| @@ -29,6 +29,7 @@ endif() | |||
| include(GNUInstallDirs) | |||
| include(CheckCXXCompilerFlag) | |||
| include(CheckIPOSupported) | |||
| include(CMakeDependentOption) | |||
| check_cxx_compiler_flag(-Wclass-memaccess CXX_SUPPORT_WCLASS_MEMACCESS) | |||
| @@ -97,6 +98,12 @@ option(MGE_BUILD_WITH_ASAN "Enable build with ASAN, need compiler support" OFF) | |||
| option(MGE_WITH_CUSTOM_OP "Build with Custom op" OFF) | |||
| option(MGE_SYNC_THIRD_PARTY "help sync third_party submodule" OFF) | |||
| # TODO: add windows support | |||
| cmake_dependent_option(MGE_WITH_CUPTI "Build with CUPTI" ON | |||
| "MGE_WITH_CUDA;MGE_BUILD_IMPERATIVE_RT;NOT MSVC;NOT WIN32" OFF) | |||
| set(MGB_CUPTI ${MGE_WITH_CUPTI}) | |||
| if(MSVC OR WIN32) | |||
| # FIXME: static link Windows vc runtime with some version from Visual Studio have some | |||
| # runtime issue at some call PATH, for example: _imperative_rt.pyd --> | |||
| @@ -686,6 +693,10 @@ if(MGB_WITH_FLATBUFFERS) | |||
| include(cmake/flatbuffers.cmake) | |||
| endif() | |||
| if(MGE_WITH_CUPTI) | |||
| include(cmake/cupti.cmake) | |||
| endif() | |||
| if(MGE_WITH_CUDA) | |||
| include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) | |||
| foreach(path ${CMAKE_CUDA_HOST_IMPLICIT_LINK_DIRECTORIES}) | |||
| @@ -6,7 +6,7 @@ endif() | |||
| if("${CUDNN_ROOT_DIR}" STREQUAL "" AND NOT "$ENV{CUDNN_ROOT_DIR}" STREQUAL "") | |||
| set(CUDNN_ROOT_DIR $ENV{CUDNN_ROOT_DIR}) | |||
| endif() | |||
| message("CUDNN ROOT: " ${CUDNN_ROOT_DIR}) | |||
| message(STATUS "CUDNN ROOT: ${CUDNN_ROOT_DIR}") | |||
| if(MGE_CUDA_USE_STATIC AND NOT MGE_WITH_CUDNN_SHARED) | |||
| find_library( | |||
| CUDNN_LIBRARY | |||
| @@ -0,0 +1,85 @@ | |||
| if("${CUDA_ROOT_DIR}" STREQUAL "" AND NOT "$ENV{CUDA_ROOT_DIR}" STREQUAL "") | |||
| set(CUDA_ROOT_DIR $ENV{CUDA_ROOT_DIR}) | |||
| endif() | |||
| if("${CUDA_ROOT_DIR}" STREQUAL "" AND NOT "$ENV{CUDA_PATH}" STREQUAL "") | |||
| set(CUDA_ROOT_DIR $ENV{CUDA_PATH}) | |||
| endif() | |||
| if("${CUDA_ROOT_DIR}" STREQUAL "" AND NOT "$ENV{CUDA_BIN_PATH}" STREQUAL "") | |||
| set(CUDA_ROOT_DIR $ENV{CUDA_BIN_PATH}) | |||
| endif() | |||
| if("${CUDA_ROOT_DIR}" STREQUAL "") | |||
| message( | |||
| FATAL_ERROR | |||
| "Can not find CUDA, please export cuda sdk path to CUDA_ROOT_DIR or CUDA_PATH or CUDA_BIN_PATH" | |||
| ) | |||
| endif() | |||
| # TODO: find_library(CUDA_ROOT_DIR) in cmake/cuda.cmake | |||
| set(MGE_CUPTI_USE_STATIC ${MGE_CUDA_USE_STATIC}) | |||
| # relates https://stackoverflow.com/questions/67485114 | |||
| if(${MGE_CUDA_USE_STATIC} AND ${CXX_SUPPORT_GOLD}) | |||
| message(WARNING "static linking CuPTI with gold may break exception handling,\ | |||
| use shared one instead") | |||
| set(MGE_CUPTI_USE_STATIC OFF) | |||
| endif() | |||
| if(MGE_CUPTI_USE_STATIC) | |||
| find_library( | |||
| CUPTI_LIBRARY | |||
| NAMES libcupti_static.a | |||
| HINTS ${CUDA_ROOT_DIR} ${CUDA_ROOT_DIR}/extras/CUPTI | |||
| PATH_SUFFIXES lib lib64 | |||
| DOC "CuPTI library.") | |||
| if("${CUPTI_LIBRARY}" STREQUAL "CUPTI_LIBRARY-NOTFOUND") | |||
| message(WARNING "Can not find static CuPTI Library, use shared one instead") | |||
| set(MGE_CUPTI_USE_STATIC OFF) | |||
| endif() | |||
| endif() | |||
| if(NOT ${MGE_CUPTI_USE_STATIC}) | |||
| find_library( | |||
| CUPTI_LIBRARY | |||
| NAMES libcupti.so | |||
| HINTS ${CUDA_ROOT_DIR} ${CUDA_ROOT_DIR}/extras/CUPTI | |||
| PATH_SUFFIXES lib lib64 | |||
| DOC "CuPTI library.") | |||
| set(CUPTI_LIBRARY_TYPE SHARED) | |||
| else() | |||
| set(CUPTI_LIBRARY_TYPE STATIC) | |||
| endif() | |||
| if("${CUPTI_LIBRARY}" STREQUAL "CUPTI_LIBRARY-NOTFOUND") | |||
| message(FATAL_ERROR "Can not find CuPTI Library") | |||
| endif() | |||
| find_path( | |||
| CUPTI_INCLUDE_DIR | |||
| NAMES cupti.h | |||
| HINTS ${CUDA_ROOT_DIR} ${CUDA_ROOT_DIR}/extras/CUPTI | |||
| PATH_SUFFIXES include | |||
| DOC "Path to CuPTI include directory.") | |||
| if(CUPTI_INCLUDE_DIR STREQUAL "CUPTI_INCLUDE_DIR-NOTFOUND") | |||
| message(FATAL_ERROR "Can not find CuPTI INCLUDE") | |||
| endif() | |||
| if(EXISTS ${CUPTI_INCLUDE_DIR}/cupti_version.h) | |||
| file(READ ${CUPTI_INCLUDE_DIR}/cupti_version.h CUPTI_VERSION_FILE_CONTENTS) | |||
| else() | |||
| file(READ ${CUPTI_INCLUDE_DIR}/cupti.h CUPTI_VERSION_FILE_CONTENTS) | |||
| endif() | |||
| string(REGEX MATCH "define CUPTI_API_VERSION * +([0-9]+)" CUPTI_API_VERSION | |||
| "${CUPTI_VERSION_FILE_CONTENTS}") | |||
| string(REGEX REPLACE "define CUPTI_API_VERSION * +([0-9]+)" "\\1" CUPTI_API_VERSION | |||
| "${CUPTI_API_VERSION}") | |||
| add_library(libcupti ${CUPTI_LIBRARY_TYPE} IMPORTED) | |||
| set_target_properties( | |||
| libcupti PROPERTIES IMPORTED_LOCATION ${CUPTI_LIBRARY} INTERFACE_INCLUDE_DIRECTORIES | |||
| ${CUPTI_INCLUDE_DIR}) | |||
| message(STATUS "Found CuPTI: ${CUPTI_LIBRARY} (found version: ${CUPTI_API_VERSION})") | |||
| @@ -36,7 +36,7 @@ else() | |||
| PATH_SUFFIXES lib lib64 | |||
| DOC "TRT plugin library.") | |||
| endif() | |||
| message("TRT_LIBRARY" ${TRT_LIBRARY}) | |||
| message(STATUS "TRT_LIBRARY: ${TRT_LIBRARY}") | |||
| if(TRT_LIBRARY STREQUAL "TRT_LIBRARY-NOTFOUND") | |||
| message( | |||
| FATAL_ERROR | |||
| @@ -51,6 +51,10 @@ if(ANDROID) | |||
| target_link_libraries(${MODULE_NAME} PRIVATE ${PYTHON_LIBRARIES}) | |||
| endif() | |||
| if(MGE_WITH_CUPTI) | |||
| target_link_libraries(${MODULE_NAME} PRIVATE libcupti) | |||
| endif() | |||
| add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/range-v3 | |||
| ${PROJECT_BINARY_DIR}/third_party/range-v3) | |||
| target_link_libraries(${MODULE_NAME} PRIVATE range-v3) | |||
| @@ -16,6 +16,10 @@ from weakref import WeakSet | |||
| from .. import _atexit | |||
| from ..core._imperative_rt.core2 import ( | |||
| cupti_available, | |||
| disable_cupti, | |||
| enable_cupti, | |||
| full_sync, | |||
| pop_scope, | |||
| push_scope, | |||
| start_profile, | |||
| @@ -50,13 +54,18 @@ class Profiler(ContextDecorator): | |||
| with profiler: | |||
| # your code here | |||
| # Then open the profile file in chrome timeline window | |||
| """ | |||
| CHROME_TIMELINE = "chrome_timeline.json" | |||
| valid_options = {"sample_rate": 0, "profile_device": 1, "num_tensor_watch": 10} | |||
| valid_options = { | |||
| "sample_rate": 0, | |||
| "profile_device": 1, | |||
| "num_tensor_watch": 10, | |||
| "enable_cupti": 0, | |||
| } | |||
| valid_formats = {"chrome_timeline.json", "memory_flow.svg"} | |||
| def __init__( | |||
| @@ -83,6 +92,11 @@ class Profiler(ContextDecorator): | |||
| self._options[opt] = int(kwargs.pop(opt, optval)) | |||
| self._pid = "<PID>" | |||
| self._dump_callback = None | |||
| if self._options.get("enable_cupti", 0): | |||
| if cupti_available(): | |||
| enable_cupti() | |||
| else: | |||
| get_logger().warning("CuPTI unavailable") | |||
| @property | |||
| def path(self): | |||
| @@ -116,7 +130,7 @@ class Profiler(ContextDecorator): | |||
| assert _running_profiler is self | |||
| _running_profiler = None | |||
| sync() | |||
| full_sync() | |||
| self._dump_callback = stop_profile() | |||
| self._pid = os.getpid() | |||
| _living_profilers.add(self) | |||
| @@ -160,6 +174,9 @@ class Profiler(ContextDecorator): | |||
| return func | |||
| def __del__(self): | |||
| if self._options.get("enable_cupti", 0): | |||
| if cupti_available(): | |||
| disable_cupti() | |||
| self.dump() | |||
| @@ -11,6 +11,7 @@ | |||
| #include "megbrain/common.h" | |||
| #include "megbrain/dtype.h" | |||
| #include "megbrain/imperative/cpp_cupti.h" | |||
| #include "megbrain/imperative/ops/autogen.h" | |||
| #include "megbrain/imperative/ops/backward_graph.h" | |||
| #include "megbrain/imperative/ops/utility.h" | |||
| @@ -982,6 +983,7 @@ void init_tensor(py::module m) { | |||
| m.def("stop_profile", [channel]() -> std::function<void(std::string, std::string)> { | |||
| channel->stop_profile(); | |||
| channel->sync(); | |||
| CompNode::sync_all(); | |||
| imperative::Profiler::stop_profile(); | |||
| auto results = std::make_shared<imperative::Profiler::bundle_t>( | |||
| imperative::Profiler::collect()); | |||
| @@ -990,6 +992,9 @@ void init_tensor(py::module m) { | |||
| results = nullptr; | |||
| }; | |||
| }); | |||
| m.def("enable_cupti", &cupti::enable); | |||
| m.def("disable_cupti", &cupti::disable); | |||
| m.def("cupti_available", &cupti::available); | |||
| m.def("sync", [channel]() { | |||
| if (channel->check_available()) { | |||
| channel->sync(); | |||
| @@ -0,0 +1,273 @@ | |||
| #include "megbrain/imperative/cpp_cupti.h" | |||
| #include <cinttypes> | |||
| #include <cstddef> | |||
| #include <cstdlib> | |||
| #include "megbrain/exception.h" | |||
| #include "megbrain/imperative/profiler.h" | |||
| #include "megbrain/imperative/utils/platform.h" | |||
| #include "./profiler/events.h" | |||
| #if MGB_CUPTI | |||
| #include "cupti.h" | |||
| #define CUPTI_CALL(call) \ | |||
| do { \ | |||
| CUptiResult _status = call; \ | |||
| if (_status != CUPTI_SUCCESS) { \ | |||
| const char* errstr; \ | |||
| cuptiGetResultString(_status, &errstr); \ | |||
| mgb_assert(_status == CUPTI_SUCCESS, "cupti error: %s", errstr); \ | |||
| } \ | |||
| } while (0) | |||
| #endif | |||
| namespace mgb::imperative::cupti { | |||
| #if MGB_CUPTI | |||
| namespace { | |||
| CUpti_SubscriberHandle cuptiSubscriber; | |||
| void cuptiSubscriberCallback( | |||
| void* userdata, CUpti_CallbackDomain domain, CUpti_CallbackId cb_id, | |||
| const void* cb_info) { | |||
| using namespace profiler; | |||
| switch (domain) { | |||
| case CUPTI_CB_DOMAIN_DRIVER_API: { | |||
| auto cb_data = (const CUpti_CallbackData*)cb_info; | |||
| switch (cb_id) { | |||
| case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel: { | |||
| if (cb_data->callbackSite == CUPTI_API_ENTER) { | |||
| MGB_RECORD_EVENT( | |||
| CUPTIKernelLaunchEvent, cb_data->correlationId, | |||
| cb_data->symbolName); | |||
| } else if (cb_data->callbackSite == CUPTI_API_EXIT) { | |||
| MGB_RECORD_EVENT( | |||
| CUPTIKernelLaunchFinishEvent, cb_data->correlationId, | |||
| cb_data->symbolName); | |||
| } | |||
| break; | |||
| } | |||
| case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA: { | |||
| } | |||
| case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync: { | |||
| if (cb_data->callbackSite == CUPTI_API_ENTER) { | |||
| MGB_RECORD_EVENT( | |||
| CUPTIMemcpyLaunchEvent, cb_data->correlationId); | |||
| } else if (cb_data->callbackSite == CUPTI_API_EXIT) { | |||
| MGB_RECORD_EVENT( | |||
| CUPTIMemcpyLaunchFinishEvent, cb_data->correlationId); | |||
| } | |||
| break; | |||
| } | |||
| default: { | |||
| if (cb_data->callbackSite == CUPTI_API_ENTER) { | |||
| MGB_RECORD_EVENT( | |||
| CUPTIDriverEvent, cb_data->correlationId, | |||
| cb_data->functionName); | |||
| } else if (cb_data->callbackSite == CUPTI_API_EXIT) { | |||
| MGB_RECORD_EVENT( | |||
| CUPTIDriverFinishEvent, cb_data->correlationId, | |||
| cb_data->functionName); | |||
| } | |||
| } | |||
| } | |||
| break; | |||
| } | |||
| case CUPTI_CB_DOMAIN_RUNTIME_API: { | |||
| auto cb_data = (const CUpti_CallbackData*)cb_info; | |||
| if (cb_data->callbackSite == CUPTI_API_ENTER) { | |||
| MGB_RECORD_EVENT( | |||
| CUPTIRuntimeEvent, cb_data->correlationId, | |||
| cb_data->functionName); | |||
| } else if (cb_data->callbackSite == CUPTI_API_EXIT) { | |||
| MGB_RECORD_EVENT( | |||
| CUPTIRuntimeFinishEvent, cb_data->correlationId, | |||
| cb_data->functionName); | |||
| } | |||
| break; | |||
| } | |||
| } | |||
| } | |||
| void handleActivity(CUpti_Activity* record) { | |||
| using namespace std::chrono_literals; | |||
| auto delta = 16ns; | |||
| switch (record->kind) { | |||
| case CUPTI_ACTIVITY_KIND_KERNEL: | |||
| case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: { | |||
| auto kernel = cupti::activity<CUpti_ActivityKernel4>(record); | |||
| MGB_RECORD_EVENT( | |||
| profiler::CUPTIKernelExecuteEvent, kernel->correlationId, | |||
| kernel->name, kernel.stream(), kernel.start(), | |||
| kernel.end() - delta); | |||
| break; | |||
| } | |||
| case CUPTI_ACTIVITY_KIND_MEMCPY: { | |||
| auto memcpy = cupti::activity<CUpti_ActivityMemcpy>(record); | |||
| MGB_RECORD_EVENT( | |||
| profiler::CUPTIMemcpyEvent, memcpy->correlationId, memcpy->srcKind, | |||
| memcpy->dstKind, memcpy->bytes, memcpy.stream(), memcpy.start(), | |||
| memcpy.end()); | |||
| break; | |||
| } | |||
| case CUPTI_ACTIVITY_KIND_MEMSET: { | |||
| auto memset = cupti::activity<CUpti_ActivityMemset>(record); | |||
| MGB_RECORD_EVENT( | |||
| profiler::CUPTIMemsetEvent, memset->correlationId, memset->value, | |||
| memset->bytes, memset.stream(), memset.start(), | |||
| memset.end() - delta); | |||
| break; | |||
| } | |||
| default: | |||
| break; | |||
| } | |||
| } | |||
| using activity_buffer_t = | |||
| std::aligned_storage_t<8 * 1024 * 1024, ACTIVITY_RECORD_ALIGNMENT>; | |||
| void bufferRequested(uint8_t** buffer, size_t* size, size_t* maxNumRecords) { | |||
| *buffer = reinterpret_cast<uint8_t*>(new activity_buffer_t()); | |||
| *size = sizeof(activity_buffer_t); | |||
| *maxNumRecords = 0; | |||
| } | |||
| void bufferCompleted( | |||
| CUcontext ctx, uint32_t streamId, uint8_t* buffer, size_t size, | |||
| size_t validSize) { | |||
| CUptiResult status; | |||
| CUpti_Activity* record = NULL; | |||
| if (validSize > 0) { | |||
| do { | |||
| status = cuptiActivityGetNextRecord(buffer, validSize, &record); | |||
| if (status == CUPTI_SUCCESS) { | |||
| handleActivity(record); | |||
| } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) | |||
| break; | |||
| else { | |||
| CUPTI_CALL(status); | |||
| } | |||
| } while (1); | |||
| size_t dropped; | |||
| CUPTI_CALL(cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped)); | |||
| mgb_assert(dropped == 0, "%zu records dropped", dropped); | |||
| } | |||
| delete reinterpret_cast<activity_buffer_t*>(buffer); | |||
| } | |||
| static bool initialized = false; | |||
| } // namespace | |||
| bool available() { | |||
| uint32_t compiletime_version = (CUPTI_API_VERSION); | |||
| uint32_t runtime_version; | |||
| CUPTI_CALL(cuptiGetVersion(&runtime_version)); | |||
| if (compiletime_version != runtime_version) { | |||
| static std::once_flag once; | |||
| std::call_once(once, [&] { | |||
| mgb_log_warn( | |||
| "CuPTI version %d mismatch against compiletime version %d. " | |||
| "This may caused by user config LD_LIBRARY_PATH" | |||
| "at unix-like env or config PATH at Windows env", | |||
| (int)compiletime_version, (int)runtime_version); | |||
| }); | |||
| return false; | |||
| } | |||
| return true; | |||
| } | |||
| void enable() { | |||
| // not thread safe | |||
| mgb_assert(!initialized, "cupti already initialized"); | |||
| // callback | |||
| CUPTI_CALL(cuptiSubscribe( | |||
| &cuptiSubscriber, (CUpti_CallbackFunc)cuptiSubscriberCallback, | |||
| (void*)nullptr)); | |||
| CUPTI_CALL(cuptiEnableDomain(1, cuptiSubscriber, CUPTI_CB_DOMAIN_DRIVER_API)); | |||
| CUPTI_CALL(cuptiEnableDomain(1, cuptiSubscriber, CUPTI_CB_DOMAIN_RUNTIME_API)); | |||
| // activity | |||
| CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE)); | |||
| CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONTEXT)); | |||
| CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER)); | |||
| CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME)); | |||
| CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY)); | |||
| CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET)); | |||
| CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_NAME)); | |||
| CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MARKER)); | |||
| CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL)); | |||
| CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD)); | |||
| CUPTI_CALL(cuptiActivityRegisterCallbacks(bufferRequested, bufferCompleted)); | |||
| initialized = true; | |||
| } | |||
| void disable() { | |||
| mgb_assert(initialized, "cupti not initialized yet"); | |||
| flush(); | |||
| CUPTI_CALL(cuptiFinalize()); | |||
| initialized = false; | |||
| } | |||
| void flush() { | |||
| if (initialized) { | |||
| CUPTI_CALL(cuptiActivityFlushAll(1)); | |||
| } | |||
| } | |||
| bool enabled() { | |||
| return initialized; | |||
| } | |||
| time_point clock::now() { | |||
| uint64_t timestamp; | |||
| CUPTI_CALL(cuptiGetTimestamp(×tamp)); | |||
| using namespace std::chrono; | |||
| // overflow? | |||
| return time_point(duration((int64_t)timestamp)); | |||
| } | |||
| #else | |||
| class CuPTIUnavailableError : public MegBrainError { | |||
| public: | |||
| CuPTIUnavailableError() | |||
| : MegBrainError( | |||
| #if MGB_CUDA | |||
| "CuPTI disabled at compile time" | |||
| #else | |||
| "CuPTI unsupported on non cuda platform" | |||
| #endif | |||
| ) { | |||
| } | |||
| }; | |||
| bool available() { | |||
| return false; | |||
| } | |||
| void enable() { | |||
| throw CuPTIUnavailableError(); | |||
| } | |||
| void disable() { | |||
| throw CuPTIUnavailableError(); | |||
| } | |||
| void flush() {} | |||
| bool enabled() { | |||
| return false; | |||
| } | |||
| time_point clock::now() { | |||
| throw CuPTIUnavailableError(); | |||
| } | |||
| #endif | |||
| } // namespace mgb::imperative::cupti | |||
| @@ -12,7 +12,9 @@ | |||
| #include "megbrain/imperative/profiler.h" | |||
| #include <chrono> | |||
| #include <unordered_map> | |||
| #include "megbrain/imperative/cpp_cupti.h" | |||
| #include "megbrain/imperative/ops/opr_attr.h" | |||
| #include "megbrain/imperative/physical_tensor.h" | |||
| @@ -48,6 +50,21 @@ bool Profiler::sm_profiling = false; | |||
| thread_local Profiler* Profiler::tm_profiler = nullptr; | |||
| std::atomic_size_t Profiler::sm_preferred_capacity; | |||
| void Profiler::start_profile() { | |||
| mgb_assert(!sm_profiling); | |||
| sm_start_at = Timer::record_host(); | |||
| sm_profiling = true; | |||
| if (cupti::enabled()) { | |||
| MGB_RECORD_EVENT(profiler::CUPTITimestampEvent, cupti::clock::now()); | |||
| } | |||
| } | |||
| void Profiler::stop_profile() { | |||
| mgb_assert(sm_profiling); | |||
| cupti::flush(); | |||
| sm_profiling = false; | |||
| } | |||
| auto Profiler::get_thread_dict() -> thread_dict_t { | |||
| thread_dict_t thread_dict; | |||
| for (auto&& [tid, profiler] : sm_profilers) { | |||
| @@ -19,6 +19,7 @@ | |||
| #include "nlohmann/json.hpp" | |||
| #include "megbrain/imperative/utils/platform.h" | |||
| #include "megbrain/utils/debug.h" | |||
| #include "./formats.h" | |||
| @@ -198,6 +199,8 @@ struct ChromeTimelineEventVisitor : EventVisitor<ChromeTimelineEventVisitor> { | |||
| decltype(getpid()) pid = getpid(); | |||
| std::string pid_str = std::to_string(pid); | |||
| ChromeTimelineEventVisitor() {} | |||
| ChromeTraceEvent& new_event( | |||
| std::string name, char ph, size_t tid, profiler::HostTime time) { | |||
| return trace_events.new_event().name(name).ph(ph).pid(pid).tid(tid).ts( | |||
| @@ -213,8 +216,13 @@ struct ChromeTimelineEventVisitor : EventVisitor<ChromeTimelineEventVisitor> { | |||
| .ts(since_start(current->time)); | |||
| } | |||
| ChromeTraceEvent& new_cupti_event( | |||
| std::string name, char ph, cupti::stream_t stream, | |||
| cupti::time_point timestamp) { | |||
| return new_event(name, ph, to_tid(stream), time_from_cupti(timestamp)); | |||
| } | |||
| ChromeTraceEvent& new_device_event(std::string name, char ph, CompNode device) { | |||
| using namespace std::literals::chrono_literals; | |||
| auto time = since_start(to_device_time(current->time, device)); | |||
| return trace_events.new_event() | |||
| .name(name) | |||
| @@ -391,6 +399,80 @@ struct ChromeTimelineEventVisitor : EventVisitor<ChromeTimelineEventVisitor> { | |||
| auto device_ahead = std::chrono::duration_cast<std::chrono::milliseconds>( | |||
| current_device_time - current_host_time); | |||
| new_host_event("device_ahead_ms", 'C').arg("value", device_ahead.count()); | |||
| } else if constexpr (std::is_same_v<TEvent, CUPTIKernelLaunchEvent>) { | |||
| new_host_event(demangle(event.name), 'B'); | |||
| new_host_event(pid_str, 's') | |||
| .id(event.correlation_id) | |||
| .cat("KernelLink") | |||
| .scope(pid_str); | |||
| } else if constexpr (std::is_same_v<TEvent, CUPTIKernelLaunchFinishEvent>) { | |||
| new_host_event(demangle(event.name), 'E'); | |||
| } else if constexpr (std::is_same_v<TEvent, CUPTIKernelExecuteEvent>) { | |||
| new_cupti_event(demangle(event.name), 'B', event.stream, event.start) | |||
| .arg("execution_time", (event.end - event.start).count()); | |||
| new_cupti_event(pid_str, 'f', event.stream, event.end) | |||
| .id(event.correlation_id) | |||
| .bp('e') | |||
| .cat("KernelLink") | |||
| .scope(pid_str); | |||
| new_cupti_event(demangle(event.name), 'E', event.stream, event.end) | |||
| .arg("execution_time", (event.end - event.start).count()); | |||
| } else if constexpr (std::is_same_v<TEvent, CUPTIMemcpyLaunchEvent>) { | |||
| new_host_event("Memcpy", 'B'); | |||
| new_host_event(pid_str, 's') | |||
| .id(event.correlation_id) | |||
| .cat("CUPTILink") | |||
| .scope(pid_str); | |||
| } else if constexpr (std::is_same_v<TEvent, CUPTIMemcpyLaunchFinishEvent>) { | |||
| new_host_event("Memcpy", 'E'); | |||
| } else if constexpr (std::is_same_v<TEvent, CUPTIMemcpyEvent>) { | |||
| auto memkind2str = [](uint8_t kind) { | |||
| const char* const valid_kinds[] = { | |||
| "CUPTI_ACTIVITY_MEMORY_KIND_UNKNOWN", | |||
| "CUPTI_ACTIVITY_MEMORY_KIND_PAGEABLE", | |||
| "CUPTI_ACTIVITY_MEMORY_KIND_PINNED", | |||
| "CUPTI_ACTIVITY_MEMORY_KIND_DEVICE", | |||
| "CUPTI_ACTIVITY_MEMORY_KIND_ARRAY", | |||
| "CUPTI_ACTIVITY_MEMORY_KIND_MANAGED", | |||
| "CUPTI_ACTIVITY_MEMORY_KIND_DEVICE_STATIC", | |||
| "CUPTI_ACTIVITY_MEMORY_KIND_MANAGED_STATIC"}; | |||
| if (kind > (sizeof(valid_kinds) / sizeof(const char*))) { | |||
| return "invalid"; | |||
| } | |||
| return valid_kinds[kind]; | |||
| }; | |||
| new_cupti_event("Memcpy", 'B', event.stream, event.start) | |||
| .arg("bytes", imperative::to_string(event.bytes)) | |||
| .arg("src_kind", memkind2str(event.src_kind)) | |||
| .arg("dst_kind", memkind2str(event.dst_kind)); | |||
| new_cupti_event(pid_str, 'f', event.stream, event.start) | |||
| .id(event.correlation_id) | |||
| .bp('e') | |||
| .cat("CUPTILink") | |||
| .scope(pid_str); | |||
| new_cupti_event("Memcpy", 'E', event.stream, event.end) | |||
| .arg("bytes", imperative::to_string(event.bytes)) | |||
| .arg("src_kind", memkind2str(event.src_kind)) | |||
| .arg("dst_kind", memkind2str(event.dst_kind)); | |||
| } else if constexpr (std::is_same_v<TEvent, CUPTIMemsetEvent>) { | |||
| new_cupti_event("Memset", 'B', event.stream, event.start) | |||
| .arg("value", imperative::to_string(event.value)) | |||
| .arg("bytes", imperative::to_string(event.bytes)); | |||
| new_cupti_event("Memset", 'E', event.stream, event.start) | |||
| .arg("value", imperative::to_string(event.value)) | |||
| .arg("bytes", imperative::to_string(event.bytes)); | |||
| } else if constexpr (std::is_same_v<TEvent, CUPTIRuntimeEvent>) { | |||
| new_host_event(event.name, 'B'); | |||
| } else if constexpr (std::is_same_v<TEvent, CUPTIRuntimeFinishEvent>) { | |||
| new_host_event(event.name, 'E'); | |||
| } else if constexpr (std::is_same_v<TEvent, CUPTIDriverEvent>) { | |||
| new_host_event(event.name, 'B'); | |||
| new_host_event(pid_str, 's') | |||
| .id(event.correlation_id) | |||
| .cat("CUPTILink") | |||
| .scope(pid_str); | |||
| } else if constexpr (std::is_same_v<TEvent, CUPTIDriverFinishEvent>) { | |||
| new_host_event(event.name, 'E'); | |||
| } | |||
| } | |||
| @@ -403,7 +485,8 @@ struct ChromeTimelineEventVisitor : EventVisitor<ChromeTimelineEventVisitor> { | |||
| if (thread_dict.count(host)) { | |||
| trace_events.new_event() | |||
| .name("thread_name") | |||
| .pid('M') | |||
| .ph('M') | |||
| .pid(pid) | |||
| .tid(to_tid(host)) | |||
| .arg("name", thread_dict.at(host)); | |||
| } | |||
| @@ -411,7 +494,8 @@ struct ChromeTimelineEventVisitor : EventVisitor<ChromeTimelineEventVisitor> { | |||
| for (auto&& device : devices()) { | |||
| trace_events.new_event() | |||
| .name("thread_name") | |||
| .pid('M') | |||
| .ph('M') | |||
| .pid(pid) | |||
| .tid(to_tid(device)) | |||
| .arg("name", device.to_string_logical()); | |||
| } | |||
| @@ -419,7 +503,7 @@ struct ChromeTimelineEventVisitor : EventVisitor<ChromeTimelineEventVisitor> { | |||
| }; | |||
| void dump_chrome_timeline(std::string filename, Profiler::bundle_t result) { | |||
| ChromeTimelineEventVisitor visitor; | |||
| ChromeTimelineEventVisitor visitor{}; | |||
| visitor.process_events(result); | |||
| visitor.name_threads(result.thread_dict); | |||
| auto trace_events = std::move(visitor.trace_events); | |||
| @@ -16,6 +16,7 @@ | |||
| #include "../interpreter/stack_manager.h" | |||
| #include "../op_trait.h" | |||
| #include "megbrain/imperative/cpp_cupti.h" | |||
| namespace mgb::imperative::profiler { | |||
| @@ -181,6 +182,60 @@ DEF_DUR_EVENT(HostToDevice, { | |||
| void* device_ptr; | |||
| }); | |||
| // cupti events | |||
| DEF_EVENT(CUPTITimestamp, { cupti::clock::time_point timestamp; }); | |||
| DEF_DUR_EVENT(CUPTIKernelLaunch, { | |||
| uint32_t correlation_id; | |||
| const char* name; | |||
| }); | |||
| DEF_EVENT(CUPTIKernelExecute, { | |||
| uint32_t correlation_id; | |||
| const char* name; | |||
| cupti::stream_t stream; | |||
| cupti::time_point start; | |||
| cupti::time_point end; | |||
| }); | |||
| DEF_DUR_EVENT(CUPTIMemcpyLaunch, { uint32_t correlation_id; }); | |||
| DEF_EVENT(CUPTIMemcpy, { | |||
| uint32_t correlation_id; | |||
| uint8_t src_kind; | |||
| uint8_t dst_kind; | |||
| uint64_t bytes; | |||
| cupti::stream_t stream; | |||
| cupti::time_point start; | |||
| cupti::time_point end; | |||
| }); | |||
| DEF_EVENT(CUPTIMemset, { | |||
| uint32_t correlation_id; | |||
| uint32_t value; | |||
| uint64_t bytes; | |||
| cupti::stream_t stream; | |||
| cupti::time_point start; | |||
| cupti::time_point end; | |||
| }); | |||
| DEF_EVENT(CUPTIUnknownDevice, {}); | |||
| DEF_DUR_EVENT(CUPTIRuntime, { | |||
| uint32_t correlation_id; | |||
| const char* name; | |||
| }); | |||
| DEF_DUR_EVENT(CUPTIDriver, { | |||
| uint32_t correlation_id; | |||
| const char* name; | |||
| }); | |||
| DEF_EVENT(CUPTIIdentifyStream, { | |||
| cupti::stream_t stream; | |||
| CompNode device; | |||
| }); | |||
| #undef DEF_EVENT | |||
| #undef DEF_DUR_EVENT | |||
| @@ -180,10 +180,13 @@ private: | |||
| HostTime m_start_time; | |||
| CompNode::UnorderedMap<size_t> m_device_tid_table; | |||
| std::unordered_map<std::thread::id, size_t> m_host_tid_table; | |||
| std::unordered_map<cupti::stream_t, size_t> m_cupti_tid_table; | |||
| CompNode::UnorderedMap<std::map<profiler::HostTime, profiler::RealDuration>> | |||
| m_device_timeline; | |||
| std::unordered_map<std::thread::id, std::vector<Trace>> m_trace_stack; | |||
| std::unordered_map<std::string, int64_t> m_counter_table; | |||
| std::optional<std::pair<profiler::HostTime, cupti::time_point>> m_cupti_timestamp = | |||
| {}; | |||
| protected: | |||
| Profiler::Record* current; | |||
| @@ -191,6 +194,11 @@ protected: | |||
| ProfileTensorState* current_tensor; | |||
| protected: | |||
| size_t next_tid() { | |||
| return m_host_tid_table.size() + m_device_tid_table.size() + | |||
| m_cupti_tid_table.size(); | |||
| } | |||
| profiler::Duration since_start(profiler::HostTime time) { | |||
| return time - m_start_time; | |||
| } | |||
| @@ -229,6 +237,10 @@ protected: | |||
| size_t to_tid(CompNode device) { return m_device_tid_table.at(device); } | |||
| size_t to_tid(cupti::stream_t cupti_stream) { | |||
| return m_cupti_tid_table.at(cupti_stream); | |||
| } | |||
| SmallVector<std::thread::id> host_threads() { | |||
| SmallVector<std::thread::id> host_threads; | |||
| for (auto&& [host, _] : m_host_tid_table) { | |||
| @@ -254,6 +266,13 @@ protected: | |||
| value += delta; | |||
| } | |||
| profiler::HostTime time_from_cupti(cupti::time_point timestamp) { | |||
| mgb_assert(m_cupti_timestamp.has_value()); | |||
| return m_cupti_timestamp->first + | |||
| std::chrono::duration_cast<profiler::HostTime::duration>( | |||
| timestamp - m_cupti_timestamp->second); | |||
| } | |||
| public: | |||
| void process_events(Profiler::bundle_t& bundle) { | |||
| m_start_time = bundle.start_at; | |||
| @@ -272,7 +291,11 @@ public: | |||
| TensorCommandEvent, TensorCommandFinishEvent, AutoEvictEvent, | |||
| AutoEvictFinishEvent, CustomEvent, CustomFinishEvent, RecordDeviceEvent, | |||
| ScopeEvent, ScopeFinishEvent, HostToDeviceEvent, | |||
| HostToDeviceFinishEvent> | |||
| HostToDeviceFinishEvent, CUPTITimestampEvent, CUPTIKernelLaunchEvent, | |||
| CUPTIKernelLaunchFinishEvent, CUPTIKernelExecuteEvent, | |||
| CUPTIMemcpyLaunchEvent, CUPTIMemcpyLaunchFinishEvent, CUPTIMemcpyEvent, | |||
| CUPTIRuntimeEvent, CUPTIRuntimeFinishEvent, CUPTIDriverEvent, | |||
| CUPTIDriverFinishEvent, CUPTIMemsetEvent> | |||
| converter; | |||
| auto for_each_entry = [&](auto&& handler) { | |||
| @@ -289,7 +312,9 @@ public: | |||
| std::shared_ptr<CompNode::Event> device; | |||
| }; | |||
| CompNode::UnorderedMap<DeviceStartPair> device_start_table; | |||
| std::unordered_map<cupti::stream_t, CompNode> cupti_stream_table; | |||
| // record device time | |||
| for_each_entry([&](auto&& event) { | |||
| using T = std::decay_t<decltype(event)>; | |||
| if constexpr (std::is_same_v<T, RecordDeviceEvent>) { | |||
| @@ -313,8 +338,7 @@ public: | |||
| // register host threads | |||
| for_each_entry([&](auto&& event) { | |||
| if (!m_host_tid_table.count(current->tid)) { | |||
| m_host_tid_table[current->tid] = { | |||
| m_device_tid_table.size() + m_host_tid_table.size()}; | |||
| m_host_tid_table[current->tid] = next_tid(); | |||
| } | |||
| }); | |||
| @@ -340,14 +364,39 @@ public: | |||
| } else if constexpr (std::is_same_v<T, TensorProduceEvent>) { | |||
| auto& tensor = m_tensors[event.tensor_id]; | |||
| if (!m_device_tid_table.count(event.device)) { | |||
| m_device_tid_table[event.device] = { | |||
| m_device_tid_table.size() + m_host_tid_table.size()}; | |||
| m_device_tid_table[event.device] = next_tid(); | |||
| } | |||
| tensor.device = event.device; | |||
| tensor.layout = event.layout; | |||
| } | |||
| }); | |||
| for_each_entry([&](auto&& event) { | |||
| using T = std::decay_t<decltype(event)>; | |||
| if constexpr (std::is_same_v<T, CUPTIIdentifyStreamEvent>) { | |||
| if (!m_cupti_tid_table.count(event.stream)) { | |||
| m_cupti_tid_table[event.stream] = | |||
| m_device_tid_table.at(event.device); | |||
| } | |||
| } | |||
| }); | |||
| // record cupti streams | |||
| for_each_entry([&](auto&& event) { | |||
| using T = std::decay_t<decltype(event)>; | |||
| if constexpr ( | |||
| std::is_same_v<T, CUPTIKernelExecuteEvent> || | |||
| std::is_same_v<T, CUPTIMemcpyEvent> || | |||
| std::is_same_v<T, CUPTIMemsetEvent>) { | |||
| if (!m_cupti_tid_table.count(event.stream)) { | |||
| m_cupti_tid_table[event.stream] = next_tid(); | |||
| } | |||
| } else if constexpr (std::is_same_v<T, CUPTITimestampEvent>) { | |||
| mgb_assert(!m_cupti_timestamp.has_value()); | |||
| m_cupti_timestamp.emplace(current->time, event.timestamp); | |||
| } | |||
| }); | |||
| // replay execution | |||
| using namespace std::placeholders; | |||
| for_each_entry([&](auto&& event) { | |||
| @@ -0,0 +1,25 @@ | |||
| #include "megbrain/imperative/utils/platform.h" | |||
| #ifdef __GNUG__ | |||
| #include <cxxabi.h> | |||
| #include <cstdlib> | |||
| #include <memory> | |||
| #endif | |||
| using namespace mgb; | |||
| using namespace imperative; | |||
| /* | |||
| * demangle typeid, see | |||
| * http://stackoverflow.com/questions/281818/unmangling-the-result-of-stdtype-infoname | |||
| */ | |||
| std::string mgb::imperative::demangle(std::string mangled) { | |||
| #ifdef __GNUG__ | |||
| int status = -1; | |||
| std::unique_ptr<char, void (*)(void*)> res{ | |||
| abi::__cxa_demangle(mangled.c_str(), nullptr, nullptr, &status), std::free}; | |||
| return (status == 0) ? res.get() : mangled; | |||
| #else | |||
| return mangled; | |||
| #endif | |||
| } | |||
| @@ -0,0 +1,86 @@ | |||
| #pragma once | |||
| #include <chrono> | |||
| #include <ctime> | |||
| #include "megbrain/common.h" | |||
| #include "megbrain/imperative/utils/to_string.h" | |||
| namespace mgb::imperative::cupti { | |||
| struct clock { | |||
| typedef std::chrono::nanoseconds duration; | |||
| typedef duration::rep rep; | |||
| typedef duration::period period; | |||
| typedef std::chrono::time_point<clock> time_point; | |||
| static const bool is_steady = false; | |||
| static time_point now() /* noexcept */; | |||
| }; | |||
| using time_point = clock::time_point; | |||
| using duration = clock::duration; | |||
| struct device_t { | |||
| uint32_t device_id; | |||
| bool operator==(const device_t& rhs) const { return device_id == rhs.device_id; } | |||
| }; | |||
| struct context_t : device_t { | |||
| uint32_t context_id; | |||
| bool operator==(const context_t& rhs) const { | |||
| return device_t::operator==(rhs) && context_id == rhs.context_id; | |||
| } | |||
| }; | |||
| struct stream_t : context_t { | |||
| uint32_t stream_id; | |||
| bool operator==(const stream_t& rhs) const { | |||
| return context_t::operator==(rhs) && stream_id == rhs.stream_id; | |||
| } | |||
| }; | |||
| bool available(); | |||
| void enable(); | |||
| void disable(); | |||
| void flush(); | |||
| bool enabled(); | |||
| template <typename TActivity> | |||
| struct activity { | |||
| private: | |||
| TActivity* m_ptr; | |||
| public: | |||
| activity(void* ptr) : m_ptr((TActivity*)ptr) {} | |||
| time_point start() const { return time_point(duration(m_ptr->start)); } | |||
| time_point end() const { return time_point(duration(m_ptr->end)); } | |||
| device_t device() const { return {m_ptr->deviceId}; } | |||
| context_t context() const { return {device(), m_ptr->contextId}; } | |||
| stream_t stream() const { return {context(), m_ptr->streamId}; } | |||
| TActivity* operator->() const { return m_ptr; } | |||
| }; | |||
| } // namespace mgb::imperative::cupti | |||
| template <> | |||
| class std::hash<mgb::imperative::cupti::stream_t> { | |||
| public: | |||
| size_t operator()(const mgb::imperative::cupti::stream_t& value) const { | |||
| return value.stream_id; | |||
| } | |||
| }; | |||
| @@ -194,16 +194,9 @@ public: | |||
| static bool is_profiling() { return sm_profiling; } | |||
| static void start_profile() { | |||
| mgb_assert(!sm_profiling); | |||
| sm_start_at = Timer::record_host(); | |||
| sm_profiling = true; | |||
| } | |||
| static void start_profile(); | |||
| static void stop_profile() { | |||
| mgb_assert(sm_profiling); | |||
| sm_profiling = false; | |||
| } | |||
| static void stop_profile(); | |||
| static thread_dict_t get_thread_dict(); | |||
| @@ -0,0 +1,9 @@ | |||
| #pragma once | |||
| #include <string> | |||
| namespace mgb::imperative { | |||
| std::string demangle(std::string mangled); | |||
| } | |||
| @@ -37,6 +37,10 @@ if(MGE_WITH_CUDA) | |||
| list(APPEND LINK_LIBS cudart) | |||
| endif() | |||
| if(MGE_WITH_CUPTI) | |||
| list(APPEND LINK_LIBS libcupti) | |||
| endif() | |||
| if(MGE_WITH_DISTRIBUTED) | |||
| list(APPEND LINK_LIBS megray) | |||
| endif() | |||
| @@ -61,11 +61,11 @@ echo "Build with ${SDK_NAME}" | |||
| if [ $SDK_NAME == "cu101" ];then | |||
| CUDA_COPY_LIB_LIST="${CUDA_LIB_DIR}/libnvrtc.so.10.1" | |||
| EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=OFF -DMGE_WITH_CUBLAS_SHARED=OFF" | |||
| BUILD_GCC8="ON" | |||
| REQUIR_CUDA_VERSION="10010" | |||
| REQUIR_CUDNN_VERSION="7.6.3" | |||
| REQUIR_TENSORRT_VERSION="6.0.1.5" | |||
| EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=OFF -DMGE_WITH_CUBLAS_SHARED=OFF" | |||
| BUILD_GCC8="ON" | |||
| REQUIR_CUDA_VERSION="10010" | |||
| REQUIR_CUDNN_VERSION="7.6.3" | |||
| REQUIR_TENSORRT_VERSION="6.0.1.5" | |||
| REQUIR_CUBLAS_VERSION="10.2.1.243" | |||
| elif [ $SDK_NAME == "cu102_JetsonNano" ];then | |||
| @@ -87,6 +87,12 @@ elif [ $SDK_NAME == "cu102_JetsonNano" ];then | |||
| ${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\ | |||
| ${CUDNN_LIB_DIR}/libcudnn.so.8" | |||
| if [ ${machine} == "aarch64" ];then | |||
| CUDA_COPY_LIB_LIST="\ | |||
| ${CUDA_LIB_DIR}/libcupti.so.10.2:\ | |||
| ${CUDA_COPY_LIB_LIST}" | |||
| fi | |||
| EXTRA_CMAKE_FLAG="-DMGE_WITH_CUDNN_SHARED=ON -DMGE_WITH_CUBLAS_SHARED=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_53,code=sm_53\" " | |||
| elif [ $SDK_NAME == "cu111" ];then | |||
| @@ -118,6 +124,12 @@ elif [ $SDK_NAME == "cu111" ];then | |||
| ${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\ | |||
| ${CUDNN_LIB_DIR}/libcudnn.so.8" | |||
| if [ ${machine} == "aarch64" ];then | |||
| CUDA_COPY_LIB_LIST="\ | |||
| ${CUDA_LIB_DIR}/libcupti.so.11.1:\ | |||
| ${CUDA_COPY_LIB_LIST}" | |||
| fi | |||
| if [ ${IN_CI} = "true" ] && [ ${machine} == "aarch64" ]; then | |||
| EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON -DMGE_WITH_CUBLAS_SHARED=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_75,code=sm_75\" " | |||
| else | |||
| @@ -152,9 +164,9 @@ elif [ $SDK_NAME == "cu112" ];then | |||
| -gencode arch=compute_86,code=sm_86 \ | |||
| -gencode arch=compute_86,code=compute_86\" " | |||
| REQUIR_CUDA_VERSION="11020" | |||
| REQUIR_CUDNN_VERSION="8.0.4" | |||
| REQUIR_TENSORRT_VERSION="7.2.2.3" | |||
| REQUIR_CUDA_VERSION="11020" | |||
| REQUIR_CUDNN_VERSION="8.0.4" | |||
| REQUIR_TENSORRT_VERSION="7.2.2.3" | |||
| REQUIR_CUBLAS_VERSION="11.3.1.68" | |||
| elif [ $SDK_NAME == "cpu" ];then | |||
| @@ -35,6 +35,7 @@ | |||
| #cmakedefine01 MGB_ENABLE_FBS_SERIALIZATION | |||
| #cmakedefine01 MGB_IS_DEV | |||
| #cmakedefine01 MGB_CUSTOM_OP | |||
| #cmakedefine01 MGB_CUPTI | |||
| // DNN related flags | |||
| // Platform macro's | |||
| #cmakedefine01 MEGDNN_WITH_CUDA | |||