| @@ -39,6 +39,9 @@ option(MGE_DISABLE_FLOAT16 "Disable MegEngine float16 support." OFF) | |||||
| option(MGE_WITH_CUDA "Enable MegEngine CUDA support." ON) | option(MGE_WITH_CUDA "Enable MegEngine CUDA support." ON) | ||||
| option(MGE_CUDA_USE_STATIC "Enable MegEngine CUDA static linking." ON) | option(MGE_CUDA_USE_STATIC "Enable MegEngine CUDA static linking." ON) | ||||
| option(MGE_WITH_TRT "Build MegEngine with TensorRT." ON) | option(MGE_WITH_TRT "Build MegEngine with TensorRT." ON) | ||||
| option(MGE_WITH_CUDA_STUB "Build MegEngine with CUDA stub." ON) | |||||
| option(MGE_WITH_NVRTC_STUB "Build MegEngine with NVRTC stub." OFF) | |||||
| option(MGE_WITH_CUDNN_SHARED "Build MegEngine with CUDNN shared." OFF) | |||||
| option(MGE_USE_SYSTEM_LIB "Build MegEngine with system libraries." OFF) | option(MGE_USE_SYSTEM_LIB "Build MegEngine with system libraries." OFF) | ||||
| option(MGB_WITH_FLATBUFFERS "Build MegBrain with FlatBuffers serialization support." ON) | option(MGB_WITH_FLATBUFFERS "Build MegBrain with FlatBuffers serialization support." ON) | ||||
| option(MGE_WITH_CAMBRICON "Build MegEngine with Cambricon support" OFF) | option(MGE_WITH_CAMBRICON "Build MegEngine with Cambricon support" OFF) | ||||
| @@ -55,6 +58,14 @@ option(MGE_BUILD_SDK "Build load_and_run" ON) | |||||
| option(MGE_INFERENCE_ONLY "Build inference only library." OFF) | option(MGE_INFERENCE_ONLY "Build inference only library." OFF) | ||||
| option(MGE_WITH_MKLDNN "Enable Intel MKL_DNN support," ON) | option(MGE_WITH_MKLDNN "Enable Intel MKL_DNN support," ON) | ||||
| option(MGE_WITH_ROCM "Enable ROCM support" OFF) | option(MGE_WITH_ROCM "Enable ROCM support" OFF) | ||||
| option(MGE_WITH_LARGE_ARCHIVE "Enable big archive link support" OFF) | |||||
| if(MGE_WITH_NVRTC_STUB OR MGE_WITH_CUDA_STUB) | |||||
| set(MGE_WITH_ANY_CUDA_STUB ON) | |||||
| else() | |||||
| set(MGE_WITH_ANY_CUDA_STUB OFF) | |||||
| endif() | |||||
| if(NOT ${MGE_BIN_REDUCE} STREQUAL "") | if(NOT ${MGE_BIN_REDUCE} STREQUAL "") | ||||
| message(STATUS "build with BIN REDUCE") | message(STATUS "build with BIN REDUCE") | ||||
| @@ -205,14 +216,24 @@ else() | |||||
| endif() | endif() | ||||
| endif() | endif() | ||||
| if(MGE_WITH_CUDA) | |||||
| include(cmake/cudnn.cmake) | |||||
| if(MGE_CUDA_USE_STATIC AND ("${CUDNN_VERSION}" VERSION_GREATER "8.0.0" OR "${CUDNN_VERSION}" VERSION_EQUAL "8.0.0") AND (NOT MGE_WITH_CUDNN_SHARED)) | |||||
| message(WARNING "Static link CUDNN8 will auto enable MGE_WITH_LARGE_ARCHIVE=ON") | |||||
| set(MGE_WITH_LARGE_ARCHIVE ON) | |||||
| endif() | |||||
| endif() | |||||
| CHECK_CXX_COMPILER_FLAG(-fuse-ld=gold CXX_SUPPORT_GOLD) | CHECK_CXX_COMPILER_FLAG(-fuse-ld=gold CXX_SUPPORT_GOLD) | ||||
| if(CXX_SUPPORT_GOLD AND NOT ANDROID AND NOT APPLE AND NOT MSVC AND NOT WIN32) | |||||
| if(MGE_WITH_LARGE_ARCHIVE) | |||||
| message(STATUS "Set -mcmodel=large and disable -fuse-ld=gold") | |||||
| set(MGE_COMMON_LINKER_FLAGS "-mcmodel=large") | |||||
| elseif(CXX_SUPPORT_GOLD AND NOT ANDROID AND NOT APPLE AND NOT MSVC AND NOT WIN32 AND NOT MGE_WITH_LARGE_ARCHIVE) | |||||
| message(STATUS "Using GNU gold linker.") | message(STATUS "Using GNU gold linker.") | ||||
| set(MGE_COMMON_LINKER_FLAGS "-fuse-ld=gold") | |||||
| set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${MGE_COMMON_LINKER_FLAGS}") | |||||
| set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} ${MGE_COMMON_LINKER_FLAGS}") | |||||
| set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${MGE_COMMON_LINKER_FLAGS}") | |||||
| set(MGE_COMMON_LINKER_FLAGS "-fuse-ld=gold") | |||||
| endif() | endif() | ||||
| set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${MGE_COMMON_LINKER_FLAGS}") | |||||
| set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} ${MGE_COMMON_LINKER_FLAGS}") | |||||
| set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${MGE_COMMON_LINKER_FLAGS}") | |||||
| if(NOT MGE_WITH_JIT) | if(NOT MGE_WITH_JIT) | ||||
| if(MGE_WITH_HALIDE) | if(MGE_WITH_HALIDE) | ||||
| @@ -353,11 +374,28 @@ if(MGE_WITH_CUDA) | |||||
| if(NOT MGE_ENABLE_EXCEPTIONS) | if(NOT MGE_ENABLE_EXCEPTIONS) | ||||
| set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -fno-exceptions") | set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -fno-exceptions") | ||||
| endif() | endif() | ||||
| if(NOT MGE_CUDA_GENCODE) | if(NOT MGE_CUDA_GENCODE) | ||||
| if(${MGE_ARCH} STREQUAL "x86_64" OR ${MGE_ARCH} STREQUAL "i386") | if(${MGE_ARCH} STREQUAL "x86_64" OR ${MGE_ARCH} STREQUAL "i386") | ||||
| set(MEGDNN_THREADS_512 0) | set(MEGDNN_THREADS_512 0) | ||||
| if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.0.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.0.0") | |||||
| if(MGE_WITH_CUDA AND MGE_CUDA_USE_STATIC AND ("${CUDNN_VERSION}" VERSION_GREATER "8.0.0" OR "${CUDNN_VERSION}" VERSION_EQUAL "8.0.0") AND (NOT MGE_WITH_CUDNN_SHARED)) | |||||
| message(WARNING "Static link CUDNN8 with many sm is unworkable, we only enable sm61 sm70 sm75 by default, and enable MGE_WITH_LARGE_ARCHIVE=ON") | |||||
| set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61") | |||||
| set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_70,code=sm_70") | |||||
| set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_75,code=sm_75") | |||||
| elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "11.1.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "11.1.0") | |||||
| set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61") | |||||
| set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_70,code=sm_70") | |||||
| set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_75,code=sm_75") | |||||
| set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_80,code=sm_80") | |||||
| set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_86,code=sm_86") | |||||
| set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_86,code=compute_86") | |||||
| elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "11.0.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "11.0.0") | |||||
| set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61") | |||||
| set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_70,code=sm_70") | |||||
| set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_75,code=sm_75") | |||||
| set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_80,code=sm_80") | |||||
| set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_80,code=compute_80") | |||||
| elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.0.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.0.0") | |||||
| set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_52,code=sm_52") | set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_52,code=sm_52") | ||||
| set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_60,code=sm_60") | set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_60,code=sm_60") | ||||
| set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61") | set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61") | ||||
| @@ -385,7 +423,6 @@ if(MGE_WITH_CUDA) | |||||
| endif() | endif() | ||||
| set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${MGE_CUDA_GENCODE}") | set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${MGE_CUDA_GENCODE}") | ||||
| include(cmake/cudnn.cmake) | |||||
| if(MGE_WITH_TRT) | if(MGE_WITH_TRT) | ||||
| include(cmake/tensorrt.cmake) | include(cmake/tensorrt.cmake) | ||||
| endif() | endif() | ||||
| @@ -394,12 +431,30 @@ if(MGE_WITH_CUDA) | |||||
| if(MSVC OR WIN32) | if(MSVC OR WIN32) | ||||
| list(APPEND MGE_CUDA_LIBS ${TRT_LIBRARY} ${CUDNN_LIBRARY}) | list(APPEND MGE_CUDA_LIBS ${TRT_LIBRARY} ${CUDNN_LIBRARY}) | ||||
| message(STATUS "windows TRT_LIBRARY: ${TRT_LIBRARY}") | message(STATUS "windows TRT_LIBRARY: ${TRT_LIBRARY}") | ||||
| else() | |||||
| if(TensorRT_VERSION_MAJOR GREATER_EQUAL 7) | |||||
| list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libnvinfer myelin_compiler_static myelin_executor_static myelin_pattern_runtime_static myelin_pattern_library_static -Wl,--no-whole-archive) | |||||
| else() | |||||
| list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libnvinfer -Wl,--no-whole-archive) | |||||
| endif() | |||||
| endif() | |||||
| endif() | |||||
| if("${CUDNN_VERSION}" STREQUAL "7.5.0") | |||||
| if(MSVC OR WIN32) | |||||
| message(STATUS "windows CUDNN_LIBRARY: ${CUDNN_LIBRARY}") | message(STATUS "windows CUDNN_LIBRARY: ${CUDNN_LIBRARY}") | ||||
| list(APPEND MGE_CUDA_LIBS ${CUDNN_LIBRARY}) | |||||
| else() | else() | ||||
| list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libnvinfer libcudnn -Wl,--no-whole-archive) | |||||
| message(STATUS "cudnn 7.5.0 has bug in cudnnConvolutionBiasActivationForward, need --whole-archive to workaround, ref https://docs.nvidia.com/deeplearning/cudnn/release-notes/rel_7xx.html") | |||||
| list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libcudnn -Wl,--no-whole-archive) | |||||
| endif() | endif() | ||||
| else() | else() | ||||
| list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libcudnn -Wl,--no-whole-archive) | |||||
| if(MSVC OR WIN32) | |||||
| message(STATUS "windows CUDNN_LIBRARY: ${CUDNN_LIBRARY}") | |||||
| list(APPEND MGE_CUDA_LIBS ${CUDNN_LIBRARY}) | |||||
| else() | |||||
| list(APPEND MGE_CUDA_LIBS libcudnn) | |||||
| endif() | |||||
| endif() | endif() | ||||
| if(MSVC OR WIN32) | if(MSVC OR WIN32) | ||||
| list(APPEND MGE_CUDA_LIBS cusolver.lib cublas.lib curand.lib cudart_static.lib cusparse.lib) | list(APPEND MGE_CUDA_LIBS cusolver.lib cublas.lib curand.lib cudart_static.lib cusparse.lib) | ||||
| @@ -447,15 +502,37 @@ if(MGE_WITH_CUDA) | |||||
| if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.1.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.1.0") | if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.1.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.1.0") | ||||
| list(APPEND MGE_CUDA_LIBS cublasLt cusolver cublas curand) | list(APPEND MGE_CUDA_LIBS cublasLt cusolver cublas curand) | ||||
| endif() | endif() | ||||
| list(APPEND MGE_CUDA_LIBS cudart) | |||||
| endif() | |||||
| if(NOT MGE_WITH_CUDA_STUB) | |||||
| if(MSVC OR WIN32) | |||||
| list(APPEND MGE_CUDA_LIBS cuda.lib) | |||||
| else() | |||||
| list(APPEND MGE_CUDA_LIBS cuda) | |||||
| endif() | |||||
| endif() | |||||
| if(NOT MGE_WITH_NVRTC_STUB) | |||||
| if(MSVC OR WIN32) | |||||
| list(APPEND MGE_CUDA_LIBS nvrtc.lib) | |||||
| else() | |||||
| list(APPEND MGE_CUDA_LIBS nvrtc) | |||||
| endif() | |||||
| endif() | |||||
| if(MGE_WITH_ANY_CUDA_STUB) | |||||
| add_subdirectory(dnn/cuda-stub) | |||||
| list(APPEND MGE_CUDA_LIBS cuda-stub) | |||||
| endif() | endif() | ||||
| add_subdirectory(dnn/cuda-stub) | |||||
| if(MSVC OR WIN32) | if(MSVC OR WIN32) | ||||
| list(APPEND MGE_CUDA_LIBS nvrtc.lib cuda-stub) | |||||
| list(APPEND MGE_CUDA_LIBS nvrtc.lib) | |||||
| else() | else() | ||||
| list(APPEND MGE_CUDA_LIBS nvrtc cuda-stub nvToolsExt) | |||||
| list(APPEND MGE_CUDA_LIBS nvToolsExt) | |||||
| endif() | endif() | ||||
| set(MGE_CUDA_LIBS "${MGE_CUDA_LIBS}") | |||||
| set(MGE_CUDA_LIBS "${MGE_CUDA_LIBS} -lrt") | |||||
| endif() | endif() | ||||
| if(MGE_WITH_CAMBRICON) | if(MGE_WITH_CAMBRICON) | ||||
| @@ -800,6 +877,9 @@ if(TARGET _imperative_rt) | |||||
| COMMAND ${CMAKE_COMMAND} -E create_symlink | COMMAND ${CMAKE_COMMAND} -E create_symlink | ||||
| ${CMAKE_CURRENT_BINARY_DIR}/imperative/python/${PACKAGE_NAME}/core/$<TARGET_FILE_NAME:${MODULE_NAME}> | ${CMAKE_CURRENT_BINARY_DIR}/imperative/python/${PACKAGE_NAME}/core/$<TARGET_FILE_NAME:${MODULE_NAME}> | ||||
| ${CMAKE_CURRENT_SOURCE_DIR}/imperative/python/${PACKAGE_NAME}/core/$<TARGET_FILE_NAME:${MODULE_NAME}> | ${CMAKE_CURRENT_SOURCE_DIR}/imperative/python/${PACKAGE_NAME}/core/$<TARGET_FILE_NAME:${MODULE_NAME}> | ||||
| COMMAND ${CMAKE_COMMAND} -E create_symlink | |||||
| ${CMAKE_CURRENT_BINARY_DIR}/imperative/python/${PACKAGE_NAME}/version.py | |||||
| ${CMAKE_CURRENT_SOURCE_DIR}/imperative/python/${PACKAGE_NAME}/version.py | |||||
| DEPENDS _imperative_rt | DEPENDS _imperative_rt | ||||
| VERBATIM | VERBATIM | ||||
| ) | ) | ||||
| @@ -863,3 +943,9 @@ if(MGE_WITH_JIT_MLIR) | |||||
| add_subdirectory(tools/mlir/mgb-opt) | add_subdirectory(tools/mlir/mgb-opt) | ||||
| add_subdirectory(tools/mlir/mgb-file-check) | add_subdirectory(tools/mlir/mgb-file-check) | ||||
| endif() | endif() | ||||
| if(MGE_WITH_CUDA AND MGE_CUDA_USE_STATIC AND("${CUDNN_VERSION}" VERSION_GREATER "8.0.0" OR "${CUDNN_VERSION}" VERSION_EQUAL "8.0.0") AND (NOT MGE_WITH_CUDNN_SHARED)) | |||||
| message(WARNING "Static link CUDNN8 with many sm is unworkable, please use -DMGE_WITH_CUDNN_SHARED=ON or -DMGE_WITH_LARGE_ARCHIVE=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_70,code=sm_70 arch=compute_75,code=sm_75\" ") | |||||
| message(WARNING "Static link CUDNN8 with many sm is unworkable, please use -DMGE_WITH_CUDNN_SHARED=ON or -DMGE_WITH_LARGE_ARCHIVE=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_70,code=sm_70 arch=compute_75,code=sm_75\" ") | |||||
| message(WARNING "Static link CUDNN8 with many sm is unworkable, please use -DMGE_WITH_CUDNN_SHARED=ON or -DMGE_WITH_LARGE_ARCHIVE=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_70,code=sm_70 arch=compute_75,code=sm_75\" ") | |||||
| endif() | |||||
| @@ -11,7 +11,7 @@ if("${CUDNN_ROOT_DIR}" STREQUAL "" AND NOT "$ENV{CUDNN_ROOT_DIR}" STREQUAL "") | |||||
| set(CUDNN_ROOT_DIR $ENV{CUDNN_ROOT_DIR}) | set(CUDNN_ROOT_DIR $ENV{CUDNN_ROOT_DIR}) | ||||
| endif() | endif() | ||||
| if(MGE_CUDA_USE_STATIC) | |||||
| if(MGE_CUDA_USE_STATIC AND NOT MGE_WITH_CUDNN_SHARED) | |||||
| find_library(CUDNN_LIBRARY | find_library(CUDNN_LIBRARY | ||||
| NAMES libcudnn_static.a cudnn.lib | NAMES libcudnn_static.a cudnn.lib | ||||
| PATHS $ENV{LD_LIBRARY_PATH} ${CUDNN_ROOT_DIR} ${PC_CUDNN_LIBRARY_DIRS} ${CMAKE_INSTALL_PREFIX} | PATHS $ENV{LD_LIBRARY_PATH} ${CUDNN_ROOT_DIR} ${PC_CUDNN_LIBRARY_DIRS} ${CMAKE_INSTALL_PREFIX} | ||||
| @@ -42,7 +42,12 @@ if(CUDNN_INCLUDE_DIR STREQUAL "CUDNN_INCLUDE_DIR-NOTFOUND") | |||||
| message(FATAL_ERROR "Can not find CuDNN Library") | message(FATAL_ERROR "Can not find CuDNN Library") | ||||
| endif() | endif() | ||||
| file(READ ${CUDNN_INCLUDE_DIR}/cudnn.h CUDNN_VERSION_FILE_CONTENTS) | |||||
| if(EXISTS ${CUDNN_INCLUDE_DIR}/cudnn_version.h) | |||||
| file(READ ${CUDNN_INCLUDE_DIR}/cudnn_version.h CUDNN_VERSION_FILE_CONTENTS) | |||||
| else() | |||||
| file(READ ${CUDNN_INCLUDE_DIR}/cudnn.h CUDNN_VERSION_FILE_CONTENTS) | |||||
| endif() | |||||
| string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)" | string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)" | ||||
| CUDNN_MAJOR_VERSION "${CUDNN_VERSION_FILE_CONTENTS}") | CUDNN_MAJOR_VERSION "${CUDNN_VERSION_FILE_CONTENTS}") | ||||
| string(REGEX REPLACE "define CUDNN_MAJOR * +([0-9]+)" "\\1" | string(REGEX REPLACE "define CUDNN_MAJOR * +([0-9]+)" "\\1" | ||||
| @@ -55,7 +60,9 @@ string(REGEX MATCH "define CUDNN_PATCHLEVEL * +([0-9]+)" | |||||
| CUDNN_PATCH_VERSION "${CUDNN_VERSION_FILE_CONTENTS}") | CUDNN_PATCH_VERSION "${CUDNN_VERSION_FILE_CONTENTS}") | ||||
| string(REGEX REPLACE "define CUDNN_PATCHLEVEL * +([0-9]+)" "\\1" | string(REGEX REPLACE "define CUDNN_PATCHLEVEL * +([0-9]+)" "\\1" | ||||
| CUDNN_PATCH_VERSION "${CUDNN_PATCH_VERSION}") | CUDNN_PATCH_VERSION "${CUDNN_PATCH_VERSION}") | ||||
| set(CUDNN_VERSION ${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}) | |||||
| set(CUDNN_VERSION ${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}.${CUDNN_PATCH_VERSION}) | |||||
| if(MGE_CUDA_USE_STATIC) | if(MGE_CUDA_USE_STATIC) | ||||
| add_library(libcudnn STATIC IMPORTED) | add_library(libcudnn STATIC IMPORTED) | ||||
| @@ -1,11 +1,21 @@ | |||||
| file (GLOB_RECURSE SOURCES src/*.cpp) | |||||
| file (GLOB_RECURSE CUDA_STUB src/libcuda.cpp) | |||||
| file (GLOB_RECURSE NVRTC_STUB src/libnvrtc.cpp) | |||||
| if(MGE_WITH_CUDA_STUB) | |||||
| list(APPEND STUB_SRC ${CUDA_STUB}) | |||||
| endif() | |||||
| if(MGE_WITH_NVRTC_STUB) | |||||
| list(APPEND STUB_SRC ${NVRTC_STUB}) | |||||
| endif() | |||||
| if(MSVC OR WIN32) | if(MSVC OR WIN32) | ||||
| add_library (cuda-stub STATIC ${SOURCES}) | |||||
| add_library (cuda-stub STATIC ${STUB_SRC}) | |||||
| else() | else() | ||||
| add_library (cuda-stub SHARED ${SOURCES}) | |||||
| add_library (cuda-stub SHARED ${STUB_SRC}) | |||||
| endif() | endif() | ||||
| set_target_properties(cuda-stub PROPERTIES OUTPUT_NAME cuda) | |||||
| set_target_properties(cuda-stub PROPERTIES OUTPUT_NAME cuda_stub) | |||||
| target_compile_definitions(cuda-stub PRIVATE __CUDA_API_VERSION_INTERNAL) | target_compile_definitions(cuda-stub PRIVATE __CUDA_API_VERSION_INTERNAL) | ||||
| if (MSVC OR WIN32) | if (MSVC OR WIN32) | ||||
| target_link_libraries(cuda-stub PRIVATE -Wl,--no-undefined) | target_link_libraries(cuda-stub PRIVATE -Wl,--no-undefined) | ||||
| @@ -0,0 +1,109 @@ | |||||
| #if defined(_WIN32) | |||||
| #include <windows.h> | |||||
| #define RTLD_LAZY 0 | |||||
| static void* dlopen(const char* file, int) { | |||||
| return static_cast<void*>(LoadLibraryA(file)); | |||||
| } | |||||
| static void* dlerror() { | |||||
| const char* errmsg = "dlerror not aviable in windows"; | |||||
| return const_cast<char*>(errmsg); | |||||
| } | |||||
| static void* dlsym(void* handle, const char* name) { | |||||
| FARPROC symbol = GetProcAddress((HMODULE)handle, name); | |||||
| return reinterpret_cast<void*>(symbol); | |||||
| } | |||||
| #else | |||||
| #include <dlfcn.h> | |||||
| #include <unistd.h> | |||||
| #endif | |||||
| #include <sstream> | |||||
| #include <string> | |||||
| #include <vector> | |||||
| static std::vector<std::string> split_string(const std::string& s, char delim) { | |||||
| std::vector<std::string> elems; | |||||
| std::stringstream ss(s); | |||||
| std::string item; | |||||
| while (std::getline(ss, item, delim)) { | |||||
| elems.push_back(item); | |||||
| } | |||||
| return elems; | |||||
| } | |||||
| static std::vector<std::string> get_env_dir(const char* env_name) { | |||||
| const char* env_p = std::getenv(env_name); | |||||
| std::vector<std::string> env_dir; | |||||
| if (env_p) { | |||||
| env_dir = split_string(env_p, ':'); | |||||
| } | |||||
| return env_dir; | |||||
| } | |||||
| static void* try_open_handle(std::vector<std::string> dir_vec, | |||||
| std::string default_so_name) { | |||||
| void* handle = nullptr; | |||||
| for (auto& tk_path : dir_vec) { | |||||
| handle = dlopen((tk_path + "/" + default_so_name).c_str(), RTLD_LAZY); | |||||
| if (handle) { | |||||
| break; | |||||
| } | |||||
| } | |||||
| return handle; | |||||
| } | |||||
| static void* try_open_handle(const char** so_vec, int nr_so) { | |||||
| void* handle = nullptr; | |||||
| for (int i = 0; i < nr_so; ++i) { | |||||
| handle = dlopen(so_vec[i], RTLD_LAZY); | |||||
| if (handle) { | |||||
| break; | |||||
| } | |||||
| } | |||||
| return handle; | |||||
| } | |||||
| static void* get_library_handle() { | |||||
| std::vector<std::string> cuda_tk_dir = get_env_dir("CUDA_TK_PATH"); | |||||
| std::vector<std::string> ld_dir = get_env_dir("LD_LIBRARY_PATH"); | |||||
| void* handle = nullptr; | |||||
| if (!handle) { | |||||
| handle = try_open_handle(ld_dir, default_so_name); | |||||
| } | |||||
| if (!handle) { | |||||
| handle = try_open_handle(cuda_tk_dir, default_so_name); | |||||
| } | |||||
| if (!handle) { | |||||
| handle = try_open_handle(default_so_paths, | |||||
| sizeof(default_so_paths) / sizeof(char*)); | |||||
| } | |||||
| if (!handle) { | |||||
| handle = try_open_handle(extra_so_paths, | |||||
| sizeof(extra_so_paths) / sizeof(char*)); | |||||
| } | |||||
| if (!handle) { | |||||
| LOGE("Failed to load %s API library", g_default_api_name); | |||||
| return nullptr; | |||||
| } | |||||
| return handle; | |||||
| } | |||||
| static void log_failed_load(int func_idx) { | |||||
| LOGE("failed to load %s func: %s", g_default_api_name, | |||||
| g_func_name[func_idx]); | |||||
| } | |||||
| static void* resolve_library_func(void* handle, const char* func) { | |||||
| if (!handle) { | |||||
| LOGE("%s handle should not be nullptr!", g_default_api_name); | |||||
| return nullptr; | |||||
| } | |||||
| auto ret = dlsym(handle, func); | |||||
| if (!ret) { | |||||
| LOGE("failed to load %s func: %s", g_default_api_name, func); | |||||
| } | |||||
| return ret; | |||||
| } | |||||
| @@ -3,36 +3,14 @@ | |||||
| #include <cstdio> | #include <cstdio> | ||||
| #define LOGE(fmt, v...) fprintf(stderr, "err: " fmt "\n", ##v) | #define LOGE(fmt, v...) fprintf(stderr, "err: " fmt "\n", ##v) | ||||
| extern "C" { | extern "C" { | ||||
| #include <cuda.h> | |||||
| #include "cuda.h" | |||||
| } | } | ||||
| #include <cudaProfiler.h> | |||||
| #include "cudaProfiler.h" | |||||
| #pragma GCC diagnostic ignored "-Wdeprecated-declarations" | #pragma GCC diagnostic ignored "-Wdeprecated-declarations" | ||||
| #if defined(_WIN32) | |||||
| #include <windows.h> | |||||
| #define RTLD_LAZY 0 | |||||
| static void* dlopen(const char* file, int) { | |||||
| return static_cast<void*>(LoadLibraryA(file)); | |||||
| } | |||||
| static void* dlerror() { | |||||
| const char* errmsg = "dlerror not aviable in windows"; | |||||
| return const_cast<char*>(errmsg); | |||||
| } | |||||
| static void* dlsym(void* handle, const char* name) { | |||||
| FARPROC symbol = GetProcAddress((HMODULE)handle, name); | |||||
| return reinterpret_cast<void*>(symbol); | |||||
| } | |||||
| #else | |||||
| #include <dlfcn.h> | |||||
| #include <unistd.h> | |||||
| #endif | |||||
| static void log_failed_load(int func_idx); | static void log_failed_load(int func_idx); | ||||
| namespace { | namespace { | ||||
| template <typename T> | template <typename T> | ||||
| @@ -42,68 +20,63 @@ CUresult on_init_failed(int func_idx) { | |||||
| log_failed_load(func_idx); | log_failed_load(func_idx); | ||||
| return CUDA_ERROR_UNKNOWN; | return CUDA_ERROR_UNKNOWN; | ||||
| } | } | ||||
| } | |||||
| } // namespace | |||||
| #define _WRAPLIB_API_CALL CUDAAPI | #define _WRAPLIB_API_CALL CUDAAPI | ||||
| #define _WRAPLIB_CALLBACK CUDA_CB | #define _WRAPLIB_CALLBACK CUDA_CB | ||||
| #include "./libcuda-wrap.h" | |||||
| #if CUDA_VERSION == 10010 | |||||
| #include "./libcuda-wrap_10.1.h" | |||||
| #elif CUDA_VERSION == 10020 | |||||
| #include "./libcuda-wrap_10.2.h" | |||||
| #elif CUDA_VERSION == 11010 | |||||
| #include "./libcuda-wrap_11.1.h" | |||||
| #elif CUDA_VERSION == 11020 | |||||
| #include "./libcuda-wrap_11.2.h" | |||||
| #else | |||||
| #error "cuda stub not support this cuda version, you can close cuda stub to passby" | |||||
| #endif | |||||
| #undef _WRAPLIB_CALLBACK | #undef _WRAPLIB_CALLBACK | ||||
| #undef _WRAPLIB_API_CALL | #undef _WRAPLIB_API_CALL | ||||
| static const char* default_so_name = | |||||
| #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) | |||||
| "nvcuda.dll"; | |||||
| #elif defined(__APPLE__) || defined(__MACOSX) | |||||
| "libcuda.dylib"; | |||||
| #else | |||||
| "libcuda.so.1"; | |||||
| #endif | |||||
| // Harvested from cuda_drvapi_dynlink.c | // Harvested from cuda_drvapi_dynlink.c | ||||
| static const char* default_so_paths[] = { | static const char* default_so_paths[] = { | ||||
| #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) | ||||
| "nvcuda.dll", | |||||
| #elif defined(__unix__) || defined (__QNX__) || defined(__APPLE__) || defined(__MACOSX) | |||||
| "nvcuda.dll", | |||||
| #elif defined(__unix__) || defined(__QNX__) || defined(__APPLE__) || \ | |||||
| defined(__MACOSX) | |||||
| #if defined(__APPLE__) || defined(__MACOSX) | #if defined(__APPLE__) || defined(__MACOSX) | ||||
| "/usr/local/cuda/lib/libcuda.dylib", | |||||
| "/usr/local/cuda/lib/libcuda.dylib", | |||||
| #elif defined(__ANDROID__) | #elif defined(__ANDROID__) | ||||
| #if defined (__aarch64__) | |||||
| "/system/vendor/lib64/libcuda.so", | |||||
| #if defined(__aarch64__) | |||||
| "/system/vendor/lib64/libcuda.so", | |||||
| #elif defined(__arm__) | #elif defined(__arm__) | ||||
| "/system/vendor/lib/libcuda.so", | |||||
| "/system/vendor/lib/libcuda.so", | |||||
| #endif | #endif | ||||
| #else | #else | ||||
| "libcuda.so.1", | |||||
| // In case some users does not have correct search path configured in | |||||
| // /etc/ld.so.conf | |||||
| "/usr/lib/x86_64-linux-gnu/libcuda.so", | |||||
| "/usr/local/nvidia/lib64/libcuda.so", | |||||
| "libcuda.so.1", | |||||
| #endif | #endif | ||||
| #else | #else | ||||
| #error "Unknown platform" | #error "Unknown platform" | ||||
| #endif | #endif | ||||
| }; | }; | ||||
| static void* get_library_handle() { | |||||
| void* handle = nullptr; | |||||
| for (size_t i = 0; i < (sizeof(default_so_paths) / sizeof(char*)); i++) { | |||||
| handle = dlopen(default_so_paths[i], RTLD_LAZY); | |||||
| if (handle) { | |||||
| break; | |||||
| } | |||||
| } | |||||
| if (!handle) { | |||||
| LOGE("Failed to load CUDA Driver API library"); | |||||
| return nullptr; | |||||
| } | |||||
| return handle; | |||||
| } | |||||
| static void log_failed_load(int func_idx) { | |||||
| LOGE("failed to load cuda func: %s", g_func_name[func_idx]); | |||||
| } | |||||
| static const char* extra_so_paths[] = { | |||||
| "/usr/lib/x86_64-linux-gnu/libcuda.so", | |||||
| "/usr/local/nvidia/lib64/libcuda.so", | |||||
| }; | |||||
| static void* resolve_library_func(void* handle, const char* func) { | |||||
| if (!handle) { | |||||
| LOGE("handle should not be nullptr!"); | |||||
| return nullptr; | |||||
| } | |||||
| auto ret = dlsym(handle, func); | |||||
| if (!ret) { | |||||
| LOGE("failed to load cuda func: %s", func); | |||||
| } | |||||
| return ret; | |||||
| } | |||||
| static const char* g_default_api_name = "cuda"; | |||||
| #include "./dlopen_helper.h" | |||||
| @@ -0,0 +1,367 @@ | |||||
| // generated by wraplib.py | |||||
| // --- begin functions to be implemented | |||||
| #ifndef _WRAPLIB_API_CALL | |||||
| #define _WRAPLIB_API_CALL | |||||
| #endif | |||||
| #ifndef _WRAPLIB_CALLBACK | |||||
| #define _WRAPLIB_CALLBACK | |||||
| #endif | |||||
| #ifndef ON_ENTRY | |||||
| #define ON_ENTRY(x) | |||||
| #endif | |||||
| static void* get_library_handle(); | |||||
| static void* resolve_library_func(void*, const char*); | |||||
| namespace { | |||||
| template <typename T> | |||||
| T on_init_failed(int func_idx); | |||||
| } | |||||
| // --- end functions to be implemented | |||||
| #include <cstddef> | |||||
| #include <mutex> | |||||
| extern "C" { | |||||
| const char _WRAPLIB_API_CALL* nvrtcGetErrorString(nvrtcResult arg0); | |||||
| nvrtcResult _WRAPLIB_API_CALL nvrtcVersion(int* arg0, int* arg1); | |||||
| nvrtcResult _WRAPLIB_API_CALL nvrtcGetNumSupportedArchs(int* arg0); | |||||
| nvrtcResult _WRAPLIB_API_CALL nvrtcGetSupportedArchs(int* arg0); | |||||
| nvrtcResult _WRAPLIB_API_CALL nvrtcCreateProgram(nvrtcProgram* arg0, | |||||
| const char* arg1, | |||||
| const char* arg2, int arg3, | |||||
| const char* const* arg4, | |||||
| const char* const* arg5); | |||||
| nvrtcResult _WRAPLIB_API_CALL nvrtcDestroyProgram(nvrtcProgram* arg0); | |||||
| nvrtcResult _WRAPLIB_API_CALL nvrtcCompileProgram(nvrtcProgram arg0, int arg1, | |||||
| const char* const* arg2); | |||||
| nvrtcResult _WRAPLIB_API_CALL nvrtcGetPTXSize(nvrtcProgram arg0, size_t* arg1); | |||||
| nvrtcResult _WRAPLIB_API_CALL nvrtcGetPTX(nvrtcProgram arg0, char* arg1); | |||||
| nvrtcResult _WRAPLIB_API_CALL nvrtcGetCUBINSize(nvrtcProgram arg0, | |||||
| size_t* arg1); | |||||
| nvrtcResult _WRAPLIB_API_CALL nvrtcGetCUBIN(nvrtcProgram arg0, char* arg1); | |||||
| nvrtcResult _WRAPLIB_API_CALL nvrtcGetProgramLogSize(nvrtcProgram arg0, | |||||
| size_t* arg1); | |||||
| nvrtcResult _WRAPLIB_API_CALL nvrtcGetProgramLog(nvrtcProgram arg0, char* arg1); | |||||
| nvrtcResult _WRAPLIB_API_CALL nvrtcAddNameExpression(nvrtcProgram arg0, | |||||
| const char* const arg1); | |||||
| nvrtcResult _WRAPLIB_API_CALL nvrtcGetLoweredName(nvrtcProgram arg0, | |||||
| const char* const arg1, | |||||
| const char** arg2); | |||||
| } | |||||
| static void load_library(); | |||||
| static const char _WRAPLIB_API_CALL* nvrtcGetErrorString_init( | |||||
| nvrtcResult arg0) { | |||||
| load_library(); | |||||
| return nvrtcGetErrorString(arg0); | |||||
| } | |||||
| static const char _WRAPLIB_API_CALL* nvrtcGetErrorString_error(nvrtcResult) { | |||||
| return on_init_failed<const char*>(0); | |||||
| } | |||||
| static nvrtcResult _WRAPLIB_API_CALL nvrtcVersion_init(int* arg0, int* arg1) { | |||||
| load_library(); | |||||
| return nvrtcVersion(arg0, arg1); | |||||
| } | |||||
| static nvrtcResult _WRAPLIB_API_CALL nvrtcVersion_error(int*, int*) { | |||||
| return on_init_failed<nvrtcResult>(1); | |||||
| } | |||||
| static nvrtcResult _WRAPLIB_API_CALL nvrtcGetNumSupportedArchs_init(int* arg0) { | |||||
| load_library(); | |||||
| return nvrtcGetNumSupportedArchs(arg0); | |||||
| } | |||||
| static nvrtcResult _WRAPLIB_API_CALL nvrtcGetNumSupportedArchs_error(int*) { | |||||
| return on_init_failed<nvrtcResult>(2); | |||||
| } | |||||
| static nvrtcResult _WRAPLIB_API_CALL nvrtcGetSupportedArchs_init(int* arg0) { | |||||
| load_library(); | |||||
| return nvrtcGetSupportedArchs(arg0); | |||||
| } | |||||
| static nvrtcResult _WRAPLIB_API_CALL nvrtcGetSupportedArchs_error(int*) { | |||||
| return on_init_failed<nvrtcResult>(3); | |||||
| } | |||||
| static nvrtcResult _WRAPLIB_API_CALL nvrtcCreateProgram_init( | |||||
| nvrtcProgram* arg0, const char* arg1, const char* arg2, int arg3, | |||||
| const char* const* arg4, const char* const* arg5) { | |||||
| load_library(); | |||||
| return nvrtcCreateProgram(arg0, arg1, arg2, arg3, arg4, arg5); | |||||
| } | |||||
| static nvrtcResult _WRAPLIB_API_CALL | |||||
| nvrtcCreateProgram_error(nvrtcProgram*, const char*, const char*, int, | |||||
| const char* const*, const char* const*) { | |||||
| return on_init_failed<nvrtcResult>(4); | |||||
| } | |||||
| static nvrtcResult _WRAPLIB_API_CALL | |||||
| nvrtcDestroyProgram_init(nvrtcProgram* arg0) { | |||||
| load_library(); | |||||
| return nvrtcDestroyProgram(arg0); | |||||
| } | |||||
| static nvrtcResult _WRAPLIB_API_CALL nvrtcDestroyProgram_error(nvrtcProgram*) { | |||||
| return on_init_failed<nvrtcResult>(5); | |||||
| } | |||||
| static nvrtcResult _WRAPLIB_API_CALL | |||||
| nvrtcCompileProgram_init(nvrtcProgram arg0, int arg1, const char* const* arg2) { | |||||
| load_library(); | |||||
| return nvrtcCompileProgram(arg0, arg1, arg2); | |||||
| } | |||||
| static nvrtcResult _WRAPLIB_API_CALL | |||||
| nvrtcCompileProgram_error(nvrtcProgram, int, const char* const*) { | |||||
| return on_init_failed<nvrtcResult>(6); | |||||
| } | |||||
| static nvrtcResult _WRAPLIB_API_CALL nvrtcGetPTXSize_init(nvrtcProgram arg0, | |||||
| size_t* arg1) { | |||||
| load_library(); | |||||
| return nvrtcGetPTXSize(arg0, arg1); | |||||
| } | |||||
| static nvrtcResult _WRAPLIB_API_CALL nvrtcGetPTXSize_error(nvrtcProgram, | |||||
| size_t*) { | |||||
| return on_init_failed<nvrtcResult>(7); | |||||
| } | |||||
| static nvrtcResult _WRAPLIB_API_CALL nvrtcGetPTX_init(nvrtcProgram arg0, | |||||
| char* arg1) { | |||||
| load_library(); | |||||
| return nvrtcGetPTX(arg0, arg1); | |||||
| } | |||||
| static nvrtcResult _WRAPLIB_API_CALL nvrtcGetPTX_error(nvrtcProgram, char*) { | |||||
| return on_init_failed<nvrtcResult>(8); | |||||
| } | |||||
| static nvrtcResult _WRAPLIB_API_CALL nvrtcGetCUBINSize_init(nvrtcProgram arg0, | |||||
| size_t* arg1) { | |||||
| load_library(); | |||||
| return nvrtcGetCUBINSize(arg0, arg1); | |||||
| } | |||||
| static nvrtcResult _WRAPLIB_API_CALL nvrtcGetCUBINSize_error(nvrtcProgram, | |||||
| size_t*) { | |||||
| return on_init_failed<nvrtcResult>(9); | |||||
| } | |||||
| static nvrtcResult _WRAPLIB_API_CALL nvrtcGetCUBIN_init(nvrtcProgram arg0, | |||||
| char* arg1) { | |||||
| load_library(); | |||||
| return nvrtcGetCUBIN(arg0, arg1); | |||||
| } | |||||
| static nvrtcResult _WRAPLIB_API_CALL nvrtcGetCUBIN_error(nvrtcProgram, char*) { | |||||
| return on_init_failed<nvrtcResult>(10); | |||||
| } | |||||
| static nvrtcResult _WRAPLIB_API_CALL | |||||
| nvrtcGetProgramLogSize_init(nvrtcProgram arg0, size_t* arg1) { | |||||
| load_library(); | |||||
| return nvrtcGetProgramLogSize(arg0, arg1); | |||||
| } | |||||
| static nvrtcResult _WRAPLIB_API_CALL nvrtcGetProgramLogSize_error(nvrtcProgram, | |||||
| size_t*) { | |||||
| return on_init_failed<nvrtcResult>(11); | |||||
| } | |||||
| static nvrtcResult _WRAPLIB_API_CALL nvrtcGetProgramLog_init(nvrtcProgram arg0, | |||||
| char* arg1) { | |||||
| load_library(); | |||||
| return nvrtcGetProgramLog(arg0, arg1); | |||||
| } | |||||
| static nvrtcResult _WRAPLIB_API_CALL nvrtcGetProgramLog_error(nvrtcProgram, | |||||
| char*) { | |||||
| return on_init_failed<nvrtcResult>(12); | |||||
| } | |||||
| static nvrtcResult _WRAPLIB_API_CALL | |||||
| nvrtcAddNameExpression_init(nvrtcProgram arg0, const char* const arg1) { | |||||
| load_library(); | |||||
| return nvrtcAddNameExpression(arg0, arg1); | |||||
| } | |||||
| static nvrtcResult _WRAPLIB_API_CALL | |||||
| nvrtcAddNameExpression_error(nvrtcProgram, const char* const) { | |||||
| return on_init_failed<nvrtcResult>(13); | |||||
| } | |||||
| static nvrtcResult _WRAPLIB_API_CALL nvrtcGetLoweredName_init( | |||||
| nvrtcProgram arg0, const char* const arg1, const char** arg2) { | |||||
| load_library(); | |||||
| return nvrtcGetLoweredName(arg0, arg1, arg2); | |||||
| } | |||||
| static nvrtcResult _WRAPLIB_API_CALL | |||||
| nvrtcGetLoweredName_error(nvrtcProgram, const char* const, const char**) { | |||||
| return on_init_failed<nvrtcResult>(14); | |||||
| } | |||||
| static constexpr size_t NR_FUNC = 15; | |||||
| static void* g_func_table[NR_FUNC] = {(void*)(&nvrtcGetErrorString_init), | |||||
| (void*)(&nvrtcVersion_init), | |||||
| (void*)(&nvrtcGetNumSupportedArchs_init), | |||||
| (void*)(&nvrtcGetSupportedArchs_init), | |||||
| (void*)(&nvrtcCreateProgram_init), | |||||
| (void*)(&nvrtcDestroyProgram_init), | |||||
| (void*)(&nvrtcCompileProgram_init), | |||||
| (void*)(&nvrtcGetPTXSize_init), | |||||
| (void*)(&nvrtcGetPTX_init), | |||||
| (void*)(&nvrtcGetCUBINSize_init), | |||||
| (void*)(&nvrtcGetCUBIN_init), | |||||
| (void*)(&nvrtcGetProgramLogSize_init), | |||||
| (void*)(&nvrtcGetProgramLog_init), | |||||
| (void*)(&nvrtcAddNameExpression_init), | |||||
| (void*)(&nvrtcGetLoweredName_init)}; | |||||
| static void* g_func_table_error[NR_FUNC] = { | |||||
| (void*)(&nvrtcGetErrorString_error), | |||||
| (void*)(&nvrtcVersion_error), | |||||
| (void*)(&nvrtcGetNumSupportedArchs_error), | |||||
| (void*)(&nvrtcGetSupportedArchs_error), | |||||
| (void*)(&nvrtcCreateProgram_error), | |||||
| (void*)(&nvrtcDestroyProgram_error), | |||||
| (void*)(&nvrtcCompileProgram_error), | |||||
| (void*)(&nvrtcGetPTXSize_error), | |||||
| (void*)(&nvrtcGetPTX_error), | |||||
| (void*)(&nvrtcGetCUBINSize_error), | |||||
| (void*)(&nvrtcGetCUBIN_error), | |||||
| (void*)(&nvrtcGetProgramLogSize_error), | |||||
| (void*)(&nvrtcGetProgramLog_error), | |||||
| (void*)(&nvrtcAddNameExpression_error), | |||||
| (void*)(&nvrtcGetLoweredName_error)}; | |||||
| static const char* const g_func_name[NR_FUNC] = {"nvrtcGetErrorString", | |||||
| "nvrtcVersion", | |||||
| "nvrtcGetNumSupportedArchs", | |||||
| "nvrtcGetSupportedArchs", | |||||
| "nvrtcCreateProgram", | |||||
| "nvrtcDestroyProgram", | |||||
| "nvrtcCompileProgram", | |||||
| "nvrtcGetPTXSize", | |||||
| "nvrtcGetPTX", | |||||
| "nvrtcGetCUBINSize", | |||||
| "nvrtcGetCUBIN", | |||||
| "nvrtcGetProgramLogSize", | |||||
| "nvrtcGetProgramLog", | |||||
| "nvrtcAddNameExpression", | |||||
| "nvrtcGetLoweredName"}; | |||||
| static void load_library() { | |||||
| static bool done = false; | |||||
| static std::mutex mtx; | |||||
| std::lock_guard<std::mutex> lg{mtx}; | |||||
| if (done) | |||||
| return; | |||||
| void* handle = get_library_handle(); | |||||
| for (size_t i = 0; i < NR_FUNC; ++i) { | |||||
| void* func; | |||||
| if (!handle) { | |||||
| func = nullptr; | |||||
| } else { | |||||
| func = resolve_library_func(handle, g_func_name[i]); | |||||
| } | |||||
| if (!func) { | |||||
| func = g_func_table_error[i]; | |||||
| } | |||||
| __atomic_store_n(g_func_table + i, func, __ATOMIC_RELAXED); | |||||
| } | |||||
| done = true; | |||||
| } | |||||
| const char _WRAPLIB_API_CALL* nvrtcGetErrorString(nvrtcResult arg0) { | |||||
| typedef const char*(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcResult); | |||||
| ON_ENTRY(nvrtcGetErrorString); | |||||
| f_ptr_t f = (f_ptr_t)(g_func_table[0]); | |||||
| return f(arg0); | |||||
| } | |||||
| nvrtcResult _WRAPLIB_API_CALL nvrtcVersion(int* arg0, int* arg1) { | |||||
| typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(int*, int*); | |||||
| ON_ENTRY(nvrtcVersion); | |||||
| f_ptr_t f = (f_ptr_t)(g_func_table[1]); | |||||
| return f(arg0, arg1); | |||||
| } | |||||
| nvrtcResult _WRAPLIB_API_CALL nvrtcGetNumSupportedArchs(int* arg0) { | |||||
| typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(int*); | |||||
| ON_ENTRY(nvrtcGetNumSupportedArchs); | |||||
| f_ptr_t f = (f_ptr_t)(g_func_table[2]); | |||||
| return f(arg0); | |||||
| } | |||||
| nvrtcResult _WRAPLIB_API_CALL nvrtcGetSupportedArchs(int* arg0) { | |||||
| typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(int*); | |||||
| ON_ENTRY(nvrtcGetSupportedArchs); | |||||
| f_ptr_t f = (f_ptr_t)(g_func_table[3]); | |||||
| return f(arg0); | |||||
| } | |||||
| nvrtcResult _WRAPLIB_API_CALL nvrtcCreateProgram(nvrtcProgram* arg0, | |||||
| const char* arg1, | |||||
| const char* arg2, int arg3, | |||||
| const char* const* arg4, | |||||
| const char* const* arg5) { | |||||
| typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)( | |||||
| nvrtcProgram*, const char*, const char*, int, const char* const*, | |||||
| const char* const*); | |||||
| ON_ENTRY(nvrtcCreateProgram); | |||||
| f_ptr_t f = (f_ptr_t)(g_func_table[4]); | |||||
| return f(arg0, arg1, arg2, arg3, arg4, arg5); | |||||
| } | |||||
| nvrtcResult _WRAPLIB_API_CALL nvrtcDestroyProgram(nvrtcProgram* arg0) { | |||||
| typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcProgram*); | |||||
| ON_ENTRY(nvrtcDestroyProgram); | |||||
| f_ptr_t f = (f_ptr_t)(g_func_table[5]); | |||||
| return f(arg0); | |||||
| } | |||||
| nvrtcResult _WRAPLIB_API_CALL nvrtcCompileProgram(nvrtcProgram arg0, int arg1, | |||||
| const char* const* arg2) { | |||||
| typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcProgram, int, | |||||
| const char* const*); | |||||
| ON_ENTRY(nvrtcCompileProgram); | |||||
| f_ptr_t f = (f_ptr_t)(g_func_table[6]); | |||||
| return f(arg0, arg1, arg2); | |||||
| } | |||||
| nvrtcResult _WRAPLIB_API_CALL nvrtcGetPTXSize(nvrtcProgram arg0, size_t* arg1) { | |||||
| typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcProgram, size_t*); | |||||
| ON_ENTRY(nvrtcGetPTXSize); | |||||
| f_ptr_t f = (f_ptr_t)(g_func_table[7]); | |||||
| return f(arg0, arg1); | |||||
| } | |||||
| nvrtcResult _WRAPLIB_API_CALL nvrtcGetPTX(nvrtcProgram arg0, char* arg1) { | |||||
| typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcProgram, char*); | |||||
| ON_ENTRY(nvrtcGetPTX); | |||||
| f_ptr_t f = (f_ptr_t)(g_func_table[8]); | |||||
| return f(arg0, arg1); | |||||
| } | |||||
| nvrtcResult _WRAPLIB_API_CALL nvrtcGetCUBINSize(nvrtcProgram arg0, | |||||
| size_t* arg1) { | |||||
| typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcProgram, size_t*); | |||||
| ON_ENTRY(nvrtcGetCUBINSize); | |||||
| f_ptr_t f = (f_ptr_t)(g_func_table[9]); | |||||
| return f(arg0, arg1); | |||||
| } | |||||
| nvrtcResult _WRAPLIB_API_CALL nvrtcGetCUBIN(nvrtcProgram arg0, char* arg1) { | |||||
| typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcProgram, char*); | |||||
| ON_ENTRY(nvrtcGetCUBIN); | |||||
| f_ptr_t f = (f_ptr_t)(g_func_table[10]); | |||||
| return f(arg0, arg1); | |||||
| } | |||||
| nvrtcResult _WRAPLIB_API_CALL nvrtcGetProgramLogSize(nvrtcProgram arg0, | |||||
| size_t* arg1) { | |||||
| typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcProgram, size_t*); | |||||
| ON_ENTRY(nvrtcGetProgramLogSize); | |||||
| f_ptr_t f = (f_ptr_t)(g_func_table[11]); | |||||
| return f(arg0, arg1); | |||||
| } | |||||
| nvrtcResult _WRAPLIB_API_CALL nvrtcGetProgramLog(nvrtcProgram arg0, | |||||
| char* arg1) { | |||||
| typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcProgram, char*); | |||||
| ON_ENTRY(nvrtcGetProgramLog); | |||||
| f_ptr_t f = (f_ptr_t)(g_func_table[12]); | |||||
| return f(arg0, arg1); | |||||
| } | |||||
| nvrtcResult _WRAPLIB_API_CALL nvrtcAddNameExpression(nvrtcProgram arg0, | |||||
| const char* const arg1) { | |||||
| typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcProgram, | |||||
| const char* const); | |||||
| ON_ENTRY(nvrtcAddNameExpression); | |||||
| f_ptr_t f = (f_ptr_t)(g_func_table[13]); | |||||
| return f(arg0, arg1); | |||||
| } | |||||
| nvrtcResult _WRAPLIB_API_CALL nvrtcGetLoweredName(nvrtcProgram arg0, | |||||
| const char* const arg1, | |||||
| const char** arg2) { | |||||
| typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)( | |||||
| nvrtcProgram, const char* const, const char**); | |||||
| ON_ENTRY(nvrtcGetLoweredName); | |||||
| f_ptr_t f = (f_ptr_t)(g_func_table[14]); | |||||
| return f(arg0, arg1, arg2); | |||||
| } | |||||
| @@ -0,0 +1,75 @@ | |||||
| /** | |||||
| * \file dnn/cuda-stub/src/libnvrtc.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
| * implied. | |||||
| */ | |||||
| #pragma GCC visibility push(default) | |||||
| #include <cstdio> | |||||
| #define LOGE(fmt, v...) fprintf(stderr, "err: " fmt "\n", ##v) | |||||
| #include "./nvrtc_type.h" | |||||
| #pragma GCC diagnostic ignored "-Wdeprecated-declarations" | |||||
| static void log_failed_load(int func_idx); | |||||
| namespace { | |||||
| template <typename T> | |||||
| T on_init_failed(int func_idx); | |||||
| template <> | |||||
| nvrtcResult on_init_failed(int func_idx) { | |||||
| log_failed_load(func_idx); | |||||
| return NVRTC_ERROR_INTERNAL_ERROR; | |||||
| } | |||||
| template <> | |||||
| const char* on_init_failed(int func_idx) { | |||||
| log_failed_load(func_idx); | |||||
| return "load lib failed"; | |||||
| } | |||||
| } // namespace | |||||
| #include "./libnvrtc-wrap.h" | |||||
| static const char* default_so_name = | |||||
| #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) | |||||
| "nvrtc.dll"; | |||||
| #elif defined(__APPLE__) || defined(__MACOSX) | |||||
| "libnvrtc.dylib"; | |||||
| #else | |||||
| "libnvrtc.so"; | |||||
| #endif | |||||
| static const char* default_so_paths[] = { | |||||
| #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) | |||||
| "nvrtc.dll", | |||||
| #elif defined(__unix__) || defined(__QNX__) || defined(__APPLE__) || \ | |||||
| defined(__MACOSX) | |||||
| #if defined(__APPLE__) || defined(__MACOSX) | |||||
| "/usr/local/cuda/lib/libnvrtc.dylib", | |||||
| #elif defined(__ANDROID__) | |||||
| #if defined(__aarch64__) | |||||
| "/system/vendor/lib64/libnvrtc.so", | |||||
| #elif defined(__arm__) | |||||
| "/system/vendor/lib/libnvrtc.so", | |||||
| #endif | |||||
| #else | |||||
| "libnvrtc.so", | |||||
| // In case some users does not have correct search path configured in | |||||
| // /etc/ld.so.conf | |||||
| "/usr/lib/x86_64-linux-gnu/libnvrtc.so", | |||||
| "/usr/local/nvidia/lib64/libnvrtc.so", | |||||
| "/usr/local/cuda/lib64/libnvrtc.so", | |||||
| #endif | |||||
| #else | |||||
| #error "Unknown platform" | |||||
| #endif | |||||
| }; | |||||
| static const char* extra_so_paths[] = {}; | |||||
| static const char* g_default_api_name = "nvrtc"; | |||||
| #include "./dlopen_helper.h" | |||||
| @@ -0,0 +1,27 @@ | |||||
| #pragma once | |||||
| #ifdef __cplusplus | |||||
| extern "C" { | |||||
| #endif /* __cplusplus */ | |||||
| #include <stdlib.h> | |||||
| typedef enum { | |||||
| NVRTC_SUCCESS = 0, | |||||
| NVRTC_ERROR_OUT_OF_MEMORY = 1, | |||||
| NVRTC_ERROR_PROGRAM_CREATION_FAILURE = 2, | |||||
| NVRTC_ERROR_INVALID_INPUT = 3, | |||||
| NVRTC_ERROR_INVALID_PROGRAM = 4, | |||||
| NVRTC_ERROR_INVALID_OPTION = 5, | |||||
| NVRTC_ERROR_COMPILATION = 6, | |||||
| NVRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7, | |||||
| NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8, | |||||
| NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9, | |||||
| NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10, | |||||
| NVRTC_ERROR_INTERNAL_ERROR = 11 | |||||
| } nvrtcResult; | |||||
| typedef struct _nvrtcProgram *nvrtcProgram; | |||||
| #ifdef __cplusplus | |||||
| } | |||||
| #endif /* __cplusplus */ | |||||
| @@ -26,6 +26,7 @@ static inline CUBLASLTMatmulDesc::SizeArgs from_local_size_args( | |||||
| return {handle, transA, transB, | return {handle, transA, transB, | ||||
| args.layout_a, args.layout_b, args.layout_c}; | args.layout_a, args.layout_b, args.layout_c}; | ||||
| } | } | ||||
| bool BatchedMatrixMulForwardImpl::AlgoCublasLt::is_available( | bool BatchedMatrixMulForwardImpl::AlgoCublasLt::is_available( | ||||
| const SizeArgs& args) const { | const SizeArgs& args) const { | ||||
| auto cublasLt_args = from_local_size_args(args); | auto cublasLt_args = from_local_size_args(args); | ||||
| @@ -35,6 +36,7 @@ bool BatchedMatrixMulForwardImpl::AlgoCublasLt::is_available( | |||||
| .is_available(cublasLt_args, INT_MAX); | .is_available(cublasLt_args, INT_MAX); | ||||
| return res; | return res; | ||||
| } | } | ||||
| size_t BatchedMatrixMulForwardImpl::AlgoCublasLt::get_workspace_in_bytes( | size_t BatchedMatrixMulForwardImpl::AlgoCublasLt::get_workspace_in_bytes( | ||||
| const SizeArgs& args) const { | const SizeArgs& args) const { | ||||
| auto cublasLt_args = from_local_size_args(args); | auto cublasLt_args = from_local_size_args(args); | ||||
| @@ -43,6 +45,7 @@ size_t BatchedMatrixMulForwardImpl::AlgoCublasLt::get_workspace_in_bytes( | |||||
| desc.get_algorithm_heuristic(cublasLt_args, INT_MAX, algo); | desc.get_algorithm_heuristic(cublasLt_args, INT_MAX, algo); | ||||
| return desc.get_workspace_bundle(cublasLt_args, algo).total_size_in_bytes(); | return desc.get_workspace_bundle(cublasLt_args, algo).total_size_in_bytes(); | ||||
| } | } | ||||
| void BatchedMatrixMulForwardImpl::AlgoCublasLt::exec( | void BatchedMatrixMulForwardImpl::AlgoCublasLt::exec( | ||||
| const ExecArgs& args) const { | const ExecArgs& args) const { | ||||
| auto cublasLt_args = from_local_size_args(args); | auto cublasLt_args = from_local_size_args(args); | ||||
| @@ -89,6 +92,7 @@ void BatchedMatrixMulForwardImpl::AlgoCublasLt::exec( | |||||
| desc.layout_c, &algo, ws_bundle.get(0), | desc.layout_c, &algo, ws_bundle.get(0), | ||||
| ws_bundle.get_size(0), stream)); | ws_bundle.get_size(0), stream)); | ||||
| }; | }; | ||||
| auto batched_igemm = [&]() { | auto batched_igemm = [&]() { | ||||
| auto zero = handle->zero_device(); | auto zero = handle->zero_device(); | ||||
| auto one = handle->one_device(); | auto one = handle->one_device(); | ||||
| @@ -133,6 +137,18 @@ void BatchedMatrixMulForwardImpl::AlgoCublasLt::exec( | |||||
| }; | }; | ||||
| ws_bundle.set(args.workspace.raw_ptr); | ws_bundle.set(args.workspace.raw_ptr); | ||||
| #if CUDA_VERSION >= 11000 | |||||
| if (desc.dt_compute == CUBLAS_COMPUTE_32I) { | |||||
| batched_igemm(); | |||||
| } else if (desc.dt_compute == CUBLAS_COMPUTE_16F) { | |||||
| batched_hgemm(); | |||||
| } else if (desc.dt_compute == CUBLAS_COMPUTE_32F) { | |||||
| batched_sgemm(); | |||||
| } else { | |||||
| megdnn_throw( | |||||
| megdnn_mangle("compute_type must be int32/float16/float32")); | |||||
| } | |||||
| #else | |||||
| if (desc.dt_compute == CUDA_R_32I) { | if (desc.dt_compute == CUDA_R_32I) { | ||||
| batched_igemm(); | batched_igemm(); | ||||
| } else if (desc.dt_compute == CUDA_R_16F) { | } else if (desc.dt_compute == CUDA_R_16F) { | ||||
| @@ -143,5 +159,6 @@ void BatchedMatrixMulForwardImpl::AlgoCublasLt::exec( | |||||
| megdnn_throw( | megdnn_throw( | ||||
| megdnn_mangle("compute_type must be int32/float16/float32")); | megdnn_mangle("compute_type must be int32/float16/float32")); | ||||
| } | } | ||||
| #endif | |||||
| } | } | ||||
| #endif | #endif | ||||
| @@ -156,6 +156,9 @@ std::string ConvBiasForwardImpl::AlgoBase::SizeArgs::to_string() const { | |||||
| case param::ConvBias::NonlineMode::IDENTITY: | case param::ConvBias::NonlineMode::IDENTITY: | ||||
| nonlinear_mode_str = "IDENTITY"; | nonlinear_mode_str = "IDENTITY"; | ||||
| break; | break; | ||||
| case param::ConvBias::NonlineMode::H_SWISH: | |||||
| nonlinear_mode_str = "H_SWISH"; | |||||
| break; | |||||
| default: | default: | ||||
| megdnn_throw("invalid conv bias nonlinear mode"); | megdnn_throw("invalid conv bias nonlinear mode"); | ||||
| } | } | ||||
| @@ -165,6 +165,23 @@ void TensorDesc::set(const TensorLayout& layout, | |||||
| } | } | ||||
| } | } | ||||
| std::string TensorDesc::to_string() { | |||||
| cudnnDataType_t data_type; | |||||
| int n; | |||||
| int c; | |||||
| int h; | |||||
| int w; | |||||
| int n_stride; | |||||
| int c_stride; | |||||
| int h_stride; | |||||
| int w_stride; | |||||
| cudnn_check(cudnnGetTensor4dDescriptor(desc, &data_type, &n, &c, &h, &w, | |||||
| &n_stride, &c_stride, &h_stride, | |||||
| &w_stride)); | |||||
| return ssprintf("<dtype_%d, %d,%d,%d,%d(%d,%d,%d,%d)>", data_type, n, c, h, | |||||
| w, n_stride, c_stride, h_stride, w_stride); | |||||
| } | |||||
| template <typename Param> | template <typename Param> | ||||
| FilterDesc<Param>::FilterDesc() { | FilterDesc<Param>::FilterDesc() { | ||||
| cudnn_check(cudnnCreateFilterDescriptor(&desc)); | cudnn_check(cudnnCreateFilterDescriptor(&desc)); | ||||
| @@ -175,6 +192,20 @@ FilterDesc<Param>::~FilterDesc() { | |||||
| cudnn_check(cudnnDestroyFilterDescriptor(desc)); | cudnn_check(cudnnDestroyFilterDescriptor(desc)); | ||||
| } | } | ||||
| template <typename Param> | |||||
| std::string FilterDesc<Param>::to_string() { | |||||
| cudnnDataType_t data_type; | |||||
| cudnnTensorFormat_t format; | |||||
| int k; | |||||
| int c; | |||||
| int h; | |||||
| int w; | |||||
| cudnn_check(cudnnGetFilter4dDescriptor(desc, &data_type, &format, &k, &c, | |||||
| &h, &w)); | |||||
| return ssprintf("<dtype_%d, format_%d, %d,%d,%d,%d>", data_type,format, k, c, h, | |||||
| w); | |||||
| } | |||||
| template <typename Param> | template <typename Param> | ||||
| void FilterDesc<Param>::set( | void FilterDesc<Param>::set( | ||||
| const typename ConvolutionBase<Param>::CanonizedFilterMeta& | const typename ConvolutionBase<Param>::CanonizedFilterMeta& | ||||
| @@ -30,6 +30,7 @@ class TensorDesc { | |||||
| //! default layout is nchw | //! default layout is nchw | ||||
| void set(const TensorLayout& layout, const param::Convolution::Format = | void set(const TensorLayout& layout, const param::Convolution::Format = | ||||
| param::Convolution::Format::NCHW); | param::Convolution::Format::NCHW); | ||||
| std::string to_string(); | |||||
| ~TensorDesc(); | ~TensorDesc(); | ||||
| cudnnTensorDescriptor_t desc; | cudnnTensorDescriptor_t desc; | ||||
| }; | }; | ||||
| @@ -39,6 +40,7 @@ class FilterDesc { | |||||
| public: | public: | ||||
| FilterDesc(); | FilterDesc(); | ||||
| void set(const typename ConvolutionBase<Param>::CanonizedFilterMeta &meta); | void set(const typename ConvolutionBase<Param>::CanonizedFilterMeta &meta); | ||||
| std::string to_string(); | |||||
| ~FilterDesc(); | ~FilterDesc(); | ||||
| cudnnFilterDescriptor_t desc; | cudnnFilterDescriptor_t desc; | ||||
| }; | }; | ||||
| @@ -25,6 +25,10 @@ using namespace cuda; | |||||
| #define SE_CUDA_DATA_HALF CUBLAS_DATA_HALF | #define SE_CUDA_DATA_HALF CUBLAS_DATA_HALF | ||||
| #endif | #endif | ||||
| #if CUDA_VERSION < 11000 | |||||
| #define CUBLAS_COMPUTE_32I CUDA_R_32I | |||||
| #endif | |||||
| bool MatrixMulForwardImpl::AlgoCuBlas::is_available( | bool MatrixMulForwardImpl::AlgoCuBlas::is_available( | ||||
| const SizeArgs& args) const { | const SizeArgs& args) const { | ||||
| if (args.opr->param().format != param::MatrixMul::Format::DEFAULT) | if (args.opr->param().format != param::MatrixMul::Format::DEFAULT) | ||||
| @@ -117,7 +121,7 @@ void MatrixMulForwardImpl::AlgoCuBlas::exec(const ExecArgs& args) const { | |||||
| args.tensor_b.layout.stride[0], args.tensor_a.raw_ptr, | args.tensor_b.layout.stride[0], args.tensor_a.raw_ptr, | ||||
| CUDA_R_8I, args.tensor_a.layout.stride[0], zero, | CUDA_R_8I, args.tensor_a.layout.stride[0], zero, | ||||
| args.tensor_c.raw_ptr, CUDA_R_32I, | args.tensor_c.raw_ptr, CUDA_R_32I, | ||||
| args.tensor_c.layout.stride[0], CUDA_R_32I, CUBLAS_GEMM_DFALT)); | |||||
| args.tensor_c.layout.stride[0], CUBLAS_COMPUTE_32I, CUBLAS_GEMM_DFALT)); | |||||
| }; | }; | ||||
| // Note that cublas takes column-major matrices as inputs, | // Note that cublas takes column-major matrices as inputs, | ||||
| @@ -6,10 +6,11 @@ | |||||
| * | * | ||||
| * Unless required by applicable law or agreed to in writing, | * Unless required by applicable law or agreed to in writing, | ||||
| * software distributed under the License is distributed on an | * software distributed under the License is distributed on an | ||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
| * implied. | |||||
| */ | */ | ||||
| #include "src/cuda/matrix_mul/cublasLt_wrapper.h" | |||||
| #include "src/common/utils.h" | #include "src/common/utils.h" | ||||
| #include "src/cuda/matrix_mul/cublasLt_wrapper.h" | |||||
| #include "src/cuda/utils.h" | #include "src/cuda/utils.h" | ||||
| #if CUDA_VERSION >= 10010 | #if CUDA_VERSION >= 10010 | ||||
| @@ -33,6 +34,7 @@ static cudaDataType_t to_cuda_dtype(DType tp) { | |||||
| } | } | ||||
| } | } | ||||
| #if CUDA_VERSION >= 11000 | |||||
| static cublasComputeType_t to_cublas_compute_type(DType tp) { | static cublasComputeType_t to_cublas_compute_type(DType tp) { | ||||
| switch (tp.enumv()) { | switch (tp.enumv()) { | ||||
| case DTypeEnum::Float16: | case DTypeEnum::Float16: | ||||
| @@ -43,10 +45,11 @@ static cublasComputeType_t to_cublas_compute_type(DType tp) { | |||||
| case DTypeEnum::QuantizedS32: | case DTypeEnum::QuantizedS32: | ||||
| return CUBLAS_COMPUTE_32I; | return CUBLAS_COMPUTE_32I; | ||||
| default: | default: | ||||
| megdnn_throw(megdnn_mangle( | |||||
| "dtype must be float16/float32/int32/Qs32")); | |||||
| megdnn_throw( | |||||
| megdnn_mangle("dtype must be float16/float32/int32/Qs32")); | |||||
| } | } | ||||
| } | } | ||||
| #endif | |||||
| static const char* cuda_type_to_str(cudaDataType_t tp) { | static const char* cuda_type_to_str(cudaDataType_t tp) { | ||||
| switch (tp) { | switch (tp) { | ||||
| @@ -106,9 +109,15 @@ void CUBLASLTMatmulDesc::set(const SizeArgs& args, bool batched) { | |||||
| dt_b = to_cuda_dtype(args.layout_b.dtype); | dt_b = to_cuda_dtype(args.layout_b.dtype); | ||||
| dt_a = to_cuda_dtype(args.layout_a.dtype); | dt_a = to_cuda_dtype(args.layout_a.dtype); | ||||
| dt_c = to_cuda_dtype(args.layout_c.dtype); | dt_c = to_cuda_dtype(args.layout_c.dtype); | ||||
| dt_compute = to_cublas_compute_type(args.layout_c.dtype); | |||||
| megdnn_assert(dt_a == dt_b, "matrix A and B should have same precision"); | megdnn_assert(dt_a == dt_b, "matrix A and B should have same precision"); | ||||
| #if CUDA_VERSION >= 11000 | |||||
| dt_compute = to_cublas_compute_type(args.layout_c.dtype); | |||||
| cublas_check(cublasLtMatmulDescCreate(&matmul_desc, dt_compute, dt_c)); | cublas_check(cublasLtMatmulDescCreate(&matmul_desc, dt_compute, dt_c)); | ||||
| #else | |||||
| dt_compute = dt_c; | |||||
| cublas_check(cublasLtMatmulDescCreate(&matmul_desc, dt_compute)); | |||||
| #endif | |||||
| cublas_check(cublasLtMatmulDescSetAttribute( | cublas_check(cublasLtMatmulDescSetAttribute( | ||||
| matmul_desc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &pm, sizeof(pm))); | matmul_desc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &pm, sizeof(pm))); | ||||
| @@ -262,8 +271,7 @@ WorkspaceBundle CUBLASLTMatmulDesc::get_workspace_bundle( | |||||
| dt_c == CUDA_R_32I ? layout_trans_b : layout_b, | dt_c == CUDA_R_32I ? layout_trans_b : layout_b, | ||||
| dt_c == CUDA_R_32I ? layout_trans_a : layout_a, | dt_c == CUDA_R_32I ? layout_trans_a : layout_a, | ||||
| dt_c == CUDA_R_32I ? layout_trans_c : layout_c, | dt_c == CUDA_R_32I ? layout_trans_c : layout_c, | ||||
| dt_c == CUDA_R_32I ? layout_trans_c : layout_c, &algo, | |||||
| &result); | |||||
| dt_c == CUDA_R_32I ? layout_trans_c : layout_c, &algo, &result); | |||||
| // return empty WorkspaceBundle if cublasLtMatmulAlgoCheck() failed | // return empty WorkspaceBundle if cublasLtMatmulAlgoCheck() failed | ||||
| if (status != CUBLAS_STATUS_SUCCESS) | if (status != CUBLAS_STATUS_SUCCESS) | ||||
| return {nullptr, {}}; | return {nullptr, {}}; | ||||
| @@ -48,7 +48,11 @@ struct CUBLASLTMatmulDesc { | |||||
| bool is_batched; | bool is_batched; | ||||
| cublasLtMatmulDesc_t matmul_desc; | cublasLtMatmulDesc_t matmul_desc; | ||||
| cudaDataType_t dt_a, dt_b, dt_c; | cudaDataType_t dt_a, dt_b, dt_c; | ||||
| #if CUDA_VERSION >= 11000 | |||||
| cublasComputeType_t dt_compute; | cublasComputeType_t dt_compute; | ||||
| #else | |||||
| cudaDataType_t dt_compute; | |||||
| #endif | |||||
| cublasLtMatrixLayout_t layout_a, layout_b, layout_c; | cublasLtMatrixLayout_t layout_a, layout_b, layout_c; | ||||
| cublasLtMatrixLayout_t layout_trans_a, layout_trans_b, layout_trans_c; | cublasLtMatrixLayout_t layout_trans_a, layout_trans_b, layout_trans_c; | ||||
| size_t workspace_a, workspace_b, workspace_c; | size_t workspace_a, workspace_b, workspace_c; | ||||
| @@ -128,7 +128,23 @@ void MatrixMulForwardImpl::AlgoCuBlasLt::exec(const ExecArgs& args) const { | |||||
| stream)); | stream)); | ||||
| cublas_check(cublasLtMatrixTransformDescDestroy(transform_desc)); | cublas_check(cublasLtMatrixTransformDescDestroy(transform_desc)); | ||||
| }; | }; | ||||
| switch(desc.dt_compute) { | |||||
| #if CUDA_VERSION >= 11000 | |||||
| switch (desc.dt_compute) { | |||||
| case CUBLAS_COMPUTE_16F: | |||||
| hgemm(); | |||||
| break; | |||||
| case CUBLAS_COMPUTE_32F: | |||||
| sgemm(); | |||||
| break; | |||||
| case CUBLAS_COMPUTE_32I: | |||||
| igemm(); | |||||
| break; | |||||
| default: | |||||
| megdnn_throw(megdnn_mangle( | |||||
| "compute type must be float16/float32/int32")); | |||||
| } | |||||
| #else | |||||
| switch (desc.dt_compute) { | |||||
| case CUDA_R_16F: | case CUDA_R_16F: | ||||
| hgemm(); | hgemm(); | ||||
| break; | break; | ||||
| @@ -139,8 +155,10 @@ void MatrixMulForwardImpl::AlgoCuBlasLt::exec(const ExecArgs& args) const { | |||||
| igemm(); | igemm(); | ||||
| break; | break; | ||||
| default: | default: | ||||
| megdnn_throw(megdnn_mangle("compute type must be float16/float32/int32")); | |||||
| megdnn_throw(megdnn_mangle( | |||||
| "compute type must be float16/float32/int32")); | |||||
| } | } | ||||
| #endif | |||||
| } | } | ||||
| #endif | #endif | ||||
| // vim: syntax=cpp.doxygen | // vim: syntax=cpp.doxygen | ||||
| @@ -309,6 +309,9 @@ void benchmark_target_algo(Handle* handle, const std::vector<BenchArgs>& args, | |||||
| arg.f / (1e12); | arg.f / (1e12); | ||||
| TensorShape src{arg.n, arg.ci, arg.hi, arg.wi}, | TensorShape src{arg.n, arg.ci, arg.hi, arg.wi}, | ||||
| filter{arg.co, arg.ci, arg.f, arg.f}; | filter{arg.co, arg.ci, arg.f, arg.f}; | ||||
| if (!algo){ | |||||
| algo = "no_name"; | |||||
| } | |||||
| printf("src=%s, filter=%s, time(algo=%s)=%.2f %.2fTops, " | printf("src=%s, filter=%s, time(algo=%s)=%.2f %.2fTops, " | ||||
| "time(cudnn)=%.2f %.2fTops, time(batched_matmul)=%.2f " | "time(cudnn)=%.2f %.2fTops, time(batched_matmul)=%.2f " | ||||
| "%.2fTops, " | "%.2fTops, " | ||||
| @@ -1,2 +1,3 @@ | |||||
| Makefile | Makefile | ||||
| /test/imperative_test | /test/imperative_test | ||||
| python/megengine/version.py | |||||
| @@ -70,3 +70,8 @@ add_custom_command( | |||||
| COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/python/requires-style.txt ${CMAKE_CURRENT_BINARY_DIR}/python/requires-style.txt | COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/python/requires-style.txt ${CMAKE_CURRENT_BINARY_DIR}/python/requires-style.txt | ||||
| COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/python/requires-test.txt ${CMAKE_CURRENT_BINARY_DIR}/python/requires-test.txt | COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/python/requires-test.txt ${CMAKE_CURRENT_BINARY_DIR}/python/requires-test.txt | ||||
| ) | ) | ||||
| add_custom_command( | |||||
| TARGET ${MODULE_NAME} POST_BUILD | |||||
| COMMAND "${PYTHON_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/python/gen_version.py --output ${CMAKE_CURRENT_BINARY_DIR}/python/megengine/version.py | |||||
| ) | |||||
| @@ -0,0 +1,31 @@ | |||||
| import argparse | |||||
| import os | |||||
| import subprocess | |||||
| def get_git_commit(src_dir): | |||||
| try: | |||||
| return subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd=src_dir).decode('ascii').strip() | |||||
| except Exception: | |||||
| return 'unknown' | |||||
| def get_mge_version(version_txt_path): | |||||
| v = {} | |||||
| with open(version_txt_path) as fp: | |||||
| exec(fp.read(), v) | |||||
| return v | |||||
| if __name__ == "__main__": | |||||
| parser = argparse.ArgumentParser(description="generate version.py to build path") | |||||
| parser.add_argument("--output", type=str, required=True) | |||||
| args = parser.parse_args() | |||||
| python_dir = os.path.dirname(__file__) | |||||
| version_txt_path = os.path.join(python_dir, 'version_template.py') | |||||
| commit_id = get_git_commit(python_dir) | |||||
| mge_ver_map = get_mge_version(version_txt_path) | |||||
| mge_ver = mge_ver_map['__version__'] if '__version__' in mge_ver_map else 'unknown' | |||||
| mge_intl = mge_ver_map['__internal__'] if '__internal__' in mge_ver_map else False | |||||
| with open(args.output, 'w') as f: | |||||
| f.write("__version__ = '{}'\n".format(mge_ver)) | |||||
| f.write("git_version = {}\n".format(repr(commit_id))) | |||||
| if mge_intl: | |||||
| f.write("__internal__ = True\n") | |||||
| @@ -0,0 +1,10 @@ | |||||
| # -*- coding: utf-8 -*- | |||||
| # MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| # | |||||
| # Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, | |||||
| # software distributed under the License is distributed on an | |||||
| # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| __version__ = "1.3.0.dev" | |||||
| @@ -8,7 +8,7 @@ | |||||
| ```bash | ```bash | ||||
| 1: please refer to: https://docs.docker.com/engine/security/rootless/ to enable rootless docker env | 1: please refer to: https://docs.docker.com/engine/security/rootless/ to enable rootless docker env | ||||
| 2: cd ./scripts/whl/manylinux2010 | |||||
| 2: cd ./scripts/whl/manylinux2014 | |||||
| 3: ./build_image.sh | 3: ./build_image.sh | ||||
| ``` | ``` | ||||
| @@ -56,24 +56,25 @@ | |||||
| ``` | ``` | ||||
| # How to build | # How to build | ||||
| Note: Guarantee the git repo is mounted in docker container, do not use `git submodule update --init` in to init megbrain repo | |||||
| ## Build for linux | ## Build for linux | ||||
| * MegBrain delivers `wheel` package with `manylinux2010` tag defined in [PEP-571](https://www.python.org/dev/peps/pep-0571/). | |||||
| * MegBrain delivers `wheel` package with `manylinux2014` tag defined in [PEP-571](https://www.python.org/dev/peps/pep-0571/). | |||||
| commands: | commands: | ||||
| ```bash | ```bash | ||||
| export CUDA_ROOT_DIR=/path/to/cuda | export CUDA_ROOT_DIR=/path/to/cuda | ||||
| export CUDNN_ROOT_DIR=/path/to/cudnn | export CUDNN_ROOT_DIR=/path/to/cudnn | ||||
| export TENSORRT_ROOT_DIR=/path/to/tensorrt | export TENSORRT_ROOT_DIR=/path/to/tensorrt | ||||
| ./scripts/whl/manylinux2010/build_wheel.sh | |||||
| ./scripts/whl/manylinux2014/build_wheel_common.sh -sdk cu101 | |||||
| ``` | ``` | ||||
| * And you can find all of the outputs in `output` directory.If you just want to build for a specific Python verison, you can use `ALL_PYTHON` environment variable. eg: | * And you can find all of the outputs in `output` directory.If you just want to build for a specific Python verison, you can use `ALL_PYTHON` environment variable. eg: | ||||
| ```bash | ```bash | ||||
| ALL_PYTHON="36m" ./scripts/whl/manylinux2010/build_wheel.sh | |||||
| ALL_PYTHON="36m" ./scripts/whl/manylinux2014/build_wheel_common.sh -sdk cu101 | |||||
| ``` | ``` | ||||
| * If you just want to build with cpu only version, you can set `BUILD_WHL_CPU_ONLY` environment 'ON'. eg: | * If you just want to build with cpu only version, you can set `BUILD_WHL_CPU_ONLY` environment 'ON'. eg: | ||||
| ```bash | ```bash | ||||
| BUILD_WHL_CPU_ONLY="ON" ALL_PYTHON="36m" ./scripts/whl/manylinux2010/build_wheel.sh | |||||
| BUILD_WHL_CPU_ONLY="ON" ALL_PYTHON="36m" ./scripts/whl/manylinux2014/build_wheel_common.sh -sdk cu101 | |||||
| ``` | ``` | ||||
| ## Build for MacOS | ## Build for MacOS | ||||
| @@ -0,0 +1,15 @@ | |||||
| FROM quay.io/pypa/manylinux2014_x86_64:2020-12-31-56195b3 | |||||
| ENV UID=1024 \ | |||||
| PATH=${PATH}:/usr/local/cuda/bin \ | |||||
| LIBRARY_PATH=${LIBRARY_PATH}:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/opt/cudnn/lib64:/opt/tensorrt/lib \ | |||||
| LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/opt/cudnn/lib64:/opt/tensorrt/lib \ | |||||
| CPATH=${CPATH}:/usr/local/cuda/include:/opt/cudnn/include:/opt/tensorrt/include | |||||
| ARG platform | |||||
| COPY setup_mirror.sh . | |||||
| RUN ./setup_mirror.sh "$platform" | |||||
| ADD init_image.sh /tmp | |||||
| RUN /tmp/init_image.sh && rm -f /tmp/init_image.sh | |||||
| @@ -0,0 +1,5 @@ | |||||
| #!/bin/bash -e | |||||
| cd $(dirname $0) | |||||
| docker build -t env_manylinux2014:latest . | |||||
| @@ -0,0 +1,217 @@ | |||||
| #!/bin/bash | |||||
| set -e | |||||
| CWD=$(dirname $0) | |||||
| BASEDIR=$(readlink -f ${CWD}/../../..) | |||||
| OUTPUTDIR=$(readlink -f ${CWD}/output) | |||||
| USERID=$(id -u) | |||||
| TMPFS_ARGS="--tmpfs /tmp:exec" | |||||
| local_path=$(dirname $(readlink -f $0)) | |||||
| CUDNN_LIB_DIR="/opt/cudnn/lib64/" | |||||
| CUDA_LIB_DIR="/usr/local/cuda/lib64/" | |||||
| CUDA_SDK="unknown" | |||||
| function usage() { | |||||
| echo "use '-sdk cu111' to specify cuda toolkit config, also support cu101, cu112" | |||||
| } | |||||
| while [ "$1" != "" ]; do | |||||
| case $1 in | |||||
| -sdk) | |||||
| shift | |||||
| CUDA_SDK=$1 | |||||
| shift | |||||
| ;; | |||||
| *) | |||||
| usage | |||||
| exit 1 | |||||
| esac | |||||
| done | |||||
| echo "Build with ${CUDA_SDK}" | |||||
| if [ $CUDA_SDK == "cu101" ];then | |||||
| COPY_LIB_LIST="${CUDA_LIB_DIR}/libnvrtc.so.10.1" | |||||
| EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=OFF" | |||||
| OUT_DIR="cu101" | |||||
| BUILD_GCC8="ON" | |||||
| REQUIR_CUDA_VERSION="10010" | |||||
| REQUIR_CUDNN_VERSION="7.6.3" | |||||
| REQUIR_TENSORRT_VERSION="6.0.1.5" | |||||
| elif [ $CUDA_SDK == "cu111" ];then | |||||
| COPY_LIB_LIST="\ | |||||
| ${CUDA_LIB_DIR}/libnvrtc.so.11.1:\ | |||||
| ${CUDA_LIB_DIR}/libcublasLt.so.11:\ | |||||
| ${CUDA_LIB_DIR}/libcublas.so.11:\ | |||||
| ${CUDNN_LIB_DIR}/libcudnn_adv_infer.so.8:\ | |||||
| ${CUDNN_LIB_DIR}/libcudnn_adv_train.so.8:\ | |||||
| ${CUDNN_LIB_DIR}/libcudnn_cnn_infer.so.8:\ | |||||
| ${CUDNN_LIB_DIR}/libcudnn_cnn_train.so.8:\ | |||||
| ${CUDNN_LIB_DIR}/libcudnn_ops_infer.so.8:\ | |||||
| ${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\ | |||||
| ${CUDNN_LIB_DIR}/libcudnn.so.8" | |||||
| EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON\ | |||||
| -gencode arch=compute_61,code=sm_61 \ | |||||
| arch=compute_70,code=sm_70 \ | |||||
| arch=compute_75,code=sm_75 \ | |||||
| arch=compute_80,code=sm_80 \ | |||||
| arch=compute_86,code=sm_86 \ | |||||
| arch=compute_86,code=compute_86" | |||||
| OUT_DIR="cu111" | |||||
| REQUIR_CUDA_VERSION="11010" | |||||
| REQUIR_CUDNN_VERSION="8.0.5" | |||||
| REQUIR_TENSORRT_VERSION="7.2.2.3" | |||||
| elif [ $CUDA_SDK == "cu112" ];then | |||||
| COPY_LIB_LIST="\ | |||||
| ${CUDA_LIB_DIR}/libnvrtc.so.11.2:\ | |||||
| ${CUDA_LIB_DIR}/libcublasLt.so.11:\ | |||||
| ${CUDA_LIB_DIR}/libcublas.so.11:\ | |||||
| ${CUDNN_LIB_DIR}/libcudnn_adv_infer.so.8:\ | |||||
| ${CUDNN_LIB_DIR}/libcudnn_adv_train.so.8:\ | |||||
| ${CUDNN_LIB_DIR}/libcudnn_cnn_infer.so.8:\ | |||||
| ${CUDNN_LIB_DIR}/libcudnn_cnn_train.so.8:\ | |||||
| ${CUDNN_LIB_DIR}/libcudnn_ops_infer.so.8:\ | |||||
| ${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\ | |||||
| ${CUDNN_LIB_DIR}/libcudnn.so.8" | |||||
| EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON \ | |||||
| -gencode arch=compute_61,code=sm_61 \ | |||||
| arch=compute_70,code=sm_70 \ | |||||
| arch=compute_75,code=sm_75 \ | |||||
| arch=compute_80,code=sm_80 \ | |||||
| arch=compute_86,code=sm_86 \ | |||||
| arch=compute_86,code=compute_86" | |||||
| OUT_DIR="cu112" | |||||
| REQUIR_CUDA_VERSION="11020" | |||||
| REQUIR_CUDNN_VERSION="8.0.5" | |||||
| REQUIR_TENSORRT_VERSION="7.2.2.3" | |||||
| else | |||||
| echo "no support sdk ${CUDA_SDK}, please set by '-sdk cu111'" | |||||
| exit -1 | |||||
| fi | |||||
| BUILD_WHL_CPU_ONLY=${BUILD_WHL_CPU_ONLY} | |||||
| if [[ -z ${BUILD_WHL_CPU_ONLY} ]] | |||||
| then | |||||
| BUILD_WHL_CPU_ONLY="OFF" | |||||
| fi | |||||
| echo ${BASEDIR} | |||||
| pushd ${BASEDIR}/third_party >/dev/null | |||||
| ./prepare.sh | |||||
| popd >/dev/null | |||||
| cd ${CWD} | |||||
| mkdir -p ${OUTPUTDIR} | |||||
| if [ ${BUILD_WHL_CPU_ONLY} = "OFF" ]; then | |||||
| if [[ -z ${CUDA_ROOT_DIR} ]]; then | |||||
| echo "Environment variable CUDA_ROOT_DIR not set." | |||||
| exit -1 | |||||
| fi | |||||
| if [[ -z ${CUDNN_ROOT_DIR} ]]; then | |||||
| echo "Environment variable CUDNN_ROOT_DIR not set." | |||||
| exit -1 | |||||
| fi | |||||
| if [[ -z ${TENSORRT_ROOT_DIR} ]]; then | |||||
| echo "Environment variable TENSORRT_ROOT_DIR not set." | |||||
| exit -1 | |||||
| fi | |||||
| ## YOU SHOULD MODIFY CUDA VERSION AS BELOW WHEN UPGRADE | |||||
| CUDA_ROOT_DIR_=${CUDA_ROOT_DIR%*/} | |||||
| CUDNN_ROOT_DIR_=${CUDNN_ROOT_DIR%*/} | |||||
| TENSORRT_ROOT_DIR_=${TENSORRT_ROOT_DIR%*/} | |||||
| CUDA_VERSION_PATH=${CUDA_ROOT_DIR_}/include/cuda.h | |||||
| if [ "$REQUIR_CUDA_VERSION" -ge "11000" ];then | |||||
| CUDNN_VERSION_PATH=${CUDNN_ROOT_DIR_}/include/cudnn_version.h | |||||
| else | |||||
| CUDNN_VERSION_PATH=${CUDNN_ROOT_DIR_}/include/cudnn.h | |||||
| fi | |||||
| TENSORRT_VERSION_PATH=${TENSORRT_ROOT_DIR_}/include/NvInferVersion.h | |||||
| if [ ! -e $CUDA_VERSION_PATH ] ; then | |||||
| echo file $CUDA_VERSION_PATH is not exist | |||||
| echo please check the Environment must use CUDA-$REQUIR_CUDA_VERSION | |||||
| exit -1 | |||||
| fi | |||||
| if [ ! -e $CUDNN_VERSION_PATH ] ; then | |||||
| echo file $CUDNN_VERSION_PATH is not exist | |||||
| echo please check the Environment must use CUDNN-V$REQUIR_CUDNN_VERSION | |||||
| exit -1 | |||||
| fi | |||||
| if [ ! -e $TENSORRT_VERSION_PATH ] ; then | |||||
| echo file $TENSORRT_VERSION_PATH is not exist | |||||
| echo please check the Environment must use TensorRT-$REQUIR_TENSORRT_VERSION | |||||
| exit -1 | |||||
| fi | |||||
| CUDA_VERSION_CONTEXT=$(head -300 ${CUDA_VERSION_PATH}) | |||||
| CUDNN_VERSION_CONTEXT=$(head -62 ${CUDNN_VERSION_PATH}) | |||||
| TENSORRT_VERSION_CONTEXT=$(tail -12 ${TENSORRT_VERSION_PATH}) | |||||
| if [ "$REQUIR_CUDA_VERSION" -ge "11000" ];then | |||||
| CUDA_API_VERSION=$(echo $CUDA_VERSION_CONTEXT | grep -Eo "define CUDA_VERSION * +([0-9]+)") | |||||
| else | |||||
| CUDA_API_VERSION=$(echo $CUDA_VERSION_CONTEXT | grep -Eo "define __CUDA_API_VERSION * +([0-9]+)") | |||||
| fi | |||||
| CUDA_VERSION=${CUDA_API_VERSION:0-5} | |||||
| echo CUDA_VERSION:$CUDA_VERSION | |||||
| CUDNN_VERSION_MAJOR=$(echo $CUDNN_VERSION_CONTEXT | grep -Eo "define CUDNN_MAJOR * +([0-9]+)") | |||||
| CUDNN_VERSION_MINOR=$(echo $CUDNN_VERSION_CONTEXT | grep -Eo "define CUDNN_MINOR * +([0-9]+)") | |||||
| CUDNN_VERSION_PATCH=$(echo $CUDNN_VERSION_CONTEXT | grep -Eo "define CUDNN_PATCHLEVEL * +([0-9]+)") | |||||
| CUDNN_VERSION=${CUDNN_VERSION_MAJOR:0-1}.${CUDNN_VERSION_MINOR:0-1}.${CUDNN_VERSION_PATCH:0-1} | |||||
| echo CUDNN_VERSION:$CUDNN_VERSION | |||||
| TENSORRT_VERSION_MAJOR=$(echo $TENSORRT_VERSION_CONTEXT | grep -Eo "NV_TENSORRT_MAJOR * +([0-9]+)") | |||||
| TENSORRT_VERSION_MINOR=$(echo $TENSORRT_VERSION_CONTEXT | grep -Eo "NV_TENSORRT_MINOR * +([0-9]+)") | |||||
| TENSORRT_VERSION_PATCH=$(echo $TENSORRT_VERSION_CONTEXT | grep -Eo "NV_TENSORRT_PATCH * +([0-9]+)") | |||||
| TENSORRT_VERSION_BUILD=$(echo $TENSORRT_VERSION_CONTEXT | grep -Eo "NV_TENSORRT_BUILD * +([0-9]+)") | |||||
| TENSORRT_VERSION=${TENSORRT_VERSION_MAJOR:0-1}.${TENSORRT_VERSION_MINOR:0-1}.${TENSORRT_VERSION_PATCH:0-1}.${TENSORRT_VERSION_BUILD:0-1} | |||||
| echo TENSORRT_VERSION:$TENSORRT_VERSION | |||||
| if [ $CUDA_VERSION != $REQUIR_CUDA_VERSION ] ; then | |||||
| echo please check the Environment must use CUDA-10.1 NO.$REQUIR_CUDA_VERSION | |||||
| exit -1 | |||||
| fi | |||||
| if [ $CUDNN_VERSION != $REQUIR_CUDNN_VERSION ] ; then | |||||
| echo please check the Environment must use CUDNN-V$REQUIR_CUDNN_VERSION | |||||
| exit -1 | |||||
| fi | |||||
| if [ $TENSORRT_VERSION != $REQUIR_TENSORRT_VERSION ] ; then | |||||
| echo please check the Environment must use TENSORRT-$REQUIR_TENSORRT_VERSION | |||||
| exit -1 | |||||
| fi | |||||
| fi | |||||
| if [[ -z ${BUILD_GCC8} ]];then | |||||
| BUILD_GCC8=OFF | |||||
| fi | |||||
| if [ "$BUILD_GCC8" == "ON" ];then | |||||
| run_cmd="scl enable devtoolset-8 /home/code/scripts/whl/manylinux2014/do_build_common.sh" | |||||
| else | |||||
| run_cmd="/home/code/scripts/whl/manylinux2014/do_build_common.sh" | |||||
| fi | |||||
| docker run --rm -it $TMPFS_ARGS \ | |||||
| -e UID=${USERID} \ | |||||
| -e LOCAL_VERSION=${LOCAL_VERSION} \ | |||||
| -e BUILD_WHL_CPU_ONLY=${BUILD_WHL_CPU_ONLY} \ | |||||
| -e ALL_PYTHON="${ALL_PYTHON}" \ | |||||
| -e EXTRA_CMAKE_FLAG="$EXTRA_CMAKE_FLAG" \ | |||||
| -e COPY_LIB_LIST="$COPY_LIB_LIST" \ | |||||
| -e OUT_DIR="$OUT_DIR" \ | |||||
| -v ${CUDA_ROOT_DIR}:/usr/local/cuda \ | |||||
| -v ${CUDNN_ROOT_DIR}:/opt/cudnn \ | |||||
| -v ${TENSORRT_ROOT_DIR}:/opt/tensorrt \ | |||||
| -v ${BASEDIR}:/home/code \ | |||||
| -v ${OUTPUTDIR}:/home/output:rw \ | |||||
| env_manylinux2014:latest /bin/bash -c "$run_cmd" | |||||
| @@ -0,0 +1,136 @@ | |||||
| #!/bin/bash -ex | |||||
| function handle_strip() { | |||||
| echo "now handle strip $1" | |||||
| objcopy --only-keep-debug $1 $1.dbg | |||||
| strip -s $1 | |||||
| objcopy --add-gnu-debuglink=$1.dbg $1 | |||||
| rm $1.dbg | |||||
| } | |||||
| function full_copy_so(){ | |||||
| lib_path=$1 | |||||
| dst_dir=$2 | |||||
| append_rpath=$3 | |||||
| lib_name=$(basename $lib_path) | |||||
| cp $lib_path $dst_dir/$lib_name | |||||
| if [ "$append_rpath" != "" ];then | |||||
| ori_rpath=$(patchelf --print-rpath $dst_dir/$lib_name) | |||||
| if [ "$ori_rpath" != "" ];then | |||||
| patchelf --set-rpath "$ori_rpath:$append_rpath" $dst_dir/$lib_name | |||||
| else | |||||
| patchelf --set-rpath "$append_rpath" $dst_dir/$lib_name | |||||
| fi | |||||
| fi | |||||
| } | |||||
| function patch_elf_depend_lib() { | |||||
| echo "handle common depend lib" | |||||
| LIBS_DIR=${BUILD_DIR}/staging/megengine/core/lib | |||||
| mkdir -p ${LIBS_DIR} | |||||
| cp /usr/lib64/libatomic.so.1 ${LIBS_DIR} | |||||
| patchelf --remove-rpath ${BUILD_DIR}/staging/megengine/core/_imperative_rt.so | |||||
| patchelf --force-rpath --set-rpath '$ORIGIN/lib' ${BUILD_DIR}/staging/megengine/core/_imperative_rt.so | |||||
| cp ${BUILD_DIR}/src/libmegengine_export.so ${LIBS_DIR} | |||||
| patchelf --remove-rpath ${LIBS_DIR}/libmegengine_export.so | |||||
| patchelf --force-rpath --set-rpath '$ORIGIN/.' ${LIBS_DIR}/libmegengine_export.so | |||||
| cp ${BUILD_DIR}/src/libmegengine_export.so ${LIBS_DIR} | |||||
| patchelf --remove-rpath ${LIBS_DIR}/libmegengine_export.so | |||||
| patchelf --force-rpath --set-rpath '$ORIGIN/.' ${LIBS_DIR}/libmegengine_export.so | |||||
| if [ ${BUILD_WHL_CPU_ONLY} = "OFF" ]; then | |||||
| echo "handle cuda lib" | |||||
| cp ${BUILD_DIR}/dnn/cuda-stub/libcuda_stub.so ${LIBS_DIR} | |||||
| cp /usr/local/cuda/lib64/libnvToolsExt.so.1 ${LIBS_DIR} | |||||
| IFS=: read -a lib_name_array <<<"$COPY_LIB_LIST" | |||||
| append_rpath='$ORIGIN/.' | |||||
| for lib_name in ${lib_name_array[@]};do | |||||
| full_copy_so $lib_name ${LIBS_DIR} $lib_append_rpath | |||||
| done | |||||
| fi | |||||
| } | |||||
| ALL_PYTHON=${ALL_PYTHON} | |||||
| if [[ -z ${ALL_PYTHON} ]] | |||||
| then | |||||
| ALL_PYTHON="35m 36m 37m 38" | |||||
| fi | |||||
| BUILD_WHL_CPU_ONLY=${BUILD_WHL_CPU_ONLY} | |||||
| if [[ -z ${BUILD_WHL_CPU_ONLY} ]] | |||||
| then | |||||
| BUILD_WHL_CPU_ONLY="OFF" | |||||
| fi | |||||
| SRC_DIR=$(readlink -f "`dirname $0`/../../../") | |||||
| BUILD_DIR=${SRC_DIR}/build_dir/host/MGE_WITH_CUDA_OFF/MGE_INFERENCE_ONLY_OFF/Release/build/ | |||||
| if [ ${BUILD_WHL_CPU_ONLY} = "OFF" ]; then | |||||
| BUILD_DIR=${SRC_DIR}/build_dir/host/MGE_WITH_CUDA_ON/MGE_INFERENCE_ONLY_OFF/Release/build/ | |||||
| fi | |||||
| NEW_LIB_PATH=core/lib | |||||
| for ver in ${ALL_PYTHON} | |||||
| do | |||||
| USE_AUDITWHEEL="ON" | |||||
| python_ver=${ver:0:2} | |||||
| MAJOR=${python_ver:0:1} | |||||
| MINOR=${ver:1} | |||||
| PYTHON_DIR=/opt/python/cp${python_ver}-cp${ver}/ | |||||
| export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} ${EXTRA_CMAKE_FLAG}" | |||||
| export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DCMAKE_BUILD_TYPE=RelWithDebInfo" | |||||
| export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DCMAKE_PREFIX_PATH=${PYTHON_DIR}" | |||||
| export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DPYTHON_EXECUTABLE=${PYTHON_DIR}/bin/python3" | |||||
| export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DPYTHON_LIBRARY=${PYTHON_DIR}lib/" | |||||
| export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DPYTHON_INCLUDE_DIR=${PYTHON_DIR}include/python${MAJOR}.${MINOR}" | |||||
| export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DMGE_WITH_ATLAS=ON" | |||||
| if [ ${BUILD_WHL_CPU_ONLY} = "OFF" ]; then | |||||
| ${SRC_DIR}/scripts/cmake-build/host_build.sh -c -t -r | |||||
| else | |||||
| ${SRC_DIR}/scripts/cmake-build/host_build.sh -t -r | |||||
| fi | |||||
| cd ${BUILD_DIR} | |||||
| rm -rf staging | |||||
| mkdir -p staging | |||||
| cp -a imperative/python/{megengine,setup.py,requires.txt,requires-style.txt,requires-test.txt} staging/ | |||||
| handle_strip ${BUILD_DIR}/src/libmegengine_export.so | |||||
| cd ${BUILD_DIR}/staging/megengine/core | |||||
| handle_strip _imperative_rt.so | |||||
| mkdir -p lib/ucx | |||||
| if [ ${USE_AUDITWHEEL} = "OFF" ]; then | |||||
| patch_elf_depend_lib | |||||
| fi | |||||
| cd ${BUILD_DIR}/staging/ | |||||
| ${PYTHON_DIR}/bin/python setup.py bdist_wheel | |||||
| cd /home/output | |||||
| if [ ${USE_AUDITWHEEL} = "ON" ]; then | |||||
| LD_LIBRARY_PATH=${BUILD_DIR}/dnn/cuda-stub:$LD_LIBRARY_PATH auditwheel repair -L ${NEW_LIB_PATH} ${BUILD_DIR}/staging/dist/Meg*.whl | |||||
| else | |||||
| mkdir -p ${SRC_DIR}/scripts/whl/manylinux2014/output/wheelhouse/${OUT_DIR} | |||||
| cd ${BUILD_DIR}/staging/dist/ | |||||
| org_whl_name=`ls Meg*${ver}*.whl` | |||||
| compat_whl_name=`echo ${org_whl_name} | sed 's/linux/manylinux2014/'` | |||||
| echo "org whl name: ${org_whl_name}" | |||||
| echo "comapt whl name: ${compat_whl_name}" | |||||
| mv ${org_whl_name} ${SRC_DIR}/scripts/whl/manylinux2014/output/wheelhouse/${OUT_DIR}/${compat_whl_name} | |||||
| cd /home/output | |||||
| fi | |||||
| chown -R ${UID}.${UID} . | |||||
| # compat for root-less docker env to remove output at host side | |||||
| chmod -R 777 . | |||||
| echo "python $ver done" | |||||
| done | |||||
| @@ -0,0 +1,70 @@ | |||||
| #!/bin/bash -e | |||||
| GET_PIP_URL='https://bootstrap.pypa.io/get-pip.py' | |||||
| SWIG_URL='https://downloads.sourceforge.net/project/swig/swig/swig-3.0.12/swig-3.0.12.tar.gz?use_mirror=autoselect' | |||||
| LLVM_URL='https://github.com/llvm-mirror/llvm/archive/release_60.tar.gz' | |||||
| CLANG_URL='https://github.com/llvm-mirror/clang/archive/release_60.tar.gz' | |||||
| yum install -y pcre-devel devtoolset-9-libatomic-devel.x86_64 | |||||
| yum install -y devtoolset-8 devtoolset-8-libatomic-devel.x86_64 | |||||
| for ver in 35m 36m 37m 38 | |||||
| do | |||||
| python_ver=${ver:0:2} | |||||
| curl ${GET_PIP_URL} | /opt/python/cp${python_ver}-cp${ver}/bin/python - \ | |||||
| --no-cache-dir --only-binary :all: | |||||
| /opt/python/cp${python_ver}-cp${ver}/bin/pip install \ | |||||
| --no-cache-dir --only-binary :all: numpy==1.18.1 setuptools==46.1.3 | |||||
| done | |||||
| pushd /home >/dev/null | |||||
| echo "Install swig" | |||||
| curl -sSL ${SWIG_URL} | tar xz | |||||
| pushd swig-3.0.12 >/dev/null | |||||
| mkdir build | |||||
| pushd build >/dev/null | |||||
| ../configure | |||||
| make -j$(nproc) | |||||
| make install | |||||
| popd >/dev/null | |||||
| popd >/dev/null | |||||
| rm -rf swig-3.0.12 | |||||
| echo "Install llvm" | |||||
| curl -sSL ${LLVM_URL} | tar xz | |||||
| pushd llvm-release_60 >/dev/null | |||||
| mkdir build | |||||
| pushd build >/dev/null | |||||
| cmake .. -DCMAKE_PREFIX_PATH=/opt/python/cp36-cp36m/ \ | |||||
| -DCMAKE_BUILD_TYPE=Release | |||||
| make -j$(nproc) | |||||
| make install | |||||
| popd >/dev/null | |||||
| popd >/dev/null | |||||
| rm -rf llvm-release_60 | |||||
| echo "Install clang" | |||||
| curl -sSL ${CLANG_URL} | tar xz | |||||
| pushd clang-release_60 >/dev/null | |||||
| mkdir build | |||||
| pushd build >/dev/null | |||||
| cmake .. -DCMAKE_PREFIX_PATH=/opt/python/cp36-cp36m/ \ | |||||
| -DCMAKE_BUILD_TYPE=Release | |||||
| make -j$(nproc) | |||||
| make install | |||||
| popd >/dev/null | |||||
| popd >/dev/null | |||||
| rm -rf clang-release_60 | |||||
| popd >/dev/null | |||||
| pushd /tmp >/dev/null | |||||
| curl -sSL https://github.com/NixOS/patchelf/archive/0.12.tar.gz | tar xz | |||||
| pushd /tmp/patchelf-0.12 >/dev/null | |||||
| sed -i '331s/32/64/' ./src/patchelf.cc | |||||
| ./bootstrap.sh && ./configure && make install-strip | |||||
| popd | |||||
| rm -rf /tmp/patchelf-0.12 | |||||
| popd | |||||
| yum clean all | |||||
| @@ -0,0 +1,65 @@ | |||||
| #!/bin/bash | |||||
| set -e | |||||
| function set_tuna_yum_mirror() { | |||||
| cp /etc/yum.repos.d/CentOS-Base.repo /etc/yum.repos.d/CentOS-Base.repo.bak | |||||
| local repo=/etc/yum.repos.d/CentOS-Base.repo | |||||
| local plugin=/etc/yum/pluginconf.d/fastestmirror.conf | |||||
| sed -i "s/mirrorlist=/#mirrorlist=/g" $repo | |||||
| sed -i "s/#baseurl/baseurl/g" $repo | |||||
| sed -i "s/mirror.centos.org/mirrors.tuna.tsinghua.edu.cn/g" $repo | |||||
| sed -i "s/http/https/g" $repo | |||||
| sed -i "s/enabled=1/enabled=0/g" $plugin | |||||
| yum clean all | |||||
| # Build on brainpp unable to pull epel reo metadata so disable this | |||||
| # https://unix.stackexchange.com/questions/148144/unable-to-pull-epel-repository-metadata | |||||
| yum --disablerepo="epel" update nss | |||||
| yum makecache | |||||
| } | |||||
| function set_epel() { | |||||
| mv /etc/yum.repos.d/epel.repo /etc/yum.repos.d/epel.repo.backup | |||||
| mv /etc/yum.repos.d/epel-testing.repo /etc/yum.repos.d/epel-testing.repo.backup | |||||
| curl -o /etc/yum.repos.d/epel.repo http://mirrors.aliyun.com/repo/epel-7.repo | |||||
| } | |||||
| function set_yum_mirror() { | |||||
| mv /etc/yum.repos.d/CentOS-Base.repo /etc/yum.repos.d/CentOS-Base.repo.backup | |||||
| curl -o /etc/yum.repos.d/CentOS-Base.repo https://mirrors.aliyun.com/repo/Centos-7.repo | |||||
| yum makecache | |||||
| } | |||||
| function set_pip_mirror() { | |||||
| cat > /etc/pip.conf <<EOF | |||||
| [global] | |||||
| timeout = 180 | |||||
| index-url = https://mirrors.aliyun.com/pypi/simple | |||||
| extra-index-url = | |||||
| http://mirrors.i.brainpp.cn/pypi/simple/ | |||||
| http://pypi.i.brainpp.cn/brain/dev/+simple | |||||
| https://pypi.tuna.tsinghua.edu.cn/simple | |||||
| trusted-host = | |||||
| mirrors.i.brainpp.cn | |||||
| pypi.i.brainpp.cn | |||||
| pypi.tuna.tsinghua.edu.cn | |||||
| mirrors.aliyun.com | |||||
| EOF | |||||
| } | |||||
| function main() { | |||||
| local platform=$1 | |||||
| case $platform in | |||||
| brainpp) | |||||
| set_epel | |||||
| set_yum_mirror | |||||
| set_pip_mirror | |||||
| ;; | |||||
| *) | |||||
| echo "No setup required" | |||||
| ;; | |||||
| esac | |||||
| } | |||||
| main "$@" | |||||
| @@ -81,8 +81,9 @@ void NMSKeep::CUDAKern::exec(const NMSKeep* opr, const DeviceTensorND& inp, | |||||
| init(opr, inp.shape()); | init(opr, inp.shape()); | ||||
| auto inp_ptr = inp.ptr<float>(); | auto inp_ptr = inp.ptr<float>(); | ||||
| auto dev_overlap_mask = reinterpret_cast<uint64_t*>(workspace.raw_ptr()), | |||||
| dev_rm_mask = reinterpret_cast<uint64_t*>( | |||||
| void* workspace_ptr = workspace.raw_ptr(); | |||||
| auto dev_overlap_mask = reinterpret_cast<uint64_t*>(workspace_ptr), | |||||
| dev_rm_mask = (uint64_t*)( | |||||
| workspace.raw_ptr() + m_workspace_overlap_mask_bytes_align); | workspace.raw_ptr() + m_workspace_overlap_mask_bytes_align); | ||||
| auto out_idx_ptr = reinterpret_cast<uint32_t*>(out_idx.ptr<int32_t>()), | auto out_idx_ptr = reinterpret_cast<uint32_t*>(out_idx.ptr<int32_t>()), | ||||
| out_size_ptr = reinterpret_cast<uint32_t*>(out_size.ptr<int32_t>()); | out_size_ptr = reinterpret_cast<uint32_t*>(out_size.ptr<int32_t>()); | ||||
| @@ -27,6 +27,9 @@ | |||||
| #include "megbrain/gopt/inference.h" | #include "megbrain/gopt/inference.h" | ||||
| #include "megbrain/gopt/misc.h" | #include "megbrain/gopt/misc.h" | ||||
| #pragma GCC diagnostic push | |||||
| #pragma GCC diagnostic ignored "-Wdeprecated-declarations" | |||||
| using namespace mgb; | using namespace mgb; | ||||
| using namespace gopt; | using namespace gopt; | ||||
| using namespace cg; | using namespace cg; | ||||
| @@ -1749,6 +1752,7 @@ void mgb::tensorrt::transform_dest_vars_inplace( | |||||
| optimizer.apply_inplace(dest_vars); | optimizer.apply_inplace(dest_vars); | ||||
| } | } | ||||
| #pragma GCC diagnostic pop | |||||
| #endif | #endif | ||||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | ||||
| @@ -20,7 +20,8 @@ | |||||
| #include "megbrain/utils/debug.h" | #include "megbrain/utils/debug.h" | ||||
| #if MGB_ENABLE_TENSOR_RT | #if MGB_ENABLE_TENSOR_RT | ||||
| #pragma GCC diagnostic push | |||||
| #pragma GCC diagnostic ignored "-Wdeprecated-declarations" | |||||
| #include "megbrain/tensorrt/tensorrt_opr.h" | #include "megbrain/tensorrt/tensorrt_opr.h" | ||||
| #include "make_trt_net.h" | #include "make_trt_net.h" | ||||
| @@ -111,7 +112,8 @@ intl::SimpleQuantizedTensorRTNetwork::SimpleQuantizedTensorRTNetwork() { | |||||
| host_b = range_gen({1, 8, 1, 1}); | host_b = range_gen({1, 8, 1, 1}); | ||||
| { | { | ||||
| float* ptr = reinterpret_cast<float*>(host_w->raw_ptr()); | |||||
| void* w_ptr = host_w->raw_ptr(); | |||||
| float* ptr = reinterpret_cast<float*>(w_ptr); | |||||
| ptr[0] = -127*1.1f; | ptr[0] = -127*1.1f; | ||||
| ptr[1] = 127*1.1f; | ptr[1] = 127*1.1f; | ||||
| } | } | ||||
| @@ -362,6 +364,7 @@ intl::ConcatConvTensorRTNetwork::create_trt_network(bool has_batch_dim) { | |||||
| return std::make_pair(builder, network); | return std::make_pair(builder, network); | ||||
| } | } | ||||
| #pragma GCC diagnostic pop | |||||
| #endif // MGB_ENABLE_TENSOR_RT | #endif // MGB_ENABLE_TENSOR_RT | ||||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | ||||