| @@ -39,6 +39,9 @@ option(MGE_DISABLE_FLOAT16 "Disable MegEngine float16 support." OFF) | |||
| option(MGE_WITH_CUDA "Enable MegEngine CUDA support." ON) | |||
| option(MGE_CUDA_USE_STATIC "Enable MegEngine CUDA static linking." ON) | |||
| option(MGE_WITH_TRT "Build MegEngine with TensorRT." ON) | |||
| option(MGE_WITH_CUDA_STUB "Build MegEngine with CUDA stub." ON) | |||
| option(MGE_WITH_NVRTC_STUB "Build MegEngine with NVRTC stub." OFF) | |||
| option(MGE_WITH_CUDNN_SHARED "Build MegEngine with CUDNN shared." OFF) | |||
| option(MGE_USE_SYSTEM_LIB "Build MegEngine with system libraries." OFF) | |||
| option(MGB_WITH_FLATBUFFERS "Build MegBrain with FlatBuffers serialization support." ON) | |||
| option(MGE_WITH_CAMBRICON "Build MegEngine with Cambricon support" OFF) | |||
| @@ -55,6 +58,14 @@ option(MGE_BUILD_SDK "Build load_and_run" ON) | |||
| option(MGE_INFERENCE_ONLY "Build inference only library." OFF) | |||
| option(MGE_WITH_MKLDNN "Enable Intel MKL_DNN support," ON) | |||
| option(MGE_WITH_ROCM "Enable ROCM support" OFF) | |||
| option(MGE_WITH_LARGE_ARCHIVE "Enable big archive link support" OFF) | |||
| if(MGE_WITH_NVRTC_STUB OR MGE_WITH_CUDA_STUB) | |||
| set(MGE_WITH_ANY_CUDA_STUB ON) | |||
| else() | |||
| set(MGE_WITH_ANY_CUDA_STUB OFF) | |||
| endif() | |||
| if(NOT ${MGE_BIN_REDUCE} STREQUAL "") | |||
| message(STATUS "build with BIN REDUCE") | |||
| @@ -205,14 +216,24 @@ else() | |||
| endif() | |||
| endif() | |||
| if(MGE_WITH_CUDA) | |||
| include(cmake/cudnn.cmake) | |||
| if(MGE_CUDA_USE_STATIC AND ("${CUDNN_VERSION}" VERSION_GREATER "8.0.0" OR "${CUDNN_VERSION}" VERSION_EQUAL "8.0.0") AND (NOT MGE_WITH_CUDNN_SHARED)) | |||
| message(WARNING "Static link CUDNN8 will auto enable MGE_WITH_LARGE_ARCHIVE=ON") | |||
| set(MGE_WITH_LARGE_ARCHIVE ON) | |||
| endif() | |||
| endif() | |||
| CHECK_CXX_COMPILER_FLAG(-fuse-ld=gold CXX_SUPPORT_GOLD) | |||
| if(CXX_SUPPORT_GOLD AND NOT ANDROID AND NOT APPLE AND NOT MSVC AND NOT WIN32) | |||
| if(MGE_WITH_LARGE_ARCHIVE) | |||
| message(STATUS "Set -mcmodel=large and disable -fuse-ld=gold") | |||
| set(MGE_COMMON_LINKER_FLAGS "-mcmodel=large") | |||
| elseif(CXX_SUPPORT_GOLD AND NOT ANDROID AND NOT APPLE AND NOT MSVC AND NOT WIN32 AND NOT MGE_WITH_LARGE_ARCHIVE) | |||
| message(STATUS "Using GNU gold linker.") | |||
| set(MGE_COMMON_LINKER_FLAGS "-fuse-ld=gold") | |||
| set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${MGE_COMMON_LINKER_FLAGS}") | |||
| set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} ${MGE_COMMON_LINKER_FLAGS}") | |||
| set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${MGE_COMMON_LINKER_FLAGS}") | |||
| set(MGE_COMMON_LINKER_FLAGS "-fuse-ld=gold") | |||
| endif() | |||
| set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${MGE_COMMON_LINKER_FLAGS}") | |||
| set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} ${MGE_COMMON_LINKER_FLAGS}") | |||
| set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${MGE_COMMON_LINKER_FLAGS}") | |||
| if(NOT MGE_WITH_JIT) | |||
| if(MGE_WITH_HALIDE) | |||
| @@ -353,11 +374,28 @@ if(MGE_WITH_CUDA) | |||
| if(NOT MGE_ENABLE_EXCEPTIONS) | |||
| set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -fno-exceptions") | |||
| endif() | |||
| if(NOT MGE_CUDA_GENCODE) | |||
| if(${MGE_ARCH} STREQUAL "x86_64" OR ${MGE_ARCH} STREQUAL "i386") | |||
| set(MEGDNN_THREADS_512 0) | |||
| if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.0.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.0.0") | |||
| if(MGE_WITH_CUDA AND MGE_CUDA_USE_STATIC AND ("${CUDNN_VERSION}" VERSION_GREATER "8.0.0" OR "${CUDNN_VERSION}" VERSION_EQUAL "8.0.0") AND (NOT MGE_WITH_CUDNN_SHARED)) | |||
| message(WARNING "Static link CUDNN8 with many sm is unworkable, we only enable sm61 sm70 sm75 by default, and enable MGE_WITH_LARGE_ARCHIVE=ON") | |||
| set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61") | |||
| set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_70,code=sm_70") | |||
| set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_75,code=sm_75") | |||
| elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "11.1.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "11.1.0") | |||
| set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61") | |||
| set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_70,code=sm_70") | |||
| set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_75,code=sm_75") | |||
| set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_80,code=sm_80") | |||
| set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_86,code=sm_86") | |||
| set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_86,code=compute_86") | |||
| elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "11.0.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "11.0.0") | |||
| set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61") | |||
| set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_70,code=sm_70") | |||
| set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_75,code=sm_75") | |||
| set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_80,code=sm_80") | |||
| set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_80,code=compute_80") | |||
| elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.0.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.0.0") | |||
| set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_52,code=sm_52") | |||
| set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_60,code=sm_60") | |||
| set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61") | |||
| @@ -385,7 +423,6 @@ if(MGE_WITH_CUDA) | |||
| endif() | |||
| set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${MGE_CUDA_GENCODE}") | |||
| include(cmake/cudnn.cmake) | |||
| if(MGE_WITH_TRT) | |||
| include(cmake/tensorrt.cmake) | |||
| endif() | |||
| @@ -394,12 +431,30 @@ if(MGE_WITH_CUDA) | |||
| if(MSVC OR WIN32) | |||
| list(APPEND MGE_CUDA_LIBS ${TRT_LIBRARY} ${CUDNN_LIBRARY}) | |||
| message(STATUS "windows TRT_LIBRARY: ${TRT_LIBRARY}") | |||
| else() | |||
| if(TensorRT_VERSION_MAJOR GREATER_EQUAL 7) | |||
| list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libnvinfer myelin_compiler_static myelin_executor_static myelin_pattern_runtime_static myelin_pattern_library_static -Wl,--no-whole-archive) | |||
| else() | |||
| list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libnvinfer -Wl,--no-whole-archive) | |||
| endif() | |||
| endif() | |||
| endif() | |||
| if("${CUDNN_VERSION}" STREQUAL "7.5.0") | |||
| if(MSVC OR WIN32) | |||
| message(STATUS "windows CUDNN_LIBRARY: ${CUDNN_LIBRARY}") | |||
| list(APPEND MGE_CUDA_LIBS ${CUDNN_LIBRARY}) | |||
| else() | |||
| list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libnvinfer libcudnn -Wl,--no-whole-archive) | |||
| message(STATUS "cudnn 7.5.0 has bug in cudnnConvolutionBiasActivationForward, need --whole-archive to workaround, ref https://docs.nvidia.com/deeplearning/cudnn/release-notes/rel_7xx.html") | |||
| list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libcudnn -Wl,--no-whole-archive) | |||
| endif() | |||
| else() | |||
| list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libcudnn -Wl,--no-whole-archive) | |||
| if(MSVC OR WIN32) | |||
| message(STATUS "windows CUDNN_LIBRARY: ${CUDNN_LIBRARY}") | |||
| list(APPEND MGE_CUDA_LIBS ${CUDNN_LIBRARY}) | |||
| else() | |||
| list(APPEND MGE_CUDA_LIBS libcudnn) | |||
| endif() | |||
| endif() | |||
| if(MSVC OR WIN32) | |||
| list(APPEND MGE_CUDA_LIBS cusolver.lib cublas.lib curand.lib cudart_static.lib cusparse.lib) | |||
| @@ -447,15 +502,37 @@ if(MGE_WITH_CUDA) | |||
| if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.1.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.1.0") | |||
| list(APPEND MGE_CUDA_LIBS cublasLt cusolver cublas curand) | |||
| endif() | |||
| list(APPEND MGE_CUDA_LIBS cudart) | |||
| endif() | |||
| if(NOT MGE_WITH_CUDA_STUB) | |||
| if(MSVC OR WIN32) | |||
| list(APPEND MGE_CUDA_LIBS cuda.lib) | |||
| else() | |||
| list(APPEND MGE_CUDA_LIBS cuda) | |||
| endif() | |||
| endif() | |||
| if(NOT MGE_WITH_NVRTC_STUB) | |||
| if(MSVC OR WIN32) | |||
| list(APPEND MGE_CUDA_LIBS nvrtc.lib) | |||
| else() | |||
| list(APPEND MGE_CUDA_LIBS nvrtc) | |||
| endif() | |||
| endif() | |||
| if(MGE_WITH_ANY_CUDA_STUB) | |||
| add_subdirectory(dnn/cuda-stub) | |||
| list(APPEND MGE_CUDA_LIBS cuda-stub) | |||
| endif() | |||
| add_subdirectory(dnn/cuda-stub) | |||
| if(MSVC OR WIN32) | |||
| list(APPEND MGE_CUDA_LIBS nvrtc.lib cuda-stub) | |||
| list(APPEND MGE_CUDA_LIBS nvrtc.lib) | |||
| else() | |||
| list(APPEND MGE_CUDA_LIBS nvrtc cuda-stub nvToolsExt) | |||
| list(APPEND MGE_CUDA_LIBS nvToolsExt) | |||
| endif() | |||
| set(MGE_CUDA_LIBS "${MGE_CUDA_LIBS}") | |||
| set(MGE_CUDA_LIBS "${MGE_CUDA_LIBS} -lrt") | |||
| endif() | |||
| if(MGE_WITH_CAMBRICON) | |||
| @@ -800,6 +877,9 @@ if(TARGET _imperative_rt) | |||
| COMMAND ${CMAKE_COMMAND} -E create_symlink | |||
| ${CMAKE_CURRENT_BINARY_DIR}/imperative/python/${PACKAGE_NAME}/core/$<TARGET_FILE_NAME:${MODULE_NAME}> | |||
| ${CMAKE_CURRENT_SOURCE_DIR}/imperative/python/${PACKAGE_NAME}/core/$<TARGET_FILE_NAME:${MODULE_NAME}> | |||
| COMMAND ${CMAKE_COMMAND} -E create_symlink | |||
| ${CMAKE_CURRENT_BINARY_DIR}/imperative/python/${PACKAGE_NAME}/version.py | |||
| ${CMAKE_CURRENT_SOURCE_DIR}/imperative/python/${PACKAGE_NAME}/version.py | |||
| DEPENDS _imperative_rt | |||
| VERBATIM | |||
| ) | |||
| @@ -863,3 +943,9 @@ if(MGE_WITH_JIT_MLIR) | |||
| add_subdirectory(tools/mlir/mgb-opt) | |||
| add_subdirectory(tools/mlir/mgb-file-check) | |||
| endif() | |||
| if(MGE_WITH_CUDA AND MGE_CUDA_USE_STATIC AND("${CUDNN_VERSION}" VERSION_GREATER "8.0.0" OR "${CUDNN_VERSION}" VERSION_EQUAL "8.0.0") AND (NOT MGE_WITH_CUDNN_SHARED)) | |||
| message(WARNING "Static link CUDNN8 with many sm is unworkable, please use -DMGE_WITH_CUDNN_SHARED=ON or -DMGE_WITH_LARGE_ARCHIVE=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_70,code=sm_70 arch=compute_75,code=sm_75\" ") | |||
| message(WARNING "Static link CUDNN8 with many sm is unworkable, please use -DMGE_WITH_CUDNN_SHARED=ON or -DMGE_WITH_LARGE_ARCHIVE=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_70,code=sm_70 arch=compute_75,code=sm_75\" ") | |||
| message(WARNING "Static link CUDNN8 with many sm is unworkable, please use -DMGE_WITH_CUDNN_SHARED=ON or -DMGE_WITH_LARGE_ARCHIVE=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_70,code=sm_70 arch=compute_75,code=sm_75\" ") | |||
| endif() | |||
| @@ -11,7 +11,7 @@ if("${CUDNN_ROOT_DIR}" STREQUAL "" AND NOT "$ENV{CUDNN_ROOT_DIR}" STREQUAL "") | |||
| set(CUDNN_ROOT_DIR $ENV{CUDNN_ROOT_DIR}) | |||
| endif() | |||
| if(MGE_CUDA_USE_STATIC) | |||
| if(MGE_CUDA_USE_STATIC AND NOT MGE_WITH_CUDNN_SHARED) | |||
| find_library(CUDNN_LIBRARY | |||
| NAMES libcudnn_static.a cudnn.lib | |||
| PATHS $ENV{LD_LIBRARY_PATH} ${CUDNN_ROOT_DIR} ${PC_CUDNN_LIBRARY_DIRS} ${CMAKE_INSTALL_PREFIX} | |||
| @@ -42,7 +42,12 @@ if(CUDNN_INCLUDE_DIR STREQUAL "CUDNN_INCLUDE_DIR-NOTFOUND") | |||
| message(FATAL_ERROR "Can not find CuDNN Library") | |||
| endif() | |||
| file(READ ${CUDNN_INCLUDE_DIR}/cudnn.h CUDNN_VERSION_FILE_CONTENTS) | |||
| if(EXISTS ${CUDNN_INCLUDE_DIR}/cudnn_version.h) | |||
| file(READ ${CUDNN_INCLUDE_DIR}/cudnn_version.h CUDNN_VERSION_FILE_CONTENTS) | |||
| else() | |||
| file(READ ${CUDNN_INCLUDE_DIR}/cudnn.h CUDNN_VERSION_FILE_CONTENTS) | |||
| endif() | |||
| string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)" | |||
| CUDNN_MAJOR_VERSION "${CUDNN_VERSION_FILE_CONTENTS}") | |||
| string(REGEX REPLACE "define CUDNN_MAJOR * +([0-9]+)" "\\1" | |||
| @@ -55,7 +60,9 @@ string(REGEX MATCH "define CUDNN_PATCHLEVEL * +([0-9]+)" | |||
| CUDNN_PATCH_VERSION "${CUDNN_VERSION_FILE_CONTENTS}") | |||
| string(REGEX REPLACE "define CUDNN_PATCHLEVEL * +([0-9]+)" "\\1" | |||
| CUDNN_PATCH_VERSION "${CUDNN_PATCH_VERSION}") | |||
| set(CUDNN_VERSION ${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}) | |||
| set(CUDNN_VERSION ${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}.${CUDNN_PATCH_VERSION}) | |||
| if(MGE_CUDA_USE_STATIC) | |||
| add_library(libcudnn STATIC IMPORTED) | |||
| @@ -1,11 +1,21 @@ | |||
| file (GLOB_RECURSE SOURCES src/*.cpp) | |||
| file (GLOB_RECURSE CUDA_STUB src/libcuda.cpp) | |||
| file (GLOB_RECURSE NVRTC_STUB src/libnvrtc.cpp) | |||
| if(MGE_WITH_CUDA_STUB) | |||
| list(APPEND STUB_SRC ${CUDA_STUB}) | |||
| endif() | |||
| if(MGE_WITH_NVRTC_STUB) | |||
| list(APPEND STUB_SRC ${NVRTC_STUB}) | |||
| endif() | |||
| if(MSVC OR WIN32) | |||
| add_library (cuda-stub STATIC ${SOURCES}) | |||
| add_library (cuda-stub STATIC ${STUB_SRC}) | |||
| else() | |||
| add_library (cuda-stub SHARED ${SOURCES}) | |||
| add_library (cuda-stub SHARED ${STUB_SRC}) | |||
| endif() | |||
| set_target_properties(cuda-stub PROPERTIES OUTPUT_NAME cuda) | |||
| set_target_properties(cuda-stub PROPERTIES OUTPUT_NAME cuda_stub) | |||
| target_compile_definitions(cuda-stub PRIVATE __CUDA_API_VERSION_INTERNAL) | |||
| if (MSVC OR WIN32) | |||
| target_link_libraries(cuda-stub PRIVATE -Wl,--no-undefined) | |||
| @@ -0,0 +1,109 @@ | |||
| #if defined(_WIN32) | |||
| #include <windows.h> | |||
| #define RTLD_LAZY 0 | |||
| static void* dlopen(const char* file, int) { | |||
| return static_cast<void*>(LoadLibraryA(file)); | |||
| } | |||
| static void* dlerror() { | |||
| const char* errmsg = "dlerror not aviable in windows"; | |||
| return const_cast<char*>(errmsg); | |||
| } | |||
| static void* dlsym(void* handle, const char* name) { | |||
| FARPROC symbol = GetProcAddress((HMODULE)handle, name); | |||
| return reinterpret_cast<void*>(symbol); | |||
| } | |||
| #else | |||
| #include <dlfcn.h> | |||
| #include <unistd.h> | |||
| #endif | |||
| #include <sstream> | |||
| #include <string> | |||
| #include <vector> | |||
| static std::vector<std::string> split_string(const std::string& s, char delim) { | |||
| std::vector<std::string> elems; | |||
| std::stringstream ss(s); | |||
| std::string item; | |||
| while (std::getline(ss, item, delim)) { | |||
| elems.push_back(item); | |||
| } | |||
| return elems; | |||
| } | |||
| static std::vector<std::string> get_env_dir(const char* env_name) { | |||
| const char* env_p = std::getenv(env_name); | |||
| std::vector<std::string> env_dir; | |||
| if (env_p) { | |||
| env_dir = split_string(env_p, ':'); | |||
| } | |||
| return env_dir; | |||
| } | |||
| static void* try_open_handle(std::vector<std::string> dir_vec, | |||
| std::string default_so_name) { | |||
| void* handle = nullptr; | |||
| for (auto& tk_path : dir_vec) { | |||
| handle = dlopen((tk_path + "/" + default_so_name).c_str(), RTLD_LAZY); | |||
| if (handle) { | |||
| break; | |||
| } | |||
| } | |||
| return handle; | |||
| } | |||
| static void* try_open_handle(const char** so_vec, int nr_so) { | |||
| void* handle = nullptr; | |||
| for (int i = 0; i < nr_so; ++i) { | |||
| handle = dlopen(so_vec[i], RTLD_LAZY); | |||
| if (handle) { | |||
| break; | |||
| } | |||
| } | |||
| return handle; | |||
| } | |||
| static void* get_library_handle() { | |||
| std::vector<std::string> cuda_tk_dir = get_env_dir("CUDA_TK_PATH"); | |||
| std::vector<std::string> ld_dir = get_env_dir("LD_LIBRARY_PATH"); | |||
| void* handle = nullptr; | |||
| if (!handle) { | |||
| handle = try_open_handle(ld_dir, default_so_name); | |||
| } | |||
| if (!handle) { | |||
| handle = try_open_handle(cuda_tk_dir, default_so_name); | |||
| } | |||
| if (!handle) { | |||
| handle = try_open_handle(default_so_paths, | |||
| sizeof(default_so_paths) / sizeof(char*)); | |||
| } | |||
| if (!handle) { | |||
| handle = try_open_handle(extra_so_paths, | |||
| sizeof(extra_so_paths) / sizeof(char*)); | |||
| } | |||
| if (!handle) { | |||
| LOGE("Failed to load %s API library", g_default_api_name); | |||
| return nullptr; | |||
| } | |||
| return handle; | |||
| } | |||
| static void log_failed_load(int func_idx) { | |||
| LOGE("failed to load %s func: %s", g_default_api_name, | |||
| g_func_name[func_idx]); | |||
| } | |||
| static void* resolve_library_func(void* handle, const char* func) { | |||
| if (!handle) { | |||
| LOGE("%s handle should not be nullptr!", g_default_api_name); | |||
| return nullptr; | |||
| } | |||
| auto ret = dlsym(handle, func); | |||
| if (!ret) { | |||
| LOGE("failed to load %s func: %s", g_default_api_name, func); | |||
| } | |||
| return ret; | |||
| } | |||
| @@ -3,36 +3,14 @@ | |||
| #include <cstdio> | |||
| #define LOGE(fmt, v...) fprintf(stderr, "err: " fmt "\n", ##v) | |||
| extern "C" { | |||
| #include <cuda.h> | |||
| #include "cuda.h" | |||
| } | |||
| #include <cudaProfiler.h> | |||
| #include "cudaProfiler.h" | |||
| #pragma GCC diagnostic ignored "-Wdeprecated-declarations" | |||
| #if defined(_WIN32) | |||
| #include <windows.h> | |||
| #define RTLD_LAZY 0 | |||
| static void* dlopen(const char* file, int) { | |||
| return static_cast<void*>(LoadLibraryA(file)); | |||
| } | |||
| static void* dlerror() { | |||
| const char* errmsg = "dlerror not aviable in windows"; | |||
| return const_cast<char*>(errmsg); | |||
| } | |||
| static void* dlsym(void* handle, const char* name) { | |||
| FARPROC symbol = GetProcAddress((HMODULE)handle, name); | |||
| return reinterpret_cast<void*>(symbol); | |||
| } | |||
| #else | |||
| #include <dlfcn.h> | |||
| #include <unistd.h> | |||
| #endif | |||
| static void log_failed_load(int func_idx); | |||
| namespace { | |||
| template <typename T> | |||
| @@ -42,68 +20,63 @@ CUresult on_init_failed(int func_idx) { | |||
| log_failed_load(func_idx); | |||
| return CUDA_ERROR_UNKNOWN; | |||
| } | |||
| } | |||
| } // namespace | |||
| #define _WRAPLIB_API_CALL CUDAAPI | |||
| #define _WRAPLIB_CALLBACK CUDA_CB | |||
| #include "./libcuda-wrap.h" | |||
| #if CUDA_VERSION == 10010 | |||
| #include "./libcuda-wrap_10.1.h" | |||
| #elif CUDA_VERSION == 10020 | |||
| #include "./libcuda-wrap_10.2.h" | |||
| #elif CUDA_VERSION == 11010 | |||
| #include "./libcuda-wrap_11.1.h" | |||
| #elif CUDA_VERSION == 11020 | |||
| #include "./libcuda-wrap_11.2.h" | |||
| #else | |||
| #error "cuda stub not support this cuda version, you can close cuda stub to passby" | |||
| #endif | |||
| #undef _WRAPLIB_CALLBACK | |||
| #undef _WRAPLIB_API_CALL | |||
| static const char* default_so_name = | |||
| #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) | |||
| "nvcuda.dll"; | |||
| #elif defined(__APPLE__) || defined(__MACOSX) | |||
| "libcuda.dylib"; | |||
| #else | |||
| "libcuda.so.1"; | |||
| #endif | |||
| // Harvested from cuda_drvapi_dynlink.c | |||
| static const char* default_so_paths[] = { | |||
| #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) | |||
| "nvcuda.dll", | |||
| #elif defined(__unix__) || defined (__QNX__) || defined(__APPLE__) || defined(__MACOSX) | |||
| "nvcuda.dll", | |||
| #elif defined(__unix__) || defined(__QNX__) || defined(__APPLE__) || \ | |||
| defined(__MACOSX) | |||
| #if defined(__APPLE__) || defined(__MACOSX) | |||
| "/usr/local/cuda/lib/libcuda.dylib", | |||
| "/usr/local/cuda/lib/libcuda.dylib", | |||
| #elif defined(__ANDROID__) | |||
| #if defined (__aarch64__) | |||
| "/system/vendor/lib64/libcuda.so", | |||
| #if defined(__aarch64__) | |||
| "/system/vendor/lib64/libcuda.so", | |||
| #elif defined(__arm__) | |||
| "/system/vendor/lib/libcuda.so", | |||
| "/system/vendor/lib/libcuda.so", | |||
| #endif | |||
| #else | |||
| "libcuda.so.1", | |||
| // In case some users does not have correct search path configured in | |||
| // /etc/ld.so.conf | |||
| "/usr/lib/x86_64-linux-gnu/libcuda.so", | |||
| "/usr/local/nvidia/lib64/libcuda.so", | |||
| "libcuda.so.1", | |||
| #endif | |||
| #else | |||
| #error "Unknown platform" | |||
| #endif | |||
| }; | |||
| static void* get_library_handle() { | |||
| void* handle = nullptr; | |||
| for (size_t i = 0; i < (sizeof(default_so_paths) / sizeof(char*)); i++) { | |||
| handle = dlopen(default_so_paths[i], RTLD_LAZY); | |||
| if (handle) { | |||
| break; | |||
| } | |||
| } | |||
| if (!handle) { | |||
| LOGE("Failed to load CUDA Driver API library"); | |||
| return nullptr; | |||
| } | |||
| return handle; | |||
| } | |||
| static void log_failed_load(int func_idx) { | |||
| LOGE("failed to load cuda func: %s", g_func_name[func_idx]); | |||
| } | |||
| static const char* extra_so_paths[] = { | |||
| "/usr/lib/x86_64-linux-gnu/libcuda.so", | |||
| "/usr/local/nvidia/lib64/libcuda.so", | |||
| }; | |||
| static void* resolve_library_func(void* handle, const char* func) { | |||
| if (!handle) { | |||
| LOGE("handle should not be nullptr!"); | |||
| return nullptr; | |||
| } | |||
| auto ret = dlsym(handle, func); | |||
| if (!ret) { | |||
| LOGE("failed to load cuda func: %s", func); | |||
| } | |||
| return ret; | |||
| } | |||
| static const char* g_default_api_name = "cuda"; | |||
| #include "./dlopen_helper.h" | |||
| @@ -0,0 +1,367 @@ | |||
| // generated by wraplib.py | |||
| // --- begin functions to be implemented | |||
| #ifndef _WRAPLIB_API_CALL | |||
| #define _WRAPLIB_API_CALL | |||
| #endif | |||
| #ifndef _WRAPLIB_CALLBACK | |||
| #define _WRAPLIB_CALLBACK | |||
| #endif | |||
| #ifndef ON_ENTRY | |||
| #define ON_ENTRY(x) | |||
| #endif | |||
| static void* get_library_handle(); | |||
| static void* resolve_library_func(void*, const char*); | |||
| namespace { | |||
| template <typename T> | |||
| T on_init_failed(int func_idx); | |||
| } | |||
| // --- end functions to be implemented | |||
| #include <cstddef> | |||
| #include <mutex> | |||
| extern "C" { | |||
| const char _WRAPLIB_API_CALL* nvrtcGetErrorString(nvrtcResult arg0); | |||
| nvrtcResult _WRAPLIB_API_CALL nvrtcVersion(int* arg0, int* arg1); | |||
| nvrtcResult _WRAPLIB_API_CALL nvrtcGetNumSupportedArchs(int* arg0); | |||
| nvrtcResult _WRAPLIB_API_CALL nvrtcGetSupportedArchs(int* arg0); | |||
| nvrtcResult _WRAPLIB_API_CALL nvrtcCreateProgram(nvrtcProgram* arg0, | |||
| const char* arg1, | |||
| const char* arg2, int arg3, | |||
| const char* const* arg4, | |||
| const char* const* arg5); | |||
| nvrtcResult _WRAPLIB_API_CALL nvrtcDestroyProgram(nvrtcProgram* arg0); | |||
| nvrtcResult _WRAPLIB_API_CALL nvrtcCompileProgram(nvrtcProgram arg0, int arg1, | |||
| const char* const* arg2); | |||
| nvrtcResult _WRAPLIB_API_CALL nvrtcGetPTXSize(nvrtcProgram arg0, size_t* arg1); | |||
| nvrtcResult _WRAPLIB_API_CALL nvrtcGetPTX(nvrtcProgram arg0, char* arg1); | |||
| nvrtcResult _WRAPLIB_API_CALL nvrtcGetCUBINSize(nvrtcProgram arg0, | |||
| size_t* arg1); | |||
| nvrtcResult _WRAPLIB_API_CALL nvrtcGetCUBIN(nvrtcProgram arg0, char* arg1); | |||
| nvrtcResult _WRAPLIB_API_CALL nvrtcGetProgramLogSize(nvrtcProgram arg0, | |||
| size_t* arg1); | |||
| nvrtcResult _WRAPLIB_API_CALL nvrtcGetProgramLog(nvrtcProgram arg0, char* arg1); | |||
| nvrtcResult _WRAPLIB_API_CALL nvrtcAddNameExpression(nvrtcProgram arg0, | |||
| const char* const arg1); | |||
| nvrtcResult _WRAPLIB_API_CALL nvrtcGetLoweredName(nvrtcProgram arg0, | |||
| const char* const arg1, | |||
| const char** arg2); | |||
| } | |||
| static void load_library(); | |||
| static const char _WRAPLIB_API_CALL* nvrtcGetErrorString_init( | |||
| nvrtcResult arg0) { | |||
| load_library(); | |||
| return nvrtcGetErrorString(arg0); | |||
| } | |||
| static const char _WRAPLIB_API_CALL* nvrtcGetErrorString_error(nvrtcResult) { | |||
| return on_init_failed<const char*>(0); | |||
| } | |||
| static nvrtcResult _WRAPLIB_API_CALL nvrtcVersion_init(int* arg0, int* arg1) { | |||
| load_library(); | |||
| return nvrtcVersion(arg0, arg1); | |||
| } | |||
| static nvrtcResult _WRAPLIB_API_CALL nvrtcVersion_error(int*, int*) { | |||
| return on_init_failed<nvrtcResult>(1); | |||
| } | |||
| static nvrtcResult _WRAPLIB_API_CALL nvrtcGetNumSupportedArchs_init(int* arg0) { | |||
| load_library(); | |||
| return nvrtcGetNumSupportedArchs(arg0); | |||
| } | |||
| static nvrtcResult _WRAPLIB_API_CALL nvrtcGetNumSupportedArchs_error(int*) { | |||
| return on_init_failed<nvrtcResult>(2); | |||
| } | |||
| static nvrtcResult _WRAPLIB_API_CALL nvrtcGetSupportedArchs_init(int* arg0) { | |||
| load_library(); | |||
| return nvrtcGetSupportedArchs(arg0); | |||
| } | |||
| static nvrtcResult _WRAPLIB_API_CALL nvrtcGetSupportedArchs_error(int*) { | |||
| return on_init_failed<nvrtcResult>(3); | |||
| } | |||
| static nvrtcResult _WRAPLIB_API_CALL nvrtcCreateProgram_init( | |||
| nvrtcProgram* arg0, const char* arg1, const char* arg2, int arg3, | |||
| const char* const* arg4, const char* const* arg5) { | |||
| load_library(); | |||
| return nvrtcCreateProgram(arg0, arg1, arg2, arg3, arg4, arg5); | |||
| } | |||
| static nvrtcResult _WRAPLIB_API_CALL | |||
| nvrtcCreateProgram_error(nvrtcProgram*, const char*, const char*, int, | |||
| const char* const*, const char* const*) { | |||
| return on_init_failed<nvrtcResult>(4); | |||
| } | |||
| static nvrtcResult _WRAPLIB_API_CALL | |||
| nvrtcDestroyProgram_init(nvrtcProgram* arg0) { | |||
| load_library(); | |||
| return nvrtcDestroyProgram(arg0); | |||
| } | |||
| static nvrtcResult _WRAPLIB_API_CALL nvrtcDestroyProgram_error(nvrtcProgram*) { | |||
| return on_init_failed<nvrtcResult>(5); | |||
| } | |||
| static nvrtcResult _WRAPLIB_API_CALL | |||
| nvrtcCompileProgram_init(nvrtcProgram arg0, int arg1, const char* const* arg2) { | |||
| load_library(); | |||
| return nvrtcCompileProgram(arg0, arg1, arg2); | |||
| } | |||
| static nvrtcResult _WRAPLIB_API_CALL | |||
| nvrtcCompileProgram_error(nvrtcProgram, int, const char* const*) { | |||
| return on_init_failed<nvrtcResult>(6); | |||
| } | |||
| static nvrtcResult _WRAPLIB_API_CALL nvrtcGetPTXSize_init(nvrtcProgram arg0, | |||
| size_t* arg1) { | |||
| load_library(); | |||
| return nvrtcGetPTXSize(arg0, arg1); | |||
| } | |||
| static nvrtcResult _WRAPLIB_API_CALL nvrtcGetPTXSize_error(nvrtcProgram, | |||
| size_t*) { | |||
| return on_init_failed<nvrtcResult>(7); | |||
| } | |||
| static nvrtcResult _WRAPLIB_API_CALL nvrtcGetPTX_init(nvrtcProgram arg0, | |||
| char* arg1) { | |||
| load_library(); | |||
| return nvrtcGetPTX(arg0, arg1); | |||
| } | |||
| static nvrtcResult _WRAPLIB_API_CALL nvrtcGetPTX_error(nvrtcProgram, char*) { | |||
| return on_init_failed<nvrtcResult>(8); | |||
| } | |||
| static nvrtcResult _WRAPLIB_API_CALL nvrtcGetCUBINSize_init(nvrtcProgram arg0, | |||
| size_t* arg1) { | |||
| load_library(); | |||
| return nvrtcGetCUBINSize(arg0, arg1); | |||
| } | |||
| static nvrtcResult _WRAPLIB_API_CALL nvrtcGetCUBINSize_error(nvrtcProgram, | |||
| size_t*) { | |||
| return on_init_failed<nvrtcResult>(9); | |||
| } | |||
| static nvrtcResult _WRAPLIB_API_CALL nvrtcGetCUBIN_init(nvrtcProgram arg0, | |||
| char* arg1) { | |||
| load_library(); | |||
| return nvrtcGetCUBIN(arg0, arg1); | |||
| } | |||
| static nvrtcResult _WRAPLIB_API_CALL nvrtcGetCUBIN_error(nvrtcProgram, char*) { | |||
| return on_init_failed<nvrtcResult>(10); | |||
| } | |||
| static nvrtcResult _WRAPLIB_API_CALL | |||
| nvrtcGetProgramLogSize_init(nvrtcProgram arg0, size_t* arg1) { | |||
| load_library(); | |||
| return nvrtcGetProgramLogSize(arg0, arg1); | |||
| } | |||
| static nvrtcResult _WRAPLIB_API_CALL nvrtcGetProgramLogSize_error(nvrtcProgram, | |||
| size_t*) { | |||
| return on_init_failed<nvrtcResult>(11); | |||
| } | |||
| static nvrtcResult _WRAPLIB_API_CALL nvrtcGetProgramLog_init(nvrtcProgram arg0, | |||
| char* arg1) { | |||
| load_library(); | |||
| return nvrtcGetProgramLog(arg0, arg1); | |||
| } | |||
| static nvrtcResult _WRAPLIB_API_CALL nvrtcGetProgramLog_error(nvrtcProgram, | |||
| char*) { | |||
| return on_init_failed<nvrtcResult>(12); | |||
| } | |||
| static nvrtcResult _WRAPLIB_API_CALL | |||
| nvrtcAddNameExpression_init(nvrtcProgram arg0, const char* const arg1) { | |||
| load_library(); | |||
| return nvrtcAddNameExpression(arg0, arg1); | |||
| } | |||
| static nvrtcResult _WRAPLIB_API_CALL | |||
| nvrtcAddNameExpression_error(nvrtcProgram, const char* const) { | |||
| return on_init_failed<nvrtcResult>(13); | |||
| } | |||
| static nvrtcResult _WRAPLIB_API_CALL nvrtcGetLoweredName_init( | |||
| nvrtcProgram arg0, const char* const arg1, const char** arg2) { | |||
| load_library(); | |||
| return nvrtcGetLoweredName(arg0, arg1, arg2); | |||
| } | |||
| static nvrtcResult _WRAPLIB_API_CALL | |||
| nvrtcGetLoweredName_error(nvrtcProgram, const char* const, const char**) { | |||
| return on_init_failed<nvrtcResult>(14); | |||
| } | |||
| static constexpr size_t NR_FUNC = 15; | |||
| static void* g_func_table[NR_FUNC] = {(void*)(&nvrtcGetErrorString_init), | |||
| (void*)(&nvrtcVersion_init), | |||
| (void*)(&nvrtcGetNumSupportedArchs_init), | |||
| (void*)(&nvrtcGetSupportedArchs_init), | |||
| (void*)(&nvrtcCreateProgram_init), | |||
| (void*)(&nvrtcDestroyProgram_init), | |||
| (void*)(&nvrtcCompileProgram_init), | |||
| (void*)(&nvrtcGetPTXSize_init), | |||
| (void*)(&nvrtcGetPTX_init), | |||
| (void*)(&nvrtcGetCUBINSize_init), | |||
| (void*)(&nvrtcGetCUBIN_init), | |||
| (void*)(&nvrtcGetProgramLogSize_init), | |||
| (void*)(&nvrtcGetProgramLog_init), | |||
| (void*)(&nvrtcAddNameExpression_init), | |||
| (void*)(&nvrtcGetLoweredName_init)}; | |||
| static void* g_func_table_error[NR_FUNC] = { | |||
| (void*)(&nvrtcGetErrorString_error), | |||
| (void*)(&nvrtcVersion_error), | |||
| (void*)(&nvrtcGetNumSupportedArchs_error), | |||
| (void*)(&nvrtcGetSupportedArchs_error), | |||
| (void*)(&nvrtcCreateProgram_error), | |||
| (void*)(&nvrtcDestroyProgram_error), | |||
| (void*)(&nvrtcCompileProgram_error), | |||
| (void*)(&nvrtcGetPTXSize_error), | |||
| (void*)(&nvrtcGetPTX_error), | |||
| (void*)(&nvrtcGetCUBINSize_error), | |||
| (void*)(&nvrtcGetCUBIN_error), | |||
| (void*)(&nvrtcGetProgramLogSize_error), | |||
| (void*)(&nvrtcGetProgramLog_error), | |||
| (void*)(&nvrtcAddNameExpression_error), | |||
| (void*)(&nvrtcGetLoweredName_error)}; | |||
| static const char* const g_func_name[NR_FUNC] = {"nvrtcGetErrorString", | |||
| "nvrtcVersion", | |||
| "nvrtcGetNumSupportedArchs", | |||
| "nvrtcGetSupportedArchs", | |||
| "nvrtcCreateProgram", | |||
| "nvrtcDestroyProgram", | |||
| "nvrtcCompileProgram", | |||
| "nvrtcGetPTXSize", | |||
| "nvrtcGetPTX", | |||
| "nvrtcGetCUBINSize", | |||
| "nvrtcGetCUBIN", | |||
| "nvrtcGetProgramLogSize", | |||
| "nvrtcGetProgramLog", | |||
| "nvrtcAddNameExpression", | |||
| "nvrtcGetLoweredName"}; | |||
| static void load_library() { | |||
| static bool done = false; | |||
| static std::mutex mtx; | |||
| std::lock_guard<std::mutex> lg{mtx}; | |||
| if (done) | |||
| return; | |||
| void* handle = get_library_handle(); | |||
| for (size_t i = 0; i < NR_FUNC; ++i) { | |||
| void* func; | |||
| if (!handle) { | |||
| func = nullptr; | |||
| } else { | |||
| func = resolve_library_func(handle, g_func_name[i]); | |||
| } | |||
| if (!func) { | |||
| func = g_func_table_error[i]; | |||
| } | |||
| __atomic_store_n(g_func_table + i, func, __ATOMIC_RELAXED); | |||
| } | |||
| done = true; | |||
| } | |||
| const char _WRAPLIB_API_CALL* nvrtcGetErrorString(nvrtcResult arg0) { | |||
| typedef const char*(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcResult); | |||
| ON_ENTRY(nvrtcGetErrorString); | |||
| f_ptr_t f = (f_ptr_t)(g_func_table[0]); | |||
| return f(arg0); | |||
| } | |||
| nvrtcResult _WRAPLIB_API_CALL nvrtcVersion(int* arg0, int* arg1) { | |||
| typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(int*, int*); | |||
| ON_ENTRY(nvrtcVersion); | |||
| f_ptr_t f = (f_ptr_t)(g_func_table[1]); | |||
| return f(arg0, arg1); | |||
| } | |||
| nvrtcResult _WRAPLIB_API_CALL nvrtcGetNumSupportedArchs(int* arg0) { | |||
| typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(int*); | |||
| ON_ENTRY(nvrtcGetNumSupportedArchs); | |||
| f_ptr_t f = (f_ptr_t)(g_func_table[2]); | |||
| return f(arg0); | |||
| } | |||
| nvrtcResult _WRAPLIB_API_CALL nvrtcGetSupportedArchs(int* arg0) { | |||
| typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(int*); | |||
| ON_ENTRY(nvrtcGetSupportedArchs); | |||
| f_ptr_t f = (f_ptr_t)(g_func_table[3]); | |||
| return f(arg0); | |||
| } | |||
| nvrtcResult _WRAPLIB_API_CALL nvrtcCreateProgram(nvrtcProgram* arg0, | |||
| const char* arg1, | |||
| const char* arg2, int arg3, | |||
| const char* const* arg4, | |||
| const char* const* arg5) { | |||
| typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)( | |||
| nvrtcProgram*, const char*, const char*, int, const char* const*, | |||
| const char* const*); | |||
| ON_ENTRY(nvrtcCreateProgram); | |||
| f_ptr_t f = (f_ptr_t)(g_func_table[4]); | |||
| return f(arg0, arg1, arg2, arg3, arg4, arg5); | |||
| } | |||
| nvrtcResult _WRAPLIB_API_CALL nvrtcDestroyProgram(nvrtcProgram* arg0) { | |||
| typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcProgram*); | |||
| ON_ENTRY(nvrtcDestroyProgram); | |||
| f_ptr_t f = (f_ptr_t)(g_func_table[5]); | |||
| return f(arg0); | |||
| } | |||
| nvrtcResult _WRAPLIB_API_CALL nvrtcCompileProgram(nvrtcProgram arg0, int arg1, | |||
| const char* const* arg2) { | |||
| typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcProgram, int, | |||
| const char* const*); | |||
| ON_ENTRY(nvrtcCompileProgram); | |||
| f_ptr_t f = (f_ptr_t)(g_func_table[6]); | |||
| return f(arg0, arg1, arg2); | |||
| } | |||
| nvrtcResult _WRAPLIB_API_CALL nvrtcGetPTXSize(nvrtcProgram arg0, size_t* arg1) { | |||
| typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcProgram, size_t*); | |||
| ON_ENTRY(nvrtcGetPTXSize); | |||
| f_ptr_t f = (f_ptr_t)(g_func_table[7]); | |||
| return f(arg0, arg1); | |||
| } | |||
| nvrtcResult _WRAPLIB_API_CALL nvrtcGetPTX(nvrtcProgram arg0, char* arg1) { | |||
| typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcProgram, char*); | |||
| ON_ENTRY(nvrtcGetPTX); | |||
| f_ptr_t f = (f_ptr_t)(g_func_table[8]); | |||
| return f(arg0, arg1); | |||
| } | |||
| nvrtcResult _WRAPLIB_API_CALL nvrtcGetCUBINSize(nvrtcProgram arg0, | |||
| size_t* arg1) { | |||
| typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcProgram, size_t*); | |||
| ON_ENTRY(nvrtcGetCUBINSize); | |||
| f_ptr_t f = (f_ptr_t)(g_func_table[9]); | |||
| return f(arg0, arg1); | |||
| } | |||
| nvrtcResult _WRAPLIB_API_CALL nvrtcGetCUBIN(nvrtcProgram arg0, char* arg1) { | |||
| typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcProgram, char*); | |||
| ON_ENTRY(nvrtcGetCUBIN); | |||
| f_ptr_t f = (f_ptr_t)(g_func_table[10]); | |||
| return f(arg0, arg1); | |||
| } | |||
| nvrtcResult _WRAPLIB_API_CALL nvrtcGetProgramLogSize(nvrtcProgram arg0, | |||
| size_t* arg1) { | |||
| typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcProgram, size_t*); | |||
| ON_ENTRY(nvrtcGetProgramLogSize); | |||
| f_ptr_t f = (f_ptr_t)(g_func_table[11]); | |||
| return f(arg0, arg1); | |||
| } | |||
| nvrtcResult _WRAPLIB_API_CALL nvrtcGetProgramLog(nvrtcProgram arg0, | |||
| char* arg1) { | |||
| typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcProgram, char*); | |||
| ON_ENTRY(nvrtcGetProgramLog); | |||
| f_ptr_t f = (f_ptr_t)(g_func_table[12]); | |||
| return f(arg0, arg1); | |||
| } | |||
| nvrtcResult _WRAPLIB_API_CALL nvrtcAddNameExpression(nvrtcProgram arg0, | |||
| const char* const arg1) { | |||
| typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcProgram, | |||
| const char* const); | |||
| ON_ENTRY(nvrtcAddNameExpression); | |||
| f_ptr_t f = (f_ptr_t)(g_func_table[13]); | |||
| return f(arg0, arg1); | |||
| } | |||
| nvrtcResult _WRAPLIB_API_CALL nvrtcGetLoweredName(nvrtcProgram arg0, | |||
| const char* const arg1, | |||
| const char** arg2) { | |||
| typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)( | |||
| nvrtcProgram, const char* const, const char**); | |||
| ON_ENTRY(nvrtcGetLoweredName); | |||
| f_ptr_t f = (f_ptr_t)(g_func_table[14]); | |||
| return f(arg0, arg1, arg2); | |||
| } | |||
| @@ -0,0 +1,75 @@ | |||
| /** | |||
| * \file dnn/cuda-stub/src/libnvrtc.cpp | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
| * implied. | |||
| */ | |||
| #pragma GCC visibility push(default) | |||
| #include <cstdio> | |||
| #define LOGE(fmt, v...) fprintf(stderr, "err: " fmt "\n", ##v) | |||
| #include "./nvrtc_type.h" | |||
| #pragma GCC diagnostic ignored "-Wdeprecated-declarations" | |||
| static void log_failed_load(int func_idx); | |||
| namespace { | |||
| template <typename T> | |||
| T on_init_failed(int func_idx); | |||
| template <> | |||
| nvrtcResult on_init_failed(int func_idx) { | |||
| log_failed_load(func_idx); | |||
| return NVRTC_ERROR_INTERNAL_ERROR; | |||
| } | |||
| template <> | |||
| const char* on_init_failed(int func_idx) { | |||
| log_failed_load(func_idx); | |||
| return "load lib failed"; | |||
| } | |||
| } // namespace | |||
| #include "./libnvrtc-wrap.h" | |||
| static const char* default_so_name = | |||
| #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) | |||
| "nvrtc.dll"; | |||
| #elif defined(__APPLE__) || defined(__MACOSX) | |||
| "libnvrtc.dylib"; | |||
| #else | |||
| "libnvrtc.so"; | |||
| #endif | |||
| static const char* default_so_paths[] = { | |||
| #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) | |||
| "nvrtc.dll", | |||
| #elif defined(__unix__) || defined(__QNX__) || defined(__APPLE__) || \ | |||
| defined(__MACOSX) | |||
| #if defined(__APPLE__) || defined(__MACOSX) | |||
| "/usr/local/cuda/lib/libnvrtc.dylib", | |||
| #elif defined(__ANDROID__) | |||
| #if defined(__aarch64__) | |||
| "/system/vendor/lib64/libnvrtc.so", | |||
| #elif defined(__arm__) | |||
| "/system/vendor/lib/libnvrtc.so", | |||
| #endif | |||
| #else | |||
| "libnvrtc.so", | |||
| // In case some users does not have correct search path configured in | |||
| // /etc/ld.so.conf | |||
| "/usr/lib/x86_64-linux-gnu/libnvrtc.so", | |||
| "/usr/local/nvidia/lib64/libnvrtc.so", | |||
| "/usr/local/cuda/lib64/libnvrtc.so", | |||
| #endif | |||
| #else | |||
| #error "Unknown platform" | |||
| #endif | |||
| }; | |||
| static const char* extra_so_paths[] = {}; | |||
| static const char* g_default_api_name = "nvrtc"; | |||
| #include "./dlopen_helper.h" | |||
| @@ -0,0 +1,27 @@ | |||
| #pragma once | |||
| #ifdef __cplusplus | |||
| extern "C" { | |||
| #endif /* __cplusplus */ | |||
| #include <stdlib.h> | |||
| typedef enum { | |||
| NVRTC_SUCCESS = 0, | |||
| NVRTC_ERROR_OUT_OF_MEMORY = 1, | |||
| NVRTC_ERROR_PROGRAM_CREATION_FAILURE = 2, | |||
| NVRTC_ERROR_INVALID_INPUT = 3, | |||
| NVRTC_ERROR_INVALID_PROGRAM = 4, | |||
| NVRTC_ERROR_INVALID_OPTION = 5, | |||
| NVRTC_ERROR_COMPILATION = 6, | |||
| NVRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7, | |||
| NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8, | |||
| NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9, | |||
| NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10, | |||
| NVRTC_ERROR_INTERNAL_ERROR = 11 | |||
| } nvrtcResult; | |||
| typedef struct _nvrtcProgram *nvrtcProgram; | |||
| #ifdef __cplusplus | |||
| } | |||
| #endif /* __cplusplus */ | |||
| @@ -26,6 +26,7 @@ static inline CUBLASLTMatmulDesc::SizeArgs from_local_size_args( | |||
| return {handle, transA, transB, | |||
| args.layout_a, args.layout_b, args.layout_c}; | |||
| } | |||
| bool BatchedMatrixMulForwardImpl::AlgoCublasLt::is_available( | |||
| const SizeArgs& args) const { | |||
| auto cublasLt_args = from_local_size_args(args); | |||
| @@ -35,6 +36,7 @@ bool BatchedMatrixMulForwardImpl::AlgoCublasLt::is_available( | |||
| .is_available(cublasLt_args, INT_MAX); | |||
| return res; | |||
| } | |||
| size_t BatchedMatrixMulForwardImpl::AlgoCublasLt::get_workspace_in_bytes( | |||
| const SizeArgs& args) const { | |||
| auto cublasLt_args = from_local_size_args(args); | |||
| @@ -43,6 +45,7 @@ size_t BatchedMatrixMulForwardImpl::AlgoCublasLt::get_workspace_in_bytes( | |||
| desc.get_algorithm_heuristic(cublasLt_args, INT_MAX, algo); | |||
| return desc.get_workspace_bundle(cublasLt_args, algo).total_size_in_bytes(); | |||
| } | |||
| void BatchedMatrixMulForwardImpl::AlgoCublasLt::exec( | |||
| const ExecArgs& args) const { | |||
| auto cublasLt_args = from_local_size_args(args); | |||
| @@ -89,6 +92,7 @@ void BatchedMatrixMulForwardImpl::AlgoCublasLt::exec( | |||
| desc.layout_c, &algo, ws_bundle.get(0), | |||
| ws_bundle.get_size(0), stream)); | |||
| }; | |||
| auto batched_igemm = [&]() { | |||
| auto zero = handle->zero_device(); | |||
| auto one = handle->one_device(); | |||
| @@ -133,6 +137,18 @@ void BatchedMatrixMulForwardImpl::AlgoCublasLt::exec( | |||
| }; | |||
| ws_bundle.set(args.workspace.raw_ptr); | |||
| #if CUDA_VERSION >= 11000 | |||
| if (desc.dt_compute == CUBLAS_COMPUTE_32I) { | |||
| batched_igemm(); | |||
| } else if (desc.dt_compute == CUBLAS_COMPUTE_16F) { | |||
| batched_hgemm(); | |||
| } else if (desc.dt_compute == CUBLAS_COMPUTE_32F) { | |||
| batched_sgemm(); | |||
| } else { | |||
| megdnn_throw( | |||
| megdnn_mangle("compute_type must be int32/float16/float32")); | |||
| } | |||
| #else | |||
| if (desc.dt_compute == CUDA_R_32I) { | |||
| batched_igemm(); | |||
| } else if (desc.dt_compute == CUDA_R_16F) { | |||
| @@ -143,5 +159,6 @@ void BatchedMatrixMulForwardImpl::AlgoCublasLt::exec( | |||
| megdnn_throw( | |||
| megdnn_mangle("compute_type must be int32/float16/float32")); | |||
| } | |||
| #endif | |||
| } | |||
| #endif | |||
| @@ -156,6 +156,9 @@ std::string ConvBiasForwardImpl::AlgoBase::SizeArgs::to_string() const { | |||
| case param::ConvBias::NonlineMode::IDENTITY: | |||
| nonlinear_mode_str = "IDENTITY"; | |||
| break; | |||
| case param::ConvBias::NonlineMode::H_SWISH: | |||
| nonlinear_mode_str = "H_SWISH"; | |||
| break; | |||
| default: | |||
| megdnn_throw("invalid conv bias nonlinear mode"); | |||
| } | |||
| @@ -165,6 +165,23 @@ void TensorDesc::set(const TensorLayout& layout, | |||
| } | |||
| } | |||
| std::string TensorDesc::to_string() { | |||
| cudnnDataType_t data_type; | |||
| int n; | |||
| int c; | |||
| int h; | |||
| int w; | |||
| int n_stride; | |||
| int c_stride; | |||
| int h_stride; | |||
| int w_stride; | |||
| cudnn_check(cudnnGetTensor4dDescriptor(desc, &data_type, &n, &c, &h, &w, | |||
| &n_stride, &c_stride, &h_stride, | |||
| &w_stride)); | |||
| return ssprintf("<dtype_%d, %d,%d,%d,%d(%d,%d,%d,%d)>", data_type, n, c, h, | |||
| w, n_stride, c_stride, h_stride, w_stride); | |||
| } | |||
| template <typename Param> | |||
| FilterDesc<Param>::FilterDesc() { | |||
| cudnn_check(cudnnCreateFilterDescriptor(&desc)); | |||
| @@ -175,6 +192,20 @@ FilterDesc<Param>::~FilterDesc() { | |||
| cudnn_check(cudnnDestroyFilterDescriptor(desc)); | |||
| } | |||
| template <typename Param> | |||
| std::string FilterDesc<Param>::to_string() { | |||
| cudnnDataType_t data_type; | |||
| cudnnTensorFormat_t format; | |||
| int k; | |||
| int c; | |||
| int h; | |||
| int w; | |||
| cudnn_check(cudnnGetFilter4dDescriptor(desc, &data_type, &format, &k, &c, | |||
| &h, &w)); | |||
| return ssprintf("<dtype_%d, format_%d, %d,%d,%d,%d>", data_type,format, k, c, h, | |||
| w); | |||
| } | |||
| template <typename Param> | |||
| void FilterDesc<Param>::set( | |||
| const typename ConvolutionBase<Param>::CanonizedFilterMeta& | |||
| @@ -30,6 +30,7 @@ class TensorDesc { | |||
| //! default layout is nchw | |||
| void set(const TensorLayout& layout, const param::Convolution::Format = | |||
| param::Convolution::Format::NCHW); | |||
| std::string to_string(); | |||
| ~TensorDesc(); | |||
| cudnnTensorDescriptor_t desc; | |||
| }; | |||
| @@ -39,6 +40,7 @@ class FilterDesc { | |||
| public: | |||
| FilterDesc(); | |||
| void set(const typename ConvolutionBase<Param>::CanonizedFilterMeta &meta); | |||
| std::string to_string(); | |||
| ~FilterDesc(); | |||
| cudnnFilterDescriptor_t desc; | |||
| }; | |||
| @@ -25,6 +25,10 @@ using namespace cuda; | |||
| #define SE_CUDA_DATA_HALF CUBLAS_DATA_HALF | |||
| #endif | |||
| #if CUDA_VERSION < 11000 | |||
| #define CUBLAS_COMPUTE_32I CUDA_R_32I | |||
| #endif | |||
| bool MatrixMulForwardImpl::AlgoCuBlas::is_available( | |||
| const SizeArgs& args) const { | |||
| if (args.opr->param().format != param::MatrixMul::Format::DEFAULT) | |||
| @@ -117,7 +121,7 @@ void MatrixMulForwardImpl::AlgoCuBlas::exec(const ExecArgs& args) const { | |||
| args.tensor_b.layout.stride[0], args.tensor_a.raw_ptr, | |||
| CUDA_R_8I, args.tensor_a.layout.stride[0], zero, | |||
| args.tensor_c.raw_ptr, CUDA_R_32I, | |||
| args.tensor_c.layout.stride[0], CUDA_R_32I, CUBLAS_GEMM_DFALT)); | |||
| args.tensor_c.layout.stride[0], CUBLAS_COMPUTE_32I, CUBLAS_GEMM_DFALT)); | |||
| }; | |||
| // Note that cublas takes column-major matrices as inputs, | |||
| @@ -6,10 +6,11 @@ | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
| * implied. | |||
| */ | |||
| #include "src/cuda/matrix_mul/cublasLt_wrapper.h" | |||
| #include "src/common/utils.h" | |||
| #include "src/cuda/matrix_mul/cublasLt_wrapper.h" | |||
| #include "src/cuda/utils.h" | |||
| #if CUDA_VERSION >= 10010 | |||
| @@ -33,6 +34,7 @@ static cudaDataType_t to_cuda_dtype(DType tp) { | |||
| } | |||
| } | |||
| #if CUDA_VERSION >= 11000 | |||
| static cublasComputeType_t to_cublas_compute_type(DType tp) { | |||
| switch (tp.enumv()) { | |||
| case DTypeEnum::Float16: | |||
| @@ -43,10 +45,11 @@ static cublasComputeType_t to_cublas_compute_type(DType tp) { | |||
| case DTypeEnum::QuantizedS32: | |||
| return CUBLAS_COMPUTE_32I; | |||
| default: | |||
| megdnn_throw(megdnn_mangle( | |||
| "dtype must be float16/float32/int32/Qs32")); | |||
| megdnn_throw( | |||
| megdnn_mangle("dtype must be float16/float32/int32/Qs32")); | |||
| } | |||
| } | |||
| #endif | |||
| static const char* cuda_type_to_str(cudaDataType_t tp) { | |||
| switch (tp) { | |||
| @@ -106,9 +109,15 @@ void CUBLASLTMatmulDesc::set(const SizeArgs& args, bool batched) { | |||
| dt_b = to_cuda_dtype(args.layout_b.dtype); | |||
| dt_a = to_cuda_dtype(args.layout_a.dtype); | |||
| dt_c = to_cuda_dtype(args.layout_c.dtype); | |||
| dt_compute = to_cublas_compute_type(args.layout_c.dtype); | |||
| megdnn_assert(dt_a == dt_b, "matrix A and B should have same precision"); | |||
| #if CUDA_VERSION >= 11000 | |||
| dt_compute = to_cublas_compute_type(args.layout_c.dtype); | |||
| cublas_check(cublasLtMatmulDescCreate(&matmul_desc, dt_compute, dt_c)); | |||
| #else | |||
| dt_compute = dt_c; | |||
| cublas_check(cublasLtMatmulDescCreate(&matmul_desc, dt_compute)); | |||
| #endif | |||
| cublas_check(cublasLtMatmulDescSetAttribute( | |||
| matmul_desc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &pm, sizeof(pm))); | |||
| @@ -262,8 +271,7 @@ WorkspaceBundle CUBLASLTMatmulDesc::get_workspace_bundle( | |||
| dt_c == CUDA_R_32I ? layout_trans_b : layout_b, | |||
| dt_c == CUDA_R_32I ? layout_trans_a : layout_a, | |||
| dt_c == CUDA_R_32I ? layout_trans_c : layout_c, | |||
| dt_c == CUDA_R_32I ? layout_trans_c : layout_c, &algo, | |||
| &result); | |||
| dt_c == CUDA_R_32I ? layout_trans_c : layout_c, &algo, &result); | |||
| // return empty WorkspaceBundle if cublasLtMatmulAlgoCheck() failed | |||
| if (status != CUBLAS_STATUS_SUCCESS) | |||
| return {nullptr, {}}; | |||
| @@ -48,7 +48,11 @@ struct CUBLASLTMatmulDesc { | |||
| bool is_batched; | |||
| cublasLtMatmulDesc_t matmul_desc; | |||
| cudaDataType_t dt_a, dt_b, dt_c; | |||
| #if CUDA_VERSION >= 11000 | |||
| cublasComputeType_t dt_compute; | |||
| #else | |||
| cudaDataType_t dt_compute; | |||
| #endif | |||
| cublasLtMatrixLayout_t layout_a, layout_b, layout_c; | |||
| cublasLtMatrixLayout_t layout_trans_a, layout_trans_b, layout_trans_c; | |||
| size_t workspace_a, workspace_b, workspace_c; | |||
| @@ -128,7 +128,23 @@ void MatrixMulForwardImpl::AlgoCuBlasLt::exec(const ExecArgs& args) const { | |||
| stream)); | |||
| cublas_check(cublasLtMatrixTransformDescDestroy(transform_desc)); | |||
| }; | |||
| switch(desc.dt_compute) { | |||
| #if CUDA_VERSION >= 11000 | |||
| switch (desc.dt_compute) { | |||
| case CUBLAS_COMPUTE_16F: | |||
| hgemm(); | |||
| break; | |||
| case CUBLAS_COMPUTE_32F: | |||
| sgemm(); | |||
| break; | |||
| case CUBLAS_COMPUTE_32I: | |||
| igemm(); | |||
| break; | |||
| default: | |||
| megdnn_throw(megdnn_mangle( | |||
| "compute type must be float16/float32/int32")); | |||
| } | |||
| #else | |||
| switch (desc.dt_compute) { | |||
| case CUDA_R_16F: | |||
| hgemm(); | |||
| break; | |||
| @@ -139,8 +155,10 @@ void MatrixMulForwardImpl::AlgoCuBlasLt::exec(const ExecArgs& args) const { | |||
| igemm(); | |||
| break; | |||
| default: | |||
| megdnn_throw(megdnn_mangle("compute type must be float16/float32/int32")); | |||
| megdnn_throw(megdnn_mangle( | |||
| "compute type must be float16/float32/int32")); | |||
| } | |||
| #endif | |||
| } | |||
| #endif | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -309,6 +309,9 @@ void benchmark_target_algo(Handle* handle, const std::vector<BenchArgs>& args, | |||
| arg.f / (1e12); | |||
| TensorShape src{arg.n, arg.ci, arg.hi, arg.wi}, | |||
| filter{arg.co, arg.ci, arg.f, arg.f}; | |||
| if (!algo){ | |||
| algo = "no_name"; | |||
| } | |||
| printf("src=%s, filter=%s, time(algo=%s)=%.2f %.2fTops, " | |||
| "time(cudnn)=%.2f %.2fTops, time(batched_matmul)=%.2f " | |||
| "%.2fTops, " | |||
| @@ -1,2 +1,3 @@ | |||
| Makefile | |||
| /test/imperative_test | |||
| python/megengine/version.py | |||
| @@ -70,3 +70,8 @@ add_custom_command( | |||
| COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/python/requires-style.txt ${CMAKE_CURRENT_BINARY_DIR}/python/requires-style.txt | |||
| COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/python/requires-test.txt ${CMAKE_CURRENT_BINARY_DIR}/python/requires-test.txt | |||
| ) | |||
| add_custom_command( | |||
| TARGET ${MODULE_NAME} POST_BUILD | |||
| COMMAND "${PYTHON_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/python/gen_version.py --output ${CMAKE_CURRENT_BINARY_DIR}/python/megengine/version.py | |||
| ) | |||
| @@ -0,0 +1,31 @@ | |||
| import argparse | |||
| import os | |||
| import subprocess | |||
| def get_git_commit(src_dir): | |||
| try: | |||
| return subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd=src_dir).decode('ascii').strip() | |||
| except Exception: | |||
| return 'unknown' | |||
| def get_mge_version(version_txt_path): | |||
| v = {} | |||
| with open(version_txt_path) as fp: | |||
| exec(fp.read(), v) | |||
| return v | |||
| if __name__ == "__main__": | |||
| parser = argparse.ArgumentParser(description="generate version.py to build path") | |||
| parser.add_argument("--output", type=str, required=True) | |||
| args = parser.parse_args() | |||
| python_dir = os.path.dirname(__file__) | |||
| version_txt_path = os.path.join(python_dir, 'version_template.py') | |||
| commit_id = get_git_commit(python_dir) | |||
| mge_ver_map = get_mge_version(version_txt_path) | |||
| mge_ver = mge_ver_map['__version__'] if '__version__' in mge_ver_map else 'unknown' | |||
| mge_intl = mge_ver_map['__internal__'] if '__internal__' in mge_ver_map else False | |||
| with open(args.output, 'w') as f: | |||
| f.write("__version__ = '{}'\n".format(mge_ver)) | |||
| f.write("git_version = {}\n".format(repr(commit_id))) | |||
| if mge_intl: | |||
| f.write("__internal__ = True\n") | |||
| @@ -0,0 +1,10 @@ | |||
| # -*- coding: utf-8 -*- | |||
| # MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| # | |||
| # Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, | |||
| # software distributed under the License is distributed on an | |||
| # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| __version__ = "1.3.0.dev" | |||
| @@ -8,7 +8,7 @@ | |||
| ```bash | |||
| 1: please refer to: https://docs.docker.com/engine/security/rootless/ to enable rootless docker env | |||
| 2: cd ./scripts/whl/manylinux2010 | |||
| 2: cd ./scripts/whl/manylinux2014 | |||
| 3: ./build_image.sh | |||
| ``` | |||
| @@ -56,24 +56,25 @@ | |||
| ``` | |||
| # How to build | |||
| Note: Guarantee the git repo is mounted in docker container, do not use `git submodule update --init` in to init megbrain repo | |||
| ## Build for linux | |||
| * MegBrain delivers `wheel` package with `manylinux2010` tag defined in [PEP-571](https://www.python.org/dev/peps/pep-0571/). | |||
| * MegBrain delivers `wheel` package with `manylinux2014` tag defined in [PEP-571](https://www.python.org/dev/peps/pep-0571/). | |||
| commands: | |||
| ```bash | |||
| export CUDA_ROOT_DIR=/path/to/cuda | |||
| export CUDNN_ROOT_DIR=/path/to/cudnn | |||
| export TENSORRT_ROOT_DIR=/path/to/tensorrt | |||
| ./scripts/whl/manylinux2010/build_wheel.sh | |||
| ./scripts/whl/manylinux2014/build_wheel_common.sh -sdk cu101 | |||
| ``` | |||
| * And you can find all of the outputs in `output` directory.If you just want to build for a specific Python verison, you can use `ALL_PYTHON` environment variable. eg: | |||
| ```bash | |||
| ALL_PYTHON="36m" ./scripts/whl/manylinux2010/build_wheel.sh | |||
| ALL_PYTHON="36m" ./scripts/whl/manylinux2014/build_wheel_common.sh -sdk cu101 | |||
| ``` | |||
| * If you just want to build with cpu only version, you can set `BUILD_WHL_CPU_ONLY` environment 'ON'. eg: | |||
| ```bash | |||
| BUILD_WHL_CPU_ONLY="ON" ALL_PYTHON="36m" ./scripts/whl/manylinux2010/build_wheel.sh | |||
| BUILD_WHL_CPU_ONLY="ON" ALL_PYTHON="36m" ./scripts/whl/manylinux2014/build_wheel_common.sh -sdk cu101 | |||
| ``` | |||
| ## Build for MacOS | |||
| @@ -0,0 +1,15 @@ | |||
| FROM quay.io/pypa/manylinux2014_x86_64:2020-12-31-56195b3 | |||
| ENV UID=1024 \ | |||
| PATH=${PATH}:/usr/local/cuda/bin \ | |||
| LIBRARY_PATH=${LIBRARY_PATH}:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/opt/cudnn/lib64:/opt/tensorrt/lib \ | |||
| LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/opt/cudnn/lib64:/opt/tensorrt/lib \ | |||
| CPATH=${CPATH}:/usr/local/cuda/include:/opt/cudnn/include:/opt/tensorrt/include | |||
| ARG platform | |||
| COPY setup_mirror.sh . | |||
| RUN ./setup_mirror.sh "$platform" | |||
| ADD init_image.sh /tmp | |||
| RUN /tmp/init_image.sh && rm -f /tmp/init_image.sh | |||
| @@ -0,0 +1,5 @@ | |||
| #!/bin/bash -e | |||
| cd $(dirname $0) | |||
| docker build -t env_manylinux2014:latest . | |||
| @@ -0,0 +1,217 @@ | |||
| #!/bin/bash | |||
| set -e | |||
| CWD=$(dirname $0) | |||
| BASEDIR=$(readlink -f ${CWD}/../../..) | |||
| OUTPUTDIR=$(readlink -f ${CWD}/output) | |||
| USERID=$(id -u) | |||
| TMPFS_ARGS="--tmpfs /tmp:exec" | |||
| local_path=$(dirname $(readlink -f $0)) | |||
| CUDNN_LIB_DIR="/opt/cudnn/lib64/" | |||
| CUDA_LIB_DIR="/usr/local/cuda/lib64/" | |||
| CUDA_SDK="unknown" | |||
| function usage() { | |||
| echo "use '-sdk cu111' to specify cuda toolkit config, also support cu101, cu112" | |||
| } | |||
| while [ "$1" != "" ]; do | |||
| case $1 in | |||
| -sdk) | |||
| shift | |||
| CUDA_SDK=$1 | |||
| shift | |||
| ;; | |||
| *) | |||
| usage | |||
| exit 1 | |||
| esac | |||
| done | |||
| echo "Build with ${CUDA_SDK}" | |||
| if [ $CUDA_SDK == "cu101" ];then | |||
| COPY_LIB_LIST="${CUDA_LIB_DIR}/libnvrtc.so.10.1" | |||
| EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=OFF" | |||
| OUT_DIR="cu101" | |||
| BUILD_GCC8="ON" | |||
| REQUIR_CUDA_VERSION="10010" | |||
| REQUIR_CUDNN_VERSION="7.6.3" | |||
| REQUIR_TENSORRT_VERSION="6.0.1.5" | |||
| elif [ $CUDA_SDK == "cu111" ];then | |||
| COPY_LIB_LIST="\ | |||
| ${CUDA_LIB_DIR}/libnvrtc.so.11.1:\ | |||
| ${CUDA_LIB_DIR}/libcublasLt.so.11:\ | |||
| ${CUDA_LIB_DIR}/libcublas.so.11:\ | |||
| ${CUDNN_LIB_DIR}/libcudnn_adv_infer.so.8:\ | |||
| ${CUDNN_LIB_DIR}/libcudnn_adv_train.so.8:\ | |||
| ${CUDNN_LIB_DIR}/libcudnn_cnn_infer.so.8:\ | |||
| ${CUDNN_LIB_DIR}/libcudnn_cnn_train.so.8:\ | |||
| ${CUDNN_LIB_DIR}/libcudnn_ops_infer.so.8:\ | |||
| ${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\ | |||
| ${CUDNN_LIB_DIR}/libcudnn.so.8" | |||
| EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON\ | |||
| -gencode arch=compute_61,code=sm_61 \ | |||
| arch=compute_70,code=sm_70 \ | |||
| arch=compute_75,code=sm_75 \ | |||
| arch=compute_80,code=sm_80 \ | |||
| arch=compute_86,code=sm_86 \ | |||
| arch=compute_86,code=compute_86" | |||
| OUT_DIR="cu111" | |||
| REQUIR_CUDA_VERSION="11010" | |||
| REQUIR_CUDNN_VERSION="8.0.5" | |||
| REQUIR_TENSORRT_VERSION="7.2.2.3" | |||
| elif [ $CUDA_SDK == "cu112" ];then | |||
| COPY_LIB_LIST="\ | |||
| ${CUDA_LIB_DIR}/libnvrtc.so.11.2:\ | |||
| ${CUDA_LIB_DIR}/libcublasLt.so.11:\ | |||
| ${CUDA_LIB_DIR}/libcublas.so.11:\ | |||
| ${CUDNN_LIB_DIR}/libcudnn_adv_infer.so.8:\ | |||
| ${CUDNN_LIB_DIR}/libcudnn_adv_train.so.8:\ | |||
| ${CUDNN_LIB_DIR}/libcudnn_cnn_infer.so.8:\ | |||
| ${CUDNN_LIB_DIR}/libcudnn_cnn_train.so.8:\ | |||
| ${CUDNN_LIB_DIR}/libcudnn_ops_infer.so.8:\ | |||
| ${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\ | |||
| ${CUDNN_LIB_DIR}/libcudnn.so.8" | |||
| EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON \ | |||
| -gencode arch=compute_61,code=sm_61 \ | |||
| arch=compute_70,code=sm_70 \ | |||
| arch=compute_75,code=sm_75 \ | |||
| arch=compute_80,code=sm_80 \ | |||
| arch=compute_86,code=sm_86 \ | |||
| arch=compute_86,code=compute_86" | |||
| OUT_DIR="cu112" | |||
| REQUIR_CUDA_VERSION="11020" | |||
| REQUIR_CUDNN_VERSION="8.0.5" | |||
| REQUIR_TENSORRT_VERSION="7.2.2.3" | |||
| else | |||
| echo "no support sdk ${CUDA_SDK}, please set by '-sdk cu111'" | |||
| exit -1 | |||
| fi | |||
| BUILD_WHL_CPU_ONLY=${BUILD_WHL_CPU_ONLY} | |||
| if [[ -z ${BUILD_WHL_CPU_ONLY} ]] | |||
| then | |||
| BUILD_WHL_CPU_ONLY="OFF" | |||
| fi | |||
| echo ${BASEDIR} | |||
| pushd ${BASEDIR}/third_party >/dev/null | |||
| ./prepare.sh | |||
| popd >/dev/null | |||
| cd ${CWD} | |||
| mkdir -p ${OUTPUTDIR} | |||
| if [ ${BUILD_WHL_CPU_ONLY} = "OFF" ]; then | |||
| if [[ -z ${CUDA_ROOT_DIR} ]]; then | |||
| echo "Environment variable CUDA_ROOT_DIR not set." | |||
| exit -1 | |||
| fi | |||
| if [[ -z ${CUDNN_ROOT_DIR} ]]; then | |||
| echo "Environment variable CUDNN_ROOT_DIR not set." | |||
| exit -1 | |||
| fi | |||
| if [[ -z ${TENSORRT_ROOT_DIR} ]]; then | |||
| echo "Environment variable TENSORRT_ROOT_DIR not set." | |||
| exit -1 | |||
| fi | |||
| ## YOU SHOULD MODIFY CUDA VERSION AS BELOW WHEN UPGRADE | |||
| CUDA_ROOT_DIR_=${CUDA_ROOT_DIR%*/} | |||
| CUDNN_ROOT_DIR_=${CUDNN_ROOT_DIR%*/} | |||
| TENSORRT_ROOT_DIR_=${TENSORRT_ROOT_DIR%*/} | |||
| CUDA_VERSION_PATH=${CUDA_ROOT_DIR_}/include/cuda.h | |||
| if [ "$REQUIR_CUDA_VERSION" -ge "11000" ];then | |||
| CUDNN_VERSION_PATH=${CUDNN_ROOT_DIR_}/include/cudnn_version.h | |||
| else | |||
| CUDNN_VERSION_PATH=${CUDNN_ROOT_DIR_}/include/cudnn.h | |||
| fi | |||
| TENSORRT_VERSION_PATH=${TENSORRT_ROOT_DIR_}/include/NvInferVersion.h | |||
| if [ ! -e $CUDA_VERSION_PATH ] ; then | |||
| echo file $CUDA_VERSION_PATH is not exist | |||
| echo please check the Environment must use CUDA-$REQUIR_CUDA_VERSION | |||
| exit -1 | |||
| fi | |||
| if [ ! -e $CUDNN_VERSION_PATH ] ; then | |||
| echo file $CUDNN_VERSION_PATH is not exist | |||
| echo please check the Environment must use CUDNN-V$REQUIR_CUDNN_VERSION | |||
| exit -1 | |||
| fi | |||
| if [ ! -e $TENSORRT_VERSION_PATH ] ; then | |||
| echo file $TENSORRT_VERSION_PATH is not exist | |||
| echo please check the Environment must use TensorRT-$REQUIR_TENSORRT_VERSION | |||
| exit -1 | |||
| fi | |||
| CUDA_VERSION_CONTEXT=$(head -300 ${CUDA_VERSION_PATH}) | |||
| CUDNN_VERSION_CONTEXT=$(head -62 ${CUDNN_VERSION_PATH}) | |||
| TENSORRT_VERSION_CONTEXT=$(tail -12 ${TENSORRT_VERSION_PATH}) | |||
| if [ "$REQUIR_CUDA_VERSION" -ge "11000" ];then | |||
| CUDA_API_VERSION=$(echo $CUDA_VERSION_CONTEXT | grep -Eo "define CUDA_VERSION * +([0-9]+)") | |||
| else | |||
| CUDA_API_VERSION=$(echo $CUDA_VERSION_CONTEXT | grep -Eo "define __CUDA_API_VERSION * +([0-9]+)") | |||
| fi | |||
| CUDA_VERSION=${CUDA_API_VERSION:0-5} | |||
| echo CUDA_VERSION:$CUDA_VERSION | |||
| CUDNN_VERSION_MAJOR=$(echo $CUDNN_VERSION_CONTEXT | grep -Eo "define CUDNN_MAJOR * +([0-9]+)") | |||
| CUDNN_VERSION_MINOR=$(echo $CUDNN_VERSION_CONTEXT | grep -Eo "define CUDNN_MINOR * +([0-9]+)") | |||
| CUDNN_VERSION_PATCH=$(echo $CUDNN_VERSION_CONTEXT | grep -Eo "define CUDNN_PATCHLEVEL * +([0-9]+)") | |||
| CUDNN_VERSION=${CUDNN_VERSION_MAJOR:0-1}.${CUDNN_VERSION_MINOR:0-1}.${CUDNN_VERSION_PATCH:0-1} | |||
| echo CUDNN_VERSION:$CUDNN_VERSION | |||
| TENSORRT_VERSION_MAJOR=$(echo $TENSORRT_VERSION_CONTEXT | grep -Eo "NV_TENSORRT_MAJOR * +([0-9]+)") | |||
| TENSORRT_VERSION_MINOR=$(echo $TENSORRT_VERSION_CONTEXT | grep -Eo "NV_TENSORRT_MINOR * +([0-9]+)") | |||
| TENSORRT_VERSION_PATCH=$(echo $TENSORRT_VERSION_CONTEXT | grep -Eo "NV_TENSORRT_PATCH * +([0-9]+)") | |||
| TENSORRT_VERSION_BUILD=$(echo $TENSORRT_VERSION_CONTEXT | grep -Eo "NV_TENSORRT_BUILD * +([0-9]+)") | |||
| TENSORRT_VERSION=${TENSORRT_VERSION_MAJOR:0-1}.${TENSORRT_VERSION_MINOR:0-1}.${TENSORRT_VERSION_PATCH:0-1}.${TENSORRT_VERSION_BUILD:0-1} | |||
| echo TENSORRT_VERSION:$TENSORRT_VERSION | |||
| if [ $CUDA_VERSION != $REQUIR_CUDA_VERSION ] ; then | |||
| echo please check the Environment must use CUDA-10.1 NO.$REQUIR_CUDA_VERSION | |||
| exit -1 | |||
| fi | |||
| if [ $CUDNN_VERSION != $REQUIR_CUDNN_VERSION ] ; then | |||
| echo please check the Environment must use CUDNN-V$REQUIR_CUDNN_VERSION | |||
| exit -1 | |||
| fi | |||
| if [ $TENSORRT_VERSION != $REQUIR_TENSORRT_VERSION ] ; then | |||
| echo please check the Environment must use TENSORRT-$REQUIR_TENSORRT_VERSION | |||
| exit -1 | |||
| fi | |||
| fi | |||
| if [[ -z ${BUILD_GCC8} ]];then | |||
| BUILD_GCC8=OFF | |||
| fi | |||
| if [ "$BUILD_GCC8" == "ON" ];then | |||
| run_cmd="scl enable devtoolset-8 /home/code/scripts/whl/manylinux2014/do_build_common.sh" | |||
| else | |||
| run_cmd="/home/code/scripts/whl/manylinux2014/do_build_common.sh" | |||
| fi | |||
| docker run --rm -it $TMPFS_ARGS \ | |||
| -e UID=${USERID} \ | |||
| -e LOCAL_VERSION=${LOCAL_VERSION} \ | |||
| -e BUILD_WHL_CPU_ONLY=${BUILD_WHL_CPU_ONLY} \ | |||
| -e ALL_PYTHON="${ALL_PYTHON}" \ | |||
| -e EXTRA_CMAKE_FLAG="$EXTRA_CMAKE_FLAG" \ | |||
| -e COPY_LIB_LIST="$COPY_LIB_LIST" \ | |||
| -e OUT_DIR="$OUT_DIR" \ | |||
| -v ${CUDA_ROOT_DIR}:/usr/local/cuda \ | |||
| -v ${CUDNN_ROOT_DIR}:/opt/cudnn \ | |||
| -v ${TENSORRT_ROOT_DIR}:/opt/tensorrt \ | |||
| -v ${BASEDIR}:/home/code \ | |||
| -v ${OUTPUTDIR}:/home/output:rw \ | |||
| env_manylinux2014:latest /bin/bash -c "$run_cmd" | |||
| @@ -0,0 +1,136 @@ | |||
| #!/bin/bash -ex | |||
| function handle_strip() { | |||
| echo "now handle strip $1" | |||
| objcopy --only-keep-debug $1 $1.dbg | |||
| strip -s $1 | |||
| objcopy --add-gnu-debuglink=$1.dbg $1 | |||
| rm $1.dbg | |||
| } | |||
| function full_copy_so(){ | |||
| lib_path=$1 | |||
| dst_dir=$2 | |||
| append_rpath=$3 | |||
| lib_name=$(basename $lib_path) | |||
| cp $lib_path $dst_dir/$lib_name | |||
| if [ "$append_rpath" != "" ];then | |||
| ori_rpath=$(patchelf --print-rpath $dst_dir/$lib_name) | |||
| if [ "$ori_rpath" != "" ];then | |||
| patchelf --set-rpath "$ori_rpath:$append_rpath" $dst_dir/$lib_name | |||
| else | |||
| patchelf --set-rpath "$append_rpath" $dst_dir/$lib_name | |||
| fi | |||
| fi | |||
| } | |||
| function patch_elf_depend_lib() { | |||
| echo "handle common depend lib" | |||
| LIBS_DIR=${BUILD_DIR}/staging/megengine/core/lib | |||
| mkdir -p ${LIBS_DIR} | |||
| cp /usr/lib64/libatomic.so.1 ${LIBS_DIR} | |||
| patchelf --remove-rpath ${BUILD_DIR}/staging/megengine/core/_imperative_rt.so | |||
| patchelf --force-rpath --set-rpath '$ORIGIN/lib' ${BUILD_DIR}/staging/megengine/core/_imperative_rt.so | |||
| cp ${BUILD_DIR}/src/libmegengine_export.so ${LIBS_DIR} | |||
| patchelf --remove-rpath ${LIBS_DIR}/libmegengine_export.so | |||
| patchelf --force-rpath --set-rpath '$ORIGIN/.' ${LIBS_DIR}/libmegengine_export.so | |||
| cp ${BUILD_DIR}/src/libmegengine_export.so ${LIBS_DIR} | |||
| patchelf --remove-rpath ${LIBS_DIR}/libmegengine_export.so | |||
| patchelf --force-rpath --set-rpath '$ORIGIN/.' ${LIBS_DIR}/libmegengine_export.so | |||
| if [ ${BUILD_WHL_CPU_ONLY} = "OFF" ]; then | |||
| echo "handle cuda lib" | |||
| cp ${BUILD_DIR}/dnn/cuda-stub/libcuda_stub.so ${LIBS_DIR} | |||
| cp /usr/local/cuda/lib64/libnvToolsExt.so.1 ${LIBS_DIR} | |||
| IFS=: read -a lib_name_array <<<"$COPY_LIB_LIST" | |||
| append_rpath='$ORIGIN/.' | |||
| for lib_name in ${lib_name_array[@]};do | |||
| full_copy_so $lib_name ${LIBS_DIR} $lib_append_rpath | |||
| done | |||
| fi | |||
| } | |||
| ALL_PYTHON=${ALL_PYTHON} | |||
| if [[ -z ${ALL_PYTHON} ]] | |||
| then | |||
| ALL_PYTHON="35m 36m 37m 38" | |||
| fi | |||
| BUILD_WHL_CPU_ONLY=${BUILD_WHL_CPU_ONLY} | |||
| if [[ -z ${BUILD_WHL_CPU_ONLY} ]] | |||
| then | |||
| BUILD_WHL_CPU_ONLY="OFF" | |||
| fi | |||
| SRC_DIR=$(readlink -f "`dirname $0`/../../../") | |||
| BUILD_DIR=${SRC_DIR}/build_dir/host/MGE_WITH_CUDA_OFF/MGE_INFERENCE_ONLY_OFF/Release/build/ | |||
| if [ ${BUILD_WHL_CPU_ONLY} = "OFF" ]; then | |||
| BUILD_DIR=${SRC_DIR}/build_dir/host/MGE_WITH_CUDA_ON/MGE_INFERENCE_ONLY_OFF/Release/build/ | |||
| fi | |||
| NEW_LIB_PATH=core/lib | |||
| for ver in ${ALL_PYTHON} | |||
| do | |||
| USE_AUDITWHEEL="ON" | |||
| python_ver=${ver:0:2} | |||
| MAJOR=${python_ver:0:1} | |||
| MINOR=${ver:1} | |||
| PYTHON_DIR=/opt/python/cp${python_ver}-cp${ver}/ | |||
| export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} ${EXTRA_CMAKE_FLAG}" | |||
| export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DCMAKE_BUILD_TYPE=RelWithDebInfo" | |||
| export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DCMAKE_PREFIX_PATH=${PYTHON_DIR}" | |||
| export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DPYTHON_EXECUTABLE=${PYTHON_DIR}/bin/python3" | |||
| export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DPYTHON_LIBRARY=${PYTHON_DIR}lib/" | |||
| export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DPYTHON_INCLUDE_DIR=${PYTHON_DIR}include/python${MAJOR}.${MINOR}" | |||
| export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DMGE_WITH_ATLAS=ON" | |||
| if [ ${BUILD_WHL_CPU_ONLY} = "OFF" ]; then | |||
| ${SRC_DIR}/scripts/cmake-build/host_build.sh -c -t -r | |||
| else | |||
| ${SRC_DIR}/scripts/cmake-build/host_build.sh -t -r | |||
| fi | |||
| cd ${BUILD_DIR} | |||
| rm -rf staging | |||
| mkdir -p staging | |||
| cp -a imperative/python/{megengine,setup.py,requires.txt,requires-style.txt,requires-test.txt} staging/ | |||
| handle_strip ${BUILD_DIR}/src/libmegengine_export.so | |||
| cd ${BUILD_DIR}/staging/megengine/core | |||
| handle_strip _imperative_rt.so | |||
| mkdir -p lib/ucx | |||
| if [ ${USE_AUDITWHEEL} = "OFF" ]; then | |||
| patch_elf_depend_lib | |||
| fi | |||
| cd ${BUILD_DIR}/staging/ | |||
| ${PYTHON_DIR}/bin/python setup.py bdist_wheel | |||
| cd /home/output | |||
| if [ ${USE_AUDITWHEEL} = "ON" ]; then | |||
| LD_LIBRARY_PATH=${BUILD_DIR}/dnn/cuda-stub:$LD_LIBRARY_PATH auditwheel repair -L ${NEW_LIB_PATH} ${BUILD_DIR}/staging/dist/Meg*.whl | |||
| else | |||
| mkdir -p ${SRC_DIR}/scripts/whl/manylinux2014/output/wheelhouse/${OUT_DIR} | |||
| cd ${BUILD_DIR}/staging/dist/ | |||
| org_whl_name=`ls Meg*${ver}*.whl` | |||
| compat_whl_name=`echo ${org_whl_name} | sed 's/linux/manylinux2014/'` | |||
| echo "org whl name: ${org_whl_name}" | |||
| echo "comapt whl name: ${compat_whl_name}" | |||
| mv ${org_whl_name} ${SRC_DIR}/scripts/whl/manylinux2014/output/wheelhouse/${OUT_DIR}/${compat_whl_name} | |||
| cd /home/output | |||
| fi | |||
| chown -R ${UID}.${UID} . | |||
| # compat for root-less docker env to remove output at host side | |||
| chmod -R 777 . | |||
| echo "python $ver done" | |||
| done | |||
| @@ -0,0 +1,70 @@ | |||
| #!/bin/bash -e | |||
| GET_PIP_URL='https://bootstrap.pypa.io/get-pip.py' | |||
| SWIG_URL='https://downloads.sourceforge.net/project/swig/swig/swig-3.0.12/swig-3.0.12.tar.gz?use_mirror=autoselect' | |||
| LLVM_URL='https://github.com/llvm-mirror/llvm/archive/release_60.tar.gz' | |||
| CLANG_URL='https://github.com/llvm-mirror/clang/archive/release_60.tar.gz' | |||
| yum install -y pcre-devel devtoolset-9-libatomic-devel.x86_64 | |||
| yum install -y devtoolset-8 devtoolset-8-libatomic-devel.x86_64 | |||
| for ver in 35m 36m 37m 38 | |||
| do | |||
| python_ver=${ver:0:2} | |||
| curl ${GET_PIP_URL} | /opt/python/cp${python_ver}-cp${ver}/bin/python - \ | |||
| --no-cache-dir --only-binary :all: | |||
| /opt/python/cp${python_ver}-cp${ver}/bin/pip install \ | |||
| --no-cache-dir --only-binary :all: numpy==1.18.1 setuptools==46.1.3 | |||
| done | |||
| pushd /home >/dev/null | |||
| echo "Install swig" | |||
| curl -sSL ${SWIG_URL} | tar xz | |||
| pushd swig-3.0.12 >/dev/null | |||
| mkdir build | |||
| pushd build >/dev/null | |||
| ../configure | |||
| make -j$(nproc) | |||
| make install | |||
| popd >/dev/null | |||
| popd >/dev/null | |||
| rm -rf swig-3.0.12 | |||
| echo "Install llvm" | |||
| curl -sSL ${LLVM_URL} | tar xz | |||
| pushd llvm-release_60 >/dev/null | |||
| mkdir build | |||
| pushd build >/dev/null | |||
| cmake .. -DCMAKE_PREFIX_PATH=/opt/python/cp36-cp36m/ \ | |||
| -DCMAKE_BUILD_TYPE=Release | |||
| make -j$(nproc) | |||
| make install | |||
| popd >/dev/null | |||
| popd >/dev/null | |||
| rm -rf llvm-release_60 | |||
| echo "Install clang" | |||
| curl -sSL ${CLANG_URL} | tar xz | |||
| pushd clang-release_60 >/dev/null | |||
| mkdir build | |||
| pushd build >/dev/null | |||
| cmake .. -DCMAKE_PREFIX_PATH=/opt/python/cp36-cp36m/ \ | |||
| -DCMAKE_BUILD_TYPE=Release | |||
| make -j$(nproc) | |||
| make install | |||
| popd >/dev/null | |||
| popd >/dev/null | |||
| rm -rf clang-release_60 | |||
| popd >/dev/null | |||
| pushd /tmp >/dev/null | |||
| curl -sSL https://github.com/NixOS/patchelf/archive/0.12.tar.gz | tar xz | |||
| pushd /tmp/patchelf-0.12 >/dev/null | |||
| sed -i '331s/32/64/' ./src/patchelf.cc | |||
| ./bootstrap.sh && ./configure && make install-strip | |||
| popd | |||
| rm -rf /tmp/patchelf-0.12 | |||
| popd | |||
| yum clean all | |||
| @@ -0,0 +1,65 @@ | |||
| #!/bin/bash | |||
| set -e | |||
| function set_tuna_yum_mirror() { | |||
| cp /etc/yum.repos.d/CentOS-Base.repo /etc/yum.repos.d/CentOS-Base.repo.bak | |||
| local repo=/etc/yum.repos.d/CentOS-Base.repo | |||
| local plugin=/etc/yum/pluginconf.d/fastestmirror.conf | |||
| sed -i "s/mirrorlist=/#mirrorlist=/g" $repo | |||
| sed -i "s/#baseurl/baseurl/g" $repo | |||
| sed -i "s/mirror.centos.org/mirrors.tuna.tsinghua.edu.cn/g" $repo | |||
| sed -i "s/http/https/g" $repo | |||
| sed -i "s/enabled=1/enabled=0/g" $plugin | |||
| yum clean all | |||
| # Build on brainpp unable to pull epel reo metadata so disable this | |||
| # https://unix.stackexchange.com/questions/148144/unable-to-pull-epel-repository-metadata | |||
| yum --disablerepo="epel" update nss | |||
| yum makecache | |||
| } | |||
| function set_epel() { | |||
| mv /etc/yum.repos.d/epel.repo /etc/yum.repos.d/epel.repo.backup | |||
| mv /etc/yum.repos.d/epel-testing.repo /etc/yum.repos.d/epel-testing.repo.backup | |||
| curl -o /etc/yum.repos.d/epel.repo http://mirrors.aliyun.com/repo/epel-7.repo | |||
| } | |||
| function set_yum_mirror() { | |||
| mv /etc/yum.repos.d/CentOS-Base.repo /etc/yum.repos.d/CentOS-Base.repo.backup | |||
| curl -o /etc/yum.repos.d/CentOS-Base.repo https://mirrors.aliyun.com/repo/Centos-7.repo | |||
| yum makecache | |||
| } | |||
| function set_pip_mirror() { | |||
| cat > /etc/pip.conf <<EOF | |||
| [global] | |||
| timeout = 180 | |||
| index-url = https://mirrors.aliyun.com/pypi/simple | |||
| extra-index-url = | |||
| http://mirrors.i.brainpp.cn/pypi/simple/ | |||
| http://pypi.i.brainpp.cn/brain/dev/+simple | |||
| https://pypi.tuna.tsinghua.edu.cn/simple | |||
| trusted-host = | |||
| mirrors.i.brainpp.cn | |||
| pypi.i.brainpp.cn | |||
| pypi.tuna.tsinghua.edu.cn | |||
| mirrors.aliyun.com | |||
| EOF | |||
| } | |||
| function main() { | |||
| local platform=$1 | |||
| case $platform in | |||
| brainpp) | |||
| set_epel | |||
| set_yum_mirror | |||
| set_pip_mirror | |||
| ;; | |||
| *) | |||
| echo "No setup required" | |||
| ;; | |||
| esac | |||
| } | |||
| main "$@" | |||
| @@ -81,8 +81,9 @@ void NMSKeep::CUDAKern::exec(const NMSKeep* opr, const DeviceTensorND& inp, | |||
| init(opr, inp.shape()); | |||
| auto inp_ptr = inp.ptr<float>(); | |||
| auto dev_overlap_mask = reinterpret_cast<uint64_t*>(workspace.raw_ptr()), | |||
| dev_rm_mask = reinterpret_cast<uint64_t*>( | |||
| void* workspace_ptr = workspace.raw_ptr(); | |||
| auto dev_overlap_mask = reinterpret_cast<uint64_t*>(workspace_ptr), | |||
| dev_rm_mask = (uint64_t*)( | |||
| workspace.raw_ptr() + m_workspace_overlap_mask_bytes_align); | |||
| auto out_idx_ptr = reinterpret_cast<uint32_t*>(out_idx.ptr<int32_t>()), | |||
| out_size_ptr = reinterpret_cast<uint32_t*>(out_size.ptr<int32_t>()); | |||
| @@ -27,6 +27,9 @@ | |||
| #include "megbrain/gopt/inference.h" | |||
| #include "megbrain/gopt/misc.h" | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wdeprecated-declarations" | |||
| using namespace mgb; | |||
| using namespace gopt; | |||
| using namespace cg; | |||
| @@ -1749,6 +1752,7 @@ void mgb::tensorrt::transform_dest_vars_inplace( | |||
| optimizer.apply_inplace(dest_vars); | |||
| } | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -20,7 +20,8 @@ | |||
| #include "megbrain/utils/debug.h" | |||
| #if MGB_ENABLE_TENSOR_RT | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wdeprecated-declarations" | |||
| #include "megbrain/tensorrt/tensorrt_opr.h" | |||
| #include "make_trt_net.h" | |||
| @@ -111,7 +112,8 @@ intl::SimpleQuantizedTensorRTNetwork::SimpleQuantizedTensorRTNetwork() { | |||
| host_b = range_gen({1, 8, 1, 1}); | |||
| { | |||
| float* ptr = reinterpret_cast<float*>(host_w->raw_ptr()); | |||
| void* w_ptr = host_w->raw_ptr(); | |||
| float* ptr = reinterpret_cast<float*>(w_ptr); | |||
| ptr[0] = -127*1.1f; | |||
| ptr[1] = 127*1.1f; | |||
| } | |||
| @@ -362,6 +364,7 @@ intl::ConcatConvTensorRTNetwork::create_trt_network(bool has_batch_dim) { | |||
| return std::make_pair(builder, network); | |||
| } | |||
| #pragma GCC diagnostic pop | |||
| #endif // MGB_ENABLE_TENSOR_RT | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||