GitOrigin-RevId: b0ad639921
tags/v1.3.1
| @@ -40,7 +40,8 @@ option(MGE_CUDA_USE_STATIC "Enable MegEngine CUDA static linking." ON) | |||
| option(MGE_WITH_TRT "Build MegEngine with TensorRT." ON) | |||
| option(MGE_WITH_CUDA_STUB "Build MegEngine with CUDA stub." ON) | |||
| option(MGE_WITH_NVRTC_STUB "Build MegEngine with NVRTC stub." OFF) | |||
| option(MGE_WITH_CUDNN_SHARED "Build MegEngine with CUDNN shared." OFF) | |||
| option(MGE_WITH_CUDNN_SHARED "Build MegEngine with CUDNN shared." ON) | |||
| option(MGE_WITH_CUBLAS_SHARED "Build MegEngine with CUBLAS shared." OFF) | |||
| option(MGE_USE_SYSTEM_LIB "Build MegEngine with system libraries." OFF) | |||
| option(MGB_WITH_FLATBUFFERS "Build MegBrain with FlatBuffers serialization support." ON) | |||
| option(MGE_WITH_CAMBRICON "Build MegEngine with Cambricon support" OFF) | |||
| @@ -60,6 +61,11 @@ option(MGE_WITH_ROCM "Enable ROCM support" OFF) | |||
| option(MGE_WITH_LARGE_ARCHIVE "Enable big archive link support" OFF) | |||
| if(MSVC OR WIN32) | |||
| message(STATUS "windows force cudnn static link") | |||
| set(MGE_WITH_CUDNN_SHARED OFF) | |||
| endif() | |||
| if(MGE_WITH_NVRTC_STUB OR MGE_WITH_CUDA_STUB) | |||
| set(MGE_WITH_ANY_CUDA_STUB ON) | |||
| else() | |||
| @@ -472,15 +478,28 @@ if(MGE_WITH_CUDA) | |||
| endif() | |||
| endif() | |||
| if(MSVC OR WIN32) | |||
| list(APPEND MGE_CUDA_LIBS cusolver.lib cublas.lib curand.lib cudart_static.lib cusparse.lib) | |||
| list(APPEND MGE_CUDA_LIBS cusolver.lib curand.lib cudart_static.lib cusparse.lib) | |||
| else() | |||
| list(APPEND MGE_CUDA_LIBS cusolver_static curand_static culibos cudart_static cusparse_static) | |||
| endif() | |||
| if(MSVC OR WIN32) | |||
| list(APPEND MGE_CUDA_LIBS cublas.lib) | |||
| else() | |||
| list(APPEND MGE_CUDA_LIBS cusolver_static cublas_static curand_static culibos cudart_static cusparse_static) | |||
| if(MGE_WITH_CUBLAS_SHARED) | |||
| list(APPEND MGE_CUDA_LIBS cublas) | |||
| else() | |||
| list(APPEND MGE_CUDA_LIBS cublas_static) | |||
| endif() | |||
| endif() | |||
| if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.1.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.1.0") | |||
| if(MSVC OR WIN32) | |||
| list(APPEND MGE_CUDA_LIBS cublasLt.lib) | |||
| else() | |||
| list(APPEND MGE_CUDA_LIBS cublasLt_static) | |||
| if(MGE_WITH_CUBLAS_SHARED) | |||
| list(APPEND MGE_CUDA_LIBS cublasLt) | |||
| else() | |||
| list(APPEND MGE_CUDA_LIBS cublasLt_static) | |||
| endif() | |||
| endif() | |||
| endif() | |||
| if((${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.0.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.0.0") AND NOT MSVC AND NOT WIN32) | |||
| @@ -54,6 +54,12 @@ HandleImpl::HandleImpl(megcoreComputingHandle_t comp_handle): | |||
| #if CUDA_VERSION >= 10010 | |||
| megdnn_assert(cublasLtGetVersion() >= 10010, | |||
| "cuda library version is too low to run cublasLt"); | |||
| #endif | |||
| #if CUDNN_VERSION >= 8000 | |||
| megdnn_log_warn(R"( | |||
| Cudnn8 will jit ptx code with cache. You can set | |||
| CUDA_CACHE_MAXSIZE and CUDA_CACHE_PATH environment var to avoid repeat jit(very slow). | |||
| For example `export CUDA_CACHE_MAXSIZE=2147483647` and `export CUDA_CACHE_PATH=/data/.cuda_cache`)"); | |||
| #endif | |||
| cudnn_check(cudnnCreate(&m_cudnn_handle)); | |||
| cublas_check(cublasCreate(&m_cublas_handle)); | |||
| @@ -199,4 +199,4 @@ def test_dp_correctness(): | |||
| model_name = "mnist_model_with_test.mge" | |||
| model_path = os.path.join(os.path.dirname(__file__), model_name) | |||
| set_execution_strategy("HEURISTIC_REPRODUCIBLE") | |||
| run_test(model_path, False, False, max_err=1e-5) | |||
| run_test(model_path, False, False, max_err=5e-5) | |||
| @@ -22,7 +22,7 @@ from megengine.utils.comp_graph_tools import GraphInference | |||
| from megengine.utils.network import Network as Net | |||
| def check_pygraph_dump(trace_func, inp_data, expect_results): | |||
| def check_pygraph_dump(trace_func, inp_data, expect_results, max_err=None): | |||
| orig_model = io.BytesIO() | |||
| inp_size = len(inp_data) | |||
| out_size = len(expect_results) | |||
| @@ -46,7 +46,12 @@ def check_pygraph_dump(trace_func, inp_data, expect_results): | |||
| results = graph.run(inp_dict=inp_dict) | |||
| for ind, tensor in enumerate(expect_results): | |||
| np.testing.assert_equal(tensor.numpy(), results[output_names[ind]]) | |||
| if max_err: | |||
| np.testing.assert_almost_equal( | |||
| tensor.numpy(), results[output_names[ind]], max_err | |||
| ) | |||
| else: | |||
| np.testing.assert_equal(tensor.numpy(), results[output_names[ind]]) | |||
| assert tensor.dtype == results[output_names[ind]].dtype | |||
| @@ -178,7 +183,8 @@ def test_convtranspose(): | |||
| data = Tensor(np.random.random((1, 32, 32, 32))) | |||
| result = fwd(data) | |||
| check_pygraph_dump(fwd, [data], [result]) | |||
| # cu111 has 1e-7 diff | |||
| check_pygraph_dump(fwd, [data], [result], 5) | |||
| @pytest.mark.skip(reason="pytest aborted") | |||
| @@ -31,7 +31,7 @@ echo "Build with ${SDK_NAME}" | |||
| if [ $SDK_NAME == "cu101" ];then | |||
| CUDA_COPY_LIB_LIST="${CUDA_LIB_DIR}/libnvrtc.so.10.1" | |||
| EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=OFF" | |||
| EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=OFF -DMGE_WITH_CUBLAS_SHARED=OFF" | |||
| BUILD_GCC8="ON" | |||
| REQUIR_CUDA_VERSION="10010" | |||
| REQUIR_CUDNN_VERSION="7.6.3" | |||
| @@ -49,7 +49,7 @@ elif [ $SDK_NAME == "cu111" ];then | |||
| ${CUDNN_LIB_DIR}/libcudnn_ops_infer.so.8:\ | |||
| ${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\ | |||
| ${CUDNN_LIB_DIR}/libcudnn.so.8" | |||
| EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON\ | |||
| EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON -DMGE_WITH_CUBLAS_SHARED=ON \ | |||
| -gencode arch=compute_61,code=sm_61 \ | |||
| arch=compute_70,code=sm_70 \ | |||
| arch=compute_75,code=sm_75 \ | |||
| @@ -72,7 +72,7 @@ elif [ $SDK_NAME == "cu112" ];then | |||
| ${CUDNN_LIB_DIR}/libcudnn_ops_infer.so.8:\ | |||
| ${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\ | |||
| ${CUDNN_LIB_DIR}/libcudnn.so.8" | |||
| EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON \ | |||
| EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON -DMGE_WITH_CUBLAS_SHARED=ON \ | |||
| -gencode arch=compute_61,code=sm_61 \ | |||
| arch=compute_70,code=sm_70 \ | |||
| arch=compute_75,code=sm_75 \ | |||
| @@ -214,6 +214,8 @@ void CompNodeEnv::init_cuda_async(int dev, CompNode comp_node, | |||
| mgb_assert( | |||
| m_property.mem_alignment == | |||
| MegDNNHandle::get(*this).handle()->alignment_requirement()); | |||
| auto err = atexit(&CompNode::finalize); | |||
| mgb_assert(!err, "failed to register CompNode::finalize at exit"); | |||
| } | |||
| MGB_CATCH(std::exception & exc, { | |||
| mgb_log_error("async cuda init failed: %s", exc.what()); | |||
| @@ -304,6 +306,8 @@ void CompNodeEnv::init_rocm_async(int dev, CompNode comp_node, | |||
| mgb_assert( | |||
| m_property.mem_alignment == | |||
| MegDNNHandle::get(*this).handle()->alignment_requirement()); | |||
| auto err = atexit(&CompNode::finalize); | |||
| mgb_assert(!err, "failed to register CompNode::finalize at exit"); | |||
| } | |||
| MGB_CATCH(std::exception & exc, { | |||
| mgb_log_error("async rocm init failed: %s", exc.what()); | |||
| @@ -1850,8 +1850,6 @@ TEST(TestEnableTensorCore, SmallInputShape) { | |||
| MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt); | |||
| } | |||
| //! close for cu111 ci, reopen it when bug fixed | |||
| #if CUDA_VERSION < 11000 | |||
| TEST(TestEnableTensorCore, Nchw4Nchw) { | |||
| REQUIRE_GPU(1); | |||
| auto cn = CompNode::load("gpu0"); | |||
| @@ -1957,7 +1955,6 @@ TEST(TestEnableTensorCore, Nchw4Nchw) { | |||
| MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt); | |||
| } | |||
| } | |||
| #endif | |||
| TEST(TestEnableTensorCore, ConvBiasWithZ) { | |||
| REQUIRE_GPU(1); | |||