GitOrigin-RevId: b0ad639921
tags/v1.3.1
| @@ -40,7 +40,8 @@ option(MGE_CUDA_USE_STATIC "Enable MegEngine CUDA static linking." ON) | |||||
| option(MGE_WITH_TRT "Build MegEngine with TensorRT." ON) | option(MGE_WITH_TRT "Build MegEngine with TensorRT." ON) | ||||
| option(MGE_WITH_CUDA_STUB "Build MegEngine with CUDA stub." ON) | option(MGE_WITH_CUDA_STUB "Build MegEngine with CUDA stub." ON) | ||||
| option(MGE_WITH_NVRTC_STUB "Build MegEngine with NVRTC stub." OFF) | option(MGE_WITH_NVRTC_STUB "Build MegEngine with NVRTC stub." OFF) | ||||
| option(MGE_WITH_CUDNN_SHARED "Build MegEngine with CUDNN shared." OFF) | |||||
| option(MGE_WITH_CUDNN_SHARED "Build MegEngine with CUDNN shared." ON) | |||||
| option(MGE_WITH_CUBLAS_SHARED "Build MegEngine with CUBLAS shared." OFF) | |||||
| option(MGE_USE_SYSTEM_LIB "Build MegEngine with system libraries." OFF) | option(MGE_USE_SYSTEM_LIB "Build MegEngine with system libraries." OFF) | ||||
| option(MGB_WITH_FLATBUFFERS "Build MegBrain with FlatBuffers serialization support." ON) | option(MGB_WITH_FLATBUFFERS "Build MegBrain with FlatBuffers serialization support." ON) | ||||
| option(MGE_WITH_CAMBRICON "Build MegEngine with Cambricon support" OFF) | option(MGE_WITH_CAMBRICON "Build MegEngine with Cambricon support" OFF) | ||||
| @@ -60,6 +61,11 @@ option(MGE_WITH_ROCM "Enable ROCM support" OFF) | |||||
| option(MGE_WITH_LARGE_ARCHIVE "Enable big archive link support" OFF) | option(MGE_WITH_LARGE_ARCHIVE "Enable big archive link support" OFF) | ||||
| if(MSVC OR WIN32) | |||||
| message(STATUS "windows force cudnn static link") | |||||
| set(MGE_WITH_CUDNN_SHARED OFF) | |||||
| endif() | |||||
| if(MGE_WITH_NVRTC_STUB OR MGE_WITH_CUDA_STUB) | if(MGE_WITH_NVRTC_STUB OR MGE_WITH_CUDA_STUB) | ||||
| set(MGE_WITH_ANY_CUDA_STUB ON) | set(MGE_WITH_ANY_CUDA_STUB ON) | ||||
| else() | else() | ||||
| @@ -472,15 +478,28 @@ if(MGE_WITH_CUDA) | |||||
| endif() | endif() | ||||
| endif() | endif() | ||||
| if(MSVC OR WIN32) | if(MSVC OR WIN32) | ||||
| list(APPEND MGE_CUDA_LIBS cusolver.lib cublas.lib curand.lib cudart_static.lib cusparse.lib) | |||||
| list(APPEND MGE_CUDA_LIBS cusolver.lib curand.lib cudart_static.lib cusparse.lib) | |||||
| else() | |||||
| list(APPEND MGE_CUDA_LIBS cusolver_static curand_static culibos cudart_static cusparse_static) | |||||
| endif() | |||||
| if(MSVC OR WIN32) | |||||
| list(APPEND MGE_CUDA_LIBS cublas.lib) | |||||
| else() | else() | ||||
| list(APPEND MGE_CUDA_LIBS cusolver_static cublas_static curand_static culibos cudart_static cusparse_static) | |||||
| if(MGE_WITH_CUBLAS_SHARED) | |||||
| list(APPEND MGE_CUDA_LIBS cublas) | |||||
| else() | |||||
| list(APPEND MGE_CUDA_LIBS cublas_static) | |||||
| endif() | |||||
| endif() | endif() | ||||
| if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.1.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.1.0") | if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.1.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.1.0") | ||||
| if(MSVC OR WIN32) | if(MSVC OR WIN32) | ||||
| list(APPEND MGE_CUDA_LIBS cublasLt.lib) | list(APPEND MGE_CUDA_LIBS cublasLt.lib) | ||||
| else() | else() | ||||
| list(APPEND MGE_CUDA_LIBS cublasLt_static) | |||||
| if(MGE_WITH_CUBLAS_SHARED) | |||||
| list(APPEND MGE_CUDA_LIBS cublasLt) | |||||
| else() | |||||
| list(APPEND MGE_CUDA_LIBS cublasLt_static) | |||||
| endif() | |||||
| endif() | endif() | ||||
| endif() | endif() | ||||
| if((${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.0.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.0.0") AND NOT MSVC AND NOT WIN32) | if((${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.0.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.0.0") AND NOT MSVC AND NOT WIN32) | ||||
| @@ -54,6 +54,12 @@ HandleImpl::HandleImpl(megcoreComputingHandle_t comp_handle): | |||||
| #if CUDA_VERSION >= 10010 | #if CUDA_VERSION >= 10010 | ||||
| megdnn_assert(cublasLtGetVersion() >= 10010, | megdnn_assert(cublasLtGetVersion() >= 10010, | ||||
| "cuda library version is too low to run cublasLt"); | "cuda library version is too low to run cublasLt"); | ||||
| #endif | |||||
| #if CUDNN_VERSION >= 8000 | |||||
| megdnn_log_warn(R"( | |||||
| Cudnn8 will jit ptx code with cache. You can set | |||||
| CUDA_CACHE_MAXSIZE and CUDA_CACHE_PATH environment var to avoid repeat jit(very slow). | |||||
| For example `export CUDA_CACHE_MAXSIZE=2147483647` and `export CUDA_CACHE_PATH=/data/.cuda_cache`)"); | |||||
| #endif | #endif | ||||
| cudnn_check(cudnnCreate(&m_cudnn_handle)); | cudnn_check(cudnnCreate(&m_cudnn_handle)); | ||||
| cublas_check(cublasCreate(&m_cublas_handle)); | cublas_check(cublasCreate(&m_cublas_handle)); | ||||
| @@ -199,4 +199,4 @@ def test_dp_correctness(): | |||||
| model_name = "mnist_model_with_test.mge" | model_name = "mnist_model_with_test.mge" | ||||
| model_path = os.path.join(os.path.dirname(__file__), model_name) | model_path = os.path.join(os.path.dirname(__file__), model_name) | ||||
| set_execution_strategy("HEURISTIC_REPRODUCIBLE") | set_execution_strategy("HEURISTIC_REPRODUCIBLE") | ||||
| run_test(model_path, False, False, max_err=1e-5) | |||||
| run_test(model_path, False, False, max_err=5e-5) | |||||
| @@ -22,7 +22,7 @@ from megengine.utils.comp_graph_tools import GraphInference | |||||
| from megengine.utils.network import Network as Net | from megengine.utils.network import Network as Net | ||||
| def check_pygraph_dump(trace_func, inp_data, expect_results): | |||||
| def check_pygraph_dump(trace_func, inp_data, expect_results, max_err=None): | |||||
| orig_model = io.BytesIO() | orig_model = io.BytesIO() | ||||
| inp_size = len(inp_data) | inp_size = len(inp_data) | ||||
| out_size = len(expect_results) | out_size = len(expect_results) | ||||
| @@ -46,7 +46,12 @@ def check_pygraph_dump(trace_func, inp_data, expect_results): | |||||
| results = graph.run(inp_dict=inp_dict) | results = graph.run(inp_dict=inp_dict) | ||||
| for ind, tensor in enumerate(expect_results): | for ind, tensor in enumerate(expect_results): | ||||
| np.testing.assert_equal(tensor.numpy(), results[output_names[ind]]) | |||||
| if max_err: | |||||
| np.testing.assert_almost_equal( | |||||
| tensor.numpy(), results[output_names[ind]], max_err | |||||
| ) | |||||
| else: | |||||
| np.testing.assert_equal(tensor.numpy(), results[output_names[ind]]) | |||||
| assert tensor.dtype == results[output_names[ind]].dtype | assert tensor.dtype == results[output_names[ind]].dtype | ||||
| @@ -178,7 +183,8 @@ def test_convtranspose(): | |||||
| data = Tensor(np.random.random((1, 32, 32, 32))) | data = Tensor(np.random.random((1, 32, 32, 32))) | ||||
| result = fwd(data) | result = fwd(data) | ||||
| check_pygraph_dump(fwd, [data], [result]) | |||||
| # cu111 has 1e-7 diff | |||||
| check_pygraph_dump(fwd, [data], [result], 5) | |||||
| @pytest.mark.skip(reason="pytest aborted") | @pytest.mark.skip(reason="pytest aborted") | ||||
| @@ -31,7 +31,7 @@ echo "Build with ${SDK_NAME}" | |||||
| if [ $SDK_NAME == "cu101" ];then | if [ $SDK_NAME == "cu101" ];then | ||||
| CUDA_COPY_LIB_LIST="${CUDA_LIB_DIR}/libnvrtc.so.10.1" | CUDA_COPY_LIB_LIST="${CUDA_LIB_DIR}/libnvrtc.so.10.1" | ||||
| EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=OFF" | |||||
| EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=OFF -DMGE_WITH_CUBLAS_SHARED=OFF" | |||||
| BUILD_GCC8="ON" | BUILD_GCC8="ON" | ||||
| REQUIR_CUDA_VERSION="10010" | REQUIR_CUDA_VERSION="10010" | ||||
| REQUIR_CUDNN_VERSION="7.6.3" | REQUIR_CUDNN_VERSION="7.6.3" | ||||
| @@ -49,7 +49,7 @@ elif [ $SDK_NAME == "cu111" ];then | |||||
| ${CUDNN_LIB_DIR}/libcudnn_ops_infer.so.8:\ | ${CUDNN_LIB_DIR}/libcudnn_ops_infer.so.8:\ | ||||
| ${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\ | ${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\ | ||||
| ${CUDNN_LIB_DIR}/libcudnn.so.8" | ${CUDNN_LIB_DIR}/libcudnn.so.8" | ||||
| EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON\ | |||||
| EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON -DMGE_WITH_CUBLAS_SHARED=ON \ | |||||
| -gencode arch=compute_61,code=sm_61 \ | -gencode arch=compute_61,code=sm_61 \ | ||||
| arch=compute_70,code=sm_70 \ | arch=compute_70,code=sm_70 \ | ||||
| arch=compute_75,code=sm_75 \ | arch=compute_75,code=sm_75 \ | ||||
| @@ -72,7 +72,7 @@ elif [ $SDK_NAME == "cu112" ];then | |||||
| ${CUDNN_LIB_DIR}/libcudnn_ops_infer.so.8:\ | ${CUDNN_LIB_DIR}/libcudnn_ops_infer.so.8:\ | ||||
| ${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\ | ${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\ | ||||
| ${CUDNN_LIB_DIR}/libcudnn.so.8" | ${CUDNN_LIB_DIR}/libcudnn.so.8" | ||||
| EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON \ | |||||
| EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON -DMGE_WITH_CUBLAS_SHARED=ON \ | |||||
| -gencode arch=compute_61,code=sm_61 \ | -gencode arch=compute_61,code=sm_61 \ | ||||
| arch=compute_70,code=sm_70 \ | arch=compute_70,code=sm_70 \ | ||||
| arch=compute_75,code=sm_75 \ | arch=compute_75,code=sm_75 \ | ||||
| @@ -214,6 +214,8 @@ void CompNodeEnv::init_cuda_async(int dev, CompNode comp_node, | |||||
| mgb_assert( | mgb_assert( | ||||
| m_property.mem_alignment == | m_property.mem_alignment == | ||||
| MegDNNHandle::get(*this).handle()->alignment_requirement()); | MegDNNHandle::get(*this).handle()->alignment_requirement()); | ||||
| auto err = atexit(&CompNode::finalize); | |||||
| mgb_assert(!err, "failed to register CompNode::finalize at exit"); | |||||
| } | } | ||||
| MGB_CATCH(std::exception & exc, { | MGB_CATCH(std::exception & exc, { | ||||
| mgb_log_error("async cuda init failed: %s", exc.what()); | mgb_log_error("async cuda init failed: %s", exc.what()); | ||||
| @@ -304,6 +306,8 @@ void CompNodeEnv::init_rocm_async(int dev, CompNode comp_node, | |||||
| mgb_assert( | mgb_assert( | ||||
| m_property.mem_alignment == | m_property.mem_alignment == | ||||
| MegDNNHandle::get(*this).handle()->alignment_requirement()); | MegDNNHandle::get(*this).handle()->alignment_requirement()); | ||||
| auto err = atexit(&CompNode::finalize); | |||||
| mgb_assert(!err, "failed to register CompNode::finalize at exit"); | |||||
| } | } | ||||
| MGB_CATCH(std::exception & exc, { | MGB_CATCH(std::exception & exc, { | ||||
| mgb_log_error("async rocm init failed: %s", exc.what()); | mgb_log_error("async rocm init failed: %s", exc.what()); | ||||
| @@ -1850,8 +1850,6 @@ TEST(TestEnableTensorCore, SmallInputShape) { | |||||
| MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt); | MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt); | ||||
| } | } | ||||
| //! close for cu111 ci, reopen it when bug fixed | |||||
| #if CUDA_VERSION < 11000 | |||||
| TEST(TestEnableTensorCore, Nchw4Nchw) { | TEST(TestEnableTensorCore, Nchw4Nchw) { | ||||
| REQUIRE_GPU(1); | REQUIRE_GPU(1); | ||||
| auto cn = CompNode::load("gpu0"); | auto cn = CompNode::load("gpu0"); | ||||
| @@ -1957,7 +1955,6 @@ TEST(TestEnableTensorCore, Nchw4Nchw) { | |||||
| MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt); | MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt); | ||||
| } | } | ||||
| } | } | ||||
| #endif | |||||
| TEST(TestEnableTensorCore, ConvBiasWithZ) { | TEST(TestEnableTensorCore, ConvBiasWithZ) { | ||||
| REQUIRE_GPU(1); | REQUIRE_GPU(1); | ||||