disable tensor op matmul kernels when input and output tensors are in f32 data type to avoid potential accuracy loss
GitOrigin-RevId: 36859cba5a
tags/v1.6.0
| @@ -313,6 +313,19 @@ bool CUBLASLTMatmulDesc::get_algorithm_heuristic(const SizeArgs& args, | |||||
| cublas_check(cublasLtMatmulPreferenceSetAttribute( | cublas_check(cublasLtMatmulPreferenceSetAttribute( | ||||
| algo_pref, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &algo_ws_limit, | algo_pref, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &algo_ws_limit, | ||||
| sizeof(algo_ws_limit))); | sizeof(algo_ws_limit))); | ||||
| #if CUDA_VERSION < 11000 | |||||
| bool is_f32_config = args.layout_a.dtype == dtype::Float32() && | |||||
| args.layout_b.dtype == dtype::Float32() && | |||||
| args.layout_c.dtype == dtype::Float32(); | |||||
| if (is_f32_config) { | |||||
| // disable HMMA tensor op matmul when inputs and output are all f32 | |||||
| // tensors, to avoid the potential accuracy loss | |||||
| uint32_t math_mode = CUBLAS_DEFAULT_MATH; | |||||
| cublas_check(cublasLtMatmulPreferenceSetAttribute( | |||||
| algo_pref, CUBLASLT_MATMUL_PREF_MATH_MODE_MASK, &math_mode, | |||||
| sizeof(math_mode))); | |||||
| } | |||||
| #endif | |||||
| status = cublasLtMatmulAlgoGetHeuristic( | status = cublasLtMatmulAlgoGetHeuristic( | ||||
| cublasLt_handle, matmul_desc, | cublasLt_handle, matmul_desc, | ||||
| dt_c == CUDA_R_32I ? layout_trans_b : layout_b, | dt_c == CUDA_R_32I ? layout_trans_b : layout_b, | ||||
| @@ -215,6 +215,7 @@ std::vector<BenchArgs> get_feat_model_args() { | |||||
| return args; | return args; | ||||
| } | } | ||||
| #if CUDA_VERSION >= 10020 | |||||
| std::vector<BenchArgs> get_f16_feat_model_args() { | std::vector<BenchArgs> get_f16_feat_model_args() { | ||||
| std::vector<BenchArgs> args; | std::vector<BenchArgs> args; | ||||
| args.emplace_back(BenchArgs{128, 9216, 9216}); | args.emplace_back(BenchArgs{128, 9216, 9216}); | ||||
| @@ -222,6 +223,7 @@ std::vector<BenchArgs> get_f16_feat_model_args() { | |||||
| args.emplace_back(BenchArgs{128, 5184, 5184}); | args.emplace_back(BenchArgs{128, 5184, 5184}); | ||||
| return args; | return args; | ||||
| } | } | ||||
| #endif | |||||
| void benchmark_matrix_mul( | void benchmark_matrix_mul( | ||||
| Handle* handle, const std::vector<BenchArgs>& args, DType A_dtype, | Handle* handle, const std::vector<BenchArgs>& args, DType A_dtype, | ||||
| @@ -473,7 +473,34 @@ TEST_F(CUDA, MATRIX_MUL_CUBLASLT_INT8) { | |||||
| execs({A, B, {}}); | execs({A, B, {}}); | ||||
| } | } | ||||
| } | } | ||||
| TEST_F(CUDA, MATRIX_MUL_CUBLASLT_F32) { | |||||
| require_compute_capability(7, 5); | |||||
| size_t m = 128, n = 1024, k = 18432; | |||||
| Checker<MatrixMul> checker(handle_cuda()); | |||||
| checker.set_before_exec_callback( | |||||
| AlgoChecker<MatrixMulForward>("CUBLAS_LT")); | |||||
| using Param = MatrixMul::Param; | |||||
| Param param; | |||||
| DType stype = dtype::Float32(); | |||||
| DType dtype = dtype::Float32(); | |||||
| TensorShape A, B; | |||||
| param.transposeA = param.transposeB = 0; | |||||
| if (param.transposeA) | |||||
| A = TensorShape{k, m}; | |||||
| else | |||||
| A = TensorShape{m, k}; | |||||
| if (param.transposeB) | |||||
| B = TensorShape{n, k}; | |||||
| else | |||||
| B = TensorShape{k, n}; | |||||
| checker.set_param(param) | |||||
| .set_dtype(0, stype) | |||||
| .set_dtype(1, stype) | |||||
| .set_dtype(2, dtype) | |||||
| .execs({A, B, {}}); | |||||
| } | |||||
| } // namespace test | } // namespace test | ||||
| } // namespace megdnn | } // namespace megdnn | ||||
| // vim: syntax=cpp.doxygen | // vim: syntax=cpp.doxygen | ||||