| @@ -44,6 +44,9 @@ std::pair<TensorLayoutArray, MatrixMulForward::Param> sub_opr_config( | |||||
| B.dtype = src_layout.dtype; | B.dtype = src_layout.dtype; | ||||
| C = {{dst_layout.shape[0], dst_layout.shape[1], B.shape[2]}, | C = {{dst_layout.shape[0], dst_layout.shape[1], B.shape[2]}, | ||||
| dst_layout.dtype}; | dst_layout.dtype}; | ||||
| C.stride[2] = 1; | |||||
| C.stride[1] = dst_layout.stride[1]; | |||||
| C.stride[0] = dst_layout.stride[0]; | |||||
| MatrixMulForward::Param param; | MatrixMulForward::Param param; | ||||
| if (opr->param().compute_mode == param::Convolution::ComputeMode::FLOAT32) { | if (opr->param().compute_mode == param::Convolution::ComputeMode::FLOAT32) { | ||||
| @@ -89,6 +92,8 @@ bool ConvBiasForwardImpl::AlgoBatchedMatmul::is_available( | |||||
| return false; | return false; | ||||
| auto config = prepare_sub_opr(args); | auto config = prepare_sub_opr(args); | ||||
| //! The dst of batched matmul should be contiguous | |||||
| if (!config.first[2].is_contiguous()) return false; | |||||
| auto&& fm = args.filter_meta; | auto&& fm = args.filter_meta; | ||||
| return fm.format == Param::Format::NCHW && | return fm.format == Param::Format::NCHW && | ||||
| @@ -109,7 +109,8 @@ void ConvBiasForwardImpl::AlgoGroupConvGeneral::exec( | |||||
| auto sub_args = args; | auto sub_args = args; | ||||
| sub_args.dst_tensor = &conv_dst_tensor; | sub_args.dst_tensor = &conv_dst_tensor; | ||||
| sub_args.dst_layout = &conv_dst_tensor.layout; | sub_args.dst_layout = &conv_dst_tensor.layout; | ||||
| TensorND tsrc{*args.src_tensor}, tdst{conv_dst_tensor}, tbias{*args.bias_tensor}; | |||||
| TensorND tsrc{*args.src_tensor}, tdst{conv_dst_tensor}, | |||||
| tbias{*args.bias_tensor}; | |||||
| SmallVector<size_t> flt_shape(0); | SmallVector<size_t> flt_shape(0); | ||||
| std::vector<ptrdiff_t> flt_stride(0); | std::vector<ptrdiff_t> flt_stride(0); | ||||
| size_t idx = 0; | size_t idx = 0; | ||||
| @@ -17,6 +17,8 @@ | |||||
| using namespace megdnn; | using namespace megdnn; | ||||
| using namespace test; | using namespace test; | ||||
| constexpr size_t matrix_mul::TestArg::UNSET_STRIDE_VAL; | |||||
| std::vector<matrix_mul::TestArg> matrix_mul::get_matmul_args_no_mask() { | std::vector<matrix_mul::TestArg> matrix_mul::get_matmul_args_no_mask() { | ||||
| std::vector<TestArg> args; | std::vector<TestArg> args; | ||||
| @@ -57,7 +59,9 @@ matrix_mul::get_batched_matmul_args_cublaslt() { | |||||
| // so please uncomment it if the bug is fixed | // so please uncomment it if the bug is fixed | ||||
| for (size_t k : {32, 64}) { | for (size_t k : {32, 64}) { | ||||
| args.emplace_back(m, n, k, 0, 0, 0, 0, 2); | |||||
| args.emplace_back(m, n, k, 0, TestArg::UNSET_STRIDE_VAL, | |||||
| TestArg::UNSET_STRIDE_VAL, | |||||
| TestArg::UNSET_STRIDE_VAL, 2); | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| @@ -70,7 +74,9 @@ matrix_mul::get_batched_matmul_args_int8x8x32() { | |||||
| for (size_t m : {1, 2, 3, 4, 5, 8, 64}) { | for (size_t m : {1, 2, 3, 4, 5, 8, 64}) { | ||||
| for (size_t n : {1, 2, 3, 4, 5, 8, 64}) { | for (size_t n : {1, 2, 3, 4, 5, 8, 64}) { | ||||
| for (size_t k : {1, 2, 3, 4, 5, 8, 64}) { | for (size_t k : {1, 2, 3, 4, 5, 8, 64}) { | ||||
| args.emplace_back(m, n, k, 0, 0, 0, 0, 2); | |||||
| args.emplace_back(m, n, k, 0, TestArg::UNSET_STRIDE_VAL, | |||||
| TestArg::UNSET_STRIDE_VAL, | |||||
| TestArg::UNSET_STRIDE_VAL, 2); | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| @@ -136,6 +142,30 @@ std::vector<matrix_mul::TestArg> matrix_mul::get_batched_matmul_args() { | |||||
| return args; | return args; | ||||
| } | } | ||||
| std::vector<matrix_mul::TestArg> | |||||
| matrix_mul::get_batched_matmul_broadcast_args() { | |||||
| std::vector<TestArg> args; | |||||
| for (size_t mask = 0; mask < 4; ++mask) { | |||||
| std::vector<TestArg> args_temp = | |||||
| matrix_mul::get_batched_matmul_broadcast_args_mask(mask); | |||||
| for (auto arg : args_temp) | |||||
| args.emplace_back(arg); | |||||
| } | |||||
| return args; | |||||
| } | |||||
| std::vector<matrix_mul::TestArg> | |||||
| matrix_mul::get_batched_matmul_broadcast_args_mask(uint8_t mask) { | |||||
| std::vector<TestArg> args; | |||||
| std::vector<TestArg> args_temp = | |||||
| matrix_mul::get_batched_matmul_args_mask(mask); | |||||
| for (auto arg : args_temp) { | |||||
| args.emplace_back(arg); | |||||
| args.back().A_batch_stride = 0; | |||||
| } | |||||
| return args; | |||||
| } | |||||
| template <typename Opr> | template <typename Opr> | ||||
| void matrix_mul::check_matrix_mul(DType A_dtype, DType B_dtype, DType C_dtype, | void matrix_mul::check_matrix_mul(DType A_dtype, DType B_dtype, DType C_dtype, | ||||
| Handle* handle, | Handle* handle, | ||||
| @@ -170,9 +200,9 @@ void matrix_mul::check_matrix_mul(DType A_dtype, DType B_dtype, DType C_dtype, | |||||
| checker.set_rng(0, rng.get()).set_rng(1, rng.get()); | checker.set_rng(0, rng.get()).set_rng(1, rng.get()); | ||||
| } | } | ||||
| //! return expect if stride == 0, stride otherwise | |||||
| //! return expect if stride == -1, stride otherwise | |||||
| auto stride_val = [](size_t stride, size_t expect) -> size_t { | auto stride_val = [](size_t stride, size_t expect) -> size_t { | ||||
| if (stride == 0) { | |||||
| if (stride == TestArg::UNSET_STRIDE_VAL) { | |||||
| return expect; | return expect; | ||||
| } else { | } else { | ||||
| return stride; | return stride; | ||||
| @@ -24,15 +24,19 @@ namespace matrix_mul { | |||||
| // mask & 1 denotes transposeA; mask & 2 denotes transposeB | // mask & 1 denotes transposeA; mask & 2 denotes transposeB | ||||
| struct TestArg { | struct TestArg { | ||||
| constexpr static size_t UNSET_STRIDE_VAL = static_cast<size_t>(-1); | |||||
| size_t m, n, k, mask; | size_t m, n, k, mask; | ||||
| size_t A_stride, B_stride, C_stride, b; | size_t A_stride, B_stride, C_stride, b; | ||||
| size_t A_batch_stride, B_batch_stride, C_batch_stride; | size_t A_batch_stride, B_batch_stride, C_batch_stride; | ||||
| // stride = 0 means the default stride, the dim is contiguous, i.e. the | // stride = 0 means the default stride, the dim is contiguous, i.e. the | ||||
| // stride value which makes tensor compact. | // stride value which makes tensor compact. | ||||
| TestArg(size_t m, size_t n, size_t k, size_t mask, size_t A_stride = 0, | |||||
| size_t B_stride = 0, size_t C_stride = 0, size_t b = 1, | |||||
| size_t A_batch_stride = 0, size_t B_batch_stride = 0, | |||||
| size_t C_batch_stride = 0) | |||||
| TestArg(size_t m, size_t n, size_t k, size_t mask, | |||||
| size_t A_stride = UNSET_STRIDE_VAL, | |||||
| size_t B_stride = UNSET_STRIDE_VAL, | |||||
| size_t C_stride = UNSET_STRIDE_VAL, size_t b = 1, | |||||
| size_t A_batch_stride = UNSET_STRIDE_VAL, | |||||
| size_t B_batch_stride = UNSET_STRIDE_VAL, | |||||
| size_t C_batch_stride = UNSET_STRIDE_VAL) | |||||
| : m{m}, | : m{m}, | ||||
| n{n}, | n{n}, | ||||
| k{k}, | k{k}, | ||||
| @@ -51,6 +55,8 @@ std::vector<TestArg> get_matmul_args_mask(uint8_t mask); | |||||
| std::vector<TestArg> get_matmul_args(); | std::vector<TestArg> get_matmul_args(); | ||||
| std::vector<TestArg> get_batched_matmul_args_mask(uint8_t mask); | std::vector<TestArg> get_batched_matmul_args_mask(uint8_t mask); | ||||
| std::vector<TestArg> get_batched_matmul_args(); | std::vector<TestArg> get_batched_matmul_args(); | ||||
| std::vector<TestArg> get_batched_matmul_broadcast_args(); | |||||
| std::vector<TestArg> get_batched_matmul_broadcast_args_mask(uint8_t mask); | |||||
| std::vector<TestArg> get_matmul_mk_packed_args(size_t nbase); | std::vector<TestArg> get_matmul_mk_packed_args(size_t nbase); | ||||
| std::vector<TestArg> get_batched_matmul_args_cublaslt(); | std::vector<TestArg> get_batched_matmul_args_cublaslt(); | ||||
| std::vector<TestArg> get_batched_matmul_args_int8x8x32(); | std::vector<TestArg> get_batched_matmul_args_int8x8x32(); | ||||
| @@ -8,6 +8,7 @@ | |||||
| * software distributed under the License is distributed on an | * software distributed under the License is distributed on an | ||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
| */ | */ | ||||
| #include <vector> | |||||
| #include "test/cuda/fixture.h" | #include "test/cuda/fixture.h" | ||||
| #include "test/common/checker.h" | #include "test/common/checker.h" | ||||
| @@ -62,6 +63,30 @@ TEST_F(CUDA, BATCHED_MATRIX_MUL_LT_F32_PART4) { | |||||
| #undef F32_TEST_PART | #undef F32_TEST_PART | ||||
| TEST_F(CUDA, BATCHED_MATRIX_MUL_F32_BROADCAST_PART1){ | |||||
| matrix_mul::check_batched_matrix_mul( | |||||
| dtype::Float32{}, dtype::Float32{}, {}, handle_cuda(), "CUBLAS", | |||||
| 1e-3, matrix_mul::get_batched_matmul_broadcast_args_mask(0)); | |||||
| } | |||||
| TEST_F(CUDA, BATCHED_MATRIX_MUL_F32_BROADCAST_PART2){ | |||||
| matrix_mul::check_batched_matrix_mul( | |||||
| dtype::Float32{}, dtype::Float32{}, {}, handle_cuda(), "CUBLAS", | |||||
| 1e-3, matrix_mul::get_batched_matmul_broadcast_args_mask(1)); | |||||
| } | |||||
| TEST_F(CUDA, BATCHED_MATRIX_MUL_F32_BROADCAST_PART3){ | |||||
| matrix_mul::check_batched_matrix_mul( | |||||
| dtype::Float32{}, dtype::Float32{}, {}, handle_cuda(), "CUBLAS", | |||||
| 1e-3, matrix_mul::get_batched_matmul_broadcast_args_mask(2)); | |||||
| } | |||||
| TEST_F(CUDA, BATCHED_MATRIX_MUL_F32_BROADCAST_PART4){ | |||||
| matrix_mul::check_batched_matrix_mul( | |||||
| dtype::Float32{}, dtype::Float32{}, {}, handle_cuda(), "CUBLAS", | |||||
| 1e-3, matrix_mul::get_batched_matmul_broadcast_args_mask(3)); | |||||
| } | |||||
| TEST_F(CUDA, BATCHED_MATRIX_MUL_F32_BRUTE_FORCE_PART1) { | TEST_F(CUDA, BATCHED_MATRIX_MUL_F32_BRUTE_FORCE_PART1) { | ||||
| matrix_mul::check_batched_matrix_mul( | matrix_mul::check_batched_matrix_mul( | ||||
| dtype::Float32{}, dtype::Float32{}, {}, handle_cuda(), | dtype::Float32{}, dtype::Float32{}, {}, handle_cuda(), | ||||
| @@ -75,8 +75,8 @@ TEST_F(CUDA, DILATED_CONVOLUTION_BACKWARD_DATA) | |||||
| "CUDNN_CONVOLUTION_BWD_DATA_ALGO_1" CUDNN_VERSION_STRING)); | "CUDNN_CONVOLUTION_BWD_DATA_ALGO_1" CUDNN_VERSION_STRING)); | ||||
| printf("cudnn version >= 7.5, use cudnn impl for dilated convolution\n"); | printf("cudnn version >= 7.5, use cudnn impl for dilated convolution\n"); | ||||
| #else | #else | ||||
| checker.set_before_exec_callback( | |||||
| AlgoChecker<ConvolutionBackwardData>("MATMUL")); | |||||
| checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardData>( | |||||
| ExecutionPolicyAlgoName{"MATMUL", {{"CUBLAS", {}}}})); | |||||
| #endif | #endif | ||||
| NormalRNG default_rng; | NormalRNG default_rng; | ||||
| for (auto &&arg: args) { | for (auto &&arg: args) { | ||||
| @@ -139,8 +139,8 @@ TEST_F(CUDA, DILATED_CONVOLUTION_BACKWARD_FILTER) | |||||
| "CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1" CUDNN_VERSION_STRING)); | "CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1" CUDNN_VERSION_STRING)); | ||||
| printf("cudnn version >= 7.5, use cudnn impl for dilated convolution\n"); | printf("cudnn version >= 7.5, use cudnn impl for dilated convolution\n"); | ||||
| #else | #else | ||||
| checker.set_before_exec_callback( | |||||
| AlgoChecker<ConvolutionBackwardFilter>("MATMUL")); | |||||
| checker.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>( | |||||
| ExecutionPolicyAlgoName{"MATMUL", {{"CUBLAS", {}}}})); | |||||
| #endif | #endif | ||||
| NormalRNG default_rng; | NormalRNG default_rng; | ||||
| bool first_run = true; | bool first_run = true; | ||||
| @@ -51,7 +51,8 @@ TEST_F(CUDA, MATRIX_MUL_QUANTIZED4x4x32) { | |||||
| if (cuda::current_device_prop().major < 7 || | if (cuda::current_device_prop().major < 7 || | ||||
| (cuda::current_device_prop().major == 7 && | (cuda::current_device_prop().major == 7 && | ||||
| cuda::current_device_prop().minor < 5)) { | cuda::current_device_prop().minor < 5)) { | ||||
| printf("Skip CUDA.MATRIX_MUL_QUANTIZED4x4x32 test as current device doesn't support\n"); | |||||
| printf("Skip CUDA.MATRIX_MUL_QUANTIZED4x4x32 test as current device " | |||||
| "doesn't support\n"); | |||||
| return; | return; | ||||
| } | } | ||||
| Checker<MatrixMul> checker(handle_cuda(), false); | Checker<MatrixMul> checker(handle_cuda(), false); | ||||
| @@ -257,19 +258,19 @@ TEST_F(CUDA, MATRIX_MUL) { | |||||
| BS = TensorShape{k, n}; | BS = TensorShape{k, n}; | ||||
| CS = TensorShape{m, n}; | CS = TensorShape{m, n}; | ||||
| TensorLayout AL, BL, CL; | TensorLayout AL, BL, CL; | ||||
| if (arg.A_stride == 0) { | |||||
| if (arg.A_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) { | |||||
| AL = TensorLayout(AS, dtype::Float32()); | AL = TensorLayout(AS, dtype::Float32()); | ||||
| } else { | } else { | ||||
| AL = TensorLayout(AS, {ptrdiff_t(arg.A_stride), 1}, | AL = TensorLayout(AS, {ptrdiff_t(arg.A_stride), 1}, | ||||
| dtype::Float32()); | dtype::Float32()); | ||||
| } | } | ||||
| if (arg.B_stride == 0) { | |||||
| if (arg.B_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) { | |||||
| BL = TensorLayout(BS, dtype::Float32()); | BL = TensorLayout(BS, dtype::Float32()); | ||||
| } else { | } else { | ||||
| BL = TensorLayout(BS, {ptrdiff_t(arg.B_stride), 1}, | BL = TensorLayout(BS, {ptrdiff_t(arg.B_stride), 1}, | ||||
| dtype::Float32()); | dtype::Float32()); | ||||
| } | } | ||||
| if (arg.C_stride == 0) { | |||||
| if (arg.C_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) { | |||||
| CL = TensorLayout(CS, dtype::Float32()); | CL = TensorLayout(CS, dtype::Float32()); | ||||
| } else { | } else { | ||||
| CL = TensorLayout(CS, {ptrdiff_t(arg.C_stride), 1}, | CL = TensorLayout(CS, {ptrdiff_t(arg.C_stride), 1}, | ||||
| @@ -285,8 +286,9 @@ TEST_F(CUDA, MATRIX_MUL_CUBLASLT) | |||||
| NormalRNG normal_rng; | NormalRNG normal_rng; | ||||
| Checker<MatrixMul> checker(handle_cuda()); | Checker<MatrixMul> checker(handle_cuda()); | ||||
| checker.set_rng(0, &normal_rng) | checker.set_rng(0, &normal_rng) | ||||
| .set_rng(1, &normal_rng) | |||||
| .set_before_exec_callback(AlgoChecker<MatrixMulForward>("CUBLAS_LT")); | |||||
| .set_rng(1, &normal_rng) | |||||
| .set_before_exec_callback( | |||||
| AlgoChecker<MatrixMulForward>("CUBLAS_LT")); | |||||
| using Param = MatrixMul::Param; | using Param = MatrixMul::Param; | ||||
| size_t m = 32, n = 32, k = 32; | size_t m = 32, n = 32, k = 32; | ||||
| // test Int8 matmul | // test Int8 matmul | ||||
| @@ -350,19 +352,19 @@ TEST_F(CUDA, MATRIX_MUL_CUBLASLT) | |||||
| BS = TensorShape{k, n}; | BS = TensorShape{k, n}; | ||||
| CS = TensorShape{m, n}; | CS = TensorShape{m, n}; | ||||
| TensorLayout AL, BL, CL; | TensorLayout AL, BL, CL; | ||||
| if (arg.A_stride == 0) { | |||||
| if (arg.A_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) { | |||||
| AL = TensorLayout(AS, dtype::Float32()); | AL = TensorLayout(AS, dtype::Float32()); | ||||
| } else { | } else { | ||||
| AL = TensorLayout(AS, {ptrdiff_t(arg.A_stride), 1}, | AL = TensorLayout(AS, {ptrdiff_t(arg.A_stride), 1}, | ||||
| dtype::Float32()); | dtype::Float32()); | ||||
| } | } | ||||
| if (arg.B_stride == 0) { | |||||
| if (arg.B_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) { | |||||
| BL = TensorLayout(BS, dtype::Float32()); | BL = TensorLayout(BS, dtype::Float32()); | ||||
| } else { | } else { | ||||
| BL = TensorLayout(BS, {ptrdiff_t(arg.B_stride), 1}, | BL = TensorLayout(BS, {ptrdiff_t(arg.B_stride), 1}, | ||||
| dtype::Float32()); | dtype::Float32()); | ||||
| } | } | ||||
| if (arg.C_stride == 0) { | |||||
| if (arg.C_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) { | |||||
| CL = TensorLayout(CS, dtype::Float32()); | CL = TensorLayout(CS, dtype::Float32()); | ||||
| } else { | } else { | ||||
| CL = TensorLayout(CS, {ptrdiff_t(arg.C_stride), 1}, | CL = TensorLayout(CS, {ptrdiff_t(arg.C_stride), 1}, | ||||