| @@ -245,9 +245,8 @@ std::pair<int, int> get_tensor_alignment( | |||||
| int threads = warp_size * algo_param.threadblock_m * algo_param.threadblock_n * | int threads = warp_size * algo_param.threadblock_m * algo_param.threadblock_n * | ||||
| algo_param.threadblock_k / | algo_param.threadblock_k / | ||||
| (algo_param.warp_m * algo_param.warp_n * algo_param.warp_k); | (algo_param.warp_m * algo_param.warp_n * algo_param.warp_k); | ||||
| int threadblock_loads = filter.dtype.size( | |||||
| algo_param.threadblock_m * algo_param.threadblock_n * | |||||
| algo_param.threadblock_k); | |||||
| int threadblock_loads = | |||||
| filter.dtype.size(algo_param.threadblock_m * algo_param.threadblock_k); | |||||
| int load_per_thread = threadblock_loads / threads; | int load_per_thread = threadblock_loads / threads; | ||||
| if (load_per_thread >= 16) | if (load_per_thread >= 16) | ||||
| alignment_filter = 16; | alignment_filter = 16; | ||||
| @@ -30,6 +30,7 @@ bool ConvBiasForwardImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm::is_available( | |||||
| using Format = Param::Format; | using Format = Param::Format; | ||||
| using Sparse = Param::Sparse; | using Sparse = Param::Sparse; | ||||
| using Mode = Param::Mode; | using Mode = Param::Mode; | ||||
| using NonlineMode = Param::NonlineMode; | |||||
| auto&& param = args.opr->param(); | auto&& param = args.opr->param(); | ||||
| auto&& fm = args.filter_meta; | auto&& fm = args.filter_meta; | ||||
| RETURN_IF_FALSE( | RETURN_IF_FALSE( | ||||
| @@ -37,6 +38,7 @@ bool ConvBiasForwardImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm::is_available( | |||||
| args.src_layout->dtype.enumv() == DTypeEnum::Float16 && | args.src_layout->dtype.enumv() == DTypeEnum::Float16 && | ||||
| args.filter_layout->dtype.enumv() == DTypeEnum::Float16 && | args.filter_layout->dtype.enumv() == DTypeEnum::Float16 && | ||||
| args.dst_layout->dtype.enumv() == DTypeEnum::Float16); | args.dst_layout->dtype.enumv() == DTypeEnum::Float16); | ||||
| RETURN_IF_FALSE(param.nonlineMode != NonlineMode::SIGMOID); | |||||
| RETURN_IF_FALSE( | RETURN_IF_FALSE( | ||||
| args.bias_layout->ndim <= 0 || | args.bias_layout->ndim <= 0 || | ||||
| (args.bias_layout->dtype.enumv() == DTypeEnum::Float16 && | (args.bias_layout->dtype.enumv() == DTypeEnum::Float16 && | ||||
| @@ -23,12 +23,14 @@ bool ConvBiasForwardImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::is_available( | |||||
| #define RETURN_IF_FALSE(stmt_) \ | #define RETURN_IF_FALSE(stmt_) \ | ||||
| if (!(stmt_)) \ | if (!(stmt_)) \ | ||||
| return false; | return false; | ||||
| RETURN_IF_FALSE(is_compute_capability_required(6, 1)); | |||||
| RETURN_IF_FALSE( | RETURN_IF_FALSE( | ||||
| args.src_layout->is_contiguous() && args.dst_layout->is_contiguous()); | args.src_layout->is_contiguous() && args.dst_layout->is_contiguous()); | ||||
| using Param = param::ConvBias; | using Param = param::ConvBias; | ||||
| using Format = Param::Format; | using Format = Param::Format; | ||||
| using Sparse = Param::Sparse; | using Sparse = Param::Sparse; | ||||
| using Mode = Param::Mode; | using Mode = Param::Mode; | ||||
| using NonlineMode = Param::NonlineMode; | |||||
| auto&& param = args.opr->param(); | auto&& param = args.opr->param(); | ||||
| auto&& fm = args.filter_meta; | auto&& fm = args.filter_meta; | ||||
| RETURN_IF_FALSE( | RETURN_IF_FALSE( | ||||
| @@ -36,6 +38,7 @@ bool ConvBiasForwardImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::is_available( | |||||
| args.src_layout->dtype.enumv() == DTypeEnum::Float32 && | args.src_layout->dtype.enumv() == DTypeEnum::Float32 && | ||||
| args.filter_layout->dtype.enumv() == DTypeEnum::Float32 && | args.filter_layout->dtype.enumv() == DTypeEnum::Float32 && | ||||
| args.dst_layout->dtype.enumv() == DTypeEnum::Float32); | args.dst_layout->dtype.enumv() == DTypeEnum::Float32); | ||||
| RETURN_IF_FALSE(param.nonlineMode != NonlineMode::SIGMOID); | |||||
| RETURN_IF_FALSE( | RETURN_IF_FALSE( | ||||
| args.bias_layout->ndim <= 0 || | args.bias_layout->ndim <= 0 || | ||||
| (args.bias_layout->dtype.enumv() == DTypeEnum::Float32 && | (args.bias_layout->dtype.enumv() == DTypeEnum::Float32 && | ||||
| @@ -63,6 +63,7 @@ bool ConvolutionBackwardDataImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::is_avai | |||||
| #define RETURN_IF_FALSE(stmt_) \ | #define RETURN_IF_FALSE(stmt_) \ | ||||
| if (!(stmt_)) \ | if (!(stmt_)) \ | ||||
| return false; | return false; | ||||
| RETURN_IF_FALSE(is_compute_capability_required(6, 1)); | |||||
| RETURN_IF_FALSE( | RETURN_IF_FALSE( | ||||
| args.diff_layout->is_contiguous() && args.grad_layout->is_contiguous()); | args.diff_layout->is_contiguous() && args.grad_layout->is_contiguous()); | ||||
| using Param = param::Convolution; | using Param = param::Convolution; | ||||
| @@ -29,6 +29,19 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm:: | |||||
| (sh == 2 && sw == 2) | (sh == 2 && sw == 2) | ||||
| ? cutlass::conv::SpecialOptimizeDesc::DECONV_DOUBLE_UPSAMPLING | ? cutlass::conv::SpecialOptimizeDesc::DECONV_DOUBLE_UPSAMPLING | ||||
| : cutlass::conv::SpecialOptimizeDesc::NONE; | : cutlass::conv::SpecialOptimizeDesc::NONE; | ||||
| int alignment_filter = 4; | |||||
| constexpr int warp_size = 32; | |||||
| int threads = warp_size * m_algo_param.threadblock_m * m_algo_param.threadblock_n * | |||||
| m_algo_param.threadblock_k / | |||||
| (m_algo_param.warp_m * m_algo_param.warp_n * m_algo_param.warp_k); | |||||
| int threadblock_loads = args.filter_layout->dtype.size( | |||||
| m_algo_param.threadblock_m * m_algo_param.threadblock_k); | |||||
| int load_per_thread = threadblock_loads / threads; | |||||
| if (load_per_thread >= 16) | |||||
| alignment_filter = 16; | |||||
| else if (load_per_thread >= 8) | |||||
| alignment_filter = 8; | |||||
| megdnn_assert(load_per_thread >= 4); | |||||
| ConvolutionKey key{ | ConvolutionKey key{ | ||||
| cutlass::conv::Operator::kDgrad, | cutlass::conv::Operator::kDgrad, | ||||
| NumericTypeID::kS8, | NumericTypeID::kS8, | ||||
| @@ -54,7 +67,7 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm:: | |||||
| m_algo_param.stage, | m_algo_param.stage, | ||||
| special_optimization, | special_optimization, | ||||
| 4, | 4, | ||||
| 4, | |||||
| alignment_filter, | |||||
| false}; | false}; | ||||
| return (void*)Singleton::get().operation_table.find_op(key); | return (void*)Singleton::get().operation_table.find_op(key); | ||||
| } | } | ||||
| @@ -20,6 +20,7 @@ | |||||
| #include "test/common/workspace_wrapper.h" | #include "test/common/workspace_wrapper.h" | ||||
| #include "test/cuda/benchmark.h" | #include "test/cuda/benchmark.h" | ||||
| #include "test/cuda/fixture.h" | #include "test/cuda/fixture.h" | ||||
| #include "test/cuda/utils.h" | |||||
| #include <cuda_profiler_api.h> | #include <cuda_profiler_api.h> | ||||
| #include <cuda_runtime_api.h> | #include <cuda_runtime_api.h> | ||||
| @@ -510,6 +511,7 @@ void check_chanwise(DType io_type, DType comp_type, Handle* handle, const char* | |||||
| #define cb(tag, tbm, tbn, tbk, wm, wn, wk) \ | #define cb(tag, tbm, tbn, tbk, wm, wn, wk) \ | ||||
| TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_CUTLASS_FMA_##tag) { \ | TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_CUTLASS_FMA_##tag) { \ | ||||
| require_compute_capability(6, 1); \ | |||||
| check_chanwise<ConvolutionForward>( \ | check_chanwise<ConvolutionForward>( \ | ||||
| dtype::Float32(), dtype::Float32(), handle_cuda(), \ | dtype::Float32(), dtype::Float32(), handle_cuda(), \ | ||||
| "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \ | "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \ | ||||
| @@ -522,6 +524,7 @@ MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FMA_KERNEL(cb) | |||||
| #define cb(tag, tbm, tbn, tbk, wm, wn, wk) \ | #define cb(tag, tbm, tbn, tbk, wm, wn, wk) \ | ||||
| TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA_CUTLASS_FMA_##tag) { \ | TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA_CUTLASS_FMA_##tag) { \ | ||||
| require_compute_capability(6, 1); \ | |||||
| check_chanwise<ConvolutionBackwardData>( \ | check_chanwise<ConvolutionBackwardData>( \ | ||||
| dtype::Float32(), dtype::Float32(), handle_cuda(), \ | dtype::Float32(), dtype::Float32(), handle_cuda(), \ | ||||
| "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \ | "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \ | ||||
| @@ -544,6 +547,7 @@ MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FMA_KERNEL(cb) | |||||
| // check both ioc16 and io16xc32 | // check both ioc16 and io16xc32 | ||||
| #define cb(tag, tbm, tbn, tbk, wm, wn, wk) \ | #define cb(tag, tbm, tbn, tbk, wm, wn, wk) \ | ||||
| TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_CUTLASS_HMMA_##tag) { \ | TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_CUTLASS_HMMA_##tag) { \ | ||||
| require_compute_capability(7, 0); \ | |||||
| check_chanwise<ConvolutionForward>( \ | check_chanwise<ConvolutionForward>( \ | ||||
| dtype::Float16(), dtype::Float16(), handle_cuda(), \ | dtype::Float16(), dtype::Float16(), handle_cuda(), \ | ||||
| "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \ | "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \ | ||||
| @@ -560,6 +564,7 @@ MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_HMMA_KERNEL(cb) | |||||
| #define cb(tag, tbm, tbn, tbk, wm, wn, wk) \ | #define cb(tag, tbm, tbn, tbk, wm, wn, wk) \ | ||||
| TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA_CUTLASS_HMMA_##tag) { \ | TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA_CUTLASS_HMMA_##tag) { \ | ||||
| require_compute_capability(7, 0); \ | |||||
| check_chanwise<ConvolutionBackwardData>( \ | check_chanwise<ConvolutionBackwardData>( \ | ||||
| dtype::Float16(), dtype::Float16(), handle_cuda(), \ | dtype::Float16(), dtype::Float16(), handle_cuda(), \ | ||||
| "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \ | "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \ | ||||
| @@ -1407,7 +1412,7 @@ TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BACKWARD_DATA_LARGE_KERNEL) { | |||||
| bencher.proxy()->target_execution_policy.algo.reset(); | bencher.proxy()->target_execution_policy.algo.reset(); | ||||
| param.compute_mode = param::Convolution::ComputeMode::FLOAT32; | param.compute_mode = param::Convolution::ComputeMode::FLOAT32; | ||||
| bencher.set_param(param); | bencher.set_param(param); | ||||
| auto time_in_ms_pseudo_fp16 = bencher.execs({src, filter, {}}) / RUNS; | |||||
| auto time_in_ms_pseudo_fp16 = bencher.execs({filter, src, src}) / RUNS; | |||||
| printf("stride=%zu src=%s, filter=%s, float32: %.2fms %.2fGB/s " | printf("stride=%zu src=%s, filter=%s, float32: %.2fms %.2fGB/s " | ||||
| "float16: %.2fms %.2fGB/s " | "float16: %.2fms %.2fGB/s " | ||||
| @@ -1033,7 +1033,7 @@ TEST_F(CUDA, CONV_BIAS_FORWARD_GROUP) { | |||||
| ConvBiasForward::algo_name<ConvBiasForward::DirectParam>( | ConvBiasForward::algo_name<ConvBiasForward::DirectParam>( | ||||
| "CUDA:GROUP_CONV", {}) | "CUDA:GROUP_CONV", {}) | ||||
| .c_str(), | .c_str(), | ||||
| {{"CUDNN", {}}}})); | |||||
| {{"DEFAULT:CUDNN", {}}}})); | |||||
| ConvBias::Param param; | ConvBias::Param param; | ||||
| param.sparse = ConvBias::Param::Sparse::GROUP; | param.sparse = ConvBias::Param::Sparse::GROUP; | ||||
| param.nonlineMode = mode; | param.nonlineMode = mode; | ||||