GitOrigin-RevId: 878b7de9de
tags/v1.3.0
| @@ -286,7 +286,8 @@ void megdnn::cuda::cutlass_wrapper:: | |||
| uint32_t /* nonlinear_mode */, float /* alpha */, | |||
| float /* beta */, float /* gamma */, float /* scale */, | |||
| const GemmCoord& /* threadblock_shape */, | |||
| const GemmCoord& /* warp_shape */, cudaStream_t /* stream */) {} | |||
| const GemmCoord& /* warp_shape */, int /* stages */, | |||
| cudaStream_t /* stream */) {} | |||
| #else | |||
| template <bool NeedLoadFromConstMem> | |||
| void megdnn::cuda::cutlass_wrapper:: | |||
| @@ -296,15 +297,15 @@ void megdnn::cuda::cutlass_wrapper:: | |||
| int* workspace, const convolution::ConvParam& param, | |||
| uint32_t nonlinear_mode, float alpha, float beta, float gamma, | |||
| float scale, const GemmCoord& threadblock_shape, | |||
| const GemmCoord& warp_shape, cudaStream_t stream) { | |||
| const GemmCoord& warp_shape, int stages, cudaStream_t stream) { | |||
| #define DISPATCH_KERNEL_WITH_TILE_SHAPE(threadblock_m_, threadblock_n_, \ | |||
| threadblock_k_, warp_m_, warp_n_, \ | |||
| warp_k_, stage_, aligned_) \ | |||
| warp_k_, stage_, aligned_) \ | |||
| if (threadblock_shape.m() == threadblock_m_ && \ | |||
| threadblock_shape.n() == threadblock_n_ && \ | |||
| threadblock_shape.k() == threadblock_k_ && \ | |||
| warp_shape.m() == warp_m_ && warp_shape.n() == warp_n_ && \ | |||
| warp_shape.k() == warp_k_) { \ | |||
| warp_shape.k() == warp_k_ && stages == stage_) { \ | |||
| using ThreadBlockShape = \ | |||
| cutlass::gemm::GemmShape<threadblock_m_, threadblock_n_, \ | |||
| threadblock_k_>; \ | |||
| @@ -397,7 +398,8 @@ void megdnn::cuda::cutlass_wrapper:: | |||
| uint32_t nonlinear_mode, float alpha, float beta, \ | |||
| float gamma, float scale, \ | |||
| const GemmCoord& threadblock_shape, \ | |||
| const GemmCoord& warp_shape, cudaStream_t stream); | |||
| const GemmCoord& warp_shape, int stages, \ | |||
| cudaStream_t stream); | |||
| INST(true); | |||
| INST(false); | |||
| #undef INST | |||
| @@ -414,7 +416,8 @@ void megdnn::cuda::cutlass_wrapper:: | |||
| uint32_t /* nonlinear_mode */, float /* alpha */, | |||
| float /* beta */, float /* gamma */, float /* scale */, | |||
| const GemmCoord& /* threadblock_shape */, | |||
| const GemmCoord& /* warp_shape */, cudaStream_t /* stream */) {} | |||
| const GemmCoord& /* warp_shape */, int /* stages */, | |||
| cudaStream_t /* stream */) {} | |||
| #else | |||
| template <bool NeedLoadFromConstMem> | |||
| void megdnn::cuda::cutlass_wrapper:: | |||
| @@ -424,15 +427,15 @@ void megdnn::cuda::cutlass_wrapper:: | |||
| int* workspace, const convolution::ConvParam& param, | |||
| uint32_t nonlinear_mode, float alpha, float beta, float gamma, | |||
| float scale, const GemmCoord& threadblock_shape, | |||
| const GemmCoord& warp_shape, cudaStream_t stream) { | |||
| const GemmCoord& warp_shape, int stages, cudaStream_t stream) { | |||
| #define DISPATCH_KERNEL_WITH_TILE_SHAPE(threadblock_m_, threadblock_n_, \ | |||
| threadblock_k_, warp_m_, warp_n_, \ | |||
| warp_k_, aligned_) \ | |||
| warp_k_, stages_, aligned_) \ | |||
| if (threadblock_shape.m() == threadblock_m_ && \ | |||
| threadblock_shape.n() == threadblock_n_ && \ | |||
| threadblock_shape.k() == threadblock_k_ && \ | |||
| warp_shape.m() == warp_m_ && warp_shape.n() == warp_n_ && \ | |||
| warp_shape.k() == warp_k_) { \ | |||
| warp_shape.k() == warp_k_ && stages == stages_) { \ | |||
| using ThreadBlockShape = \ | |||
| cutlass::gemm::GemmShape<threadblock_m_, threadblock_n_, \ | |||
| threadblock_k_>; \ | |||
| @@ -449,7 +452,7 @@ void megdnn::cuda::cutlass_wrapper:: | |||
| cutlass::convolution::threadblock:: \ | |||
| ConvolutionNCxHWxThreadblockSwizzle< \ | |||
| cutlass::convolution::ConvType::kConvolution>, \ | |||
| 2, 4, aligned_, NeedLoadFromConstMem, \ | |||
| stages_, 4, aligned_, NeedLoadFromConstMem, \ | |||
| cutlass::arch::OpMultiplyAdd>; \ | |||
| typename Convolution::ConvolutionParameter conv_param{ \ | |||
| param.n, param.ci, param.co, param.hi, param.wi, \ | |||
| @@ -460,16 +463,17 @@ void megdnn::cuda::cutlass_wrapper:: | |||
| epilogue, stream); \ | |||
| } | |||
| #define DISPATCH_KERNEL \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 128, 32, 64, 32, 32, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 32, 64, 32, 32, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 32, 64, 32, 32, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 32, 64, 32, 32, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 128, 32, 32, 64, 32, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 64, 32, 64, 32, 32, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 64, 32, 32, 64, 32, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 32, 32, 64, 32, 32, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 32, 32, 32, 32, 32, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(16, 64, 8, 16, 64, 8, 4); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 128, 32, 64, 32, 32, 2, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 32, 64, 32, 32, 2, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 32, 64, 32, 32, 2, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 32, 64, 32, 32, 2, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 128, 32, 32, 64, 32, 2, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 64, 32, 64, 32, 32, 2, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 64, 32, 32, 64, 32, 2, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 32, 32, 64, 32, 32, 2, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 32, 32, 32, 32, 32, 2, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(16, 128, 16, 16, 128, 16, 1, 8); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(16, 64, 8, 16, 64, 8, 2, 4); \ | |||
| megdnn_assert(false, \ | |||
| "unsupported threadblock shape (%dx%dx%d) and warp shape " \ | |||
| "(%dx%dx%d)", \ | |||
| @@ -525,7 +529,8 @@ void megdnn::cuda::cutlass_wrapper:: | |||
| uint32_t nonlinear_mode, float alpha, float beta, \ | |||
| float gamma, float scale, \ | |||
| const GemmCoord& threadblock_shape, \ | |||
| const GemmCoord& warp_shape, cudaStream_t stream); | |||
| const GemmCoord& warp_shape, int stages, \ | |||
| cudaStream_t stream); | |||
| INST(true); | |||
| INST(false); | |||
| #undef INST | |||
| @@ -542,7 +547,8 @@ void megdnn::cuda::cutlass_wrapper:: | |||
| uint32_t /* nonlinear_mode */, float /* alpha */, | |||
| float /* beta */, float /* gamma */, float /* scale */, | |||
| const GemmCoord& /* threadblock_shape */, | |||
| const GemmCoord& /* warp_shape */, cudaStream_t /* stream */) {} | |||
| const GemmCoord& /* warp_shape */, int /* stages */, | |||
| cudaStream_t /* stream */) {} | |||
| #else | |||
| template <bool NeedLoadFromConstMem> | |||
| void megdnn::cuda::cutlass_wrapper:: | |||
| @@ -552,15 +558,15 @@ void megdnn::cuda::cutlass_wrapper:: | |||
| int* workspace, const convolution::ConvParam& param, | |||
| uint32_t nonlinear_mode, float alpha, float beta, float gamma, | |||
| float scale, const GemmCoord& threadblock_shape, | |||
| const GemmCoord& warp_shape, cudaStream_t stream) { | |||
| const GemmCoord& warp_shape, int stages, cudaStream_t stream) { | |||
| #define DISPATCH_KERNEL_WITH_TILE_SHAPE(threadblock_m_, threadblock_n_, \ | |||
| threadblock_k_, warp_m_, warp_n_, \ | |||
| warp_k_, aligned_) \ | |||
| warp_k_, stages_, aligned_) \ | |||
| if (threadblock_shape.m() == threadblock_m_ && \ | |||
| threadblock_shape.n() == threadblock_n_ && \ | |||
| threadblock_shape.k() == threadblock_k_ && \ | |||
| warp_shape.m() == warp_m_ && warp_shape.n() == warp_n_ && \ | |||
| warp_shape.k() == warp_k_) { \ | |||
| warp_shape.k() == warp_k_ && stages == stages_) { \ | |||
| using ThreadBlockShape = \ | |||
| cutlass::gemm::GemmShape<threadblock_m_, threadblock_n_, \ | |||
| threadblock_k_>; \ | |||
| @@ -577,7 +583,7 @@ void megdnn::cuda::cutlass_wrapper:: | |||
| cutlass::convolution::threadblock:: \ | |||
| ConvolutionNCxHWxThreadblockSwizzle< \ | |||
| cutlass::convolution::ConvType::kConvolution>, \ | |||
| 2, 4, aligned_, NeedLoadFromConstMem>; \ | |||
| stages_, 4, aligned_, NeedLoadFromConstMem>; \ | |||
| typename Convolution::ConvolutionParameter conv_param{ \ | |||
| param.n, param.ci, param.co, param.hi, param.wi, \ | |||
| param.fh, param.fw, param.ho, param.wo, param.sh, \ | |||
| @@ -587,15 +593,15 @@ void megdnn::cuda::cutlass_wrapper:: | |||
| epilogue, stream); \ | |||
| } | |||
| #define DISPATCH_KERNEL \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 128, 32, 64, 32, 32, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 32, 64, 32, 32, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 32, 64, 32, 32, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 32, 64, 32, 32, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 128, 32, 32, 64, 32, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 64, 32, 64, 32, 32, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 64, 32, 32, 64, 32, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 32, 32, 64, 32, 32, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 32, 32, 32, 32, 32, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 128, 32, 64, 32, 32, 2, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 32, 64, 32, 32, 2, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 32, 64, 32, 32, 2, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 32, 64, 32, 32, 2, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 128, 32, 32, 64, 32, 2, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 64, 32, 64, 32, 32, 2, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 64, 32, 32, 64, 32, 2, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 32, 32, 64, 32, 32, 2, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 32, 32, 32, 32, 32, 2, 16); \ | |||
| megdnn_assert(false, \ | |||
| "unsupported threadblock shape (%dx%dx%d) and warp shape " \ | |||
| "(%dx%dx%d)", \ | |||
| @@ -651,7 +657,8 @@ void megdnn::cuda::cutlass_wrapper:: | |||
| uint32_t nonlinear_mode, float alpha, float beta, \ | |||
| float gamma, float scale, \ | |||
| const GemmCoord& threadblock_shape, \ | |||
| const GemmCoord& warp_shape, cudaStream_t stream); | |||
| const GemmCoord& warp_shape, int stages, \ | |||
| cudaStream_t stream); | |||
| INST(true); | |||
| INST(false); | |||
| #undef INST | |||
| @@ -56,7 +56,7 @@ void do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4( | |||
| const convolution::ConvParam& param, uint32_t nonlinear_mode, | |||
| float alpha, float beta, float gamma, float scale, | |||
| const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, | |||
| cudaStream_t stream); | |||
| int stages, cudaStream_t stream); | |||
| template <bool NeedLoadFromConstMem> | |||
| void do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw( | |||
| @@ -65,7 +65,7 @@ void do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw( | |||
| const convolution::ConvParam& param, uint32_t nonlinear_mode, | |||
| float alpha, float beta, float gamma, float scale, | |||
| const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, | |||
| cudaStream_t stream); | |||
| int stages, cudaStream_t stream); | |||
| template <bool NeedLoadFromConstMem> | |||
| void do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32( | |||
| @@ -74,7 +74,7 @@ void do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32( | |||
| const convolution::ConvParam& param, uint32_t nonlinear_mode, | |||
| float alpha, float beta, float gamma, float scale, | |||
| const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, | |||
| cudaStream_t stream); | |||
| int stages, cudaStream_t stream); | |||
| } // namespace cutlass_wrapper | |||
| } // namespace cuda | |||
| @@ -32,8 +32,11 @@ bool ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::is_available( | |||
| if (!conv_bias::check_bias_share_in_channel(*(args.bias_layout), | |||
| param.format)) | |||
| return false; | |||
| if (param.format != Format::NCHW4 && param.format != Format::NCHW4_NCHW && | |||
| param.format != Format::NCHW4_NCHW32) | |||
| if (param.format == Format::NCHW4_NCHW32) { | |||
| if (m_algo_param.threadblock_m % 32 != 0) | |||
| return false; | |||
| } else if (param.format != Format::NCHW4_NCHW && | |||
| param.format != Format::NCHW4) | |||
| return false; | |||
| size_t n = args.src_layout->operator[](0), | |||
| ci = args.src_layout->operator[](1) * 4, | |||
| @@ -187,7 +190,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( | |||
| cutlass_wrapper::GemmCoord{m_algo_param.warp_m, | |||
| m_algo_param.warp_n, | |||
| m_algo_param.warp_k}, | |||
| stream); | |||
| m_algo_param.stage, stream); | |||
| } else if (param.format == Format::NCHW4_NCHW) { | |||
| cutlass_wrapper:: | |||
| do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw<false>( | |||
| @@ -205,7 +208,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( | |||
| cutlass_wrapper::GemmCoord{m_algo_param.warp_m, | |||
| m_algo_param.warp_n, | |||
| m_algo_param.warp_k}, | |||
| stream); | |||
| m_algo_param.stage, stream); | |||
| } else { | |||
| megdnn_assert(param.format == Format::NCHW4_NCHW32); | |||
| cutlass_wrapper:: | |||
| @@ -225,7 +228,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( | |||
| cutlass_wrapper::GemmCoord{m_algo_param.warp_m, | |||
| m_algo_param.warp_n, | |||
| m_algo_param.warp_k}, | |||
| stream); | |||
| m_algo_param.stage, stream); | |||
| } | |||
| } else { | |||
| if (param.format == Format::NCHW4) { | |||
| @@ -242,7 +245,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( | |||
| cutlass_wrapper::GemmCoord{m_algo_param.warp_m, | |||
| m_algo_param.warp_n, | |||
| m_algo_param.warp_k}, | |||
| stream); | |||
| m_algo_param.stage, stream); | |||
| } else if (param.format == Format::NCHW4_NCHW) { | |||
| cutlass_wrapper:: | |||
| do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw<true>( | |||
| @@ -260,7 +263,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( | |||
| cutlass_wrapper::GemmCoord{m_algo_param.warp_m, | |||
| m_algo_param.warp_n, | |||
| m_algo_param.warp_k}, | |||
| stream); | |||
| m_algo_param.stage, stream); | |||
| } else { | |||
| megdnn_assert(param.format == Format::NCHW4_NCHW32); | |||
| @@ -281,7 +284,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( | |||
| cutlass_wrapper::GemmCoord{m_algo_param.warp_m, | |||
| m_algo_param.warp_n, | |||
| m_algo_param.warp_k}, | |||
| stream); | |||
| m_algo_param.stage, stream); | |||
| } | |||
| } | |||
| after_kernel_launch(); | |||
| @@ -8,6 +8,7 @@ | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
| using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClam | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutSrc, int32_t, LayoutSrc, int32_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| 1, 4, 8, true>; | |||
| 1, 4, 8, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const int8_t* d_src, | |||
| const int8_t* d_filter, | |||
| const int32_t* d_bias, | |||
| const int8_t* d_z, | |||
| int8_t* d_dst, | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| @@ -8,6 +8,7 @@ | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
| using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutSrc, int32_t, LayoutSrc, int32_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| 1, 4, 8, true>; | |||
| 1, 4, 8, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const int8_t* d_src, | |||
| const int8_t* d_filter, | |||
| const int32_t* d_bias, | |||
| const int8_t* d_z, | |||
| int8_t* d_dst, | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| @@ -8,6 +8,7 @@ | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
| using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutSrc, int32_t, LayoutSrc, int32_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| 1, 4, 8, true>; | |||
| 1, 4, 8, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const int8_t* d_src, | |||
| const int8_t* d_filter, | |||
| const int32_t* d_bias, | |||
| const int8_t* d_z, | |||
| int8_t* d_dst, | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| @@ -8,6 +8,7 @@ | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
| using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClam | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutSrc, int32_t, LayoutSrc, int32_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| 1, 4, 8, false>; | |||
| 1, 4, 8, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const int8_t* d_src, | |||
| const int8_t* d_filter, | |||
| const int32_t* d_bias, | |||
| const int8_t* d_z, | |||
| int8_t* d_dst, | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| @@ -8,6 +8,7 @@ | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
| using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutSrc, int32_t, LayoutSrc, int32_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| 1, 4, 8, false>; | |||
| 1, 4, 8, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const int8_t* d_src, | |||
| const int8_t* d_filter, | |||
| const int32_t* d_bias, | |||
| const int8_t* d_z, | |||
| int8_t* d_dst, | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| @@ -8,6 +8,7 @@ | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
| using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutSrc, int32_t, LayoutSrc, int32_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| 1, 4, 8, false>; | |||
| 1, 4, 8, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const int8_t* d_src, | |||
| const int8_t* d_filter, | |||
| const int32_t* d_bias, | |||
| const int8_t* d_z, | |||
| int8_t* d_dst, | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| @@ -0,0 +1,37 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCHW; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
| using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< | |||
| float, 1, int32_t, float, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, float, | |||
| LayoutDst, float, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| 1, 4, 8, true, | |||
| cutlass::arch::OpMultiplyAdd>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -0,0 +1,37 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCHW; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
| using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< | |||
| float, 1, int32_t, float, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, float, | |||
| LayoutDst, float, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| 1, 4, 8, true, | |||
| cutlass::arch::OpMultiplyAdd>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -0,0 +1,37 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCHW; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
| using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< | |||
| float, 1, int32_t, float, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, float, | |||
| LayoutDst, float, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| 1, 4, 8, true, | |||
| cutlass::arch::OpMultiplyAdd>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -0,0 +1,37 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCHW; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
| using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< | |||
| float, 1, int32_t, float, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, float, | |||
| LayoutDst, float, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| 1, 4, 8, false, | |||
| cutlass::arch::OpMultiplyAdd>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -0,0 +1,37 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCHW; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
| using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< | |||
| float, 1, int32_t, float, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, float, | |||
| LayoutDst, float, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| 1, 4, 8, false, | |||
| cutlass::arch::OpMultiplyAdd>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -0,0 +1,37 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCHW; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
| using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< | |||
| float, 1, int32_t, float, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, float, | |||
| LayoutDst, float, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| 1, 4, 8, false, | |||
| cutlass::arch::OpMultiplyAdd>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -3895,6 +3895,9 @@ TEST(TestGoptInference, FoldingConvDimshuffle) { | |||
| .apply({{y}}) | |||
| .endpoint_vars(), | |||
| y_fuse); | |||
| gopt::modify_opr_algo_strategy_inplace( | |||
| {y_fuse}, | |||
| opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy::PROFILE); | |||
| graph->compile({{y_fuse, {}}}) | |||
| ->to_json() | |||
| ->writeto_fpath(output_file( | |||
| @@ -3976,6 +3979,9 @@ TEST(TestGoptInference, FoldingConvDimshuffleNCHW4NCHW32) { | |||
| .apply({{y}}) | |||
| .endpoint_vars(), | |||
| y_fuse); | |||
| gopt::modify_opr_algo_strategy_inplace( | |||
| {y_fuse}, | |||
| opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy::PROFILE); | |||
| graph->compile({{y_fuse, {}}}) | |||
| ->to_json() | |||
| ->writeto_fpath(output_file( | |||