feat(dnn/cuda): add deconv int8 and fix cutlass conv wrapper base on modify cutlass 2.4

GitOrigin-RevId: 49e0565e8a
5 years ago · 5d350fc843
--- a/.gitattributes
+++ b/.gitattributes
@@ -5,6 +5,7 @@ dnn/src/cuda/conv_bias/int8_imma/kimpl/* binary
 dnn/src/cuda/batch_conv_bias/int8/kimpl/* binary
 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/* binary
 dnn/src/cuda/sass/prebuilt/map_defs.cpp binary
 dnn/src/cuda/convolution/backward_data/int8/kimpl/* binary
 tools/mlir/mlir-tblgen filter=lfs diff=lfs merge=lfs -text
 *.caffemodel filter=lfs diff=lfs merge=lfs -text
 imperative/python/test/integration/data/*.mge filter=lfs diff=lfs merge=lfs -text
--- a/dnn/src/common/convolution.cpp
+++ b/dnn/src/common/convolution.cpp
@@ -46,7 +46,7 @@ void make_canonized_filter_meta_nchw_nhwc(
        size_t src_ndim, const TensorLayout& filter, const Param& param,
        typename ConvolutionBase<Parameter>::CanonizedFilterMeta& ret) {
    megdnn_assert(param.format == Param::Format::NCHW ||
                  param.format == Param::Format::NHWC );
                  param.format == Param::Format::NHWC);
    auto img_ndim = src_ndim - 2;
    size_t flt_start, flt_spatial_start, ocpg_pos, icpg_pos;
    if (param.sparse == Param::Sparse::DENSE) {
@@ -320,8 +320,8 @@ void make_canonized_filter_meta_nchwxx(
                          img_ndim, filter.ndim);
            megdnn_assert((filter[filter.ndim - 1] == pack_size &&
                           filter[filter.ndim - 2] == pack_size) ||
                           (filter[filter.ndim - 1] == 2 * pack_size &&
                            filter[filter.ndim - 2] == 2 * pack_size),
                                  (filter[filter.ndim - 1] == 2 * pack_size &&
                                   filter[filter.ndim - 2] == 2 * pack_size),
                          "last 2 dim of filter must be %zu, but got %zu, %zu",
                          pack_size, filter[filter.ndim - 2],
                          filter[filter.ndim - 1]);
@@ -684,7 +684,8 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src,
        }
        if (param().format == Param::Format::NCHW44 ||
            param().format == Param::Format::NCHW44_DOT) {
            //!support nchw44 filter change to 88 for int8 winogradf23_88 using MK8 mamtul
            //! support nchw44 filter change to 88 for int8 winogradf23_88 using
            //! MK8 mamtul
            megdnn_assert((src.ndim == 4 && filter.ndim == 5 &&
                           filter[filter.ndim - 1] == 4) ||
                                  (src.ndim == 5 &&
@@ -716,7 +717,7 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src,
                  "currently only convolution on 2D image is supported");
    auto cflt = make_canonized_filter_meta(src.ndim, filter);
    if (param().format == Param::Format::NCHW ||
        param().format == Param::Format::NHWC ) {
        param().format == Param::Format::NHWC) {
        size_t src_or_dst_c_pos = 0;
        size_t src_or_dst_spatial_start = 0;
        if (param().format == Param::Format::NCHW) {
@@ -790,7 +791,7 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src,
        dst[3] = infer_conv_shape(src[3], cflt.dilated_spatial[1],
                                  cflt.stride[1], cflt.padding[1]);
        dst[4] = 32;
    } else if (param().format == Param::Format::NCHW88 ) {
    } else if (param().format == Param::Format::NCHW88) {
        megdnn_assert(src.ndim == 5 || (src.ndim == 4 && src[1] <= 8),
                      "invalid src ndim for NCHW88, expected=5 or 4, got=%zu",
                      src.ndim);
@@ -1042,10 +1043,10 @@ void ConvolutionBackwardData::deduce_dtype(DType filter, DType diff,
    }
    megdnn_assert(param().compute_mode != Param::ComputeMode::FLOAT32
 #if !MEGDNN_DISABLE_FLOAT16
                          || filter.enumv() == DTypeEnum::Float16
                          || filter.enumv() == DTypeEnum::BFloat16
                          || filter.enumv() == DTypeEnum::Float16 ||
                          filter.enumv() == DTypeEnum::BFloat16
 #endif
                          ,
                  ,
                  "ComputeMode::FLOAT32 is only available for Float16/BFloat16 "
                  "input / output.");
 }
@@ -1096,6 +1097,24 @@ void ConvolutionBackwardData::deduce_layout(const TensorLayout& filter,
                    diff[i + src_or_dst_spatial_start], cflt.dilated_spatial[i],
                    cflt.stride[i], cflt.padding[i]);
        }
    } else if (param().format == Param::Format::NCHW4) {
        megdnn_assert(diff.ndim == 5,
                      "valid diff ndim for NCHW4, expected=5, got=%zu",
                      diff.ndim);
        megdnn_assert(cflt.group == 1, "%s", errmsg().c_str());
        megdnn_assert(cflt.ocpg * cflt.group == diff[1] * 4, "%s",
                      errmsg().c_str());
        grad.ndim = diff.ndim;
        grad[0] = diff[0];
        auto ic = cflt.icpg * cflt.group;
        megdnn_assert(ic % 4 == 0);
        grad[1] = ic / 4;
        grad[2] = deduce(diff[2], cflt.dilated_spatial[0], cflt.stride[0],
                         cflt.padding[0]);
        grad[3] = deduce(diff[3], cflt.dilated_spatial[1], cflt.stride[1],
                         cflt.padding[1]);
        megdnn_assert(diff[4] == 4);
        grad[4] = 4;
    } else {
        megdnn_assert(param().format == Param::Format::NHWCD4);
        megdnn_assert(diff.ndim == 5,
--- a/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu
+++ b/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu
@@ -62,22 +62,21 @@ void megdnn::cuda::cutlass_wrapper::
                                         threadblock_k_>;                      \
        using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \
        using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>;           \
        using Convolution = cutlass::convolution::device::Convolution<         \
        using Convolution = cutlass::conv::device::Convolution<                \
                int8_t, cutlass::layout::TensorNCxHWx<32>, int8_t,             \
                cutlass::layout::TensorCxRSKx<32>, ElementOutput,              \
                cutlass::layout::TensorNCxHWx<32>, int32_t,                    \
                cutlass::layout::TensorNCxHWx<32>, int32_t,                    \
                cutlass::convolution::ConvType::kConvolution,                  \
                cutlass::conv::ConvType::kConvolution,                         \
                cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75,           \
                ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,     \
                cutlass::convolution::threadblock::                            \
                        ConvolutionNCxHWxThreadblockSwizzle<                   \
                                cutlass::convolution::ConvType::kConvolution>, \
                cutlass::conv::threadblock::                                   \
                        ConvolutionFpropNCxHWxThreadblockSwizzle,              \
                2, 16, 16, NeedLoadFromConstMem>;                              \
        typename Convolution::ConvolutionParameter conv_param{                 \
                param.n,  param.ci, param.co, param.hi, param.wi,              \
                param.fh, param.fw, param.ho, param.wo, param.sh,              \
                param.sw, param.ph, param.pw, 1,        1};                    \
        typename Convolution::ConvolutionParameter conv_param(                 \
                param.n, param.hi, param.wi, param.ci, param.co, param.fh,     \
                param.fw, param.ho, param.wo, param.ph, param.pw, param.sh,    \
                param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation);       \
        return cutlass_convolution_wrapper<Convolution>(                       \
                d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param,    \
                epilogue, stream);                                             \
@@ -186,22 +185,21 @@ void megdnn::cuda::cutlass_wrapper::
                                         threadblock_k_>;                      \
        using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \
        using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>;           \
        using Convolution = cutlass::convolution::device::Convolution<         \
        using Convolution = cutlass::conv::device::Convolution<                \
                int8_t, cutlass::layout::TensorNCxHWx<32>, int8_t,             \
                cutlass::layout::TensorCxRSKx<32>, ElementOutput,              \
                cutlass::layout::TensorNCxHWx<4>, int32_t,                     \
                cutlass::layout::TensorNCxHWx<4>, int32_t,                     \
                cutlass::convolution::ConvType::kConvolution,                  \
                cutlass::conv::ConvType::kConvolution,                         \
                cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75,           \
                ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,     \
                cutlass::convolution::threadblock::                            \
                        ConvolutionNCxHWxThreadblockSwizzle<                   \
                                cutlass::convolution::ConvType::kConvolution>, \
                cutlass::conv::threadblock::                                   \
                        ConvolutionFpropNCxHWxThreadblockSwizzle,              \
                2, 16, 16, NeedLoadFromConstMem>;                              \
        typename Convolution::ConvolutionParameter conv_param{                 \
                param.n,  param.ci, param.co, param.hi, param.wi,              \
                param.fh, param.fw, param.ho, param.wo, param.sh,              \
                param.sw, param.ph, param.pw, 1,        1};                    \
        typename Convolution::ConvolutionParameter conv_param(                 \
                param.n, param.hi, param.wi, param.ci, param.co, param.fh,     \
                param.fw, param.ho, param.wo, param.ph, param.pw, param.sh,    \
                param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation);       \
        return cutlass_convolution_wrapper<Convolution>(                       \
                d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param,    \
                epilogue, stream);                                             \
@@ -311,22 +309,21 @@ void megdnn::cuda::cutlass_wrapper::
                                         threadblock_k_>;                      \
        using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \
        using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;            \
        using Convolution = cutlass::convolution::device::Convolution<         \
        using Convolution = cutlass::conv::device::Convolution<                \
                int8_t, cutlass::layout::TensorNCxHWx<4>, int8_t,              \
                cutlass::layout::TensorCxRSKx<4>, ElementOutput,               \
                cutlass::layout::TensorNCxHWx<4>, int32_t,                     \
                cutlass::layout::TensorNCxHWx<4>, int32_t,                     \
                cutlass::convolution::ConvType::kConvolution,                  \
                cutlass::conv::ConvType::kConvolution,                         \
                cutlass::arch::OpClassSimt, cutlass::arch::Sm61,               \
                ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,     \
                cutlass::convolution::threadblock::                            \
                        ConvolutionNCxHWxThreadblockSwizzle<                   \
                                cutlass::convolution::ConvType::kConvolution>, \
                cutlass::conv::threadblock::                                   \
                        ConvolutionFpropNCxHWxThreadblockSwizzle,              \
                stage_, 4, aligned_, NeedLoadFromConstMem>;                    \
        typename Convolution::ConvolutionParameter conv_param{                 \
                param.n,  param.ci, param.co, param.hi, param.wi,              \
                param.fh, param.fw, param.ho, param.wo, param.sh,              \
                param.sw, param.ph, param.pw, 1,        1};                    \
        typename Convolution::ConvolutionParameter conv_param(                 \
                param.n, param.hi, param.wi, param.ci, param.co, param.fh,     \
                param.fw, param.ho, param.wo, param.ph, param.pw, param.sh,    \
                param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation);       \
        return cutlass_convolution_wrapper<Convolution>(                       \
                d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param,    \
                epilogue, stream);                                             \
@@ -441,23 +438,22 @@ void megdnn::cuda::cutlass_wrapper::
                                         threadblock_k_>;                      \
        using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \
        using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;            \
        using Convolution = cutlass::convolution::device::Convolution<         \
        using Convolution = cutlass::conv::device::Convolution<                \
                int8_t, cutlass::layout::TensorNCxHWx<4>, int8_t,              \
                cutlass::layout::TensorCxRSKx<4>, ElementOutput,               \
                cutlass::layout::TensorNCHW, float,                            \
                cutlass::layout::TensorNCHW, int32_t,                          \
                cutlass::convolution::ConvType::kConvolution,                  \
                cutlass::conv::ConvType::kConvolution,                         \
                cutlass::arch::OpClassSimt, cutlass::arch::Sm61,               \
                ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,     \
                cutlass::convolution::threadblock::                            \
                        ConvolutionNCxHWxThreadblockSwizzle<                   \
                                cutlass::convolution::ConvType::kConvolution>, \
                cutlass::conv::threadblock::                                   \
                        ConvolutionFpropNCxHWxThreadblockSwizzle,              \
                stages_, 4, aligned_, NeedLoadFromConstMem,                    \
                cutlass::arch::OpMultiplyAdd>;                                 \
        typename Convolution::ConvolutionParameter conv_param{                 \
                param.n,  param.ci, param.co, param.hi, param.wi,              \
                param.fh, param.fw, param.ho, param.wo, param.sh,              \
                param.sw, param.ph, param.pw, 1,        1};                    \
        typename Convolution::ConvolutionParameter conv_param(                 \
                param.n, param.hi, param.wi, param.ci, param.co, param.fh,     \
                param.fw, param.ho, param.wo, param.ph, param.pw, param.sh,    \
                param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation);       \
        return cutlass_convolution_wrapper<Convolution>(                       \
                d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param,    \
                epilogue, stream);                                             \
@@ -572,36 +568,35 @@ void megdnn::cuda::cutlass_wrapper::
                                         threadblock_k_>;                      \
        using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \
        using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;            \
        using Convolution = cutlass::convolution::device::Convolution<         \
        using Convolution = cutlass::conv::device::Convolution<                \
                int8_t, cutlass::layout::TensorNCxHWx<4>, int8_t,              \
                cutlass::layout::TensorCxRSKx<4>, ElementOutput,               \
                cutlass::layout::TensorNCxHWx<32>, int32_t,                    \
                cutlass::layout::TensorNCxHWx<32>, int32_t,                    \
                cutlass::convolution::ConvType::kConvolution,                  \
                cutlass::conv::ConvType::kConvolution,                         \
                cutlass::arch::OpClassSimt, cutlass::arch::Sm61,               \
                ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,     \
                cutlass::convolution::threadblock::                            \
                        ConvolutionNCxHWxThreadblockSwizzle<                   \
                                cutlass::convolution::ConvType::kConvolution>, \
                cutlass::conv::threadblock::                                   \
                        ConvolutionFpropNCxHWxThreadblockSwizzle,              \
                stages_, 4, aligned_, NeedLoadFromConstMem>;                   \
        typename Convolution::ConvolutionParameter conv_param{                 \
                param.n,  param.ci, param.co, param.hi, param.wi,              \
                param.fh, param.fw, param.ho, param.wo, param.sh,              \
                param.sw, param.ph, param.pw, 1,        1};                    \
        typename Convolution::ConvolutionParameter conv_param(                 \
                param.n, param.hi, param.wi, param.ci, param.co, param.fh,     \
                param.fw, param.ho, param.wo, param.ph, param.pw, param.sh,    \
                param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation);       \
        return cutlass_convolution_wrapper<Convolution>(                       \
                d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param,    \
                epilogue, stream);                                             \
    }
 #define DISPATCH_KERNEL                                                      \
    DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 128, 32, 64, 32, 32, 2, 16);           \
    DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 32, 64, 32, 32, 2, 16);            \
    DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 32, 64, 32, 32, 2, 16);            \
    DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 32, 64, 32, 32, 2, 16);            \
    DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 128, 32, 32, 64, 32, 2, 16);            \
    DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 64, 32, 64, 32, 32, 2, 16);             \
    DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 64, 32, 32, 64, 32, 2, 16);             \
    DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 32, 32, 64, 32, 32, 2, 16);             \
    DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 32, 32, 32, 32, 32, 2, 16);             \
    DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 128, 32, 64, 32, 32, 2, 16);        \
    DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 32, 64, 32, 32, 2, 16);         \
    DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 32, 64, 32, 32, 2, 16);         \
    DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 32, 64, 32, 32, 2, 16);         \
    DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 128, 32, 32, 64, 32, 2, 16);         \
    DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 64, 32, 64, 32, 32, 2, 16);          \
    DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 64, 32, 32, 64, 32, 2, 16);          \
    DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 32, 32, 64, 32, 32, 2, 16);          \
    DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 32, 32, 32, 32, 32, 2, 16);          \
    megdnn_assert(false,                                                     \
                  "unsupported threadblock shape (%dx%dx%d) and warp shape " \
                  "(%dx%dx%d)",                                              \
--- a/dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl
+++ b/dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl
@@ -29,28 +29,30 @@ void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper(
        cudaStream_t stream) {
    typename Convolution::TensorRefSrc tensor_src{
            const_cast<typename Convolution::ElementSrc*>(d_src),
            Convolution::LayoutSrc::packed({conv_param.n(), conv_param.hi(),
                                            conv_param.wi(), conv_param.ci()})};
            Convolution::LayoutSrc::packed(
                    {conv_param.N, conv_param.H, conv_param.W, conv_param.C})};
    typename Convolution::TensorRefFilter tensor_filter{
            const_cast<typename Convolution::ElementFilter*>(d_filter),
            Convolution::LayoutFilter::packed({conv_param.co(), conv_param.fh(),
                                               conv_param.fw(),
                                               conv_param.ci()})};
            Convolution::LayoutFilter::packed(
                    {conv_param.K, conv_param.R, conv_param.S, conv_param.C})};
    typename Convolution::TensorRefBias tensor_bias{
            const_cast<typename Convolution::ElementBias*>(d_bias),
            Convolution::LayoutBias::packed({1, 1, 1, conv_param.co()})};
            Convolution::LayoutBias::packed({1, 1, 1, conv_param.K})};
    typename Convolution::TensorRefDst tensor_z{
            const_cast<typename Convolution::ElementDst*>(d_z),
            Convolution::LayoutDst::packed({conv_param.n(), conv_param.ho(),
                                            conv_param.wo(), conv_param.co()})};
            Convolution::LayoutDst::packed(
                    {conv_param.N, conv_param.P, conv_param.Q, conv_param.K})};
    typename Convolution::TensorRefDst tensor_dst{
            d_dst,
            Convolution::LayoutDst::packed({conv_param.n(), conv_param.ho(),
                                            conv_param.wo(), conv_param.co()})};
    typename Convolution::Arguments arguments{
            conv_param,  tensor_src, tensor_filter,
            tensor_bias, tensor_z,   tensor_dst.non_const_ref(),
            epilogue};
            Convolution::LayoutDst::packed(
                    {conv_param.N, conv_param.P, conv_param.Q, conv_param.K})};
    typename Convolution::Arguments arguments{conv_param,
                                              tensor_src.non_const_ref(),
                                              tensor_filter.non_const_ref(),
                                              tensor_bias.non_const_ref(),
                                              tensor_z.non_const_ref(),
                                              tensor_dst.non_const_ref(),
                                              epilogue};
    Convolution conv_op;
    cutlass_check(conv_op.initialize(arguments, workspace));
    cutlass_check(conv_op(stream));
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_hswish.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_hswish.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, true, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_id.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_id.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, true, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_relu.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_relu.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, true, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_hswish.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_hswish.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, true, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_id.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_id.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, true, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_relu.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_relu.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, true, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_hswish.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_hswish.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, true, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_id.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_id.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, true, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_relu.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_relu.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, true, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_hswish.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_hswish.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    1, 4, 8, true, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_id.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_id.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    1, 4, 8, true, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_relu.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_relu.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    1, 4, 8, true, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_hswish.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_hswish.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 4, true, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_id.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_id.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 4, true, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_relu.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_relu.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 4, true, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_hswish.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_hswish.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_id.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_id.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_relu.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_relu.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_hswish.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_hswish.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_id.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_id.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_relu.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_relu.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_hswish.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_hswish.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_id.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_id.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_relu.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_relu.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_hswish.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_hswish.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    1, 4, 8, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_id.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_id.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    1, 4, 8, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_relu.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_relu.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    1, 4, 8, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_hswish.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_hswish.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 4, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_id.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_id.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 4, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_relu.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_relu.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 4, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_hswish.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_hswish.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_id.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_id.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_relu.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_relu.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_hswish.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_hswish.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_id.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_id.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_relu.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_relu.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_hswish.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_hswish.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_id.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_id.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_relu.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_relu.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_hswish.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_hswish.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_id.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_id.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_relu.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_relu.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_hswish.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_hswish.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_id.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_id.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_relu.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_relu.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_hswish.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_hswish.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_id.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_id.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_relu.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_relu.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_hswish.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_hswish.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, true, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_id.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_id.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, true, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_relu.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_relu.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, true, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_hswish.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_hswish.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, true, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_id.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_id.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, true, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_relu.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_relu.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, true, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_hswish.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_hswish.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, true, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_id.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_id.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, true, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_relu.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_relu.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, true, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_hswish.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_hswish.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, true, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_id.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_id.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, true, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_relu.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_relu.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, true, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_hswish.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_hswish.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, true, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_id.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_id.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, true, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_relu.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_relu.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, true, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_hswish.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_hswish.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, true, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_id.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_id.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, true, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_relu.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_relu.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, true, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_hswish.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_hswish.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, true, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_id.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_id.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, true, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_relu.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_relu.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, true, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_hswish.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_hswish.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, true, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_id.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_id.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, true, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_relu.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_relu.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, true, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_hswish.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_hswish.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, true, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_id.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_id.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, true, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_relu.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_relu.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, true, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_hswish.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_hswish.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_id.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_id.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_relu.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_relu.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_hswish.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_hswish.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_id.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_id.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_relu.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_relu.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_hswish.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_hswish.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_id.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_id.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_relu.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_relu.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_hswish.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_hswish.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_id.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_id.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_relu.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_relu.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_hswish.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_hswish.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_id.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_id.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_relu.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_relu.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_hswish.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_hswish.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_id.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_id.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_relu.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_relu.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_hswish.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_hswish.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_id.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_id.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
--- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_relu.cu
+++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_relu.cu
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
 using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
                    int8_t, 4, int32_t, int32_t, float>;
 using Convolution = cutlass::convolution::device::Convolution<
 using Convolution = cutlass::conv::device::Convolution<
    int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, 
    LayoutDst, int32_t, LayoutDst, int32_t, 
    cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
    cutlass::convolution::ConvType::kConvolution>, 
    cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 
    2, 4, 16, false, 
    cutlass::arch::OpMultiplyAddSaturate>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(