GitOrigin-RevId: 49e0565e8a
tags/v1.3.0
| @@ -5,6 +5,7 @@ dnn/src/cuda/conv_bias/int8_imma/kimpl/* binary | |||
| dnn/src/cuda/batch_conv_bias/int8/kimpl/* binary | |||
| dnn/src/cuda/matrix_mul/fp32_simt/kimpl/* binary | |||
| dnn/src/cuda/sass/prebuilt/map_defs.cpp binary | |||
| dnn/src/cuda/convolution/backward_data/int8/kimpl/* binary | |||
| tools/mlir/mlir-tblgen filter=lfs diff=lfs merge=lfs -text | |||
| *.caffemodel filter=lfs diff=lfs merge=lfs -text | |||
| imperative/python/test/integration/data/*.mge filter=lfs diff=lfs merge=lfs -text | |||
| @@ -46,7 +46,7 @@ void make_canonized_filter_meta_nchw_nhwc( | |||
| size_t src_ndim, const TensorLayout& filter, const Param& param, | |||
| typename ConvolutionBase<Parameter>::CanonizedFilterMeta& ret) { | |||
| megdnn_assert(param.format == Param::Format::NCHW || | |||
| param.format == Param::Format::NHWC ); | |||
| param.format == Param::Format::NHWC); | |||
| auto img_ndim = src_ndim - 2; | |||
| size_t flt_start, flt_spatial_start, ocpg_pos, icpg_pos; | |||
| if (param.sparse == Param::Sparse::DENSE) { | |||
| @@ -320,8 +320,8 @@ void make_canonized_filter_meta_nchwxx( | |||
| img_ndim, filter.ndim); | |||
| megdnn_assert((filter[filter.ndim - 1] == pack_size && | |||
| filter[filter.ndim - 2] == pack_size) || | |||
| (filter[filter.ndim - 1] == 2 * pack_size && | |||
| filter[filter.ndim - 2] == 2 * pack_size), | |||
| (filter[filter.ndim - 1] == 2 * pack_size && | |||
| filter[filter.ndim - 2] == 2 * pack_size), | |||
| "last 2 dim of filter must be %zu, but got %zu, %zu", | |||
| pack_size, filter[filter.ndim - 2], | |||
| filter[filter.ndim - 1]); | |||
| @@ -684,7 +684,8 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src, | |||
| } | |||
| if (param().format == Param::Format::NCHW44 || | |||
| param().format == Param::Format::NCHW44_DOT) { | |||
| //!support nchw44 filter change to 88 for int8 winogradf23_88 using MK8 mamtul | |||
| //! support nchw44 filter change to 88 for int8 winogradf23_88 using | |||
| //! MK8 mamtul | |||
| megdnn_assert((src.ndim == 4 && filter.ndim == 5 && | |||
| filter[filter.ndim - 1] == 4) || | |||
| (src.ndim == 5 && | |||
| @@ -716,7 +717,7 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src, | |||
| "currently only convolution on 2D image is supported"); | |||
| auto cflt = make_canonized_filter_meta(src.ndim, filter); | |||
| if (param().format == Param::Format::NCHW || | |||
| param().format == Param::Format::NHWC ) { | |||
| param().format == Param::Format::NHWC) { | |||
| size_t src_or_dst_c_pos = 0; | |||
| size_t src_or_dst_spatial_start = 0; | |||
| if (param().format == Param::Format::NCHW) { | |||
| @@ -790,7 +791,7 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src, | |||
| dst[3] = infer_conv_shape(src[3], cflt.dilated_spatial[1], | |||
| cflt.stride[1], cflt.padding[1]); | |||
| dst[4] = 32; | |||
| } else if (param().format == Param::Format::NCHW88 ) { | |||
| } else if (param().format == Param::Format::NCHW88) { | |||
| megdnn_assert(src.ndim == 5 || (src.ndim == 4 && src[1] <= 8), | |||
| "invalid src ndim for NCHW88, expected=5 or 4, got=%zu", | |||
| src.ndim); | |||
| @@ -1042,10 +1043,10 @@ void ConvolutionBackwardData::deduce_dtype(DType filter, DType diff, | |||
| } | |||
| megdnn_assert(param().compute_mode != Param::ComputeMode::FLOAT32 | |||
| #if !MEGDNN_DISABLE_FLOAT16 | |||
| || filter.enumv() == DTypeEnum::Float16 | |||
| || filter.enumv() == DTypeEnum::BFloat16 | |||
| || filter.enumv() == DTypeEnum::Float16 || | |||
| filter.enumv() == DTypeEnum::BFloat16 | |||
| #endif | |||
| , | |||
| , | |||
| "ComputeMode::FLOAT32 is only available for Float16/BFloat16 " | |||
| "input / output."); | |||
| } | |||
| @@ -1096,6 +1097,24 @@ void ConvolutionBackwardData::deduce_layout(const TensorLayout& filter, | |||
| diff[i + src_or_dst_spatial_start], cflt.dilated_spatial[i], | |||
| cflt.stride[i], cflt.padding[i]); | |||
| } | |||
| } else if (param().format == Param::Format::NCHW4) { | |||
| megdnn_assert(diff.ndim == 5, | |||
| "valid diff ndim for NCHW4, expected=5, got=%zu", | |||
| diff.ndim); | |||
| megdnn_assert(cflt.group == 1, "%s", errmsg().c_str()); | |||
| megdnn_assert(cflt.ocpg * cflt.group == diff[1] * 4, "%s", | |||
| errmsg().c_str()); | |||
| grad.ndim = diff.ndim; | |||
| grad[0] = diff[0]; | |||
| auto ic = cflt.icpg * cflt.group; | |||
| megdnn_assert(ic % 4 == 0); | |||
| grad[1] = ic / 4; | |||
| grad[2] = deduce(diff[2], cflt.dilated_spatial[0], cflt.stride[0], | |||
| cflt.padding[0]); | |||
| grad[3] = deduce(diff[3], cflt.dilated_spatial[1], cflt.stride[1], | |||
| cflt.padding[1]); | |||
| megdnn_assert(diff[4] == 4); | |||
| grad[4] = 4; | |||
| } else { | |||
| megdnn_assert(param().format == Param::Format::NHWCD4); | |||
| megdnn_assert(diff.ndim == 5, | |||
| @@ -62,22 +62,21 @@ void megdnn::cuda::cutlass_wrapper:: | |||
| threadblock_k_>; \ | |||
| using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \ | |||
| using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; \ | |||
| using Convolution = cutlass::convolution::device::Convolution< \ | |||
| using Convolution = cutlass::conv::device::Convolution< \ | |||
| int8_t, cutlass::layout::TensorNCxHWx<32>, int8_t, \ | |||
| cutlass::layout::TensorCxRSKx<32>, ElementOutput, \ | |||
| cutlass::layout::TensorNCxHWx<32>, int32_t, \ | |||
| cutlass::layout::TensorNCxHWx<32>, int32_t, \ | |||
| cutlass::convolution::ConvType::kConvolution, \ | |||
| cutlass::conv::ConvType::kConvolution, \ | |||
| cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, \ | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ | |||
| cutlass::convolution::threadblock:: \ | |||
| ConvolutionNCxHWxThreadblockSwizzle< \ | |||
| cutlass::convolution::ConvType::kConvolution>, \ | |||
| cutlass::conv::threadblock:: \ | |||
| ConvolutionFpropNCxHWxThreadblockSwizzle, \ | |||
| 2, 16, 16, NeedLoadFromConstMem>; \ | |||
| typename Convolution::ConvolutionParameter conv_param{ \ | |||
| param.n, param.ci, param.co, param.hi, param.wi, \ | |||
| param.fh, param.fw, param.ho, param.wo, param.sh, \ | |||
| param.sw, param.ph, param.pw, 1, 1}; \ | |||
| typename Convolution::ConvolutionParameter conv_param( \ | |||
| param.n, param.hi, param.wi, param.ci, param.co, param.fh, \ | |||
| param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \ | |||
| param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation); \ | |||
| return cutlass_convolution_wrapper<Convolution>( \ | |||
| d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \ | |||
| epilogue, stream); \ | |||
| @@ -186,22 +185,21 @@ void megdnn::cuda::cutlass_wrapper:: | |||
| threadblock_k_>; \ | |||
| using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \ | |||
| using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; \ | |||
| using Convolution = cutlass::convolution::device::Convolution< \ | |||
| using Convolution = cutlass::conv::device::Convolution< \ | |||
| int8_t, cutlass::layout::TensorNCxHWx<32>, int8_t, \ | |||
| cutlass::layout::TensorCxRSKx<32>, ElementOutput, \ | |||
| cutlass::layout::TensorNCxHWx<4>, int32_t, \ | |||
| cutlass::layout::TensorNCxHWx<4>, int32_t, \ | |||
| cutlass::convolution::ConvType::kConvolution, \ | |||
| cutlass::conv::ConvType::kConvolution, \ | |||
| cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, \ | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ | |||
| cutlass::convolution::threadblock:: \ | |||
| ConvolutionNCxHWxThreadblockSwizzle< \ | |||
| cutlass::convolution::ConvType::kConvolution>, \ | |||
| cutlass::conv::threadblock:: \ | |||
| ConvolutionFpropNCxHWxThreadblockSwizzle, \ | |||
| 2, 16, 16, NeedLoadFromConstMem>; \ | |||
| typename Convolution::ConvolutionParameter conv_param{ \ | |||
| param.n, param.ci, param.co, param.hi, param.wi, \ | |||
| param.fh, param.fw, param.ho, param.wo, param.sh, \ | |||
| param.sw, param.ph, param.pw, 1, 1}; \ | |||
| typename Convolution::ConvolutionParameter conv_param( \ | |||
| param.n, param.hi, param.wi, param.ci, param.co, param.fh, \ | |||
| param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \ | |||
| param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation); \ | |||
| return cutlass_convolution_wrapper<Convolution>( \ | |||
| d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \ | |||
| epilogue, stream); \ | |||
| @@ -311,22 +309,21 @@ void megdnn::cuda::cutlass_wrapper:: | |||
| threadblock_k_>; \ | |||
| using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \ | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; \ | |||
| using Convolution = cutlass::convolution::device::Convolution< \ | |||
| using Convolution = cutlass::conv::device::Convolution< \ | |||
| int8_t, cutlass::layout::TensorNCxHWx<4>, int8_t, \ | |||
| cutlass::layout::TensorCxRSKx<4>, ElementOutput, \ | |||
| cutlass::layout::TensorNCxHWx<4>, int32_t, \ | |||
| cutlass::layout::TensorNCxHWx<4>, int32_t, \ | |||
| cutlass::convolution::ConvType::kConvolution, \ | |||
| cutlass::conv::ConvType::kConvolution, \ | |||
| cutlass::arch::OpClassSimt, cutlass::arch::Sm61, \ | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ | |||
| cutlass::convolution::threadblock:: \ | |||
| ConvolutionNCxHWxThreadblockSwizzle< \ | |||
| cutlass::convolution::ConvType::kConvolution>, \ | |||
| cutlass::conv::threadblock:: \ | |||
| ConvolutionFpropNCxHWxThreadblockSwizzle, \ | |||
| stage_, 4, aligned_, NeedLoadFromConstMem>; \ | |||
| typename Convolution::ConvolutionParameter conv_param{ \ | |||
| param.n, param.ci, param.co, param.hi, param.wi, \ | |||
| param.fh, param.fw, param.ho, param.wo, param.sh, \ | |||
| param.sw, param.ph, param.pw, 1, 1}; \ | |||
| typename Convolution::ConvolutionParameter conv_param( \ | |||
| param.n, param.hi, param.wi, param.ci, param.co, param.fh, \ | |||
| param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \ | |||
| param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation); \ | |||
| return cutlass_convolution_wrapper<Convolution>( \ | |||
| d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \ | |||
| epilogue, stream); \ | |||
| @@ -441,23 +438,22 @@ void megdnn::cuda::cutlass_wrapper:: | |||
| threadblock_k_>; \ | |||
| using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \ | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; \ | |||
| using Convolution = cutlass::convolution::device::Convolution< \ | |||
| using Convolution = cutlass::conv::device::Convolution< \ | |||
| int8_t, cutlass::layout::TensorNCxHWx<4>, int8_t, \ | |||
| cutlass::layout::TensorCxRSKx<4>, ElementOutput, \ | |||
| cutlass::layout::TensorNCHW, float, \ | |||
| cutlass::layout::TensorNCHW, int32_t, \ | |||
| cutlass::convolution::ConvType::kConvolution, \ | |||
| cutlass::conv::ConvType::kConvolution, \ | |||
| cutlass::arch::OpClassSimt, cutlass::arch::Sm61, \ | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ | |||
| cutlass::convolution::threadblock:: \ | |||
| ConvolutionNCxHWxThreadblockSwizzle< \ | |||
| cutlass::convolution::ConvType::kConvolution>, \ | |||
| cutlass::conv::threadblock:: \ | |||
| ConvolutionFpropNCxHWxThreadblockSwizzle, \ | |||
| stages_, 4, aligned_, NeedLoadFromConstMem, \ | |||
| cutlass::arch::OpMultiplyAdd>; \ | |||
| typename Convolution::ConvolutionParameter conv_param{ \ | |||
| param.n, param.ci, param.co, param.hi, param.wi, \ | |||
| param.fh, param.fw, param.ho, param.wo, param.sh, \ | |||
| param.sw, param.ph, param.pw, 1, 1}; \ | |||
| typename Convolution::ConvolutionParameter conv_param( \ | |||
| param.n, param.hi, param.wi, param.ci, param.co, param.fh, \ | |||
| param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \ | |||
| param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation); \ | |||
| return cutlass_convolution_wrapper<Convolution>( \ | |||
| d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \ | |||
| epilogue, stream); \ | |||
| @@ -572,36 +568,35 @@ void megdnn::cuda::cutlass_wrapper:: | |||
| threadblock_k_>; \ | |||
| using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \ | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; \ | |||
| using Convolution = cutlass::convolution::device::Convolution< \ | |||
| using Convolution = cutlass::conv::device::Convolution< \ | |||
| int8_t, cutlass::layout::TensorNCxHWx<4>, int8_t, \ | |||
| cutlass::layout::TensorCxRSKx<4>, ElementOutput, \ | |||
| cutlass::layout::TensorNCxHWx<32>, int32_t, \ | |||
| cutlass::layout::TensorNCxHWx<32>, int32_t, \ | |||
| cutlass::convolution::ConvType::kConvolution, \ | |||
| cutlass::conv::ConvType::kConvolution, \ | |||
| cutlass::arch::OpClassSimt, cutlass::arch::Sm61, \ | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ | |||
| cutlass::convolution::threadblock:: \ | |||
| ConvolutionNCxHWxThreadblockSwizzle< \ | |||
| cutlass::convolution::ConvType::kConvolution>, \ | |||
| cutlass::conv::threadblock:: \ | |||
| ConvolutionFpropNCxHWxThreadblockSwizzle, \ | |||
| stages_, 4, aligned_, NeedLoadFromConstMem>; \ | |||
| typename Convolution::ConvolutionParameter conv_param{ \ | |||
| param.n, param.ci, param.co, param.hi, param.wi, \ | |||
| param.fh, param.fw, param.ho, param.wo, param.sh, \ | |||
| param.sw, param.ph, param.pw, 1, 1}; \ | |||
| typename Convolution::ConvolutionParameter conv_param( \ | |||
| param.n, param.hi, param.wi, param.ci, param.co, param.fh, \ | |||
| param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \ | |||
| param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation); \ | |||
| return cutlass_convolution_wrapper<Convolution>( \ | |||
| d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \ | |||
| epilogue, stream); \ | |||
| } | |||
| #define DISPATCH_KERNEL \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 128, 32, 64, 32, 32, 2, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 32, 64, 32, 32, 2, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 32, 64, 32, 32, 2, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 32, 64, 32, 32, 2, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 128, 32, 32, 64, 32, 2, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 64, 32, 64, 32, 32, 2, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 64, 32, 32, 64, 32, 2, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 32, 32, 64, 32, 32, 2, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 32, 32, 32, 32, 32, 2, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 128, 32, 64, 32, 32, 2, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 32, 64, 32, 32, 2, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 32, 64, 32, 32, 2, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 32, 64, 32, 32, 2, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 128, 32, 32, 64, 32, 2, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 64, 32, 64, 32, 32, 2, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 64, 32, 32, 64, 32, 2, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 32, 32, 64, 32, 32, 2, 16); \ | |||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 32, 32, 32, 32, 32, 2, 16); \ | |||
| megdnn_assert(false, \ | |||
| "unsupported threadblock shape (%dx%dx%d) and warp shape " \ | |||
| "(%dx%dx%d)", \ | |||
| @@ -29,28 +29,30 @@ void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( | |||
| cudaStream_t stream) { | |||
| typename Convolution::TensorRefSrc tensor_src{ | |||
| const_cast<typename Convolution::ElementSrc*>(d_src), | |||
| Convolution::LayoutSrc::packed({conv_param.n(), conv_param.hi(), | |||
| conv_param.wi(), conv_param.ci()})}; | |||
| Convolution::LayoutSrc::packed( | |||
| {conv_param.N, conv_param.H, conv_param.W, conv_param.C})}; | |||
| typename Convolution::TensorRefFilter tensor_filter{ | |||
| const_cast<typename Convolution::ElementFilter*>(d_filter), | |||
| Convolution::LayoutFilter::packed({conv_param.co(), conv_param.fh(), | |||
| conv_param.fw(), | |||
| conv_param.ci()})}; | |||
| Convolution::LayoutFilter::packed( | |||
| {conv_param.K, conv_param.R, conv_param.S, conv_param.C})}; | |||
| typename Convolution::TensorRefBias tensor_bias{ | |||
| const_cast<typename Convolution::ElementBias*>(d_bias), | |||
| Convolution::LayoutBias::packed({1, 1, 1, conv_param.co()})}; | |||
| Convolution::LayoutBias::packed({1, 1, 1, conv_param.K})}; | |||
| typename Convolution::TensorRefDst tensor_z{ | |||
| const_cast<typename Convolution::ElementDst*>(d_z), | |||
| Convolution::LayoutDst::packed({conv_param.n(), conv_param.ho(), | |||
| conv_param.wo(), conv_param.co()})}; | |||
| Convolution::LayoutDst::packed( | |||
| {conv_param.N, conv_param.P, conv_param.Q, conv_param.K})}; | |||
| typename Convolution::TensorRefDst tensor_dst{ | |||
| d_dst, | |||
| Convolution::LayoutDst::packed({conv_param.n(), conv_param.ho(), | |||
| conv_param.wo(), conv_param.co()})}; | |||
| typename Convolution::Arguments arguments{ | |||
| conv_param, tensor_src, tensor_filter, | |||
| tensor_bias, tensor_z, tensor_dst.non_const_ref(), | |||
| epilogue}; | |||
| Convolution::LayoutDst::packed( | |||
| {conv_param.N, conv_param.P, conv_param.Q, conv_param.K})}; | |||
| typename Convolution::Arguments arguments{conv_param, | |||
| tensor_src.non_const_ref(), | |||
| tensor_filter.non_const_ref(), | |||
| tensor_bias.non_const_ref(), | |||
| tensor_z.non_const_ref(), | |||
| tensor_dst.non_const_ref(), | |||
| epilogue}; | |||
| Convolution conv_op; | |||
| cutlass_check(conv_op.initialize(arguments, workspace)); | |||
| cutlass_check(conv_op(stream)); | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 1, 4, 8, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 1, 4, 8, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 1, 4, 8, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 4, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 4, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 4, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 1, 4, 8, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 1, 4, 8, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 1, 4, 8, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 4, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 4, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 4, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::convolution::device::Convolution< | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
| cutlass::convolution::ConvType::kConvolution>, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||