GitOrigin-RevId: 49e0565e8a
tags/v1.3.0
| @@ -5,6 +5,7 @@ dnn/src/cuda/conv_bias/int8_imma/kimpl/* binary | |||||
| dnn/src/cuda/batch_conv_bias/int8/kimpl/* binary | dnn/src/cuda/batch_conv_bias/int8/kimpl/* binary | ||||
| dnn/src/cuda/matrix_mul/fp32_simt/kimpl/* binary | dnn/src/cuda/matrix_mul/fp32_simt/kimpl/* binary | ||||
| dnn/src/cuda/sass/prebuilt/map_defs.cpp binary | dnn/src/cuda/sass/prebuilt/map_defs.cpp binary | ||||
| dnn/src/cuda/convolution/backward_data/int8/kimpl/* binary | |||||
| tools/mlir/mlir-tblgen filter=lfs diff=lfs merge=lfs -text | tools/mlir/mlir-tblgen filter=lfs diff=lfs merge=lfs -text | ||||
| *.caffemodel filter=lfs diff=lfs merge=lfs -text | *.caffemodel filter=lfs diff=lfs merge=lfs -text | ||||
| imperative/python/test/integration/data/*.mge filter=lfs diff=lfs merge=lfs -text | imperative/python/test/integration/data/*.mge filter=lfs diff=lfs merge=lfs -text | ||||
| @@ -46,7 +46,7 @@ void make_canonized_filter_meta_nchw_nhwc( | |||||
| size_t src_ndim, const TensorLayout& filter, const Param& param, | size_t src_ndim, const TensorLayout& filter, const Param& param, | ||||
| typename ConvolutionBase<Parameter>::CanonizedFilterMeta& ret) { | typename ConvolutionBase<Parameter>::CanonizedFilterMeta& ret) { | ||||
| megdnn_assert(param.format == Param::Format::NCHW || | megdnn_assert(param.format == Param::Format::NCHW || | ||||
| param.format == Param::Format::NHWC ); | |||||
| param.format == Param::Format::NHWC); | |||||
| auto img_ndim = src_ndim - 2; | auto img_ndim = src_ndim - 2; | ||||
| size_t flt_start, flt_spatial_start, ocpg_pos, icpg_pos; | size_t flt_start, flt_spatial_start, ocpg_pos, icpg_pos; | ||||
| if (param.sparse == Param::Sparse::DENSE) { | if (param.sparse == Param::Sparse::DENSE) { | ||||
| @@ -320,8 +320,8 @@ void make_canonized_filter_meta_nchwxx( | |||||
| img_ndim, filter.ndim); | img_ndim, filter.ndim); | ||||
| megdnn_assert((filter[filter.ndim - 1] == pack_size && | megdnn_assert((filter[filter.ndim - 1] == pack_size && | ||||
| filter[filter.ndim - 2] == pack_size) || | filter[filter.ndim - 2] == pack_size) || | ||||
| (filter[filter.ndim - 1] == 2 * pack_size && | |||||
| filter[filter.ndim - 2] == 2 * pack_size), | |||||
| (filter[filter.ndim - 1] == 2 * pack_size && | |||||
| filter[filter.ndim - 2] == 2 * pack_size), | |||||
| "last 2 dim of filter must be %zu, but got %zu, %zu", | "last 2 dim of filter must be %zu, but got %zu, %zu", | ||||
| pack_size, filter[filter.ndim - 2], | pack_size, filter[filter.ndim - 2], | ||||
| filter[filter.ndim - 1]); | filter[filter.ndim - 1]); | ||||
| @@ -684,7 +684,8 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src, | |||||
| } | } | ||||
| if (param().format == Param::Format::NCHW44 || | if (param().format == Param::Format::NCHW44 || | ||||
| param().format == Param::Format::NCHW44_DOT) { | param().format == Param::Format::NCHW44_DOT) { | ||||
| //!support nchw44 filter change to 88 for int8 winogradf23_88 using MK8 mamtul | |||||
| //! support nchw44 filter change to 88 for int8 winogradf23_88 using | |||||
| //! MK8 mamtul | |||||
| megdnn_assert((src.ndim == 4 && filter.ndim == 5 && | megdnn_assert((src.ndim == 4 && filter.ndim == 5 && | ||||
| filter[filter.ndim - 1] == 4) || | filter[filter.ndim - 1] == 4) || | ||||
| (src.ndim == 5 && | (src.ndim == 5 && | ||||
| @@ -716,7 +717,7 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src, | |||||
| "currently only convolution on 2D image is supported"); | "currently only convolution on 2D image is supported"); | ||||
| auto cflt = make_canonized_filter_meta(src.ndim, filter); | auto cflt = make_canonized_filter_meta(src.ndim, filter); | ||||
| if (param().format == Param::Format::NCHW || | if (param().format == Param::Format::NCHW || | ||||
| param().format == Param::Format::NHWC ) { | |||||
| param().format == Param::Format::NHWC) { | |||||
| size_t src_or_dst_c_pos = 0; | size_t src_or_dst_c_pos = 0; | ||||
| size_t src_or_dst_spatial_start = 0; | size_t src_or_dst_spatial_start = 0; | ||||
| if (param().format == Param::Format::NCHW) { | if (param().format == Param::Format::NCHW) { | ||||
| @@ -790,7 +791,7 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src, | |||||
| dst[3] = infer_conv_shape(src[3], cflt.dilated_spatial[1], | dst[3] = infer_conv_shape(src[3], cflt.dilated_spatial[1], | ||||
| cflt.stride[1], cflt.padding[1]); | cflt.stride[1], cflt.padding[1]); | ||||
| dst[4] = 32; | dst[4] = 32; | ||||
| } else if (param().format == Param::Format::NCHW88 ) { | |||||
| } else if (param().format == Param::Format::NCHW88) { | |||||
| megdnn_assert(src.ndim == 5 || (src.ndim == 4 && src[1] <= 8), | megdnn_assert(src.ndim == 5 || (src.ndim == 4 && src[1] <= 8), | ||||
| "invalid src ndim for NCHW88, expected=5 or 4, got=%zu", | "invalid src ndim for NCHW88, expected=5 or 4, got=%zu", | ||||
| src.ndim); | src.ndim); | ||||
| @@ -1042,10 +1043,10 @@ void ConvolutionBackwardData::deduce_dtype(DType filter, DType diff, | |||||
| } | } | ||||
| megdnn_assert(param().compute_mode != Param::ComputeMode::FLOAT32 | megdnn_assert(param().compute_mode != Param::ComputeMode::FLOAT32 | ||||
| #if !MEGDNN_DISABLE_FLOAT16 | #if !MEGDNN_DISABLE_FLOAT16 | ||||
| || filter.enumv() == DTypeEnum::Float16 | |||||
| || filter.enumv() == DTypeEnum::BFloat16 | |||||
| || filter.enumv() == DTypeEnum::Float16 || | |||||
| filter.enumv() == DTypeEnum::BFloat16 | |||||
| #endif | #endif | ||||
| , | |||||
| , | |||||
| "ComputeMode::FLOAT32 is only available for Float16/BFloat16 " | "ComputeMode::FLOAT32 is only available for Float16/BFloat16 " | ||||
| "input / output."); | "input / output."); | ||||
| } | } | ||||
| @@ -1096,6 +1097,24 @@ void ConvolutionBackwardData::deduce_layout(const TensorLayout& filter, | |||||
| diff[i + src_or_dst_spatial_start], cflt.dilated_spatial[i], | diff[i + src_or_dst_spatial_start], cflt.dilated_spatial[i], | ||||
| cflt.stride[i], cflt.padding[i]); | cflt.stride[i], cflt.padding[i]); | ||||
| } | } | ||||
| } else if (param().format == Param::Format::NCHW4) { | |||||
| megdnn_assert(diff.ndim == 5, | |||||
| "valid diff ndim for NCHW4, expected=5, got=%zu", | |||||
| diff.ndim); | |||||
| megdnn_assert(cflt.group == 1, "%s", errmsg().c_str()); | |||||
| megdnn_assert(cflt.ocpg * cflt.group == diff[1] * 4, "%s", | |||||
| errmsg().c_str()); | |||||
| grad.ndim = diff.ndim; | |||||
| grad[0] = diff[0]; | |||||
| auto ic = cflt.icpg * cflt.group; | |||||
| megdnn_assert(ic % 4 == 0); | |||||
| grad[1] = ic / 4; | |||||
| grad[2] = deduce(diff[2], cflt.dilated_spatial[0], cflt.stride[0], | |||||
| cflt.padding[0]); | |||||
| grad[3] = deduce(diff[3], cflt.dilated_spatial[1], cflt.stride[1], | |||||
| cflt.padding[1]); | |||||
| megdnn_assert(diff[4] == 4); | |||||
| grad[4] = 4; | |||||
| } else { | } else { | ||||
| megdnn_assert(param().format == Param::Format::NHWCD4); | megdnn_assert(param().format == Param::Format::NHWCD4); | ||||
| megdnn_assert(diff.ndim == 5, | megdnn_assert(diff.ndim == 5, | ||||
| @@ -62,22 +62,21 @@ void megdnn::cuda::cutlass_wrapper:: | |||||
| threadblock_k_>; \ | threadblock_k_>; \ | ||||
| using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \ | using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \ | ||||
| using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; \ | using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; \ | ||||
| using Convolution = cutlass::convolution::device::Convolution< \ | |||||
| using Convolution = cutlass::conv::device::Convolution< \ | |||||
| int8_t, cutlass::layout::TensorNCxHWx<32>, int8_t, \ | int8_t, cutlass::layout::TensorNCxHWx<32>, int8_t, \ | ||||
| cutlass::layout::TensorCxRSKx<32>, ElementOutput, \ | cutlass::layout::TensorCxRSKx<32>, ElementOutput, \ | ||||
| cutlass::layout::TensorNCxHWx<32>, int32_t, \ | cutlass::layout::TensorNCxHWx<32>, int32_t, \ | ||||
| cutlass::layout::TensorNCxHWx<32>, int32_t, \ | cutlass::layout::TensorNCxHWx<32>, int32_t, \ | ||||
| cutlass::convolution::ConvType::kConvolution, \ | |||||
| cutlass::conv::ConvType::kConvolution, \ | |||||
| cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, \ | cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, \ | ||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ | ||||
| cutlass::convolution::threadblock:: \ | |||||
| ConvolutionNCxHWxThreadblockSwizzle< \ | |||||
| cutlass::convolution::ConvType::kConvolution>, \ | |||||
| cutlass::conv::threadblock:: \ | |||||
| ConvolutionFpropNCxHWxThreadblockSwizzle, \ | |||||
| 2, 16, 16, NeedLoadFromConstMem>; \ | 2, 16, 16, NeedLoadFromConstMem>; \ | ||||
| typename Convolution::ConvolutionParameter conv_param{ \ | |||||
| param.n, param.ci, param.co, param.hi, param.wi, \ | |||||
| param.fh, param.fw, param.ho, param.wo, param.sh, \ | |||||
| param.sw, param.ph, param.pw, 1, 1}; \ | |||||
| typename Convolution::ConvolutionParameter conv_param( \ | |||||
| param.n, param.hi, param.wi, param.ci, param.co, param.fh, \ | |||||
| param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \ | |||||
| param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation); \ | |||||
| return cutlass_convolution_wrapper<Convolution>( \ | return cutlass_convolution_wrapper<Convolution>( \ | ||||
| d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \ | d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \ | ||||
| epilogue, stream); \ | epilogue, stream); \ | ||||
| @@ -186,22 +185,21 @@ void megdnn::cuda::cutlass_wrapper:: | |||||
| threadblock_k_>; \ | threadblock_k_>; \ | ||||
| using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \ | using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \ | ||||
| using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; \ | using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; \ | ||||
| using Convolution = cutlass::convolution::device::Convolution< \ | |||||
| using Convolution = cutlass::conv::device::Convolution< \ | |||||
| int8_t, cutlass::layout::TensorNCxHWx<32>, int8_t, \ | int8_t, cutlass::layout::TensorNCxHWx<32>, int8_t, \ | ||||
| cutlass::layout::TensorCxRSKx<32>, ElementOutput, \ | cutlass::layout::TensorCxRSKx<32>, ElementOutput, \ | ||||
| cutlass::layout::TensorNCxHWx<4>, int32_t, \ | cutlass::layout::TensorNCxHWx<4>, int32_t, \ | ||||
| cutlass::layout::TensorNCxHWx<4>, int32_t, \ | cutlass::layout::TensorNCxHWx<4>, int32_t, \ | ||||
| cutlass::convolution::ConvType::kConvolution, \ | |||||
| cutlass::conv::ConvType::kConvolution, \ | |||||
| cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, \ | cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, \ | ||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ | ||||
| cutlass::convolution::threadblock:: \ | |||||
| ConvolutionNCxHWxThreadblockSwizzle< \ | |||||
| cutlass::convolution::ConvType::kConvolution>, \ | |||||
| cutlass::conv::threadblock:: \ | |||||
| ConvolutionFpropNCxHWxThreadblockSwizzle, \ | |||||
| 2, 16, 16, NeedLoadFromConstMem>; \ | 2, 16, 16, NeedLoadFromConstMem>; \ | ||||
| typename Convolution::ConvolutionParameter conv_param{ \ | |||||
| param.n, param.ci, param.co, param.hi, param.wi, \ | |||||
| param.fh, param.fw, param.ho, param.wo, param.sh, \ | |||||
| param.sw, param.ph, param.pw, 1, 1}; \ | |||||
| typename Convolution::ConvolutionParameter conv_param( \ | |||||
| param.n, param.hi, param.wi, param.ci, param.co, param.fh, \ | |||||
| param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \ | |||||
| param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation); \ | |||||
| return cutlass_convolution_wrapper<Convolution>( \ | return cutlass_convolution_wrapper<Convolution>( \ | ||||
| d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \ | d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \ | ||||
| epilogue, stream); \ | epilogue, stream); \ | ||||
| @@ -311,22 +309,21 @@ void megdnn::cuda::cutlass_wrapper:: | |||||
| threadblock_k_>; \ | threadblock_k_>; \ | ||||
| using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \ | using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \ | ||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; \ | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; \ | ||||
| using Convolution = cutlass::convolution::device::Convolution< \ | |||||
| using Convolution = cutlass::conv::device::Convolution< \ | |||||
| int8_t, cutlass::layout::TensorNCxHWx<4>, int8_t, \ | int8_t, cutlass::layout::TensorNCxHWx<4>, int8_t, \ | ||||
| cutlass::layout::TensorCxRSKx<4>, ElementOutput, \ | cutlass::layout::TensorCxRSKx<4>, ElementOutput, \ | ||||
| cutlass::layout::TensorNCxHWx<4>, int32_t, \ | cutlass::layout::TensorNCxHWx<4>, int32_t, \ | ||||
| cutlass::layout::TensorNCxHWx<4>, int32_t, \ | cutlass::layout::TensorNCxHWx<4>, int32_t, \ | ||||
| cutlass::convolution::ConvType::kConvolution, \ | |||||
| cutlass::conv::ConvType::kConvolution, \ | |||||
| cutlass::arch::OpClassSimt, cutlass::arch::Sm61, \ | cutlass::arch::OpClassSimt, cutlass::arch::Sm61, \ | ||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ | ||||
| cutlass::convolution::threadblock:: \ | |||||
| ConvolutionNCxHWxThreadblockSwizzle< \ | |||||
| cutlass::convolution::ConvType::kConvolution>, \ | |||||
| cutlass::conv::threadblock:: \ | |||||
| ConvolutionFpropNCxHWxThreadblockSwizzle, \ | |||||
| stage_, 4, aligned_, NeedLoadFromConstMem>; \ | stage_, 4, aligned_, NeedLoadFromConstMem>; \ | ||||
| typename Convolution::ConvolutionParameter conv_param{ \ | |||||
| param.n, param.ci, param.co, param.hi, param.wi, \ | |||||
| param.fh, param.fw, param.ho, param.wo, param.sh, \ | |||||
| param.sw, param.ph, param.pw, 1, 1}; \ | |||||
| typename Convolution::ConvolutionParameter conv_param( \ | |||||
| param.n, param.hi, param.wi, param.ci, param.co, param.fh, \ | |||||
| param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \ | |||||
| param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation); \ | |||||
| return cutlass_convolution_wrapper<Convolution>( \ | return cutlass_convolution_wrapper<Convolution>( \ | ||||
| d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \ | d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \ | ||||
| epilogue, stream); \ | epilogue, stream); \ | ||||
| @@ -441,23 +438,22 @@ void megdnn::cuda::cutlass_wrapper:: | |||||
| threadblock_k_>; \ | threadblock_k_>; \ | ||||
| using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \ | using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \ | ||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; \ | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; \ | ||||
| using Convolution = cutlass::convolution::device::Convolution< \ | |||||
| using Convolution = cutlass::conv::device::Convolution< \ | |||||
| int8_t, cutlass::layout::TensorNCxHWx<4>, int8_t, \ | int8_t, cutlass::layout::TensorNCxHWx<4>, int8_t, \ | ||||
| cutlass::layout::TensorCxRSKx<4>, ElementOutput, \ | cutlass::layout::TensorCxRSKx<4>, ElementOutput, \ | ||||
| cutlass::layout::TensorNCHW, float, \ | cutlass::layout::TensorNCHW, float, \ | ||||
| cutlass::layout::TensorNCHW, int32_t, \ | cutlass::layout::TensorNCHW, int32_t, \ | ||||
| cutlass::convolution::ConvType::kConvolution, \ | |||||
| cutlass::conv::ConvType::kConvolution, \ | |||||
| cutlass::arch::OpClassSimt, cutlass::arch::Sm61, \ | cutlass::arch::OpClassSimt, cutlass::arch::Sm61, \ | ||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ | ||||
| cutlass::convolution::threadblock:: \ | |||||
| ConvolutionNCxHWxThreadblockSwizzle< \ | |||||
| cutlass::convolution::ConvType::kConvolution>, \ | |||||
| cutlass::conv::threadblock:: \ | |||||
| ConvolutionFpropNCxHWxThreadblockSwizzle, \ | |||||
| stages_, 4, aligned_, NeedLoadFromConstMem, \ | stages_, 4, aligned_, NeedLoadFromConstMem, \ | ||||
| cutlass::arch::OpMultiplyAdd>; \ | cutlass::arch::OpMultiplyAdd>; \ | ||||
| typename Convolution::ConvolutionParameter conv_param{ \ | |||||
| param.n, param.ci, param.co, param.hi, param.wi, \ | |||||
| param.fh, param.fw, param.ho, param.wo, param.sh, \ | |||||
| param.sw, param.ph, param.pw, 1, 1}; \ | |||||
| typename Convolution::ConvolutionParameter conv_param( \ | |||||
| param.n, param.hi, param.wi, param.ci, param.co, param.fh, \ | |||||
| param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \ | |||||
| param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation); \ | |||||
| return cutlass_convolution_wrapper<Convolution>( \ | return cutlass_convolution_wrapper<Convolution>( \ | ||||
| d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \ | d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \ | ||||
| epilogue, stream); \ | epilogue, stream); \ | ||||
| @@ -572,36 +568,35 @@ void megdnn::cuda::cutlass_wrapper:: | |||||
| threadblock_k_>; \ | threadblock_k_>; \ | ||||
| using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \ | using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \ | ||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; \ | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; \ | ||||
| using Convolution = cutlass::convolution::device::Convolution< \ | |||||
| using Convolution = cutlass::conv::device::Convolution< \ | |||||
| int8_t, cutlass::layout::TensorNCxHWx<4>, int8_t, \ | int8_t, cutlass::layout::TensorNCxHWx<4>, int8_t, \ | ||||
| cutlass::layout::TensorCxRSKx<4>, ElementOutput, \ | cutlass::layout::TensorCxRSKx<4>, ElementOutput, \ | ||||
| cutlass::layout::TensorNCxHWx<32>, int32_t, \ | cutlass::layout::TensorNCxHWx<32>, int32_t, \ | ||||
| cutlass::layout::TensorNCxHWx<32>, int32_t, \ | cutlass::layout::TensorNCxHWx<32>, int32_t, \ | ||||
| cutlass::convolution::ConvType::kConvolution, \ | |||||
| cutlass::conv::ConvType::kConvolution, \ | |||||
| cutlass::arch::OpClassSimt, cutlass::arch::Sm61, \ | cutlass::arch::OpClassSimt, cutlass::arch::Sm61, \ | ||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ | ||||
| cutlass::convolution::threadblock:: \ | |||||
| ConvolutionNCxHWxThreadblockSwizzle< \ | |||||
| cutlass::convolution::ConvType::kConvolution>, \ | |||||
| cutlass::conv::threadblock:: \ | |||||
| ConvolutionFpropNCxHWxThreadblockSwizzle, \ | |||||
| stages_, 4, aligned_, NeedLoadFromConstMem>; \ | stages_, 4, aligned_, NeedLoadFromConstMem>; \ | ||||
| typename Convolution::ConvolutionParameter conv_param{ \ | |||||
| param.n, param.ci, param.co, param.hi, param.wi, \ | |||||
| param.fh, param.fw, param.ho, param.wo, param.sh, \ | |||||
| param.sw, param.ph, param.pw, 1, 1}; \ | |||||
| typename Convolution::ConvolutionParameter conv_param( \ | |||||
| param.n, param.hi, param.wi, param.ci, param.co, param.fh, \ | |||||
| param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \ | |||||
| param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation); \ | |||||
| return cutlass_convolution_wrapper<Convolution>( \ | return cutlass_convolution_wrapper<Convolution>( \ | ||||
| d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \ | d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \ | ||||
| epilogue, stream); \ | epilogue, stream); \ | ||||
| } | } | ||||
| #define DISPATCH_KERNEL \ | #define DISPATCH_KERNEL \ | ||||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 128, 32, 64, 32, 32, 2, 16); \ | |||||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 32, 64, 32, 32, 2, 16); \ | |||||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 32, 64, 32, 32, 2, 16); \ | |||||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 32, 64, 32, 32, 2, 16); \ | |||||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 128, 32, 32, 64, 32, 2, 16); \ | |||||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 64, 32, 64, 32, 32, 2, 16); \ | |||||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 64, 32, 32, 64, 32, 2, 16); \ | |||||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 32, 32, 64, 32, 32, 2, 16); \ | |||||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 32, 32, 32, 32, 32, 2, 16); \ | |||||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 128, 32, 64, 32, 32, 2, 16); \ | |||||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 32, 64, 32, 32, 2, 16); \ | |||||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 32, 64, 32, 32, 2, 16); \ | |||||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 32, 64, 32, 32, 2, 16); \ | |||||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 128, 32, 32, 64, 32, 2, 16); \ | |||||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 64, 32, 64, 32, 32, 2, 16); \ | |||||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 64, 32, 32, 64, 32, 2, 16); \ | |||||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 32, 32, 64, 32, 32, 2, 16); \ | |||||
| DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 32, 32, 32, 32, 32, 2, 16); \ | |||||
| megdnn_assert(false, \ | megdnn_assert(false, \ | ||||
| "unsupported threadblock shape (%dx%dx%d) and warp shape " \ | "unsupported threadblock shape (%dx%dx%d) and warp shape " \ | ||||
| "(%dx%dx%d)", \ | "(%dx%dx%d)", \ | ||||
| @@ -29,28 +29,30 @@ void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( | |||||
| cudaStream_t stream) { | cudaStream_t stream) { | ||||
| typename Convolution::TensorRefSrc tensor_src{ | typename Convolution::TensorRefSrc tensor_src{ | ||||
| const_cast<typename Convolution::ElementSrc*>(d_src), | const_cast<typename Convolution::ElementSrc*>(d_src), | ||||
| Convolution::LayoutSrc::packed({conv_param.n(), conv_param.hi(), | |||||
| conv_param.wi(), conv_param.ci()})}; | |||||
| Convolution::LayoutSrc::packed( | |||||
| {conv_param.N, conv_param.H, conv_param.W, conv_param.C})}; | |||||
| typename Convolution::TensorRefFilter tensor_filter{ | typename Convolution::TensorRefFilter tensor_filter{ | ||||
| const_cast<typename Convolution::ElementFilter*>(d_filter), | const_cast<typename Convolution::ElementFilter*>(d_filter), | ||||
| Convolution::LayoutFilter::packed({conv_param.co(), conv_param.fh(), | |||||
| conv_param.fw(), | |||||
| conv_param.ci()})}; | |||||
| Convolution::LayoutFilter::packed( | |||||
| {conv_param.K, conv_param.R, conv_param.S, conv_param.C})}; | |||||
| typename Convolution::TensorRefBias tensor_bias{ | typename Convolution::TensorRefBias tensor_bias{ | ||||
| const_cast<typename Convolution::ElementBias*>(d_bias), | const_cast<typename Convolution::ElementBias*>(d_bias), | ||||
| Convolution::LayoutBias::packed({1, 1, 1, conv_param.co()})}; | |||||
| Convolution::LayoutBias::packed({1, 1, 1, conv_param.K})}; | |||||
| typename Convolution::TensorRefDst tensor_z{ | typename Convolution::TensorRefDst tensor_z{ | ||||
| const_cast<typename Convolution::ElementDst*>(d_z), | const_cast<typename Convolution::ElementDst*>(d_z), | ||||
| Convolution::LayoutDst::packed({conv_param.n(), conv_param.ho(), | |||||
| conv_param.wo(), conv_param.co()})}; | |||||
| Convolution::LayoutDst::packed( | |||||
| {conv_param.N, conv_param.P, conv_param.Q, conv_param.K})}; | |||||
| typename Convolution::TensorRefDst tensor_dst{ | typename Convolution::TensorRefDst tensor_dst{ | ||||
| d_dst, | d_dst, | ||||
| Convolution::LayoutDst::packed({conv_param.n(), conv_param.ho(), | |||||
| conv_param.wo(), conv_param.co()})}; | |||||
| typename Convolution::Arguments arguments{ | |||||
| conv_param, tensor_src, tensor_filter, | |||||
| tensor_bias, tensor_z, tensor_dst.non_const_ref(), | |||||
| epilogue}; | |||||
| Convolution::LayoutDst::packed( | |||||
| {conv_param.N, conv_param.P, conv_param.Q, conv_param.K})}; | |||||
| typename Convolution::Arguments arguments{conv_param, | |||||
| tensor_src.non_const_ref(), | |||||
| tensor_filter.non_const_ref(), | |||||
| tensor_bias.non_const_ref(), | |||||
| tensor_z.non_const_ref(), | |||||
| tensor_dst.non_const_ref(), | |||||
| epilogue}; | |||||
| Convolution conv_op; | Convolution conv_op; | ||||
| cutlass_check(conv_op.initialize(arguments, workspace)); | cutlass_check(conv_op.initialize(arguments, workspace)); | ||||
| cutlass_check(conv_op(stream)); | cutlass_check(conv_op(stream)); | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, true, | 2, 4, 16, true, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, true, | 2, 4, 16, true, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, true, | 2, 4, 16, true, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, true, | 2, 4, 16, true, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, true, | 2, 4, 16, true, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, true, | 2, 4, 16, true, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, true, | 2, 4, 16, true, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, true, | 2, 4, 16, true, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, true, | 2, 4, 16, true, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 1, 4, 8, true, | 1, 4, 8, true, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 1, 4, 8, true, | 1, 4, 8, true, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 1, 4, 8, true, | 1, 4, 8, true, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 4, true, | 2, 4, 4, true, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 4, true, | 2, 4, 4, true, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 4, true, | 2, 4, 4, true, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 1, 4, 8, false, | 1, 4, 8, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 1, 4, 8, false, | 1, 4, 8, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 1, 4, 8, false, | 1, 4, 8, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 4, false, | 2, 4, 4, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 4, false, | 2, 4, 4, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 4, false, | 2, 4, 4, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, true, | 2, 4, 16, true, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, true, | 2, 4, 16, true, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, true, | 2, 4, 16, true, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, true, | 2, 4, 16, true, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, true, | 2, 4, 16, true, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, true, | 2, 4, 16, true, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, true, | 2, 4, 16, true, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, true, | 2, 4, 16, true, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, true, | 2, 4, 16, true, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, true, | 2, 4, 16, true, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, true, | 2, 4, 16, true, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, true, | 2, 4, 16, true, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, true, | 2, 4, 16, true, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, true, | 2, 4, 16, true, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, true, | 2, 4, 16, true, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, true, | 2, 4, 16, true, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, true, | 2, 4, 16, true, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, true, | 2, 4, 16, true, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, true, | 2, 4, 16, true, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, true, | 2, 4, 16, true, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, true, | 2, 4, 16, true, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, true, | 2, 4, 16, true, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, true, | 2, 4, 16, true, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, true, | 2, 4, 16, true, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, true, | 2, 4, 16, true, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, true, | 2, 4, 16, true, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, true, | 2, 4, 16, true, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
| @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
| int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
| using Convolution = cutlass::convolution::device::Convolution< | |||||
| using Convolution = cutlass::conv::device::Convolution< | |||||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
| LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
| cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
| cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
| cutlass::convolution::ConvType::kConvolution>, | |||||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
| 2, 4, 16, false, | 2, 4, 16, false, | ||||
| cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||