GitOrigin-RevId: 7882f9c68c
tags/v1.5.0
| @@ -37,14 +37,13 @@ all: ${PARAM_DEFS} ${ELEMWISE_IMPL} ${CUDA_CONV_IMPL} $(CUDA_MATMUL_IMPL) | |||
| ../src/cuda/elemwise_multi_type/kimpl: gen_elemwise_multi_type_kern_impls.py | |||
| ./$^ --type cuda $@ | |||
| ../src/cuda/conv_bias/int8/kimpl: gen_cuda_conv_bias_kern_impls.py gen_cutlass_conv_bias_kern_impls.py cutlass_generator/generator.py | |||
| ../src/cuda/conv_bias/int8/kimpl: gen_cuda_conv_bias_kern_impls.py cutlass_generator | |||
| ./gen_cuda_conv_bias_kern_impls.py --type dp4a $@ | |||
| ./gen_cutlass_conv_bias_kern_impls.py --type dp4a $@ | |||
| python3 ./cutlass_generator/generator.py --operations all --type simt $@ | |||
| ../src/cuda/conv_bias/int8_imma/kimpl: gen_cuda_conv_bias_kern_impls.py gen_cutlass_conv_bias_kern_impls.py | |||
| ../src/cuda/conv_bias/int8_imma/kimpl: gen_cuda_conv_bias_kern_impls.py cutlass_generator | |||
| ./gen_cuda_conv_bias_kern_impls.py --type imma $@ | |||
| ./gen_cutlass_conv_bias_kern_impls.py --type imma $@ | |||
| python3 ./cutlass_generator/generator.py --operations conv2d --type tensorop8816 $@ | |||
| ../src/cuda/batch_conv_bias/int8/kimpl: gen_cuda_batch_conv_bias_kern_impls.py | |||
| ./$^ --type dp4a $@ | |||
| @@ -807,9 +807,9 @@ void megdnn::cuda::cutlass_wrapper:: | |||
| const int32_t* d_bias, const uint8_t* d_z, uint8_t* d_dst, | |||
| int* workspace, const convolution::ConvParam& param, | |||
| uint32_t nonlinear_mode, float alpha, float beta, float gamma, | |||
| float delta, float theta, float scale, uint8_t src_zero_point, | |||
| const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, | |||
| cudaStream_t stream) { | |||
| float delta, float theta, float /* scale */, | |||
| uint8_t src_zero_point, const GemmCoord& threadblock_shape, | |||
| const GemmCoord& warp_shape, cudaStream_t stream) { | |||
| #define DISPATCH_KERNEL_WITH_TILE_SHAPE(threadblock_m_, threadblock_n_, \ | |||
| threadblock_k_, warp_m_, warp_n_, \ | |||
| warp_k_) \ | |||
| @@ -878,15 +878,6 @@ void megdnn::cuda::cutlass_wrapper:: | |||
| 0, delta, theta}; | |||
| DISPATCH_KERNEL; | |||
| } | |||
| case NonlineMode::H_SWISH: { | |||
| using EpilogueOp = cutlass::epilogue::thread:: | |||
| BiasAddLinearCombinationHSwishClamp< | |||
| ElementOutput, 16, ElementAccumulator, ElementBias, | |||
| ElementCompute>; | |||
| typename EpilogueOp::Params epilogue{alpha, beta, gamma, | |||
| scale, delta, theta}; | |||
| DISPATCH_KERNEL; | |||
| } | |||
| default: | |||
| megdnn_assert(false, | |||
| "unsupported nonlinear mode for conv bias operator"); | |||
| @@ -960,8 +951,7 @@ void megdnn::cuda::cutlass_wrapper:: | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ | |||
| cutlass::conv::threadblock:: \ | |||
| ConvolutionFpropNCxHWxThreadblockSwizzle, \ | |||
| stages_, 4, aligned_, true, \ | |||
| cutlass::arch::OpMultiplyAddSaturate>; \ | |||
| stages_, 4, aligned_, true, cutlass::arch::OpMultiplyAdd>; \ | |||
| typename Convolution::ConvolutionParameter conv_param( \ | |||
| param.n, param.hi, param.wi, param.ci, param.co, param.fh, \ | |||
| param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \ | |||
| @@ -1,65 +0,0 @@ | |||
| /** | |||
| * \file | |||
| * dnn/src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
| * implied. | |||
| */ | |||
| #include "cutlass/convolution/device/convolution.h" | |||
| #include "src/cuda/conv_bias/cutlass_convolution_wrapper.cuh" | |||
| using namespace megdnn; | |||
| using namespace cuda; | |||
| using namespace cutlass_wrapper; | |||
| template <typename Convolution> | |||
| void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param) { | |||
| typename Convolution::TensorRefSrc tensor_src{ | |||
| const_cast<typename Convolution::ElementSrc*>(d_src), | |||
| Convolution::LayoutSrc::packed( | |||
| {conv_param.N, conv_param.H, conv_param.W, conv_param.C})}; | |||
| typename Convolution::TensorRefFilter tensor_filter{ | |||
| const_cast<typename Convolution::ElementFilter*>(d_filter), | |||
| Convolution::LayoutFilter::packed( | |||
| {conv_param.K, conv_param.R, conv_param.S, conv_param.C})}; | |||
| typename Convolution::TensorRefBias tensor_bias{ | |||
| const_cast<typename Convolution::ElementBias*>(d_bias), | |||
| Convolution::LayoutBias::packed({1, 1, 1, conv_param.K})}; | |||
| typename Convolution::TensorRefDst tensor_z{ | |||
| const_cast<typename Convolution::ElementDst*>(d_z), | |||
| Convolution::LayoutDst::packed( | |||
| {conv_param.N, conv_param.P, conv_param.Q, conv_param.K})}; | |||
| typename Convolution::TensorRefDst tensor_dst{ | |||
| d_dst, | |||
| Convolution::LayoutDst::packed( | |||
| {conv_param.N, conv_param.P, conv_param.Q, conv_param.K})}; | |||
| typename Convolution::Arguments arguments{conv_param, | |||
| tensor_src.non_const_ref(), | |||
| tensor_filter.non_const_ref(), | |||
| tensor_bias.non_const_ref(), | |||
| tensor_z.non_const_ref(), | |||
| tensor_dst.non_const_ref(), | |||
| epilogue, | |||
| {}, | |||
| {}, | |||
| extra_param}; | |||
| Convolution conv_op; | |||
| cutlass_check(conv_op.initialize(arguments, workspace)); | |||
| cutlass_check(conv_op(stream)); | |||
| after_kernel_launch(); | |||
| } | |||
| // vim: syntax=cuda.doxygen | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int4_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<64>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<64>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<64>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 128>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| cutlass::int4b_t, 16, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::int4b_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 32, 32, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int4_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<64>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<64>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<64>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 128>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| cutlass::int4b_t, 16, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::int4b_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 32, 32, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int4_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<64>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<64>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<64>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 128>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| cutlass::int4b_t, 16, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::int4b_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 32, 32, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int4_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<64>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<64>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<64>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 128>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| cutlass::int4b_t, 16, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::int4b_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 32, 32, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int4_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<64>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<64>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<64>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 128>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| cutlass::int4b_t, 16, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::int4b_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 32, 32, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int4_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<64>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<64>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<64>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 128>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| cutlass::int4b_t, 16, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::int4b_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 32, 32, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int4_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<64>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<64>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<64>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 128>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| cutlass::uint4b_t, 16, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| cutlass::uint4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::uint4b_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 32, 32, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int4_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<64>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<64>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<64>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 128>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| cutlass::uint4b_t, 16, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| cutlass::uint4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::uint4b_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 32, 32, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int4_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<64>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<64>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<64>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 128>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| cutlass::uint4b_t, 16, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| cutlass::uint4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::uint4b_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 32, 32, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int4_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<64>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<64>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<64>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 128>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| cutlass::uint4b_t, 16, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| cutlass::uint4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::uint4b_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 32, 32, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int4_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<64>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<64>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<64>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 128>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| cutlass::uint4b_t, 16, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| cutlass::uint4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::uint4b_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 32, 32, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int4_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<64>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<64>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<64>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 128>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| cutlass::uint4b_t, 16, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| cutlass::uint4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::uint4b_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 32, 32, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -0,0 +1,55 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x128x128_64x64x128_2_nc64hw64" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorCxRSKx<64>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| int32_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 128, 128>, | |||
| cutlass::gemm::GemmShape<64, 64, 128>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| cutlass::int4b_t, | |||
| 16, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, | |||
| 32, | |||
| 32, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -0,0 +1,55 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_256x128x128_64x64x128_2_nc64hw64" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorCxRSKx<64>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| int32_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<256, 128, 128>, | |||
| cutlass::gemm::GemmShape<64, 64, 128>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| cutlass::int4b_t, | |||
| 16, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, | |||
| 32, | |||
| 32, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -0,0 +1,55 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_128x128x128_64x64x128_2_nc64hw64" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorCxRSKx<64>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| int32_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 128, 128>, | |||
| cutlass::gemm::GemmShape<64, 64, 128>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| cutlass::int4b_t, | |||
| 16, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, | |||
| 32, | |||
| 32, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -0,0 +1,55 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_256x128x128_64x64x128_2_nc64hw64" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorCxRSKx<64>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| int32_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<256, 128, 128>, | |||
| cutlass::gemm::GemmShape<64, 64, 128>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| cutlass::int4b_t, | |||
| 16, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, | |||
| 32, | |||
| 32, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -0,0 +1,55 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_128x128x128_64x64x128_2_nc64hw64" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorCxRSKx<64>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| int32_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 128, 128>, | |||
| cutlass::gemm::GemmShape<64, 64, 128>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| cutlass::int4b_t, | |||
| 16, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, | |||
| 32, | |||
| 32, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -0,0 +1,55 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_256x128x128_64x64x128_2_nc64hw64" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorCxRSKx<64>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| int32_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<256, 128, 128>, | |||
| cutlass::gemm::GemmShape<64, 64, 128>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| cutlass::int4b_t, | |||
| 16, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, | |||
| 32, | |||
| 32, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -0,0 +1,55 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x128x128_64x64x128_2_nc64hw64" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorCxRSKx<64>, | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| int32_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 128, 128>, | |||
| cutlass::gemm::GemmShape<64, 64, 128>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| cutlass::uint4b_t, | |||
| 16, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, | |||
| 32, | |||
| 32, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -0,0 +1,55 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_256x128x128_64x64x128_2_nc64hw64" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorCxRSKx<64>, | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| int32_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<256, 128, 128>, | |||
| cutlass::gemm::GemmShape<64, 64, 128>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| cutlass::uint4b_t, | |||
| 16, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, | |||
| 32, | |||
| 32, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -0,0 +1,55 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x128x128_64x64x128_2_nc64hw64" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorCxRSKx<64>, | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| int32_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 128, 128>, | |||
| cutlass::gemm::GemmShape<64, 64, 128>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| cutlass::uint4b_t, | |||
| 16, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, | |||
| 32, | |||
| 32, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -0,0 +1,55 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_256x128x128_64x64x128_2_nc64hw64" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorCxRSKx<64>, | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| int32_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<256, 128, 128>, | |||
| cutlass::gemm::GemmShape<64, 64, 128>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| cutlass::uint4b_t, | |||
| 16, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, | |||
| 32, | |||
| 32, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1 +0,0 @@ | |||
| ../implicit_gemm_conv_bias_cutlass_wrapper.cuinl | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
| using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 1, 4, 8, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
| using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 1, 4, 8, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
| using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 1, 4, 8, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||
| using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 4, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||
| using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 4, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||
| using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 4, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
| using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 1, 4, 8, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
| using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 1, 4, 8, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
| using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 1, 4, 8, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||
| using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 4, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||
| using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 4, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||
| using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 4, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, false, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<32>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<32>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<32>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<32>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<32>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<32>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<32>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,36 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // generated by gen_cuda_conv_bias_int8_kern_impls.py | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||
| using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||
| using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||
| using LayoutDst = cutlass::layout::TensorNCxHWx<32>; | |||
| using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; | |||
| using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
| using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
| using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| int8_t, 4, int32_t, int32_t, float>; | |||
| using Convolution = cutlass::conv::device::Convolution< | |||
| int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
| LayoutDst, int32_t, LayoutDst, int32_t, | |||
| cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
| ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, 4, 16, true, | |||
| cutlass::arch::OpMultiplyAddSaturate>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||