feat(dnn/cuda): add cutlass matmul impls

GitOrigin-RevId: 619c8c299c
5 years ago · 03c921f7c4
--- a/.gitattributes
+++ b/.gitattributes
@@ -3,6 +3,7 @@
 dnn/src/cuda/conv_bias/int8/kimpl/* binary
 dnn/src/cuda/conv_bias/int8_imma/kimpl/* binary
 dnn/src/cuda/batch_conv_bias/int8/kimpl/* binary
 dnn/src/cuda/matrix_mul/fp32_simt/kimpl/* binary
 dnn/src/cuda/sass/prebuilt/map_defs.cpp binary
 tools/mlir/mlir-tblgen filter=lfs diff=lfs merge=lfs -text
 *.caffemodel filter=lfs diff=lfs merge=lfs -text
--- a/dnn/scripts/Makefile
+++ b/dnn/scripts/Makefile
@@ -8,9 +8,10 @@ ELEMWISE_IMPL := ../src/cuda/cond_take/kimpl \
 	../src/naive/elemwise/kimpl \
 	../src/cuda/elemwise_multi_type/kimpl
 CUDA_CONV_IMPL := ../src/cuda/conv_bias/int8/kimpl ../src/cuda/conv_bias/int8_imma/kimpl ../src/cuda/batch_conv_bias/int8/kimpl
 CUDA_CONV_IMPL := ../src/cuda/conv_bias/int8/kimpl ../src/cuda/conv_bias/int8_imma/kimpl ../src/cuda/batch_conv_bias/int8/kimpl 
 CUDA_MATMUL_KIMPL := ../src/cuda/matrix_mul/fp32_simt/kimpl
 all: ${PARAM_DEFS} ${ELEMWISE_IMPL} ${CUDA_CONV_IMPL}
 all: ${PARAM_DEFS} ${ELEMWISE_IMPL} ${CUDA_CONV_IMPL} $(CUDA_MATMUL_KIMPL)
 ../src/common/elemwise/each_mode.inl: gen_elemwise_each_mode.py
 	./$^ $@
@@ -47,4 +48,7 @@ all: ${PARAM_DEFS} ${ELEMWISE_IMPL} ${CUDA_CONV_IMPL}
 ../src/cuda/batch_conv_bias/int8/kimpl: gen_cuda_batch_conv_bias_kern_impls.py
 	./$^ --type dp4a $@
 ../src/cuda/matrix_mul/fp32_simt/kimpl: gen_cutlass_matmul_kern_impls.py
 	./$^ $@
 .PHONY: all
--- a/dnn/src/cuda/matrix_mul/algos.cpp
+++ b/dnn/src/cuda/matrix_mul/algos.cpp
@@ -33,12 +33,37 @@ MatrixMulForwardImpl::AlgoPack::AlgoPack() {
 #if !MEGDNN_DISABLE_FLOAT16
    all_algos.push_back(&bfloat16);
 #endif
    fill_cutlass_algos();
    for (auto&& algo : simt_float32) {
        all_algos.push_back(&algo);
    }
    for (auto&& algo : all_algos) {
        m_all_algos_map.emplace(algo->info().desc, algo);
    }
 }
 void MatrixMulForwardImpl::AlgoPack::fill_cutlass_algos() {
    using AlgoParam = AlgoFloat32SIMT::AlgoParam;
    simt_float32.emplace_back(AlgoParam{64, 256, 8, 32, 64, 8});
    simt_float32.emplace_back(AlgoParam{256, 64, 8, 64, 32, 8});
    simt_float32.emplace_back(AlgoParam{32, 256, 8, 16, 64, 8});
    simt_float32.emplace_back(AlgoParam{256, 32, 8, 64, 16, 8});
    simt_float32.emplace_back(AlgoParam{128, 128, 8, 32, 64, 8});
    simt_float32.emplace_back(AlgoParam{128, 64, 8, 64, 32, 8});
    simt_float32.emplace_back(AlgoParam{64, 128, 8, 32, 64, 8});
    simt_float32.emplace_back(AlgoParam{128, 32, 8, 64, 32, 8});
    simt_float32.emplace_back(AlgoParam{32, 128, 8, 32, 64, 8});
    simt_float32.emplace_back(AlgoParam{64, 64, 8, 32, 64, 8});
    simt_float32.emplace_back(AlgoParam{32, 64, 8, 32, 64, 8});
    simt_float32.emplace_back(AlgoParam{64, 32, 8, 64, 32, 8});
    simt_float32.emplace_back(AlgoParam{32, 32, 8, 32, 32, 8});
    simt_float32.emplace_back(AlgoParam{8, 32, 8, 8, 32, 8});
    simt_float32.emplace_back(AlgoParam{16, 32, 8, 16, 32, 8});
    simt_float32.emplace_back(AlgoParam{16, 64, 8, 16, 64, 8});
    simt_float32.emplace_back(AlgoParam{16, 128, 8, 16, 64, 8});
 }
 MatrixMulForwardImpl::AlgoPack MatrixMulForwardImpl::sm_algo_pack;
 MEGDNN_DEF_GET_ALGO_FROM_DESC(MatrixMulForwardImpl)
--- a/dnn/src/cuda/matrix_mul/algos.h
+++ b/dnn/src/cuda/matrix_mul/algos.h
@@ -41,7 +41,8 @@ public:
        CUDA_WMMA_UINT4X4X32,
        CUDA_CUBLASLT,
        CUDA_NAIVE,
        CUDA_BFLOAT16
        CUDA_BFLOAT16, 
        CUDA_FLOAT32_SIMT, 
    };
    using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>;
@@ -165,6 +166,38 @@ private:
 };
 #endif
 class MatrixMulForwardImpl::AlgoFloat32SIMT final : public AlgoBase {
 public:
    struct AlgoParam {
        int threadblock_m, threadblock_n, threadblock_k;
        int warp_m, warp_n, warp_k;
        std::string to_string() {
            return ssprintf("%dX%dX%d_%dX%dX%d", threadblock_m, threadblock_n,
                            threadblock_k, warp_m, warp_n, warp_k);
        }
    };
    AlgoFloat32SIMT(AlgoParam algo_param)
            : m_algo_param{algo_param},
              m_name{ssprintf("CUTLASS_FLOAT32_SIMT_%s",
                              m_algo_param.to_string().c_str())} {}
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    const char* name() const override { return m_name.c_str(); }
    void exec(const ExecArgs& args) const override;
    bool is_reproducible() const override { return true; }
    MEGDNN_DECL_ALGO_TYPE(CUDA_FLOAT32_SIMT)
    std::string param() const override {
        std::string ret;
        serialize_write_pod(m_algo_param, ret);
        return ret;
    }
 private:
    AlgoParam m_algo_param;
    std::string m_name;
 };
 class MatrixMulForwardImpl::AlgoPack : NonCopyableObj {
 private:
    AlgoBase::Mapper m_all_algos_map;
@@ -182,9 +215,11 @@ public:
 #if !MEGDNN_DISABLE_FLOAT16
    AlgoBFloat16 bfloat16;
 #endif
    std::vector<AlgoFloat32SIMT> simt_float32;
    std::vector<AlgoBase*> all_algos;
    const AlgoBase::Mapper& all_algos_map() const { return m_all_algos_map; }
    void fill_cutlass_algos();
 };
 }  // namespace cuda
--- a/dnn/src/cuda/matrix_mul/cutlass_float32_simt.cpp
+++ b/dnn/src/cuda/matrix_mul/cutlass_float32_simt.cpp
@@ -0,0 +1,73 @@
 /**
 * \file dnn/src/cuda/matrix_mul/cutlass_float32_simt.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */
 #include "src/cuda/handle.h"
 #include "src/cuda/matrix_mul/algos.h"
 #include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh"
 #include "src/cuda/utils.h"
 using namespace megdnn;
 using namespace cuda;
 using namespace cutlass_wrapper;
 bool MatrixMulForwardImpl::AlgoFloat32SIMT::is_available(
        const SizeArgs& args) const {
 #if CUDA_VERSION >= 9200
    return args.opr->param().format == param::MatrixMul::Format::DEFAULT &&
           args.layout_a.dtype == dtype::Float32() &&
           args.layout_b.dtype == dtype::Float32() &&
           args.layout_c.dtype == dtype::Float32();
 #else
    return false;
 #endif
 }
 size_t MatrixMulForwardImpl::AlgoFloat32SIMT::get_workspace_in_bytes(
        const SizeArgs& args) const {
    size_t lda = args.layout_a.stride[0], ldb = args.layout_b.stride[0],
           ldc = args.layout_c.stride[0];
    auto&& param = args.opr->param();
    int m = args.layout_c.shape[0], n = args.layout_c.shape[1],
        k = args.layout_a.shape[param.transposeA ? 0 : 1];
    GemmCoord problem_size{m, n, k};
    return cutlass_matrix_mul_float32_simt_get_workspace_size(
            param.transposeA, lda, param.transposeB, ldb, ldc, problem_size,
            1.f, 0.f,
            GemmCoord{m_algo_param.threadblock_m, m_algo_param.threadblock_n,
                      m_algo_param.threadblock_k},
            GemmCoord{m_algo_param.warp_m, m_algo_param.warp_n,
                      m_algo_param.warp_k});
 }
 void MatrixMulForwardImpl::AlgoFloat32SIMT::exec(const ExecArgs& args) const {
    size_t lda = args.tensor_a.layout.stride[0],
           ldb = args.tensor_b.layout.stride[0],
           ldc = args.tensor_c.layout.stride[0];
    auto&& param = args.opr->param();
    int m = args.tensor_c.layout.shape[0], n = args.tensor_c.layout.shape[1],
        k = args.tensor_a.layout.shape[param.transposeA ? 0 : 1];
    GemmCoord problem_size{m, n, k};
    auto&& stream = cuda_stream(args.opr->handle());
    int* workspace = reinterpret_cast<int*>(args.workspace.raw_ptr);
    return cutlass_matrix_mul_float32_simt(
            args.tensor_a.ptr<dt_float32>(), param.transposeA, lda,
            args.tensor_b.ptr<dt_float32>(), param.transposeB, ldb,
            args.tensor_c.ptr<dt_float32>(), ldc, workspace, problem_size, 1.f,
            0.f,
            GemmCoord{m_algo_param.threadblock_m, m_algo_param.threadblock_n,
                      m_algo_param.threadblock_k},
            GemmCoord{m_algo_param.warp_m, m_algo_param.warp_n,
                      m_algo_param.warp_k},
            stream);
 }
 // vim: syntax=cpp.doxygen
--- a/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cu
+++ b/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cu
@@ -0,0 +1,200 @@
 /**
 * \file dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cu
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #include "cuda.h"
 #if __CUDACC_VER_MAJOR__ > 9 || \
        (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 #include "cutlass/gemm/device/gemm.h"
 #endif
 #include "src/common/opr_param_defs_enumv.cuh"
 #include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh"
 #pragma GCC diagnostic pop
 using namespace megdnn;
 using namespace cuda;
 using namespace cutlass_wrapper;
 /* ================= cutlass kernel wrapper for f32 matrix mul ================
 */
 #define DISPATCH(cb)                                                         \
    cb(64, 256, 8, 32, 64, 8);                                               \
    cb(256, 64, 8, 64, 32, 8);                                               \
    cb(32, 256, 8, 16, 64, 8);                                               \
    cb(256, 32, 8, 64, 16, 8);                                               \
    cb(128, 128, 8, 32, 64, 8);                                              \
    cb(128, 64, 8, 64, 32, 8);                                               \
    cb(64, 128, 8, 32, 64, 8);                                               \
    cb(128, 32, 8, 64, 32, 8);                                               \
    cb(32, 128, 8, 32, 64, 8);                                               \
    cb(64, 64, 8, 32, 64, 8);                                                \
    cb(32, 64, 8, 32, 64, 8);                                                \
    cb(64, 32, 8, 64, 32, 8);                                                \
    cb(32, 32, 8, 32, 32, 8);                                                \
    cb(8, 32, 8, 8, 32, 8);                                                  \
    cb(16, 32, 8, 16, 32, 8);                                                \
    cb(16, 64, 8, 16, 64, 8);                                                \
    cb(16, 128, 8, 16, 64, 8);                                               \
    megdnn_assert(false,                                                     \
                  "unsupported threadblock shape (%dx%dx%d) and warp shape " \
                  "(%dx%dx%d)",                                              \
                  threadblock_shape.m(), threadblock_shape.n(),              \
                  threadblock_shape.k(), warp_shape.m(), warp_shape.n(),     \
                  warp_shape.k());
 #if __CUDACC_VER_MAJOR__ < 9 || \
        (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ <= 2)
 void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_float32_simt(
        const float* /* d_A */, bool /* transpose_A */, size_t /* lda */,
        const float* /* d_B */, bool /* transpose_B */, size_t /* ldb */,
        float* /* d_C */, size_t /* ldc */, int* /* workspace */,
        GemmCoord const& /* problem_size */, float /* alpha */,
        float /* beta */, const GemmCoord& /* threadblock_shape */,
        const GemmCoord& /* warp_shape */, cudaStream_t /* stream */) {}
 #else
 void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_float32_simt(
        const float* d_A, bool transpose_A, size_t lda, const float* d_B,
        bool transpose_B, size_t ldb, float* d_C, size_t ldc, int* workspace,
        GemmCoord const& problem_size, float alpha, float beta,
        const GemmCoord& threadblock_shape, const GemmCoord& warp_shape,
        cudaStream_t stream) {
 #define cb(threadblock_m_, threadblock_n_, threadblock_k_, warp_m_, warp_n_,   \
           warp_k_)                                                            \
    if (threadblock_shape.m() == threadblock_m_ &&                             \
        threadblock_shape.n() == threadblock_n_ &&                             \
        threadblock_shape.k() == threadblock_k_ &&                             \
        warp_shape.m() == warp_m_ && warp_shape.n() == warp_n_ &&              \
        warp_shape.k() == warp_k_) {                                           \
        using ThreadBlockShape =                                               \
                cutlass::gemm::GemmShape<threadblock_m_, threadblock_n_,       \
                                         threadblock_k_>;                      \
        using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \
        using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;            \
        using Gemm = cutlass::gemm::device::Gemm<                              \
                float, LayoutA, float, LayoutB, float,                         \
                cutlass::layout::RowMajor, float, cutlass::arch::OpClassSimt,  \
                cutlass::arch::Sm50, ThreadBlockShape, WarpShape,              \
                InstructionShape, EpilogueOp,                                  \
                cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,  \
                2>;                                                            \
        return cutlass_matrix_mul_wrapper<Gemm>(d_A, lda, d_B, ldb, d_C, ldc,  \
                                                workspace, problem_size,       \
                                                epilogue, stream);             \
    }
    static constexpr int kEpilogueElementsPerAccess = 1;
    using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
            float, kEpilogueElementsPerAccess, float, float>;
    typename EpilogueOp::Params epilogue{alpha, beta};
    if (!transpose_A && !transpose_B) {
        using LayoutA = cutlass::layout::RowMajor;
        using LayoutB = cutlass::layout::RowMajor;
        DISPATCH(cb)
    } else if (!transpose_A && transpose_B) {
        using LayoutA = cutlass::layout::RowMajor;
        using LayoutB = cutlass::layout::ColumnMajor;
        DISPATCH(cb)
    } else if (transpose_A && !transpose_B) {
        using LayoutA = cutlass::layout::ColumnMajor;
        using LayoutB = cutlass::layout::RowMajor;
        DISPATCH(cb)
    } else {
        megdnn_assert(transpose_A && transpose_B);
        using LayoutA = cutlass::layout::ColumnMajor;
        using LayoutB = cutlass::layout::ColumnMajor;
        DISPATCH(cb)
    }
 #undef cb
 }
 #endif
 #if __CUDACC_VER_MAJOR__ < 9 || \
        (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ <= 2)
 size_t megdnn::cuda::cutlass_wrapper::
        cutlass_matrix_mul_float32_simt_get_workspace_size(
                bool /* transpose_A */, size_t /* lda */,
                bool /* transpose_B */, size_t /* ldb */, size_t /* ldc */,
                GemmCoord const& /* problem_size */, float /* alpha */,
                float /* beta */, const GemmCoord& /* threadblock_shape */,
                const GemmCoord& /* warp_shape */) {
    return 0;
 }
 #else
 size_t megdnn::cuda::cutlass_wrapper::
        cutlass_matrix_mul_float32_simt_get_workspace_size(
                bool transpose_A, size_t lda, bool transpose_B, size_t ldb,
                size_t ldc, GemmCoord const& problem_size, float alpha,
                float beta, const GemmCoord& threadblock_shape,
                const GemmCoord& warp_shape) {
 #define cb(threadblock_m_, threadblock_n_, threadblock_k_, warp_m_, warp_n_,   \
           warp_k_)                                                            \
    if (threadblock_shape.m() == threadblock_m_ &&                             \
        threadblock_shape.n() == threadblock_n_ &&                             \
        threadblock_shape.k() == threadblock_k_ &&                             \
        warp_shape.m() == warp_m_ && warp_shape.n() == warp_n_ &&              \
        warp_shape.k() == warp_k_) {                                           \
        using ThreadBlockShape =                                               \
                cutlass::gemm::GemmShape<threadblock_m_, threadblock_n_,       \
                                         threadblock_k_>;                      \
        using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \
        using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;            \
        using Gemm = cutlass::gemm::device::Gemm<                              \
                float, LayoutA, float, LayoutB, float,                         \
                cutlass::layout::RowMajor, float, cutlass::arch::OpClassSimt,  \
                cutlass::arch::Sm50, ThreadBlockShape, WarpShape,              \
                InstructionShape, EpilogueOp,                                  \
                cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,  \
                2>;                                                            \
        typename Gemm::TensorRefA tensor_A{                                    \
                nullptr, Gemm::LayoutA{static_cast<int>(lda)}};                \
        typename Gemm::TensorRefB tensor_B{                                    \
                nullptr, Gemm::LayoutB{static_cast<int>(ldb)}};                \
        typename Gemm::TensorRefC tensor_C{                                    \
                nullptr, Gemm::LayoutC{static_cast<int>(ldc)}};                \
        typename Gemm::TensorRefD tensor_D{                                    \
                nullptr, Gemm::LayoutC{static_cast<int>(ldc)}};                \
        typename Gemm::Arguments arguments{problem_size,  tensor_A, tensor_B,  \
                                           tensor_C,      tensor_D, epilogue,  \
                                           split_k_slices};                    \
        return Gemm::get_workspace_size(arguments);                            \
    }
    static constexpr int kEpilogueElementsPerAccess = 1;
    static constexpr int split_k_slices = 1;
    using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
            float, kEpilogueElementsPerAccess, float, float>;
    typename EpilogueOp::Params epilogue{alpha, beta};
    if (!transpose_A && !transpose_B) {
        using LayoutA = cutlass::layout::RowMajor;
        using LayoutB = cutlass::layout::RowMajor;
        DISPATCH(cb)
    } else if (!transpose_A && transpose_B) {
        using LayoutA = cutlass::layout::RowMajor;
        using LayoutB = cutlass::layout::ColumnMajor;
        DISPATCH(cb)
    } else if (transpose_A && !transpose_B) {
        using LayoutA = cutlass::layout::ColumnMajor;
        using LayoutB = cutlass::layout::RowMajor;
        DISPATCH(cb)
    } else {
        megdnn_assert(transpose_A && transpose_B);
        using LayoutA = cutlass::layout::ColumnMajor;
        using LayoutB = cutlass::layout::ColumnMajor;
        DISPATCH(cb)
    }
 #undef cb
 }
 #endif
 #undef DISPATCH
 // vim: syntax=cuda.doxygen
--- a/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh
+++ b/dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh
@@ -0,0 +1,47 @@
 /**
 * \file dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */
 #pragma once
 #include "cutlass/gemm/gemm.h"
 #include "src/cuda/utils.cuh"
 namespace megdnn {
 namespace cuda {
 namespace cutlass_wrapper {
 using GemmCoord = cutlass::gemm::GemmCoord;
 template <typename Gemm>
 void cutlass_matrix_mul_wrapper(
        const typename Gemm::ElementA* d_A, size_t lda,
        const typename Gemm::ElementB* d_B, size_t ldb,
        typename Gemm::ElementC* d_C, size_t ldc, int* workspace,
        GemmCoord const& problem_size,
        typename Gemm::EpilogueOutputOp::Params const& epilogue,
        cudaStream_t stream);
 void cutlass_matrix_mul_float32_simt(
        const float* d_A, bool transpose_A, size_t lda, const float* d_B,
        bool transpose_B, size_t ldb, float* d_C, size_t ldc, int* workspace,
        GemmCoord const& problem_size, float alpha, float beta,
        const GemmCoord& threadblock_shape, const GemmCoord& warp_shape,
        cudaStream_t stream);
 size_t cutlass_matrix_mul_float32_simt_get_workspace_size(
        bool transpose_A, size_t lda, bool transpose_B, size_t ldb, size_t ldc,
        GemmCoord const& problem_size, float alpha, float beta,
        const GemmCoord& threadblock_shape, const GemmCoord& warp_shape);
 }  // namespace cutlass_wrapper
 }  // namespace cuda
 }  // namespace megdnn
 // vim: syntax=cuda.doxygen
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nn.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::RowMajor;
 using LayoutB = cutlass::layout::RowMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 8>;
 using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nt.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::RowMajor;
 using LayoutB = cutlass::layout::ColumnMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 8>;
 using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tn.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::ColumnMajor;
 using LayoutB = cutlass::layout::RowMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 8>;
 using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tt.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::ColumnMajor;
 using LayoutB = cutlass::layout::ColumnMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 8>;
 using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nn.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::RowMajor;
 using LayoutB = cutlass::layout::RowMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 8>;
 using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nt.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::RowMajor;
 using LayoutB = cutlass::layout::ColumnMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 8>;
 using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tn.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::ColumnMajor;
 using LayoutB = cutlass::layout::RowMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 8>;
 using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tt.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::ColumnMajor;
 using LayoutB = cutlass::layout::ColumnMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 8>;
 using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nn.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::RowMajor;
 using LayoutB = cutlass::layout::RowMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 8>;
 using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nt.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::RowMajor;
 using LayoutB = cutlass::layout::ColumnMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 8>;
 using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tn.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::ColumnMajor;
 using LayoutB = cutlass::layout::RowMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 8>;
 using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tt.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::ColumnMajor;
 using LayoutB = cutlass::layout::ColumnMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 8>;
 using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nn.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::RowMajor;
 using LayoutB = cutlass::layout::RowMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 8>;
 using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nt.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::RowMajor;
 using LayoutB = cutlass::layout::ColumnMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 8>;
 using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tn.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::ColumnMajor;
 using LayoutB = cutlass::layout::RowMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 8>;
 using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tt.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::ColumnMajor;
 using LayoutB = cutlass::layout::ColumnMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 8>;
 using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nn.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::RowMajor;
 using LayoutB = cutlass::layout::RowMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<16, 32, 8>;
 using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nt.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::RowMajor;
 using LayoutB = cutlass::layout::ColumnMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<16, 32, 8>;
 using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tn.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::ColumnMajor;
 using LayoutB = cutlass::layout::RowMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<16, 32, 8>;
 using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tt.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::ColumnMajor;
 using LayoutB = cutlass::layout::ColumnMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<16, 32, 8>;
 using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nn.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::RowMajor;
 using LayoutB = cutlass::layout::RowMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>;
 using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nt.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::RowMajor;
 using LayoutB = cutlass::layout::ColumnMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>;
 using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tn.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::ColumnMajor;
 using LayoutB = cutlass::layout::RowMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>;
 using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tt.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::ColumnMajor;
 using LayoutB = cutlass::layout::ColumnMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>;
 using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nn.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::RowMajor;
 using LayoutB = cutlass::layout::RowMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<256, 32, 8>;
 using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nt.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::RowMajor;
 using LayoutB = cutlass::layout::ColumnMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<256, 32, 8>;
 using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tn.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::ColumnMajor;
 using LayoutB = cutlass::layout::RowMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<256, 32, 8>;
 using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tt.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::ColumnMajor;
 using LayoutB = cutlass::layout::ColumnMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<256, 32, 8>;
 using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nn.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::RowMajor;
 using LayoutB = cutlass::layout::RowMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<256, 64, 8>;
 using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nt.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::RowMajor;
 using LayoutB = cutlass::layout::ColumnMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<256, 64, 8>;
 using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tn.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::ColumnMajor;
 using LayoutB = cutlass::layout::RowMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<256, 64, 8>;
 using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tt.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::ColumnMajor;
 using LayoutB = cutlass::layout::ColumnMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<256, 64, 8>;
 using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nn.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::RowMajor;
 using LayoutB = cutlass::layout::RowMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 8>;
 using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nt.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::RowMajor;
 using LayoutB = cutlass::layout::ColumnMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 8>;
 using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tn.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::ColumnMajor;
 using LayoutB = cutlass::layout::RowMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 8>;
 using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tt.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::ColumnMajor;
 using LayoutB = cutlass::layout::ColumnMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 8>;
 using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nn.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::RowMajor;
 using LayoutB = cutlass::layout::RowMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<32, 256, 8>;
 using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nt.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::RowMajor;
 using LayoutB = cutlass::layout::ColumnMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<32, 256, 8>;
 using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tn.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::ColumnMajor;
 using LayoutB = cutlass::layout::RowMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<32, 256, 8>;
 using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tt.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::ColumnMajor;
 using LayoutB = cutlass::layout::ColumnMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<32, 256, 8>;
 using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nn.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::RowMajor;
 using LayoutB = cutlass::layout::RowMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 8>;
 using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nt.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::RowMajor;
 using LayoutB = cutlass::layout::ColumnMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 8>;
 using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tn.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::ColumnMajor;
 using LayoutB = cutlass::layout::RowMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 8>;
 using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tt.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::ColumnMajor;
 using LayoutB = cutlass::layout::ColumnMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 8>;
 using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nn.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::RowMajor;
 using LayoutB = cutlass::layout::RowMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 8>;
 using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nt.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::RowMajor;
 using LayoutB = cutlass::layout::ColumnMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 8>;
 using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tn.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::ColumnMajor;
 using LayoutB = cutlass::layout::RowMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 8>;
 using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tt.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::ColumnMajor;
 using LayoutB = cutlass::layout::ColumnMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 8>;
 using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nn.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::RowMajor;
 using LayoutB = cutlass::layout::RowMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 8>;
 using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_nt.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::RowMajor;
 using LayoutB = cutlass::layout::ColumnMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 8>;
 using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tn.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::ColumnMajor;
 using LayoutB = cutlass::layout::RowMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 8>;
 using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x128x8_32x64x8_tt.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::ColumnMajor;
 using LayoutB = cutlass::layout::ColumnMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 8>;
 using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nn.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::RowMajor;
 using LayoutB = cutlass::layout::RowMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<64, 256, 8>;
 using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_nt.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::RowMajor;
 using LayoutB = cutlass::layout::ColumnMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<64, 256, 8>;
 using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tn.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::ColumnMajor;
 using LayoutB = cutlass::layout::RowMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<64, 256, 8>;
 using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x256x8_32x64x8_tt.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::ColumnMajor;
 using LayoutB = cutlass::layout::ColumnMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<64, 256, 8>;
 using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nn.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::RowMajor;
 using LayoutB = cutlass::layout::RowMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 8>;
 using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_nt.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::RowMajor;
 using LayoutB = cutlass::layout::ColumnMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 8>;
 using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tn.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::ColumnMajor;
 using LayoutB = cutlass::layout::RowMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 8>;
 using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x32x8_64x32x8_tt.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::ColumnMajor;
 using LayoutB = cutlass::layout::ColumnMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 8>;
 using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nn.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::RowMajor;
 using LayoutB = cutlass::layout::RowMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 8>;
 using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_nt.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::RowMajor;
 using LayoutB = cutlass::layout::ColumnMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 8>;
 using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tn.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::ColumnMajor;
 using LayoutB = cutlass::layout::RowMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 8>;
 using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_64x64x8_32x64x8_tt.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::ColumnMajor;
 using LayoutB = cutlass::layout::ColumnMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 8>;
 using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nn.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::RowMajor;
 using LayoutB = cutlass::layout::RowMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<8, 32, 8>;
 using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_nt.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::RowMajor;
 using LayoutB = cutlass::layout::ColumnMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<8, 32, 8>;
 using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tn.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tn.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::ColumnMajor;
 using LayoutB = cutlass::layout::RowMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<8, 32, 8>;
 using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tt.cu
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_8x32x8_8x32x8_tt.cu
@@ -0,0 +1,33 @@
 #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
 // generated by gen_cutlass_matrix_mul_kern_impls.py
 // ignore warning of cutlass
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wuninitialized"
 #include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"
 using LayoutA = cutlass::layout::ColumnMajor;
 using LayoutB = cutlass::layout::ColumnMajor;
 using ThreadBlockShape = cutlass::gemm::GemmShape<8, 32, 8>;
 using WarpShape = cutlass::gemm::GemmShape<8, 32, 8>;
 using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
 using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
 using Gemm = cutlass::gemm::device::Gemm<
    float, LayoutA, 
    float, LayoutB, 
    float, cutlass::layout::RowMajor, float, 
    cutlass::arch::OpClassSimt, cutlass::arch::Sm50, 
    ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, 
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 
    2>;
 template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
        const typename Gemm::ElementA* d_A, size_t lda, 
        const typename Gemm::ElementB* d_B, size_t ldb,  
        typename Gemm::ElementC* d_C, size_t ldc,  
        int* workspace, 
        cutlass::gemm::GemmCoord const& problem_size,   
        typename Gemm::EpilogueOutputOp::Params const& epilogue, 
        cudaStream_t stream);
 #pragma GCC diagnostic pop
 #endif
--- a/dnn/src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl
+++ b/dnn/src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl
@@ -0,0 +1,52 @@
 /**
 * \file
 * dnn/src/cuda/matrix_mul/matrix_mul_float_simt_cutlass_wrapper.cuinl
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */
 #include "cutlass/gemm/device/gemm.h"
 #include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh"
 using namespace megdnn;
 using namespace cuda;
 using namespace cutlass_wrapper;
 template <typename Gemm>
 void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper(
        const typename Gemm::ElementA* d_A, size_t lda,
        const typename Gemm::ElementB* d_B, size_t ldb,
        typename Gemm::ElementC* d_C, size_t ldc, int* workspace,
        GemmCoord const& problem_size,
        typename Gemm::EpilogueOutputOp::Params const& epilogue,
        cudaStream_t stream) {
    typename Gemm::TensorRefA tensor_a{
            const_cast<typename Gemm::ElementA*>(d_A),
            typename Gemm::LayoutA{static_cast<int>(lda)}};
    typename Gemm::TensorRefB tensor_b{
            const_cast<typename Gemm::ElementB*>(d_B),
            typename Gemm::LayoutB{static_cast<int>(ldb)}};
    typename Gemm::TensorRefC tensor_c{
            nullptr, typename Gemm::LayoutC{static_cast<int>(ldc)}};
    typename Gemm::TensorRefD tensor_d{
            d_C, typename Gemm::LayoutC{static_cast<int>(ldc)}};
    typename Gemm::Arguments arguments{problem_size,
                                       tensor_a,
                                       tensor_b,
                                       tensor_c,
                                       tensor_d.non_const_ref(),
                                       epilogue,
                                       1};
    Gemm gemm_op;
    cutlass_check(gemm_op.initialize(arguments, workspace));
    cutlass_check(gemm_op(stream));
    after_kernel_launch();
 }
 // vim: syntax=cuda.doxygen
--- a/dnn/src/cuda/matrix_mul/opr_impl.h
+++ b/dnn/src/cuda/matrix_mul/opr_impl.h
@@ -41,6 +41,7 @@ public:
 #if !MEGDNN_DISABLE_FLOAT16
    class AlgoBFloat16;
 #endif
    class AlgoFloat32SIMT;
    class AlgoPack;
    static const AlgoPack& algo_pack() {
--- a/dnn/test/cuda/cutlass_matmul.cpp
+++ b/dnn/test/cuda/cutlass_matmul.cpp
@@ -0,0 +1,337 @@
 /**
 * \file dnn/test/cuda/cutlass_matmul.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */
 #include <cuda.h>
 #include "megdnn/oprs/linalg.h"
 #include "src/common/utils.h"
 #include "test/common/checker.h"
 #include "test/common/matrix_mul.h"
 #include "test/common/tensor.h"
 #include "test/common/workspace_wrapper.h"
 #include "test/cuda/benchmark.h"
 #include "test/cuda/fixture.h"
 #include "test/cuda/utils.h"
 #if CUDA_VERSION >= 9020
 namespace megdnn {
 namespace test {
 namespace {
 void test_multibatchsize(
        Handle* handle_cuda, DType A_dtype, DType B_dtype, DType C_dtype,
        const char* algo, const std::vector<matrix_mul::TestArg>& args,
        param::MatrixMul::Format format = param::MatrixMul::Format::DEFAULT,
        const std::function<bool(const matrix_mul::TestArg&)>& filter = {}) {
    Checker<MatrixMulForward> checker(handle_cuda, false);
    if (algo) {
        checker.set_before_exec_callback(AlgoChecker<MatrixMulForward>(algo));
    }
    std::unique_ptr<RNG> rng;
    if (A_dtype.enumv() == DTypeEnum::Float32) {
        rng = std::make_unique<UniformFloatRNG>(-1, 1);
        megdnn_assert(B_dtype.enumv() == DTypeEnum::Float32 &&
                      C_dtype.enumv() == DTypeEnum::Float32);
    }
    megdnn_assert(rng != nullptr);
    struct Compare {
        bool is_same(dt_float32 expected, dt_float32 actual) const {
            return expected == actual;
        }
    };
    // copy rhs->lhs, lhs is 8 times of rhs
    auto copy = [](SyncedTensor<dt_float32, Compare>& lhs,
                   SyncedTensor<dt_float32, Compare>& rhs) {
        size_t chunk = rhs.layout().span().dist_byte();
        size_t tot = lhs.layout().span().dist_byte();
        megdnn_assert(tot % chunk == 0);
        char* pointer_lhs = reinterpret_cast<char*>(lhs.ptr_mutable_host());
        const char* pointer_rhs = reinterpret_cast<const char*>(rhs.ptr_host());
        for (size_t i = 0; i < tot; i += chunk) {
            std::memcpy(pointer_lhs + i, pointer_rhs, chunk);
        }
    };
    using Param = param::MatrixMul;
    megdnn_assert(format == Param::Format::DEFAULT);
    for (auto&& arg : args) {
        megdnn_assert(arg.mask == 0x0);
        // make m, n, k big enough
        size_t m = arg.m, n = (arg.n << 3), k = (arg.k << 3);
        size_t m_prime = (m << 3);
        if (filter && filter(arg))
            continue;
        TensorShape A{m, k}, B{k, n}, C{m, n};
        TensorShape A_prime{m_prime, k}, C_prime{m_prime, n};
        SyncedTensor<dt_float32, Compare> A_tensor{handle_cuda, {A, A_dtype}},
                B_tensor{handle_cuda, {B, B_dtype}},
                C_tensor{handle_cuda, {C, C_dtype}},
                A_tensor_prime{handle_cuda, {A_prime, A_dtype}},
                C_tensor_prime{handle_cuda, {C_prime, C_dtype}},
                C_tensor_batch{handle_cuda, {C_prime, C_dtype}};
        rng->gen(A_tensor.tensornd_host());
        rng->gen(B_tensor.tensornd_host());
        copy(A_tensor_prime, A_tensor);
        auto opr_reference = handle_cuda->create_operator<MatrixMulForward>();
        {
            opr_reference->execution_policy().algo.reset();
            for (auto i : opr_reference->get_all_algorithms_info(
                         A_tensor.layout(), B_tensor.layout(),
                         C_tensor.layout())) {
                if (std::regex_match(
                            i.name.c_str(),
                            std::regex("(" + std::string(algo) + ")(.*)"))) {
                    opr_reference->execution_policy().algo = i;
                    break;
                }
            }
            megdnn_assert(opr_reference->execution_policy().algo.valid());
            size_t ws_size = opr_reference->get_workspace_in_bytes(
                    A_tensor.layout(), B_tensor.layout(), C_tensor.layout());
            WorkspaceWrapper ws_reference(handle_cuda, ws_size);
            opr_reference->exec(
                    A_tensor.tensornd_dev(), B_tensor.tensornd_dev(),
                    C_tensor.tensornd_dev(), ws_reference.workspace());
        }
        copy(C_tensor_prime, C_tensor);
        checker.set_dtype(0, A_dtype)
                .set_dtype(1, B_dtype)
                .set_dtype(2, C_dtype)
                .set_epsilon(1e-6)
                .exect({A_tensor_prime.tensornd_host(),
                        B_tensor.tensornd_host(),
                        {}},
                       {{}, {}, C_tensor_prime.tensornd_host()});
        {
            opr_reference->execution_policy().algo.reset();
            for (auto i : opr_reference->get_all_algorithms_info(
                         A_tensor_prime.layout(), B_tensor.layout(),
                         C_tensor_batch.layout())) {
                if (std::regex_match(
                            i.name.c_str(),
                            std::regex("(" + std::string(algo) + ")(.*)"))) {
                    opr_reference->execution_policy().algo = i;
                    break;
                }
            }
            megdnn_assert(opr_reference->execution_policy().algo.valid());
            size_t ws_size = opr_reference->get_workspace_in_bytes(
                    A_tensor_prime.layout(), B_tensor.layout(),
                    C_tensor_batch.layout());
            WorkspaceWrapper ws_reference(handle_cuda, ws_size);
            opr_reference->exec(
                    A_tensor_prime.tensornd_dev(), B_tensor.tensornd_dev(),
                    C_tensor_batch.tensornd_dev(), ws_reference.workspace());
        }
        C_tensor_batch.check_with(C_tensor_prime);
    }
 }
 #if MEGDNN_WITH_BENCHMARK
 struct BenchArgs {
    size_t m, n, k, mask = 0x0;
 };
 std::vector<BenchArgs> get_square_matmul_args() {
    std::vector<BenchArgs> args;
    args.emplace_back(BenchArgs{128, 128, 128});
    args.emplace_back(BenchArgs{256, 256, 256});
    args.emplace_back(BenchArgs{512, 512, 512});
    args.emplace_back(BenchArgs{1024, 1024, 1024});
    args.emplace_back(BenchArgs{2048, 2048, 2048});
    args.emplace_back(BenchArgs{4096, 4096, 4096});
    return args;
 }
 std::vector<BenchArgs> get_feat_model_args() {
    std::vector<BenchArgs> args;
    args.emplace_back(BenchArgs{2, 4096, 4096});
    args.emplace_back(BenchArgs{2, 1024, 6912});
    args.emplace_back(BenchArgs{2, 3456, 3456});
    args.emplace_back(BenchArgs{2, 2304, 2304});
    args.emplace_back(BenchArgs{1, 256, 8192});
    args.emplace_back(BenchArgs{2, 864, 864});
    args.emplace_back(BenchArgs{2, 9, 64});
    args.emplace_back(BenchArgs{4, 4096, 4096});
    args.emplace_back(BenchArgs{4, 1024, 6912});
    args.emplace_back(BenchArgs{4, 3456, 3456});
    args.emplace_back(BenchArgs{4, 2304, 2304});
    args.emplace_back(BenchArgs{2, 256, 8192});
    args.emplace_back(BenchArgs{4, 864, 864});
    args.emplace_back(BenchArgs{4, 9, 64});
    args.emplace_back(BenchArgs{8, 4096, 4096});
    args.emplace_back(BenchArgs{8, 1024, 6912});
    args.emplace_back(BenchArgs{8, 3456, 3456});
    args.emplace_back(BenchArgs{8, 2304, 2304});
    args.emplace_back(BenchArgs{4, 256, 8192});
    args.emplace_back(BenchArgs{8, 864, 864});
    args.emplace_back(BenchArgs{4, 9, 64});
    args.emplace_back(BenchArgs{16, 4096, 4096});
    args.emplace_back(BenchArgs{16, 1024, 6912});
    args.emplace_back(BenchArgs{16, 3456, 3456});
    args.emplace_back(BenchArgs{16, 2304, 2304});
    args.emplace_back(BenchArgs{8, 256, 8192});
    args.emplace_back(BenchArgs{16, 864, 864});
    args.emplace_back(BenchArgs{8, 9, 64});
    args.emplace_back(BenchArgs{32, 4096, 4096});
    args.emplace_back(BenchArgs{32, 1024, 6912});
    args.emplace_back(BenchArgs{32, 3456, 3456});
    args.emplace_back(BenchArgs{32, 2304, 2304});
    args.emplace_back(BenchArgs{16, 256, 8192});
    args.emplace_back(BenchArgs{32, 864, 864});
    args.emplace_back(BenchArgs{32, 9, 64});
    args.emplace_back(BenchArgs{64, 4096, 4096});
    args.emplace_back(BenchArgs{64, 1024, 6912});
    args.emplace_back(BenchArgs{64, 3456, 3456});
    args.emplace_back(BenchArgs{64, 2304, 2304});
    args.emplace_back(BenchArgs{32, 256, 8192});
    args.emplace_back(BenchArgs{64, 864, 864});
    args.emplace_back(BenchArgs{64, 9, 64});
    args.emplace_back(BenchArgs{128, 4096, 4096});
    args.emplace_back(BenchArgs{128, 1024, 6912});
    args.emplace_back(BenchArgs{128, 3456, 3456});
    args.emplace_back(BenchArgs{128, 2304, 2304});
    args.emplace_back(BenchArgs{64, 256, 8192});
    args.emplace_back(BenchArgs{128, 864, 864});
    args.emplace_back(BenchArgs{128, 9, 64});
    return args;
 }
 void benchmark_matrix_mul(
        Handle* handle, const std::vector<BenchArgs>& args, DType A_dtype,
        DType B_dtype, DType C_dtype, const char* algo = nullptr,
        param::MatrixMul::Format format = param::MatrixMul::Format::DEFAULT) {
    megdnn_assert(A_dtype.enumv() == B_dtype.enumv());
    CUBenchmarker<MatrixMulForward> benchmarker(handle);
    CUBenchmarker<MatrixMulForward> benchmarker_cublas(handle);
    size_t RUNS = 1000;
    benchmarker.set_display(false).set_times(RUNS);
    benchmarker_cublas.set_display(false).set_times(RUNS);
    benchmarker_cublas.set_before_exec_callback(
            AlgoChecker<MatrixMulForward>("CUBLAS"));
    benchmarker.set_dtype(0, A_dtype)
            .set_dtype(1, B_dtype)
            .set_dtype(2, C_dtype);
    benchmarker_cublas.set_dtype(0, A_dtype)
            .set_dtype(1, B_dtype)
            .set_dtype(2, C_dtype);
    using Param = MatrixMul::Param;
    for (auto&& arg : args) {
        size_t m = arg.m, n = arg.n, k = arg.k;
        Param param;
        param.transposeA = arg.mask & 0x1;
        param.transposeB = arg.mask & 0x2;
        param.format = format;
        size_t A0 = m, A1 = k, B0 = k, B1 = n;
        if (param.transposeA) {
            std::swap(A0, A1);
        }
        if (param.transposeB) {
            std::swap(B0, B1);
        }
        benchmarker.set_param(param);
        TensorShape A{A0, A1}, B{B0, B1}, C{m, n};
        float time_in_ms = 0.f;
        if (algo) {
            time_in_ms =
                    algo_benchmark<MatrixMulForward, OprProxy<MatrixMulForward>,
                                   CUTimer>(benchmarker, {A, B, C}, algo) /
                    RUNS;
        } else {
            time_in_ms = benchmarker.execs({A, B, C}) / RUNS;
        }
        benchmarker_cublas.set_param(param);
        auto time_in_ms_cublas = benchmarker_cublas.execs({A, B, C}) / RUNS;
        float flo = 2.0 * m * n * k / (1e12);
        printf("A=%s, B=%s, C=%s, time(algo=%s)=%.2f %.2fTops, "
               "time(cublas)=%.2f %.2fTops, "
               "perf(algo=%s)/perf(cublas)=%.2f\n",
               A.to_string().c_str(), B.to_string().c_str(),
               C.to_string().c_str(), algo, time_in_ms,
               (flo / (time_in_ms * 1e-3)), time_in_ms_cublas,
               (flo / (time_in_ms_cublas * 1e-3)), algo,
               time_in_ms_cublas / time_in_ms);
    }
 }
 #endif
 }  // namespace
 TEST_F(CUDA, CUTLASS_GEMM_MULTI_BATCHSIZE) {
    auto args = matrix_mul::get_matmul_args_no_mask();
    test_multibatchsize(handle_cuda(), dtype::Float32(), dtype::Float32(),
                        dtype::Float32(),
                        "CUTLASS_FLOAT32_SIMT_128X128X8_32X64X8", args,
                        param::MatrixMul::Format::DEFAULT);
 }
 #define MEGDNN_FOREACH_CUTLASS_KERNEL(cb) \
    cb(1, 64, 256, 8, 32, 64, 8);         \
    cb(2, 256, 64, 8, 64, 32, 8);         \
    cb(3, 32, 256, 8, 16, 64, 8);         \
    cb(4, 256, 32, 8, 64, 16, 8);         \
    cb(5, 128, 128, 8, 32, 64, 8);        \
    cb(6, 128, 64, 8, 64, 32, 8);         \
    cb(7, 64, 128, 8, 32, 64, 8);         \
    cb(8, 128, 32, 8, 64, 32, 8);         \
    cb(9, 32, 128, 8, 32, 64, 8);         \
    cb(10, 64, 64, 8, 32, 64, 8);         \
    cb(11, 32, 64, 8, 32, 64, 8);         \
    cb(12, 64, 32, 8, 64, 32, 8);         \
    cb(13, 32, 32, 8, 32, 32, 8);         \
    cb(14, 8, 32, 8, 8, 32, 8);           \
    cb(15, 16, 32, 8, 16, 32, 8);         \
    cb(16, 16, 64, 8, 16, 64, 8);         \
    cb(17, 16, 128, 8, 16, 64, 8);
 #define cb(name, tbm, tbn, tbk, wm, wn, wk)                                    \
    TEST_F(CUDA, CUTLASS_GEMM_##name) {                                        \
        matrix_mul::check_matrix_mul<MatrixMulForward>(                        \
                dtype::Float32(), dtype::Float32(), dtype::Float32(),          \
                handle_cuda(),                                                 \
                "CUTLASS_FLOAT32_SIMT_" #tbm "X" #tbn "X" #tbk "_" #wm "X" #wn \
                "X" #wk);                                                      \
    }
 MEGDNN_FOREACH_CUTLASS_KERNEL(cb)
 #undef cb
 #undef MEGDNN_FOREACH_CUTLASS_KERNEL
 #if MEGDNN_WITH_BENCHMARK
 TEST_F(CUDA, BENCHMARK_CUTLASS_MATMUL) {
    benchmark_matrix_mul(handle_cuda(), get_square_matmul_args(),
                         dtype::Float32(), dtype::Float32(), dtype::Float32(),
                         "CUTLASS_FLOAT32_SIMT");
 }
 TEST_F(CUDA, BENCHMARK_CUTLASS_MATMUL_FEAT) {
    benchmark_matrix_mul(handle_cuda(), get_feat_model_args(), dtype::Float32(),
                         dtype::Float32(), dtype::Float32(),
                         "CUTLASS_FLOAT32_SIMT");
 }
 #endif
 }  // namespace test
 }  // namespace megdnn
 #endif
 // vim: syntax=cpp.doxygen