feat(dnn): add conv1x1 algo and tests

GitOrigin-RevId: 374a62cf12
5 years ago · c985204b31
--- a/dnn/src/fallback/conv_bias/conv1x1/algos.cpp
+++ b/dnn/src/fallback/conv_bias/conv1x1/algos.cpp
@@ -0,0 +1,230 @@
 /**
 * \file dnn/src/fallback/conv_bias/conv1x1/algos.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

 #include "src/fallback/conv_bias/conv1x1/algos.h"
 #include "src/common/opr_delegate.h"
 #include "src/fallback/conv_bias/common.h"
 #include "src/fallback/conv_bias/conv1x1/conv1x1_dispatcher.h"
 #include "src/fallback/conv_bias/conv1x1/conv1x1_strategy.h"
 #include "src/fallback/conv_bias/opr_impl.h"

 #include "megdnn/opr_param_defs.h"
 #include "src/naive/convolution/helper.h"

 #if MEGDNN_X86
 #include "src/x86/conv_bias/postprocess_helper.h"
 #endif

 #include "midout.h"
 MIDOUT_DECL(megdnn_fallback_conv1x1)

 using namespace megdnn;
 using namespace fallback;
 #if MEGDNN_X86
 using namespace x86;
 #endif
 using namespace conv1x1;

 size_t ConvBiasImpl::AlgoConv1x1::get_oc_tile_size_heuristic(
        const NCBKernSizeParam& param) const {
    size_t OH = param.osz[0];
    size_t OW = param.osz[1];
    size_t OC = param.filter_meta.ocpg;
    if (OH * OW >= 56 * 56 || OC >= 64)
        return m_oc_block_size;
    return div_ceil(OC, param.nr_threads);
 }

 size_t ConvBiasImpl::AlgoConv1x1::get_workspace(
        ConvBiasImpl*, const NCBKernSizeParam& param) const {
    size_t OH = param.osz[0];
    size_t OW = param.osz[1];
    size_t compt_oc_block_size = get_oc_tile_size_heuristic(param);

    auto matmul_param =
            get_matmul_kern_param(param, OH * OW, compt_oc_block_size);
    
    auto pack_mode = m_matmul_algo->packmode();
    if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) {
        MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 0, 0) {
            Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::DEFAULT> dispatcher;
            return dispatcher
                    .get_bundle(param, matmul_param, m_matmul_algo,
                                compt_oc_block_size)
                    .total_size_in_bytes();
        }
        MIDOUT_END();
    } else if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) {
        MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 0, 1) {
            Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA> dispatcher;
            return dispatcher
                    .get_bundle(param, matmul_param, m_matmul_algo,
                                compt_oc_block_size)
                    .total_size_in_bytes();
        }
        MIDOUT_END();
    } else {
        MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 0, 2) {
            Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::NO_PACK> dispatcher;
            return dispatcher
                    .get_bundle(param, matmul_param, m_matmul_algo,
                                compt_oc_block_size)
                    .total_size_in_bytes();
        }
        MIDOUT_END();
    }
    return 0;
 }

 SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoConv1x1::dispatch_kerns(
        ConvBiasImpl* opr, const NCBKernSizeParam& param) const {
    SmallVector<ConvBiasImpl::NCBKern> ret_kern;

    size_t OH = param.osz[0];
    size_t OW = param.osz[1];
    size_t OC = param.filter_meta.ocpg;
    size_t compt_oc_block_size = get_oc_tile_size_heuristic(param);
    size_t GROUP = param.filter_meta.group;
    size_t BATCH = param.n;
    size_t oc_blocks_per_group = div_ceil(OC, compt_oc_block_size);

    auto matmul_param =
            get_matmul_kern_param(param, OH * OW, compt_oc_block_size);
    WorkspaceBundle whole_bundle = {nullptr, {}};
    WorkspaceBundle thread_bundle = {nullptr, {}};
    WorkspaceBundle matmul_bundle = {nullptr, {}};

    auto pack_mode = m_matmul_algo->packmode();
    if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) {
        MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 1, 0) {
            Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::DEFAULT> dispatcher;
            whole_bundle = dispatcher.get_bundle(
                    param, matmul_param, m_matmul_algo, compt_oc_block_size);
            matmul_bundle = m_matmul_algo->get_bundle(matmul_param);
        }
        MIDOUT_END();
    } else if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) {
        MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 1, 1) {
            Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA> dispatcher;
            whole_bundle = dispatcher.get_bundle(
                    param, matmul_param, m_matmul_algo, compt_oc_block_size);
            matmul_bundle = m_matmul_algo->get_bundle(matmul_param);
        }
        MIDOUT_END();
    } else {
        MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 1, 2) {
            Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::NO_PACK> dispatcher;
            whole_bundle = dispatcher.get_bundle(
                    param, matmul_param, m_matmul_algo, compt_oc_block_size);
            matmul_bundle = {
                    nullptr,
                    {0, 0, m_matmul_algo->get_workspace(matmul_param)}};
        }
        MIDOUT_END();
    }

    //! get thread bundle
    thread_bundle = get_thread_bundle(param, matmul_bundle.get_size(2),
                                      compt_oc_block_size);

    Conv1x1StrategyBase* conv1x1_strategy =
            Conv1x1Factory::make_conv1x1_strategy(param, pack_mode,
                                                 opr->param().format);

    auto kern_packA = [this, whole_bundle, matmul_bundle, param,
                       compt_oc_block_size, conv1x1_strategy](
                              const NCBKernParam& ncb_param,
                              const NCBKernIndex& ncb_index) mutable {
        conv1x1_strategy->packA(whole_bundle, matmul_bundle,
                                compt_oc_block_size, this->m_matmul_algo, param,
                                ncb_param, std::move(ncb_index));
    };
    auto kern_packB = [this, whole_bundle, matmul_bundle, param,
                       conv1x1_strategy](
                              const NCBKernParam& ncb_param,
                              const NCBKernIndex& ncb_index) mutable {
        conv1x1_strategy->packB(whole_bundle, matmul_bundle,
                                this->m_matmul_algo, param, ncb_param,
                                std::move(ncb_index));
    };
    auto kern_compt = [this, whole_bundle, matmul_bundle, thread_bundle, param,
                       compt_oc_block_size, conv1x1_strategy](
                              const NCBKernParam& ncb_param,
                              const NCBKernIndex& ncb_index) mutable {
        conv1x1_strategy->exec(whole_bundle, matmul_bundle, thread_bundle,
                               compt_oc_block_size, this->m_matmul_algo, param,
                               ncb_param, std::move(ncb_index));
    };

    if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT ||
        pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) {
        ret_kern.push_back({kern_packA, {GROUP, oc_blocks_per_group}});
        if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) {
                ret_kern.push_back({kern_packB, {1}});
            }
    }
    ret_kern.push_back({kern_compt, {BATCH, GROUP, oc_blocks_per_group}});

    return ret_kern;
 }

 bool ConvBiasImpl::AlgoConv1x1::usable(ConvBiasImpl* opr,
                                       const NCBKernSizeParam& param,
                                       AlgoSelectionStrategy) const {
    MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 2) {
        //! only support nchw format
        if (opr->param().format != param::ConvBias::Format::NCHW)
            return false;

        size_t FH = param.filter_meta.spatial[0],
               FW = param.filter_meta.spatial[1];
        size_t PH = param.filter_meta.padding[0],
               PW = param.filter_meta.padding[1];
        size_t SH = param.filter_meta.stride[0],
               SW = param.filter_meta.stride[1];

        if (FH != 1 || FW != 1 || PH || PW || SH != 1 || SW != 1)
            return false;

        //! make sure 8x8x16 and 8x8x32 biasmode is nobias and nonlineMode
        //! is identity otherwise return false mean that 8x8x32 and 8x8x16
        //! not support PostProcess
        if (param.src_type.enumv() == param.filter_type.enumv() &&
            (param.src_type.enumv() == DTypeEnum::Int8 &&
             (param.dst_type.enumv() == DTypeEnum::Int16 ||
              param.dst_type.enumv() == DTypeEnum::Int32)) &&
            param.bias_mode != megdnn::BiasMode::NO_BIAS &&
            param.nonlineMode != megdnn::NonlineMode::IDENTITY)
            return false;

        if (param.src_type.enumv() == param.filter_type.enumv() &&
            ((param.src_type.enumv() == DTypeEnum::QuantizedS8 ||
              param.src_type.enumv() == DTypeEnum::Quantized8Asymm) &&
             param.dst_type.enumv() == DTypeEnum::QuantizedS32) &&
            param.bias_mode != megdnn::BiasMode::NO_BIAS &&
            param.nonlineMode != megdnn::NonlineMode::IDENTITY)
            return false;

        size_t OH = param.osz[0];
        size_t OW = param.osz[1];
        MatrixMulImpl::KernSizeParam matmul_param =
                get_matmul_kern_param(param, OH * OW, get_oc_tile_size_heuristic(param));

        bool matmulusable = m_matmul_algo->usable(matmul_param);
        return matmulusable &&
               (param.filter_meta.dilation[0] ==
                        param.filter_meta.dilation[1] &&
                param.filter_meta.dilation[0] == 1) &&
               param.compute_mode == param::ConvBias::ComputeMode::DEFAULT;
    }
    MIDOUT_END();
    return false;
 }
--- a/dnn/src/fallback/conv_bias/conv1x1/algos.h
+++ b/dnn/src/fallback/conv_bias/conv1x1/algos.h
@@ -0,0 +1,56 @@
 /**
 * \file dnn/src/fallback/conv_bias/conv1x1/algos.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

 #pragma once

 #include "megdnn/thin/small_vector.h"
 #include "src/common/utils.h"
 #include "src/fallback/conv_bias/opr_impl.h"
 #include "src/fallback/matrix_mul/opr_impl.h"

 namespace megdnn {
 namespace fallback {

 class ConvBiasImpl::AlgoConv1x1 final : public AlgoBase {
 public:
    AlgoConv1x1(MatrixMulImpl::AlgoBase* matmul_algo, size_t oc_block_size)
            : m_matmul_algo(matmul_algo), m_oc_block_size(oc_block_size) {}

    bool is_reproducible() const override { return true; }

    const char* name() const override {
        if (m_name.empty()) {
            m_name = ssprintf("CONV1x1:%s:%zu", m_matmul_algo->name(),
                              m_oc_block_size);
        }
        return m_name.c_str();
    }

    bool usable(ConvBiasImpl* opr, const NCBKernSizeParam& param,
                AlgoSelectionStrategy algo_selection_strategy) const override;
    size_t get_workspace(ConvBiasImpl*,
                         const NCBKernSizeParam& param) const override;
    SmallVector<NCBKern> dispatch_kerns(
            ConvBiasImpl* opr, const NCBKernSizeParam& param) const override;

 protected:
    size_t get_oc_tile_size_heuristic(const NCBKernSizeParam& param) const;

 private:
    MatrixMulImpl::AlgoBase* m_matmul_algo;
    mutable std::string m_name;
    mutable size_t m_oc_block_size = 0;
 };

 }  // namespace fallback
 }  // namespace megdnn

 // vim: syntax=cpp.doxygen
--- a/dnn/src/fallback/conv_bias/conv1x1/conv1x1_dispatcher.h
+++ b/dnn/src/fallback/conv_bias/conv1x1/conv1x1_dispatcher.h
@@ -0,0 +1,99 @@
 /**
 * \file dnn/src/fallback/conv_bias/conv1x1/conv1x1_dispatcher.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

 #pragma once

 #include "src/fallback/conv_bias/conv1x1/conv1x1_strategy.h"

 namespace megdnn {
 namespace fallback {
 namespace conv1x1 {

 namespace {
 //! get_thread_bundle
 WorkspaceBundle get_thread_bundle(const ConvBiasImpl::NCBKernSizeParam& param,
                                  size_t matmul_c_size, size_t oc_tile_size) {
    //! for some cases, matmul result need temp space to store
    size_t OH = param.osz[0];
    size_t OW = param.osz[1];
    bool is_dst_8bit = (param.src_type.enumv() == DTypeEnum::QuantizedS8 &&
                        param.dst_type.enumv() == DTypeEnum::QuantizedS8) ||
                       (param.src_type.enumv() == DTypeEnum::Quantized8Asymm &&
                        param.dst_type.enumv() == DTypeEnum::Quantized8Asymm);
    size_t matmul_dst_bytes_per_thread =
            is_dst_8bit ? oc_tile_size * OH * OW * sizeof(param.bias_type) : 0;
    return WorkspaceBundle{nullptr,
                           {matmul_c_size, matmul_dst_bytes_per_thread}};
 }
 } // anonymous namespace

 template <MatrixMulImpl::AlgoBase::PackMode pack_mode>
 class Conv1x1Kerns {
 public:
    //! get_bundle
    WorkspaceBundle get_bundle(const ConvBiasImpl::NCBKernSizeParam& param,
                               const MatrixMulImpl::KernSizeParam& matmul_param,
                               const MatrixMulImpl::AlgoBase* matmul_algo,
                               size_t oc_tile_size) {
        size_t GROUP = param.filter_meta.group;
        size_t OC = param.filter_meta.ocpg;
        size_t BATCH = param.n;

        //! bundle per thread
        //! matmul_param records a matmul with M = oc_tile_size, K = IC, N = OH
        //! * OW this does not bother packb bytes
        auto matmul_bundle = matmul_algo->get_bundle(matmul_param);
        auto thread_bundle = get_thread_bundle(param, matmul_bundle.get_size(2),
                                               oc_tile_size);

        //! size per thread
        size_t all_threads_bytes =
                thread_bundle.total_size_in_bytes() * param.nr_threads;

        //! packa size = GROUP * packa_size_each_group
        size_t packa_bytes_per_oc_tile = matmul_bundle.get_size(0);
        size_t oc_tiles_per_group = div_ceil(OC, oc_tile_size);
        size_t all_packa_bytes =
                packa_bytes_per_oc_tile * oc_tiles_per_group * GROUP;

        if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA)
            return WorkspaceBundle{nullptr,
                                   {all_packa_bytes, 0, all_threads_bytes}};

        //! packb size = N * GROUP * packb_size_per_group
        size_t packb_bytes_per_group = matmul_bundle.get_size(1);
        size_t all_packb_bytes = packb_bytes_per_group * GROUP * BATCH;

        return WorkspaceBundle{
                nullptr, {all_packa_bytes, all_packb_bytes, all_threads_bytes}};
    }
 };

 template<>
 class Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::NO_PACK> {
 public:
    //! get_bundle
    WorkspaceBundle get_bundle(const ConvBiasImpl::NCBKernSizeParam& param,
                               const MatrixMulImpl::KernSizeParam& matmul_param,
                               const MatrixMulImpl::AlgoBase* matmul_algo,
                               size_t oc_tile_size) {
        size_t matmul_size = matmul_algo->get_workspace(matmul_param);
        auto thread_bundle = get_thread_bundle(param, matmul_size, oc_tile_size);
        //! size per thread
        size_t all_threads_bytes =
                thread_bundle.total_size_in_bytes() * param.nr_threads;
        return WorkspaceBundle{nullptr, {0, 0, all_threads_bytes}};
    }
 };

 }  // namespace conv1x1
 }  // namespace fallback
 }  // namespace megdnn
--- a/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.cpp
+++ b/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.cpp
@@ -0,0 +1,214 @@
 /**
 * \file dnn/src/fallback/conv_bias/conv1x1/Conv1x1_strategy.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

 #include <unordered_map>
 #include "src/fallback/conv_bias/conv1x1/conv1x1_strategy.h"

 #include "midout.h"

 MIDOUT_DECL(megdnn_fallback_conv1x1_factory_strategy)

 namespace megdnn {
 namespace fallback {
 namespace conv1x1 {

 namespace {

 struct StrategyHashParam {
    ConvBiasImpl::NCBKernSizeParam param;
    param::ConvBias::Format format;
    MatrixMulImpl::AlgoBase::PackMode packmode;
 };

 struct StrategyHashParamHash {
    std::size_t operator()(const StrategyHashParam& sparam) const {
        constexpr size_t base = 1;  //! avoid hashkey is zero
        std::size_t result =
                static_cast<std::size_t>(sparam.param.src_type.enumv()) + base;
        result = result ^
                 ((static_cast<std::size_t>(sparam.param.dst_type.enumv()) +
                   base)
                  << 3);
        result = result ^
                 ((static_cast<std::size_t>(sparam.param.filter_type.enumv()) +
                   base)
                  << 6);
        result = result ^
                 ((static_cast<std::size_t>(sparam.param.bias_type.enumv()) +
                   base)
                  << 9);
        result = result ^
                 ((static_cast<std::size_t>(sparam.format) + base) << 12);
        result = result ^
                 ((static_cast<std::size_t>(sparam.packmode) + base) << 15);
        return result;
    };
 };

 struct StrategyHashParamEqual {
    bool operator()(const StrategyHashParam& param1,
                    const StrategyHashParam& param2) const {
        bool flags = true;
        flags = param1.param.src_type == param2.param.src_type && flags;
        flags = param1.param.filter_type == param2.param.filter_type && flags;
        flags = param1.param.bias_type == param2.param.bias_type && flags;
        flags = param1.param.dst_type == param2.param.dst_type && flags;
        flags = param1.format == param2.format && flags;
        flags = param1.packmode == param2.packmode && flags;
        return flags;
    };
 };

 std::unique_ptr<Conv1x1StrategyBase> create_conv1x1_strategy(
        const ConvBiasImpl::NCBKernSizeParam& param,
        MatrixMulImpl::AlgoBase::PackMode pack_mode,
        param::ConvBias::Format format) {
    MEGDNN_MARK_USED_VAR(format);

 #define cb1(_packmode, _dt, _post_ctype, _postprocess_mode, _midout_tag)     \
    MIDOUT_BEGIN(megdnn_fallback_conv1x1_factory_strategy,                   \
                 midout_iv(_midout_tag)) {                                   \
        if (param.filter_type.enumv() == DTypeTrait<_dt>::enumv) {           \
            return std::make_unique<                                         \
                    Conv1x1Strategy<_dt, _dt, _dt, _post_ctype, _post_ctype, \
                                    _postprocess_mode, _packmode>>();        \
        }                                                                    \
    }                                                                        \
    MIDOUT_END()

 #define cb2(_packmode, _i_src_type, _i_bias_type, _i_dst_type, _src_ctype, \
            _bias_ctype, _dst_ctype, _postprocess_mode, _midout_tag)       \
    MIDOUT_BEGIN(megdnn_fallback_conv1x1_factory_strategy,                 \
                 midout_iv(_midout_tag)) {                                 \
        if (param.filter_type.enumv() == param.src_type.enumv() &&         \
            param.src_type.enumv() == DTypeTrait<_i_src_type>::enumv &&    \
            param.dst_type.enumv() == DTypeTrait<_i_dst_type>::enumv) {    \
            return std::make_unique<                                       \
                    Conv1x1Strategy<_src_ctype, _bias_ctype, _dst_ctype,   \
                                    DTypeTrait<_i_bias_type>::ctype,       \
                                    DTypeTrait<_i_dst_type>::ctype,        \
                                    _postprocess_mode, _packmode>>();      \
        }                                                                  \
    }                                                                      \
    MIDOUT_END()

    switch (pack_mode) {
        case MatrixMulImpl::AlgoBase::PackMode::DEFAULT:
            cb1(MatrixMulImpl::AlgoBase::PackMode::DEFAULT, dt_float32,
                dt_float32, PostprocessMode::FLOAT, "Default::FLOAT"_hash);
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
            cb1(MatrixMulImpl::AlgoBase::PackMode::DEFAULT, dt_float16, __fp16,
                PostprocessMode::FLOAT, "Default::FLOAT16_FP16"_hash);
 #else
 #if !MEGDNN_DISABLE_FLOAT16
            cb1(MatrixMulImpl::AlgoBase::PackMode::DEFAULT, dt_float16,
                dt_float16, PostprocessMode::NO_PROCESS,
                "Default::FLOAT16_FLOAT16"_hash);
 #endif
 #endif
            cb2(MatrixMulImpl::AlgoBase::PackMode::DEFAULT, dt_int8, dt_int32,
                dt_int32, dt_int8, dt_int32, dt_int32,
                PostprocessMode::NO_PROCESS, "Default::INT8x8x32_INT32"_hash);
            cb2(MatrixMulImpl::AlgoBase::PackMode::DEFAULT, dt_int8, dt_int16,
                dt_int16, dt_int8, dt_int16, dt_int16,
                PostprocessMode::NO_PROCESS, "Default::INT8x8x16_INT16"_hash);
 #if MEGDNN_AARCH64 || MEGDNN_ARMV7
            cb2(MatrixMulImpl::AlgoBase::PackMode::DEFAULT,
                dtype::Quantized8Asymm, dtype::QuantizedS32,
                dtype::QuantizedS32, dt_uint8, dt_int32, dt_int32,
                PostprocessMode::NO_PROCESS,
                "Default::QUINT8x8x32_QINT32"_hash);
            cb2(MatrixMulImpl::AlgoBase::PackMode::DEFAULT,
                dtype::Quantized8Asymm, dtype::QuantizedS32,
                dtype::Quantized8Asymm, dt_uint8, dt_int32, dt_uint8,
                PostprocessMode::QUANTIZED, "Default::QUINT8x8x32_QUINT8"_hash);
 #endif
            cb2(MatrixMulImpl::AlgoBase::PackMode::DEFAULT, dtype::QuantizedS8,
                dtype::QuantizedS32, dtype::QuantizedS32, dt_int8, dt_int32,
                dt_int32, PostprocessMode::NO_PROCESS,
                "Default::QINT8x8x32_QINT32"_hash);
            cb2(MatrixMulImpl::AlgoBase::PackMode::DEFAULT, dtype::QuantizedS8,
                dtype::QuantizedS32, dtype::QuantizedS8, dt_int8, dt_int32,
                dt_int8, PostprocessMode::QUANTIZED,
                "Default::QINT8x8x32_QINT8"_hash);
            break;

        case MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA:
            cb1(MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA, dt_float32,
                dt_float32, PostprocessMode::FLOAT, "OnlyPackA::FLOAT"_hash);
            break;

        case MatrixMulImpl::AlgoBase::PackMode::NO_PACK:
            cb1(MatrixMulImpl::AlgoBase::PackMode::NO_PACK, dt_float32,
                dt_float32, PostprocessMode::FLOAT, "NoPack::FLOAT"_hash);

            cb2(MatrixMulImpl::AlgoBase::PackMode::NO_PACK, dt_int8, dt_int16,
                dt_int16, dt_int8, dt_int16, dt_int16,
                PostprocessMode::NO_PROCESS, "NoPack::INT8x8x16_INT16"_hash);

            cb2(MatrixMulImpl::AlgoBase::PackMode::NO_PACK, dt_int8, dt_int32,
                dt_int32, dt_int8, dt_int32, dt_int32,
                PostprocessMode::NO_PROCESS, "NoPack::INT8x8x32_INT32"_hash);

            cb2(MatrixMulImpl::AlgoBase::PackMode::NO_PACK,
                dtype::QuantizedS8, dtype::QuantizedS32,
                dtype::QuantizedS32, dt_int8, dt_int32, dt_int32,
                PostprocessMode::NO_PROCESS,
                "NoPack::QINT8x8x32_QINT32"_hash);
            break;

        default:
            megdnn_throw("Invalid Pack Mode");
            break;
    }
 #undef cb1
 #undef cb2
    megdnn_throw("Invalid Data Type");
    return nullptr;
 }

 class StrategyDelegationStorage {
 public:
    Conv1x1StrategyBase* get(const ConvBiasImpl::NCBKernSizeParam& param,
                             MatrixMulImpl::AlgoBase::PackMode pack_mode,
                             param::ConvBias::Format format) {
        MEGDNN_LOCK_GUARD(m_mtx);
        StrategyHashParam sparam;
        sparam.param = param;
        sparam.format = format;
        sparam.packmode = pack_mode;
        if (m_map_strategies.find(sparam) == m_map_strategies.end()) {
            auto strategy = create_conv1x1_strategy(param, pack_mode, format);
            m_map_strategies[sparam] = std::move(strategy);
        }
        return m_map_strategies[sparam].get();
    }

 private:
    std::mutex m_mtx;
    std::unordered_map<StrategyHashParam, std::unique_ptr<Conv1x1StrategyBase>,
                       StrategyHashParamHash, StrategyHashParamEqual>
            m_map_strategies;
 };

 }  // anonymous namespace

 Conv1x1StrategyBase* Conv1x1Factory::make_conv1x1_strategy(
        const ConvBiasImpl::NCBKernSizeParam& param,
        MatrixMulImpl::AlgoBase::PackMode pack_mode,
        param::ConvBias::Format format) {
    static StrategyDelegationStorage storage;
    return storage.get(param, pack_mode, format);
 }

 }  // namespace conv1x1
 }  // namespace fallback
 }  // namespace megdnn
--- a/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.h
+++ b/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.h
@@ -0,0 +1,310 @@
 /**
 * \file dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #pragma once

 #include "megdnn/opr_param_defs.h"
 #include "src/fallback/conv_bias/opr_impl.h"
 #if MEGDNN_X86
 #include "src/x86/conv_bias/postprocess_helper.h"
 #endif

 namespace megdnn {
 namespace fallback {
 namespace conv1x1 {

 #if MEGDNN_X86
 using namespace x86;
 #endif

 namespace {

 //! get_matmul_kern_param
 MatrixMulImpl::KernSizeParam get_matmul_kern_param(
        const ConvBiasImpl::NCBKernSizeParam& param, size_t n, size_t m) {
    size_t M = m;
    size_t N = n;
    size_t K = param.filter_meta.icpg;  //! K = IC
    size_t LDA = K, LDB = N, LDC = N;
    bool is_dst_8bit = (param.src_type.enumv() == DTypeEnum::QuantizedS8 &&
                        param.dst_type.enumv() == DTypeEnum::QuantizedS8) ||
                       (param.src_type.enumv() == DTypeEnum::Quantized8Asymm &&
                        param.dst_type.enumv() == DTypeEnum::Quantized8Asymm);
    return {param.filter_type,
            param.src_type,
            is_dst_8bit ? param.bias_type : param.dst_type,
            M,
            N,
            K,
            LDA,
            LDB,
            LDC,
            false,
            false,
            param::MatrixMul::ComputeMode::DEFAULT,
            param::MatrixMul::Format::DEFAULT};
 }
 }  // namespace

 class Conv1x1StrategyBase {
 public:
    virtual void packA(WorkspaceBundle& whole_bundle,
                       WorkspaceBundle& matmul_bundle,
                       size_t oc_tile_size,
                       const MatrixMulImpl::AlgoBase* matmul_algo,
                       const ConvBiasImpl::NCBKernSizeParam& param,
                       const ConvBiasImpl::NCBKernParam& ncb_param,
                       const ConvBiasImpl::NCBKernIndex& ncb_index) = 0;

    virtual void packB(WorkspaceBundle& whole_bundle,
                       WorkspaceBundle& matmul_bundle,
                       const MatrixMulImpl::AlgoBase* matmul_algo,
                       const ConvBiasImpl::NCBKernSizeParam& param,
                       const ConvBiasImpl::NCBKernParam& ncb_param,
                       const ConvBiasImpl::NCBKernIndex& ncb_index) = 0;

    virtual void exec(WorkspaceBundle& whole_bundle,
                      WorkspaceBundle& matmul_bundle,
                      WorkspaceBundle& thread_bundle,
                      size_t oc_tile_size,
                      const MatrixMulImpl::AlgoBase* matmul_algo,
                      const ConvBiasImpl::NCBKernSizeParam& param,
                      const ConvBiasImpl::NCBKernParam& ncb_param,
                      const ConvBiasImpl::NCBKernIndex& ncb_index) = 0;
    virtual ~Conv1x1StrategyBase() = default;
 };

 template <typename src_ctype, typename bias_ctype, typename dst_ctype,
          typename op_ctype, typename op_dtype,
          megdnn::PostprocessMode postprocess_mode, MatrixMulImpl::AlgoBase::PackMode pack_mode>
 class Conv1x1Strategy : public Conv1x1StrategyBase {
 public:
    void packA(WorkspaceBundle& whole_bundle,
               WorkspaceBundle& matmul_bundle,
               size_t oc_tile_size,
               const MatrixMulImpl::AlgoBase* matmul_algo,
               const ConvBiasImpl::NCBKernSizeParam& param,
               const ConvBiasImpl::NCBKernParam& ncb_param,
               const ConvBiasImpl::NCBKernIndex& ncb_index) override {
        
        if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::NO_PACK) {
            megdnn_log_error("NoPack mode has no packA kernel");
            return;
        }

        whole_bundle.set(ncb_param.workspace_ptr);

        //! packa size per group
        size_t OC = param.filter_meta.ocpg;
        size_t oc_tiles_per_group = div_ceil(OC, oc_tile_size);
        size_t packa_bytes_per_oc_tile = matmul_bundle.get_size(0);
        size_t packa_bytes_per_group =
                oc_tiles_per_group * packa_bytes_per_oc_tile;

        size_t group_id = ncb_index.ndrange_id[0];
        size_t oc_tile_id_in_group = ncb_index.ndrange_id[1];

        size_t oc_start = oc_tile_id_in_group * oc_tile_size;
        size_t oc_end = oc_start + oc_tile_size;
        oc_end = (oc_end <= OC ? oc_end : OC);

        size_t OH = param.osz[0];
        size_t OW = param.osz[1];
        size_t IC = param.filter_meta.icpg;
        MatrixMulImpl::KernParam matmul_kern_param;
        static_cast<MatrixMulImpl::KernSizeParam&>(matmul_kern_param) =
                get_matmul_kern_param(param, OH * OW, oc_end - oc_start);

        size_t bytes_offset_of_a_panel =
                group_id * packa_bytes_per_group +
                oc_tile_id_in_group * packa_bytes_per_oc_tile;
        size_t numbers_offset_of_filter =
                oc_tile_size * IC * oc_tile_id_in_group;

        src_ctype* a_panel = reinterpret_cast<src_ctype*>(
                reinterpret_cast<int8_t*>(whole_bundle.get(0)) +
                bytes_offset_of_a_panel);
        matmul_kern_param.A_ptr = const_cast<src_ctype*>(
                ncb_param.filter<src_ctype>(group_id) +
                numbers_offset_of_filter);
        matmul_algo->pack_A(matmul_kern_param, a_panel, 0,
                            oc_end - oc_start);
    }

    void packB(WorkspaceBundle& whole_bundle,
               WorkspaceBundle& matmul_bundle,
               const MatrixMulImpl::AlgoBase* matmul_algo,
               const ConvBiasImpl::NCBKernSizeParam& param,
               const ConvBiasImpl::NCBKernParam& ncb_param,
               const ConvBiasImpl::NCBKernIndex& ncb_index) override {
        if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) {
            whole_bundle.set(ncb_param.workspace_ptr);

            //! packb size per group
            size_t packb_bytes_per_group = matmul_bundle.get_size(1);

            size_t GROUP = param.filter_meta.group;
            size_t BATCH = param.n;
            size_t SH = param.filter_meta.stride[0];
            size_t SW = param.filter_meta.stride[1];
            size_t OH = param.osz[0];
            size_t OW = param.osz[1];
            size_t OC = param.filter_meta.ocpg;

            MatrixMulImpl::KernParam matmul_kern_param;
            static_cast<MatrixMulImpl::KernSizeParam&>(matmul_kern_param) =
                    get_matmul_kern_param(param, OH * OW, OC);

            rep(batch, BATCH) {
                rep(g, GROUP) {
                    if (SH == 2 && SW == 2)
                        megdnn_throw("no support for stride = 2");

                    size_t bytes_offset_of_b_panel =
                            batch * packb_bytes_per_group * GROUP +
                            g * packb_bytes_per_group;
                    src_ctype* b_panel = reinterpret_cast<src_ctype*>(
                            reinterpret_cast<int8_t*>(whole_bundle.get(1)) +
                            bytes_offset_of_b_panel);
                    matmul_kern_param.B_ptr = const_cast<src_ctype*>(
                            ncb_param.src<src_ctype>(batch, g));
                    matmul_algo->pack_B(matmul_kern_param, b_panel, 0, OH * OW);
                }
            }
        } else {
            megdnn_log_error("OnlyPackA mode and NoPack mode has no packB kernel");
        }
    }

    void exec(WorkspaceBundle& whole_bundle,
              WorkspaceBundle& matmul_bundle,
              WorkspaceBundle& thread_bundle,
              size_t oc_tile_size,
              const MatrixMulImpl::AlgoBase* matmul_algo,
              const ConvBiasImpl::NCBKernSizeParam& param,
              const ConvBiasImpl::NCBKernParam& ncb_param,
              const ConvBiasImpl::NCBKernIndex& ncb_index) override {
        whole_bundle.set(ncb_param.workspace_ptr);
        size_t OC = param.filter_meta.ocpg;
        size_t IC = param.filter_meta.icpg;

        //! packa bytes per group
        size_t oc_tiles_per_group = div_ceil(OC, oc_tile_size);
        size_t packa_bytes_per_oc_tile = matmul_bundle.get_size(0);
        size_t packa_bytes_per_group =
                packa_bytes_per_oc_tile * oc_tiles_per_group;

        //! packb bytes per group
        size_t packb_bytes_per_group = matmul_bundle.get_size(1);

        //! matmul bytes per thread
        size_t matmul_bytes_per_thread = thread_bundle.get_size(0);

        size_t batch_id = ncb_index.ndrange_id[0];
        size_t group_id = ncb_index.ndrange_id[1];
        size_t oc_tile_id_in_group = ncb_index.ndrange_id[2];
        size_t thread_id = ncb_index.thread_id;

        size_t GROUP = param.filter_meta.group;
        size_t OH = param.osz[0];
        size_t OW = param.osz[1];
        size_t oc_start = oc_tile_size * oc_tile_id_in_group;
        size_t oc_end = oc_start + oc_tile_size;
        oc_end = (oc_end <= OC ? oc_end : OC);

        MatrixMulImpl::KernParam matmul_kern_param;
        static_cast<MatrixMulImpl::KernSizeParam&>(matmul_kern_param) =
                get_matmul_kern_param(param, OH * OW, oc_end - oc_start);

        size_t bytes_offset_of_a_panel =
                group_id * packa_bytes_per_group +
                oc_tile_id_in_group * packa_bytes_per_oc_tile;
        int8_t* a_panel = reinterpret_cast<int8_t*>(whole_bundle.get(0)) +
                          bytes_offset_of_a_panel;

        size_t bytes_offset_of_b_panel =
                batch_id * packb_bytes_per_group * GROUP +
                group_id * packb_bytes_per_group;
        int8_t* b_panel = reinterpret_cast<int8_t*>(whole_bundle.get(1)) +
                          bytes_offset_of_b_panel;

        size_t thread_offset = thread_bundle.total_size_in_bytes() * thread_id;
        size_t bytes_offset_of_matmul_dst_this_thread =
                thread_offset + thread_bundle.get_size(0);
        int8_t* matmul_temp_dst =
                reinterpret_cast<int8_t*>(whole_bundle.get(2)) +
                bytes_offset_of_matmul_dst_this_thread;

        size_t numbers_of_ncb_dst_offset =
                oc_tile_size * OH * OW * oc_tile_id_in_group;
        void* conv_bias_dst = static_cast<void*>(
                ncb_param.dst<dst_ctype>(batch_id, group_id) +
                numbers_of_ncb_dst_offset);

        size_t numbers_of_ncb_filter_offset =
                oc_tile_size * IC * oc_tile_id_in_group;
        matmul_kern_param.A_ptr = const_cast<src_ctype*>(
                ncb_param.filter<src_ctype>(group_id) +
                numbers_of_ncb_filter_offset);

        matmul_kern_param.B_ptr = const_cast<src_ctype*>(
                ncb_param.src<src_ctype>(batch_id, group_id));

        matmul_kern_param.workspace_ptr =
                reinterpret_cast<int8_t*>(whole_bundle.get(2)) + thread_offset;
        matmul_kern_param.workspace_size = matmul_bytes_per_thread;

        bool is_dst_8bit =
                (param.src_type.enumv() == DTypeEnum::QuantizedS8 &&
                 param.dst_type.enumv() == DTypeEnum::QuantizedS8) ||
                (param.src_type.enumv() == DTypeEnum::Quantized8Asymm &&
                 param.dst_type.enumv() == DTypeEnum::Quantized8Asymm);
        void* matmul_dst = is_dst_8bit ? matmul_temp_dst : conv_bias_dst;

        matmul_kern_param.C_ptr = matmul_dst;

        if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::NO_PACK) {
            auto matmul_kern = matmul_algo->get_kern(matmul_kern_param);
            matmul_kern(matmul_kern_param);
        } else {
            auto matmul_kern_naked =
                    matmul_algo->get_kern_naked(matmul_kern_param);
            matmul_kern_naked(matmul_kern_param, a_panel, b_panel);
        }

        //! do postprocess
        void* bias_ptr = nullptr;
        if (param.bias_mode == megdnn::BiasMode::BIAS)
            bias_ptr = static_cast<void*>(const_cast<bias_ctype*>(
                    ncb_param.bias<bias_ctype>(batch_id, group_id) +
                    numbers_of_ncb_dst_offset));
        else
            bias_ptr = static_cast<void*>(const_cast<bias_ctype*>(
                    ncb_param.bias<bias_ctype>(batch_id, group_id) + oc_start));
        PostProcess<op_ctype, op_dtype, postprocess_mode>::run(
                matmul_dst, bias_ptr, conv_bias_dst, param.bias_mode,
                param.nonlineMode, param.bias_type, param.dst_type, 1_z,
                oc_end - oc_start, OH, OW);
    }
 };

 class Conv1x1Factory {
 public:
    static Conv1x1StrategyBase* make_conv1x1_strategy(
            const ConvBiasImpl::NCBKernSizeParam& param,
            MatrixMulImpl::AlgoBase::PackMode pack_mode,
            param::ConvBias::Format format);
 };

 }  // namespace conv1x1
 }  // namespace fallback
 }  // namespace megdnn
--- a/dnn/src/fallback/conv_bias/opr_impl.cpp
+++ b/dnn/src/fallback/conv_bias/opr_impl.cpp
@@ -15,6 +15,7 @@
 #include "src/common/opr_delegate.h"
 #include "src/common/utils.h"
 #include "src/fallback/conv_bias/algos.h"
 #include "src/fallback/conv_bias/conv1x1/algos.h"
 #include "src/fallback/conv_bias/im2col/algos.h"
 #include "src/fallback/conv_bias/opr_impl.h"
 #include "src/naive/convolution/algorithms.h"
@@ -54,7 +55,13 @@ public:
                        ohw_tile_size));
                all_algos.emplace_back(refhold.back().get());
            }
 #if 1
            for (size_t oc_tile_size : {24, 48}) {
                refhold.emplace_back(new AlgoConv1x1(
                    static_cast<MatrixMulImpl::AlgoBase*>(algo),
                    oc_tile_size));
                all_algos.emplace_back(refhold.back().get());
            }
 #if 0
        //! As these algos maybe very slow, it will make fastrun search slow, so
        //! we disable it, but for the test of strategyhelper, we just keep it.
        //! FIXME: I do not know a better way to do it.
--- a/dnn/src/fallback/conv_bias/opr_impl.h
+++ b/dnn/src/fallback/conv_bias/opr_impl.h
@@ -248,6 +248,7 @@ protected:
 private:
    class AlgoNaive;
    class AlgoIm2col;
    class AlgoConv1x1;
    class AlgoWinogradF32;
    class AlgoWinogradF32_4x4;
    class AlgoWinogradQS8;
--- a/dnn/src/x86/matrix_mul/algos.cpp
+++ b/dnn/src/x86/matrix_mul/algos.cpp
@@ -438,7 +438,6 @@ size_t MatrixMulImpl::AlgoInt8x8x32SSEM4N8K2::get_workspace(
                   m, n, k, trans_a, trans_b, strategy, cacheline)
            .get_workspace_size();
 }

 MEGDNN_REG_GEMM_FUNC_FOR_IM2COL_IMPL_DETAIL(
        AlgoInt8x8x32SSEM4N8K2, megdnn_x86_matmul_kern, 9,
        x86::matmul::gemm_sse_s8s8s32_4x8x2, dt_int8, dt_int32, dt_int16);
--- a/dnn/test/common/conv_bias.cpp
+++ b/dnn/test/common/conv_bias.cpp
@@ -875,6 +875,82 @@ std::vector<conv_bias::TestArg> get_conv_bias_args(
    return args;
 }

 std::vector<megdnn::test::conv_bias::TestArg> get_conv_bias_1x1_args(
        bool no_bias, bool no_nonlinemode, bool quantized_nlmod,
        bool only_broadcast_bias) {
    using namespace conv_bias;
    using Param = param::ConvBias;
    using NLMode = param::ConvBias::NonlineMode;
    using CONVMode = param::ConvBias::Mode;
    std::vector<TestArg> args;

    auto pack = [&](size_t n, size_t oc, size_t ic, size_t w, size_t h,
                    size_t stride, NLMode nlmode, CONVMode convmode) {
        Param param;
        param.stride_h = stride;
        param.stride_w = stride;
        param.pad_h = 0;
        param.pad_w = 0;

        param.mode = convmode;
        param.nonlineMode = nlmode;

        args.emplace_back(param, TensorShape{n, ic, h, w},
                          TensorShape{oc, ic, 1, 1}, TensorShape{});
        if (!no_bias) {
            args.emplace_back(param, TensorShape{n, ic, h, w},
                              TensorShape{oc, ic, 1, 1},
                              TensorShape{1, oc, 1, 1});

            if (!only_broadcast_bias) {
                args.emplace_back(param, TensorShape{n, ic, h, w},
                                  TensorShape{oc, ic, 1, 1},
                                  TensorShape{n, oc, (h - 1) / stride + 1,
                                              (w - 1) / stride + 1});
            }
        }

        param.sparse = param::ConvBias::Sparse::GROUP;

        args.emplace_back(param, TensorShape{n, 2 * ic, h, w},
                          TensorShape{2, oc, ic, 1, 1}, TensorShape{});
        if (!no_bias) {
            args.emplace_back(param, TensorShape{n, 2 * ic, h, w},
                              TensorShape{2, oc, ic, 1, 1},
                              TensorShape{1, 2 * oc, 1, 1});

            if (!only_broadcast_bias) {
                args.emplace_back(param, TensorShape{n, 2 * ic, h, w},
                                  TensorShape{2, oc, ic, 1, 1},
                                  TensorShape{n, 2 * oc, (h - 1) / stride + 1,
                                              (w - 1) / stride + 1});
            }
        }
    };

    std::vector<NLMode> nonlinemode = {NLMode::IDENTITY};
    if (!no_nonlinemode) {
        nonlinemode.emplace_back(NLMode::RELU);
        nonlinemode.emplace_back(NLMode::H_SWISH);
        if (!quantized_nlmod) {
            nonlinemode.emplace_back(NLMode::SIGMOID);
        }
    }

    std::vector<CONVMode> convmodes{param::ConvBias::Mode::CONVOLUTION,
                                    param::ConvBias::Mode::CROSS_CORRELATION};

    for (size_t n : {1, 2})
        for (size_t oc : {1, 9, 33})
            for (size_t ic : {1, 16, 64})
                for (size_t size : {7, 14, 28})
                    for (auto nlmode : nonlinemode)
                        for (auto convmode : convmodes) {
                            pack(n, oc, ic, size, size, 1, nlmode, convmode);
                        }
    return args;
 }

 void check_conv_bias(std::vector<conv_bias::TestArg> args, Handle* handle,
                     const char* algo_name) {
    using namespace conv_bias;
--- a/dnn/test/common/conv_bias.h
+++ b/dnn/test/common/conv_bias.h
@@ -76,6 +76,10 @@ std::vector<megdnn::test::conv_bias::TestArg> get_conv_bias_args(
        bool no_nonlinemode, bool quantized_nlmod = false,
        bool only_broadcast_bias = false);

 std::vector<megdnn::test::conv_bias::TestArg> get_conv_bias_1x1_args(
        bool no_bias, bool no_nonlinemode, bool quantized_nlmod = false,
        bool only_broadcast_bias = false);

 void check_conv_bias(std::vector<megdnn::test::conv_bias::TestArg> args,
                     megdnn::Handle* handle, const char* algo_name);

--- a/dnn/test/x86/conv_bias.cpp
+++ b/dnn/test/x86/conv_bias.cpp
@@ -919,6 +919,79 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP32_PACKA) {

 #undef cb
 }

 /**************************** Conv1x1 PackA *************************/
 namespace {
 void checker_conv_bias(std::vector<conv_bias::TestArg> args, Handle* handle,
                       RNG* rng, float epsilon, DType type0, DType type1,
                       DType type2, DType type3, const char* algo_name) {
    using namespace conv_bias;

    Checker<ConvBias> checker(handle);
    checker.set_before_exec_callback(
            conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name));
    checker.set_dtype(0, type0);
    checker.set_dtype(1, type1);
    checker.set_dtype(2, type2);
    checker.set_dtype(4, type3);
    checker.set_epsilon(epsilon);
    if (NULL != rng) {
        checker.set_rng(0, rng).set_rng(1, rng).set_rng(2, rng).set_rng(3, rng);
    }
    for (auto&& arg : args) {
        checker.set_param(arg.param).execs(
                {arg.src, arg.filter, arg.bias, {}, {}});
    }
 }
 }  // namespace

 #if MEGDNN_X86_WITH_MKL
 TEST_F(X86_MULTI_THREADS, CONV_BIAS_CONV1X1_S1_FP32_PACKA) {
    using namespace conv_bias;
    std::vector<conv_bias::TestArg> args = get_conv_bias_1x1_args(false, false);
    check_conv_bias(args, handle(), "CONV1x1:X86_F32_MKL_PACKA:24");
 }

 TEST_F(X86_MULTI_THREADS, CONV_BIAS_CONV1X1_S1_FP32_BLAS) {
    using namespace conv_bias;
    std::vector<conv_bias::TestArg> args = get_conv_bias_1x1_args(false, false);
    check_conv_bias(args, handle(), "CONV1x1:X86_F32_BLAS:48");
 }
 #endif

 TEST_F(X86_MULTI_THREADS, CONV_BIAS_CONV1X1_S1_INT8X8X32) {
    using namespace conv_bias;
    UniformIntRNG rng{-50, 50};
    float epsilon = 0.001;
    std::vector<conv_bias::TestArg> args = get_conv_bias_1x1_args(true, true);
 #if MEGDNN_X86_WITH_MKL_DNN
    if (x86::is_supported(x86::SIMDType::VNNI)) {
        checker_conv_bias(args, handle(), &rng, epsilon, dtype::Int8{},
                        dtype::Int8{}, dtype::Int32{}, dtype::Int32{},
                        "CONV1x1:X86_INT8X8X32_MKLDNN:24");
    }
 #endif
 #if MEGDNN_X86_WITH_VNNI
    if (x86::is_supported(x86::SIMDType::VNNI)) {
        checker_conv_bias(args, handle(), &rng, epsilon, dtype::Int8{},
                          dtype::Int8{}, dtype::Int32{}, dtype::Int32{},
                          "CONV1x1:X86_INT8X8X32_VNNI:24");
    }
 #endif
    if (x86::is_supported(x86::SIMDType::AVX2)) {
        checker_conv_bias(args, handle(), &rng, epsilon, dtype::Int8{},
                          dtype::Int8{}, dtype::Int32{}, dtype::Int32{},
                          "CONV1x1:X86_INT8X8X32_AVX2_4X16X2:24");
        checker_conv_bias(args, handle(), &rng, epsilon, dtype::Int8{},
                        dtype::Int8{}, dtype::Int32{}, dtype::Int32{},
                        "CONV1x1:X86_INT8X8X32_AVX2_2X4X16:24");
    }
    checker_conv_bias(args, handle(), &rng, epsilon, dtype::Int8{},
                      dtype::Int8{}, dtype::Int32{}, dtype::Int32{},
                      "CONV1x1:X86_INT8X8X32_SSE_4X8X2:48");
 }
 /************************* End Conv1x1 PackA ************************/

 #endif

 TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QINT8) {