| @@ -67,8 +67,7 @@ public: | |||
| } | |||
| auto&& fm = param.filter_meta; | |||
| auto OC = fm.ocpg, IC = fm.icpg; | |||
| return (fm.spatial[0] == fm.spatial[1] && fm.spatial[0] == 1) || | |||
| OC >= 32 || IC >= 32; | |||
| return OC >= 32 || IC >= 32; | |||
| } | |||
| private: | |||
| @@ -0,0 +1,473 @@ | |||
| /** | |||
| * \file dnn/src/fallback/conv_bias/im2col/factory.h | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| */ | |||
| #pragma once | |||
| #include <unordered_map> | |||
| #include "src/fallback/conv_bias/im2col/strategy_base.h" | |||
| #include "src/fallback/conv_bias/opr_impl.h" | |||
| #include "midout.h" | |||
| MIDOUT_DECL(megdnn_fallback_im2col_factory_make_strategy) | |||
| namespace megdnn { | |||
| namespace fallback { | |||
| namespace im2col { | |||
| enum class StrategyType : uint32_t { | |||
| FLOAT = 0, | |||
| #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| FLOAT_FP16 = 1, | |||
| #else | |||
| #if !MEGDNN_DISABLE_FLOAT16 | |||
| FLOAT16_FLOAT16 = 2, | |||
| #endif | |||
| #endif | |||
| INT8x8x32 = 3, | |||
| INT8x8x16 = 4, | |||
| #if MEGDNN_AARCH64 || MEGDNN_ARMV7 | |||
| QUINT8x8x32 = 5, | |||
| QUINT8x8x32x8 = 6, | |||
| #endif | |||
| QINT8x8x32 = 7, | |||
| QINT8x8x32x8 = 8 | |||
| }; | |||
| struct StrategyHashParam { | |||
| fallback::ConvBiasImpl::NCBKernSizeParam param; | |||
| param::ConvBias::Format format; | |||
| fallback::MatrixMulImpl::AlgoBase::PackMode packmode; | |||
| size_t block_m; | |||
| size_t block_n; | |||
| size_t block_k; | |||
| }; | |||
| struct StrategyHashParamHash { | |||
| std::size_t operator()(const StrategyHashParam& sparam) const { | |||
| constexpr size_t base = 1; //! avoid hashkey is zero | |||
| std::size_t result = | |||
| static_cast<std::size_t>(sparam.param.src_type.enumv()) + base; | |||
| result = result ^ | |||
| ((static_cast<std::size_t>(sparam.param.dst_type.enumv()) + | |||
| base) | |||
| << 3); | |||
| result = result ^ | |||
| ((static_cast<std::size_t>(sparam.param.filter_type.enumv()) + | |||
| base) | |||
| << 6); | |||
| result = result ^ | |||
| ((static_cast<std::size_t>(sparam.param.bias_type.enumv()) + | |||
| base) | |||
| << 9); | |||
| result = result ^ | |||
| ((static_cast<std::size_t>(sparam.format) + base) << 12); | |||
| result = result ^ | |||
| ((static_cast<std::size_t>(sparam.packmode) + base) << 15); | |||
| result = result ^ | |||
| ((static_cast<std::size_t>(sparam.block_m) + base) << 18); | |||
| result = result ^ | |||
| ((static_cast<std::size_t>(sparam.block_n) + base) << 22); | |||
| result = result ^ | |||
| ((static_cast<std::size_t>(sparam.block_k) + base) << 26); | |||
| return result; | |||
| }; | |||
| }; | |||
| struct StrategyHashParamEqual { | |||
| std::size_t operator()(const StrategyHashParam& param1, | |||
| const StrategyHashParam& param2) const { | |||
| bool flags = true; | |||
| flags = param1.param.src_type == param2.param.src_type && flags; | |||
| flags = param1.param.filter_type == param2.param.filter_type && flags; | |||
| flags = param1.param.bias_type == param2.param.bias_type && flags; | |||
| flags = param1.param.dst_type == param2.param.dst_type && flags; | |||
| flags = param1.format == param2.format && flags; | |||
| flags = param1.packmode == param2.packmode && flags; | |||
| flags = param1.block_m == param2.block_m && flags; | |||
| flags = param1.block_n == param2.block_n && flags; | |||
| flags = param1.block_k == param2.block_k && flags; | |||
| return flags; | |||
| }; | |||
| }; | |||
| class StrategyDelegationStorage { | |||
| std::mutex m_mtx; | |||
| std::unordered_map<StrategyHashParam, std::unique_ptr<StrategyBase>, | |||
| StrategyHashParamHash, StrategyHashParamEqual> | |||
| map_strategys; | |||
| public: | |||
| ~StrategyDelegationStorage() = default; | |||
| template <typename Strategy> | |||
| Strategy* get(param::ConvBias::Format format, | |||
| fallback::MatrixMulImpl::AlgoBase* matmul_algo, | |||
| const fallback::ConvBiasImpl::NCBKernSizeParam& param, | |||
| StrategyType stype); | |||
| }; | |||
| class Factory { | |||
| public: | |||
| static StrategyBase* get_im2col_strategy( | |||
| const fallback::ConvBiasImpl::NCBKernSizeParam& param, | |||
| fallback::MatrixMulImpl::AlgoBase* matmul_algo, | |||
| param::ConvBias::Format format) { | |||
| static StrategyDelegationStorage storage; | |||
| StrategyType strategytype = get_strategy_type(param); | |||
| return storage.get<StrategyBase>(format, matmul_algo, param, | |||
| strategytype); | |||
| } | |||
| static StrategyType get_strategy_type( | |||
| const fallback::ConvBiasImpl::NCBKernSizeParam& param) { | |||
| #define cb1(_dt, _post_ctype, _strategytype) \ | |||
| if (param.filter_type.enumv() == DTypeTrait<_dt>::enumv) { \ | |||
| return _strategytype; \ | |||
| } | |||
| #define cb2(_i_src_type, _i_bias_type, _i_dst_type, _src_ctype, _bias_ctype, \ | |||
| _dst_ctype, _strategytype) \ | |||
| if (param.filter_type.enumv() == param.src_type.enumv() && \ | |||
| param.src_type.enumv() == DTypeTrait<_i_src_type>::enumv && \ | |||
| param.dst_type.enumv() == DTypeTrait<_i_dst_type>::enumv) { \ | |||
| return _strategytype; \ | |||
| } | |||
| cb1(dt_float32, dt_float32, StrategyType::FLOAT); | |||
| #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| cb1(dt_float16, __fp16, StrategyType::FLOAT_FP16); | |||
| #else | |||
| #if !MEGDNN_DISABLE_FLOAT16 | |||
| cb1(dt_float16, dt_float16, StrategyType::FLOAT16_FLOAT16); | |||
| #endif | |||
| #endif | |||
| cb2(dt_int8, dt_int32, dt_int32, dt_int8, dt_int32, dt_int32, | |||
| StrategyType::INT8x8x32); | |||
| cb2(dt_int8, dt_int16, dt_int16, dt_int8, dt_int16, dt_int16, | |||
| StrategyType::INT8x8x16); | |||
| #if MEGDNN_AARCH64 || MEGDNN_ARMV7 | |||
| cb2(dtype::Quantized8Asymm, dtype::QuantizedS32, dtype::QuantizedS32, | |||
| dt_uint8, dt_int32, dt_int32, StrategyType::QUINT8x8x32); | |||
| cb2(dtype::Quantized8Asymm, dtype::QuantizedS32, dtype::Quantized8Asymm, | |||
| dt_uint8, dt_int32, dt_uint8, StrategyType::QUINT8x8x32x8); | |||
| #endif | |||
| cb2(dtype::QuantizedS8, dtype::QuantizedS32, dtype::QuantizedS32, | |||
| dt_int8, dt_int32, dt_int32, StrategyType::QINT8x8x32); | |||
| cb2(dtype::QuantizedS8, dtype::QuantizedS32, dtype::QuantizedS8, | |||
| dt_int8, dt_int32, dt_int8, StrategyType::QINT8x8x32x8); | |||
| #undef cb1 | |||
| #undef cb2 | |||
| megdnn_throw("not support datatype in im2col strategy\n"); | |||
| } | |||
| #define cb1(_packmode, _dt, _post_ctype, _postprocess_mode, _midout_tag) \ | |||
| MIDOUT_BEGIN(megdnn_fallback_im2col_factory_make_strategy, \ | |||
| midout_iv(_midout_tag)) { \ | |||
| if (param.filter_type.enumv() == DTypeTrait<_dt>::enumv) { \ | |||
| return std::make_unique< \ | |||
| Strategy<_dt, _dt, _dt, _post_ctype, _post_ctype, \ | |||
| _postprocess_mode, PackMode::_packmode>>(); \ | |||
| } \ | |||
| } \ | |||
| MIDOUT_END(); \ | |||
| return {}; | |||
| #define cb2(_packmode, _i_src_type, _i_bias_type, _i_dst_type, _src_ctype, \ | |||
| _bias_ctype, _dst_ctype, _postprocess_mode, _midout_tag) \ | |||
| MIDOUT_BEGIN(megdnn_fallback_im2col_factory_make_strategy, \ | |||
| midout_iv(_midout_tag)) { \ | |||
| if (param.filter_type.enumv() == param.src_type.enumv() && \ | |||
| param.src_type.enumv() == DTypeTrait<_i_src_type>::enumv && \ | |||
| param.dst_type.enumv() == DTypeTrait<_i_dst_type>::enumv) { \ | |||
| return std::make_unique< \ | |||
| Strategy<_src_ctype, _bias_ctype, _dst_ctype, \ | |||
| DTypeTrait<_i_bias_type>::ctype, \ | |||
| DTypeTrait<_i_dst_type>::ctype, \ | |||
| _postprocess_mode, PackMode::_packmode>>(); \ | |||
| } \ | |||
| } \ | |||
| MIDOUT_END(); \ | |||
| return {}; | |||
| static std::unique_ptr<StrategyBase> make_default_strategy( | |||
| fallback::MatrixMulImpl::AlgoBase* matmul_algo, | |||
| const fallback::ConvBiasImpl::NCBKernSizeParam& param, | |||
| param::ConvBias::Format format, StrategyType strategytype) { | |||
| MEGDNN_MARK_USED_VAR(matmul_algo); | |||
| MEGDNN_MARK_USED_VAR(format); | |||
| switch (strategytype) { | |||
| case StrategyType::FLOAT: | |||
| cb1(DEFAULT, dt_float32, dt_float32, PostprocessMode::FLOAT, | |||
| "DefaultStrategyType::FLOAT"_hash); | |||
| break; | |||
| #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| case StrategyType::FLOAT_FP16: | |||
| cb1(DEFAULT, dt_float16, __fp16, PostprocessMode::FLOAT, | |||
| "DefaultStrategyType::FLOAT_FP16"_hash); | |||
| break; | |||
| #else | |||
| #if !MEGDNN_DISABLE_FLOAT16 | |||
| case StrategyType::FLOAT16_FLOAT16: | |||
| cb1(DEFAULT, dt_float16, dt_float16, | |||
| PostprocessMode::NO_PROCESS, | |||
| "DefaultStrategyType::FLOAT16_FLOAT16"_hash); | |||
| break; | |||
| #endif | |||
| #endif | |||
| case StrategyType::INT8x8x32: | |||
| cb2(DEFAULT, dt_int8, dt_int32, dt_int32, dt_int8, dt_int32, | |||
| dt_int32, PostprocessMode::NO_PROCESS, | |||
| "DefaultStrategyType::INT8x8x32"_hash); | |||
| break; | |||
| case StrategyType::INT8x8x16: | |||
| cb2(DEFAULT, dt_int8, dt_int16, dt_int16, dt_int8, dt_int16, | |||
| dt_int16, PostprocessMode::NO_PROCESS, | |||
| "DefaultStrategyType::INT8x8x16"_hash); | |||
| break; | |||
| #if MEGDNN_AARCH64 || MEGDNN_ARMV7 | |||
| case StrategyType::QUINT8x8x32: | |||
| cb2(DEFAULT, dtype::Quantized8Asymm, dtype::QuantizedS32, | |||
| dtype::QuantizedS32, dt_uint8, dt_int32, dt_int32, | |||
| PostprocessMode::NO_PROCESS, | |||
| "DefaultStrategyType::QUINT8x8x32"_hash); | |||
| break; | |||
| case StrategyType::QUINT8x8x32x8: | |||
| cb2(DEFAULT, dtype::Quantized8Asymm, dtype::QuantizedS32, | |||
| dtype::Quantized8Asymm, dt_uint8, dt_int32, dt_uint8, | |||
| PostprocessMode::QUANTIZED, | |||
| "DefaultStrategyType::QUINT8x8x32x8"_hash); | |||
| break; | |||
| #endif | |||
| case StrategyType::QINT8x8x32: | |||
| cb2(DEFAULT, dtype::QuantizedS8, dtype::QuantizedS32, | |||
| dtype::QuantizedS32, dt_int8, dt_int32, dt_int32, | |||
| PostprocessMode::NO_PROCESS, | |||
| "DefaultStrategyType::QINT8x8x32"_hash); | |||
| break; | |||
| case StrategyType::QINT8x8x32x8: | |||
| cb2(DEFAULT, dtype::QuantizedS8, dtype::QuantizedS32, | |||
| dtype::QuantizedS8, dt_int8, dt_int32, dt_int8, | |||
| PostprocessMode::QUANTIZED, | |||
| "DefaultStrategyType::QINT8x8x32x8"_hash); | |||
| break; | |||
| } | |||
| megdnn_throw("error not support strategy type "); | |||
| } | |||
| static std::unique_ptr<StrategyBase> make_nopack_strategy( | |||
| fallback::MatrixMulImpl::AlgoBase* matmul_algo, | |||
| const fallback::ConvBiasImpl::NCBKernSizeParam& param, | |||
| param::ConvBias::Format format, StrategyType strategytype) { | |||
| MEGDNN_MARK_USED_VAR(matmul_algo); | |||
| MEGDNN_MARK_USED_VAR(format); | |||
| switch (strategytype) { | |||
| case StrategyType::FLOAT: | |||
| cb1(NO_PACK, dt_float32, dt_float32, PostprocessMode::FLOAT, | |||
| "NoPackStrategyType::FLOAT"_hash); | |||
| break; | |||
| #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| case StrategyType::FLOAT_FP16: | |||
| cb1(NO_PACK, dt_float16, __fp16, PostprocessMode::FLOAT, | |||
| "NoPackStrategyType::FLOAT_FP16"_hash); | |||
| break; | |||
| #else | |||
| #if !MEGDNN_DISABLE_FLOAT16 | |||
| case StrategyType::FLOAT16_FLOAT16: | |||
| cb1(NO_PACK, dt_float16, dt_float16, PostprocessMode::NO_PROCESS, | |||
| "NoPackStrategyType::FLOAT16_FLOAT16"_hash); | |||
| break; | |||
| #endif | |||
| #endif | |||
| case StrategyType::INT8x8x32: | |||
| cb2(NO_PACK, dt_int8, dt_int32, dt_int32, dt_int8, dt_int32, | |||
| dt_int32, PostprocessMode::NO_PROCESS, | |||
| "NoPackStrategyType::INT8x8x32"_hash); | |||
| break; | |||
| case StrategyType::INT8x8x16: | |||
| cb2(NO_PACK, dt_int8, dt_int16, dt_int16, dt_int8, dt_int16, | |||
| dt_int16, PostprocessMode::NO_PROCESS, | |||
| "NoPackStrategyType::INT8x8x16"_hash); | |||
| break; | |||
| #if MEGDNN_AARCH64 || MEGDNN_ARMV7 | |||
| case StrategyType::QUINT8x8x32: | |||
| cb2(NO_PACK, dtype::Quantized8Asymm, dtype::QuantizedS32, | |||
| dtype::QuantizedS32, dt_uint8, dt_int32, dt_int32, | |||
| PostprocessMode::NO_PROCESS, | |||
| "NoPackStrategyType::QUINT8x8x32"_hash); | |||
| break; | |||
| case StrategyType::QUINT8x8x32x8: | |||
| cb2(NO_PACK, dtype::Quantized8Asymm, dtype::QuantizedS32, | |||
| dtype::Quantized8Asymm, dt_uint8, dt_int32, dt_uint8, | |||
| PostprocessMode::QUANTIZED, | |||
| "NoPackStrategyType::QUINT8x8x32x8"_hash); | |||
| break; | |||
| #endif | |||
| case StrategyType::QINT8x8x32: | |||
| cb2(NO_PACK, dtype::QuantizedS8, dtype::QuantizedS32, | |||
| dtype::QuantizedS32, dt_int8, dt_int32, dt_int32, | |||
| PostprocessMode::NO_PROCESS, | |||
| "NoPackStrategyType::QINT8x8x32"_hash); | |||
| break; | |||
| case StrategyType::QINT8x8x32x8: | |||
| cb2(NO_PACK, dtype::QuantizedS8, dtype::QuantizedS32, | |||
| dtype::QuantizedS8, dt_int8, dt_int32, dt_int8, | |||
| PostprocessMode::QUANTIZED, | |||
| "NoPackStrategyType::QINT8x8x32x8"_hash); | |||
| break; | |||
| } | |||
| megdnn_throw("error not support strategy type "); | |||
| } | |||
| static std::unique_ptr<StrategyBase> make_onlypacka_strategy( | |||
| fallback::MatrixMulImpl::AlgoBase* matmul_algo, | |||
| const fallback::ConvBiasImpl::NCBKernSizeParam& param, | |||
| param::ConvBias::Format format, StrategyType strategytype) { | |||
| MEGDNN_MARK_USED_VAR(matmul_algo); | |||
| MEGDNN_MARK_USED_VAR(format); | |||
| switch (strategytype) { | |||
| case StrategyType::FLOAT: | |||
| cb1(ONLY_PACKA, dt_float32, dt_float32, PostprocessMode::FLOAT, | |||
| "OnlyPackaStrategyType::FLOAT"_hash); | |||
| break; | |||
| #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| case StrategyType::FLOAT_FP16: | |||
| cb1(ONLY_PACKA, dt_float16, __fp16, PostprocessMode::FLOAT, | |||
| "OnlyPackaStrategyType::FLOAT_FP16"_hash); | |||
| break; | |||
| #else | |||
| #if !MEGDNN_DISABLE_FLOAT16 | |||
| case StrategyType::FLOAT16_FLOAT16: | |||
| cb1(ONLY_PACKA, dt_float16, dt_float16, | |||
| PostprocessMode::NO_PROCESS, | |||
| "OnlyPackaStrategyType::FLOAT16_FLOAT16"_hash); | |||
| break; | |||
| #endif | |||
| #endif | |||
| case StrategyType::INT8x8x32: | |||
| cb2(ONLY_PACKA, dt_int8, dt_int32, dt_int32, dt_int8, dt_int32, | |||
| dt_int32, PostprocessMode::NO_PROCESS, | |||
| "OnlyPackaStrategyType::INT8x8x32"_hash); | |||
| break; | |||
| case StrategyType::INT8x8x16: | |||
| cb2(ONLY_PACKA, dt_int8, dt_int16, dt_int16, dt_int8, dt_int16, | |||
| dt_int16, PostprocessMode::NO_PROCESS, | |||
| "OnlyPackaStrategyType::INT8x8x16"_hash); | |||
| break; | |||
| #if MEGDNN_AARCH64 || MEGDNN_ARMV7 | |||
| case StrategyType::QUINT8x8x32: | |||
| cb2(ONLY_PACKA, dtype::Quantized8Asymm, dtype::QuantizedS32, | |||
| dtype::QuantizedS32, dt_uint8, dt_int32, dt_int32, | |||
| PostprocessMode::NO_PROCESS, | |||
| "OnlyPackaStrategyType::QUINT8x8x32"_hash); | |||
| break; | |||
| case StrategyType::QUINT8x8x32x8: | |||
| cb2(ONLY_PACKA, dtype::Quantized8Asymm, dtype::QuantizedS32, | |||
| dtype::Quantized8Asymm, dt_uint8, dt_int32, dt_uint8, | |||
| PostprocessMode::QUANTIZED, | |||
| "OnlyPackaStrategyType::QUINT8x8x32x8"_hash); | |||
| break; | |||
| #endif | |||
| case StrategyType::QINT8x8x32: | |||
| cb2(ONLY_PACKA, dtype::QuantizedS8, dtype::QuantizedS32, | |||
| dtype::QuantizedS32, dt_int8, dt_int32, dt_int32, | |||
| PostprocessMode::NO_PROCESS, | |||
| "OnlyPackaStrategyType::QINT8x8x32"_hash); | |||
| break; | |||
| case StrategyType::QINT8x8x32x8: | |||
| cb2(ONLY_PACKA, dtype::QuantizedS8, dtype::QuantizedS32, | |||
| dtype::QuantizedS8, dt_int8, dt_int32, dt_int8, | |||
| PostprocessMode::QUANTIZED, | |||
| "OnlyPackaStrategyType::QINT8x8x32x8"_hash); | |||
| break; | |||
| } | |||
| megdnn_throw("error not support strategy type "); | |||
| } | |||
| #undef cb1 | |||
| #undef cb2 | |||
| static std::unique_ptr<StrategyBase> make_strategy( | |||
| param::ConvBias::Format format, | |||
| fallback::MatrixMulImpl::AlgoBase* matmul_algo, | |||
| fallback::MatrixMulImpl::AlgoBase::PackMode packmode, | |||
| const fallback::ConvBiasImpl::NCBKernSizeParam& param, | |||
| StrategyType stype) { | |||
| switch (packmode) { | |||
| case MatrixMulImpl::AlgoBase::PackMode::DEFAULT: | |||
| return make_default_strategy(matmul_algo, param, format, stype); | |||
| break; | |||
| case MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA: | |||
| return make_onlypacka_strategy(matmul_algo, param, format, | |||
| stype); | |||
| break; | |||
| case MatrixMulImpl::AlgoBase::PackMode::NO_PACK: | |||
| return make_nopack_strategy(matmul_algo, param, format, stype); | |||
| break; | |||
| default: | |||
| megdnn_throw( | |||
| "not support packmode except default onlypackA " | |||
| "nopack"); | |||
| break; | |||
| } | |||
| megdnn_throw( | |||
| "factory make Strategy error please check your code"); | |||
| } | |||
| }; | |||
| template <typename Strategy> | |||
| Strategy* StrategyDelegationStorage::get( | |||
| param::ConvBias::Format format, | |||
| fallback::MatrixMulImpl::AlgoBase* matmul_algo, | |||
| const fallback::ConvBiasImpl::NCBKernSizeParam& param, | |||
| StrategyType stype) { | |||
| fallback::MatrixMulImpl::AlgoBase::PackMode packmode = | |||
| matmul_algo->packmode(); | |||
| //! nopack mode block_m block_n block_k is zero | |||
| size_t block_m = 0, block_n = 0, block_k = 0; | |||
| if (packmode == fallback::MatrixMulImpl::AlgoBase::PackMode::DEFAULT || | |||
| packmode == fallback::MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) { | |||
| block_m = matmul_algo->get_inner_block_size().m; | |||
| block_n = matmul_algo->get_inner_block_size().n; | |||
| block_k = matmul_algo->get_inner_block_size().k; | |||
| } | |||
| StrategyHashParam sparam; | |||
| sparam.param = param; | |||
| sparam.format = format; | |||
| sparam.packmode = packmode; | |||
| sparam.block_m = block_m; | |||
| sparam.block_n = block_n; | |||
| sparam.block_k = block_k; | |||
| if (map_strategys.find(sparam) == map_strategys.end()) { | |||
| MEGDNN_LOCK_GUARD(m_mtx); | |||
| auto strategy = Factory::make_strategy(format, matmul_algo, packmode, | |||
| param, stype); | |||
| map_strategys[sparam] = std::move(strategy); | |||
| } | |||
| return static_cast<Strategy*>(map_strategys[sparam].get()); | |||
| } | |||
| } // namespace im2col | |||
| } // namespace fallback | |||
| } // namespace megdnn | |||
| @@ -0,0 +1,259 @@ | |||
| /** | |||
| * \file dnn/src/fallback/conv_bias/im2col/strategy_base.h | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| */ | |||
| #pragma once | |||
| #include "src/fallback/conv_bias/opr_impl.h" | |||
| namespace megdnn { | |||
| using PackMode = fallback::MatrixMulImpl::AlgoBase::PackMode; | |||
| struct StrategyParam { | |||
| size_t batch_id; | |||
| size_t group_id; | |||
| size_t oc_tile_size; | |||
| size_t oc_cur_index; | |||
| size_t oc_end_index; | |||
| size_t ohw_cur_index; | |||
| size_t output_block_size; | |||
| size_t output_block_oc_size; | |||
| size_t ohw; | |||
| size_t block_m; | |||
| size_t block_n; | |||
| size_t block_k; | |||
| bool skip_copy_dst; | |||
| bool is_dst_8bit; | |||
| bool is_ohw_size_bigger; | |||
| }; | |||
| class StrategyBase { | |||
| public: | |||
| StrategyBase() = default; | |||
| virtual ~StrategyBase() = default; | |||
| virtual void copy_padding_kern( | |||
| WorkspaceBundle bundle, | |||
| const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) = 0; | |||
| virtual void packA_kern( | |||
| WorkspaceBundle bundle, | |||
| const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| fallback::MatrixMulImpl::KernSizeParam matmulparam, | |||
| fallback::MatrixMulImpl::AlgoBase* matmul_algo, | |||
| const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) = 0; | |||
| virtual void exec_im2col( | |||
| WorkspaceBundle bundle, WorkspaceBundle bundle_thread, | |||
| const StrategyParam& sparam, | |||
| const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| fallback::MatrixMulImpl::KernParam matmul_param, | |||
| fallback::MatrixMulImpl::AlgoBase* matmul_algo) = 0; | |||
| virtual void exec_matmul( | |||
| const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| const StrategyParam& sparam, WorkspaceBundle bundle, | |||
| WorkspaceBundle bundle_thread, | |||
| fallback::MatrixMulImpl::KernParam matmul_param, | |||
| fallback::MatrixMulImpl::AlgoBase* matmul_algo, | |||
| const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) = 0; | |||
| virtual void exec_postprocess( | |||
| const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| const StrategyParam& sparam, WorkspaceBundle bundle_thread) = 0; | |||
| }; | |||
| template <typename src_ctype, typename bias_ctype, typename dst_ctype, | |||
| typename op_ctype, typename op_dtype, | |||
| megdnn::PostprocessMode postprocess_mode, PackMode packmode> | |||
| class Strategy; | |||
| template <typename src_ctype, typename bias_ctype, typename dst_ctype, | |||
| typename op_ctype, typename op_dtype, | |||
| megdnn::PostprocessMode postprocess_mode> | |||
| class Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | |||
| postprocess_mode, PackMode::DEFAULT> : public StrategyBase { | |||
| public: | |||
| constexpr static size_t BUNDLE_PADDING_INDEX = 0; | |||
| constexpr static size_t BUNDLE_PACKA_INDEX = 1; | |||
| constexpr static size_t THREAD_BUNDLE_PACKB_INDEX = 0; | |||
| constexpr static size_t THREAD_BUNDLE_IM2COL_INDEX = 1; | |||
| constexpr static size_t THREAD_BUNDLE_BIAS_INDEX = 2; | |||
| Strategy(); | |||
| void copy_padding_kern( | |||
| WorkspaceBundle bundle, | |||
| const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) override; | |||
| void packA_kern( | |||
| WorkspaceBundle bundle, | |||
| const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| fallback::MatrixMulImpl::KernSizeParam matmulparam, | |||
| fallback::MatrixMulImpl::AlgoBase* matmul_algo, | |||
| const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) override; | |||
| void exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread, | |||
| const StrategyParam& sparam, | |||
| const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| fallback::MatrixMulImpl::KernParam matmul_param, | |||
| fallback::MatrixMulImpl::AlgoBase* matmul_algo) override; | |||
| void exec_matmul( | |||
| const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| const StrategyParam& sparam, WorkspaceBundle bundle, | |||
| WorkspaceBundle bundle_thread, | |||
| fallback::MatrixMulImpl::KernParam matmul_param, | |||
| fallback::MatrixMulImpl::AlgoBase* matmul_algo, | |||
| const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) override; | |||
| void exec_postprocess(const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| const StrategyParam& sparam, | |||
| WorkspaceBundle bundle_thread) override; | |||
| void copy_dst(const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| const void* matmul_dst, const StrategyParam& sparam); | |||
| void copy_bias(const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| WorkspaceBundle bundle_thread, const StrategyParam& sparam); | |||
| void* get_bias_temp_ptr(const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| const WorkspaceBundle& bundle_thread); | |||
| void* get_matmul_dst_ptr(const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| const WorkspaceBundle& bundle_thread, | |||
| const StrategyParam& sparam); | |||
| }; | |||
| template <typename src_ctype, typename bias_ctype, typename dst_ctype, | |||
| typename op_ctype, typename op_dtype, | |||
| megdnn::PostprocessMode postprocess_mode> | |||
| class Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | |||
| postprocess_mode, PackMode::NO_PACK> : public StrategyBase { | |||
| public: | |||
| constexpr static size_t BUNDLE_PADDING_INDEX = 0; | |||
| constexpr static size_t BUNDLE_PACKA_INDEX = 1; | |||
| constexpr static size_t THREAD_BUNDLE_IM2COL_INDEX = 0; | |||
| constexpr static size_t THREAD_BUNDLE_MATMULDST_INDEX = 1; | |||
| constexpr static size_t THREAD_BUNDLE_BIAS_INDEX = 2; | |||
| constexpr static size_t THREAD_BUNDLE_MATCOMP_INDEX = 3; | |||
| Strategy(); | |||
| void copy_padding_kern( | |||
| WorkspaceBundle bundle, | |||
| const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) override; | |||
| void packA_kern( | |||
| WorkspaceBundle bundle, | |||
| const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| fallback::MatrixMulImpl::KernSizeParam matmulparam, | |||
| fallback::MatrixMulImpl::AlgoBase* matmul_algo, | |||
| const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) override; | |||
| void exec_matmul( | |||
| const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| const StrategyParam& sparam, WorkspaceBundle bundle, | |||
| WorkspaceBundle bundle_thread, | |||
| fallback::MatrixMulImpl::KernParam matmul_param, | |||
| fallback::MatrixMulImpl::AlgoBase* matmul_algo, | |||
| const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) override; | |||
| void* get_matmul_dst_ptr(const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| const WorkspaceBundle& bundle_thread, | |||
| const StrategyParam& sparam); | |||
| inline void* get_bias_temp_ptr( | |||
| const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| const WorkspaceBundle& bundle_thread) { | |||
| bias_ctype* bias_tmp_ptr = | |||
| param.bias_mode == megdnn::BiasMode::BIAS | |||
| ? static_cast<bias_ctype*>( | |||
| bundle_thread.get(THREAD_BUNDLE_BIAS_INDEX)) | |||
| : nullptr; | |||
| return bias_tmp_ptr; | |||
| } | |||
| void exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread, | |||
| const StrategyParam& sparam, | |||
| const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| fallback::MatrixMulImpl::KernParam matmul_param, | |||
| fallback::MatrixMulImpl::AlgoBase* matmul_algo) override; | |||
| void exec_postprocess(const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| const StrategyParam& sparam, | |||
| WorkspaceBundle bundle_thread) override; | |||
| void copy_dst(const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| const void* matmul_dst, const StrategyParam& sparam); | |||
| void copy_bias(const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| WorkspaceBundle bundle_thread, const StrategyParam& sparam); | |||
| }; | |||
| template <typename src_ctype, typename bias_ctype, typename dst_ctype, | |||
| typename op_ctype, typename op_dtype, | |||
| megdnn::PostprocessMode postprocess_mode> | |||
| class Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | |||
| postprocess_mode, PackMode::ONLY_PACKA> : public StrategyBase { | |||
| public: | |||
| constexpr static size_t BUNDLE_PADDING_INDEX = 0; | |||
| constexpr static size_t BUNDLE_PACKA_INDEX = 1; | |||
| constexpr static size_t THREAD_BUNDLE_PACKB_INDEX = 0; | |||
| constexpr static size_t THREAD_BUNDLE_IM2COL_INDEX = 1; | |||
| constexpr static size_t THREAD_BUNDLE_MATMULDST_INDEX = 2; | |||
| constexpr static size_t THREAD_BUNDLE_BIAS_INDEX = 3; | |||
| Strategy(); | |||
| void copy_padding_kern( | |||
| WorkspaceBundle bundle, | |||
| const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) override; | |||
| void packA_kern( | |||
| WorkspaceBundle bundle, | |||
| const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| fallback::MatrixMulImpl::KernSizeParam matmulparam, | |||
| fallback::MatrixMulImpl::AlgoBase* matmul_algo, | |||
| const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) override; | |||
| void exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread, | |||
| const StrategyParam& sparam, | |||
| const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| fallback::MatrixMulImpl::KernParam matmul_param, | |||
| fallback::MatrixMulImpl::AlgoBase* matmul_algo) override; | |||
| void exec_matmul( | |||
| const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| const StrategyParam& sparam, WorkspaceBundle bundle, | |||
| WorkspaceBundle bundle_thread, | |||
| fallback::MatrixMulImpl::KernParam matmul_param, | |||
| fallback::MatrixMulImpl::AlgoBase* matmul_algo, | |||
| const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) override; | |||
| void* get_matmul_dst_ptr(const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| const WorkspaceBundle& bundle_thread, | |||
| const StrategyParam& sparam); | |||
| inline void* get_bias_temp_ptr( | |||
| const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| const WorkspaceBundle& bundle_thread) { | |||
| bias_ctype* bias_tmp_ptr = | |||
| param.bias_mode == megdnn::BiasMode::BIAS | |||
| ? static_cast<bias_ctype*>( | |||
| bundle_thread.get(THREAD_BUNDLE_BIAS_INDEX)) | |||
| : nullptr; | |||
| return bias_tmp_ptr; | |||
| } | |||
| void exec_postprocess(const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| const StrategyParam& sparam, | |||
| WorkspaceBundle bundle_thread) override; | |||
| void copy_dst(const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| const void* matmul_dst, const StrategyParam& sparam); | |||
| void copy_bias(const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| WorkspaceBundle bundle_thread, const StrategyParam& sparam); | |||
| }; | |||
| } // namespace megdnn | |||
| @@ -0,0 +1,379 @@ | |||
| /** | |||
| * \file dnn/src/fallback/conv_bias/im2col/strategy_default.cpp | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| */ | |||
| #include "megdnn/opr_param_defs.h" | |||
| #include "src/common/utils.h" | |||
| #include "src/fallback/conv_bias/im2col/strategy_base.h" | |||
| #include "src/fallback/convolution/img2col_helper.h" | |||
| #if MEGDNN_X86 | |||
| #include "src/x86/conv_bias/postprocess_helper.h" | |||
| #endif | |||
| using namespace megdnn; | |||
| #if MEGDNN_X86 | |||
| using namespace x86; | |||
| #endif | |||
| namespace megdnn { | |||
| template <typename src_ctype, typename bias_ctype, typename dst_ctype, | |||
| typename op_ctype, typename op_dtype, | |||
| megdnn::PostprocessMode postprocess_mode> | |||
| Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | |||
| postprocess_mode,PackMode::DEFAULT>::Strategy() | |||
| : StrategyBase() {} | |||
| template <typename src_ctype, typename bias_ctype, typename dst_ctype, | |||
| typename op_ctype, typename op_dtype, | |||
| megdnn::PostprocessMode postprocess_mode> | |||
| void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | |||
| postprocess_mode,PackMode::DEFAULT>:: | |||
| copy_padding_kern( | |||
| WorkspaceBundle bundle, | |||
| const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) { | |||
| UNPACK_CONV_F32_NCB_KERN_SIZES(param); | |||
| MEGDNN_MARK_USED_VAR(N); | |||
| MEGDNN_MARK_USED_VAR(OC); | |||
| MEGDNN_MARK_USED_VAR(OH); | |||
| MEGDNN_MARK_USED_VAR(OW); | |||
| MEGDNN_MARK_USED_VAR(FH); | |||
| MEGDNN_MARK_USED_VAR(FW); | |||
| MEGDNN_MARK_USED_VAR(SH); | |||
| MEGDNN_MARK_USED_VAR(SW); | |||
| size_t IW2 = IW + 2 * PW; | |||
| size_t IH2 = IH + 2 * PH; | |||
| size_t batch_id = ncb_index.ndrange_id[0]; | |||
| size_t group_id = ncb_index.ndrange_id[1]; | |||
| size_t channel_id = ncb_index.ndrange_id[2]; | |||
| size_t padding_group_size = IH2 * IW2 * IC; | |||
| size_t workspace_channel_offset = IH2 * IW2 * channel_id; | |||
| size_t workspace_group_offset = group_id * padding_group_size; | |||
| size_t workspace_batch_offset = | |||
| param.filter_meta.group * batch_id * padding_group_size; | |||
| bundle.set(param.workspace_ptr); | |||
| src_ctype src_zp = static_cast<src_ctype>(0); | |||
| if (param.src_type.enumv() == DTypeEnum::Quantized8Asymm) { | |||
| src_zp = param.src_type.param<dtype::Quantized8Asymm>().zero_point; | |||
| } | |||
| src_ctype* src = const_cast<src_ctype*>( | |||
| param.src<src_ctype>(batch_id, group_id, channel_id)); | |||
| src_ctype* src2; | |||
| src2 = static_cast<src_ctype*>(bundle.get(BUNDLE_PADDING_INDEX)) + | |||
| workspace_group_offset + workspace_batch_offset + | |||
| workspace_channel_offset; | |||
| src_ctype* src2_ptr = src2; | |||
| const src_ctype* src_ptr = src; | |||
| if (PH != 0) { | |||
| std::memset(src2_ptr, src_zp, sizeof(src_ctype) * PH * IW2); | |||
| src2_ptr += PH * IW2; | |||
| } | |||
| rep(ih, IH) { | |||
| if (PW != 0) | |||
| rep(pw, PW) * (src2_ptr++) = src_zp; | |||
| std::memcpy(src2_ptr, src_ptr, sizeof(src_ctype) * IW); | |||
| src2_ptr += IW; | |||
| src_ptr += IW; | |||
| if (PW != 0) | |||
| rep(pw, PW) * (src2_ptr++) = src_zp; | |||
| } | |||
| if (PH != 0) { | |||
| std::memset(src2_ptr, src_zp, sizeof(src_ctype) * PH * IW2); | |||
| src2_ptr += PH * IW2; | |||
| } | |||
| } | |||
| template <typename src_ctype, typename bias_ctype, typename dst_ctype, | |||
| typename op_ctype, typename op_dtype, | |||
| megdnn::PostprocessMode postprocess_mode> | |||
| void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | |||
| postprocess_mode,PackMode::DEFAULT>:: | |||
| packA_kern(WorkspaceBundle bundle, | |||
| const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| fallback::MatrixMulImpl::KernSizeParam matmulparam, | |||
| fallback::MatrixMulImpl::AlgoBase* matmul_algo, | |||
| const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) { | |||
| bundle.set(param.workspace_ptr); | |||
| fallback::MatrixMulImpl::KernParam matmul_param; | |||
| size_t group_id = ncb_index.ndrange_id[0]; | |||
| static_cast<fallback::MatrixMulImpl::KernSizeParam&>(matmul_param) = | |||
| matmulparam; | |||
| size_t packA_group_size = matmul_algo->get_bundle(matmul_param).get_size(0); | |||
| size_t packed_per_oc_block_size = | |||
| round_up(matmul_param.K, matmul_algo->get_inner_block_size().k) * | |||
| matmul_algo->get_inner_block_size().m * | |||
| matmul_algo->get_packA_type_size(); | |||
| size_t a_panel_offset = ncb_index.ndrange_id[1] * packed_per_oc_block_size; | |||
| int8_t* a_panel = static_cast<int8_t*>(bundle.get(BUNDLE_PACKA_INDEX)) + | |||
| group_id * packA_group_size + a_panel_offset; | |||
| matmul_param.A_ptr = | |||
| const_cast<src_ctype*>(param.filter<src_ctype>(group_id)); | |||
| matmul_algo->pack_A(matmul_param, a_panel, ncb_index.ndrange_id[1], | |||
| matmul_algo->get_inner_block_size().m); | |||
| } | |||
| template <typename src_ctype, typename bias_ctype, typename dst_ctype, | |||
| typename op_ctype, typename op_dtype, | |||
| megdnn::PostprocessMode postprocess_mode> | |||
| void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | |||
| postprocess_mode,PackMode::DEFAULT>:: | |||
| exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread, | |||
| const StrategyParam& sparam, | |||
| const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| fallback::MatrixMulImpl::KernParam matmul_param, | |||
| fallback::MatrixMulImpl::AlgoBase* matmul_algo | |||
| ) { | |||
| size_t m_sh = param.filter_meta.stride[0]; | |||
| size_t m_sw = param.filter_meta.stride[1]; | |||
| size_t m_oc = param.filter_meta.ocpg; | |||
| size_t m_oh = param.osz[0]; | |||
| size_t m_ow = param.osz[1]; | |||
| size_t m_ic = param.filter_meta.icpg; | |||
| size_t m_ih = param.isz[0] + param.filter_meta.padding[0] * 2; | |||
| size_t m_iw = param.isz[1] + param.filter_meta.padding[1] * 2; | |||
| size_t m_fh = param.filter_meta.spatial[0]; | |||
| size_t m_fw = param.filter_meta.spatial[1]; | |||
| size_t m_is_xcorr = !param.filter_meta.should_flip; | |||
| size_t input_offset = | |||
| m_ih * m_iw * m_ic * | |||
| (sparam.group_id + param.filter_meta.group * sparam.batch_id) * | |||
| sizeof(src_ctype); | |||
| src_ctype* src2 = reinterpret_cast<src_ctype*>( | |||
| reinterpret_cast<uintptr_t>(bundle.get(BUNDLE_PADDING_INDEX)) + | |||
| input_offset); | |||
| bool is_phpwzero = param.filter_meta.padding[0] == 0 && | |||
| param.filter_meta.padding[1] == 0; | |||
| if (is_phpwzero) { | |||
| src2 = const_cast<src_ctype*>( | |||
| param.src<src_ctype>(sparam.batch_id, sparam.group_id)); | |||
| } | |||
| src_ctype* im2col_dst = static_cast<src_ctype*>( | |||
| bundle_thread.get(THREAD_BUNDLE_IM2COL_INDEX)); | |||
| if (m_sh == 1 && m_sw == 1) { | |||
| if (m_is_xcorr) { | |||
| img2col<true>(src2, im2col_dst, m_oc, m_oh, m_ow, m_ic, m_ih, m_iw, | |||
| m_fh, m_fw, sparam.ohw_cur_index, | |||
| sparam.output_block_size); | |||
| } else { | |||
| img2col<false>(src2, im2col_dst, m_oc, m_oh, m_ow, m_ic, m_ih, m_iw, | |||
| m_fh, m_fw, sparam.ohw_cur_index, | |||
| sparam.output_block_size); | |||
| } | |||
| } else { | |||
| if (m_is_xcorr) { | |||
| img2col_stride<true>(src2, im2col_dst, m_oc, m_oh, m_ow, m_ic, m_ih, | |||
| m_iw, m_fh, m_fw, m_sh, m_sw, | |||
| sparam.ohw_cur_index, | |||
| sparam.output_block_size); | |||
| } else { | |||
| img2col_stride<false>(src2, im2col_dst, m_oc, m_oh, m_ow, m_ic, | |||
| m_ih, m_iw, m_fh, m_fw, m_sh, m_sw, | |||
| sparam.ohw_cur_index, | |||
| sparam.output_block_size); | |||
| } | |||
| } | |||
| matmul_param.M = sparam.output_block_oc_size; | |||
| matmul_param.N = sparam.output_block_size; | |||
| matmul_param.LDB = sparam.output_block_size; | |||
| matmul_param.LDC = sparam.output_block_size; | |||
| matmul_param.B_ptr = im2col_dst; | |||
| src_ctype* b_panel = | |||
| reinterpret_cast<src_ctype*>(reinterpret_cast<uintptr_t>( | |||
| bundle_thread.get(THREAD_BUNDLE_PACKB_INDEX))); | |||
| matmul_algo->pack_B(matmul_param, b_panel, 0, matmul_param.N); | |||
| } | |||
| template <typename src_ctype, typename bias_ctype, typename dst_ctype, | |||
| typename op_ctype, typename op_dtype, | |||
| megdnn::PostprocessMode postprocess_mode> | |||
| void* Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | |||
| postprocess_mode,PackMode::DEFAULT>:: | |||
| get_matmul_dst_ptr(const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| const WorkspaceBundle& bundle_thread, | |||
| const StrategyParam& sparam) { | |||
| if (sparam.is_dst_8bit || !sparam.is_ohw_size_bigger) { | |||
| return static_cast<void*>( | |||
| bundle_thread.get(THREAD_BUNDLE_IM2COL_INDEX)); | |||
| } else { | |||
| bias_ctype* dst = | |||
| param.dst<bias_ctype>(sparam.batch_id, sparam.group_id) + | |||
| sparam.oc_cur_index * sparam.ohw; | |||
| return static_cast<void*>(dst); | |||
| } | |||
| } | |||
| template <typename src_ctype, typename bias_ctype, typename dst_ctype, | |||
| typename op_ctype, typename op_dtype, | |||
| megdnn::PostprocessMode postprocess_mode> | |||
| void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | |||
| postprocess_mode,PackMode::DEFAULT>:: | |||
| exec_matmul(const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| const StrategyParam& sparam, WorkspaceBundle bundle, | |||
| WorkspaceBundle bundle_thread, | |||
| fallback::MatrixMulImpl::KernParam matmul_param, | |||
| fallback::MatrixMulImpl::AlgoBase* matmul_algo, | |||
| const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) { | |||
| size_t packA_per_oc_block_size = | |||
| round_up(matmul_param.K, matmul_algo->get_inner_block_size().k) * | |||
| sparam.oc_tile_size * matmul_algo->get_packA_type_size(); | |||
| size_t packA_group_size = matmul_algo->get_bundle(matmul_param).get_size(0); | |||
| size_t a_panel_offset = ncb_index.ndrange_id[1] * packA_group_size + | |||
| ncb_index.ndrange_id[3] * packA_per_oc_block_size; | |||
| void* matmul_dst = get_matmul_dst_ptr(param, bundle_thread, sparam); | |||
| src_ctype* a_panel = reinterpret_cast<src_ctype*>( | |||
| reinterpret_cast<uintptr_t>(bundle.get(BUNDLE_PACKA_INDEX)) + | |||
| a_panel_offset); | |||
| src_ctype* b_panel = | |||
| reinterpret_cast<src_ctype*>(reinterpret_cast<uintptr_t>( | |||
| bundle_thread.get(THREAD_BUNDLE_PACKB_INDEX))); | |||
| matmul_param.M = sparam.output_block_oc_size; | |||
| matmul_param.N = sparam.output_block_size; | |||
| matmul_param.LDB = sparam.output_block_size; | |||
| matmul_param.LDC = sparam.output_block_size; | |||
| matmul_param.C_ptr = matmul_dst; | |||
| auto matmul_kern_naked = matmul_algo->get_kern_naked(matmul_param); | |||
| matmul_kern_naked(matmul_param, a_panel, b_panel); | |||
| } | |||
| template <typename src_ctype, typename bias_ctype, typename dst_ctype, | |||
| typename op_ctype, typename op_dtype, | |||
| megdnn::PostprocessMode postprocess_mode> | |||
| void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | |||
| postprocess_mode,PackMode::DEFAULT>:: | |||
| exec_postprocess(const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| const StrategyParam& sparam, | |||
| WorkspaceBundle bundle_thread) { | |||
| copy_bias(param, bundle_thread, sparam); | |||
| void* matmul_dst = get_matmul_dst_ptr(param, bundle_thread, sparam); | |||
| const bias_ctype* bias_ptr = static_cast<const bias_ctype*>( | |||
| param.bias<bias_ctype>(sparam.batch_id, sparam.group_id)); | |||
| void* bias_temp_ptr = get_bias_temp_ptr(param, bundle_thread); | |||
| void* bias_preprocess_ptr = const_cast<void*>( | |||
| param.bias_mode == megdnn::BiasMode::BIAS | |||
| ? bias_temp_ptr | |||
| : static_cast<void*>(const_cast<bias_ctype*>( | |||
| bias_ptr + sparam.oc_cur_index))); | |||
| PostProcess<op_ctype, op_dtype, postprocess_mode>::run( | |||
| matmul_dst, bias_preprocess_ptr, matmul_dst, param.bias_mode, | |||
| param.nonlineMode, param.bias_type, param.dst_type, 1_z, | |||
| sparam.output_block_oc_size, 1_z, sparam.output_block_size); | |||
| copy_dst(param, matmul_dst, sparam); | |||
| } | |||
| template <typename src_ctype, typename bias_ctype, typename dst_ctype, | |||
| typename op_ctype, typename op_dtype, | |||
| megdnn::PostprocessMode postprocess_mode> | |||
| void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | |||
| postprocess_mode,PackMode::DEFAULT>:: | |||
| copy_dst(const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| const void* matmul_dst, const StrategyParam& sparam) { | |||
| if (!sparam.skip_copy_dst) { | |||
| dst_ctype* dst_tmp_ptr = | |||
| reinterpret_cast<dst_ctype*>(const_cast<void*>(matmul_dst)); | |||
| dst_ctype* dst = | |||
| param.dst<dst_ctype>(sparam.batch_id, sparam.group_id) + | |||
| sparam.oc_cur_index * sparam.ohw + sparam.ohw_cur_index; | |||
| for (size_t oc = 0; oc < sparam.output_block_oc_size; oc++) { | |||
| std::memcpy(dst, dst_tmp_ptr, | |||
| sizeof(dst_ctype) * sparam.output_block_size); | |||
| dst_tmp_ptr += sparam.output_block_size; | |||
| dst += sparam.ohw; | |||
| } | |||
| } | |||
| } | |||
| template <typename src_ctype, typename bias_ctype, typename dst_ctype, | |||
| typename op_ctype, typename op_dtype, | |||
| megdnn::PostprocessMode postprocess_mode> | |||
| void* Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | |||
| postprocess_mode,PackMode::DEFAULT>:: | |||
| get_bias_temp_ptr(const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| const WorkspaceBundle& bundle_thread) { | |||
| bias_ctype* bias_tmp_ptr = | |||
| param.bias_mode == megdnn::BiasMode::BIAS | |||
| ? static_cast<bias_ctype*>( | |||
| bundle_thread.get(THREAD_BUNDLE_BIAS_INDEX)) | |||
| : nullptr; | |||
| return bias_tmp_ptr; | |||
| } | |||
| template <typename src_ctype, typename bias_ctype, typename dst_ctype, | |||
| typename op_ctype, typename op_dtype, | |||
| megdnn::PostprocessMode postprocess_mode> | |||
| void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | |||
| postprocess_mode,PackMode::DEFAULT>:: | |||
| copy_bias(const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| WorkspaceBundle bundle_thread, const StrategyParam& sparam) { | |||
| const bias_ctype* bias_ptr = static_cast<const bias_ctype*>( | |||
| param.bias<bias_ctype>(sparam.batch_id, sparam.group_id)); | |||
| bias_ctype* bias_temp_ptr = | |||
| static_cast<bias_ctype*>(get_bias_temp_ptr(param, bundle_thread)); | |||
| if (param.bias_mode == megdnn::BiasMode::BIAS) { | |||
| bias_ctype* copy_dst = bias_temp_ptr; | |||
| const bias_ctype* copy_src = bias_ptr + | |||
| sparam.oc_cur_index * sparam.ohw + | |||
| sparam.ohw_cur_index; | |||
| for (size_t oc = sparam.oc_cur_index; oc < sparam.oc_end_index; oc++) { | |||
| std::memcpy(copy_dst, copy_src, | |||
| sizeof(bias_ctype) * sparam.output_block_size); | |||
| copy_dst += sparam.output_block_size; | |||
| copy_src += sparam.ohw; | |||
| } | |||
| } | |||
| } | |||
| #define INSTANTIAL_CLASS(_src_ctype, _bias_ctype, _dst_ctype, _op_ctype, \ | |||
| _op_dtype, _postprocess_mode) \ | |||
| template class Strategy<_src_ctype, _bias_ctype, _dst_ctype, _op_ctype, \ | |||
| _op_dtype, _postprocess_mode, PackMode::DEFAULT>; | |||
| INSTANTIAL_CLASS(dt_float32, dt_float32, dt_float32, dt_float32, dt_float32, | |||
| megdnn::PostprocessMode::FLOAT) | |||
| #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| INSTANTIAL_CLASS(dt_float16, dt_float16, dt_float16, __fp16, __fp16, | |||
| megdnn::PostprocessMode::FLOAT) | |||
| #else | |||
| #if !MEGDNN_DISABLE_FLOAT16 | |||
| INSTANTIAL_CLASS(dt_float16, dt_float16, dt_float16, dt_float16, dt_float16, | |||
| megdnn::PostprocessMode::NO_PROCESS) | |||
| #endif | |||
| #endif | |||
| #if MEGDNN_AARCH64 || MEGDNN_ARMV7 | |||
| //! x86 do not have uint8 matmul so only armv7 armv8 support uint8 | |||
| INSTANTIAL_CLASS(dt_uint8, dt_int32, dt_uint8, dt_qint32, dt_quint8, | |||
| megdnn::PostprocessMode::QUANTIZED) | |||
| INSTANTIAL_CLASS(dt_uint8, dt_int32, dt_int32, dt_qint32, dt_qint32, | |||
| megdnn::PostprocessMode::NO_PROCESS) | |||
| #endif | |||
| INSTANTIAL_CLASS(dt_int8, dt_int32, dt_int8, dt_qint32, dt_qint8, | |||
| megdnn::PostprocessMode::QUANTIZED) | |||
| INSTANTIAL_CLASS(dt_int8, dt_int32, dt_int32, dt_int32, dt_int32, | |||
| megdnn::PostprocessMode::NO_PROCESS) | |||
| INSTANTIAL_CLASS(dt_int8, dt_int16, dt_int16, dt_int16, dt_int16, | |||
| megdnn::PostprocessMode::NO_PROCESS) | |||
| INSTANTIAL_CLASS(dt_int8, dt_int32, dt_int32, dt_qint32, dt_qint32, | |||
| megdnn::PostprocessMode::NO_PROCESS) | |||
| #undef INSTANTIAL_CLASS | |||
| } // namespace megdnn | |||
| @@ -0,0 +1,343 @@ | |||
| /** | |||
| * \file dnn/src/fallback/conv_bias/im2col/strategy_nopack.cpp | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| */ | |||
| #include "megdnn/opr_param_defs.h" | |||
| #include "src/common/utils.h" | |||
| #include "src/fallback/conv_bias/im2col/strategy_base.h" | |||
| #include "src/fallback/convolution/img2col_helper.h" | |||
| #if MEGDNN_X86 | |||
| #include "src/x86/conv_bias/postprocess_helper.h" | |||
| #endif | |||
| using namespace megdnn; | |||
| #if MEGDNN_X86 | |||
| using namespace x86; | |||
| #endif | |||
| namespace megdnn { | |||
| template <typename src_ctype, typename bias_ctype, typename dst_ctype, | |||
| typename op_ctype, typename op_dtype, | |||
| megdnn::PostprocessMode postprocess_mode> | |||
| Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | |||
| postprocess_mode,PackMode::NO_PACK>::Strategy() | |||
| : StrategyBase() {} | |||
| template <typename src_ctype, typename bias_ctype, typename dst_ctype, | |||
| typename op_ctype, typename op_dtype, | |||
| megdnn::PostprocessMode postprocess_mode> | |||
| void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | |||
| postprocess_mode,PackMode::NO_PACK>:: | |||
| copy_padding_kern( | |||
| WorkspaceBundle bundle, | |||
| const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) { | |||
| UNPACK_CONV_F32_NCB_KERN_SIZES(param); | |||
| MEGDNN_MARK_USED_VAR(N); | |||
| MEGDNN_MARK_USED_VAR(OC); | |||
| MEGDNN_MARK_USED_VAR(OH); | |||
| MEGDNN_MARK_USED_VAR(OW); | |||
| MEGDNN_MARK_USED_VAR(FH); | |||
| MEGDNN_MARK_USED_VAR(FW); | |||
| MEGDNN_MARK_USED_VAR(SH); | |||
| MEGDNN_MARK_USED_VAR(SW); | |||
| size_t IW2 = IW + 2 * PW; | |||
| size_t IH2 = IH + 2 * PH; | |||
| size_t batch_id = ncb_index.ndrange_id[0]; | |||
| size_t group_id = ncb_index.ndrange_id[1]; | |||
| size_t channel_id = ncb_index.ndrange_id[2]; | |||
| size_t padding_group_size = IH2 * IW2 * IC; | |||
| size_t workspace_channel_offset = IH2 * IW2 * channel_id; | |||
| size_t workspace_group_offset = group_id * padding_group_size; | |||
| size_t workspace_batch_offset = | |||
| param.filter_meta.group * batch_id * padding_group_size; | |||
| bundle.set(param.workspace_ptr); | |||
| src_ctype src_zp = static_cast<src_ctype>(0); | |||
| if (param.src_type.enumv() == DTypeEnum::Quantized8Asymm) { | |||
| src_zp = param.src_type.param<dtype::Quantized8Asymm>().zero_point; | |||
| } | |||
| src_ctype* src = const_cast<src_ctype*>( | |||
| param.src<src_ctype>(batch_id, group_id, channel_id)); | |||
| src_ctype* src2; | |||
| src2 = static_cast<src_ctype*>(bundle.get(BUNDLE_PADDING_INDEX)) + | |||
| workspace_group_offset + workspace_batch_offset + | |||
| workspace_channel_offset; | |||
| src_ctype* src2_ptr = src2; | |||
| const src_ctype* src_ptr = src; | |||
| if (PH != 0) { | |||
| std::memset(src2_ptr, src_zp, sizeof(src_ctype) * PH * IW2); | |||
| src2_ptr += PH * IW2; | |||
| } | |||
| rep(ih, IH) { | |||
| if (PW != 0) | |||
| rep(pw, PW) * (src2_ptr++) = src_zp; | |||
| std::memcpy(src2_ptr, src_ptr, sizeof(src_ctype) * IW); | |||
| src2_ptr += IW; | |||
| src_ptr += IW; | |||
| if (PW != 0) | |||
| rep(pw, PW) * (src2_ptr++) = src_zp; | |||
| } | |||
| if (PH != 0) { | |||
| std::memset(src2_ptr, src_zp, sizeof(src_ctype) * PH * IW2); | |||
| src2_ptr += PH * IW2; | |||
| } | |||
| } | |||
| template <typename src_ctype, typename bias_ctype, typename dst_ctype, | |||
| typename op_ctype, typename op_dtype, | |||
| megdnn::PostprocessMode postprocess_mode> | |||
| void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | |||
| postprocess_mode,PackMode::NO_PACK>:: | |||
| packA_kern(WorkspaceBundle bundle, | |||
| const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| fallback::MatrixMulImpl::KernSizeParam matmulparam, | |||
| fallback::MatrixMulImpl::AlgoBase* matmul_algo, | |||
| const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) { | |||
| MEGDNN_MARK_USED_VAR(bundle); | |||
| MEGDNN_MARK_USED_VAR(param); | |||
| MEGDNN_MARK_USED_VAR(matmulparam); | |||
| MEGDNN_MARK_USED_VAR(matmul_algo); | |||
| MEGDNN_MARK_USED_VAR(ncb_index); | |||
| megdnn_throw( | |||
| "nopack mode should not call packA_kern please check your code"); | |||
| } | |||
| template <typename src_ctype, typename bias_ctype, typename dst_ctype, | |||
| typename op_ctype, typename op_dtype, | |||
| megdnn::PostprocessMode postprocess_mode> | |||
| void* Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | |||
| postprocess_mode,PackMode::NO_PACK>:: | |||
| get_matmul_dst_ptr(const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| const WorkspaceBundle& bundle_thread, | |||
| const StrategyParam& sparam) { | |||
| if (sparam.is_dst_8bit || !sparam.is_ohw_size_bigger) { | |||
| return static_cast<bias_ctype*>( | |||
| bundle_thread.get(THREAD_BUNDLE_MATMULDST_INDEX)); | |||
| } else { | |||
| bias_ctype* dst = | |||
| param.dst<bias_ctype>(sparam.batch_id, sparam.group_id) + | |||
| sparam.oc_cur_index * sparam.ohw; | |||
| return static_cast<void*>(dst); | |||
| } | |||
| } | |||
| template <typename src_ctype, typename bias_ctype, typename dst_ctype, | |||
| typename op_ctype, typename op_dtype, | |||
| megdnn::PostprocessMode postprocess_mode> | |||
| void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | |||
| postprocess_mode,PackMode::NO_PACK>:: | |||
| exec_matmul(const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| const StrategyParam& sparam, WorkspaceBundle bundle, | |||
| WorkspaceBundle bundle_thread, | |||
| fallback::MatrixMulImpl::KernParam matmul_param, | |||
| fallback::MatrixMulImpl::AlgoBase* matmul_algo, | |||
| const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) { | |||
| MEGDNN_MARK_USED_VAR(bundle); | |||
| MEGDNN_MARK_USED_VAR(ncb_index); | |||
| matmul_param.workspace_ptr = bundle_thread.get(THREAD_BUNDLE_MATCOMP_INDEX); | |||
| void* matmul_dst = get_matmul_dst_ptr(param, bundle_thread, sparam); | |||
| src_ctype* im2col_dst = static_cast<src_ctype*>( | |||
| bundle_thread.get(THREAD_BUNDLE_IM2COL_INDEX)); | |||
| const void* filter = param.filter<src_ctype>(sparam.group_id) + | |||
| sparam.oc_cur_index * param.filter_meta.icpg * | |||
| param.filter_meta.spatial[0] * | |||
| param.filter_meta.spatial[1]; | |||
| matmul_param.M = sparam.output_block_oc_size; | |||
| matmul_param.N = sparam.output_block_size; | |||
| matmul_param.LDB = sparam.output_block_size; | |||
| matmul_param.LDC = sparam.output_block_size; | |||
| matmul_param.A_ptr = filter; | |||
| matmul_param.B_ptr = im2col_dst; | |||
| matmul_param.C_ptr = matmul_dst; | |||
| auto matmul_kern = matmul_algo->get_kern(matmul_param); | |||
| matmul_kern(matmul_param); | |||
| } | |||
| template <typename src_ctype, typename bias_ctype, typename dst_ctype, | |||
| typename op_ctype, typename op_dtype, | |||
| megdnn::PostprocessMode postprocess_mode> | |||
| void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | |||
| postprocess_mode,PackMode::NO_PACK>:: | |||
| exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread, | |||
| const StrategyParam& sparam, | |||
| const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| fallback::MatrixMulImpl::KernParam matmul_param, | |||
| fallback::MatrixMulImpl::AlgoBase* matmul_algo | |||
| ) { | |||
| MEGDNN_MARK_USED_VAR(matmul_param); | |||
| MEGDNN_MARK_USED_VAR(matmul_algo); | |||
| size_t m_sh = param.filter_meta.stride[0]; | |||
| size_t m_sw = param.filter_meta.stride[1]; | |||
| size_t m_oc = param.filter_meta.ocpg; | |||
| size_t m_oh = param.osz[0]; | |||
| size_t m_ow = param.osz[1]; | |||
| size_t m_ic = param.filter_meta.icpg; | |||
| size_t m_ih = param.isz[0] + param.filter_meta.padding[0] * 2; | |||
| size_t m_iw = param.isz[1] + param.filter_meta.padding[1] * 2; | |||
| size_t m_fh = param.filter_meta.spatial[0]; | |||
| size_t m_fw = param.filter_meta.spatial[1]; | |||
| size_t m_is_xcorr = !param.filter_meta.should_flip; | |||
| size_t input_offset = | |||
| m_ih * m_iw * m_ic * | |||
| (sparam.group_id + param.filter_meta.group * sparam.batch_id) * | |||
| sizeof(src_ctype); | |||
| src_ctype* src2 = reinterpret_cast<src_ctype*>( | |||
| reinterpret_cast<uintptr_t>(bundle.get(BUNDLE_PADDING_INDEX)) + | |||
| input_offset); | |||
| bool is_phpwzero = param.filter_meta.padding[0] == 0 && | |||
| param.filter_meta.padding[1] == 0; | |||
| if (is_phpwzero) { | |||
| src2 = const_cast<src_ctype*>( | |||
| param.src<src_ctype>(sparam.batch_id, sparam.group_id)); | |||
| } | |||
| src_ctype* im2col_dst = static_cast<src_ctype*>( | |||
| bundle_thread.get(THREAD_BUNDLE_IM2COL_INDEX)); | |||
| if (m_sh == 1 && m_sw == 1) { | |||
| if (m_is_xcorr) { | |||
| img2col<true>(src2, im2col_dst, m_oc, m_oh, m_ow, m_ic, m_ih, m_iw, | |||
| m_fh, m_fw, sparam.ohw_cur_index, | |||
| sparam.output_block_size); | |||
| } else { | |||
| img2col<false>(src2, im2col_dst, m_oc, m_oh, m_ow, m_ic, m_ih, m_iw, | |||
| m_fh, m_fw, sparam.ohw_cur_index, | |||
| sparam.output_block_size); | |||
| } | |||
| } else { | |||
| if (m_is_xcorr) { | |||
| img2col_stride<true>(src2, im2col_dst, m_oc, m_oh, m_ow, m_ic, m_ih, | |||
| m_iw, m_fh, m_fw, m_sh, m_sw, | |||
| sparam.ohw_cur_index, | |||
| sparam.output_block_size); | |||
| } else { | |||
| img2col_stride<false>(src2, im2col_dst, m_oc, m_oh, m_ow, m_ic, | |||
| m_ih, m_iw, m_fh, m_fw, m_sh, m_sw, | |||
| sparam.ohw_cur_index, | |||
| sparam.output_block_size); | |||
| } | |||
| } | |||
| } | |||
| template <typename src_ctype, typename bias_ctype, typename dst_ctype, | |||
| typename op_ctype, typename op_dtype, | |||
| megdnn::PostprocessMode postprocess_mode> | |||
| void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | |||
| postprocess_mode,PackMode::NO_PACK>:: | |||
| exec_postprocess(const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| const StrategyParam& sparam, | |||
| WorkspaceBundle bundle_thread) { | |||
| copy_bias(param, bundle_thread, sparam); | |||
| void* matmul_dst = get_matmul_dst_ptr(param, bundle_thread, sparam); | |||
| const bias_ctype* bias_ptr = static_cast<const bias_ctype*>( | |||
| param.bias<bias_ctype>(sparam.batch_id, sparam.group_id)); | |||
| bias_ctype* bias_temp_ptr = | |||
| static_cast<bias_ctype*>(get_bias_temp_ptr(param, bundle_thread)); | |||
| PostProcess<op_ctype, op_dtype, postprocess_mode>::run( | |||
| matmul_dst, | |||
| const_cast<void*>( | |||
| param.bias_mode == megdnn::BiasMode::BIAS | |||
| ? bias_temp_ptr | |||
| : static_cast<void*>(const_cast<bias_ctype*>( | |||
| bias_ptr + sparam.oc_cur_index))), | |||
| matmul_dst, param.bias_mode, param.nonlineMode, param.bias_type, | |||
| param.dst_type, 1_z, sparam.output_block_oc_size, 1_z, | |||
| sparam.output_block_size); | |||
| copy_dst(param, matmul_dst, sparam); | |||
| } | |||
| template <typename src_ctype, typename bias_ctype, typename dst_ctype, | |||
| typename op_ctype, typename op_dtype, | |||
| megdnn::PostprocessMode postprocess_mode> | |||
| void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | |||
| postprocess_mode,PackMode::NO_PACK>:: | |||
| copy_dst(const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| const void* matmul_dst, const StrategyParam& sparam) { | |||
| if (!sparam.skip_copy_dst) { | |||
| dst_ctype* dst_tmp_ptr = | |||
| reinterpret_cast<dst_ctype*>(const_cast<void*>(matmul_dst)); | |||
| dst_ctype* dst = | |||
| param.dst<dst_ctype>(sparam.batch_id, sparam.group_id) + | |||
| sparam.oc_cur_index * sparam.ohw + sparam.ohw_cur_index; | |||
| for (size_t oc = 0; oc < sparam.output_block_oc_size; oc++) { | |||
| std::memcpy(dst, dst_tmp_ptr, | |||
| sizeof(dst_ctype) * sparam.output_block_size); | |||
| dst_tmp_ptr += sparam.output_block_size; | |||
| dst += sparam.ohw; | |||
| } | |||
| } | |||
| } | |||
| template <typename src_ctype, typename bias_ctype, typename dst_ctype, | |||
| typename op_ctype, typename op_dtype, | |||
| megdnn::PostprocessMode postprocess_mode> | |||
| void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | |||
| postprocess_mode,PackMode::NO_PACK>:: | |||
| copy_bias(const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| WorkspaceBundle bundle_thread, const StrategyParam& sparam) { | |||
| const bias_ctype* bias_ptr = static_cast<const bias_ctype*>( | |||
| param.bias<bias_ctype>(sparam.batch_id, sparam.group_id)); | |||
| bias_ctype* bias_temp_ptr = | |||
| static_cast<bias_ctype*>(get_bias_temp_ptr(param, bundle_thread)); | |||
| if (param.bias_mode == megdnn::BiasMode::BIAS) { | |||
| bias_ctype* copy_dst = bias_temp_ptr; | |||
| const bias_ctype* copy_src = bias_ptr + | |||
| sparam.oc_cur_index * sparam.ohw + | |||
| sparam.ohw_cur_index; | |||
| for (size_t oc = sparam.oc_cur_index; oc < sparam.oc_end_index; oc++) { | |||
| std::memcpy(copy_dst, copy_src, | |||
| sizeof(bias_ctype) * sparam.output_block_size); | |||
| copy_dst += sparam.output_block_size; | |||
| copy_src += sparam.ohw; | |||
| } | |||
| } | |||
| } | |||
| #define INSTANTIAL_CLASS(_src_ctype, _bias_ctype, _dst_ctype, _op_ctype, \ | |||
| _op_dtype, _postprocess_mode) \ | |||
| template class Strategy<_src_ctype, _bias_ctype, _dst_ctype, _op_ctype, \ | |||
| _op_dtype, _postprocess_mode, PackMode::NO_PACK>; | |||
| INSTANTIAL_CLASS(dt_float32, dt_float32, dt_float32, dt_float32, dt_float32, | |||
| megdnn::PostprocessMode::FLOAT) | |||
| #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| INSTANTIAL_CLASS(dt_float16, dt_float16, dt_float16, __fp16, __fp16, | |||
| megdnn::PostprocessMode::FLOAT) | |||
| #else | |||
| #if !MEGDNN_DISABLE_FLOAT16 | |||
| INSTANTIAL_CLASS(dt_float16, dt_float16, dt_float16, dt_float16, dt_float16, | |||
| megdnn::PostprocessMode::NO_PROCESS) | |||
| #endif | |||
| #endif | |||
| #if MEGDNN_AARCH64 || MEGDNN_ARMV7 | |||
| //! x86 do not have uint8 matmul so only armv7 armv8 support uint8 | |||
| INSTANTIAL_CLASS(dt_uint8, dt_int32, dt_uint8, dt_qint32, dt_quint8, | |||
| megdnn::PostprocessMode::QUANTIZED) | |||
| INSTANTIAL_CLASS(dt_uint8, dt_int32, dt_int32, dt_qint32, dt_qint32, | |||
| megdnn::PostprocessMode::NO_PROCESS) | |||
| #endif | |||
| INSTANTIAL_CLASS(dt_int8, dt_int32, dt_int8, dt_qint32, dt_qint8, | |||
| megdnn::PostprocessMode::QUANTIZED) | |||
| INSTANTIAL_CLASS(dt_int8, dt_int32, dt_int32, dt_int32, dt_int32, | |||
| megdnn::PostprocessMode::NO_PROCESS) | |||
| INSTANTIAL_CLASS(dt_int8, dt_int16, dt_int16, dt_int16, dt_int16, | |||
| megdnn::PostprocessMode::NO_PROCESS) | |||
| INSTANTIAL_CLASS(dt_int8, dt_int32, dt_int32, dt_qint32, dt_qint32, | |||
| megdnn::PostprocessMode::NO_PROCESS) | |||
| } // namespace megdnn | |||
| @@ -0,0 +1,349 @@ | |||
| /** | |||
| * \file dnn/src/fallback/conv_bias/im2col/algos.cpp | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| */ | |||
| #include "megdnn/opr_param_defs.h" | |||
| #include "src/fallback/conv_bias/im2col/strategy_base.h" | |||
| #include "src/fallback/convolution/img2col_helper.h" | |||
| #if MEGDNN_X86 | |||
| #include "src/x86/conv_bias/postprocess_helper.h" | |||
| #endif | |||
| using namespace megdnn; | |||
| #if MEGDNN_X86 | |||
| using namespace x86; | |||
| #endif | |||
| namespace megdnn { | |||
| template <typename src_ctype, typename bias_ctype, typename dst_ctype, | |||
| typename op_ctype, typename op_dtype, | |||
| megdnn::PostprocessMode postprocess_mode> | |||
| Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | |||
| postprocess_mode,PackMode::ONLY_PACKA>::Strategy() | |||
| : StrategyBase() {} | |||
| template <typename src_ctype, typename bias_ctype, typename dst_ctype, | |||
| typename op_ctype, typename op_dtype, | |||
| megdnn::PostprocessMode postprocess_mode> | |||
| void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | |||
| postprocess_mode,PackMode::ONLY_PACKA>:: | |||
| copy_padding_kern( | |||
| WorkspaceBundle bundle, | |||
| const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) { | |||
| UNPACK_CONV_F32_NCB_KERN_SIZES(param); | |||
| MEGDNN_MARK_USED_VAR(N); | |||
| MEGDNN_MARK_USED_VAR(OC); | |||
| MEGDNN_MARK_USED_VAR(OH); | |||
| MEGDNN_MARK_USED_VAR(OW); | |||
| MEGDNN_MARK_USED_VAR(FH); | |||
| MEGDNN_MARK_USED_VAR(FW); | |||
| MEGDNN_MARK_USED_VAR(SH); | |||
| MEGDNN_MARK_USED_VAR(SW); | |||
| size_t IW2 = IW + 2 * PW; | |||
| size_t IH2 = IH + 2 * PH; | |||
| size_t batch_id = ncb_index.ndrange_id[0]; | |||
| size_t group_id = ncb_index.ndrange_id[1]; | |||
| size_t channel_id = ncb_index.ndrange_id[2]; | |||
| size_t padding_group_size = IH2 * IW2 * IC; | |||
| size_t workspace_channel_offset = IH2 * IW2 * channel_id; | |||
| size_t workspace_group_offset = group_id * padding_group_size; | |||
| size_t workspace_batch_offset = | |||
| param.filter_meta.group * batch_id * padding_group_size; | |||
| bundle.set(param.workspace_ptr); | |||
| src_ctype src_zp = static_cast<src_ctype>(0); | |||
| if (param.src_type.enumv() == DTypeEnum::Quantized8Asymm) { | |||
| src_zp = param.src_type.param<dtype::Quantized8Asymm>().zero_point; | |||
| } | |||
| src_ctype* src = const_cast<src_ctype*>( | |||
| param.src<src_ctype>(batch_id, group_id, channel_id)); | |||
| src_ctype* src2; | |||
| src2 = static_cast<src_ctype*>(bundle.get(BUNDLE_PADDING_INDEX)) + | |||
| workspace_group_offset + workspace_batch_offset + | |||
| workspace_channel_offset; | |||
| src_ctype* src2_ptr = src2; | |||
| const src_ctype* src_ptr = src; | |||
| if (PH != 0) { | |||
| std::memset(src2_ptr, src_zp, sizeof(src_ctype) * PH * IW2); | |||
| src2_ptr += PH * IW2; | |||
| } | |||
| rep(ih, IH) { | |||
| if (PW != 0) | |||
| rep(pw, PW) * (src2_ptr++) = src_zp; | |||
| std::memcpy(src2_ptr, src_ptr, sizeof(src_ctype) * IW); | |||
| src2_ptr += IW; | |||
| src_ptr += IW; | |||
| if (PW != 0) | |||
| rep(pw, PW) * (src2_ptr++) = src_zp; | |||
| } | |||
| if (PH != 0) { | |||
| std::memset(src2_ptr, src_zp, sizeof(src_ctype) * PH * IW2); | |||
| src2_ptr += PH * IW2; | |||
| } | |||
| } | |||
| template <typename src_ctype, typename bias_ctype, typename dst_ctype, | |||
| typename op_ctype, typename op_dtype, | |||
| megdnn::PostprocessMode postprocess_mode> | |||
| void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | |||
| postprocess_mode,PackMode::ONLY_PACKA>:: | |||
| packA_kern(WorkspaceBundle bundle, | |||
| const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| fallback::MatrixMulImpl::KernSizeParam matmulparam, | |||
| fallback::MatrixMulImpl::AlgoBase* matmul_algo, | |||
| const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) { | |||
| bundle.set(param.workspace_ptr); | |||
| fallback::MatrixMulImpl::KernParam matmul_param; | |||
| static_cast<fallback::MatrixMulImpl::KernSizeParam&>(matmul_param) = | |||
| matmulparam; | |||
| size_t OC = param.filter_meta.ocpg; | |||
| size_t oc_tile_size = matmul_param.M; | |||
| size_t group_id = ncb_index.ndrange_id[0]; | |||
| size_t output_block_oc_size = | |||
| std::min(oc_tile_size, OC - ncb_index.ndrange_id[1] * oc_tile_size); | |||
| size_t oc_cur_index = ncb_index.ndrange_id[1] * oc_tile_size; | |||
| size_t packA_group_size = | |||
| bundle.get_size(BUNDLE_PACKA_INDEX) / param.filter_meta.group; | |||
| size_t a_panel_offset = ncb_index.ndrange_id[1] * | |||
| matmul_algo->get_bundle(matmul_param).get_size(0); | |||
| int8_t* a_panel = static_cast<int8_t*>(bundle.get(BUNDLE_PACKA_INDEX)) + | |||
| group_id * packA_group_size + a_panel_offset; | |||
| matmul_param.A_ptr = | |||
| const_cast<src_ctype*>(param.filter<src_ctype>(group_id)) + | |||
| oc_cur_index * matmul_param.K; | |||
| matmul_param.M = output_block_oc_size; | |||
| matmul_algo->pack_A(matmul_param, a_panel, 0_z, 0_z); | |||
| } | |||
| template <typename src_ctype, typename bias_ctype, typename dst_ctype, | |||
| typename op_ctype, typename op_dtype, | |||
| megdnn::PostprocessMode postprocess_mode> | |||
| void* Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | |||
| postprocess_mode,PackMode::ONLY_PACKA>:: | |||
| get_matmul_dst_ptr(const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| const WorkspaceBundle& bundle_thread, | |||
| const StrategyParam& sparam) { | |||
| if (sparam.is_dst_8bit || !sparam.is_ohw_size_bigger) { | |||
| return static_cast<void*>( | |||
| bundle_thread.get(THREAD_BUNDLE_MATMULDST_INDEX)); | |||
| } else { | |||
| bias_ctype* dst = | |||
| param.dst<bias_ctype>(sparam.batch_id, sparam.group_id) + | |||
| sparam.oc_cur_index * sparam.ohw; | |||
| return static_cast<void*>(dst); | |||
| } | |||
| } | |||
| template <typename src_ctype, typename bias_ctype, typename dst_ctype, | |||
| typename op_ctype, typename op_dtype, | |||
| megdnn::PostprocessMode postprocess_mode> | |||
| void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | |||
| postprocess_mode,PackMode::ONLY_PACKA>:: | |||
| exec_matmul(const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| const StrategyParam& sparam, WorkspaceBundle bundle, | |||
| WorkspaceBundle bundle_thread, | |||
| fallback::MatrixMulImpl::KernParam matmul_param, | |||
| fallback::MatrixMulImpl::AlgoBase* matmul_algo, | |||
| const fallback::ConvBiasImpl::NCBKernIndex& ncb_index) { | |||
| size_t packA_group_size = | |||
| bundle.get_size(BUNDLE_PACKA_INDEX) / param.filter_meta.group; | |||
| size_t a_panel_offset = ncb_index.ndrange_id[3] * | |||
| matmul_algo->get_bundle(matmul_param).get_size(0); | |||
| a_panel_offset = sparam.group_id * packA_group_size + a_panel_offset; | |||
| void* matmul_dst = get_matmul_dst_ptr(param, bundle_thread, sparam); | |||
| src_ctype* a_panel = reinterpret_cast<src_ctype*>( | |||
| reinterpret_cast<uintptr_t>(bundle.get(BUNDLE_PACKA_INDEX)) + | |||
| a_panel_offset); | |||
| src_ctype* b_panel = nullptr; | |||
| src_ctype* im2col_dst = static_cast<src_ctype*>( | |||
| bundle_thread.get(THREAD_BUNDLE_IM2COL_INDEX)); | |||
| matmul_param.M = sparam.output_block_oc_size; | |||
| matmul_param.N = sparam.output_block_size; | |||
| matmul_param.LDB = sparam.output_block_size; | |||
| matmul_param.LDC = sparam.output_block_size; | |||
| matmul_param.B_ptr = im2col_dst; | |||
| matmul_param.C_ptr = matmul_dst; | |||
| auto matmul_kern = matmul_algo->get_kern_naked(matmul_param); | |||
| matmul_kern(matmul_param, a_panel, b_panel); | |||
| } | |||
| template <typename src_ctype, typename bias_ctype, typename dst_ctype, | |||
| typename op_ctype, typename op_dtype, | |||
| megdnn::PostprocessMode postprocess_mode> | |||
| void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | |||
| postprocess_mode,PackMode::ONLY_PACKA>:: | |||
| exec_im2col(WorkspaceBundle bundle, WorkspaceBundle bundle_thread, | |||
| const StrategyParam& sparam, | |||
| const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| fallback::MatrixMulImpl::KernParam matmul_param, | |||
| fallback::MatrixMulImpl::AlgoBase* matmul_algo | |||
| ) { | |||
| MEGDNN_MARK_USED_VAR(matmul_param); | |||
| MEGDNN_MARK_USED_VAR(matmul_algo); | |||
| size_t m_sh = param.filter_meta.stride[0]; | |||
| size_t m_sw = param.filter_meta.stride[1]; | |||
| size_t m_oc = param.filter_meta.ocpg; | |||
| size_t m_oh = param.osz[0]; | |||
| size_t m_ow = param.osz[1]; | |||
| size_t m_ic = param.filter_meta.icpg; | |||
| size_t m_ih = param.isz[0] + param.filter_meta.padding[0] * 2; | |||
| size_t m_iw = param.isz[1] + param.filter_meta.padding[1] * 2; | |||
| size_t m_fh = param.filter_meta.spatial[0]; | |||
| size_t m_fw = param.filter_meta.spatial[1]; | |||
| size_t m_is_xcorr = !param.filter_meta.should_flip; | |||
| size_t input_offset = | |||
| m_ih * m_iw * m_ic * | |||
| (sparam.group_id + param.filter_meta.group * sparam.batch_id) * | |||
| sizeof(src_ctype); | |||
| src_ctype* src2 = reinterpret_cast<src_ctype*>( | |||
| reinterpret_cast<uintptr_t>(bundle.get(BUNDLE_PADDING_INDEX)) + | |||
| input_offset); | |||
| bool is_phpwzero = param.filter_meta.padding[0] == 0 && | |||
| param.filter_meta.padding[1] == 0; | |||
| if (is_phpwzero) { | |||
| src2 = const_cast<src_ctype*>( | |||
| param.src<src_ctype>(sparam.batch_id, sparam.group_id)); | |||
| } | |||
| src_ctype* im2col_dst = static_cast<src_ctype*>( | |||
| bundle_thread.get(THREAD_BUNDLE_IM2COL_INDEX)); | |||
| if (m_sh == 1 && m_sw == 1) { | |||
| if (m_is_xcorr) { | |||
| img2col<true>(src2, im2col_dst, m_oc, m_oh, m_ow, m_ic, m_ih, m_iw, | |||
| m_fh, m_fw, sparam.ohw_cur_index, | |||
| sparam.output_block_size); | |||
| } else { | |||
| img2col<false>(src2, im2col_dst, m_oc, m_oh, m_ow, m_ic, m_ih, m_iw, | |||
| m_fh, m_fw, sparam.ohw_cur_index, | |||
| sparam.output_block_size); | |||
| } | |||
| } else { | |||
| if (m_is_xcorr) { | |||
| img2col_stride<true>(src2, im2col_dst, m_oc, m_oh, m_ow, m_ic, m_ih, | |||
| m_iw, m_fh, m_fw, m_sh, m_sw, | |||
| sparam.ohw_cur_index, | |||
| sparam.output_block_size); | |||
| } else { | |||
| img2col_stride<false>(src2, im2col_dst, m_oc, m_oh, m_ow, m_ic, | |||
| m_ih, m_iw, m_fh, m_fw, m_sh, m_sw, | |||
| sparam.ohw_cur_index, | |||
| sparam.output_block_size); | |||
| } | |||
| } | |||
| } | |||
| template <typename src_ctype, typename bias_ctype, typename dst_ctype, | |||
| typename op_ctype, typename op_dtype, | |||
| megdnn::PostprocessMode postprocess_mode> | |||
| void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | |||
| postprocess_mode,PackMode::ONLY_PACKA>:: | |||
| exec_postprocess(const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| const StrategyParam& sparam, | |||
| WorkspaceBundle bundle_thread) { | |||
| void* matmul_dst = get_matmul_dst_ptr(param, bundle_thread, sparam); | |||
| const bias_ctype* bias_ptr = static_cast<const bias_ctype*>( | |||
| param.bias<bias_ctype>(sparam.batch_id, sparam.group_id)); | |||
| bias_ctype* bias_temp_ptr = | |||
| static_cast<bias_ctype*>(get_bias_temp_ptr(param, bundle_thread)); | |||
| if (param.bias_mode == megdnn::BiasMode::BIAS) { | |||
| bias_ctype* copy_dst = bias_temp_ptr; | |||
| const bias_ctype* copy_src = bias_ptr + | |||
| sparam.oc_cur_index * sparam.ohw + | |||
| sparam.ohw_cur_index; | |||
| for (size_t oc = sparam.oc_cur_index; oc < sparam.oc_end_index; oc++) { | |||
| std::memcpy(copy_dst, copy_src, | |||
| sizeof(bias_ctype) * sparam.output_block_size); | |||
| copy_dst += sparam.output_block_size; | |||
| copy_src += sparam.ohw; | |||
| } | |||
| } | |||
| PostProcess<op_ctype, op_dtype, postprocess_mode>::run( | |||
| matmul_dst, | |||
| const_cast<void*>( | |||
| param.bias_mode == megdnn::BiasMode::BIAS | |||
| ? bias_temp_ptr | |||
| : static_cast<void*>(const_cast<bias_ctype*>( | |||
| bias_ptr + sparam.oc_cur_index))), | |||
| matmul_dst, param.bias_mode, param.nonlineMode, param.bias_type, | |||
| param.dst_type, 1_z, sparam.output_block_oc_size, 1_z, | |||
| sparam.output_block_size); | |||
| copy_dst(param, matmul_dst, sparam); | |||
| } | |||
| template <typename src_ctype, typename bias_ctype, typename dst_ctype, | |||
| typename op_ctype, typename op_dtype, | |||
| megdnn::PostprocessMode postprocess_mode> | |||
| void Strategy<src_ctype, bias_ctype, dst_ctype, op_ctype, op_dtype, | |||
| postprocess_mode,PackMode::ONLY_PACKA>:: | |||
| copy_dst(const fallback::ConvBiasImpl::NCBKernParam& param, | |||
| const void* matmul_dst, const StrategyParam& sparam) { | |||
| if (!sparam.skip_copy_dst) { | |||
| dst_ctype* dst_tmp_ptr = | |||
| reinterpret_cast<dst_ctype*>(const_cast<void*>(matmul_dst)); | |||
| dst_ctype* dst = | |||
| param.dst<dst_ctype>(sparam.batch_id, sparam.group_id) + | |||
| sparam.oc_cur_index * sparam.ohw + sparam.ohw_cur_index; | |||
| for (size_t oc = 0; oc < sparam.output_block_oc_size; oc++) { | |||
| std::memcpy(dst, dst_tmp_ptr, | |||
| sizeof(dst_ctype) * sparam.output_block_size); | |||
| dst_tmp_ptr += sparam.output_block_size; | |||
| dst += sparam.ohw; | |||
| } | |||
| } | |||
| } | |||
| #define INSTANTIAL_CLASS(_src_ctype, _bias_ctype, _dst_ctype, _op_ctype, \ | |||
| _op_dtype, _postprocess_mode) \ | |||
| template class Strategy<_src_ctype, _bias_ctype, _dst_ctype, \ | |||
| _op_ctype, _op_dtype, _postprocess_mode,PackMode::ONLY_PACKA>; | |||
| INSTANTIAL_CLASS(dt_float32, dt_float32, dt_float32, dt_float32, dt_float32, | |||
| megdnn::PostprocessMode::FLOAT) | |||
| #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| INSTANTIAL_CLASS(dt_float16, dt_float16, dt_float16, __fp16, __fp16, | |||
| megdnn::PostprocessMode::FLOAT) | |||
| #else | |||
| #if !MEGDNN_DISABLE_FLOAT16 | |||
| INSTANTIAL_CLASS(dt_float16, dt_float16, dt_float16, dt_float16, dt_float16, | |||
| megdnn::PostprocessMode::NO_PROCESS) | |||
| #endif | |||
| #endif | |||
| #if MEGDNN_AARCH64 || MEGDNN_ARMV7 | |||
| //! x86 do not have uint8 matmul so only armv7 armv8 support uint8 | |||
| INSTANTIAL_CLASS(dt_uint8, dt_int32, dt_uint8, dt_qint32, dt_quint8, | |||
| megdnn::PostprocessMode::QUANTIZED) | |||
| INSTANTIAL_CLASS(dt_uint8, dt_int32, dt_int32, dt_qint32, dt_qint32, | |||
| megdnn::PostprocessMode::NO_PROCESS) | |||
| #endif | |||
| INSTANTIAL_CLASS(dt_int8, dt_int32, dt_int8, dt_qint32, dt_qint8, | |||
| megdnn::PostprocessMode::QUANTIZED) | |||
| INSTANTIAL_CLASS(dt_int8, dt_int32, dt_int32, dt_int32, dt_int32, | |||
| megdnn::PostprocessMode::NO_PROCESS) | |||
| INSTANTIAL_CLASS(dt_int8, dt_int16, dt_int16, dt_int16, dt_int16, | |||
| megdnn::PostprocessMode::NO_PROCESS) | |||
| INSTANTIAL_CLASS(dt_int8, dt_int32, dt_int32, dt_qint32, dt_qint32, | |||
| megdnn::PostprocessMode::NO_PROCESS) | |||
| #undef INSTANTIAL_CLASS | |||
| } // namespace megdnn | |||
| @@ -8,7 +8,6 @@ | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| */ | |||
| #include <cstddef> | |||
| #include "src/common/utils.h" | |||
| namespace { | |||
| @@ -42,7 +41,8 @@ void img2col_stride(const dtype* __restrict src, dtype* __restrict dst, | |||
| } | |||
| } | |||
| //! add for im2col matmul multithread | |||
| //!add for im2col matmul multithread | |||
| template <bool is_xcorr, typename dtype> | |||
| void img2col_stride(const dtype* __restrict src, dtype* __restrict dst, | |||
| const int OC, const int OH, const int OW, const int IC, | |||
| @@ -323,6 +323,7 @@ struct UnaryOpBase<SIMDType::NONE, dt_qint32, dt_qint8> | |||
| init(src_scale, dst_scale); | |||
| } | |||
| }; | |||
| template <> | |||
| struct UnaryOpBase<SIMDType::NONE, dt_qint32, dt_quint8> | |||
| : OpBase<dt_qint32, dt_quint8> { | |||
| @@ -330,20 +331,24 @@ struct UnaryOpBase<SIMDType::NONE, dt_qint32, dt_quint8> | |||
| using src_ctype = dt_qint32; | |||
| using dst_ctype = dt_quint8; | |||
| float scale, scale_src, scale_dst; | |||
| void init(float src_scale, float dst_scale) { | |||
| uint8_t dzp; | |||
| void init(float src_scale, float dst_scale, uint8_t dst_zp) { | |||
| scale_src = src_scale; | |||
| scale_dst = 1.f / dst_scale; | |||
| scale_dst = 1.0f / dst_scale; | |||
| dzp = dst_zp; | |||
| scale = src_scale / dst_scale; | |||
| } | |||
| UnaryOpBase(DType src_dtype, DType dst_dtype) { | |||
| float src_scale = src_dtype.param<dtype::QuantizedS32>().scale; | |||
| float dst_scale = dst_dtype.param<dtype::QuantizedS8>().scale; | |||
| init(src_scale, dst_scale); | |||
| float dst_scale = dst_dtype.param<dtype::Quantized8Asymm>().scale; | |||
| uint8_t dst_zp = dst_dtype.param<dtype::Quantized8Asymm>().zero_point; | |||
| init(src_scale, dst_scale, dst_zp); | |||
| } | |||
| UnaryOpBase(float src_scale, float dst_scale) { | |||
| init(src_scale, dst_scale); | |||
| UnaryOpBase(float src_scale, float dst_scale, uint8_t dst_zp) { | |||
| init(src_scale, dst_scale, dst_zp); | |||
| } | |||
| }; | |||
| #define OP_BASE(_simd_type, _simd_target, _simd_data_type, _func_prefix) \ | |||
| template <> \ | |||
| struct UnaryOpBase<_simd_type, dt_float32, dt_qint8> \ | |||
| @@ -828,7 +833,6 @@ template <typename Op> | |||
| struct UnaryQuantizationOp<SIMDType::NONE, dt_qint32, dt_quint8, Op> | |||
| : UnaryOpBase<SIMDType::NONE, dt_qint32, dt_quint8> { | |||
| using UnaryOpBase<SIMDType::NONE, dt_qint32, dt_quint8>::UnaryOpBase; | |||
| constexpr static size_t SIMD_WIDTH = 8; | |||
| Op op; | |||
| void operator()(const dt_qint32& src, dt_quint8* dst) const { | |||
| @@ -195,10 +195,10 @@ MatrixMulImpl::kern_t MatrixMulImpl::AlgoInt8x8x32Vnni::get_kern( | |||
| return int8x8x32_kern_vnni; | |||
| } | |||
| MEGDNN_REG_GEMM_FUNC_FOR_IM2COL_IMPL_PACKA(AlgoInt8x8x32Vnni, | |||
| megdnn_x86_matmul_kern, 5, | |||
| x86::matmul::gemm_int8_vnni_12x32x4, | |||
| dt_int8, dt_int32, dt_uint8); | |||
| MEGDNN_REG_GEMM_FUNC_FOR_IM2COL_IMPL_DETAIL(AlgoInt8x8x32Vnni, | |||
| megdnn_x86_matmul_kern, 5, | |||
| x86::matmul::gemm_int8_vnni_12x32x4, | |||
| dt_int8, dt_int32, dt_uint8); | |||
| #endif | |||
| /* ===================== Int8 mkldnn algo ===================== */ | |||
| @@ -364,7 +364,9 @@ size_t MatrixMulImpl::AlgoInt8x8x32AVX2M4N16K2::get_workspace( | |||
| m, n, k, trans_a, trans_b, strategy, cacheline) | |||
| .get_workspace_size(); | |||
| } | |||
| MEGDNN_REG_GEMM_FUNC_FOR_IM2COL_IMPL_DETAIL( | |||
| AlgoInt8x8x32AVX2M4N16K2, megdnn_x86_matmul_kern, 8, | |||
| x86::matmul::gemm_avx2_s8s8s32_4x16x2, dt_int8, dt_int32, dt_int16); | |||
| MatrixMulImpl::kern_t MatrixMulImpl::AlgoInt8x8x32AVX2M2N4K16::get_kern( | |||
| const KernSizeParam&) const { | |||
| @@ -437,6 +439,10 @@ size_t MatrixMulImpl::AlgoInt8x8x32SSEM4N8K2::get_workspace( | |||
| .get_workspace_size(); | |||
| } | |||
| MEGDNN_REG_GEMM_FUNC_FOR_IM2COL_IMPL_DETAIL( | |||
| AlgoInt8x8x32SSEM4N8K2, megdnn_x86_matmul_kern, 9, | |||
| x86::matmul::gemm_sse_s8s8s32_4x8x2, dt_int8, dt_int32, dt_int16); | |||
| /*************************AlgoF32MK8_8x8********************/ | |||
| MatrixMulImpl::kern_t MatrixMulImpl::AlgoF32MK8_8x8::get_kern( | |||
| const KernSizeParam&) const { | |||
| @@ -68,7 +68,7 @@ public: | |||
| size_t get_workspace(const KernSizeParam&) const override; | |||
| kern_t get_kern(const KernSizeParam&) const override; | |||
| void* type() const override { return sm_x86_algo_type; } | |||
| PackMode packmode() const override { return PackMode::NO_PACK; } | |||
| MEGDNN_REG_GEMM_FUNC_FOR_IM2COL(); | |||
| }; | |||
| class MatrixMulImpl::AlgoInt8x8x32SSEM4N8K2 : public AlgoBase { | |||
| @@ -79,7 +79,7 @@ public: | |||
| size_t get_workspace(const KernSizeParam&) const override; | |||
| kern_t get_kern(const KernSizeParam&) const override; | |||
| void* type() const override { return sm_x86_algo_type; } | |||
| PackMode packmode() const override { return PackMode::NO_PACK; } | |||
| MEGDNN_REG_GEMM_FUNC_FOR_IM2COL(); | |||
| }; | |||
| class MatrixMulImpl::AlgoF32MK8_8x8 : public AlgoBase { | |||
| @@ -741,7 +741,7 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32) { | |||
| TensorShape{oc, ic, kernel, kernel}, TensorShape{}); | |||
| }; | |||
| for (size_t kernel : {1, 2, 3, 4, 5, 6, 7}) | |||
| for (size_t kernel : {2, 3, 4, 5, 6, 7}) | |||
| for (size_t ic : {1, 4, 8, 16}) | |||
| for (size_t oc : {1, 4, 8}) | |||
| for (size_t p : {0, 2}) | |||
| @@ -751,7 +751,7 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_INT8x8x32) { | |||
| run(oc, ic, size, size, kernel, p, nonline_mode); | |||
| } | |||
| //! test OC block | |||
| run(2046, 1, 8, 8, 1, 0, NonlineMode::IDENTITY); | |||
| run(2046, 1, 8, 8, 2, 0, NonlineMode::IDENTITY); | |||
| Checker<ConvBias> checker(handle()); | |||
| UniformIntRNG rng{-50, 50}; | |||
| @@ -826,7 +826,7 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP32) { | |||
| (w + 2 * p - kernel) / param.stride_w + 1}); | |||
| }; | |||
| for (size_t kernel : {1, 2, 3, 4, 5, 6, 7}) | |||
| for (size_t kernel : {2, 3, 4, 5, 6, 7}) | |||
| for (size_t ic : {1, 4, 8, 16}) | |||
| for (size_t oc : {1, 4, 8, 16, 300}) | |||
| for (size_t p : {0, 2}) | |||
| @@ -895,7 +895,7 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP32_PACKA) { | |||
| (w + 2 * param.pad_w - kernel) / 1 + 1}); | |||
| }; | |||
| for (size_t kernel : {1, 2, 3, 4, 5, 6, 7}) | |||
| for (size_t kernel : {2, 3, 4, 5, 6, 7}) | |||
| for (size_t ic : {1, 4, 8, 16}) | |||
| for (size_t oc : {1, 4, 8, 16}) | |||
| for (size_t p : {0, 1}) | |||
| @@ -945,7 +945,7 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QINT8) { | |||
| TensorShape{1, oc, 1, 1}); | |||
| }; | |||
| for (size_t kernel : {1, 2, 3, 4, 5, 6, 7}) | |||
| for (size_t kernel : {2, 3, 4, 5, 6, 7}) | |||
| for (size_t ic : {1, 4, 8, 16}) | |||
| for (size_t oc : {1, 4, 8}) | |||
| for (size_t p : {0, 2}) | |||
| @@ -2183,7 +2183,7 @@ TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_IM2COL_INT8X8X32) { | |||
| std::vector<DType> data_type = {dtype::Int8(), dtype::Int8(), | |||
| dtype::Int32(), dtype::Int32()}; | |||
| std::string algo_name = "IM2COLMATMUL:X86_INT8X8X32_AVX2_4X16X2"; | |||
| std::string algo_name = "IM2COLMATMUL:X86_INT8X8X32_AVX2_4X16X2:192"; | |||
| // std::string algo_name = "IM2COLMATMUL:X86_INT8X8X32_AVX2_2X4X16"; | |||
| // printf("Benchmark IM2COLMATMUL:X86_INT8X8X32_AVX2_4X16X2 algo\n"); | |||
| benchmark_impl(param, shapes_and_computation, algo_name, RUNS, | |||