GitOrigin-RevId: d99ef7efcd
tags/v0.5.0
| @@ -445,7 +445,8 @@ pdef('PowC', 'power with constant exponent').add_fields('float32', 'exp', 0) | |||||
| 'uint32', | 'uint32', | ||||
| Doc('output_block_size', 'output block size, detail meaning see winograd ' | Doc('output_block_size', 'output block size, detail meaning see winograd ' | ||||
| 'in convbias, equals to the meaning of m in F(m, r)'), 0). | 'in convbias, equals to the meaning of m in F(m, r)'), 0). | ||||
| add_enum_alias('Format', 'MatrixMul') | |||||
| add_enum_alias('Format', 'MatrixMul'). | |||||
| add_enum_alias('ComputeMode', 'Convolution', name_field='compute_mode') | |||||
| ) | ) | ||||
| (pdef('SVD'). | (pdef('SVD'). | ||||
| @@ -273,5 +273,167 @@ ConvBiasImpl::AlgoS8WinogradF23_8x8::dispatch_kerns( | |||||
| MIDOUT_END(); | MIDOUT_END(); | ||||
| return {}; | return {}; | ||||
| } | } | ||||
| //=========================== input int8 compute float32 ========= | |||||
| bool ConvBiasImpl::AlgoS8CF32WinogradF23_4x4_NCHW44::usable( | |||||
| fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param, | |||||
| AlgoSelectionStrategy /*algo_selection_strategy*/) const { | |||||
| MEGDNN_MARK_USED_VAR(param); | |||||
| MIDOUT_BEGIN(megdnn_arm_common_conv_bias_int8, | |||||
| midout_iv("arm_common_AlgoS8CF32WinogradF23_4x4::usable"_hash)) { | |||||
| if (param.filter_meta.icpg % 4 != 0 || param.filter_meta.ocpg % 4 != 0) | |||||
| return false; | |||||
| bool is_matmul_usable = false; | |||||
| using Strategy = winograd::winograd_2x3_4x4_s8_f32_nchw44; | |||||
| Strategy strategy(param.src_type, param.filter_type, param.dst_type); | |||||
| is_matmul_usable = m_matmul_algo->usable( | |||||
| megdnn::winograd::ConvBias<Strategy, | |||||
| param::MatrixMul::Format::MK4>( | |||||
| strategy, m_tile_size, param.nr_threads, param.osz[0], | |||||
| param.osz[1], param.filter_meta.ocpg) | |||||
| .get_matmul_kern_param(param)); | |||||
| return is_matmul_usable && | |||||
| ((opr->param().format == param::ConvBias::Format::NCHW44 && | |||||
| param.filter_type.enumv() == DTypeEnum::QuantizedS8) || | |||||
| ((opr->param().format == | |||||
| param::ConvBias::Format::NCHW44_WINOGRAD) && | |||||
| opr->param().output_block_size == 2 && | |||||
| param.winograd_matmul_format == | |||||
| param::MatrixMul::Format::MK4)) && | |||||
| opr->param().mode == param::ConvBias::Mode::CROSS_CORRELATION && | |||||
| (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] && | |||||
| param.filter_meta.spatial[0] == 3) && | |||||
| (param.filter_meta.stride[0] == param.filter_meta.stride[1] && | |||||
| param.filter_meta.stride[0] == 1) && | |||||
| (param.filter_meta.dilation[0] == | |||||
| param.filter_meta.dilation[1] && | |||||
| param.filter_meta.dilation[0] == 1) && | |||||
| (param.compute_mode == param::ConvBias::ComputeMode::FLOAT32 || | |||||
| param.compute_mode == param::ConvBias::ComputeMode::DEFAULT) && | |||||
| param.src_type.enumv() == DTypeEnum::QuantizedS8 && | |||||
| param.bias_type.enumv() == DTypeEnum::QuantizedS32 && | |||||
| param.dst_type.enumv() == DTypeEnum::QuantizedS8; | |||||
| } | |||||
| MIDOUT_END(); | |||||
| return false; | |||||
| } | |||||
| size_t ConvBiasImpl::AlgoS8CF32WinogradF23_4x4_NCHW44::get_workspace( | |||||
| fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { | |||||
| MIDOUT_BEGIN( | |||||
| megdnn_arm_common_conv_bias_int8, | |||||
| midout_iv("arm_common_AlgoS8CF32WinogradF23_4x4::get_workspace"_hash)) { | |||||
| winograd::winograd_2x3_4x4_s8_f32_nchw44 strategy( | |||||
| param.src_type, param.filter_type, param.dst_type); | |||||
| return megdnn::winograd::ConvBias<winograd::winograd_2x3_4x4_s8_f32_nchw44, | |||||
| param::MatrixMul::Format::MK4>( | |||||
| strategy, m_tile_size, param.nr_threads, param.osz[0], | |||||
| param.osz[1], param.filter_meta.ocpg) | |||||
| .get_workspace_size(param, m_matmul_algo); | |||||
| } | |||||
| MIDOUT_END(); | |||||
| return 0; | |||||
| } | |||||
| SmallVector<ConvBiasImpl::NCBKern> | |||||
| ConvBiasImpl::AlgoS8CF32WinogradF23_4x4_NCHW44::dispatch_kerns( | |||||
| fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { | |||||
| MEGDNN_MARK_USED_VAR(param); | |||||
| MIDOUT_BEGIN( | |||||
| megdnn_arm_common_conv_bias_int8, | |||||
| midout_iv( | |||||
| "arm_common_AlgoS8CF32WinogradF23_4x4::dispatch_kerns"_hash)) { | |||||
| winograd::winograd_2x3_4x4_s8_f32_nchw44 strategy( | |||||
| param.src_type, param.filter_type, param.dst_type); | |||||
| auto winograd_impl = | |||||
| megdnn::winograd::ConvBias<winograd::winograd_2x3_4x4_s8_f32_nchw44, | |||||
| param::MatrixMul::Format::MK4>( | |||||
| strategy, m_tile_size, param.nr_threads, param.osz[0], | |||||
| param.osz[1], param.filter_meta.ocpg); | |||||
| return winograd_impl.get_kerns(param, m_matmul_algo); | |||||
| } | |||||
| MIDOUT_END(); | |||||
| return {}; | |||||
| } | |||||
| /* ======================= AlgoS8WinogradF23_8x8_NCHW44 ======================== */ | |||||
| bool ConvBiasImpl::AlgoS8WinogradF23_8x8_NCHW44::usable( | |||||
| fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param, | |||||
| AlgoSelectionStrategy /*algo_selection_strategy*/) const { | |||||
| MIDOUT_BEGIN( | |||||
| megdnn_arm_common_conv_bias_int8, | |||||
| midout_iv( | |||||
| "arm_common_AlgoS8WinogradF23_8x8_NCHW44::usable"_hash)) { | |||||
| if (param.filter_meta.icpg % 8 != 0 || param.filter_meta.ocpg % 8 != 0) | |||||
| return false; | |||||
| using Strategy = winograd::winograd_2x3_8x8_s8_nchw44; | |||||
| Strategy strategy(param.src_type, param.filter_type, param.dst_type); | |||||
| auto&& matmul_param = | |||||
| megdnn::winograd::ConvBias<Strategy, | |||||
| param::MatrixMul::Format::MK8>( | |||||
| strategy, m_tile_size, param.nr_threads, param.osz[0], | |||||
| param.osz[1], param.filter_meta.ocpg) | |||||
| .get_matmul_kern_param(param); | |||||
| bool is_matmul_usable = m_matmul_algo->usable(matmul_param); | |||||
| return is_matmul_usable && | |||||
| ((opr->param().format == param::ConvBias::Format::NCHW44 && | |||||
| param.filter_type.enumv() == DTypeEnum::QuantizedS8) || | |||||
| (opr->param().format == param::ConvBias::Format::NCHW44_WINOGRAD && | |||||
| opr->param().output_block_size == 2 && | |||||
| param.winograd_matmul_format == param::MatrixMul::Format::MK8 && | |||||
| param.filter_type.enumv() == DTypeEnum::QuantizedS16)) && | |||||
| opr->param().mode == param::ConvBias::Mode::CROSS_CORRELATION && | |||||
| (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] && | |||||
| param.filter_meta.spatial[0] == 3) && | |||||
| (param.filter_meta.stride[0] == param.filter_meta.stride[1] && | |||||
| param.filter_meta.stride[0] == 1) && | |||||
| (param.filter_meta.dilation[0] == param.filter_meta.dilation[1] && | |||||
| param.filter_meta.dilation[0] == 1) && | |||||
| param.compute_mode == param::ConvBias::ComputeMode::DEFAULT && | |||||
| param.src_type.enumv() == DTypeEnum::QuantizedS8 && | |||||
| param.bias_type.enumv() == DTypeEnum::QuantizedS32 && | |||||
| param.dst_type.enumv() == DTypeEnum::QuantizedS8; | |||||
| } | |||||
| MIDOUT_END(); | |||||
| return false; | |||||
| } | |||||
| size_t ConvBiasImpl::AlgoS8WinogradF23_8x8_NCHW44::get_workspace( | |||||
| fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { | |||||
| MIDOUT_BEGIN( | |||||
| megdnn_arm_common_conv_bias_int8, | |||||
| midout_iv( | |||||
| "arm_common_AlgoS8WinogradF23_8x8_NCHW44::get_workspace"_hash)) { | |||||
| winograd::winograd_2x3_8x8_s8_nchw44 strategy( | |||||
| param.src_type, param.filter_type, param.dst_type); | |||||
| return megdnn::winograd::ConvBias<winograd::winograd_2x3_8x8_s8_nchw44, | |||||
| param::MatrixMul::Format::MK8>( | |||||
| strategy, m_tile_size, param.nr_threads, param.osz[0], | |||||
| param.osz[1], param.filter_meta.ocpg) | |||||
| .get_workspace_size(param, m_matmul_algo); | |||||
| } | |||||
| MIDOUT_END(); | |||||
| return 0; | |||||
| } | |||||
| SmallVector<ConvBiasImpl::NCBKern> | |||||
| ConvBiasImpl::AlgoS8WinogradF23_8x8_NCHW44::dispatch_kerns( | |||||
| fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { | |||||
| MIDOUT_BEGIN( | |||||
| megdnn_arm_common_conv_bias_int8, | |||||
| midout_iv( | |||||
| "arm_common_AlgoS8WinogradF23_8x8_NCHW44::dispatch_kerns"_hash)) { | |||||
| winograd::winograd_2x3_8x8_s8_nchw44 strategy( | |||||
| param.src_type, param.filter_type, param.dst_type); | |||||
| auto winograd_impl = | |||||
| megdnn::winograd::ConvBias<winograd::winograd_2x3_8x8_s8_nchw44, | |||||
| param::MatrixMul::Format::MK8>( | |||||
| strategy, m_tile_size, param.nr_threads, param.osz[0], | |||||
| param.osz[1], param.filter_meta.ocpg); | |||||
| return winograd_impl.get_kerns(param, m_matmul_algo); | |||||
| } | |||||
| MIDOUT_END(); | |||||
| return {}; | |||||
| } | |||||
| // vim: syntax=cpp.doxygen | // vim: syntax=cpp.doxygen | ||||
| @@ -220,6 +220,68 @@ private: | |||||
| uint32_t m_tile_size; | uint32_t m_tile_size; | ||||
| }; | }; | ||||
| //=======================input int8 compute fp32 output int8============ | |||||
| class ConvBiasImpl::AlgoS8CF32WinogradF23_4x4_NCHW44 final : public AlgoBase { | |||||
| public: | |||||
| AlgoS8CF32WinogradF23_4x4_NCHW44(fallback::MatrixMulImpl::AlgoBase* matmul_algo, | |||||
| uint32_t tile_size) | |||||
| : m_matmul_algo{matmul_algo}, m_tile_size{tile_size} {} | |||||
| bool is_reproducible() const override { return true; } | |||||
| const char* name() const override { | |||||
| if (m_name.empty()) { | |||||
| m_name = ConvBiasImpl::algo_name<ConvBias::WinogradParam>( | |||||
| m_matmul_algo->name(), {4, 2, m_tile_size}, | |||||
| param::ConvBias::Format::NCHW44); | |||||
| } | |||||
| return m_name.c_str(); | |||||
| } | |||||
| bool usable(fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param, | |||||
| AlgoSelectionStrategy algo_selection_strategy) const override; | |||||
| size_t get_workspace(fallback::ConvBiasImpl*, | |||||
| const NCBKernSizeParam& param) const override; | |||||
| virtual SmallVector<NCBKern> dispatch_kerns( | |||||
| fallback::ConvBiasImpl* opr, | |||||
| const NCBKernSizeParam& param) const override; | |||||
| static std::vector<fallback::MatrixMulImpl::Algorithm*> | |||||
| get_avaiable_matmul_algos(const NCBKernSizeParam& param); | |||||
| private: | |||||
| fallback::MatrixMulImpl::AlgoBase* m_matmul_algo; | |||||
| mutable std::string m_name; | |||||
| uint32_t m_tile_size; | |||||
| }; | |||||
| //=======================input int8 compute int16 output int8============ | |||||
| class ConvBiasImpl::AlgoS8WinogradF23_8x8_NCHW44 final : public AlgoBase { | |||||
| public: | |||||
| AlgoS8WinogradF23_8x8_NCHW44(fallback::MatrixMulImpl::AlgoBase* matmul_algo, | |||||
| uint32_t tile_size) | |||||
| : m_matmul_algo{matmul_algo}, m_tile_size{tile_size} {} | |||||
| bool is_reproducible() const override { return true; } | |||||
| const char* name() const override { | |||||
| if (m_name.empty()) { | |||||
| m_name = ConvBiasImpl::algo_name<ConvBias::WinogradParam>( | |||||
| m_matmul_algo->name(), {8, 2, m_tile_size}, | |||||
| param::ConvBias::Format::NCHW44); | |||||
| } | |||||
| return m_name.c_str(); | |||||
| } | |||||
| bool usable(fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param, | |||||
| AlgoSelectionStrategy algo_selection_strategy) const override; | |||||
| size_t get_workspace(fallback::ConvBiasImpl*, | |||||
| const NCBKernSizeParam& param) const override; | |||||
| virtual SmallVector<NCBKern> dispatch_kerns( | |||||
| fallback::ConvBiasImpl* opr, | |||||
| const NCBKernSizeParam& param) const override; | |||||
| static std::vector<fallback::MatrixMulImpl::Algorithm*> | |||||
| get_avaiable_matmul_algos(const NCBKernSizeParam& param); | |||||
| private: | |||||
| fallback::MatrixMulImpl::AlgoBase* m_matmul_algo; | |||||
| mutable std::string m_name; | |||||
| uint32_t m_tile_size; | |||||
| }; | |||||
| } // namespace arm_common | } // namespace arm_common | ||||
| } // namespace megdnn | } // namespace megdnn | ||||
| @@ -20,6 +20,10 @@ namespace winograd { | |||||
| MEGDNN_REG_WINOGRAD_STRATEGY(int8_t, int8_t, int16_t, int, 2, 3, 8, 8, | MEGDNN_REG_WINOGRAD_STRATEGY(int8_t, int8_t, int16_t, int, 2, 3, 8, 8, | ||||
| winograd_2x3_8x8_s8) | winograd_2x3_8x8_s8) | ||||
| MEGDNN_REG_WINOGRAD_STRATEGY(int8_t, int8_t, int16_t, int, 2, 3, 8, 8, | |||||
| winograd_2x3_8x8_s8_nchw44) | |||||
| MEGDNN_REG_WINOGRAD_STRATEGY(int8_t, int8_t, float, float, 2, 3, 4, 4, | |||||
| winograd_2x3_4x4_s8_f32_nchw44) | |||||
| } | } | ||||
| } // namespace arm_common | } // namespace arm_common | ||||
| } // namespace megdnn | } // namespace megdnn | ||||
| @@ -0,0 +1,372 @@ | |||||
| /** | |||||
| * \file dnn/src/arm_common/conv_bias/fp32/strategy_nchw44_2x3_4x4.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| */ | |||||
| #include "src/arm_common/conv_bias/int8/strategy.h" | |||||
| #include "src/arm_common/simd_macro/marm_neon.h" | |||||
| #include "src/arm_common/utils.h" | |||||
| #include "src/common/unroll_macro.h" | |||||
| #include "src/common/utils.h" | |||||
| #include "src/fallback/conv_bias/winograd/winograd.h" | |||||
| #include "src/arm_common/conv_bias/winograd_common/winograd_common.h" | |||||
| #include "src/naive/matrix_mul/matrix_mul_helper.h" | |||||
| #include "src/arm_common/elemwise_helper/op_unary.h" | |||||
| #include "src/arm_common/conv_bias/fp32/helper.h" | |||||
| #include "midout.h" | |||||
| MIDOUT_DECL(megdnn_arm_common_winograd_nchw44_s8_comp_fp32_f23) | |||||
| using namespace megdnn; | |||||
| using namespace arm_common; | |||||
| namespace { | |||||
| struct InputTransform2X3 { | |||||
| template <bool inner> | |||||
| static void prepare(const int8_t* input, float* patch, float* patchT, | |||||
| int ih_start, int iw_start, size_t IH, size_t IW, | |||||
| size_t ic, size_t IC, size_t PH, size_t PW) { | |||||
| megdnn_assert( | |||||
| ic % 4 == 0 && IC % 4 == 0, | |||||
| "Winograd input prepare param is not times of 4!"); | |||||
| constexpr size_t alpha = 2 + 3 - 1; | |||||
| MEGDNN_MARK_USED_VAR(patch); | |||||
| if (inner) { | |||||
| const int8_t* input_ptr = | |||||
| input + ic * IH * IW + ih_start * IW * 4 + iw_start * 4; | |||||
| for (size_t ico = 0; ico < 4; ++ico) { | |||||
| int8x16_t v_input = vld1q_s8(input_ptr); | |||||
| int16x8_t v_low = vmovl_s8(vget_low_s8(v_input)); | |||||
| int16x8_t v_high = vmovl_s8(vget_high_s8(v_input)); | |||||
| int32x4_t v_0 = vmovl_s16(vget_low_s16(v_low)); | |||||
| int32x4_t v_1 = vmovl_s16(vget_high_s16(v_low)); | |||||
| int32x4_t v_2 = vmovl_s16(vget_low_s16(v_high)); | |||||
| int32x4_t v_3 = vmovl_s16(vget_high_s16(v_high)); | |||||
| vst1q_f32(patchT + ico * 4 * alpha + 0 * 4, | |||||
| vcvtq_f32_s32(v_0)); | |||||
| vst1q_f32(patchT + ico * 4 * alpha + 1 * 4, | |||||
| vcvtq_f32_s32(v_1)); | |||||
| vst1q_f32(patchT + ico * 4 * alpha + 2 * 4, | |||||
| vcvtq_f32_s32(v_2)); | |||||
| vst1q_f32(patchT + ico * 4 * alpha + 3 * 4, | |||||
| vcvtq_f32_s32(v_3)); | |||||
| input_ptr += IW * 4; | |||||
| } | |||||
| } else { | |||||
| if (PH > 0 || PW > 0) { | |||||
| memset(patchT, 0, sizeof(float) * 4 * alpha * alpha); | |||||
| } | |||||
| InputGetter<const int8_t*, float32x4_t> getter; | |||||
| const int8_t* input_ptr = input + ic * IH * IW; | |||||
| int ih0_act = std::max<int>(ih_start, 0), | |||||
| ih1_act = std::min<int>(ih_start + alpha, IH), | |||||
| iw0_act = std::max<int>(iw_start, 0), | |||||
| iw1_act = std::min<int>(iw_start + alpha, IW); | |||||
| // partial copy | |||||
| for (int ih = ih0_act; ih < ih1_act; ++ih) { | |||||
| for (int iw = iw0_act; iw < iw1_act; ++iw) { | |||||
| size_t iho = ih - ih_start, iwo = iw - iw_start; | |||||
| vst1q_f32(patchT + iho * alpha * 4 + iwo * 4, | |||||
| getter(input_ptr + ih * IW * 4 + iw * 4)); | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| static void transform(const float* patchT, float* input_transform_buf, | |||||
| size_t unit_idx, size_t nr_units_in_tile, size_t ic, | |||||
| size_t IC) { | |||||
| constexpr size_t alpha = 2 + 3 - 1; | |||||
| // BT * d * B | |||||
| #define cb(m, n) \ | |||||
| Vector<float, 4> d##m##n = \ | |||||
| Vector<float, 4>::load(patchT + m * 4 * 4 + n * 4); | |||||
| UNROLL_CALL_NOWRAPPER_D2(4, 4, cb); | |||||
| #undef cb | |||||
| //! 1 0 -1 0 d00 d01 d02 d03 1 0 0 0 | |||||
| //! 0 1 1 0 d10 d11 d12 d13 0 1 -1 -1 | |||||
| //! 0 -1 1 0 d20 d21 d22 d23 -1 1 1 0 | |||||
| //! 0 -1 0 1 d30 d31 d32 d33 0 0 0 1 | |||||
| #define cb(m) \ | |||||
| auto t0##m = d0##m - d2##m; \ | |||||
| auto t1##m = d1##m + d2##m; \ | |||||
| auto t2##m = d2##m - d1##m; \ | |||||
| auto t3##m = d3##m - d1##m; | |||||
| UNROLL_CALL_NOWRAPPER(4, cb); | |||||
| #undef cb | |||||
| #define cb(m) \ | |||||
| d##m##0 = t##m##0 - t##m##2; \ | |||||
| d##m##1 = t##m##1 + t##m##2; \ | |||||
| d##m##2 = t##m##2 - t##m##1; \ | |||||
| d##m##3 = t##m##3 - t##m##1; | |||||
| UNROLL_CALL_NOWRAPPER(4, cb); | |||||
| #undef cb | |||||
| size_t ICB = IC / 4; | |||||
| size_t icb = ic / 4; | |||||
| #define cb(m, n) \ | |||||
| d##m##n.save(input_transform_buf + \ | |||||
| (m * alpha + n) * ICB * nr_units_in_tile * 4 + \ | |||||
| icb * nr_units_in_tile * 4 + unit_idx * 4); | |||||
| UNROLL_CALL_NOWRAPPER_D2(4, 4, cb) | |||||
| #undef cb | |||||
| } | |||||
| }; | |||||
| template <BiasMode bmode, typename Op> | |||||
| struct OutputTransform2X3 { | |||||
| static void transform(const float* output_transform_buf, const float* bias, | |||||
| int8_t* output, float* transform_mid_buf, | |||||
| size_t oh_start, size_t ow_start, size_t OH, | |||||
| size_t OW, size_t oc_start, size_t oc_end, | |||||
| size_t oc_index, size_t unit_idx, | |||||
| size_t nr_units_in_tile, const DType& src_dtype, | |||||
| const DType& filter_dtype, const DType& dst_dtype) { | |||||
| float scale_filter = 0.f; | |||||
| MEGDNN_MARK_USED_VAR(transform_mid_buf); | |||||
| if (filter_dtype.enumv() == DTypeEnum::QuantizedS8) { | |||||
| scale_filter = filter_dtype.param<dtype::QuantizedS8>().scale; | |||||
| } else if (filter_dtype.enumv() == DTypeEnum::QuantizedS32) { | |||||
| megdnn_assert(filter_dtype.enumv() == DTypeEnum::QuantizedS32); | |||||
| scale_filter = filter_dtype.param<dtype::QuantizedS32>().scale; | |||||
| } | |||||
| float input_filter_scale = | |||||
| src_dtype.param<dtype::QuantizedS8>().scale * scale_filter; | |||||
| DType buffer_dtype = dtype::QuantizedS32(input_filter_scale); | |||||
| Op op(buffer_dtype, dst_dtype); | |||||
| //! AT * m * A | |||||
| constexpr size_t alpha = 2 + 3 - 1; | |||||
| size_t oc = oc_start + oc_index; | |||||
| size_t OCB = (oc_end - oc_start) / 4; | |||||
| size_t ocb = oc_index / 4; | |||||
| #define cb(m, n) \ | |||||
| auto v##m##n = Vector<float, 4>::load( \ | |||||
| output_transform_buf + \ | |||||
| (m * alpha + n) * OCB * nr_units_in_tile * 4 + \ | |||||
| ocb * nr_units_in_tile * 4 + unit_idx * 4); | |||||
| UNROLL_CALL_NOWRAPPER_D2(4, 4, cb); | |||||
| #undef cb | |||||
| //! 1 1 1 0 v00 v01 v02 v03 1 0 | |||||
| //! 0 1 -1 1 v10 v11 v12 v13 1 1 | |||||
| //! v20 v21 v22 v23 1 -1 | |||||
| //! v30 v31 v32 v33 0 1 | |||||
| #define cb(m) \ | |||||
| auto t0##m = v0##m + v1##m + v2##m; \ | |||||
| auto t1##m = v1##m - v2##m + v3##m; | |||||
| UNROLL_CALL_NOWRAPPER(4, cb); | |||||
| #undef cb | |||||
| Vector<float, 4> result[2][2]; | |||||
| result[0][0] = t00 + t01 + t02; | |||||
| result[1][0] = t10 + t11 + t12; | |||||
| result[0][1] = t01 - t02 + t03; | |||||
| result[1][1] = t11 - t12 + t13; | |||||
| const int32_t* tmp_bias = | |||||
| static_cast<const int32_t*>(static_cast<const void*>(bias)); | |||||
| Vector<float, 4> vbias; | |||||
| if (bmode == BiasMode::BROADCAST_CHANNEL_BIAS) { | |||||
| const float32x4_t vvbias = vcvtq_f32_s32(vld1q_s32(tmp_bias + oc)); | |||||
| vbias = Vector<float, 4>(vvbias); | |||||
| result[0][0] += vbias; | |||||
| result[0][1] += vbias; | |||||
| result[1][0] += vbias; | |||||
| result[1][1] += vbias; | |||||
| } | |||||
| #undef cb | |||||
| #if MEGDNN_AARCH64 | |||||
| int32_t* tmp_ouput = static_cast<int32_t*>(static_cast<void*>(output)); | |||||
| #endif | |||||
| for (size_t oho = 0; oho < 2 && oh_start + oho < OH; ++oho) { | |||||
| for (size_t owo = 0; owo < 2 && ow_start + owo < OW; ++owo) { | |||||
| size_t oh = oh_start + oho; | |||||
| size_t ow = ow_start + owo; | |||||
| Vector<float, 4> res; | |||||
| res = result[oho][owo]; | |||||
| if (bmode == BiasMode::BIAS) { | |||||
| const float32x4_t vvbias = vcvtq_f32_s32(vld1q_s32( | |||||
| tmp_bias + oc * OH * OW + oh * OW * 4 + ow * 4)); | |||||
| res += Vector<float, 4>(vvbias); | |||||
| } | |||||
| #if MEGDNN_AARCH64 | |||||
| int8x8_t v_res = op(res.value); | |||||
| tmp_ouput[oc * OH * OW / 4 + oh * OW + ow] = | |||||
| vget_lane_s32(vreinterpret_s32_s8(v_res), 0); | |||||
| #else | |||||
| //! armv7 using neon there is some error ,so using scalar | |||||
| //! compute | |||||
| dt_qint8 res_int8 = dt_qint8(0); | |||||
| #define cb(i) \ | |||||
| res_int8 = op(dt_qint32(vgetq_lane_f32(res.value, i))); \ | |||||
| output[oc * OH * OW + oh * OW * 4 + ow * 4 + i] = res_int8.as_int8(); | |||||
| UNROLL_CALL_NOWRAPPER(4, cb); | |||||
| #undef cb | |||||
| #endif | |||||
| } | |||||
| } | |||||
| } | |||||
| }; | |||||
| } // namespace | |||||
| namespace megdnn { | |||||
| namespace arm_common { | |||||
| namespace winograd { | |||||
| MEGDNN_REG_WINOGRAD_STRATEGY_IMPL(winograd_2x3_4x4_s8_f32_nchw44) | |||||
| void winograd_2x3_4x4_s8_f32_nchw44::filter(const int8_t* filter, | |||||
| float* filter_transform_buf, | |||||
| float* transform_mid_buf, size_t OC, size_t IC, | |||||
| size_t oc_start, size_t oc_end) { | |||||
| constexpr int alpha = 2 + 3 - 1; | |||||
| /** | |||||
| * origin: (4x3) * (3 x 3) * (3 x 4) | |||||
| */ | |||||
| //! 1 0 0 v00 v01 v02 1 0.5 0.5 0 | |||||
| //! 0.5 0.5 0.5 v10 v11 v12 0 0.5 -0.5 0 | |||||
| //! 0.5 -0.5 0.5 v20 v21 v22 0 0.5 0.5 1 | |||||
| //! 0 0 1 | |||||
| InputGetter<const int8_t*, float32x4_t> getter; | |||||
| MEGDNN_MARK_USED_VAR(transform_mid_buf); | |||||
| megdnn_assert((oc_end - oc_start) % 4 == 0 && oc_start % 4 == 0 && | |||||
| oc_end % 4 == 0 && IC % 4 == 0 && OC % 4 == 0, | |||||
| "Winograd filter transform input param is not times of 4!"); | |||||
| size_t OCB = OC / 4; | |||||
| size_t ICB = IC / 4; | |||||
| for (size_t ocb = oc_start / 4; ocb < oc_end / 4; ocb++) { | |||||
| for (size_t icb = 0; icb < ICB; icb++) { | |||||
| for (size_t ic_inner = 0; ic_inner < 4; ic_inner++) { | |||||
| const int8_t* fptr = filter + (ocb * ICB + icb) * 3 * 3 * 4 * 4 + | |||||
| ic_inner * 4; | |||||
| #define cb(m, n) \ | |||||
| Vector<float, 4> g##m##n = \ | |||||
| Vector<float, 4>(getter(fptr + (m * 3 + n) * 4 * 4)); | |||||
| UNROLL_CALL_NOWRAPPER_D2(3, 3, cb) | |||||
| #undef cb | |||||
| #define FILTER_TRANSFORM(n, wd, g) \ | |||||
| auto wd##n##0 = g##0##n; \ | |||||
| tmp0 = (g##0##n + g##2##n) * 0.5; \ | |||||
| tmp1 = g##1##n * 0.5; \ | |||||
| auto wd##n##1 = tmp0 + tmp1; \ | |||||
| auto wd##n##2 = tmp0 - tmp1; \ | |||||
| auto wd##n##3 = g##2##n; | |||||
| Vector<float, 4> tmp0, tmp1; | |||||
| UNROLL_CALL_RAW(3, FILTER_TRANSFORM, wd, g); | |||||
| UNROLL_CALL_RAW(4, FILTER_TRANSFORM, ret, wd); | |||||
| #undef FILTER_TRANSFORM | |||||
| #define cb(m, n) \ | |||||
| ret##m##n.save(filter_transform_buf + \ | |||||
| (m * alpha + n) * OCB * ICB * 4 * 4 + ocb * ICB * 4 * 4 + \ | |||||
| icb * 4 * 4 + ic_inner * 4); | |||||
| UNROLL_CALL_NOWRAPPER_D2(4, 4, cb) | |||||
| #undef cb | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| void winograd_2x3_4x4_s8_f32_nchw44::input(const int8_t* input, float* input_transform_buf, | |||||
| float* transform_mid_buf, size_t IH, size_t IW, | |||||
| size_t IC, size_t PH, size_t PW, | |||||
| size_t unit_start_idx, | |||||
| size_t nr_units_in_tile) { | |||||
| megdnn_assert(IC % 4 == 0); | |||||
| constexpr int alpha = 3 + 2 - 1; | |||||
| auto units_w = | |||||
| div_ceil<size_t>(IW + 2 * PW - KERNEL_SIZE + 1, OUTPUT_BLOCK_SIZE); | |||||
| float* patch = transform_mid_buf; | |||||
| float* patchT = transform_mid_buf + 4 * alpha * alpha; | |||||
| for (size_t ic = 0; ic < IC; ic += 4) { | |||||
| rep(unit_idx, nr_units_in_tile) { | |||||
| size_t index = unit_start_idx + unit_idx; | |||||
| size_t nh = index / units_w; | |||||
| size_t nw = index % units_w; | |||||
| int ih_start = nh * OUTPUT_BLOCK_SIZE - PH; | |||||
| int iw_start = nw * OUTPUT_BLOCK_SIZE - PW; | |||||
| if (ih_start >= 0 && ih_start + alpha <= static_cast<int>(IH) && | |||||
| iw_start >= 0 && iw_start + alpha <= static_cast<int>(IW)) { | |||||
| InputTransform2X3::prepare<true>(input, patch, patchT, ih_start, | |||||
| iw_start, IH, IW, ic, IC,PH,PW); | |||||
| InputTransform2X3::transform(patchT, input_transform_buf, | |||||
| unit_idx, nr_units_in_tile, ic, | |||||
| IC); | |||||
| } else { | |||||
| InputTransform2X3::prepare<false>(input, patch, patchT, | |||||
| ih_start, iw_start, IH, IW, | |||||
| ic, IC,PH,PW); | |||||
| InputTransform2X3::transform(patchT, input_transform_buf, | |||||
| unit_idx, nr_units_in_tile, ic, | |||||
| IC); | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| void winograd_2x3_4x4_s8_f32_nchw44::output(const float* output_transform_buf, | |||||
| const float* bias, int8_t* output, | |||||
| float* transform_mid_buf, BiasMode bmode, | |||||
| NonlineMode nonline_mode, size_t OH, size_t OW, | |||||
| size_t oc_start, size_t oc_end, | |||||
| size_t unit_start_idx, | |||||
| size_t nr_units_in_tile) { | |||||
| #define cb(_bmode, _nonline_op, ...) \ | |||||
| OutputTransform2X3<_bmode MEGDNN_COMMA _nonline_op>::transform(__VA_ARGS__); | |||||
| auto units_w = div_ceil<size_t>(OW, OUTPUT_BLOCK_SIZE); | |||||
| for (size_t oc = oc_start; oc < oc_end; oc += 4) { | |||||
| size_t oc_index = oc - oc_start; | |||||
| rep(unit_idx, nr_units_in_tile) { | |||||
| size_t index = unit_start_idx + unit_idx; | |||||
| auto nh = index / units_w; | |||||
| auto nw = index % units_w; | |||||
| size_t oh_start = nh * OUTPUT_BLOCK_SIZE; | |||||
| size_t ow_start = nw * OUTPUT_BLOCK_SIZE; | |||||
| DISPATCH_CONV_WINOGRAD_BIAS_QUANTIZED( | |||||
| megdnn_arm_common_winograd_nchw44_s8_comp_fp32_f23, cb, | |||||
| dt_qint32, dt_qint8, bmode, nonline_mode, | |||||
| output_transform_buf, bias, output, transform_mid_buf, | |||||
| oh_start, ow_start, OH, OW, oc_start, oc_end, oc_index, | |||||
| unit_idx, nr_units_in_tile, src_dtype, filter_dtype, | |||||
| dst_dtype); | |||||
| } | |||||
| } | |||||
| #undef cb | |||||
| } | |||||
| } // namespace winograd | |||||
| } // namespace arm_common | |||||
| } // namespace megdnn | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,404 @@ | |||||
| /** | |||||
| * \file dnn/src/arm_common/conv_bias/int8/strategy_nchw44_2x3_8x8.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| */ | |||||
| #include "src/fallback/conv_bias/winograd/winograd.h" | |||||
| #include "src/naive/matrix_mul/matrix_mul_helper.h" | |||||
| #include "src/arm_common/conv_bias/winograd_common/winograd_common.h" | |||||
| #include "src/arm_common/elemwise_helper/op_unary.h" | |||||
| #include "src/arm_common/conv_bias/int8/strategy.h" | |||||
| #include "src/arm_common/conv_bias/int8/helper.h" | |||||
| #include "src/arm_common/simd_macro/marm_neon.h" | |||||
| #include "src/arm_common/utils.h" | |||||
| #include "src/common/winograd/winograd_generator.h" | |||||
| #include "src/common/unroll_macro.h" | |||||
| #include "src/common/utils.h" | |||||
| #include "midout.h" | |||||
| MIDOUT_DECL(megdnn_arm_common_winograd_nchw44_s8_int16_8x8) | |||||
| using namespace megdnn; | |||||
| using namespace arm_common; | |||||
| namespace { | |||||
| struct FilterTransform2X3_qs8 { | |||||
| static void transform(const int8_t* filter_ptr, int16_t* filter_transform_buf, | |||||
| int16_t* transform_mid_buf, size_t OC, size_t IC, | |||||
| size_t oc_start, size_t oc_end) { | |||||
| constexpr int alpha = 2 + 3 - 1; | |||||
| /** | |||||
| * origin: (4x3) * (3 x 3) * (3 x 4) | |||||
| */ | |||||
| //! 1 0 0 v00 v01 v02 1 0.5 0.5 0 | |||||
| //! 0.5 0.5 0.5 v10 v11 v12 0 0.5 -0.5 0 | |||||
| //! 0.5 -0.5 0.5 v20 v21 v22 0 0.5 0.5 1 | |||||
| //! 0 0 1 | |||||
| //! 2 0 0 v00 v01 v02 2 1 1 0 | |||||
| //! 1 1 1 v10 v11 v12 0 1 -1 0 | |||||
| //! 1 -1 1 v20 v21 v22 0 1 1 2 | |||||
| //! 0 0 2 | |||||
| //! G * g * GT | |||||
| InputGetter<const int8_t*, int16x4_t> getter; | |||||
| MEGDNN_MARK_USED_VAR(transform_mid_buf); | |||||
| megdnn_assert( | |||||
| (oc_end - oc_start) % 4 == 0 && oc_start % 4 == 0 && | |||||
| oc_end % 4 == 0 && IC % 8 == 0 && OC % 8 == 0, | |||||
| "Winograd filter transform input param is not times of 8!"); | |||||
| size_t OCB = OC / 8; | |||||
| size_t ICB = IC / 8; | |||||
| size_t ICB4 = IC / 4; | |||||
| for (size_t ocb = oc_start / 4; ocb < oc_end / 4; ocb++) { | |||||
| size_t tmp_ocb = ocb / 2; | |||||
| size_t index = ((ocb & 1) == 0) ? 0 : 1; | |||||
| for (size_t icb = 0; icb < ICB4; icb++) { | |||||
| for (size_t ic_inner = 0; ic_inner < 4; ic_inner++) { | |||||
| const int8_t* fptr = filter_ptr + | |||||
| (ocb * ICB4 + icb) * 3 * 3 * 4 * 4 + | |||||
| ic_inner * 4; | |||||
| #define cb(m, n) \ | |||||
| Vector<int16_t, 4> g##m##n = \ | |||||
| Vector<int16_t, 4>(getter(fptr + (m * 3 + n) * 4 * 4)); | |||||
| UNROLL_CALL_NOWRAPPER_D2(3, 3, cb) | |||||
| #undef cb | |||||
| #define FILTER_TRANSFORM(n, wd, g) \ | |||||
| auto wd##n##0 = g##0##n * 2; \ | |||||
| v_tmp = g##0##n + g##2##n; \ | |||||
| auto wd##n##1 = v_tmp + g##1##n; \ | |||||
| auto wd##n##2 = v_tmp - g##1##n; \ | |||||
| auto wd##n##3 = g##2##n * 2; | |||||
| Vector<int16_t, 4> v_tmp; | |||||
| UNROLL_CALL_RAW(3, FILTER_TRANSFORM, wd, g); | |||||
| UNROLL_CALL_RAW(4, FILTER_TRANSFORM, ret, wd); | |||||
| #undef FILTER_TRANSFORM | |||||
| #define cb(m, n) \ | |||||
| ret##m##n.save( \ | |||||
| filter_transform_buf + (m * alpha + n) * OCB * ICB * 8 * 8 + \ | |||||
| tmp_ocb * ICB * 8 * 8 + icb * 4 * 8 + ic_inner * 8 + index * 4); | |||||
| UNROLL_CALL_NOWRAPPER_D2(4, 4, cb) | |||||
| #undef cb | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| }; | |||||
| struct InputTransform2X3_qs8 { | |||||
| template <bool inner> | |||||
| static void prepare(const int8_t* input, int16_t* patch, int16_t* patchT, | |||||
| int ih_start, int iw_start, size_t IH, size_t IW, | |||||
| size_t ic, size_t IC, size_t PH, size_t PW) { | |||||
| megdnn_assert(ic % 8 == 0 && IC % 8 == 0, | |||||
| "Winograd input prepare param is not times of 4!"); | |||||
| MEGDNN_MARK_USED_VAR(patch); | |||||
| constexpr size_t alpha = 2 + 3 - 1; | |||||
| if (inner) { | |||||
| const int8_t* input_ptr = | |||||
| input + ic * IH * IW + ih_start * IW * 4 + iw_start * 4; | |||||
| for (size_t ico = 0; ico < alpha; ++ico) { | |||||
| int8x16_t v_input0 = vld1q_s8(input_ptr); // c0123 | |||||
| int8x16_t v_input1 = | |||||
| vld1q_s8(input_ptr + IH * IW * 4); // c4567 | |||||
| int32x4_t v32_00 = vreinterpretq_s32_s8(v_input0); | |||||
| int32x4_t v32_01 = vreinterpretq_s32_s8(v_input1); | |||||
| int32x4x2_t v_trn = vtrnq_s32(v32_00, v32_01); // c01234567 | |||||
| v_input0 = vreinterpretq_s8_s32(v_trn.val[0]); | |||||
| v_input1 = vreinterpretq_s8_s32(v_trn.val[1]); | |||||
| int16x8_t v0_low = vmovl_s8(vget_low_s8(v_input0)); | |||||
| int16x8_t v0_high = vmovl_s8(vget_high_s8(v_input0)); | |||||
| int16x8_t v1_low = vmovl_s8(vget_low_s8(v_input1)); | |||||
| int16x8_t v1_high = vmovl_s8(vget_high_s8(v_input1)); | |||||
| vst1q_s16(patchT + ico * 8 * alpha + 0 * 8, v0_low); | |||||
| vst1q_s16(patchT + ico * 8 * alpha + 1 * 8, v1_low); | |||||
| vst1q_s16(patchT + ico * 8 * alpha + 2 * 8, v0_high); | |||||
| vst1q_s16(patchT + ico * 8 * alpha + 3 * 8, v1_high); | |||||
| input_ptr += IW * 4; // next row | |||||
| } | |||||
| } else { | |||||
| if (PH || PW) { | |||||
| memset(patchT, 0, sizeof(int16_t) * 8 * alpha * alpha); | |||||
| } | |||||
| InputGetter<const int8_t*, int16x4_t> getter; | |||||
| const int8_t* input_ptr = input + ic * IH * IW; | |||||
| int ih0_act = std::max<int>(ih_start, 0), | |||||
| ih1_act = std::min<int>(ih_start + alpha, IH), | |||||
| iw0_act = std::max<int>(iw_start, 0), | |||||
| iw1_act = std::min<int>(iw_start + alpha, IW); | |||||
| // partial copy | |||||
| for (int ih = ih0_act; ih < ih1_act; ++ih) { | |||||
| for (int iw = iw0_act; iw < iw1_act; ++iw) { | |||||
| size_t iho = ih - ih_start, iwo = iw - iw_start; | |||||
| vst1q_s16(patchT + iho * alpha * 8 + iwo * 8, | |||||
| vcombine_s16( | |||||
| getter(input_ptr + ih * IW * 4 + iw * 4), | |||||
| getter(input_ptr + IH * IW * 4 + | |||||
| ih * IW * 4 + iw * 4))); | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| static void transform(const int16_t* patchT, int16_t* input_transform_buf, | |||||
| size_t unit_idx, size_t nr_units_in_tile, size_t ic, | |||||
| size_t IC) { | |||||
| constexpr size_t alpha = 2 + 3 - 1; | |||||
| // BT * d * B | |||||
| #define cb(m, n) \ | |||||
| Vector<int16_t, 8> d##m##n = \ | |||||
| Vector<int16_t, 8>::load(patchT + m * 4 * 8 + n * 8); | |||||
| UNROLL_CALL_NOWRAPPER_D2(4, 4, cb); | |||||
| #undef cb | |||||
| //! 1 0 -1 0 d00 d01 d02 d03 1 0 0 0 | |||||
| //! 0 1 1 0 d10 d11 d12 d13 0 1 -1 -1 | |||||
| //! 0 -1 1 0 d20 d21 d22 d23 -1 1 1 0 | |||||
| //! 0 -1 0 1 d30 d31 d32 d33 0 0 0 1 | |||||
| #define cb(m) \ | |||||
| auto t0##m = d0##m - d2##m; \ | |||||
| auto t1##m = d1##m + d2##m; \ | |||||
| auto t2##m = d2##m - d1##m; \ | |||||
| auto t3##m = d3##m - d1##m; | |||||
| UNROLL_CALL_NOWRAPPER(4, cb); | |||||
| #undef cb | |||||
| #define cb(m) \ | |||||
| d##m##0 = t##m##0 - t##m##2; \ | |||||
| d##m##1 = t##m##1 + t##m##2; \ | |||||
| d##m##2 = t##m##2 - t##m##1; \ | |||||
| d##m##3 = t##m##3 - t##m##1; | |||||
| UNROLL_CALL_NOWRAPPER(4, cb); | |||||
| #undef cb | |||||
| size_t ICB = IC / 8; | |||||
| size_t icb = ic / 8; | |||||
| #define cb(m, n) \ | |||||
| d##m##n.save(input_transform_buf + \ | |||||
| (m * alpha + n) * ICB * nr_units_in_tile * 8 + \ | |||||
| icb * nr_units_in_tile * 8 + unit_idx * 8); | |||||
| UNROLL_CALL_NOWRAPPER_D2(4, 4, cb) | |||||
| #undef cb | |||||
| } | |||||
| }; | |||||
| template <BiasMode bmode, typename Op> | |||||
| struct OutputTransform2X3_qs8 { | |||||
| static void transform(const int32_t* output_transform_buf, | |||||
| const int32_t* bias, int8_t* output, | |||||
| int32_t* transform_mid_buf, size_t oh_start, | |||||
| size_t ow_start, size_t OH, size_t OW, | |||||
| size_t oc_start, size_t oc_end, size_t oc_index, | |||||
| size_t unit_idx, size_t nr_units_in_tile, | |||||
| const DType& src_dtype, const DType& filter_dtype, | |||||
| const DType& dst_dtype) { | |||||
| MEGDNN_MARK_USED_VAR(transform_mid_buf); | |||||
| float scale_filter = 0.f; | |||||
| if (filter_dtype.enumv() == DTypeEnum::QuantizedS8) { | |||||
| scale_filter = filter_dtype.param<dtype::QuantizedS8>().scale; | |||||
| } else { | |||||
| megdnn_assert(filter_dtype.enumv() == DTypeEnum::QuantizedS16); | |||||
| scale_filter = filter_dtype.param<dtype::QuantizedS16>().scale; | |||||
| } | |||||
| float input_filter_scale = | |||||
| src_dtype.param<dtype::QuantizedS8>().scale * scale_filter; | |||||
| DType buffer_dtype = dtype::QuantizedS32(input_filter_scale * 0.5f * | |||||
| 0.5f * 1.0f * 1.0f); | |||||
| Op op(buffer_dtype, dst_dtype); | |||||
| //! AT * m * A | |||||
| constexpr size_t alpha = 2 + 3 - 1; | |||||
| size_t oc = oc_start + oc_index; | |||||
| size_t OCB = (oc_end - oc_start) / 8; | |||||
| size_t ocb = oc_index / 8; | |||||
| #define cb(m, n) \ | |||||
| auto v##m##n = Vector<int32_t, 8>::load( \ | |||||
| output_transform_buf + \ | |||||
| (m * alpha + n) * OCB * nr_units_in_tile * 8 + \ | |||||
| ocb * nr_units_in_tile * 8 + unit_idx * 8); | |||||
| UNROLL_CALL_NOWRAPPER_D2(4, 4, cb); | |||||
| #undef cb | |||||
| //! 1 1 1 0 v00 v01 v02 v03 1 0 | |||||
| //! 0 1 -1 1 v10 v11 v12 v13 1 1 | |||||
| //! v20 v21 v22 v23 1 -1 | |||||
| //! v30 v31 v32 v33 0 1 | |||||
| #define cb(m) \ | |||||
| auto t0##m = v0##m + v1##m + v2##m; \ | |||||
| auto t1##m = v1##m - v2##m + v3##m; | |||||
| UNROLL_CALL_NOWRAPPER(4, cb); | |||||
| #undef cb | |||||
| Vector<int32_t, 8> result[2][2]; | |||||
| result[0][0] = t00 + t01 + t02; | |||||
| result[1][0] = t10 + t11 + t12; | |||||
| result[0][1] = t01 - t02 + t03; | |||||
| result[1][1] = t11 - t12 + t13; | |||||
| if (bmode == BiasMode::BROADCAST_CHANNEL_BIAS) { | |||||
| Vector<int32_t, 8> vbias; | |||||
| vbias = Vector<int32_t, 8>::load(bias + oc) * 4; | |||||
| result[0][0] += vbias; | |||||
| result[0][1] += vbias; | |||||
| result[1][0] += vbias; | |||||
| result[1][1] += vbias; | |||||
| } | |||||
| #if MEGDNN_AARCH64 | |||||
| int32_t* tmp_output = static_cast<int32_t*>(static_cast<void*>(output)); | |||||
| #endif | |||||
| for (size_t oho = 0; oho < 2 && oh_start + oho < OH; ++oho) { | |||||
| for (size_t owo = 0; owo < 2 && ow_start + owo < OW; ++owo) { | |||||
| size_t oh = oh_start + oho; | |||||
| size_t ow = ow_start + owo; | |||||
| Vector<int32_t, 8> res = result[oho][owo]; | |||||
| if (bmode == BiasMode::BIAS) { | |||||
| int32x4x2_t vbias; | |||||
| vbias.val[0] = vld1q_s32(bias + oc * OH * OW + oh * OW * 4 + | |||||
| ow * 4); | |||||
| vbias.val[1] = vld1q_s32(bias + (oc + 4) * OH * OW + | |||||
| oh * OW * 4 + ow * 4); | |||||
| res += Vector<int32_t, 8>(vbias) * 4; | |||||
| } | |||||
| #if MEGDNN_AARCH64 | |||||
| int8x8_t res_int8 = op(res.value); | |||||
| int32x2_t res32 = vreinterpret_s32_s8(res_int8); | |||||
| tmp_output[oc / 4 * OH * OW + oh * OW + ow] = | |||||
| vget_lane_s32(res32, 0); | |||||
| tmp_output[(oc / 4 + 1) * OH * OW + oh * OW + ow] = | |||||
| vget_lane_s32(res32, 1); | |||||
| #else | |||||
| dt_qint8 res_int8 = dt_qint8(0); | |||||
| #define cb(i) \ | |||||
| res_int8 = op(dt_qint32(vgetq_lane_s32(res.value.val[0], i))); \ | |||||
| output[oc * OH * OW + oh * OW * 4 + ow * 4 + i] = res_int8.as_int8(); \ | |||||
| res_int8 = op(dt_qint32(vgetq_lane_s32(res.value.val[1], i))); \ | |||||
| output[(oc + 4) * OH * OW + oh * OW * 4 + ow * 4 + i] = res_int8.as_int8(); | |||||
| UNROLL_CALL_NOWRAPPER(4, cb); | |||||
| #undef cb | |||||
| #endif | |||||
| } | |||||
| } | |||||
| #undef cb | |||||
| } | |||||
| }; | |||||
| } // namespace | |||||
| namespace megdnn { | |||||
| namespace arm_common { | |||||
| namespace winograd { | |||||
| MEGDNN_REG_WINOGRAD_STRATEGY_IMPL(winograd_2x3_8x8_s8_nchw44) | |||||
| void winograd_2x3_8x8_s8_nchw44::filter(const int8_t* filter, | |||||
| int16_t* filter_transform_buf, | |||||
| int16_t* transform_mid_buf, size_t OC, | |||||
| size_t IC, size_t oc_start, size_t oc_end) { | |||||
| FilterTransform2X3_qs8::transform(filter, filter_transform_buf, | |||||
| transform_mid_buf, OC, IC, oc_start, | |||||
| oc_end); | |||||
| } | |||||
| void winograd_2x3_8x8_s8_nchw44::input(const int8_t* input, | |||||
| int16_t* input_transform_buf, | |||||
| int16_t* transform_mid_buf, size_t IH, | |||||
| size_t IW, size_t IC, size_t PH, size_t PW, | |||||
| size_t unit_start_idx, | |||||
| size_t nr_units_in_tile) { | |||||
| megdnn_assert(IC % 8 == 0); | |||||
| constexpr int alpha = 3 + 2 - 1; | |||||
| auto units_w = div_ceil<size_t>(IW + 2 * PW - KERNEL_SIZE + 1, OUTPUT_BLOCK_SIZE); | |||||
| int16_t* patch = transform_mid_buf; | |||||
| int16_t* patchT = transform_mid_buf;// + 8 * alpha * alpha; | |||||
| for (size_t ic = 0; ic < IC; ic += 8) { | |||||
| rep(unit_idx, nr_units_in_tile) { | |||||
| size_t index = unit_start_idx + unit_idx; | |||||
| size_t nh = index / units_w; | |||||
| size_t nw = index % units_w; | |||||
| int ih_start = nh * OUTPUT_BLOCK_SIZE - PH; | |||||
| int iw_start = nw * OUTPUT_BLOCK_SIZE - PW; | |||||
| if (ih_start >= 0 && ih_start + alpha <= static_cast<int>(IH) && | |||||
| iw_start >= 0 && iw_start + alpha <= static_cast<int>(IW)) { | |||||
| InputTransform2X3_qs8::prepare<true>(input, patch, patchT, | |||||
| ih_start, iw_start, IH, IW, | |||||
| ic, IC,PH,PW); | |||||
| InputTransform2X3_qs8::transform(patchT, input_transform_buf, | |||||
| unit_idx, nr_units_in_tile, ic, | |||||
| IC); | |||||
| } else { | |||||
| InputTransform2X3_qs8::prepare<false>(input, patch, patchT, | |||||
| ih_start, iw_start, IH, | |||||
| IW, ic, IC,PH,PW); | |||||
| InputTransform2X3_qs8::transform(patchT, input_transform_buf, | |||||
| unit_idx, nr_units_in_tile, ic, | |||||
| IC); | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| void winograd_2x3_8x8_s8_nchw44::output(const int* output_transform_buf, | |||||
| const int* bias, int8_t* output, | |||||
| int* transform_mid_buf, BiasMode bmode, | |||||
| NonlineMode nonline_mode, size_t OH, size_t OW, | |||||
| size_t oc_start, size_t oc_end, | |||||
| size_t unit_start_idx, | |||||
| size_t nr_units_in_tile) { | |||||
| #define cb(_bmode, _nonline_op, ...) \ | |||||
| OutputTransform2X3_qs8<_bmode MEGDNN_COMMA _nonline_op>::transform( \ | |||||
| __VA_ARGS__); | |||||
| auto units_w = div_ceil<size_t>(OW, OUTPUT_BLOCK_SIZE); | |||||
| for (size_t oc = oc_start; oc < oc_end; oc += 8) { | |||||
| size_t oc_index = oc - oc_start; | |||||
| rep(unit_idx, nr_units_in_tile) { | |||||
| size_t index = unit_start_idx + unit_idx; | |||||
| auto nh = index / units_w; | |||||
| auto nw = index % units_w; | |||||
| size_t oh_start = nh * OUTPUT_BLOCK_SIZE; | |||||
| size_t ow_start = nw * OUTPUT_BLOCK_SIZE; | |||||
| DISPATCH_CONV_WINOGRAD_BIAS_QUANTIZED( | |||||
| megdnn_arm_common_winograd_nchw44_s8_int16_8x8, cb, | |||||
| dt_qint32, dt_qint8, bmode, nonline_mode, | |||||
| output_transform_buf, bias, output, transform_mid_buf, | |||||
| oh_start, ow_start, OH, OW, oc_start, oc_end, oc_index, | |||||
| unit_idx, nr_units_in_tile, src_dtype, filter_dtype, | |||||
| dst_dtype); | |||||
| } | |||||
| } | |||||
| #undef cb | |||||
| } | |||||
| } // namespace winograd | |||||
| } // namespace arm_common | |||||
| } // namespace megdnn | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -201,6 +201,14 @@ public: | |||||
| static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo), | static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo), | ||||
| tile_size)); | tile_size)); | ||||
| winograd_algos.emplace_back(refhold.back().get()); | winograd_algos.emplace_back(refhold.back().get()); | ||||
| refhold.emplace_back(new AlgoS8CF32WinogradF23_4x4_NCHW44( | |||||
| static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo), | |||||
| tile_size)); | |||||
| winograd_algos.emplace_back(refhold.back().get()); | |||||
| refhold.emplace_back(new AlgoS8WinogradF23_8x8_NCHW44( | |||||
| static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo), | |||||
| tile_size)); | |||||
| winograd_algos.emplace_back(refhold.back().get()); | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| @@ -79,6 +79,8 @@ private: | |||||
| class AlgoI8x8x16Stride2; | class AlgoI8x8x16Stride2; | ||||
| class AlgoI8x8x16Stride2Filter2; | class AlgoI8x8x16Stride2Filter2; | ||||
| class AlgoS8WinogradF23_8x8; | class AlgoS8WinogradF23_8x8; | ||||
| class AlgoS8CF32WinogradF23_4x4_NCHW44; | |||||
| class AlgoS8WinogradF23_8x8_NCHW44; | |||||
| #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | ||||
| class AlgoF16Direct; | class AlgoF16Direct; | ||||
| class AlgoF16DirectStride1; | class AlgoF16DirectStride1; | ||||
| @@ -37,6 +37,13 @@ struct InputGetter<const uint8_t*, uint16x4_t> { | |||||
| return vget_low_u16(vmovl_u8(vld1_u8(ptr))) - zp; | return vget_low_u16(vmovl_u8(vld1_u8(ptr))) - zp; | ||||
| } | } | ||||
| }; | }; | ||||
| template <> | |||||
| struct InputGetter<const int8_t*, float32x4_t> { | |||||
| float32x4_t operator()(const int8_t* ptr) { | |||||
| return vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vld1_s8(ptr))))); | |||||
| } | |||||
| }; | |||||
| } // namespace | } // namespace | ||||
| } // namespace arm_common | } // namespace arm_common | ||||
| } // namespace megdnn | } // namespace megdnn | ||||
| @@ -189,6 +189,11 @@ struct ReluOp<dt_qint32, dt_qint8> : ReluOpBase<dt_qint32, dt_qint8> { | |||||
| vitem0 = vmaxq_f32(vitem0, QConverterBase::vfzero()); | vitem0 = vmaxq_f32(vitem0, QConverterBase::vfzero()); | ||||
| return QConverter::convert<int8x8_t, float32x4_t>(vitem0); | return QConverter::convert<int8x8_t, float32x4_t>(vitem0); | ||||
| } | } | ||||
| int8x8_t operator()(const float32x4_t& src) const { | |||||
| auto vitem0 = vmulq_f32(src, this->vscale); | |||||
| vitem0 = vmaxq_f32(vitem0, QConverterBase::vfzero()); | |||||
| return QConverter::convert<int8x8_t, float32x4_t>(vitem0); | |||||
| } | |||||
| }; | }; | ||||
| #else | #else | ||||
| template <> | template <> | ||||
| @@ -215,12 +220,25 @@ struct ReluOp<dt_qint32, dt_qint8> : ReluOpBase<dt_qint32, dt_qint8>, | |||||
| return vqmovn_s16(vcombine_s16(vqmovn_s32(vrshlq_s32(vitem0, vshift)), | return vqmovn_s16(vcombine_s16(vqmovn_s32(vrshlq_s32(vitem0, vshift)), | ||||
| vqmovn_s32(vrshlq_s32(vitem1, vshift)))); | vqmovn_s32(vrshlq_s32(vitem1, vshift)))); | ||||
| } | } | ||||
| int8x8_t operator()(const float32x4_t& vsrc) const { | |||||
| int32x4_t vitem0 = vqrdmulhq_s32(vcvtq_s32_f32(vsrc), vmultiplier); | |||||
| vitem0 = vmaxq_s32(vitem0, QConverterBase::vzero()); | |||||
| vitem0 = vrshlq_s32(vitem0, vshift); | |||||
| int16x4_t vitem = vqmovn_s32(vitem0); | |||||
| return vqmovn_s16(vcombine_s16(vitem, vitem)); | |||||
| } | |||||
| void operator()(const int32x4_t& src, dt_qint8* dst) const { | void operator()(const int32x4_t& src, dt_qint8* dst) const { | ||||
| auto vitem0 = vmulq_f32(vcvtq_f32_s32(src), this->vscale); | auto vitem0 = vmulq_f32(vcvtq_f32_s32(src), this->vscale); | ||||
| vitem0 = vmaxq_f32(vitem0, QConverterBase::vfzero()); | vitem0 = vmaxq_f32(vitem0, QConverterBase::vfzero()); | ||||
| auto result = QConverter::convert<int8x8_t, float32x4_t>(vitem0); | auto result = QConverter::convert<int8x8_t, float32x4_t>(vitem0); | ||||
| vst1_lane_s32(reinterpret_cast<int32_t*>(dst), (int32x2_t)result, 0); | vst1_lane_s32(reinterpret_cast<int32_t*>(dst), (int32x2_t)result, 0); | ||||
| } | } | ||||
| void operator()(const float32x4_t& src, dt_qint8* dst) const { | |||||
| auto vitem0 = vmulq_f32(src, this->vscale); | |||||
| vitem0 = vmaxq_f32(vitem0, QConverterBase::vfzero()); | |||||
| auto result = QConverter::convert<int8x8_t, float32x4_t>(vitem0); | |||||
| vst1_lane_s32(reinterpret_cast<int32_t*>(dst), (int32x2_t)result, 0); | |||||
| } | |||||
| }; | }; | ||||
| #endif | #endif | ||||
| @@ -50,6 +50,10 @@ struct TypeCvtOp<dt_qint32, dt_qint8> : UnaryOpBase<dt_qint32, dt_qint8> { | |||||
| auto vitem0 = vmulq_f32(vcvtq_f32_s32(src), this->vscale); | auto vitem0 = vmulq_f32(vcvtq_f32_s32(src), this->vscale); | ||||
| return QConverter::convert<int8x8_t, float32x4_t>(vitem0); | return QConverter::convert<int8x8_t, float32x4_t>(vitem0); | ||||
| } | } | ||||
| int8x8_t operator()(const float32x4_t& src) const { | |||||
| auto vitem0 = vmulq_f32(src, this->vscale); | |||||
| return QConverter::convert<int8x8_t, float32x4_t>(vitem0); | |||||
| } | |||||
| }; | }; | ||||
| #else | #else | ||||
| template <> | template <> | ||||
| @@ -95,6 +99,13 @@ struct TypeCvtOp<dt_qint32, dt_qint8> : UnaryOpBase<dt_qint32, dt_qint8>, | |||||
| int16x4_t vres0_int16 = vqmovn_s32(vrshlq_s32(vitem0, vshift)); | int16x4_t vres0_int16 = vqmovn_s32(vrshlq_s32(vitem0, vshift)); | ||||
| return vqmovn_s16(vcombine_s16(vres0_int16, vres0_int16)); | return vqmovn_s16(vcombine_s16(vres0_int16, vres0_int16)); | ||||
| } | } | ||||
| int8x8_t operator()(const float32x4_t& src) const { | |||||
| int32x4_t vitem0 = vqrdmulhq_s32(vcvtq_s32_f32(src), vmultiplier); | |||||
| auto fixup0 = vshrq_n_s32(vitem0, 31); | |||||
| vitem0 = vqaddq_s32(vitem0, fixup0); | |||||
| int16x4_t vres0_int16 = vqmovn_s32(vrshlq_s32(vitem0, vshift)); | |||||
| return vqmovn_s16(vcombine_s16(vres0_int16, vres0_int16)); | |||||
| } | |||||
| }; | }; | ||||
| #endif | #endif | ||||
| @@ -369,6 +369,72 @@ struct Vector<int16_t, 8> { | |||||
| } | } | ||||
| }; | }; | ||||
| template <> | |||||
| struct Vector<int16_t, 4> { | |||||
| int16x4_t value; | |||||
| Vector() {} | |||||
| Vector(const int16_t v) { value = vdup_n_s16(v); } | |||||
| Vector(const Vector& lr) { value = lr.value; } | |||||
| Vector(const Vector&& lr) { value = std::move(lr.value); } | |||||
| Vector(const int16x4_t& v) { value = v; } | |||||
| static Vector load(const int16_t* addr) { | |||||
| Vector v; | |||||
| v.value = vld1_s16(addr); | |||||
| return v; | |||||
| } | |||||
| static void save(int16_t* addr, const Vector& v) { | |||||
| vst1_s16(addr, v.value); | |||||
| } | |||||
| void save(int16_t* addr) { save(addr, *this); } | |||||
| Vector operator+(const Vector& lr) { | |||||
| Vector dst; | |||||
| dst.value = vadd_s16(value, lr.value); | |||||
| return dst; | |||||
| } | |||||
| Vector& operator+=(const Vector& lr) { | |||||
| value = vadd_s16(value, lr.value); | |||||
| return *this; | |||||
| } | |||||
| Vector operator-(const Vector& lr) { | |||||
| Vector dst; | |||||
| dst.value = vsub_s16(value, lr.value); | |||||
| return dst; | |||||
| } | |||||
| Vector& operator-=(const Vector& lr) { | |||||
| value = vsub_s16(value, lr.value); | |||||
| return *this; | |||||
| } | |||||
| Vector operator*(int16_t lr) { | |||||
| Vector dst; | |||||
| dst.value = vmul_n_s16(value, lr); | |||||
| return dst; | |||||
| } | |||||
| Vector operator*(const Vector& lr) { | |||||
| Vector dst; | |||||
| dst.value = vmul_s16(value, lr.value); | |||||
| return dst; | |||||
| } | |||||
| Vector& operator*=(const Vector& lr) { | |||||
| value = vmul_s16(value, lr.value); | |||||
| return *this; | |||||
| } | |||||
| Vector& operator=(const Vector& lr) { | |||||
| value = lr.value; | |||||
| return *this; | |||||
| } | |||||
| Vector& operator=(const Vector&& lr) { | |||||
| value = std::move(lr.value); | |||||
| return *this; | |||||
| } | |||||
| Vector operator-() { | |||||
| Vector dst; | |||||
| dst.value = -value; | |||||
| return dst; | |||||
| } | |||||
| }; | |||||
| template <> | template <> | ||||
| struct Vector<int32_t, 8> { | struct Vector<int32_t, 8> { | ||||
| int32x4x2_t value; | int32x4x2_t value; | ||||
| @@ -109,12 +109,36 @@ void WinogradFilterPreprocessImpl::exec(_megdnn_tensor_in src, | |||||
| } | } | ||||
| if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS8) { | if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS8) { | ||||
| const dt_int8* src_ptr = src.compatible_ptr<dt_int8>(); | const dt_int8* src_ptr = src.compatible_ptr<dt_int8>(); | ||||
| dt_int16* dst_ptr = dst.compatible_ptr<dt_int16>(); | |||||
| dt_int16* workspace_ptr = workspace.ptr<dt_int16>(); | |||||
| if (FW == 3) { | |||||
| if (m == 2) { | |||||
| DISPATCH(winograd_2x3_8x8_s8, param::Winograd::Format::MK8, 1, | |||||
| 0); | |||||
| if (param().compute_mode == param::ConvBias::ComputeMode::DEFAULT) { | |||||
| dt_int16* dst_ptr = dst.compatible_ptr<dt_int16>(); | |||||
| dt_int16* workspace_ptr = workspace.ptr<dt_int16>(); | |||||
| if (FW == 3) { | |||||
| if (m == 2) { | |||||
| if (pack_c_size == 1) { | |||||
| DISPATCH(winograd_2x3_8x8_s8, | |||||
| param::Winograd::Format::MK8, 1, 0); | |||||
| } else if (pack_c_size == 4) { | |||||
| DISPATCH(winograd_2x3_8x8_s8_nchw44, | |||||
| param::Winograd::Format::MK8, 1, 0); | |||||
| }else{ | |||||
| megdnn_throw("only support pack_c_size = 1 or 4"); | |||||
| } | |||||
| } | |||||
| } | |||||
| } else { | |||||
| dt_int32* dst_ptr_tmp = dst.compatible_ptr<dt_int32>(); | |||||
| dt_int32* workspace_ptr_tmp = workspace.ptr<dt_int32>(); | |||||
| float* dst_ptr = reinterpret_cast<float*>(dst_ptr_tmp); | |||||
| float* workspace_ptr = reinterpret_cast<float*>(workspace_ptr_tmp); | |||||
| if (pack_c_size == 4) { | |||||
| if (FW == 3) { | |||||
| if (m == 2) { | |||||
| DISPATCH(winograd_2x3_4x4_s8_f32_nchw44, | |||||
| param::Winograd::Format::MK4, 1, 1); | |||||
| } | |||||
| } | |||||
| } else { | |||||
| megdnn_throw("only support pack_c_size == 4"); | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| @@ -37,7 +37,9 @@ ConvBiasForward::CanonizedFilterMeta ConvBiasForward::check_exec( | |||||
| param().format == param::ConvBias::Format::NCHW88_WINOGRAD || | param().format == param::ConvBias::Format::NCHW88_WINOGRAD || | ||||
| param().format == param::ConvBias::Format::NCHW44_WINOGRAD) && | param().format == param::ConvBias::Format::NCHW44_WINOGRAD) && | ||||
| src.dtype.category() == DTypeCategory::QUANTIZED) { | src.dtype.category() == DTypeCategory::QUANTIZED) { | ||||
| megdnn_assert(filter.dtype.enumv() == DTypeEnum::QuantizedS16); | |||||
| megdnn_assert(filter.dtype.enumv() == DTypeEnum::QuantizedS16 || | |||||
| //!int8 winogradf23_44 using float,QuantizedS32 take the scale | |||||
| filter.dtype.enumv() == DTypeEnum::QuantizedS32); | |||||
| megdnn_assert(src.dtype.enumv() == DTypeEnum::QuantizedS8 || | megdnn_assert(src.dtype.enumv() == DTypeEnum::QuantizedS8 || | ||||
| src.dtype.enumv() == DTypeEnum::Quantized8Asymm); | src.dtype.enumv() == DTypeEnum::Quantized8Asymm); | ||||
| } else { | } else { | ||||
| @@ -49,7 +51,12 @@ ConvBiasForward::CanonizedFilterMeta ConvBiasForward::check_exec( | |||||
| if (param().format == param::ConvBias::Format::NCHW_WINOGRAD || | if (param().format == param::ConvBias::Format::NCHW_WINOGRAD || | ||||
| param().format == param::ConvBias::Format::NCHW88_WINOGRAD || | param().format == param::ConvBias::Format::NCHW88_WINOGRAD || | ||||
| param().format == param::ConvBias::Format::NCHW44_WINOGRAD) { | param().format == param::ConvBias::Format::NCHW44_WINOGRAD) { | ||||
| scale_filter = filter.dtype.param<dtype::QuantizedS16>().scale; | |||||
| if (filter.dtype.enumv() == DTypeEnum::QuantizedS32) { | |||||
| //!int8 winogradf23_44 using float,QuantizedS32 take the scale | |||||
| scale_filter = filter.dtype.param<dtype::QuantizedS32>().scale; | |||||
| } else { | |||||
| scale_filter = filter.dtype.param<dtype::QuantizedS16>().scale; | |||||
| } | |||||
| } else { | } else { | ||||
| scale_filter = filter.dtype.param<dtype::QuantizedS8>().scale; | scale_filter = filter.dtype.param<dtype::QuantizedS8>().scale; | ||||
| } | } | ||||
| @@ -312,11 +312,14 @@ void make_canonized_filter_meta_nchwxx( | |||||
| size_t img_ndim = 2; | size_t img_ndim = 2; | ||||
| size_t flt_start = 0; | size_t flt_start = 0; | ||||
| size_t flt_spatial_start = 2; | size_t flt_spatial_start = 2; | ||||
| size_t pack_c_size = 0; | |||||
| if (param.sparse == Param::Sparse::DENSE) { | if (param.sparse == Param::Sparse::DENSE) { | ||||
| if (filter.ndim == img_ndim + 4) { | if (filter.ndim == img_ndim + 4) { | ||||
| // oihw8i8o case | // oihw8i8o case | ||||
| megdnn_assert(filter[filter.ndim - 2] == pack_size && | |||||
| filter[filter.ndim - 1] == pack_size, | |||||
| megdnn_assert((filter[filter.ndim - 2] == pack_size && | |||||
| filter[filter.ndim - 1] == pack_size) || | |||||
| (filter[filter.ndim - 2] == 2 * pack_size && | |||||
| filter[filter.ndim - 1] == 2 * pack_size), | |||||
| "last 2 dim of filter must be %zu, but got %zu, %zu", | "last 2 dim of filter must be %zu, but got %zu, %zu", | ||||
| pack_size, filter[filter.ndim - 2], | pack_size, filter[filter.ndim - 2], | ||||
| filter[filter.ndim - 1]); | filter[filter.ndim - 1]); | ||||
| @@ -326,8 +329,14 @@ void make_canonized_filter_meta_nchwxx( | |||||
| param.format == Param::Format::NCHW44_WINOGRAD) { | param.format == Param::Format::NCHW44_WINOGRAD) { | ||||
| flt_start = 2; | flt_start = 2; | ||||
| } | } | ||||
| ret.ocpg = filter[flt_start] * pack_size; | |||||
| ret.icpg = filter[flt_start + 1] * pack_size; | |||||
| if (filter[filter.ndim - 2] == 2 * pack_size && | |||||
| filter[filter.ndim - 1] == 2 * pack_size) { | |||||
| pack_c_size = 2 * pack_size; | |||||
| } else { | |||||
| pack_c_size = pack_size; | |||||
| } | |||||
| ret.ocpg = filter[flt_start] * pack_c_size; | |||||
| ret.icpg = filter[flt_start + 1] * pack_c_size; | |||||
| } else if (filter.ndim == img_ndim + 3) { | } else if (filter.ndim == img_ndim + 3) { | ||||
| // ohwi8o | // ohwi8o | ||||
| megdnn_assert(param.format != Param::Format::NCHW88_WINOGRAD, | megdnn_assert(param.format != Param::Format::NCHW88_WINOGRAD, | ||||
| @@ -375,15 +384,23 @@ void make_canonized_filter_meta_nchwxx( | |||||
| "bad filter ndim for group convolution: " | "bad filter ndim for group convolution: " | ||||
| "spatial_ndim=%zu filter_ndim=%zu", | "spatial_ndim=%zu filter_ndim=%zu", | ||||
| img_ndim, filter.ndim); | img_ndim, filter.ndim); | ||||
| megdnn_assert(filter[filter.ndim - 1] == pack_size && | |||||
| filter[filter.ndim - 2] == pack_size, | |||||
| megdnn_assert((filter[filter.ndim - 1] == pack_size && | |||||
| filter[filter.ndim - 2] == pack_size) || | |||||
| (filter[filter.ndim - 1] == 2 * pack_size && | |||||
| filter[filter.ndim - 2] == 2 * pack_size), | |||||
| "last 2 dim of filter must be %zu, but got %zu, %zu", | "last 2 dim of filter must be %zu, but got %zu, %zu", | ||||
| pack_size, filter[filter.ndim - 2], | pack_size, filter[filter.ndim - 2], | ||||
| filter[filter.ndim - 1]); | filter[filter.ndim - 1]); | ||||
| ret.group = filter[0]; | ret.group = filter[0]; | ||||
| ret.ocpg = filter_oc * pack_size; | |||||
| ret.icpg = filter_ic * pack_size; | |||||
| if (filter[filter.ndim - 2] == 2 * pack_size && | |||||
| filter[filter.ndim - 1] == 2 * pack_size) { | |||||
| ret.ocpg = filter_oc * 2 * pack_size; | |||||
| ret.icpg = filter_ic * 2 * pack_size; | |||||
| } else { | |||||
| ret.ocpg = filter_oc * pack_size; | |||||
| ret.icpg = filter_ic * pack_size; | |||||
| } | |||||
| } | } | ||||
| } | } | ||||
| ret.spatial_ndim = 2; | ret.spatial_ndim = 2; | ||||
| @@ -596,8 +613,17 @@ void ConvolutionBase<Parameter>::check_or_deduce_dtype_fwd(DType src, | |||||
| } else if (src.enumv() == DTypeEnum::QuantizedS8 || | } else if (src.enumv() == DTypeEnum::QuantizedS8 || | ||||
| src.enumv() == DTypeEnum::Quantized8Asymm || | src.enumv() == DTypeEnum::Quantized8Asymm || | ||||
| src.enumv() == DTypeEnum::Quantized4Asymm) { | src.enumv() == DTypeEnum::Quantized4Asymm) { | ||||
| supported_dst_dtype.push_back( | |||||
| dtype::QuantizedS32(mul_scale(src, filter))); | |||||
| //! Qint8 winograd compute with float, in order to bringing the filter | |||||
| //! scale, here just use QuantizedS32 as filter type. | |||||
| if (src.enumv() == DTypeEnum::QuantizedS8 && | |||||
| filter.enumv() == DTypeEnum::QuantizedS32) { | |||||
| supported_dst_dtype.push_back(dtype::QuantizedS32( | |||||
| src.param<dtype::QuantizedS8>().scale * | |||||
| filter.param<dtype::QuantizedS32>().scale)); | |||||
| } else { | |||||
| supported_dst_dtype.push_back( | |||||
| dtype::QuantizedS32(mul_scale(src, filter))); | |||||
| } | |||||
| if (dst.valid() && dst.enumv() == src.enumv()) { | if (dst.valid() && dst.enumv() == src.enumv()) { | ||||
| supported_dst_dtype.push_back(dst); | supported_dst_dtype.push_back(dst); | ||||
| } | } | ||||
| @@ -625,12 +651,13 @@ void ConvolutionBase<Parameter>::check_or_deduce_dtype_fwd(DType src, | |||||
| megdnn_assert(dst_supported, "unsupported Conv(%s, %s) -> %s", | megdnn_assert(dst_supported, "unsupported Conv(%s, %s) -> %s", | ||||
| src.name(), filter.name(), dst.name()); | src.name(), filter.name(), dst.name()); | ||||
| } | } | ||||
| megdnn_assert(param().compute_mode != Param::ComputeMode::FLOAT32 | |||||
| megdnn_assert((param().compute_mode == Param::ComputeMode::FLOAT32 || | |||||
| param().compute_mode == Param::ComputeMode::DEFAULT) | |||||
| #if !MEGDNN_DISABLE_FLOAT16 | #if !MEGDNN_DISABLE_FLOAT16 | ||||
| || src.enumv() == DTypeEnum::Float16 | |||||
| || src.enumv() == DTypeEnum::BFloat16 | |||||
| || src.enumv() == DTypeEnum::Float16 || | |||||
| src.enumv() == DTypeEnum::BFloat16 | |||||
| #endif | #endif | ||||
| , | |||||
| , | |||||
| "ComputeMode::FLOAT32 is only available for Float16/BFloat16 " | "ComputeMode::FLOAT32 is only available for Float16/BFloat16 " | ||||
| "input / output."); | "input / output."); | ||||
| } | } | ||||
| @@ -645,10 +672,12 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src, | |||||
| megdnn_assert_contiguous(src); | megdnn_assert_contiguous(src); | ||||
| megdnn_assert_contiguous(filter); | megdnn_assert_contiguous(filter); | ||||
| megdnn_assert(src.ndim >= 3_z, "%s", errmsg().c_str()); | megdnn_assert(src.ndim >= 3_z, "%s", errmsg().c_str()); | ||||
| if (param().format == Param::Format::NCHW_WINOGRAD && | |||||
| if ((param().format == Param::Format::NCHW_WINOGRAD || | |||||
| param().format == Param::Format::NCHW44_WINOGRAD) && | |||||
| src.dtype.category() == DTypeCategory::QUANTIZED) { | src.dtype.category() == DTypeCategory::QUANTIZED) { | ||||
| megdnn_assert(filter.dtype.enumv() == DTypeEnum::QuantizedS16, "%s", | |||||
| errmsg().c_str()); | |||||
| megdnn_assert((filter.dtype.enumv() == DTypeEnum::QuantizedS16 || | |||||
| filter.dtype.enumv() == DTypeEnum::QuantizedS32), | |||||
| "%s", errmsg().c_str()); | |||||
| megdnn_assert(src.dtype.enumv() == DTypeEnum::QuantizedS8 || | megdnn_assert(src.dtype.enumv() == DTypeEnum::QuantizedS8 || | ||||
| src.dtype.enumv() == DTypeEnum::Quantized8Asymm, | src.dtype.enumv() == DTypeEnum::Quantized8Asymm, | ||||
| "%s", errmsg().c_str()); | "%s", errmsg().c_str()); | ||||
| @@ -741,14 +770,18 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src, | |||||
| if (param().format == Param::Format::NCHW44 || | if (param().format == Param::Format::NCHW44 || | ||||
| param().format == Param::Format::NCHW44_DOT || | param().format == Param::Format::NCHW44_DOT || | ||||
| param().format == Param::Format::NCHW44_WINOGRAD) { | param().format == Param::Format::NCHW44_WINOGRAD) { | ||||
| //!support nchw44 filter change to 88 for int8 winogradf23_88 using MK8 mamtul | |||||
| megdnn_assert((src.ndim == 4 && filter.ndim == 5 && | megdnn_assert((src.ndim == 4 && filter.ndim == 5 && | ||||
| filter[filter.ndim - 1] == 4) || | filter[filter.ndim - 1] == 4) || | ||||
| (src.ndim == 5 && | (src.ndim == 5 && | ||||
| ((filter.ndim == 6 && | ((filter.ndim == 6 && | ||||
| filter[filter.ndim - 1] == 4) || | |||||
| (filter[filter.ndim - 1] == 4 || | |||||
| filter[filter.ndim - 1] == 8)) || | |||||
| (filter.ndim == 7 && | (filter.ndim == 7 && | ||||
| filter[filter.ndim - 1] == 4 && | |||||
| filter[filter.ndim - 2] == 4)) && | |||||
| (filter[filter.ndim - 1] == 4 || | |||||
| filter[filter.ndim - 1] == 8) && | |||||
| (filter[filter.ndim - 2] == 4 || | |||||
| filter[filter.ndim - 2] == 8))) && | |||||
| src[src.ndim - 1] == 4), | src[src.ndim - 1] == 4), | ||||
| "NCHW44 require src ndim is 5 and filter's ndim is 6 " | "NCHW44 require src ndim is 5 and filter's ndim is 6 " | ||||
| ", and last shape two is 4 but got src %s, filter %s", | ", and last shape two is 4 but got src %s, filter %s", | ||||
| @@ -67,8 +67,8 @@ constexpr size_t layout_pack_size(param::ConvBias::Format layout) { | |||||
| switch (layout) { | switch (layout) { | ||||
| case param::ConvBias::Format::NHWCD4: | case param::ConvBias::Format::NHWCD4: | ||||
| return 4; | return 4; | ||||
| case param::ConvBias::Format::NCHW4: | |||||
| case param::ConvBias::Format::NCHW44: | case param::ConvBias::Format::NCHW44: | ||||
| case param::ConvBias::Format::NCHW4: | |||||
| return 4; | return 4; | ||||
| case param::ConvBias::Format::NCHW32: | case param::ConvBias::Format::NCHW32: | ||||
| return 32; | return 32; | ||||
| @@ -365,6 +365,7 @@ INST(uint8_t, uint8_t, int16_t, int) | |||||
| _output_compute_type, layout, param::MatrixMul::Format::MK4>; | _output_compute_type, layout, param::MatrixMul::Format::MK4>; | ||||
| INST(float, float, float, float, param::ConvBias::Format::NCHW) | INST(float, float, float, float, param::ConvBias::Format::NCHW) | ||||
| INST(float, float, float, float, param::ConvBias::Format::NCHW44) | INST(float, float, float, float, param::ConvBias::Format::NCHW44) | ||||
| INST(int8_t, int8_t, float, float, param::ConvBias::Format::NCHW44) | |||||
| #undef INST | #undef INST | ||||
| #define INST(_ctype, _dst_type, _input_filter_compute_type, \ | #define INST(_ctype, _dst_type, _input_filter_compute_type, \ | ||||
| @@ -373,6 +374,7 @@ INST(float, float, float, float, param::ConvBias::Format::NCHW44) | |||||
| _ctype, _dst_type, _input_filter_compute_type, \ | _ctype, _dst_type, _input_filter_compute_type, \ | ||||
| _output_compute_type, layout, param::MatrixMul::Format::MK8>; | _output_compute_type, layout, param::MatrixMul::Format::MK8>; | ||||
| INST(int8_t, int8_t, int16_t, int, param::ConvBias::Format::NCHW) | INST(int8_t, int8_t, int16_t, int, param::ConvBias::Format::NCHW) | ||||
| INST(int8_t, int8_t, int16_t, int, param::ConvBias::Format::NCHW44) | |||||
| INST(float, float, float, float, param::ConvBias::Format::NCHW88) | INST(float, float, float, float, param::ConvBias::Format::NCHW88) | ||||
| MEGDNN_INC_FLOAT16(INST(dt_float16, dt_float16, dt_float16, dt_float16, | MEGDNN_INC_FLOAT16(INST(dt_float16, dt_float16, dt_float16, dt_float16, | ||||
| param::ConvBias::Format::NCHW)) | param::ConvBias::Format::NCHW)) | ||||
| @@ -56,8 +56,16 @@ void WinogradFilterPreprocess::deduce_layout(const TensorLayout& src, | |||||
| DType dst_type = src.dtype; | DType dst_type = src.dtype; | ||||
| if (src.dtype.category() == DTypeCategory::QUANTIZED) { | if (src.dtype.category() == DTypeCategory::QUANTIZED) { | ||||
| megdnn_assert(src.dtype.enumv() == DTypeEnum::QuantizedS8); | megdnn_assert(src.dtype.enumv() == DTypeEnum::QuantizedS8); | ||||
| dst_type = dtype::QuantizedS16( | |||||
| src.dtype.param<dtype::QuantizedS8>().scale); | |||||
| if (param().compute_mode == | |||||
| param::ConvBias::ComputeMode::DEFAULT) { | |||||
| //! input int8 compute short | |||||
| dst_type = dtype::QuantizedS16( | |||||
| src.dtype.param<dtype::QuantizedS8>().scale); | |||||
| } else { | |||||
| //! input int8 compute float32 | |||||
| dst_type = dtype::QuantizedS32( | |||||
| src.dtype.param<dtype::QuantizedS8>().scale); | |||||
| } | |||||
| } | } | ||||
| if (src.ndim == 4 || src.ndim == 6) { | if (src.ndim == 4 || src.ndim == 6) { | ||||
| @@ -123,8 +131,16 @@ size_t WinogradFilterPreprocess::get_workspace_in_bytes( | |||||
| if (src.dtype.category() == DTypeCategory::QUANTIZED) { | if (src.dtype.category() == DTypeCategory::QUANTIZED) { | ||||
| megdnn_assert(src.dtype.enumv() == DTypeEnum::QuantizedS8 || | megdnn_assert(src.dtype.enumv() == DTypeEnum::QuantizedS8 || | ||||
| src.dtype.enumv() == DTypeEnum::Quantized8Asymm); | src.dtype.enumv() == DTypeEnum::Quantized8Asymm); | ||||
| output_compute_dtype = dtype::QuantizedS16( | |||||
| src.dtype.param<dtype::QuantizedS8>().scale); | |||||
| if (param().compute_mode == | |||||
| param::ConvBias::ComputeMode::DEFAULT) { | |||||
| //! input int8 compute short | |||||
| output_compute_dtype = dtype::QuantizedS16( | |||||
| src.dtype.param<dtype::QuantizedS8>().scale); | |||||
| } else { | |||||
| //! input int8 compute float32 | |||||
| output_compute_dtype = dtype::QuantizedS32( | |||||
| src.dtype.param<dtype::QuantizedS8>().scale); | |||||
| } | |||||
| } | } | ||||
| size_t FW = src[3]; | size_t FW = src[3]; | ||||
| @@ -118,6 +118,9 @@ void ConvBiasForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | |||||
| DISPATCH(QuantizedS8, QuantizedS32) | DISPATCH(QuantizedS8, QuantizedS32) | ||||
| DISPATCH(Quantized8Asymm, QuantizedS32) | DISPATCH(Quantized8Asymm, QuantizedS32) | ||||
| DISPATCH(Quantized4Asymm, QuantizedS32) | DISPATCH(Quantized4Asymm, QuantizedS32) | ||||
| DISPATCH_RAW(QuantizedS8, QuantizedS32, QuantizedS32, FLOAT32, | |||||
| (convolution::forward_bias<dt_int8, dt_int8, dt_int32, | |||||
| dt_int32>)) | |||||
| #if !MEGDNN_DISABLE_FLOAT16 | #if !MEGDNN_DISABLE_FLOAT16 | ||||
| DISPATCH(Float16, Float16) | DISPATCH(Float16, Float16) | ||||
| DISPATCH_RAW(Float16, Float16, Float16, FLOAT32, | DISPATCH_RAW(Float16, Float16, Float16, FLOAT32, | ||||
| @@ -171,7 +171,6 @@ void WinogradFilterPreprocessImpl::exec(_megdnn_tensor_in src, | |||||
| } | } | ||||
| } | } | ||||
| #undef cb | #undef cb | ||||
| #undef DISPATCH_FORMAT_MK8 | |||||
| #undef DISPATCH_DTYPE | #undef DISPATCH_DTYPE | ||||
| } | } | ||||
| else if (pack_c_size == 4) { //! NCHW44 | else if (pack_c_size == 4) { //! NCHW44 | ||||
| @@ -195,6 +194,15 @@ void WinogradFilterPreprocessImpl::exec(_megdnn_tensor_in src, | |||||
| if (src.layout.dtype.enumv() == DTypeEnum::Float32) { \ | if (src.layout.dtype.enumv() == DTypeEnum::Float32) { \ | ||||
| DISPATCH_KERNEL(dt_float32, dt_float32, dt_float32, dt_float32, \ | DISPATCH_KERNEL(dt_float32, dt_float32, dt_float32, dt_float32, \ | ||||
| DISPATCH_FORMAT_MK4, 1.0f, _midout_tag, 0); \ | DISPATCH_FORMAT_MK4, 1.0f, _midout_tag, 0); \ | ||||
| } \ | |||||
| if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS8) { \ | |||||
| if (param().format == param::Winograd::Format::MK4) { \ | |||||
| DISPATCH_KERNEL(dt_int8, dt_int8, dt_float32, dt_float32, \ | |||||
| DISPATCH_FORMAT_MK4, 1.0f, _midout_tag, 0); \ | |||||
| } else if (param().format == param::Winograd::Format::MK8) { \ | |||||
| DISPATCH_KERNEL(dt_int8, dt_int8, dt_int16, dt_int32, \ | |||||
| DISPATCH_FORMAT_MK8, 2.0f, _midout_tag, 0); \ | |||||
| } \ | |||||
| } | } | ||||
| if (FW == 3) { | if (FW == 3) { | ||||
| if (m == 2) { | if (m == 2) { | ||||
| @@ -208,6 +216,7 @@ void WinogradFilterPreprocessImpl::exec(_megdnn_tensor_in src, | |||||
| } | } | ||||
| #undef cb | #undef cb | ||||
| #undef DISPATCH_FORMAT_MK8 | #undef DISPATCH_FORMAT_MK8 | ||||
| #undef DISPATCH_FORMAT_MK4 | |||||
| #undef DISPATCH_KERNEL | #undef DISPATCH_KERNEL | ||||
| #undef DISPATCH_DTYPE | #undef DISPATCH_DTYPE | ||||
| } | } | ||||
| @@ -98,7 +98,8 @@ CB_TEST(H_SWISH); | |||||
| #if MEGDNN_WITH_BENCHMARK | #if MEGDNN_WITH_BENCHMARK | ||||
| static void benchmark_convbias(Handle* handle, bool is_fp32 = false) { | |||||
| static void benchmark_convbias(Handle* handle, std::string int_name, | |||||
| std::string float_name, bool is_fp32 = false) { | |||||
| constexpr size_t RUNS = 30; | constexpr size_t RUNS = 30; | ||||
| Benchmarker<ConvBias> benchmarker_int(handle); | Benchmarker<ConvBias> benchmarker_int(handle); | ||||
| @@ -109,12 +110,12 @@ static void benchmark_convbias(Handle* handle, bool is_fp32 = false) { | |||||
| .set_dtype(4, dtype::QuantizedS8(60.25)) | .set_dtype(4, dtype::QuantizedS8(60.25)) | ||||
| .set_display(false); | .set_display(false); | ||||
| benchmarker_int.set_before_exec_callback( | benchmarker_int.set_before_exec_callback( | ||||
| conv_bias::ConvBiasAlgoChecker<ConvBias>("IM2COLMATMUL:.+")); | |||||
| conv_bias::ConvBiasAlgoChecker<ConvBias>(int_name.c_str())); | |||||
| Benchmarker<ConvBias> benchmarker_float(handle); | Benchmarker<ConvBias> benchmarker_float(handle); | ||||
| benchmarker_float.set_display(false).set_times(RUNS); | benchmarker_float.set_display(false).set_times(RUNS); | ||||
| benchmarker_float.set_before_exec_callback( | benchmarker_float.set_before_exec_callback( | ||||
| conv_bias::ConvBiasAlgoChecker<ConvBias>("IM2COLMATMUL:.+")); | |||||
| conv_bias::ConvBiasAlgoChecker<ConvBias>(float_name.c_str())); | |||||
| Benchmarker<ConvBias> benchmarker_nchw44(handle); | Benchmarker<ConvBias> benchmarker_nchw44(handle); | ||||
| if (is_fp32) { | if (is_fp32) { | ||||
| @@ -233,13 +234,24 @@ static void benchmark_convbias(Handle* handle, bool is_fp32 = false) { | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_NCHW44) { | TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_NCHW44) { | ||||
| benchmark_convbias(handle(), true); | |||||
| benchmark_convbias(handle(), false); | |||||
| #if MEGDNN_AARCH64 | |||||
| benchmark_convbias(handle(), "IM2COLMATMUL:AARCH64_INT8X8X32_K4X4X16:384", | |||||
| "IM2COLMATMUL:AARCH64_F32K8X12X1:192", true); | |||||
| #else | |||||
| benchmark_convbias(handle(), "IM2COLMATMUL:ARMV7_INT8X8X32_K4X8X8:384", | |||||
| "IM2COLMATMUL:ARMV7_F32:192", true); | |||||
| #endif | |||||
| } | } | ||||
| TEST_F(ARM_COMMON_MULTI_THREADS, BENCHMARK_CONVBIAS_NCHW44) { | TEST_F(ARM_COMMON_MULTI_THREADS, BENCHMARK_CONVBIAS_NCHW44) { | ||||
| benchmark_convbias(handle(), true); | |||||
| benchmark_convbias(handle(), false); | |||||
| #if MEGDNN_AARCH64 | |||||
| benchmark_convbias(handle(), "IM2COLMATMUL:AARCH64_INT8X8X32_K4X4X16:384", | |||||
| "IM2COLMATMUL:AARCH64_F32K8X12X1:192", true); | |||||
| #else | |||||
| benchmark_convbias(handle(), "IM2COLMATMUL:AARCH64_INT8X8X32_K4X4X16:384", | |||||
| "IM2COLMATMUL:ARMV7_F32:192", true); | |||||
| #endif | |||||
| } | } | ||||
| #endif | #endif | ||||
| @@ -506,7 +518,7 @@ void BENCHMARK_IM2COL_NCHW44_VS_NCHW(const char* algo_name, | |||||
| computations / used_im2col, used / used_im2col); | computations / used_im2col, used / used_im2col); | ||||
| } | } | ||||
| } | } | ||||
| #if MEGDNN_AARCH64 | |||||
| TEST_F(ARM_COMMON, BENCHMARK_NCHW_VS_NCHW44_INT8x8x32) { | TEST_F(ARM_COMMON, BENCHMARK_NCHW_VS_NCHW44_INT8x8x32) { | ||||
| printf("=========================compare " | printf("=========================compare " | ||||
| "IM2COLMATMUL:AARCH64_INT8X8X32_K4X4X16, " | "IM2COLMATMUL:AARCH64_INT8X8X32_K4X4X16, " | ||||
| @@ -515,6 +527,7 @@ TEST_F(ARM_COMMON, BENCHMARK_NCHW_VS_NCHW44_INT8x8x32) { | |||||
| "IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16", | "IM2COLMATMUL:AARCH64_INT8X8X32_MK4_4X4X16", | ||||
| handle(), 3, 4); | handle(), 3, 4); | ||||
| } | } | ||||
| #endif | |||||
| TEST_F(ARM_COMMON, BENCHMARK_GROUP_CONVBIAS_QUANTIZED) { | TEST_F(ARM_COMMON, BENCHMARK_GROUP_CONVBIAS_QUANTIZED) { | ||||
| constexpr size_t RUNS = 50; | constexpr size_t RUNS = 50; | ||||
| @@ -989,7 +989,6 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_MK_PACKED_INT8) { | |||||
| checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBias>( | checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBias>( | ||||
| ssprintf("WINOGRAD:%s:8:2:32", matmul_name).c_str())); | ssprintf("WINOGRAD:%s:8:2:32", matmul_name).c_str())); | ||||
| std::vector<TestArg> args = get_winograd_mk_packed_args(8); | |||||
| std::vector<TestArg> quantized_args = | std::vector<TestArg> quantized_args = | ||||
| get_quantized_winograd_mk_packed_args(8); | get_quantized_winograd_mk_packed_args(8); | ||||
| UniformIntRNG int_rng{-50, 50}; | UniformIntRNG int_rng{-50, 50}; | ||||
| @@ -999,6 +998,174 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_MK_PACKED_INT8) { | |||||
| dtype::QuantizedS8(60.25f), param::MatrixMul::Format::MK8, 1e-3); | dtype::QuantizedS8(60.25f), param::MatrixMul::Format::MK8, 1e-3); | ||||
| } | } | ||||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_NCHW44_MK_PACKED_INT8) { | |||||
| using namespace conv_bias; | |||||
| Checker<ConvBiasForward> checker(handle()); | |||||
| auto run = [&checker](Handle* handle, const std::vector<TestArg>& args, | |||||
| const std::vector<size_t>& out_size, DType A_dtype, | |||||
| DType B_dtype, DType C_dtype, DType D_dtype, | |||||
| param::MatrixMul::Format format, float eps) { | |||||
| for (auto&& arg : args) { | |||||
| for (uint32_t m : out_size) { | |||||
| checker.set_extra_opr_impl(std::bind( | |||||
| winograd_algo_extra_impl, std::placeholders::_1, m, | |||||
| arg.param, handle, format)); | |||||
| checker.set_dtype(0, A_dtype) | |||||
| .set_dtype(1, B_dtype) | |||||
| .set_dtype(2, C_dtype) | |||||
| .set_dtype(4, D_dtype) | |||||
| .set_epsilon(eps) | |||||
| .set_param(arg.param) | |||||
| .execs({arg.src, arg.filter, arg.bias, {}, {}}); | |||||
| } | |||||
| } | |||||
| }; | |||||
| #if MEGDNN_AARCH64 | |||||
| const char* matmul_name = "AARCH64_INT16X16X32_MK8_8X8"; | |||||
| #else | |||||
| const char* matmul_name = "ARMV7_INT16X16X32_MK8_4X8"; | |||||
| #endif | |||||
| checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBias>( | |||||
| ssprintf("WINOGRAD_NCHW44:%s:8:2:32", matmul_name).c_str())); | |||||
| std::vector<TestArg> quantized_args = get_int8_nchw44_args (3,4); | |||||
| UniformIntRNG int_rng{-50, 50}; | |||||
| checker.set_rng(0, &int_rng).set_rng(1, &int_rng).set_rng(2, &int_rng); | |||||
| run(handle(), quantized_args, {2}, dtype::QuantizedS8(2.5f), | |||||
| dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f), | |||||
| dtype::QuantizedS8(60.25f), param::MatrixMul::Format::MK8, 1e-3); | |||||
| } | |||||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_NCHW44_MK_PACKED_INT8_GROUPMODE) { | |||||
| using namespace conv_bias; | |||||
| Checker<ConvBiasForward> checker(handle()); | |||||
| auto run = [&checker](Handle* handle, const std::vector<TestArg>& args, | |||||
| const std::vector<size_t>& out_size, DType A_dtype, | |||||
| DType B_dtype, DType C_dtype, DType D_dtype, | |||||
| param::MatrixMul::Format format, float eps) { | |||||
| for (auto&& arg : args) { | |||||
| for (uint32_t m : out_size) { | |||||
| checker.set_extra_opr_impl(std::bind( | |||||
| winograd_algo_extra_impl, std::placeholders::_1, m, | |||||
| arg.param, handle, format)); | |||||
| checker.set_dtype(0, A_dtype) | |||||
| .set_dtype(1, B_dtype) | |||||
| .set_dtype(2, C_dtype) | |||||
| .set_dtype(4, D_dtype) | |||||
| .set_epsilon(eps) | |||||
| .set_param(arg.param) | |||||
| .execs({arg.src, arg.filter, arg.bias, {}, {}}); | |||||
| } | |||||
| } | |||||
| }; | |||||
| #if MEGDNN_AARCH64 | |||||
| const char* matmul_name = "AARCH64_INT16X16X32_MK8_8X8"; | |||||
| #else | |||||
| const char* matmul_name = "ARMV7_INT16X16X32_MK8_4X8"; | |||||
| #endif | |||||
| checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBias>( | |||||
| ssprintf("WINOGRAD_NCHW44:%s:8:2:32", matmul_name).c_str())); | |||||
| std::vector<TestArg> quantized_args = | |||||
| get_int8_nchw44_args(3, 4, false, true); | |||||
| UniformIntRNG int_rng{-50, 50}; | |||||
| checker.set_rng(0, &int_rng).set_rng(1, &int_rng).set_rng(2, &int_rng); | |||||
| run(handle(), quantized_args, {2}, dtype::QuantizedS8(2.5f), | |||||
| dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f), | |||||
| dtype::QuantizedS8(60.25f), param::MatrixMul::Format::MK8, 1e-3); | |||||
| } | |||||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_NCHW44_MK_PACKED_INT8_COMP_F32) { | |||||
| using namespace conv_bias; | |||||
| Checker<ConvBiasForward> checker(handle()); | |||||
| auto run = [&checker](Handle* handle, const std::vector<TestArg>& args, | |||||
| const std::vector<size_t>& out_size, DType A_dtype, | |||||
| DType B_dtype, DType C_dtype, DType D_dtype, | |||||
| param::MatrixMul::Format format, float eps) { | |||||
| for (auto&& arg : args) { | |||||
| for (uint32_t m : out_size) { | |||||
| checker.set_extra_opr_impl(std::bind( | |||||
| winograd_algo_extra_impl, std::placeholders::_1, m, | |||||
| arg.param, handle, format)); | |||||
| checker.set_dtype(0, A_dtype) | |||||
| .set_dtype(1, B_dtype) | |||||
| .set_dtype(2, C_dtype) | |||||
| .set_dtype(4, D_dtype) | |||||
| .set_epsilon(eps) | |||||
| .set_param(arg.param) | |||||
| .execs({arg.src, arg.filter, arg.bias, {}, {}}); | |||||
| } | |||||
| } | |||||
| }; | |||||
| float epsilon = 0.001; | |||||
| #if MEGDNN_AARCH64 | |||||
| const char* matmul_name = "AARCH64_F32_MK4_4x16"; | |||||
| #else | |||||
| const char* matmul_name = "ARMV7_F32_MK4_4x8"; | |||||
| #endif | |||||
| checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBias>( | |||||
| ssprintf("WINOGRAD_NCHW44:%s:4:2:32", matmul_name).c_str())); | |||||
| std::vector<TestArg> quantized_args = | |||||
| get_int8_nchw44_args(3, 4, true); | |||||
| UniformIntRNG int_rng{-50, 50}; | |||||
| checker.set_rng(0, &int_rng).set_rng(1, &int_rng).set_rng(2, &int_rng); | |||||
| run(handle(), quantized_args, {2}, dtype::QuantizedS8(0.41113496f), | |||||
| dtype::QuantizedS8(0.01887994f), | |||||
| dtype::QuantizedS32(0.41113496f * 0.01887994f), | |||||
| dtype::QuantizedS8(0.49550694f), param::MatrixMul::Format::MK4, epsilon); | |||||
| } | |||||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_NCHW44_MK_PACKED_INT8_COMP_F32_GROUPMODE) { | |||||
| using namespace conv_bias; | |||||
| Checker<ConvBiasForward> checker(handle()); | |||||
| auto run = [&checker](Handle* handle, const std::vector<TestArg>& args, | |||||
| const std::vector<size_t>& out_size, DType A_dtype, | |||||
| DType B_dtype, DType C_dtype, DType D_dtype, | |||||
| param::MatrixMul::Format format, float eps) { | |||||
| for (auto&& arg : args) { | |||||
| for (uint32_t m : out_size) { | |||||
| checker.set_extra_opr_impl(std::bind( | |||||
| winograd_algo_extra_impl, std::placeholders::_1, m, | |||||
| arg.param, handle, format)); | |||||
| checker.set_dtype(0, A_dtype) | |||||
| .set_dtype(1, B_dtype) | |||||
| .set_dtype(2, C_dtype) | |||||
| .set_dtype(4, D_dtype) | |||||
| .set_epsilon(eps) | |||||
| .set_param(arg.param) | |||||
| .execs({arg.src, arg.filter, arg.bias, {}, {}}); | |||||
| } | |||||
| } | |||||
| }; | |||||
| float epsilon = 0.001; | |||||
| #if MEGDNN_AARCH64 | |||||
| const char* matmul_name = "AARCH64_F32_MK4_4x16"; | |||||
| #else | |||||
| const char* matmul_name = "ARMV7_F32_MK4_4x8"; | |||||
| #endif | |||||
| checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBias>( | |||||
| ssprintf("WINOGRAD_NCHW44:%s:4:2:32", matmul_name).c_str())); | |||||
| std::vector<TestArg> quantized_args = | |||||
| get_int8_nchw44_args(3, 4, true, true); | |||||
| UniformIntRNG int_rng{-50, 50}; | |||||
| checker.set_rng(0, &int_rng).set_rng(1, &int_rng).set_rng(2, &int_rng); | |||||
| run(handle(), quantized_args, {2}, dtype::QuantizedS8(0.41113496f), | |||||
| dtype::QuantizedS8(0.01887994f), | |||||
| dtype::QuantizedS32(0.41113496f * 0.01887994f), | |||||
| dtype::QuantizedS8(0.49550694f), param::MatrixMul::Format::MK4, epsilon); | |||||
| } | |||||
| #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | ||||
| TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F16_F23) { | TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F16_F23) { | ||||
| using namespace conv_bias; | using namespace conv_bias; | ||||
| @@ -1185,6 +1185,197 @@ TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_WINOGRAD_F32) { | |||||
| {1, {4}}, data_type); | {1, {4}}, data_type); | ||||
| } | } | ||||
| TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_WINOGRAD_INT8) { | |||||
| constexpr size_t RUNS = 50; | |||||
| param::ConvBias param; | |||||
| param.nonlineMode = param::ConvBias::NonlineMode::RELU; | |||||
| param.pad_h = 1; | |||||
| param.pad_w = 1; | |||||
| param.stride_h = 1; | |||||
| param.stride_w = 1; | |||||
| param.sparse = param::ConvBias::Sparse::GROUP; | |||||
| std::vector<std::pair<SmallVector<TensorShape>, float>> | |||||
| shapes_and_computation; | |||||
| auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, | |||||
| size_t FS, size_t group) { | |||||
| SmallVector<TensorShape> shapes{{N, IC, H, W}, | |||||
| {group, OC / group, IC / group, FS, FS}, | |||||
| {1, OC, 1, 1}, | |||||
| {}, | |||||
| {N, OC, H, W}}; | |||||
| TensorShape dst{N, OC, H, W}; | |||||
| float computations = | |||||
| ((IC / group) * FS * FS * dst.total_nr_elems() * 2 + | |||||
| dst.total_nr_elems()) * | |||||
| 1e-6; | |||||
| shapes_and_computation.push_back(std::make_pair(shapes, computations)); | |||||
| }; | |||||
| bench_case(1, 32, 32, 200, 200, 3, 4); | |||||
| bench_case(1, 32, 32, 200, 200, 3, 1); | |||||
| bench_case(1, 32, 32, 128, 128, 3, 4); | |||||
| bench_case(1, 32, 32, 128, 128, 3, 1); | |||||
| bench_case(1, 32, 32, 100, 100, 3, 4); | |||||
| bench_case(1, 32, 32, 100, 100, 3, 1); | |||||
| bench_case(1, 32, 32, 80, 80, 3, 4); | |||||
| bench_case(1, 512, 512, 14, 14, 3, 1); | |||||
| bench_case(1, 512, 256, 14, 14, 3, 1); | |||||
| bench_case(1, 512, 128, 14, 14, 3, 1); | |||||
| bench_case(1, 512, 64, 14, 14, 3, 1); | |||||
| bench_case(1, 512, 512, 7, 7, 3, 1); | |||||
| bench_case(1, 512, 256, 7, 7, 3, 1); | |||||
| bench_case(1, 512, 128, 7, 7, 3, 1); | |||||
| bench_case(1, 512, 64, 7, 7, 3, 1); | |||||
| std::string algo_name; | |||||
| #if MEGDNN_AARCH64 | |||||
| algo_name = "WINOGRAD:AARCH64_INT16X16X32_MK8_8X8:8:2:32"; | |||||
| #else | |||||
| algo_name = "WINOGRAD:ARMV7_INT16X16X32_MK8_4X8:8:2:32"; | |||||
| #endif | |||||
| std::vector<DType> data_type = {dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f), | |||||
| dtype::QuantizedS32(6.25f) ,dtype::QuantizedS8(60.25f) }; | |||||
| printf("Benchmark WINOGRAD_IN8_MK8 algo\n"); | |||||
| benchmark_impl(param, shapes_and_computation, algo_name, RUNS, | |||||
| {4, {4, 5, 6, 7}}, {1, {4}}, data_type); | |||||
| benchmark_impl(param, shapes_and_computation, algo_name, RUNS, | |||||
| {4, {4, 5, 6, 7}}, {1, {7}}, data_type); | |||||
| benchmark_impl(param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, | |||||
| {1, {4}}, data_type); | |||||
| } | |||||
| TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, | |||||
| BENCHMARK_CONVBIAS_WINOGRAD_NCHW44_INT8_MK8) { | |||||
| constexpr size_t RUNS = 50; | |||||
| param::ConvBias param; | |||||
| param.nonlineMode = param::ConvBias::NonlineMode::RELU; | |||||
| param.pad_h = 1; | |||||
| param.pad_w = 1; | |||||
| param.stride_h = 1; | |||||
| param.stride_w = 1; | |||||
| param.sparse = param::ConvBias::Sparse::DENSE; | |||||
| param.format = param::ConvBias::Format::NCHW44; | |||||
| std::vector<std::pair<SmallVector<TensorShape>, float>> | |||||
| shapes_and_computation; | |||||
| auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, | |||||
| size_t FS, size_t group) { | |||||
| SmallVector<TensorShape> shapes{{N, IC / 4, H, W, 4}, | |||||
| {OC / 4, IC / 4, FS, FS, 4, 4}, | |||||
| {1, OC / 4, 1, 1, 4}, | |||||
| {}, | |||||
| {N, OC / 4, H, W, 4}}; | |||||
| TensorShape dst{N, OC, H, W}; | |||||
| float computations = | |||||
| ((IC / group) * FS * FS * dst.total_nr_elems() * 2 + | |||||
| dst.total_nr_elems()) * | |||||
| 1e-6; | |||||
| shapes_and_computation.push_back(std::make_pair(shapes, computations)); | |||||
| }; | |||||
| bench_case(1, 32, 32, 200, 200, 3, 1); | |||||
| bench_case(1, 32, 32, 128, 128, 3, 1); | |||||
| bench_case(1, 32, 32, 100, 100, 3, 1); | |||||
| bench_case(1, 512, 512, 14, 14, 3, 1); | |||||
| bench_case(1, 512, 256, 14, 14, 3, 1); | |||||
| bench_case(1, 512, 128, 14, 14, 3, 1); | |||||
| bench_case(1, 512, 64, 14, 14, 3, 1); | |||||
| bench_case(1, 512, 512, 7, 7, 3, 1); | |||||
| bench_case(1, 512, 256, 7, 7, 3, 1); | |||||
| bench_case(1, 512, 128, 7, 7, 3, 1); | |||||
| bench_case(1, 512, 64, 7, 7, 3, 1); | |||||
| std::string algo_name; | |||||
| #if MEGDNN_AARCH64 | |||||
| algo_name = "WINOGRAD_NCHW44:AARCH64_INT16X16X32_MK8_8X8:8:2:32"; | |||||
| #else | |||||
| algo_name = "WINOGRAD_NCHW44:ARMV7_INT16X16X32_MK8_4X8:8:2:32"; | |||||
| #endif | |||||
| std::vector<DType> data_type = { | |||||
| dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f), | |||||
| dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f)}; | |||||
| printf("Benchmark WINOGRAD_INT8_MK8 algo\n"); | |||||
| benchmark_impl(param, shapes_and_computation, algo_name, RUNS, | |||||
| {4, {4, 5, 6, 7}}, {1, {4}}, data_type); | |||||
| benchmark_impl(param, shapes_and_computation, algo_name, RUNS, | |||||
| {4, {4, 5, 6, 7}}, {1, {7}}, data_type); | |||||
| benchmark_impl(param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, | |||||
| {1, {4}}, data_type); | |||||
| } | |||||
| TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, | |||||
| BENCHMARK_CONVBIAS_WINOGRAD_NCHW44_INT8_COMP_F32) { | |||||
| constexpr size_t RUNS = 50; | |||||
| param::ConvBias param; | |||||
| param.nonlineMode = param::ConvBias::NonlineMode::RELU; | |||||
| param.pad_h = 1; | |||||
| param.pad_w = 1; | |||||
| param.stride_h = 1; | |||||
| param.stride_w = 1; | |||||
| param.sparse = param::ConvBias::Sparse::DENSE; // GROUP; | |||||
| param.format = param::ConvBias::Format::NCHW44; | |||||
| std::vector<std::pair<SmallVector<TensorShape>, float>> | |||||
| shapes_and_computation; | |||||
| auto bench_case = [&](size_t N, size_t IC, size_t OC, size_t H, size_t W, | |||||
| size_t FS, size_t group) { | |||||
| SmallVector<TensorShape> shapes{{N, IC / 4, H, W, 4}, | |||||
| {OC / 4, IC / 4, FS, FS, 4, 4}, | |||||
| {1, OC / 4, 1, 1, 4}, | |||||
| {}, | |||||
| {N, OC / 4, H, W, 4}}; | |||||
| TensorShape dst{N, OC, H, W}; | |||||
| float computations = | |||||
| ((IC / group) * FS * FS * dst.total_nr_elems() * 2 + | |||||
| dst.total_nr_elems()) * | |||||
| 1e-6; | |||||
| shapes_and_computation.push_back(std::make_pair(shapes, computations)); | |||||
| }; | |||||
| bench_case(1, 32, 32, 200, 200, 3, 1); | |||||
| bench_case(1, 32, 32, 128, 128, 3, 1); | |||||
| bench_case(1, 32, 32, 100, 100, 3, 1); | |||||
| bench_case(1, 512, 512, 14, 14, 3, 1); | |||||
| bench_case(1, 512, 256, 14, 14, 3, 1); | |||||
| bench_case(1, 512, 128, 14, 14, 3, 1); | |||||
| bench_case(1, 512, 64, 14, 14, 3, 1); | |||||
| bench_case(1, 512, 512, 7, 7, 3, 1); | |||||
| bench_case(1, 512, 256, 7, 7, 3, 1); | |||||
| bench_case(1, 512, 128, 7, 7, 3, 1); | |||||
| bench_case(1, 512, 64, 7, 7, 3, 1); | |||||
| std::string algo_name; | |||||
| #if MEGDNN_AARCH64 | |||||
| algo_name = "WINOGRAD_NCHW44:AARCH64_F32_MK4_4x16:4:2:32"; | |||||
| #else | |||||
| algo_name = "WINOGRAD_NCHW44:ARMV7_F32_MK4_4x8:4:2:32"; | |||||
| #endif | |||||
| std::vector<DType> data_type = { | |||||
| dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f), | |||||
| dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f)}; | |||||
| printf("Benchmark WINOGRAD_INT8_NCHW44_MK4_COMP_F32 algo\n"); | |||||
| benchmark_impl(param, shapes_and_computation, algo_name, RUNS, | |||||
| {4, {4, 5, 6, 7}}, {1, {4}}, data_type); | |||||
| benchmark_impl(param, shapes_and_computation, algo_name, RUNS, | |||||
| {4, {4, 5, 6, 7}}, {1, {7}}, data_type); | |||||
| benchmark_impl(param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}}, | |||||
| {1, {4}}, data_type); | |||||
| } | |||||
| TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_IM2COL_FP32) { | TEST_F(ARM_COMMON_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_IM2COL_FP32) { | ||||
| constexpr size_t RUNS = 50; | constexpr size_t RUNS = 50; | ||||
| @@ -9,6 +9,7 @@ | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
| */ | */ | ||||
| #include "test/common/conv_bias.h" | #include "test/common/conv_bias.h" | ||||
| #include "megdnn/opr_param_defs.h" | |||||
| #include "src/common/utils.h" | #include "src/common/utils.h" | ||||
| #include "test/common/benchmarker.h" | #include "test/common/benchmarker.h" | ||||
| namespace megdnn { | namespace megdnn { | ||||
| @@ -242,7 +243,8 @@ std::vector<TestArg> get_winograd_mk_packed_args(size_t pack_size) { | |||||
| return args; | return args; | ||||
| } | } | ||||
| std::vector<TestArg> get_quantized_winograd_mk_packed_args(size_t pack_size) { | |||||
| std::vector<TestArg> get_quantized_winograd_mk_packed_args( | |||||
| size_t pack_size, bool compute_float32) { | |||||
| std::vector<TestArg> args; | std::vector<TestArg> args; | ||||
| param::ConvBias cur_param; | param::ConvBias cur_param; | ||||
| @@ -260,13 +262,16 @@ std::vector<TestArg> get_quantized_winograd_mk_packed_args(size_t pack_size) { | |||||
| cur_param.sparse = param::ConvBias::Sparse::DENSE; | cur_param.sparse = param::ConvBias::Sparse::DENSE; | ||||
| cur_param.pad_h = cur_param.pad_w = 1; | cur_param.pad_h = cur_param.pad_w = 1; | ||||
| if(compute_float32){ | |||||
| cur_param.compute_mode = param::ConvBias::ComputeMode::FLOAT32; | |||||
| } | |||||
| args.emplace_back(cur_param, TensorShape{1, pack_size, 3, 3}, | args.emplace_back(cur_param, TensorShape{1, pack_size, 3, 3}, | ||||
| TensorShape{pack_size, pack_size, 3, 3}, | TensorShape{pack_size, pack_size, 3, 3}, | ||||
| TensorShape{1, pack_size, 1, 1}); | TensorShape{1, pack_size, 1, 1}); | ||||
| //! no bias | //! no bias | ||||
| args.emplace_back(cur_param, TensorShape{2, ic, i, i}, | args.emplace_back(cur_param, TensorShape{2, ic, i, i}, | ||||
| TensorShape{oc, ic, 3, 3}, TensorShape{}); | TensorShape{oc, ic, 3, 3}, TensorShape{}); | ||||
| //! bias | //! bias | ||||
| args.emplace_back(cur_param, TensorShape{2, ic, i, i}, | args.emplace_back(cur_param, TensorShape{2, ic, i, i}, | ||||
| TensorShape{oc, ic, 3, 3}, TensorShape{2, oc, i, i}); | TensorShape{oc, ic, 3, 3}, TensorShape{2, oc, i, i}); | ||||
| @@ -372,7 +377,7 @@ std::vector<TestArg> get_int8_nchw4_args(size_t kernel_size) { | |||||
| for (auto mode : {param::ConvBias::Mode::CROSS_CORRELATION}) { | for (auto mode : {param::ConvBias::Mode::CROSS_CORRELATION}) { | ||||
| for (size_t b : {64, 16}) { | for (size_t b : {64, 16}) { | ||||
| for (size_t ic : {16, 32}) { | for (size_t ic : {16, 32}) { | ||||
| for (size_t oc : {64, 32}) { | |||||
| for (size_t oc : {16, 32}) { | |||||
| for (size_t h : {8}) { | for (size_t h : {8}) { | ||||
| for (size_t w : {8, 11}) { | for (size_t w : {8, 11}) { | ||||
| for (int p : {0, static_cast<int>(kernel_size / 2)}) { | for (int p : {0, static_cast<int>(kernel_size / 2)}) { | ||||
| @@ -399,6 +404,95 @@ std::vector<TestArg> get_int8_nchw4_args(size_t kernel_size) { | |||||
| return args; | return args; | ||||
| } | } | ||||
| std::vector<TestArg> get_int8_nchw44_args(size_t kernel_size, size_t pack_size, | |||||
| bool compute_float32, | |||||
| bool group_mode) { | |||||
| std::vector<TestArg> args; | |||||
| param::ConvBias cur_param; | |||||
| megdnn_assert(pack_size > 0, "not support pack_size"); | |||||
| megdnn_assert(kernel_size > 0, "not support kernel_size"); | |||||
| using NLMode = param::ConvBias::NonlineMode; | |||||
| //// clang-format off | |||||
| for (auto nlmode : {NLMode::IDENTITY, NLMode::RELU}) { | |||||
| for (auto mode : {param::ConvBias::Mode::CROSS_CORRELATION}) { | |||||
| for (size_t b : {1,2}) { | |||||
| for (size_t ic : {8,16}) { | |||||
| for (size_t oc : {8,16}) { | |||||
| for (size_t h : {9,23}) { | |||||
| for (size_t w : {9,23}) { | |||||
| for (int p : {0, static_cast<int>(kernel_size / 2)}) { | |||||
| for (size_t s : {1}) { | |||||
| if (kernel_size == 7) { | |||||
| b = std::min(b, 32_z); | |||||
| } | |||||
| size_t f = kernel_size; | |||||
| cur_param.mode = mode; | |||||
| cur_param.nonlineMode = nlmode; | |||||
| if (pack_size == 4){ | |||||
| cur_param.format = param::ConvBias::Format::NCHW44; | |||||
| } else if(pack_size == 8){ | |||||
| cur_param.format = param::ConvBias::Format::NCHW88; | |||||
| } | |||||
| if(compute_float32){ | |||||
| cur_param.compute_mode = | |||||
| param::ConvBias::ComputeMode::FLOAT32; | |||||
| } | |||||
| cur_param.sparse = param::ConvBias::Sparse::DENSE; | |||||
| cur_param.pad_h = cur_param.pad_w = p; | |||||
| cur_param.stride_h = cur_param.stride_w = s; | |||||
| if (!group_mode) { | |||||
| //! no bias | |||||
| args.emplace_back(cur_param, | |||||
| TensorShape{b, ic / pack_size, h, w, pack_size}, | |||||
| TensorShape{oc / pack_size, ic / pack_size, f, f, | |||||
| pack_size, pack_size}, | |||||
| TensorShape{}); | |||||
| //! bias channel | |||||
| args.emplace_back(cur_param, | |||||
| TensorShape{b, ic / pack_size, h, w, pack_size}, | |||||
| TensorShape{oc / pack_size, ic / pack_size, f, f, | |||||
| pack_size, pack_size}, | |||||
| TensorShape{1, oc / pack_size, 1, 1, pack_size}); | |||||
| //! bias | |||||
| args.emplace_back( | |||||
| cur_param, TensorShape{b, ic / pack_size, h, w, pack_size}, | |||||
| TensorShape{oc / pack_size, ic / pack_size, f, f, pack_size, | |||||
| pack_size}, | |||||
| TensorShape{b, oc / pack_size, (h - f + 2 * p) / s + 1, | |||||
| (w - f + 2 * p) / s + 1, pack_size}); | |||||
| } else { | |||||
| cur_param.sparse = param::ConvBias::Sparse::GROUP; | |||||
| args.emplace_back( | |||||
| cur_param, | |||||
| TensorShape{2, 2 * ic / pack_size, h, w, pack_size}, | |||||
| TensorShape{2, oc / pack_size, ic / pack_size, 3, 3, | |||||
| pack_size, pack_size}, | |||||
| TensorShape{2, 2 * oc / pack_size, (h - f + 2 * p) / s + 1, | |||||
| (w - f + 2 * p) / s + 1, pack_size}); | |||||
| args.emplace_back( | |||||
| cur_param, | |||||
| TensorShape{2, 2 * ic / pack_size, h, w, pack_size}, | |||||
| TensorShape{2, oc / pack_size, ic / pack_size, f, f, | |||||
| pack_size, pack_size}, | |||||
| TensorShape{1, 2 * oc / pack_size, 1, 1, pack_size}); | |||||
| args.emplace_back( | |||||
| cur_param, | |||||
| TensorShape{2, 2 * ic / pack_size, h, w, pack_size}, | |||||
| TensorShape{2, oc / pack_size, ic / pack_size, f, f, | |||||
| pack_size, pack_size}, | |||||
| TensorShape{}); | |||||
| } | |||||
| } } } } } } } } } | |||||
| // clang-format on | |||||
| return args; | |||||
| } | |||||
| std::vector<TestArg> get_int8_nchw4_args_check_bounds(size_t kernel_size) { | std::vector<TestArg> get_int8_nchw4_args_check_bounds(size_t kernel_size) { | ||||
| std::vector<TestArg> args; | std::vector<TestArg> args; | ||||
| param::ConvBias cur_param; | param::ConvBias cur_param; | ||||
| @@ -990,11 +1084,14 @@ void checker_conv_bias_int8x8x16(std::vector<conv_bias::TestArg> args, | |||||
| void winograd_algo_extra_impl(const TensorNDArray& tensors, uint32_t m, | void winograd_algo_extra_impl(const TensorNDArray& tensors, uint32_t m, | ||||
| param::ConvBias param, Handle* handle, | param::ConvBias param, Handle* handle, | ||||
| param::MatrixMul::Format format) { | param::MatrixMul::Format format) { | ||||
| megdnn_assert(param.format == param::ConvBias::Format::NCHW); | |||||
| megdnn_assert(param.format == param::ConvBias::Format::NCHW || | |||||
| param.format == param::ConvBias::Format::NCHW44); | |||||
| auto winograd_preprocess_opr = | auto winograd_preprocess_opr = | ||||
| handle->create_operator<WinogradFilterPreprocess>(); | handle->create_operator<WinogradFilterPreprocess>(); | ||||
| winograd_preprocess_opr->param().output_block_size = m; | winograd_preprocess_opr->param().output_block_size = m; | ||||
| winograd_preprocess_opr->param().format = format; | winograd_preprocess_opr->param().format = format; | ||||
| winograd_preprocess_opr->param().compute_mode = | |||||
| param.compute_mode; | |||||
| TensorLayout filter_transform_layout; | TensorLayout filter_transform_layout; | ||||
| winograd_preprocess_opr->deduce_layout(tensors[1].layout, | winograd_preprocess_opr->deduce_layout(tensors[1].layout, | ||||
| filter_transform_layout); | filter_transform_layout); | ||||
| @@ -1004,7 +1101,12 @@ void winograd_algo_extra_impl(const TensorNDArray& tensors, uint32_t m, | |||||
| auto conv_bias_opr = handle->create_operator<ConvBias>(); | auto conv_bias_opr = handle->create_operator<ConvBias>(); | ||||
| conv_bias_opr->param() = param; | conv_bias_opr->param() = param; | ||||
| conv_bias_opr->param().format = param::ConvBias::Format::NCHW_WINOGRAD; | |||||
| if (param.format == param::ConvBias::Format::NCHW) { | |||||
| conv_bias_opr->param().format = param::ConvBias::Format::NCHW_WINOGRAD; | |||||
| } else { | |||||
| conv_bias_opr->param().format = | |||||
| param::ConvBias::Format::NCHW44_WINOGRAD; | |||||
| } | |||||
| conv_bias_opr->param().output_block_size = m; | conv_bias_opr->param().output_block_size = m; | ||||
| size_t conv_bias_workspace_in_bytes = conv_bias_opr->get_workspace_in_bytes( | size_t conv_bias_workspace_in_bytes = conv_bias_opr->get_workspace_in_bytes( | ||||
| tensors[0].layout, filter_transform_layout, tensors[2].layout, | tensors[0].layout, filter_transform_layout, tensors[2].layout, | ||||
| @@ -1021,7 +1123,6 @@ void winograd_algo_extra_impl(const TensorNDArray& tensors, uint32_t m, | |||||
| wb.get_workspace(2)); | wb.get_workspace(2)); | ||||
| conv_bias_opr->exec(tensors[0], filter_transform_tensor, tensors[2], | conv_bias_opr->exec(tensors[0], filter_transform_tensor, tensors[2], | ||||
| tensors[3], tensors[4], nullptr, wb.get_workspace(1)); | tensors[3], tensors[4], nullptr, wb.get_workspace(1)); | ||||
| free(wb.ptr()); | free(wb.ptr()); | ||||
| }; | }; | ||||
| @@ -36,7 +36,7 @@ std::vector<TestArg> get_chanwise_args(); | |||||
| std::vector<TestArg> get_winograd_args(size_t kernel_size); | std::vector<TestArg> get_winograd_args(size_t kernel_size); | ||||
| std::vector<TestArg> get_winograd_mk_packed_args(size_t pack_size = 4); | std::vector<TestArg> get_winograd_mk_packed_args(size_t pack_size = 4); | ||||
| std::vector<TestArg> get_quantized_winograd_mk_packed_args( | std::vector<TestArg> get_quantized_winograd_mk_packed_args( | ||||
| size_t pack_size = 4); | |||||
| size_t pack_size = 4, bool compute_float32 = false); | |||||
| std::vector<TestArg> get_quantized_args_with_nlmode( | std::vector<TestArg> get_quantized_args_with_nlmode( | ||||
| param::ConvBias::NonlineMode nlmode); | param::ConvBias::NonlineMode nlmode); | ||||
| std::vector<TestArg> get_quantized_args(); | std::vector<TestArg> get_quantized_args(); | ||||
| @@ -55,6 +55,10 @@ std::vector<TestArg> get_int8_chwn4_args_small_batch(size_t kernel_size); | |||||
| std::vector<TestArg> get_int8_nchw4_tensorcore_args(size_t kernel_size); | std::vector<TestArg> get_int8_nchw4_tensorcore_args(size_t kernel_size); | ||||
| std::vector<TestArg> get_int8_chwn4_tensorcore_args(size_t kernel_size); | std::vector<TestArg> get_int8_chwn4_tensorcore_args(size_t kernel_size); | ||||
| std::vector<TestArg> get_int8_nchw44_args(size_t kernel_size, size_t pack_size, | |||||
| bool compute_float32 = false, | |||||
| bool group_mode = false); | |||||
| template <typename Opr> | template <typename Opr> | ||||
| using ConvBiasAlgoChecker = AlgoChecker<Opr>; | using ConvBiasAlgoChecker = AlgoChecker<Opr>; | ||||
| @@ -102,12 +102,28 @@ void WinogradTransformReplacePass::apply(OptState& opt) const { | |||||
| opr::ConvBiasForward::get_matmul_format(winograd_param); | opr::ConvBiasForward::get_matmul_format(winograd_param); | ||||
| winograd_preprocess_param.output_block_size = | winograd_preprocess_param.output_block_size = | ||||
| winograd_param.output_block_size; | winograd_param.output_block_size; | ||||
| size_t pack_c_size = 1; | |||||
| if (new_inp[0]->shape().ndim == 5) { | |||||
| pack_c_size = new_inp[0]->layout().shape[4]; | |||||
| } | |||||
| if (conv_bias_opr.input(0)->dtype().enumv() == | |||||
| DTypeEnum::QuantizedS8 && | |||||
| pack_c_size == 4 && | |||||
| winograd_preprocess_param.format == | |||||
| megdnn::param::MatrixMul::Format::MK4) { | |||||
| winograd_preprocess_param.compute_mode = | |||||
| megdnn::param::ConvBias::ComputeMode::FLOAT32; | |||||
| } | |||||
| auto winograd_preprocess_opr = opr::WinogradFilterPreprocess::make( | auto winograd_preprocess_opr = opr::WinogradFilterPreprocess::make( | ||||
| new_inp[1], winograd_preprocess_param); | new_inp[1], winograd_preprocess_param); | ||||
| mgb_assert(inputs.size() == 2 || inputs.size() == 3, | mgb_assert(inputs.size() == 2 || inputs.size() == 3, | ||||
| "input size need to be 2/3, but got: %zu", | "input size need to be 2/3, but got: %zu", | ||||
| inputs.size()); | inputs.size()); | ||||
| SymbolVar new_conv_bias_opr; | SymbolVar new_conv_bias_opr; | ||||
| auto conv_bias_param = conv_bias_opr.param(); | auto conv_bias_param = conv_bias_opr.param(); | ||||
| if (new_inp[0]->shape().ndim == 4) { | if (new_inp[0]->shape().ndim == 4) { | ||||
| conv_bias_param.format = | conv_bias_param.format = | ||||
| @@ -126,6 +142,7 @@ void WinogradTransformReplacePass::apply(OptState& opt) const { | |||||
| algo_name.c_str()); | algo_name.c_str()); | ||||
| } | } | ||||
| } | } | ||||
| conv_bias_param.output_block_size = | conv_bias_param.output_block_size = | ||||
| winograd_param.output_block_size; | winograd_param.output_block_size; | ||||
| if (inputs.size() == 2) { | if (inputs.size() == 2) { | ||||
| @@ -1541,8 +1541,9 @@ void ConvBiasForward::check_winograd_param_valid( | |||||
| dtype.enumv() == DTypeEnum::QuantizedS8 || | dtype.enumv() == DTypeEnum::QuantizedS8 || | ||||
| dtype.enumv() == DTypeEnum::Quantized8Asymm) && | dtype.enumv() == DTypeEnum::Quantized8Asymm) && | ||||
| (param.channel_block_size == 1 || | (param.channel_block_size == 1 || | ||||
| param.channel_block_size == 4 || | |||||
| param.channel_block_size == 8), | param.channel_block_size == 8), | ||||
| "only support 1/8 for the channel_block_size of " | |||||
| "only support 1/4/8 for the channel_block_size of " | |||||
| "winograd param, got %u", | "winograd param, got %u", | ||||
| param.channel_block_size); | param.channel_block_size); | ||||
| } | } | ||||