| @@ -1,4 +1,3 @@ | |||
| #pragma once | |||
| /** | |||
| * \file dnn/src/arm_common/conv_bias/intrinsic_helper.h | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| @@ -10,7 +9,9 @@ | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
| * implied. | |||
| */ | |||
| #include "src/arm_common/conv_bias/neon_struct.h" | |||
| #pragma once | |||
| #include "src/arm_common/intrinsic_helper.h" | |||
| #include "src/arm_common/neon_struct.h" | |||
| #include "src/arm_common/simd_macro/marm_neon.h" | |||
| #include "src/common/unroll_macro.h" | |||
| #include "src/fallback/conv_bias/common.h" | |||
| @@ -689,185 +690,8 @@ inline void init_ocx_ow4(T& c, const int32_t* bias_ptr, int oc_step) { | |||
| InitOcxOw4<c_dim, bias_mode, T>::impl(c, bias_ptr, oc_step); | |||
| } | |||
| /////////////////////////////////////// | |||
| template <int weight_number, int base_offset, int ptr_step, int oc_block, | |||
| typename Func, typename T, typename T2, typename... XT> | |||
| struct LoadHelper { | |||
| static void impl(T& weight, T2 ptr, int oc_offset, XT... args); | |||
| }; | |||
| #define WEIGHT_CB(step) \ | |||
| src[step] = Func::impl(ptr + base_offset + step * ptr_step, args...); | |||
| template <int base_offset, int ptr_step, typename Func, typename T, typename T2, | |||
| typename... XT> | |||
| struct LoadHelper<1, base_offset, ptr_step, 0, Func, T, T2, XT...> { | |||
| static void impl(T& src, T2 ptr, int, XT... args) { | |||
| UNROLL_CALL_RAW(1, WEIGHT_CB); | |||
| } | |||
| }; | |||
| template <int base_offset, int ptr_step, typename Func, typename T, typename T2, | |||
| typename... XT> | |||
| struct LoadHelper<2, base_offset, ptr_step, 0, Func, T, T2, XT...> { | |||
| static void impl(T& src, T2 ptr, int, XT... args) { | |||
| UNROLL_CALL_RAW(2, WEIGHT_CB); | |||
| } | |||
| }; | |||
| template <int base_offset, int ptr_step, typename Func, typename T, typename T2, | |||
| typename... XT> | |||
| struct LoadHelper<3, base_offset, ptr_step, 0, Func, T, T2, XT...> { | |||
| static void impl(T& src, T2 ptr, int, XT... args) { | |||
| UNROLL_CALL_RAW(3, WEIGHT_CB); | |||
| } | |||
| }; | |||
| template <int base_offset, int ptr_step, typename Func, typename T, typename T2, | |||
| typename... XT> | |||
| struct LoadHelper<4, base_offset, ptr_step, 0, Func, T, T2, XT...> { | |||
| static void impl(T& src, T2 ptr, int, XT... args) { | |||
| UNROLL_CALL_RAW(4, WEIGHT_CB); | |||
| } | |||
| }; | |||
| template <int base_offset, int ptr_step, typename Func, typename T, typename T2, | |||
| typename... XT> | |||
| struct LoadHelper<5, base_offset, ptr_step, 0, Func, T, T2, XT...> { | |||
| static void impl(T& src, T2 ptr, int, XT... args) { | |||
| UNROLL_CALL_RAW(5, WEIGHT_CB); | |||
| } | |||
| }; | |||
| template <int base_offset, int ptr_step, typename Func, typename T, typename T2, | |||
| typename... XT> | |||
| struct LoadHelper<6, base_offset, ptr_step, 0, Func, T, T2, XT...> { | |||
| static void impl(T& src, T2 ptr, int, XT... args) { | |||
| UNROLL_CALL_RAW(6, WEIGHT_CB); | |||
| } | |||
| }; | |||
| template <int base_offset, int ptr_step, typename Func, typename T, typename T2, | |||
| typename... XT> | |||
| struct LoadHelper<7, base_offset, ptr_step, 0, Func, T, T2, XT...> { | |||
| static void impl(T& src, T2 ptr, int, XT... args) { | |||
| UNROLL_CALL_RAW(7, WEIGHT_CB); | |||
| } | |||
| }; | |||
| template <int base_offset, int ptr_step, typename Func, typename T, typename T2, | |||
| typename... XT> | |||
| struct LoadHelper<8, base_offset, ptr_step, 0, Func, T, T2, XT...> { | |||
| static void impl(T& src, T2 ptr, int, XT... args) { | |||
| UNROLL_CALL_RAW(8, WEIGHT_CB); | |||
| } | |||
| }; | |||
| #undef WEIGHT_CB | |||
| #define WEIGHT_CB(step) \ | |||
| src[0][step] = Func::impl(ptr + base_offset + step * ptr_step); | |||
| template <int base_offset, int ptr_step, typename Func, typename T, typename T2> | |||
| struct LoadHelper<1, base_offset, ptr_step, 1, Func, T, T2> { | |||
| static void impl(T& src, T2 ptr, int) { UNROLL_CALL_RAW(1, WEIGHT_CB); } | |||
| }; | |||
| template <int base_offset, int ptr_step, typename Func, typename T, typename T2> | |||
| struct LoadHelper<2, base_offset, ptr_step, 1, Func, T, T2> { | |||
| static void impl(T& src, T2 ptr, int) { UNROLL_CALL_RAW(2, WEIGHT_CB); } | |||
| }; | |||
| template <int base_offset, int ptr_step, typename Func, typename T, typename T2> | |||
| struct LoadHelper<3, base_offset, ptr_step, 1, Func, T, T2> { | |||
| static void impl(T& src, T2 ptr, int) { UNROLL_CALL_RAW(3, WEIGHT_CB); } | |||
| }; | |||
| template <int base_offset, int ptr_step, typename Func, typename T, typename T2> | |||
| struct LoadHelper<4, base_offset, ptr_step, 1, Func, T, T2> { | |||
| static void impl(T& src, T2 ptr, int) { UNROLL_CALL_RAW(4, WEIGHT_CB); } | |||
| }; | |||
| template <int base_offset, int ptr_step, typename Func, typename T, typename T2> | |||
| struct LoadHelper<5, base_offset, ptr_step, 1, Func, T, T2> { | |||
| static void impl(T& src, T2 ptr, int) { UNROLL_CALL_RAW(5, WEIGHT_CB); } | |||
| }; | |||
| template <int base_offset, int ptr_step, typename Func, typename T, typename T2> | |||
| struct LoadHelper<6, base_offset, ptr_step, 1, Func, T, T2> { | |||
| static void impl(T& src, T2 ptr, int) { UNROLL_CALL_RAW(6, WEIGHT_CB); } | |||
| }; | |||
| template <int base_offset, int ptr_step, typename Func, typename T, typename T2> | |||
| struct LoadHelper<7, base_offset, ptr_step, 1, Func, T, T2> { | |||
| static void impl(T& src, T2 ptr, int) { UNROLL_CALL_RAW(7, WEIGHT_CB); } | |||
| }; | |||
| template <int base_offset, int ptr_step, typename Func, typename T, typename T2> | |||
| struct LoadHelper<8, base_offset, ptr_step, 1, Func, T, T2> { | |||
| static void impl(T& src, T2 ptr, int) { UNROLL_CALL_RAW(8, WEIGHT_CB); } | |||
| }; | |||
| #undef WEIGHT_CB | |||
| #define WEIGHT_CB(step) \ | |||
| src[0][step] = Func::impl(ptr + base_offset + step * ptr_step); \ | |||
| src[1][step] = Func::impl(ptr + base_offset + step * ptr_step + oc_offset); | |||
| template <int base_offset, int ptr_step, typename Func, typename T, typename T2> | |||
| struct LoadHelper<1, base_offset, ptr_step, 2, Func, T, T2> { | |||
| static void impl(T& src, T2 ptr, int oc_offset) { | |||
| UNROLL_CALL_RAW(1, WEIGHT_CB); | |||
| } | |||
| }; | |||
| template <int base_offset, int ptr_step, typename Func, typename T, typename T2> | |||
| struct LoadHelper<2, base_offset, ptr_step, 2, Func, T, T2> { | |||
| static void impl(T& src, T2 ptr, int oc_offset) { | |||
| UNROLL_CALL_RAW(2, WEIGHT_CB); | |||
| } | |||
| }; | |||
| template <int base_offset, int ptr_step, typename Func, typename T, typename T2> | |||
| struct LoadHelper<3, base_offset, ptr_step, 2, Func, T, T2> { | |||
| static void impl(T& src, T2 ptr, int oc_offset) { | |||
| UNROLL_CALL_RAW(3, WEIGHT_CB); | |||
| } | |||
| }; | |||
| template <int base_offset, int ptr_step, typename Func, typename T, typename T2> | |||
| struct LoadHelper<4, base_offset, ptr_step, 2, Func, T, T2> { | |||
| static void impl(T& src, T2 ptr, int oc_offset) { | |||
| UNROLL_CALL_RAW(4, WEIGHT_CB); | |||
| } | |||
| }; | |||
| template <int base_offset, int ptr_step, typename Func, typename T, typename T2> | |||
| struct LoadHelper<5, base_offset, ptr_step, 2, Func, T, T2> { | |||
| static void impl(T& src, T2 ptr, int oc_offset) { | |||
| UNROLL_CALL_RAW(5, WEIGHT_CB); | |||
| } | |||
| }; | |||
| template <int base_offset, int ptr_step, typename Func, typename T, typename T2> | |||
| struct LoadHelper<6, base_offset, ptr_step, 2, Func, T, T2> { | |||
| static void impl(T& src, T2 ptr, int oc_offset) { | |||
| UNROLL_CALL_RAW(6, WEIGHT_CB); | |||
| } | |||
| }; | |||
| template <int base_offset, int ptr_step, typename Func, typename T, typename T2> | |||
| struct LoadHelper<7, base_offset, ptr_step, 2, Func, T, T2> { | |||
| static void impl(T& src, T2 ptr, int oc_offset) { | |||
| UNROLL_CALL_RAW(7, WEIGHT_CB); | |||
| } | |||
| }; | |||
| template <int base_offset, int ptr_step, typename Func, typename T, typename T2> | |||
| struct LoadHelper<8, base_offset, ptr_step, 2, Func, T, T2> { | |||
| static void impl(T& src, T2 ptr, int oc_offset) { | |||
| UNROLL_CALL_RAW(8, WEIGHT_CB); | |||
| } | |||
| }; | |||
| #undef WEIGHT_CB | |||
| template <int weight_number, int base_offset, int ptr_step, int c_dim, | |||
| typename Func, typename T, typename T2> | |||
| inline void load_helper(T& weight, T2 ptr, int oc_offset) { | |||
| LoadHelper<weight_number, base_offset, ptr_step, c_dim, Func, T, T2>::impl( | |||
| weight, ptr, oc_offset); | |||
| } | |||
| template <int weight_number, int base_offset, int ptr_step, int c_dim, | |||
| typename Func, typename T, typename T2, typename... XT> | |||
| inline void load_helper_x(T& weight, T2 ptr, int oc_offset, XT... args) { | |||
| LoadHelper<weight_number, base_offset, ptr_step, c_dim, Func, T, T2, | |||
| XT...>::impl(weight, ptr, oc_offset, args...); | |||
| } | |||
| } // namespace | |||
| } // namespace megdnn | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -0,0 +1,126 @@ | |||
| /** | |||
| * \file dnn/src/arm_common/intrinsic_helper.h | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
| * implied. | |||
| */ | |||
| #pragma once | |||
| #include "src/arm_common/neon_struct.h" | |||
| #include "src/arm_common/simd_macro/marm_neon.h" | |||
| #include "src/common/unroll_macro.h" | |||
| namespace megdnn { | |||
| namespace { | |||
| template <int weight_number, int base_offset, int ptr_step, int oc_block, | |||
| typename Func, typename T, typename T2, typename... XT> | |||
| struct LoadHelper { | |||
| static void impl(T& weight, T2 ptr, int oc_offset, XT... args); | |||
| }; | |||
| #define WEIGHT_CB(step) \ | |||
| src[step] = Func::impl(ptr + base_offset + step * ptr_step, args...); | |||
| #define LOAD_HELPER(step) \ | |||
| template <int base_offset, int ptr_step, typename Func, typename T, \ | |||
| typename T2, typename... XT> \ | |||
| struct LoadHelper<step, base_offset, ptr_step, 0, Func, T, T2, XT...> { \ | |||
| static void impl(T& src, T2 ptr, int, XT... args) { \ | |||
| UNROLL_CALL_RAW(step, WEIGHT_CB); \ | |||
| } \ | |||
| } | |||
| LOAD_HELPER(1); | |||
| LOAD_HELPER(2); | |||
| LOAD_HELPER(3); | |||
| LOAD_HELPER(4); | |||
| LOAD_HELPER(5); | |||
| LOAD_HELPER(6); | |||
| LOAD_HELPER(7); | |||
| LOAD_HELPER(8); | |||
| LOAD_HELPER(9); | |||
| LOAD_HELPER(10); | |||
| LOAD_HELPER(11); | |||
| LOAD_HELPER(12); | |||
| LOAD_HELPER(13); | |||
| LOAD_HELPER(14); | |||
| LOAD_HELPER(15); | |||
| LOAD_HELPER(16); | |||
| #undef LOAD_HELPER | |||
| #undef WEIGHT_CB | |||
| ///////////////////////////c_dim = 1///////////////////////// | |||
| #define WEIGHT_CB(step) \ | |||
| src[0][step] = Func::impl(ptr + base_offset + step * ptr_step); | |||
| #define LOAD_HELPER(step) \ | |||
| template <int base_offset, int ptr_step, typename Func, typename T, \ | |||
| typename T2> \ | |||
| struct LoadHelper<step, base_offset, ptr_step, 1, Func, T, T2> { \ | |||
| static void impl(T& src, T2 ptr, int) { \ | |||
| UNROLL_CALL_RAW(step, WEIGHT_CB); \ | |||
| } \ | |||
| } | |||
| LOAD_HELPER(1); | |||
| LOAD_HELPER(2); | |||
| LOAD_HELPER(3); | |||
| LOAD_HELPER(4); | |||
| LOAD_HELPER(5); | |||
| LOAD_HELPER(6); | |||
| LOAD_HELPER(7); | |||
| LOAD_HELPER(8); | |||
| LOAD_HELPER(9); | |||
| #undef LOAD_HELPER | |||
| #undef WEIGHT_CB | |||
| /////////////////////////c_dim = 2/////////////////////////////// | |||
| #define WEIGHT_CB(step) \ | |||
| src[0][step] = Func::impl(ptr + base_offset + step * ptr_step); \ | |||
| src[1][step] = Func::impl(ptr + base_offset + step * ptr_step + oc_offset); | |||
| #define LOAD_HELPER(step) \ | |||
| template <int base_offset, int ptr_step, typename Func, typename T, \ | |||
| typename T2> \ | |||
| struct LoadHelper<step, base_offset, ptr_step, 2, Func, T, T2> { \ | |||
| static void impl(T& src, T2 ptr, int oc_offset) { \ | |||
| UNROLL_CALL_RAW(step, WEIGHT_CB); \ | |||
| } \ | |||
| } | |||
| LOAD_HELPER(1); | |||
| LOAD_HELPER(2); | |||
| LOAD_HELPER(3); | |||
| LOAD_HELPER(4); | |||
| LOAD_HELPER(5); | |||
| LOAD_HELPER(6); | |||
| LOAD_HELPER(7); | |||
| LOAD_HELPER(8); | |||
| #undef LOAD_HELPER | |||
| #undef WEIGHT_CB | |||
| template <int weight_number, int base_offset, int ptr_step, int c_dim, | |||
| typename Func, typename T, typename T2> | |||
| inline void load_helper(T& weight, T2 ptr, int oc_offset) { | |||
| LoadHelper<weight_number, base_offset, ptr_step, c_dim, Func, T, T2>::impl( | |||
| weight, ptr, oc_offset); | |||
| } | |||
| template <int weight_number, int base_offset, int ptr_step, int c_dim, | |||
| typename Func, typename T, typename T2, typename... XT> | |||
| inline void load_helper_x(T& weight, T2 ptr, int oc_offset, XT... args) { | |||
| LoadHelper<weight_number, base_offset, ptr_step, c_dim, Func, T, T2, | |||
| XT...>::impl(weight, ptr, oc_offset, args...); | |||
| } | |||
| } // namespace | |||
| } // namespace megdnn | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -1,6 +1,5 @@ | |||
| #pragma once | |||
| /** | |||
| * \file dnn/src/arm_common/conv_bias/neon_struct.h | |||
| * \file dnn/src/arm_common/neon_struct.h | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
| @@ -10,6 +9,7 @@ | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
| * implied. | |||
| */ | |||
| #pragma once | |||
| #include "src/arm_common/simd_macro/marm_neon.h" | |||
| namespace megdnn { | |||
| namespace { | |||
| @@ -62,4 +62,6 @@ struct Vfmaq_laneq_f32 { | |||
| }; | |||
| } // namespace | |||
| } // namespace megdnn | |||
| } // namespace megdnn | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -114,7 +114,13 @@ public: | |||
| bool usable(const PoolingKernSizeParam& param) const override; | |||
| void exec(const PoolingKernParam& param) const override; | |||
| }; | |||
| class PoolingImpl::AlgoFp32ModexStridexNCHW44 final : public AlgoBase { | |||
| public: | |||
| bool is_reproducible() const override { return true; } | |||
| const char* name() const override { return "ARM_POOLING_FP32_MODEX_STRIDEX_NCHW44"; } | |||
| bool usable(const PoolingKernSizeParam& param) const override; | |||
| void exec(const PoolingKernParam& param) const override; | |||
| }; | |||
| WorkspaceBundle get_bundle(const PoolingImpl::PoolingKernSizeParam& param); | |||
| WorkspaceBundle get_bundle_nchw44( | |||
| @@ -0,0 +1,126 @@ | |||
| /** | |||
| * \file dnn/src/arm_common/pooling/algo_fp32_pooling_nchw44.cpp | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
| * implied. | |||
| */ | |||
| #include "megdnn/opr_param_defs.h" | |||
| #include "src/arm_common/pooling/algo.h" | |||
| #include "src/arm_common/pooling/kern_fp32_pooling_nchw44.h" | |||
| #include "midout.h" | |||
| MIDOUT_DECL(megdnn_arm_common_fp32_pooling_nchw44) | |||
| namespace megdnn { | |||
| namespace arm_common { | |||
| bool PoolingImpl::AlgoFp32ModexStridexNCHW44::usable( | |||
| const PoolingKernSizeParam& param) const { | |||
| uint32_t sh = param.stride[0]; | |||
| uint32_t sw = param.stride[1]; | |||
| uint32_t fh = param.filter[0]; | |||
| uint32_t fw = param.filter[1]; | |||
| bool avaible = param.src_type.enumv() == DTypeEnum::Float32 && | |||
| param.format == Param::Format::NCHW44 && | |||
| (param.mode == Mode::MAX || param.mode == Mode::AVERAGE) && | |||
| fh == fw && sh == sw && | |||
| (fh == 2 || fh == 3 || fh == 4 || fh == 5) && | |||
| (sh == 1 || sh == 2); | |||
| return avaible; | |||
| } | |||
| void PoolingImpl::AlgoFp32ModexStridexNCHW44::exec( | |||
| const PoolingKernParam& param) const { | |||
| int ih = param.isz[0]; | |||
| int iw = param.isz[1]; | |||
| int oh = param.osz[0]; | |||
| int ow = param.osz[1]; | |||
| int n = param.n; | |||
| int ic = param.ic; | |||
| int ph = param.padding[0]; | |||
| int pw = param.padding[1]; | |||
| int sh = param.stride[0]; | |||
| int fh = param.filter[0]; | |||
| void* src_ptr = param.src_ptr; | |||
| void* dst_ptr = param.dst_ptr; | |||
| #define DISPATCH_FUNC(filter, stride, mode) \ | |||
| MIDOUT_BEGIN(megdnn_arm_common_fp32_pooling_nchw44, midout_iv(0), \ | |||
| midout_iv(#filter #stride #mode##_hash)) { \ | |||
| auto run = [ih, iw, oh, ow, ph, pw, src_ptr, dst_ptr](size_t index, \ | |||
| size_t) { \ | |||
| const int c_idx = index; \ | |||
| pooling_fp32_nchw44<filter, stride, mode>( \ | |||
| static_cast<const float*>(src_ptr) + c_idx * ih * iw * 4, \ | |||
| static_cast<float*>(dst_ptr) + c_idx * oh * ow * 4, ih, \ | |||
| iw, oh, ow, ph, pw); \ | |||
| }; \ | |||
| MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN( \ | |||
| static_cast<::megdnn::naive::HandleImpl*>(param.handle), \ | |||
| n* ic, run); \ | |||
| } \ | |||
| MIDOUT_END(); | |||
| #define DISPATCH_MODE(filter, stride) \ | |||
| switch (param.mode) { \ | |||
| case PoolingBase::Mode::MAX: \ | |||
| DISPATCH_FUNC(filter, stride, PoolingBase::Mode::MAX); \ | |||
| break; \ | |||
| case PoolingBase::Mode::AVERAGE: \ | |||
| DISPATCH_FUNC(filter, stride, PoolingBase::Mode::AVERAGE); \ | |||
| break; \ | |||
| default: \ | |||
| megdnn_assert(0, "invalid mode %u", \ | |||
| static_cast<uint32_t>(param.mode)); \ | |||
| } | |||
| #define DISPATCH_STRIDE(filter) \ | |||
| switch (sh) { \ | |||
| case 1: \ | |||
| DISPATCH_MODE(filter, 1); \ | |||
| break; \ | |||
| case 2: \ | |||
| DISPATCH_MODE(filter, 2); \ | |||
| break; \ | |||
| default: \ | |||
| megdnn_assert(0, "invalid stride %d", sh); \ | |||
| } | |||
| #define DISPATCH_FILTER() \ | |||
| switch (fh) { \ | |||
| case 2: \ | |||
| DISPATCH_STRIDE(2); \ | |||
| break; \ | |||
| case 3: \ | |||
| DISPATCH_STRIDE(3); \ | |||
| break; \ | |||
| case 4: \ | |||
| DISPATCH_STRIDE(4); \ | |||
| break; \ | |||
| case 5: \ | |||
| DISPATCH_STRIDE(5); \ | |||
| break; \ | |||
| default: \ | |||
| megdnn_assert(0, "invalid filter %d", fh); \ | |||
| } | |||
| DISPATCH_FILTER() | |||
| #undef DISPATCH_FILTER | |||
| #undef DISPATCH_STRIDE | |||
| #undef DISPATCH_MODE | |||
| #undef DISPATCH_FUNC | |||
| } | |||
| } // namespace arm_common | |||
| } // namespace megdnn | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -0,0 +1,308 @@ | |||
| /** | |||
| * \file dnn/src/arm_common/pooling/kern_fp32_pooling_nchw44.h | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
| * implied. | |||
| */ | |||
| #pragma once | |||
| #include <limits> | |||
| #include "megdnn/opr_param_defs.h" | |||
| #include "src/arm_common/intrinsic_helper.h" | |||
| #include "src/arm_common/neon_struct.h" | |||
| #include "src/arm_common/simd_macro/marm_neon.h" | |||
| #include "src/common/unroll_macro.h" | |||
| namespace megdnn { | |||
| namespace arm_common { | |||
| namespace { | |||
| template <int filter, int stride, int ow_step, PoolingBase::Mode mode, | |||
| typename T1, typename T2> | |||
| struct CalXsXNchw44 { | |||
| static void impl(T1 result, T2 src); | |||
| }; | |||
| template <int filter, int stride, int ow_step, PoolingBase::Mode mode, | |||
| typename T1, typename T2> | |||
| void calculate_xsx_nchw44(T1 result, T2 src) { | |||
| CalXsXNchw44<filter, stride, ow_step, mode, T1, T2>::impl(result, src); | |||
| }; | |||
| #define CALCULATE_MAX_CB(step) \ | |||
| result[0] = vmaxq_f32(result[0], src[0 * stride + step]); \ | |||
| result[1] = vmaxq_f32(result[1], src[1 * stride + step]); \ | |||
| result[2] = vmaxq_f32(result[2], src[2 * stride + step]); \ | |||
| result[3] = vmaxq_f32(result[3], src[3 * stride + step]); | |||
| #define CALCULATE_AVG_CB(step) \ | |||
| result[0] = vaddq_f32(result[0], src[0 * stride + step]); \ | |||
| result[1] = vaddq_f32(result[1], src[1 * stride + step]); \ | |||
| result[2] = vaddq_f32(result[2], src[2 * stride + step]); \ | |||
| result[3] = vaddq_f32(result[3], src[3 * stride + step]); | |||
| #define INSTANCE_CAL(filter) \ | |||
| template <int stride, typename T1, typename T2> \ | |||
| struct CalXsXNchw44<filter, stride, 4, PoolingBase::Mode::MAX, T1, T2> { \ | |||
| static void impl(T1 result, T2 src) { \ | |||
| UNROLL_CALL_RAW(filter, CALCULATE_MAX_CB); \ | |||
| } \ | |||
| }; \ | |||
| template <int stride, typename T1, typename T2> \ | |||
| struct CalXsXNchw44<filter, stride, 4, PoolingBase::Mode::AVERAGE, T1, \ | |||
| T2> { \ | |||
| static void impl(T1 result, T2 src) { \ | |||
| UNROLL_CALL_RAW(filter, CALCULATE_AVG_CB); \ | |||
| } \ | |||
| }; | |||
| INSTANCE_CAL(2) | |||
| INSTANCE_CAL(3) | |||
| INSTANCE_CAL(4) | |||
| INSTANCE_CAL(5) | |||
| #undef INSTANCE_CAL | |||
| #undef CALCULATE_AVG_CB | |||
| #undef CALCULATE_MAX_CB | |||
| template <int filter, int stride, int ow_step, PoolingBase::Mode mode> | |||
| struct KerPoolingFilterXStrideXNchw44 { | |||
| static void impl(const float32_t* src_ptr, float32_t* dst_ptr, size_t iw); | |||
| }; | |||
| template <int filter, int stride, int ow_step> | |||
| struct KerPoolingFilterXStrideXNchw44<filter, stride, ow_step, | |||
| PoolingBase::Mode::MAX> { | |||
| static void impl(const float32_t* src_ptr, float32_t* dst_ptr, size_t iw) { | |||
| constexpr int src_reg_size = ow_step * stride + filter - stride; | |||
| constexpr int packed_ic = 4; | |||
| constexpr int simd_len = 4; | |||
| constexpr float default_float = std::numeric_limits<float>::lowest(); | |||
| float32x4_t result[ow_step]; | |||
| float32x4_t src[src_reg_size]; | |||
| result[0] = vdupq_n_f32(default_float); | |||
| result[1] = vdupq_n_f32(default_float); | |||
| result[2] = vdupq_n_f32(default_float); | |||
| result[3] = vdupq_n_f32(default_float); | |||
| for (int fh_idx = 0; fh_idx < filter; ++fh_idx) { | |||
| load_helper<src_reg_size, 0, simd_len, 0, Vld1q_f32>( | |||
| src, src_ptr + fh_idx * iw * packed_ic, 0); | |||
| calculate_xsx_nchw44<filter, stride, ow_step, | |||
| PoolingBase::Mode::MAX>(result, src); | |||
| } | |||
| vst1q_f32(dst_ptr + 0 * packed_ic, result[0]); | |||
| vst1q_f32(dst_ptr + 1 * packed_ic, result[1]); | |||
| vst1q_f32(dst_ptr + 2 * packed_ic, result[2]); | |||
| vst1q_f32(dst_ptr + 3 * packed_ic, result[3]); | |||
| } | |||
| }; | |||
| template <int filter, int stride, int ow_step> | |||
| struct KerPoolingFilterXStrideXNchw44<filter, stride, ow_step, | |||
| PoolingBase::Mode::AVERAGE> { | |||
| static void impl(const float32_t* src_ptr, float32_t* dst_ptr, size_t iw) { | |||
| constexpr int src_reg_size = ow_step * stride + filter - stride; | |||
| constexpr int packed_ic = 4; | |||
| constexpr int simd_len = 4; | |||
| constexpr float default_float = 0; | |||
| constexpr float div_filter_size = 1.f / (filter * filter); | |||
| const float32x4_t div_filter_size_vec = vdupq_n_f32(div_filter_size); | |||
| float32x4_t result[ow_step]; | |||
| float32x4_t src[src_reg_size]; | |||
| result[0] = vdupq_n_f32(default_float); | |||
| result[1] = vdupq_n_f32(default_float); | |||
| result[2] = vdupq_n_f32(default_float); | |||
| result[3] = vdupq_n_f32(default_float); | |||
| for (int fh_idx = 0; fh_idx < filter; ++fh_idx) { | |||
| load_helper<src_reg_size, 0, simd_len, 0, Vld1q_f32>( | |||
| src, src_ptr + fh_idx * iw * packed_ic, 0); | |||
| calculate_xsx_nchw44<filter, stride, ow_step, | |||
| PoolingBase::Mode::AVERAGE>(result, src); | |||
| } | |||
| result[0] = vmulq_f32(result[0], div_filter_size_vec); | |||
| result[1] = vmulq_f32(result[1], div_filter_size_vec); | |||
| result[2] = vmulq_f32(result[2], div_filter_size_vec); | |||
| result[3] = vmulq_f32(result[3], div_filter_size_vec); | |||
| vst1q_f32(dst_ptr + 0 * packed_ic, result[0]); | |||
| vst1q_f32(dst_ptr + 1 * packed_ic, result[1]); | |||
| vst1q_f32(dst_ptr + 2 * packed_ic, result[2]); | |||
| vst1q_f32(dst_ptr + 3 * packed_ic, result[3]); | |||
| } | |||
| }; | |||
| template <PoolingBase::Mode mode> | |||
| void ker_pooling_nchw44_remain_pad(const float32_t* src_ptr, float32_t* dst_ptr, | |||
| const int iw, const int pad_top, | |||
| const int pad_bottom, const int pad_left, | |||
| const int pad_right, const int filter); | |||
| template <> | |||
| void ker_pooling_nchw44_remain_pad<PoolingBase::Mode::MAX>( | |||
| const float32_t* src_ptr, float32_t* dst_ptr, const int iw, | |||
| const int pad_top, const int pad_bottom, const int pad_left, | |||
| const int pad_right, const int filter) { | |||
| constexpr int ic_step = 4; | |||
| const int ih_end = filter - pad_bottom; | |||
| const int iw_end = filter - pad_right; | |||
| float32x4_t result = vdupq_n_f32(std::numeric_limits<float>::lowest()); | |||
| for (int ih_idx = pad_top; ih_idx < ih_end; ++ih_idx) { | |||
| for (int iw_idx = pad_left; iw_idx < iw_end; ++iw_idx) { | |||
| float32x4_t src = | |||
| vld1q_f32(src_ptr + (iw_idx - pad_left) * ic_step); | |||
| result = vmaxq_f32(result, src); | |||
| } | |||
| src_ptr += iw * ic_step; | |||
| } | |||
| vst1q_f32(dst_ptr, result); | |||
| } | |||
| template <> | |||
| void ker_pooling_nchw44_remain_pad<PoolingBase::Mode::AVERAGE>( | |||
| const float32_t* src_ptr, float32_t* dst_ptr, const int iw, | |||
| const int pad_top, const int pad_bottom, const int pad_left, | |||
| const int pad_right, const int filter) { | |||
| constexpr int ic_step = 4; | |||
| const int ih_end = filter - pad_bottom; | |||
| const int iw_end = filter - pad_right; | |||
| const float div_filter_size = 1.f / (filter * filter); | |||
| const float32x4_t div_filter_size_vec = vdupq_n_f32(div_filter_size); | |||
| float32x4_t result = vdupq_n_f32(0.f); | |||
| for (int ih_idx = pad_top; ih_idx < ih_end; ++ih_idx) { | |||
| for (int iw_idx = pad_left; iw_idx < iw_end; ++iw_idx) { | |||
| float32x4_t src = | |||
| vld1q_f32(src_ptr + (iw_idx - pad_left) * ic_step); | |||
| result = vaddq_f32(result, src); | |||
| } | |||
| src_ptr += iw * ic_step; | |||
| } | |||
| result = vmulq_f32(result, div_filter_size_vec); | |||
| vst1q_f32(dst_ptr, result); | |||
| } | |||
| template <PoolingBase::Mode mode> | |||
| static inline void kern_pooling_with_pad_nchw44( | |||
| const float32_t* src, float32_t* dst, const int filter, | |||
| const int ow_start, const int ow_end, const int iw, const int ow, | |||
| const int stride_w, const int pw, const int real_ih_idx, | |||
| const int oh_idx, const int pad_top, const int pad_bottom) { | |||
| constexpr int ic_step = 4; | |||
| constexpr int oc_step = 4; | |||
| for (int ow_idx = ow_start; ow_idx < ow_end; ++ow_idx) { | |||
| const int iw_idx = ow_idx * stride_w; | |||
| const int real_iw_idx = std::max(iw_idx - pw, 0); | |||
| const int pad_left = std::max(0, pw - iw_idx); | |||
| const int pad_right = std::max(0, iw_idx - pw + filter - iw); | |||
| const int src_offset = (real_ih_idx * iw + real_iw_idx) * ic_step; | |||
| const int dst_offset = (oh_idx * ow + ow_idx) * oc_step; | |||
| ker_pooling_nchw44_remain_pad<mode>(src + src_offset, dst + dst_offset, | |||
| iw, pad_top, pad_bottom, pad_left, | |||
| pad_right, filter); | |||
| } | |||
| } | |||
| template <int filter, int stride, PoolingBase::Mode mode> | |||
| static inline void pooling_fp32_nchw44_pad(const float32_t* src, float32_t* dst, | |||
| int ih, int iw, int oh, int ow, | |||
| int ph, int pw) { | |||
| constexpr int stride_h = stride; | |||
| constexpr int stride_w = stride; | |||
| constexpr int ic_step = 4; | |||
| constexpr int oc_step = 4; | |||
| constexpr int ow_step = 4; | |||
| const int ow_pad_left_end = div_ceil(pw, stride_w); | |||
| const int ow_pad_right_end = (iw - filter + pw - 1) / stride_w; | |||
| const int ow_pad_right_step_end = | |||
| (ow_pad_right_end - ow_pad_left_end) / ow_step * ow_step + | |||
| ow_pad_left_end; | |||
| rep(oh_idx, oh) { | |||
| const int ih_idx = oh_idx * stride_h; | |||
| const int real_ih_idx = std::max(ih_idx - ph, 0); | |||
| const int pad_top = std::max(0, ph - ih_idx); | |||
| const int pad_bottom = std::max(0, ih_idx - ph + filter - ih); | |||
| if (pad_top > 0 || pad_bottom > 0) { | |||
| kern_pooling_with_pad_nchw44<mode>(src, dst, filter, 0, ow, iw, ow, | |||
| stride_w, pw, real_ih_idx, | |||
| oh_idx, pad_top, pad_bottom); | |||
| } else { | |||
| kern_pooling_with_pad_nchw44<mode>( | |||
| src, dst, filter, 0, ow_pad_left_end, iw, ow, stride_w, pw, | |||
| real_ih_idx, oh_idx, pad_top, pad_bottom); | |||
| for (int ow_idx = ow_pad_left_end; ow_idx < ow_pad_right_step_end; | |||
| ow_idx += ow_step) { | |||
| const int iw_idx = ow_idx * stride_w; | |||
| const int real_iw_idx = std::max(iw_idx - pw, 0); | |||
| const int src_offset = | |||
| (real_ih_idx * iw + real_iw_idx) * ic_step; | |||
| const int dst_offset = (oh_idx * ow + ow_idx) * oc_step; | |||
| KerPoolingFilterXStrideXNchw44<filter, stride, ow_step, | |||
| mode>::impl(src + src_offset, | |||
| dst + dst_offset, | |||
| iw); | |||
| } | |||
| kern_pooling_with_pad_nchw44<mode>( | |||
| src, dst, filter, ow_pad_right_step_end, ow, iw, ow, | |||
| stride_w, pw, real_ih_idx, oh_idx, pad_top, pad_bottom); | |||
| } | |||
| } | |||
| } | |||
| template <int filter, int stride, PoolingBase::Mode mode> | |||
| static inline void pooling_fp32_nchw44_no_pad(const float32_t* src, | |||
| float32_t* dst, int, int iw, | |||
| int oh, int ow) { | |||
| constexpr int stride_h = stride; | |||
| constexpr int stride_w = stride; | |||
| constexpr int ic_step = 4; | |||
| constexpr int oc_step = 4; | |||
| constexpr int ow_step = 4; | |||
| const int ow_end = ow / ow_step * ow_step; | |||
| const int ow_remain = ow - ow_end; | |||
| rep(oh_idx, oh) { | |||
| const int ih_idx = oh_idx * stride_h; | |||
| const int src_ih_offset = ih_idx * iw; | |||
| const int dst_oh_offset = oh_idx * ow; | |||
| for (int ow_idx = 0; ow_idx < ow_end; ow_idx += ow_step) { | |||
| const int iw_idx = ow_idx * stride_w; | |||
| const int src_offset = (src_ih_offset + iw_idx) * ic_step; | |||
| const int dst_offset = (dst_oh_offset + ow_idx) * oc_step; | |||
| KerPoolingFilterXStrideXNchw44<filter, stride, ow_step, mode>::impl( | |||
| src + src_offset, dst + dst_offset, iw); | |||
| } | |||
| if (ow_remain > 0) { | |||
| kern_pooling_with_pad_nchw44<mode>(src, dst, filter, ow_end, ow, iw, | |||
| ow, stride_w, 0, ih_idx, oh_idx, | |||
| 0, 0); | |||
| } | |||
| } | |||
| } | |||
| template <int filter, int stride, PoolingBase::Mode mode> | |||
| static inline void pooling_fp32_nchw44(const float32_t* src, float32_t* dst, | |||
| int ih, int iw, int oh, int ow, int ph, | |||
| int pw) { | |||
| if (ph > 0 || pw > 0) { | |||
| pooling_fp32_nchw44_pad<filter, stride, mode>(src, dst, ih, iw, oh, ow, | |||
| ph, pw); | |||
| } else { | |||
| pooling_fp32_nchw44_no_pad<filter, stride, mode>(src, dst, ih, iw, oh, | |||
| ow); | |||
| } | |||
| } | |||
| } // namespace | |||
| } // namespace arm_common | |||
| } // namespace megdnn | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -29,6 +29,7 @@ class PoolingImpl::AlgoPack : NonCopyableObj { | |||
| AlgoFilter3ModexStridexNCHW44 algo_filter3_modex_stridex_nchw4; | |||
| AlgoFilter4ModexStridexNCHW44 algo_filter4_modex_stridex_nchw4; | |||
| AlgoFilter5ModexStridexNCHW44 algo_filter5_modex_stridex_nchw4; | |||
| AlgoFp32ModexStridexNCHW44 algo_fp32_modex_stridex_nchw44; | |||
| public: | |||
| AlgoPack() { | |||
| @@ -44,6 +45,7 @@ public: | |||
| all_algos.emplace_back(&algo_filter2_modex_stridex_nchw4); | |||
| all_algos.emplace_back(&algo_filter4_modex_stridex_nchw4); | |||
| all_algos.emplace_back(&algo_filter5_modex_stridex_nchw4); | |||
| all_algos.emplace_back(&algo_fp32_modex_stridex_nchw44); | |||
| } | |||
| SmallVector<AlgoBase*> all_algos; | |||
| }; | |||
| @@ -87,10 +87,10 @@ private: | |||
| class AlgoFilter3ModexStridexNCHW44; | |||
| class AlgoFilter4ModexStridexNCHW44; | |||
| class AlgoFilter5ModexStridexNCHW44; | |||
| class AlgoFp32ModexStridexNCHW44; | |||
| class AlgoPack; | |||
| }; | |||
| } // namespace arm_common | |||
| } // namespace megdnn | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -6,7 +6,8 @@ | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
| * implied. | |||
| */ | |||
| #pragma once | |||
| @@ -32,10 +33,33 @@ | |||
| #define UNROLL_RAW9(cb, v0, a...) \ | |||
| UNROLL_RAW8(cb, v0, ##a) \ | |||
| cb(8, ##a) | |||
| #define UNROLL_RAW10(cb, v0, a...) \ | |||
| UNROLL_RAW9(cb, v0, ##a) \ | |||
| cb(9, ##a) | |||
| #define UNROLL_RAW11(cb, v0, a...) \ | |||
| UNROLL_RAW10(cb, v0, ##a) \ | |||
| cb(10, ##a) | |||
| #define UNROLL_RAW12(cb, v0, a...) \ | |||
| UNROLL_RAW11(cb, v0, ##a) \ | |||
| cb(11, ##a) | |||
| #define UNROLL_RAW13(cb, v0, a...) \ | |||
| UNROLL_RAW12(cb, v0, ##a) \ | |||
| cb(12, ##a) | |||
| #define UNROLL_RAW14(cb, v0, a...) \ | |||
| UNROLL_RAW13(cb, v0, ##a) \ | |||
| cb(13, ##a) | |||
| #define UNROLL_RAW15(cb, v0, a...) \ | |||
| UNROLL_RAW14(cb, v0, ##a) \ | |||
| cb(14, ##a) | |||
| // clang-format off | |||
| #define UNROLL_RAW16(cb, v0, a...) \ | |||
| UNROLL_RAW8(cb, v0, ##a) \ | |||
| cb(8, ##a) cb(9, ##a) cb(10, ##a) cb(11, ##a) cb(12, ##a) cb(13, ##a) \ | |||
| cb(14, ##a) cb(15, ##a) | |||
| #define UNROLL_RAW17(cb, v0, a...) \ | |||
| UNROLL_RAW16(cb, v0, ##a) \ | |||
| cb(16, ##a) | |||
| #define UNROLL_RAW24(cb, v0, a...) \ | |||
| UNROLL_RAW16(cb, v0, ##a) \ | |||
| cb(16, ##a) cb(17, ##a) cb(18, ##a) cb(19, ##a) cb(20, ##a) cb(21, ##a) \ | |||
| @@ -130,4 +154,6 @@ | |||
| #define UNROLL_CALL_NOWRAPPER_D2(step, step2, cb) \ | |||
| UNROLL_CALL_RAW_D2(step, step2, cb) | |||
| // clang-format on | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -256,6 +256,73 @@ TEST_F(ARM_COMMON, POOLING_QUANTIZED) { | |||
| } | |||
| #if MEGDNN_WITH_BENCHMARK | |||
| void benchmark_nchw44_fp32(Handle *handle) { | |||
| using Param = param::Pooling; | |||
| auto run = [&](size_t n, size_t c, size_t h, size_t w, size_t filter, | |||
| size_t stride, size_t pad, Param::Mode mode) { | |||
| Param param; | |||
| param.window_h = param.window_w = filter; | |||
| param.stride_h = param.stride_w = stride; | |||
| param.pad_h = param.pad_w = pad; | |||
| param.format = Param::Format::NCHW; | |||
| param.mode = mode; | |||
| TensorShape nchw_shape = {n, c, h, w}; | |||
| TensorShape nchw44_shape = {n, c / 4, h, w, 4}; | |||
| TensorLayout dst_layout; | |||
| auto opr = handle->create_operator<Pooling>(); | |||
| opr->param() = param; | |||
| opr->deduce_layout({nchw_shape, dtype::Float32()}, dst_layout); | |||
| float calc_amount = | |||
| dst_layout.total_nr_elems() * param.window_h * param.window_w; | |||
| Benchmarker<Pooling> benchmarker_float_nchw(handle); | |||
| Benchmarker<Pooling> benchmarker_float_nchw44(handle); | |||
| Benchmarker<Pooling> benchmarker_int_nchw44(handle); | |||
| size_t RUN = 500; | |||
| auto t1 = benchmarker_float_nchw.set_display(false) | |||
| .set_times(RUN) | |||
| .set_param(param) | |||
| .exec({nchw_shape, {}}); | |||
| param.format = Param::Format::NCHW44; | |||
| auto t2 = benchmarker_int_nchw44.set_display(false) | |||
| .set_times(RUN) | |||
| .set_param(param) | |||
| .execl({{nchw44_shape, dtype::QuantizedS8(1.0)}, | |||
| {{}, dtype::QuantizedS8(1.0)}}); | |||
| auto t3 = benchmarker_float_nchw44.set_display(false) | |||
| .set_times(RUN) | |||
| .set_param(param) | |||
| .exec({nchw44_shape, {}}); | |||
| printf("{%zu %zu %zu %zu} filter = %zu, stride = %zu pad = %zu\n" | |||
| "nchw_fp32={%.3f ms, %.3f Mflops}, " | |||
| "nchw44_int={%.3f ms, %.3f Mflops}, " | |||
| "nchw44_fp32={%.3f ms, %.3f Mflops, speed_up %f}\n\n", | |||
| n, c, h, w, filter, stride, pad, t1 / RUN, | |||
| calc_amount / (t1 / RUN * 1000), t2 / RUN, | |||
| calc_amount / (t2 / RUN * 1000), t3 / RUN, | |||
| calc_amount / (t3 / RUN * 1000), t1 / t3); | |||
| }; | |||
| // Resnet50 | |||
| run(1, 64, 112, 112, 3, 2, 1, param::Pooling::Mode::MAX); | |||
| run(1, 2048, 7, 7, 7, 1, 0, param::Pooling::Mode::AVERAGE); | |||
| // VGG16 | |||
| run(1, 64, 224, 224, 2, 2, 0, param::Pooling::Mode::MAX); | |||
| run(1, 128, 112, 112, 2, 2, 0, param::Pooling::Mode::MAX); | |||
| run(1, 256, 56, 56, 2, 2, 0, param::Pooling::Mode::MAX); | |||
| run(1, 512, 28, 28, 2, 2, 0, param::Pooling::Mode::MAX); | |||
| run(1, 512, 14, 14, 2, 2, 0, param::Pooling::Mode::MAX); | |||
| } | |||
| TEST_F(ARM_COMMON, BENCHMARK_POOLING_NCHW44_FP32) { benchmark_nchw44_fp32(handle()); } | |||
| TEST_F(ARM_COMMON_MULTI_THREADS, BENCHMARK_POOLING_NCHW44_FP32) { | |||
| benchmark_nchw44_fp32(handle()); | |||
| } | |||
| TEST_F(ARM_COMMON, BENCHMARK_POOLING_INT8_W3x3_S2x2) | |||
| { | |||
| using Param = param::Pooling; | |||
| @@ -57,6 +57,65 @@ TEST_F(ARM_COMMON_MULTI_THREADS, POOLING) { | |||
| } | |||
| } | |||
| std::vector<std::pair<param::Pooling, TensorShapeArray>> get_nchw44_pool_args( | |||
| size_t filter, size_t stride) { | |||
| constexpr size_t ic_step = 4; | |||
| std::vector<std::pair<param::Pooling, TensorShapeArray>> args; | |||
| for (size_t n : {1, 2}) | |||
| for (size_t c : {4, 8}) | |||
| for (size_t ih : {3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}) | |||
| for (size_t iw : {3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}) | |||
| for (size_t ph : {0, 1, 2}) | |||
| for (size_t pw : {0, 1, 2}) | |||
| for (auto mode : {param::Pooling::Mode::MAX, | |||
| param::Pooling::Mode::AVERAGE}) | |||
| if (ih + 2 * ph >= filter && | |||
| iw + 2 * pw >= filter && filter > ph && | |||
| filter > pw) { | |||
| param::Pooling param; | |||
| param.mode = mode; | |||
| param.format = | |||
| param::Pooling::Format::NCHW44; | |||
| param.pad_h = ph; | |||
| param.pad_w = pw; | |||
| param.stride_h = param.stride_w = stride; | |||
| param.window_h = param.window_w = filter; | |||
| args.emplace_back(std::make_pair( | |||
| param, | |||
| TensorShapeArray{{n, c / ic_step, | |||
| ih, iw, ic_step}, | |||
| {}})); | |||
| } | |||
| return args; | |||
| } | |||
| void run_pooling_check( | |||
| Handle* handle, | |||
| std::vector<std::pair<param::Pooling, TensorShapeArray>> args, | |||
| bool is_int8) { | |||
| Checker<Pooling> checker(handle); | |||
| UniformIntRNG rng_int8{INT8_MIN >> 1, INT8_MAX >> 1}; | |||
| UniformIntRNG rng_fp32{-10, 10}; | |||
| if (is_int8) { | |||
| checker.set_dtype(0, dtype::QuantizedS8(1.1f)); | |||
| checker.set_rng(0, &rng_int8); | |||
| } else { | |||
| checker.set_rng(0, &rng_fp32); | |||
| } | |||
| for (auto arg : args) { | |||
| checker.set_param(arg.first).exec(arg.second); | |||
| } | |||
| } | |||
| TEST_F(ARM_COMMON_MULTI_THREADS, POOLING_NCHW44_FP32) { | |||
| for (auto filter : {2, 3, 4, 5}) | |||
| for (auto stride : {1, 2}) { | |||
| run_pooling_check(handle(), get_nchw44_pool_args(filter, stride), | |||
| false); | |||
| } | |||
| } | |||
| TEST_F(ARM_COMMON_MULTI_THREADS, POOLING_W3x3_NCHW44) | |||
| { | |||
| // clang-format off | |||