GitOrigin-RevId: 1bddd6dc2c
tags/v1.10.0
| @@ -12,7 +12,7 @@ | |||
| #pragma once | |||
| #include "src/arm_common/pooling/opr_impl.h" | |||
| #include "src/arm_common/pooling/pooling_helper.h" | |||
| #include "src/common//utils.h" | |||
| #include "src/common/utils.h" | |||
| #include "src/naive/handle.h" | |||
| namespace megdnn { | |||
| @@ -134,22 +134,15 @@ public: | |||
| void exec(const PoolingKernParam& param) const override; | |||
| MEGDNN_DECL_ALGO_TYPE(ARM_Filter5ModexStridexNCHW44) | |||
| }; | |||
| class PoolingImpl::AlgoFp32ModexStridexNCHW44 final : public AlgoBase { | |||
| public: | |||
| AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }; | |||
| const char* name() const override { | |||
| return "ARM_POOLING_FP32_MODEX_STRIDEX_NCHW44"; | |||
| } | |||
| bool usable(const PoolingKernSizeParam& param) const override; | |||
| void exec(const PoolingKernParam& param) const override; | |||
| MEGDNN_DECL_ALGO_TYPE(ARM_Fp32ModexStridexNCHW44) | |||
| }; | |||
| class PoolingImpl::AlgoFallback final : public AlgoBase { | |||
| public: | |||
| AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }; | |||
| const char* name() const override { return "FALLBACK_POOLING"; } | |||
| bool usable(const PoolingKernSizeParam&) const override { return true; } | |||
| void exec(const PoolingKernParam&) const override {} | |||
| void exec(const PoolingKernParam&) const override { | |||
| megdnn_assert(false, "code issue happened!!"); | |||
| } | |||
| MEGDNN_DECL_ALGO_TYPE(ARM_Fallback) | |||
| }; | |||
| WorkspaceBundle get_bundle(const PoolingImpl::PoolingKernSizeParam& param); | |||
| @@ -32,7 +32,6 @@ private: | |||
| AlgoFilter3ModexStridexNCHW44 algo_filter3_modex_stridex_nchw4; | |||
| AlgoFilter4ModexStridexNCHW44 algo_filter4_modex_stridex_nchw4; | |||
| AlgoFilter5ModexStridexNCHW44 algo_filter5_modex_stridex_nchw4; | |||
| AlgoFp32ModexStridexNCHW44 algo_fp32_modex_stridex_nchw44; | |||
| AlgoFallback algo_fallback; | |||
| public: | |||
| @@ -49,7 +48,6 @@ public: | |||
| all_algos.emplace_back(&algo_filter2_modex_stridex_nchw4); | |||
| all_algos.emplace_back(&algo_filter4_modex_stridex_nchw4); | |||
| all_algos.emplace_back(&algo_filter5_modex_stridex_nchw4); | |||
| all_algos.emplace_back(&algo_fp32_modex_stridex_nchw44); | |||
| all_algos.emplace_back(&algo_fallback); | |||
| for (auto&& algo : all_algos) { | |||
| @@ -62,40 +60,6 @@ public: | |||
| PoolingImpl::AlgoPack PoolingImpl::sm_algo_pack; | |||
| PoolingImpl::PoolingKernSizeParam PoolingImpl::make_pooling_kern_szie_param( | |||
| fallback::PoolingImpl* opr, const TensorLayout& src, const TensorLayout& dst) { | |||
| auto safe_u32 = [](size_t v) -> uint32_t { | |||
| megdnn_assert( | |||
| v <= std::numeric_limits<uint32_t>::max(), "value too large: %zu", v); | |||
| return v; | |||
| }; | |||
| return {safe_u32(src.shape[0]), | |||
| safe_u32(src.shape[1]), | |||
| {{safe_u32(src.shape[2]), safe_u32(src.shape[3])}}, | |||
| {{safe_u32(dst.shape[2]), safe_u32(dst.shape[3])}}, | |||
| {{safe_u32(opr->param().pad_h), safe_u32(opr->param().pad_w)}}, | |||
| {{safe_u32(opr->param().window_h), safe_u32(opr->param().window_w)}}, | |||
| {{safe_u32(opr->param().stride_h), safe_u32(opr->param().stride_w)}}, | |||
| src.dtype, | |||
| dst.dtype, | |||
| opr->handle(), | |||
| opr->param().format, | |||
| opr->param().mode}; | |||
| }; | |||
| PoolingImpl::PoolingKernParam PoolingImpl::make_pooling_kern_param( | |||
| fallback::PoolingImpl* opr, _megdnn_tensor_in src, _megdnn_tensor_out dst, | |||
| _megdnn_workspace workspace) { | |||
| PoolingKernParam ret; | |||
| static_cast<PoolingKernSizeParam&>(ret) = | |||
| make_pooling_kern_szie_param(opr, src.layout, dst.layout); | |||
| ret.src_ptr = src.get_ref_ptr(); | |||
| ret.dst_ptr = dst.get_ref_ptr(); | |||
| ret.workspace_ptr = workspace.raw_ptr; | |||
| ret.workspace_size = workspace.size; | |||
| return ret; | |||
| }; | |||
| size_t PoolingImpl::get_workspace_in_bytes( | |||
| const TensorLayout& src, const TensorLayout& dst) { | |||
| TensorLayoutArray layouts{src, dst}; | |||
| @@ -19,6 +19,10 @@ namespace arm_common { | |||
| class PoolingImpl final : public fallback::PoolingImpl { | |||
| private: | |||
| //! TODO: remove | |||
| //! AlgoFilterxModexStride1/AlgoFilter2ModexStride2 | |||
| //! AlgoFilter3AverageStride2/AlgoFilter4MaxStride2/AlgoFilter5MaxStride2 | |||
| //! after imp gi with float16 and int8 support to dnn/src/fallback/pooling/opr_impl.h | |||
| class AlgoFilterxModexStride1; | |||
| class AlgoFilter2ModexStride2; | |||
| class AlgoFilter3MaxStride2; | |||
| @@ -31,7 +35,6 @@ private: | |||
| class AlgoFilter3ModexStridexNCHW44; | |||
| class AlgoFilter4ModexStridexNCHW44; | |||
| class AlgoFilter5ModexStridexNCHW44; | |||
| class AlgoFp32ModexStridexNCHW44; | |||
| class AlgoFallback; | |||
| class AlgoPack; | |||
| static AlgoPack sm_algo_pack; | |||
| @@ -45,47 +48,10 @@ public: | |||
| static size_t constexpr MAX_SPATIAL_DIM = 2; | |||
| struct PoolingKernSizeParam { | |||
| uint32_t n, ic; | |||
| std::array<uint32_t, MAX_SPATIAL_DIM> isz, osz; | |||
| std::array<uint32_t, MAX_SPATIAL_DIM> padding, filter, stride; | |||
| DType src_type, dst_type; | |||
| Handle* handle; | |||
| Param::Format format; | |||
| Mode mode; | |||
| }; | |||
| struct PoolingKernParam : public PoolingKernSizeParam { | |||
| RefPtr src_ptr; | |||
| RefPtr dst_ptr; | |||
| void* workspace_ptr; | |||
| size_t workspace_size; | |||
| template <typename T> | |||
| const T* src() const { | |||
| src_type.assert_is_compatible_ctype<T>(); | |||
| return static_cast<const T*>(src_ptr.get_ptr()); | |||
| } | |||
| template <typename T> | |||
| T* dst() const { | |||
| dst_type.assert_is_compatible_ctype<T>(); | |||
| return static_cast<T*>(dst_ptr.get_ptr()); | |||
| } | |||
| template <typename T> | |||
| T* workspace() const { | |||
| return static_cast<T*>(workspace_ptr); | |||
| } | |||
| }; | |||
| using PoolingKernSizeParam = fallback::PoolingImpl::PoolingKernSizeParam; | |||
| PoolingKernSizeParam make_pooling_kern_szie_param( | |||
| fallback::PoolingImpl* opr, const TensorLayout& src, | |||
| const TensorLayout& dst); | |||
| using PoolingKernParam = fallback::PoolingImpl::PoolingKernParam; | |||
| PoolingKernParam make_pooling_kern_param( | |||
| fallback::PoolingImpl* opr, _megdnn_tensor_in src, _megdnn_tensor_out dst, | |||
| _megdnn_workspace workspace); | |||
| class AlgoBase : public detail::Algorithm { | |||
| public: | |||
| enum class AlgoType : uint32_t { | |||
| @@ -1325,3 +1325,35 @@ GI_FORCEINLINE float32x2_t GiGetHighFloat32(GI_FLOAT32_t a) { | |||
| return ___gi_vget_high_f32(a); | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE float32x2_t GiPaddFloat32(float32x2_t a, float32x2_t b) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vpadd_f32(a, b); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| float32x2_t res; | |||
| res.m64_f32[0] = a.m64_f32[0] + a.m64_f32[1]; | |||
| res.m64_f32[1] = b.m64_f32[0] + b.m64_f32[1]; | |||
| return res; | |||
| #else | |||
| float32x2_t res; | |||
| res[0] = a[0] + a[1]; | |||
| res[1] = b[0] + b[1]; | |||
| return res; | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE float32x2_t GiPmaxFloat32(float32x2_t a, float32x2_t b) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vpmax_f32(a, b); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| float32x2_t res; | |||
| res.m64_f32[0] = MAX_NAN(a.m64_f32[0], a.m64_f32[1]); | |||
| res.m64_f32[1] = MAX_NAN(b.m64_f32[0], b.m64_f32[1]); | |||
| return res; | |||
| #else | |||
| float32x2_t res; | |||
| res[0] = MAX_NAN(a[0], a[1]); | |||
| res[1] = MAX_NAN(b[0], b[1]); | |||
| return res; | |||
| #endif | |||
| } | |||
| @@ -0,0 +1,126 @@ | |||
| /** | |||
| * \file dnn/src/fallback/gi_intrinsic_helper.h | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2022 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
| * implied. | |||
| */ | |||
| #pragma once | |||
| #include "src/common/unroll_macro.h" | |||
| #include "src/fallback/general_intrinsic/gi_float.h" | |||
| namespace megdnn { | |||
| namespace { | |||
| template < | |||
| int weight_number, int base_offset, int ptr_step, int oc_block, typename Func, | |||
| typename T, typename T2, typename... XT> | |||
| struct LoadHelper { | |||
| static GI_FORCEINLINE void impl(T& weight, T2 ptr, int oc_offset, XT... args); | |||
| }; | |||
| #define WEIGHT_CB(step) \ | |||
| src[step] = Func::impl(ptr + base_offset + step * ptr_step, args...); | |||
| #define LOAD_HELPER(step) \ | |||
| template < \ | |||
| int base_offset, int ptr_step, typename Func, typename T, typename T2, \ | |||
| typename... XT> \ | |||
| struct LoadHelper<step, base_offset, ptr_step, 0, Func, T, T2, XT...> { \ | |||
| static GI_FORCEINLINE void impl(T& src, T2 ptr, int, XT... args) { \ | |||
| UNROLL_CALL_RAW(step, WEIGHT_CB); \ | |||
| } \ | |||
| } | |||
| LOAD_HELPER(1); | |||
| LOAD_HELPER(2); | |||
| LOAD_HELPER(3); | |||
| LOAD_HELPER(4); | |||
| LOAD_HELPER(5); | |||
| LOAD_HELPER(6); | |||
| LOAD_HELPER(7); | |||
| LOAD_HELPER(8); | |||
| LOAD_HELPER(9); | |||
| LOAD_HELPER(10); | |||
| LOAD_HELPER(11); | |||
| LOAD_HELPER(12); | |||
| LOAD_HELPER(13); | |||
| LOAD_HELPER(14); | |||
| LOAD_HELPER(15); | |||
| LOAD_HELPER(16); | |||
| #undef LOAD_HELPER | |||
| #undef WEIGHT_CB | |||
| ///////////////////////////c_dim = 1///////////////////////// | |||
| #define WEIGHT_CB(step) src[0][step] = Func::impl(ptr + base_offset + step * ptr_step); | |||
| #define LOAD_HELPER(step) \ | |||
| template <int base_offset, int ptr_step, typename Func, typename T, typename T2> \ | |||
| struct LoadHelper<step, base_offset, ptr_step, 1, Func, T, T2> { \ | |||
| static GI_FORCEINLINE void impl(T& src, T2 ptr, int) { \ | |||
| UNROLL_CALL_RAW(step, WEIGHT_CB); \ | |||
| } \ | |||
| } | |||
| LOAD_HELPER(1); | |||
| LOAD_HELPER(2); | |||
| LOAD_HELPER(3); | |||
| LOAD_HELPER(4); | |||
| LOAD_HELPER(5); | |||
| LOAD_HELPER(6); | |||
| LOAD_HELPER(7); | |||
| LOAD_HELPER(8); | |||
| LOAD_HELPER(9); | |||
| #undef LOAD_HELPER | |||
| #undef WEIGHT_CB | |||
| /////////////////////////c_dim = 2/////////////////////////////// | |||
| #define WEIGHT_CB(step) \ | |||
| src[0][step] = Func::impl(ptr + base_offset + step * ptr_step); \ | |||
| src[1][step] = Func::impl(ptr + base_offset + step * ptr_step + oc_offset); | |||
| #define LOAD_HELPER(step) \ | |||
| template <int base_offset, int ptr_step, typename Func, typename T, typename T2> \ | |||
| struct LoadHelper<step, base_offset, ptr_step, 2, Func, T, T2> { \ | |||
| static GI_FORCEINLINE void impl(T& src, T2 ptr, int oc_offset) { \ | |||
| UNROLL_CALL_RAW(step, WEIGHT_CB); \ | |||
| } \ | |||
| } | |||
| LOAD_HELPER(1); | |||
| LOAD_HELPER(2); | |||
| LOAD_HELPER(3); | |||
| LOAD_HELPER(4); | |||
| LOAD_HELPER(5); | |||
| LOAD_HELPER(6); | |||
| LOAD_HELPER(7); | |||
| LOAD_HELPER(8); | |||
| #undef LOAD_HELPER | |||
| #undef WEIGHT_CB | |||
| template < | |||
| int weight_number, int base_offset, int ptr_step, int c_dim, typename Func, | |||
| typename T, typename T2> | |||
| GI_FORCEINLINE void load_helper(T& weight, T2 ptr, int oc_offset) { | |||
| LoadHelper<weight_number, base_offset, ptr_step, c_dim, Func, T, T2>::impl( | |||
| weight, ptr, oc_offset); | |||
| } | |||
| template < | |||
| int weight_number, int base_offset, int ptr_step, int c_dim, typename Func, | |||
| typename T, typename T2, typename... XT> | |||
| GI_FORCEINLINE void load_helper_x(T& weight, T2 ptr, int oc_offset, XT... args) { | |||
| LoadHelper<weight_number, base_offset, ptr_step, c_dim, Func, T, T2, XT...>::impl( | |||
| weight, ptr, oc_offset, args...); | |||
| } | |||
| } // namespace | |||
| } // namespace megdnn | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -0,0 +1,403 @@ | |||
| /** | |||
| * \file dnn/src/fallback/pooling/gi/algo.cpp | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
| * implied. | |||
| */ | |||
| #include "algo.h" | |||
| #include "do_max_pooling_w4x4_s2x2.h" | |||
| #include "megdnn/opr_param_defs.h" | |||
| #include "midout.h" | |||
| MIDOUT_DECL(megdnn_fallback_gi_pooling) | |||
| namespace megdnn { | |||
| namespace fallback { | |||
| WorkspaceBundle get_bundle(const PoolingImpl::PoolingKernSizeParam& param) { | |||
| megdnn_assert( | |||
| param.src_type.category() == DTypeCategory::FLOAT && | |||
| param.format == param::Pooling::Format::NCHW && | |||
| (param.mode == param::Pooling::Mode::MAX || | |||
| (param.mode == param::Pooling::Mode::AVERAGE && param.filter[0] == 3)) && | |||
| param.filter[0] == param.filter[1] && | |||
| (param.filter[0] == 3 || param.filter[1] == 5) && param.stride[0] == 2 && | |||
| param.stride[1] == 2 && param.isz[0] >= 2 && param.isz[1] >= 2); | |||
| //! max pooling nxn stride 2 | |||
| auto IW = param.isz[1]; | |||
| auto OW = param.osz[1]; | |||
| // In order to process odd size filter, | |||
| // Firstly, Store a row of the input separately by odd and even numbers | |||
| // Then process them, get a row of the outputs | |||
| // We need to store n rows of results | |||
| SmallVector<size_t> needed_mem; | |||
| for (size_t i = 0; i < param.filter[0]; ++i) | |||
| needed_mem.push_back(OW * param.src_type.size()); | |||
| needed_mem.push_back((IW + 1) / 2 * param.src_type.size()); | |||
| needed_mem.push_back((IW + 1) / 2 * param.src_type.size()); | |||
| WorkspaceBundle ws(nullptr, needed_mem, 16); | |||
| return ws; | |||
| } | |||
| bool PoolingImpl::AlgoGiFilterxModexStride1::usable( | |||
| const PoolingKernSizeParam& param) const { | |||
| auto SH = param.stride[0]; | |||
| auto SW = param.stride[1]; | |||
| auto FH = param.filter[0]; | |||
| auto FW = param.filter[1]; | |||
| bool avaible = param.src_type.category() == DTypeCategory::FLOAT && | |||
| param.format == Param::Format::NCHW && SH == 1 && SW == 1 && | |||
| FH == FW && (FH == 2 || FH == 3); | |||
| bool is_mode_ok = (param.mode == Mode::MAX || param.mode == Mode::AVERAGE); | |||
| return avaible && is_mode_ok; | |||
| } | |||
| void PoolingImpl::AlgoGiFilterxModexStride1::exec(const PoolingKernParam& param) const { | |||
| auto IH = param.isz[0], IW = param.isz[1]; | |||
| auto OH = param.osz[0], OW = param.osz[1]; | |||
| auto N = param.n, C = param.ic; | |||
| auto PH = param.padding[0]; | |||
| auto PW = param.padding[1]; | |||
| auto FH = param.filter[0]; | |||
| auto src_ptr = param.src_ptr; | |||
| auto dst_ptr = param.dst_ptr; | |||
| #define DISPATCH_FUNC(Pooler, GiPooler, window, midout_type_id) \ | |||
| MIDOUT_BEGIN( \ | |||
| megdnn_fallback_gi_pooling, midout_iv(0), midout_iv(midout_type_id), \ | |||
| Pooler::MIDOUT_CASE_NUM, GiPooler::MIDOUT_CASE_NUM, window) { \ | |||
| auto run = [C, IH, IW, OH, OW, PH, PW, src_ptr, dst_ptr, \ | |||
| src_dtype = param.src_type](size_t index, size_t) { \ | |||
| size_t n = index / C; \ | |||
| size_t c = index % C; \ | |||
| do_pooling_compact<Pooler MEGDNN_COMMA GiPooler MEGDNN_COMMA window>( \ | |||
| static_cast<const typename Pooler::ctype*>(src_ptr.get_ptr()) + \ | |||
| n * C * IH * IW + c * IH * IW, \ | |||
| static_cast<typename Pooler::ctype*>(dst_ptr.get_ptr()) + \ | |||
| n * C * OH * OW + c * OH * OW, \ | |||
| src_dtype, IH, IW, OH, OW, PH, PW); \ | |||
| }; \ | |||
| MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN( \ | |||
| static_cast<::megdnn::naive::HandleImpl*>(param.handle), N* C, run); \ | |||
| } \ | |||
| MIDOUT_END() | |||
| #define DISPATCH_WINDOW(Pooler, GiPooler, dtype, ctype, comp_type, midout_type_id) \ | |||
| switch (FH) { \ | |||
| case 2: { \ | |||
| using _Pooler = Pooler<4, dtype, ctype, comp_type>; \ | |||
| using _GiPooler = GiPooler<4, dtype, ctype, comp_type>; \ | |||
| DISPATCH_FUNC(_Pooler, _GiPooler, 2, midout_type_id); \ | |||
| break; \ | |||
| } \ | |||
| case 3: { \ | |||
| using _Pooler = Pooler<9, dtype, ctype, comp_type>; \ | |||
| using _GiPooler = GiPooler<9, dtype, ctype, comp_type>; \ | |||
| DISPATCH_FUNC(_Pooler, _GiPooler, 3, midout_type_id); \ | |||
| break; \ | |||
| } \ | |||
| default: \ | |||
| megdnn_assert(0, "unsupport pooling filter size"); \ | |||
| break; \ | |||
| } | |||
| #define DISPATCH_MODE(dtype, ctype, comp_type, midout_type_id) \ | |||
| switch (param.mode) { \ | |||
| case Mode::MAX: \ | |||
| DISPATCH_WINDOW( \ | |||
| MaxPooler, GiMaxPooler, dtype, ctype, comp_type, midout_type_id); \ | |||
| break; \ | |||
| case Mode::AVERAGE: \ | |||
| DISPATCH_WINDOW( \ | |||
| MeanInPooler, GiMeanPooler, dtype, ctype, comp_type, \ | |||
| midout_type_id); \ | |||
| break; \ | |||
| default: \ | |||
| megdnn_assert(0, "unsupport pooling mode"); \ | |||
| break; \ | |||
| } | |||
| if (param.src_type == dtype::Float32{}) { | |||
| DISPATCH_MODE(dt_float32, float, float, 0); | |||
| } | |||
| #undef DISPATCH_FUNC | |||
| #undef DISPATCH_WINDOW | |||
| #undef DISPATCH_MODE | |||
| } | |||
| bool PoolingImpl::AlgoGiFilter2ModexStride2::usable( | |||
| const PoolingKernSizeParam& param) const { | |||
| auto SH = param.stride[0]; | |||
| auto SW = param.stride[1]; | |||
| auto FH = param.filter[0]; | |||
| auto FW = param.filter[1]; | |||
| bool avaible = param.src_type.category() == DTypeCategory::FLOAT && | |||
| param.format == Param::Format::NCHW && FH == FW && SH == SW && | |||
| FH == 2 && SH == 2; | |||
| bool is_mode_ok = (param.mode == Mode::MAX || param.mode == Mode::AVERAGE); | |||
| return avaible && is_mode_ok; | |||
| } | |||
| void PoolingImpl::AlgoGiFilter2ModexStride2::exec(const PoolingKernParam& param) const { | |||
| auto IH = param.isz[0], IW = param.isz[1]; | |||
| auto OH = param.osz[0], OW = param.osz[1]; | |||
| auto N = param.n, C = param.ic; | |||
| auto PH = param.padding[0]; | |||
| auto PW = param.padding[1]; | |||
| auto src_ptr = param.src_ptr; | |||
| auto dst_ptr = param.dst_ptr; | |||
| #define DISPATCH_FUNC(Pooler, mode, midout_type_id) \ | |||
| MIDOUT_BEGIN( \ | |||
| megdnn_fallback_gi_pooling, midout_iv(1), midout_iv(midout_type_id), \ | |||
| Pooler::MIDOUT_CASE_NUM) { \ | |||
| auto run = [C, IH, IW, OH, OW, PH, PW, src_ptr, dst_ptr, \ | |||
| src_dtype = param.src_type](size_t index, size_t) { \ | |||
| size_t n = index / C; \ | |||
| size_t c = index % C; \ | |||
| do_pooling_2x2<Pooler MEGDNN_COMMA mode>( \ | |||
| static_cast<const typename Pooler::ctype*>(src_ptr.get_ptr()) + \ | |||
| n * C * IH * IW + c * IH * IW, \ | |||
| static_cast<typename Pooler::ctype*>(dst_ptr.get_ptr()) + \ | |||
| n * C * OH * OW + c * OH * OW, \ | |||
| src_dtype, IH, IW, OH, OW, PH, PW); \ | |||
| }; \ | |||
| MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN( \ | |||
| static_cast<::megdnn::naive::HandleImpl*>(param.handle), N* C, run); \ | |||
| } \ | |||
| MIDOUT_END() | |||
| #define DISPATCH_MODE(dtype, ctype, comp_type, midout_type_id) \ | |||
| switch (param.mode) { \ | |||
| case Mode::MAX: { \ | |||
| using _Pooler = MaxPooler<4, dtype, ctype, comp_type>; \ | |||
| DISPATCH_FUNC(_Pooler, Mode::MAX, midout_type_id); \ | |||
| break; \ | |||
| } \ | |||
| case Mode::AVERAGE: { \ | |||
| using _Pooler = MeanInPooler<4, dtype, ctype, comp_type>; \ | |||
| DISPATCH_FUNC(_Pooler, Mode::AVERAGE, midout_type_id); \ | |||
| break; \ | |||
| } \ | |||
| default: \ | |||
| megdnn_assert(0, "unsupport pooling mode"); \ | |||
| break; \ | |||
| } | |||
| if (param.src_type == dtype::Float32{}) { | |||
| DISPATCH_MODE(dt_float32, float, float, 0); | |||
| } | |||
| #undef DISPATCH_FUNC | |||
| #undef DISPATCH_PAD | |||
| #undef DISPATCH_MODE | |||
| } | |||
| bool PoolingImpl::AlgoGiFilter3MaxStride2::usable( | |||
| const PoolingKernSizeParam& param) const { | |||
| bool avaible = param.src_type.category() == DTypeCategory::FLOAT && | |||
| param.format == Param::Format::NCHW && param.mode == Mode::MAX && | |||
| param.filter[0] == 3 && param.filter[1] == 3 && | |||
| param.stride[0] == 2 && param.stride[1] == 2 && param.isz[0] >= 2 && | |||
| param.isz[1] >= 2; | |||
| return avaible; | |||
| } | |||
| void PoolingImpl::AlgoGiFilter3MaxStride2::exec(const PoolingKernParam& param) const { | |||
| auto IH = param.isz[0], IW = param.isz[1]; | |||
| auto OH = param.osz[0], OW = param.osz[1]; | |||
| auto N = param.n, C = param.ic; | |||
| auto PH = param.padding[0]; | |||
| auto PW = param.padding[1]; | |||
| auto src_ptr = param.src_ptr; | |||
| auto dst_ptr = param.dst_ptr; | |||
| #define DISPATCH_FUNC(type, func, midout_type_id) \ | |||
| MIDOUT_BEGIN( \ | |||
| megdnn_fallback_gi_pooling, midout_iv(2), midout_iv(midout_type_id)) { \ | |||
| WorkspaceBundle wbundle = get_bundle(param); \ | |||
| auto run = [C, IH, IW, OH, OW, PH, PW, src_ptr, dst_ptr, wbundle = wbundle, \ | |||
| workspace_ptr = param.workspace<dt_byte>()]( \ | |||
| size_t index, size_t thread_id) { \ | |||
| auto ws = wbundle; \ | |||
| ws.set(workspace_ptr + ws.total_size_in_bytes() * thread_id); \ | |||
| size_t n = index / C; \ | |||
| size_t c = index % C; \ | |||
| do_max_pooling_3x3_s2x2_float_gi( \ | |||
| static_cast<const type*>(src_ptr.get_ptr()) + n * C * IH * IW + \ | |||
| c * IH * IW, \ | |||
| static_cast<type*>(dst_ptr.get_ptr()) + n * C * OH * OW + \ | |||
| c * OH * OW, \ | |||
| IH, IW, OH, OW, PH, PW, ws); \ | |||
| }; \ | |||
| MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN( \ | |||
| static_cast<::megdnn::naive::HandleImpl*>(param.handle), N* C, run); \ | |||
| } \ | |||
| MIDOUT_END(); | |||
| if (param.src_type == dtype::Float32{}) { | |||
| DISPATCH_FUNC(float, float, 0); | |||
| } | |||
| #undef DISPATCH_FUNC | |||
| } | |||
| bool PoolingImpl::AlgoGiFilter3AverageStride2::usable( | |||
| const PoolingKernSizeParam& param) const { | |||
| bool avaible = (param.src_type.category() == DTypeCategory::FLOAT) && | |||
| param.format == Param::Format::NCHW && param.mode == Mode::AVERAGE && | |||
| param.filter[0] == 3 && param.filter[1] == 3 && | |||
| param.stride[0] == 2 && param.stride[1] == 2 && param.isz[0] >= 2 && | |||
| param.isz[1] >= 2; | |||
| return avaible; | |||
| } | |||
| void PoolingImpl::AlgoGiFilter3AverageStride2::exec( | |||
| const PoolingKernParam& param) const { | |||
| auto IH = param.isz[0], IW = param.isz[1]; | |||
| auto OH = param.osz[0], OW = param.osz[1]; | |||
| auto N = param.n, C = param.ic; | |||
| auto PH = param.padding[0]; | |||
| auto PW = param.padding[1]; | |||
| auto src_ptr = param.src_ptr; | |||
| auto dst_ptr = param.dst_ptr; | |||
| #define DISPATCH_FUNC(type, MEGDNN_SIMD_WIDTH, midout_type_id) \ | |||
| MIDOUT_BEGIN( \ | |||
| megdnn_fallback_gi_pooling, midout_iv(3), midout_iv(midout_type_id)) { \ | |||
| WorkspaceBundle wbundle = get_bundle(param); \ | |||
| auto run = [C, IH, IW, OH, OW, PH, PW, src_ptr, dst_ptr, wbundle = wbundle, \ | |||
| workspace_ptr = param.workspace<dt_byte>()]( \ | |||
| size_t index, size_t thread_id) { \ | |||
| auto ws = wbundle; \ | |||
| ws.set(workspace_ptr + ws.total_size_in_bytes() * thread_id); \ | |||
| size_t n = index / C; \ | |||
| size_t c = index % C; \ | |||
| do_average_pooling_3x3_s2x2_gi( \ | |||
| static_cast<const type*>(src_ptr.get_ptr()) + n * C * IH * IW + \ | |||
| c * IH * IW, \ | |||
| static_cast<type*>(dst_ptr.get_ptr()) + n * C * OH * OW + \ | |||
| c * OH * OW, \ | |||
| IH, IW, OH, OW, PH, PW, ws, MEGDNN_SIMD_WIDTH); \ | |||
| }; \ | |||
| MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN( \ | |||
| static_cast<::megdnn::naive::HandleImpl*>(param.handle), N* C, run); \ | |||
| } \ | |||
| MIDOUT_END(); | |||
| if (param.src_type == dtype::Float32{}) { | |||
| DISPATCH_FUNC(dt_float32, 4, 0); | |||
| } | |||
| #undef DISPATCH_FUNC | |||
| } | |||
| bool PoolingImpl::AlgoGiFilter4MaxStride2::usable( | |||
| const PoolingKernSizeParam& param) const { | |||
| auto SH = param.stride[0]; | |||
| auto SW = param.stride[1]; | |||
| auto FH = param.filter[0]; | |||
| auto FW = param.filter[1]; | |||
| auto OH = param.osz[0], OW = param.osz[1]; | |||
| bool avaible = param.src_type.category() == DTypeCategory::FLOAT && | |||
| param.format == Param::Format::NCHW && param.mode == Mode::MAX && | |||
| FH == 4 && FW == 4 && SH == 2 && SW == 2 && OH >= 2 && OW >= 2; | |||
| return avaible; | |||
| } | |||
| void PoolingImpl::AlgoGiFilter4MaxStride2::exec(const PoolingKernParam& param) const { | |||
| auto IH = param.isz[0], IW = param.isz[1]; | |||
| auto OH = param.osz[0], OW = param.osz[1]; | |||
| auto N = param.n, C = param.ic; | |||
| auto PH = param.padding[0]; | |||
| auto PW = param.padding[1]; | |||
| auto src_ptr = param.src_ptr; | |||
| auto dst_ptr = param.dst_ptr; | |||
| #define DISPATCH_FUNC(type, func, midout_type_id) \ | |||
| MIDOUT_BEGIN( \ | |||
| megdnn_fallback_gi_pooling, midout_iv(4), midout_iv(midout_type_id)) { \ | |||
| auto run = [C, IH, IW, OH, OW, PH, PW, src_ptr, dst_ptr, \ | |||
| src_dtype = param.src_type](size_t index, size_t) { \ | |||
| size_t n = index / C; \ | |||
| size_t c = index % C; \ | |||
| do_max_pooling_w4x4_s2x2_##func##_gi( \ | |||
| static_cast<const type*>(src_ptr.get_ptr()) + n * C * IH * IW + \ | |||
| c * IH * IW, \ | |||
| static_cast<type*>(dst_ptr.get_ptr()) + n * C * OH * OW + \ | |||
| c * OH * OW, \ | |||
| src_dtype, IH, IW, OH, OW, PH, PW); \ | |||
| }; \ | |||
| MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN( \ | |||
| static_cast<::megdnn::naive::HandleImpl*>(param.handle), N* C, run); \ | |||
| } \ | |||
| MIDOUT_END(); | |||
| if (param.src_type == dtype::Float32{}) { | |||
| DISPATCH_FUNC(float, float, 0); | |||
| } | |||
| #undef DISPATCH_FUNC | |||
| } | |||
| bool PoolingImpl::AlgoGiFilter5MaxStride2::usable( | |||
| const PoolingKernSizeParam& param) const { | |||
| auto SH = param.stride[0]; | |||
| auto SW = param.stride[1]; | |||
| auto FH = param.filter[0]; | |||
| auto FW = param.filter[1]; | |||
| auto OH = param.osz[0], OW = param.osz[1]; | |||
| bool avaible = param.src_type.category() == DTypeCategory::FLOAT && | |||
| param.format == Param::Format::NCHW && param.mode == Mode::MAX && | |||
| FH == 5 && FW == 5 && SH == 2 && SW == 2 && OH >= 2 && OW >= 2; | |||
| return avaible; | |||
| } | |||
| void PoolingImpl::AlgoGiFilter5MaxStride2::exec(const PoolingKernParam& param) const { | |||
| auto IH = param.isz[0], IW = param.isz[1]; | |||
| auto OH = param.osz[0], OW = param.osz[1]; | |||
| auto N = param.n, C = param.ic; | |||
| auto PH = param.padding[0]; | |||
| auto PW = param.padding[1]; | |||
| auto src_ptr = param.src_ptr; | |||
| auto dst_ptr = param.dst_ptr; | |||
| #define DISPATCH_FUNC(dtype, type, midout_type_id, MEGDNN_SIMD_WIDTH) \ | |||
| MIDOUT_BEGIN( \ | |||
| megdnn_fallback_gi_pooling, midout_iv(5), midout_iv(midout_type_id)) { \ | |||
| WorkspaceBundle wbundle = get_bundle(param); \ | |||
| auto run = [C, IH, IW, OH, OW, PH, PW, src_ptr, dst_ptr, wbundle = wbundle, \ | |||
| workspace_ptr = param.workspace<dt_byte>()]( \ | |||
| size_t index, size_t thread_id) { \ | |||
| auto ws = wbundle; \ | |||
| ws.set(workspace_ptr + ws.total_size_in_bytes() * thread_id); \ | |||
| size_t n = index / C; \ | |||
| size_t c = index % C; \ | |||
| do_max_pooling_w5x5_s2x2_gi<dtype>( \ | |||
| static_cast<const type*>(src_ptr.get_ptr()) + n * C * IH * IW + \ | |||
| c * IH * IW, \ | |||
| static_cast<type*>(dst_ptr.get_ptr()) + n * C * OH * OW + \ | |||
| c * OH * OW, \ | |||
| IH, IW, OH, OW, PH, PW, ws, MEGDNN_SIMD_WIDTH); \ | |||
| }; \ | |||
| MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN( \ | |||
| static_cast<::megdnn::naive::HandleImpl*>(param.handle), N* C, run); \ | |||
| } \ | |||
| MIDOUT_END(); | |||
| if (param.src_type == dtype::Float32{}) { | |||
| DISPATCH_FUNC(dt_float32, float, 0, 4); | |||
| } | |||
| #undef DISPATCH_FUNC | |||
| } | |||
| } // namespace fallback | |||
| } // namespace megdnn | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -0,0 +1,103 @@ | |||
| /** | |||
| * \file dnn/src/fallback/pooling/gi/algo.h | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
| * implied. | |||
| */ | |||
| #pragma once | |||
| #include "src/common/utils.h" | |||
| #include "src/fallback/pooling/opr_impl.h" | |||
| #include "pooling_helper.h" | |||
| #include "src/naive/handle.h" | |||
| #include "src/naive/pooling/opr_impl.h" | |||
| namespace megdnn { | |||
| namespace fallback { | |||
| using AlgoBase = PoolingImpl::AlgoBase; | |||
| class PoolingImpl::AlgoGiFilterxModexStride1 final : public AlgoBase { | |||
| public: | |||
| AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }; | |||
| const char* name() const override { return "GI_POOLING_STRIDE1"; } | |||
| bool usable(const PoolingKernSizeParam& param) const override; | |||
| void exec(const PoolingKernParam& param) const override; | |||
| MEGDNN_DECL_ALGO_TYPE(GI_FilterxModexStride1) | |||
| }; | |||
| class PoolingImpl::AlgoGiFilter2ModexStride2 final : public AlgoBase { | |||
| public: | |||
| AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }; | |||
| const char* name() const override { return "GI_POOLING_STRIDE2"; } | |||
| bool usable(const PoolingKernSizeParam& param) const override; | |||
| void exec(const PoolingKernParam& param) const override; | |||
| MEGDNN_DECL_ALGO_TYPE(GI_Filter2ModexStride2) | |||
| }; | |||
| class PoolingImpl::AlgoGiFilter3MaxStride2 final : public AlgoBase { | |||
| public: | |||
| AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }; | |||
| const char* name() const override { return "GI_POOLING_FILTER3_MAX"; } | |||
| bool usable(const PoolingKernSizeParam& param) const override; | |||
| void exec(const PoolingKernParam& param) const override; | |||
| MEGDNN_DECL_ALGO_TYPE(GI_Filter3MaxStride2) | |||
| }; | |||
| class PoolingImpl::AlgoGiFilter3AverageStride2 final : public AlgoBase { | |||
| public: | |||
| AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }; | |||
| const char* name() const override { return "GI_POOLING_FILTER3_AVERAGE"; } | |||
| bool usable(const PoolingKernSizeParam& param) const override; | |||
| void exec(const PoolingKernParam& param) const override; | |||
| MEGDNN_DECL_ALGO_TYPE(GI_Filter3AverageStride2) | |||
| }; | |||
| class PoolingImpl::AlgoGiFilter4MaxStride2 final : public AlgoBase { | |||
| public: | |||
| AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }; | |||
| const char* name() const override { return "GI_POOLING_FILTER4_MAX"; } | |||
| bool usable(const PoolingKernSizeParam& param) const override; | |||
| void exec(const PoolingKernParam& param) const override; | |||
| MEGDNN_DECL_ALGO_TYPE(GI_Filter4MaxStride2) | |||
| }; | |||
| class PoolingImpl::AlgoGiFilter5MaxStride2 final : public AlgoBase { | |||
| public: | |||
| AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }; | |||
| const char* name() const override { return "GI_POOLING_FILTER5_MAX"; } | |||
| bool usable(const PoolingKernSizeParam& param) const override; | |||
| void exec(const PoolingKernParam& param) const override; | |||
| MEGDNN_DECL_ALGO_TYPE(GI_Filter5MaxStride2) | |||
| }; | |||
| class PoolingImpl::AlgoGiFp32ModexStridexNCHW44 final : public AlgoBase { | |||
| public: | |||
| AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }; | |||
| const char* name() const override { return "GI_POOLING_FP32_MODEX_STRIDEX_NCHW44"; } | |||
| bool usable(const PoolingKernSizeParam& param) const override; | |||
| void exec(const PoolingKernParam& param) const override; | |||
| MEGDNN_DECL_ALGO_TYPE(GI_Fp32ModexStridexNCHW44) | |||
| }; | |||
| class PoolingImpl::AlgoFallback final : public AlgoBase { | |||
| public: | |||
| AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }; | |||
| const char* name() const override { return "FALLBACK_NOT_GI_POOLING"; } | |||
| bool usable(const PoolingKernSizeParam&) const override { return true; } | |||
| void exec(const PoolingKernParam& /*param*/) const override { | |||
| megdnn_assert(false, "code issue happened!!"); | |||
| } | |||
| MEGDNN_DECL_ALGO_TYPE(FallbackNotGI) | |||
| }; | |||
| WorkspaceBundle get_bundle(const PoolingImpl::PoolingKernSizeParam&); | |||
| } // namespace fallback | |||
| } // namespace megdnn | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * \file dnn/src/arm_common/pooling/algo_fp32_pooling_nchw44.cpp | |||
| * \file dnn/src/fallback/pooling/gi/algo_fp32_pooling_nchw44.cpp | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||
| @@ -10,17 +10,17 @@ | |||
| * implied. | |||
| */ | |||
| #include "algo.h" | |||
| #include "kern_fp32_pooling_nchw44.h" | |||
| #include "megdnn/opr_param_defs.h" | |||
| #include "src/arm_common/pooling/algo.h" | |||
| #include "src/arm_common/pooling/kern_fp32_pooling_nchw44.h" | |||
| #include "midout.h" | |||
| MIDOUT_DECL(megdnn_arm_common_fp32_pooling_nchw44) | |||
| MIDOUT_DECL(megdnn_fallback_fp32_pooling_nchw44) | |||
| namespace megdnn { | |||
| namespace arm_common { | |||
| bool PoolingImpl::AlgoFp32ModexStridexNCHW44::usable( | |||
| namespace fallback { | |||
| bool PoolingImpl::AlgoGiFp32ModexStridexNCHW44::usable( | |||
| const PoolingKernSizeParam& param) const { | |||
| uint32_t sh = param.stride[0]; | |||
| uint32_t sw = param.stride[1]; | |||
| @@ -37,7 +37,7 @@ bool PoolingImpl::AlgoFp32ModexStridexNCHW44::usable( | |||
| return avaible && size_ok; | |||
| } | |||
| void PoolingImpl::AlgoFp32ModexStridexNCHW44::exec( | |||
| void PoolingImpl::AlgoGiFp32ModexStridexNCHW44::exec( | |||
| const PoolingKernParam& param) const { | |||
| int ih = param.isz[0]; | |||
| int iw = param.isz[1]; | |||
| @@ -55,7 +55,7 @@ void PoolingImpl::AlgoFp32ModexStridexNCHW44::exec( | |||
| #define DISPATCH_FUNC(filter, stride, mode) \ | |||
| MIDOUT_BEGIN( \ | |||
| megdnn_arm_common_fp32_pooling_nchw44, midout_iv(0), \ | |||
| megdnn_fallback_fp32_pooling_nchw44, midout_iv(0), \ | |||
| midout_iv(#filter #stride #mode##_hash)) { \ | |||
| auto run = [ih, iw, oh, ow, ph, pw, src_ptr, dst_ptr](size_t index, size_t) { \ | |||
| const int c_idx = index; \ | |||
| @@ -135,7 +135,7 @@ void PoolingImpl::AlgoFp32ModexStridexNCHW44::exec( | |||
| #undef DISPATCH_FUNC | |||
| } | |||
| } // namespace arm_common | |||
| } // namespace fallback | |||
| } // namespace megdnn | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -0,0 +1,157 @@ | |||
| /** | |||
| * \file dnn/src/fallback/pooling/gi/do_max_pooling_3x3_s2x2_float.cpp | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2022 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| */ | |||
| #include "src/common/utils.h" | |||
| #include <algorithm> | |||
| #include <vector> | |||
| #include "do_max_pooling_3x3_s2x2_float.h" | |||
| #include "src/common/macro_helper.h" | |||
| namespace megdnn { | |||
| namespace fallback { | |||
| #define GI_UZP(s0, s1, d0, d1) \ | |||
| do { \ | |||
| auto tmp__ = GiUzpqFloat32(s0, s1); \ | |||
| d0 = tmp__.val[0]; \ | |||
| d1 = tmp__.val[1]; \ | |||
| } while (0) | |||
| void do_max_pooling_3x3_s2x2_float_gi( | |||
| const float* src, float* dst, size_t IH_, size_t IW_, size_t OH_, size_t OW_, | |||
| size_t PH_, size_t PW_, const WorkspaceBundle& ws) { | |||
| int IH = IH_, IW = IW_, OH = OH_, OW = OW_, PH = PH_, PW = PW_; | |||
| // cache[i] stores the answer of the i-th line after | |||
| // pooling along the W dimension. | |||
| float* cache[3] = { | |||
| static_cast<float*>(ws.get(0)), static_cast<float*>(ws.get(1)), | |||
| static_cast<float*>(ws.get(2))}; | |||
| float* odd = static_cast<float*>(ws.get(3)); | |||
| float* even = static_cast<float*>(ws.get(4)); | |||
| int ih_next = 0; | |||
| // "good" area means we can use SIMD to accelerate. | |||
| auto get_good_area = [](int I, int /* O */, int P, int& O_from, int& O_to) { | |||
| // x*2 - P >= 0; 2x >= P; x >= P/2 | |||
| O_from = (P + 1) / 2; | |||
| // x*2 - P + 3 <= I; x*2 <= I+P-3; x <= (I+P-3)/2 | |||
| O_to = (I + P - 3) / 2 + 1; | |||
| // we must have I >= 2 to ensure O_from <= O_to | |||
| }; | |||
| int OW_from, OW_to; | |||
| get_good_area(IW, OW, PW, OW_from, OW_to); | |||
| auto process_cache = [&](int ih) { | |||
| const float* __restrict sptr = src + ih * IW; | |||
| auto tmp = cache[2]; | |||
| cache[2] = cache[1]; | |||
| cache[1] = cache[0]; | |||
| cache[0] = tmp; | |||
| // cache 0 is used to store the current answer. | |||
| auto run_single = [&](int ow) { | |||
| int iw = ow * 2 - PW; | |||
| float res = std::numeric_limits<float>::lowest(); | |||
| if (iw + 0 >= 0 && iw + 0 < IW) { | |||
| res = std::max(res, sptr[iw + 0]); | |||
| } | |||
| if (iw + 1 >= 0 && iw + 1 < IW) { | |||
| res = std::max(res, sptr[iw + 1]); | |||
| } | |||
| if (iw + 2 >= 0 && iw + 2 < IW) { | |||
| res = std::max(res, sptr[iw + 2]); | |||
| } | |||
| cache[0][ow] = res; | |||
| }; | |||
| // build odd/even | |||
| int iw = 0; | |||
| int odd_offset = 0, even_offset = 0; | |||
| for (; iw + 2 * 4 <= IW; iw += 2 * 4) { | |||
| GI_FLOAT32_t s0, s1, d0, d1; | |||
| s0 = GiLoadFloat32(sptr + iw); | |||
| s1 = GiLoadFloat32(sptr + iw + 4); | |||
| GI_UZP(s0, s1, d0, d1); | |||
| GiStoreFloat32(even + even_offset, d0); | |||
| GiStoreFloat32(odd + odd_offset, d1); | |||
| even_offset += 4; | |||
| odd_offset += 4; | |||
| } | |||
| for (; iw < IW; ++iw) { | |||
| if (iw & 1) | |||
| odd[odd_offset++] = sptr[iw]; | |||
| else | |||
| even[even_offset++] = sptr[iw]; | |||
| } | |||
| int ow = 0; | |||
| for (; ow < OW_from; ++ow) | |||
| run_single(ow); | |||
| if (PW & 1) { | |||
| for (; ow + 4 <= OW_to; ow += 4) { | |||
| GI_FLOAT32_t d, s0, s1, s2; | |||
| s0 = GiLoadFloat32(odd + ow - (PW >> 1) - 1); | |||
| s1 = GiLoadFloat32(even + ow - (PW >> 1)); | |||
| s2 = GiLoadFloat32(odd + ow - (PW >> 1)); | |||
| d = GiMaximumFloat32(GiMaximumFloat32(s0, s1), s2); | |||
| GiStoreFloat32(cache[0] + ow, d); | |||
| } | |||
| } else { | |||
| for (; ow + 4 <= OW_to; ow += 4) { | |||
| GI_FLOAT32_t d, s0, s1, s2; | |||
| s0 = GiLoadFloat32(even + ow - (PW >> 1)); | |||
| s1 = GiLoadFloat32(odd + ow - (PW >> 1)); | |||
| s2 = GiLoadFloat32(even + ow - (PW >> 1) + 1); | |||
| d = GiMaximumFloat32(GiMaximumFloat32(s0, s1), s2); | |||
| GiStoreFloat32(cache[0] + ow, d); | |||
| } | |||
| } | |||
| for (; ow < OW; ++ow) | |||
| run_single(ow); | |||
| }; | |||
| for (int oh = 0; oh < OH; ++oh) { | |||
| float* __restrict dptr = dst + oh * OW; | |||
| int ih_from = std::min(IH, std::max(0, oh * 2 - PH)); | |||
| int ih_to = std::min(IH, std::max(0, oh * 2 - PH + 3)); | |||
| while (ih_next < ih_to) { | |||
| process_cache(ih_next++); | |||
| } | |||
| if (ih_to - ih_from == 3) { | |||
| int ow = 0; | |||
| for (; ow + 4 <= OW; ow += 4) { | |||
| GI_FLOAT32_t d, s0, s1, s2; | |||
| s0 = GiLoadFloat32(cache[0] + ow); | |||
| s1 = GiLoadFloat32(cache[1] + ow); | |||
| s2 = GiLoadFloat32(cache[2] + ow); | |||
| d = GiMaximumFloat32(GiMaximumFloat32(s0, s1), s2); | |||
| GiStoreFloat32(dptr + ow, d); | |||
| } | |||
| for (; ow < OW; ++ow) { | |||
| dptr[ow] = std::max(std::max(cache[0][ow], cache[1][ow]), cache[2][ow]); | |||
| } | |||
| } else { | |||
| std::memcpy(dptr, cache[0], sizeof(float) * OW); | |||
| for (int i = 1; i < ih_to - ih_from; ++i) { | |||
| int ow = 0; | |||
| for (; ow + 4 <= OW; ow += 4) { | |||
| GI_FLOAT32_t d, s; | |||
| s = GiLoadFloat32(cache[i] + ow); | |||
| d = GiLoadFloat32(dptr + ow); | |||
| d = GiMaximumFloat32(d, s); | |||
| GiStoreFloat32(dptr + ow, d); | |||
| } | |||
| for (; ow < OW; ++ow) { | |||
| dptr[ow] = std::max(dptr[ow], cache[i][ow]); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } // namespace fallback | |||
| } // namespace megdnn | |||
| @@ -0,0 +1,26 @@ | |||
| /** | |||
| * \file dnn/src/fallback/pooling/gi/do_max_pooling_3x3_s2x2_float.h | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2022 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| */ | |||
| #include "src/common/utils.h" | |||
| #include "megdnn/arch.h" | |||
| #include "src/fallback/general_intrinsic/gi_float.h" | |||
| namespace megdnn { | |||
| namespace fallback { | |||
| void do_max_pooling_3x3_s2x2_float_gi( | |||
| const float* src, float* dst, size_t IH_, size_t IW_, size_t OH_, size_t OW_, | |||
| size_t PH_, size_t PW_, const WorkspaceBundle& ws); | |||
| } // namespace fallback | |||
| } // namespace megdnn | |||
| @@ -0,0 +1,89 @@ | |||
| /** | |||
| * \file dnn/src/fallback/pooling/gi/do_max_pooling_w4x4_s2x2.cpp | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
| * implied. | |||
| */ | |||
| #include "do_max_pooling_w4x4_s2x2.h" | |||
| #include "pooling_helper.h" | |||
| namespace megdnn { | |||
| namespace fallback { | |||
| void do_max_pooling_w4x4_s2x2_float_gi( | |||
| const dt_float32* src, dt_float32* dst, DType src_dtype, const int IH, | |||
| const int IW, const int OH, const int OW, const int PH, const int PW) { | |||
| const int window = 4; | |||
| const int stride = 2; | |||
| using Pooler = MaxPooler<16, dt_float32, float, float>; | |||
| int oh = 0; | |||
| for (; oh < OH && -PH + stride * oh < 0; ++oh) { | |||
| int ow = 0; | |||
| for (; ow < OW; ++ow) { | |||
| do_pxl_naive<Pooler, window>( | |||
| oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW, stride, | |||
| stride); | |||
| } | |||
| } | |||
| for (; oh < OH && -PH + stride * oh + window <= IH; ++oh) { | |||
| int ow = 0; | |||
| for (; ow < OW && -PW + stride * ow < 0; ++ow) { | |||
| do_pxl_naive<Pooler, window>( | |||
| oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW, stride, | |||
| stride); | |||
| } | |||
| dt_float32 last_hf_res = -std::numeric_limits<dt_float32>::infinity(); | |||
| int ih = -PH + stride * oh, iw = -PW + stride * ow; | |||
| if (-PW + stride * ow + window <= IW) { | |||
| GI_FLOAT32_t i0 = GiLoadFloat32(src + (ih + 0) * IW + iw), | |||
| i1 = GiLoadFloat32(src + (ih + 1) * IW + iw), | |||
| i2 = GiLoadFloat32(src + (ih + 2) * IW + iw), | |||
| i3 = GiLoadFloat32(src + (ih + 3) * IW + iw); | |||
| GI_FLOAT32_t sum0 = GiMaximumFloat32( | |||
| GiMaximumFloat32(i0, i1), GiMaximumFloat32(i2, i3)); | |||
| float32x2_t t = | |||
| GiPmaxFloat32(GiGetLowFloat32(sum0), GiGetHighFloat32(sum0)); | |||
| dst[oh * OW + ow] = | |||
| std::max(GiGetLaneFloat32(t, 0), GiGetLaneFloat32(t, 1)); | |||
| last_hf_res = GiGetLaneFloat32(t, 1); | |||
| ow += 1; | |||
| } | |||
| for (; ow + 1 < OW && -PW + stride * (ow + 1) + window <= IW; ow += 2) { | |||
| iw = -PW + stride * (ow + 1); | |||
| GI_FLOAT32_t i0 = GiLoadFloat32(src + (ih + 0) * IW + iw), | |||
| i1 = GiLoadFloat32(src + (ih + 1) * IW + iw), | |||
| i2 = GiLoadFloat32(src + (ih + 2) * IW + iw), | |||
| i3 = GiLoadFloat32(src + (ih + 3) * IW + iw); | |||
| GI_FLOAT32_t sum0 = GiMaximumFloat32( | |||
| GiMaximumFloat32(i0, i1), GiMaximumFloat32(i2, i3)); | |||
| float32x2_t t = | |||
| GiPmaxFloat32(GiGetLowFloat32(sum0), GiGetHighFloat32(sum0)); | |||
| dst[oh * OW + ow + 0] = std::max(GiGetLaneFloat32(t, 0), last_hf_res); | |||
| dst[oh * OW + ow + 1] = | |||
| std::max(GiGetLaneFloat32(t, 0), GiGetLaneFloat32(t, 1)); | |||
| last_hf_res = GiGetLaneFloat32(t, 1); | |||
| } | |||
| for (; ow < OW; ++ow) { | |||
| do_pxl_naive<Pooler, window>( | |||
| oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW, stride, | |||
| stride); | |||
| } | |||
| } | |||
| for (; oh < OH; ++oh) { | |||
| int ow = 0; | |||
| for (; ow < OW; ++ow) { | |||
| do_pxl_naive<Pooler, window>( | |||
| oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW, stride, | |||
| stride); | |||
| } | |||
| } | |||
| } | |||
| } // namespace fallback | |||
| } // namespace megdnn | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -0,0 +1,24 @@ | |||
| /** | |||
| * \file dnn/src/fallback/pooling/gi/do_max_pooling_w4x4_s2x2.h | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
| * implied. | |||
| */ | |||
| #pragma once | |||
| #include "src/fallback/pooling/opr_impl.h" | |||
| namespace megdnn { | |||
| namespace fallback { | |||
| void do_max_pooling_w4x4_s2x2_float_gi( | |||
| const dt_float32* src, dt_float32* dst, DType src_dtype, const int IH, | |||
| const int IW, const int OH, const int OW, const int PH, const int PW); | |||
| } // namespace fallback | |||
| } // namespace megdnn | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -0,0 +1,306 @@ | |||
| /** | |||
| * \file dnn/src/fallback/pooling/gi/kern_fp32_pooling_nchw44.h | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
| * implied. | |||
| */ | |||
| #pragma once | |||
| #include <limits> | |||
| #include "megdnn/opr_param_defs.h" | |||
| #include "src/common/unroll_macro.h" | |||
| #include "src/fallback/general_intrinsic/gi_float.h" | |||
| #include "src/fallback/gi_intrinsic_helper.h" | |||
| namespace megdnn { | |||
| namespace fallback { | |||
| namespace { | |||
| template < | |||
| int filter, int stride, int ow_step, PoolingBase::Mode mode, typename T1, | |||
| typename T2> | |||
| struct CalXsXNchw44 { | |||
| static void impl(T1 result, T2 src); | |||
| }; | |||
| struct GiD1Qf32 { | |||
| static GI_FORCEINLINE GI_FLOAT32_t impl(const float32_t* ptr) { | |||
| return GiLoadFloat32(ptr); | |||
| } | |||
| }; | |||
| template < | |||
| int filter, int stride, int ow_step, PoolingBase::Mode mode, typename T1, | |||
| typename T2> | |||
| void calculate_xsx_nchw44(T1 result, T2 src) { | |||
| CalXsXNchw44<filter, stride, ow_step, mode, T1, T2>::impl(result, src); | |||
| }; | |||
| #define CALCULATE_MAX_CB(step) \ | |||
| result[0] = GiMaximumFloat32(result[0], src[0 * stride + step]); \ | |||
| result[1] = GiMaximumFloat32(result[1], src[1 * stride + step]); \ | |||
| result[2] = GiMaximumFloat32(result[2], src[2 * stride + step]); \ | |||
| result[3] = GiMaximumFloat32(result[3], src[3 * stride + step]); | |||
| #define CALCULATE_AVG_CB(step) \ | |||
| result[0] = GiAddFloat32(result[0], src[0 * stride + step]); \ | |||
| result[1] = GiAddFloat32(result[1], src[1 * stride + step]); \ | |||
| result[2] = GiAddFloat32(result[2], src[2 * stride + step]); \ | |||
| result[3] = GiAddFloat32(result[3], src[3 * stride + step]); | |||
| #define INSTANCE_CAL(filter) \ | |||
| template <int stride, typename T1, typename T2> \ | |||
| struct CalXsXNchw44<filter, stride, 4, PoolingBase::Mode::MAX, T1, T2> { \ | |||
| static void impl(T1 result, T2 src) { \ | |||
| UNROLL_CALL_RAW(filter, CALCULATE_MAX_CB); \ | |||
| } \ | |||
| }; \ | |||
| template <int stride, typename T1, typename T2> \ | |||
| struct CalXsXNchw44<filter, stride, 4, PoolingBase::Mode::AVERAGE, T1, T2> { \ | |||
| static void impl(T1 result, T2 src) { \ | |||
| UNROLL_CALL_RAW(filter, CALCULATE_AVG_CB); \ | |||
| } \ | |||
| }; | |||
| INSTANCE_CAL(2) | |||
| INSTANCE_CAL(3) | |||
| INSTANCE_CAL(4) | |||
| INSTANCE_CAL(5) | |||
| INSTANCE_CAL(9) | |||
| INSTANCE_CAL(13) | |||
| #undef INSTANCE_CAL | |||
| #undef CALCULATE_AVG_CB | |||
| #undef CALCULATE_MAX_CB | |||
| template <int filter, int stride, int ow_step, PoolingBase::Mode mode> | |||
| struct KerPoolingFilterXStrideXNchw44 { | |||
| static void impl(const float32_t* src_ptr, float32_t* dst_ptr, size_t iw); | |||
| }; | |||
| template <int filter, int stride, int ow_step> | |||
| struct KerPoolingFilterXStrideXNchw44<filter, stride, ow_step, PoolingBase::Mode::MAX> { | |||
| static void impl(const float32_t* src_ptr, float32_t* dst_ptr, size_t iw) { | |||
| constexpr int src_reg_size = ow_step * stride + filter - stride; | |||
| constexpr int packed_ic = 4; | |||
| constexpr int simd_len = 4; | |||
| constexpr float default_float = std::numeric_limits<float>::lowest(); | |||
| GI_FLOAT32_t result[ow_step]; | |||
| GI_FLOAT32_t src[src_reg_size]; | |||
| result[0] = GiBroadcastFloat32(default_float); | |||
| result[1] = GiBroadcastFloat32(default_float); | |||
| result[2] = GiBroadcastFloat32(default_float); | |||
| result[3] = GiBroadcastFloat32(default_float); | |||
| for (int fh_idx = 0; fh_idx < filter; ++fh_idx) { | |||
| load_helper<src_reg_size, 0, simd_len, 0, GiD1Qf32>( | |||
| src, src_ptr + fh_idx * iw * packed_ic, 0); | |||
| calculate_xsx_nchw44<filter, stride, ow_step, PoolingBase::Mode::MAX>( | |||
| result, src); | |||
| } | |||
| GiStoreFloat32(dst_ptr + 0 * packed_ic, result[0]); | |||
| GiStoreFloat32(dst_ptr + 1 * packed_ic, result[1]); | |||
| GiStoreFloat32(dst_ptr + 2 * packed_ic, result[2]); | |||
| GiStoreFloat32(dst_ptr + 3 * packed_ic, result[3]); | |||
| } | |||
| }; | |||
| template <int filter, int stride, int ow_step> | |||
| struct KerPoolingFilterXStrideXNchw44< | |||
| filter, stride, ow_step, PoolingBase::Mode::AVERAGE> { | |||
| static void impl(const float32_t* src_ptr, float32_t* dst_ptr, size_t iw) { | |||
| constexpr int src_reg_size = ow_step * stride + filter - stride; | |||
| constexpr int packed_ic = 4; | |||
| constexpr int simd_len = 4; | |||
| constexpr float default_float = 0; | |||
| constexpr float div_filter_size = 1.f / (filter * filter); | |||
| const GI_FLOAT32_t div_filter_size_vec = GiBroadcastFloat32(div_filter_size); | |||
| GI_FLOAT32_t result[ow_step]; | |||
| GI_FLOAT32_t src[src_reg_size]; | |||
| result[0] = GiBroadcastFloat32(default_float); | |||
| result[1] = GiBroadcastFloat32(default_float); | |||
| result[2] = GiBroadcastFloat32(default_float); | |||
| result[3] = GiBroadcastFloat32(default_float); | |||
| for (int fh_idx = 0; fh_idx < filter; ++fh_idx) { | |||
| load_helper<src_reg_size, 0, simd_len, 0, GiD1Qf32>( | |||
| src, src_ptr + fh_idx * iw * packed_ic, 0); | |||
| calculate_xsx_nchw44<filter, stride, ow_step, PoolingBase::Mode::AVERAGE>( | |||
| result, src); | |||
| } | |||
| result[0] = GiMultiplyFloat32(result[0], div_filter_size_vec); | |||
| result[1] = GiMultiplyFloat32(result[1], div_filter_size_vec); | |||
| result[2] = GiMultiplyFloat32(result[2], div_filter_size_vec); | |||
| result[3] = GiMultiplyFloat32(result[3], div_filter_size_vec); | |||
| GiStoreFloat32(dst_ptr + 0 * packed_ic, result[0]); | |||
| GiStoreFloat32(dst_ptr + 1 * packed_ic, result[1]); | |||
| GiStoreFloat32(dst_ptr + 2 * packed_ic, result[2]); | |||
| GiStoreFloat32(dst_ptr + 3 * packed_ic, result[3]); | |||
| } | |||
| }; | |||
| template <PoolingBase::Mode mode> | |||
| void ker_pooling_nchw44_remain_pad( | |||
| const float32_t* src_ptr, float32_t* dst_ptr, const int iw, const int pad_top, | |||
| const int pad_bottom, const int pad_left, const int pad_right, | |||
| const int filter); | |||
| template <> | |||
| void ker_pooling_nchw44_remain_pad<PoolingBase::Mode::MAX>( | |||
| const float32_t* src_ptr, float32_t* dst_ptr, const int iw, const int pad_top, | |||
| const int pad_bottom, const int pad_left, const int pad_right, | |||
| const int filter) { | |||
| constexpr int ic_step = 4; | |||
| const int ih_end = filter - pad_bottom; | |||
| const int iw_end = filter - pad_right; | |||
| GI_FLOAT32_t result = GiBroadcastFloat32(std::numeric_limits<float>::lowest()); | |||
| for (int ih_idx = pad_top; ih_idx < ih_end; ++ih_idx) { | |||
| for (int iw_idx = pad_left; iw_idx < iw_end; ++iw_idx) { | |||
| GI_FLOAT32_t src = GiLoadFloat32(src_ptr + (iw_idx - pad_left) * ic_step); | |||
| result = GiMaximumFloat32(result, src); | |||
| } | |||
| src_ptr += iw * ic_step; | |||
| } | |||
| GiStoreFloat32(dst_ptr, result); | |||
| } | |||
| template <> | |||
| void ker_pooling_nchw44_remain_pad<PoolingBase::Mode::AVERAGE>( | |||
| const float32_t* src_ptr, float32_t* dst_ptr, const int iw, const int pad_top, | |||
| const int pad_bottom, const int pad_left, const int pad_right, | |||
| const int filter) { | |||
| constexpr int ic_step = 4; | |||
| const int ih_end = filter - pad_bottom; | |||
| const int iw_end = filter - pad_right; | |||
| const float div_filter_size = 1.f / (filter * filter); | |||
| const GI_FLOAT32_t div_filter_size_vec = GiBroadcastFloat32(div_filter_size); | |||
| GI_FLOAT32_t result = GiBroadcastFloat32(0.f); | |||
| for (int ih_idx = pad_top; ih_idx < ih_end; ++ih_idx) { | |||
| for (int iw_idx = pad_left; iw_idx < iw_end; ++iw_idx) { | |||
| GI_FLOAT32_t src = GiLoadFloat32(src_ptr + (iw_idx - pad_left) * ic_step); | |||
| result = GiAddFloat32(result, src); | |||
| } | |||
| src_ptr += iw * ic_step; | |||
| } | |||
| result = GiMultiplyFloat32(result, div_filter_size_vec); | |||
| GiStoreFloat32(dst_ptr, result); | |||
| } | |||
| template <PoolingBase::Mode mode> | |||
| static inline void kern_pooling_with_pad_nchw44( | |||
| const float32_t* src, float32_t* dst, const int filter, const int ow_start, | |||
| const int ow_end, const int iw, const int ow, const int stride_w, const int pw, | |||
| const int real_ih_idx, const int oh_idx, const int pad_top, | |||
| const int pad_bottom) { | |||
| constexpr int ic_step = 4; | |||
| constexpr int oc_step = 4; | |||
| for (int ow_idx = ow_start; ow_idx < ow_end; ++ow_idx) { | |||
| const int iw_idx = ow_idx * stride_w; | |||
| const int real_iw_idx = std::max(iw_idx - pw, 0); | |||
| const int pad_left = std::max(0, pw - iw_idx); | |||
| const int pad_right = std::max(0, iw_idx - pw + filter - iw); | |||
| const int src_offset = (real_ih_idx * iw + real_iw_idx) * ic_step; | |||
| const int dst_offset = (oh_idx * ow + ow_idx) * oc_step; | |||
| ker_pooling_nchw44_remain_pad<mode>( | |||
| src + src_offset, dst + dst_offset, iw, pad_top, pad_bottom, pad_left, | |||
| pad_right, filter); | |||
| } | |||
| } | |||
| template <int filter, int stride, PoolingBase::Mode mode> | |||
| static inline void pooling_fp32_nchw44_pad( | |||
| const float32_t* src, float32_t* dst, int ih, int iw, int oh, int ow, int ph, | |||
| int pw) { | |||
| constexpr int stride_h = stride; | |||
| constexpr int stride_w = stride; | |||
| constexpr int ic_step = 4; | |||
| constexpr int oc_step = 4; | |||
| constexpr int ow_step = 4; | |||
| const int ow_pad_left_end = div_ceil(pw, stride_w); | |||
| const int ow_pad_right_end = (iw - filter + pw - 1) / stride_w; | |||
| const int ow_pad_right_step_end = | |||
| (ow_pad_right_end - ow_pad_left_end) / ow_step * ow_step + ow_pad_left_end; | |||
| rep(oh_idx, oh) { | |||
| const int ih_idx = oh_idx * stride_h; | |||
| const int real_ih_idx = std::max(ih_idx - ph, 0); | |||
| const int pad_top = std::max(0, ph - ih_idx); | |||
| const int pad_bottom = std::max(0, ih_idx - ph + filter - ih); | |||
| if (pad_top > 0 || pad_bottom > 0) { | |||
| kern_pooling_with_pad_nchw44<mode>( | |||
| src, dst, filter, 0, ow, iw, ow, stride_w, pw, real_ih_idx, oh_idx, | |||
| pad_top, pad_bottom); | |||
| } else { | |||
| kern_pooling_with_pad_nchw44<mode>( | |||
| src, dst, filter, 0, ow_pad_left_end, iw, ow, stride_w, pw, | |||
| real_ih_idx, oh_idx, pad_top, pad_bottom); | |||
| for (int ow_idx = ow_pad_left_end; ow_idx < ow_pad_right_step_end; | |||
| ow_idx += ow_step) { | |||
| const int iw_idx = ow_idx * stride_w; | |||
| const int real_iw_idx = std::max(iw_idx - pw, 0); | |||
| const int src_offset = (real_ih_idx * iw + real_iw_idx) * ic_step; | |||
| const int dst_offset = (oh_idx * ow + ow_idx) * oc_step; | |||
| KerPoolingFilterXStrideXNchw44<filter, stride, ow_step, mode>::impl( | |||
| src + src_offset, dst + dst_offset, iw); | |||
| } | |||
| kern_pooling_with_pad_nchw44<mode>( | |||
| src, dst, filter, ow_pad_right_step_end, ow, iw, ow, stride_w, pw, | |||
| real_ih_idx, oh_idx, pad_top, pad_bottom); | |||
| } | |||
| } | |||
| } | |||
| template <int filter, int stride, PoolingBase::Mode mode> | |||
| static inline void pooling_fp32_nchw44_no_pad( | |||
| const float32_t* src, float32_t* dst, int, int iw, int oh, int ow) { | |||
| constexpr int stride_h = stride; | |||
| constexpr int stride_w = stride; | |||
| constexpr int ic_step = 4; | |||
| constexpr int oc_step = 4; | |||
| constexpr int ow_step = 4; | |||
| const int ow_end = ow / ow_step * ow_step; | |||
| const int ow_remain = ow - ow_end; | |||
| rep(oh_idx, oh) { | |||
| const int ih_idx = oh_idx * stride_h; | |||
| const int src_ih_offset = ih_idx * iw; | |||
| const int dst_oh_offset = oh_idx * ow; | |||
| for (int ow_idx = 0; ow_idx < ow_end; ow_idx += ow_step) { | |||
| const int iw_idx = ow_idx * stride_w; | |||
| const int src_offset = (src_ih_offset + iw_idx) * ic_step; | |||
| const int dst_offset = (dst_oh_offset + ow_idx) * oc_step; | |||
| KerPoolingFilterXStrideXNchw44<filter, stride, ow_step, mode>::impl( | |||
| src + src_offset, dst + dst_offset, iw); | |||
| } | |||
| if (ow_remain > 0) { | |||
| kern_pooling_with_pad_nchw44<mode>( | |||
| src, dst, filter, ow_end, ow, iw, ow, stride_w, 0, ih_idx, oh_idx, | |||
| 0, 0); | |||
| } | |||
| } | |||
| } | |||
| template <int filter, int stride, PoolingBase::Mode mode> | |||
| static inline void pooling_fp32_nchw44( | |||
| const float32_t* src, float32_t* dst, int ih, int iw, int oh, int ow, int ph, | |||
| int pw) { | |||
| if (ph > 0 || pw > 0) { | |||
| pooling_fp32_nchw44_pad<filter, stride, mode>(src, dst, ih, iw, oh, ow, ph, pw); | |||
| } else { | |||
| pooling_fp32_nchw44_no_pad<filter, stride, mode>(src, dst, ih, iw, oh, ow); | |||
| } | |||
| } | |||
| } // namespace | |||
| } // namespace fallback | |||
| } // namespace megdnn | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -0,0 +1,572 @@ | |||
| /** | |||
| * \file dnn/src/fallback/pooling/gi/pooling_helper.h | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
| * implied. | |||
| */ | |||
| #include "do_max_pooling_3x3_s2x2_float.h" | |||
| #include "megdnn/dtype.h" | |||
| #include "src/common/unroll_macro.h" | |||
| #include "src/common/utils.h" | |||
| namespace { | |||
| /* ======================= MeanPooler ======================== */ | |||
| using namespace megdnn; | |||
| /** | |||
| * \brief Mean mode for pooling | |||
| * \tparam area the pooling area size, FH * FW | |||
| * \tparam dtype the input type | |||
| * \tparam ctype the inner raw type | |||
| * \tparam comp_type compute type | |||
| */ | |||
| template <int area, typename dtype, typename ctype, typename comp_type> | |||
| struct MeanPoolerCommon { | |||
| //! the gi imp register size is 16 bytes(128 bits) | |||
| static constexpr int SIMD_WIDTH = 16 / sizeof(ctype); | |||
| static constexpr comp_type coef = static_cast<comp_type>(1.0f) / area; | |||
| comp_type res; | |||
| MeanPoolerCommon() : res(0) {} | |||
| void feed(const ctype* val) { res += *val; } | |||
| }; | |||
| template <int area, typename dtype, typename ctype, typename comp_type> | |||
| constexpr comp_type MeanPoolerCommon<area, dtype, ctype, comp_type>::coef; | |||
| template <int area, typename dtype, typename _ctype, typename comp_type> | |||
| struct MeanInPooler : MeanPoolerCommon<area, dtype, _ctype, comp_type> { | |||
| using ctype = _ctype; | |||
| //! `MIDOUT_CASE_NUM` is a unique int id | |||
| static constexpr int MIDOUT_CASE_NUM = 1; | |||
| MeanInPooler(DType) : MeanPoolerCommon<area, dtype, _ctype, comp_type>() {} | |||
| void post(ctype* dst) { | |||
| this->res *= this->coef; | |||
| *dst = this->res; | |||
| } | |||
| }; | |||
| template <int area, typename dtype, typename _ctype> | |||
| struct MeanInRoundPooler : MeanPoolerCommon<area, dtype, _ctype, float> { | |||
| using ctype = _ctype; | |||
| void post(ctype* dst) { | |||
| this->res *= this->coef; | |||
| *dst = std::round(this->res); | |||
| } | |||
| }; | |||
| template <int area, typename dtype, typename ctype, typename comp_type> | |||
| struct GiMeanPooler; | |||
| template <int area> | |||
| struct GiMeanPooler<area, dt_float32, float, float> { | |||
| using ctype = float; | |||
| static constexpr int MIDOUT_CASE_NUM = 1; | |||
| static constexpr int SIMD_WIDTH = 4; | |||
| static const GI_FLOAT32_t coef; | |||
| GI_FLOAT32_t res; | |||
| GiMeanPooler(DType) : res(GiBroadcastFloat32(0.0f)) {} | |||
| void feed(const float* val) { res = GiAddFloat32(res, GiLoadFloat32(val)); } | |||
| void post(float* dst) { | |||
| res = GiMultiplyFloat32(res, coef); | |||
| GiStoreFloat32(dst, res); | |||
| } | |||
| }; | |||
| template <int area> | |||
| const GI_FLOAT32_t GiMeanPooler<area, dt_float32, float, float>::coef = | |||
| GiBroadcastFloat32(1.0f / area); | |||
| /* ======================= MaxPooler ======================== */ | |||
| template <int area, typename dtype, typename _ctype, typename comp_type> | |||
| struct MaxPooler { | |||
| using ctype = _ctype; | |||
| static constexpr int MIDOUT_CASE_NUM = 11; | |||
| static constexpr int SIMD_WIDTH = 16 / sizeof(ctype); | |||
| static const ctype outsider; | |||
| ctype res; | |||
| MaxPooler(DType) : res(DTypeTrait<dtype>::min()) {} | |||
| void feed(const ctype* val) { res = std::max(res, *val); } | |||
| void post(ctype* dst) { *dst = res; } | |||
| }; | |||
| template <int area, typename dtype, typename ctype, typename comp_type> | |||
| const ctype MaxPooler<area, dtype, ctype, comp_type>::outsider = | |||
| DTypeTrait<dtype>::min(); | |||
| template <int area, typename dtype, typename ctype, typename comp_type> | |||
| struct GiMaxPooler; | |||
| template <int area> | |||
| struct GiMaxPooler<area, dt_float32, float, float> { | |||
| using ctype = float; | |||
| static constexpr int MIDOUT_CASE_NUM = 11; | |||
| static constexpr int SIMD_WIDTH = 4; | |||
| GI_FLOAT32_t res; | |||
| GiMaxPooler(DType) : res(GiBroadcastFloat32(DTypeTrait<dt_float32>::min())) {} | |||
| void feed(const float* val) { res = GiMaximumFloat32(res, GiLoadFloat32(val)); } | |||
| void post(float* dst) { GiStoreFloat32(dst, res); } | |||
| }; | |||
| template <typename Pooler, int window> | |||
| void do_pxl_naive( | |||
| int oh, int ow, const typename Pooler::ctype* src, typename Pooler::ctype* dst, | |||
| DType src_dtype, const int IH, const int IW, const int OH, const int OW, | |||
| const int PH, const int PW, const int SH, const int SW) { | |||
| MEGDNN_MARK_USED_VAR(OH); | |||
| Pooler pooler(src_dtype); | |||
| rep(wh, window) rep(ww, window) { | |||
| int ih = -PH + oh * SH + wh; | |||
| int iw = -PW + ow * SW + ww; | |||
| if (ih >= 0 && iw >= 0 && ih < IH && iw < IW) { | |||
| pooler.feed(src + ih * IW + iw); | |||
| } | |||
| } | |||
| pooler.post(dst + oh * OW + ow); | |||
| } | |||
| namespace detail { | |||
| template <typename Pooler, Pooling::Mode mode> | |||
| struct do_pxl_2x2_pack_proxy { | |||
| static void gao( | |||
| int oh, int ow, const typename Pooler::ctype* src, | |||
| typename Pooler::ctype* dst, DType, const int IH, const int IW, | |||
| const int OH, const int OW, const int PH, const int PW); | |||
| }; | |||
| template <> | |||
| struct do_pxl_2x2_pack_proxy< | |||
| MeanInPooler<4, dt_float32, float, float>, Pooling::Mode::AVERAGE> { | |||
| static void gao( | |||
| int oh, int ow, const dt_float32* src, dt_float32* dst, DType, const int IH, | |||
| const int IW, const int OH, const int OW, const int PH, const int PW) { | |||
| MEGDNN_MARK_USED_VAR(IH); | |||
| MEGDNN_MARK_USED_VAR(OH); | |||
| static const auto avg_coef = GiBroadcastFloat32(0.25f); | |||
| int ih = -PH + 2 * oh; | |||
| int iw = -PW + 2 * ow; | |||
| auto i00 = GiLoadFloat32(src + (ih + 0) * IW + (iw + 0)), | |||
| i01 = GiLoadFloat32(src + (ih + 0) * IW + (iw + 4)), | |||
| i10 = GiLoadFloat32(src + (ih + 1) * IW + (iw + 0)), | |||
| i11 = GiLoadFloat32(src + (ih + 1) * IW + (iw + 4)); | |||
| auto sum0 = GiAddFloat32(i00, i10), sum1 = GiAddFloat32(i01, i11); | |||
| auto vlow = GiPaddFloat32(GiGetLowFloat32(sum0), GiGetHighFloat32(sum0)); | |||
| auto vhigh = GiPaddFloat32(GiGetLowFloat32(sum1), GiGetHighFloat32(sum1)); | |||
| auto comb = GiCombineFloat32(vlow, vhigh); | |||
| auto result = GiMultiplyFloat32(comb, avg_coef); | |||
| GiStoreFloat32(dst + oh * OW + ow, result); | |||
| } | |||
| }; | |||
| template <> | |||
| struct do_pxl_2x2_pack_proxy< | |||
| MaxPooler<4, dt_float32, float, float>, Pooling::Mode::MAX> { | |||
| static void gao( | |||
| int oh, int ow, const dt_float32* src, dt_float32* dst, DType, const int IH, | |||
| const int IW, const int OH, const int OW, const int PH, const int PW) { | |||
| MEGDNN_MARK_USED_VAR(IH); | |||
| MEGDNN_MARK_USED_VAR(OH); | |||
| int ih = -PH + 2 * oh; | |||
| int iw = -PW + 2 * ow; | |||
| auto i00 = GiLoadFloat32(src + (ih + 0) * IW + (iw + 0)), | |||
| i01 = GiLoadFloat32(src + (ih + 0) * IW + (iw + 4)), | |||
| i10 = GiLoadFloat32(src + (ih + 1) * IW + (iw + 0)), | |||
| i11 = GiLoadFloat32(src + (ih + 1) * IW + (iw + 4)); | |||
| auto sum0 = GiMaximumFloat32(i00, i10), sum1 = GiMaximumFloat32(i01, i11); | |||
| auto vlow = GiPmaxFloat32(GiGetLowFloat32(sum0), GiGetHighFloat32(sum0)); | |||
| auto vhigh = GiPmaxFloat32(GiGetLowFloat32(sum1), GiGetHighFloat32(sum1)); | |||
| auto comb = GiCombineFloat32(vlow, vhigh); | |||
| GiStoreFloat32(dst + oh * OW + ow, comb); | |||
| } | |||
| }; | |||
| } // namespace detail | |||
| template <typename Pooler, Pooling::Mode mode> | |||
| void do_pxl_2x2_pack( | |||
| int oh, int ow, const typename Pooler::ctype* src, typename Pooler::ctype* dst, | |||
| DType src_dtype, const int IH, const int IW, const int OH, const int OW, | |||
| const int PH, const int PW) { | |||
| detail::do_pxl_2x2_pack_proxy<Pooler, mode>::gao( | |||
| oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW); | |||
| } | |||
| template <typename GiPooler, int window> | |||
| void do_pxl_compact_packed( | |||
| int oh, int ow, const typename GiPooler::ctype* src, | |||
| typename GiPooler::ctype* dst, DType src_dtype, const int IH, const int IW, | |||
| const int OH, const int OW, const int PH, const int PW) { | |||
| MEGDNN_MARK_USED_VAR(IH); | |||
| MEGDNN_MARK_USED_VAR(OH); | |||
| GiPooler pooler(src_dtype); | |||
| rep(wh, window) rep(ww, window) { | |||
| int ih = -PH + oh + wh; | |||
| int iw = -PW + ow + ww; | |||
| pooler.feed(src + ih * IW + iw); | |||
| } | |||
| pooler.post(dst + oh * OW + ow); | |||
| } | |||
| template <typename Pooler, typename GiPooler, int window> | |||
| void do_pooling_compact( | |||
| const typename Pooler::ctype* src, typename Pooler::ctype* dst, DType src_dtype, | |||
| const int IH, const int IW, const int OH, const int OW, const int PH, | |||
| const int PW) { | |||
| static_assert( | |||
| std::is_same<typename Pooler::ctype, typename GiPooler::ctype>::value, | |||
| "ctype of Pooler and GiPooler is not the same"); | |||
| const int stride = 1; | |||
| int oh = 0; | |||
| for (; oh < OH && oh - PH < 0; ++oh) { | |||
| int ow = 0; | |||
| for (; ow < OW; ++ow) { | |||
| do_pxl_naive<Pooler, window>( | |||
| oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW, stride, | |||
| stride); | |||
| } | |||
| } | |||
| for (; oh < OH && oh - PH + window <= IH; ++oh) { | |||
| int ow = 0; | |||
| for (; ow < OW && ow - PW < 0; ++ow) { | |||
| do_pxl_naive<Pooler, window>( | |||
| oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW, stride, | |||
| stride); | |||
| } | |||
| for (; ow + GiPooler::SIMD_WIDTH <= OW && | |||
| ow + GiPooler::SIMD_WIDTH - 1 - PW + window <= IW; | |||
| ow += GiPooler::SIMD_WIDTH) { | |||
| do_pxl_compact_packed<GiPooler, window>( | |||
| oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW); | |||
| } | |||
| for (; ow < OW; ++ow) { | |||
| do_pxl_naive<Pooler, window>( | |||
| oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW, stride, | |||
| stride); | |||
| } | |||
| } | |||
| for (; oh < OH; ++oh) { | |||
| int ow = 0; | |||
| for (; ow < OW; ++ow) { | |||
| do_pxl_naive<Pooler, window>( | |||
| oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW, stride, | |||
| stride); | |||
| } | |||
| } | |||
| } | |||
| template <typename Pooler, Pooling::Mode mode> | |||
| void do_pooling_2x2( | |||
| const typename Pooler::ctype* src, typename Pooler::ctype* dst, DType src_dtype, | |||
| const int IH, const int IW, const int OH, const int OW, const int PH, | |||
| const int PW) { | |||
| const int window = 2; | |||
| const int stride = 2; | |||
| int oh = 0; | |||
| for (; oh < OH && -PH + stride * oh < 0; ++oh) { | |||
| int ow = 0; | |||
| for (; ow < OW; ++ow) { | |||
| do_pxl_naive<Pooler, window>( | |||
| oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW, stride, | |||
| stride); | |||
| } | |||
| } | |||
| for (; oh < OH && -PH + stride * oh + window <= IH; ++oh) { | |||
| int ow = 0; | |||
| for (; ow < OW && -PW + stride * ow < 0; ++ow) { | |||
| do_pxl_naive<Pooler, window>( | |||
| oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW, stride, | |||
| stride); | |||
| } | |||
| for (; ow + Pooler::SIMD_WIDTH <= OW && | |||
| -PW + stride * (ow + Pooler::SIMD_WIDTH - 1) + window <= IW; | |||
| ow += Pooler::SIMD_WIDTH) { | |||
| do_pxl_2x2_pack<Pooler, mode>( | |||
| oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW); | |||
| } | |||
| for (; ow < OW; ++ow) { | |||
| do_pxl_naive<Pooler, window>( | |||
| oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW, stride, | |||
| stride); | |||
| } | |||
| } | |||
| for (; oh < OH; ++oh) { | |||
| int ow = 0; | |||
| for (; ow < OW; ++ow) { | |||
| do_pxl_naive<Pooler, window>( | |||
| oh, ow, src, dst, src_dtype, IH, IW, OH, OW, PH, PW, stride, | |||
| stride); | |||
| } | |||
| } | |||
| } | |||
| template <typename dtype, typename ctype> | |||
| void do_max_pooling_w5x5_s2x2_gi( | |||
| const ctype* src, ctype* dst, const int IH, const int IW, const int OH, | |||
| const int OW, const int PH, const int PW, const WorkspaceBundle& ws, | |||
| const int MEGDNN_SIMD_WIDTH) { | |||
| ctype* cache[5] = { | |||
| static_cast<ctype*>(ws.get(0)), static_cast<ctype*>(ws.get(1)), | |||
| static_cast<ctype*>(ws.get(2)), static_cast<ctype*>(ws.get(3)), | |||
| static_cast<ctype*>(ws.get(4))}; | |||
| ctype* odd = static_cast<ctype*>(ws.get(5)); | |||
| ctype* even = static_cast<ctype*>(ws.get(6)); | |||
| int ih_next = 0; | |||
| int OW_from = (PW + 1) / 2, OW_to = (IW + PW - 5) / 2 + 1; | |||
| auto process_cache = [&](int ih) { | |||
| const ctype* __restrict sptr = src + ih * IW; | |||
| auto tmp = cache[4]; | |||
| for (auto i = 4; i >= 1; --i) | |||
| cache[i] = cache[i - 1]; | |||
| cache[0] = tmp; | |||
| auto run_single = [&](int ow) { | |||
| int iw = ow * 2 - PW; | |||
| ctype res = std::numeric_limits<dtype>::lowest(); | |||
| for (auto i = 0; i < 5; ++i) | |||
| if (iw + i >= 0 && iw + i < IW) | |||
| res = std::max(res, sptr[iw + i]); | |||
| cache[0][ow] = res; | |||
| }; | |||
| int iw = 0; | |||
| int odd_offset = 0, even_offset = 0; | |||
| for (; iw + 2 * MEGDNN_SIMD_WIDTH <= IW; iw += 2 * MEGDNN_SIMD_WIDTH) { | |||
| auto s0 = GiLoadFloat32(sptr + iw + 0); | |||
| auto s1 = GiLoadFloat32(sptr + iw + MEGDNN_SIMD_WIDTH); | |||
| auto d = GiUzpqFloat32(s0, s1); | |||
| GiStoreFloat32(even + even_offset, d.val[0]); | |||
| GiStoreFloat32(odd + odd_offset, d.val[1]); | |||
| even_offset += MEGDNN_SIMD_WIDTH; | |||
| odd_offset += MEGDNN_SIMD_WIDTH; | |||
| } | |||
| for (; iw < IW; ++iw) { | |||
| if (iw & 1) | |||
| odd[odd_offset++] = sptr[iw]; | |||
| else | |||
| even[even_offset++] = sptr[iw]; | |||
| } | |||
| int ow = 0; | |||
| for (; ow < OW_from; ++ow) | |||
| run_single(ow); | |||
| if (PW & 1) { | |||
| for (; ow + MEGDNN_SIMD_WIDTH <= OW_to; ow += MEGDNN_SIMD_WIDTH) { | |||
| auto s0 = GiLoadFloat32(odd + ow - (PW >> 1) - 1); | |||
| auto s1 = GiLoadFloat32(even + ow - (PW >> 1)); | |||
| auto s2 = GiLoadFloat32(odd + ow - (PW >> 1)); | |||
| auto s3 = GiLoadFloat32(even + ow - (PW >> 1) + 1); | |||
| auto s4 = GiLoadFloat32(odd + ow - (PW >> 1) + 1); | |||
| auto d = GiMaximumFloat32( | |||
| s0, | |||
| GiMaximumFloat32( | |||
| GiMaximumFloat32(s1, s2), GiMaximumFloat32(s3, s4))); | |||
| GiStoreFloat32(cache[0] + ow, d); | |||
| } | |||
| } else { | |||
| for (; ow + MEGDNN_SIMD_WIDTH <= OW_to; ow += MEGDNN_SIMD_WIDTH) { | |||
| auto s0 = GiLoadFloat32(even + ow - (PW >> 1)); | |||
| auto s1 = GiLoadFloat32(odd + ow - (PW >> 1)); | |||
| auto s2 = GiLoadFloat32(even + ow - (PW >> 1) + 1); | |||
| auto s3 = GiLoadFloat32(odd + ow - (PW >> 1) + 1); | |||
| auto s4 = GiLoadFloat32(even + ow - (PW >> 1) + 2); | |||
| auto d = GiMaximumFloat32( | |||
| s0, | |||
| GiMaximumFloat32( | |||
| GiMaximumFloat32(s1, s2), GiMaximumFloat32(s3, s4))); | |||
| GiStoreFloat32(cache[0] + ow, d); | |||
| } | |||
| } | |||
| for (; ow < OW; ++ow) | |||
| run_single(ow); | |||
| }; | |||
| for (int oh = 0; oh < OH; ++oh) { | |||
| ctype* __restrict dptr = dst + oh * OW; | |||
| int ih_from = std::min(IH, std::max(0, oh * 2 - PH)); | |||
| int ih_to = std::min(IH, std::max(0, oh * 2 - PH + 5)); | |||
| while (ih_next < ih_to) | |||
| process_cache(ih_next++); | |||
| if (ih_to - ih_from == 5) { | |||
| int ow = 0; | |||
| for (; ow + MEGDNN_SIMD_WIDTH <= OW; ow += MEGDNN_SIMD_WIDTH) { | |||
| auto s0 = GiLoadFloat32(cache[0] + ow); | |||
| auto s1 = GiLoadFloat32(cache[1] + ow); | |||
| auto s2 = GiLoadFloat32(cache[2] + ow); | |||
| auto s3 = GiLoadFloat32(cache[3] + ow); | |||
| auto s4 = GiLoadFloat32(cache[4] + ow); | |||
| auto d = GiMaximumFloat32( | |||
| s0, | |||
| GiMaximumFloat32( | |||
| GiMaximumFloat32(s1, s2), GiMaximumFloat32(s3, s4))); | |||
| GiStoreFloat32(dptr + ow, d); | |||
| } | |||
| for (; ow < OW; ++ow) | |||
| dptr[ow] = std::max( | |||
| {cache[0][ow], cache[1][ow], cache[2][ow], cache[3][ow], | |||
| cache[4][ow]}); | |||
| } else { | |||
| std::memcpy(dptr, cache[0], sizeof(ctype) * OW); | |||
| for (int i = 1; i < ih_to - ih_from; ++i) { | |||
| int ow = 0; | |||
| for (; ow + MEGDNN_SIMD_WIDTH <= OW; ow += MEGDNN_SIMD_WIDTH) { | |||
| auto s = GiLoadFloat32(cache[i] + ow); | |||
| auto d = GiLoadFloat32(dptr + ow); | |||
| d = GiMaximumFloat32(d, s); | |||
| GiStoreFloat32(dptr + ow, d); | |||
| } | |||
| for (; ow < OW; ++ow) | |||
| dptr[ow] = std::max(dptr[ow], cache[i][ow]); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| template <typename ctype> | |||
| void do_average_pooling_3x3_s2x2_gi( | |||
| const ctype* src, ctype* dst, size_t IH_, size_t IW_, size_t OH_, size_t OW_, | |||
| size_t PH_, size_t PW_, const WorkspaceBundle& ws, | |||
| const int MEGDNN_SIMD_WIDTH) { | |||
| int IH = IH_, IW = IW_, OH = OH_, OW = OW_, PH = PH_, PW = PW_; | |||
| // cache[i] stores the answer of the i-th line after | |||
| // pooling along the W dimension. | |||
| ctype* cache[3] = { | |||
| static_cast<ctype*>(ws.get(0)), static_cast<ctype*>(ws.get(1)), | |||
| static_cast<ctype*>(ws.get(2))}; | |||
| ctype* odd = static_cast<ctype*>(ws.get(3)); | |||
| ctype* even = static_cast<ctype*>(ws.get(4)); | |||
| int ih_next = 0; | |||
| // "good" area means we can use SIMD to accelerate. | |||
| auto get_good_area = [](int I, int /* O */, int P, int& O_from, int& O_to) { | |||
| // x*2 - P >= 0; 2x >= P; x >= P/2 | |||
| O_from = (P + 1) / 2; | |||
| // x*2 - P + 3 <= I; x*2 <= I+P-3; x <= (I+P-3)/2 | |||
| O_to = (I + P - 3) / 2 + 1; | |||
| // we must have I >= 2 to ensure O_from <= O_to | |||
| }; | |||
| int OW_from, OW_to; | |||
| get_good_area(IW, OW, PW, OW_from, OW_to); | |||
| auto process_cache = [&](int ih) { | |||
| const ctype* __restrict sptr = src + ih * IW; | |||
| auto tmp = cache[2]; | |||
| cache[2] = cache[1]; | |||
| cache[1] = cache[0]; | |||
| cache[0] = tmp; | |||
| // cache 0 is used to store the current answer. | |||
| auto run_single = [&](int ow) { | |||
| int iw = ow * 2 - PW; | |||
| ctype res = 0; | |||
| if (iw + 0 >= 0 && iw + 0 < IW) { | |||
| res += sptr[iw + 0]; | |||
| } | |||
| if (iw + 1 >= 0 && iw + 1 < IW) { | |||
| res += sptr[iw + 1]; | |||
| } | |||
| if (iw + 2 >= 0 && iw + 2 < IW) { | |||
| res += sptr[iw + 2]; | |||
| } | |||
| cache[0][ow] = res; | |||
| }; | |||
| // build odd/even | |||
| int iw = 0; | |||
| int odd_offset = 0, even_offset = 0; | |||
| for (; iw + 2 * MEGDNN_SIMD_WIDTH <= IW; iw += 2 * MEGDNN_SIMD_WIDTH) { | |||
| auto s0 = GiLd2qFloat32(sptr + iw); | |||
| GiStoreFloat32(even + even_offset, s0.val[0]); | |||
| GiStoreFloat32(odd + odd_offset, s0.val[1]); | |||
| even_offset += MEGDNN_SIMD_WIDTH; | |||
| odd_offset += MEGDNN_SIMD_WIDTH; | |||
| } | |||
| for (; iw < IW; ++iw) { | |||
| if (iw & 1) | |||
| odd[odd_offset++] = sptr[iw]; | |||
| else | |||
| even[even_offset++] = sptr[iw]; | |||
| } | |||
| int ow = 0; | |||
| for (; ow < OW_from; ++ow) | |||
| run_single(ow); | |||
| if (PW & 1) { | |||
| for (; ow + MEGDNN_SIMD_WIDTH <= OW_to; ow += MEGDNN_SIMD_WIDTH) { | |||
| auto s0 = GiLoadFloat32(odd + ow - (PW >> 1) - 1); | |||
| auto s1 = GiLoadFloat32(even + ow - (PW >> 1)); | |||
| auto s2 = GiLoadFloat32(odd + ow - (PW >> 1)); | |||
| auto d = GiAddFloat32(GiAddFloat32(s0, s1), s2); | |||
| GiStoreFloat32(cache[0] + ow, d); | |||
| } | |||
| } else { | |||
| for (; ow + MEGDNN_SIMD_WIDTH <= OW_to; ow += MEGDNN_SIMD_WIDTH) { | |||
| auto s0 = GiLoadFloat32(even + ow - (PW >> 1)); | |||
| auto s1 = GiLoadFloat32(odd + ow - (PW >> 1)); | |||
| auto s2 = GiLoadFloat32(even + ow - (PW >> 1) + 1); | |||
| auto d = GiAddFloat32(GiAddFloat32(s0, s1), s2); | |||
| GiStoreFloat32(cache[0] + ow, d); | |||
| } | |||
| } | |||
| for (; ow < OW; ++ow) | |||
| run_single(ow); | |||
| }; | |||
| for (int oh = 0; oh < OH; ++oh) { | |||
| ctype* __restrict dptr = dst + oh * OW; | |||
| int ih_from = std::min(IH, std::max(0, oh * 2 - PH)); | |||
| int ih_to = std::min(IH, std::max(0, oh * 2 - PH + 3)); | |||
| while (ih_next < ih_to) { | |||
| process_cache(ih_next++); | |||
| } | |||
| ctype factor = (1.0f / 9); | |||
| auto coef = GiBroadcastFloat32(factor); | |||
| if (ih_to - ih_from == 3) { | |||
| int ow = 0; | |||
| for (; ow + MEGDNN_SIMD_WIDTH <= OW; ow += MEGDNN_SIMD_WIDTH) { | |||
| auto s0 = GiLoadFloat32(cache[0] + ow); | |||
| auto s1 = GiLoadFloat32(cache[1] + ow); | |||
| auto s2 = GiLoadFloat32(cache[2] + ow); | |||
| auto d = GiAddFloat32(GiAddFloat32(s0, s1), s2); | |||
| d = GiMultiplyFloat32(d, coef); | |||
| GiStoreFloat32(dptr + ow, d); | |||
| } | |||
| #if MEGDNN_FIX_AARCH32_BUG | |||
| // FIXME: as llvm may cause cannot select error if enable vectorize | |||
| #pragma clang loop vectorize(disable) | |||
| #endif | |||
| for (; ow < OW; ++ow) { | |||
| dptr[ow] = (cache[0][ow] + cache[1][ow] + cache[2][ow]) * factor; | |||
| } | |||
| } else { | |||
| std::memcpy(dptr, cache[0], sizeof(ctype) * OW); | |||
| int i = 1; | |||
| for (; i < ih_to - ih_from; ++i) { | |||
| int ow = 0; | |||
| for (; ow + MEGDNN_SIMD_WIDTH <= OW; ow += MEGDNN_SIMD_WIDTH) { | |||
| auto s = GiLoadFloat32(cache[i] + ow); | |||
| auto d = GiLoadFloat32(dptr + ow); | |||
| d = GiAddFloat32(d, s); | |||
| GiStoreFloat32(dptr + ow, d); | |||
| } | |||
| for (; ow < OW; ++ow) { | |||
| dptr[ow] = (dptr[ow] + cache[i][ow]); | |||
| } | |||
| } | |||
| int ow = 0; | |||
| for (; ow + MEGDNN_SIMD_WIDTH <= OW; ow += MEGDNN_SIMD_WIDTH) { | |||
| auto d = GiLoadFloat32(dptr + ow); | |||
| d = GiMultiplyFloat32(d, coef); | |||
| GiStoreFloat32(dptr + ow, d); | |||
| } | |||
| #if MEGDNN_FIX_AARCH32_BUG | |||
| // FIXME: as llvm may cause cannot select error if enable vectorize | |||
| #pragma clang loop vectorize(disable) | |||
| #endif | |||
| for (; ow < OW; ++ow) { | |||
| dptr[ow] *= factor; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } // anonymous namespace | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -6,18 +6,186 @@ | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
| * implied. | |||
| */ | |||
| #include "src/fallback/pooling/opr_impl.h" | |||
| #include <cstring> | |||
| #include "src/common/utils.h" | |||
| #include "src/naive/handle.h" | |||
| #include "src/common/algo_chooser.h" | |||
| #include "src/common/metahelper.h" | |||
| #include "src/fallback/pooling/gi/algo.h" | |||
| #include "midout.h" | |||
| MIDOUT_DECL(megdnn_fallback_pooling) | |||
| using namespace megdnn; | |||
| using namespace fallback; | |||
| class PoolingImpl::AlgoPack : NonCopyableObj { | |||
| private: | |||
| AlgoBase::Mapper m_all_algos_map; | |||
| AlgoGiFilterxModexStride1 algo_gi_filterx_modex_stride1; | |||
| AlgoGiFilter2ModexStride2 algo_gi_filter2_modex_stride2; | |||
| AlgoGiFilter3MaxStride2 algo_gi_filter3_max_stride2; | |||
| AlgoGiFilter3AverageStride2 algo_gi_filter3_average_stride2; | |||
| AlgoGiFilter4MaxStride2 algo_gi_filter4_max_stride2; | |||
| AlgoGiFilter5MaxStride2 algo_gi_filter5_max_stride2; | |||
| AlgoGiFp32ModexStridexNCHW44 algo_gi_fp32_modex_stridex_nchw44; | |||
| AlgoFallback algo_fallback; | |||
| public: | |||
| AlgoPack() { | |||
| all_algos.emplace_back(&algo_gi_filterx_modex_stride1); | |||
| all_algos.emplace_back(&algo_gi_filter2_modex_stride2); | |||
| all_algos.emplace_back(&algo_gi_filter3_max_stride2); | |||
| all_algos.emplace_back(&algo_gi_filter3_average_stride2); | |||
| all_algos.emplace_back(&algo_gi_filter4_max_stride2); | |||
| all_algos.emplace_back(&algo_gi_filter5_max_stride2); | |||
| all_algos.emplace_back(&algo_gi_fp32_modex_stridex_nchw44); | |||
| all_algos.emplace_back(&algo_fallback); | |||
| for (auto&& algo : all_algos) { | |||
| m_all_algos_map.emplace(algo->info().desc, algo); | |||
| } | |||
| } | |||
| SmallVector<AlgoBase*> all_algos; | |||
| const AlgoBase::Mapper& all_algos_map() const { return m_all_algos_map; } | |||
| }; | |||
| PoolingImpl::AlgoPack PoolingImpl::sm_algo_pack; | |||
| PoolingImpl::PoolingKernSizeParam PoolingImpl::make_pooling_kern_szie_param( | |||
| fallback::PoolingImpl* opr, const TensorLayout& src, const TensorLayout& dst) { | |||
| auto safe_u32 = [](size_t v) -> uint32_t { | |||
| megdnn_assert( | |||
| v <= std::numeric_limits<uint32_t>::max(), "value too large: %zu", v); | |||
| return v; | |||
| }; | |||
| return {safe_u32(src.shape[0]), | |||
| safe_u32(src.shape[1]), | |||
| {{safe_u32(src.shape[2]), safe_u32(src.shape[3])}}, | |||
| {{safe_u32(dst.shape[2]), safe_u32(dst.shape[3])}}, | |||
| {{safe_u32(opr->param().pad_h), safe_u32(opr->param().pad_w)}}, | |||
| {{safe_u32(opr->param().window_h), safe_u32(opr->param().window_w)}}, | |||
| {{safe_u32(opr->param().stride_h), safe_u32(opr->param().stride_w)}}, | |||
| src.dtype, | |||
| dst.dtype, | |||
| opr->handle(), | |||
| opr->param().format, | |||
| opr->param().mode}; | |||
| }; | |||
| PoolingImpl::PoolingKernParam PoolingImpl::make_pooling_kern_param( | |||
| fallback::PoolingImpl* opr, _megdnn_tensor_in src, _megdnn_tensor_out dst, | |||
| _megdnn_workspace workspace) { | |||
| PoolingKernParam ret; | |||
| static_cast<PoolingKernSizeParam&>(ret) = | |||
| make_pooling_kern_szie_param(opr, src.layout, dst.layout); | |||
| ret.src_ptr = src.get_ref_ptr(); | |||
| ret.dst_ptr = dst.get_ref_ptr(); | |||
| ret.workspace_ptr = workspace.raw_ptr; | |||
| ret.workspace_size = workspace.size; | |||
| return ret; | |||
| }; | |||
| MEGDNN_DEF_GET_ALGO_FROM_DESC(PoolingImpl); | |||
| std::vector<Algorithm*> PoolingImpl::get_all_algorithms( | |||
| const TensorLayout& src, const TensorLayout& dst) { | |||
| auto param = make_pooling_kern_szie_param(this, src, dst); | |||
| std::vector<Algorithm*> ret; | |||
| ret.reserve(algo_pack().all_algos.size()); | |||
| for (auto i : algo_pack().all_algos) { | |||
| if (i->usable(param)) { | |||
| ret.push_back(i); | |||
| } | |||
| } | |||
| return ret; | |||
| } | |||
| size_t PoolingImpl::get_workspace_in_bytes( | |||
| const TensorLayout& src, const TensorLayout& dst) { | |||
| TensorLayoutArray layouts{src, dst}; | |||
| AlgorithmCache::Key key{this->handle(), this->get_opr_type(), | |||
| layouts.data(), layouts.size(), | |||
| &this->param(), sizeof(this->param())}; | |||
| auto rst = AlgorithmCache::instance().get(key); | |||
| if (rst.policy.algo.valid()) { | |||
| return rst.workspace; | |||
| } | |||
| auto param = make_pooling_kern_szie_param(this, src, dst); | |||
| auto algo = static_cast<AlgoBase*>(fallback::PoolingImpl::get_algorithm_heuristic( | |||
| src, dst, std::numeric_limits<size_t>::max(), AlgoAttribute::DEFAULT, | |||
| AlgoAttribute::DEFAULT)); | |||
| if (!is_fallback_non_gi_algo(algo)) { | |||
| size_t fallback_gi_workspace = 0; | |||
| //! When multi-thread, every thread has its own workspace | |||
| size_t nr_threads = static_cast<naive::HandleImpl*>(handle()) | |||
| ->megcore_dispatcher() | |||
| ->nr_threads(); | |||
| if (param.src_type.category() == DTypeCategory::FLOAT && | |||
| param.filter[0] == param.filter[1] && | |||
| (param.filter[0] == 3 || param.filter[0] == 5) && | |||
| param.format == Param::Format::NCHW && | |||
| (param.mode == Mode::MAX || | |||
| (param.mode == Mode::AVERAGE && param.filter[0] == 3)) && | |||
| param.stride[0] == 2 && param.stride[1] == 2 && param.isz[0] >= 2 && | |||
| param.isz[1] >= 2) { | |||
| WorkspaceBundle ws = get_bundle(param); | |||
| fallback_gi_workspace = ws.total_size_in_bytes() * nr_threads; | |||
| } | |||
| return fallback_gi_workspace; | |||
| } else { | |||
| auto naive_worksapce = | |||
| naive::PoolingForwardImpl::get_workspace_in_bytes(src, dst); | |||
| return naive_worksapce; | |||
| } | |||
| } | |||
| void PoolingImpl::exec( | |||
| _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) { | |||
| check_exec(src.layout, dst.layout, workspace.size); | |||
| auto param = make_pooling_kern_param(this, src, dst, workspace); | |||
| auto algo = static_cast<AlgoBase*>(fallback::PoolingImpl::get_algorithm_heuristic( | |||
| src.layout, dst.layout, std::numeric_limits<size_t>::max(), | |||
| AlgoAttribute::DEFAULT, AlgoAttribute::DEFAULT)); | |||
| if (!is_fallback_non_gi_algo(algo)) { | |||
| algo->exec(param); | |||
| } else { | |||
| exec_fallback(src, dst, workspace); | |||
| } | |||
| } | |||
| std::vector<Algorithm*> PoolingImpl::get_all_algorithms_safe( | |||
| const TensorLayout& src, const TensorLayout& dst) { | |||
| auto ret_safe = get_all_algorithms(src, dst); | |||
| megdnn_assert(!ret_safe.empty(), "no usable pooling fwd algorithm"); | |||
| return ret_safe; | |||
| } | |||
| Algorithm* PoolingImpl::get_algorithm_heuristic( | |||
| const TensorLayout& src, const TensorLayout& dst, | |||
| size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr, | |||
| const AlgoAttribute& negative_attr) { | |||
| MEGDNN_MARK_USED_VAR(workspace_limit_in_bytes); | |||
| auto param = make_pooling_kern_szie_param(this, src, dst); | |||
| for (auto&& iter : sm_algo_pack.all_algos) { | |||
| if (iter->is_available_attribute(param, positive_attr, negative_attr)) { | |||
| return iter; | |||
| } | |||
| } | |||
| megdnn_throw(ssprintf( | |||
| "require algorithm with attribute(%s) and without " | |||
| "attribute(%s), but can't get suitable algo.\n", | |||
| Algorithm::attribute_str(positive_attr).c_str(), | |||
| Algorithm::attribute_str(negative_attr).c_str())); | |||
| return nullptr; | |||
| } | |||
| //! fallback not gi imp | |||
| namespace megdnn { | |||
| namespace fallback { | |||
| namespace pooling { | |||
| @@ -140,9 +308,6 @@ void w2x2_s2x2_avg_int8( | |||
| } // namespace fallback | |||
| } // namespace megdnn | |||
| namespace megdnn { | |||
| namespace fallback { | |||
| void PoolingImpl::exec_w3x3_s1x1( | |||
| _megdnn_tensor_in src, _megdnn_tensor_out dst, const Param& param) { | |||
| auto N = src.layout.shape[0], C = src.layout.shape[1]; | |||
| @@ -179,7 +344,7 @@ void PoolingImpl::exec_w2x2_s2x2_avg_int8( | |||
| } | |||
| } | |||
| void PoolingImpl::exec( | |||
| void PoolingImpl::exec_fallback( | |||
| _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) { | |||
| Param param = this->param(); | |||
| check_exec(src.layout, dst.layout, workspace.size); | |||
| @@ -219,7 +384,4 @@ void PoolingImpl::exec( | |||
| naive::PoolingForwardImpl::exec(src, dst, workspace); | |||
| } | |||
| } // namespace fallback | |||
| } // namespace megdnn | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -10,6 +10,7 @@ | |||
| * implied. | |||
| */ | |||
| #pragma once | |||
| #include <unordered_map> | |||
| #include "megdnn/oprs/base.h" | |||
| #include "src/naive/pooling/opr_impl.h" | |||
| @@ -17,19 +18,143 @@ namespace megdnn { | |||
| namespace fallback { | |||
| class PoolingImpl : public naive::PoolingForwardImpl { | |||
| private: | |||
| class AlgoGiFilterxModexStride1; | |||
| class AlgoGiFilter2ModexStride2; | |||
| class AlgoGiFilter3MaxStride2; | |||
| class AlgoGiFilter3AverageStride2; | |||
| class AlgoGiFilter4MaxStride2; | |||
| class AlgoGiFilter5MaxStride2; | |||
| class AlgoGiFp32ModexStridexNCHW44; | |||
| class AlgoFallback; | |||
| class AlgoPack; | |||
| static AlgoPack sm_algo_pack; | |||
| void exec_w3x3_s1x1( | |||
| _megdnn_tensor_in src, _megdnn_tensor_out dst, const Param& param); | |||
| void exec_w2x2_s2x2_int8(_megdnn_tensor_in src, _megdnn_tensor_out dst); | |||
| void exec_w2x2_s2x2_avg_int8(_megdnn_tensor_in src, _megdnn_tensor_out dst); | |||
| public: | |||
| using naive::PoolingForwardImpl::PoolingForwardImpl; | |||
| using Param = param::Pooling; | |||
| void exec( | |||
| _megdnn_tensor_in src, _megdnn_tensor_out dst, | |||
| _megdnn_workspace workspace) override; | |||
| private: | |||
| void exec_w3x3_s1x1( | |||
| _megdnn_tensor_in src, _megdnn_tensor_out dst, const Param& param); | |||
| void exec_w2x2_s2x2_int8(_megdnn_tensor_in src, _megdnn_tensor_out dst); | |||
| void exec_w2x2_s2x2_avg_int8(_megdnn_tensor_in src, _megdnn_tensor_out dst); | |||
| void exec_fallback( | |||
| _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace); | |||
| size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&) override; | |||
| static size_t constexpr MAX_SPATIAL_DIM = 2; | |||
| struct PoolingKernSizeParam { | |||
| uint32_t n, ic; | |||
| std::array<uint32_t, MAX_SPATIAL_DIM> isz, osz; | |||
| std::array<uint32_t, MAX_SPATIAL_DIM> padding, filter, stride; | |||
| DType src_type, dst_type; | |||
| Handle* handle; | |||
| Param::Format format; | |||
| Mode mode; | |||
| }; | |||
| struct PoolingKernParam : public PoolingKernSizeParam { | |||
| RefPtr src_ptr; | |||
| RefPtr dst_ptr; | |||
| void* workspace_ptr; | |||
| size_t workspace_size; | |||
| template <typename T> | |||
| const T* src() const { | |||
| src_type.assert_is_compatible_ctype<T>(); | |||
| return static_cast<const T*>(src_ptr.get_ptr()); | |||
| } | |||
| template <typename T> | |||
| T* dst() const { | |||
| dst_type.assert_is_compatible_ctype<T>(); | |||
| return static_cast<T*>(dst_ptr.get_ptr()); | |||
| } | |||
| template <typename T> | |||
| T* workspace() const { | |||
| return static_cast<T*>(workspace_ptr); | |||
| } | |||
| }; | |||
| PoolingKernSizeParam make_pooling_kern_szie_param( | |||
| fallback::PoolingImpl* opr, const TensorLayout& src, | |||
| const TensorLayout& dst); | |||
| PoolingKernParam make_pooling_kern_param( | |||
| fallback::PoolingImpl* opr, _megdnn_tensor_in src, _megdnn_tensor_out dst, | |||
| _megdnn_workspace workspace); | |||
| class AlgoBase : public detail::Algorithm { | |||
| public: | |||
| enum class AlgoType : uint32_t { | |||
| GI_FilterxModexStride1, | |||
| GI_Filter2ModexStride2, | |||
| GI_Filter3MaxStride2, | |||
| GI_Filter3AverageStride2, | |||
| GI_Filter4MaxStride2, | |||
| GI_Filter5MaxStride2, | |||
| GI_Filter2ModexStridexNCHW44, | |||
| GI_Filter3ModexStridexNCHW44, | |||
| GI_Filter4ModexStridexNCHW44, | |||
| GI_Filter5ModexStridexNCHW44, | |||
| GI_Fp32ModexStridexNCHW44, | |||
| FallbackNotGI | |||
| }; | |||
| using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>; | |||
| AlgoBase() : Algorithm() { m_handle_type = Handle::HandleType::FALLBACK; } | |||
| virtual ~AlgoBase() = default; | |||
| virtual bool usable(const PoolingKernSizeParam& param) const = 0; | |||
| virtual void exec(const PoolingKernParam& param) const = 0; | |||
| uint32_t type() const override { return INVALID_ALGO_TYPE; }; | |||
| bool is_available_attribute( | |||
| const PoolingKernSizeParam& param, | |||
| const AlgoAttribute& positive_attr = AlgoAttribute::REPRODUCIBLE, | |||
| const AlgoAttribute& negative_attr = AlgoAttribute::DEFAULT) { | |||
| return contain_attribute_all(positive_attr) && | |||
| !contain_attribute_any(negative_attr) && usable(param); | |||
| } | |||
| }; | |||
| const char* get_algorithm_set_name() const override { | |||
| return "FALLBACK_POOLING_FORWARD"; | |||
| } | |||
| Algorithm* get_algorithm_from_desc(const AlgorithmDesc&) override; | |||
| std::vector<Algorithm*> get_all_algorithms( | |||
| const TensorLayout& src, const TensorLayout& dst) override; | |||
| std::vector<Algorithm*> get_all_algorithms_safe( | |||
| const TensorLayout& src, const TensorLayout& dst) override; | |||
| Algorithm* get_algorithm_heuristic( | |||
| const TensorLayout& src, const TensorLayout& dst, | |||
| size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr, | |||
| const AlgoAttribute& negative_attr) override; | |||
| AlgorithmInfo get_algorithm_info_heuristic( | |||
| const TensorLayout& src, const TensorLayout& dst, | |||
| size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr, | |||
| const AlgoAttribute& negative_attr) { | |||
| return fallback::PoolingImpl::get_algorithm_heuristic( | |||
| src, dst, workspace_limit_in_bytes, positive_attr, negative_attr) | |||
| ->info(); | |||
| } | |||
| static const AlgoPack& algo_pack() { return sm_algo_pack; } | |||
| bool is_fallback_non_gi_algo(Algorithm* algo) { | |||
| return strcmp(algo->name(), "FALLBACK_NOT_GI_POOLING") == 0; | |||
| } | |||
| }; | |||
| } // namespace fallback | |||
| } // namespace megdnn | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -103,7 +103,9 @@ public: | |||
| AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }; | |||
| const char* name() const override { return m_algo_name.c_str(); } | |||
| bool is_available(const SizeArgs&) const override { return true; } | |||
| void exec(const ExecArgs&) const override {} | |||
| void exec(const ExecArgs&) const override { | |||
| megdnn_assert(false, "code issue happened!!"); | |||
| } | |||
| MEGDNN_DECL_ALGO_TYPE(X86_Fallback) | |||
| }; | |||
| @@ -3161,6 +3161,44 @@ TEST_F(FALLBACK, GiGetHighFloat32) { | |||
| ASSERT_EQ(*(r + 1), s0[3]); | |||
| } | |||
| TEST_F(FALLBACK, GiPaddFloat32) { | |||
| float32x2_t src0, src1, ret; | |||
| std::vector<float> s0{1.1f, -3.1415f}; | |||
| std::vector<float> s1{2.3f, 3.14777f}; | |||
| memcpy(&src0, s0.data(), sizeof(float32x2_t)); | |||
| memcpy(&src1, s1.data(), sizeof(float32x2_t)); | |||
| ret = GiPaddFloat32(src0, src1); | |||
| std::vector<float> naive; | |||
| naive.push_back(s0[0] + s0[1]); | |||
| naive.push_back(s1[0] + s1[1]); | |||
| auto r = (float*)&ret; | |||
| ASSERT_LT(std::abs(naive[0] - r[0]), 1e-3); | |||
| ASSERT_LT(std::abs(naive[1] - r[1]), 1e-3); | |||
| } | |||
| TEST_F(FALLBACK, GiPmaxFloat32) { | |||
| float32x2_t src0, src1, ret; | |||
| std::vector<float> s0{1.1f, -3.1415f}; | |||
| std::vector<float> s1{2.3f, 3.14777f}; | |||
| memcpy(&src0, s0.data(), sizeof(float32x2_t)); | |||
| memcpy(&src1, s1.data(), sizeof(float32x2_t)); | |||
| ret = GiPmaxFloat32(src0, src1); | |||
| std::vector<float> naive; | |||
| auto t0 = MAX_NAN(s0[0], s0[1]); | |||
| auto t1 = MAX_NAN(s1[0], s1[1]); | |||
| naive.push_back(t0); | |||
| naive.push_back(t1); | |||
| auto r = (float*)&ret; | |||
| ASSERT_LT(std::abs(naive[0] - r[0]), 1e-3); | |||
| ASSERT_LT(std::abs(naive[1] - r[1]), 1e-3); | |||
| } | |||
| } // namespace test | |||
| } // namespace megdnn | |||
| @@ -0,0 +1,560 @@ | |||
| /** | |||
| * \file dnn/test/fallback/pooling.cpp | |||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| * | |||
| * Copyright (c) 2014-2022 Megvii Inc. All rights reserved. | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, | |||
| * software distributed under the License is distributed on an | |||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| */ | |||
| #include "test/fallback/fixture.h" | |||
| #include "test/common/benchmarker.h" | |||
| #include "test/common/checker.h" | |||
| #include "test/common/pooling.h" | |||
| #include "test/common/rng.h" | |||
| #include "test/common/task_record_check.h" | |||
| namespace megdnn { | |||
| namespace test { | |||
| namespace { | |||
| std::vector<std::pair<param::Pooling, TensorShapeArray>> get_nchw44_pool_args( | |||
| size_t filter, size_t stride) { | |||
| constexpr size_t ic_step = 4; | |||
| std::vector<std::pair<param::Pooling, TensorShapeArray>> args; | |||
| for (size_t n : {1, 2}) | |||
| for (size_t c : {4, 8}) | |||
| for (size_t ih : {3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}) | |||
| for (size_t iw : {3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}) | |||
| for (size_t ph : {0, 1, 2}) | |||
| for (size_t pw : {0, 1, 2}) | |||
| for (auto mode : | |||
| {param::Pooling::Mode::MAX, | |||
| param::Pooling::Mode::AVERAGE}) | |||
| if (ih + 2 * ph >= filter && iw + 2 * pw >= filter && | |||
| filter > ph && filter > pw) { | |||
| param::Pooling param; | |||
| param.mode = mode; | |||
| param.format = param::Pooling::Format::NCHW44; | |||
| param.pad_h = ph; | |||
| param.pad_w = pw; | |||
| param.stride_h = param.stride_w = stride; | |||
| param.window_h = param.window_w = filter; | |||
| args.emplace_back(std::make_pair( | |||
| param, | |||
| TensorShapeArray{ | |||
| {n, c / ic_step, ih, iw, ic_step}, | |||
| {}})); | |||
| } | |||
| return args; | |||
| } | |||
| void run_pooling_check( | |||
| Handle* handle, std::vector<std::pair<param::Pooling, TensorShapeArray>> args, | |||
| bool is_int8) { | |||
| Checker<Pooling> checker(handle); | |||
| UniformIntRNG rng_int8{INT8_MIN >> 1, INT8_MAX >> 1}; | |||
| UniformIntRNG rng_fp32{-10, 10}; | |||
| if (is_int8) { | |||
| checker.set_dtype(0, dtype::QuantizedS8(1.1f)); | |||
| checker.set_rng(0, &rng_int8); | |||
| } else { | |||
| checker.set_rng(0, &rng_fp32); | |||
| } | |||
| for (auto arg : args) { | |||
| checker.set_param(arg.first).exec(arg.second); | |||
| } | |||
| } | |||
| } // namespace | |||
| TEST_F(FALLBACK_MULTI_THREADS, POOLING_GI_NCHW44_FP32) { | |||
| for (auto filter : {2, 3, 4, 5}) | |||
| for (auto stride : {1, 2}) { | |||
| run_pooling_check(handle(), get_nchw44_pool_args(filter, stride), false); | |||
| } | |||
| } | |||
| TEST_F(FALLBACK, POOLING_GI) { | |||
| using Param = param::Pooling; | |||
| // clang-format off | |||
| for (size_t ih: {2, 3, 5, 7, 11, 13, 17, 19, 23, 24, 25, 26, 27, 28, 29, 30}) | |||
| for (size_t iw: {2, 3, 5, 7, 11, 13, 17, 19, 23, 24, 25, 26, 27, 28, 29, 30}) | |||
| for (size_t p: {1, 2}) | |||
| { | |||
| Param param; | |||
| param.mode = Param::Mode::MAX; | |||
| param.window_h = param.window_w = 3; | |||
| param.stride_h = param.stride_w = 2; | |||
| param.pad_h = param.pad_w = p; | |||
| Checker<Pooling> checker(handle()); | |||
| checker.set_param(param).exec({{2, 3, ih, iw}, {}}); | |||
| param.mode = Param::Mode::AVERAGE; | |||
| param.window_h = param.window_w = 3; | |||
| param.stride_h = param.stride_w = 2; | |||
| param.pad_h = param.pad_w = p; | |||
| checker.set_param(param).exec({{2, 3, ih, iw}, {}}); | |||
| param.mode = Param::Mode::MAX; | |||
| param.window_h = param.window_w = 4; | |||
| param.stride_h = param.stride_w = 2; | |||
| param.pad_h = param.pad_w = p; | |||
| checker.set_param(param).exec({{2, 3, ih, iw}, {}}); | |||
| param.mode = Param::Mode::MAX; | |||
| param.window_h = param.window_w = 5; | |||
| param.stride_h = param.stride_w = 2; | |||
| param.pad_h = param.pad_w = p; | |||
| if (ih + p * 2 >= 5 && iw + p * 2 >= 5) | |||
| checker.set_param(param).exec({{2, 3, ih, iw}, {}}); | |||
| } | |||
| for (size_t ih: {2, 3, 5, 7, 11, 13, 17, 19, 23, 24, 25, 26, 27, 28, 29, 30}) | |||
| for (size_t iw: {2, 3, 5, 7, 11, 13, 17, 19, 23, 24, 25, 26, 27, 28, 29, 30}) | |||
| for (size_t p: {1, 2}) | |||
| { | |||
| Param param; | |||
| param.mode = Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING; | |||
| param.window_h = param.window_w = 3; | |||
| param.stride_h = param.stride_w = 1; | |||
| param.pad_h = param.pad_w = p; | |||
| Checker<Pooling> checker(handle()); | |||
| checker.set_param(param).exec({{2, 3, ih, iw}, {}}); | |||
| } | |||
| // clang-format on | |||
| } | |||
| TEST_F(FALLBACK, POOLING_GI_RECORD) { | |||
| using Param = param::Pooling; | |||
| TaskRecordChecker<Pooling> checker(0); | |||
| // clang-format off | |||
| for (size_t ih: {2, 3, 5, 7, 11, 13, 17}) | |||
| for (size_t iw: {2, 3, 5, 7, 11, 13, 17}) | |||
| for (size_t p: {1, 2}) | |||
| { | |||
| Param param; | |||
| param.mode = Param::Mode::MAX; | |||
| param.window_h = param.window_w = 3; | |||
| param.stride_h = param.stride_w = 2; | |||
| param.pad_h = param.pad_w = p; | |||
| checker.set_param(param).exec({{2, 3, ih, iw}, {}}); | |||
| param.mode = Param::Mode::AVERAGE; | |||
| param.window_h = param.window_w = 3; | |||
| param.stride_h = param.stride_w = 2; | |||
| param.pad_h = param.pad_w = p; | |||
| checker.set_param(param).exec({{2, 3, ih, iw}, {}}); | |||
| param.mode = Param::Mode::MAX; | |||
| param.window_h = param.window_w = 4; | |||
| param.stride_h = param.stride_w = 2; | |||
| param.pad_h = param.pad_w = p; | |||
| checker.set_param(param).exec({{2, 3, ih, iw}, {}}); | |||
| param.mode = Param::Mode::MAX; | |||
| param.window_h = param.window_w = 5; | |||
| param.stride_h = param.stride_w = 2; | |||
| param.pad_h = param.pad_w = p; | |||
| if (ih + p * 2 >= 5 && iw + p * 2 >= 5) | |||
| checker.set_param(param).exec({{2, 3, ih, iw}, {}}); | |||
| } | |||
| for (size_t ih: {2, 3, 5, 7, 11, 13, 17, 19, 23, 24, 25, 26, 27, 28, 29, 30}) | |||
| for (size_t iw: {2, 3, 5, 7, 11, 13, 17, 19, 23, 24, 25, 26, 27, 28, 29, 30}) | |||
| for (size_t p: {1, 2}) | |||
| { | |||
| Param param; | |||
| param.mode = Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING; | |||
| param.window_h = param.window_w = 3; | |||
| param.stride_h = param.stride_w = 1; | |||
| param.pad_h = param.pad_w = p; | |||
| Checker<Pooling> checker(handle()); | |||
| checker.set_param(param).exec({{2, 3, ih, iw}, {}}); | |||
| } | |||
| // clang-format on | |||
| } | |||
| TEST_F(FALLBACK_MULTI_THREADS, POOLING_GI_RECORD) { | |||
| using Param = param::Pooling; | |||
| TaskRecordChecker<Pooling> checker(0); | |||
| for (size_t ih : {2, 3, 5, 7, 11, 13, 17}) | |||
| for (size_t iw : {2, 3, 5, 7, 11, 13, 17}) | |||
| for (size_t p : {1, 2}) { | |||
| Param param; | |||
| param.mode = Param::Mode::MAX; | |||
| param.window_h = param.window_w = 3; | |||
| param.stride_h = param.stride_w = 2; | |||
| param.pad_h = param.pad_w = p; | |||
| checker.set_param(param).exec({{2, 3, ih, iw}, {}}); | |||
| param.mode = Param::Mode::AVERAGE; | |||
| param.window_h = param.window_w = 3; | |||
| param.stride_h = param.stride_w = 2; | |||
| param.pad_h = param.pad_w = p; | |||
| checker.set_param(param).exec({{2, 3, ih, iw}, {}}); | |||
| param.mode = Param::Mode::MAX; | |||
| param.window_h = param.window_w = 4; | |||
| param.stride_h = param.stride_w = 2; | |||
| param.pad_h = param.pad_w = p; | |||
| checker.set_param(param).exec({{2, 3, ih, iw}, {}}); | |||
| param.mode = Param::Mode::MAX; | |||
| param.window_h = param.window_w = 5; | |||
| param.stride_h = param.stride_w = 2; | |||
| param.pad_h = param.pad_w = p; | |||
| if (ih + p * 2 >= 5 && iw + p * 2 >= 5) | |||
| checker.set_param(param).exec({{2, 3, ih, iw}, {}}); | |||
| } | |||
| } | |||
| TEST_F(FALLBACK_MULTI_THREADS, POOLING_GI_W9_w13_NCHW44) { | |||
| UniformIntRNG rng{-10, 10}; | |||
| Checker<Pooling> checker(handle()); | |||
| checker.set_rng(0, &rng); | |||
| // clang-format off | |||
| for (size_t ih: {20, 15}) | |||
| for (size_t iw: {15, 20}) | |||
| for (size_t kernel: {9, 13}) | |||
| for (size_t pad: {4, 6}) | |||
| for(auto mode: {param::Pooling::Mode::MAX, param::Pooling::Mode::AVERAGE}) | |||
| if (kernel > pad) | |||
| { | |||
| param::Pooling param; | |||
| param.mode = mode; | |||
| param.format = param::Pooling::Format::NCHW44; | |||
| param.pad_h = pad; | |||
| param.pad_w = pad; | |||
| param.stride_h = param.stride_w = 1; | |||
| param.window_h = param.window_w = kernel ; | |||
| checker.set_param(param).exec(TensorShapeArray{{2, 8, ih, iw, 4}, {}}); | |||
| } | |||
| // clang-format on | |||
| } | |||
| TEST_F(FALLBACK_MULTI_THREADS, POOLING_GI_FALLBACK) { | |||
| using Param = param::Pooling; | |||
| for (size_t ih : {2, 3, 5, 7, 11, 13, 17, 19, 23, 24, 25, 26, 27, 28, 29, 30}) | |||
| for (size_t iw : {2, 3, 5, 7, 11, 13, 17, 19, 23, 24, 25, 26, 27, 28, 29, 30}) | |||
| for (size_t p : {1, 2}) { | |||
| Param param; | |||
| param.mode = Param::Mode::MAX; | |||
| param.window_h = param.window_w = 3; | |||
| param.stride_h = param.stride_w = 2; | |||
| param.pad_h = param.pad_w = p; | |||
| Checker<Pooling> checker(handle()); | |||
| checker.set_param(param).exec({{2, 3, ih, iw}, {}}); | |||
| } | |||
| } | |||
| TEST_F(FALLBACK_MULTI_THREADS, POOLING_GI) { | |||
| using Param = param::Pooling; | |||
| for (size_t ih : {2, 3, 5, 7, 11, 13, 17, 19, 23, 24, 25, 26, 27, 28, 29, 30}) | |||
| for (size_t iw : {2, 3, 5, 7, 11, 13, 17, 19, 23, 24, 25, 26, 27, 28, 29, 30}) | |||
| for (size_t p : {1, 2}) { | |||
| Param param; | |||
| param.mode = Param::Mode::MAX; | |||
| param.window_h = param.window_w = 3; | |||
| param.stride_h = param.stride_w = 2; | |||
| param.pad_h = param.pad_w = p; | |||
| Checker<Pooling> checker(handle()); | |||
| checker.set_param(param).exec({{2, 3, ih, iw}, {}}); | |||
| param.mode = Param::Mode::AVERAGE; | |||
| param.window_h = param.window_w = 3; | |||
| param.stride_h = param.stride_w = 2; | |||
| param.pad_h = param.pad_w = p; | |||
| checker.set_param(param).exec({{2, 3, ih, iw}, {}}); | |||
| param.mode = Param::Mode::MAX; | |||
| param.window_h = param.window_w = 4; | |||
| param.stride_h = param.stride_w = 2; | |||
| param.pad_h = param.pad_w = p; | |||
| checker.set_param(param).exec({{2, 3, ih, iw}, {}}); | |||
| param.mode = Param::Mode::MAX; | |||
| param.window_h = param.window_w = 5; | |||
| param.stride_h = param.stride_w = 2; | |||
| param.pad_h = param.pad_w = p; | |||
| if (ih + p * 2 >= 5 && iw + p * 2 >= 5) | |||
| checker.set_param(param).exec({{2, 3, ih, iw}, {}}); | |||
| } | |||
| } | |||
| #if MEGDNN_WITH_BENCHMARK | |||
| namespace { | |||
| void benchmark_nchw44_fp32(Handle* handle) { | |||
| using Param = param::Pooling; | |||
| auto run = [&](size_t n, size_t c, size_t h, size_t w, size_t filter, size_t stride, | |||
| size_t pad, Param::Mode mode) { | |||
| Param param; | |||
| param.window_h = param.window_w = filter; | |||
| param.stride_h = param.stride_w = stride; | |||
| param.pad_h = param.pad_w = pad; | |||
| param.format = Param::Format::NCHW; | |||
| param.mode = mode; | |||
| TensorShape nchw_shape = {n, c, h, w}; | |||
| TensorShape nchw44_shape = {n, c / 4, h, w, 4}; | |||
| TensorLayout dst_layout; | |||
| auto opr = handle->create_operator<Pooling>(); | |||
| opr->param() = param; | |||
| opr->deduce_layout({nchw_shape, dtype::Float32()}, dst_layout); | |||
| float calc_amount = | |||
| dst_layout.total_nr_elems() * param.window_h * param.window_w; | |||
| Benchmarker<Pooling> benchmarker_float_nchw(handle); | |||
| Benchmarker<Pooling> benchmarker_float_nchw44(handle); | |||
| Benchmarker<Pooling> benchmarker_int_nchw44(handle); | |||
| size_t RUN = 500; | |||
| auto t1 = benchmarker_float_nchw.set_display(false) | |||
| .set_times(RUN) | |||
| .set_param(param) | |||
| .exec({nchw_shape, {}}); | |||
| param.format = Param::Format::NCHW44; | |||
| auto t2 = benchmarker_int_nchw44.set_display(false) | |||
| .set_times(RUN) | |||
| .set_param(param) | |||
| .execl({{nchw44_shape, dtype::QuantizedS8(1.0)}, | |||
| {{}, dtype::QuantizedS8(1.0)}}); | |||
| auto t3 = benchmarker_float_nchw44.set_display(false) | |||
| .set_times(RUN) | |||
| .set_param(param) | |||
| .exec({nchw44_shape, {}}); | |||
| printf("{%zu %zu %zu %zu} filter = %zu, stride = %zu pad = %zu\n" | |||
| "nchw_fp32={%.3f ms, %.3f Mflops}, " | |||
| "nchw44_int={%.3f ms, %.3f Mflops}, " | |||
| "nchw44_fp32={%.3f ms, %.3f Mflops, speed_up %f}\n\n", | |||
| n, c, h, w, filter, stride, pad, t1 / RUN, | |||
| calc_amount / (t1 / RUN * 1000), t2 / RUN, | |||
| calc_amount / (t2 / RUN * 1000), t3 / RUN, | |||
| calc_amount / (t3 / RUN * 1000), t1 / t3); | |||
| }; | |||
| // Resnet50 | |||
| run(1, 64, 112, 112, 3, 2, 1, param::Pooling::Mode::MAX); | |||
| run(1, 2048, 7, 7, 7, 1, 0, param::Pooling::Mode::AVERAGE); | |||
| // VGG16 | |||
| run(1, 64, 224, 224, 2, 2, 0, param::Pooling::Mode::MAX); | |||
| run(1, 128, 112, 112, 2, 2, 0, param::Pooling::Mode::MAX); | |||
| run(1, 256, 56, 56, 2, 2, 0, param::Pooling::Mode::MAX); | |||
| run(1, 512, 28, 28, 2, 2, 0, param::Pooling::Mode::MAX); | |||
| run(1, 512, 14, 14, 2, 2, 0, param::Pooling::Mode::MAX); | |||
| } | |||
| } // namespace | |||
| TEST_F(FALLBACK, BENCHMARK_POOLING_GI_NCHW44_FP32) { | |||
| benchmark_nchw44_fp32(handle()); | |||
| } | |||
| TEST_F(FALLBACK_MULTI_THREADS, BENCHMARK_POOLING_GI_NCHW44_FP32) { | |||
| benchmark_nchw44_fp32(handle()); | |||
| } | |||
| TEST_F(FALLBACK, BENCHMARK_POOLING_GI_W4x4_S2x2) { | |||
| using Param = param::Pooling; | |||
| auto run = [&](const TensorShapeArray& shapes, Param param) { | |||
| std::cout << "N:" << shapes[0][0] << " " | |||
| << "IC:" << shapes[0][1] << " " | |||
| << "IH:" << shapes[0][2] << " " | |||
| << "IW:" << shapes[0][3] << std::endl; | |||
| auto handle_naive = create_cpu_handle(2); | |||
| Benchmarker<Pooling> benchmarker_naive(handle_naive.get()); | |||
| Benchmarker<Pooling> benchmarker_float(handle()); | |||
| size_t RUN = 10; | |||
| auto t1 = benchmarker_naive.set_display(false) | |||
| .set_times(RUN) | |||
| .set_param(param) | |||
| .exec(shapes); | |||
| auto t2 = benchmarker_float.set_display(false) | |||
| .set_times(RUN) | |||
| .set_param(param) | |||
| .exec(shapes); | |||
| TensorLayout dst_layout; | |||
| auto opr = handle()->create_operator<Pooling>(); | |||
| opr->param() = param; | |||
| opr->deduce_layout({shapes[0], dtype::Float32()}, dst_layout); | |||
| float calc_amount = | |||
| dst_layout.total_nr_elems() * param.window_h * param.window_w; | |||
| printf("naive={%.3fms, %.3fMflops}, neon={%.3fms, %.3fMflops}\n", t1 / RUN, | |||
| calc_amount / (t1 / RUN * 1000), t2 / RUN, | |||
| calc_amount / (t2 / RUN * 1000)); | |||
| }; | |||
| Param param; | |||
| param.window_h = param.window_w = 4; | |||
| param.stride_h = param.stride_w = 2; | |||
| param.pad_h = param.pad_w = 1; | |||
| std::cout << "4x4 with 2x2 stride max pooling:" << std::endl; | |||
| run({{1, 24, 160, 128}, {}}, param); | |||
| run({{1, 4, 240, 135}, {}}, param); | |||
| run({{1, 32, 120, 67}, {}}, param); | |||
| run({{1, 64, 60, 33}, {}}, param); | |||
| } | |||
| TEST_F(FALLBACK, BENCHMARK_POOLING_GI_W5x5_S2x2) { | |||
| using Param = param::Pooling; | |||
| auto run = [&](const TensorShapeArray& shapes, Param param) { | |||
| std::cout << "N:" << shapes[0][0] << " " | |||
| << "IC:" << shapes[0][1] << " " | |||
| << "IH:" << shapes[0][2] << " " | |||
| << "IW:" << shapes[0][3] << std::endl; | |||
| auto handle_naive = create_cpu_handle(2); | |||
| Benchmarker<Pooling> benchmarker_naive(handle_naive.get()); | |||
| Benchmarker<Pooling> benchmarker_float(handle()); | |||
| size_t RUN = 10; | |||
| auto t1 = benchmarker_naive.set_display(false) | |||
| .set_times(RUN) | |||
| .set_param(param) | |||
| .exec(shapes); | |||
| auto t2 = benchmarker_float.set_display(false) | |||
| .set_times(RUN) | |||
| .set_param(param) | |||
| .exec(shapes); | |||
| TensorLayout dst_layout; | |||
| auto opr = handle()->create_operator<Pooling>(); | |||
| opr->param() = param; | |||
| opr->deduce_layout({shapes[0], dtype::Float32()}, dst_layout); | |||
| float calc_amount = | |||
| dst_layout.total_nr_elems() * param.window_h * param.window_w; | |||
| printf("naive={%.3fms, %.3fMflops}, neon={%.3fms, %.3fMflops}\n", t1 / RUN, | |||
| calc_amount / (t1 / RUN * 1000), t2 / RUN, | |||
| calc_amount / (t2 / RUN * 1000)); | |||
| }; | |||
| Param param; | |||
| param.window_h = param.window_w = 5; | |||
| param.stride_h = param.stride_w = 2; | |||
| param.pad_h = param.pad_w = 1; | |||
| std::cout << "5x5 with 2x2 stride max pooling:" << std::endl; | |||
| run({{1, 24, 160, 128}, {}}, param); | |||
| run({{1, 4, 240, 135}, {}}, param); | |||
| run({{1, 32, 120, 67}, {}}, param); | |||
| run({{1, 64, 60, 33}, {}}, param); | |||
| } | |||
| namespace { | |||
| template <typename Opr> | |||
| void benchmark_impl( | |||
| const typename Opr::Param& param, std::vector<SmallVector<TensorShape>> shapes, | |||
| size_t RUNS, TaskExecutorConfig&& multi_thread_config, | |||
| TaskExecutorConfig&& single_thread_config, DType data_type) { | |||
| std::vector<float> multi_thread_times, single_thread_times; | |||
| { | |||
| auto multi_thread_hanle = create_cpu_handle(0, true, &multi_thread_config); | |||
| auto benchmarker = Benchmarker<Opr>(multi_thread_hanle.get()); | |||
| benchmarker.set_times(RUNS).set_display(false).set_param(param); | |||
| benchmarker.set_dtype(0, data_type); | |||
| for (auto shape : shapes) { | |||
| multi_thread_times.push_back(benchmarker.exec(shape) / RUNS); | |||
| } | |||
| } | |||
| { | |||
| auto single_thread_handle = create_cpu_handle(0, true, &single_thread_config); | |||
| auto benchmarker = Benchmarker<Opr>(single_thread_handle.get()); | |||
| benchmarker.set_times(RUNS).set_display(false).set_param(param); | |||
| benchmarker.set_dtype(0, data_type); | |||
| for (auto shape : shapes) { | |||
| single_thread_times.push_back(benchmarker.exec(shape) / RUNS); | |||
| } | |||
| } | |||
| printf("Benchmark : Multi threads %zu, ", multi_thread_config.nr_thread); | |||
| printf("core_ids:"); | |||
| for (size_t i = 0; i < multi_thread_config.affinity_core_set.size(); i++) { | |||
| printf("%zu ", multi_thread_config.affinity_core_set[i]); | |||
| } | |||
| printf(", Single thread core_id %zu\n", single_thread_config.affinity_core_set[0]); | |||
| for (size_t i = 0; i < shapes.size(); i++) { | |||
| auto shape = shapes[i]; | |||
| printf("Case: "); | |||
| for (auto sh : shape) | |||
| printf("%s ", sh.to_string().c_str()); | |||
| printf("%zu threads time: %f,\n single thread time: " | |||
| "%f. spead up = %f, speedup/cores=%f\n", | |||
| multi_thread_config.nr_thread, multi_thread_times[i], | |||
| single_thread_times[i], single_thread_times[i] / multi_thread_times[i], | |||
| single_thread_times[i] / multi_thread_times[i] / | |||
| multi_thread_config.nr_thread); | |||
| } | |||
| } | |||
| } // namespace | |||
| TEST_F(FALLBACK_MULTI_THREADS, BENCHMARK_POOLING_GI) { | |||
| constexpr size_t RUNS = 50; | |||
| using Param = param::Pooling; | |||
| Param param; | |||
| param.window_h = param.window_w = 3; | |||
| param.stride_h = param.stride_w = 2; | |||
| param.pad_h = param.pad_w = 1; | |||
| std::vector<SmallVector<TensorShape>> shapes; | |||
| shapes.push_back({{32, 32, 215, 215}, {}}); | |||
| shapes.push_back({{32, 32, 128, 128}, {}}); | |||
| shapes.push_back({{8, 256, 100, 100}, {}}); | |||
| shapes.push_back({{1, 256, 100, 100}, {}}); | |||
| shapes.push_back({{1, 32, 100, 100}, {}}); | |||
| shapes.push_back({{1, 256, 80, 80}, {}}); | |||
| shapes.push_back({{1, 256, 60, 60}, {}}); | |||
| shapes.push_back({{1, 256, 30, 30}, {}}); | |||
| param.window_h = param.window_w = 3; | |||
| param.stride_h = param.stride_w = 2; | |||
| param.pad_h = param.pad_w = 1; | |||
| printf("Benchmark POOLING kernel:%d*%d stride:%d,mode %d\n", param.window_h, | |||
| param.window_w, param.stride_h, static_cast<int>(param.mode)); | |||
| benchmark_impl<Pooling>( | |||
| param, shapes, RUNS, {4, {0, 1, 2, 3}}, {1, {0}}, dtype::Float32()); | |||
| benchmark_impl<Pooling>( | |||
| param, shapes, RUNS, {4, {4, 5, 6, 7}}, {1, {4}}, dtype::Float32()); | |||
| benchmark_impl<Pooling>( | |||
| param, shapes, RUNS, {2, {0, 1}}, {1, {0}}, dtype::Float32()); | |||
| } | |||
| TEST_F(FALLBACK_MULTI_THREADS, BENCHMARK_POOLING_GI_NCHW44) { | |||
| constexpr size_t RUNS = 50; | |||
| using Param = param::Pooling; | |||
| Param param; | |||
| param.pad_h = param.pad_w = 0; | |||
| param.mode = Param::Mode::MAX; | |||
| std::vector<SmallVector<TensorShape>> shapes; | |||
| std::vector<std::vector<size_t>> filter_and_stride = { | |||
| {2, 1}, {2, 2}, {3, 1}, {3, 2}, {4, 1}, {4, 2}, {5, 1}, {5, 2}}; | |||
| for (auto mode : {param::Pooling::Mode::MAX, param::Pooling::Mode::AVERAGE}) { | |||
| for (auto filter : filter_and_stride) { | |||
| shapes.push_back({{1, 32 * 4, 215, 215}, {}}); | |||
| shapes.push_back({{1, 32 * 4, 128, 128}, {}}); | |||
| shapes.push_back({{1, 16 * 4, 56, 56}, {}}); | |||
| param.mode = mode; | |||
| param.window_h = param.window_w = filter[0]; | |||
| param.stride_h = param.stride_w = filter[1]; | |||
| param.format = Param::Format::NCHW; | |||
| printf("NCHW Benchmark POOLING kernel:%d*%d stride:%d,mode %d\n", | |||
| param.window_h, param.window_h, param.stride_h, | |||
| static_cast<int>(param.mode)); | |||
| benchmark_impl<Pooling>( | |||
| param, shapes, RUNS, {4, {4, 5, 6, 7}}, {1, {4}}, | |||
| dtype::QuantizedS8(1.1f)); | |||
| shapes.clear(); | |||
| shapes.push_back({{1, 32, 215, 215, 4}, {}}); | |||
| shapes.push_back({{1, 32, 128, 128, 4}, {}}); | |||
| shapes.push_back({{1, 16, 56, 56, 4}, {}}); | |||
| param.format = Param::Format::NCHW44; | |||
| printf("NCHW44 Benchmark POOLING kernel:%d*%d stride:%d,mode %d\n", | |||
| param.window_h, param.window_w, param.stride_h, | |||
| static_cast<int>(param.mode)); | |||
| benchmark_impl<Pooling>( | |||
| param, shapes, RUNS, {4, {4, 5, 6, 7}}, {1, {4}}, | |||
| dtype::QuantizedS8(1.1f)); | |||
| shapes.clear(); | |||
| } | |||
| } | |||
| } | |||
| #endif | |||
| } // namespace test | |||
| } // namespace megdnn | |||
| // vim: syntax=cpp.doxygen | |||