From 2b20bf940cd2ea58c62ce6a035abcb4091a3ae27 Mon Sep 17 00:00:00 2001 From: nihuini Date: Wed, 20 Jun 2018 15:06:19 +0800 Subject: [PATCH] drop armv7 vaddvq_f32 hack --- src/layer/arm/convolution_3x3.h | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/src/layer/arm/convolution_3x3.h b/src/layer/arm/convolution_3x3.h index d6e0a4a88..da0cbbaf8 100644 --- a/src/layer/arm/convolution_3x3.h +++ b/src/layer/arm/convolution_3x3.h @@ -8325,16 +8325,6 @@ static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, co copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w); } -#if !__aarch64__ -// TODO drop it -static inline float vaddvq_f32(float32x4_t _v) -{ - float32x2_t _ss = vadd_f32(vget_low_f32(_v), vget_high_f32(_v)); - float32x2_t _ss2 = vpadd_f32(_ss, _ss); - return vget_lane_f32(_ss2, 0); -} -#endif // !__aarch64__ - static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& _bias) { int w = bottom_blob.w; @@ -10268,10 +10258,21 @@ static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, co } // TODO transpose and acc +#if __aarch64__ float sum0 = vaddvq_f32(_sum0); float sum1 = vaddvq_f32(_sum1); float sum2 = vaddvq_f32(_sum2); float sum3 = vaddvq_f32(_sum3); +#else + float32x2_t _ss0 = vadd_f32(vget_low_f32(_sum0), vget_high_f32(_sum0)); + float32x2_t _ss1 = vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1)); + float32x2_t _ss2 = vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2)); + float32x2_t _ss3 = vadd_f32(vget_low_f32(_sum3), vget_high_f32(_sum3)); + float sum0 = vget_lane_f32(vpadd_f32(_ss0, _ss0), 0); + float sum1 = vget_lane_f32(vpadd_f32(_ss1, _ss1), 0); + float sum2 = vget_lane_f32(vpadd_f32(_ss2, _ss2), 0); + float sum3 = vget_lane_f32(vpadd_f32(_ss3, _ss3), 0); +#endif // __aarch64__ for (; q