From d0bcda70ca50bd8b7dcde0d3b478c3aa1d6edc04 Mon Sep 17 00:00:00 2001 From: nihui Date: Sat, 21 Apr 2018 20:39:11 +0800 Subject: [PATCH] arm neon optimize for winograd input output transform, about 4%~22% faster --- src/layer/arm/convolution_3x3.h | 453 +++++++++++++++++++++++++++++--- 1 file changed, 414 insertions(+), 39 deletions(-) diff --git a/src/layer/arm/convolution_3x3.h b/src/layer/arm/convolution_3x3.h index 9462f67d7..8fceaedb8 100644 --- a/src/layer/arm/convolution_3x3.h +++ b/src/layer/arm/convolution_3x3.h @@ -5320,6 +5320,15 @@ static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, co // 5 = (r06 + (r02 - r04 * 1.25) * 4) + (r01 * 2 - r03 * 2.5 + r05 * 0.5) // 6 = (r06 + (r02 - r04 * 1.25) * 4) - (r01 * 2 - r03 * 2.5 + r05 * 0.5) +#if __ARM_NEON + const float coeff[8] = { + 0.25f, 0.5f, -1.25f, 2.f, + -2.5f, 4.f, 4.25f, 5.25f + }; + float32x4_t _coeff0 = vld1q_f32(coeff); + float32x4_t _coeff1 = vld1q_f32(coeff+4); +#endif // __ARM_NEON + #pragma omp parallel for for (int q = 0; q