From ce74836e2a15af70b6ced8a7bf0b1c378dee4ef5 Mon Sep 17 00:00:00 2001 From: nihuini Date: Thu, 31 May 2018 16:33:48 +0800 Subject: [PATCH] yet another winograd convolution implementation, unroll outch 8 tiles 4 inch 4, about 22% faster, more optimization may comes soon :> --- src/layer/arm/convolution_3x3.h | 2043 +++++++++++++++++++++++++++++++ 1 file changed, 2043 insertions(+) diff --git a/src/layer/arm/convolution_3x3.h b/src/layer/arm/convolution_3x3.h index eabe980c2..0e9b47856 100644 --- a/src/layer/arm/convolution_3x3.h +++ b/src/layer/arm/convolution_3x3.h @@ -1533,6 +1533,323 @@ static void conv3x3s1_winograd64_transform_kernel_neon(const Mat& kernel, Mat& k kernel_tm = kernel_tm2; } +static void conv3x3s1_winograd64_transform_kernel_neon5(const Mat& kernel, Mat& kernel_tm, int inch, int outch) +{ + kernel_tm.create(8*8, inch, outch); + + const float ktm[8][3] = { + { 1.0f, 0.0f, 0.0f}, + {-2.0f/9, -2.0f/9, -2.0f/9}, + {-2.0f/9, 2.0f/9, -2.0f/9}, + {1.0f/90, 1.0f/45, 2.0f/45}, + {1.0f/90, -1.0f/45, 2.0f/45}, + {1.0f/45, 1.0f/90, 1.0f/180}, + {1.0f/45, -1.0f/90, 1.0f/180}, + { 0.0f, 0.0f, 1.0f} + }; + + #pragma omp parallel for + for (int p = 0; p