From 05d7562a5df04979cbc61ef30217dfba6bdcd0a2 Mon Sep 17 00:00:00 2001 From: nihuini Date: Mon, 2 Jul 2018 18:54:41 +0800 Subject: [PATCH] reorder kernel weight, pipeline friendly ;) --- src/layer/arm/convolution_1x1.h | 933 ++++++++++++++++------------ src/layer/arm/convolution_3x3.h | 1001 +++++++++++++++++-------------- 2 files changed, 1089 insertions(+), 845 deletions(-) diff --git a/src/layer/arm/convolution_1x1.h b/src/layer/arm/convolution_1x1.h index a5b8cfbff..f0c974893 100644 --- a/src/layer/arm/convolution_1x1.h +++ b/src/layer/arm/convolution_1x1.h @@ -21,14 +21,14 @@ static void conv1x1s1_sgemm_transform_kernel_neon(const Mat& _kernel, Mat& kerne const float* kernel = _kernel; // interleave -#if __aarch64__ +#if __ARM_NEON && __aarch64__ kernel_tm.create(4*8, inch/4 + inch%4, outch/8 + (outch%8)/4 + outch%4); #else kernel_tm.create(4*4, inch/4 + inch%4, outch/4 + outch%4); -#endif // __aarch64__ +#endif // __ARM_NEON && __aarch64__ int p = 0; -#if __aarch64__ +#if __ARM_NEON && __aarch64__ for (; p+7> 3; remain_outch_start = nn_outch << 3; @@ -403,89 +273,89 @@ static void conv1x1s1_sgemm_neon(const Mat& bottom_blob, Mat& top_blob, const Ma "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%9], #64 \n" "fmla v16.4s, v8.4s, v0.s[0] \n" - "fmla v18.4s, v8.4s, v1.s[0] \n" - "fmla v20.4s, v8.4s, v2.s[0] \n" - "fmla v22.4s, v8.4s, v3.s[0] \n" + "fmla v18.4s, v8.4s, v0.s[1] \n" + "fmla v20.4s, v8.4s, v0.s[2] \n" + "fmla v22.4s, v8.4s, v0.s[3] \n" "fmla v17.4s, v9.4s, v0.s[0] \n" - "fmla v19.4s, v9.4s, v1.s[0] \n" - "fmla v21.4s, v9.4s, v2.s[0] \n" - "fmla v23.4s, v9.4s, v3.s[0] \n" + "fmla v19.4s, v9.4s, v0.s[1] \n" + "fmla v21.4s, v9.4s, v0.s[2] \n" + "fmla v23.4s, v9.4s, v0.s[3] \n" + + "fmla v24.4s, v8.4s, v1.s[0] \n" + "fmla v26.4s, v8.4s, v1.s[1] \n" + "fmla v28.4s, v8.4s, v1.s[2] \n" + "fmla v30.4s, v8.4s, v1.s[3] \n" + + "fmla v25.4s, v9.4s, v1.s[0] \n" + "fmla v27.4s, v9.4s, v1.s[1] \n" + "fmla v29.4s, v9.4s, v1.s[2] \n" + "fmla v31.4s, v9.4s, v1.s[3] \n" "prfm pldl1keep, [%8, #512] \n" "ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%8], #64 \n" - "fmla v16.4s, v10.4s, v0.s[1] \n" - "fmla v18.4s, v10.4s, v1.s[1] \n" - "fmla v20.4s, v10.4s, v2.s[1] \n" - "fmla v22.4s, v10.4s, v3.s[1] \n" + "fmla v16.4s, v10.4s, v2.s[0] \n" + "fmla v18.4s, v10.4s, v2.s[1] \n" + "fmla v20.4s, v10.4s, v2.s[2] \n" + "fmla v22.4s, v10.4s, v2.s[3] \n" - "fmla v17.4s, v11.4s, v0.s[1] \n" - "fmla v19.4s, v11.4s, v1.s[1] \n" - "fmla v21.4s, v11.4s, v2.s[1] \n" - "fmla v23.4s, v11.4s, v3.s[1] \n" + "fmla v17.4s, v11.4s, v2.s[0] \n" + "fmla v19.4s, v11.4s, v2.s[1] \n" + "fmla v21.4s, v11.4s, v2.s[2] \n" + "fmla v23.4s, v11.4s, v2.s[3] \n" - "fmla v16.4s, v12.4s, v0.s[2] \n" - "fmla v18.4s, v12.4s, v1.s[2] \n" - "fmla v20.4s, v12.4s, v2.s[2] \n" - "fmla v22.4s, v12.4s, v3.s[2] \n" + "fmla v24.4s, v10.4s, v3.s[0] \n" + "fmla v26.4s, v10.4s, v3.s[1] \n" + "fmla v28.4s, v10.4s, v3.s[2] \n" + "fmla v30.4s, v10.4s, v3.s[3] \n" - "fmla v17.4s, v13.4s, v0.s[2] \n" - "fmla v19.4s, v13.4s, v1.s[2] \n" - "fmla v21.4s, v13.4s, v2.s[2] \n" - "fmla v23.4s, v13.4s, v3.s[2] \n" + "fmla v25.4s, v11.4s, v3.s[0] \n" + "fmla v27.4s, v11.4s, v3.s[1] \n" + "fmla v29.4s, v11.4s, v3.s[2] \n" + "fmla v31.4s, v11.4s, v3.s[3] \n" "prfm pldl1keep, [%9, #512] \n" "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%9], #64 \n" - "fmla v16.4s, v14.4s, v0.s[3] \n" - "fmla v18.4s, v14.4s, v1.s[3] \n" - "fmla v20.4s, v14.4s, v2.s[3] \n" - "fmla v22.4s, v14.4s, v3.s[3] \n" - - "fmla v17.4s, v15.4s, v0.s[3] \n" - "fmla v19.4s, v15.4s, v1.s[3] \n" - "fmla v21.4s, v15.4s, v2.s[3] \n" - "fmla v23.4s, v15.4s, v3.s[3] \n" - - "fmla v24.4s, v8.4s, v4.s[0] \n" - "fmla v26.4s, v8.4s, v5.s[0] \n" - "fmla v28.4s, v8.4s, v6.s[0] \n" - "fmla v30.4s, v8.4s, v7.s[0] \n" - - "fmla v25.4s, v9.4s, v4.s[0] \n" - "fmla v27.4s, v9.4s, v5.s[0] \n" - "fmla v29.4s, v9.4s, v6.s[0] \n" - "fmla v31.4s, v9.4s, v7.s[0] \n" - - "fmla v24.4s, v10.4s, v4.s[1] \n" - "fmla v26.4s, v10.4s, v5.s[1] \n" - "fmla v28.4s, v10.4s, v6.s[1] \n" - "fmla v30.4s, v10.4s, v7.s[1] \n" - - "fmla v25.4s, v11.4s, v4.s[1] \n" - "fmla v27.4s, v11.4s, v5.s[1] \n" - "fmla v29.4s, v11.4s, v6.s[1] \n" - "fmla v31.4s, v11.4s, v7.s[1] \n" - - "fmla v24.4s, v12.4s, v4.s[2] \n" - "fmla v26.4s, v12.4s, v5.s[2] \n" - "fmla v28.4s, v12.4s, v6.s[2] \n" - "fmla v30.4s, v12.4s, v7.s[2] \n" - - "fmla v25.4s, v13.4s, v4.s[2] \n" - "fmla v27.4s, v13.4s, v5.s[2] \n" - "fmla v29.4s, v13.4s, v6.s[2] \n" - "fmla v31.4s, v13.4s, v7.s[2] \n" - - "fmla v24.4s, v14.4s, v4.s[3] \n" - "fmla v26.4s, v14.4s, v5.s[3] \n" - "fmla v28.4s, v14.4s, v6.s[3] \n" + "fmla v16.4s, v12.4s, v4.s[0] \n" + "fmla v18.4s, v12.4s, v4.s[1] \n" + "fmla v20.4s, v12.4s, v4.s[2] \n" + "fmla v22.4s, v12.4s, v4.s[3] \n" + + "fmla v17.4s, v13.4s, v4.s[0] \n" + "fmla v19.4s, v13.4s, v4.s[1] \n" + "fmla v21.4s, v13.4s, v4.s[2] \n" + "fmla v23.4s, v13.4s, v4.s[3] \n" + + "fmla v24.4s, v12.4s, v5.s[0] \n" + "fmla v26.4s, v12.4s, v5.s[1] \n" + "fmla v28.4s, v12.4s, v5.s[2] \n" + "fmla v30.4s, v12.4s, v5.s[3] \n" + + "fmla v25.4s, v13.4s, v5.s[0] \n" + "fmla v27.4s, v13.4s, v5.s[1] \n" + "fmla v29.4s, v13.4s, v5.s[2] \n" + "fmla v31.4s, v13.4s, v5.s[3] \n" + + "fmla v16.4s, v14.4s, v6.s[0] \n" + "fmla v18.4s, v14.4s, v6.s[1] \n" + "fmla v20.4s, v14.4s, v6.s[2] \n" + "fmla v22.4s, v14.4s, v6.s[3] \n" + + "fmla v17.4s, v15.4s, v6.s[0] \n" + "fmla v19.4s, v15.4s, v6.s[1] \n" + "fmla v21.4s, v15.4s, v6.s[2] \n" + "fmla v23.4s, v15.4s, v6.s[3] \n" + + "fmla v24.4s, v14.4s, v7.s[0] \n" + "fmla v26.4s, v14.4s, v7.s[1] \n" + "fmla v28.4s, v14.4s, v7.s[2] \n" "fmla v30.4s, v14.4s, v7.s[3] \n" - "fmla v25.4s, v15.4s, v4.s[3] \n" - "fmla v27.4s, v15.4s, v5.s[3] \n" - "fmla v29.4s, v15.4s, v6.s[3] \n" + "fmla v25.4s, v15.4s, v7.s[0] \n" + "fmla v27.4s, v15.4s, v7.s[1] \n" + "fmla v29.4s, v15.4s, v7.s[2] \n" "fmla v31.4s, v15.4s, v7.s[3] \n" "subs w4, w4, #1 \n" @@ -599,39 +469,39 @@ static void conv1x1s1_sgemm_neon(const Mat& bottom_blob, Mat& top_blob, const Ma "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%9], #64 \n" "fmla v16.4s, v8.4s, v0.s[0] \n" - "fmla v17.4s, v8.4s, v1.s[0] \n" - "fmla v18.4s, v8.4s, v2.s[0] \n" - "fmla v19.4s, v8.4s, v3.s[0] \n" - "fmla v20.4s, v8.4s, v4.s[0] \n" - "fmla v21.4s, v8.4s, v5.s[0] \n" - "fmla v22.4s, v8.4s, v6.s[0] \n" - "fmla v23.4s, v8.4s, v7.s[0] \n" - - "fmla v16.4s, v9.4s, v0.s[1] \n" - "fmla v17.4s, v9.4s, v1.s[1] \n" - "fmla v18.4s, v9.4s, v2.s[1] \n" - "fmla v19.4s, v9.4s, v3.s[1] \n" - "fmla v20.4s, v9.4s, v4.s[1] \n" - "fmla v21.4s, v9.4s, v5.s[1] \n" - "fmla v22.4s, v9.4s, v6.s[1] \n" - "fmla v23.4s, v9.4s, v7.s[1] \n" - - "fmla v16.4s, v10.4s, v0.s[2] \n" - "fmla v17.4s, v10.4s, v1.s[2] \n" - "fmla v18.4s, v10.4s, v2.s[2] \n" - "fmla v19.4s, v10.4s, v3.s[2] \n" - "fmla v20.4s, v10.4s, v4.s[2] \n" - "fmla v21.4s, v10.4s, v5.s[2] \n" - "fmla v22.4s, v10.4s, v6.s[2] \n" - "fmla v23.4s, v10.4s, v7.s[2] \n" - - "fmla v16.4s, v11.4s, v0.s[3] \n" - "fmla v17.4s, v11.4s, v1.s[3] \n" - "fmla v18.4s, v11.4s, v2.s[3] \n" - "fmla v19.4s, v11.4s, v3.s[3] \n" - "fmla v20.4s, v11.4s, v4.s[3] \n" - "fmla v21.4s, v11.4s, v5.s[3] \n" - "fmla v22.4s, v11.4s, v6.s[3] \n" + "fmla v17.4s, v8.4s, v0.s[1] \n" + "fmla v18.4s, v8.4s, v0.s[2] \n" + "fmla v19.4s, v8.4s, v0.s[3] \n" + "fmla v20.4s, v8.4s, v1.s[0] \n" + "fmla v21.4s, v8.4s, v1.s[1] \n" + "fmla v22.4s, v8.4s, v1.s[2] \n" + "fmla v23.4s, v8.4s, v1.s[3] \n" + + "fmla v16.4s, v9.4s, v2.s[0] \n" + "fmla v17.4s, v9.4s, v2.s[1] \n" + "fmla v18.4s, v9.4s, v2.s[2] \n" + "fmla v19.4s, v9.4s, v2.s[3] \n" + "fmla v20.4s, v9.4s, v3.s[0] \n" + "fmla v21.4s, v9.4s, v3.s[1] \n" + "fmla v22.4s, v9.4s, v3.s[2] \n" + "fmla v23.4s, v9.4s, v3.s[3] \n" + + "fmla v16.4s, v10.4s, v4.s[0] \n" + "fmla v17.4s, v10.4s, v4.s[1] \n" + "fmla v18.4s, v10.4s, v4.s[2] \n" + "fmla v19.4s, v10.4s, v4.s[3] \n" + "fmla v20.4s, v10.4s, v5.s[0] \n" + "fmla v21.4s, v10.4s, v5.s[1] \n" + "fmla v22.4s, v10.4s, v5.s[2] \n" + "fmla v23.4s, v10.4s, v5.s[3] \n" + + "fmla v16.4s, v11.4s, v6.s[0] \n" + "fmla v17.4s, v11.4s, v6.s[1] \n" + "fmla v18.4s, v11.4s, v6.s[2] \n" + "fmla v19.4s, v11.4s, v6.s[3] \n" + "fmla v20.4s, v11.4s, v7.s[0] \n" + "fmla v21.4s, v11.4s, v7.s[1] \n" + "fmla v22.4s, v11.4s, v7.s[2] \n" "fmla v23.4s, v11.4s, v7.s[3] \n" "subs w4, w4, #1 \n" @@ -734,25 +604,24 @@ static void conv1x1s1_sgemm_neon(const Mat& bottom_blob, Mat& top_blob, const Ma "prfm pldl1keep, [%9, #512] \n" "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%9], #64 \n" - "fmla v16.4s, v8.4s, v0.4s \n" - "fmla v17.4s, v8.4s, v1.4s \n" - "fmla v18.4s, v8.4s, v2.4s \n" - "fmla v19.4s, v8.4s, v3.4s \n" - "fmla v20.4s, v8.4s, v4.4s \n" - "fmla v21.4s, v8.4s, v5.4s \n" - "fmla v22.4s, v8.4s, v6.4s \n" - "fmla v23.4s, v8.4s, v7.4s \n" + "fmla v16.4s, v0.4s, v8.s[0] \n" + "fmla v17.4s, v1.4s, v8.s[0] \n" + "fmla v18.4s, v2.4s, v8.s[1] \n" + "fmla v19.4s, v3.4s, v8.s[1] \n" + "fmla v20.4s, v4.4s, v8.s[2] \n" + "fmla v21.4s, v5.4s, v8.s[2] \n" + "fmla v22.4s, v6.4s, v8.s[3] \n" + "fmla v23.4s, v7.4s, v8.s[3] \n" "subs w4, w4, #1 \n" "bne 0b \n" - "faddp v0.4s, v16.4s, v17.4s \n" - "faddp v1.4s, v18.4s, v19.4s \n" - "faddp v2.4s, v20.4s, v21.4s \n" - "faddp v3.4s, v22.4s, v23.4s \n" - "faddp v16.4s, v0.4s, v1.4s \n" - "faddp v17.4s, v2.4s, v3.4s \n" - + "fadd v16.4s, v16.4s, v18.4s \n" + "fadd v17.4s, v17.4s, v19.4s \n" + "fadd v20.4s, v20.4s, v22.4s \n" + "fadd v21.4s, v21.4s, v23.4s \n" + "fadd v16.4s, v16.4s, v20.4s \n" + "fadd v17.4s, v17.4s, v21.4s \n" "fadd v24.4s, v24.4s, v16.4s \n" "fadd v25.4s, v25.4s, v17.4s \n" @@ -814,7 +683,7 @@ static void conv1x1s1_sgemm_neon(const Mat& bottom_blob, Mat& top_blob, const Ma ); } } -#endif // __aarch64__ +#endif // __ARM_NEON && __aarch64__ nn_outch = (outch - remain_outch_start) >> 2; @@ -836,12 +705,13 @@ static void conv1x1s1_sgemm_neon(const Mat& bottom_blob, Mat& top_blob, const Ma for (; i+7> 3; remain_outch_start = nn_outch << 3; @@ -9383,82 +9217,81 @@ static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, co "fmla v16.4s, v8.4s, v0.s[0] \n" "fmla v17.4s, v9.4s, v0.s[0] \n" - "fmla v18.4s, v8.4s, v1.s[0] \n" - "fmla v19.4s, v9.4s, v1.s[0] \n" + "fmla v18.4s, v8.4s, v0.s[1] \n" + "fmla v19.4s, v9.4s, v0.s[1] \n" + "fmla v20.4s, v8.4s, v0.s[2] \n" + "fmla v21.4s, v9.4s, v0.s[2] \n" + "fmla v22.4s, v8.4s, v0.s[3] \n" + "fmla v23.4s, v9.4s, v0.s[3] \n" "prfm pldl1keep, [%9, #512] \n" "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%9], #64 \n" - "fmla v20.4s, v8.4s, v2.s[0] \n" - "fmla v21.4s, v9.4s, v2.s[0] \n" - "fmla v22.4s, v8.4s, v3.s[0] \n" - "fmla v23.4s, v9.4s, v3.s[0] \n" - - "fmla v24.4s, v8.4s, v4.s[0] \n" - "fmla v25.4s, v9.4s, v4.s[0] \n" - "fmla v26.4s, v8.4s, v5.s[0] \n" - "fmla v27.4s, v9.4s, v5.s[0] \n" - "fmla v28.4s, v8.4s, v6.s[0] \n" - "fmla v29.4s, v9.4s, v6.s[0] \n" - "fmla v30.4s, v8.4s, v7.s[0] \n" - "fmla v31.4s, v9.4s, v7.s[0] \n" - - "fmla v16.4s, v10.4s, v0.s[1] \n" - "fmla v17.4s, v11.4s, v0.s[1] \n" - "fmla v18.4s, v10.4s, v1.s[1] \n" - "fmla v19.4s, v11.4s, v1.s[1] \n" - "fmla v20.4s, v10.4s, v2.s[1] \n" - "fmla v21.4s, v11.4s, v2.s[1] \n" - "fmla v22.4s, v10.4s, v3.s[1] \n" - "fmla v23.4s, v11.4s, v3.s[1] \n" + "fmla v24.4s, v8.4s, v1.s[0] \n" + "fmla v25.4s, v9.4s, v1.s[0] \n" + "fmla v26.4s, v8.4s, v1.s[1] \n" + "fmla v27.4s, v9.4s, v1.s[1] \n" + "fmla v28.4s, v8.4s, v1.s[2] \n" + "fmla v29.4s, v9.4s, v1.s[2] \n" + "fmla v30.4s, v8.4s, v1.s[3] \n" + "fmla v31.4s, v9.4s, v1.s[3] \n" + + "fmla v16.4s, v10.4s, v2.s[0] \n" + "fmla v17.4s, v11.4s, v2.s[0] \n" + "fmla v18.4s, v10.4s, v2.s[1] \n" + "fmla v19.4s, v11.4s, v2.s[1] \n" + "fmla v20.4s, v10.4s, v2.s[2] \n" + "fmla v21.4s, v11.4s, v2.s[2] \n" + "fmla v22.4s, v10.4s, v2.s[3] \n" + "fmla v23.4s, v11.4s, v2.s[3] \n" "prfm pldl1keep, [%8, #512] \n" "ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%8], #64 \n" - "fmla v24.4s, v10.4s, v4.s[1] \n" - "fmla v25.4s, v11.4s, v4.s[1] \n" - "fmla v26.4s, v10.4s, v5.s[1] \n" - "fmla v27.4s, v11.4s, v5.s[1] \n" - "fmla v28.4s, v10.4s, v6.s[1] \n" - "fmla v29.4s, v11.4s, v6.s[1] \n" - "fmla v30.4s, v10.4s, v7.s[1] \n" - "fmla v31.4s, v11.4s, v7.s[1] \n" - - "fmla v16.4s, v12.4s, v0.s[2] \n" - "fmla v17.4s, v13.4s, v0.s[2] \n" - "fmla v18.4s, v12.4s, v1.s[2] \n" - "fmla v19.4s, v13.4s, v1.s[2] \n" - "fmla v20.4s, v12.4s, v2.s[2] \n" - "fmla v21.4s, v13.4s, v2.s[2] \n" - "fmla v22.4s, v12.4s, v3.s[2] \n" - "fmla v23.4s, v13.4s, v3.s[2] \n" - - "fmla v24.4s, v12.4s, v4.s[2] \n" - "fmla v25.4s, v13.4s, v4.s[2] \n" - "fmla v26.4s, v12.4s, v5.s[2] \n" - "fmla v27.4s, v13.4s, v5.s[2] \n" - "fmla v28.4s, v12.4s, v6.s[2] \n" - "fmla v29.4s, v13.4s, v6.s[2] \n" - "fmla v30.4s, v12.4s, v7.s[2] \n" - "fmla v31.4s, v13.4s, v7.s[2] \n" - - "fmla v16.4s, v14.4s, v0.s[3] \n" - "fmla v17.4s, v15.4s, v0.s[3] \n" - "fmla v18.4s, v14.4s, v1.s[3] \n" - "fmla v19.4s, v15.4s, v1.s[3] \n" - "fmla v20.4s, v14.4s, v2.s[3] \n" - "fmla v21.4s, v15.4s, v2.s[3] \n" - "fmla v22.4s, v14.4s, v3.s[3] \n" - "fmla v23.4s, v15.4s, v3.s[3] \n" + "fmla v24.4s, v10.4s, v3.s[0] \n" + "fmla v25.4s, v11.4s, v3.s[0] \n" + "fmla v26.4s, v10.4s, v3.s[1] \n" + "fmla v27.4s, v11.4s, v3.s[1] \n" + "fmla v28.4s, v10.4s, v3.s[2] \n" + "fmla v29.4s, v11.4s, v3.s[2] \n" + "fmla v30.4s, v10.4s, v3.s[3] \n" + "fmla v31.4s, v11.4s, v3.s[3] \n" + + "fmla v16.4s, v12.4s, v4.s[0] \n" + "fmla v17.4s, v13.4s, v4.s[0] \n" + "fmla v18.4s, v12.4s, v4.s[1] \n" + "fmla v19.4s, v13.4s, v4.s[1] \n" + "fmla v20.4s, v12.4s, v4.s[2] \n" + "fmla v21.4s, v13.4s, v4.s[2] \n" + "fmla v22.4s, v12.4s, v4.s[3] \n" + "fmla v23.4s, v13.4s, v4.s[3] \n" + + "fmla v24.4s, v12.4s, v5.s[0] \n" + "fmla v25.4s, v13.4s, v5.s[0] \n" + "fmla v26.4s, v12.4s, v5.s[1] \n" + "fmla v27.4s, v13.4s, v5.s[1] \n" + "fmla v28.4s, v12.4s, v5.s[2] \n" + "fmla v29.4s, v13.4s, v5.s[2] \n" + "fmla v30.4s, v12.4s, v5.s[3] \n" + "fmla v31.4s, v13.4s, v5.s[3] \n" + + "fmla v16.4s, v14.4s, v6.s[0] \n" + "fmla v17.4s, v15.4s, v6.s[0] \n" + "fmla v18.4s, v14.4s, v6.s[1] \n" + "fmla v19.4s, v15.4s, v6.s[1] \n" + "fmla v20.4s, v14.4s, v6.s[2] \n" + "fmla v21.4s, v15.4s, v6.s[2] \n" + "fmla v22.4s, v14.4s, v6.s[3] \n" + "fmla v23.4s, v15.4s, v6.s[3] \n" "subs w4, w4, #1 \n" - "fmla v24.4s, v14.4s, v4.s[3] \n" - "fmla v25.4s, v15.4s, v4.s[3] \n" - "fmla v26.4s, v14.4s, v5.s[3] \n" - "fmla v27.4s, v15.4s, v5.s[3] \n" - "fmla v28.4s, v14.4s, v6.s[3] \n" - "fmla v29.4s, v15.4s, v6.s[3] \n" + "fmla v24.4s, v14.4s, v7.s[0] \n" + "fmla v25.4s, v15.4s, v7.s[0] \n" + "fmla v26.4s, v14.4s, v7.s[1] \n" + "fmla v27.4s, v15.4s, v7.s[1] \n" + "fmla v28.4s, v14.4s, v7.s[2] \n" + "fmla v29.4s, v15.4s, v7.s[2] \n" "fmla v30.4s, v14.4s, v7.s[3] \n" "fmla v31.4s, v15.4s, v7.s[3] \n" @@ -9566,45 +9399,44 @@ static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, co "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%9], #64 \n" "fmla v16.4s, v8.4s, v0.s[0] \n" - "fmla v17.4s, v8.4s, v1.s[0] \n" - "fmla v18.4s, v8.4s, v2.s[0] \n" - "fmla v19.4s, v8.4s, v3.s[0] \n" + "fmla v17.4s, v8.4s, v0.s[1] \n" + "fmla v18.4s, v8.4s, v0.s[2] \n" + "fmla v19.4s, v8.4s, v0.s[3] \n" + "fmla v20.4s, v8.4s, v1.s[0] \n" + "fmla v21.4s, v8.4s, v1.s[1] \n" + "fmla v22.4s, v8.4s, v1.s[2] \n" + "fmla v23.4s, v8.4s, v1.s[3] \n" "prfm pldl1keep, [%9, #512] \n" "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%9], #64 \n" - "fmla v20.4s, v8.4s, v4.s[0] \n" - "fmla v21.4s, v8.4s, v5.s[0] \n" - "fmla v22.4s, v8.4s, v6.s[0] \n" - "fmla v23.4s, v8.4s, v7.s[0] \n" - - "fmla v16.4s, v9.4s, v0.s[1] \n" - "fmla v17.4s, v9.4s, v1.s[1] \n" - "fmla v18.4s, v9.4s, v2.s[1] \n" - "fmla v19.4s, v9.4s, v3.s[1] \n" - "fmla v20.4s, v9.4s, v4.s[1] \n" - "fmla v21.4s, v9.4s, v5.s[1] \n" - "fmla v22.4s, v9.4s, v6.s[1] \n" - "fmla v23.4s, v9.4s, v7.s[1] \n" - - "fmla v16.4s, v10.4s, v0.s[2] \n" - "fmla v17.4s, v10.4s, v1.s[2] \n" - "fmla v18.4s, v10.4s, v2.s[2] \n" - "fmla v19.4s, v10.4s, v3.s[2] \n" - "fmla v20.4s, v10.4s, v4.s[2] \n" - "fmla v21.4s, v10.4s, v5.s[2] \n" - "fmla v22.4s, v10.4s, v6.s[2] \n" - "fmla v23.4s, v10.4s, v7.s[2] \n" + "fmla v16.4s, v9.4s, v2.s[0] \n" + "fmla v17.4s, v9.4s, v2.s[1] \n" + "fmla v18.4s, v9.4s, v2.s[2] \n" + "fmla v19.4s, v9.4s, v2.s[3] \n" + "fmla v20.4s, v9.4s, v3.s[0] \n" + "fmla v21.4s, v9.4s, v3.s[1] \n" + "fmla v22.4s, v9.4s, v3.s[2] \n" + "fmla v23.4s, v9.4s, v3.s[3] \n" + + "fmla v16.4s, v10.4s, v4.s[0] \n" + "fmla v17.4s, v10.4s, v4.s[1] \n" + "fmla v18.4s, v10.4s, v4.s[2] \n" + "fmla v19.4s, v10.4s, v4.s[3] \n" + "fmla v20.4s, v10.4s, v5.s[0] \n" + "fmla v21.4s, v10.4s, v5.s[1] \n" + "fmla v22.4s, v10.4s, v5.s[2] \n" + "fmla v23.4s, v10.4s, v5.s[3] \n" "subs w4, w4, #1 \n" - "fmla v16.4s, v11.4s, v0.s[3] \n" - "fmla v17.4s, v11.4s, v1.s[3] \n" - "fmla v18.4s, v11.4s, v2.s[3] \n" - "fmla v19.4s, v11.4s, v3.s[3] \n" - "fmla v20.4s, v11.4s, v4.s[3] \n" - "fmla v21.4s, v11.4s, v5.s[3] \n" - "fmla v22.4s, v11.4s, v6.s[3] \n" + "fmla v16.4s, v11.4s, v6.s[0] \n" + "fmla v17.4s, v11.4s, v6.s[1] \n" + "fmla v18.4s, v11.4s, v6.s[2] \n" + "fmla v19.4s, v11.4s, v6.s[3] \n" + "fmla v20.4s, v11.4s, v7.s[0] \n" + "fmla v21.4s, v11.4s, v7.s[1] \n" + "fmla v22.4s, v11.4s, v7.s[2] \n" "fmla v23.4s, v11.4s, v7.s[3] \n" "bne 0b \n" @@ -9679,14 +9511,8 @@ static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, co const float* ktm0 = kernel_tm0.row(r); - float32x4_t _sum0 = vdupq_n_f32(0.f); - float32x4_t _sum1 = vdupq_n_f32(0.f); - float32x4_t _sum2 = vdupq_n_f32(0.f); - float32x4_t _sum3 = vdupq_n_f32(0.f); - float32x4_t _sum4 = vdupq_n_f32(0.f); - float32x4_t _sum5 = vdupq_n_f32(0.f); - float32x4_t _sum6 = vdupq_n_f32(0.f); - float32x4_t _sum7 = vdupq_n_f32(0.f); + float32x4_t _sum0123 = vdupq_n_f32(0.f); + float32x4_t _sum4567 = vdupq_n_f32(0.f); int q=0; for (; q+3