From b5b486fbfa1bbd7d0722802de014871cce1448e1 Mon Sep 17 00:00:00 2001 From: nihuini Date: Mon, 10 Aug 2020 17:29:28 +0800 Subject: [PATCH] conv3x3s2 pack8 arm fp16sa neon assembly optimization --- src/layer/arm/convolution_3x3_pack8_fp16s.h | 979 ++++++++++++++++++++ src/layer/arm/convolution_arm.cpp | 9 + 2 files changed, 988 insertions(+) diff --git a/src/layer/arm/convolution_3x3_pack8_fp16s.h b/src/layer/arm/convolution_3x3_pack8_fp16s.h index 00c1581c5..7b78ef441 100644 --- a/src/layer/arm/convolution_3x3_pack8_fp16s.h +++ b/src/layer/arm/convolution_3x3_pack8_fp16s.h @@ -1207,3 +1207,982 @@ static void conv3x3s1_winograd64_pack8_fp16sa_neon(const Mat& bottom_blob, Mat& copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt); } +static void conv3x3s2_pack8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) +{ + int w = bottom_blob.w; + int inch = bottom_blob.c; + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const int tailstep = (w - 2 * outw + w) * 8; + + const __fp16* bias = _bias; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + Mat out0 = top_blob.channel(p); + + float16x8_t _bias0 = bias ? vld1q_f16(bias + p * 8) : vdupq_n_f16(0.f); + out0.fill(_bias0); + + for (int q = 0; q < inch; q++) + { + __fp16* outptr0 = out0; + + const Mat img0 = bottom_blob.channel(q); + + const __fp16* r0 = img0.row(0); + const __fp16* r1 = img0.row(1); + const __fp16* r2 = img0.row(2); + + const __fp16* kptr = kernel.channel(p).row(q); + + int i = 0; + for (; i < outh; i++) + { + int j = 0; + for (; j + 3 < outw; j += 4) + { + asm volatile( + "prfm pldl1keep, [%1, #512] \n" + "ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [%1], #64 \n" // r00 r01 r02 r03 + + "prfm pldl1keep, [%0, #512] \n" + "ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [%0] \n" // sum0 + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n" + + "prfm pldl1keep, [%1, #512] \n" + "ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [%1], #64 \n" // r04 r05 r06 r07 + + "fmla v28.8h, v16.8h, v0.h[0] \n" + "fmla v29.8h, v16.8h, v2.h[0] \n" + "fmla v30.8h, v16.8h, v4.h[0] \n" + "fmla v31.8h, v16.8h, v6.h[0] \n" + + "fmla v28.8h, v17.8h, v0.h[1] \n" + "fmla v29.8h, v17.8h, v2.h[1] \n" + "fmla v30.8h, v17.8h, v4.h[1] \n" + "fmla v31.8h, v17.8h, v6.h[1] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n" + + "fmla v28.8h, v18.8h, v0.h[2] \n" + "fmla v29.8h, v18.8h, v2.h[2] \n" + "fmla v30.8h, v18.8h, v4.h[2] \n" + "fmla v31.8h, v18.8h, v6.h[2] \n" + + "fmla v28.8h, v19.8h, v0.h[3] \n" + "fmla v29.8h, v19.8h, v2.h[3] \n" + "fmla v30.8h, v19.8h, v4.h[3] \n" + "fmla v31.8h, v19.8h, v6.h[3] \n" + + "fmla v28.8h, v20.8h, v0.h[4] \n" + "fmla v29.8h, v20.8h, v2.h[4] \n" + "fmla v30.8h, v20.8h, v4.h[4] \n" + "fmla v31.8h, v20.8h, v6.h[4] \n" + + "fmla v28.8h, v21.8h, v0.h[5] \n" + "fmla v29.8h, v21.8h, v2.h[5] \n" + "fmla v30.8h, v21.8h, v4.h[5] \n" + "fmla v31.8h, v21.8h, v6.h[5] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n" + + "fmla v28.8h, v22.8h, v0.h[6] \n" + "fmla v29.8h, v22.8h, v2.h[6] \n" + "fmla v30.8h, v22.8h, v4.h[6] \n" + "fmla v31.8h, v22.8h, v6.h[6] \n" + + "fmla v28.8h, v23.8h, v0.h[7] \n" + "fmla v29.8h, v23.8h, v2.h[7] \n" + "fmla v30.8h, v23.8h, v4.h[7] \n" + "fmla v31.8h, v23.8h, v6.h[7] \n" + + "fmla v28.8h, v16.8h, v1.h[0] \n" + "fmla v29.8h, v16.8h, v3.h[0] \n" + "fmla v30.8h, v16.8h, v5.h[0] \n" + "fmla v31.8h, v16.8h, v7.h[0] \n" + + "fmla v28.8h, v17.8h, v1.h[1] \n" + "fmla v29.8h, v17.8h, v3.h[1] \n" + "fmla v30.8h, v17.8h, v5.h[1] \n" + "fmla v31.8h, v17.8h, v7.h[1] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n" + + "fmla v28.8h, v18.8h, v1.h[2] \n" + "fmla v29.8h, v18.8h, v3.h[2] \n" + "fmla v30.8h, v18.8h, v5.h[2] \n" + "fmla v31.8h, v18.8h, v7.h[2] \n" + + "fmla v28.8h, v19.8h, v1.h[3] \n" + "fmla v29.8h, v19.8h, v3.h[3] \n" + "fmla v30.8h, v19.8h, v5.h[3] \n" + "fmla v31.8h, v19.8h, v7.h[3] \n" + + "fmla v28.8h, v20.8h, v1.h[4] \n" + "fmla v29.8h, v20.8h, v3.h[4] \n" + "fmla v30.8h, v20.8h, v5.h[4] \n" + "fmla v31.8h, v20.8h, v7.h[4] \n" + + "fmla v28.8h, v21.8h, v1.h[5] \n" + "fmla v29.8h, v21.8h, v3.h[5] \n" + "fmla v30.8h, v21.8h, v5.h[5] \n" + "fmla v31.8h, v21.8h, v7.h[5] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n" + + "fmla v28.8h, v22.8h, v1.h[6] \n" + "fmla v29.8h, v22.8h, v3.h[6] \n" + "fmla v30.8h, v22.8h, v5.h[6] \n" + "fmla v31.8h, v22.8h, v7.h[6] \n" + + "fmla v28.8h, v23.8h, v1.h[7] \n" + "fmla v29.8h, v23.8h, v3.h[7] \n" + "fmla v30.8h, v23.8h, v5.h[7] \n" + "fmla v31.8h, v23.8h, v7.h[7] \n" + + "prfm pldl1keep, [%1, #128] \n" + "ld1 {v0.8h}, [%1] \n" // r08 + + "fmla v28.8h, v16.8h, v2.h[0] \n" + "fmla v29.8h, v16.8h, v4.h[0] \n" + "fmla v30.8h, v16.8h, v6.h[0] \n" + "fmla v31.8h, v16.8h, v0.h[0] \n" + + "fmla v28.8h, v17.8h, v2.h[1] \n" + "fmla v29.8h, v17.8h, v4.h[1] \n" + "fmla v30.8h, v17.8h, v6.h[1] \n" + "fmla v31.8h, v17.8h, v0.h[1] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n" + + "fmla v28.8h, v18.8h, v2.h[2] \n" + "fmla v29.8h, v18.8h, v4.h[2] \n" + "fmla v30.8h, v18.8h, v6.h[2] \n" + "fmla v31.8h, v18.8h, v0.h[2] \n" + + "fmla v28.8h, v19.8h, v2.h[3] \n" + "fmla v29.8h, v19.8h, v4.h[3] \n" + "fmla v30.8h, v19.8h, v6.h[3] \n" + "fmla v31.8h, v19.8h, v0.h[3] \n" + + "prfm pldl1keep, [%2, #512] \n" + "ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [%2], #64 \n" // r10 r11 r12 r13 + + "fmla v28.8h, v20.8h, v2.h[4] \n" + "fmla v29.8h, v20.8h, v4.h[4] \n" + "fmla v30.8h, v20.8h, v6.h[4] \n" + "fmla v31.8h, v20.8h, v0.h[4] \n" + + "fmla v28.8h, v21.8h, v2.h[5] \n" + "fmla v29.8h, v21.8h, v4.h[5] \n" + "fmla v30.8h, v21.8h, v6.h[5] \n" + "fmla v31.8h, v21.8h, v0.h[5] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n" + + "fmla v28.8h, v22.8h, v2.h[6] \n" + "fmla v29.8h, v22.8h, v4.h[6] \n" + "fmla v30.8h, v22.8h, v6.h[6] \n" + "fmla v31.8h, v22.8h, v0.h[6] \n" + + "fmla v28.8h, v23.8h, v2.h[7] \n" + "fmla v29.8h, v23.8h, v4.h[7] \n" + "fmla v30.8h, v23.8h, v6.h[7] \n" + "fmla v31.8h, v23.8h, v0.h[7] \n" + + "prfm pldl1keep, [%2, #512] \n" + "ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [%2], #64 \n" // r14 r15 r16 r17 + + "fmla v28.8h, v16.8h, v8.h[0] \n" + "fmla v29.8h, v16.8h, v10.h[0] \n" + "fmla v30.8h, v16.8h, v12.h[0] \n" + "fmla v31.8h, v16.8h, v14.h[0] \n" + + "fmla v28.8h, v17.8h, v8.h[1] \n" + "fmla v29.8h, v17.8h, v10.h[1] \n" + "fmla v30.8h, v17.8h, v12.h[1] \n" + "fmla v31.8h, v17.8h, v14.h[1] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n" + + "fmla v28.8h, v18.8h, v8.h[2] \n" + "fmla v29.8h, v18.8h, v10.h[2] \n" + "fmla v30.8h, v18.8h, v12.h[2] \n" + "fmla v31.8h, v18.8h, v14.h[2] \n" + + "fmla v28.8h, v19.8h, v8.h[3] \n" + "fmla v29.8h, v19.8h, v10.h[3] \n" + "fmla v30.8h, v19.8h, v12.h[3] \n" + "fmla v31.8h, v19.8h, v14.h[3] \n" + + "fmla v28.8h, v20.8h, v8.h[4] \n" + "fmla v29.8h, v20.8h, v10.h[4] \n" + "fmla v30.8h, v20.8h, v12.h[4] \n" + "fmla v31.8h, v20.8h, v14.h[4] \n" + + "fmla v28.8h, v21.8h, v8.h[5] \n" + "fmla v29.8h, v21.8h, v10.h[5] \n" + "fmla v30.8h, v21.8h, v12.h[5] \n" + "fmla v31.8h, v21.8h, v14.h[5] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n" + + "fmla v28.8h, v22.8h, v8.h[6] \n" + "fmla v29.8h, v22.8h, v10.h[6] \n" + "fmla v30.8h, v22.8h, v12.h[6] \n" + "fmla v31.8h, v22.8h, v14.h[6] \n" + + "fmla v28.8h, v23.8h, v8.h[7] \n" + "fmla v29.8h, v23.8h, v10.h[7] \n" + "fmla v30.8h, v23.8h, v12.h[7] \n" + "fmla v31.8h, v23.8h, v14.h[7] \n" + + "fmla v28.8h, v16.8h, v9.h[0] \n" + "fmla v29.8h, v16.8h, v11.h[0] \n" + "fmla v30.8h, v16.8h, v13.h[0] \n" + "fmla v31.8h, v16.8h, v15.h[0] \n" + + "fmla v28.8h, v17.8h, v9.h[1] \n" + "fmla v29.8h, v17.8h, v11.h[1] \n" + "fmla v30.8h, v17.8h, v13.h[1] \n" + "fmla v31.8h, v17.8h, v15.h[1] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n" + + "fmla v28.8h, v18.8h, v9.h[2] \n" + "fmla v29.8h, v18.8h, v11.h[2] \n" + "fmla v30.8h, v18.8h, v13.h[2] \n" + "fmla v31.8h, v18.8h, v15.h[2] \n" + + "fmla v28.8h, v19.8h, v9.h[3] \n" + "fmla v29.8h, v19.8h, v11.h[3] \n" + "fmla v30.8h, v19.8h, v13.h[3] \n" + "fmla v31.8h, v19.8h, v15.h[3] \n" + + "fmla v28.8h, v20.8h, v9.h[4] \n" + "fmla v29.8h, v20.8h, v11.h[4] \n" + "fmla v30.8h, v20.8h, v13.h[4] \n" + "fmla v31.8h, v20.8h, v15.h[4] \n" + + "fmla v28.8h, v21.8h, v9.h[5] \n" + "fmla v29.8h, v21.8h, v11.h[5] \n" + "fmla v30.8h, v21.8h, v13.h[5] \n" + "fmla v31.8h, v21.8h, v15.h[5] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n" + + "fmla v28.8h, v22.8h, v9.h[6] \n" + "fmla v29.8h, v22.8h, v11.h[6] \n" + "fmla v30.8h, v22.8h, v13.h[6] \n" + "fmla v31.8h, v22.8h, v15.h[6] \n" + + "fmla v28.8h, v23.8h, v9.h[7] \n" + "fmla v29.8h, v23.8h, v11.h[7] \n" + "fmla v30.8h, v23.8h, v13.h[7] \n" + "fmla v31.8h, v23.8h, v15.h[7] \n" + + "prfm pldl1keep, [%2, #128] \n" + "ld1 {v8.8h}, [%2] \n" // r18 + + "fmla v28.8h, v16.8h, v10.h[0] \n" + "fmla v29.8h, v16.8h, v12.h[0] \n" + "fmla v30.8h, v16.8h, v14.h[0] \n" + "fmla v31.8h, v16.8h, v8.h[0] \n" + + "fmla v28.8h, v17.8h, v10.h[1] \n" + "fmla v29.8h, v17.8h, v12.h[1] \n" + "fmla v30.8h, v17.8h, v14.h[1] \n" + "fmla v31.8h, v17.8h, v8.h[1] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n" + + "fmla v28.8h, v18.8h, v10.h[2] \n" + "fmla v29.8h, v18.8h, v12.h[2] \n" + "fmla v30.8h, v18.8h, v14.h[2] \n" + "fmla v31.8h, v18.8h, v8.h[2] \n" + + "fmla v28.8h, v19.8h, v10.h[3] \n" + "fmla v29.8h, v19.8h, v12.h[3] \n" + "fmla v30.8h, v19.8h, v14.h[3] \n" + "fmla v31.8h, v19.8h, v8.h[3] \n" + + "prfm pldl1keep, [%3, #512] \n" + "ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [%3], #64 \n" // r20 r21 r22 r23 + + "fmla v28.8h, v20.8h, v10.h[4] \n" + "fmla v29.8h, v20.8h, v12.h[4] \n" + "fmla v30.8h, v20.8h, v14.h[4] \n" + "fmla v31.8h, v20.8h, v8.h[4] \n" + + "fmla v28.8h, v21.8h, v10.h[5] \n" + "fmla v29.8h, v21.8h, v12.h[5] \n" + "fmla v30.8h, v21.8h, v14.h[5] \n" + "fmla v31.8h, v21.8h, v8.h[5] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n" + + "fmla v28.8h, v22.8h, v10.h[6] \n" + "fmla v29.8h, v22.8h, v12.h[6] \n" + "fmla v30.8h, v22.8h, v14.h[6] \n" + "fmla v31.8h, v22.8h, v8.h[6] \n" + + "fmla v28.8h, v23.8h, v10.h[7] \n" + "fmla v29.8h, v23.8h, v12.h[7] \n" + "fmla v30.8h, v23.8h, v14.h[7] \n" + "fmla v31.8h, v23.8h, v8.h[7] \n" + + "prfm pldl1keep, [%3, #512] \n" + "ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [%3], #64 \n" // r24 r25 r26 r27 + + "fmla v28.8h, v16.8h, v0.h[0] \n" + "fmla v29.8h, v16.8h, v2.h[0] \n" + "fmla v30.8h, v16.8h, v4.h[0] \n" + "fmla v31.8h, v16.8h, v6.h[0] \n" + + "fmla v28.8h, v17.8h, v0.h[1] \n" + "fmla v29.8h, v17.8h, v2.h[1] \n" + "fmla v30.8h, v17.8h, v4.h[1] \n" + "fmla v31.8h, v17.8h, v6.h[1] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n" + + "fmla v28.8h, v18.8h, v0.h[2] \n" + "fmla v29.8h, v18.8h, v2.h[2] \n" + "fmla v30.8h, v18.8h, v4.h[2] \n" + "fmla v31.8h, v18.8h, v6.h[2] \n" + + "fmla v28.8h, v19.8h, v0.h[3] \n" + "fmla v29.8h, v19.8h, v2.h[3] \n" + "fmla v30.8h, v19.8h, v4.h[3] \n" + "fmla v31.8h, v19.8h, v6.h[3] \n" + + "fmla v28.8h, v20.8h, v0.h[4] \n" + "fmla v29.8h, v20.8h, v2.h[4] \n" + "fmla v30.8h, v20.8h, v4.h[4] \n" + "fmla v31.8h, v20.8h, v6.h[4] \n" + + "fmla v28.8h, v21.8h, v0.h[5] \n" + "fmla v29.8h, v21.8h, v2.h[5] \n" + "fmla v30.8h, v21.8h, v4.h[5] \n" + "fmla v31.8h, v21.8h, v6.h[5] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n" + + "fmla v28.8h, v22.8h, v0.h[6] \n" + "fmla v29.8h, v22.8h, v2.h[6] \n" + "fmla v30.8h, v22.8h, v4.h[6] \n" + "fmla v31.8h, v22.8h, v6.h[6] \n" + + "fmla v28.8h, v23.8h, v0.h[7] \n" + "fmla v29.8h, v23.8h, v2.h[7] \n" + "fmla v30.8h, v23.8h, v4.h[7] \n" + "fmla v31.8h, v23.8h, v6.h[7] \n" + + "fmla v28.8h, v16.8h, v1.h[0] \n" + "fmla v29.8h, v16.8h, v3.h[0] \n" + "fmla v30.8h, v16.8h, v5.h[0] \n" + "fmla v31.8h, v16.8h, v7.h[0] \n" + + "fmla v28.8h, v17.8h, v1.h[1] \n" + "fmla v29.8h, v17.8h, v3.h[1] \n" + "fmla v30.8h, v17.8h, v5.h[1] \n" + "fmla v31.8h, v17.8h, v7.h[1] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n" + + "fmla v28.8h, v18.8h, v1.h[2] \n" + "fmla v29.8h, v18.8h, v3.h[2] \n" + "fmla v30.8h, v18.8h, v5.h[2] \n" + "fmla v31.8h, v18.8h, v7.h[2] \n" + + "fmla v28.8h, v19.8h, v1.h[3] \n" + "fmla v29.8h, v19.8h, v3.h[3] \n" + "fmla v30.8h, v19.8h, v5.h[3] \n" + "fmla v31.8h, v19.8h, v7.h[3] \n" + + "fmla v28.8h, v20.8h, v1.h[4] \n" + "fmla v29.8h, v20.8h, v3.h[4] \n" + "fmla v30.8h, v20.8h, v5.h[4] \n" + "fmla v31.8h, v20.8h, v7.h[4] \n" + + "fmla v28.8h, v21.8h, v1.h[5] \n" + "fmla v29.8h, v21.8h, v3.h[5] \n" + "fmla v30.8h, v21.8h, v5.h[5] \n" + "fmla v31.8h, v21.8h, v7.h[5] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n" + + "fmla v28.8h, v22.8h, v1.h[6] \n" + "fmla v29.8h, v22.8h, v3.h[6] \n" + "fmla v30.8h, v22.8h, v5.h[6] \n" + "fmla v31.8h, v22.8h, v7.h[6] \n" + + "fmla v28.8h, v23.8h, v1.h[7] \n" + "fmla v29.8h, v23.8h, v3.h[7] \n" + "fmla v30.8h, v23.8h, v5.h[7] \n" + "fmla v31.8h, v23.8h, v7.h[7] \n" + + "prfm pldl1keep, [%3, #128] \n" + "ld1 {v0.8h}, [%3] \n" // r28 + + "fmla v28.8h, v16.8h, v2.h[0] \n" + "fmla v29.8h, v16.8h, v4.h[0] \n" + "fmla v30.8h, v16.8h, v6.h[0] \n" + "fmla v31.8h, v16.8h, v0.h[0] \n" + + "fmla v28.8h, v17.8h, v2.h[1] \n" + "fmla v29.8h, v17.8h, v4.h[1] \n" + "fmla v30.8h, v17.8h, v6.h[1] \n" + "fmla v31.8h, v17.8h, v0.h[1] \n" + + // "prfm pldl1keep, [%4, #512] \n" + "ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [%4] \n" + + "fmla v28.8h, v18.8h, v2.h[2] \n" + "fmla v29.8h, v18.8h, v4.h[2] \n" + "fmla v30.8h, v18.8h, v6.h[2] \n" + "fmla v31.8h, v18.8h, v0.h[2] \n" + + "fmla v28.8h, v19.8h, v2.h[3] \n" + "fmla v29.8h, v19.8h, v4.h[3] \n" + "fmla v30.8h, v19.8h, v6.h[3] \n" + "fmla v31.8h, v19.8h, v0.h[3] \n" + + "fmla v28.8h, v20.8h, v2.h[4] \n" + "fmla v29.8h, v20.8h, v4.h[4] \n" + "fmla v30.8h, v20.8h, v6.h[4] \n" + "fmla v31.8h, v20.8h, v0.h[4] \n" + + "fmla v28.8h, v21.8h, v2.h[5] \n" + "fmla v29.8h, v21.8h, v4.h[5] \n" + "fmla v30.8h, v21.8h, v6.h[5] \n" + "fmla v31.8h, v21.8h, v0.h[5] \n" + + "fmla v28.8h, v22.8h, v2.h[6] \n" + "fmla v29.8h, v22.8h, v4.h[6] \n" + "fmla v30.8h, v22.8h, v6.h[6] \n" + "fmla v31.8h, v22.8h, v0.h[6] \n" + + "fmla v28.8h, v23.8h, v2.h[7] \n" + "fmla v29.8h, v23.8h, v4.h[7] \n" + "fmla v30.8h, v23.8h, v6.h[7] \n" + "fmla v31.8h, v23.8h, v0.h[7] \n" + + "sub %4, %4, #1088 \n" // kptr -= 8.5 * 64; + + "st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [%0], #64 \n" + + : "=r"(outptr0), // %0 + "=r"(r0), // %1 + "=r"(r1), // %2 + "=r"(r2), // %3 + "=r"(kptr) // %4 + : "0"(outptr0), + "1"(r0), + "2"(r1), + "3"(r2), + "4"(kptr) + : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v28", "v29", "v30", "v31" + ); + } + for (; j + 1 < outw; j += 2) + { + asm volatile( + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n" + + "prfm pldl1keep, [%1, #512] \n" + "ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [%1], #64 \n" // r00 r01 r02 r03 + + "prfm pldl1keep, [%0, #256] \n" + "ld1 {v30.8h, v31.8h}, [%0] \n" // sum0 + + "fmul v28.8h, v16.8h, v0.h[0] \n" + "fmul v29.8h, v16.8h, v2.h[0] \n" + "fmla v30.8h, v17.8h, v0.h[1] \n" + "fmla v31.8h, v17.8h, v2.h[1] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n" + + "fmla v28.8h, v18.8h, v0.h[2] \n" + "fmla v29.8h, v18.8h, v2.h[2] \n" + "fmla v30.8h, v19.8h, v0.h[3] \n" + "fmla v31.8h, v19.8h, v2.h[3] \n" + "fmla v28.8h, v20.8h, v0.h[4] \n" + "fmla v29.8h, v20.8h, v2.h[4] \n" + "fmla v30.8h, v21.8h, v0.h[5] \n" + "fmla v31.8h, v21.8h, v2.h[5] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n" + + "fmla v28.8h, v22.8h, v0.h[6] \n" + "fmla v29.8h, v22.8h, v2.h[6] \n" + "fmla v30.8h, v23.8h, v0.h[7] \n" + "fmla v31.8h, v23.8h, v2.h[7] \n" + + "fmla v28.8h, v16.8h, v1.h[0] \n" + "fmla v29.8h, v16.8h, v3.h[0] \n" + "fmla v30.8h, v17.8h, v1.h[1] \n" + "fmla v31.8h, v17.8h, v3.h[1] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n" + + "fmla v28.8h, v18.8h, v1.h[2] \n" + "fmla v29.8h, v18.8h, v3.h[2] \n" + "fmla v30.8h, v19.8h, v1.h[3] \n" + "fmla v31.8h, v19.8h, v3.h[3] \n" + + "prfm pldl1keep, [%1, #128] \n" + "ld1 {v0.8h}, [%1] \n" // r04 + + "fmla v28.8h, v20.8h, v1.h[4] \n" + "fmla v29.8h, v20.8h, v3.h[4] \n" + "fmla v30.8h, v21.8h, v1.h[5] \n" + "fmla v31.8h, v21.8h, v3.h[5] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n" + + "fmla v28.8h, v22.8h, v1.h[6] \n" + "fmla v29.8h, v22.8h, v3.h[6] \n" + "fmla v30.8h, v23.8h, v1.h[7] \n" + "fmla v31.8h, v23.8h, v3.h[7] \n" + + "fmla v28.8h, v16.8h, v2.h[0] \n" + "fmla v29.8h, v16.8h, v0.h[0] \n" + "fmla v30.8h, v17.8h, v2.h[1] \n" + "fmla v31.8h, v17.8h, v0.h[1] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n" + + "fmla v28.8h, v18.8h, v2.h[2] \n" + "fmla v29.8h, v18.8h, v0.h[2] \n" + "fmla v30.8h, v19.8h, v2.h[3] \n" + "fmla v31.8h, v19.8h, v0.h[3] \n" + + "prfm pldl1keep, [%2, #512] \n" + "ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [%2], #64 \n" // r10 r11 r12 r13 + + "fmla v28.8h, v20.8h, v2.h[4] \n" + "fmla v29.8h, v20.8h, v0.h[4] \n" + "fmla v30.8h, v21.8h, v2.h[5] \n" + "fmla v31.8h, v21.8h, v0.h[5] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n" + + "fmla v28.8h, v22.8h, v2.h[6] \n" + "fmla v29.8h, v22.8h, v0.h[6] \n" + "fmla v30.8h, v23.8h, v2.h[7] \n" + "fmla v31.8h, v23.8h, v0.h[7] \n" + + "fmla v28.8h, v16.8h, v4.h[0] \n" + "fmla v29.8h, v16.8h, v6.h[0] \n" + "fmla v30.8h, v17.8h, v4.h[1] \n" + "fmla v31.8h, v17.8h, v6.h[1] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n" + + "fmla v28.8h, v18.8h, v4.h[2] \n" + "fmla v29.8h, v18.8h, v6.h[2] \n" + "fmla v30.8h, v19.8h, v4.h[3] \n" + "fmla v31.8h, v19.8h, v6.h[3] \n" + "fmla v28.8h, v20.8h, v4.h[4] \n" + "fmla v29.8h, v20.8h, v6.h[4] \n" + "fmla v30.8h, v21.8h, v4.h[5] \n" + "fmla v31.8h, v21.8h, v6.h[5] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n" + + "fmla v28.8h, v22.8h, v4.h[6] \n" + "fmla v29.8h, v22.8h, v6.h[6] \n" + "fmla v30.8h, v23.8h, v4.h[7] \n" + "fmla v31.8h, v23.8h, v6.h[7] \n" + + "fmla v28.8h, v16.8h, v5.h[0] \n" + "fmla v29.8h, v16.8h, v7.h[0] \n" + "fmla v30.8h, v17.8h, v5.h[1] \n" + "fmla v31.8h, v17.8h, v7.h[1] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n" + + "fmla v28.8h, v18.8h, v5.h[2] \n" + "fmla v29.8h, v18.8h, v7.h[2] \n" + "fmla v30.8h, v19.8h, v5.h[3] \n" + "fmla v31.8h, v19.8h, v7.h[3] \n" + + "prfm pldl1keep, [%2, #128] \n" + "ld1 {v4.8h}, [%2] \n" // r14 + + "fmla v28.8h, v20.8h, v5.h[4] \n" + "fmla v29.8h, v20.8h, v7.h[4] \n" + "fmla v30.8h, v21.8h, v5.h[5] \n" + "fmla v31.8h, v21.8h, v7.h[5] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n" + + "fmla v28.8h, v22.8h, v5.h[6] \n" + "fmla v29.8h, v22.8h, v7.h[6] \n" + "fmla v30.8h, v23.8h, v5.h[7] \n" + "fmla v31.8h, v23.8h, v7.h[7] \n" + + "fmla v28.8h, v16.8h, v6.h[0] \n" + "fmla v29.8h, v16.8h, v4.h[0] \n" + "fmla v30.8h, v17.8h, v6.h[1] \n" + "fmla v31.8h, v17.8h, v4.h[1] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n" + + "fmla v28.8h, v18.8h, v6.h[2] \n" + "fmla v29.8h, v18.8h, v4.h[2] \n" + "fmla v30.8h, v19.8h, v6.h[3] \n" + "fmla v31.8h, v19.8h, v4.h[3] \n" + + "prfm pldl1keep, [%3, #512] \n" + "ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [%3], #64 \n" // r20 r21 r22 r23 + + "fmla v28.8h, v20.8h, v6.h[4] \n" + "fmla v29.8h, v20.8h, v4.h[4] \n" + "fmla v30.8h, v21.8h, v6.h[5] \n" + "fmla v31.8h, v21.8h, v4.h[5] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n" + + "fmla v28.8h, v22.8h, v6.h[6] \n" + "fmla v29.8h, v22.8h, v4.h[6] \n" + "fmla v30.8h, v23.8h, v6.h[7] \n" + "fmla v31.8h, v23.8h, v4.h[7] \n" + + "fmla v28.8h, v16.8h, v0.h[0] \n" + "fmla v29.8h, v16.8h, v2.h[0] \n" + "fmla v30.8h, v17.8h, v0.h[1] \n" + "fmla v31.8h, v17.8h, v2.h[1] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n" + + "fmla v28.8h, v18.8h, v0.h[2] \n" + "fmla v29.8h, v18.8h, v2.h[2] \n" + "fmla v30.8h, v19.8h, v0.h[3] \n" + "fmla v31.8h, v19.8h, v2.h[3] \n" + "fmla v28.8h, v20.8h, v0.h[4] \n" + "fmla v29.8h, v20.8h, v2.h[4] \n" + "fmla v30.8h, v21.8h, v0.h[5] \n" + "fmla v31.8h, v21.8h, v2.h[5] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n" + + "fmla v28.8h, v22.8h, v0.h[6] \n" + "fmla v29.8h, v22.8h, v2.h[6] \n" + "fmla v30.8h, v23.8h, v0.h[7] \n" + "fmla v31.8h, v23.8h, v2.h[7] \n" + + "fmla v28.8h, v16.8h, v1.h[0] \n" + "fmla v29.8h, v16.8h, v3.h[0] \n" + "fmla v30.8h, v17.8h, v1.h[1] \n" + "fmla v31.8h, v17.8h, v3.h[1] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n" + + "fmla v28.8h, v18.8h, v1.h[2] \n" + "fmla v29.8h, v18.8h, v3.h[2] \n" + "fmla v30.8h, v19.8h, v1.h[3] \n" + "fmla v31.8h, v19.8h, v3.h[3] \n" + + "prfm pldl1keep, [%3, #128] \n" + "ld1 {v0.8h}, [%3] \n" // r24 + + "fmla v28.8h, v20.8h, v1.h[4] \n" + "fmla v29.8h, v20.8h, v3.h[4] \n" + "fmla v30.8h, v21.8h, v1.h[5] \n" + "fmla v31.8h, v21.8h, v3.h[5] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n" + + "fmla v28.8h, v22.8h, v1.h[6] \n" + "fmla v29.8h, v22.8h, v3.h[6] \n" + "fmla v30.8h, v23.8h, v1.h[7] \n" + "fmla v31.8h, v23.8h, v3.h[7] \n" + + "fmla v28.8h, v16.8h, v2.h[0] \n" + "fmla v29.8h, v16.8h, v0.h[0] \n" + "fmla v30.8h, v17.8h, v2.h[1] \n" + "fmla v31.8h, v17.8h, v0.h[1] \n" + + // "prfm pldl1keep, [%4, #512] \n" + "ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [%4] \n" + + "fmla v28.8h, v18.8h, v2.h[2] \n" + "fmla v29.8h, v18.8h, v0.h[2] \n" + "fmla v30.8h, v19.8h, v2.h[3] \n" + "fmla v31.8h, v19.8h, v0.h[3] \n" + "fmla v28.8h, v20.8h, v2.h[4] \n" + "fmla v29.8h, v20.8h, v0.h[4] \n" + "fmla v30.8h, v21.8h, v2.h[5] \n" + "fmla v31.8h, v21.8h, v0.h[5] \n" + "fmla v28.8h, v22.8h, v2.h[6] \n" + "fmla v29.8h, v22.8h, v0.h[6] \n" + "fmla v30.8h, v23.8h, v2.h[7] \n" + "fmla v31.8h, v23.8h, v0.h[7] \n" + + "fadd v28.8h, v28.8h, v30.8h \n" + "fadd v29.8h, v29.8h, v31.8h \n" + + "sub %4, %4, #1088 \n" // kptr -= 8.5 * 64; + + "st1 {v28.8h, v29.8h}, [%0], #32 \n" + + : "=r"(outptr0), // %0 + "=r"(r0), // %1 + "=r"(r1), // %2 + "=r"(r2), // %3 + "=r"(kptr) // %4 + : "0"(outptr0), + "1"(r0), + "2"(r1), + "3"(r2), + "4"(kptr) + : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v28", "v29", "v30", "v31" + ); + } + for (; j < outw; j++) + { + asm volatile( + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n" + + "prfm pldl1keep, [%1, #384] \n" + "ld1 {v0.8h, v1.8h, v2.8h}, [%1] \n" // r00 r01 r02 + + "prfm pldl1keep, [%0, #128] \n" + "ld1 {v31.8h}, [%0] \n" // sum0 + + "fmul v28.8h, v16.8h, v0.h[0] \n" + "fmul v29.8h, v17.8h, v0.h[1] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n" + + "fmul v30.8h, v18.8h, v0.h[2] \n" + "fmla v31.8h, v19.8h, v0.h[3] \n" + "fmla v28.8h, v20.8h, v0.h[4] \n" + "fmla v29.8h, v21.8h, v0.h[5] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n" + + "fmla v30.8h, v22.8h, v0.h[6] \n" + "fmla v31.8h, v23.8h, v0.h[7] \n" + + "fmla v28.8h, v16.8h, v1.h[0] \n" + "fmla v29.8h, v17.8h, v1.h[1] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n" + + "fmla v30.8h, v18.8h, v1.h[2] \n" + "fmla v31.8h, v19.8h, v1.h[3] \n" + "fmla v28.8h, v20.8h, v1.h[4] \n" + "fmla v29.8h, v21.8h, v1.h[5] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n" + + "fmla v30.8h, v22.8h, v1.h[6] \n" + "fmla v31.8h, v23.8h, v1.h[7] \n" + + "fmla v28.8h, v16.8h, v2.h[0] \n" + "fmla v29.8h, v17.8h, v2.h[1] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n" + + "fmla v30.8h, v18.8h, v2.h[2] \n" + "fmla v31.8h, v19.8h, v2.h[3] \n" + + "prfm pldl1keep, [%2, #384] \n" + "ld1 {v3.8h, v4.8h, v5.8h}, [%2] \n" // r10 r11 r12 + + "fmla v28.8h, v20.8h, v2.h[4] \n" + "fmla v29.8h, v21.8h, v2.h[5] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n" + + "fmla v30.8h, v22.8h, v2.h[6] \n" + "fmla v31.8h, v23.8h, v2.h[7] \n" + + "fmla v28.8h, v16.8h, v3.h[0] \n" + "fmla v29.8h, v17.8h, v3.h[1] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n" + + "fmla v30.8h, v18.8h, v3.h[2] \n" + "fmla v31.8h, v19.8h, v3.h[3] \n" + "fmla v28.8h, v20.8h, v3.h[4] \n" + "fmla v29.8h, v21.8h, v3.h[5] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n" + + "fmla v30.8h, v22.8h, v3.h[6] \n" + "fmla v31.8h, v23.8h, v3.h[7] \n" + + "fmla v28.8h, v16.8h, v4.h[0] \n" + "fmla v29.8h, v17.8h, v4.h[1] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n" + + "fmla v30.8h, v18.8h, v4.h[2] \n" + "fmla v31.8h, v19.8h, v4.h[3] \n" + "fmla v28.8h, v20.8h, v4.h[4] \n" + "fmla v29.8h, v21.8h, v4.h[5] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n" + + "fmla v30.8h, v22.8h, v4.h[6] \n" + "fmla v31.8h, v23.8h, v4.h[7] \n" + + "fmla v28.8h, v16.8h, v5.h[0] \n" + "fmla v29.8h, v17.8h, v5.h[1] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n" + + "fmla v30.8h, v18.8h, v5.h[2] \n" + "fmla v31.8h, v19.8h, v5.h[3] \n" + + "prfm pldl1keep, [%3, #384] \n" + "ld1 {v0.8h, v1.8h, v2.8h}, [%3] \n" // r20 r21 r22 + + "fmla v28.8h, v20.8h, v5.h[4] \n" + "fmla v29.8h, v21.8h, v5.h[5] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n" + + "fmla v30.8h, v22.8h, v5.h[6] \n" + "fmla v31.8h, v23.8h, v5.h[7] \n" + + "fmla v28.8h, v16.8h, v0.h[0] \n" + "fmla v29.8h, v17.8h, v0.h[1] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n" + + "fmla v30.8h, v18.8h, v0.h[2] \n" + "fmla v31.8h, v19.8h, v0.h[3] \n" + "fmla v28.8h, v20.8h, v0.h[4] \n" + "fmla v29.8h, v21.8h, v0.h[5] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n" + + "fmla v30.8h, v22.8h, v0.h[6] \n" + "fmla v31.8h, v23.8h, v0.h[7] \n" + + "fmla v28.8h, v16.8h, v1.h[0] \n" + "fmla v29.8h, v17.8h, v1.h[1] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n" + + "fmla v30.8h, v18.8h, v1.h[2] \n" + "fmla v31.8h, v19.8h, v1.h[3] \n" + "fmla v28.8h, v20.8h, v1.h[4] \n" + "fmla v29.8h, v21.8h, v1.h[5] \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n" + + "fmla v30.8h, v22.8h, v1.h[6] \n" + "fmla v31.8h, v23.8h, v1.h[7] \n" + + "fmla v28.8h, v16.8h, v2.h[0] \n" + "fmla v29.8h, v17.8h, v2.h[1] \n" + + // "prfm pldl1keep, [%4, #512] \n" + "ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [%4] \n" + + "fmla v30.8h, v18.8h, v2.h[2] \n" + "fmla v31.8h, v19.8h, v2.h[3] \n" + "fmla v28.8h, v20.8h, v2.h[4] \n" + "fmla v29.8h, v21.8h, v2.h[5] \n" + + "add %1, %1, #32 \n" + + "fmla v30.8h, v22.8h, v2.h[6] \n" + "fmla v31.8h, v23.8h, v2.h[7] \n" + + "add %2, %2, #32 \n" + + "fadd v28.8h, v28.8h, v29.8h \n" + "fadd v30.8h, v30.8h, v31.8h \n" + + "add %3, %3, #32 \n" + + "fadd v28.8h, v28.8h, v30.8h \n" + + "sub %4, %4, #1088 \n" // kptr -= 8.5 * 64; + + "st1 {v28.8h}, [%0], #16 \n" + + : "=r"(outptr0), // %0 + "=r"(r0), // %1 + "=r"(r1), // %2 + "=r"(r2), // %3 + "=r"(kptr) // %4 + : "0"(outptr0), + "1"(r0), + "2"(r1), + "3"(r2), + "4"(kptr) + : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v28", "v29", "v30", "v31" + ); + } + + r0 += tailstep; + r1 += tailstep; + r2 += tailstep; + } + } + } +} diff --git a/src/layer/arm/convolution_arm.cpp b/src/layer/arm/convolution_arm.cpp index a2b91fc2c..36c8c6e91 100644 --- a/src/layer/arm/convolution_arm.cpp +++ b/src/layer/arm/convolution_arm.cpp @@ -1467,6 +1467,15 @@ int Convolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const activation->forward_inplace(top_blob, opt); } } + else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + conv3x3s2_pack8_fp16sa_neon(bottom_blob_bordered, top_blob, weight_data_fp16, bias_data_fp16, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } else { // num_output