diff --git a/src/layer/arm/absval_arm.cpp b/src/layer/arm/absval_arm.cpp index 990109086..da5142ad6 100644 --- a/src/layer/arm/absval_arm.cpp +++ b/src/layer/arm/absval_arm.cpp @@ -43,13 +43,22 @@ int AbsVal_arm::forward_inplace(Mat& bottom_top_blob) const #if __ARM_NEON #if __aarch64__ - for (; nn>0; nn--) + if (nn > 0) { - float32x4_t _p = vld1q_f32(ptr); - _p = vabsq_f32(_p); - vst1q_f32(ptr, _p); - - ptr += 4; + asm volatile( + "0: \n" + "prfm pldl1keep, [%1, #128] \n" + "ld1 {v0.4s}, [%1] \n" + "fabs v0.4s, v0.4s \n" + "subs %w0, %w0, #1 \n" + "st1 {v0.4s}, [%1], #16 \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(ptr) // %1 + : "0"(nn), + "1"(ptr) + : "cc", "memory", "v0" + ); } #else if (nn > 0) diff --git a/src/layer/arm/batchnorm_arm.cpp b/src/layer/arm/batchnorm_arm.cpp index 0a9ab8adc..eb25a17e6 100644 --- a/src/layer/arm/batchnorm_arm.cpp +++ b/src/layer/arm/batchnorm_arm.cpp @@ -51,16 +51,27 @@ int BatchNorm_arm::forward_inplace(Mat& bottom_top_blob) const #if __ARM_NEON #if __aarch64__ - float32x4_t _a = vdupq_n_f32(a); - float32x4_t _b = vdupq_n_f32(b); - for (; nn>0; nn--) + if (nn > 0) { - float32x4_t _p = vld1q_f32(ptr); - float32x4_t _outp = _a; - _outp = vfmaq_f32(_outp, _p, _b); - vst1q_f32(ptr, _outp); - - ptr += 4; + asm volatile( + "dup v1.4s, %w4 \n" + "dup v2.4s, %w5 \n" + "0: \n" + "prfm pldl1keep, [%1, #128] \n" + "ld1 {v0.4s}, [%1] \n" + "orr v3.16b, v1.16b, v1.16b \n" + "fmla v3.4s, v0.4s, v2.4s \n" + "subs %w0, %w0, #1 \n" + "st1 {v3.4s}, [%1], #16 \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(ptr) // %1 + : "0"(nn), + "1"(ptr), + "r"(a), // %4 + "r"(b) // %5 + : "cc", "memory", "v0", "v1", "v2", "v3" + ); } #else if (nn > 0) diff --git a/src/layer/arm/convolution_1x1.h b/src/layer/arm/convolution_1x1.h index c872c9f07..97e002f45 100644 --- a/src/layer/arm/convolution_1x1.h +++ b/src/layer/arm/convolution_1x1.h @@ -1027,52 +1027,62 @@ static void conv1x1s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke float32x4_t _k2 = vdupq_n_f32(k2); float32x4_t _k3 = vdupq_n_f32(k3); #if __aarch64__ - for (; nn>0; nn--) + if (nn > 0) { - float32x4_t _p = vld1q_f32(r0); - float32x4_t _pn = vld1q_f32(r0+4); - - float32x4_t _out0p = vld1q_f32(outptr0); - float32x4_t _out0pn = vld1q_f32(outptr0+4); - - float32x4_t _out1p = vld1q_f32(outptr1); - float32x4_t _out1pn = vld1q_f32(outptr1+4); - - float32x4_t _out2p = vld1q_f32(outptr2); - float32x4_t _out2pn = vld1q_f32(outptr2+4); - - float32x4_t _out3p = vld1q_f32(outptr3); - float32x4_t _out3pn = vld1q_f32(outptr3+4); - - _out0p = vfmaq_f32(_out0p, _p, _k0); - _out0pn = vfmaq_f32(_out0pn, _pn, _k0); - - _out1p = vfmaq_f32(_out1p, _p, _k1); - _out1pn = vfmaq_f32(_out1pn, _pn, _k1); - - _out2p = vfmaq_f32(_out2p, _p, _k2); - _out2pn = vfmaq_f32(_out2pn, _pn, _k2); - - _out3p = vfmaq_f32(_out3p, _p, _k3); - _out3pn = vfmaq_f32(_out3pn, _pn, _k3); - - vst1q_f32(outptr0, _out0p); - vst1q_f32(outptr0+4, _out0pn); - - vst1q_f32(outptr1, _out1p); - vst1q_f32(outptr1+4, _out1pn); - - vst1q_f32(outptr2, _out2p); - vst1q_f32(outptr2+4, _out2pn); - - vst1q_f32(outptr3, _out3p); - vst1q_f32(outptr3+4, _out3pn); - - r0 += 8; - outptr0 += 8; - outptr1 += 8; - outptr2 += 8; - outptr3 += 8; + asm volatile( + "prfm pldl1keep, [%5, #256] \n" + "ld1 {v6.4s, v7.4s}, [%5], #32 \n" + "0: \n" + "prfm pldl1keep, [%1, #256] \n" + "ld1 {v8.4s, v9.4s}, [%1] \n" + "fmla v8.4s, v6.4s, %12.4s \n" + "fmla v9.4s, v7.4s, %12.4s \n" + + "prfm pldl1keep, [%2, #256] \n" + "ld1 {v10.4s, v11.4s}, [%2] \n" + "fmla v10.4s, v6.4s, %13.4s \n" + "fmla v11.4s, v7.4s, %13.4s \n" + + "st1 {v8.4s, v9.4s}, [%1], #32 \n" + + "prfm pldl1keep, [%3, #256] \n" + "ld1 {v12.4s, v13.4s}, [%3] \n" + "fmla v12.4s, v6.4s, %14.4s \n" + "fmla v13.4s, v7.4s, %14.4s \n" + + "st1 {v10.4s, v11.4s}, [%2], #32 \n" + + "prfm pldl1keep, [%4, #256] \n" + "ld1 {v14.4s, v15.4s}, [%4] \n" + "fmla v14.4s, v6.4s, %15.4s \n" + "fmla v15.4s, v7.4s, %15.4s \n" + + "st1 {v12.4s, v13.4s}, [%3], #32 \n" + + "prfm pldl1keep, [%5, #256] \n" + "ld1 {v6.4s, v7.4s}, [%5], #32 \n" + "subs %w0, %w0, #1 \n" + "st1 {v14.4s, v15.4s}, [%4], #32 \n" + "bne 0b \n" + "sub %5, %5, #32 \n" + : "=r"(nn), // %0 + "=r"(outptr0),// %1 + "=r"(outptr1),// %2 + "=r"(outptr2),// %3 + "=r"(outptr3),// %4 + "=r"(r0) // %5 + : "0"(nn), + "1"(outptr0), + "2"(outptr1), + "3"(outptr2), + "4"(outptr3), + "5"(r0), + "w"(_k0), // %12 + "w"(_k1), // %13 + "w"(_k2), // %14 + "w"(_k3) // %15 + : "cc", "memory", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" + ); } #else if (nn > 0) @@ -1202,43 +1212,56 @@ static void conv1x1s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke float32x4_t _k2 = vdupq_n_f32(k2); float32x4_t _k3 = vdupq_n_f32(k3); #if __aarch64__ - for (; nn>0; nn--) + if (nn > 0) { - float32x4_t _p = vld1q_f32(r0); - float32x4_t _pn = vld1q_f32(r0+4); - - float32x4_t _outp = vld1q_f32(outptr); - float32x4_t _outpn = vld1q_f32(outptr+4); - - _outp = vfmaq_f32(_outp, _p, _k0); - _outpn = vfmaq_f32(_outpn, _pn, _k0); - - float32x4_t _p1 = vld1q_f32(r1); - float32x4_t _p1n = vld1q_f32(r1+4); - - _outp = vfmaq_f32(_outp, _p1, _k1); - _outpn = vfmaq_f32(_outpn, _p1n, _k1); - - float32x4_t _p2 = vld1q_f32(r2); - float32x4_t _p2n = vld1q_f32(r2+4); - - _outp = vfmaq_f32(_outp, _p2, _k2); - _outpn = vfmaq_f32(_outpn, _p2n, _k2); - - float32x4_t _p3 = vld1q_f32(r3); - float32x4_t _p3n = vld1q_f32(r3+4); - - _outp = vfmaq_f32(_outp, _p3, _k3); - _outpn = vfmaq_f32(_outpn, _p3n, _k3); - - vst1q_f32(outptr, _outp); - vst1q_f32(outptr+4, _outpn); - - r0 += 8; - r1 += 8; - r2 += 8; - r3 += 8; - outptr += 8; + asm volatile( + "prfm pldl1keep, [%2, #256] \n" + "ld1 {v2.4s, v3.4s}, [%2], #32 \n" + "0: \n" + "prfm pldl1keep, [%1, #256] \n" + "ld1 {v0.4s, v1.4s}, [%1] \n" + "fmla v0.4s, v2.4s, %12.4s \n" + "fmla v1.4s, v3.4s, %12.4s \n" + + "prfm pldl1keep, [%3, #256] \n" + "ld1 {v2.4s, v3.4s}, [%3], #32 \n" + "fmla v0.4s, v2.4s, %13.4s \n" + "fmla v1.4s, v3.4s, %13.4s \n" + + "prfm pldl1keep, [%4, #256] \n" + "ld1 {v2.4s, v3.4s}, [%4], #32 \n" + "fmla v0.4s, v2.4s, %14.4s \n" + "fmla v1.4s, v3.4s, %14.4s \n" + + "prfm pldl1keep, [%5, #256] \n" + "ld1 {v2.4s, v3.4s}, [%5], #32 \n" + "fmla v0.4s, v2.4s, %15.4s \n" + "fmla v1.4s, v3.4s, %15.4s \n" + + "prfm pldl1keep, [%2, #256] \n" + "ld1 {v2.4s, v3.4s}, [%2], #32 \n" + "subs %w0, %w0, #1 \n" + "st1 {v0.4s, v1.4s}, [%1], #32 \n" + "bne 0b \n" + "sub %2, %2, #32 \n" + : "=r"(nn), // %0 + "=r"(outptr), // %1 + "=r"(r0), // %2 + "=r"(r1), // %3 + "=r"(r2), // %4 + "=r"(r3) // %5 + : "0"(nn), + "1"(outptr), + "2"(r0), + "3"(r1), + "4"(r2), + "5"(r3), + "w"(_k0), // %12 + "w"(_k1), // %13 + "w"(_k2), // %14 + "w"(_k3) // %15 + : "cc", "memory", "v0", "v1", "v2", "v3" + ); } #else if (nn > 0) @@ -1331,22 +1354,31 @@ static void conv1x1s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke #if __ARM_NEON float32x4_t _k0 = vdupq_n_f32(k0); #if __aarch64__ - for (; nn>0; nn--) + if (nn > 0) { - float32x4_t _p = vld1q_f32(r0); - float32x4_t _outp = vld1q_f32(outptr); - - float32x4_t _pn = vld1q_f32(r0+4); - float32x4_t _outpn = vld1q_f32(outptr+4); - - _outp = vfmaq_f32(_outp, _p, _k0); - _outpn = vfmaq_f32(_outpn, _pn, _k0); - - vst1q_f32(outptr, _outp); - vst1q_f32(outptr+4, _outpn); - - r0 += 8; - outptr += 8; + asm volatile( + "prfm pldl1keep, [%2, #256] \n" + "ld1 {v2.4s, v3.4s}, [%2], #32 \n" + "0: \n" + "prfm pldl1keep, [%1, #256] \n" + "ld1 {v0.4s, v1.4s}, [%1] \n" + "fmla v0.4s, v2.4s, %6.4s \n" + "fmla v1.4s, v3.4s, %6.4s \n" + "prfm pldl1keep, [%2, #256] \n" + "ld1 {v2.4s, v3.4s}, [%2], #32 \n" + "subs %w0, %w0, #1 \n" + "st1 {v0.4s, v1.4s}, [%1], #32 \n" + "bne 0b \n" + "sub %2, %2, #32 \n" + : "=r"(nn), // %0 + "=r"(outptr), // %1 + "=r"(r0) // %2 + : "0"(nn), + "1"(outptr), + "2"(r0), + "w"(_k0) // %6 + : "cc", "memory", "v0", "v1", "v2", "v3" + ); } #else if (nn > 0) @@ -1470,108 +1502,128 @@ static void conv1x1s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke float32x4_t _k2 = vld1q_f32(kernel2); float32x4_t _k3 = vld1q_f32(kernel3); #if __aarch64__ - for (; nn>0; nn--) + if (nn > 0) { - float32x4x2_t _px2 = vld2q_f32(r0); - float32x4_t _p = _px2.val[0]; - float32x4x2_t _pnx2 = vld2q_f32(r0+8); - float32x4_t _pn = _pnx2.val[0]; + asm volatile( + "0: \n" + + "prfm pldl1keep, [%5, #512] \n" + "ld2 {v4.4s, v5.4s}, [%5], #32 \n" + "ld2 {v6.4s, v7.4s}, [%5], #32 \n" + "and v5.16b, v6.16b, v6.16b \n"// v4 v5 - float32x4_t _out0p = vld1q_f32(outptr0); - float32x4_t _out0pn = vld1q_f32(outptr0+4); + "prfm pldl1keep, [%1, #256] \n" + "ld1 {v8.4s, v9.4s}, [%1] \n" - float32x4_t _out1p = vld1q_f32(outptr1); - float32x4_t _out1pn = vld1q_f32(outptr1+4); + "fmla v8.4s, v4.4s, %18.s[0] \n" + "fmla v9.4s, v5.4s, %18.s[0] \n" - float32x4_t _out2p = vld1q_f32(outptr2); - float32x4_t _out2pn = vld1q_f32(outptr2+4); + "prfm pldl1keep, [%2, #256] \n" + "ld1 {v10.4s, v11.4s}, [%2] \n" - float32x4_t _out3p = vld1q_f32(outptr3); - float32x4_t _out3pn = vld1q_f32(outptr3+4); + "fmla v10.4s, v4.4s, %19.s[0] \n" + "fmla v11.4s, v5.4s, %19.s[0] \n" - _out0p = vfmaq_laneq_f32(_out0p, _p, _k0, 0); - _out0pn = vfmaq_laneq_f32(_out0pn, _pn, _k0, 0); + "prfm pldl1keep, [%3, #256] \n" + "ld1 {v12.4s, v13.4s}, [%3] \n" - _out1p = vfmaq_laneq_f32(_out1p, _p, _k1, 0); - _out1pn = vfmaq_laneq_f32(_out1pn, _pn, _k1, 0); + "fmla v12.4s, v4.4s, %20.s[0] \n" + "fmla v13.4s, v5.4s, %20.s[0] \n" - _out2p = vfmaq_laneq_f32(_out2p, _p, _k2, 0); - _out2pn = vfmaq_laneq_f32(_out2pn, _pn, _k2, 0); + "prfm pldl1keep, [%4, #256] \n" + "ld1 {v14.4s, v15.4s}, [%4] \n" - _out3p = vfmaq_laneq_f32(_out3p, _p, _k3, 0); - _out3pn = vfmaq_laneq_f32(_out3pn, _pn, _k3, 0); + "prfm pldl1keep, [%6, #512] \n" + "ld2 {v6.4s, v7.4s}, [%6], #32 \n" - float32x4x2_t _p1x2 = vld2q_f32(r1); - float32x4_t _p1 = _p1x2.val[0]; - float32x4x2_t _p1nx2 = vld2q_f32(r1+8); - float32x4_t _p1n = _p1nx2.val[0]; + "fmla v14.4s, v4.4s, %21.s[0] \n" + "fmla v15.4s, v5.4s, %21.s[0] \n" - _out0p = vfmaq_laneq_f32(_out0p, _p1, _k0, 1); - _out0pn = vfmaq_laneq_f32(_out0pn, _p1n, _k0, 1); + "ld2 {v4.4s, v5.4s}, [%6], #32 \n" + "and v7.16b, v4.16b, v4.16b \n"// v6 v7 - _out1p = vfmaq_laneq_f32(_out1p, _p1, _k1, 1); - _out1pn = vfmaq_laneq_f32(_out1pn, _p1n, _k1, 1); + "fmla v8.4s, v6.4s, %18.s[1] \n" + "fmla v9.4s, v7.4s, %18.s[1] \n" - _out2p = vfmaq_laneq_f32(_out2p, _p1, _k2, 1); - _out2pn = vfmaq_laneq_f32(_out2pn, _p1n, _k2, 1); + "fmla v10.4s, v6.4s, %19.s[1] \n" + "fmla v11.4s, v7.4s, %19.s[1] \n" - _out3p = vfmaq_laneq_f32(_out3p, _p1, _k3, 1); - _out3pn = vfmaq_laneq_f32(_out3pn, _p1n, _k3, 1); + "fmla v12.4s, v6.4s, %20.s[1] \n" + "fmla v13.4s, v7.4s, %20.s[1] \n" - float32x4x2_t _p2x2 = vld2q_f32(r2); - float32x4_t _p2 = _p2x2.val[0]; - float32x4x2_t _p2nx2 = vld2q_f32(r2+8); - float32x4_t _p2n = _p2nx2.val[0]; + "prfm pldl1keep, [%7, #512] \n" + "ld2 {v4.4s, v5.4s}, [%7], #32 \n" - _out0p = vfmaq_laneq_f32(_out0p, _p2, _k0, 2); - _out0pn = vfmaq_laneq_f32(_out0pn, _p2n, _k0, 2); + "fmla v14.4s, v6.4s, %21.s[1] \n" + "fmla v15.4s, v7.4s, %21.s[1] \n" - _out1p = vfmaq_laneq_f32(_out1p, _p2, _k1, 2); - _out1pn = vfmaq_laneq_f32(_out1pn, _p2n, _k1, 2); + "ld2 {v6.4s, v7.4s}, [%7], #32 \n" + "and v5.16b, v6.16b, v6.16b \n"// v4 v5 - _out2p = vfmaq_laneq_f32(_out2p, _p2, _k2, 2); - _out2pn = vfmaq_laneq_f32(_out2pn, _p2n, _k2, 2); + "fmla v8.4s, v4.4s, %18.s[2] \n" + "fmla v9.4s, v5.4s, %18.s[2] \n" - _out3p = vfmaq_laneq_f32(_out3p, _p2, _k3, 2); - _out3pn = vfmaq_laneq_f32(_out3pn, _p2n, _k3, 2); + "fmla v10.4s, v4.4s, %19.s[2] \n" + "fmla v11.4s, v5.4s, %19.s[2] \n" - float32x4x2_t _p3x2 = vld2q_f32(r3); - float32x4_t _p3 = _p3x2.val[0]; - float32x4x2_t _p3nx2 = vld2q_f32(r3+8); - float32x4_t _p3n = _p3nx2.val[0]; + "fmla v12.4s, v4.4s, %20.s[2] \n" + "fmla v13.4s, v5.4s, %20.s[2] \n" - _out0p = vfmaq_laneq_f32(_out0p, _p3, _k0, 3); - _out0pn = vfmaq_laneq_f32(_out0pn, _p3n, _k0, 3); + "prfm pldl1keep, [%8, #512] \n" + "ld2 {v6.4s, v7.4s}, [%8], #32 \n" - _out1p = vfmaq_laneq_f32(_out1p, _p3, _k1, 3); - _out1pn = vfmaq_laneq_f32(_out1pn, _p3n, _k1, 3); + "fmla v14.4s, v4.4s, %21.s[2] \n" + "fmla v15.4s, v5.4s, %21.s[2] \n" - _out2p = vfmaq_laneq_f32(_out2p, _p3, _k2, 3); - _out2pn = vfmaq_laneq_f32(_out2pn, _p3n, _k2, 3); + "ld2 {v4.4s, v5.4s}, [%8], #32 \n" + "and v7.16b, v4.16b, v4.16b \n"// v6 v7 - _out3p = vfmaq_laneq_f32(_out3p, _p3, _k3, 3); - _out3pn = vfmaq_laneq_f32(_out3pn, _p3n, _k3, 3); + "fmla v8.4s, v6.4s, %18.s[3] \n" + "fmla v9.4s, v7.4s, %18.s[3] \n" - vst1q_f32(outptr0, _out0p); - vst1q_f32(outptr0+4, _out0pn); + "fmla v10.4s, v6.4s, %19.s[3] \n" + "fmla v11.4s, v7.4s, %19.s[3] \n" - vst1q_f32(outptr1, _out1p); - vst1q_f32(outptr1+4, _out1pn); + "st1 {v8.4s, v9.4s}, [%1], #32 \n" - vst1q_f32(outptr2, _out2p); - vst1q_f32(outptr2+4, _out2pn); + "fmla v12.4s, v6.4s, %20.s[3] \n" + "fmla v13.4s, v7.4s, %20.s[3] \n" - vst1q_f32(outptr3, _out3p); - vst1q_f32(outptr3+4, _out3pn); + "st1 {v10.4s, v11.4s}, [%2], #32 \n" - r0 += 16; - r1 += 16; - r2 += 16; - r3 += 16; - outptr0 += 8; - outptr1 += 8; - outptr2 += 8; - outptr3 += 8; + "fmla v14.4s, v6.4s, %21.s[3] \n" + "fmla v15.4s, v7.4s, %21.s[3] \n" + + "st1 {v12.4s, v13.4s}, [%3], #32 \n" + + "subs %w0, %w0, #1 \n" + "st1 {v14.4s, v15.4s}, [%4], #32 \n" + + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(outptr0),// %1 + "=r"(outptr1),// %2 + "=r"(outptr2),// %3 + "=r"(outptr3),// %4 + "=r"(r0), // %5 + "=r"(r1), // %6 + "=r"(r2), // %7 + "=r"(r3) // %8 + : "0"(nn), + "1"(outptr0), + "2"(outptr1), + "3"(outptr2), + "4"(outptr3), + "5"(r0), + "6"(r1), + "7"(r2), + "8"(r3), + "w"(_k0), // %18 + "w"(_k1), // %19 + "w"(_k2), // %20 + "w"(_k3) // %21 + : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" + ); } #else if (nn > 0) @@ -1767,54 +1819,67 @@ static void conv1x1s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke float32x4_t _k2 = vdupq_n_f32(k2); float32x4_t _k3 = vdupq_n_f32(k3); #if __aarch64__ - for (; nn>0; nn--) + if (nn > 0) { - float32x4x2_t _px2 = vld2q_f32(r0); - float32x4_t _p = _px2.val[0]; - float32x4x2_t _pnx2 = vld2q_f32(r0+8); - float32x4_t _pn = _pnx2.val[0]; - - float32x4_t _out0p = vld1q_f32(outptr0); - float32x4_t _out0pn = vld1q_f32(outptr0+4); - - float32x4_t _out1p = vld1q_f32(outptr1); - float32x4_t _out1pn = vld1q_f32(outptr1+4); - - float32x4_t _out2p = vld1q_f32(outptr2); - float32x4_t _out2pn = vld1q_f32(outptr2+4); + asm volatile( + "0: \n" - float32x4_t _out3p = vld1q_f32(outptr3); - float32x4_t _out3pn = vld1q_f32(outptr3+4); + "prfm pldl1keep, [%5, #512] \n" + "ld2 {v4.4s, v5.4s}, [%5], #32 \n" + "ld2 {v6.4s, v7.4s}, [%5], #32 \n" + "and v5.16b, v6.16b, v6.16b \n" + + "prfm pldl1keep, [%1, #256] \n" + "ld1 {v8.4s, v9.4s}, [%1] \n" - _out0p = vfmaq_f32(_out0p, _p, _k0); - _out0pn = vfmaq_f32(_out0pn, _pn, _k0); + "fmla v8.4s, v4.4s, %12.4s \n" + "fmla v9.4s, v5.4s, %12.4s \n" - _out1p = vfmaq_f32(_out1p, _p, _k1); - _out1pn = vfmaq_f32(_out1pn, _pn, _k1); + "prfm pldl1keep, [%2, #256] \n" + "ld1 {v10.4s, v11.4s}, [%2] \n" - _out2p = vfmaq_f32(_out2p, _p, _k2); - _out2pn = vfmaq_f32(_out2pn, _pn, _k2); + "fmla v10.4s, v4.4s, %13.4s \n" + "fmla v11.4s, v5.4s, %13.4s \n" - _out3p = vfmaq_f32(_out3p, _p, _k3); - _out3pn = vfmaq_f32(_out3pn, _pn, _k3); + "prfm pldl1keep, [%3, #256] \n" + "ld1 {v12.4s, v13.4s}, [%3] \n" - vst1q_f32(outptr0, _out0p); - vst1q_f32(outptr0+4, _out0pn); + "st1 {v8.4s, v9.4s}, [%1], #32 \n" - vst1q_f32(outptr1, _out1p); - vst1q_f32(outptr1+4, _out1pn); + "fmla v12.4s, v4.4s, %14.4s \n" + "fmla v13.4s, v5.4s, %14.4s \n" - vst1q_f32(outptr2, _out2p); - vst1q_f32(outptr2+4, _out2pn); + "prfm pldl1keep, [%4, #256] \n" + "ld1 {v14.4s, v15.4s}, [%4] \n" - vst1q_f32(outptr3, _out3p); - vst1q_f32(outptr3+4, _out3pn); + "st1 {v10.4s, v11.4s}, [%2], #32 \n" + + "fmla v14.4s, v4.4s, %15.4s \n" + "fmla v15.4s, v5.4s, %15.4s \n" - r0 += 16; - outptr0 += 8; - outptr1 += 8; - outptr2 += 8; - outptr3 += 8; + "st1 {v12.4s, v13.4s}, [%3], #32 \n" + "subs %w0, %w0, #1 \n" + + "st1 {v14.4s, v15.4s}, [%4], #32 \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(outptr0),// %1 + "=r"(outptr1),// %2 + "=r"(outptr2),// %3 + "=r"(outptr3),// %4 + "=r"(r0) // %5 + : "0"(nn), + "1"(outptr0), + "2"(outptr1), + "3"(outptr2), + "4"(outptr3), + "5"(r0), + "w"(_k0), // %12 + "w"(_k1), // %13 + "w"(_k2), // %14 + "w"(_k3) // %15 + : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" + ); } #else if (nn > 0) @@ -1951,51 +2016,63 @@ static void conv1x1s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke float32x4_t _k2 = vdupq_n_f32(k2); float32x4_t _k3 = vdupq_n_f32(k3); #if __aarch64__ - for (; nn>0; nn--) + if (nn > 0) { - float32x4x2_t _px2 = vld2q_f32(r0); - float32x4_t _p = _px2.val[0]; - float32x4_t _outp = vld1q_f32(outptr); - - float32x4x2_t _pnx2 = vld2q_f32(r0+8); - float32x4_t _pn = _pnx2.val[0]; - float32x4_t _outpn = vld1q_f32(outptr+4); - - _outp = vmlaq_f32(_outp, _p, _k0); - _outpn = vmlaq_f32(_outpn, _pn, _k0); - - float32x4x2_t _p1x2 = vld2q_f32(r1); - float32x4_t _p1 = _p1x2.val[0]; - float32x4x2_t _p1nx2 = vld2q_f32(r1+8); - float32x4_t _p1n = _p1nx2.val[0]; - - _outp = vmlaq_f32(_outp, _p1, _k1); - _outpn = vmlaq_f32(_outpn, _p1n, _k1); - - float32x4x2_t _p2x2 = vld2q_f32(r2); - float32x4_t _p2 = _p2x2.val[0]; - float32x4x2_t _p2nx2 = vld2q_f32(r2+8); - float32x4_t _p2n = _p2nx2.val[0]; - - _outp = vmlaq_f32(_outp, _p2, _k2); - _outpn = vmlaq_f32(_outpn, _p2n, _k2); - - float32x4x2_t _p3x2 = vld2q_f32(r3); - float32x4_t _p3 = _p3x2.val[0]; - float32x4x2_t _p3nx2 = vld2q_f32(r3+8); - float32x4_t _p3n = _p3nx2.val[0]; - - _outp = vmlaq_f32(_outp, _p3, _k3); - _outpn = vmlaq_f32(_outpn, _p3n, _k3); - - vst1q_f32(outptr, _outp); - vst1q_f32(outptr+4, _outpn); - - r0 += 16; - r1 += 16; - r2 += 16; - r3 += 16; - outptr += 8; + asm volatile( + "prfm pldl1keep, [%2, #512] \n" + "ld2 {v2.4s, v3.4s}, [%2], #32 \n" + "ld2 {v8.4s, v9.4s}, [%2], #32 \n" + "0: \n" + + "prfm pldl1keep, [%1, #256] \n" + "ld1 {v0.4s, v1.4s}, [%1] \n" + "fmla v0.4s, v2.4s, %12.4s \n" + "fmla v1.4s, v8.4s, %12.4s \n" + + "prfm pldl1keep, [%3, #512] \n" + "ld2 {v2.4s, v3.4s}, [%3], #32 \n" + "ld2 {v8.4s, v9.4s}, [%3], #32 \n" + "fmla v0.4s, v2.4s, %13.4s \n" + "fmla v1.4s, v8.4s, %13.4s \n" + + "prfm pldl1keep, [%4, #512] \n" + "ld2 {v2.4s, v3.4s}, [%4], #32 \n" + "ld2 {v8.4s, v9.4s}, [%4], #32 \n" + "fmla v0.4s, v2.4s, %14.4s \n" + "fmla v1.4s, v8.4s, %14.4s \n" + + "prfm pldl1keep, [%5, #512] \n" + "ld2 {v2.4s, v3.4s}, [%5], #32 \n" + "ld2 {v8.4s, v9.4s}, [%5], #32 \n" + "fmla v0.4s, v2.4s, %15.4s \n" + "fmla v1.4s, v8.4s, %15.4s \n" + + "prfm pldl1keep, [%2, #512] \n" + "ld2 {v2.4s, v3.4s}, [%2], #32 \n" + "ld2 {v8.4s, v9.4s}, [%2], #32 \n" + + "subs %w0, %w0, #1 \n" + "st1 {v0.4s, v1.4s}, [%1], #32 \n" + "bne 0b \n" + "sub %2, %2, #64 \n" + : "=r"(nn), // %0 + "=r"(outptr), // %1 + "=r"(r0), // %2 + "=r"(r1), // %3 + "=r"(r2), // %4 + "=r"(r3) // %5 + : "0"(nn), + "1"(outptr), + "2"(r0), + "3"(r1), + "4"(r2), + "5"(r3), + "w"(_k0), // %12 + "w"(_k1), // %13 + "w"(_k2), // %14 + "w"(_k3) // %15 + : "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9" + ); } #else if (nn > 0) @@ -2099,24 +2176,37 @@ static void conv1x1s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke #if __ARM_NEON float32x4_t _k0 = vdupq_n_f32(k0); #if __aarch64__ - for (; nn>0; nn--) + if (nn > 0) { - float32x4x2_t _px2 = vld2q_f32(r0); - float32x4_t _p = _px2.val[0]; - float32x4_t _outp = vld1q_f32(outptr); + asm volatile( + "prfm pldl1keep, [%2, #512] \n" + "ld2 {v2.4s, v3.4s}, [%2], #32 \n" + "ld2 {v8.4s, v9.4s}, [%2], #32 \n" - float32x4x2_t _pnx2 = vld2q_f32(r0+8); - float32x4_t _pn = _pnx2.val[0]; - float32x4_t _outpn = vld1q_f32(outptr+4); + "0: \n" - _outp = vmlaq_f32(_outp, _p, _k0); - _outpn = vmlaq_f32(_outpn, _pn, _k0); + "prfm pldl1keep, [%1, #256] \n" + "ld1 {v0.4s, v1.4s}, [%1] \n" + "fmla v0.4s, v2.4s, %6.4s \n" + "fmla v1.4s, v8.4s, %6.4s \n" - vst1q_f32(outptr, _outp); - vst1q_f32(outptr+4, _outpn); + "prfm pldl1keep, [%2, #512] \n" + "ld2 {v2.4s, v3.4s}, [%2], #32 \n" + "ld2 {v8.4s, v9.4s}, [%2], #32 \n" - r0 += 16; - outptr += 8; + "subs %w0, %w0, #1 \n" + "st1 {v0.4s, v1.4s}, [%1], #32 \n" + "bne 0b \n" + "sub %2, %2, #64 \n" + : "=r"(nn), // %0 + "=r"(outptr), // %1 + "=r"(r0) // %2 + : "0"(nn), + "1"(outptr), + "2"(r0), + "w"(_k0) // %6 + : "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9" + ); } #else if (nn > 0) diff --git a/src/layer/arm/convolution_2x2.h b/src/layer/arm/convolution_2x2.h index 0c52eb712..24cc7c4d9 100644 --- a/src/layer/arm/convolution_2x2.h +++ b/src/layer/arm/convolution_2x2.h @@ -71,37 +71,83 @@ static void conv2x2s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke #if __ARM_NEON #if __aarch64__ - for (; nn>0; nn--) + if (nn > 0) { - float32x4_t _r000 = vld1q_f32(r00); - float32x4_t _r010 = vld1q_f32(r01); - float32x4_t _r001 = vld1q_f32(r00 + 1); - float32x4_t _r011 = vld1q_f32(r01 + 1); - - float32x4_t _r100 = vld1q_f32(r10); - float32x4_t _r110 = vld1q_f32(r11); - float32x4_t _r101 = vld1q_f32(r10 + 1); - float32x4_t _r111 = vld1q_f32(r11 + 1); - - float32x4_t _sum = vld1q_f32(outptr); - - _sum = vmlaq_lane_f32(_sum, _r000, vget_low_f32(_k0), 0); - _sum = vmlaq_lane_f32(_sum, _r001, vget_low_f32(_k0), 1); - _sum = vmlaq_lane_f32(_sum, _r010, vget_high_f32(_k0), 0); - _sum = vmlaq_lane_f32(_sum, _r011, vget_high_f32(_k0), 1); - - _sum = vmlaq_lane_f32(_sum, _r100, vget_low_f32(_k1), 0); - _sum = vmlaq_lane_f32(_sum, _r101, vget_low_f32(_k1), 1); - _sum = vmlaq_lane_f32(_sum, _r110, vget_high_f32(_k1), 0); - _sum = vmlaq_lane_f32(_sum, _r111, vget_high_f32(_k1), 1); - - vst1q_f32(outptr, _sum); - - r00 += 4; - r01 += 4; - r10 += 4; - r11 += 4; - outptr += 4; + asm volatile( + "prfm pldl1keep, [%1, #128] \n" + "ld1 {v0.4s}, [%1], #16 \n" + "prfm pldl1keep, [%2, #128] \n" + "ld1 {v2.4s}, [%2], #16 \n" + "prfm pldl1keep, [%3, #128] \n" + "ld1 {v12.4s}, [%3], #16 \n" + "prfm pldl1keep, [%4, #128] \n" + "ld1 {v14.4s}, [%4], #16 \n" + + "0: \n" + "prfm pldl1keep, [%5, #128] \n" + "ld1 {v9.4s}, [%5] \n" + + "fmul v8.4s, v0.4s, %12.s[0] \n" + "fmla v9.4s, v2.4s, %12.s[2] \n" + + "prfm pldl1keep, [%1, #128] \n" + "ld1 {v1.4s}, [%1], #16 \n" + + "prfm pldl1keep, [%2, #128] \n" + "ld1 {v3.4s}, [%2], #16 \n" + + "ext v10.16b, v0.16b, v1.16b, #4 \n" + "ext v11.16b, v2.16b, v3.16b, #4 \n" + + "fmla v8.4s, v12.4s, %13.s[0] \n" + "fmla v9.4s, v14.4s, %13.s[2] \n" + + "prfm pldl1keep, [%3, #128] \n" + "ld1 {v13.4s}, [%3], #16 \n" + + "prfm pldl1keep, [%4, #128] \n" + "ld1 {v15.4s}, [%4], #16 \n" + + "fmla v8.4s, v10.4s, %12.s[1] \n" + "fmla v9.4s, v11.4s, %12.s[3] \n" + + "ext v10.16b, v12.16b, v13.16b, #4 \n" + "ext v11.16b, v14.16b, v15.16b, #4 \n" + + "fmla v8.4s, v10.4s, %13.s[1] \n" + "fmla v9.4s, v11.4s, %13.s[3] \n" + + "orr v0.16b, v1.16b, v1.16b \n" + "orr v2.16b, v3.16b, v3.16b \n" + + "fadd v8.4s, v8.4s, v9.4s \n" + + "orr v12.16b, v13.16b, v13.16b \n" + "orr v14.16b, v15.16b, v15.16b \n" + + "subs %w0, %w0, #1 \n" + "st1 {v8.4s}, [%5], #16 \n" + "bne 0b \n" + "sub %1, %1, #16 \n" + "sub %2, %2, #16 \n" + "sub %3, %3, #16 \n" + "sub %4, %4, #16 \n" + : "=r"(nn), // %0 + "=r"(r00), // %1 + "=r"(r01), // %2 + "=r"(r10), // %3 + "=r"(r11), // %4 + "=r"(outptr) // %5 + : "0"(nn), + "1"(r00), + "2"(r01), + "3"(r10), + "4"(r11), + "5"(outptr), + "w"(_k0), // %12 + "w"(_k1) // %13 + : "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" + ); } #else if (nn > 0) @@ -263,28 +309,56 @@ static void conv2x2s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke #if __ARM_NEON #if __aarch64__ - for (; nn>0; nn--) + if (nn > 0) { - float32x4_t _r00 = vld1q_f32(r0); - float32x4_t _r10 = vld1q_f32(r1); - float32x4_t _r01 = vld1q_f32(r0 + 1); - float32x4_t _r11 = vld1q_f32(r1 + 1); + asm volatile( + "prfm pldl1keep, [%1, #128] \n" + "ld1 {v0.4s}, [%1], #16 \n" + "prfm pldl1keep, [%2, #128] \n" + "ld1 {v2.4s}, [%2], #16 \n" + + "0: \n" + "prfm pldl1keep, [%3, #128] \n" + "ld1 {v9.4s}, [%3] \n" - float32x4_t _sum = vld1q_f32(outptr); - float32x4_t _sum2; + "fmul v8.4s, v0.4s, %8.4s \n" + "fmla v9.4s, v2.4s, %10.4s \n" - _sum = vmlaq_f32(_sum, _r00, _k0); - _sum2 = vmulq_f32(_r01, _k1); - _sum = vmlaq_f32(_sum, _r10, _k2); - _sum2 = vmlaq_f32(_sum2, _r11, _k3); + "prfm pldl1keep, [%1, #128] \n" + "ld1 {v1.4s}, [%1], #16 \n" + "ext v10.16b, v0.16b, v1.16b, #4 \n" - _sum = vaddq_f32(_sum, _sum2); + "fmla v8.4s, v10.4s, %9.4s \n" - vst1q_f32(outptr, _sum); + "prfm pldl1keep, [%2, #128] \n" + "ld1 {v3.4s}, [%2], #16 \n" + "ext v11.16b, v2.16b, v3.16b, #4 \n" - r0 += 4; - r1 += 4; - outptr += 4; + "fmla v9.4s, v11.4s, %11.4s \n" + + "orr v0.16b, v1.16b, v1.16b \n" + "fadd v8.4s, v8.4s, v9.4s \n" + "orr v2.16b, v3.16b, v3.16b \n" + + "subs %w0, %w0, #1 \n" + "st1 {v8.4s}, [%3], #16 \n" + "bne 0b \n" + "sub %1, %1, #16 \n" + "sub %2, %2, #16 \n" + : "=r"(nn), // %0 + "=r"(r0), // %1 + "=r"(r1), // %2 + "=r"(outptr) // %3 + : "0"(nn), + "1"(r0), + "2"(r1), + "3"(outptr), + "w"(_k0), // %8 + "w"(_k1), // %9 + "w"(_k2), // %10 + "w"(_k3) // %11 + : "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11" + ); } #else if (nn > 0) diff --git a/src/layer/arm/convolution_3x3.h b/src/layer/arm/convolution_3x3.h index f1bc2d6e7..75c0c897a 100644 --- a/src/layer/arm/convolution_3x3.h +++ b/src/layer/arm/convolution_3x3.h @@ -6858,49 +6858,74 @@ static void conv3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke #if __ARM_NEON #if __aarch64__ - for (; nn>0; nn--) + if (nn > 0) { - float32x4_t _outp = vld1q_f32(outptr); + asm volatile( + "prfm pldl1keep, [%2, #256] \n" + "ld2 {v2.4s, v3.4s}, [%2], #32 \n" + "0: \n" + + "prfm pldl1keep, [%1, #128] \n" + "ld1 {v0.4s}, [%1] \n" - float32x4x2_t _r0 = vld2q_f32(r0); - float32x4x2_t _r0n = vld2q_f32(r0+8); + "fmla v0.4s, v2.4s, %10.s[0] \n" + "fmul v10.4s, v3.4s, %10.s[1] \n" - float32x4_t _r00 = _r0.val[0];// 0 2 4 6 - float32x4_t _r01 = _r0.val[1];// 1 3 5 7 - float32x4_t _r02 = vextq_f32(_r00, _r0n.val[0], 1);// 2 4 6 8 + "prfm pldl1keep, [%2, #256] \n" + "ld2 {v8.4s, v9.4s}, [%2] \n" + "ext v1.16b, v2.16b, v8.16b, #4 \n" - _outp = vfmaq_laneq_f32(_outp, _r00, _k0123, 0); - _outp = vfmaq_laneq_f32(_outp, _r01, _k0123, 1); - _outp = vfmaq_laneq_f32(_outp, _r02, _k0123, 2); + "fmul v11.4s, v1.4s, %10.s[2] \n" - float32x4x2_t _r1 = vld2q_f32(r1); - float32x4x2_t _r1n = vld2q_f32(r1+8); + "prfm pldl1keep, [%3, #256] \n" + "ld2 {v2.4s, v3.4s}, [%3], #32 \n" - float32x4_t _r10 = _r1.val[0]; - float32x4_t _r11 = _r1.val[1]; - float32x4_t _r12 = vextq_f32(_r10, _r1n.val[0], 1); + "fmla v0.4s, v2.4s, %11.s[0] \n" + "fmla v10.4s, v3.4s, %11.s[1] \n" - _outp = vfmaq_laneq_f32(_outp, _r10, _k3456, 0); - _outp = vfmaq_laneq_f32(_outp, _r11, _k3456, 1); - _outp = vfmaq_laneq_f32(_outp, _r12, _k3456, 2); + "prfm pldl1keep, [%3, #256] \n" + "ld2 {v8.4s, v9.4s}, [%3] \n" + "ext v1.16b, v2.16b, v8.16b, #4 \n" - float32x4x2_t _r2 = vld2q_f32(r2); - float32x4x2_t _r2n = vld2q_f32(r2+8); + "fmla v11.4s, v1.4s, %11.s[2] \n" - float32x4_t _r20 = _r2.val[0]; - float32x4_t _r21 = _r2.val[1]; - float32x4_t _r22 = vextq_f32(_r20, _r2n.val[0], 1); + "prfm pldl1keep, [%4, #256] \n" + "ld2 {v2.4s, v3.4s}, [%4], #32 \n" - _outp = vfmaq_laneq_f32(_outp, _r20, _k6789, 0); - _outp = vfmaq_laneq_f32(_outp, _r21, _k6789, 1); - _outp = vfmaq_laneq_f32(_outp, _r22, _k6789, 2); + "fmla v0.4s, v2.4s, %12.s[0] \n" + "fmla v10.4s, v3.4s, %12.s[1] \n" - vst1q_f32(outptr, _outp); + "prfm pldl1keep, [%4, #256] \n" + "ld2 {v8.4s, v9.4s}, [%4] \n" + "ext v1.16b, v2.16b, v8.16b, #4 \n" - r0 += 8; - r1 += 8; - r2 += 8; - outptr += 4; + "fmla v11.4s, v1.4s, %12.s[2] \n" + + "prfm pldl1keep, [%2, #256] \n" + "ld2 {v2.4s, v3.4s}, [%2], #32 \n" + + "fadd v0.4s, v0.4s, v10.4s \n" + "fadd v0.4s, v0.4s, v11.4s \n" + + "subs %w0, %w0, #1 \n" + "st1 {v0.4s}, [%1], #16 \n" + "bne 0b \n" + "sub %2, %2, #32 \n" + : "=r"(nn), // %0 + "=r"(outptr), // %1 + "=r"(r0), // %2 + "=r"(r1), // %3 + "=r"(r2) // %4 + : "0"(nn), + "1"(outptr), + "2"(r0), + "3"(r1), + "4"(r2), + "w"(_k0123), // %10 + "w"(_k3456), // %11 + "w"(_k6789) // %12 + : "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" + ); } #else if (nn > 0) diff --git a/src/layer/arm/convolution_4x4.h b/src/layer/arm/convolution_4x4.h index 8b058d92c..3054faffd 100644 --- a/src/layer/arm/convolution_4x4.h +++ b/src/layer/arm/convolution_4x4.h @@ -75,63 +75,107 @@ static void conv4x4s4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke #if __ARM_NEON #if __aarch64__ - for (; nn>0; nn--) + if (nn > 0) { - float32x4_t _r00 = vld1q_f32(r0); - float32x4_t _r10 = vld1q_f32(r1); - float32x4_t _r20 = vld1q_f32(r2); - float32x4_t _r30 = vld1q_f32(r3); - - float32x4_t _r01 = vld1q_f32(r0 + 4); - float32x4_t _r11 = vld1q_f32(r1 + 4); - float32x4_t _r21 = vld1q_f32(r2 + 4); - float32x4_t _r31 = vld1q_f32(r3 + 4); - - float32x4_t _r02 = vld1q_f32(r0 + 8); - float32x4_t _r12 = vld1q_f32(r1 + 8); - float32x4_t _r22 = vld1q_f32(r2 + 8); - float32x4_t _r32 = vld1q_f32(r3 + 8); - - float32x4_t _r03 = vld1q_f32(r0 + 12); - float32x4_t _r13 = vld1q_f32(r1 + 12); - float32x4_t _r23 = vld1q_f32(r2 + 12); - float32x4_t _r33 = vld1q_f32(r3 + 12); - - float32x4_t _sum0 = vmulq_f32(_r00, _k0123); - float32x4_t _sum1 = vmulq_f32(_r01, _k0123); - float32x4_t _sum2 = vmulq_f32(_r02, _k0123); - float32x4_t _sum3 = vmulq_f32(_r03, _k0123); - - _sum0 = vfmaq_f32(_sum0, _r10, _k4567); - _sum1 = vfmaq_f32(_sum1, _r11, _k4567); - _sum2 = vfmaq_f32(_sum2, _r12, _k4567); - _sum3 = vfmaq_f32(_sum3, _r13, _k4567); - - _sum0 = vfmaq_f32(_sum0, _r20, _k891011); - _sum1 = vfmaq_f32(_sum1, _r21, _k891011); - _sum2 = vfmaq_f32(_sum2, _r22, _k891011); - _sum3 = vfmaq_f32(_sum3, _r23, _k891011); - - _sum0 = vfmaq_f32(_sum0, _r30, _k12131415); - _sum1 = vfmaq_f32(_sum1, _r31, _k12131415); - _sum2 = vfmaq_f32(_sum2, _r32, _k12131415); - _sum3 = vfmaq_f32(_sum3, _r33, _k12131415); - - float32x4_t _s01 = vpaddq_f32(_sum0, _sum1); - float32x4_t _s23 = vpaddq_f32(_sum2, _sum3); - float32x4_t _sum = vpaddq_f32(_s01, _s23); - - float32x4_t _outp = vld1q_f32(outptr); - - _outp = vaddq_f32(_outp, _sum); - - vst1q_f32(outptr, _outp); - - r0 += 16; - r1 += 16; - r2 += 16; - r3 += 16; - outptr += 4; + asm volatile( + "prfm pldl1keep, [%1, #128] \n" + "0: \n" + + "prfm pldl1keep, [%2, #512] \n" + "prfm pldl1keep, [%3, #512] \n" + + "ld1 {v7.4s}, [%1] \n" // v7 = outptr + + "ld1 {v8.4s}, [%2], #16 \n"// v8 = r0 + "ld1 {v9.4s}, [%3], #16 \n"// v9 = r1 + + "prfm pldl1keep, [%4, #512] \n" + "prfm pldl1keep, [%5, #512] \n" + + "fmul v12.4s, v8.4s, %12.4s \n" + "fmul v13.4s, v9.4s, %13.4s \n" + + "ld1 {v10.4s}, [%4], #16 \n"// v10 = r2 + "ld1 {v11.4s}, [%5], #16 \n"// v11 = r3 + + "fmla v12.4s, v10.4s, %14.4s \n" + "fmla v13.4s, v11.4s, %15.4s \n" + + "fadd v5.4s, v12.4s, v13.4s \n" + + "ld1 {v8.4s}, [%2], #16 \n"// v8 = r0 + "ld1 {v9.4s}, [%3], #16 \n"// v9 = r1 + + "fmul v12.4s, v8.4s, %12.4s \n" + "fmul v13.4s, v9.4s, %13.4s \n" + + "ld1 {v10.4s}, [%4], #16 \n"// v10 = r2 + "ld1 {v11.4s}, [%5], #16 \n"// v11 = r3 + + "fmla v12.4s, v10.4s, %14.4s \n" + "fmla v13.4s, v11.4s, %15.4s \n" + + "fadd v6.4s, v12.4s, v13.4s \n" + + "ld1 {v8.4s}, [%2], #16 \n"// v8 = r0 + "ld1 {v9.4s}, [%3], #16 \n"// v9 = r1 + + "fmul v12.4s, v8.4s, %12.4s \n" + "fmul v13.4s, v9.4s, %13.4s \n" + + "ld1 {v10.4s}, [%4], #16 \n"// v10 = r2 + "ld1 {v11.4s}, [%5], #16 \n"// v11 = r3 + + "fmla v12.4s, v10.4s, %14.4s \n" + "fmla v13.4s, v11.4s, %15.4s \n" + + "fadd v14.4s, v12.4s, v13.4s \n" + "faddp v5.4s, v5.4s, v6.4s \n" // Move to here to enhance ILP + + "ld1 {v8.4s}, [%2], #16 \n"// v8 = r0 + "ld1 {v9.4s}, [%3], #16 \n"// v9 = r1 + + "fmul v12.4s, v8.4s, %12.4s \n" + "fmul v13.4s, v9.4s, %13.4s \n" + + "ld1 {v10.4s}, [%4], #16 \n"// v10 = r2 + "ld1 {v11.4s}, [%5], #16 \n"// v11 = r3 + + "fmla v12.4s, v10.4s, %14.4s \n" + "fmla v13.4s, v11.4s, %15.4s \n" + + "fadd v15.4s, v12.4s, v13.4s \n" + +// "faddp v5.4s , v5.4s, v6.4s \n" // Move this line upward. + "faddp v14.4s, v14.4s, v15.4s \n" + "faddp v5.4s , v5.4s, v14.4s \n" + + "fadd v7.4s, v7.4s, v5.4s \n" + + "st1 {v7.4s}, [%1], #16 \n" + + "prfm pldl1keep, [%1, #128] \n" + + "subs %w0, %w0, #1 \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(outptr), // %1 + "=r"(r0), // %2 + "=r"(r1), // %3 + "=r"(r2), // %4 + "=r"(r3) // %5 + : "0"(nn), + "1"(outptr), + "2"(r0), + "3"(r1), + "4"(r2), + "5"(r3), + "w"(_k0123), // %12 + "w"(_k4567), // %13 + "w"(_k891011), // %14 + "w"(_k12131415) // %15 + : "cc", "memory", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" + ); } #else if (nn > 0) @@ -247,22 +291,42 @@ static void conv4x4s4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke { #if __ARM_NEON #if __aarch64__ - float32x4_t _r0 = vld1q_f32(r0); - float32x4_t _r1 = vld1q_f32(r1); - float32x4_t _r2 = vld1q_f32(r2); - float32x4_t _r3 = vld1q_f32(r3); + float sum = 0.f; - float32x4_t _sum = vmulq_f32(_r0, _k0123); - _sum = vmlaq_f32(_sum, _r1, _k4567); - _sum = vmlaq_f32(_sum, _r2, _k891011); - _sum = vmlaq_f32(_sum, _r3, _k12131415); + asm volatile( + "ld1 {v8.4s}, [%0], #16 \n"// v8 = r0 + "ld1 {v9.4s}, [%1], #16 \n"// v9 = r1 - *outptr += vaddvq_f32(_sum); + "fmul v12.4s, v8.4s, %9.4s \n" + "fmul v13.4s, v9.4s, %10.4s \n" - r0 += 4; - r1 += 4; - r2 += 4; - r3 += 4; + "ld1 {v10.4s}, [%2], #16 \n"// v10 = r2 + "ld1 {v11.4s}, [%3], #16 \n"// v11 = r3 + + "fmla v12.4s, v10.4s, %11.4s \n" + "fmla v13.4s, v11.4s, %12.4s \n" + + "fadd v5.4s, v12.4s, v13.4s \n" + "faddp v5.4s, v5.4s, v5.4s \n" + "faddp s5, v5.2s \n" + "fmov %w4, s5 \n" + : "=r"(r0), // %0 + "=r"(r1), // %1 + "=r"(r2), // %2 + "=r"(r3), // %3 + "=r"(sum) // %4 + : "0"(r0), + "1"(r1), + "2"(r2), + "3"(r3), + "w"(_k0123), // %9 + "w"(_k4567), // %10 + "w"(_k891011), // %11 + "w"(_k12131415) // %12 + : "cc", "memory", "v5", "v6", "v8", "v9", "v10", "v11", "v12", "v13" + ); + + *outptr += sum; #else float sum = 0.f; diff --git a/src/layer/arm/convolution_5x5.h b/src/layer/arm/convolution_5x5.h index fac3c988f..4928d0d43 100644 --- a/src/layer/arm/convolution_5x5.h +++ b/src/layer/arm/convolution_5x5.h @@ -83,118 +83,193 @@ static void conv5x5s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke #if __ARM_NEON #if __aarch64__ - for (; nn>0; nn--) + if (nn > 0) { - float32x4_t _sum = vld1q_f32(outptr); - float32x4_t _sum2 = vld1q_f32(outptr2); - - float32x4_t _r00 = vld1q_f32(r0); - float32x4_t _r04 = vld1q_f32(r0 + 4); - float32x4_t _r01 = vextq_f32(_r00, _r04, 1); - float32x4_t _r02 = vextq_f32(_r00, _r04, 2); - float32x4_t _r03 = vextq_f32(_r00, _r04, 3); - - float32x4_t _r10 = vld1q_f32(r1); - float32x4_t _r14 = vld1q_f32(r1 + 4); - float32x4_t _r11 = vextq_f32(_r10, _r14, 1); - float32x4_t _r12 = vextq_f32(_r10, _r14, 2); - float32x4_t _r13 = vextq_f32(_r10, _r14, 3); - - float32x4_t _r20 = vld1q_f32(r2); - float32x4_t _r24 = vld1q_f32(r2 + 4); - float32x4_t _r21 = vextq_f32(_r20, _r24, 1); - float32x4_t _r22 = vextq_f32(_r20, _r24, 2); - float32x4_t _r23 = vextq_f32(_r20, _r24, 3); - - float32x4_t _r30 = vld1q_f32(r3); - float32x4_t _r34 = vld1q_f32(r3 + 4); - float32x4_t _r31 = vextq_f32(_r30, _r34, 1); - float32x4_t _r32 = vextq_f32(_r30, _r34, 2); - float32x4_t _r33 = vextq_f32(_r30, _r34, 3); - - float32x4_t _r40 = vld1q_f32(r4); - float32x4_t _r44 = vld1q_f32(r4 + 4); - float32x4_t _r41 = vextq_f32(_r40, _r44, 1); - float32x4_t _r42 = vextq_f32(_r40, _r44, 2); - float32x4_t _r43 = vextq_f32(_r40, _r44, 3); - - float32x4_t _r50 = vld1q_f32(r5); - float32x4_t _r54 = vld1q_f32(r5 + 4); - float32x4_t _r51 = vextq_f32(_r50, _r54, 1); - float32x4_t _r52 = vextq_f32(_r50, _r54, 2); - float32x4_t _r53 = vextq_f32(_r50, _r54, 3); - - _sum = vfmaq_laneq_f32(_sum, _r00, _k0123, 0); - _sum = vfmaq_laneq_f32(_sum, _r01, _k0123, 1); - _sum = vfmaq_laneq_f32(_sum, _r02, _k0123, 2); - _sum = vfmaq_laneq_f32(_sum, _r03, _k0123, 3); - _sum = vfmaq_laneq_f32(_sum, _r04, _k4567, 0); - - _sum = vfmaq_laneq_f32(_sum, _r10, _k4567, 1); - _sum = vfmaq_laneq_f32(_sum, _r11, _k4567, 2); - _sum = vfmaq_laneq_f32(_sum, _r12, _k4567, 3); - _sum = vfmaq_laneq_f32(_sum, _r13, _k891011, 0); - _sum = vfmaq_laneq_f32(_sum, _r14, _k891011, 1); - - _sum = vfmaq_laneq_f32(_sum, _r20, _k891011, 2); - _sum = vfmaq_laneq_f32(_sum, _r21, _k891011, 3); - _sum = vfmaq_laneq_f32(_sum, _r22, _k12131415, 0); - _sum = vfmaq_laneq_f32(_sum, _r23, _k12131415, 1); - _sum = vfmaq_laneq_f32(_sum, _r24, _k12131415, 2); - - _sum = vfmaq_laneq_f32(_sum, _r30, _k12131415, 3); - _sum = vfmaq_laneq_f32(_sum, _r31, _k16171819, 0); - _sum = vfmaq_laneq_f32(_sum, _r32, _k16171819, 1); - _sum = vfmaq_laneq_f32(_sum, _r33, _k16171819, 2); - _sum = vfmaq_laneq_f32(_sum, _r34, _k16171819, 3); - - _sum = vfmaq_laneq_f32(_sum, _r40, _k20212223, 0); - _sum = vfmaq_laneq_f32(_sum, _r41, _k20212223, 1); - _sum = vfmaq_laneq_f32(_sum, _r42, _k20212223, 2); - _sum = vfmaq_laneq_f32(_sum, _r43, _k20212223, 3); - _sum = vfmaq_laneq_f32(_sum, _r44, _k24242424, 0); - - _sum2 = vfmaq_laneq_f32(_sum2, _r10, _k0123, 0); - _sum2 = vfmaq_laneq_f32(_sum2, _r11, _k0123, 1); - _sum2 = vfmaq_laneq_f32(_sum2, _r12, _k0123, 2); - _sum2 = vfmaq_laneq_f32(_sum2, _r13, _k0123, 3); - _sum2 = vfmaq_laneq_f32(_sum2, _r14, _k4567, 0); - - _sum2 = vfmaq_laneq_f32(_sum2, _r20, _k4567, 1); - _sum2 = vfmaq_laneq_f32(_sum2, _r21, _k4567, 2); - _sum2 = vfmaq_laneq_f32(_sum2, _r22, _k4567, 3); - _sum2 = vfmaq_laneq_f32(_sum2, _r23, _k891011, 0); - _sum2 = vfmaq_laneq_f32(_sum2, _r24, _k891011, 1); - - _sum2 = vfmaq_laneq_f32(_sum2, _r30, _k891011, 2); - _sum2 = vfmaq_laneq_f32(_sum2, _r31, _k891011, 3); - _sum2 = vfmaq_laneq_f32(_sum2, _r32, _k12131415, 0); - _sum2 = vfmaq_laneq_f32(_sum2, _r33, _k12131415, 1); - _sum2 = vfmaq_laneq_f32(_sum2, _r34, _k12131415, 2); - - _sum2 = vfmaq_laneq_f32(_sum2, _r40, _k12131415, 3); - _sum2 = vfmaq_laneq_f32(_sum2, _r41, _k16171819, 0); - _sum2 = vfmaq_laneq_f32(_sum2, _r42, _k16171819, 1); - _sum2 = vfmaq_laneq_f32(_sum2, _r43, _k16171819, 2); - _sum2 = vfmaq_laneq_f32(_sum2, _r44, _k16171819, 3); - - _sum2 = vfmaq_laneq_f32(_sum2, _r50, _k20212223, 0); - _sum2 = vfmaq_laneq_f32(_sum2, _r51, _k20212223, 1); - _sum2 = vfmaq_laneq_f32(_sum2, _r52, _k20212223, 2); - _sum2 = vfmaq_laneq_f32(_sum2, _r53, _k20212223, 3); - _sum2 = vfmaq_laneq_f32(_sum2, _r54, _k24242424, 0); - - vst1q_f32(outptr, _sum); - vst1q_f32(outptr2, _sum2); - - r0 += 4; - r1 += 4; - r2 += 4; - r3 += 4; - r4 += 4; - r5 += 4; - outptr += 4; - outptr2 += 4; + asm volatile( + // v11 = rx1 / rx3 + // v12 = rx2 + // v13 v14 = intermediate sum register + + "prfm pldl1keep, [%1, #128] \n" + "ld1 {v7.4s}, [%1] \n"// v7 = out + + "0: \n" + + "prfm pldl1keep, [%2, #128] \n" + "ld1 {v8.4s}, [%2] \n"// v8 = out2 + + // r1 + "prfm pldl1keep, [%4, #256] \n" + "ld1 {v9.4s, v10.4s}, [%4] \n"// v9 v10 = r10 r14 + "add %4, %4, #16 \n" + + "ext v11.16b, v9.16b, v10.16b, #4 \n" //r11 + "fmul v13.4s, v9.4s, %19.s[1] \n" + "fmla v8.4s, v9.4s, %18.s[0] \n" + + "ext v12.16b, v9.16b, v10.16b, #8 \n" //r12 + "fmla v7.4s, v11.4s, %19.s[2] \n" + "fmul v14.4s, v11.4s, %18.s[1] \n" + + "ext v11.16b, v9.16b, v10.16b, #12 \n" //r13 + "fmla v13.4s, v12.4s, %19.s[3] \n" + "fmla v8.4s, v12.4s, %18.s[2] \n" + + "fmla v7.4s, v11.4s, %20.s[0] \n" + "fmla v14.4s, v11.4s, %18.s[3] \n" + + "prfm pldl1keep, [%5, #256] \n" + + "fmla v13.4s, v10.4s, %20.s[1] \n" + "fmla v8.4s, v10.4s, %19.s[0] \n" + + // r2 + "ld1 {v9.4s, v10.4s}, [%5] \n"// v9 v10 = r20 r24 + "add %5, %5, #16 \n" + + "ext v11.16b, v9.16b, v10.16b, #4 \n" //r21 + "fmla v7.4s, v9.4s, %20.s[2] \n" + "fmla v14.4s, v9.4s, %19.s[1] \n" + + "ext v12.16b, v9.16b, v10.16b, #8 \n" //r22 + "fmla v13.4s, v11.4s, %20.s[3] \n" + "fmla v8.4s, v11.4s, %19.s[2] \n" + + "ext v11.16b, v9.16b, v10.16b, #12 \n" //r23 + "fmla v7.4s, v12.4s, %21.s[0] \n" + "fmla v14.4s, v12.4s, %19.s[3] \n" + + "fmla v13.4s, v11.4s, %21.s[1] \n" + "fmla v8.4s, v11.4s, %20.s[0] \n" + + "prfm pldl1keep, [%6, #256] \n" + + "fmla v7.4s, v10.4s, %21.s[2] \n" + "fmla v14.4s, v10.4s, %20.s[1] \n" + + // r3 + "ld1 {v9.4s, v10.4s}, [%6] \n"// v9 v10 = r30 r34 + "add %6, %6, #16 \n" + + "ext v11.16b, v9.16b, v10.16b, #4 \n" //r31 + "fmla v13.4s, v9.4s, %21.s[3] \n" + "fmla v8.4s, v9.4s, %20.s[2] \n" + + "ext v12.16b, v9.16b, v10.16b, #8 \n" //r32 + "fmla v7.4s, v11.4s, %22.s[0] \n" + "fmla v14.4s, v11.4s, %20.s[3] \n" + + "ext v11.16b, v9.16b, v10.16b, #12 \n" //r33 + "fmla v13.4s, v12.4s, %22.s[1] \n" + "fmla v8.4s, v12.4s, %21.s[0] \n" + + "fmla v7.4s, v11.4s, %22.s[2] \n" + "fmla v14.4s, v11.4s, %21.s[1] \n" + + "prfm pldl1keep, [%7, #256] \n" + + "fmla v13.4s, v10.4s, %22.s[3] \n" + "fmla v8.4s, v10.4s, %21.s[2] \n" + + // r4 + "ld1 {v9.4s, v10.4s}, [%7] \n"// v9 v10 = r40 r44 + "add %7, %7, #16 \n" + + "ext v11.16b, v9.16b, v10.16b, #4 \n" //r41 + "fmla v7.4s, v9.4s, %23.s[0] \n" + "fmla v14.4s, v9.4s, %21.s[3] \n" + + "ext v12.16b, v9.16b, v10.16b, #8 \n" //r41 + "fmla v13.4s, v11.4s, %23.s[1] \n" + "fmla v8.4s, v11.4s, %22.s[0] \n" + + "ext v11.16b, v9.16b, v10.16b, #12 \n" //r41 + "fmla v7.4s, v12.4s, %23.s[2] \n" + "fmla v14.4s, v12.4s, %22.s[1] \n" + + "fmla v13.4s, v11.4s, %23.s[3] \n" + "fmla v8.4s, v11.4s, %22.s[2] \n" + + "prfm pldl1keep, [%3, #256] \n" + + "fmla v7.4s, v10.4s, %24.s[0] \n" + "fmla v14.4s, v10.4s, %22.s[3] \n" + + // r0 and r5 + "ld1 {v9.4s, v10.4s}, [%3] \n"// v9 v10 = r00 r04 + "add %3, %3, #16 \n" + + "ext v11.16b, v9.16b, v10.16b, #4 \n" //r01 + "fmla v13.4s, v11.4s, %18.s[1] \n" + + "ext v12.16b, v9.16b, v10.16b, #8 \n" //r02 + "fmla v7.4s, v12.4s, %18.s[2] \n" + + "ext v11.16b, v9.16b, v10.16b, #12 \n" //r03 + + "prfm pldl1keep, [%8, #256] \n" + + "fmla v13.4s, v11.4s, %18.s[3] \n" + + // r5 + "ld1 {v11.4s, v12.4s}, [%8] \n"// v11 v12 = r50 r54 + "add %8, %8, #16 \n" + + "fmla v8.4s, v11.4s, %23.s[0] \n" + "fmla v14.4s, v12.4s, %24.s[0] \n" + + "fmla v7.4s, v9.4s, %18.s[0] \n" + "fmla v13.4s, v10.4s, %19.s[0] \n" + + "ext v9.16b, v11.16b, v12.16b, #4 \n" //r51 + "ext v10.16b, v11.16b, v12.16b, #8 \n" //r52 + + "fmla v14.4s, v9.4s, %23.s[1] \n" + + "ext v9.16b, v11.16b, v12.16b, #12 \n" //r53 + "fmla v8.4s, v10.4s, %23.s[2] \n" + + "fmla v14.4s, v9.4s, %23.s[3] \n" + + "fadd v7.4s, v7.4s, v13.4s \n" + + "st1 {v7.4s}, [%1], #16 \n" + + "fadd v8.4s, v8.4s, v14.4s \n" + + "prfm pldl1keep, [%1, #128] \n" + "ld1 {v7.4s}, [%1] \n"// v7 = out + "st1 {v8.4s}, [%2], #16 \n" + + "subs %w0, %w0, #1 \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(outptr), // %1 + "=r"(outptr2), // %2 + "=r"(r0), // %3 + "=r"(r1), // %4 + "=r"(r2), // %5 + "=r"(r3), // %6 + "=r"(r4), // %7 + "=r"(r5) // %8 + : "0"(nn), + "1"(outptr), + "2"(outptr2), + "3"(r0), + "4"(r1), + "5"(r2), + "6"(r3), + "7"(r4), + "8"(r5), + "w"(_k0123), // %18 + "w"(_k4567), // %19 + "w"(_k891011), // %20 + "w"(_k12131415), // %21 + "w"(_k16171819), // %22 + "w"(_k20212223), // %23 + "w"(_k24242424) // %24 + : "cc", "memory", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" + ); } #else if (nn > 0) @@ -555,78 +630,130 @@ static void conv5x5s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke #if __ARM_NEON #if __aarch64__ - for (; nn>0; nn--) + if (nn > 0) { - float32x4_t _sum = vld1q_f32(outptr); - - float32x4_t _r00 = vld1q_f32(r0); - float32x4_t _r04 = vld1q_f32(r0 + 4); - float32x4_t _r01 = vextq_f32(_r00, _r04, 1); - float32x4_t _r02 = vextq_f32(_r00, _r04, 2); - float32x4_t _r03 = vextq_f32(_r00, _r04, 3); - - float32x4_t _r10 = vld1q_f32(r1); - float32x4_t _r14 = vld1q_f32(r1 + 4); - float32x4_t _r11 = vextq_f32(_r10, _r14, 1); - float32x4_t _r12 = vextq_f32(_r10, _r14, 2); - float32x4_t _r13 = vextq_f32(_r10, _r14, 3); - - float32x4_t _r20 = vld1q_f32(r2); - float32x4_t _r24 = vld1q_f32(r2 + 4); - float32x4_t _r21 = vextq_f32(_r20, _r24, 1); - float32x4_t _r22 = vextq_f32(_r20, _r24, 2); - float32x4_t _r23 = vextq_f32(_r20, _r24, 3); - - float32x4_t _r30 = vld1q_f32(r3); - float32x4_t _r34 = vld1q_f32(r3 + 4); - float32x4_t _r31 = vextq_f32(_r30, _r34, 1); - float32x4_t _r32 = vextq_f32(_r30, _r34, 2); - float32x4_t _r33 = vextq_f32(_r30, _r34, 3); - - float32x4_t _r40 = vld1q_f32(r4); - float32x4_t _r44 = vld1q_f32(r4 + 4); - float32x4_t _r41 = vextq_f32(_r40, _r44, 1); - float32x4_t _r42 = vextq_f32(_r40, _r44, 2); - float32x4_t _r43 = vextq_f32(_r40, _r44, 3); - - _sum = vfmaq_laneq_f32(_sum, _r00, _k0123, 0); - _sum = vfmaq_laneq_f32(_sum, _r01, _k0123, 1); - _sum = vfmaq_laneq_f32(_sum, _r02, _k0123, 2); - _sum = vfmaq_laneq_f32(_sum, _r03, _k0123, 3); - _sum = vfmaq_laneq_f32(_sum, _r04, _k4567, 0); - - _sum = vfmaq_laneq_f32(_sum, _r10, _k4567, 1); - _sum = vfmaq_laneq_f32(_sum, _r11, _k4567, 2); - _sum = vfmaq_laneq_f32(_sum, _r12, _k4567, 3); - _sum = vfmaq_laneq_f32(_sum, _r13, _k891011, 0); - _sum = vfmaq_laneq_f32(_sum, _r14, _k891011, 1); - - _sum = vfmaq_laneq_f32(_sum, _r20, _k891011, 2); - _sum = vfmaq_laneq_f32(_sum, _r21, _k891011, 3); - _sum = vfmaq_laneq_f32(_sum, _r22, _k12131415, 0); - _sum = vfmaq_laneq_f32(_sum, _r23, _k12131415, 1); - _sum = vfmaq_laneq_f32(_sum, _r24, _k12131415, 2); - - _sum = vfmaq_laneq_f32(_sum, _r30, _k12131415, 3); - _sum = vfmaq_laneq_f32(_sum, _r31, _k16171819, 0); - _sum = vfmaq_laneq_f32(_sum, _r32, _k16171819, 1); - _sum = vfmaq_laneq_f32(_sum, _r33, _k16171819, 2); - _sum = vfmaq_laneq_f32(_sum, _r34, _k16171819, 3); - - _sum = vfmaq_laneq_f32(_sum, _r40, _k20212223, 0); - _sum = vfmaq_laneq_f32(_sum, _r41, _k20212223, 1); - _sum = vfmaq_laneq_f32(_sum, _r42, _k20212223, 2); - _sum = vfmaq_laneq_f32(_sum, _r43, _k20212223, 3); - _sum = vfmaq_laneq_f32(_sum, _r44, _k24242424, 0); - - vst1q_f32(outptr, _sum); - - r0 += 4; - r1 += 4; - r2 += 4; - r3 += 4; - r4 += 4; - outptr += 4; + asm volatile( + "prfm pldl1keep, [%1, #128] \n" + "prfm pldl1keep, [%2, #256] \n" + + "ld1 {v8.4s, v9.4s}, [%2] \n"// _r00 = vld1q_f32(r0+j); + "add %2, %2, #16 \n" + + "0: \n" + + "ld1 {v7.4s}, [%1] \n"// _sum = vld1q_f32(outptr+j); + + "ext v10.16b, v8.16b, v9.16b, #4 \n" //_r01 + "ext v11.16b, v8.16b, v9.16b, #8 \n" //_r02 + "ext v12.16b, v8.16b, v9.16b, #12 \n" //_r03 + + "fmla v7.4s, v8.4s, %14.s[0] \n" + "fmul v13.4s, v10.4s, %14.s[1] \n" + + "prfm pldl1keep, [%3, #256] \n" + + "fmul v14.4s, v11.4s, %14.s[2] \n" + "fmul v15.4s, v12.4s, %14.s[3] \n" + "fmla v7.4s, v9.4s, %15.s[0] \n" + + "ld1 {v8.4s, v9.4s}, [%3] \n" + "add %3, %3, #16 \n" + "ext v10.16b, v8.16b, v9.16b, #4 \n" //_r11 + "ext v11.16b, v8.16b, v9.16b, #8 \n" //_r12 + "ext v12.16b, v8.16b, v9.16b, #12 \n" //_r13 + + "fmla v7.4s, v8.4s, %15.s[1] \n" + "fmla v13.4s, v10.4s, %15.s[2] \n" + + "prfm pldl1keep, [%4, #256] \n" + + "fmla v14.4s, v11.4s, %15.s[3] \n" + "fmla v15.4s, v12.4s, %16.s[0] \n" + "fmla v7.4s, v9.4s, %16.s[1] \n" + + "ld1 {v8.4s, v9.4s}, [%4] \n" + "add %4, %4, #16 \n" + "ext v10.16b, v8.16b, v9.16b, #4 \n" //_r21 + "ext v11.16b, v8.16b, v9.16b, #8 \n" //_r22 + "ext v12.16b, v8.16b, v9.16b, #12 \n" //_r23 + + "fmla v7.4s, v8.4s, %16.s[2] \n" + "fmla v13.4s, v10.4s, %16.s[3] \n" + + "prfm pldl1keep, [%5, #256] \n" + + "fmla v14.4s, v11.4s, %17.s[0] \n" + "fmla v15.4s, v12.4s, %17.s[1] \n" + "fmla v7.4s, v9.4s, %17.s[2] \n" + + "ld1 {v8.4s, v9.4s}, [%5] \n" + "add %5, %5, #16 \n" + "ext v10.16b, v8.16b, v9.16b, #4 \n" //_r31 + "ext v11.16b, v8.16b, v9.16b, #8 \n" //_r32 + "ext v12.16b, v8.16b, v9.16b, #12 \n" //_r33 + + "fmla v7.4s, v8.4s, %17.s[3] \n" + "fmla v13.4s, v10.4s, %18.s[0] \n" + + "prfm pldl1keep, [%6, #256] \n" + + "fmla v14.4s, v11.4s, %18.s[1] \n" + "fmla v15.4s, v12.4s, %18.s[2] \n" + "fmla v7.4s, v9.4s, %18.s[3] \n" + + "ld1 {v8.4s, v9.4s}, [%6] \n" + "add %6, %6, #16 \n" + "ext v10.16b, v8.16b, v9.16b, #4 \n" //_r41 + "ext v11.16b, v8.16b, v9.16b, #8 \n" //_r42 + "ext v12.16b, v8.16b, v9.16b, #12 \n" //_r43 + + "fmla v7.4s, v8.4s, %19.s[0] \n" + "fmla v13.4s, v10.4s, %19.s[1] \n" + "fmla v14.4s, v11.4s, %19.s[2] \n" + "fmla v15.4s, v12.4s, %19.s[3] \n" + "fmla v7.4s, v9.4s, %20.s[0] \n" + + "fadd v14.4s, v14.4s, v15.4s \n" + "fadd v7.4s, v7.4s, v13.4s \n" + + "prfm pldl1keep, [%2, #256] \n" + + "fadd v7.4s, v7.4s, v14.4s \n" + + "ld1 {v8.4s, v9.4s}, [%2] \n" + "add %2, %2, #16 \n" + + "st1 {v7.4s}, [%1], #16 \n" + + "prfm pldl1keep, [%1, #128] \n" + + "subs %w0, %w0, #1 \n" + "bne 0b \n" + + "sub %2, %2, #16 \n" + : "=r"(nn), // %0 + "=r"(outptr), // %1 + "=r"(r0), // %2 + "=r"(r1), // %3 + "=r"(r2), // %4 + "=r"(r3), // %5 + "=r"(r4) // %6 + : "0"(nn), + "1"(outptr), + "2"(r0), + "3"(r1), + "4"(r2), + "5"(r3), + "6"(r4), + "w"(_k0123), // %14 + "w"(_k4567), // %15 + "w"(_k891011), // %16 + "w"(_k12131415), // %17 + "w"(_k16171819), // %18 + "w"(_k20212223), // %19 + "w"(_k24242424) // %20 + : "cc", "memory", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" + ); } #else if (nn > 0) @@ -920,99 +1047,147 @@ static void conv5x5s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke #if __ARM_NEON #if __aarch64__ - for (; nn>0; nn--) + if (nn > 0) { - float32x4_t _sum = vld1q_f32(outptr); - - float32x4x2_t _r00_02461357 = vld2q_f32(r0); - float32x4x2_t _r00nx2 = vld2q_f32(r0 + 8); - float32x4_t _r0_8101214 = _r00nx2.val[0];// 8 10 12 14 - float32x4_t _r0_9111315 = _r00nx2.val[1];// 9 11 13 15 - float32x4_t _r00 = _r00_02461357.val[0];// 0 2 4 6 - float32x4_t _r01 = _r00_02461357.val[1];// 1 3 5 7 - float32x4_t _r02 = vextq_f32(_r00, _r0_8101214, 1);// 2 4 6 8 - float32x4_t _r03 = vextq_f32(_r01, _r0_9111315, 1);// 3 5 7 9 - float32x4_t _r04 = vextq_f32(_r00, _r0_8101214, 2);// 4 6 8 10 - - float32x4x2_t _r10_02461357 = vld2q_f32(r1); - float32x4x2_t _r10nx2 = vld2q_f32(r1 + 8); - float32x4_t _r1_8101214 = _r10nx2.val[0]; - float32x4_t _r1_9111315 = _r10nx2.val[1]; - float32x4_t _r10 = _r10_02461357.val[0]; - float32x4_t _r11 = _r10_02461357.val[1]; - float32x4_t _r12 = vextq_f32(_r10, _r1_8101214, 1); - float32x4_t _r13 = vextq_f32(_r11, _r1_9111315, 1); - float32x4_t _r14 = vextq_f32(_r10, _r1_8101214, 2); - - float32x4x2_t _r20_02461357 = vld2q_f32(r2); - float32x4x2_t _r20nx2 = vld2q_f32(r2 + 8); - float32x4_t _r2_8101214 = _r20nx2.val[0]; - float32x4_t _r2_9111315 = _r20nx2.val[1]; - float32x4_t _r20 = _r20_02461357.val[0]; - float32x4_t _r21 = _r20_02461357.val[1]; - float32x4_t _r22 = vextq_f32(_r20, _r2_8101214, 1); - float32x4_t _r23 = vextq_f32(_r21, _r2_9111315, 1); - float32x4_t _r24 = vextq_f32(_r20, _r2_8101214, 2); - - float32x4x2_t _r30_02461357 = vld2q_f32(r3); - float32x4x2_t _r30nx2 = vld2q_f32(r3 + 8); - float32x4_t _r3_8101214 = _r30nx2.val[0]; - float32x4_t _r3_9111315 = _r30nx2.val[1]; - float32x4_t _r30 = _r30_02461357.val[0]; - float32x4_t _r31 = _r30_02461357.val[1]; - float32x4_t _r32 = vextq_f32(_r30, _r3_8101214, 1); - float32x4_t _r33 = vextq_f32(_r31, _r3_9111315, 1); - float32x4_t _r34 = vextq_f32(_r30, _r3_8101214, 2); - - float32x4x2_t _r40_02461357 = vld2q_f32(r4); - float32x4x2_t _r40nx2 = vld2q_f32(r4 + 8); - float32x4_t _r4_8101214 = _r40nx2.val[0]; - float32x4_t _r4_9111315 = _r40nx2.val[1]; - float32x4_t _r40 = _r40_02461357.val[0]; - float32x4_t _r41 = _r40_02461357.val[1]; - float32x4_t _r42 = vextq_f32(_r40, _r4_8101214, 1); - float32x4_t _r43 = vextq_f32(_r41, _r4_9111315, 1); - float32x4_t _r44 = vextq_f32(_r40, _r4_8101214, 2); - - _sum = vfmaq_laneq_f32(_sum, _r00, _k0123, 0); - _sum = vfmaq_laneq_f32(_sum, _r01, _k0123, 1); - _sum = vfmaq_laneq_f32(_sum, _r02, _k0123, 2); - _sum = vfmaq_laneq_f32(_sum, _r03, _k0123, 3); - _sum = vfmaq_laneq_f32(_sum, _r04, _k4567, 0); - - _sum = vfmaq_laneq_f32(_sum, _r10, _k4567, 1); - _sum = vfmaq_laneq_f32(_sum, _r11, _k4567, 2); - _sum = vfmaq_laneq_f32(_sum, _r12, _k4567, 3); - _sum = vfmaq_laneq_f32(_sum, _r13, _k891011, 0); - _sum = vfmaq_laneq_f32(_sum, _r14, _k891011, 1); - - _sum = vfmaq_laneq_f32(_sum, _r20, _k891011, 2); - _sum = vfmaq_laneq_f32(_sum, _r21, _k891011, 3); - _sum = vfmaq_laneq_f32(_sum, _r22, _k12131415, 0); - _sum = vfmaq_laneq_f32(_sum, _r23, _k12131415, 1); - _sum = vfmaq_laneq_f32(_sum, _r24, _k12131415, 2); - - _sum = vfmaq_laneq_f32(_sum, _r30, _k12131415, 3); - _sum = vfmaq_laneq_f32(_sum, _r31, _k16171819, 0); - _sum = vfmaq_laneq_f32(_sum, _r32, _k16171819, 1); - _sum = vfmaq_laneq_f32(_sum, _r33, _k16171819, 2); - _sum = vfmaq_laneq_f32(_sum, _r34, _k16171819, 3); - - _sum = vfmaq_laneq_f32(_sum, _r40, _k20212223, 0); - _sum = vfmaq_laneq_f32(_sum, _r41, _k20212223, 1); - _sum = vfmaq_laneq_f32(_sum, _r42, _k20212223, 2); - _sum = vfmaq_laneq_f32(_sum, _r43, _k20212223, 3); - _sum = vfmaq_laneq_f32(_sum, _r44, _k24242424, 0); - - vst1q_f32(outptr, _sum); - - r0 += 8; - r1 += 8; - r2 += 8; - r3 += 8; - r4 += 8; - outptr += 4; + asm volatile( + "prfm pldl1keep, [%2, #256] \n" + "ld2 {v8.4s, v9.4s}, [%2], #32 \n"// v8 = 0 2 4 6 q9 = 1 3 5 7 + + "prfm pldl1keep, [%2, #256] \n" + "ld2 {v10.4s, v11.4s}, [%2] \n"// v10 = 8 10 12 14 v11 = 9 11 13 15 + + "prfm pldl1keep, [%1, #128] \n" + "0: \n" + + "ld1 {v7.4s}, [%1] \n" // v7 = outptr + + "ext v12.16b, v8.16b, v10.16b, #4 \n" // v12 = 2 4 6 8 + "ext v11.16b, v9.16b, v11.16b, #4 \n" // v11 = 3 5 7 9 + "ext v10.16b, v8.16b, v10.16b, #8 \n" // v10 = 4 6 8 10 + + "fmla v7.4s, v8.4s, %14.s[0] \n" + "fmul v13.4s, v9.4s, %14.s[1] \n" + + "prfm pldl1keep, [%3, #256] \n" + + "fmul v14.4s, v12.4s, %14.s[2] \n" + "fmul v15.4s, v11.4s, %14.s[3] \n" + "fmla v7.4s, v10.4s, %15.s[0] \n" + + "ld2 {v8.4s, v9.4s}, [%3], #32 \n" + + "prfm pldl1keep, [%3, #256] \n" + + "ld2 {v10.4s, v11.4s}, [%3] \n" + "ext v12.16b, v8.16b, v10.16b, #4 \n" + "ext v11.16b, v9.16b, v11.16b, #4 \n" + "ext v10.16b, v8.16b, v10.16b, #8 \n" + + "fmla v7.4s, v8.4s, %15.s[1] \n" + "fmla v13.4s, v9.4s, %15.s[2] \n" + + "prfm pldl1keep, [%4, #256] \n" + + "fmla v14.4s, v12.4s, %15.s[3] \n" + "fmla v15.4s, v11.4s, %16.s[0] \n" + "fmla v7.4s, v10.4s, %16.s[1] \n" + + "ld2 {v8.4s, v9.4s}, [%4], #32 \n" + + "prfm pldl1keep, [%4, #256] \n" + + "ld2 {v10.4s, v11.4s}, [%4] \n" + "ext v12.16b, v8.16b, v10.16b, #4 \n" + "ext v11.16b, v9.16b, v11.16b, #4 \n" + "ext v10.16b, v8.16b, v10.16b, #8 \n" + + "fmla v7.4s, v8.4s, %16.s[2] \n" + "fmla v13.4s, v9.4s, %16.s[3] \n" + + "prfm pldl1keep, [%5, #256] \n" + + "fmla v14.4s, v12.4s, %17.s[0] \n" + "fmla v15.4s, v11.4s, %17.s[1] \n" + "fmla v7.4s, v10.4s, %17.s[2] \n" + + "ld2 {v8.4s, v9.4s}, [%5], #32 \n" + + "prfm pldl1keep, [%5, #256] \n" + + "ld2 {v10.4s, v11.4s}, [%5] \n" + "ext v12.16b, v8.16b, v10.16b, #4 \n" + "ext v11.16b, v9.16b, v11.16b, #4 \n" + "ext v10.16b, v8.16b, v10.16b, #8 \n" + + "fmla v7.4s, v8.4s, %17.s[3] \n" + "fmla v13.4s, v9.4s, %18.s[0] \n" + + "prfm pldl1keep, [%6, #256] \n" + + "fmla v14.4s, v12.4s, %18.s[1] \n" + "fmla v15.4s, v11.4s, %18.s[2] \n" + "fmla v7.4s, v10.4s, %18.s[3] \n" + + "ld2 {v8.4s, v9.4s}, [%6], #32 \n" + + "prfm pldl1keep, [%6, #256] \n" + + "ld2 {v10.4s, v11.4s}, [%6] \n" + "ext v12.16b, v8.16b, v10.16b, #4 \n" + "ext v11.16b, v9.16b, v11.16b, #4 \n" + "ext v10.16b, v8.16b, v10.16b, #8 \n" + + "fmla v7.4s, v8.4s, %19.s[0] \n" + "fmla v13.4s, v9.4s, %19.s[1] \n" + "fmla v14.4s, v12.4s, %19.s[2] \n" + "fmla v15.4s, v11.4s, %19.s[3] \n" + "fmla v7.4s, v10.4s, %20.s[0] \n" + + "prfm pldl1keep, [%2, #256] \n" + + "ld2 {v8.4s, v9.4s}, [%2], #32 \n" + + "fadd v14.4s, v14.4s, v15.4s \n" + "fadd v7.4s, v7.4s, v13.4s \n" + + "prfm pldl1keep, [%2, #256] \n" + + "fadd v7.4s, v7.4s, v14.4s \n" + + "ld2 {v10.4s, v11.4s}, [%2] \n" + "st1 {v7.4s}, [%1], #16 \n" + + "prfm pldl1keep, [%1, #128] \n" + + "subs %w0, %w0, #1 \n" + "bne 0b \n" + + "sub %2, %2, #32 \n" + : "=r"(nn), // %0 + "=r"(outptr), // %1 + "=r"(r0), // %2 + "=r"(r1), // %3 + "=r"(r2), // %4 + "=r"(r3), // %5 + "=r"(r4) // %6 + : "0"(nn), + "1"(outptr), + "2"(r0), + "3"(r1), + "4"(r2), + "5"(r3), + "6"(r4), + "w"(_k0123), // %14 + "w"(_k4567), // %15 + "w"(_k891011), // %16 + "w"(_k12131415), // %17 + "w"(_k16171819), // %18 + "w"(_k20212223), // %19 + "w"(_k24242424) // %20 + : "cc", "memory", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" + ); } + #else if (nn > 0) { diff --git a/src/layer/arm/convolution_7x7.h b/src/layer/arm/convolution_7x7.h index d6e0254ba..3b2c77e6a 100644 --- a/src/layer/arm/convolution_7x7.h +++ b/src/layer/arm/convolution_7x7.h @@ -75,13 +75,198 @@ static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke #if __ARM_NEON #if __aarch64__ + float32x4_t _k0123 = vld1q_f32(k0); + float32x4_t _k4567 = vld1q_f32(k0 + 4); + float32x4_t _k78910 = vld1q_f32(k1); + float32x4_t _k11121314 = vld1q_f32(k1 + 4); + float32x4_t _k14151617 = vld1q_f32(k2); + float32x4_t _k18192021 = vld1q_f32(k2 + 4); + float32x4_t _k21222324 = vld1q_f32(k3); + float32x4_t _k25262728 = vld1q_f32(k3 + 4); + float32x4_t _k28293031 = vld1q_f32(k4); + float32x4_t _k32333435 = vld1q_f32(k4 + 4); + float32x4_t _k35363738 = vld1q_f32(k5); + float32x4_t _k39404142 = vld1q_f32(k5 + 4); + float32x4_t _k42434445 = vld1q_f32(k6); + float32x4_t _k46474849 = vld1q_f32(k6 + 4); +#ifdef __clang__ // __ARM_NEON && __aarch64__ && __clang__ + if (nn > 0) + { + asm volatile( + // v0: input / final output + // v1 v2 v3: = ri0 ri4 ri0n , i <- 1-7 + // v4 = ri1 / ri3 / ri6 + // v5 = ri2 / ri5 + // v9 = intermediate sum register + "0: \n" + "prfm pldl1keep, [%1, #128] \n" + "ld1 {v0.4s}, [%1] \n" + + //i = 1 + "prfm pldl1keep, [%2, #384] \n" + "ld1 {v1.4s, v2.4s, v3.4s}, [%2] \n" + "add %2, %2, #16 \n" + "ext v4.16b, v1.16b, v2.16b, #4 \n" + "fmul v9.4s, v1.4s, %18.s[0] \n" + "ext v5.16b, v1.16b, v2.16b, #8 \n" + "fmla v0.4s, v4.4s, %18.s[1] \n" + "ext v4.16b, v1.16b, v2.16b, #12 \n" + "fmla v9.4s, v5.4s, %18.s[2] \n" + "ext v5.16b, v2.16b, v3.16b, #4 \n" + "fmla v0.4s, v4.4s, %18.s[3] \n" + "ext v4.16b, v2.16b, v3.16b, #8 \n" + "fmla v9.4s, v2.4s, %19.s[0] \n" + "fmla v0.4s, v5.4s, %19.s[1] \n" + "fmla v9.4s, v4.4s, %19.s[2] \n" + + //i = 2 + "prfm pldl1keep, [%3, #384] \n" + "ld1 {v1.4s, v2.4s, v3.4s}, [%3] \n" // v1 v2 v3: = r20 r24 r20n + "add %3, %3, #16 \n" + "ext v4.16b, v1.16b, v2.16b, #4 \n" // v4 = r21 + "fmla v9.4s, v1.4s, %20.s[0] \n" // *+ r10 + "ext v5.16b, v1.16b, v2.16b, #8 \n" // v5 = r22 + "fmla v0.4s, v4.4s, %20.s[1] \n" // *+ r11 + "ext v4.16b, v1.16b, v2.16b, #12 \n" // v4 = r23 + "fmla v9.4s, v5.4s, %20.s[2] \n" // *+ r1 + "ext v5.16b, v2.16b, v3.16b, #4 \n" // v5 = r25 + "fmla v0.4s, v4.4s, %20.s[3] \n" // *+ r13 + "ext v4.16b, v2.16b, v3.16b, #8 \n" // v4 = r26 + "fmla v9.4s, v2.4s, %21.s[0] \n" // *+ r14 + "fmla v0.4s, v5.4s, %21.s[1] \n" // *+ r15 + "fmla v9.4s, v4.4s, %21.s[2] \n" // *+ r16 + + //i = 3 + "prfm pldl1keep, [%4, #384] \n" + "ld1 {v1.4s, v2.4s, v3.4s}, [%4] \n" + "add %4, %4, #16 \n" + "ext v4.16b, v1.16b, v2.16b, #4 \n" + "fmla v9.4s, v1.4s, %22.s[0] \n" + "ext v5.16b, v1.16b, v2.16b, #8 \n" + "fmla v0.4s, v4.4s, %22.s[1] \n" + "ext v4.16b, v1.16b, v2.16b, #12 \n" + "fmla v9.4s, v5.4s, %22.s[2] \n" + "ext v5.16b, v2.16b, v3.16b, #4 \n" + "fmla v0.4s, v4.4s, %22.s[3] \n" + "ext v4.16b, v2.16b, v3.16b, #8 \n" + "fmla v9.4s, v2.4s, %23.s[0] \n" + "fmla v0.4s, v5.4s, %23.s[1] \n" + "fmla v9.4s, v4.4s, %23.s[2] \n" + + //i = 4 + "prfm pldl1keep, [%5, #384] \n" + "ld1 {v1.4s, v2.4s, v3.4s}, [%5] \n" + "add %5, %5, #16 \n" + "ext v4.16b, v1.16b, v2.16b, #4 \n" + "fmla v9.4s, v1.4s, %24.s[0] \n" + "ext v5.16b, v1.16b, v2.16b, #8 \n" + "fmla v0.4s, v4.4s, %24.s[1] \n" + "ext v4.16b, v1.16b, v2.16b, #12 \n" + "fmla v9.4s, v5.4s, %24.s[2] \n" + "ext v5.16b, v2.16b, v3.16b, #4 \n" + "fmla v0.4s, v4.4s, %24.s[3] \n" + "ext v4.16b, v2.16b, v3.16b, #8 \n" + "fmla v9.4s, v2.4s, %25.s[0] \n" + "fmla v0.4s, v5.4s, %25.s[1] \n" + "fmla v9.4s, v4.4s, %25.s[2] \n" + + //i = 5 + "prfm pldl1keep, [%6, #384] \n" + "ld1 {v1.4s, v2.4s, v3.4s}, [%6] \n" + "add %6, %6, #16 \n" + "ext v4.16b, v1.16b, v2.16b, #4 \n" + "fmla v9.4s, v1.4s, %26.s[0] \n" + "ext v5.16b, v1.16b, v2.16b, #8 \n" + "fmla v0.4s, v4.4s, %26.s[1] \n" + "ext v4.16b, v1.16b, v2.16b, #12 \n" + "fmla v9.4s, v5.4s, %26.s[2] \n" + "ext v5.16b, v2.16b, v3.16b, #4 \n" + "fmla v0.4s, v4.4s, %26.s[3] \n" + "ext v4.16b, v2.16b, v3.16b, #8 \n" + "fmla v9.4s, v2.4s, %27.s[0] \n" + "fmla v0.4s, v5.4s, %27.s[1] \n" + "fmla v9.4s, v4.4s, %27.s[2] \n" + + //i = 6 + "prfm pldl1keep, [%7, #384] \n" + "ld1 {v1.4s, v2.4s, v3.4s}, [%7] \n" + "add %7, %7, #16 \n" + "ext v4.16b, v1.16b, v2.16b, #4 \n" + "fmla v9.4s, v1.4s, %28.s[0] \n" + "ext v5.16b, v1.16b, v2.16b, #8 \n" + "fmla v0.4s, v4.4s, %28.s[1] \n" + "ext v4.16b, v1.16b, v2.16b, #12 \n" + "fmla v9.4s, v5.4s, %28.s[2] \n" + "ext v5.16b, v2.16b, v3.16b, #4 \n" + "fmla v0.4s, v4.4s, %28.s[3] \n" + "ext v4.16b, v2.16b, v3.16b, #8 \n" + "fmla v9.4s, v2.4s, %29.s[0] \n" + "fmla v0.4s, v5.4s, %29.s[1] \n" + "fmla v9.4s, v4.4s, %29.s[2] \n" + + //i = 7 + "prfm pldl1keep, [%8, #384] \n" + "ld1 {v1.4s, v2.4s, v3.4s}, [%8] \n" + "add %8, %8, #16 \n" + "ext v4.16b, v1.16b, v2.16b, #4 \n" + "fmla v9.4s, v1.4s, %30.s[0] \n" + "ext v5.16b, v1.16b, v2.16b, #8 \n" + "fmla v0.4s, v4.4s, %30.s[1] \n" + "ext v4.16b, v1.16b, v2.16b, #12 \n" + "fmla v9.4s, v5.4s, %30.s[2] \n" + "ext v5.16b, v2.16b, v3.16b, #4 \n" + "fmla v0.4s, v4.4s, %30.s[3] \n" + "ext v4.16b, v2.16b, v3.16b, #8 \n" + "fmla v9.4s, v2.4s, %31.s[0] \n" + "fmla v0.4s, v5.4s, %31.s[1] \n" + "fmla v9.4s, v4.4s, %31.s[2] \n" + + "fadd v0.4s, v0.4s, v9.4s \n" + "st1 {v0.4s}, [%1], #16 \n" + "subs %w0, %w0, #1 \n" + "bne 0b \n" + + : "=r"(nn), // %0 + "=r"(outptr), // %1 + "=r"(r0), // %2 + "=r"(r1), // %3 + "=r"(r2), // %4 + "=r"(r3), // %5 + "=r"(r4), // %6 + "=r"(r5), // %7 + "=r"(r6) // %8 + : "0"(nn), + "1"(outptr), + "2"(r0), + "3"(r1), + "4"(r2), + "5"(r3), + "6"(r4), + "7"(r5), + "8"(r6), + "w"(_k0123), // %18 + "w"(_k4567), // %19 + "w"(_k78910), // %20 + "w"(_k11121314), // %21 + "w"(_k14151617), // %22 + "w"(_k18192021), // %23 + "w"(_k21222324), // %24 + "w"(_k25262728), // %25 + "w"(_k28293031), // %26 + "w"(_k32333435), // %27 + "w"(_k35363738), // %28 + "w"(_k39404142), // %29 + "w"(_k42434445), // %30 + "w"(_k46474849) // %31 + : "cc", "memory","v0", "v1", "v2", "v3", "v4", "v5", "v9" + ); + } +#else // __ARM_NEON && __aarch64__ defined, but __clang__ not defined +// When compiled with gcc, gcc does not accept over 30 operands for (; nn>0; nn--) { float32x4_t _sum = vld1q_f32(outptr); - float32x4_t _k0123 = vld1q_f32(k0); - float32x4_t _k4567 = vld1q_f32(k0 + 4); - float32x4_t _r00 = vld1q_f32(r0);// 0 1 2 3 float32x4_t _r04 = vld1q_f32(r0 + 4);// 4 5 6 7 float32x4_t _r00n = vld1q_f32(r0 + 8);// 8 9 10 11 @@ -99,9 +284,6 @@ static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke _sum = vfmaq_laneq_f32(_sum, _r05, _k4567, 1); _sum = vfmaq_laneq_f32(_sum, _r06, _k4567, 2); - float32x4_t _k78910 = vld1q_f32(k1); - float32x4_t _k11121314 = vld1q_f32(k1 + 4); - float32x4_t _r10 = vld1q_f32(r1); float32x4_t _r14 = vld1q_f32(r1 + 4); float32x4_t _r10n = vld1q_f32(r1 + 8); @@ -119,9 +301,6 @@ static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke _sum = vfmaq_laneq_f32(_sum, _r15, _k11121314, 1); _sum = vfmaq_laneq_f32(_sum, _r16, _k11121314, 2); - float32x4_t _k14151617 = vld1q_f32(k2); - float32x4_t _k18192021 = vld1q_f32(k2 + 4); - float32x4_t _r20 = vld1q_f32(r2); float32x4_t _r24 = vld1q_f32(r2 + 4); float32x4_t _r20n = vld1q_f32(r2 + 8); @@ -139,9 +318,6 @@ static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke _sum = vfmaq_laneq_f32(_sum, _r25, _k18192021, 1); _sum = vfmaq_laneq_f32(_sum, _r26, _k18192021, 2); - float32x4_t _k21222324 = vld1q_f32(k3); - float32x4_t _k25262728 = vld1q_f32(k3 + 4); - float32x4_t _r30 = vld1q_f32(r3); float32x4_t _r34 = vld1q_f32(r3 + 4); float32x4_t _r30n = vld1q_f32(r3 + 8); @@ -159,9 +335,6 @@ static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke _sum = vfmaq_laneq_f32(_sum, _r35, _k25262728, 1); _sum = vfmaq_laneq_f32(_sum, _r36, _k25262728, 2); - float32x4_t _k28293031 = vld1q_f32(k4); - float32x4_t _k32333435 = vld1q_f32(k4 + 4); - float32x4_t _r40 = vld1q_f32(r4); float32x4_t _r44 = vld1q_f32(r4 + 4); float32x4_t _r40n = vld1q_f32(r4 + 8); @@ -179,9 +352,6 @@ static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke _sum = vfmaq_laneq_f32(_sum, _r45, _k32333435, 1); _sum = vfmaq_laneq_f32(_sum, _r46, _k32333435, 2); - float32x4_t _k35363738 = vld1q_f32(k5); - float32x4_t _k39404142 = vld1q_f32(k5 + 4); - float32x4_t _r50 = vld1q_f32(r5); float32x4_t _r54 = vld1q_f32(r5 + 4); float32x4_t _r50n = vld1q_f32(r5 + 8); @@ -199,9 +369,6 @@ static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke _sum = vfmaq_laneq_f32(_sum, _r55, _k39404142, 1); _sum = vfmaq_laneq_f32(_sum, _r56, _k39404142, 2); - float32x4_t _k42434445 = vld1q_f32(k6); - float32x4_t _k46474849 = vld1q_f32(k6 + 4); - float32x4_t _r60 = vld1q_f32(r6); float32x4_t _r64 = vld1q_f32(r6 + 4); float32x4_t _r60n = vld1q_f32(r6 + 8); @@ -230,7 +397,8 @@ static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke r6 += 4; outptr += 4; } -#else +#endif // __clang__ +#else //__aarch32__ if (nn > 0) { asm volatile( @@ -599,13 +767,205 @@ static void conv7x7s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke #if __ARM_NEON #if __aarch64__ + float32x4_t _k0123 = vld1q_f32(k0); + float32x4_t _k4567 = vld1q_f32(k0 + 4); + float32x4_t _k78910 = vld1q_f32(k1); + float32x4_t _k11121314 = vld1q_f32(k1 + 4); + float32x4_t _k14151617 = vld1q_f32(k2); + float32x4_t _k18192021 = vld1q_f32(k2 + 4); + float32x4_t _k21222324 = vld1q_f32(k3); + float32x4_t _k25262728 = vld1q_f32(k3 + 4); + float32x4_t _k28293031 = vld1q_f32(k4); + float32x4_t _k32333435 = vld1q_f32(k4 + 4); + float32x4_t _k35363738 = vld1q_f32(k5); + float32x4_t _k39404142 = vld1q_f32(k5 + 4); + float32x4_t _k42434445 = vld1q_f32(k6); + float32x4_t _k46474849 = vld1q_f32(k6 + 4); +#ifdef __clang__ // __ARM_NEON && __aarch64__ && __clang__ + if (nn > 0) + { + asm volatile( + // v0: input / final output + // v1 v2: = _ri0/_ri1 first + // v3 v4: = then _r0_8101214/_r0_9111315 + // v5 = ri2 / ri4 / ri6 + // v6 = ri3 / ri5 + // v9 = intermediate sum register + "0: \n" + "prfm pldl1keep, [%1, #128] \n" + "ld1 {v0.4s}, [%1] \n" + + //i = 1 + "prfm pldl1keep, [%2, #512] \n" + "ld2 {v1.4s, v2.4s}, [%2] \n" // v1 v2 = _r00 _r01 + "add %2, %2, #32 \n" + "ld2 {v3.4s, v4.4s}, [%2] \n" // v3 v4 = _r0_8101214 / _r0_9111315 + "fmul v9.4s, v1.4s, %18.s[0] \n" // *+ _r00 + "ext v5.16b, v1.16b, v3.16b, #4 \n" // v5 = _r02 + "fmla v0.4s, v2.4s, %18.s[1] \n" // *+ _r01 + "ext v6.16b, v2.16b, v4.16b, #4 \n" // v6 = _r03 + "fmla v9.4s, v5.4s, %18.s[2] \n" // *+ _r02 + "ext v5.16b, v1.16b, v3.16b, #8 \n" // v5 = _r04 + "fmla v0.4s, v6.4s, %18.s[3] \n" // *+ _r03 + "ext v6.16b, v2.16b, v4.16b, #8 \n" // v6 = _r05 + "fmla v9.4s, v5.4s, %19.s[0] \n" // *+ _r04 + "ext v5.16b, v1.16b, v3.16b, #12 \n" // v5 = _r06 + "fmla v0.4s, v6.4s, %19.s[1] \n" // *+ _r05 + "fmla v9.4s, v5.4s, %19.s[2] \n" // *+ _r06 + + //i = 2 + "prfm pldl1keep, [%3, #512] \n" + "ld2 {v1.4s, v2.4s}, [%3] \n" + "add %3, %3, #32 \n" + "ld2 {v3.4s, v4.4s}, [%3] \n" + "fmla v9.4s, v1.4s, %20.s[0] \n" + "ext v5.16b, v1.16b, v3.16b, #4 \n" + "fmla v0.4s, v2.4s, %20.s[1] \n" + "ext v6.16b, v2.16b, v4.16b, #4 \n" + "fmla v9.4s, v5.4s, %20.s[2] \n" + "ext v5.16b, v1.16b, v3.16b, #8 \n" + "fmla v0.4s, v6.4s, %20.s[3] \n" + "ext v6.16b, v2.16b, v4.16b, #8 \n" + "fmla v9.4s, v5.4s, %21.s[0] \n" + "ext v5.16b, v1.16b, v3.16b, #12 \n" + "fmla v0.4s, v6.4s, %21.s[1] \n" + "fmla v9.4s, v5.4s, %21.s[2] \n" + + //i = 3 + "prfm pldl1keep, [%4, #512] \n" + "ld2 {v1.4s, v2.4s}, [%4] \n" + "add %4, %4, #32 \n" + "ld2 {v3.4s, v4.4s}, [%4] \n" + "fmla v9.4s, v1.4s, %22.s[0] \n" + "ext v5.16b, v1.16b, v3.16b, #4 \n" + "fmla v0.4s, v2.4s, %22.s[1] \n" + "ext v6.16b, v2.16b, v4.16b, #4 \n" + "fmla v9.4s, v5.4s, %22.s[2] \n" + "ext v5.16b, v1.16b, v3.16b, #8 \n" + "fmla v0.4s, v6.4s, %22.s[3] \n" + "ext v6.16b, v2.16b, v4.16b, #8 \n" + "fmla v9.4s, v5.4s, %23.s[0] \n" + "ext v5.16b, v1.16b, v3.16b, #12 \n" + "fmla v0.4s, v6.4s, %23.s[1] \n" + "fmla v9.4s, v5.4s, %23.s[2] \n" + + //i = 4 + "prfm pldl1keep, [%5, #512] \n" + "ld2 {v1.4s, v2.4s}, [%5] \n" + "add %5, %5, #32 \n" + "ld2 {v3.4s, v4.4s}, [%5] \n" + "fmla v9.4s, v1.4s, %24.s[0] \n" + "ext v5.16b, v1.16b, v3.16b, #4 \n" + "fmla v0.4s, v2.4s, %24.s[1] \n" + "ext v6.16b, v2.16b, v4.16b, #4 \n" + "fmla v9.4s, v5.4s, %24.s[2] \n" + "ext v5.16b, v1.16b, v3.16b, #8 \n" + "fmla v0.4s, v6.4s, %24.s[3] \n" + "ext v6.16b, v2.16b, v4.16b, #8 \n" + "fmla v9.4s, v5.4s, %25.s[0] \n" + "ext v5.16b, v1.16b, v3.16b, #12 \n" + "fmla v0.4s, v6.4s, %25.s[1] \n" + "fmla v9.4s, v5.4s, %25.s[2] \n" + + //i = 5 + "prfm pldl1keep, [%6, #512] \n" + "ld2 {v1.4s, v2.4s}, [%6] \n" + "add %6, %6, #32 \n" + "ld2 {v3.4s, v4.4s}, [%6] \n" + "fmla v9.4s, v1.4s, %26.s[0] \n" + "ext v5.16b, v1.16b, v3.16b, #4 \n" + "fmla v0.4s, v2.4s, %26.s[1] \n" + "ext v6.16b, v2.16b, v4.16b, #4 \n" + "fmla v9.4s, v5.4s, %26.s[2] \n" + "ext v5.16b, v1.16b, v3.16b, #8 \n" + "fmla v0.4s, v6.4s, %26.s[3] \n" + "ext v6.16b, v2.16b, v4.16b, #8 \n" + "fmla v9.4s, v5.4s, %27.s[0] \n" + "ext v5.16b, v1.16b, v3.16b, #12 \n" + "fmla v0.4s, v6.4s, %27.s[1] \n" + "fmla v9.4s, v5.4s, %27.s[2] \n" + + //i = 6 + "prfm pldl1keep, [%7, #512] \n" + "ld2 {v1.4s, v2.4s}, [%7] \n" + "add %7, %7, #32 \n" + "ld2 {v3.4s, v4.4s}, [%7] \n" + "fmla v9.4s, v1.4s, %28.s[0] \n" + "ext v5.16b, v1.16b, v3.16b, #4 \n" + "fmla v0.4s, v2.4s, %28.s[1] \n" + "ext v6.16b, v2.16b, v4.16b, #4 \n" + "fmla v9.4s, v5.4s, %28.s[2] \n" + "ext v5.16b, v1.16b, v3.16b, #8 \n" + "fmla v0.4s, v6.4s, %28.s[3] \n" + "ext v6.16b, v2.16b, v4.16b, #8 \n" + "fmla v9.4s, v5.4s, %29.s[0] \n" + "ext v5.16b, v1.16b, v3.16b, #12 \n" + "fmla v0.4s, v6.4s, %29.s[1] \n" + "fmla v9.4s, v5.4s, %29.s[2] \n" + + //i = 7 + "prfm pldl1keep, [%8, #512] \n" + "ld2 {v1.4s, v2.4s}, [%8] \n" + "add %8, %8, #32 \n" + "ld2 {v3.4s, v4.4s}, [%8] \n" + "fmla v9.4s, v1.4s, %30.s[0] \n" + "ext v5.16b, v1.16b, v3.16b, #4 \n" + "fmla v0.4s, v2.4s, %30.s[1] \n" + "ext v6.16b, v2.16b, v4.16b, #4 \n" + "fmla v9.4s, v5.4s, %30.s[2] \n" + "ext v5.16b, v1.16b, v3.16b, #8 \n" + "fmla v0.4s, v6.4s, %30.s[3] \n" + "ext v6.16b, v2.16b, v4.16b, #8 \n" + "fmla v9.4s, v5.4s, %31.s[0] \n" + "ext v5.16b, v1.16b, v3.16b, #12 \n" + "fmla v0.4s, v6.4s, %31.s[1] \n" + "fmla v9.4s, v5.4s, %31.s[2] \n" + + "fadd v0.4s, v0.4s, v9.4s \n" + "st1 {v0.4s}, [%1], #16 \n" + "subs %w0, %w0, #1 \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(outptr), // %1 + "=r"(r0), // %2 + "=r"(r1), // %3 + "=r"(r2), // %4 + "=r"(r3), // %5 + "=r"(r4), // %6 + "=r"(r5), // %7 + "=r"(r6) // %8 + : "0"(nn), + "1"(outptr), + "2"(r0), + "3"(r1), + "4"(r2), + "5"(r3), + "6"(r4), + "7"(r5), + "8"(r6), + "w"(_k0123), // %18 + "w"(_k4567), // %19 + "w"(_k78910), // %20 + "w"(_k11121314), // %21 + "w"(_k14151617), // %22 + "w"(_k18192021), // %23 + "w"(_k21222324), // %24 + "w"(_k25262728), // %25 + "w"(_k28293031), // %26 + "w"(_k32333435), // %27 + "w"(_k35363738), // %28 + "w"(_k39404142), // %29 + "w"(_k42434445), // %30 + "w"(_k46474849) // %31 + : "cc", "memory","v0", "v1", "v2", "v3", "v4", "v5", "v6", "v9" + ); + } +#else // __ARM_NEON && __aarch64__ defined, but __clang__ not defined +// When compiled with gcc, gcc does not accept over 30 operands for (; nn>0; nn--) { float32x4_t _sum = vld1q_f32(outptr); - float32x4_t _k0123 = vld1q_f32(k0); - float32x4_t _k4567 = vld1q_f32(k0 + 4); - float32x4x2_t _r00_02461357 = vld2q_f32(r0); float32x4x2_t _r00nx2 = vld2q_f32(r0 + 8); float32x4_t _r0_8101214 = _r00nx2.val[0];// 8 10 12 14 @@ -626,9 +986,6 @@ static void conv7x7s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke _sum = vfmaq_laneq_f32(_sum, _r05, _k4567, 1); _sum = vfmaq_laneq_f32(_sum, _r06, _k4567, 2); - float32x4_t _k78910 = vld1q_f32(k1); - float32x4_t _k11121314 = vld1q_f32(k1 + 4); - float32x4x2_t _r10_02461357 = vld2q_f32(r1); float32x4x2_t _r10nx2 = vld2q_f32(r1 + 8); float32x4_t _r1_8101214 = _r10nx2.val[0]; @@ -649,9 +1006,6 @@ static void conv7x7s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke _sum = vfmaq_laneq_f32(_sum, _r15, _k11121314, 1); _sum = vfmaq_laneq_f32(_sum, _r16, _k11121314, 2); - float32x4_t _k14151617 = vld1q_f32(k2); - float32x4_t _k18192021 = vld1q_f32(k2 + 4); - float32x4x2_t _r20_02461357 = vld2q_f32(r2); float32x4x2_t _r20nx2 = vld2q_f32(r2 + 8); float32x4_t _r2_8101214 = _r20nx2.val[0]; @@ -672,9 +1026,6 @@ static void conv7x7s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke _sum = vfmaq_laneq_f32(_sum, _r25, _k18192021, 1); _sum = vfmaq_laneq_f32(_sum, _r26, _k18192021, 2); - float32x4_t _k21222324 = vld1q_f32(k3); - float32x4_t _k25262728 = vld1q_f32(k3 + 4); - float32x4x2_t _r30_02461357 = vld2q_f32(r3); float32x4x2_t _r30nx2 = vld2q_f32(r3 + 8); float32x4_t _r3_8101214 = _r30nx2.val[0]; @@ -695,9 +1046,6 @@ static void conv7x7s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke _sum = vfmaq_laneq_f32(_sum, _r35, _k25262728, 1); _sum = vfmaq_laneq_f32(_sum, _r36, _k25262728, 2); - float32x4_t _k28293031 = vld1q_f32(k4); - float32x4_t _k32333435 = vld1q_f32(k4 + 4); - float32x4x2_t _r40_02461357 = vld2q_f32(r4); float32x4x2_t _r40nx2 = vld2q_f32(r4 + 8); float32x4_t _r4_8101214 = _r40nx2.val[0]; @@ -718,9 +1066,6 @@ static void conv7x7s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke _sum = vfmaq_laneq_f32(_sum, _r45, _k32333435, 1); _sum = vfmaq_laneq_f32(_sum, _r46, _k32333435, 2); - float32x4_t _k35363738 = vld1q_f32(k5); - float32x4_t _k39404142 = vld1q_f32(k5 + 4); - float32x4x2_t _r50_02461357 = vld2q_f32(r5); float32x4x2_t _r50nx2 = vld2q_f32(r5 + 8); float32x4_t _r5_8101214 = _r50nx2.val[0]; @@ -741,9 +1086,6 @@ static void conv7x7s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke _sum = vfmaq_laneq_f32(_sum, _r55, _k39404142, 1); _sum = vfmaq_laneq_f32(_sum, _r56, _k39404142, 2); - float32x4_t _k42434445 = vld1q_f32(k6); - float32x4_t _k46474849 = vld1q_f32(k6 + 4); - float32x4x2_t _r60_02461357 = vld2q_f32(r6); float32x4x2_t _r60nx2 = vld2q_f32(r6 + 8); float32x4_t _r6_8101214 = _r60nx2.val[0]; @@ -775,6 +1117,7 @@ static void conv7x7s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke r6 += 8; outptr += 4; } +#endif // __clang__ #else if (nn > 0) { diff --git a/src/layer/arm/convolutiondepthwise_3x3.h b/src/layer/arm/convolutiondepthwise_3x3.h index e26204676..c2f1ae222 100644 --- a/src/layer/arm/convolutiondepthwise_3x3.h +++ b/src/layer/arm/convolutiondepthwise_3x3.h @@ -77,60 +77,111 @@ static void convdw3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ #if __ARM_NEON #if __aarch64__ - for (; nn>0; nn--) + if (nn > 0) { - float32x4_t _r00 = vld1q_f32(r0); - float32x4_t _r00n = vld1q_f32(r0 + 4); - float32x4_t _r01 = vextq_f32(_r00, _r00n, 1); - float32x4_t _r02 = vextq_f32(_r00, _r00n, 2); + asm volatile( + "prfm pldl1keep, [%3, #192] \n" + "ld1 {v9.4s, v10.4s}, [%3] \n" //r0 + "add %3, %3, #16 \n" - float32x4_t _r10 = vld1q_f32(r1); - float32x4_t _r10n = vld1q_f32(r1 + 4); - float32x4_t _r11 = vextq_f32(_r10, _r10n, 1); - float32x4_t _r12 = vextq_f32(_r10, _r10n, 2); + "ext v11.16b, v9.16b, v10.16b, #4 \n" + "ext v12.16b, v9.16b, v10.16b, #8 \n" - float32x4_t _r20 = vld1q_f32(r2); - float32x4_t _r20n = vld1q_f32(r2 + 4); - float32x4_t _r21 = vextq_f32(_r20, _r20n, 1); - float32x4_t _r22 = vextq_f32(_r20, _r20n, 2); + "0: \n" - float32x4_t _r30 = vld1q_f32(r3); - float32x4_t _r30n = vld1q_f32(r3 + 4); - float32x4_t _r31 = vextq_f32(_r30, _r30n, 1); - float32x4_t _r32 = vextq_f32(_r30, _r30n, 2); - - float32x4_t _sum1 = vmulq_laneq_f32(_r00, _k012x, 0); - float32x4_t _sum2 = vfmaq_laneq_f32(_bias0, _r01, _k012x, 1); - _sum1 = vfmaq_laneq_f32(_sum1, _r02, _k012x, 2); - _sum2 = vfmaq_laneq_f32(_sum2, _r10, _k345x, 0); - _sum1 = vfmaq_laneq_f32(_sum1, _r11, _k345x, 1); - _sum2 = vfmaq_laneq_f32(_sum2, _r12, _k345x, 2); - _sum1 = vfmaq_laneq_f32(_sum1, _r20, _k678x, 0); - _sum2 = vfmaq_laneq_f32(_sum2, _r21, _k678x, 1); - _sum1 = vfmaq_laneq_f32(_sum1, _r22, _k678x, 2); - - float32x4_t _sum3 = vmulq_laneq_f32(_r10, _k012x, 0); - float32x4_t _sum4 = vfmaq_laneq_f32(_bias0, _r11, _k012x, 1); - _sum3 = vfmaq_laneq_f32(_sum3, _r12, _k012x, 2); - _sum4 = vfmaq_laneq_f32(_sum4, _r20, _k345x, 0); - _sum3 = vfmaq_laneq_f32(_sum3, _r21, _k345x, 1); - _sum4 = vfmaq_laneq_f32(_sum4, _r22, _k345x, 2); - _sum3 = vfmaq_laneq_f32(_sum3, _r30, _k678x, 0); - _sum4 = vfmaq_laneq_f32(_sum4, _r31, _k678x, 1); - _sum3 = vfmaq_laneq_f32(_sum3, _r32, _k678x, 2); - - _sum1 = vaddq_f32(_sum1, _sum2); - _sum3 = vaddq_f32(_sum3, _sum4); - - vst1q_f32(outptr, _sum1); - vst1q_f32(outptr2, _sum3); - - r0 += 4; - r1 += 4; - r2 += 4; - r3 += 4; - outptr += 4; - outptr2 += 4; + "fmul v7.4s, v9.4s, %14.s[0] \n" + + "and v13.16b, %17.16b, %17.16b \n" // v13 = _bias0 + "fmul v6.4s, v11.4s, %14.s[1] \n" + "fmla v13.4s, v12.4s, %14.s[2] \n" + + "prfm pldl1keep, [%4, #192] \n" + "ld1 {v9.4s, v10.4s}, [%4] \n" + "add %4, %4, #16 \n" + + "fmla v7.4s, v9.4s, %15.s[0] \n" + + "ext v11.16b, v9.16b, v10.16b, #4 \n" + "ext v12.16b, v9.16b, v10.16b, #8 \n" + + "fmla v6.4s, v11.4s, %15.s[1] \n" + "fmla v13.4s, v12.4s, %15.s[2] \n" + + "fmul v8.4s, v9.4s, %14.s[0] \n" + + "and v15.16b, %17.16b, %17.16b \n" // v15 = _bias0 + "fmul v14.4s, v11.4s, %14.s[1] \n" + "fmla v15.4s, v12.4s, %14.s[2] \n" + + "prfm pldl1keep, [%5, #192] \n" + "ld1 {v9.4s, v10.4s}, [%5] \n" + "add %5, %5, #16 \n" + + "fmla v7.4s, v9.4s, %16.s[0] \n" + + "ext v11.16b, v9.16b, v10.16b, #4 \n" + "ext v12.16b, v9.16b, v10.16b, #8 \n" + + "fmla v6.4s, v11.4s, %16.s[1] \n" + "fmla v13.4s, v12.4s, %16.s[2] \n" + + "fmla v8.4s, v9.4s, %15.s[0] \n" + "fmla v14.4s, v11.4s, %15.s[1] \n" + "fmla v15.4s, v12.4s, %15.s[2] \n" + + "prfm pldl1keep, [%6, #192] \n" + "ld1 {v9.4s, v10.4s}, [%6] \n" + "add %6, %6, #16 \n" + + "fmla v8.4s, v9.4s, %16.s[0] \n" + + "ext v11.16b, v9.16b, v10.16b, #4 \n" + "ext v12.16b, v9.16b, v10.16b, #8 \n" + + "fmla v14.4s, v11.4s, %16.s[1] \n" + "fmla v15.4s, v12.4s, %16.s[2] \n" + + "fadd v7.4s, v7.4s, v6.4s \n" + + "prfm pldl1keep, [%3, #192] \n" + "ld1 {v9.4s, v10.4s}, [%3] \n" //ro, for next loop + + "fadd v8.4s, v8.4s, v14.4s \n" + "fadd v7.4s, v7.4s, v13.4s \n" + "fadd v8.4s, v8.4s, v15.4s \n" + + "ext v11.16b, v9.16b, v10.16b, #4 \n" // for next loop + "ext v12.16b, v9.16b, v10.16b, #8 \n" // for next loop + + "add %3, %3, #16 \n" + + "st1 {v7.4s}, [%1], #16 \n" + "st1 {v8.4s}, [%2], #16 \n" + + "subs %w0, %w0, #1 \n" + "bne 0b \n" + + "sub %3, %3, #16 \n" + : "=r"(nn), // %0 + "=r"(outptr), // %1 + "=r"(outptr2), // %2 + "=r"(r0), // %3 + "=r"(r1), // %4 + "=r"(r2), // %5 + "=r"(r3) // %6 + : "0"(nn), + "1"(outptr), + "2"(outptr2), + "3"(r0), + "4"(r1), + "5"(r2), + "6"(r3), + "w"(_k012x), // %14 + "w"(_k345x), // %15 + "w"(_k678x), // %16 + "w"(_bias0) // %17 + : "cc", "memory", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" + ); } #else if (nn > 0) @@ -326,41 +377,80 @@ static void convdw3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ #if __ARM_NEON #if __aarch64__ - for (; nn>0; nn--) + if (nn > 0) { - float32x4_t _r00 = vld1q_f32(r0); - float32x4_t _r00n = vld1q_f32(r0 + 4); - float32x4_t _r01 = vextq_f32(_r00, _r00n, 1); - float32x4_t _r02 = vextq_f32(_r00, _r00n, 2); + asm volatile( + "prfm pldl1keep, [%2, #192] \n" + "ld1 {v8.4s, v9.4s}, [%2] \n" //r0 + "add %2, %2, #16 \n" - float32x4_t _r10 = vld1q_f32(r1); - float32x4_t _r10n = vld1q_f32(r1 + 4); - float32x4_t _r11 = vextq_f32(_r10, _r10n, 1); - float32x4_t _r12 = vextq_f32(_r10, _r10n, 2); + "ext v10.16b, v8.16b, v9.16b, #4 \n" + "ext v11.16b, v8.16b, v9.16b, #8 \n" - float32x4_t _r20 = vld1q_f32(r2); - float32x4_t _r20n = vld1q_f32(r2 + 4); - float32x4_t _r21 = vextq_f32(_r20, _r20n, 1); - float32x4_t _r22 = vextq_f32(_r20, _r20n, 2); - - float32x4_t _sum1 = vmulq_laneq_f32(_r00, _k012x, 0); - float32x4_t _sum2 = vfmaq_laneq_f32(_bias0, _r01, _k012x, 1); - _sum1 = vfmaq_laneq_f32(_sum1, _r02, _k012x, 2); - _sum2 = vfmaq_laneq_f32(_sum2, _r10, _k345x, 0); - _sum1 = vfmaq_laneq_f32(_sum1, _r11, _k345x, 1); - _sum2 = vfmaq_laneq_f32(_sum2, _r12, _k345x, 2); - _sum1 = vfmaq_laneq_f32(_sum1, _r20, _k678x, 0); - _sum2 = vfmaq_laneq_f32(_sum2, _r21, _k678x, 1); - _sum1 = vfmaq_laneq_f32(_sum1, _r22, _k678x, 2); - - _sum1 = vaddq_f32(_sum1, _sum2); - - vst1q_f32(outptr, _sum1); - - r0 += 4; - r1 += 4; - r2 += 4; - outptr += 4; + "0: \n" + + "fmul v7.4s, v8.4s, %10.s[0] \n" + + "and v14.16b, %13.16b, %13.16b \n" // v14 = _bias0 + "fmul v13.4s, v10.4s, %10.s[1] \n" + "fmla v14.4s, v11.4s, %10.s[2] \n" + + "prfm pldl1keep, [%3, #192] \n" + "ld1 {v8.4s, v9.4s}, [%3] \n" //r1 + "add %3, %3, #16 \n" + + "fmla v7.4s, v8.4s, %11.s[0] \n" + + "ext v10.16b, v8.16b, v9.16b, #4 \n" + "ext v11.16b, v8.16b, v9.16b, #8 \n" + + "fmla v13.4s, v10.4s, %11.s[1] \n" + "fmla v14.4s, v11.4s, %11.s[2] \n" + + "prfm pldl1keep, [%4, #192] \n" + "ld1 {v8.4s, v9.4s}, [%4] \n" //r2 + "add %4, %4, #16 \n" + + "fmla v7.4s, v8.4s, %12.s[0] \n" + + "ext v10.16b, v8.16b, v9.16b, #4 \n" + "ext v11.16b, v8.16b, v9.16b, #8 \n" + + "fmla v13.4s, v10.4s, %12.s[1] \n" + "fmla v14.4s, v11.4s, %12.s[2] \n" + + "prfm pldl1keep, [%2, #192] \n" + "ld1 {v8.4s, v9.4s}, [%2] \n" //r0, for next loop + "add %2, %2, #16 \n" + + "fadd v7.4s, v7.4s, v13.4s \n" + "fadd v7.4s, v7.4s, v14.4s \n" + + "ext v10.16b, v8.16b, v9.16b, #4 \n" // for next loop + "ext v11.16b, v8.16b, v9.16b, #8 \n" // for next loop + + "st1 {v7.4s}, [%1], #16 \n" + + "subs %w0, %w0, #1 \n" + "bne 0b \n" + + "sub %2, %2, #16 \n" + : "=r"(nn), // %0 + "=r"(outptr), // %1 + "=r"(r0), // %2 + "=r"(r1), // %3 + "=r"(r2) // %4 + : "0"(nn), + "1"(outptr), + "2"(r0), + "3"(r1), + "4"(r2), + "w"(_k012x), // %10 + "w"(_k345x), // %11 + "w"(_k678x), // %12 + "w"(_bias0) // %13 + : "cc", "memory", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" + ); } #else if (nn > 0) @@ -547,47 +637,76 @@ static void convdw3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ #if __ARM_NEON #if __aarch64__ - for (; nn>0; nn--) + if (nn > 0) { - float32x4x2_t _r0 = vld2q_f32(r0); - float32x4x2_t _r0n = vld2q_f32(r0+8); + asm volatile( + "prfm pldl1keep, [%2, #256] \n" + "ld2 {v2.4s, v3.4s}, [%2], #32 \n" + + "and v11.16b, %13.16b, %13.16b \n" // v11 = _bias0 - float32x4_t _r00 = _r0.val[0];// 0 2 4 6 - float32x4_t _r01 = _r0.val[1];// 1 3 5 7 - float32x4_t _r02 = vextq_f32(_r00, _r0n.val[0], 1);// 2 4 6 8 + "0: \n" + "fmul v0.4s, v2.4s, %10.s[0] \n" + "fmul v10.4s, v3.4s, %10.s[1] \n" - float32x4_t _outp = vfmaq_laneq_f32(_bias0, _r00, _k012x, 0); - _outp = vfmaq_laneq_f32(_outp, _r01, _k012x, 1); - _outp = vfmaq_laneq_f32(_outp, _r02, _k012x, 2); + "prfm pldl1keep, [%2, #256] \n" + "ld2 {v8.4s, v9.4s}, [%2] \n" + "ext v1.16b, v2.16b, v8.16b, #4 \n" - float32x4x2_t _r1 = vld2q_f32(r1); - float32x4x2_t _r1n = vld2q_f32(r1+8); + "fmla v11.4s, v1.4s, %10.s[2] \n" - float32x4_t _r10 = _r1.val[0]; - float32x4_t _r11 = _r1.val[1]; - float32x4_t _r12 = vextq_f32(_r10, _r1n.val[0], 1); + "prfm pldl1keep, [%3, #256] \n" + "ld2 {v2.4s, v3.4s}, [%3], #32 \n" - _outp = vfmaq_laneq_f32(_outp, _r10, _k345x, 0); - _outp = vfmaq_laneq_f32(_outp, _r11, _k345x, 1); - _outp = vfmaq_laneq_f32(_outp, _r12, _k345x, 2); + "fmla v0.4s, v2.4s, %11.s[0] \n" + "fmla v10.4s, v3.4s, %11.s[1] \n" - float32x4x2_t _r2 = vld2q_f32(r2); - float32x4x2_t _r2n = vld2q_f32(r2+8); + "prfm pldl1keep, [%3, #256] \n" + "ld2 {v8.4s, v9.4s}, [%3] \n" + "ext v1.16b, v2.16b, v8.16b, #4 \n" - float32x4_t _r20 = _r2.val[0]; - float32x4_t _r21 = _r2.val[1]; - float32x4_t _r22 = vextq_f32(_r20, _r2n.val[0], 1); + "fmla v11.4s, v1.4s, %11.s[2] \n" + + "prfm pldl1keep, [%4, #256] \n" + "ld2 {v2.4s, v3.4s}, [%4], #32 \n" - _outp = vfmaq_laneq_f32(_outp, _r20, _k678x, 0); - _outp = vfmaq_laneq_f32(_outp, _r21, _k678x, 1); - _outp = vfmaq_laneq_f32(_outp, _r22, _k678x, 2); + "fmla v0.4s, v2.4s, %12.s[0] \n" + "fmla v10.4s, v3.4s, %12.s[1] \n" - vst1q_f32(outptr, _outp); + "prfm pldl1keep, [%4, #256] \n" + "ld2 {v8.4s, v9.4s}, [%4] \n" + "ext v1.16b, v2.16b, v8.16b, #4 \n" - r0 += 8; - r1 += 8; - r2 += 8; - outptr += 4; + "fmla v11.4s, v1.4s, %12.s[2] \n" + + "prfm pldl1keep, [%2, #256] \n" + "ld2 {v2.4s, v3.4s}, [%2], #32 \n" + + "fadd v0.4s, v0.4s, v10.4s \n" + "fadd v0.4s, v0.4s, v11.4s \n" + + "and v11.16b, %13.16b, %13.16b \n" // v11 = _bias0 + + "subs %w0, %w0, #1 \n" + "st1 {v0.4s}, [%1], #16 \n" + "bne 0b \n" + "sub %2, %2, #32 \n" + : "=r"(nn), // %0 + "=r"(outptr), // %1 + "=r"(r0), // %2 + "=r"(r1), // %3 + "=r"(r2) // %4 + : "0"(nn), + "1"(outptr), + "2"(r0), + "3"(r1), + "4"(r2), + "w"(_k012x), // %10 + "w"(_k345x), // %11 + "w"(_k678x), // %12 + "w"(_bias0) // %13 + : "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" + ); } #else if (nn > 0) diff --git a/src/layer/arm/eltwise_arm.cpp b/src/layer/arm/eltwise_arm.cpp index 07a465f7e..cbe03d61a 100644 --- a/src/layer/arm/eltwise_arm.cpp +++ b/src/layer/arm/eltwise_arm.cpp @@ -55,16 +55,28 @@ int Eltwise_arm::forward(const std::vector& bottom_blobs, std::vector& #if __ARM_NEON #if __aarch64__ - for (; nn>0; nn--) + if (nn > 0) { - float32x4_t _ptr = vld1q_f32(ptr); - float32x4_t _ptr1 = vld1q_f32(ptr1); - float32x4_t _p = vmulq_f32(_ptr, _ptr1); - vst1q_f32(outptr, _p); - - ptr += 4; - ptr1 += 4; - outptr += 4; + asm volatile( + "0: \n" + "prfm pldl1keep, [%1, #128] \n" + "prfm pldl1keep, [%2, #128] \n" + "ld1 {v0.4s}, [%1], #16 \n" + "ld1 {v1.4s}, [%2], #16 \n" + "fmul v0.4s, v0.4s, v1.4s \n" + "subs %w0, %w0, #1 \n" + "st1 {v0.4s}, [%3], #16 \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(ptr), // %1 + "=r"(ptr1), // %2 + "=r"(outptr) // %3 + : "0"(nn), + "1"(ptr), + "2"(ptr1), + "3"(outptr) + : "cc", "memory", "v0", "v1" + ); } #else if (nn > 0) @@ -120,15 +132,26 @@ int Eltwise_arm::forward(const std::vector& bottom_blobs, std::vector& #if __ARM_NEON #if __aarch64__ - for (; nn>0; nn--) + if (nn > 0) { - float32x4_t _ptr = vld1q_f32(ptr); - float32x4_t _p = vld1q_f32(outptr); - _p = vmulq_f32(_ptr, _p); - vst1q_f32(outptr, _p); - - ptr += 4; - outptr += 4; + asm volatile( + "0: \n" + "prfm pldl1keep, [%1, #128] \n" + "prfm pldl1keep, [%2, #128] \n" + "ld1 {v0.4s}, [%1], #16 \n" + "ld1 {v1.4s}, [%2] \n" + "fmul v0.4s, v0.4s, v1.4s \n" + "subs %w0, %w0, #1 \n" + "st1 {v0.4s}, [%2], #16 \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(ptr), // %1 + "=r"(outptr) // %2 + : "0"(nn), + "1"(ptr), + "2"(outptr) + : "cc", "memory", "v0", "v1" + ); } #else if (nn > 0) @@ -186,16 +209,28 @@ int Eltwise_arm::forward(const std::vector& bottom_blobs, std::vector& #if __ARM_NEON #if __aarch64__ - for (; nn>0; nn--) + if (nn > 0) { - float32x4_t _ptr = vld1q_f32(ptr); - float32x4_t _ptr1 = vld1q_f32(ptr1); - float32x4_t _p = vaddq_f32(_ptr, _ptr1); - vst1q_f32(outptr, _p); - - ptr += 4; - ptr1 += 4; - outptr += 4; + asm volatile( + "0: \n" + "prfm pldl1keep, [%1, #128] \n" + "prfm pldl1keep, [%2, #128] \n" + "ld1 {v0.4s}, [%1], #16 \n" + "ld1 {v1.4s}, [%2], #16 \n" + "fadd v0.4s, v0.4s, v1.4s \n" + "subs %w0, %w0, #1 \n" + "st1 {v0.4s}, [%3], #16 \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(ptr), // %1 + "=r"(ptr1), // %2 + "=r"(outptr) // %3 + : "0"(nn), + "1"(ptr), + "2"(ptr1), + "3"(outptr) + : "cc", "memory", "v0", "v1" + ); } #else if (nn > 0) @@ -251,15 +286,26 @@ int Eltwise_arm::forward(const std::vector& bottom_blobs, std::vector& #if __ARM_NEON #if __aarch64__ - for (; nn>0; nn--) + if (nn > 0) { - float32x4_t _ptr = vld1q_f32(ptr); - float32x4_t _p = vld1q_f32(outptr); - _p = vaddq_f32(_ptr, _p); - vst1q_f32(outptr, _p); - - ptr += 4; - outptr += 4; + asm volatile( + "0: \n" + "prfm pldl1keep, [%1, #128] \n" + "prfm pldl1keep, [%2, #128] \n" + "ld1 {v0.4s}, [%1], #16 \n" + "ld1 {v1.4s}, [%2] \n" + "fadd v0.4s, v0.4s, v1.4s \n" + "subs %w0, %w0, #1 \n" + "st1 {v0.4s}, [%2], #16 \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(ptr), // %1 + "=r"(outptr) // %2 + : "0"(nn), + "1"(ptr), + "2"(outptr) + : "cc", "memory", "v0", "v1" + ); } #else if (nn > 0) @@ -321,17 +367,31 @@ int Eltwise_arm::forward(const std::vector& bottom_blobs, std::vector& float32x4_t _coeff0 = vdupq_n_f32(coeff0); float32x4_t _coeff1 = vdupq_n_f32(coeff1); #if __aarch64__ - for (; nn>0; nn--) + if (nn > 0) { - float32x4_t _ptr = vld1q_f32(ptr); - float32x4_t _ptr1 = vld1q_f32(ptr1); - float32x4_t _p = vmulq_f32(_ptr, _coeff0); - _p = vmlaq_f32(_p, _ptr1, _coeff1); - vst1q_f32(outptr, _p); - - ptr += 4; - ptr1 += 4; - outptr += 4; + asm volatile( + "0: \n" + "prfm pldl1keep, [%1, #128] \n" + "prfm pldl1keep, [%2, #128] \n" + "ld1 {v0.4s}, [%1], #16 \n" + "ld1 {v1.4s}, [%2], #16 \n" + "fmul v0.4s, v0.4s, %8.4s \n" + "fmla v0.4s, v1.4s, %9.4s \n" + "subs %w0, %w0, #1 \n" + "st1 {v0.4s}, [%3], #16 \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(ptr), // %1 + "=r"(ptr1), // %2 + "=r"(outptr) // %3 + : "0"(nn), + "1"(ptr), + "2"(ptr1), + "3"(outptr), + "w"(_coeff0), // %8 + "w"(_coeff1) // %9 + : "cc", "memory", "v0", "v1" + ); } #else if (nn > 0) @@ -392,15 +452,27 @@ int Eltwise_arm::forward(const std::vector& bottom_blobs, std::vector& #if __ARM_NEON float32x4_t _coeff = vdupq_n_f32(coeff); #if __aarch64__ - for (; nn>0; nn--) + if (nn > 0) { - float32x4_t _ptr = vld1q_f32(ptr); - float32x4_t _p = vld1q_f32(outptr); - _p = vmlaq_f32(_p, _ptr, _coeff); - vst1q_f32(outptr, _p); - - ptr += 4; - outptr += 4; + asm volatile( + "0: \n" + "prfm pldl1keep, [%1, #128] \n" + "prfm pldl1keep, [%2, #128] \n" + "ld1 {v0.4s}, [%1], #16 \n" + "ld1 {v1.4s}, [%2] \n" + "fmla v1.4s, v0.4s, %6.4s \n" + "subs %w0, %w0, #1 \n" + "st1 {v1.4s}, [%2], #16 \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(ptr), // %1 + "=r"(outptr) // %2 + : "0"(nn), + "1"(ptr), + "2"(outptr), + "w"(_coeff) // %6 + : "cc", "memory", "v0", "v1" + ); } #else if (nn > 0) @@ -458,16 +530,28 @@ int Eltwise_arm::forward(const std::vector& bottom_blobs, std::vector& #if __ARM_NEON #if __aarch64__ - for (; nn>0; nn--) + if (nn > 0) { - float32x4_t _ptr = vld1q_f32(ptr); - float32x4_t _ptr1 = vld1q_f32(ptr1); - float32x4_t _p = vmaxq_f32(_ptr, _ptr1); - vst1q_f32(outptr, _p); - - ptr += 4; - ptr1 += 4; - outptr += 4; + asm volatile( + "0: \n" + "prfm pldl1keep, [%1, #128] \n" + "prfm pldl1keep, [%2, #128] \n" + "ld1 {v0.4s}, [%1], #16 \n" + "ld1 {v1.4s}, [%2], #16 \n" + "fmax v0.4s, v0.4s, v1.4s \n" + "subs %w0, %w0, #1 \n" + "st1 {v0.4s}, [%3], #16 \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(ptr), // %1 + "=r"(ptr1), // %2 + "=r"(outptr) // %3 + : "0"(nn), + "1"(ptr), + "2"(ptr1), + "3"(outptr) + : "cc", "memory", "v0", "v1" + ); } #else if (nn > 0) @@ -523,15 +607,26 @@ int Eltwise_arm::forward(const std::vector& bottom_blobs, std::vector& #if __ARM_NEON #if __aarch64__ - for (; nn>0; nn--) + if (nn > 0) { - float32x4_t _ptr = vld1q_f32(ptr); - float32x4_t _p = vld1q_f32(outptr); - _p = vmaxq_f32(_ptr, _p); - vst1q_f32(outptr, _p); - - ptr += 4; - outptr += 4; + asm volatile( + "0: \n" + "prfm pldl1keep, [%1, #128] \n" + "prfm pldl1keep, [%2, #128] \n" + "ld1 {v0.4s}, [%1], #16 \n" + "ld1 {v1.4s}, [%2] \n" + "fmax v0.4s, v0.4s, v1.4s \n" + "subs %w0, %w0, #1 \n" + "st1 {v0.4s}, [%2], #16 \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(ptr), // %1 + "=r"(outptr) // %2 + : "0"(nn), + "1"(ptr), + "2"(outptr) + : "cc", "memory", "v0", "v1" + ); } #else if (nn > 0) diff --git a/src/layer/arm/innerproduct_arm.cpp b/src/layer/arm/innerproduct_arm.cpp index 47e03d665..d9947614d 100644 --- a/src/layer/arm/innerproduct_arm.cpp +++ b/src/layer/arm/innerproduct_arm.cpp @@ -172,18 +172,30 @@ int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob) const #if __ARM_NEON #if __aarch64__ - for (; nn>0; nn--) + if (nn > 0) { - float32x4_t _m = vld1q_f32(m); - float32x4_t _w = vld1q_f32(w); - _sum = vfmaq_f32(_sum, _m, _w); - - _m = vld1q_f32(m + 4); - _w = vld1q_f32(w + 4); - _sum2 = vfmaq_f32(_sum2, _m, _w); - - m += 8; - w += 8; + asm volatile( + "0: \n" + "prfm pldl1keep, [%1, #256] \n" + "ld1 {v0.4s, v1.4s}, [%1], #32 \n" + "prfm pldl1keep, [%2, #256] \n" + "ld1 {v2.4s, v3.4s}, [%2], #32 \n" + "fmla %3.4s, v0.4s, v2.4s \n" + "subs %w0, %w0, #1 \n" + "fmla %4.4s, v1.4s, v3.4s \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(m), // %1 + "=r"(w), // %2 + "=w"(_sum), // %3 + "=w"(_sum2) // %4 + : "0"(nn), + "1"(m), + "2"(w), + "3"(_sum), + "4"(_sum2) + : "cc", "memory", "v0", "v1", "v2", "v3" + ); } #else if (nn > 0) diff --git a/src/layer/arm/pooling_2x2.h b/src/layer/arm/pooling_2x2.h index 76c93c1bd..91cb1b98c 100644 --- a/src/layer/arm/pooling_2x2.h +++ b/src/layer/arm/pooling_2x2.h @@ -46,23 +46,30 @@ static void pooling2x2s2_max_neon(const Mat& bottom_blob, Mat& top_blob) #if __ARM_NEON #if __aarch64__ - for (; nn>0; nn--) + if (nn > 0) { - float32x4_t _r00 = vld1q_f32(r0); - float32x4_t _r10 = vld1q_f32(r1); - float32x4_t _r01 = vld1q_f32(r0 + 4); - float32x4_t _r11 = vld1q_f32(r1 + 4); - - float32x4_t _max0 = vmaxq_f32(_r00, _r10); - float32x4_t _max1 = vmaxq_f32(_r01, _r11); - - float32x4_t _max = vpmaxq_f32(_max0, _max1); - - vst1q_f32(outptr, _max); - - r0 += 8; - r1 += 8; - outptr += 4; + asm volatile( + "0: \n" + "prfm pldl1keep, [%1, #256] \n" + "prfm pldl1keep, [%2, #256] \n" + "ld1 {v0.4s, v1.4s}, [%1], #32 \n" + "ld1 {v2.4s, v3.4s}, [%2], #32 \n" + "fmax v0.4s, v0.4s, v2.4s \n" + "fmax v1.4s, v1.4s, v3.4s \n" + "fmaxp v2.4s, v0.4s, v1.4s \n" + "subs %w0, %w0, #1 \n" + "st1 {v2.4s}, [%3], #16 \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(r0), // %1 + "=r"(r1), // %2 + "=r"(outptr) // %3 + : "0"(nn), + "1"(r0), + "2"(r1), + "3"(outptr) + : "cc", "memory", "v0", "v1", "v2", "v3" + ); } #else if (nn > 0) diff --git a/src/layer/arm/pooling_3x3.h b/src/layer/arm/pooling_3x3.h index 26183183b..b53cbbd4d 100644 --- a/src/layer/arm/pooling_3x3.h +++ b/src/layer/arm/pooling_3x3.h @@ -47,39 +47,68 @@ static void pooling3x3s2_max_neon(const Mat& bottom_blob, Mat& top_blob) #if __ARM_NEON #if __aarch64__ - float32x4x2_t _r0 = vld2q_f32(r0); - float32x4x2_t _r1 = vld2q_f32(r1); - float32x4x2_t _r2 = vld2q_f32(r2); - for (; nn>0; nn--) + if (nn > 0) { - float32x4x2_t _r0n = vld2q_f32(r0+8); - float32x4x2_t _r1n = vld2q_f32(r1+8); - float32x4x2_t _r2n = vld2q_f32(r2+8); + asm volatile( + "prfm pldl1keep, [%1, #256] \n" + "ld2 {v0.4s, v1.4s}, [%1], #32 \n" + "prfm pldl1keep, [%2, #256] \n" + "ld2 {v2.4s, v3.4s}, [%2], #32 \n" + "prfm pldl1keep, [%3, #256] \n" + "ld2 {v4.4s, v5.4s}, [%3], #32 \n" + "0: \n" + + "prfm pldl1keep, [%1, #256] \n" + "ld2 {v6.4s, v7.4s}, [%1], #32 \n" + + "fmax v12.4s, v0.4s, v1.4s \n" + "fmax v13.4s, v2.4s, v3.4s \n" + + "prfm pldl1keep, [%2, #256] \n" + "ld2 {v8.4s, v9.4s}, [%2], #32 \n" - float32x4_t _max0 = vmaxq_f32(_r0.val[0], _r0.val[1]); - float32x4_t _max1 = vmaxq_f32(_r1.val[0], _r1.val[1]); - float32x4_t _max2 = vmaxq_f32(_r2.val[0], _r2.val[1]); + "fmax v14.4s, v4.4s, v5.4s \n" + "ext v0.16b, v0.16b, v6.16b, #4 \n" - float32x4_t _r02 = vextq_f32(_r0.val[0], _r0n.val[0], 1); - float32x4_t _r12 = vextq_f32(_r1.val[0], _r1n.val[0], 1); - float32x4_t _r22 = vextq_f32(_r2.val[0], _r2n.val[0], 1); + "prfm pldl1keep, [%3, #256] \n" + "ld2 {v10.4s, v11.4s}, [%3], #32 \n" - _max0 = vmaxq_f32(_max0, _r02); - _max1 = vmaxq_f32(_max1, _r12); - _max2 = vmaxq_f32(_max2, _r22); + "ext v2.16b, v2.16b, v8.16b, #4 \n" - float32x4_t _max = vmaxq_f32(vmaxq_f32(_max0, _max1), _max2); + "fmax v12.4s, v12.4s, v0.4s \n" + "ext v4.16b, v4.16b, v10.16b, #4 \n" - vst1q_f32(outptr, _max); + "fmax v13.4s, v13.4s, v2.4s \n" + "fmax v14.4s, v14.4s, v4.4s \n" + "fmax v12.4s, v12.4s, v13.4s \n" - _r0 = _r0n; - _r1 = _r1n; - _r2 = _r2n; + "orr v0.16b, v6.16b, v6.16b \n" + "orr v1.16b, v7.16b, v7.16b \n" + "fmax v12.4s, v12.4s, v14.4s \n" - r0 += 8; - r1 += 8; - r2 += 8; - outptr += 4; + "orr v2.16b, v8.16b, v8.16b \n" + "orr v3.16b, v9.16b, v9.16b \n" + "orr v4.16b, v10.16b, v10.16b \n" + "orr v5.16b, v11.16b, v11.16b \n" + + "subs %w0, %w0, #1 \n" + "st1 {v12.4s}, [%4], #16 \n" + "bne 0b \n" + "sub %1, %1, #32 \n" + "sub %2, %2, #32 \n" + "sub %3, %3, #32 \n" + : "=r"(nn), // %0 + "=r"(r0), // %1 + "=r"(r1), // %2 + "=r"(r2), // %3 + "=r"(outptr) // %4 + : "0"(nn), + "1"(r0), + "2"(r1), + "3"(r2), + "4"(outptr) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14" + ); } #else if (nn > 0) diff --git a/src/layer/arm/slice_arm.cpp b/src/layer/arm/slice_arm.cpp index e811df37a..3b89c9001 100644 --- a/src/layer/arm/slice_arm.cpp +++ b/src/layer/arm/slice_arm.cpp @@ -57,15 +57,23 @@ int Slice_arm::forward(const std::vector& bottom_blobs, std::vector& t #if __ARM_NEON #if __aarch64__ - for (; nn>0; nn--) + if (nn > 0) { - float32x4_t _p = vld1q_f32(ptr); - float32x4_t _p2 = vld1q_f32(ptr+4); - vst1q_f32(outptr, _p); - vst1q_f32(outptr+4, _p2); - - ptr += 8; - outptr += 8; + asm volatile( + "0: \n" + "prfm pldl1keep, [%1, #256] \n" + "ld1 {v0.4s, v1.4s}, [%1], #32 \n" + "subs %w0, %w0, #1 \n" + "st1 {v0.4s, v1.4s}, [%2], #32 \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(ptr), // %1 + "=r"(outptr) // %2 + : "0"(nn), + "1"(ptr), + "2"(outptr) + : "cc", "memory", "v0", "v1" + ); } #else if (nn > 0) diff --git a/src/mat.cpp b/src/mat.cpp index c0ae337ea..45501abff 100644 --- a/src/mat.cpp +++ b/src/mat.cpp @@ -44,13 +44,24 @@ void Mat::substract_mean_normalize(const float* mean_vals, const float* norm_val #if __ARM_NEON #if __aarch64__ - float32x4_t _mean = vdupq_n_f32(mean); - for (; nn>0; nn--) + if (nn > 0) { - float32x4_t _ptr = vld1q_f32(ptr); - _ptr = vsubq_f32(_ptr, _mean); - vst1q_f32(ptr, _ptr); - ptr += 4; + asm volatile( + "dup v1.4s, %w4 \n" + "0: \n" + "prfm pldl1keep, [%1, #128] \n" + "ld1 {v0.4s}, [%1] \n" + "fsub v0.4s, v0.4s, v1.4s \n" + "subs %w0, %w0, #1 \n" + "st1 {v0.4s}, [%1], #16 \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(ptr) // %1 + : "0"(nn), + "1"(ptr), + "r"(mean) // %4 + : "cc", "memory", "v0", "v1" + ); } #else if (nn > 0) @@ -99,13 +110,24 @@ void Mat::substract_mean_normalize(const float* mean_vals, const float* norm_val #if __ARM_NEON #if __aarch64__ - float32x4_t _norm = vdupq_n_f32(norm); - for (; nn>0; nn--) + if (nn > 0) { - float32x4_t _ptr = vld1q_f32(ptr); - _ptr = vmulq_f32(_ptr, _norm); - vst1q_f32(ptr, _ptr); - ptr += 4; + asm volatile( + "dup v1.4s, %w4 \n" + "0: \n" + "prfm pldl1keep, [%1, #128] \n" + "ld1 {v0.4s}, [%1] \n" + "fmul v0.4s, v0.4s, v1.4s \n" + "subs %w0, %w0, #1 \n" + "st1 {v0.4s}, [%1], #16 \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(ptr) // %1 + : "0"(nn), + "1"(ptr), + "r"(norm) // %4 + : "cc", "memory", "v0", "v1" + ); } #else if (nn > 0) @@ -155,15 +177,27 @@ void Mat::substract_mean_normalize(const float* mean_vals, const float* norm_val #if __ARM_NEON #if __aarch64__ - float32x4_t _mean = vdupq_n_f32(mean); - float32x4_t _norm = vdupq_n_f32(norm); - for (; nn>0; nn--) - { - float32x4_t _ptr = vld1q_f32(ptr); - _ptr = vsubq_f32(_ptr, _mean); - _ptr = vmulq_f32(_ptr, _norm); - vst1q_f32(ptr, _ptr); - ptr += 4; + if (nn > 0) + { + asm volatile( + "dup v1.4s, %w4 \n" + "dup v2.4s, %w5 \n" + "0: \n" + "prfm pldl1keep, [%1, #128] \n" + "ld1 {v0.4s}, [%1] \n" + "fsub v0.4s, v0.4s, v1.4s \n" + "fmul v0.4s, v0.4s, v2.4s \n" + "subs %w0, %w0, #1 \n" + "st1 {v0.4s}, [%1], #16 \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(ptr) // %1 + : "0"(nn), + "1"(ptr), + "r"(mean), // %4 + "r"(norm) // %5 + : "cc", "memory", "v0", "v1", "v2" + ); } #else if (nn > 0) diff --git a/src/mat.h b/src/mat.h index e35a0a9ca..a9f4422c7 100644 --- a/src/mat.h +++ b/src/mat.h @@ -343,10 +343,20 @@ inline void Mat::fill(float _v) #if __ARM_NEON float32x4_t _c = vdupq_n_f32(_v); #if __aarch64__ - for (; nn>0; nn--) + if (nn > 0) { - vst1q_f32(ptr, _c); - ptr += 4; + asm volatile ( + "0: \n" + "subs %w0, %w0, #1 \n" + "st1 {%4.4s}, [%1], #16 \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(ptr) // %1 + : "0"(nn), + "1"(ptr), + "w"(_c) // %4 + : "cc", "memory" + ); } #else if (nn > 0)