| @@ -43,13 +43,22 @@ int AbsVal_arm::forward_inplace(Mat& bottom_top_blob) const | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| if (nn > 0) | |||
| { | |||
| float32x4_t _p = vld1q_f32(ptr); | |||
| _p = vabsq_f32(_p); | |||
| vst1q_f32(ptr, _p); | |||
| ptr += 4; | |||
| asm volatile( | |||
| "0: \n" | |||
| "prfm pldl1keep, [%1, #128] \n" | |||
| "ld1 {v0.4s}, [%1] \n" | |||
| "fabs v0.4s, v0.4s \n" | |||
| "subs %w0, %w0, #1 \n" | |||
| "st1 {v0.4s}, [%1], #16 \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(ptr) // %1 | |||
| : "0"(nn), | |||
| "1"(ptr) | |||
| : "cc", "memory", "v0" | |||
| ); | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| @@ -51,16 +51,27 @@ int BatchNorm_arm::forward_inplace(Mat& bottom_top_blob) const | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| float32x4_t _a = vdupq_n_f32(a); | |||
| float32x4_t _b = vdupq_n_f32(b); | |||
| for (; nn>0; nn--) | |||
| if (nn > 0) | |||
| { | |||
| float32x4_t _p = vld1q_f32(ptr); | |||
| float32x4_t _outp = _a; | |||
| _outp = vfmaq_f32(_outp, _p, _b); | |||
| vst1q_f32(ptr, _outp); | |||
| ptr += 4; | |||
| asm volatile( | |||
| "dup v1.4s, %w4 \n" | |||
| "dup v2.4s, %w5 \n" | |||
| "0: \n" | |||
| "prfm pldl1keep, [%1, #128] \n" | |||
| "ld1 {v0.4s}, [%1] \n" | |||
| "orr v3.16b, v1.16b, v1.16b \n" | |||
| "fmla v3.4s, v0.4s, v2.4s \n" | |||
| "subs %w0, %w0, #1 \n" | |||
| "st1 {v3.4s}, [%1], #16 \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(ptr) // %1 | |||
| : "0"(nn), | |||
| "1"(ptr), | |||
| "r"(a), // %4 | |||
| "r"(b) // %5 | |||
| : "cc", "memory", "v0", "v1", "v2", "v3" | |||
| ); | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| @@ -1027,52 +1027,62 @@ static void conv1x1s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| float32x4_t _k2 = vdupq_n_f32(k2); | |||
| float32x4_t _k3 = vdupq_n_f32(k3); | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| if (nn > 0) | |||
| { | |||
| float32x4_t _p = vld1q_f32(r0); | |||
| float32x4_t _pn = vld1q_f32(r0+4); | |||
| float32x4_t _out0p = vld1q_f32(outptr0); | |||
| float32x4_t _out0pn = vld1q_f32(outptr0+4); | |||
| float32x4_t _out1p = vld1q_f32(outptr1); | |||
| float32x4_t _out1pn = vld1q_f32(outptr1+4); | |||
| float32x4_t _out2p = vld1q_f32(outptr2); | |||
| float32x4_t _out2pn = vld1q_f32(outptr2+4); | |||
| float32x4_t _out3p = vld1q_f32(outptr3); | |||
| float32x4_t _out3pn = vld1q_f32(outptr3+4); | |||
| _out0p = vfmaq_f32(_out0p, _p, _k0); | |||
| _out0pn = vfmaq_f32(_out0pn, _pn, _k0); | |||
| _out1p = vfmaq_f32(_out1p, _p, _k1); | |||
| _out1pn = vfmaq_f32(_out1pn, _pn, _k1); | |||
| _out2p = vfmaq_f32(_out2p, _p, _k2); | |||
| _out2pn = vfmaq_f32(_out2pn, _pn, _k2); | |||
| _out3p = vfmaq_f32(_out3p, _p, _k3); | |||
| _out3pn = vfmaq_f32(_out3pn, _pn, _k3); | |||
| vst1q_f32(outptr0, _out0p); | |||
| vst1q_f32(outptr0+4, _out0pn); | |||
| vst1q_f32(outptr1, _out1p); | |||
| vst1q_f32(outptr1+4, _out1pn); | |||
| vst1q_f32(outptr2, _out2p); | |||
| vst1q_f32(outptr2+4, _out2pn); | |||
| vst1q_f32(outptr3, _out3p); | |||
| vst1q_f32(outptr3+4, _out3pn); | |||
| r0 += 8; | |||
| outptr0 += 8; | |||
| outptr1 += 8; | |||
| outptr2 += 8; | |||
| outptr3 += 8; | |||
| asm volatile( | |||
| "prfm pldl1keep, [%5, #256] \n" | |||
| "ld1 {v6.4s, v7.4s}, [%5], #32 \n" | |||
| "0: \n" | |||
| "prfm pldl1keep, [%1, #256] \n" | |||
| "ld1 {v8.4s, v9.4s}, [%1] \n" | |||
| "fmla v8.4s, v6.4s, %12.4s \n" | |||
| "fmla v9.4s, v7.4s, %12.4s \n" | |||
| "prfm pldl1keep, [%2, #256] \n" | |||
| "ld1 {v10.4s, v11.4s}, [%2] \n" | |||
| "fmla v10.4s, v6.4s, %13.4s \n" | |||
| "fmla v11.4s, v7.4s, %13.4s \n" | |||
| "st1 {v8.4s, v9.4s}, [%1], #32 \n" | |||
| "prfm pldl1keep, [%3, #256] \n" | |||
| "ld1 {v12.4s, v13.4s}, [%3] \n" | |||
| "fmla v12.4s, v6.4s, %14.4s \n" | |||
| "fmla v13.4s, v7.4s, %14.4s \n" | |||
| "st1 {v10.4s, v11.4s}, [%2], #32 \n" | |||
| "prfm pldl1keep, [%4, #256] \n" | |||
| "ld1 {v14.4s, v15.4s}, [%4] \n" | |||
| "fmla v14.4s, v6.4s, %15.4s \n" | |||
| "fmla v15.4s, v7.4s, %15.4s \n" | |||
| "st1 {v12.4s, v13.4s}, [%3], #32 \n" | |||
| "prfm pldl1keep, [%5, #256] \n" | |||
| "ld1 {v6.4s, v7.4s}, [%5], #32 \n" | |||
| "subs %w0, %w0, #1 \n" | |||
| "st1 {v14.4s, v15.4s}, [%4], #32 \n" | |||
| "bne 0b \n" | |||
| "sub %5, %5, #32 \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(outptr0),// %1 | |||
| "=r"(outptr1),// %2 | |||
| "=r"(outptr2),// %3 | |||
| "=r"(outptr3),// %4 | |||
| "=r"(r0) // %5 | |||
| : "0"(nn), | |||
| "1"(outptr0), | |||
| "2"(outptr1), | |||
| "3"(outptr2), | |||
| "4"(outptr3), | |||
| "5"(r0), | |||
| "w"(_k0), // %12 | |||
| "w"(_k1), // %13 | |||
| "w"(_k2), // %14 | |||
| "w"(_k3) // %15 | |||
| : "cc", "memory", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" | |||
| ); | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| @@ -1202,43 +1212,56 @@ static void conv1x1s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| float32x4_t _k2 = vdupq_n_f32(k2); | |||
| float32x4_t _k3 = vdupq_n_f32(k3); | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| if (nn > 0) | |||
| { | |||
| float32x4_t _p = vld1q_f32(r0); | |||
| float32x4_t _pn = vld1q_f32(r0+4); | |||
| float32x4_t _outp = vld1q_f32(outptr); | |||
| float32x4_t _outpn = vld1q_f32(outptr+4); | |||
| _outp = vfmaq_f32(_outp, _p, _k0); | |||
| _outpn = vfmaq_f32(_outpn, _pn, _k0); | |||
| float32x4_t _p1 = vld1q_f32(r1); | |||
| float32x4_t _p1n = vld1q_f32(r1+4); | |||
| _outp = vfmaq_f32(_outp, _p1, _k1); | |||
| _outpn = vfmaq_f32(_outpn, _p1n, _k1); | |||
| float32x4_t _p2 = vld1q_f32(r2); | |||
| float32x4_t _p2n = vld1q_f32(r2+4); | |||
| _outp = vfmaq_f32(_outp, _p2, _k2); | |||
| _outpn = vfmaq_f32(_outpn, _p2n, _k2); | |||
| float32x4_t _p3 = vld1q_f32(r3); | |||
| float32x4_t _p3n = vld1q_f32(r3+4); | |||
| _outp = vfmaq_f32(_outp, _p3, _k3); | |||
| _outpn = vfmaq_f32(_outpn, _p3n, _k3); | |||
| vst1q_f32(outptr, _outp); | |||
| vst1q_f32(outptr+4, _outpn); | |||
| r0 += 8; | |||
| r1 += 8; | |||
| r2 += 8; | |||
| r3 += 8; | |||
| outptr += 8; | |||
| asm volatile( | |||
| "prfm pldl1keep, [%2, #256] \n" | |||
| "ld1 {v2.4s, v3.4s}, [%2], #32 \n" | |||
| "0: \n" | |||
| "prfm pldl1keep, [%1, #256] \n" | |||
| "ld1 {v0.4s, v1.4s}, [%1] \n" | |||
| "fmla v0.4s, v2.4s, %12.4s \n" | |||
| "fmla v1.4s, v3.4s, %12.4s \n" | |||
| "prfm pldl1keep, [%3, #256] \n" | |||
| "ld1 {v2.4s, v3.4s}, [%3], #32 \n" | |||
| "fmla v0.4s, v2.4s, %13.4s \n" | |||
| "fmla v1.4s, v3.4s, %13.4s \n" | |||
| "prfm pldl1keep, [%4, #256] \n" | |||
| "ld1 {v2.4s, v3.4s}, [%4], #32 \n" | |||
| "fmla v0.4s, v2.4s, %14.4s \n" | |||
| "fmla v1.4s, v3.4s, %14.4s \n" | |||
| "prfm pldl1keep, [%5, #256] \n" | |||
| "ld1 {v2.4s, v3.4s}, [%5], #32 \n" | |||
| "fmla v0.4s, v2.4s, %15.4s \n" | |||
| "fmla v1.4s, v3.4s, %15.4s \n" | |||
| "prfm pldl1keep, [%2, #256] \n" | |||
| "ld1 {v2.4s, v3.4s}, [%2], #32 \n" | |||
| "subs %w0, %w0, #1 \n" | |||
| "st1 {v0.4s, v1.4s}, [%1], #32 \n" | |||
| "bne 0b \n" | |||
| "sub %2, %2, #32 \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(outptr), // %1 | |||
| "=r"(r0), // %2 | |||
| "=r"(r1), // %3 | |||
| "=r"(r2), // %4 | |||
| "=r"(r3) // %5 | |||
| : "0"(nn), | |||
| "1"(outptr), | |||
| "2"(r0), | |||
| "3"(r1), | |||
| "4"(r2), | |||
| "5"(r3), | |||
| "w"(_k0), // %12 | |||
| "w"(_k1), // %13 | |||
| "w"(_k2), // %14 | |||
| "w"(_k3) // %15 | |||
| : "cc", "memory", "v0", "v1", "v2", "v3" | |||
| ); | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| @@ -1331,22 +1354,31 @@ static void conv1x1s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| #if __ARM_NEON | |||
| float32x4_t _k0 = vdupq_n_f32(k0); | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| if (nn > 0) | |||
| { | |||
| float32x4_t _p = vld1q_f32(r0); | |||
| float32x4_t _outp = vld1q_f32(outptr); | |||
| float32x4_t _pn = vld1q_f32(r0+4); | |||
| float32x4_t _outpn = vld1q_f32(outptr+4); | |||
| _outp = vfmaq_f32(_outp, _p, _k0); | |||
| _outpn = vfmaq_f32(_outpn, _pn, _k0); | |||
| vst1q_f32(outptr, _outp); | |||
| vst1q_f32(outptr+4, _outpn); | |||
| r0 += 8; | |||
| outptr += 8; | |||
| asm volatile( | |||
| "prfm pldl1keep, [%2, #256] \n" | |||
| "ld1 {v2.4s, v3.4s}, [%2], #32 \n" | |||
| "0: \n" | |||
| "prfm pldl1keep, [%1, #256] \n" | |||
| "ld1 {v0.4s, v1.4s}, [%1] \n" | |||
| "fmla v0.4s, v2.4s, %6.4s \n" | |||
| "fmla v1.4s, v3.4s, %6.4s \n" | |||
| "prfm pldl1keep, [%2, #256] \n" | |||
| "ld1 {v2.4s, v3.4s}, [%2], #32 \n" | |||
| "subs %w0, %w0, #1 \n" | |||
| "st1 {v0.4s, v1.4s}, [%1], #32 \n" | |||
| "bne 0b \n" | |||
| "sub %2, %2, #32 \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(outptr), // %1 | |||
| "=r"(r0) // %2 | |||
| : "0"(nn), | |||
| "1"(outptr), | |||
| "2"(r0), | |||
| "w"(_k0) // %6 | |||
| : "cc", "memory", "v0", "v1", "v2", "v3" | |||
| ); | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| @@ -1470,108 +1502,128 @@ static void conv1x1s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| float32x4_t _k2 = vld1q_f32(kernel2); | |||
| float32x4_t _k3 = vld1q_f32(kernel3); | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| if (nn > 0) | |||
| { | |||
| float32x4x2_t _px2 = vld2q_f32(r0); | |||
| float32x4_t _p = _px2.val[0]; | |||
| float32x4x2_t _pnx2 = vld2q_f32(r0+8); | |||
| float32x4_t _pn = _pnx2.val[0]; | |||
| asm volatile( | |||
| "0: \n" | |||
| "prfm pldl1keep, [%5, #512] \n" | |||
| "ld2 {v4.4s, v5.4s}, [%5], #32 \n" | |||
| "ld2 {v6.4s, v7.4s}, [%5], #32 \n" | |||
| "and v5.16b, v6.16b, v6.16b \n"// v4 v5 | |||
| float32x4_t _out0p = vld1q_f32(outptr0); | |||
| float32x4_t _out0pn = vld1q_f32(outptr0+4); | |||
| "prfm pldl1keep, [%1, #256] \n" | |||
| "ld1 {v8.4s, v9.4s}, [%1] \n" | |||
| float32x4_t _out1p = vld1q_f32(outptr1); | |||
| float32x4_t _out1pn = vld1q_f32(outptr1+4); | |||
| "fmla v8.4s, v4.4s, %18.s[0] \n" | |||
| "fmla v9.4s, v5.4s, %18.s[0] \n" | |||
| float32x4_t _out2p = vld1q_f32(outptr2); | |||
| float32x4_t _out2pn = vld1q_f32(outptr2+4); | |||
| "prfm pldl1keep, [%2, #256] \n" | |||
| "ld1 {v10.4s, v11.4s}, [%2] \n" | |||
| float32x4_t _out3p = vld1q_f32(outptr3); | |||
| float32x4_t _out3pn = vld1q_f32(outptr3+4); | |||
| "fmla v10.4s, v4.4s, %19.s[0] \n" | |||
| "fmla v11.4s, v5.4s, %19.s[0] \n" | |||
| _out0p = vfmaq_laneq_f32(_out0p, _p, _k0, 0); | |||
| _out0pn = vfmaq_laneq_f32(_out0pn, _pn, _k0, 0); | |||
| "prfm pldl1keep, [%3, #256] \n" | |||
| "ld1 {v12.4s, v13.4s}, [%3] \n" | |||
| _out1p = vfmaq_laneq_f32(_out1p, _p, _k1, 0); | |||
| _out1pn = vfmaq_laneq_f32(_out1pn, _pn, _k1, 0); | |||
| "fmla v12.4s, v4.4s, %20.s[0] \n" | |||
| "fmla v13.4s, v5.4s, %20.s[0] \n" | |||
| _out2p = vfmaq_laneq_f32(_out2p, _p, _k2, 0); | |||
| _out2pn = vfmaq_laneq_f32(_out2pn, _pn, _k2, 0); | |||
| "prfm pldl1keep, [%4, #256] \n" | |||
| "ld1 {v14.4s, v15.4s}, [%4] \n" | |||
| _out3p = vfmaq_laneq_f32(_out3p, _p, _k3, 0); | |||
| _out3pn = vfmaq_laneq_f32(_out3pn, _pn, _k3, 0); | |||
| "prfm pldl1keep, [%6, #512] \n" | |||
| "ld2 {v6.4s, v7.4s}, [%6], #32 \n" | |||
| float32x4x2_t _p1x2 = vld2q_f32(r1); | |||
| float32x4_t _p1 = _p1x2.val[0]; | |||
| float32x4x2_t _p1nx2 = vld2q_f32(r1+8); | |||
| float32x4_t _p1n = _p1nx2.val[0]; | |||
| "fmla v14.4s, v4.4s, %21.s[0] \n" | |||
| "fmla v15.4s, v5.4s, %21.s[0] \n" | |||
| _out0p = vfmaq_laneq_f32(_out0p, _p1, _k0, 1); | |||
| _out0pn = vfmaq_laneq_f32(_out0pn, _p1n, _k0, 1); | |||
| "ld2 {v4.4s, v5.4s}, [%6], #32 \n" | |||
| "and v7.16b, v4.16b, v4.16b \n"// v6 v7 | |||
| _out1p = vfmaq_laneq_f32(_out1p, _p1, _k1, 1); | |||
| _out1pn = vfmaq_laneq_f32(_out1pn, _p1n, _k1, 1); | |||
| "fmla v8.4s, v6.4s, %18.s[1] \n" | |||
| "fmla v9.4s, v7.4s, %18.s[1] \n" | |||
| _out2p = vfmaq_laneq_f32(_out2p, _p1, _k2, 1); | |||
| _out2pn = vfmaq_laneq_f32(_out2pn, _p1n, _k2, 1); | |||
| "fmla v10.4s, v6.4s, %19.s[1] \n" | |||
| "fmla v11.4s, v7.4s, %19.s[1] \n" | |||
| _out3p = vfmaq_laneq_f32(_out3p, _p1, _k3, 1); | |||
| _out3pn = vfmaq_laneq_f32(_out3pn, _p1n, _k3, 1); | |||
| "fmla v12.4s, v6.4s, %20.s[1] \n" | |||
| "fmla v13.4s, v7.4s, %20.s[1] \n" | |||
| float32x4x2_t _p2x2 = vld2q_f32(r2); | |||
| float32x4_t _p2 = _p2x2.val[0]; | |||
| float32x4x2_t _p2nx2 = vld2q_f32(r2+8); | |||
| float32x4_t _p2n = _p2nx2.val[0]; | |||
| "prfm pldl1keep, [%7, #512] \n" | |||
| "ld2 {v4.4s, v5.4s}, [%7], #32 \n" | |||
| _out0p = vfmaq_laneq_f32(_out0p, _p2, _k0, 2); | |||
| _out0pn = vfmaq_laneq_f32(_out0pn, _p2n, _k0, 2); | |||
| "fmla v14.4s, v6.4s, %21.s[1] \n" | |||
| "fmla v15.4s, v7.4s, %21.s[1] \n" | |||
| _out1p = vfmaq_laneq_f32(_out1p, _p2, _k1, 2); | |||
| _out1pn = vfmaq_laneq_f32(_out1pn, _p2n, _k1, 2); | |||
| "ld2 {v6.4s, v7.4s}, [%7], #32 \n" | |||
| "and v5.16b, v6.16b, v6.16b \n"// v4 v5 | |||
| _out2p = vfmaq_laneq_f32(_out2p, _p2, _k2, 2); | |||
| _out2pn = vfmaq_laneq_f32(_out2pn, _p2n, _k2, 2); | |||
| "fmla v8.4s, v4.4s, %18.s[2] \n" | |||
| "fmla v9.4s, v5.4s, %18.s[2] \n" | |||
| _out3p = vfmaq_laneq_f32(_out3p, _p2, _k3, 2); | |||
| _out3pn = vfmaq_laneq_f32(_out3pn, _p2n, _k3, 2); | |||
| "fmla v10.4s, v4.4s, %19.s[2] \n" | |||
| "fmla v11.4s, v5.4s, %19.s[2] \n" | |||
| float32x4x2_t _p3x2 = vld2q_f32(r3); | |||
| float32x4_t _p3 = _p3x2.val[0]; | |||
| float32x4x2_t _p3nx2 = vld2q_f32(r3+8); | |||
| float32x4_t _p3n = _p3nx2.val[0]; | |||
| "fmla v12.4s, v4.4s, %20.s[2] \n" | |||
| "fmla v13.4s, v5.4s, %20.s[2] \n" | |||
| _out0p = vfmaq_laneq_f32(_out0p, _p3, _k0, 3); | |||
| _out0pn = vfmaq_laneq_f32(_out0pn, _p3n, _k0, 3); | |||
| "prfm pldl1keep, [%8, #512] \n" | |||
| "ld2 {v6.4s, v7.4s}, [%8], #32 \n" | |||
| _out1p = vfmaq_laneq_f32(_out1p, _p3, _k1, 3); | |||
| _out1pn = vfmaq_laneq_f32(_out1pn, _p3n, _k1, 3); | |||
| "fmla v14.4s, v4.4s, %21.s[2] \n" | |||
| "fmla v15.4s, v5.4s, %21.s[2] \n" | |||
| _out2p = vfmaq_laneq_f32(_out2p, _p3, _k2, 3); | |||
| _out2pn = vfmaq_laneq_f32(_out2pn, _p3n, _k2, 3); | |||
| "ld2 {v4.4s, v5.4s}, [%8], #32 \n" | |||
| "and v7.16b, v4.16b, v4.16b \n"// v6 v7 | |||
| _out3p = vfmaq_laneq_f32(_out3p, _p3, _k3, 3); | |||
| _out3pn = vfmaq_laneq_f32(_out3pn, _p3n, _k3, 3); | |||
| "fmla v8.4s, v6.4s, %18.s[3] \n" | |||
| "fmla v9.4s, v7.4s, %18.s[3] \n" | |||
| vst1q_f32(outptr0, _out0p); | |||
| vst1q_f32(outptr0+4, _out0pn); | |||
| "fmla v10.4s, v6.4s, %19.s[3] \n" | |||
| "fmla v11.4s, v7.4s, %19.s[3] \n" | |||
| vst1q_f32(outptr1, _out1p); | |||
| vst1q_f32(outptr1+4, _out1pn); | |||
| "st1 {v8.4s, v9.4s}, [%1], #32 \n" | |||
| vst1q_f32(outptr2, _out2p); | |||
| vst1q_f32(outptr2+4, _out2pn); | |||
| "fmla v12.4s, v6.4s, %20.s[3] \n" | |||
| "fmla v13.4s, v7.4s, %20.s[3] \n" | |||
| vst1q_f32(outptr3, _out3p); | |||
| vst1q_f32(outptr3+4, _out3pn); | |||
| "st1 {v10.4s, v11.4s}, [%2], #32 \n" | |||
| r0 += 16; | |||
| r1 += 16; | |||
| r2 += 16; | |||
| r3 += 16; | |||
| outptr0 += 8; | |||
| outptr1 += 8; | |||
| outptr2 += 8; | |||
| outptr3 += 8; | |||
| "fmla v14.4s, v6.4s, %21.s[3] \n" | |||
| "fmla v15.4s, v7.4s, %21.s[3] \n" | |||
| "st1 {v12.4s, v13.4s}, [%3], #32 \n" | |||
| "subs %w0, %w0, #1 \n" | |||
| "st1 {v14.4s, v15.4s}, [%4], #32 \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(outptr0),// %1 | |||
| "=r"(outptr1),// %2 | |||
| "=r"(outptr2),// %3 | |||
| "=r"(outptr3),// %4 | |||
| "=r"(r0), // %5 | |||
| "=r"(r1), // %6 | |||
| "=r"(r2), // %7 | |||
| "=r"(r3) // %8 | |||
| : "0"(nn), | |||
| "1"(outptr0), | |||
| "2"(outptr1), | |||
| "3"(outptr2), | |||
| "4"(outptr3), | |||
| "5"(r0), | |||
| "6"(r1), | |||
| "7"(r2), | |||
| "8"(r3), | |||
| "w"(_k0), // %18 | |||
| "w"(_k1), // %19 | |||
| "w"(_k2), // %20 | |||
| "w"(_k3) // %21 | |||
| : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" | |||
| ); | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| @@ -1767,54 +1819,67 @@ static void conv1x1s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| float32x4_t _k2 = vdupq_n_f32(k2); | |||
| float32x4_t _k3 = vdupq_n_f32(k3); | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| if (nn > 0) | |||
| { | |||
| float32x4x2_t _px2 = vld2q_f32(r0); | |||
| float32x4_t _p = _px2.val[0]; | |||
| float32x4x2_t _pnx2 = vld2q_f32(r0+8); | |||
| float32x4_t _pn = _pnx2.val[0]; | |||
| float32x4_t _out0p = vld1q_f32(outptr0); | |||
| float32x4_t _out0pn = vld1q_f32(outptr0+4); | |||
| float32x4_t _out1p = vld1q_f32(outptr1); | |||
| float32x4_t _out1pn = vld1q_f32(outptr1+4); | |||
| float32x4_t _out2p = vld1q_f32(outptr2); | |||
| float32x4_t _out2pn = vld1q_f32(outptr2+4); | |||
| asm volatile( | |||
| "0: \n" | |||
| float32x4_t _out3p = vld1q_f32(outptr3); | |||
| float32x4_t _out3pn = vld1q_f32(outptr3+4); | |||
| "prfm pldl1keep, [%5, #512] \n" | |||
| "ld2 {v4.4s, v5.4s}, [%5], #32 \n" | |||
| "ld2 {v6.4s, v7.4s}, [%5], #32 \n" | |||
| "and v5.16b, v6.16b, v6.16b \n" | |||
| "prfm pldl1keep, [%1, #256] \n" | |||
| "ld1 {v8.4s, v9.4s}, [%1] \n" | |||
| _out0p = vfmaq_f32(_out0p, _p, _k0); | |||
| _out0pn = vfmaq_f32(_out0pn, _pn, _k0); | |||
| "fmla v8.4s, v4.4s, %12.4s \n" | |||
| "fmla v9.4s, v5.4s, %12.4s \n" | |||
| _out1p = vfmaq_f32(_out1p, _p, _k1); | |||
| _out1pn = vfmaq_f32(_out1pn, _pn, _k1); | |||
| "prfm pldl1keep, [%2, #256] \n" | |||
| "ld1 {v10.4s, v11.4s}, [%2] \n" | |||
| _out2p = vfmaq_f32(_out2p, _p, _k2); | |||
| _out2pn = vfmaq_f32(_out2pn, _pn, _k2); | |||
| "fmla v10.4s, v4.4s, %13.4s \n" | |||
| "fmla v11.4s, v5.4s, %13.4s \n" | |||
| _out3p = vfmaq_f32(_out3p, _p, _k3); | |||
| _out3pn = vfmaq_f32(_out3pn, _pn, _k3); | |||
| "prfm pldl1keep, [%3, #256] \n" | |||
| "ld1 {v12.4s, v13.4s}, [%3] \n" | |||
| vst1q_f32(outptr0, _out0p); | |||
| vst1q_f32(outptr0+4, _out0pn); | |||
| "st1 {v8.4s, v9.4s}, [%1], #32 \n" | |||
| vst1q_f32(outptr1, _out1p); | |||
| vst1q_f32(outptr1+4, _out1pn); | |||
| "fmla v12.4s, v4.4s, %14.4s \n" | |||
| "fmla v13.4s, v5.4s, %14.4s \n" | |||
| vst1q_f32(outptr2, _out2p); | |||
| vst1q_f32(outptr2+4, _out2pn); | |||
| "prfm pldl1keep, [%4, #256] \n" | |||
| "ld1 {v14.4s, v15.4s}, [%4] \n" | |||
| vst1q_f32(outptr3, _out3p); | |||
| vst1q_f32(outptr3+4, _out3pn); | |||
| "st1 {v10.4s, v11.4s}, [%2], #32 \n" | |||
| "fmla v14.4s, v4.4s, %15.4s \n" | |||
| "fmla v15.4s, v5.4s, %15.4s \n" | |||
| r0 += 16; | |||
| outptr0 += 8; | |||
| outptr1 += 8; | |||
| outptr2 += 8; | |||
| outptr3 += 8; | |||
| "st1 {v12.4s, v13.4s}, [%3], #32 \n" | |||
| "subs %w0, %w0, #1 \n" | |||
| "st1 {v14.4s, v15.4s}, [%4], #32 \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(outptr0),// %1 | |||
| "=r"(outptr1),// %2 | |||
| "=r"(outptr2),// %3 | |||
| "=r"(outptr3),// %4 | |||
| "=r"(r0) // %5 | |||
| : "0"(nn), | |||
| "1"(outptr0), | |||
| "2"(outptr1), | |||
| "3"(outptr2), | |||
| "4"(outptr3), | |||
| "5"(r0), | |||
| "w"(_k0), // %12 | |||
| "w"(_k1), // %13 | |||
| "w"(_k2), // %14 | |||
| "w"(_k3) // %15 | |||
| : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" | |||
| ); | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| @@ -1951,51 +2016,63 @@ static void conv1x1s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| float32x4_t _k2 = vdupq_n_f32(k2); | |||
| float32x4_t _k3 = vdupq_n_f32(k3); | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| if (nn > 0) | |||
| { | |||
| float32x4x2_t _px2 = vld2q_f32(r0); | |||
| float32x4_t _p = _px2.val[0]; | |||
| float32x4_t _outp = vld1q_f32(outptr); | |||
| float32x4x2_t _pnx2 = vld2q_f32(r0+8); | |||
| float32x4_t _pn = _pnx2.val[0]; | |||
| float32x4_t _outpn = vld1q_f32(outptr+4); | |||
| _outp = vmlaq_f32(_outp, _p, _k0); | |||
| _outpn = vmlaq_f32(_outpn, _pn, _k0); | |||
| float32x4x2_t _p1x2 = vld2q_f32(r1); | |||
| float32x4_t _p1 = _p1x2.val[0]; | |||
| float32x4x2_t _p1nx2 = vld2q_f32(r1+8); | |||
| float32x4_t _p1n = _p1nx2.val[0]; | |||
| _outp = vmlaq_f32(_outp, _p1, _k1); | |||
| _outpn = vmlaq_f32(_outpn, _p1n, _k1); | |||
| float32x4x2_t _p2x2 = vld2q_f32(r2); | |||
| float32x4_t _p2 = _p2x2.val[0]; | |||
| float32x4x2_t _p2nx2 = vld2q_f32(r2+8); | |||
| float32x4_t _p2n = _p2nx2.val[0]; | |||
| _outp = vmlaq_f32(_outp, _p2, _k2); | |||
| _outpn = vmlaq_f32(_outpn, _p2n, _k2); | |||
| float32x4x2_t _p3x2 = vld2q_f32(r3); | |||
| float32x4_t _p3 = _p3x2.val[0]; | |||
| float32x4x2_t _p3nx2 = vld2q_f32(r3+8); | |||
| float32x4_t _p3n = _p3nx2.val[0]; | |||
| _outp = vmlaq_f32(_outp, _p3, _k3); | |||
| _outpn = vmlaq_f32(_outpn, _p3n, _k3); | |||
| vst1q_f32(outptr, _outp); | |||
| vst1q_f32(outptr+4, _outpn); | |||
| r0 += 16; | |||
| r1 += 16; | |||
| r2 += 16; | |||
| r3 += 16; | |||
| outptr += 8; | |||
| asm volatile( | |||
| "prfm pldl1keep, [%2, #512] \n" | |||
| "ld2 {v2.4s, v3.4s}, [%2], #32 \n" | |||
| "ld2 {v8.4s, v9.4s}, [%2], #32 \n" | |||
| "0: \n" | |||
| "prfm pldl1keep, [%1, #256] \n" | |||
| "ld1 {v0.4s, v1.4s}, [%1] \n" | |||
| "fmla v0.4s, v2.4s, %12.4s \n" | |||
| "fmla v1.4s, v8.4s, %12.4s \n" | |||
| "prfm pldl1keep, [%3, #512] \n" | |||
| "ld2 {v2.4s, v3.4s}, [%3], #32 \n" | |||
| "ld2 {v8.4s, v9.4s}, [%3], #32 \n" | |||
| "fmla v0.4s, v2.4s, %13.4s \n" | |||
| "fmla v1.4s, v8.4s, %13.4s \n" | |||
| "prfm pldl1keep, [%4, #512] \n" | |||
| "ld2 {v2.4s, v3.4s}, [%4], #32 \n" | |||
| "ld2 {v8.4s, v9.4s}, [%4], #32 \n" | |||
| "fmla v0.4s, v2.4s, %14.4s \n" | |||
| "fmla v1.4s, v8.4s, %14.4s \n" | |||
| "prfm pldl1keep, [%5, #512] \n" | |||
| "ld2 {v2.4s, v3.4s}, [%5], #32 \n" | |||
| "ld2 {v8.4s, v9.4s}, [%5], #32 \n" | |||
| "fmla v0.4s, v2.4s, %15.4s \n" | |||
| "fmla v1.4s, v8.4s, %15.4s \n" | |||
| "prfm pldl1keep, [%2, #512] \n" | |||
| "ld2 {v2.4s, v3.4s}, [%2], #32 \n" | |||
| "ld2 {v8.4s, v9.4s}, [%2], #32 \n" | |||
| "subs %w0, %w0, #1 \n" | |||
| "st1 {v0.4s, v1.4s}, [%1], #32 \n" | |||
| "bne 0b \n" | |||
| "sub %2, %2, #64 \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(outptr), // %1 | |||
| "=r"(r0), // %2 | |||
| "=r"(r1), // %3 | |||
| "=r"(r2), // %4 | |||
| "=r"(r3) // %5 | |||
| : "0"(nn), | |||
| "1"(outptr), | |||
| "2"(r0), | |||
| "3"(r1), | |||
| "4"(r2), | |||
| "5"(r3), | |||
| "w"(_k0), // %12 | |||
| "w"(_k1), // %13 | |||
| "w"(_k2), // %14 | |||
| "w"(_k3) // %15 | |||
| : "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9" | |||
| ); | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| @@ -2099,24 +2176,37 @@ static void conv1x1s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| #if __ARM_NEON | |||
| float32x4_t _k0 = vdupq_n_f32(k0); | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| if (nn > 0) | |||
| { | |||
| float32x4x2_t _px2 = vld2q_f32(r0); | |||
| float32x4_t _p = _px2.val[0]; | |||
| float32x4_t _outp = vld1q_f32(outptr); | |||
| asm volatile( | |||
| "prfm pldl1keep, [%2, #512] \n" | |||
| "ld2 {v2.4s, v3.4s}, [%2], #32 \n" | |||
| "ld2 {v8.4s, v9.4s}, [%2], #32 \n" | |||
| float32x4x2_t _pnx2 = vld2q_f32(r0+8); | |||
| float32x4_t _pn = _pnx2.val[0]; | |||
| float32x4_t _outpn = vld1q_f32(outptr+4); | |||
| "0: \n" | |||
| _outp = vmlaq_f32(_outp, _p, _k0); | |||
| _outpn = vmlaq_f32(_outpn, _pn, _k0); | |||
| "prfm pldl1keep, [%1, #256] \n" | |||
| "ld1 {v0.4s, v1.4s}, [%1] \n" | |||
| "fmla v0.4s, v2.4s, %6.4s \n" | |||
| "fmla v1.4s, v8.4s, %6.4s \n" | |||
| vst1q_f32(outptr, _outp); | |||
| vst1q_f32(outptr+4, _outpn); | |||
| "prfm pldl1keep, [%2, #512] \n" | |||
| "ld2 {v2.4s, v3.4s}, [%2], #32 \n" | |||
| "ld2 {v8.4s, v9.4s}, [%2], #32 \n" | |||
| r0 += 16; | |||
| outptr += 8; | |||
| "subs %w0, %w0, #1 \n" | |||
| "st1 {v0.4s, v1.4s}, [%1], #32 \n" | |||
| "bne 0b \n" | |||
| "sub %2, %2, #64 \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(outptr), // %1 | |||
| "=r"(r0) // %2 | |||
| : "0"(nn), | |||
| "1"(outptr), | |||
| "2"(r0), | |||
| "w"(_k0) // %6 | |||
| : "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9" | |||
| ); | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| @@ -71,37 +71,83 @@ static void conv2x2s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| if (nn > 0) | |||
| { | |||
| float32x4_t _r000 = vld1q_f32(r00); | |||
| float32x4_t _r010 = vld1q_f32(r01); | |||
| float32x4_t _r001 = vld1q_f32(r00 + 1); | |||
| float32x4_t _r011 = vld1q_f32(r01 + 1); | |||
| float32x4_t _r100 = vld1q_f32(r10); | |||
| float32x4_t _r110 = vld1q_f32(r11); | |||
| float32x4_t _r101 = vld1q_f32(r10 + 1); | |||
| float32x4_t _r111 = vld1q_f32(r11 + 1); | |||
| float32x4_t _sum = vld1q_f32(outptr); | |||
| _sum = vmlaq_lane_f32(_sum, _r000, vget_low_f32(_k0), 0); | |||
| _sum = vmlaq_lane_f32(_sum, _r001, vget_low_f32(_k0), 1); | |||
| _sum = vmlaq_lane_f32(_sum, _r010, vget_high_f32(_k0), 0); | |||
| _sum = vmlaq_lane_f32(_sum, _r011, vget_high_f32(_k0), 1); | |||
| _sum = vmlaq_lane_f32(_sum, _r100, vget_low_f32(_k1), 0); | |||
| _sum = vmlaq_lane_f32(_sum, _r101, vget_low_f32(_k1), 1); | |||
| _sum = vmlaq_lane_f32(_sum, _r110, vget_high_f32(_k1), 0); | |||
| _sum = vmlaq_lane_f32(_sum, _r111, vget_high_f32(_k1), 1); | |||
| vst1q_f32(outptr, _sum); | |||
| r00 += 4; | |||
| r01 += 4; | |||
| r10 += 4; | |||
| r11 += 4; | |||
| outptr += 4; | |||
| asm volatile( | |||
| "prfm pldl1keep, [%1, #128] \n" | |||
| "ld1 {v0.4s}, [%1], #16 \n" | |||
| "prfm pldl1keep, [%2, #128] \n" | |||
| "ld1 {v2.4s}, [%2], #16 \n" | |||
| "prfm pldl1keep, [%3, #128] \n" | |||
| "ld1 {v12.4s}, [%3], #16 \n" | |||
| "prfm pldl1keep, [%4, #128] \n" | |||
| "ld1 {v14.4s}, [%4], #16 \n" | |||
| "0: \n" | |||
| "prfm pldl1keep, [%5, #128] \n" | |||
| "ld1 {v9.4s}, [%5] \n" | |||
| "fmul v8.4s, v0.4s, %12.s[0] \n" | |||
| "fmla v9.4s, v2.4s, %12.s[2] \n" | |||
| "prfm pldl1keep, [%1, #128] \n" | |||
| "ld1 {v1.4s}, [%1], #16 \n" | |||
| "prfm pldl1keep, [%2, #128] \n" | |||
| "ld1 {v3.4s}, [%2], #16 \n" | |||
| "ext v10.16b, v0.16b, v1.16b, #4 \n" | |||
| "ext v11.16b, v2.16b, v3.16b, #4 \n" | |||
| "fmla v8.4s, v12.4s, %13.s[0] \n" | |||
| "fmla v9.4s, v14.4s, %13.s[2] \n" | |||
| "prfm pldl1keep, [%3, #128] \n" | |||
| "ld1 {v13.4s}, [%3], #16 \n" | |||
| "prfm pldl1keep, [%4, #128] \n" | |||
| "ld1 {v15.4s}, [%4], #16 \n" | |||
| "fmla v8.4s, v10.4s, %12.s[1] \n" | |||
| "fmla v9.4s, v11.4s, %12.s[3] \n" | |||
| "ext v10.16b, v12.16b, v13.16b, #4 \n" | |||
| "ext v11.16b, v14.16b, v15.16b, #4 \n" | |||
| "fmla v8.4s, v10.4s, %13.s[1] \n" | |||
| "fmla v9.4s, v11.4s, %13.s[3] \n" | |||
| "orr v0.16b, v1.16b, v1.16b \n" | |||
| "orr v2.16b, v3.16b, v3.16b \n" | |||
| "fadd v8.4s, v8.4s, v9.4s \n" | |||
| "orr v12.16b, v13.16b, v13.16b \n" | |||
| "orr v14.16b, v15.16b, v15.16b \n" | |||
| "subs %w0, %w0, #1 \n" | |||
| "st1 {v8.4s}, [%5], #16 \n" | |||
| "bne 0b \n" | |||
| "sub %1, %1, #16 \n" | |||
| "sub %2, %2, #16 \n" | |||
| "sub %3, %3, #16 \n" | |||
| "sub %4, %4, #16 \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(r00), // %1 | |||
| "=r"(r01), // %2 | |||
| "=r"(r10), // %3 | |||
| "=r"(r11), // %4 | |||
| "=r"(outptr) // %5 | |||
| : "0"(nn), | |||
| "1"(r00), | |||
| "2"(r01), | |||
| "3"(r10), | |||
| "4"(r11), | |||
| "5"(outptr), | |||
| "w"(_k0), // %12 | |||
| "w"(_k1) // %13 | |||
| : "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" | |||
| ); | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| @@ -263,28 +309,56 @@ static void conv2x2s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| if (nn > 0) | |||
| { | |||
| float32x4_t _r00 = vld1q_f32(r0); | |||
| float32x4_t _r10 = vld1q_f32(r1); | |||
| float32x4_t _r01 = vld1q_f32(r0 + 1); | |||
| float32x4_t _r11 = vld1q_f32(r1 + 1); | |||
| asm volatile( | |||
| "prfm pldl1keep, [%1, #128] \n" | |||
| "ld1 {v0.4s}, [%1], #16 \n" | |||
| "prfm pldl1keep, [%2, #128] \n" | |||
| "ld1 {v2.4s}, [%2], #16 \n" | |||
| "0: \n" | |||
| "prfm pldl1keep, [%3, #128] \n" | |||
| "ld1 {v9.4s}, [%3] \n" | |||
| float32x4_t _sum = vld1q_f32(outptr); | |||
| float32x4_t _sum2; | |||
| "fmul v8.4s, v0.4s, %8.4s \n" | |||
| "fmla v9.4s, v2.4s, %10.4s \n" | |||
| _sum = vmlaq_f32(_sum, _r00, _k0); | |||
| _sum2 = vmulq_f32(_r01, _k1); | |||
| _sum = vmlaq_f32(_sum, _r10, _k2); | |||
| _sum2 = vmlaq_f32(_sum2, _r11, _k3); | |||
| "prfm pldl1keep, [%1, #128] \n" | |||
| "ld1 {v1.4s}, [%1], #16 \n" | |||
| "ext v10.16b, v0.16b, v1.16b, #4 \n" | |||
| _sum = vaddq_f32(_sum, _sum2); | |||
| "fmla v8.4s, v10.4s, %9.4s \n" | |||
| vst1q_f32(outptr, _sum); | |||
| "prfm pldl1keep, [%2, #128] \n" | |||
| "ld1 {v3.4s}, [%2], #16 \n" | |||
| "ext v11.16b, v2.16b, v3.16b, #4 \n" | |||
| r0 += 4; | |||
| r1 += 4; | |||
| outptr += 4; | |||
| "fmla v9.4s, v11.4s, %11.4s \n" | |||
| "orr v0.16b, v1.16b, v1.16b \n" | |||
| "fadd v8.4s, v8.4s, v9.4s \n" | |||
| "orr v2.16b, v3.16b, v3.16b \n" | |||
| "subs %w0, %w0, #1 \n" | |||
| "st1 {v8.4s}, [%3], #16 \n" | |||
| "bne 0b \n" | |||
| "sub %1, %1, #16 \n" | |||
| "sub %2, %2, #16 \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(r0), // %1 | |||
| "=r"(r1), // %2 | |||
| "=r"(outptr) // %3 | |||
| : "0"(nn), | |||
| "1"(r0), | |||
| "2"(r1), | |||
| "3"(outptr), | |||
| "w"(_k0), // %8 | |||
| "w"(_k1), // %9 | |||
| "w"(_k2), // %10 | |||
| "w"(_k3) // %11 | |||
| : "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11" | |||
| ); | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| @@ -6858,49 +6858,74 @@ static void conv3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| if (nn > 0) | |||
| { | |||
| float32x4_t _outp = vld1q_f32(outptr); | |||
| asm volatile( | |||
| "prfm pldl1keep, [%2, #256] \n" | |||
| "ld2 {v2.4s, v3.4s}, [%2], #32 \n" | |||
| "0: \n" | |||
| "prfm pldl1keep, [%1, #128] \n" | |||
| "ld1 {v0.4s}, [%1] \n" | |||
| float32x4x2_t _r0 = vld2q_f32(r0); | |||
| float32x4x2_t _r0n = vld2q_f32(r0+8); | |||
| "fmla v0.4s, v2.4s, %10.s[0] \n" | |||
| "fmul v10.4s, v3.4s, %10.s[1] \n" | |||
| float32x4_t _r00 = _r0.val[0];// 0 2 4 6 | |||
| float32x4_t _r01 = _r0.val[1];// 1 3 5 7 | |||
| float32x4_t _r02 = vextq_f32(_r00, _r0n.val[0], 1);// 2 4 6 8 | |||
| "prfm pldl1keep, [%2, #256] \n" | |||
| "ld2 {v8.4s, v9.4s}, [%2] \n" | |||
| "ext v1.16b, v2.16b, v8.16b, #4 \n" | |||
| _outp = vfmaq_laneq_f32(_outp, _r00, _k0123, 0); | |||
| _outp = vfmaq_laneq_f32(_outp, _r01, _k0123, 1); | |||
| _outp = vfmaq_laneq_f32(_outp, _r02, _k0123, 2); | |||
| "fmul v11.4s, v1.4s, %10.s[2] \n" | |||
| float32x4x2_t _r1 = vld2q_f32(r1); | |||
| float32x4x2_t _r1n = vld2q_f32(r1+8); | |||
| "prfm pldl1keep, [%3, #256] \n" | |||
| "ld2 {v2.4s, v3.4s}, [%3], #32 \n" | |||
| float32x4_t _r10 = _r1.val[0]; | |||
| float32x4_t _r11 = _r1.val[1]; | |||
| float32x4_t _r12 = vextq_f32(_r10, _r1n.val[0], 1); | |||
| "fmla v0.4s, v2.4s, %11.s[0] \n" | |||
| "fmla v10.4s, v3.4s, %11.s[1] \n" | |||
| _outp = vfmaq_laneq_f32(_outp, _r10, _k3456, 0); | |||
| _outp = vfmaq_laneq_f32(_outp, _r11, _k3456, 1); | |||
| _outp = vfmaq_laneq_f32(_outp, _r12, _k3456, 2); | |||
| "prfm pldl1keep, [%3, #256] \n" | |||
| "ld2 {v8.4s, v9.4s}, [%3] \n" | |||
| "ext v1.16b, v2.16b, v8.16b, #4 \n" | |||
| float32x4x2_t _r2 = vld2q_f32(r2); | |||
| float32x4x2_t _r2n = vld2q_f32(r2+8); | |||
| "fmla v11.4s, v1.4s, %11.s[2] \n" | |||
| float32x4_t _r20 = _r2.val[0]; | |||
| float32x4_t _r21 = _r2.val[1]; | |||
| float32x4_t _r22 = vextq_f32(_r20, _r2n.val[0], 1); | |||
| "prfm pldl1keep, [%4, #256] \n" | |||
| "ld2 {v2.4s, v3.4s}, [%4], #32 \n" | |||
| _outp = vfmaq_laneq_f32(_outp, _r20, _k6789, 0); | |||
| _outp = vfmaq_laneq_f32(_outp, _r21, _k6789, 1); | |||
| _outp = vfmaq_laneq_f32(_outp, _r22, _k6789, 2); | |||
| "fmla v0.4s, v2.4s, %12.s[0] \n" | |||
| "fmla v10.4s, v3.4s, %12.s[1] \n" | |||
| vst1q_f32(outptr, _outp); | |||
| "prfm pldl1keep, [%4, #256] \n" | |||
| "ld2 {v8.4s, v9.4s}, [%4] \n" | |||
| "ext v1.16b, v2.16b, v8.16b, #4 \n" | |||
| r0 += 8; | |||
| r1 += 8; | |||
| r2 += 8; | |||
| outptr += 4; | |||
| "fmla v11.4s, v1.4s, %12.s[2] \n" | |||
| "prfm pldl1keep, [%2, #256] \n" | |||
| "ld2 {v2.4s, v3.4s}, [%2], #32 \n" | |||
| "fadd v0.4s, v0.4s, v10.4s \n" | |||
| "fadd v0.4s, v0.4s, v11.4s \n" | |||
| "subs %w0, %w0, #1 \n" | |||
| "st1 {v0.4s}, [%1], #16 \n" | |||
| "bne 0b \n" | |||
| "sub %2, %2, #32 \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(outptr), // %1 | |||
| "=r"(r0), // %2 | |||
| "=r"(r1), // %3 | |||
| "=r"(r2) // %4 | |||
| : "0"(nn), | |||
| "1"(outptr), | |||
| "2"(r0), | |||
| "3"(r1), | |||
| "4"(r2), | |||
| "w"(_k0123), // %10 | |||
| "w"(_k3456), // %11 | |||
| "w"(_k6789) // %12 | |||
| : "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" | |||
| ); | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| @@ -75,63 +75,107 @@ static void conv4x4s4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| if (nn > 0) | |||
| { | |||
| float32x4_t _r00 = vld1q_f32(r0); | |||
| float32x4_t _r10 = vld1q_f32(r1); | |||
| float32x4_t _r20 = vld1q_f32(r2); | |||
| float32x4_t _r30 = vld1q_f32(r3); | |||
| float32x4_t _r01 = vld1q_f32(r0 + 4); | |||
| float32x4_t _r11 = vld1q_f32(r1 + 4); | |||
| float32x4_t _r21 = vld1q_f32(r2 + 4); | |||
| float32x4_t _r31 = vld1q_f32(r3 + 4); | |||
| float32x4_t _r02 = vld1q_f32(r0 + 8); | |||
| float32x4_t _r12 = vld1q_f32(r1 + 8); | |||
| float32x4_t _r22 = vld1q_f32(r2 + 8); | |||
| float32x4_t _r32 = vld1q_f32(r3 + 8); | |||
| float32x4_t _r03 = vld1q_f32(r0 + 12); | |||
| float32x4_t _r13 = vld1q_f32(r1 + 12); | |||
| float32x4_t _r23 = vld1q_f32(r2 + 12); | |||
| float32x4_t _r33 = vld1q_f32(r3 + 12); | |||
| float32x4_t _sum0 = vmulq_f32(_r00, _k0123); | |||
| float32x4_t _sum1 = vmulq_f32(_r01, _k0123); | |||
| float32x4_t _sum2 = vmulq_f32(_r02, _k0123); | |||
| float32x4_t _sum3 = vmulq_f32(_r03, _k0123); | |||
| _sum0 = vfmaq_f32(_sum0, _r10, _k4567); | |||
| _sum1 = vfmaq_f32(_sum1, _r11, _k4567); | |||
| _sum2 = vfmaq_f32(_sum2, _r12, _k4567); | |||
| _sum3 = vfmaq_f32(_sum3, _r13, _k4567); | |||
| _sum0 = vfmaq_f32(_sum0, _r20, _k891011); | |||
| _sum1 = vfmaq_f32(_sum1, _r21, _k891011); | |||
| _sum2 = vfmaq_f32(_sum2, _r22, _k891011); | |||
| _sum3 = vfmaq_f32(_sum3, _r23, _k891011); | |||
| _sum0 = vfmaq_f32(_sum0, _r30, _k12131415); | |||
| _sum1 = vfmaq_f32(_sum1, _r31, _k12131415); | |||
| _sum2 = vfmaq_f32(_sum2, _r32, _k12131415); | |||
| _sum3 = vfmaq_f32(_sum3, _r33, _k12131415); | |||
| float32x4_t _s01 = vpaddq_f32(_sum0, _sum1); | |||
| float32x4_t _s23 = vpaddq_f32(_sum2, _sum3); | |||
| float32x4_t _sum = vpaddq_f32(_s01, _s23); | |||
| float32x4_t _outp = vld1q_f32(outptr); | |||
| _outp = vaddq_f32(_outp, _sum); | |||
| vst1q_f32(outptr, _outp); | |||
| r0 += 16; | |||
| r1 += 16; | |||
| r2 += 16; | |||
| r3 += 16; | |||
| outptr += 4; | |||
| asm volatile( | |||
| "prfm pldl1keep, [%1, #128] \n" | |||
| "0: \n" | |||
| "prfm pldl1keep, [%2, #512] \n" | |||
| "prfm pldl1keep, [%3, #512] \n" | |||
| "ld1 {v7.4s}, [%1] \n" // v7 = outptr | |||
| "ld1 {v8.4s}, [%2], #16 \n"// v8 = r0 | |||
| "ld1 {v9.4s}, [%3], #16 \n"// v9 = r1 | |||
| "prfm pldl1keep, [%4, #512] \n" | |||
| "prfm pldl1keep, [%5, #512] \n" | |||
| "fmul v12.4s, v8.4s, %12.4s \n" | |||
| "fmul v13.4s, v9.4s, %13.4s \n" | |||
| "ld1 {v10.4s}, [%4], #16 \n"// v10 = r2 | |||
| "ld1 {v11.4s}, [%5], #16 \n"// v11 = r3 | |||
| "fmla v12.4s, v10.4s, %14.4s \n" | |||
| "fmla v13.4s, v11.4s, %15.4s \n" | |||
| "fadd v5.4s, v12.4s, v13.4s \n" | |||
| "ld1 {v8.4s}, [%2], #16 \n"// v8 = r0 | |||
| "ld1 {v9.4s}, [%3], #16 \n"// v9 = r1 | |||
| "fmul v12.4s, v8.4s, %12.4s \n" | |||
| "fmul v13.4s, v9.4s, %13.4s \n" | |||
| "ld1 {v10.4s}, [%4], #16 \n"// v10 = r2 | |||
| "ld1 {v11.4s}, [%5], #16 \n"// v11 = r3 | |||
| "fmla v12.4s, v10.4s, %14.4s \n" | |||
| "fmla v13.4s, v11.4s, %15.4s \n" | |||
| "fadd v6.4s, v12.4s, v13.4s \n" | |||
| "ld1 {v8.4s}, [%2], #16 \n"// v8 = r0 | |||
| "ld1 {v9.4s}, [%3], #16 \n"// v9 = r1 | |||
| "fmul v12.4s, v8.4s, %12.4s \n" | |||
| "fmul v13.4s, v9.4s, %13.4s \n" | |||
| "ld1 {v10.4s}, [%4], #16 \n"// v10 = r2 | |||
| "ld1 {v11.4s}, [%5], #16 \n"// v11 = r3 | |||
| "fmla v12.4s, v10.4s, %14.4s \n" | |||
| "fmla v13.4s, v11.4s, %15.4s \n" | |||
| "fadd v14.4s, v12.4s, v13.4s \n" | |||
| "faddp v5.4s, v5.4s, v6.4s \n" // Move to here to enhance ILP | |||
| "ld1 {v8.4s}, [%2], #16 \n"// v8 = r0 | |||
| "ld1 {v9.4s}, [%3], #16 \n"// v9 = r1 | |||
| "fmul v12.4s, v8.4s, %12.4s \n" | |||
| "fmul v13.4s, v9.4s, %13.4s \n" | |||
| "ld1 {v10.4s}, [%4], #16 \n"// v10 = r2 | |||
| "ld1 {v11.4s}, [%5], #16 \n"// v11 = r3 | |||
| "fmla v12.4s, v10.4s, %14.4s \n" | |||
| "fmla v13.4s, v11.4s, %15.4s \n" | |||
| "fadd v15.4s, v12.4s, v13.4s \n" | |||
| // "faddp v5.4s , v5.4s, v6.4s \n" // Move this line upward. | |||
| "faddp v14.4s, v14.4s, v15.4s \n" | |||
| "faddp v5.4s , v5.4s, v14.4s \n" | |||
| "fadd v7.4s, v7.4s, v5.4s \n" | |||
| "st1 {v7.4s}, [%1], #16 \n" | |||
| "prfm pldl1keep, [%1, #128] \n" | |||
| "subs %w0, %w0, #1 \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(outptr), // %1 | |||
| "=r"(r0), // %2 | |||
| "=r"(r1), // %3 | |||
| "=r"(r2), // %4 | |||
| "=r"(r3) // %5 | |||
| : "0"(nn), | |||
| "1"(outptr), | |||
| "2"(r0), | |||
| "3"(r1), | |||
| "4"(r2), | |||
| "5"(r3), | |||
| "w"(_k0123), // %12 | |||
| "w"(_k4567), // %13 | |||
| "w"(_k891011), // %14 | |||
| "w"(_k12131415) // %15 | |||
| : "cc", "memory", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" | |||
| ); | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| @@ -247,22 +291,42 @@ static void conv4x4s4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| { | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| float32x4_t _r0 = vld1q_f32(r0); | |||
| float32x4_t _r1 = vld1q_f32(r1); | |||
| float32x4_t _r2 = vld1q_f32(r2); | |||
| float32x4_t _r3 = vld1q_f32(r3); | |||
| float sum = 0.f; | |||
| float32x4_t _sum = vmulq_f32(_r0, _k0123); | |||
| _sum = vmlaq_f32(_sum, _r1, _k4567); | |||
| _sum = vmlaq_f32(_sum, _r2, _k891011); | |||
| _sum = vmlaq_f32(_sum, _r3, _k12131415); | |||
| asm volatile( | |||
| "ld1 {v8.4s}, [%0], #16 \n"// v8 = r0 | |||
| "ld1 {v9.4s}, [%1], #16 \n"// v9 = r1 | |||
| *outptr += vaddvq_f32(_sum); | |||
| "fmul v12.4s, v8.4s, %9.4s \n" | |||
| "fmul v13.4s, v9.4s, %10.4s \n" | |||
| r0 += 4; | |||
| r1 += 4; | |||
| r2 += 4; | |||
| r3 += 4; | |||
| "ld1 {v10.4s}, [%2], #16 \n"// v10 = r2 | |||
| "ld1 {v11.4s}, [%3], #16 \n"// v11 = r3 | |||
| "fmla v12.4s, v10.4s, %11.4s \n" | |||
| "fmla v13.4s, v11.4s, %12.4s \n" | |||
| "fadd v5.4s, v12.4s, v13.4s \n" | |||
| "faddp v5.4s, v5.4s, v5.4s \n" | |||
| "faddp s5, v5.2s \n" | |||
| "fmov %w4, s5 \n" | |||
| : "=r"(r0), // %0 | |||
| "=r"(r1), // %1 | |||
| "=r"(r2), // %2 | |||
| "=r"(r3), // %3 | |||
| "=r"(sum) // %4 | |||
| : "0"(r0), | |||
| "1"(r1), | |||
| "2"(r2), | |||
| "3"(r3), | |||
| "w"(_k0123), // %9 | |||
| "w"(_k4567), // %10 | |||
| "w"(_k891011), // %11 | |||
| "w"(_k12131415) // %12 | |||
| : "cc", "memory", "v5", "v6", "v8", "v9", "v10", "v11", "v12", "v13" | |||
| ); | |||
| *outptr += sum; | |||
| #else | |||
| float sum = 0.f; | |||
| @@ -83,118 +83,193 @@ static void conv5x5s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| if (nn > 0) | |||
| { | |||
| float32x4_t _sum = vld1q_f32(outptr); | |||
| float32x4_t _sum2 = vld1q_f32(outptr2); | |||
| float32x4_t _r00 = vld1q_f32(r0); | |||
| float32x4_t _r04 = vld1q_f32(r0 + 4); | |||
| float32x4_t _r01 = vextq_f32(_r00, _r04, 1); | |||
| float32x4_t _r02 = vextq_f32(_r00, _r04, 2); | |||
| float32x4_t _r03 = vextq_f32(_r00, _r04, 3); | |||
| float32x4_t _r10 = vld1q_f32(r1); | |||
| float32x4_t _r14 = vld1q_f32(r1 + 4); | |||
| float32x4_t _r11 = vextq_f32(_r10, _r14, 1); | |||
| float32x4_t _r12 = vextq_f32(_r10, _r14, 2); | |||
| float32x4_t _r13 = vextq_f32(_r10, _r14, 3); | |||
| float32x4_t _r20 = vld1q_f32(r2); | |||
| float32x4_t _r24 = vld1q_f32(r2 + 4); | |||
| float32x4_t _r21 = vextq_f32(_r20, _r24, 1); | |||
| float32x4_t _r22 = vextq_f32(_r20, _r24, 2); | |||
| float32x4_t _r23 = vextq_f32(_r20, _r24, 3); | |||
| float32x4_t _r30 = vld1q_f32(r3); | |||
| float32x4_t _r34 = vld1q_f32(r3 + 4); | |||
| float32x4_t _r31 = vextq_f32(_r30, _r34, 1); | |||
| float32x4_t _r32 = vextq_f32(_r30, _r34, 2); | |||
| float32x4_t _r33 = vextq_f32(_r30, _r34, 3); | |||
| float32x4_t _r40 = vld1q_f32(r4); | |||
| float32x4_t _r44 = vld1q_f32(r4 + 4); | |||
| float32x4_t _r41 = vextq_f32(_r40, _r44, 1); | |||
| float32x4_t _r42 = vextq_f32(_r40, _r44, 2); | |||
| float32x4_t _r43 = vextq_f32(_r40, _r44, 3); | |||
| float32x4_t _r50 = vld1q_f32(r5); | |||
| float32x4_t _r54 = vld1q_f32(r5 + 4); | |||
| float32x4_t _r51 = vextq_f32(_r50, _r54, 1); | |||
| float32x4_t _r52 = vextq_f32(_r50, _r54, 2); | |||
| float32x4_t _r53 = vextq_f32(_r50, _r54, 3); | |||
| _sum = vfmaq_laneq_f32(_sum, _r00, _k0123, 0); | |||
| _sum = vfmaq_laneq_f32(_sum, _r01, _k0123, 1); | |||
| _sum = vfmaq_laneq_f32(_sum, _r02, _k0123, 2); | |||
| _sum = vfmaq_laneq_f32(_sum, _r03, _k0123, 3); | |||
| _sum = vfmaq_laneq_f32(_sum, _r04, _k4567, 0); | |||
| _sum = vfmaq_laneq_f32(_sum, _r10, _k4567, 1); | |||
| _sum = vfmaq_laneq_f32(_sum, _r11, _k4567, 2); | |||
| _sum = vfmaq_laneq_f32(_sum, _r12, _k4567, 3); | |||
| _sum = vfmaq_laneq_f32(_sum, _r13, _k891011, 0); | |||
| _sum = vfmaq_laneq_f32(_sum, _r14, _k891011, 1); | |||
| _sum = vfmaq_laneq_f32(_sum, _r20, _k891011, 2); | |||
| _sum = vfmaq_laneq_f32(_sum, _r21, _k891011, 3); | |||
| _sum = vfmaq_laneq_f32(_sum, _r22, _k12131415, 0); | |||
| _sum = vfmaq_laneq_f32(_sum, _r23, _k12131415, 1); | |||
| _sum = vfmaq_laneq_f32(_sum, _r24, _k12131415, 2); | |||
| _sum = vfmaq_laneq_f32(_sum, _r30, _k12131415, 3); | |||
| _sum = vfmaq_laneq_f32(_sum, _r31, _k16171819, 0); | |||
| _sum = vfmaq_laneq_f32(_sum, _r32, _k16171819, 1); | |||
| _sum = vfmaq_laneq_f32(_sum, _r33, _k16171819, 2); | |||
| _sum = vfmaq_laneq_f32(_sum, _r34, _k16171819, 3); | |||
| _sum = vfmaq_laneq_f32(_sum, _r40, _k20212223, 0); | |||
| _sum = vfmaq_laneq_f32(_sum, _r41, _k20212223, 1); | |||
| _sum = vfmaq_laneq_f32(_sum, _r42, _k20212223, 2); | |||
| _sum = vfmaq_laneq_f32(_sum, _r43, _k20212223, 3); | |||
| _sum = vfmaq_laneq_f32(_sum, _r44, _k24242424, 0); | |||
| _sum2 = vfmaq_laneq_f32(_sum2, _r10, _k0123, 0); | |||
| _sum2 = vfmaq_laneq_f32(_sum2, _r11, _k0123, 1); | |||
| _sum2 = vfmaq_laneq_f32(_sum2, _r12, _k0123, 2); | |||
| _sum2 = vfmaq_laneq_f32(_sum2, _r13, _k0123, 3); | |||
| _sum2 = vfmaq_laneq_f32(_sum2, _r14, _k4567, 0); | |||
| _sum2 = vfmaq_laneq_f32(_sum2, _r20, _k4567, 1); | |||
| _sum2 = vfmaq_laneq_f32(_sum2, _r21, _k4567, 2); | |||
| _sum2 = vfmaq_laneq_f32(_sum2, _r22, _k4567, 3); | |||
| _sum2 = vfmaq_laneq_f32(_sum2, _r23, _k891011, 0); | |||
| _sum2 = vfmaq_laneq_f32(_sum2, _r24, _k891011, 1); | |||
| _sum2 = vfmaq_laneq_f32(_sum2, _r30, _k891011, 2); | |||
| _sum2 = vfmaq_laneq_f32(_sum2, _r31, _k891011, 3); | |||
| _sum2 = vfmaq_laneq_f32(_sum2, _r32, _k12131415, 0); | |||
| _sum2 = vfmaq_laneq_f32(_sum2, _r33, _k12131415, 1); | |||
| _sum2 = vfmaq_laneq_f32(_sum2, _r34, _k12131415, 2); | |||
| _sum2 = vfmaq_laneq_f32(_sum2, _r40, _k12131415, 3); | |||
| _sum2 = vfmaq_laneq_f32(_sum2, _r41, _k16171819, 0); | |||
| _sum2 = vfmaq_laneq_f32(_sum2, _r42, _k16171819, 1); | |||
| _sum2 = vfmaq_laneq_f32(_sum2, _r43, _k16171819, 2); | |||
| _sum2 = vfmaq_laneq_f32(_sum2, _r44, _k16171819, 3); | |||
| _sum2 = vfmaq_laneq_f32(_sum2, _r50, _k20212223, 0); | |||
| _sum2 = vfmaq_laneq_f32(_sum2, _r51, _k20212223, 1); | |||
| _sum2 = vfmaq_laneq_f32(_sum2, _r52, _k20212223, 2); | |||
| _sum2 = vfmaq_laneq_f32(_sum2, _r53, _k20212223, 3); | |||
| _sum2 = vfmaq_laneq_f32(_sum2, _r54, _k24242424, 0); | |||
| vst1q_f32(outptr, _sum); | |||
| vst1q_f32(outptr2, _sum2); | |||
| r0 += 4; | |||
| r1 += 4; | |||
| r2 += 4; | |||
| r3 += 4; | |||
| r4 += 4; | |||
| r5 += 4; | |||
| outptr += 4; | |||
| outptr2 += 4; | |||
| asm volatile( | |||
| // v11 = rx1 / rx3 | |||
| // v12 = rx2 | |||
| // v13 v14 = intermediate sum register | |||
| "prfm pldl1keep, [%1, #128] \n" | |||
| "ld1 {v7.4s}, [%1] \n"// v7 = out | |||
| "0: \n" | |||
| "prfm pldl1keep, [%2, #128] \n" | |||
| "ld1 {v8.4s}, [%2] \n"// v8 = out2 | |||
| // r1 | |||
| "prfm pldl1keep, [%4, #256] \n" | |||
| "ld1 {v9.4s, v10.4s}, [%4] \n"// v9 v10 = r10 r14 | |||
| "add %4, %4, #16 \n" | |||
| "ext v11.16b, v9.16b, v10.16b, #4 \n" //r11 | |||
| "fmul v13.4s, v9.4s, %19.s[1] \n" | |||
| "fmla v8.4s, v9.4s, %18.s[0] \n" | |||
| "ext v12.16b, v9.16b, v10.16b, #8 \n" //r12 | |||
| "fmla v7.4s, v11.4s, %19.s[2] \n" | |||
| "fmul v14.4s, v11.4s, %18.s[1] \n" | |||
| "ext v11.16b, v9.16b, v10.16b, #12 \n" //r13 | |||
| "fmla v13.4s, v12.4s, %19.s[3] \n" | |||
| "fmla v8.4s, v12.4s, %18.s[2] \n" | |||
| "fmla v7.4s, v11.4s, %20.s[0] \n" | |||
| "fmla v14.4s, v11.4s, %18.s[3] \n" | |||
| "prfm pldl1keep, [%5, #256] \n" | |||
| "fmla v13.4s, v10.4s, %20.s[1] \n" | |||
| "fmla v8.4s, v10.4s, %19.s[0] \n" | |||
| // r2 | |||
| "ld1 {v9.4s, v10.4s}, [%5] \n"// v9 v10 = r20 r24 | |||
| "add %5, %5, #16 \n" | |||
| "ext v11.16b, v9.16b, v10.16b, #4 \n" //r21 | |||
| "fmla v7.4s, v9.4s, %20.s[2] \n" | |||
| "fmla v14.4s, v9.4s, %19.s[1] \n" | |||
| "ext v12.16b, v9.16b, v10.16b, #8 \n" //r22 | |||
| "fmla v13.4s, v11.4s, %20.s[3] \n" | |||
| "fmla v8.4s, v11.4s, %19.s[2] \n" | |||
| "ext v11.16b, v9.16b, v10.16b, #12 \n" //r23 | |||
| "fmla v7.4s, v12.4s, %21.s[0] \n" | |||
| "fmla v14.4s, v12.4s, %19.s[3] \n" | |||
| "fmla v13.4s, v11.4s, %21.s[1] \n" | |||
| "fmla v8.4s, v11.4s, %20.s[0] \n" | |||
| "prfm pldl1keep, [%6, #256] \n" | |||
| "fmla v7.4s, v10.4s, %21.s[2] \n" | |||
| "fmla v14.4s, v10.4s, %20.s[1] \n" | |||
| // r3 | |||
| "ld1 {v9.4s, v10.4s}, [%6] \n"// v9 v10 = r30 r34 | |||
| "add %6, %6, #16 \n" | |||
| "ext v11.16b, v9.16b, v10.16b, #4 \n" //r31 | |||
| "fmla v13.4s, v9.4s, %21.s[3] \n" | |||
| "fmla v8.4s, v9.4s, %20.s[2] \n" | |||
| "ext v12.16b, v9.16b, v10.16b, #8 \n" //r32 | |||
| "fmla v7.4s, v11.4s, %22.s[0] \n" | |||
| "fmla v14.4s, v11.4s, %20.s[3] \n" | |||
| "ext v11.16b, v9.16b, v10.16b, #12 \n" //r33 | |||
| "fmla v13.4s, v12.4s, %22.s[1] \n" | |||
| "fmla v8.4s, v12.4s, %21.s[0] \n" | |||
| "fmla v7.4s, v11.4s, %22.s[2] \n" | |||
| "fmla v14.4s, v11.4s, %21.s[1] \n" | |||
| "prfm pldl1keep, [%7, #256] \n" | |||
| "fmla v13.4s, v10.4s, %22.s[3] \n" | |||
| "fmla v8.4s, v10.4s, %21.s[2] \n" | |||
| // r4 | |||
| "ld1 {v9.4s, v10.4s}, [%7] \n"// v9 v10 = r40 r44 | |||
| "add %7, %7, #16 \n" | |||
| "ext v11.16b, v9.16b, v10.16b, #4 \n" //r41 | |||
| "fmla v7.4s, v9.4s, %23.s[0] \n" | |||
| "fmla v14.4s, v9.4s, %21.s[3] \n" | |||
| "ext v12.16b, v9.16b, v10.16b, #8 \n" //r41 | |||
| "fmla v13.4s, v11.4s, %23.s[1] \n" | |||
| "fmla v8.4s, v11.4s, %22.s[0] \n" | |||
| "ext v11.16b, v9.16b, v10.16b, #12 \n" //r41 | |||
| "fmla v7.4s, v12.4s, %23.s[2] \n" | |||
| "fmla v14.4s, v12.4s, %22.s[1] \n" | |||
| "fmla v13.4s, v11.4s, %23.s[3] \n" | |||
| "fmla v8.4s, v11.4s, %22.s[2] \n" | |||
| "prfm pldl1keep, [%3, #256] \n" | |||
| "fmla v7.4s, v10.4s, %24.s[0] \n" | |||
| "fmla v14.4s, v10.4s, %22.s[3] \n" | |||
| // r0 and r5 | |||
| "ld1 {v9.4s, v10.4s}, [%3] \n"// v9 v10 = r00 r04 | |||
| "add %3, %3, #16 \n" | |||
| "ext v11.16b, v9.16b, v10.16b, #4 \n" //r01 | |||
| "fmla v13.4s, v11.4s, %18.s[1] \n" | |||
| "ext v12.16b, v9.16b, v10.16b, #8 \n" //r02 | |||
| "fmla v7.4s, v12.4s, %18.s[2] \n" | |||
| "ext v11.16b, v9.16b, v10.16b, #12 \n" //r03 | |||
| "prfm pldl1keep, [%8, #256] \n" | |||
| "fmla v13.4s, v11.4s, %18.s[3] \n" | |||
| // r5 | |||
| "ld1 {v11.4s, v12.4s}, [%8] \n"// v11 v12 = r50 r54 | |||
| "add %8, %8, #16 \n" | |||
| "fmla v8.4s, v11.4s, %23.s[0] \n" | |||
| "fmla v14.4s, v12.4s, %24.s[0] \n" | |||
| "fmla v7.4s, v9.4s, %18.s[0] \n" | |||
| "fmla v13.4s, v10.4s, %19.s[0] \n" | |||
| "ext v9.16b, v11.16b, v12.16b, #4 \n" //r51 | |||
| "ext v10.16b, v11.16b, v12.16b, #8 \n" //r52 | |||
| "fmla v14.4s, v9.4s, %23.s[1] \n" | |||
| "ext v9.16b, v11.16b, v12.16b, #12 \n" //r53 | |||
| "fmla v8.4s, v10.4s, %23.s[2] \n" | |||
| "fmla v14.4s, v9.4s, %23.s[3] \n" | |||
| "fadd v7.4s, v7.4s, v13.4s \n" | |||
| "st1 {v7.4s}, [%1], #16 \n" | |||
| "fadd v8.4s, v8.4s, v14.4s \n" | |||
| "prfm pldl1keep, [%1, #128] \n" | |||
| "ld1 {v7.4s}, [%1] \n"// v7 = out | |||
| "st1 {v8.4s}, [%2], #16 \n" | |||
| "subs %w0, %w0, #1 \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(outptr), // %1 | |||
| "=r"(outptr2), // %2 | |||
| "=r"(r0), // %3 | |||
| "=r"(r1), // %4 | |||
| "=r"(r2), // %5 | |||
| "=r"(r3), // %6 | |||
| "=r"(r4), // %7 | |||
| "=r"(r5) // %8 | |||
| : "0"(nn), | |||
| "1"(outptr), | |||
| "2"(outptr2), | |||
| "3"(r0), | |||
| "4"(r1), | |||
| "5"(r2), | |||
| "6"(r3), | |||
| "7"(r4), | |||
| "8"(r5), | |||
| "w"(_k0123), // %18 | |||
| "w"(_k4567), // %19 | |||
| "w"(_k891011), // %20 | |||
| "w"(_k12131415), // %21 | |||
| "w"(_k16171819), // %22 | |||
| "w"(_k20212223), // %23 | |||
| "w"(_k24242424) // %24 | |||
| : "cc", "memory", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" | |||
| ); | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| @@ -555,78 +630,130 @@ static void conv5x5s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| if (nn > 0) | |||
| { | |||
| float32x4_t _sum = vld1q_f32(outptr); | |||
| float32x4_t _r00 = vld1q_f32(r0); | |||
| float32x4_t _r04 = vld1q_f32(r0 + 4); | |||
| float32x4_t _r01 = vextq_f32(_r00, _r04, 1); | |||
| float32x4_t _r02 = vextq_f32(_r00, _r04, 2); | |||
| float32x4_t _r03 = vextq_f32(_r00, _r04, 3); | |||
| float32x4_t _r10 = vld1q_f32(r1); | |||
| float32x4_t _r14 = vld1q_f32(r1 + 4); | |||
| float32x4_t _r11 = vextq_f32(_r10, _r14, 1); | |||
| float32x4_t _r12 = vextq_f32(_r10, _r14, 2); | |||
| float32x4_t _r13 = vextq_f32(_r10, _r14, 3); | |||
| float32x4_t _r20 = vld1q_f32(r2); | |||
| float32x4_t _r24 = vld1q_f32(r2 + 4); | |||
| float32x4_t _r21 = vextq_f32(_r20, _r24, 1); | |||
| float32x4_t _r22 = vextq_f32(_r20, _r24, 2); | |||
| float32x4_t _r23 = vextq_f32(_r20, _r24, 3); | |||
| float32x4_t _r30 = vld1q_f32(r3); | |||
| float32x4_t _r34 = vld1q_f32(r3 + 4); | |||
| float32x4_t _r31 = vextq_f32(_r30, _r34, 1); | |||
| float32x4_t _r32 = vextq_f32(_r30, _r34, 2); | |||
| float32x4_t _r33 = vextq_f32(_r30, _r34, 3); | |||
| float32x4_t _r40 = vld1q_f32(r4); | |||
| float32x4_t _r44 = vld1q_f32(r4 + 4); | |||
| float32x4_t _r41 = vextq_f32(_r40, _r44, 1); | |||
| float32x4_t _r42 = vextq_f32(_r40, _r44, 2); | |||
| float32x4_t _r43 = vextq_f32(_r40, _r44, 3); | |||
| _sum = vfmaq_laneq_f32(_sum, _r00, _k0123, 0); | |||
| _sum = vfmaq_laneq_f32(_sum, _r01, _k0123, 1); | |||
| _sum = vfmaq_laneq_f32(_sum, _r02, _k0123, 2); | |||
| _sum = vfmaq_laneq_f32(_sum, _r03, _k0123, 3); | |||
| _sum = vfmaq_laneq_f32(_sum, _r04, _k4567, 0); | |||
| _sum = vfmaq_laneq_f32(_sum, _r10, _k4567, 1); | |||
| _sum = vfmaq_laneq_f32(_sum, _r11, _k4567, 2); | |||
| _sum = vfmaq_laneq_f32(_sum, _r12, _k4567, 3); | |||
| _sum = vfmaq_laneq_f32(_sum, _r13, _k891011, 0); | |||
| _sum = vfmaq_laneq_f32(_sum, _r14, _k891011, 1); | |||
| _sum = vfmaq_laneq_f32(_sum, _r20, _k891011, 2); | |||
| _sum = vfmaq_laneq_f32(_sum, _r21, _k891011, 3); | |||
| _sum = vfmaq_laneq_f32(_sum, _r22, _k12131415, 0); | |||
| _sum = vfmaq_laneq_f32(_sum, _r23, _k12131415, 1); | |||
| _sum = vfmaq_laneq_f32(_sum, _r24, _k12131415, 2); | |||
| _sum = vfmaq_laneq_f32(_sum, _r30, _k12131415, 3); | |||
| _sum = vfmaq_laneq_f32(_sum, _r31, _k16171819, 0); | |||
| _sum = vfmaq_laneq_f32(_sum, _r32, _k16171819, 1); | |||
| _sum = vfmaq_laneq_f32(_sum, _r33, _k16171819, 2); | |||
| _sum = vfmaq_laneq_f32(_sum, _r34, _k16171819, 3); | |||
| _sum = vfmaq_laneq_f32(_sum, _r40, _k20212223, 0); | |||
| _sum = vfmaq_laneq_f32(_sum, _r41, _k20212223, 1); | |||
| _sum = vfmaq_laneq_f32(_sum, _r42, _k20212223, 2); | |||
| _sum = vfmaq_laneq_f32(_sum, _r43, _k20212223, 3); | |||
| _sum = vfmaq_laneq_f32(_sum, _r44, _k24242424, 0); | |||
| vst1q_f32(outptr, _sum); | |||
| r0 += 4; | |||
| r1 += 4; | |||
| r2 += 4; | |||
| r3 += 4; | |||
| r4 += 4; | |||
| outptr += 4; | |||
| asm volatile( | |||
| "prfm pldl1keep, [%1, #128] \n" | |||
| "prfm pldl1keep, [%2, #256] \n" | |||
| "ld1 {v8.4s, v9.4s}, [%2] \n"// _r00 = vld1q_f32(r0+j); | |||
| "add %2, %2, #16 \n" | |||
| "0: \n" | |||
| "ld1 {v7.4s}, [%1] \n"// _sum = vld1q_f32(outptr+j); | |||
| "ext v10.16b, v8.16b, v9.16b, #4 \n" //_r01 | |||
| "ext v11.16b, v8.16b, v9.16b, #8 \n" //_r02 | |||
| "ext v12.16b, v8.16b, v9.16b, #12 \n" //_r03 | |||
| "fmla v7.4s, v8.4s, %14.s[0] \n" | |||
| "fmul v13.4s, v10.4s, %14.s[1] \n" | |||
| "prfm pldl1keep, [%3, #256] \n" | |||
| "fmul v14.4s, v11.4s, %14.s[2] \n" | |||
| "fmul v15.4s, v12.4s, %14.s[3] \n" | |||
| "fmla v7.4s, v9.4s, %15.s[0] \n" | |||
| "ld1 {v8.4s, v9.4s}, [%3] \n" | |||
| "add %3, %3, #16 \n" | |||
| "ext v10.16b, v8.16b, v9.16b, #4 \n" //_r11 | |||
| "ext v11.16b, v8.16b, v9.16b, #8 \n" //_r12 | |||
| "ext v12.16b, v8.16b, v9.16b, #12 \n" //_r13 | |||
| "fmla v7.4s, v8.4s, %15.s[1] \n" | |||
| "fmla v13.4s, v10.4s, %15.s[2] \n" | |||
| "prfm pldl1keep, [%4, #256] \n" | |||
| "fmla v14.4s, v11.4s, %15.s[3] \n" | |||
| "fmla v15.4s, v12.4s, %16.s[0] \n" | |||
| "fmla v7.4s, v9.4s, %16.s[1] \n" | |||
| "ld1 {v8.4s, v9.4s}, [%4] \n" | |||
| "add %4, %4, #16 \n" | |||
| "ext v10.16b, v8.16b, v9.16b, #4 \n" //_r21 | |||
| "ext v11.16b, v8.16b, v9.16b, #8 \n" //_r22 | |||
| "ext v12.16b, v8.16b, v9.16b, #12 \n" //_r23 | |||
| "fmla v7.4s, v8.4s, %16.s[2] \n" | |||
| "fmla v13.4s, v10.4s, %16.s[3] \n" | |||
| "prfm pldl1keep, [%5, #256] \n" | |||
| "fmla v14.4s, v11.4s, %17.s[0] \n" | |||
| "fmla v15.4s, v12.4s, %17.s[1] \n" | |||
| "fmla v7.4s, v9.4s, %17.s[2] \n" | |||
| "ld1 {v8.4s, v9.4s}, [%5] \n" | |||
| "add %5, %5, #16 \n" | |||
| "ext v10.16b, v8.16b, v9.16b, #4 \n" //_r31 | |||
| "ext v11.16b, v8.16b, v9.16b, #8 \n" //_r32 | |||
| "ext v12.16b, v8.16b, v9.16b, #12 \n" //_r33 | |||
| "fmla v7.4s, v8.4s, %17.s[3] \n" | |||
| "fmla v13.4s, v10.4s, %18.s[0] \n" | |||
| "prfm pldl1keep, [%6, #256] \n" | |||
| "fmla v14.4s, v11.4s, %18.s[1] \n" | |||
| "fmla v15.4s, v12.4s, %18.s[2] \n" | |||
| "fmla v7.4s, v9.4s, %18.s[3] \n" | |||
| "ld1 {v8.4s, v9.4s}, [%6] \n" | |||
| "add %6, %6, #16 \n" | |||
| "ext v10.16b, v8.16b, v9.16b, #4 \n" //_r41 | |||
| "ext v11.16b, v8.16b, v9.16b, #8 \n" //_r42 | |||
| "ext v12.16b, v8.16b, v9.16b, #12 \n" //_r43 | |||
| "fmla v7.4s, v8.4s, %19.s[0] \n" | |||
| "fmla v13.4s, v10.4s, %19.s[1] \n" | |||
| "fmla v14.4s, v11.4s, %19.s[2] \n" | |||
| "fmla v15.4s, v12.4s, %19.s[3] \n" | |||
| "fmla v7.4s, v9.4s, %20.s[0] \n" | |||
| "fadd v14.4s, v14.4s, v15.4s \n" | |||
| "fadd v7.4s, v7.4s, v13.4s \n" | |||
| "prfm pldl1keep, [%2, #256] \n" | |||
| "fadd v7.4s, v7.4s, v14.4s \n" | |||
| "ld1 {v8.4s, v9.4s}, [%2] \n" | |||
| "add %2, %2, #16 \n" | |||
| "st1 {v7.4s}, [%1], #16 \n" | |||
| "prfm pldl1keep, [%1, #128] \n" | |||
| "subs %w0, %w0, #1 \n" | |||
| "bne 0b \n" | |||
| "sub %2, %2, #16 \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(outptr), // %1 | |||
| "=r"(r0), // %2 | |||
| "=r"(r1), // %3 | |||
| "=r"(r2), // %4 | |||
| "=r"(r3), // %5 | |||
| "=r"(r4) // %6 | |||
| : "0"(nn), | |||
| "1"(outptr), | |||
| "2"(r0), | |||
| "3"(r1), | |||
| "4"(r2), | |||
| "5"(r3), | |||
| "6"(r4), | |||
| "w"(_k0123), // %14 | |||
| "w"(_k4567), // %15 | |||
| "w"(_k891011), // %16 | |||
| "w"(_k12131415), // %17 | |||
| "w"(_k16171819), // %18 | |||
| "w"(_k20212223), // %19 | |||
| "w"(_k24242424) // %20 | |||
| : "cc", "memory", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" | |||
| ); | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| @@ -920,99 +1047,147 @@ static void conv5x5s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| if (nn > 0) | |||
| { | |||
| float32x4_t _sum = vld1q_f32(outptr); | |||
| float32x4x2_t _r00_02461357 = vld2q_f32(r0); | |||
| float32x4x2_t _r00nx2 = vld2q_f32(r0 + 8); | |||
| float32x4_t _r0_8101214 = _r00nx2.val[0];// 8 10 12 14 | |||
| float32x4_t _r0_9111315 = _r00nx2.val[1];// 9 11 13 15 | |||
| float32x4_t _r00 = _r00_02461357.val[0];// 0 2 4 6 | |||
| float32x4_t _r01 = _r00_02461357.val[1];// 1 3 5 7 | |||
| float32x4_t _r02 = vextq_f32(_r00, _r0_8101214, 1);// 2 4 6 8 | |||
| float32x4_t _r03 = vextq_f32(_r01, _r0_9111315, 1);// 3 5 7 9 | |||
| float32x4_t _r04 = vextq_f32(_r00, _r0_8101214, 2);// 4 6 8 10 | |||
| float32x4x2_t _r10_02461357 = vld2q_f32(r1); | |||
| float32x4x2_t _r10nx2 = vld2q_f32(r1 + 8); | |||
| float32x4_t _r1_8101214 = _r10nx2.val[0]; | |||
| float32x4_t _r1_9111315 = _r10nx2.val[1]; | |||
| float32x4_t _r10 = _r10_02461357.val[0]; | |||
| float32x4_t _r11 = _r10_02461357.val[1]; | |||
| float32x4_t _r12 = vextq_f32(_r10, _r1_8101214, 1); | |||
| float32x4_t _r13 = vextq_f32(_r11, _r1_9111315, 1); | |||
| float32x4_t _r14 = vextq_f32(_r10, _r1_8101214, 2); | |||
| float32x4x2_t _r20_02461357 = vld2q_f32(r2); | |||
| float32x4x2_t _r20nx2 = vld2q_f32(r2 + 8); | |||
| float32x4_t _r2_8101214 = _r20nx2.val[0]; | |||
| float32x4_t _r2_9111315 = _r20nx2.val[1]; | |||
| float32x4_t _r20 = _r20_02461357.val[0]; | |||
| float32x4_t _r21 = _r20_02461357.val[1]; | |||
| float32x4_t _r22 = vextq_f32(_r20, _r2_8101214, 1); | |||
| float32x4_t _r23 = vextq_f32(_r21, _r2_9111315, 1); | |||
| float32x4_t _r24 = vextq_f32(_r20, _r2_8101214, 2); | |||
| float32x4x2_t _r30_02461357 = vld2q_f32(r3); | |||
| float32x4x2_t _r30nx2 = vld2q_f32(r3 + 8); | |||
| float32x4_t _r3_8101214 = _r30nx2.val[0]; | |||
| float32x4_t _r3_9111315 = _r30nx2.val[1]; | |||
| float32x4_t _r30 = _r30_02461357.val[0]; | |||
| float32x4_t _r31 = _r30_02461357.val[1]; | |||
| float32x4_t _r32 = vextq_f32(_r30, _r3_8101214, 1); | |||
| float32x4_t _r33 = vextq_f32(_r31, _r3_9111315, 1); | |||
| float32x4_t _r34 = vextq_f32(_r30, _r3_8101214, 2); | |||
| float32x4x2_t _r40_02461357 = vld2q_f32(r4); | |||
| float32x4x2_t _r40nx2 = vld2q_f32(r4 + 8); | |||
| float32x4_t _r4_8101214 = _r40nx2.val[0]; | |||
| float32x4_t _r4_9111315 = _r40nx2.val[1]; | |||
| float32x4_t _r40 = _r40_02461357.val[0]; | |||
| float32x4_t _r41 = _r40_02461357.val[1]; | |||
| float32x4_t _r42 = vextq_f32(_r40, _r4_8101214, 1); | |||
| float32x4_t _r43 = vextq_f32(_r41, _r4_9111315, 1); | |||
| float32x4_t _r44 = vextq_f32(_r40, _r4_8101214, 2); | |||
| _sum = vfmaq_laneq_f32(_sum, _r00, _k0123, 0); | |||
| _sum = vfmaq_laneq_f32(_sum, _r01, _k0123, 1); | |||
| _sum = vfmaq_laneq_f32(_sum, _r02, _k0123, 2); | |||
| _sum = vfmaq_laneq_f32(_sum, _r03, _k0123, 3); | |||
| _sum = vfmaq_laneq_f32(_sum, _r04, _k4567, 0); | |||
| _sum = vfmaq_laneq_f32(_sum, _r10, _k4567, 1); | |||
| _sum = vfmaq_laneq_f32(_sum, _r11, _k4567, 2); | |||
| _sum = vfmaq_laneq_f32(_sum, _r12, _k4567, 3); | |||
| _sum = vfmaq_laneq_f32(_sum, _r13, _k891011, 0); | |||
| _sum = vfmaq_laneq_f32(_sum, _r14, _k891011, 1); | |||
| _sum = vfmaq_laneq_f32(_sum, _r20, _k891011, 2); | |||
| _sum = vfmaq_laneq_f32(_sum, _r21, _k891011, 3); | |||
| _sum = vfmaq_laneq_f32(_sum, _r22, _k12131415, 0); | |||
| _sum = vfmaq_laneq_f32(_sum, _r23, _k12131415, 1); | |||
| _sum = vfmaq_laneq_f32(_sum, _r24, _k12131415, 2); | |||
| _sum = vfmaq_laneq_f32(_sum, _r30, _k12131415, 3); | |||
| _sum = vfmaq_laneq_f32(_sum, _r31, _k16171819, 0); | |||
| _sum = vfmaq_laneq_f32(_sum, _r32, _k16171819, 1); | |||
| _sum = vfmaq_laneq_f32(_sum, _r33, _k16171819, 2); | |||
| _sum = vfmaq_laneq_f32(_sum, _r34, _k16171819, 3); | |||
| _sum = vfmaq_laneq_f32(_sum, _r40, _k20212223, 0); | |||
| _sum = vfmaq_laneq_f32(_sum, _r41, _k20212223, 1); | |||
| _sum = vfmaq_laneq_f32(_sum, _r42, _k20212223, 2); | |||
| _sum = vfmaq_laneq_f32(_sum, _r43, _k20212223, 3); | |||
| _sum = vfmaq_laneq_f32(_sum, _r44, _k24242424, 0); | |||
| vst1q_f32(outptr, _sum); | |||
| r0 += 8; | |||
| r1 += 8; | |||
| r2 += 8; | |||
| r3 += 8; | |||
| r4 += 8; | |||
| outptr += 4; | |||
| asm volatile( | |||
| "prfm pldl1keep, [%2, #256] \n" | |||
| "ld2 {v8.4s, v9.4s}, [%2], #32 \n"// v8 = 0 2 4 6 q9 = 1 3 5 7 | |||
| "prfm pldl1keep, [%2, #256] \n" | |||
| "ld2 {v10.4s, v11.4s}, [%2] \n"// v10 = 8 10 12 14 v11 = 9 11 13 15 | |||
| "prfm pldl1keep, [%1, #128] \n" | |||
| "0: \n" | |||
| "ld1 {v7.4s}, [%1] \n" // v7 = outptr | |||
| "ext v12.16b, v8.16b, v10.16b, #4 \n" // v12 = 2 4 6 8 | |||
| "ext v11.16b, v9.16b, v11.16b, #4 \n" // v11 = 3 5 7 9 | |||
| "ext v10.16b, v8.16b, v10.16b, #8 \n" // v10 = 4 6 8 10 | |||
| "fmla v7.4s, v8.4s, %14.s[0] \n" | |||
| "fmul v13.4s, v9.4s, %14.s[1] \n" | |||
| "prfm pldl1keep, [%3, #256] \n" | |||
| "fmul v14.4s, v12.4s, %14.s[2] \n" | |||
| "fmul v15.4s, v11.4s, %14.s[3] \n" | |||
| "fmla v7.4s, v10.4s, %15.s[0] \n" | |||
| "ld2 {v8.4s, v9.4s}, [%3], #32 \n" | |||
| "prfm pldl1keep, [%3, #256] \n" | |||
| "ld2 {v10.4s, v11.4s}, [%3] \n" | |||
| "ext v12.16b, v8.16b, v10.16b, #4 \n" | |||
| "ext v11.16b, v9.16b, v11.16b, #4 \n" | |||
| "ext v10.16b, v8.16b, v10.16b, #8 \n" | |||
| "fmla v7.4s, v8.4s, %15.s[1] \n" | |||
| "fmla v13.4s, v9.4s, %15.s[2] \n" | |||
| "prfm pldl1keep, [%4, #256] \n" | |||
| "fmla v14.4s, v12.4s, %15.s[3] \n" | |||
| "fmla v15.4s, v11.4s, %16.s[0] \n" | |||
| "fmla v7.4s, v10.4s, %16.s[1] \n" | |||
| "ld2 {v8.4s, v9.4s}, [%4], #32 \n" | |||
| "prfm pldl1keep, [%4, #256] \n" | |||
| "ld2 {v10.4s, v11.4s}, [%4] \n" | |||
| "ext v12.16b, v8.16b, v10.16b, #4 \n" | |||
| "ext v11.16b, v9.16b, v11.16b, #4 \n" | |||
| "ext v10.16b, v8.16b, v10.16b, #8 \n" | |||
| "fmla v7.4s, v8.4s, %16.s[2] \n" | |||
| "fmla v13.4s, v9.4s, %16.s[3] \n" | |||
| "prfm pldl1keep, [%5, #256] \n" | |||
| "fmla v14.4s, v12.4s, %17.s[0] \n" | |||
| "fmla v15.4s, v11.4s, %17.s[1] \n" | |||
| "fmla v7.4s, v10.4s, %17.s[2] \n" | |||
| "ld2 {v8.4s, v9.4s}, [%5], #32 \n" | |||
| "prfm pldl1keep, [%5, #256] \n" | |||
| "ld2 {v10.4s, v11.4s}, [%5] \n" | |||
| "ext v12.16b, v8.16b, v10.16b, #4 \n" | |||
| "ext v11.16b, v9.16b, v11.16b, #4 \n" | |||
| "ext v10.16b, v8.16b, v10.16b, #8 \n" | |||
| "fmla v7.4s, v8.4s, %17.s[3] \n" | |||
| "fmla v13.4s, v9.4s, %18.s[0] \n" | |||
| "prfm pldl1keep, [%6, #256] \n" | |||
| "fmla v14.4s, v12.4s, %18.s[1] \n" | |||
| "fmla v15.4s, v11.4s, %18.s[2] \n" | |||
| "fmla v7.4s, v10.4s, %18.s[3] \n" | |||
| "ld2 {v8.4s, v9.4s}, [%6], #32 \n" | |||
| "prfm pldl1keep, [%6, #256] \n" | |||
| "ld2 {v10.4s, v11.4s}, [%6] \n" | |||
| "ext v12.16b, v8.16b, v10.16b, #4 \n" | |||
| "ext v11.16b, v9.16b, v11.16b, #4 \n" | |||
| "ext v10.16b, v8.16b, v10.16b, #8 \n" | |||
| "fmla v7.4s, v8.4s, %19.s[0] \n" | |||
| "fmla v13.4s, v9.4s, %19.s[1] \n" | |||
| "fmla v14.4s, v12.4s, %19.s[2] \n" | |||
| "fmla v15.4s, v11.4s, %19.s[3] \n" | |||
| "fmla v7.4s, v10.4s, %20.s[0] \n" | |||
| "prfm pldl1keep, [%2, #256] \n" | |||
| "ld2 {v8.4s, v9.4s}, [%2], #32 \n" | |||
| "fadd v14.4s, v14.4s, v15.4s \n" | |||
| "fadd v7.4s, v7.4s, v13.4s \n" | |||
| "prfm pldl1keep, [%2, #256] \n" | |||
| "fadd v7.4s, v7.4s, v14.4s \n" | |||
| "ld2 {v10.4s, v11.4s}, [%2] \n" | |||
| "st1 {v7.4s}, [%1], #16 \n" | |||
| "prfm pldl1keep, [%1, #128] \n" | |||
| "subs %w0, %w0, #1 \n" | |||
| "bne 0b \n" | |||
| "sub %2, %2, #32 \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(outptr), // %1 | |||
| "=r"(r0), // %2 | |||
| "=r"(r1), // %3 | |||
| "=r"(r2), // %4 | |||
| "=r"(r3), // %5 | |||
| "=r"(r4) // %6 | |||
| : "0"(nn), | |||
| "1"(outptr), | |||
| "2"(r0), | |||
| "3"(r1), | |||
| "4"(r2), | |||
| "5"(r3), | |||
| "6"(r4), | |||
| "w"(_k0123), // %14 | |||
| "w"(_k4567), // %15 | |||
| "w"(_k891011), // %16 | |||
| "w"(_k12131415), // %17 | |||
| "w"(_k16171819), // %18 | |||
| "w"(_k20212223), // %19 | |||
| "w"(_k24242424) // %20 | |||
| : "cc", "memory", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" | |||
| ); | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| { | |||
| @@ -75,13 +75,198 @@ static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| float32x4_t _k0123 = vld1q_f32(k0); | |||
| float32x4_t _k4567 = vld1q_f32(k0 + 4); | |||
| float32x4_t _k78910 = vld1q_f32(k1); | |||
| float32x4_t _k11121314 = vld1q_f32(k1 + 4); | |||
| float32x4_t _k14151617 = vld1q_f32(k2); | |||
| float32x4_t _k18192021 = vld1q_f32(k2 + 4); | |||
| float32x4_t _k21222324 = vld1q_f32(k3); | |||
| float32x4_t _k25262728 = vld1q_f32(k3 + 4); | |||
| float32x4_t _k28293031 = vld1q_f32(k4); | |||
| float32x4_t _k32333435 = vld1q_f32(k4 + 4); | |||
| float32x4_t _k35363738 = vld1q_f32(k5); | |||
| float32x4_t _k39404142 = vld1q_f32(k5 + 4); | |||
| float32x4_t _k42434445 = vld1q_f32(k6); | |||
| float32x4_t _k46474849 = vld1q_f32(k6 + 4); | |||
| #ifdef __clang__ // __ARM_NEON && __aarch64__ && __clang__ | |||
| if (nn > 0) | |||
| { | |||
| asm volatile( | |||
| // v0: input / final output | |||
| // v1 v2 v3: = ri0 ri4 ri0n , i <- 1-7 | |||
| // v4 = ri1 / ri3 / ri6 | |||
| // v5 = ri2 / ri5 | |||
| // v9 = intermediate sum register | |||
| "0: \n" | |||
| "prfm pldl1keep, [%1, #128] \n" | |||
| "ld1 {v0.4s}, [%1] \n" | |||
| //i = 1 | |||
| "prfm pldl1keep, [%2, #384] \n" | |||
| "ld1 {v1.4s, v2.4s, v3.4s}, [%2] \n" | |||
| "add %2, %2, #16 \n" | |||
| "ext v4.16b, v1.16b, v2.16b, #4 \n" | |||
| "fmul v9.4s, v1.4s, %18.s[0] \n" | |||
| "ext v5.16b, v1.16b, v2.16b, #8 \n" | |||
| "fmla v0.4s, v4.4s, %18.s[1] \n" | |||
| "ext v4.16b, v1.16b, v2.16b, #12 \n" | |||
| "fmla v9.4s, v5.4s, %18.s[2] \n" | |||
| "ext v5.16b, v2.16b, v3.16b, #4 \n" | |||
| "fmla v0.4s, v4.4s, %18.s[3] \n" | |||
| "ext v4.16b, v2.16b, v3.16b, #8 \n" | |||
| "fmla v9.4s, v2.4s, %19.s[0] \n" | |||
| "fmla v0.4s, v5.4s, %19.s[1] \n" | |||
| "fmla v9.4s, v4.4s, %19.s[2] \n" | |||
| //i = 2 | |||
| "prfm pldl1keep, [%3, #384] \n" | |||
| "ld1 {v1.4s, v2.4s, v3.4s}, [%3] \n" // v1 v2 v3: = r20 r24 r20n | |||
| "add %3, %3, #16 \n" | |||
| "ext v4.16b, v1.16b, v2.16b, #4 \n" // v4 = r21 | |||
| "fmla v9.4s, v1.4s, %20.s[0] \n" // *+ r10 | |||
| "ext v5.16b, v1.16b, v2.16b, #8 \n" // v5 = r22 | |||
| "fmla v0.4s, v4.4s, %20.s[1] \n" // *+ r11 | |||
| "ext v4.16b, v1.16b, v2.16b, #12 \n" // v4 = r23 | |||
| "fmla v9.4s, v5.4s, %20.s[2] \n" // *+ r1 | |||
| "ext v5.16b, v2.16b, v3.16b, #4 \n" // v5 = r25 | |||
| "fmla v0.4s, v4.4s, %20.s[3] \n" // *+ r13 | |||
| "ext v4.16b, v2.16b, v3.16b, #8 \n" // v4 = r26 | |||
| "fmla v9.4s, v2.4s, %21.s[0] \n" // *+ r14 | |||
| "fmla v0.4s, v5.4s, %21.s[1] \n" // *+ r15 | |||
| "fmla v9.4s, v4.4s, %21.s[2] \n" // *+ r16 | |||
| //i = 3 | |||
| "prfm pldl1keep, [%4, #384] \n" | |||
| "ld1 {v1.4s, v2.4s, v3.4s}, [%4] \n" | |||
| "add %4, %4, #16 \n" | |||
| "ext v4.16b, v1.16b, v2.16b, #4 \n" | |||
| "fmla v9.4s, v1.4s, %22.s[0] \n" | |||
| "ext v5.16b, v1.16b, v2.16b, #8 \n" | |||
| "fmla v0.4s, v4.4s, %22.s[1] \n" | |||
| "ext v4.16b, v1.16b, v2.16b, #12 \n" | |||
| "fmla v9.4s, v5.4s, %22.s[2] \n" | |||
| "ext v5.16b, v2.16b, v3.16b, #4 \n" | |||
| "fmla v0.4s, v4.4s, %22.s[3] \n" | |||
| "ext v4.16b, v2.16b, v3.16b, #8 \n" | |||
| "fmla v9.4s, v2.4s, %23.s[0] \n" | |||
| "fmla v0.4s, v5.4s, %23.s[1] \n" | |||
| "fmla v9.4s, v4.4s, %23.s[2] \n" | |||
| //i = 4 | |||
| "prfm pldl1keep, [%5, #384] \n" | |||
| "ld1 {v1.4s, v2.4s, v3.4s}, [%5] \n" | |||
| "add %5, %5, #16 \n" | |||
| "ext v4.16b, v1.16b, v2.16b, #4 \n" | |||
| "fmla v9.4s, v1.4s, %24.s[0] \n" | |||
| "ext v5.16b, v1.16b, v2.16b, #8 \n" | |||
| "fmla v0.4s, v4.4s, %24.s[1] \n" | |||
| "ext v4.16b, v1.16b, v2.16b, #12 \n" | |||
| "fmla v9.4s, v5.4s, %24.s[2] \n" | |||
| "ext v5.16b, v2.16b, v3.16b, #4 \n" | |||
| "fmla v0.4s, v4.4s, %24.s[3] \n" | |||
| "ext v4.16b, v2.16b, v3.16b, #8 \n" | |||
| "fmla v9.4s, v2.4s, %25.s[0] \n" | |||
| "fmla v0.4s, v5.4s, %25.s[1] \n" | |||
| "fmla v9.4s, v4.4s, %25.s[2] \n" | |||
| //i = 5 | |||
| "prfm pldl1keep, [%6, #384] \n" | |||
| "ld1 {v1.4s, v2.4s, v3.4s}, [%6] \n" | |||
| "add %6, %6, #16 \n" | |||
| "ext v4.16b, v1.16b, v2.16b, #4 \n" | |||
| "fmla v9.4s, v1.4s, %26.s[0] \n" | |||
| "ext v5.16b, v1.16b, v2.16b, #8 \n" | |||
| "fmla v0.4s, v4.4s, %26.s[1] \n" | |||
| "ext v4.16b, v1.16b, v2.16b, #12 \n" | |||
| "fmla v9.4s, v5.4s, %26.s[2] \n" | |||
| "ext v5.16b, v2.16b, v3.16b, #4 \n" | |||
| "fmla v0.4s, v4.4s, %26.s[3] \n" | |||
| "ext v4.16b, v2.16b, v3.16b, #8 \n" | |||
| "fmla v9.4s, v2.4s, %27.s[0] \n" | |||
| "fmla v0.4s, v5.4s, %27.s[1] \n" | |||
| "fmla v9.4s, v4.4s, %27.s[2] \n" | |||
| //i = 6 | |||
| "prfm pldl1keep, [%7, #384] \n" | |||
| "ld1 {v1.4s, v2.4s, v3.4s}, [%7] \n" | |||
| "add %7, %7, #16 \n" | |||
| "ext v4.16b, v1.16b, v2.16b, #4 \n" | |||
| "fmla v9.4s, v1.4s, %28.s[0] \n" | |||
| "ext v5.16b, v1.16b, v2.16b, #8 \n" | |||
| "fmla v0.4s, v4.4s, %28.s[1] \n" | |||
| "ext v4.16b, v1.16b, v2.16b, #12 \n" | |||
| "fmla v9.4s, v5.4s, %28.s[2] \n" | |||
| "ext v5.16b, v2.16b, v3.16b, #4 \n" | |||
| "fmla v0.4s, v4.4s, %28.s[3] \n" | |||
| "ext v4.16b, v2.16b, v3.16b, #8 \n" | |||
| "fmla v9.4s, v2.4s, %29.s[0] \n" | |||
| "fmla v0.4s, v5.4s, %29.s[1] \n" | |||
| "fmla v9.4s, v4.4s, %29.s[2] \n" | |||
| //i = 7 | |||
| "prfm pldl1keep, [%8, #384] \n" | |||
| "ld1 {v1.4s, v2.4s, v3.4s}, [%8] \n" | |||
| "add %8, %8, #16 \n" | |||
| "ext v4.16b, v1.16b, v2.16b, #4 \n" | |||
| "fmla v9.4s, v1.4s, %30.s[0] \n" | |||
| "ext v5.16b, v1.16b, v2.16b, #8 \n" | |||
| "fmla v0.4s, v4.4s, %30.s[1] \n" | |||
| "ext v4.16b, v1.16b, v2.16b, #12 \n" | |||
| "fmla v9.4s, v5.4s, %30.s[2] \n" | |||
| "ext v5.16b, v2.16b, v3.16b, #4 \n" | |||
| "fmla v0.4s, v4.4s, %30.s[3] \n" | |||
| "ext v4.16b, v2.16b, v3.16b, #8 \n" | |||
| "fmla v9.4s, v2.4s, %31.s[0] \n" | |||
| "fmla v0.4s, v5.4s, %31.s[1] \n" | |||
| "fmla v9.4s, v4.4s, %31.s[2] \n" | |||
| "fadd v0.4s, v0.4s, v9.4s \n" | |||
| "st1 {v0.4s}, [%1], #16 \n" | |||
| "subs %w0, %w0, #1 \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(outptr), // %1 | |||
| "=r"(r0), // %2 | |||
| "=r"(r1), // %3 | |||
| "=r"(r2), // %4 | |||
| "=r"(r3), // %5 | |||
| "=r"(r4), // %6 | |||
| "=r"(r5), // %7 | |||
| "=r"(r6) // %8 | |||
| : "0"(nn), | |||
| "1"(outptr), | |||
| "2"(r0), | |||
| "3"(r1), | |||
| "4"(r2), | |||
| "5"(r3), | |||
| "6"(r4), | |||
| "7"(r5), | |||
| "8"(r6), | |||
| "w"(_k0123), // %18 | |||
| "w"(_k4567), // %19 | |||
| "w"(_k78910), // %20 | |||
| "w"(_k11121314), // %21 | |||
| "w"(_k14151617), // %22 | |||
| "w"(_k18192021), // %23 | |||
| "w"(_k21222324), // %24 | |||
| "w"(_k25262728), // %25 | |||
| "w"(_k28293031), // %26 | |||
| "w"(_k32333435), // %27 | |||
| "w"(_k35363738), // %28 | |||
| "w"(_k39404142), // %29 | |||
| "w"(_k42434445), // %30 | |||
| "w"(_k46474849) // %31 | |||
| : "cc", "memory","v0", "v1", "v2", "v3", "v4", "v5", "v9" | |||
| ); | |||
| } | |||
| #else // __ARM_NEON && __aarch64__ defined, but __clang__ not defined | |||
| // When compiled with gcc, gcc does not accept over 30 operands | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _sum = vld1q_f32(outptr); | |||
| float32x4_t _k0123 = vld1q_f32(k0); | |||
| float32x4_t _k4567 = vld1q_f32(k0 + 4); | |||
| float32x4_t _r00 = vld1q_f32(r0);// 0 1 2 3 | |||
| float32x4_t _r04 = vld1q_f32(r0 + 4);// 4 5 6 7 | |||
| float32x4_t _r00n = vld1q_f32(r0 + 8);// 8 9 10 11 | |||
| @@ -99,9 +284,6 @@ static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| _sum = vfmaq_laneq_f32(_sum, _r05, _k4567, 1); | |||
| _sum = vfmaq_laneq_f32(_sum, _r06, _k4567, 2); | |||
| float32x4_t _k78910 = vld1q_f32(k1); | |||
| float32x4_t _k11121314 = vld1q_f32(k1 + 4); | |||
| float32x4_t _r10 = vld1q_f32(r1); | |||
| float32x4_t _r14 = vld1q_f32(r1 + 4); | |||
| float32x4_t _r10n = vld1q_f32(r1 + 8); | |||
| @@ -119,9 +301,6 @@ static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| _sum = vfmaq_laneq_f32(_sum, _r15, _k11121314, 1); | |||
| _sum = vfmaq_laneq_f32(_sum, _r16, _k11121314, 2); | |||
| float32x4_t _k14151617 = vld1q_f32(k2); | |||
| float32x4_t _k18192021 = vld1q_f32(k2 + 4); | |||
| float32x4_t _r20 = vld1q_f32(r2); | |||
| float32x4_t _r24 = vld1q_f32(r2 + 4); | |||
| float32x4_t _r20n = vld1q_f32(r2 + 8); | |||
| @@ -139,9 +318,6 @@ static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| _sum = vfmaq_laneq_f32(_sum, _r25, _k18192021, 1); | |||
| _sum = vfmaq_laneq_f32(_sum, _r26, _k18192021, 2); | |||
| float32x4_t _k21222324 = vld1q_f32(k3); | |||
| float32x4_t _k25262728 = vld1q_f32(k3 + 4); | |||
| float32x4_t _r30 = vld1q_f32(r3); | |||
| float32x4_t _r34 = vld1q_f32(r3 + 4); | |||
| float32x4_t _r30n = vld1q_f32(r3 + 8); | |||
| @@ -159,9 +335,6 @@ static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| _sum = vfmaq_laneq_f32(_sum, _r35, _k25262728, 1); | |||
| _sum = vfmaq_laneq_f32(_sum, _r36, _k25262728, 2); | |||
| float32x4_t _k28293031 = vld1q_f32(k4); | |||
| float32x4_t _k32333435 = vld1q_f32(k4 + 4); | |||
| float32x4_t _r40 = vld1q_f32(r4); | |||
| float32x4_t _r44 = vld1q_f32(r4 + 4); | |||
| float32x4_t _r40n = vld1q_f32(r4 + 8); | |||
| @@ -179,9 +352,6 @@ static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| _sum = vfmaq_laneq_f32(_sum, _r45, _k32333435, 1); | |||
| _sum = vfmaq_laneq_f32(_sum, _r46, _k32333435, 2); | |||
| float32x4_t _k35363738 = vld1q_f32(k5); | |||
| float32x4_t _k39404142 = vld1q_f32(k5 + 4); | |||
| float32x4_t _r50 = vld1q_f32(r5); | |||
| float32x4_t _r54 = vld1q_f32(r5 + 4); | |||
| float32x4_t _r50n = vld1q_f32(r5 + 8); | |||
| @@ -199,9 +369,6 @@ static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| _sum = vfmaq_laneq_f32(_sum, _r55, _k39404142, 1); | |||
| _sum = vfmaq_laneq_f32(_sum, _r56, _k39404142, 2); | |||
| float32x4_t _k42434445 = vld1q_f32(k6); | |||
| float32x4_t _k46474849 = vld1q_f32(k6 + 4); | |||
| float32x4_t _r60 = vld1q_f32(r6); | |||
| float32x4_t _r64 = vld1q_f32(r6 + 4); | |||
| float32x4_t _r60n = vld1q_f32(r6 + 8); | |||
| @@ -230,7 +397,8 @@ static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| r6 += 4; | |||
| outptr += 4; | |||
| } | |||
| #else | |||
| #endif // __clang__ | |||
| #else //__aarch32__ | |||
| if (nn > 0) | |||
| { | |||
| asm volatile( | |||
| @@ -599,13 +767,205 @@ static void conv7x7s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| float32x4_t _k0123 = vld1q_f32(k0); | |||
| float32x4_t _k4567 = vld1q_f32(k0 + 4); | |||
| float32x4_t _k78910 = vld1q_f32(k1); | |||
| float32x4_t _k11121314 = vld1q_f32(k1 + 4); | |||
| float32x4_t _k14151617 = vld1q_f32(k2); | |||
| float32x4_t _k18192021 = vld1q_f32(k2 + 4); | |||
| float32x4_t _k21222324 = vld1q_f32(k3); | |||
| float32x4_t _k25262728 = vld1q_f32(k3 + 4); | |||
| float32x4_t _k28293031 = vld1q_f32(k4); | |||
| float32x4_t _k32333435 = vld1q_f32(k4 + 4); | |||
| float32x4_t _k35363738 = vld1q_f32(k5); | |||
| float32x4_t _k39404142 = vld1q_f32(k5 + 4); | |||
| float32x4_t _k42434445 = vld1q_f32(k6); | |||
| float32x4_t _k46474849 = vld1q_f32(k6 + 4); | |||
| #ifdef __clang__ // __ARM_NEON && __aarch64__ && __clang__ | |||
| if (nn > 0) | |||
| { | |||
| asm volatile( | |||
| // v0: input / final output | |||
| // v1 v2: = _ri0/_ri1 first | |||
| // v3 v4: = then _r0_8101214/_r0_9111315 | |||
| // v5 = ri2 / ri4 / ri6 | |||
| // v6 = ri3 / ri5 | |||
| // v9 = intermediate sum register | |||
| "0: \n" | |||
| "prfm pldl1keep, [%1, #128] \n" | |||
| "ld1 {v0.4s}, [%1] \n" | |||
| //i = 1 | |||
| "prfm pldl1keep, [%2, #512] \n" | |||
| "ld2 {v1.4s, v2.4s}, [%2] \n" // v1 v2 = _r00 _r01 | |||
| "add %2, %2, #32 \n" | |||
| "ld2 {v3.4s, v4.4s}, [%2] \n" // v3 v4 = _r0_8101214 / _r0_9111315 | |||
| "fmul v9.4s, v1.4s, %18.s[0] \n" // *+ _r00 | |||
| "ext v5.16b, v1.16b, v3.16b, #4 \n" // v5 = _r02 | |||
| "fmla v0.4s, v2.4s, %18.s[1] \n" // *+ _r01 | |||
| "ext v6.16b, v2.16b, v4.16b, #4 \n" // v6 = _r03 | |||
| "fmla v9.4s, v5.4s, %18.s[2] \n" // *+ _r02 | |||
| "ext v5.16b, v1.16b, v3.16b, #8 \n" // v5 = _r04 | |||
| "fmla v0.4s, v6.4s, %18.s[3] \n" // *+ _r03 | |||
| "ext v6.16b, v2.16b, v4.16b, #8 \n" // v6 = _r05 | |||
| "fmla v9.4s, v5.4s, %19.s[0] \n" // *+ _r04 | |||
| "ext v5.16b, v1.16b, v3.16b, #12 \n" // v5 = _r06 | |||
| "fmla v0.4s, v6.4s, %19.s[1] \n" // *+ _r05 | |||
| "fmla v9.4s, v5.4s, %19.s[2] \n" // *+ _r06 | |||
| //i = 2 | |||
| "prfm pldl1keep, [%3, #512] \n" | |||
| "ld2 {v1.4s, v2.4s}, [%3] \n" | |||
| "add %3, %3, #32 \n" | |||
| "ld2 {v3.4s, v4.4s}, [%3] \n" | |||
| "fmla v9.4s, v1.4s, %20.s[0] \n" | |||
| "ext v5.16b, v1.16b, v3.16b, #4 \n" | |||
| "fmla v0.4s, v2.4s, %20.s[1] \n" | |||
| "ext v6.16b, v2.16b, v4.16b, #4 \n" | |||
| "fmla v9.4s, v5.4s, %20.s[2] \n" | |||
| "ext v5.16b, v1.16b, v3.16b, #8 \n" | |||
| "fmla v0.4s, v6.4s, %20.s[3] \n" | |||
| "ext v6.16b, v2.16b, v4.16b, #8 \n" | |||
| "fmla v9.4s, v5.4s, %21.s[0] \n" | |||
| "ext v5.16b, v1.16b, v3.16b, #12 \n" | |||
| "fmla v0.4s, v6.4s, %21.s[1] \n" | |||
| "fmla v9.4s, v5.4s, %21.s[2] \n" | |||
| //i = 3 | |||
| "prfm pldl1keep, [%4, #512] \n" | |||
| "ld2 {v1.4s, v2.4s}, [%4] \n" | |||
| "add %4, %4, #32 \n" | |||
| "ld2 {v3.4s, v4.4s}, [%4] \n" | |||
| "fmla v9.4s, v1.4s, %22.s[0] \n" | |||
| "ext v5.16b, v1.16b, v3.16b, #4 \n" | |||
| "fmla v0.4s, v2.4s, %22.s[1] \n" | |||
| "ext v6.16b, v2.16b, v4.16b, #4 \n" | |||
| "fmla v9.4s, v5.4s, %22.s[2] \n" | |||
| "ext v5.16b, v1.16b, v3.16b, #8 \n" | |||
| "fmla v0.4s, v6.4s, %22.s[3] \n" | |||
| "ext v6.16b, v2.16b, v4.16b, #8 \n" | |||
| "fmla v9.4s, v5.4s, %23.s[0] \n" | |||
| "ext v5.16b, v1.16b, v3.16b, #12 \n" | |||
| "fmla v0.4s, v6.4s, %23.s[1] \n" | |||
| "fmla v9.4s, v5.4s, %23.s[2] \n" | |||
| //i = 4 | |||
| "prfm pldl1keep, [%5, #512] \n" | |||
| "ld2 {v1.4s, v2.4s}, [%5] \n" | |||
| "add %5, %5, #32 \n" | |||
| "ld2 {v3.4s, v4.4s}, [%5] \n" | |||
| "fmla v9.4s, v1.4s, %24.s[0] \n" | |||
| "ext v5.16b, v1.16b, v3.16b, #4 \n" | |||
| "fmla v0.4s, v2.4s, %24.s[1] \n" | |||
| "ext v6.16b, v2.16b, v4.16b, #4 \n" | |||
| "fmla v9.4s, v5.4s, %24.s[2] \n" | |||
| "ext v5.16b, v1.16b, v3.16b, #8 \n" | |||
| "fmla v0.4s, v6.4s, %24.s[3] \n" | |||
| "ext v6.16b, v2.16b, v4.16b, #8 \n" | |||
| "fmla v9.4s, v5.4s, %25.s[0] \n" | |||
| "ext v5.16b, v1.16b, v3.16b, #12 \n" | |||
| "fmla v0.4s, v6.4s, %25.s[1] \n" | |||
| "fmla v9.4s, v5.4s, %25.s[2] \n" | |||
| //i = 5 | |||
| "prfm pldl1keep, [%6, #512] \n" | |||
| "ld2 {v1.4s, v2.4s}, [%6] \n" | |||
| "add %6, %6, #32 \n" | |||
| "ld2 {v3.4s, v4.4s}, [%6] \n" | |||
| "fmla v9.4s, v1.4s, %26.s[0] \n" | |||
| "ext v5.16b, v1.16b, v3.16b, #4 \n" | |||
| "fmla v0.4s, v2.4s, %26.s[1] \n" | |||
| "ext v6.16b, v2.16b, v4.16b, #4 \n" | |||
| "fmla v9.4s, v5.4s, %26.s[2] \n" | |||
| "ext v5.16b, v1.16b, v3.16b, #8 \n" | |||
| "fmla v0.4s, v6.4s, %26.s[3] \n" | |||
| "ext v6.16b, v2.16b, v4.16b, #8 \n" | |||
| "fmla v9.4s, v5.4s, %27.s[0] \n" | |||
| "ext v5.16b, v1.16b, v3.16b, #12 \n" | |||
| "fmla v0.4s, v6.4s, %27.s[1] \n" | |||
| "fmla v9.4s, v5.4s, %27.s[2] \n" | |||
| //i = 6 | |||
| "prfm pldl1keep, [%7, #512] \n" | |||
| "ld2 {v1.4s, v2.4s}, [%7] \n" | |||
| "add %7, %7, #32 \n" | |||
| "ld2 {v3.4s, v4.4s}, [%7] \n" | |||
| "fmla v9.4s, v1.4s, %28.s[0] \n" | |||
| "ext v5.16b, v1.16b, v3.16b, #4 \n" | |||
| "fmla v0.4s, v2.4s, %28.s[1] \n" | |||
| "ext v6.16b, v2.16b, v4.16b, #4 \n" | |||
| "fmla v9.4s, v5.4s, %28.s[2] \n" | |||
| "ext v5.16b, v1.16b, v3.16b, #8 \n" | |||
| "fmla v0.4s, v6.4s, %28.s[3] \n" | |||
| "ext v6.16b, v2.16b, v4.16b, #8 \n" | |||
| "fmla v9.4s, v5.4s, %29.s[0] \n" | |||
| "ext v5.16b, v1.16b, v3.16b, #12 \n" | |||
| "fmla v0.4s, v6.4s, %29.s[1] \n" | |||
| "fmla v9.4s, v5.4s, %29.s[2] \n" | |||
| //i = 7 | |||
| "prfm pldl1keep, [%8, #512] \n" | |||
| "ld2 {v1.4s, v2.4s}, [%8] \n" | |||
| "add %8, %8, #32 \n" | |||
| "ld2 {v3.4s, v4.4s}, [%8] \n" | |||
| "fmla v9.4s, v1.4s, %30.s[0] \n" | |||
| "ext v5.16b, v1.16b, v3.16b, #4 \n" | |||
| "fmla v0.4s, v2.4s, %30.s[1] \n" | |||
| "ext v6.16b, v2.16b, v4.16b, #4 \n" | |||
| "fmla v9.4s, v5.4s, %30.s[2] \n" | |||
| "ext v5.16b, v1.16b, v3.16b, #8 \n" | |||
| "fmla v0.4s, v6.4s, %30.s[3] \n" | |||
| "ext v6.16b, v2.16b, v4.16b, #8 \n" | |||
| "fmla v9.4s, v5.4s, %31.s[0] \n" | |||
| "ext v5.16b, v1.16b, v3.16b, #12 \n" | |||
| "fmla v0.4s, v6.4s, %31.s[1] \n" | |||
| "fmla v9.4s, v5.4s, %31.s[2] \n" | |||
| "fadd v0.4s, v0.4s, v9.4s \n" | |||
| "st1 {v0.4s}, [%1], #16 \n" | |||
| "subs %w0, %w0, #1 \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(outptr), // %1 | |||
| "=r"(r0), // %2 | |||
| "=r"(r1), // %3 | |||
| "=r"(r2), // %4 | |||
| "=r"(r3), // %5 | |||
| "=r"(r4), // %6 | |||
| "=r"(r5), // %7 | |||
| "=r"(r6) // %8 | |||
| : "0"(nn), | |||
| "1"(outptr), | |||
| "2"(r0), | |||
| "3"(r1), | |||
| "4"(r2), | |||
| "5"(r3), | |||
| "6"(r4), | |||
| "7"(r5), | |||
| "8"(r6), | |||
| "w"(_k0123), // %18 | |||
| "w"(_k4567), // %19 | |||
| "w"(_k78910), // %20 | |||
| "w"(_k11121314), // %21 | |||
| "w"(_k14151617), // %22 | |||
| "w"(_k18192021), // %23 | |||
| "w"(_k21222324), // %24 | |||
| "w"(_k25262728), // %25 | |||
| "w"(_k28293031), // %26 | |||
| "w"(_k32333435), // %27 | |||
| "w"(_k35363738), // %28 | |||
| "w"(_k39404142), // %29 | |||
| "w"(_k42434445), // %30 | |||
| "w"(_k46474849) // %31 | |||
| : "cc", "memory","v0", "v1", "v2", "v3", "v4", "v5", "v6", "v9" | |||
| ); | |||
| } | |||
| #else // __ARM_NEON && __aarch64__ defined, but __clang__ not defined | |||
| // When compiled with gcc, gcc does not accept over 30 operands | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _sum = vld1q_f32(outptr); | |||
| float32x4_t _k0123 = vld1q_f32(k0); | |||
| float32x4_t _k4567 = vld1q_f32(k0 + 4); | |||
| float32x4x2_t _r00_02461357 = vld2q_f32(r0); | |||
| float32x4x2_t _r00nx2 = vld2q_f32(r0 + 8); | |||
| float32x4_t _r0_8101214 = _r00nx2.val[0];// 8 10 12 14 | |||
| @@ -626,9 +986,6 @@ static void conv7x7s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| _sum = vfmaq_laneq_f32(_sum, _r05, _k4567, 1); | |||
| _sum = vfmaq_laneq_f32(_sum, _r06, _k4567, 2); | |||
| float32x4_t _k78910 = vld1q_f32(k1); | |||
| float32x4_t _k11121314 = vld1q_f32(k1 + 4); | |||
| float32x4x2_t _r10_02461357 = vld2q_f32(r1); | |||
| float32x4x2_t _r10nx2 = vld2q_f32(r1 + 8); | |||
| float32x4_t _r1_8101214 = _r10nx2.val[0]; | |||
| @@ -649,9 +1006,6 @@ static void conv7x7s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| _sum = vfmaq_laneq_f32(_sum, _r15, _k11121314, 1); | |||
| _sum = vfmaq_laneq_f32(_sum, _r16, _k11121314, 2); | |||
| float32x4_t _k14151617 = vld1q_f32(k2); | |||
| float32x4_t _k18192021 = vld1q_f32(k2 + 4); | |||
| float32x4x2_t _r20_02461357 = vld2q_f32(r2); | |||
| float32x4x2_t _r20nx2 = vld2q_f32(r2 + 8); | |||
| float32x4_t _r2_8101214 = _r20nx2.val[0]; | |||
| @@ -672,9 +1026,6 @@ static void conv7x7s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| _sum = vfmaq_laneq_f32(_sum, _r25, _k18192021, 1); | |||
| _sum = vfmaq_laneq_f32(_sum, _r26, _k18192021, 2); | |||
| float32x4_t _k21222324 = vld1q_f32(k3); | |||
| float32x4_t _k25262728 = vld1q_f32(k3 + 4); | |||
| float32x4x2_t _r30_02461357 = vld2q_f32(r3); | |||
| float32x4x2_t _r30nx2 = vld2q_f32(r3 + 8); | |||
| float32x4_t _r3_8101214 = _r30nx2.val[0]; | |||
| @@ -695,9 +1046,6 @@ static void conv7x7s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| _sum = vfmaq_laneq_f32(_sum, _r35, _k25262728, 1); | |||
| _sum = vfmaq_laneq_f32(_sum, _r36, _k25262728, 2); | |||
| float32x4_t _k28293031 = vld1q_f32(k4); | |||
| float32x4_t _k32333435 = vld1q_f32(k4 + 4); | |||
| float32x4x2_t _r40_02461357 = vld2q_f32(r4); | |||
| float32x4x2_t _r40nx2 = vld2q_f32(r4 + 8); | |||
| float32x4_t _r4_8101214 = _r40nx2.val[0]; | |||
| @@ -718,9 +1066,6 @@ static void conv7x7s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| _sum = vfmaq_laneq_f32(_sum, _r45, _k32333435, 1); | |||
| _sum = vfmaq_laneq_f32(_sum, _r46, _k32333435, 2); | |||
| float32x4_t _k35363738 = vld1q_f32(k5); | |||
| float32x4_t _k39404142 = vld1q_f32(k5 + 4); | |||
| float32x4x2_t _r50_02461357 = vld2q_f32(r5); | |||
| float32x4x2_t _r50nx2 = vld2q_f32(r5 + 8); | |||
| float32x4_t _r5_8101214 = _r50nx2.val[0]; | |||
| @@ -741,9 +1086,6 @@ static void conv7x7s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| _sum = vfmaq_laneq_f32(_sum, _r55, _k39404142, 1); | |||
| _sum = vfmaq_laneq_f32(_sum, _r56, _k39404142, 2); | |||
| float32x4_t _k42434445 = vld1q_f32(k6); | |||
| float32x4_t _k46474849 = vld1q_f32(k6 + 4); | |||
| float32x4x2_t _r60_02461357 = vld2q_f32(r6); | |||
| float32x4x2_t _r60nx2 = vld2q_f32(r6 + 8); | |||
| float32x4_t _r6_8101214 = _r60nx2.val[0]; | |||
| @@ -775,6 +1117,7 @@ static void conv7x7s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke | |||
| r6 += 8; | |||
| outptr += 4; | |||
| } | |||
| #endif // __clang__ | |||
| #else | |||
| if (nn > 0) | |||
| { | |||
| @@ -77,60 +77,111 @@ static void convdw3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| if (nn > 0) | |||
| { | |||
| float32x4_t _r00 = vld1q_f32(r0); | |||
| float32x4_t _r00n = vld1q_f32(r0 + 4); | |||
| float32x4_t _r01 = vextq_f32(_r00, _r00n, 1); | |||
| float32x4_t _r02 = vextq_f32(_r00, _r00n, 2); | |||
| asm volatile( | |||
| "prfm pldl1keep, [%3, #192] \n" | |||
| "ld1 {v9.4s, v10.4s}, [%3] \n" //r0 | |||
| "add %3, %3, #16 \n" | |||
| float32x4_t _r10 = vld1q_f32(r1); | |||
| float32x4_t _r10n = vld1q_f32(r1 + 4); | |||
| float32x4_t _r11 = vextq_f32(_r10, _r10n, 1); | |||
| float32x4_t _r12 = vextq_f32(_r10, _r10n, 2); | |||
| "ext v11.16b, v9.16b, v10.16b, #4 \n" | |||
| "ext v12.16b, v9.16b, v10.16b, #8 \n" | |||
| float32x4_t _r20 = vld1q_f32(r2); | |||
| float32x4_t _r20n = vld1q_f32(r2 + 4); | |||
| float32x4_t _r21 = vextq_f32(_r20, _r20n, 1); | |||
| float32x4_t _r22 = vextq_f32(_r20, _r20n, 2); | |||
| "0: \n" | |||
| float32x4_t _r30 = vld1q_f32(r3); | |||
| float32x4_t _r30n = vld1q_f32(r3 + 4); | |||
| float32x4_t _r31 = vextq_f32(_r30, _r30n, 1); | |||
| float32x4_t _r32 = vextq_f32(_r30, _r30n, 2); | |||
| float32x4_t _sum1 = vmulq_laneq_f32(_r00, _k012x, 0); | |||
| float32x4_t _sum2 = vfmaq_laneq_f32(_bias0, _r01, _k012x, 1); | |||
| _sum1 = vfmaq_laneq_f32(_sum1, _r02, _k012x, 2); | |||
| _sum2 = vfmaq_laneq_f32(_sum2, _r10, _k345x, 0); | |||
| _sum1 = vfmaq_laneq_f32(_sum1, _r11, _k345x, 1); | |||
| _sum2 = vfmaq_laneq_f32(_sum2, _r12, _k345x, 2); | |||
| _sum1 = vfmaq_laneq_f32(_sum1, _r20, _k678x, 0); | |||
| _sum2 = vfmaq_laneq_f32(_sum2, _r21, _k678x, 1); | |||
| _sum1 = vfmaq_laneq_f32(_sum1, _r22, _k678x, 2); | |||
| float32x4_t _sum3 = vmulq_laneq_f32(_r10, _k012x, 0); | |||
| float32x4_t _sum4 = vfmaq_laneq_f32(_bias0, _r11, _k012x, 1); | |||
| _sum3 = vfmaq_laneq_f32(_sum3, _r12, _k012x, 2); | |||
| _sum4 = vfmaq_laneq_f32(_sum4, _r20, _k345x, 0); | |||
| _sum3 = vfmaq_laneq_f32(_sum3, _r21, _k345x, 1); | |||
| _sum4 = vfmaq_laneq_f32(_sum4, _r22, _k345x, 2); | |||
| _sum3 = vfmaq_laneq_f32(_sum3, _r30, _k678x, 0); | |||
| _sum4 = vfmaq_laneq_f32(_sum4, _r31, _k678x, 1); | |||
| _sum3 = vfmaq_laneq_f32(_sum3, _r32, _k678x, 2); | |||
| _sum1 = vaddq_f32(_sum1, _sum2); | |||
| _sum3 = vaddq_f32(_sum3, _sum4); | |||
| vst1q_f32(outptr, _sum1); | |||
| vst1q_f32(outptr2, _sum3); | |||
| r0 += 4; | |||
| r1 += 4; | |||
| r2 += 4; | |||
| r3 += 4; | |||
| outptr += 4; | |||
| outptr2 += 4; | |||
| "fmul v7.4s, v9.4s, %14.s[0] \n" | |||
| "and v13.16b, %17.16b, %17.16b \n" // v13 = _bias0 | |||
| "fmul v6.4s, v11.4s, %14.s[1] \n" | |||
| "fmla v13.4s, v12.4s, %14.s[2] \n" | |||
| "prfm pldl1keep, [%4, #192] \n" | |||
| "ld1 {v9.4s, v10.4s}, [%4] \n" | |||
| "add %4, %4, #16 \n" | |||
| "fmla v7.4s, v9.4s, %15.s[0] \n" | |||
| "ext v11.16b, v9.16b, v10.16b, #4 \n" | |||
| "ext v12.16b, v9.16b, v10.16b, #8 \n" | |||
| "fmla v6.4s, v11.4s, %15.s[1] \n" | |||
| "fmla v13.4s, v12.4s, %15.s[2] \n" | |||
| "fmul v8.4s, v9.4s, %14.s[0] \n" | |||
| "and v15.16b, %17.16b, %17.16b \n" // v15 = _bias0 | |||
| "fmul v14.4s, v11.4s, %14.s[1] \n" | |||
| "fmla v15.4s, v12.4s, %14.s[2] \n" | |||
| "prfm pldl1keep, [%5, #192] \n" | |||
| "ld1 {v9.4s, v10.4s}, [%5] \n" | |||
| "add %5, %5, #16 \n" | |||
| "fmla v7.4s, v9.4s, %16.s[0] \n" | |||
| "ext v11.16b, v9.16b, v10.16b, #4 \n" | |||
| "ext v12.16b, v9.16b, v10.16b, #8 \n" | |||
| "fmla v6.4s, v11.4s, %16.s[1] \n" | |||
| "fmla v13.4s, v12.4s, %16.s[2] \n" | |||
| "fmla v8.4s, v9.4s, %15.s[0] \n" | |||
| "fmla v14.4s, v11.4s, %15.s[1] \n" | |||
| "fmla v15.4s, v12.4s, %15.s[2] \n" | |||
| "prfm pldl1keep, [%6, #192] \n" | |||
| "ld1 {v9.4s, v10.4s}, [%6] \n" | |||
| "add %6, %6, #16 \n" | |||
| "fmla v8.4s, v9.4s, %16.s[0] \n" | |||
| "ext v11.16b, v9.16b, v10.16b, #4 \n" | |||
| "ext v12.16b, v9.16b, v10.16b, #8 \n" | |||
| "fmla v14.4s, v11.4s, %16.s[1] \n" | |||
| "fmla v15.4s, v12.4s, %16.s[2] \n" | |||
| "fadd v7.4s, v7.4s, v6.4s \n" | |||
| "prfm pldl1keep, [%3, #192] \n" | |||
| "ld1 {v9.4s, v10.4s}, [%3] \n" //ro, for next loop | |||
| "fadd v8.4s, v8.4s, v14.4s \n" | |||
| "fadd v7.4s, v7.4s, v13.4s \n" | |||
| "fadd v8.4s, v8.4s, v15.4s \n" | |||
| "ext v11.16b, v9.16b, v10.16b, #4 \n" // for next loop | |||
| "ext v12.16b, v9.16b, v10.16b, #8 \n" // for next loop | |||
| "add %3, %3, #16 \n" | |||
| "st1 {v7.4s}, [%1], #16 \n" | |||
| "st1 {v8.4s}, [%2], #16 \n" | |||
| "subs %w0, %w0, #1 \n" | |||
| "bne 0b \n" | |||
| "sub %3, %3, #16 \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(outptr), // %1 | |||
| "=r"(outptr2), // %2 | |||
| "=r"(r0), // %3 | |||
| "=r"(r1), // %4 | |||
| "=r"(r2), // %5 | |||
| "=r"(r3) // %6 | |||
| : "0"(nn), | |||
| "1"(outptr), | |||
| "2"(outptr2), | |||
| "3"(r0), | |||
| "4"(r1), | |||
| "5"(r2), | |||
| "6"(r3), | |||
| "w"(_k012x), // %14 | |||
| "w"(_k345x), // %15 | |||
| "w"(_k678x), // %16 | |||
| "w"(_bias0) // %17 | |||
| : "cc", "memory", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" | |||
| ); | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| @@ -326,41 +377,80 @@ static void convdw3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| if (nn > 0) | |||
| { | |||
| float32x4_t _r00 = vld1q_f32(r0); | |||
| float32x4_t _r00n = vld1q_f32(r0 + 4); | |||
| float32x4_t _r01 = vextq_f32(_r00, _r00n, 1); | |||
| float32x4_t _r02 = vextq_f32(_r00, _r00n, 2); | |||
| asm volatile( | |||
| "prfm pldl1keep, [%2, #192] \n" | |||
| "ld1 {v8.4s, v9.4s}, [%2] \n" //r0 | |||
| "add %2, %2, #16 \n" | |||
| float32x4_t _r10 = vld1q_f32(r1); | |||
| float32x4_t _r10n = vld1q_f32(r1 + 4); | |||
| float32x4_t _r11 = vextq_f32(_r10, _r10n, 1); | |||
| float32x4_t _r12 = vextq_f32(_r10, _r10n, 2); | |||
| "ext v10.16b, v8.16b, v9.16b, #4 \n" | |||
| "ext v11.16b, v8.16b, v9.16b, #8 \n" | |||
| float32x4_t _r20 = vld1q_f32(r2); | |||
| float32x4_t _r20n = vld1q_f32(r2 + 4); | |||
| float32x4_t _r21 = vextq_f32(_r20, _r20n, 1); | |||
| float32x4_t _r22 = vextq_f32(_r20, _r20n, 2); | |||
| float32x4_t _sum1 = vmulq_laneq_f32(_r00, _k012x, 0); | |||
| float32x4_t _sum2 = vfmaq_laneq_f32(_bias0, _r01, _k012x, 1); | |||
| _sum1 = vfmaq_laneq_f32(_sum1, _r02, _k012x, 2); | |||
| _sum2 = vfmaq_laneq_f32(_sum2, _r10, _k345x, 0); | |||
| _sum1 = vfmaq_laneq_f32(_sum1, _r11, _k345x, 1); | |||
| _sum2 = vfmaq_laneq_f32(_sum2, _r12, _k345x, 2); | |||
| _sum1 = vfmaq_laneq_f32(_sum1, _r20, _k678x, 0); | |||
| _sum2 = vfmaq_laneq_f32(_sum2, _r21, _k678x, 1); | |||
| _sum1 = vfmaq_laneq_f32(_sum1, _r22, _k678x, 2); | |||
| _sum1 = vaddq_f32(_sum1, _sum2); | |||
| vst1q_f32(outptr, _sum1); | |||
| r0 += 4; | |||
| r1 += 4; | |||
| r2 += 4; | |||
| outptr += 4; | |||
| "0: \n" | |||
| "fmul v7.4s, v8.4s, %10.s[0] \n" | |||
| "and v14.16b, %13.16b, %13.16b \n" // v14 = _bias0 | |||
| "fmul v13.4s, v10.4s, %10.s[1] \n" | |||
| "fmla v14.4s, v11.4s, %10.s[2] \n" | |||
| "prfm pldl1keep, [%3, #192] \n" | |||
| "ld1 {v8.4s, v9.4s}, [%3] \n" //r1 | |||
| "add %3, %3, #16 \n" | |||
| "fmla v7.4s, v8.4s, %11.s[0] \n" | |||
| "ext v10.16b, v8.16b, v9.16b, #4 \n" | |||
| "ext v11.16b, v8.16b, v9.16b, #8 \n" | |||
| "fmla v13.4s, v10.4s, %11.s[1] \n" | |||
| "fmla v14.4s, v11.4s, %11.s[2] \n" | |||
| "prfm pldl1keep, [%4, #192] \n" | |||
| "ld1 {v8.4s, v9.4s}, [%4] \n" //r2 | |||
| "add %4, %4, #16 \n" | |||
| "fmla v7.4s, v8.4s, %12.s[0] \n" | |||
| "ext v10.16b, v8.16b, v9.16b, #4 \n" | |||
| "ext v11.16b, v8.16b, v9.16b, #8 \n" | |||
| "fmla v13.4s, v10.4s, %12.s[1] \n" | |||
| "fmla v14.4s, v11.4s, %12.s[2] \n" | |||
| "prfm pldl1keep, [%2, #192] \n" | |||
| "ld1 {v8.4s, v9.4s}, [%2] \n" //r0, for next loop | |||
| "add %2, %2, #16 \n" | |||
| "fadd v7.4s, v7.4s, v13.4s \n" | |||
| "fadd v7.4s, v7.4s, v14.4s \n" | |||
| "ext v10.16b, v8.16b, v9.16b, #4 \n" // for next loop | |||
| "ext v11.16b, v8.16b, v9.16b, #8 \n" // for next loop | |||
| "st1 {v7.4s}, [%1], #16 \n" | |||
| "subs %w0, %w0, #1 \n" | |||
| "bne 0b \n" | |||
| "sub %2, %2, #16 \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(outptr), // %1 | |||
| "=r"(r0), // %2 | |||
| "=r"(r1), // %3 | |||
| "=r"(r2) // %4 | |||
| : "0"(nn), | |||
| "1"(outptr), | |||
| "2"(r0), | |||
| "3"(r1), | |||
| "4"(r2), | |||
| "w"(_k012x), // %10 | |||
| "w"(_k345x), // %11 | |||
| "w"(_k678x), // %12 | |||
| "w"(_bias0) // %13 | |||
| : "cc", "memory", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" | |||
| ); | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| @@ -547,47 +637,76 @@ static void convdw3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| if (nn > 0) | |||
| { | |||
| float32x4x2_t _r0 = vld2q_f32(r0); | |||
| float32x4x2_t _r0n = vld2q_f32(r0+8); | |||
| asm volatile( | |||
| "prfm pldl1keep, [%2, #256] \n" | |||
| "ld2 {v2.4s, v3.4s}, [%2], #32 \n" | |||
| "and v11.16b, %13.16b, %13.16b \n" // v11 = _bias0 | |||
| float32x4_t _r00 = _r0.val[0];// 0 2 4 6 | |||
| float32x4_t _r01 = _r0.val[1];// 1 3 5 7 | |||
| float32x4_t _r02 = vextq_f32(_r00, _r0n.val[0], 1);// 2 4 6 8 | |||
| "0: \n" | |||
| "fmul v0.4s, v2.4s, %10.s[0] \n" | |||
| "fmul v10.4s, v3.4s, %10.s[1] \n" | |||
| float32x4_t _outp = vfmaq_laneq_f32(_bias0, _r00, _k012x, 0); | |||
| _outp = vfmaq_laneq_f32(_outp, _r01, _k012x, 1); | |||
| _outp = vfmaq_laneq_f32(_outp, _r02, _k012x, 2); | |||
| "prfm pldl1keep, [%2, #256] \n" | |||
| "ld2 {v8.4s, v9.4s}, [%2] \n" | |||
| "ext v1.16b, v2.16b, v8.16b, #4 \n" | |||
| float32x4x2_t _r1 = vld2q_f32(r1); | |||
| float32x4x2_t _r1n = vld2q_f32(r1+8); | |||
| "fmla v11.4s, v1.4s, %10.s[2] \n" | |||
| float32x4_t _r10 = _r1.val[0]; | |||
| float32x4_t _r11 = _r1.val[1]; | |||
| float32x4_t _r12 = vextq_f32(_r10, _r1n.val[0], 1); | |||
| "prfm pldl1keep, [%3, #256] \n" | |||
| "ld2 {v2.4s, v3.4s}, [%3], #32 \n" | |||
| _outp = vfmaq_laneq_f32(_outp, _r10, _k345x, 0); | |||
| _outp = vfmaq_laneq_f32(_outp, _r11, _k345x, 1); | |||
| _outp = vfmaq_laneq_f32(_outp, _r12, _k345x, 2); | |||
| "fmla v0.4s, v2.4s, %11.s[0] \n" | |||
| "fmla v10.4s, v3.4s, %11.s[1] \n" | |||
| float32x4x2_t _r2 = vld2q_f32(r2); | |||
| float32x4x2_t _r2n = vld2q_f32(r2+8); | |||
| "prfm pldl1keep, [%3, #256] \n" | |||
| "ld2 {v8.4s, v9.4s}, [%3] \n" | |||
| "ext v1.16b, v2.16b, v8.16b, #4 \n" | |||
| float32x4_t _r20 = _r2.val[0]; | |||
| float32x4_t _r21 = _r2.val[1]; | |||
| float32x4_t _r22 = vextq_f32(_r20, _r2n.val[0], 1); | |||
| "fmla v11.4s, v1.4s, %11.s[2] \n" | |||
| "prfm pldl1keep, [%4, #256] \n" | |||
| "ld2 {v2.4s, v3.4s}, [%4], #32 \n" | |||
| _outp = vfmaq_laneq_f32(_outp, _r20, _k678x, 0); | |||
| _outp = vfmaq_laneq_f32(_outp, _r21, _k678x, 1); | |||
| _outp = vfmaq_laneq_f32(_outp, _r22, _k678x, 2); | |||
| "fmla v0.4s, v2.4s, %12.s[0] \n" | |||
| "fmla v10.4s, v3.4s, %12.s[1] \n" | |||
| vst1q_f32(outptr, _outp); | |||
| "prfm pldl1keep, [%4, #256] \n" | |||
| "ld2 {v8.4s, v9.4s}, [%4] \n" | |||
| "ext v1.16b, v2.16b, v8.16b, #4 \n" | |||
| r0 += 8; | |||
| r1 += 8; | |||
| r2 += 8; | |||
| outptr += 4; | |||
| "fmla v11.4s, v1.4s, %12.s[2] \n" | |||
| "prfm pldl1keep, [%2, #256] \n" | |||
| "ld2 {v2.4s, v3.4s}, [%2], #32 \n" | |||
| "fadd v0.4s, v0.4s, v10.4s \n" | |||
| "fadd v0.4s, v0.4s, v11.4s \n" | |||
| "and v11.16b, %13.16b, %13.16b \n" // v11 = _bias0 | |||
| "subs %w0, %w0, #1 \n" | |||
| "st1 {v0.4s}, [%1], #16 \n" | |||
| "bne 0b \n" | |||
| "sub %2, %2, #32 \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(outptr), // %1 | |||
| "=r"(r0), // %2 | |||
| "=r"(r1), // %3 | |||
| "=r"(r2) // %4 | |||
| : "0"(nn), | |||
| "1"(outptr), | |||
| "2"(r0), | |||
| "3"(r1), | |||
| "4"(r2), | |||
| "w"(_k012x), // %10 | |||
| "w"(_k345x), // %11 | |||
| "w"(_k678x), // %12 | |||
| "w"(_bias0) // %13 | |||
| : "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" | |||
| ); | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| @@ -55,16 +55,28 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| if (nn > 0) | |||
| { | |||
| float32x4_t _ptr = vld1q_f32(ptr); | |||
| float32x4_t _ptr1 = vld1q_f32(ptr1); | |||
| float32x4_t _p = vmulq_f32(_ptr, _ptr1); | |||
| vst1q_f32(outptr, _p); | |||
| ptr += 4; | |||
| ptr1 += 4; | |||
| outptr += 4; | |||
| asm volatile( | |||
| "0: \n" | |||
| "prfm pldl1keep, [%1, #128] \n" | |||
| "prfm pldl1keep, [%2, #128] \n" | |||
| "ld1 {v0.4s}, [%1], #16 \n" | |||
| "ld1 {v1.4s}, [%2], #16 \n" | |||
| "fmul v0.4s, v0.4s, v1.4s \n" | |||
| "subs %w0, %w0, #1 \n" | |||
| "st1 {v0.4s}, [%3], #16 \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(ptr), // %1 | |||
| "=r"(ptr1), // %2 | |||
| "=r"(outptr) // %3 | |||
| : "0"(nn), | |||
| "1"(ptr), | |||
| "2"(ptr1), | |||
| "3"(outptr) | |||
| : "cc", "memory", "v0", "v1" | |||
| ); | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| @@ -120,15 +132,26 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| if (nn > 0) | |||
| { | |||
| float32x4_t _ptr = vld1q_f32(ptr); | |||
| float32x4_t _p = vld1q_f32(outptr); | |||
| _p = vmulq_f32(_ptr, _p); | |||
| vst1q_f32(outptr, _p); | |||
| ptr += 4; | |||
| outptr += 4; | |||
| asm volatile( | |||
| "0: \n" | |||
| "prfm pldl1keep, [%1, #128] \n" | |||
| "prfm pldl1keep, [%2, #128] \n" | |||
| "ld1 {v0.4s}, [%1], #16 \n" | |||
| "ld1 {v1.4s}, [%2] \n" | |||
| "fmul v0.4s, v0.4s, v1.4s \n" | |||
| "subs %w0, %w0, #1 \n" | |||
| "st1 {v0.4s}, [%2], #16 \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(ptr), // %1 | |||
| "=r"(outptr) // %2 | |||
| : "0"(nn), | |||
| "1"(ptr), | |||
| "2"(outptr) | |||
| : "cc", "memory", "v0", "v1" | |||
| ); | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| @@ -186,16 +209,28 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| if (nn > 0) | |||
| { | |||
| float32x4_t _ptr = vld1q_f32(ptr); | |||
| float32x4_t _ptr1 = vld1q_f32(ptr1); | |||
| float32x4_t _p = vaddq_f32(_ptr, _ptr1); | |||
| vst1q_f32(outptr, _p); | |||
| ptr += 4; | |||
| ptr1 += 4; | |||
| outptr += 4; | |||
| asm volatile( | |||
| "0: \n" | |||
| "prfm pldl1keep, [%1, #128] \n" | |||
| "prfm pldl1keep, [%2, #128] \n" | |||
| "ld1 {v0.4s}, [%1], #16 \n" | |||
| "ld1 {v1.4s}, [%2], #16 \n" | |||
| "fadd v0.4s, v0.4s, v1.4s \n" | |||
| "subs %w0, %w0, #1 \n" | |||
| "st1 {v0.4s}, [%3], #16 \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(ptr), // %1 | |||
| "=r"(ptr1), // %2 | |||
| "=r"(outptr) // %3 | |||
| : "0"(nn), | |||
| "1"(ptr), | |||
| "2"(ptr1), | |||
| "3"(outptr) | |||
| : "cc", "memory", "v0", "v1" | |||
| ); | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| @@ -251,15 +286,26 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| if (nn > 0) | |||
| { | |||
| float32x4_t _ptr = vld1q_f32(ptr); | |||
| float32x4_t _p = vld1q_f32(outptr); | |||
| _p = vaddq_f32(_ptr, _p); | |||
| vst1q_f32(outptr, _p); | |||
| ptr += 4; | |||
| outptr += 4; | |||
| asm volatile( | |||
| "0: \n" | |||
| "prfm pldl1keep, [%1, #128] \n" | |||
| "prfm pldl1keep, [%2, #128] \n" | |||
| "ld1 {v0.4s}, [%1], #16 \n" | |||
| "ld1 {v1.4s}, [%2] \n" | |||
| "fadd v0.4s, v0.4s, v1.4s \n" | |||
| "subs %w0, %w0, #1 \n" | |||
| "st1 {v0.4s}, [%2], #16 \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(ptr), // %1 | |||
| "=r"(outptr) // %2 | |||
| : "0"(nn), | |||
| "1"(ptr), | |||
| "2"(outptr) | |||
| : "cc", "memory", "v0", "v1" | |||
| ); | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| @@ -321,17 +367,31 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& | |||
| float32x4_t _coeff0 = vdupq_n_f32(coeff0); | |||
| float32x4_t _coeff1 = vdupq_n_f32(coeff1); | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| if (nn > 0) | |||
| { | |||
| float32x4_t _ptr = vld1q_f32(ptr); | |||
| float32x4_t _ptr1 = vld1q_f32(ptr1); | |||
| float32x4_t _p = vmulq_f32(_ptr, _coeff0); | |||
| _p = vmlaq_f32(_p, _ptr1, _coeff1); | |||
| vst1q_f32(outptr, _p); | |||
| ptr += 4; | |||
| ptr1 += 4; | |||
| outptr += 4; | |||
| asm volatile( | |||
| "0: \n" | |||
| "prfm pldl1keep, [%1, #128] \n" | |||
| "prfm pldl1keep, [%2, #128] \n" | |||
| "ld1 {v0.4s}, [%1], #16 \n" | |||
| "ld1 {v1.4s}, [%2], #16 \n" | |||
| "fmul v0.4s, v0.4s, %8.4s \n" | |||
| "fmla v0.4s, v1.4s, %9.4s \n" | |||
| "subs %w0, %w0, #1 \n" | |||
| "st1 {v0.4s}, [%3], #16 \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(ptr), // %1 | |||
| "=r"(ptr1), // %2 | |||
| "=r"(outptr) // %3 | |||
| : "0"(nn), | |||
| "1"(ptr), | |||
| "2"(ptr1), | |||
| "3"(outptr), | |||
| "w"(_coeff0), // %8 | |||
| "w"(_coeff1) // %9 | |||
| : "cc", "memory", "v0", "v1" | |||
| ); | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| @@ -392,15 +452,27 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& | |||
| #if __ARM_NEON | |||
| float32x4_t _coeff = vdupq_n_f32(coeff); | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| if (nn > 0) | |||
| { | |||
| float32x4_t _ptr = vld1q_f32(ptr); | |||
| float32x4_t _p = vld1q_f32(outptr); | |||
| _p = vmlaq_f32(_p, _ptr, _coeff); | |||
| vst1q_f32(outptr, _p); | |||
| ptr += 4; | |||
| outptr += 4; | |||
| asm volatile( | |||
| "0: \n" | |||
| "prfm pldl1keep, [%1, #128] \n" | |||
| "prfm pldl1keep, [%2, #128] \n" | |||
| "ld1 {v0.4s}, [%1], #16 \n" | |||
| "ld1 {v1.4s}, [%2] \n" | |||
| "fmla v1.4s, v0.4s, %6.4s \n" | |||
| "subs %w0, %w0, #1 \n" | |||
| "st1 {v1.4s}, [%2], #16 \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(ptr), // %1 | |||
| "=r"(outptr) // %2 | |||
| : "0"(nn), | |||
| "1"(ptr), | |||
| "2"(outptr), | |||
| "w"(_coeff) // %6 | |||
| : "cc", "memory", "v0", "v1" | |||
| ); | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| @@ -458,16 +530,28 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| if (nn > 0) | |||
| { | |||
| float32x4_t _ptr = vld1q_f32(ptr); | |||
| float32x4_t _ptr1 = vld1q_f32(ptr1); | |||
| float32x4_t _p = vmaxq_f32(_ptr, _ptr1); | |||
| vst1q_f32(outptr, _p); | |||
| ptr += 4; | |||
| ptr1 += 4; | |||
| outptr += 4; | |||
| asm volatile( | |||
| "0: \n" | |||
| "prfm pldl1keep, [%1, #128] \n" | |||
| "prfm pldl1keep, [%2, #128] \n" | |||
| "ld1 {v0.4s}, [%1], #16 \n" | |||
| "ld1 {v1.4s}, [%2], #16 \n" | |||
| "fmax v0.4s, v0.4s, v1.4s \n" | |||
| "subs %w0, %w0, #1 \n" | |||
| "st1 {v0.4s}, [%3], #16 \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(ptr), // %1 | |||
| "=r"(ptr1), // %2 | |||
| "=r"(outptr) // %3 | |||
| : "0"(nn), | |||
| "1"(ptr), | |||
| "2"(ptr1), | |||
| "3"(outptr) | |||
| : "cc", "memory", "v0", "v1" | |||
| ); | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| @@ -523,15 +607,26 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| if (nn > 0) | |||
| { | |||
| float32x4_t _ptr = vld1q_f32(ptr); | |||
| float32x4_t _p = vld1q_f32(outptr); | |||
| _p = vmaxq_f32(_ptr, _p); | |||
| vst1q_f32(outptr, _p); | |||
| ptr += 4; | |||
| outptr += 4; | |||
| asm volatile( | |||
| "0: \n" | |||
| "prfm pldl1keep, [%1, #128] \n" | |||
| "prfm pldl1keep, [%2, #128] \n" | |||
| "ld1 {v0.4s}, [%1], #16 \n" | |||
| "ld1 {v1.4s}, [%2] \n" | |||
| "fmax v0.4s, v0.4s, v1.4s \n" | |||
| "subs %w0, %w0, #1 \n" | |||
| "st1 {v0.4s}, [%2], #16 \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(ptr), // %1 | |||
| "=r"(outptr) // %2 | |||
| : "0"(nn), | |||
| "1"(ptr), | |||
| "2"(outptr) | |||
| : "cc", "memory", "v0", "v1" | |||
| ); | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| @@ -172,18 +172,30 @@ int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob) const | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| if (nn > 0) | |||
| { | |||
| float32x4_t _m = vld1q_f32(m); | |||
| float32x4_t _w = vld1q_f32(w); | |||
| _sum = vfmaq_f32(_sum, _m, _w); | |||
| _m = vld1q_f32(m + 4); | |||
| _w = vld1q_f32(w + 4); | |||
| _sum2 = vfmaq_f32(_sum2, _m, _w); | |||
| m += 8; | |||
| w += 8; | |||
| asm volatile( | |||
| "0: \n" | |||
| "prfm pldl1keep, [%1, #256] \n" | |||
| "ld1 {v0.4s, v1.4s}, [%1], #32 \n" | |||
| "prfm pldl1keep, [%2, #256] \n" | |||
| "ld1 {v2.4s, v3.4s}, [%2], #32 \n" | |||
| "fmla %3.4s, v0.4s, v2.4s \n" | |||
| "subs %w0, %w0, #1 \n" | |||
| "fmla %4.4s, v1.4s, v3.4s \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(m), // %1 | |||
| "=r"(w), // %2 | |||
| "=w"(_sum), // %3 | |||
| "=w"(_sum2) // %4 | |||
| : "0"(nn), | |||
| "1"(m), | |||
| "2"(w), | |||
| "3"(_sum), | |||
| "4"(_sum2) | |||
| : "cc", "memory", "v0", "v1", "v2", "v3" | |||
| ); | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| @@ -46,23 +46,30 @@ static void pooling2x2s2_max_neon(const Mat& bottom_blob, Mat& top_blob) | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| if (nn > 0) | |||
| { | |||
| float32x4_t _r00 = vld1q_f32(r0); | |||
| float32x4_t _r10 = vld1q_f32(r1); | |||
| float32x4_t _r01 = vld1q_f32(r0 + 4); | |||
| float32x4_t _r11 = vld1q_f32(r1 + 4); | |||
| float32x4_t _max0 = vmaxq_f32(_r00, _r10); | |||
| float32x4_t _max1 = vmaxq_f32(_r01, _r11); | |||
| float32x4_t _max = vpmaxq_f32(_max0, _max1); | |||
| vst1q_f32(outptr, _max); | |||
| r0 += 8; | |||
| r1 += 8; | |||
| outptr += 4; | |||
| asm volatile( | |||
| "0: \n" | |||
| "prfm pldl1keep, [%1, #256] \n" | |||
| "prfm pldl1keep, [%2, #256] \n" | |||
| "ld1 {v0.4s, v1.4s}, [%1], #32 \n" | |||
| "ld1 {v2.4s, v3.4s}, [%2], #32 \n" | |||
| "fmax v0.4s, v0.4s, v2.4s \n" | |||
| "fmax v1.4s, v1.4s, v3.4s \n" | |||
| "fmaxp v2.4s, v0.4s, v1.4s \n" | |||
| "subs %w0, %w0, #1 \n" | |||
| "st1 {v2.4s}, [%3], #16 \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(r0), // %1 | |||
| "=r"(r1), // %2 | |||
| "=r"(outptr) // %3 | |||
| : "0"(nn), | |||
| "1"(r0), | |||
| "2"(r1), | |||
| "3"(outptr) | |||
| : "cc", "memory", "v0", "v1", "v2", "v3" | |||
| ); | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| @@ -47,39 +47,68 @@ static void pooling3x3s2_max_neon(const Mat& bottom_blob, Mat& top_blob) | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| float32x4x2_t _r0 = vld2q_f32(r0); | |||
| float32x4x2_t _r1 = vld2q_f32(r1); | |||
| float32x4x2_t _r2 = vld2q_f32(r2); | |||
| for (; nn>0; nn--) | |||
| if (nn > 0) | |||
| { | |||
| float32x4x2_t _r0n = vld2q_f32(r0+8); | |||
| float32x4x2_t _r1n = vld2q_f32(r1+8); | |||
| float32x4x2_t _r2n = vld2q_f32(r2+8); | |||
| asm volatile( | |||
| "prfm pldl1keep, [%1, #256] \n" | |||
| "ld2 {v0.4s, v1.4s}, [%1], #32 \n" | |||
| "prfm pldl1keep, [%2, #256] \n" | |||
| "ld2 {v2.4s, v3.4s}, [%2], #32 \n" | |||
| "prfm pldl1keep, [%3, #256] \n" | |||
| "ld2 {v4.4s, v5.4s}, [%3], #32 \n" | |||
| "0: \n" | |||
| "prfm pldl1keep, [%1, #256] \n" | |||
| "ld2 {v6.4s, v7.4s}, [%1], #32 \n" | |||
| "fmax v12.4s, v0.4s, v1.4s \n" | |||
| "fmax v13.4s, v2.4s, v3.4s \n" | |||
| "prfm pldl1keep, [%2, #256] \n" | |||
| "ld2 {v8.4s, v9.4s}, [%2], #32 \n" | |||
| float32x4_t _max0 = vmaxq_f32(_r0.val[0], _r0.val[1]); | |||
| float32x4_t _max1 = vmaxq_f32(_r1.val[0], _r1.val[1]); | |||
| float32x4_t _max2 = vmaxq_f32(_r2.val[0], _r2.val[1]); | |||
| "fmax v14.4s, v4.4s, v5.4s \n" | |||
| "ext v0.16b, v0.16b, v6.16b, #4 \n" | |||
| float32x4_t _r02 = vextq_f32(_r0.val[0], _r0n.val[0], 1); | |||
| float32x4_t _r12 = vextq_f32(_r1.val[0], _r1n.val[0], 1); | |||
| float32x4_t _r22 = vextq_f32(_r2.val[0], _r2n.val[0], 1); | |||
| "prfm pldl1keep, [%3, #256] \n" | |||
| "ld2 {v10.4s, v11.4s}, [%3], #32 \n" | |||
| _max0 = vmaxq_f32(_max0, _r02); | |||
| _max1 = vmaxq_f32(_max1, _r12); | |||
| _max2 = vmaxq_f32(_max2, _r22); | |||
| "ext v2.16b, v2.16b, v8.16b, #4 \n" | |||
| float32x4_t _max = vmaxq_f32(vmaxq_f32(_max0, _max1), _max2); | |||
| "fmax v12.4s, v12.4s, v0.4s \n" | |||
| "ext v4.16b, v4.16b, v10.16b, #4 \n" | |||
| vst1q_f32(outptr, _max); | |||
| "fmax v13.4s, v13.4s, v2.4s \n" | |||
| "fmax v14.4s, v14.4s, v4.4s \n" | |||
| "fmax v12.4s, v12.4s, v13.4s \n" | |||
| _r0 = _r0n; | |||
| _r1 = _r1n; | |||
| _r2 = _r2n; | |||
| "orr v0.16b, v6.16b, v6.16b \n" | |||
| "orr v1.16b, v7.16b, v7.16b \n" | |||
| "fmax v12.4s, v12.4s, v14.4s \n" | |||
| r0 += 8; | |||
| r1 += 8; | |||
| r2 += 8; | |||
| outptr += 4; | |||
| "orr v2.16b, v8.16b, v8.16b \n" | |||
| "orr v3.16b, v9.16b, v9.16b \n" | |||
| "orr v4.16b, v10.16b, v10.16b \n" | |||
| "orr v5.16b, v11.16b, v11.16b \n" | |||
| "subs %w0, %w0, #1 \n" | |||
| "st1 {v12.4s}, [%4], #16 \n" | |||
| "bne 0b \n" | |||
| "sub %1, %1, #32 \n" | |||
| "sub %2, %2, #32 \n" | |||
| "sub %3, %3, #32 \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(r0), // %1 | |||
| "=r"(r1), // %2 | |||
| "=r"(r2), // %3 | |||
| "=r"(outptr) // %4 | |||
| : "0"(nn), | |||
| "1"(r0), | |||
| "2"(r1), | |||
| "3"(r2), | |||
| "4"(outptr) | |||
| : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14" | |||
| ); | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| @@ -57,15 +57,23 @@ int Slice_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& t | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| if (nn > 0) | |||
| { | |||
| float32x4_t _p = vld1q_f32(ptr); | |||
| float32x4_t _p2 = vld1q_f32(ptr+4); | |||
| vst1q_f32(outptr, _p); | |||
| vst1q_f32(outptr+4, _p2); | |||
| ptr += 8; | |||
| outptr += 8; | |||
| asm volatile( | |||
| "0: \n" | |||
| "prfm pldl1keep, [%1, #256] \n" | |||
| "ld1 {v0.4s, v1.4s}, [%1], #32 \n" | |||
| "subs %w0, %w0, #1 \n" | |||
| "st1 {v0.4s, v1.4s}, [%2], #32 \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(ptr), // %1 | |||
| "=r"(outptr) // %2 | |||
| : "0"(nn), | |||
| "1"(ptr), | |||
| "2"(outptr) | |||
| : "cc", "memory", "v0", "v1" | |||
| ); | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| @@ -44,13 +44,24 @@ void Mat::substract_mean_normalize(const float* mean_vals, const float* norm_val | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| float32x4_t _mean = vdupq_n_f32(mean); | |||
| for (; nn>0; nn--) | |||
| if (nn > 0) | |||
| { | |||
| float32x4_t _ptr = vld1q_f32(ptr); | |||
| _ptr = vsubq_f32(_ptr, _mean); | |||
| vst1q_f32(ptr, _ptr); | |||
| ptr += 4; | |||
| asm volatile( | |||
| "dup v1.4s, %w4 \n" | |||
| "0: \n" | |||
| "prfm pldl1keep, [%1, #128] \n" | |||
| "ld1 {v0.4s}, [%1] \n" | |||
| "fsub v0.4s, v0.4s, v1.4s \n" | |||
| "subs %w0, %w0, #1 \n" | |||
| "st1 {v0.4s}, [%1], #16 \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(ptr) // %1 | |||
| : "0"(nn), | |||
| "1"(ptr), | |||
| "r"(mean) // %4 | |||
| : "cc", "memory", "v0", "v1" | |||
| ); | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| @@ -99,13 +110,24 @@ void Mat::substract_mean_normalize(const float* mean_vals, const float* norm_val | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| float32x4_t _norm = vdupq_n_f32(norm); | |||
| for (; nn>0; nn--) | |||
| if (nn > 0) | |||
| { | |||
| float32x4_t _ptr = vld1q_f32(ptr); | |||
| _ptr = vmulq_f32(_ptr, _norm); | |||
| vst1q_f32(ptr, _ptr); | |||
| ptr += 4; | |||
| asm volatile( | |||
| "dup v1.4s, %w4 \n" | |||
| "0: \n" | |||
| "prfm pldl1keep, [%1, #128] \n" | |||
| "ld1 {v0.4s}, [%1] \n" | |||
| "fmul v0.4s, v0.4s, v1.4s \n" | |||
| "subs %w0, %w0, #1 \n" | |||
| "st1 {v0.4s}, [%1], #16 \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(ptr) // %1 | |||
| : "0"(nn), | |||
| "1"(ptr), | |||
| "r"(norm) // %4 | |||
| : "cc", "memory", "v0", "v1" | |||
| ); | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| @@ -155,15 +177,27 @@ void Mat::substract_mean_normalize(const float* mean_vals, const float* norm_val | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| float32x4_t _mean = vdupq_n_f32(mean); | |||
| float32x4_t _norm = vdupq_n_f32(norm); | |||
| for (; nn>0; nn--) | |||
| { | |||
| float32x4_t _ptr = vld1q_f32(ptr); | |||
| _ptr = vsubq_f32(_ptr, _mean); | |||
| _ptr = vmulq_f32(_ptr, _norm); | |||
| vst1q_f32(ptr, _ptr); | |||
| ptr += 4; | |||
| if (nn > 0) | |||
| { | |||
| asm volatile( | |||
| "dup v1.4s, %w4 \n" | |||
| "dup v2.4s, %w5 \n" | |||
| "0: \n" | |||
| "prfm pldl1keep, [%1, #128] \n" | |||
| "ld1 {v0.4s}, [%1] \n" | |||
| "fsub v0.4s, v0.4s, v1.4s \n" | |||
| "fmul v0.4s, v0.4s, v2.4s \n" | |||
| "subs %w0, %w0, #1 \n" | |||
| "st1 {v0.4s}, [%1], #16 \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(ptr) // %1 | |||
| : "0"(nn), | |||
| "1"(ptr), | |||
| "r"(mean), // %4 | |||
| "r"(norm) // %5 | |||
| : "cc", "memory", "v0", "v1", "v2" | |||
| ); | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| @@ -343,10 +343,20 @@ inline void Mat::fill(float _v) | |||
| #if __ARM_NEON | |||
| float32x4_t _c = vdupq_n_f32(_v); | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| if (nn > 0) | |||
| { | |||
| vst1q_f32(ptr, _c); | |||
| ptr += 4; | |||
| asm volatile ( | |||
| "0: \n" | |||
| "subs %w0, %w0, #1 \n" | |||
| "st1 {%4.4s}, [%1], #16 \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(ptr) // %1 | |||
| : "0"(nn), | |||
| "1"(ptr), | |||
| "w"(_c) // %4 | |||
| : "cc", "memory" | |||
| ); | |||
| } | |||
| #else | |||
| if (nn > 0) | |||