Browse Source

Use aarch64 assembly to replace arm intrinsics

tags/20180427
dong nihui 8 years ago
parent
commit
6ea09ebf2c
16 changed files with 2110 additions and 1005 deletions
  1. +15
    -6
      src/layer/arm/absval_arm.cpp
  2. +20
    -9
      src/layer/arm/batchnorm_arm.cpp
  3. +353
    -263
      src/layer/arm/convolution_1x1.h
  4. +120
    -46
      src/layer/arm/convolution_2x2.h
  5. +56
    -31
      src/layer/arm/convolution_3x3.h
  6. +133
    -69
      src/layer/arm/convolution_4x4.h
  7. +448
    -273
      src/layer/arm/convolution_5x5.h
  8. +386
    -43
      src/layer/arm/convolution_7x7.h
  9. +231
    -112
      src/layer/arm/convolutiondepthwise_3x3.h
  10. +164
    -69
      src/layer/arm/eltwise_arm.cpp
  11. +23
    -11
      src/layer/arm/innerproduct_arm.cpp
  12. +23
    -16
      src/layer/arm/pooling_2x2.h
  13. +54
    -25
      src/layer/arm/pooling_3x3.h
  14. +16
    -8
      src/layer/arm/slice_arm.cpp
  15. +55
    -21
      src/mat.cpp
  16. +13
    -3
      src/mat.h

+ 15
- 6
src/layer/arm/absval_arm.cpp View File

@@ -43,13 +43,22 @@ int AbsVal_arm::forward_inplace(Mat& bottom_top_blob) const

#if __ARM_NEON
#if __aarch64__
for (; nn>0; nn--)
if (nn > 0)
{
float32x4_t _p = vld1q_f32(ptr);
_p = vabsq_f32(_p);
vst1q_f32(ptr, _p);

ptr += 4;
asm volatile(
"0: \n"
"prfm pldl1keep, [%1, #128] \n"
"ld1 {v0.4s}, [%1] \n"
"fabs v0.4s, v0.4s \n"
"subs %w0, %w0, #1 \n"
"st1 {v0.4s}, [%1], #16 \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(ptr) // %1
: "0"(nn),
"1"(ptr)
: "cc", "memory", "v0"
);
}
#else
if (nn > 0)


+ 20
- 9
src/layer/arm/batchnorm_arm.cpp View File

@@ -51,16 +51,27 @@ int BatchNorm_arm::forward_inplace(Mat& bottom_top_blob) const

#if __ARM_NEON
#if __aarch64__
float32x4_t _a = vdupq_n_f32(a);
float32x4_t _b = vdupq_n_f32(b);
for (; nn>0; nn--)
if (nn > 0)
{
float32x4_t _p = vld1q_f32(ptr);
float32x4_t _outp = _a;
_outp = vfmaq_f32(_outp, _p, _b);
vst1q_f32(ptr, _outp);

ptr += 4;
asm volatile(
"dup v1.4s, %w4 \n"
"dup v2.4s, %w5 \n"
"0: \n"
"prfm pldl1keep, [%1, #128] \n"
"ld1 {v0.4s}, [%1] \n"
"orr v3.16b, v1.16b, v1.16b \n"
"fmla v3.4s, v0.4s, v2.4s \n"
"subs %w0, %w0, #1 \n"
"st1 {v3.4s}, [%1], #16 \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(ptr) // %1
: "0"(nn),
"1"(ptr),
"r"(a), // %4
"r"(b) // %5
: "cc", "memory", "v0", "v1", "v2", "v3"
);
}
#else
if (nn > 0)


+ 353
- 263
src/layer/arm/convolution_1x1.h View File

@@ -1027,52 +1027,62 @@ static void conv1x1s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
float32x4_t _k2 = vdupq_n_f32(k2);
float32x4_t _k3 = vdupq_n_f32(k3);
#if __aarch64__
for (; nn>0; nn--)
if (nn > 0)
{
float32x4_t _p = vld1q_f32(r0);
float32x4_t _pn = vld1q_f32(r0+4);

float32x4_t _out0p = vld1q_f32(outptr0);
float32x4_t _out0pn = vld1q_f32(outptr0+4);

float32x4_t _out1p = vld1q_f32(outptr1);
float32x4_t _out1pn = vld1q_f32(outptr1+4);

float32x4_t _out2p = vld1q_f32(outptr2);
float32x4_t _out2pn = vld1q_f32(outptr2+4);

float32x4_t _out3p = vld1q_f32(outptr3);
float32x4_t _out3pn = vld1q_f32(outptr3+4);

_out0p = vfmaq_f32(_out0p, _p, _k0);
_out0pn = vfmaq_f32(_out0pn, _pn, _k0);

_out1p = vfmaq_f32(_out1p, _p, _k1);
_out1pn = vfmaq_f32(_out1pn, _pn, _k1);

_out2p = vfmaq_f32(_out2p, _p, _k2);
_out2pn = vfmaq_f32(_out2pn, _pn, _k2);

_out3p = vfmaq_f32(_out3p, _p, _k3);
_out3pn = vfmaq_f32(_out3pn, _pn, _k3);

vst1q_f32(outptr0, _out0p);
vst1q_f32(outptr0+4, _out0pn);

vst1q_f32(outptr1, _out1p);
vst1q_f32(outptr1+4, _out1pn);

vst1q_f32(outptr2, _out2p);
vst1q_f32(outptr2+4, _out2pn);

vst1q_f32(outptr3, _out3p);
vst1q_f32(outptr3+4, _out3pn);

r0 += 8;
outptr0 += 8;
outptr1 += 8;
outptr2 += 8;
outptr3 += 8;
asm volatile(
"prfm pldl1keep, [%5, #256] \n"
"ld1 {v6.4s, v7.4s}, [%5], #32 \n"
"0: \n"
"prfm pldl1keep, [%1, #256] \n"
"ld1 {v8.4s, v9.4s}, [%1] \n"
"fmla v8.4s, v6.4s, %12.4s \n"
"fmla v9.4s, v7.4s, %12.4s \n"
"prfm pldl1keep, [%2, #256] \n"
"ld1 {v10.4s, v11.4s}, [%2] \n"
"fmla v10.4s, v6.4s, %13.4s \n"
"fmla v11.4s, v7.4s, %13.4s \n"

"st1 {v8.4s, v9.4s}, [%1], #32 \n"

"prfm pldl1keep, [%3, #256] \n"
"ld1 {v12.4s, v13.4s}, [%3] \n"
"fmla v12.4s, v6.4s, %14.4s \n"
"fmla v13.4s, v7.4s, %14.4s \n"

"st1 {v10.4s, v11.4s}, [%2], #32 \n"

"prfm pldl1keep, [%4, #256] \n"
"ld1 {v14.4s, v15.4s}, [%4] \n"
"fmla v14.4s, v6.4s, %15.4s \n"
"fmla v15.4s, v7.4s, %15.4s \n"

"st1 {v12.4s, v13.4s}, [%3], #32 \n"

"prfm pldl1keep, [%5, #256] \n"
"ld1 {v6.4s, v7.4s}, [%5], #32 \n"
"subs %w0, %w0, #1 \n"
"st1 {v14.4s, v15.4s}, [%4], #32 \n"
"bne 0b \n"
"sub %5, %5, #32 \n"
: "=r"(nn), // %0
"=r"(outptr0),// %1
"=r"(outptr1),// %2
"=r"(outptr2),// %3
"=r"(outptr3),// %4
"=r"(r0) // %5
: "0"(nn),
"1"(outptr0),
"2"(outptr1),
"3"(outptr2),
"4"(outptr3),
"5"(r0),
"w"(_k0), // %12
"w"(_k1), // %13
"w"(_k2), // %14
"w"(_k3) // %15
: "cc", "memory", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
);
}
#else
if (nn > 0)
@@ -1202,43 +1212,56 @@ static void conv1x1s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
float32x4_t _k2 = vdupq_n_f32(k2);
float32x4_t _k3 = vdupq_n_f32(k3);
#if __aarch64__
for (; nn>0; nn--)
if (nn > 0)
{
float32x4_t _p = vld1q_f32(r0);
float32x4_t _pn = vld1q_f32(r0+4);

float32x4_t _outp = vld1q_f32(outptr);
float32x4_t _outpn = vld1q_f32(outptr+4);

_outp = vfmaq_f32(_outp, _p, _k0);
_outpn = vfmaq_f32(_outpn, _pn, _k0);

float32x4_t _p1 = vld1q_f32(r1);
float32x4_t _p1n = vld1q_f32(r1+4);

_outp = vfmaq_f32(_outp, _p1, _k1);
_outpn = vfmaq_f32(_outpn, _p1n, _k1);

float32x4_t _p2 = vld1q_f32(r2);
float32x4_t _p2n = vld1q_f32(r2+4);

_outp = vfmaq_f32(_outp, _p2, _k2);
_outpn = vfmaq_f32(_outpn, _p2n, _k2);

float32x4_t _p3 = vld1q_f32(r3);
float32x4_t _p3n = vld1q_f32(r3+4);

_outp = vfmaq_f32(_outp, _p3, _k3);
_outpn = vfmaq_f32(_outpn, _p3n, _k3);

vst1q_f32(outptr, _outp);
vst1q_f32(outptr+4, _outpn);

r0 += 8;
r1 += 8;
r2 += 8;
r3 += 8;
outptr += 8;
asm volatile(
"prfm pldl1keep, [%2, #256] \n"
"ld1 {v2.4s, v3.4s}, [%2], #32 \n"
"0: \n"
"prfm pldl1keep, [%1, #256] \n"
"ld1 {v0.4s, v1.4s}, [%1] \n"
"fmla v0.4s, v2.4s, %12.4s \n"
"fmla v1.4s, v3.4s, %12.4s \n"

"prfm pldl1keep, [%3, #256] \n"
"ld1 {v2.4s, v3.4s}, [%3], #32 \n"
"fmla v0.4s, v2.4s, %13.4s \n"
"fmla v1.4s, v3.4s, %13.4s \n"

"prfm pldl1keep, [%4, #256] \n"
"ld1 {v2.4s, v3.4s}, [%4], #32 \n"
"fmla v0.4s, v2.4s, %14.4s \n"
"fmla v1.4s, v3.4s, %14.4s \n"

"prfm pldl1keep, [%5, #256] \n"
"ld1 {v2.4s, v3.4s}, [%5], #32 \n"
"fmla v0.4s, v2.4s, %15.4s \n"
"fmla v1.4s, v3.4s, %15.4s \n"

"prfm pldl1keep, [%2, #256] \n"
"ld1 {v2.4s, v3.4s}, [%2], #32 \n"
"subs %w0, %w0, #1 \n"
"st1 {v0.4s, v1.4s}, [%1], #32 \n"
"bne 0b \n"
"sub %2, %2, #32 \n"
: "=r"(nn), // %0
"=r"(outptr), // %1
"=r"(r0), // %2
"=r"(r1), // %3
"=r"(r2), // %4
"=r"(r3) // %5
: "0"(nn),
"1"(outptr),
"2"(r0),
"3"(r1),
"4"(r2),
"5"(r3),
"w"(_k0), // %12
"w"(_k1), // %13
"w"(_k2), // %14
"w"(_k3) // %15
: "cc", "memory", "v0", "v1", "v2", "v3"
);
}
#else
if (nn > 0)
@@ -1331,22 +1354,31 @@ static void conv1x1s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
#if __ARM_NEON
float32x4_t _k0 = vdupq_n_f32(k0);
#if __aarch64__
for (; nn>0; nn--)
if (nn > 0)
{
float32x4_t _p = vld1q_f32(r0);
float32x4_t _outp = vld1q_f32(outptr);

float32x4_t _pn = vld1q_f32(r0+4);
float32x4_t _outpn = vld1q_f32(outptr+4);

_outp = vfmaq_f32(_outp, _p, _k0);
_outpn = vfmaq_f32(_outpn, _pn, _k0);

vst1q_f32(outptr, _outp);
vst1q_f32(outptr+4, _outpn);

r0 += 8;
outptr += 8;
asm volatile(
"prfm pldl1keep, [%2, #256] \n"
"ld1 {v2.4s, v3.4s}, [%2], #32 \n"
"0: \n"
"prfm pldl1keep, [%1, #256] \n"
"ld1 {v0.4s, v1.4s}, [%1] \n"
"fmla v0.4s, v2.4s, %6.4s \n"
"fmla v1.4s, v3.4s, %6.4s \n"
"prfm pldl1keep, [%2, #256] \n"
"ld1 {v2.4s, v3.4s}, [%2], #32 \n"
"subs %w0, %w0, #1 \n"
"st1 {v0.4s, v1.4s}, [%1], #32 \n"
"bne 0b \n"
"sub %2, %2, #32 \n"
: "=r"(nn), // %0
"=r"(outptr), // %1
"=r"(r0) // %2
: "0"(nn),
"1"(outptr),
"2"(r0),
"w"(_k0) // %6
: "cc", "memory", "v0", "v1", "v2", "v3"
);
}
#else
if (nn > 0)
@@ -1470,108 +1502,128 @@ static void conv1x1s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
float32x4_t _k2 = vld1q_f32(kernel2);
float32x4_t _k3 = vld1q_f32(kernel3);
#if __aarch64__
for (; nn>0; nn--)
if (nn > 0)
{
float32x4x2_t _px2 = vld2q_f32(r0);
float32x4_t _p = _px2.val[0];
float32x4x2_t _pnx2 = vld2q_f32(r0+8);
float32x4_t _pn = _pnx2.val[0];
asm volatile(
"0: \n"

"prfm pldl1keep, [%5, #512] \n"
"ld2 {v4.4s, v5.4s}, [%5], #32 \n"
"ld2 {v6.4s, v7.4s}, [%5], #32 \n"
"and v5.16b, v6.16b, v6.16b \n"// v4 v5

float32x4_t _out0p = vld1q_f32(outptr0);
float32x4_t _out0pn = vld1q_f32(outptr0+4);
"prfm pldl1keep, [%1, #256] \n"
"ld1 {v8.4s, v9.4s}, [%1] \n"

float32x4_t _out1p = vld1q_f32(outptr1);
float32x4_t _out1pn = vld1q_f32(outptr1+4);
"fmla v8.4s, v4.4s, %18.s[0] \n"
"fmla v9.4s, v5.4s, %18.s[0] \n"

float32x4_t _out2p = vld1q_f32(outptr2);
float32x4_t _out2pn = vld1q_f32(outptr2+4);
"prfm pldl1keep, [%2, #256] \n"
"ld1 {v10.4s, v11.4s}, [%2] \n"

float32x4_t _out3p = vld1q_f32(outptr3);
float32x4_t _out3pn = vld1q_f32(outptr3+4);
"fmla v10.4s, v4.4s, %19.s[0] \n"
"fmla v11.4s, v5.4s, %19.s[0] \n"

_out0p = vfmaq_laneq_f32(_out0p, _p, _k0, 0);
_out0pn = vfmaq_laneq_f32(_out0pn, _pn, _k0, 0);
"prfm pldl1keep, [%3, #256] \n"
"ld1 {v12.4s, v13.4s}, [%3] \n"

_out1p = vfmaq_laneq_f32(_out1p, _p, _k1, 0);
_out1pn = vfmaq_laneq_f32(_out1pn, _pn, _k1, 0);
"fmla v12.4s, v4.4s, %20.s[0] \n"
"fmla v13.4s, v5.4s, %20.s[0] \n"

_out2p = vfmaq_laneq_f32(_out2p, _p, _k2, 0);
_out2pn = vfmaq_laneq_f32(_out2pn, _pn, _k2, 0);
"prfm pldl1keep, [%4, #256] \n"
"ld1 {v14.4s, v15.4s}, [%4] \n"

_out3p = vfmaq_laneq_f32(_out3p, _p, _k3, 0);
_out3pn = vfmaq_laneq_f32(_out3pn, _pn, _k3, 0);
"prfm pldl1keep, [%6, #512] \n"
"ld2 {v6.4s, v7.4s}, [%6], #32 \n"

float32x4x2_t _p1x2 = vld2q_f32(r1);
float32x4_t _p1 = _p1x2.val[0];
float32x4x2_t _p1nx2 = vld2q_f32(r1+8);
float32x4_t _p1n = _p1nx2.val[0];
"fmla v14.4s, v4.4s, %21.s[0] \n"
"fmla v15.4s, v5.4s, %21.s[0] \n"

_out0p = vfmaq_laneq_f32(_out0p, _p1, _k0, 1);
_out0pn = vfmaq_laneq_f32(_out0pn, _p1n, _k0, 1);
"ld2 {v4.4s, v5.4s}, [%6], #32 \n"
"and v7.16b, v4.16b, v4.16b \n"// v6 v7

_out1p = vfmaq_laneq_f32(_out1p, _p1, _k1, 1);
_out1pn = vfmaq_laneq_f32(_out1pn, _p1n, _k1, 1);
"fmla v8.4s, v6.4s, %18.s[1] \n"
"fmla v9.4s, v7.4s, %18.s[1] \n"

_out2p = vfmaq_laneq_f32(_out2p, _p1, _k2, 1);
_out2pn = vfmaq_laneq_f32(_out2pn, _p1n, _k2, 1);
"fmla v10.4s, v6.4s, %19.s[1] \n"
"fmla v11.4s, v7.4s, %19.s[1] \n"

_out3p = vfmaq_laneq_f32(_out3p, _p1, _k3, 1);
_out3pn = vfmaq_laneq_f32(_out3pn, _p1n, _k3, 1);
"fmla v12.4s, v6.4s, %20.s[1] \n"
"fmla v13.4s, v7.4s, %20.s[1] \n"

float32x4x2_t _p2x2 = vld2q_f32(r2);
float32x4_t _p2 = _p2x2.val[0];
float32x4x2_t _p2nx2 = vld2q_f32(r2+8);
float32x4_t _p2n = _p2nx2.val[0];
"prfm pldl1keep, [%7, #512] \n"
"ld2 {v4.4s, v5.4s}, [%7], #32 \n"

_out0p = vfmaq_laneq_f32(_out0p, _p2, _k0, 2);
_out0pn = vfmaq_laneq_f32(_out0pn, _p2n, _k0, 2);
"fmla v14.4s, v6.4s, %21.s[1] \n"
"fmla v15.4s, v7.4s, %21.s[1] \n"

_out1p = vfmaq_laneq_f32(_out1p, _p2, _k1, 2);
_out1pn = vfmaq_laneq_f32(_out1pn, _p2n, _k1, 2);
"ld2 {v6.4s, v7.4s}, [%7], #32 \n"
"and v5.16b, v6.16b, v6.16b \n"// v4 v5

_out2p = vfmaq_laneq_f32(_out2p, _p2, _k2, 2);
_out2pn = vfmaq_laneq_f32(_out2pn, _p2n, _k2, 2);
"fmla v8.4s, v4.4s, %18.s[2] \n"
"fmla v9.4s, v5.4s, %18.s[2] \n"

_out3p = vfmaq_laneq_f32(_out3p, _p2, _k3, 2);
_out3pn = vfmaq_laneq_f32(_out3pn, _p2n, _k3, 2);
"fmla v10.4s, v4.4s, %19.s[2] \n"
"fmla v11.4s, v5.4s, %19.s[2] \n"

float32x4x2_t _p3x2 = vld2q_f32(r3);
float32x4_t _p3 = _p3x2.val[0];
float32x4x2_t _p3nx2 = vld2q_f32(r3+8);
float32x4_t _p3n = _p3nx2.val[0];
"fmla v12.4s, v4.4s, %20.s[2] \n"
"fmla v13.4s, v5.4s, %20.s[2] \n"

_out0p = vfmaq_laneq_f32(_out0p, _p3, _k0, 3);
_out0pn = vfmaq_laneq_f32(_out0pn, _p3n, _k0, 3);
"prfm pldl1keep, [%8, #512] \n"
"ld2 {v6.4s, v7.4s}, [%8], #32 \n"

_out1p = vfmaq_laneq_f32(_out1p, _p3, _k1, 3);
_out1pn = vfmaq_laneq_f32(_out1pn, _p3n, _k1, 3);
"fmla v14.4s, v4.4s, %21.s[2] \n"
"fmla v15.4s, v5.4s, %21.s[2] \n"

_out2p = vfmaq_laneq_f32(_out2p, _p3, _k2, 3);
_out2pn = vfmaq_laneq_f32(_out2pn, _p3n, _k2, 3);
"ld2 {v4.4s, v5.4s}, [%8], #32 \n"
"and v7.16b, v4.16b, v4.16b \n"// v6 v7

_out3p = vfmaq_laneq_f32(_out3p, _p3, _k3, 3);
_out3pn = vfmaq_laneq_f32(_out3pn, _p3n, _k3, 3);
"fmla v8.4s, v6.4s, %18.s[3] \n"
"fmla v9.4s, v7.4s, %18.s[3] \n"

vst1q_f32(outptr0, _out0p);
vst1q_f32(outptr0+4, _out0pn);
"fmla v10.4s, v6.4s, %19.s[3] \n"
"fmla v11.4s, v7.4s, %19.s[3] \n"

vst1q_f32(outptr1, _out1p);
vst1q_f32(outptr1+4, _out1pn);
"st1 {v8.4s, v9.4s}, [%1], #32 \n"

vst1q_f32(outptr2, _out2p);
vst1q_f32(outptr2+4, _out2pn);
"fmla v12.4s, v6.4s, %20.s[3] \n"
"fmla v13.4s, v7.4s, %20.s[3] \n"

vst1q_f32(outptr3, _out3p);
vst1q_f32(outptr3+4, _out3pn);
"st1 {v10.4s, v11.4s}, [%2], #32 \n"

r0 += 16;
r1 += 16;
r2 += 16;
r3 += 16;
outptr0 += 8;
outptr1 += 8;
outptr2 += 8;
outptr3 += 8;
"fmla v14.4s, v6.4s, %21.s[3] \n"
"fmla v15.4s, v7.4s, %21.s[3] \n"

"st1 {v12.4s, v13.4s}, [%3], #32 \n"

"subs %w0, %w0, #1 \n"
"st1 {v14.4s, v15.4s}, [%4], #32 \n"

"bne 0b \n"
: "=r"(nn), // %0
"=r"(outptr0),// %1
"=r"(outptr1),// %2
"=r"(outptr2),// %3
"=r"(outptr3),// %4
"=r"(r0), // %5
"=r"(r1), // %6
"=r"(r2), // %7
"=r"(r3) // %8
: "0"(nn),
"1"(outptr0),
"2"(outptr1),
"3"(outptr2),
"4"(outptr3),
"5"(r0),
"6"(r1),
"7"(r2),
"8"(r3),
"w"(_k0), // %18
"w"(_k1), // %19
"w"(_k2), // %20
"w"(_k3) // %21
: "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
);
}
#else
if (nn > 0)
@@ -1767,54 +1819,67 @@ static void conv1x1s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
float32x4_t _k2 = vdupq_n_f32(k2);
float32x4_t _k3 = vdupq_n_f32(k3);
#if __aarch64__
for (; nn>0; nn--)
if (nn > 0)
{
float32x4x2_t _px2 = vld2q_f32(r0);
float32x4_t _p = _px2.val[0];
float32x4x2_t _pnx2 = vld2q_f32(r0+8);
float32x4_t _pn = _pnx2.val[0];

float32x4_t _out0p = vld1q_f32(outptr0);
float32x4_t _out0pn = vld1q_f32(outptr0+4);

float32x4_t _out1p = vld1q_f32(outptr1);
float32x4_t _out1pn = vld1q_f32(outptr1+4);

float32x4_t _out2p = vld1q_f32(outptr2);
float32x4_t _out2pn = vld1q_f32(outptr2+4);
asm volatile(
"0: \n"

float32x4_t _out3p = vld1q_f32(outptr3);
float32x4_t _out3pn = vld1q_f32(outptr3+4);
"prfm pldl1keep, [%5, #512] \n"
"ld2 {v4.4s, v5.4s}, [%5], #32 \n"
"ld2 {v6.4s, v7.4s}, [%5], #32 \n"
"and v5.16b, v6.16b, v6.16b \n"
"prfm pldl1keep, [%1, #256] \n"
"ld1 {v8.4s, v9.4s}, [%1] \n"

_out0p = vfmaq_f32(_out0p, _p, _k0);
_out0pn = vfmaq_f32(_out0pn, _pn, _k0);
"fmla v8.4s, v4.4s, %12.4s \n"
"fmla v9.4s, v5.4s, %12.4s \n"

_out1p = vfmaq_f32(_out1p, _p, _k1);
_out1pn = vfmaq_f32(_out1pn, _pn, _k1);
"prfm pldl1keep, [%2, #256] \n"
"ld1 {v10.4s, v11.4s}, [%2] \n"

_out2p = vfmaq_f32(_out2p, _p, _k2);
_out2pn = vfmaq_f32(_out2pn, _pn, _k2);
"fmla v10.4s, v4.4s, %13.4s \n"
"fmla v11.4s, v5.4s, %13.4s \n"

_out3p = vfmaq_f32(_out3p, _p, _k3);
_out3pn = vfmaq_f32(_out3pn, _pn, _k3);
"prfm pldl1keep, [%3, #256] \n"
"ld1 {v12.4s, v13.4s}, [%3] \n"

vst1q_f32(outptr0, _out0p);
vst1q_f32(outptr0+4, _out0pn);
"st1 {v8.4s, v9.4s}, [%1], #32 \n"

vst1q_f32(outptr1, _out1p);
vst1q_f32(outptr1+4, _out1pn);
"fmla v12.4s, v4.4s, %14.4s \n"
"fmla v13.4s, v5.4s, %14.4s \n"

vst1q_f32(outptr2, _out2p);
vst1q_f32(outptr2+4, _out2pn);
"prfm pldl1keep, [%4, #256] \n"
"ld1 {v14.4s, v15.4s}, [%4] \n"

vst1q_f32(outptr3, _out3p);
vst1q_f32(outptr3+4, _out3pn);
"st1 {v10.4s, v11.4s}, [%2], #32 \n"
"fmla v14.4s, v4.4s, %15.4s \n"
"fmla v15.4s, v5.4s, %15.4s \n"

r0 += 16;
outptr0 += 8;
outptr1 += 8;
outptr2 += 8;
outptr3 += 8;
"st1 {v12.4s, v13.4s}, [%3], #32 \n"
"subs %w0, %w0, #1 \n"
"st1 {v14.4s, v15.4s}, [%4], #32 \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(outptr0),// %1
"=r"(outptr1),// %2
"=r"(outptr2),// %3
"=r"(outptr3),// %4
"=r"(r0) // %5
: "0"(nn),
"1"(outptr0),
"2"(outptr1),
"3"(outptr2),
"4"(outptr3),
"5"(r0),
"w"(_k0), // %12
"w"(_k1), // %13
"w"(_k2), // %14
"w"(_k3) // %15
: "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
);
}
#else
if (nn > 0)
@@ -1951,51 +2016,63 @@ static void conv1x1s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
float32x4_t _k2 = vdupq_n_f32(k2);
float32x4_t _k3 = vdupq_n_f32(k3);
#if __aarch64__
for (; nn>0; nn--)
if (nn > 0)
{
float32x4x2_t _px2 = vld2q_f32(r0);
float32x4_t _p = _px2.val[0];
float32x4_t _outp = vld1q_f32(outptr);

float32x4x2_t _pnx2 = vld2q_f32(r0+8);
float32x4_t _pn = _pnx2.val[0];
float32x4_t _outpn = vld1q_f32(outptr+4);

_outp = vmlaq_f32(_outp, _p, _k0);
_outpn = vmlaq_f32(_outpn, _pn, _k0);

float32x4x2_t _p1x2 = vld2q_f32(r1);
float32x4_t _p1 = _p1x2.val[0];
float32x4x2_t _p1nx2 = vld2q_f32(r1+8);
float32x4_t _p1n = _p1nx2.val[0];

_outp = vmlaq_f32(_outp, _p1, _k1);
_outpn = vmlaq_f32(_outpn, _p1n, _k1);

float32x4x2_t _p2x2 = vld2q_f32(r2);
float32x4_t _p2 = _p2x2.val[0];
float32x4x2_t _p2nx2 = vld2q_f32(r2+8);
float32x4_t _p2n = _p2nx2.val[0];

_outp = vmlaq_f32(_outp, _p2, _k2);
_outpn = vmlaq_f32(_outpn, _p2n, _k2);

float32x4x2_t _p3x2 = vld2q_f32(r3);
float32x4_t _p3 = _p3x2.val[0];
float32x4x2_t _p3nx2 = vld2q_f32(r3+8);
float32x4_t _p3n = _p3nx2.val[0];

_outp = vmlaq_f32(_outp, _p3, _k3);
_outpn = vmlaq_f32(_outpn, _p3n, _k3);

vst1q_f32(outptr, _outp);
vst1q_f32(outptr+4, _outpn);

r0 += 16;
r1 += 16;
r2 += 16;
r3 += 16;
outptr += 8;
asm volatile(
"prfm pldl1keep, [%2, #512] \n"
"ld2 {v2.4s, v3.4s}, [%2], #32 \n"
"ld2 {v8.4s, v9.4s}, [%2], #32 \n"
"0: \n"

"prfm pldl1keep, [%1, #256] \n"
"ld1 {v0.4s, v1.4s}, [%1] \n"
"fmla v0.4s, v2.4s, %12.4s \n"
"fmla v1.4s, v8.4s, %12.4s \n"

"prfm pldl1keep, [%3, #512] \n"
"ld2 {v2.4s, v3.4s}, [%3], #32 \n"
"ld2 {v8.4s, v9.4s}, [%3], #32 \n"
"fmla v0.4s, v2.4s, %13.4s \n"
"fmla v1.4s, v8.4s, %13.4s \n"

"prfm pldl1keep, [%4, #512] \n"
"ld2 {v2.4s, v3.4s}, [%4], #32 \n"
"ld2 {v8.4s, v9.4s}, [%4], #32 \n"
"fmla v0.4s, v2.4s, %14.4s \n"
"fmla v1.4s, v8.4s, %14.4s \n"

"prfm pldl1keep, [%5, #512] \n"
"ld2 {v2.4s, v3.4s}, [%5], #32 \n"
"ld2 {v8.4s, v9.4s}, [%5], #32 \n"
"fmla v0.4s, v2.4s, %15.4s \n"
"fmla v1.4s, v8.4s, %15.4s \n"

"prfm pldl1keep, [%2, #512] \n"
"ld2 {v2.4s, v3.4s}, [%2], #32 \n"
"ld2 {v8.4s, v9.4s}, [%2], #32 \n"

"subs %w0, %w0, #1 \n"
"st1 {v0.4s, v1.4s}, [%1], #32 \n"
"bne 0b \n"
"sub %2, %2, #64 \n"
: "=r"(nn), // %0
"=r"(outptr), // %1
"=r"(r0), // %2
"=r"(r1), // %3
"=r"(r2), // %4
"=r"(r3) // %5
: "0"(nn),
"1"(outptr),
"2"(r0),
"3"(r1),
"4"(r2),
"5"(r3),
"w"(_k0), // %12
"w"(_k1), // %13
"w"(_k2), // %14
"w"(_k3) // %15
: "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9"
);
}
#else
if (nn > 0)
@@ -2099,24 +2176,37 @@ static void conv1x1s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
#if __ARM_NEON
float32x4_t _k0 = vdupq_n_f32(k0);
#if __aarch64__
for (; nn>0; nn--)
if (nn > 0)
{
float32x4x2_t _px2 = vld2q_f32(r0);
float32x4_t _p = _px2.val[0];
float32x4_t _outp = vld1q_f32(outptr);
asm volatile(
"prfm pldl1keep, [%2, #512] \n"
"ld2 {v2.4s, v3.4s}, [%2], #32 \n"
"ld2 {v8.4s, v9.4s}, [%2], #32 \n"

float32x4x2_t _pnx2 = vld2q_f32(r0+8);
float32x4_t _pn = _pnx2.val[0];
float32x4_t _outpn = vld1q_f32(outptr+4);
"0: \n"

_outp = vmlaq_f32(_outp, _p, _k0);
_outpn = vmlaq_f32(_outpn, _pn, _k0);
"prfm pldl1keep, [%1, #256] \n"
"ld1 {v0.4s, v1.4s}, [%1] \n"
"fmla v0.4s, v2.4s, %6.4s \n"
"fmla v1.4s, v8.4s, %6.4s \n"

vst1q_f32(outptr, _outp);
vst1q_f32(outptr+4, _outpn);
"prfm pldl1keep, [%2, #512] \n"
"ld2 {v2.4s, v3.4s}, [%2], #32 \n"
"ld2 {v8.4s, v9.4s}, [%2], #32 \n"

r0 += 16;
outptr += 8;
"subs %w0, %w0, #1 \n"
"st1 {v0.4s, v1.4s}, [%1], #32 \n"
"bne 0b \n"
"sub %2, %2, #64 \n"
: "=r"(nn), // %0
"=r"(outptr), // %1
"=r"(r0) // %2
: "0"(nn),
"1"(outptr),
"2"(r0),
"w"(_k0) // %6
: "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9"
);
}
#else
if (nn > 0)


+ 120
- 46
src/layer/arm/convolution_2x2.h View File

@@ -71,37 +71,83 @@ static void conv2x2s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke

#if __ARM_NEON
#if __aarch64__
for (; nn>0; nn--)
if (nn > 0)
{
float32x4_t _r000 = vld1q_f32(r00);
float32x4_t _r010 = vld1q_f32(r01);
float32x4_t _r001 = vld1q_f32(r00 + 1);
float32x4_t _r011 = vld1q_f32(r01 + 1);

float32x4_t _r100 = vld1q_f32(r10);
float32x4_t _r110 = vld1q_f32(r11);
float32x4_t _r101 = vld1q_f32(r10 + 1);
float32x4_t _r111 = vld1q_f32(r11 + 1);

float32x4_t _sum = vld1q_f32(outptr);

_sum = vmlaq_lane_f32(_sum, _r000, vget_low_f32(_k0), 0);
_sum = vmlaq_lane_f32(_sum, _r001, vget_low_f32(_k0), 1);
_sum = vmlaq_lane_f32(_sum, _r010, vget_high_f32(_k0), 0);
_sum = vmlaq_lane_f32(_sum, _r011, vget_high_f32(_k0), 1);

_sum = vmlaq_lane_f32(_sum, _r100, vget_low_f32(_k1), 0);
_sum = vmlaq_lane_f32(_sum, _r101, vget_low_f32(_k1), 1);
_sum = vmlaq_lane_f32(_sum, _r110, vget_high_f32(_k1), 0);
_sum = vmlaq_lane_f32(_sum, _r111, vget_high_f32(_k1), 1);

vst1q_f32(outptr, _sum);

r00 += 4;
r01 += 4;
r10 += 4;
r11 += 4;
outptr += 4;
asm volatile(
"prfm pldl1keep, [%1, #128] \n"
"ld1 {v0.4s}, [%1], #16 \n"
"prfm pldl1keep, [%2, #128] \n"
"ld1 {v2.4s}, [%2], #16 \n"
"prfm pldl1keep, [%3, #128] \n"
"ld1 {v12.4s}, [%3], #16 \n"
"prfm pldl1keep, [%4, #128] \n"
"ld1 {v14.4s}, [%4], #16 \n"

"0: \n"
"prfm pldl1keep, [%5, #128] \n"
"ld1 {v9.4s}, [%5] \n"

"fmul v8.4s, v0.4s, %12.s[0] \n"
"fmla v9.4s, v2.4s, %12.s[2] \n"
"prfm pldl1keep, [%1, #128] \n"
"ld1 {v1.4s}, [%1], #16 \n"

"prfm pldl1keep, [%2, #128] \n"
"ld1 {v3.4s}, [%2], #16 \n"

"ext v10.16b, v0.16b, v1.16b, #4 \n"
"ext v11.16b, v2.16b, v3.16b, #4 \n"

"fmla v8.4s, v12.4s, %13.s[0] \n"
"fmla v9.4s, v14.4s, %13.s[2] \n"

"prfm pldl1keep, [%3, #128] \n"
"ld1 {v13.4s}, [%3], #16 \n"

"prfm pldl1keep, [%4, #128] \n"
"ld1 {v15.4s}, [%4], #16 \n"

"fmla v8.4s, v10.4s, %12.s[1] \n"
"fmla v9.4s, v11.4s, %12.s[3] \n"

"ext v10.16b, v12.16b, v13.16b, #4 \n"
"ext v11.16b, v14.16b, v15.16b, #4 \n"

"fmla v8.4s, v10.4s, %13.s[1] \n"
"fmla v9.4s, v11.4s, %13.s[3] \n"

"orr v0.16b, v1.16b, v1.16b \n"
"orr v2.16b, v3.16b, v3.16b \n"

"fadd v8.4s, v8.4s, v9.4s \n"

"orr v12.16b, v13.16b, v13.16b \n"
"orr v14.16b, v15.16b, v15.16b \n"

"subs %w0, %w0, #1 \n"
"st1 {v8.4s}, [%5], #16 \n"
"bne 0b \n"
"sub %1, %1, #16 \n"
"sub %2, %2, #16 \n"
"sub %3, %3, #16 \n"
"sub %4, %4, #16 \n"
: "=r"(nn), // %0
"=r"(r00), // %1
"=r"(r01), // %2
"=r"(r10), // %3
"=r"(r11), // %4
"=r"(outptr) // %5
: "0"(nn),
"1"(r00),
"2"(r01),
"3"(r10),
"4"(r11),
"5"(outptr),
"w"(_k0), // %12
"w"(_k1) // %13
: "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
);
}
#else
if (nn > 0)
@@ -263,28 +309,56 @@ static void conv2x2s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke

#if __ARM_NEON
#if __aarch64__
for (; nn>0; nn--)
if (nn > 0)
{
float32x4_t _r00 = vld1q_f32(r0);
float32x4_t _r10 = vld1q_f32(r1);
float32x4_t _r01 = vld1q_f32(r0 + 1);
float32x4_t _r11 = vld1q_f32(r1 + 1);
asm volatile(
"prfm pldl1keep, [%1, #128] \n"
"ld1 {v0.4s}, [%1], #16 \n"
"prfm pldl1keep, [%2, #128] \n"
"ld1 {v2.4s}, [%2], #16 \n"

"0: \n"
"prfm pldl1keep, [%3, #128] \n"
"ld1 {v9.4s}, [%3] \n"

float32x4_t _sum = vld1q_f32(outptr);
float32x4_t _sum2;
"fmul v8.4s, v0.4s, %8.4s \n"
"fmla v9.4s, v2.4s, %10.4s \n"

_sum = vmlaq_f32(_sum, _r00, _k0);
_sum2 = vmulq_f32(_r01, _k1);
_sum = vmlaq_f32(_sum, _r10, _k2);
_sum2 = vmlaq_f32(_sum2, _r11, _k3);
"prfm pldl1keep, [%1, #128] \n"
"ld1 {v1.4s}, [%1], #16 \n"
"ext v10.16b, v0.16b, v1.16b, #4 \n"

_sum = vaddq_f32(_sum, _sum2);
"fmla v8.4s, v10.4s, %9.4s \n"

vst1q_f32(outptr, _sum);
"prfm pldl1keep, [%2, #128] \n"
"ld1 {v3.4s}, [%2], #16 \n"
"ext v11.16b, v2.16b, v3.16b, #4 \n"

r0 += 4;
r1 += 4;
outptr += 4;
"fmla v9.4s, v11.4s, %11.4s \n"

"orr v0.16b, v1.16b, v1.16b \n"
"fadd v8.4s, v8.4s, v9.4s \n"
"orr v2.16b, v3.16b, v3.16b \n"

"subs %w0, %w0, #1 \n"
"st1 {v8.4s}, [%3], #16 \n"
"bne 0b \n"
"sub %1, %1, #16 \n"
"sub %2, %2, #16 \n"
: "=r"(nn), // %0
"=r"(r0), // %1
"=r"(r1), // %2
"=r"(outptr) // %3
: "0"(nn),
"1"(r0),
"2"(r1),
"3"(outptr),
"w"(_k0), // %8
"w"(_k1), // %9
"w"(_k2), // %10
"w"(_k3) // %11
: "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11"
);
}
#else
if (nn > 0)


+ 56
- 31
src/layer/arm/convolution_3x3.h View File

@@ -6858,49 +6858,74 @@ static void conv3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke

#if __ARM_NEON
#if __aarch64__
for (; nn>0; nn--)
if (nn > 0)
{
float32x4_t _outp = vld1q_f32(outptr);
asm volatile(
"prfm pldl1keep, [%2, #256] \n"
"ld2 {v2.4s, v3.4s}, [%2], #32 \n"
"0: \n"
"prfm pldl1keep, [%1, #128] \n"
"ld1 {v0.4s}, [%1] \n"

float32x4x2_t _r0 = vld2q_f32(r0);
float32x4x2_t _r0n = vld2q_f32(r0+8);
"fmla v0.4s, v2.4s, %10.s[0] \n"
"fmul v10.4s, v3.4s, %10.s[1] \n"

float32x4_t _r00 = _r0.val[0];// 0 2 4 6
float32x4_t _r01 = _r0.val[1];// 1 3 5 7
float32x4_t _r02 = vextq_f32(_r00, _r0n.val[0], 1);// 2 4 6 8
"prfm pldl1keep, [%2, #256] \n"
"ld2 {v8.4s, v9.4s}, [%2] \n"
"ext v1.16b, v2.16b, v8.16b, #4 \n"

_outp = vfmaq_laneq_f32(_outp, _r00, _k0123, 0);
_outp = vfmaq_laneq_f32(_outp, _r01, _k0123, 1);
_outp = vfmaq_laneq_f32(_outp, _r02, _k0123, 2);
"fmul v11.4s, v1.4s, %10.s[2] \n"

float32x4x2_t _r1 = vld2q_f32(r1);
float32x4x2_t _r1n = vld2q_f32(r1+8);
"prfm pldl1keep, [%3, #256] \n"
"ld2 {v2.4s, v3.4s}, [%3], #32 \n"

float32x4_t _r10 = _r1.val[0];
float32x4_t _r11 = _r1.val[1];
float32x4_t _r12 = vextq_f32(_r10, _r1n.val[0], 1);
"fmla v0.4s, v2.4s, %11.s[0] \n"
"fmla v10.4s, v3.4s, %11.s[1] \n"

_outp = vfmaq_laneq_f32(_outp, _r10, _k3456, 0);
_outp = vfmaq_laneq_f32(_outp, _r11, _k3456, 1);
_outp = vfmaq_laneq_f32(_outp, _r12, _k3456, 2);
"prfm pldl1keep, [%3, #256] \n"
"ld2 {v8.4s, v9.4s}, [%3] \n"
"ext v1.16b, v2.16b, v8.16b, #4 \n"

float32x4x2_t _r2 = vld2q_f32(r2);
float32x4x2_t _r2n = vld2q_f32(r2+8);
"fmla v11.4s, v1.4s, %11.s[2] \n"

float32x4_t _r20 = _r2.val[0];
float32x4_t _r21 = _r2.val[1];
float32x4_t _r22 = vextq_f32(_r20, _r2n.val[0], 1);
"prfm pldl1keep, [%4, #256] \n"
"ld2 {v2.4s, v3.4s}, [%4], #32 \n"

_outp = vfmaq_laneq_f32(_outp, _r20, _k6789, 0);
_outp = vfmaq_laneq_f32(_outp, _r21, _k6789, 1);
_outp = vfmaq_laneq_f32(_outp, _r22, _k6789, 2);
"fmla v0.4s, v2.4s, %12.s[0] \n"
"fmla v10.4s, v3.4s, %12.s[1] \n"

vst1q_f32(outptr, _outp);
"prfm pldl1keep, [%4, #256] \n"
"ld2 {v8.4s, v9.4s}, [%4] \n"
"ext v1.16b, v2.16b, v8.16b, #4 \n"

r0 += 8;
r1 += 8;
r2 += 8;
outptr += 4;
"fmla v11.4s, v1.4s, %12.s[2] \n"

"prfm pldl1keep, [%2, #256] \n"
"ld2 {v2.4s, v3.4s}, [%2], #32 \n"

"fadd v0.4s, v0.4s, v10.4s \n"
"fadd v0.4s, v0.4s, v11.4s \n"

"subs %w0, %w0, #1 \n"
"st1 {v0.4s}, [%1], #16 \n"
"bne 0b \n"
"sub %2, %2, #32 \n"
: "=r"(nn), // %0
"=r"(outptr), // %1
"=r"(r0), // %2
"=r"(r1), // %3
"=r"(r2) // %4
: "0"(nn),
"1"(outptr),
"2"(r0),
"3"(r1),
"4"(r2),
"w"(_k0123), // %10
"w"(_k3456), // %11
"w"(_k6789) // %12
: "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
);
}
#else
if (nn > 0)


+ 133
- 69
src/layer/arm/convolution_4x4.h View File

@@ -75,63 +75,107 @@ static void conv4x4s4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke

#if __ARM_NEON
#if __aarch64__
for (; nn>0; nn--)
if (nn > 0)
{
float32x4_t _r00 = vld1q_f32(r0);
float32x4_t _r10 = vld1q_f32(r1);
float32x4_t _r20 = vld1q_f32(r2);
float32x4_t _r30 = vld1q_f32(r3);

float32x4_t _r01 = vld1q_f32(r0 + 4);
float32x4_t _r11 = vld1q_f32(r1 + 4);
float32x4_t _r21 = vld1q_f32(r2 + 4);
float32x4_t _r31 = vld1q_f32(r3 + 4);

float32x4_t _r02 = vld1q_f32(r0 + 8);
float32x4_t _r12 = vld1q_f32(r1 + 8);
float32x4_t _r22 = vld1q_f32(r2 + 8);
float32x4_t _r32 = vld1q_f32(r3 + 8);

float32x4_t _r03 = vld1q_f32(r0 + 12);
float32x4_t _r13 = vld1q_f32(r1 + 12);
float32x4_t _r23 = vld1q_f32(r2 + 12);
float32x4_t _r33 = vld1q_f32(r3 + 12);

float32x4_t _sum0 = vmulq_f32(_r00, _k0123);
float32x4_t _sum1 = vmulq_f32(_r01, _k0123);
float32x4_t _sum2 = vmulq_f32(_r02, _k0123);
float32x4_t _sum3 = vmulq_f32(_r03, _k0123);

_sum0 = vfmaq_f32(_sum0, _r10, _k4567);
_sum1 = vfmaq_f32(_sum1, _r11, _k4567);
_sum2 = vfmaq_f32(_sum2, _r12, _k4567);
_sum3 = vfmaq_f32(_sum3, _r13, _k4567);

_sum0 = vfmaq_f32(_sum0, _r20, _k891011);
_sum1 = vfmaq_f32(_sum1, _r21, _k891011);
_sum2 = vfmaq_f32(_sum2, _r22, _k891011);
_sum3 = vfmaq_f32(_sum3, _r23, _k891011);

_sum0 = vfmaq_f32(_sum0, _r30, _k12131415);
_sum1 = vfmaq_f32(_sum1, _r31, _k12131415);
_sum2 = vfmaq_f32(_sum2, _r32, _k12131415);
_sum3 = vfmaq_f32(_sum3, _r33, _k12131415);

float32x4_t _s01 = vpaddq_f32(_sum0, _sum1);
float32x4_t _s23 = vpaddq_f32(_sum2, _sum3);
float32x4_t _sum = vpaddq_f32(_s01, _s23);

float32x4_t _outp = vld1q_f32(outptr);

_outp = vaddq_f32(_outp, _sum);

vst1q_f32(outptr, _outp);

r0 += 16;
r1 += 16;
r2 += 16;
r3 += 16;
outptr += 4;
asm volatile(
"prfm pldl1keep, [%1, #128] \n"
"0: \n"

"prfm pldl1keep, [%2, #512] \n"
"prfm pldl1keep, [%3, #512] \n"

"ld1 {v7.4s}, [%1] \n" // v7 = outptr

"ld1 {v8.4s}, [%2], #16 \n"// v8 = r0
"ld1 {v9.4s}, [%3], #16 \n"// v9 = r1

"prfm pldl1keep, [%4, #512] \n"
"prfm pldl1keep, [%5, #512] \n"

"fmul v12.4s, v8.4s, %12.4s \n"
"fmul v13.4s, v9.4s, %13.4s \n"

"ld1 {v10.4s}, [%4], #16 \n"// v10 = r2
"ld1 {v11.4s}, [%5], #16 \n"// v11 = r3

"fmla v12.4s, v10.4s, %14.4s \n"
"fmla v13.4s, v11.4s, %15.4s \n"

"fadd v5.4s, v12.4s, v13.4s \n"

"ld1 {v8.4s}, [%2], #16 \n"// v8 = r0
"ld1 {v9.4s}, [%3], #16 \n"// v9 = r1

"fmul v12.4s, v8.4s, %12.4s \n"
"fmul v13.4s, v9.4s, %13.4s \n"

"ld1 {v10.4s}, [%4], #16 \n"// v10 = r2
"ld1 {v11.4s}, [%5], #16 \n"// v11 = r3
"fmla v12.4s, v10.4s, %14.4s \n"
"fmla v13.4s, v11.4s, %15.4s \n"

"fadd v6.4s, v12.4s, v13.4s \n"

"ld1 {v8.4s}, [%2], #16 \n"// v8 = r0
"ld1 {v9.4s}, [%3], #16 \n"// v9 = r1

"fmul v12.4s, v8.4s, %12.4s \n"
"fmul v13.4s, v9.4s, %13.4s \n"

"ld1 {v10.4s}, [%4], #16 \n"// v10 = r2
"ld1 {v11.4s}, [%5], #16 \n"// v11 = r3

"fmla v12.4s, v10.4s, %14.4s \n"
"fmla v13.4s, v11.4s, %15.4s \n"

"fadd v14.4s, v12.4s, v13.4s \n"
"faddp v5.4s, v5.4s, v6.4s \n" // Move to here to enhance ILP

"ld1 {v8.4s}, [%2], #16 \n"// v8 = r0
"ld1 {v9.4s}, [%3], #16 \n"// v9 = r1

"fmul v12.4s, v8.4s, %12.4s \n"
"fmul v13.4s, v9.4s, %13.4s \n"

"ld1 {v10.4s}, [%4], #16 \n"// v10 = r2
"ld1 {v11.4s}, [%5], #16 \n"// v11 = r3

"fmla v12.4s, v10.4s, %14.4s \n"
"fmla v13.4s, v11.4s, %15.4s \n"

"fadd v15.4s, v12.4s, v13.4s \n"

// "faddp v5.4s , v5.4s, v6.4s \n" // Move this line upward.
"faddp v14.4s, v14.4s, v15.4s \n"
"faddp v5.4s , v5.4s, v14.4s \n"
"fadd v7.4s, v7.4s, v5.4s \n"

"st1 {v7.4s}, [%1], #16 \n"

"prfm pldl1keep, [%1, #128] \n"

"subs %w0, %w0, #1 \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(outptr), // %1
"=r"(r0), // %2
"=r"(r1), // %3
"=r"(r2), // %4
"=r"(r3) // %5
: "0"(nn),
"1"(outptr),
"2"(r0),
"3"(r1),
"4"(r2),
"5"(r3),
"w"(_k0123), // %12
"w"(_k4567), // %13
"w"(_k891011), // %14
"w"(_k12131415) // %15
: "cc", "memory", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
);
}
#else
if (nn > 0)
@@ -247,22 +291,42 @@ static void conv4x4s4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
{
#if __ARM_NEON
#if __aarch64__
float32x4_t _r0 = vld1q_f32(r0);
float32x4_t _r1 = vld1q_f32(r1);
float32x4_t _r2 = vld1q_f32(r2);
float32x4_t _r3 = vld1q_f32(r3);
float sum = 0.f;

float32x4_t _sum = vmulq_f32(_r0, _k0123);
_sum = vmlaq_f32(_sum, _r1, _k4567);
_sum = vmlaq_f32(_sum, _r2, _k891011);
_sum = vmlaq_f32(_sum, _r3, _k12131415);
asm volatile(
"ld1 {v8.4s}, [%0], #16 \n"// v8 = r0
"ld1 {v9.4s}, [%1], #16 \n"// v9 = r1

*outptr += vaddvq_f32(_sum);
"fmul v12.4s, v8.4s, %9.4s \n"
"fmul v13.4s, v9.4s, %10.4s \n"

r0 += 4;
r1 += 4;
r2 += 4;
r3 += 4;
"ld1 {v10.4s}, [%2], #16 \n"// v10 = r2
"ld1 {v11.4s}, [%3], #16 \n"// v11 = r3

"fmla v12.4s, v10.4s, %11.4s \n"
"fmla v13.4s, v11.4s, %12.4s \n"

"fadd v5.4s, v12.4s, v13.4s \n"
"faddp v5.4s, v5.4s, v5.4s \n"
"faddp s5, v5.2s \n"
"fmov %w4, s5 \n"
: "=r"(r0), // %0
"=r"(r1), // %1
"=r"(r2), // %2
"=r"(r3), // %3
"=r"(sum) // %4
: "0"(r0),
"1"(r1),
"2"(r2),
"3"(r3),
"w"(_k0123), // %9
"w"(_k4567), // %10
"w"(_k891011), // %11
"w"(_k12131415) // %12
: "cc", "memory", "v5", "v6", "v8", "v9", "v10", "v11", "v12", "v13"
);
*outptr += sum;
#else
float sum = 0.f;



+ 448
- 273
src/layer/arm/convolution_5x5.h View File

@@ -83,118 +83,193 @@ static void conv5x5s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke

#if __ARM_NEON
#if __aarch64__
for (; nn>0; nn--)
if (nn > 0)
{
float32x4_t _sum = vld1q_f32(outptr);
float32x4_t _sum2 = vld1q_f32(outptr2);

float32x4_t _r00 = vld1q_f32(r0);
float32x4_t _r04 = vld1q_f32(r0 + 4);
float32x4_t _r01 = vextq_f32(_r00, _r04, 1);
float32x4_t _r02 = vextq_f32(_r00, _r04, 2);
float32x4_t _r03 = vextq_f32(_r00, _r04, 3);

float32x4_t _r10 = vld1q_f32(r1);
float32x4_t _r14 = vld1q_f32(r1 + 4);
float32x4_t _r11 = vextq_f32(_r10, _r14, 1);
float32x4_t _r12 = vextq_f32(_r10, _r14, 2);
float32x4_t _r13 = vextq_f32(_r10, _r14, 3);

float32x4_t _r20 = vld1q_f32(r2);
float32x4_t _r24 = vld1q_f32(r2 + 4);
float32x4_t _r21 = vextq_f32(_r20, _r24, 1);
float32x4_t _r22 = vextq_f32(_r20, _r24, 2);
float32x4_t _r23 = vextq_f32(_r20, _r24, 3);

float32x4_t _r30 = vld1q_f32(r3);
float32x4_t _r34 = vld1q_f32(r3 + 4);
float32x4_t _r31 = vextq_f32(_r30, _r34, 1);
float32x4_t _r32 = vextq_f32(_r30, _r34, 2);
float32x4_t _r33 = vextq_f32(_r30, _r34, 3);

float32x4_t _r40 = vld1q_f32(r4);
float32x4_t _r44 = vld1q_f32(r4 + 4);
float32x4_t _r41 = vextq_f32(_r40, _r44, 1);
float32x4_t _r42 = vextq_f32(_r40, _r44, 2);
float32x4_t _r43 = vextq_f32(_r40, _r44, 3);

float32x4_t _r50 = vld1q_f32(r5);
float32x4_t _r54 = vld1q_f32(r5 + 4);
float32x4_t _r51 = vextq_f32(_r50, _r54, 1);
float32x4_t _r52 = vextq_f32(_r50, _r54, 2);
float32x4_t _r53 = vextq_f32(_r50, _r54, 3);

_sum = vfmaq_laneq_f32(_sum, _r00, _k0123, 0);
_sum = vfmaq_laneq_f32(_sum, _r01, _k0123, 1);
_sum = vfmaq_laneq_f32(_sum, _r02, _k0123, 2);
_sum = vfmaq_laneq_f32(_sum, _r03, _k0123, 3);
_sum = vfmaq_laneq_f32(_sum, _r04, _k4567, 0);

_sum = vfmaq_laneq_f32(_sum, _r10, _k4567, 1);
_sum = vfmaq_laneq_f32(_sum, _r11, _k4567, 2);
_sum = vfmaq_laneq_f32(_sum, _r12, _k4567, 3);
_sum = vfmaq_laneq_f32(_sum, _r13, _k891011, 0);
_sum = vfmaq_laneq_f32(_sum, _r14, _k891011, 1);

_sum = vfmaq_laneq_f32(_sum, _r20, _k891011, 2);
_sum = vfmaq_laneq_f32(_sum, _r21, _k891011, 3);
_sum = vfmaq_laneq_f32(_sum, _r22, _k12131415, 0);
_sum = vfmaq_laneq_f32(_sum, _r23, _k12131415, 1);
_sum = vfmaq_laneq_f32(_sum, _r24, _k12131415, 2);

_sum = vfmaq_laneq_f32(_sum, _r30, _k12131415, 3);
_sum = vfmaq_laneq_f32(_sum, _r31, _k16171819, 0);
_sum = vfmaq_laneq_f32(_sum, _r32, _k16171819, 1);
_sum = vfmaq_laneq_f32(_sum, _r33, _k16171819, 2);
_sum = vfmaq_laneq_f32(_sum, _r34, _k16171819, 3);

_sum = vfmaq_laneq_f32(_sum, _r40, _k20212223, 0);
_sum = vfmaq_laneq_f32(_sum, _r41, _k20212223, 1);
_sum = vfmaq_laneq_f32(_sum, _r42, _k20212223, 2);
_sum = vfmaq_laneq_f32(_sum, _r43, _k20212223, 3);
_sum = vfmaq_laneq_f32(_sum, _r44, _k24242424, 0);

_sum2 = vfmaq_laneq_f32(_sum2, _r10, _k0123, 0);
_sum2 = vfmaq_laneq_f32(_sum2, _r11, _k0123, 1);
_sum2 = vfmaq_laneq_f32(_sum2, _r12, _k0123, 2);
_sum2 = vfmaq_laneq_f32(_sum2, _r13, _k0123, 3);
_sum2 = vfmaq_laneq_f32(_sum2, _r14, _k4567, 0);

_sum2 = vfmaq_laneq_f32(_sum2, _r20, _k4567, 1);
_sum2 = vfmaq_laneq_f32(_sum2, _r21, _k4567, 2);
_sum2 = vfmaq_laneq_f32(_sum2, _r22, _k4567, 3);
_sum2 = vfmaq_laneq_f32(_sum2, _r23, _k891011, 0);
_sum2 = vfmaq_laneq_f32(_sum2, _r24, _k891011, 1);

_sum2 = vfmaq_laneq_f32(_sum2, _r30, _k891011, 2);
_sum2 = vfmaq_laneq_f32(_sum2, _r31, _k891011, 3);
_sum2 = vfmaq_laneq_f32(_sum2, _r32, _k12131415, 0);
_sum2 = vfmaq_laneq_f32(_sum2, _r33, _k12131415, 1);
_sum2 = vfmaq_laneq_f32(_sum2, _r34, _k12131415, 2);

_sum2 = vfmaq_laneq_f32(_sum2, _r40, _k12131415, 3);
_sum2 = vfmaq_laneq_f32(_sum2, _r41, _k16171819, 0);
_sum2 = vfmaq_laneq_f32(_sum2, _r42, _k16171819, 1);
_sum2 = vfmaq_laneq_f32(_sum2, _r43, _k16171819, 2);
_sum2 = vfmaq_laneq_f32(_sum2, _r44, _k16171819, 3);

_sum2 = vfmaq_laneq_f32(_sum2, _r50, _k20212223, 0);
_sum2 = vfmaq_laneq_f32(_sum2, _r51, _k20212223, 1);
_sum2 = vfmaq_laneq_f32(_sum2, _r52, _k20212223, 2);
_sum2 = vfmaq_laneq_f32(_sum2, _r53, _k20212223, 3);
_sum2 = vfmaq_laneq_f32(_sum2, _r54, _k24242424, 0);

vst1q_f32(outptr, _sum);
vst1q_f32(outptr2, _sum2);

r0 += 4;
r1 += 4;
r2 += 4;
r3 += 4;
r4 += 4;
r5 += 4;
outptr += 4;
outptr2 += 4;
asm volatile(
// v11 = rx1 / rx3
// v12 = rx2
// v13 v14 = intermediate sum register
"prfm pldl1keep, [%1, #128] \n"
"ld1 {v7.4s}, [%1] \n"// v7 = out

"0: \n"

"prfm pldl1keep, [%2, #128] \n"
"ld1 {v8.4s}, [%2] \n"// v8 = out2

// r1
"prfm pldl1keep, [%4, #256] \n"
"ld1 {v9.4s, v10.4s}, [%4] \n"// v9 v10 = r10 r14
"add %4, %4, #16 \n"

"ext v11.16b, v9.16b, v10.16b, #4 \n" //r11
"fmul v13.4s, v9.4s, %19.s[1] \n"
"fmla v8.4s, v9.4s, %18.s[0] \n"

"ext v12.16b, v9.16b, v10.16b, #8 \n" //r12
"fmla v7.4s, v11.4s, %19.s[2] \n"
"fmul v14.4s, v11.4s, %18.s[1] \n"

"ext v11.16b, v9.16b, v10.16b, #12 \n" //r13
"fmla v13.4s, v12.4s, %19.s[3] \n"
"fmla v8.4s, v12.4s, %18.s[2] \n"

"fmla v7.4s, v11.4s, %20.s[0] \n"
"fmla v14.4s, v11.4s, %18.s[3] \n"

"prfm pldl1keep, [%5, #256] \n"

"fmla v13.4s, v10.4s, %20.s[1] \n"
"fmla v8.4s, v10.4s, %19.s[0] \n"

// r2
"ld1 {v9.4s, v10.4s}, [%5] \n"// v9 v10 = r20 r24
"add %5, %5, #16 \n"

"ext v11.16b, v9.16b, v10.16b, #4 \n" //r21
"fmla v7.4s, v9.4s, %20.s[2] \n"
"fmla v14.4s, v9.4s, %19.s[1] \n"

"ext v12.16b, v9.16b, v10.16b, #8 \n" //r22
"fmla v13.4s, v11.4s, %20.s[3] \n"
"fmla v8.4s, v11.4s, %19.s[2] \n"
"ext v11.16b, v9.16b, v10.16b, #12 \n" //r23
"fmla v7.4s, v12.4s, %21.s[0] \n"
"fmla v14.4s, v12.4s, %19.s[3] \n"

"fmla v13.4s, v11.4s, %21.s[1] \n"
"fmla v8.4s, v11.4s, %20.s[0] \n"

"prfm pldl1keep, [%6, #256] \n"

"fmla v7.4s, v10.4s, %21.s[2] \n"
"fmla v14.4s, v10.4s, %20.s[1] \n"
// r3
"ld1 {v9.4s, v10.4s}, [%6] \n"// v9 v10 = r30 r34
"add %6, %6, #16 \n"

"ext v11.16b, v9.16b, v10.16b, #4 \n" //r31
"fmla v13.4s, v9.4s, %21.s[3] \n"
"fmla v8.4s, v9.4s, %20.s[2] \n"

"ext v12.16b, v9.16b, v10.16b, #8 \n" //r32
"fmla v7.4s, v11.4s, %22.s[0] \n"
"fmla v14.4s, v11.4s, %20.s[3] \n"

"ext v11.16b, v9.16b, v10.16b, #12 \n" //r33
"fmla v13.4s, v12.4s, %22.s[1] \n"
"fmla v8.4s, v12.4s, %21.s[0] \n"

"fmla v7.4s, v11.4s, %22.s[2] \n"
"fmla v14.4s, v11.4s, %21.s[1] \n"

"prfm pldl1keep, [%7, #256] \n"

"fmla v13.4s, v10.4s, %22.s[3] \n"
"fmla v8.4s, v10.4s, %21.s[2] \n"

// r4
"ld1 {v9.4s, v10.4s}, [%7] \n"// v9 v10 = r40 r44
"add %7, %7, #16 \n"

"ext v11.16b, v9.16b, v10.16b, #4 \n" //r41
"fmla v7.4s, v9.4s, %23.s[0] \n"
"fmla v14.4s, v9.4s, %21.s[3] \n"

"ext v12.16b, v9.16b, v10.16b, #8 \n" //r41
"fmla v13.4s, v11.4s, %23.s[1] \n"
"fmla v8.4s, v11.4s, %22.s[0] \n"

"ext v11.16b, v9.16b, v10.16b, #12 \n" //r41
"fmla v7.4s, v12.4s, %23.s[2] \n"
"fmla v14.4s, v12.4s, %22.s[1] \n"

"fmla v13.4s, v11.4s, %23.s[3] \n"
"fmla v8.4s, v11.4s, %22.s[2] \n"

"prfm pldl1keep, [%3, #256] \n"

"fmla v7.4s, v10.4s, %24.s[0] \n"
"fmla v14.4s, v10.4s, %22.s[3] \n"

// r0 and r5
"ld1 {v9.4s, v10.4s}, [%3] \n"// v9 v10 = r00 r04
"add %3, %3, #16 \n"

"ext v11.16b, v9.16b, v10.16b, #4 \n" //r01
"fmla v13.4s, v11.4s, %18.s[1] \n"

"ext v12.16b, v9.16b, v10.16b, #8 \n" //r02
"fmla v7.4s, v12.4s, %18.s[2] \n"

"ext v11.16b, v9.16b, v10.16b, #12 \n" //r03

"prfm pldl1keep, [%8, #256] \n"

"fmla v13.4s, v11.4s, %18.s[3] \n"

// r5
"ld1 {v11.4s, v12.4s}, [%8] \n"// v11 v12 = r50 r54
"add %8, %8, #16 \n"

"fmla v8.4s, v11.4s, %23.s[0] \n"
"fmla v14.4s, v12.4s, %24.s[0] \n"

"fmla v7.4s, v9.4s, %18.s[0] \n"
"fmla v13.4s, v10.4s, %19.s[0] \n"

"ext v9.16b, v11.16b, v12.16b, #4 \n" //r51
"ext v10.16b, v11.16b, v12.16b, #8 \n" //r52

"fmla v14.4s, v9.4s, %23.s[1] \n"

"ext v9.16b, v11.16b, v12.16b, #12 \n" //r53
"fmla v8.4s, v10.4s, %23.s[2] \n"

"fmla v14.4s, v9.4s, %23.s[3] \n"

"fadd v7.4s, v7.4s, v13.4s \n"

"st1 {v7.4s}, [%1], #16 \n"

"fadd v8.4s, v8.4s, v14.4s \n"

"prfm pldl1keep, [%1, #128] \n"
"ld1 {v7.4s}, [%1] \n"// v7 = out
"st1 {v8.4s}, [%2], #16 \n"

"subs %w0, %w0, #1 \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(outptr), // %1
"=r"(outptr2), // %2
"=r"(r0), // %3
"=r"(r1), // %4
"=r"(r2), // %5
"=r"(r3), // %6
"=r"(r4), // %7
"=r"(r5) // %8
: "0"(nn),
"1"(outptr),
"2"(outptr2),
"3"(r0),
"4"(r1),
"5"(r2),
"6"(r3),
"7"(r4),
"8"(r5),
"w"(_k0123), // %18
"w"(_k4567), // %19
"w"(_k891011), // %20
"w"(_k12131415), // %21
"w"(_k16171819), // %22
"w"(_k20212223), // %23
"w"(_k24242424) // %24
: "cc", "memory", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
);
}
#else
if (nn > 0)
@@ -555,78 +630,130 @@ static void conv5x5s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke

#if __ARM_NEON
#if __aarch64__
for (; nn>0; nn--)
if (nn > 0)
{
float32x4_t _sum = vld1q_f32(outptr);

float32x4_t _r00 = vld1q_f32(r0);
float32x4_t _r04 = vld1q_f32(r0 + 4);
float32x4_t _r01 = vextq_f32(_r00, _r04, 1);
float32x4_t _r02 = vextq_f32(_r00, _r04, 2);
float32x4_t _r03 = vextq_f32(_r00, _r04, 3);

float32x4_t _r10 = vld1q_f32(r1);
float32x4_t _r14 = vld1q_f32(r1 + 4);
float32x4_t _r11 = vextq_f32(_r10, _r14, 1);
float32x4_t _r12 = vextq_f32(_r10, _r14, 2);
float32x4_t _r13 = vextq_f32(_r10, _r14, 3);

float32x4_t _r20 = vld1q_f32(r2);
float32x4_t _r24 = vld1q_f32(r2 + 4);
float32x4_t _r21 = vextq_f32(_r20, _r24, 1);
float32x4_t _r22 = vextq_f32(_r20, _r24, 2);
float32x4_t _r23 = vextq_f32(_r20, _r24, 3);

float32x4_t _r30 = vld1q_f32(r3);
float32x4_t _r34 = vld1q_f32(r3 + 4);
float32x4_t _r31 = vextq_f32(_r30, _r34, 1);
float32x4_t _r32 = vextq_f32(_r30, _r34, 2);
float32x4_t _r33 = vextq_f32(_r30, _r34, 3);

float32x4_t _r40 = vld1q_f32(r4);
float32x4_t _r44 = vld1q_f32(r4 + 4);
float32x4_t _r41 = vextq_f32(_r40, _r44, 1);
float32x4_t _r42 = vextq_f32(_r40, _r44, 2);
float32x4_t _r43 = vextq_f32(_r40, _r44, 3);

_sum = vfmaq_laneq_f32(_sum, _r00, _k0123, 0);
_sum = vfmaq_laneq_f32(_sum, _r01, _k0123, 1);
_sum = vfmaq_laneq_f32(_sum, _r02, _k0123, 2);
_sum = vfmaq_laneq_f32(_sum, _r03, _k0123, 3);
_sum = vfmaq_laneq_f32(_sum, _r04, _k4567, 0);

_sum = vfmaq_laneq_f32(_sum, _r10, _k4567, 1);
_sum = vfmaq_laneq_f32(_sum, _r11, _k4567, 2);
_sum = vfmaq_laneq_f32(_sum, _r12, _k4567, 3);
_sum = vfmaq_laneq_f32(_sum, _r13, _k891011, 0);
_sum = vfmaq_laneq_f32(_sum, _r14, _k891011, 1);

_sum = vfmaq_laneq_f32(_sum, _r20, _k891011, 2);
_sum = vfmaq_laneq_f32(_sum, _r21, _k891011, 3);
_sum = vfmaq_laneq_f32(_sum, _r22, _k12131415, 0);
_sum = vfmaq_laneq_f32(_sum, _r23, _k12131415, 1);
_sum = vfmaq_laneq_f32(_sum, _r24, _k12131415, 2);

_sum = vfmaq_laneq_f32(_sum, _r30, _k12131415, 3);
_sum = vfmaq_laneq_f32(_sum, _r31, _k16171819, 0);
_sum = vfmaq_laneq_f32(_sum, _r32, _k16171819, 1);
_sum = vfmaq_laneq_f32(_sum, _r33, _k16171819, 2);
_sum = vfmaq_laneq_f32(_sum, _r34, _k16171819, 3);

_sum = vfmaq_laneq_f32(_sum, _r40, _k20212223, 0);
_sum = vfmaq_laneq_f32(_sum, _r41, _k20212223, 1);
_sum = vfmaq_laneq_f32(_sum, _r42, _k20212223, 2);
_sum = vfmaq_laneq_f32(_sum, _r43, _k20212223, 3);
_sum = vfmaq_laneq_f32(_sum, _r44, _k24242424, 0);

vst1q_f32(outptr, _sum);

r0 += 4;
r1 += 4;
r2 += 4;
r3 += 4;
r4 += 4;
outptr += 4;
asm volatile(
"prfm pldl1keep, [%1, #128] \n"
"prfm pldl1keep, [%2, #256] \n"

"ld1 {v8.4s, v9.4s}, [%2] \n"// _r00 = vld1q_f32(r0+j);
"add %2, %2, #16 \n"

"0: \n"

"ld1 {v7.4s}, [%1] \n"// _sum = vld1q_f32(outptr+j);

"ext v10.16b, v8.16b, v9.16b, #4 \n" //_r01
"ext v11.16b, v8.16b, v9.16b, #8 \n" //_r02
"ext v12.16b, v8.16b, v9.16b, #12 \n" //_r03

"fmla v7.4s, v8.4s, %14.s[0] \n"
"fmul v13.4s, v10.4s, %14.s[1] \n"

"prfm pldl1keep, [%3, #256] \n"

"fmul v14.4s, v11.4s, %14.s[2] \n"
"fmul v15.4s, v12.4s, %14.s[3] \n"
"fmla v7.4s, v9.4s, %15.s[0] \n"

"ld1 {v8.4s, v9.4s}, [%3] \n"
"add %3, %3, #16 \n"
"ext v10.16b, v8.16b, v9.16b, #4 \n" //_r11
"ext v11.16b, v8.16b, v9.16b, #8 \n" //_r12
"ext v12.16b, v8.16b, v9.16b, #12 \n" //_r13

"fmla v7.4s, v8.4s, %15.s[1] \n"
"fmla v13.4s, v10.4s, %15.s[2] \n"

"prfm pldl1keep, [%4, #256] \n"

"fmla v14.4s, v11.4s, %15.s[3] \n"
"fmla v15.4s, v12.4s, %16.s[0] \n"
"fmla v7.4s, v9.4s, %16.s[1] \n"

"ld1 {v8.4s, v9.4s}, [%4] \n"
"add %4, %4, #16 \n"
"ext v10.16b, v8.16b, v9.16b, #4 \n" //_r21
"ext v11.16b, v8.16b, v9.16b, #8 \n" //_r22
"ext v12.16b, v8.16b, v9.16b, #12 \n" //_r23

"fmla v7.4s, v8.4s, %16.s[2] \n"
"fmla v13.4s, v10.4s, %16.s[3] \n"

"prfm pldl1keep, [%5, #256] \n"

"fmla v14.4s, v11.4s, %17.s[0] \n"
"fmla v15.4s, v12.4s, %17.s[1] \n"
"fmla v7.4s, v9.4s, %17.s[2] \n"

"ld1 {v8.4s, v9.4s}, [%5] \n"
"add %5, %5, #16 \n"
"ext v10.16b, v8.16b, v9.16b, #4 \n" //_r31
"ext v11.16b, v8.16b, v9.16b, #8 \n" //_r32
"ext v12.16b, v8.16b, v9.16b, #12 \n" //_r33

"fmla v7.4s, v8.4s, %17.s[3] \n"
"fmla v13.4s, v10.4s, %18.s[0] \n"

"prfm pldl1keep, [%6, #256] \n"

"fmla v14.4s, v11.4s, %18.s[1] \n"
"fmla v15.4s, v12.4s, %18.s[2] \n"
"fmla v7.4s, v9.4s, %18.s[3] \n"

"ld1 {v8.4s, v9.4s}, [%6] \n"
"add %6, %6, #16 \n"
"ext v10.16b, v8.16b, v9.16b, #4 \n" //_r41
"ext v11.16b, v8.16b, v9.16b, #8 \n" //_r42
"ext v12.16b, v8.16b, v9.16b, #12 \n" //_r43

"fmla v7.4s, v8.4s, %19.s[0] \n"
"fmla v13.4s, v10.4s, %19.s[1] \n"
"fmla v14.4s, v11.4s, %19.s[2] \n"
"fmla v15.4s, v12.4s, %19.s[3] \n"
"fmla v7.4s, v9.4s, %20.s[0] \n"

"fadd v14.4s, v14.4s, v15.4s \n"
"fadd v7.4s, v7.4s, v13.4s \n"

"prfm pldl1keep, [%2, #256] \n"

"fadd v7.4s, v7.4s, v14.4s \n"

"ld1 {v8.4s, v9.4s}, [%2] \n"
"add %2, %2, #16 \n"

"st1 {v7.4s}, [%1], #16 \n"

"prfm pldl1keep, [%1, #128] \n"

"subs %w0, %w0, #1 \n"
"bne 0b \n"

"sub %2, %2, #16 \n"
: "=r"(nn), // %0
"=r"(outptr), // %1
"=r"(r0), // %2
"=r"(r1), // %3
"=r"(r2), // %4
"=r"(r3), // %5
"=r"(r4) // %6
: "0"(nn),
"1"(outptr),
"2"(r0),
"3"(r1),
"4"(r2),
"5"(r3),
"6"(r4),
"w"(_k0123), // %14
"w"(_k4567), // %15
"w"(_k891011), // %16
"w"(_k12131415), // %17
"w"(_k16171819), // %18
"w"(_k20212223), // %19
"w"(_k24242424) // %20
: "cc", "memory", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
);
}
#else
if (nn > 0)
@@ -920,99 +1047,147 @@ static void conv5x5s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke

#if __ARM_NEON
#if __aarch64__
for (; nn>0; nn--)
if (nn > 0)
{
float32x4_t _sum = vld1q_f32(outptr);

float32x4x2_t _r00_02461357 = vld2q_f32(r0);
float32x4x2_t _r00nx2 = vld2q_f32(r0 + 8);
float32x4_t _r0_8101214 = _r00nx2.val[0];// 8 10 12 14
float32x4_t _r0_9111315 = _r00nx2.val[1];// 9 11 13 15
float32x4_t _r00 = _r00_02461357.val[0];// 0 2 4 6
float32x4_t _r01 = _r00_02461357.val[1];// 1 3 5 7
float32x4_t _r02 = vextq_f32(_r00, _r0_8101214, 1);// 2 4 6 8
float32x4_t _r03 = vextq_f32(_r01, _r0_9111315, 1);// 3 5 7 9
float32x4_t _r04 = vextq_f32(_r00, _r0_8101214, 2);// 4 6 8 10

float32x4x2_t _r10_02461357 = vld2q_f32(r1);
float32x4x2_t _r10nx2 = vld2q_f32(r1 + 8);
float32x4_t _r1_8101214 = _r10nx2.val[0];
float32x4_t _r1_9111315 = _r10nx2.val[1];
float32x4_t _r10 = _r10_02461357.val[0];
float32x4_t _r11 = _r10_02461357.val[1];
float32x4_t _r12 = vextq_f32(_r10, _r1_8101214, 1);
float32x4_t _r13 = vextq_f32(_r11, _r1_9111315, 1);
float32x4_t _r14 = vextq_f32(_r10, _r1_8101214, 2);

float32x4x2_t _r20_02461357 = vld2q_f32(r2);
float32x4x2_t _r20nx2 = vld2q_f32(r2 + 8);
float32x4_t _r2_8101214 = _r20nx2.val[0];
float32x4_t _r2_9111315 = _r20nx2.val[1];
float32x4_t _r20 = _r20_02461357.val[0];
float32x4_t _r21 = _r20_02461357.val[1];
float32x4_t _r22 = vextq_f32(_r20, _r2_8101214, 1);
float32x4_t _r23 = vextq_f32(_r21, _r2_9111315, 1);
float32x4_t _r24 = vextq_f32(_r20, _r2_8101214, 2);

float32x4x2_t _r30_02461357 = vld2q_f32(r3);
float32x4x2_t _r30nx2 = vld2q_f32(r3 + 8);
float32x4_t _r3_8101214 = _r30nx2.val[0];
float32x4_t _r3_9111315 = _r30nx2.val[1];
float32x4_t _r30 = _r30_02461357.val[0];
float32x4_t _r31 = _r30_02461357.val[1];
float32x4_t _r32 = vextq_f32(_r30, _r3_8101214, 1);
float32x4_t _r33 = vextq_f32(_r31, _r3_9111315, 1);
float32x4_t _r34 = vextq_f32(_r30, _r3_8101214, 2);

float32x4x2_t _r40_02461357 = vld2q_f32(r4);
float32x4x2_t _r40nx2 = vld2q_f32(r4 + 8);
float32x4_t _r4_8101214 = _r40nx2.val[0];
float32x4_t _r4_9111315 = _r40nx2.val[1];
float32x4_t _r40 = _r40_02461357.val[0];
float32x4_t _r41 = _r40_02461357.val[1];
float32x4_t _r42 = vextq_f32(_r40, _r4_8101214, 1);
float32x4_t _r43 = vextq_f32(_r41, _r4_9111315, 1);
float32x4_t _r44 = vextq_f32(_r40, _r4_8101214, 2);

_sum = vfmaq_laneq_f32(_sum, _r00, _k0123, 0);
_sum = vfmaq_laneq_f32(_sum, _r01, _k0123, 1);
_sum = vfmaq_laneq_f32(_sum, _r02, _k0123, 2);
_sum = vfmaq_laneq_f32(_sum, _r03, _k0123, 3);
_sum = vfmaq_laneq_f32(_sum, _r04, _k4567, 0);

_sum = vfmaq_laneq_f32(_sum, _r10, _k4567, 1);
_sum = vfmaq_laneq_f32(_sum, _r11, _k4567, 2);
_sum = vfmaq_laneq_f32(_sum, _r12, _k4567, 3);
_sum = vfmaq_laneq_f32(_sum, _r13, _k891011, 0);
_sum = vfmaq_laneq_f32(_sum, _r14, _k891011, 1);

_sum = vfmaq_laneq_f32(_sum, _r20, _k891011, 2);
_sum = vfmaq_laneq_f32(_sum, _r21, _k891011, 3);
_sum = vfmaq_laneq_f32(_sum, _r22, _k12131415, 0);
_sum = vfmaq_laneq_f32(_sum, _r23, _k12131415, 1);
_sum = vfmaq_laneq_f32(_sum, _r24, _k12131415, 2);

_sum = vfmaq_laneq_f32(_sum, _r30, _k12131415, 3);
_sum = vfmaq_laneq_f32(_sum, _r31, _k16171819, 0);
_sum = vfmaq_laneq_f32(_sum, _r32, _k16171819, 1);
_sum = vfmaq_laneq_f32(_sum, _r33, _k16171819, 2);
_sum = vfmaq_laneq_f32(_sum, _r34, _k16171819, 3);

_sum = vfmaq_laneq_f32(_sum, _r40, _k20212223, 0);
_sum = vfmaq_laneq_f32(_sum, _r41, _k20212223, 1);
_sum = vfmaq_laneq_f32(_sum, _r42, _k20212223, 2);
_sum = vfmaq_laneq_f32(_sum, _r43, _k20212223, 3);
_sum = vfmaq_laneq_f32(_sum, _r44, _k24242424, 0);

vst1q_f32(outptr, _sum);

r0 += 8;
r1 += 8;
r2 += 8;
r3 += 8;
r4 += 8;
outptr += 4;
asm volatile(
"prfm pldl1keep, [%2, #256] \n"
"ld2 {v8.4s, v9.4s}, [%2], #32 \n"// v8 = 0 2 4 6 q9 = 1 3 5 7

"prfm pldl1keep, [%2, #256] \n"
"ld2 {v10.4s, v11.4s}, [%2] \n"// v10 = 8 10 12 14 v11 = 9 11 13 15

"prfm pldl1keep, [%1, #128] \n"
"0: \n"

"ld1 {v7.4s}, [%1] \n" // v7 = outptr

"ext v12.16b, v8.16b, v10.16b, #4 \n" // v12 = 2 4 6 8
"ext v11.16b, v9.16b, v11.16b, #4 \n" // v11 = 3 5 7 9
"ext v10.16b, v8.16b, v10.16b, #8 \n" // v10 = 4 6 8 10

"fmla v7.4s, v8.4s, %14.s[0] \n"
"fmul v13.4s, v9.4s, %14.s[1] \n"

"prfm pldl1keep, [%3, #256] \n"

"fmul v14.4s, v12.4s, %14.s[2] \n"
"fmul v15.4s, v11.4s, %14.s[3] \n"
"fmla v7.4s, v10.4s, %15.s[0] \n"

"ld2 {v8.4s, v9.4s}, [%3], #32 \n"

"prfm pldl1keep, [%3, #256] \n"

"ld2 {v10.4s, v11.4s}, [%3] \n"
"ext v12.16b, v8.16b, v10.16b, #4 \n"
"ext v11.16b, v9.16b, v11.16b, #4 \n"
"ext v10.16b, v8.16b, v10.16b, #8 \n"

"fmla v7.4s, v8.4s, %15.s[1] \n"
"fmla v13.4s, v9.4s, %15.s[2] \n"

"prfm pldl1keep, [%4, #256] \n"

"fmla v14.4s, v12.4s, %15.s[3] \n"
"fmla v15.4s, v11.4s, %16.s[0] \n"
"fmla v7.4s, v10.4s, %16.s[1] \n"

"ld2 {v8.4s, v9.4s}, [%4], #32 \n"

"prfm pldl1keep, [%4, #256] \n"

"ld2 {v10.4s, v11.4s}, [%4] \n"
"ext v12.16b, v8.16b, v10.16b, #4 \n"
"ext v11.16b, v9.16b, v11.16b, #4 \n"
"ext v10.16b, v8.16b, v10.16b, #8 \n"

"fmla v7.4s, v8.4s, %16.s[2] \n"
"fmla v13.4s, v9.4s, %16.s[3] \n"

"prfm pldl1keep, [%5, #256] \n"

"fmla v14.4s, v12.4s, %17.s[0] \n"
"fmla v15.4s, v11.4s, %17.s[1] \n"
"fmla v7.4s, v10.4s, %17.s[2] \n"

"ld2 {v8.4s, v9.4s}, [%5], #32 \n"

"prfm pldl1keep, [%5, #256] \n"

"ld2 {v10.4s, v11.4s}, [%5] \n"
"ext v12.16b, v8.16b, v10.16b, #4 \n"
"ext v11.16b, v9.16b, v11.16b, #4 \n"
"ext v10.16b, v8.16b, v10.16b, #8 \n"

"fmla v7.4s, v8.4s, %17.s[3] \n"
"fmla v13.4s, v9.4s, %18.s[0] \n"

"prfm pldl1keep, [%6, #256] \n"

"fmla v14.4s, v12.4s, %18.s[1] \n"
"fmla v15.4s, v11.4s, %18.s[2] \n"
"fmla v7.4s, v10.4s, %18.s[3] \n"

"ld2 {v8.4s, v9.4s}, [%6], #32 \n"

"prfm pldl1keep, [%6, #256] \n"

"ld2 {v10.4s, v11.4s}, [%6] \n"
"ext v12.16b, v8.16b, v10.16b, #4 \n"
"ext v11.16b, v9.16b, v11.16b, #4 \n"
"ext v10.16b, v8.16b, v10.16b, #8 \n"

"fmla v7.4s, v8.4s, %19.s[0] \n"
"fmla v13.4s, v9.4s, %19.s[1] \n"
"fmla v14.4s, v12.4s, %19.s[2] \n"
"fmla v15.4s, v11.4s, %19.s[3] \n"
"fmla v7.4s, v10.4s, %20.s[0] \n"

"prfm pldl1keep, [%2, #256] \n"

"ld2 {v8.4s, v9.4s}, [%2], #32 \n"

"fadd v14.4s, v14.4s, v15.4s \n"
"fadd v7.4s, v7.4s, v13.4s \n"
"prfm pldl1keep, [%2, #256] \n"

"fadd v7.4s, v7.4s, v14.4s \n"

"ld2 {v10.4s, v11.4s}, [%2] \n"
"st1 {v7.4s}, [%1], #16 \n"

"prfm pldl1keep, [%1, #128] \n"

"subs %w0, %w0, #1 \n"
"bne 0b \n"

"sub %2, %2, #32 \n"
: "=r"(nn), // %0
"=r"(outptr), // %1
"=r"(r0), // %2
"=r"(r1), // %3
"=r"(r2), // %4
"=r"(r3), // %5
"=r"(r4) // %6
: "0"(nn),
"1"(outptr),
"2"(r0),
"3"(r1),
"4"(r2),
"5"(r3),
"6"(r4),
"w"(_k0123), // %14
"w"(_k4567), // %15
"w"(_k891011), // %16
"w"(_k12131415), // %17
"w"(_k16171819), // %18
"w"(_k20212223), // %19
"w"(_k24242424) // %20
: "cc", "memory", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
);
}

#else
if (nn > 0)
{


+ 386
- 43
src/layer/arm/convolution_7x7.h View File

@@ -75,13 +75,198 @@ static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke

#if __ARM_NEON
#if __aarch64__
float32x4_t _k0123 = vld1q_f32(k0);
float32x4_t _k4567 = vld1q_f32(k0 + 4);
float32x4_t _k78910 = vld1q_f32(k1);
float32x4_t _k11121314 = vld1q_f32(k1 + 4);
float32x4_t _k14151617 = vld1q_f32(k2);
float32x4_t _k18192021 = vld1q_f32(k2 + 4);
float32x4_t _k21222324 = vld1q_f32(k3);
float32x4_t _k25262728 = vld1q_f32(k3 + 4);
float32x4_t _k28293031 = vld1q_f32(k4);
float32x4_t _k32333435 = vld1q_f32(k4 + 4);
float32x4_t _k35363738 = vld1q_f32(k5);
float32x4_t _k39404142 = vld1q_f32(k5 + 4);
float32x4_t _k42434445 = vld1q_f32(k6);
float32x4_t _k46474849 = vld1q_f32(k6 + 4);
#ifdef __clang__ // __ARM_NEON && __aarch64__ && __clang__
if (nn > 0)
{
asm volatile(
// v0: input / final output
// v1 v2 v3: = ri0 ri4 ri0n , i <- 1-7
// v4 = ri1 / ri3 / ri6
// v5 = ri2 / ri5
// v9 = intermediate sum register
"0: \n"
"prfm pldl1keep, [%1, #128] \n"
"ld1 {v0.4s}, [%1] \n"

//i = 1
"prfm pldl1keep, [%2, #384] \n"
"ld1 {v1.4s, v2.4s, v3.4s}, [%2] \n"
"add %2, %2, #16 \n"
"ext v4.16b, v1.16b, v2.16b, #4 \n"
"fmul v9.4s, v1.4s, %18.s[0] \n"
"ext v5.16b, v1.16b, v2.16b, #8 \n"
"fmla v0.4s, v4.4s, %18.s[1] \n"
"ext v4.16b, v1.16b, v2.16b, #12 \n"
"fmla v9.4s, v5.4s, %18.s[2] \n"
"ext v5.16b, v2.16b, v3.16b, #4 \n"
"fmla v0.4s, v4.4s, %18.s[3] \n"
"ext v4.16b, v2.16b, v3.16b, #8 \n"
"fmla v9.4s, v2.4s, %19.s[0] \n"
"fmla v0.4s, v5.4s, %19.s[1] \n"
"fmla v9.4s, v4.4s, %19.s[2] \n"

//i = 2
"prfm pldl1keep, [%3, #384] \n"
"ld1 {v1.4s, v2.4s, v3.4s}, [%3] \n" // v1 v2 v3: = r20 r24 r20n
"add %3, %3, #16 \n"
"ext v4.16b, v1.16b, v2.16b, #4 \n" // v4 = r21
"fmla v9.4s, v1.4s, %20.s[0] \n" // *+ r10
"ext v5.16b, v1.16b, v2.16b, #8 \n" // v5 = r22
"fmla v0.4s, v4.4s, %20.s[1] \n" // *+ r11
"ext v4.16b, v1.16b, v2.16b, #12 \n" // v4 = r23
"fmla v9.4s, v5.4s, %20.s[2] \n" // *+ r1
"ext v5.16b, v2.16b, v3.16b, #4 \n" // v5 = r25
"fmla v0.4s, v4.4s, %20.s[3] \n" // *+ r13
"ext v4.16b, v2.16b, v3.16b, #8 \n" // v4 = r26
"fmla v9.4s, v2.4s, %21.s[0] \n" // *+ r14
"fmla v0.4s, v5.4s, %21.s[1] \n" // *+ r15
"fmla v9.4s, v4.4s, %21.s[2] \n" // *+ r16

//i = 3
"prfm pldl1keep, [%4, #384] \n"
"ld1 {v1.4s, v2.4s, v3.4s}, [%4] \n"
"add %4, %4, #16 \n"
"ext v4.16b, v1.16b, v2.16b, #4 \n"
"fmla v9.4s, v1.4s, %22.s[0] \n"
"ext v5.16b, v1.16b, v2.16b, #8 \n"
"fmla v0.4s, v4.4s, %22.s[1] \n"
"ext v4.16b, v1.16b, v2.16b, #12 \n"
"fmla v9.4s, v5.4s, %22.s[2] \n"
"ext v5.16b, v2.16b, v3.16b, #4 \n"
"fmla v0.4s, v4.4s, %22.s[3] \n"
"ext v4.16b, v2.16b, v3.16b, #8 \n"
"fmla v9.4s, v2.4s, %23.s[0] \n"
"fmla v0.4s, v5.4s, %23.s[1] \n"
"fmla v9.4s, v4.4s, %23.s[2] \n"

//i = 4
"prfm pldl1keep, [%5, #384] \n"
"ld1 {v1.4s, v2.4s, v3.4s}, [%5] \n"
"add %5, %5, #16 \n"
"ext v4.16b, v1.16b, v2.16b, #4 \n"
"fmla v9.4s, v1.4s, %24.s[0] \n"
"ext v5.16b, v1.16b, v2.16b, #8 \n"
"fmla v0.4s, v4.4s, %24.s[1] \n"
"ext v4.16b, v1.16b, v2.16b, #12 \n"
"fmla v9.4s, v5.4s, %24.s[2] \n"
"ext v5.16b, v2.16b, v3.16b, #4 \n"
"fmla v0.4s, v4.4s, %24.s[3] \n"
"ext v4.16b, v2.16b, v3.16b, #8 \n"
"fmla v9.4s, v2.4s, %25.s[0] \n"
"fmla v0.4s, v5.4s, %25.s[1] \n"
"fmla v9.4s, v4.4s, %25.s[2] \n"

//i = 5
"prfm pldl1keep, [%6, #384] \n"
"ld1 {v1.4s, v2.4s, v3.4s}, [%6] \n"
"add %6, %6, #16 \n"
"ext v4.16b, v1.16b, v2.16b, #4 \n"
"fmla v9.4s, v1.4s, %26.s[0] \n"
"ext v5.16b, v1.16b, v2.16b, #8 \n"
"fmla v0.4s, v4.4s, %26.s[1] \n"
"ext v4.16b, v1.16b, v2.16b, #12 \n"
"fmla v9.4s, v5.4s, %26.s[2] \n"
"ext v5.16b, v2.16b, v3.16b, #4 \n"
"fmla v0.4s, v4.4s, %26.s[3] \n"
"ext v4.16b, v2.16b, v3.16b, #8 \n"
"fmla v9.4s, v2.4s, %27.s[0] \n"
"fmla v0.4s, v5.4s, %27.s[1] \n"
"fmla v9.4s, v4.4s, %27.s[2] \n"

//i = 6
"prfm pldl1keep, [%7, #384] \n"
"ld1 {v1.4s, v2.4s, v3.4s}, [%7] \n"
"add %7, %7, #16 \n"
"ext v4.16b, v1.16b, v2.16b, #4 \n"
"fmla v9.4s, v1.4s, %28.s[0] \n"
"ext v5.16b, v1.16b, v2.16b, #8 \n"
"fmla v0.4s, v4.4s, %28.s[1] \n"
"ext v4.16b, v1.16b, v2.16b, #12 \n"
"fmla v9.4s, v5.4s, %28.s[2] \n"
"ext v5.16b, v2.16b, v3.16b, #4 \n"
"fmla v0.4s, v4.4s, %28.s[3] \n"
"ext v4.16b, v2.16b, v3.16b, #8 \n"
"fmla v9.4s, v2.4s, %29.s[0] \n"
"fmla v0.4s, v5.4s, %29.s[1] \n"
"fmla v9.4s, v4.4s, %29.s[2] \n"
//i = 7
"prfm pldl1keep, [%8, #384] \n"
"ld1 {v1.4s, v2.4s, v3.4s}, [%8] \n"
"add %8, %8, #16 \n"
"ext v4.16b, v1.16b, v2.16b, #4 \n"
"fmla v9.4s, v1.4s, %30.s[0] \n"
"ext v5.16b, v1.16b, v2.16b, #8 \n"
"fmla v0.4s, v4.4s, %30.s[1] \n"
"ext v4.16b, v1.16b, v2.16b, #12 \n"
"fmla v9.4s, v5.4s, %30.s[2] \n"
"ext v5.16b, v2.16b, v3.16b, #4 \n"
"fmla v0.4s, v4.4s, %30.s[3] \n"
"ext v4.16b, v2.16b, v3.16b, #8 \n"
"fmla v9.4s, v2.4s, %31.s[0] \n"
"fmla v0.4s, v5.4s, %31.s[1] \n"
"fmla v9.4s, v4.4s, %31.s[2] \n"

"fadd v0.4s, v0.4s, v9.4s \n"
"st1 {v0.4s}, [%1], #16 \n"
"subs %w0, %w0, #1 \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(outptr), // %1
"=r"(r0), // %2
"=r"(r1), // %3
"=r"(r2), // %4
"=r"(r3), // %5
"=r"(r4), // %6
"=r"(r5), // %7
"=r"(r6) // %8
: "0"(nn),
"1"(outptr),
"2"(r0),
"3"(r1),
"4"(r2),
"5"(r3),
"6"(r4),
"7"(r5),
"8"(r6),
"w"(_k0123), // %18
"w"(_k4567), // %19
"w"(_k78910), // %20
"w"(_k11121314), // %21
"w"(_k14151617), // %22
"w"(_k18192021), // %23
"w"(_k21222324), // %24
"w"(_k25262728), // %25
"w"(_k28293031), // %26
"w"(_k32333435), // %27
"w"(_k35363738), // %28
"w"(_k39404142), // %29
"w"(_k42434445), // %30
"w"(_k46474849) // %31
: "cc", "memory","v0", "v1", "v2", "v3", "v4", "v5", "v9"
);
}
#else // __ARM_NEON && __aarch64__ defined, but __clang__ not defined
// When compiled with gcc, gcc does not accept over 30 operands
for (; nn>0; nn--)
{
float32x4_t _sum = vld1q_f32(outptr);

float32x4_t _k0123 = vld1q_f32(k0);
float32x4_t _k4567 = vld1q_f32(k0 + 4);

float32x4_t _r00 = vld1q_f32(r0);// 0 1 2 3
float32x4_t _r04 = vld1q_f32(r0 + 4);// 4 5 6 7
float32x4_t _r00n = vld1q_f32(r0 + 8);// 8 9 10 11
@@ -99,9 +284,6 @@ static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
_sum = vfmaq_laneq_f32(_sum, _r05, _k4567, 1);
_sum = vfmaq_laneq_f32(_sum, _r06, _k4567, 2);

float32x4_t _k78910 = vld1q_f32(k1);
float32x4_t _k11121314 = vld1q_f32(k1 + 4);

float32x4_t _r10 = vld1q_f32(r1);
float32x4_t _r14 = vld1q_f32(r1 + 4);
float32x4_t _r10n = vld1q_f32(r1 + 8);
@@ -119,9 +301,6 @@ static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
_sum = vfmaq_laneq_f32(_sum, _r15, _k11121314, 1);
_sum = vfmaq_laneq_f32(_sum, _r16, _k11121314, 2);

float32x4_t _k14151617 = vld1q_f32(k2);
float32x4_t _k18192021 = vld1q_f32(k2 + 4);

float32x4_t _r20 = vld1q_f32(r2);
float32x4_t _r24 = vld1q_f32(r2 + 4);
float32x4_t _r20n = vld1q_f32(r2 + 8);
@@ -139,9 +318,6 @@ static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
_sum = vfmaq_laneq_f32(_sum, _r25, _k18192021, 1);
_sum = vfmaq_laneq_f32(_sum, _r26, _k18192021, 2);

float32x4_t _k21222324 = vld1q_f32(k3);
float32x4_t _k25262728 = vld1q_f32(k3 + 4);

float32x4_t _r30 = vld1q_f32(r3);
float32x4_t _r34 = vld1q_f32(r3 + 4);
float32x4_t _r30n = vld1q_f32(r3 + 8);
@@ -159,9 +335,6 @@ static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
_sum = vfmaq_laneq_f32(_sum, _r35, _k25262728, 1);
_sum = vfmaq_laneq_f32(_sum, _r36, _k25262728, 2);

float32x4_t _k28293031 = vld1q_f32(k4);
float32x4_t _k32333435 = vld1q_f32(k4 + 4);

float32x4_t _r40 = vld1q_f32(r4);
float32x4_t _r44 = vld1q_f32(r4 + 4);
float32x4_t _r40n = vld1q_f32(r4 + 8);
@@ -179,9 +352,6 @@ static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
_sum = vfmaq_laneq_f32(_sum, _r45, _k32333435, 1);
_sum = vfmaq_laneq_f32(_sum, _r46, _k32333435, 2);

float32x4_t _k35363738 = vld1q_f32(k5);
float32x4_t _k39404142 = vld1q_f32(k5 + 4);

float32x4_t _r50 = vld1q_f32(r5);
float32x4_t _r54 = vld1q_f32(r5 + 4);
float32x4_t _r50n = vld1q_f32(r5 + 8);
@@ -199,9 +369,6 @@ static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
_sum = vfmaq_laneq_f32(_sum, _r55, _k39404142, 1);
_sum = vfmaq_laneq_f32(_sum, _r56, _k39404142, 2);

float32x4_t _k42434445 = vld1q_f32(k6);
float32x4_t _k46474849 = vld1q_f32(k6 + 4);

float32x4_t _r60 = vld1q_f32(r6);
float32x4_t _r64 = vld1q_f32(r6 + 4);
float32x4_t _r60n = vld1q_f32(r6 + 8);
@@ -230,7 +397,8 @@ static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
r6 += 4;
outptr += 4;
}
#else
#endif // __clang__
#else //__aarch32__
if (nn > 0)
{
asm volatile(
@@ -599,13 +767,205 @@ static void conv7x7s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke

#if __ARM_NEON
#if __aarch64__
float32x4_t _k0123 = vld1q_f32(k0);
float32x4_t _k4567 = vld1q_f32(k0 + 4);
float32x4_t _k78910 = vld1q_f32(k1);
float32x4_t _k11121314 = vld1q_f32(k1 + 4);
float32x4_t _k14151617 = vld1q_f32(k2);
float32x4_t _k18192021 = vld1q_f32(k2 + 4);
float32x4_t _k21222324 = vld1q_f32(k3);
float32x4_t _k25262728 = vld1q_f32(k3 + 4);
float32x4_t _k28293031 = vld1q_f32(k4);
float32x4_t _k32333435 = vld1q_f32(k4 + 4);
float32x4_t _k35363738 = vld1q_f32(k5);
float32x4_t _k39404142 = vld1q_f32(k5 + 4);
float32x4_t _k42434445 = vld1q_f32(k6);
float32x4_t _k46474849 = vld1q_f32(k6 + 4);
#ifdef __clang__ // __ARM_NEON && __aarch64__ && __clang__
if (nn > 0)
{
asm volatile(
// v0: input / final output
// v1 v2: = _ri0/_ri1 first
// v3 v4: = then _r0_8101214/_r0_9111315
// v5 = ri2 / ri4 / ri6
// v6 = ri3 / ri5
// v9 = intermediate sum register
"0: \n"
"prfm pldl1keep, [%1, #128] \n"
"ld1 {v0.4s}, [%1] \n"

//i = 1
"prfm pldl1keep, [%2, #512] \n"
"ld2 {v1.4s, v2.4s}, [%2] \n" // v1 v2 = _r00 _r01
"add %2, %2, #32 \n"
"ld2 {v3.4s, v4.4s}, [%2] \n" // v3 v4 = _r0_8101214 / _r0_9111315
"fmul v9.4s, v1.4s, %18.s[0] \n" // *+ _r00
"ext v5.16b, v1.16b, v3.16b, #4 \n" // v5 = _r02
"fmla v0.4s, v2.4s, %18.s[1] \n" // *+ _r01
"ext v6.16b, v2.16b, v4.16b, #4 \n" // v6 = _r03
"fmla v9.4s, v5.4s, %18.s[2] \n" // *+ _r02
"ext v5.16b, v1.16b, v3.16b, #8 \n" // v5 = _r04
"fmla v0.4s, v6.4s, %18.s[3] \n" // *+ _r03
"ext v6.16b, v2.16b, v4.16b, #8 \n" // v6 = _r05
"fmla v9.4s, v5.4s, %19.s[0] \n" // *+ _r04
"ext v5.16b, v1.16b, v3.16b, #12 \n" // v5 = _r06
"fmla v0.4s, v6.4s, %19.s[1] \n" // *+ _r05
"fmla v9.4s, v5.4s, %19.s[2] \n" // *+ _r06

//i = 2
"prfm pldl1keep, [%3, #512] \n"
"ld2 {v1.4s, v2.4s}, [%3] \n"
"add %3, %3, #32 \n"
"ld2 {v3.4s, v4.4s}, [%3] \n"
"fmla v9.4s, v1.4s, %20.s[0] \n"
"ext v5.16b, v1.16b, v3.16b, #4 \n"
"fmla v0.4s, v2.4s, %20.s[1] \n"
"ext v6.16b, v2.16b, v4.16b, #4 \n"
"fmla v9.4s, v5.4s, %20.s[2] \n"
"ext v5.16b, v1.16b, v3.16b, #8 \n"
"fmla v0.4s, v6.4s, %20.s[3] \n"
"ext v6.16b, v2.16b, v4.16b, #8 \n"
"fmla v9.4s, v5.4s, %21.s[0] \n"
"ext v5.16b, v1.16b, v3.16b, #12 \n"
"fmla v0.4s, v6.4s, %21.s[1] \n"
"fmla v9.4s, v5.4s, %21.s[2] \n"

//i = 3
"prfm pldl1keep, [%4, #512] \n"
"ld2 {v1.4s, v2.4s}, [%4] \n"
"add %4, %4, #32 \n"
"ld2 {v3.4s, v4.4s}, [%4] \n"
"fmla v9.4s, v1.4s, %22.s[0] \n"
"ext v5.16b, v1.16b, v3.16b, #4 \n"
"fmla v0.4s, v2.4s, %22.s[1] \n"
"ext v6.16b, v2.16b, v4.16b, #4 \n"
"fmla v9.4s, v5.4s, %22.s[2] \n"
"ext v5.16b, v1.16b, v3.16b, #8 \n"
"fmla v0.4s, v6.4s, %22.s[3] \n"
"ext v6.16b, v2.16b, v4.16b, #8 \n"
"fmla v9.4s, v5.4s, %23.s[0] \n"
"ext v5.16b, v1.16b, v3.16b, #12 \n"
"fmla v0.4s, v6.4s, %23.s[1] \n"
"fmla v9.4s, v5.4s, %23.s[2] \n"

//i = 4
"prfm pldl1keep, [%5, #512] \n"
"ld2 {v1.4s, v2.4s}, [%5] \n"
"add %5, %5, #32 \n"
"ld2 {v3.4s, v4.4s}, [%5] \n"
"fmla v9.4s, v1.4s, %24.s[0] \n"
"ext v5.16b, v1.16b, v3.16b, #4 \n"
"fmla v0.4s, v2.4s, %24.s[1] \n"
"ext v6.16b, v2.16b, v4.16b, #4 \n"
"fmla v9.4s, v5.4s, %24.s[2] \n"
"ext v5.16b, v1.16b, v3.16b, #8 \n"
"fmla v0.4s, v6.4s, %24.s[3] \n"
"ext v6.16b, v2.16b, v4.16b, #8 \n"
"fmla v9.4s, v5.4s, %25.s[0] \n"
"ext v5.16b, v1.16b, v3.16b, #12 \n"
"fmla v0.4s, v6.4s, %25.s[1] \n"
"fmla v9.4s, v5.4s, %25.s[2] \n"

//i = 5
"prfm pldl1keep, [%6, #512] \n"
"ld2 {v1.4s, v2.4s}, [%6] \n"
"add %6, %6, #32 \n"
"ld2 {v3.4s, v4.4s}, [%6] \n"
"fmla v9.4s, v1.4s, %26.s[0] \n"
"ext v5.16b, v1.16b, v3.16b, #4 \n"
"fmla v0.4s, v2.4s, %26.s[1] \n"
"ext v6.16b, v2.16b, v4.16b, #4 \n"
"fmla v9.4s, v5.4s, %26.s[2] \n"
"ext v5.16b, v1.16b, v3.16b, #8 \n"
"fmla v0.4s, v6.4s, %26.s[3] \n"
"ext v6.16b, v2.16b, v4.16b, #8 \n"
"fmla v9.4s, v5.4s, %27.s[0] \n"
"ext v5.16b, v1.16b, v3.16b, #12 \n"
"fmla v0.4s, v6.4s, %27.s[1] \n"
"fmla v9.4s, v5.4s, %27.s[2] \n"

//i = 6
"prfm pldl1keep, [%7, #512] \n"
"ld2 {v1.4s, v2.4s}, [%7] \n"
"add %7, %7, #32 \n"
"ld2 {v3.4s, v4.4s}, [%7] \n"
"fmla v9.4s, v1.4s, %28.s[0] \n"
"ext v5.16b, v1.16b, v3.16b, #4 \n"
"fmla v0.4s, v2.4s, %28.s[1] \n"
"ext v6.16b, v2.16b, v4.16b, #4 \n"
"fmla v9.4s, v5.4s, %28.s[2] \n"
"ext v5.16b, v1.16b, v3.16b, #8 \n"
"fmla v0.4s, v6.4s, %28.s[3] \n"
"ext v6.16b, v2.16b, v4.16b, #8 \n"
"fmla v9.4s, v5.4s, %29.s[0] \n"
"ext v5.16b, v1.16b, v3.16b, #12 \n"
"fmla v0.4s, v6.4s, %29.s[1] \n"
"fmla v9.4s, v5.4s, %29.s[2] \n"

//i = 7
"prfm pldl1keep, [%8, #512] \n"
"ld2 {v1.4s, v2.4s}, [%8] \n"
"add %8, %8, #32 \n"
"ld2 {v3.4s, v4.4s}, [%8] \n"
"fmla v9.4s, v1.4s, %30.s[0] \n"
"ext v5.16b, v1.16b, v3.16b, #4 \n"
"fmla v0.4s, v2.4s, %30.s[1] \n"
"ext v6.16b, v2.16b, v4.16b, #4 \n"
"fmla v9.4s, v5.4s, %30.s[2] \n"
"ext v5.16b, v1.16b, v3.16b, #8 \n"
"fmla v0.4s, v6.4s, %30.s[3] \n"
"ext v6.16b, v2.16b, v4.16b, #8 \n"
"fmla v9.4s, v5.4s, %31.s[0] \n"
"ext v5.16b, v1.16b, v3.16b, #12 \n"
"fmla v0.4s, v6.4s, %31.s[1] \n"
"fmla v9.4s, v5.4s, %31.s[2] \n"

"fadd v0.4s, v0.4s, v9.4s \n"
"st1 {v0.4s}, [%1], #16 \n"
"subs %w0, %w0, #1 \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(outptr), // %1
"=r"(r0), // %2
"=r"(r1), // %3
"=r"(r2), // %4
"=r"(r3), // %5
"=r"(r4), // %6
"=r"(r5), // %7
"=r"(r6) // %8
: "0"(nn),
"1"(outptr),
"2"(r0),
"3"(r1),
"4"(r2),
"5"(r3),
"6"(r4),
"7"(r5),
"8"(r6),
"w"(_k0123), // %18
"w"(_k4567), // %19
"w"(_k78910), // %20
"w"(_k11121314), // %21
"w"(_k14151617), // %22
"w"(_k18192021), // %23
"w"(_k21222324), // %24
"w"(_k25262728), // %25
"w"(_k28293031), // %26
"w"(_k32333435), // %27
"w"(_k35363738), // %28
"w"(_k39404142), // %29
"w"(_k42434445), // %30
"w"(_k46474849) // %31
: "cc", "memory","v0", "v1", "v2", "v3", "v4", "v5", "v6", "v9"
);
}
#else // __ARM_NEON && __aarch64__ defined, but __clang__ not defined
// When compiled with gcc, gcc does not accept over 30 operands
for (; nn>0; nn--)
{
float32x4_t _sum = vld1q_f32(outptr);

float32x4_t _k0123 = vld1q_f32(k0);
float32x4_t _k4567 = vld1q_f32(k0 + 4);

float32x4x2_t _r00_02461357 = vld2q_f32(r0);
float32x4x2_t _r00nx2 = vld2q_f32(r0 + 8);
float32x4_t _r0_8101214 = _r00nx2.val[0];// 8 10 12 14
@@ -626,9 +986,6 @@ static void conv7x7s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
_sum = vfmaq_laneq_f32(_sum, _r05, _k4567, 1);
_sum = vfmaq_laneq_f32(_sum, _r06, _k4567, 2);

float32x4_t _k78910 = vld1q_f32(k1);
float32x4_t _k11121314 = vld1q_f32(k1 + 4);

float32x4x2_t _r10_02461357 = vld2q_f32(r1);
float32x4x2_t _r10nx2 = vld2q_f32(r1 + 8);
float32x4_t _r1_8101214 = _r10nx2.val[0];
@@ -649,9 +1006,6 @@ static void conv7x7s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
_sum = vfmaq_laneq_f32(_sum, _r15, _k11121314, 1);
_sum = vfmaq_laneq_f32(_sum, _r16, _k11121314, 2);

float32x4_t _k14151617 = vld1q_f32(k2);
float32x4_t _k18192021 = vld1q_f32(k2 + 4);

float32x4x2_t _r20_02461357 = vld2q_f32(r2);
float32x4x2_t _r20nx2 = vld2q_f32(r2 + 8);
float32x4_t _r2_8101214 = _r20nx2.val[0];
@@ -672,9 +1026,6 @@ static void conv7x7s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
_sum = vfmaq_laneq_f32(_sum, _r25, _k18192021, 1);
_sum = vfmaq_laneq_f32(_sum, _r26, _k18192021, 2);

float32x4_t _k21222324 = vld1q_f32(k3);
float32x4_t _k25262728 = vld1q_f32(k3 + 4);

float32x4x2_t _r30_02461357 = vld2q_f32(r3);
float32x4x2_t _r30nx2 = vld2q_f32(r3 + 8);
float32x4_t _r3_8101214 = _r30nx2.val[0];
@@ -695,9 +1046,6 @@ static void conv7x7s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
_sum = vfmaq_laneq_f32(_sum, _r35, _k25262728, 1);
_sum = vfmaq_laneq_f32(_sum, _r36, _k25262728, 2);

float32x4_t _k28293031 = vld1q_f32(k4);
float32x4_t _k32333435 = vld1q_f32(k4 + 4);

float32x4x2_t _r40_02461357 = vld2q_f32(r4);
float32x4x2_t _r40nx2 = vld2q_f32(r4 + 8);
float32x4_t _r4_8101214 = _r40nx2.val[0];
@@ -718,9 +1066,6 @@ static void conv7x7s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
_sum = vfmaq_laneq_f32(_sum, _r45, _k32333435, 1);
_sum = vfmaq_laneq_f32(_sum, _r46, _k32333435, 2);

float32x4_t _k35363738 = vld1q_f32(k5);
float32x4_t _k39404142 = vld1q_f32(k5 + 4);

float32x4x2_t _r50_02461357 = vld2q_f32(r5);
float32x4x2_t _r50nx2 = vld2q_f32(r5 + 8);
float32x4_t _r5_8101214 = _r50nx2.val[0];
@@ -741,9 +1086,6 @@ static void conv7x7s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
_sum = vfmaq_laneq_f32(_sum, _r55, _k39404142, 1);
_sum = vfmaq_laneq_f32(_sum, _r56, _k39404142, 2);

float32x4_t _k42434445 = vld1q_f32(k6);
float32x4_t _k46474849 = vld1q_f32(k6 + 4);

float32x4x2_t _r60_02461357 = vld2q_f32(r6);
float32x4x2_t _r60nx2 = vld2q_f32(r6 + 8);
float32x4_t _r6_8101214 = _r60nx2.val[0];
@@ -775,6 +1117,7 @@ static void conv7x7s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _ke
r6 += 8;
outptr += 4;
}
#endif // __clang__
#else
if (nn > 0)
{


+ 231
- 112
src/layer/arm/convolutiondepthwise_3x3.h View File

@@ -77,60 +77,111 @@ static void convdw3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _

#if __ARM_NEON
#if __aarch64__
for (; nn>0; nn--)
if (nn > 0)
{
float32x4_t _r00 = vld1q_f32(r0);
float32x4_t _r00n = vld1q_f32(r0 + 4);
float32x4_t _r01 = vextq_f32(_r00, _r00n, 1);
float32x4_t _r02 = vextq_f32(_r00, _r00n, 2);
asm volatile(
"prfm pldl1keep, [%3, #192] \n"
"ld1 {v9.4s, v10.4s}, [%3] \n" //r0
"add %3, %3, #16 \n"

float32x4_t _r10 = vld1q_f32(r1);
float32x4_t _r10n = vld1q_f32(r1 + 4);
float32x4_t _r11 = vextq_f32(_r10, _r10n, 1);
float32x4_t _r12 = vextq_f32(_r10, _r10n, 2);
"ext v11.16b, v9.16b, v10.16b, #4 \n"
"ext v12.16b, v9.16b, v10.16b, #8 \n"

float32x4_t _r20 = vld1q_f32(r2);
float32x4_t _r20n = vld1q_f32(r2 + 4);
float32x4_t _r21 = vextq_f32(_r20, _r20n, 1);
float32x4_t _r22 = vextq_f32(_r20, _r20n, 2);
"0: \n"

float32x4_t _r30 = vld1q_f32(r3);
float32x4_t _r30n = vld1q_f32(r3 + 4);
float32x4_t _r31 = vextq_f32(_r30, _r30n, 1);
float32x4_t _r32 = vextq_f32(_r30, _r30n, 2);

float32x4_t _sum1 = vmulq_laneq_f32(_r00, _k012x, 0);
float32x4_t _sum2 = vfmaq_laneq_f32(_bias0, _r01, _k012x, 1);
_sum1 = vfmaq_laneq_f32(_sum1, _r02, _k012x, 2);
_sum2 = vfmaq_laneq_f32(_sum2, _r10, _k345x, 0);
_sum1 = vfmaq_laneq_f32(_sum1, _r11, _k345x, 1);
_sum2 = vfmaq_laneq_f32(_sum2, _r12, _k345x, 2);
_sum1 = vfmaq_laneq_f32(_sum1, _r20, _k678x, 0);
_sum2 = vfmaq_laneq_f32(_sum2, _r21, _k678x, 1);
_sum1 = vfmaq_laneq_f32(_sum1, _r22, _k678x, 2);

float32x4_t _sum3 = vmulq_laneq_f32(_r10, _k012x, 0);
float32x4_t _sum4 = vfmaq_laneq_f32(_bias0, _r11, _k012x, 1);
_sum3 = vfmaq_laneq_f32(_sum3, _r12, _k012x, 2);
_sum4 = vfmaq_laneq_f32(_sum4, _r20, _k345x, 0);
_sum3 = vfmaq_laneq_f32(_sum3, _r21, _k345x, 1);
_sum4 = vfmaq_laneq_f32(_sum4, _r22, _k345x, 2);
_sum3 = vfmaq_laneq_f32(_sum3, _r30, _k678x, 0);
_sum4 = vfmaq_laneq_f32(_sum4, _r31, _k678x, 1);
_sum3 = vfmaq_laneq_f32(_sum3, _r32, _k678x, 2);

_sum1 = vaddq_f32(_sum1, _sum2);
_sum3 = vaddq_f32(_sum3, _sum4);

vst1q_f32(outptr, _sum1);
vst1q_f32(outptr2, _sum3);

r0 += 4;
r1 += 4;
r2 += 4;
r3 += 4;
outptr += 4;
outptr2 += 4;
"fmul v7.4s, v9.4s, %14.s[0] \n"
"and v13.16b, %17.16b, %17.16b \n" // v13 = _bias0
"fmul v6.4s, v11.4s, %14.s[1] \n"
"fmla v13.4s, v12.4s, %14.s[2] \n"

"prfm pldl1keep, [%4, #192] \n"
"ld1 {v9.4s, v10.4s}, [%4] \n"
"add %4, %4, #16 \n"

"fmla v7.4s, v9.4s, %15.s[0] \n"

"ext v11.16b, v9.16b, v10.16b, #4 \n"
"ext v12.16b, v9.16b, v10.16b, #8 \n"

"fmla v6.4s, v11.4s, %15.s[1] \n"
"fmla v13.4s, v12.4s, %15.s[2] \n"

"fmul v8.4s, v9.4s, %14.s[0] \n"

"and v15.16b, %17.16b, %17.16b \n" // v15 = _bias0
"fmul v14.4s, v11.4s, %14.s[1] \n"
"fmla v15.4s, v12.4s, %14.s[2] \n"

"prfm pldl1keep, [%5, #192] \n"
"ld1 {v9.4s, v10.4s}, [%5] \n"
"add %5, %5, #16 \n"

"fmla v7.4s, v9.4s, %16.s[0] \n"

"ext v11.16b, v9.16b, v10.16b, #4 \n"
"ext v12.16b, v9.16b, v10.16b, #8 \n"

"fmla v6.4s, v11.4s, %16.s[1] \n"
"fmla v13.4s, v12.4s, %16.s[2] \n"

"fmla v8.4s, v9.4s, %15.s[0] \n"
"fmla v14.4s, v11.4s, %15.s[1] \n"
"fmla v15.4s, v12.4s, %15.s[2] \n"

"prfm pldl1keep, [%6, #192] \n"
"ld1 {v9.4s, v10.4s}, [%6] \n"
"add %6, %6, #16 \n"

"fmla v8.4s, v9.4s, %16.s[0] \n"

"ext v11.16b, v9.16b, v10.16b, #4 \n"
"ext v12.16b, v9.16b, v10.16b, #8 \n"

"fmla v14.4s, v11.4s, %16.s[1] \n"
"fmla v15.4s, v12.4s, %16.s[2] \n"

"fadd v7.4s, v7.4s, v6.4s \n"

"prfm pldl1keep, [%3, #192] \n"
"ld1 {v9.4s, v10.4s}, [%3] \n" //ro, for next loop

"fadd v8.4s, v8.4s, v14.4s \n"
"fadd v7.4s, v7.4s, v13.4s \n"
"fadd v8.4s, v8.4s, v15.4s \n"

"ext v11.16b, v9.16b, v10.16b, #4 \n" // for next loop
"ext v12.16b, v9.16b, v10.16b, #8 \n" // for next loop

"add %3, %3, #16 \n"
"st1 {v7.4s}, [%1], #16 \n"
"st1 {v8.4s}, [%2], #16 \n"

"subs %w0, %w0, #1 \n"
"bne 0b \n"

"sub %3, %3, #16 \n"
: "=r"(nn), // %0
"=r"(outptr), // %1
"=r"(outptr2), // %2
"=r"(r0), // %3
"=r"(r1), // %4
"=r"(r2), // %5
"=r"(r3) // %6
: "0"(nn),
"1"(outptr),
"2"(outptr2),
"3"(r0),
"4"(r1),
"5"(r2),
"6"(r3),
"w"(_k012x), // %14
"w"(_k345x), // %15
"w"(_k678x), // %16
"w"(_bias0) // %17
: "cc", "memory", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
);
}
#else
if (nn > 0)
@@ -326,41 +377,80 @@ static void convdw3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _

#if __ARM_NEON
#if __aarch64__
for (; nn>0; nn--)
if (nn > 0)
{
float32x4_t _r00 = vld1q_f32(r0);
float32x4_t _r00n = vld1q_f32(r0 + 4);
float32x4_t _r01 = vextq_f32(_r00, _r00n, 1);
float32x4_t _r02 = vextq_f32(_r00, _r00n, 2);
asm volatile(
"prfm pldl1keep, [%2, #192] \n"
"ld1 {v8.4s, v9.4s}, [%2] \n" //r0
"add %2, %2, #16 \n"

float32x4_t _r10 = vld1q_f32(r1);
float32x4_t _r10n = vld1q_f32(r1 + 4);
float32x4_t _r11 = vextq_f32(_r10, _r10n, 1);
float32x4_t _r12 = vextq_f32(_r10, _r10n, 2);
"ext v10.16b, v8.16b, v9.16b, #4 \n"
"ext v11.16b, v8.16b, v9.16b, #8 \n"

float32x4_t _r20 = vld1q_f32(r2);
float32x4_t _r20n = vld1q_f32(r2 + 4);
float32x4_t _r21 = vextq_f32(_r20, _r20n, 1);
float32x4_t _r22 = vextq_f32(_r20, _r20n, 2);

float32x4_t _sum1 = vmulq_laneq_f32(_r00, _k012x, 0);
float32x4_t _sum2 = vfmaq_laneq_f32(_bias0, _r01, _k012x, 1);
_sum1 = vfmaq_laneq_f32(_sum1, _r02, _k012x, 2);
_sum2 = vfmaq_laneq_f32(_sum2, _r10, _k345x, 0);
_sum1 = vfmaq_laneq_f32(_sum1, _r11, _k345x, 1);
_sum2 = vfmaq_laneq_f32(_sum2, _r12, _k345x, 2);
_sum1 = vfmaq_laneq_f32(_sum1, _r20, _k678x, 0);
_sum2 = vfmaq_laneq_f32(_sum2, _r21, _k678x, 1);
_sum1 = vfmaq_laneq_f32(_sum1, _r22, _k678x, 2);

_sum1 = vaddq_f32(_sum1, _sum2);

vst1q_f32(outptr, _sum1);

r0 += 4;
r1 += 4;
r2 += 4;
outptr += 4;
"0: \n"

"fmul v7.4s, v8.4s, %10.s[0] \n"

"and v14.16b, %13.16b, %13.16b \n" // v14 = _bias0
"fmul v13.4s, v10.4s, %10.s[1] \n"
"fmla v14.4s, v11.4s, %10.s[2] \n"

"prfm pldl1keep, [%3, #192] \n"
"ld1 {v8.4s, v9.4s}, [%3] \n" //r1
"add %3, %3, #16 \n"

"fmla v7.4s, v8.4s, %11.s[0] \n"

"ext v10.16b, v8.16b, v9.16b, #4 \n"
"ext v11.16b, v8.16b, v9.16b, #8 \n"

"fmla v13.4s, v10.4s, %11.s[1] \n"
"fmla v14.4s, v11.4s, %11.s[2] \n"

"prfm pldl1keep, [%4, #192] \n"
"ld1 {v8.4s, v9.4s}, [%4] \n" //r2
"add %4, %4, #16 \n"

"fmla v7.4s, v8.4s, %12.s[0] \n"

"ext v10.16b, v8.16b, v9.16b, #4 \n"
"ext v11.16b, v8.16b, v9.16b, #8 \n"

"fmla v13.4s, v10.4s, %12.s[1] \n"
"fmla v14.4s, v11.4s, %12.s[2] \n"
"prfm pldl1keep, [%2, #192] \n"
"ld1 {v8.4s, v9.4s}, [%2] \n" //r0, for next loop
"add %2, %2, #16 \n"

"fadd v7.4s, v7.4s, v13.4s \n"
"fadd v7.4s, v7.4s, v14.4s \n"

"ext v10.16b, v8.16b, v9.16b, #4 \n" // for next loop
"ext v11.16b, v8.16b, v9.16b, #8 \n" // for next loop

"st1 {v7.4s}, [%1], #16 \n"

"subs %w0, %w0, #1 \n"
"bne 0b \n"

"sub %2, %2, #16 \n"
: "=r"(nn), // %0
"=r"(outptr), // %1
"=r"(r0), // %2
"=r"(r1), // %3
"=r"(r2) // %4
: "0"(nn),
"1"(outptr),
"2"(r0),
"3"(r1),
"4"(r2),
"w"(_k012x), // %10
"w"(_k345x), // %11
"w"(_k678x), // %12
"w"(_bias0) // %13
: "cc", "memory", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
);
}
#else
if (nn > 0)
@@ -547,47 +637,76 @@ static void convdw3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _

#if __ARM_NEON
#if __aarch64__
for (; nn>0; nn--)
if (nn > 0)
{
float32x4x2_t _r0 = vld2q_f32(r0);
float32x4x2_t _r0n = vld2q_f32(r0+8);
asm volatile(
"prfm pldl1keep, [%2, #256] \n"
"ld2 {v2.4s, v3.4s}, [%2], #32 \n"

"and v11.16b, %13.16b, %13.16b \n" // v11 = _bias0

float32x4_t _r00 = _r0.val[0];// 0 2 4 6
float32x4_t _r01 = _r0.val[1];// 1 3 5 7
float32x4_t _r02 = vextq_f32(_r00, _r0n.val[0], 1);// 2 4 6 8
"0: \n"
"fmul v0.4s, v2.4s, %10.s[0] \n"
"fmul v10.4s, v3.4s, %10.s[1] \n"

float32x4_t _outp = vfmaq_laneq_f32(_bias0, _r00, _k012x, 0);
_outp = vfmaq_laneq_f32(_outp, _r01, _k012x, 1);
_outp = vfmaq_laneq_f32(_outp, _r02, _k012x, 2);
"prfm pldl1keep, [%2, #256] \n"
"ld2 {v8.4s, v9.4s}, [%2] \n"
"ext v1.16b, v2.16b, v8.16b, #4 \n"

float32x4x2_t _r1 = vld2q_f32(r1);
float32x4x2_t _r1n = vld2q_f32(r1+8);
"fmla v11.4s, v1.4s, %10.s[2] \n"

float32x4_t _r10 = _r1.val[0];
float32x4_t _r11 = _r1.val[1];
float32x4_t _r12 = vextq_f32(_r10, _r1n.val[0], 1);
"prfm pldl1keep, [%3, #256] \n"
"ld2 {v2.4s, v3.4s}, [%3], #32 \n"

_outp = vfmaq_laneq_f32(_outp, _r10, _k345x, 0);
_outp = vfmaq_laneq_f32(_outp, _r11, _k345x, 1);
_outp = vfmaq_laneq_f32(_outp, _r12, _k345x, 2);
"fmla v0.4s, v2.4s, %11.s[0] \n"
"fmla v10.4s, v3.4s, %11.s[1] \n"

float32x4x2_t _r2 = vld2q_f32(r2);
float32x4x2_t _r2n = vld2q_f32(r2+8);
"prfm pldl1keep, [%3, #256] \n"
"ld2 {v8.4s, v9.4s}, [%3] \n"
"ext v1.16b, v2.16b, v8.16b, #4 \n"

float32x4_t _r20 = _r2.val[0];
float32x4_t _r21 = _r2.val[1];
float32x4_t _r22 = vextq_f32(_r20, _r2n.val[0], 1);
"fmla v11.4s, v1.4s, %11.s[2] \n"
"prfm pldl1keep, [%4, #256] \n"
"ld2 {v2.4s, v3.4s}, [%4], #32 \n"

_outp = vfmaq_laneq_f32(_outp, _r20, _k678x, 0);
_outp = vfmaq_laneq_f32(_outp, _r21, _k678x, 1);
_outp = vfmaq_laneq_f32(_outp, _r22, _k678x, 2);
"fmla v0.4s, v2.4s, %12.s[0] \n"
"fmla v10.4s, v3.4s, %12.s[1] \n"

vst1q_f32(outptr, _outp);
"prfm pldl1keep, [%4, #256] \n"
"ld2 {v8.4s, v9.4s}, [%4] \n"
"ext v1.16b, v2.16b, v8.16b, #4 \n"

r0 += 8;
r1 += 8;
r2 += 8;
outptr += 4;
"fmla v11.4s, v1.4s, %12.s[2] \n"

"prfm pldl1keep, [%2, #256] \n"
"ld2 {v2.4s, v3.4s}, [%2], #32 \n"

"fadd v0.4s, v0.4s, v10.4s \n"
"fadd v0.4s, v0.4s, v11.4s \n"

"and v11.16b, %13.16b, %13.16b \n" // v11 = _bias0

"subs %w0, %w0, #1 \n"
"st1 {v0.4s}, [%1], #16 \n"
"bne 0b \n"
"sub %2, %2, #32 \n"
: "=r"(nn), // %0
"=r"(outptr), // %1
"=r"(r0), // %2
"=r"(r1), // %3
"=r"(r2) // %4
: "0"(nn),
"1"(outptr),
"2"(r0),
"3"(r1),
"4"(r2),
"w"(_k012x), // %10
"w"(_k345x), // %11
"w"(_k678x), // %12
"w"(_bias0) // %13
: "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
);
}
#else
if (nn > 0)


+ 164
- 69
src/layer/arm/eltwise_arm.cpp View File

@@ -55,16 +55,28 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&

#if __ARM_NEON
#if __aarch64__
for (; nn>0; nn--)
if (nn > 0)
{
float32x4_t _ptr = vld1q_f32(ptr);
float32x4_t _ptr1 = vld1q_f32(ptr1);
float32x4_t _p = vmulq_f32(_ptr, _ptr1);
vst1q_f32(outptr, _p);

ptr += 4;
ptr1 += 4;
outptr += 4;
asm volatile(
"0: \n"
"prfm pldl1keep, [%1, #128] \n"
"prfm pldl1keep, [%2, #128] \n"
"ld1 {v0.4s}, [%1], #16 \n"
"ld1 {v1.4s}, [%2], #16 \n"
"fmul v0.4s, v0.4s, v1.4s \n"
"subs %w0, %w0, #1 \n"
"st1 {v0.4s}, [%3], #16 \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(ptr), // %1
"=r"(ptr1), // %2
"=r"(outptr) // %3
: "0"(nn),
"1"(ptr),
"2"(ptr1),
"3"(outptr)
: "cc", "memory", "v0", "v1"
);
}
#else
if (nn > 0)
@@ -120,15 +132,26 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&

#if __ARM_NEON
#if __aarch64__
for (; nn>0; nn--)
if (nn > 0)
{
float32x4_t _ptr = vld1q_f32(ptr);
float32x4_t _p = vld1q_f32(outptr);
_p = vmulq_f32(_ptr, _p);
vst1q_f32(outptr, _p);

ptr += 4;
outptr += 4;
asm volatile(
"0: \n"
"prfm pldl1keep, [%1, #128] \n"
"prfm pldl1keep, [%2, #128] \n"
"ld1 {v0.4s}, [%1], #16 \n"
"ld1 {v1.4s}, [%2] \n"
"fmul v0.4s, v0.4s, v1.4s \n"
"subs %w0, %w0, #1 \n"
"st1 {v0.4s}, [%2], #16 \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(ptr), // %1
"=r"(outptr) // %2
: "0"(nn),
"1"(ptr),
"2"(outptr)
: "cc", "memory", "v0", "v1"
);
}
#else
if (nn > 0)
@@ -186,16 +209,28 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&

#if __ARM_NEON
#if __aarch64__
for (; nn>0; nn--)
if (nn > 0)
{
float32x4_t _ptr = vld1q_f32(ptr);
float32x4_t _ptr1 = vld1q_f32(ptr1);
float32x4_t _p = vaddq_f32(_ptr, _ptr1);
vst1q_f32(outptr, _p);

ptr += 4;
ptr1 += 4;
outptr += 4;
asm volatile(
"0: \n"
"prfm pldl1keep, [%1, #128] \n"
"prfm pldl1keep, [%2, #128] \n"
"ld1 {v0.4s}, [%1], #16 \n"
"ld1 {v1.4s}, [%2], #16 \n"
"fadd v0.4s, v0.4s, v1.4s \n"
"subs %w0, %w0, #1 \n"
"st1 {v0.4s}, [%3], #16 \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(ptr), // %1
"=r"(ptr1), // %2
"=r"(outptr) // %3
: "0"(nn),
"1"(ptr),
"2"(ptr1),
"3"(outptr)
: "cc", "memory", "v0", "v1"
);
}
#else
if (nn > 0)
@@ -251,15 +286,26 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&

#if __ARM_NEON
#if __aarch64__
for (; nn>0; nn--)
if (nn > 0)
{
float32x4_t _ptr = vld1q_f32(ptr);
float32x4_t _p = vld1q_f32(outptr);
_p = vaddq_f32(_ptr, _p);
vst1q_f32(outptr, _p);

ptr += 4;
outptr += 4;
asm volatile(
"0: \n"
"prfm pldl1keep, [%1, #128] \n"
"prfm pldl1keep, [%2, #128] \n"
"ld1 {v0.4s}, [%1], #16 \n"
"ld1 {v1.4s}, [%2] \n"
"fadd v0.4s, v0.4s, v1.4s \n"
"subs %w0, %w0, #1 \n"
"st1 {v0.4s}, [%2], #16 \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(ptr), // %1
"=r"(outptr) // %2
: "0"(nn),
"1"(ptr),
"2"(outptr)
: "cc", "memory", "v0", "v1"
);
}
#else
if (nn > 0)
@@ -321,17 +367,31 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&
float32x4_t _coeff0 = vdupq_n_f32(coeff0);
float32x4_t _coeff1 = vdupq_n_f32(coeff1);
#if __aarch64__
for (; nn>0; nn--)
if (nn > 0)
{
float32x4_t _ptr = vld1q_f32(ptr);
float32x4_t _ptr1 = vld1q_f32(ptr1);
float32x4_t _p = vmulq_f32(_ptr, _coeff0);
_p = vmlaq_f32(_p, _ptr1, _coeff1);
vst1q_f32(outptr, _p);

ptr += 4;
ptr1 += 4;
outptr += 4;
asm volatile(
"0: \n"
"prfm pldl1keep, [%1, #128] \n"
"prfm pldl1keep, [%2, #128] \n"
"ld1 {v0.4s}, [%1], #16 \n"
"ld1 {v1.4s}, [%2], #16 \n"
"fmul v0.4s, v0.4s, %8.4s \n"
"fmla v0.4s, v1.4s, %9.4s \n"
"subs %w0, %w0, #1 \n"
"st1 {v0.4s}, [%3], #16 \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(ptr), // %1
"=r"(ptr1), // %2
"=r"(outptr) // %3
: "0"(nn),
"1"(ptr),
"2"(ptr1),
"3"(outptr),
"w"(_coeff0), // %8
"w"(_coeff1) // %9
: "cc", "memory", "v0", "v1"
);
}
#else
if (nn > 0)
@@ -392,15 +452,27 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&
#if __ARM_NEON
float32x4_t _coeff = vdupq_n_f32(coeff);
#if __aarch64__
for (; nn>0; nn--)
if (nn > 0)
{
float32x4_t _ptr = vld1q_f32(ptr);
float32x4_t _p = vld1q_f32(outptr);
_p = vmlaq_f32(_p, _ptr, _coeff);
vst1q_f32(outptr, _p);

ptr += 4;
outptr += 4;
asm volatile(
"0: \n"
"prfm pldl1keep, [%1, #128] \n"
"prfm pldl1keep, [%2, #128] \n"
"ld1 {v0.4s}, [%1], #16 \n"
"ld1 {v1.4s}, [%2] \n"
"fmla v1.4s, v0.4s, %6.4s \n"
"subs %w0, %w0, #1 \n"
"st1 {v1.4s}, [%2], #16 \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(ptr), // %1
"=r"(outptr) // %2
: "0"(nn),
"1"(ptr),
"2"(outptr),
"w"(_coeff) // %6
: "cc", "memory", "v0", "v1"
);
}
#else
if (nn > 0)
@@ -458,16 +530,28 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&

#if __ARM_NEON
#if __aarch64__
for (; nn>0; nn--)
if (nn > 0)
{
float32x4_t _ptr = vld1q_f32(ptr);
float32x4_t _ptr1 = vld1q_f32(ptr1);
float32x4_t _p = vmaxq_f32(_ptr, _ptr1);
vst1q_f32(outptr, _p);

ptr += 4;
ptr1 += 4;
outptr += 4;
asm volatile(
"0: \n"
"prfm pldl1keep, [%1, #128] \n"
"prfm pldl1keep, [%2, #128] \n"
"ld1 {v0.4s}, [%1], #16 \n"
"ld1 {v1.4s}, [%2], #16 \n"
"fmax v0.4s, v0.4s, v1.4s \n"
"subs %w0, %w0, #1 \n"
"st1 {v0.4s}, [%3], #16 \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(ptr), // %1
"=r"(ptr1), // %2
"=r"(outptr) // %3
: "0"(nn),
"1"(ptr),
"2"(ptr1),
"3"(outptr)
: "cc", "memory", "v0", "v1"
);
}
#else
if (nn > 0)
@@ -523,15 +607,26 @@ int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&

#if __ARM_NEON
#if __aarch64__
for (; nn>0; nn--)
if (nn > 0)
{
float32x4_t _ptr = vld1q_f32(ptr);
float32x4_t _p = vld1q_f32(outptr);
_p = vmaxq_f32(_ptr, _p);
vst1q_f32(outptr, _p);

ptr += 4;
outptr += 4;
asm volatile(
"0: \n"
"prfm pldl1keep, [%1, #128] \n"
"prfm pldl1keep, [%2, #128] \n"
"ld1 {v0.4s}, [%1], #16 \n"
"ld1 {v1.4s}, [%2] \n"
"fmax v0.4s, v0.4s, v1.4s \n"
"subs %w0, %w0, #1 \n"
"st1 {v0.4s}, [%2], #16 \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(ptr), // %1
"=r"(outptr) // %2
: "0"(nn),
"1"(ptr),
"2"(outptr)
: "cc", "memory", "v0", "v1"
);
}
#else
if (nn > 0)


+ 23
- 11
src/layer/arm/innerproduct_arm.cpp View File

@@ -172,18 +172,30 @@ int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob) const

#if __ARM_NEON
#if __aarch64__
for (; nn>0; nn--)
if (nn > 0)
{
float32x4_t _m = vld1q_f32(m);
float32x4_t _w = vld1q_f32(w);
_sum = vfmaq_f32(_sum, _m, _w);

_m = vld1q_f32(m + 4);
_w = vld1q_f32(w + 4);
_sum2 = vfmaq_f32(_sum2, _m, _w);

m += 8;
w += 8;
asm volatile(
"0: \n"
"prfm pldl1keep, [%1, #256] \n"
"ld1 {v0.4s, v1.4s}, [%1], #32 \n"
"prfm pldl1keep, [%2, #256] \n"
"ld1 {v2.4s, v3.4s}, [%2], #32 \n"
"fmla %3.4s, v0.4s, v2.4s \n"
"subs %w0, %w0, #1 \n"
"fmla %4.4s, v1.4s, v3.4s \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(m), // %1
"=r"(w), // %2
"=w"(_sum), // %3
"=w"(_sum2) // %4
: "0"(nn),
"1"(m),
"2"(w),
"3"(_sum),
"4"(_sum2)
: "cc", "memory", "v0", "v1", "v2", "v3"
);
}
#else
if (nn > 0)


+ 23
- 16
src/layer/arm/pooling_2x2.h View File

@@ -46,23 +46,30 @@ static void pooling2x2s2_max_neon(const Mat& bottom_blob, Mat& top_blob)

#if __ARM_NEON
#if __aarch64__
for (; nn>0; nn--)
if (nn > 0)
{
float32x4_t _r00 = vld1q_f32(r0);
float32x4_t _r10 = vld1q_f32(r1);
float32x4_t _r01 = vld1q_f32(r0 + 4);
float32x4_t _r11 = vld1q_f32(r1 + 4);

float32x4_t _max0 = vmaxq_f32(_r00, _r10);
float32x4_t _max1 = vmaxq_f32(_r01, _r11);

float32x4_t _max = vpmaxq_f32(_max0, _max1);

vst1q_f32(outptr, _max);

r0 += 8;
r1 += 8;
outptr += 4;
asm volatile(
"0: \n"
"prfm pldl1keep, [%1, #256] \n"
"prfm pldl1keep, [%2, #256] \n"
"ld1 {v0.4s, v1.4s}, [%1], #32 \n"
"ld1 {v2.4s, v3.4s}, [%2], #32 \n"
"fmax v0.4s, v0.4s, v2.4s \n"
"fmax v1.4s, v1.4s, v3.4s \n"
"fmaxp v2.4s, v0.4s, v1.4s \n"
"subs %w0, %w0, #1 \n"
"st1 {v2.4s}, [%3], #16 \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(r0), // %1
"=r"(r1), // %2
"=r"(outptr) // %3
: "0"(nn),
"1"(r0),
"2"(r1),
"3"(outptr)
: "cc", "memory", "v0", "v1", "v2", "v3"
);
}
#else
if (nn > 0)


+ 54
- 25
src/layer/arm/pooling_3x3.h View File

@@ -47,39 +47,68 @@ static void pooling3x3s2_max_neon(const Mat& bottom_blob, Mat& top_blob)

#if __ARM_NEON
#if __aarch64__
float32x4x2_t _r0 = vld2q_f32(r0);
float32x4x2_t _r1 = vld2q_f32(r1);
float32x4x2_t _r2 = vld2q_f32(r2);
for (; nn>0; nn--)
if (nn > 0)
{
float32x4x2_t _r0n = vld2q_f32(r0+8);
float32x4x2_t _r1n = vld2q_f32(r1+8);
float32x4x2_t _r2n = vld2q_f32(r2+8);
asm volatile(
"prfm pldl1keep, [%1, #256] \n"
"ld2 {v0.4s, v1.4s}, [%1], #32 \n"
"prfm pldl1keep, [%2, #256] \n"
"ld2 {v2.4s, v3.4s}, [%2], #32 \n"
"prfm pldl1keep, [%3, #256] \n"
"ld2 {v4.4s, v5.4s}, [%3], #32 \n"
"0: \n"

"prfm pldl1keep, [%1, #256] \n"
"ld2 {v6.4s, v7.4s}, [%1], #32 \n"

"fmax v12.4s, v0.4s, v1.4s \n"
"fmax v13.4s, v2.4s, v3.4s \n"

"prfm pldl1keep, [%2, #256] \n"
"ld2 {v8.4s, v9.4s}, [%2], #32 \n"

float32x4_t _max0 = vmaxq_f32(_r0.val[0], _r0.val[1]);
float32x4_t _max1 = vmaxq_f32(_r1.val[0], _r1.val[1]);
float32x4_t _max2 = vmaxq_f32(_r2.val[0], _r2.val[1]);
"fmax v14.4s, v4.4s, v5.4s \n"
"ext v0.16b, v0.16b, v6.16b, #4 \n"

float32x4_t _r02 = vextq_f32(_r0.val[0], _r0n.val[0], 1);
float32x4_t _r12 = vextq_f32(_r1.val[0], _r1n.val[0], 1);
float32x4_t _r22 = vextq_f32(_r2.val[0], _r2n.val[0], 1);
"prfm pldl1keep, [%3, #256] \n"
"ld2 {v10.4s, v11.4s}, [%3], #32 \n"

_max0 = vmaxq_f32(_max0, _r02);
_max1 = vmaxq_f32(_max1, _r12);
_max2 = vmaxq_f32(_max2, _r22);
"ext v2.16b, v2.16b, v8.16b, #4 \n"

float32x4_t _max = vmaxq_f32(vmaxq_f32(_max0, _max1), _max2);
"fmax v12.4s, v12.4s, v0.4s \n"
"ext v4.16b, v4.16b, v10.16b, #4 \n"

vst1q_f32(outptr, _max);
"fmax v13.4s, v13.4s, v2.4s \n"
"fmax v14.4s, v14.4s, v4.4s \n"
"fmax v12.4s, v12.4s, v13.4s \n"

_r0 = _r0n;
_r1 = _r1n;
_r2 = _r2n;
"orr v0.16b, v6.16b, v6.16b \n"
"orr v1.16b, v7.16b, v7.16b \n"
"fmax v12.4s, v12.4s, v14.4s \n"

r0 += 8;
r1 += 8;
r2 += 8;
outptr += 4;
"orr v2.16b, v8.16b, v8.16b \n"
"orr v3.16b, v9.16b, v9.16b \n"
"orr v4.16b, v10.16b, v10.16b \n"
"orr v5.16b, v11.16b, v11.16b \n"

"subs %w0, %w0, #1 \n"
"st1 {v12.4s}, [%4], #16 \n"
"bne 0b \n"
"sub %1, %1, #32 \n"
"sub %2, %2, #32 \n"
"sub %3, %3, #32 \n"
: "=r"(nn), // %0
"=r"(r0), // %1
"=r"(r1), // %2
"=r"(r2), // %3
"=r"(outptr) // %4
: "0"(nn),
"1"(r0),
"2"(r1),
"3"(r2),
"4"(outptr)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"
);
}
#else
if (nn > 0)


+ 16
- 8
src/layer/arm/slice_arm.cpp View File

@@ -57,15 +57,23 @@ int Slice_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& t

#if __ARM_NEON
#if __aarch64__
for (; nn>0; nn--)
if (nn > 0)
{
float32x4_t _p = vld1q_f32(ptr);
float32x4_t _p2 = vld1q_f32(ptr+4);
vst1q_f32(outptr, _p);
vst1q_f32(outptr+4, _p2);

ptr += 8;
outptr += 8;
asm volatile(
"0: \n"
"prfm pldl1keep, [%1, #256] \n"
"ld1 {v0.4s, v1.4s}, [%1], #32 \n"
"subs %w0, %w0, #1 \n"
"st1 {v0.4s, v1.4s}, [%2], #32 \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(ptr), // %1
"=r"(outptr) // %2
: "0"(nn),
"1"(ptr),
"2"(outptr)
: "cc", "memory", "v0", "v1"
);
}
#else
if (nn > 0)


+ 55
- 21
src/mat.cpp View File

@@ -44,13 +44,24 @@ void Mat::substract_mean_normalize(const float* mean_vals, const float* norm_val

#if __ARM_NEON
#if __aarch64__
float32x4_t _mean = vdupq_n_f32(mean);
for (; nn>0; nn--)
if (nn > 0)
{
float32x4_t _ptr = vld1q_f32(ptr);
_ptr = vsubq_f32(_ptr, _mean);
vst1q_f32(ptr, _ptr);
ptr += 4;
asm volatile(
"dup v1.4s, %w4 \n"
"0: \n"
"prfm pldl1keep, [%1, #128] \n"
"ld1 {v0.4s}, [%1] \n"
"fsub v0.4s, v0.4s, v1.4s \n"
"subs %w0, %w0, #1 \n"
"st1 {v0.4s}, [%1], #16 \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(ptr) // %1
: "0"(nn),
"1"(ptr),
"r"(mean) // %4
: "cc", "memory", "v0", "v1"
);
}
#else
if (nn > 0)
@@ -99,13 +110,24 @@ void Mat::substract_mean_normalize(const float* mean_vals, const float* norm_val

#if __ARM_NEON
#if __aarch64__
float32x4_t _norm = vdupq_n_f32(norm);
for (; nn>0; nn--)
if (nn > 0)
{
float32x4_t _ptr = vld1q_f32(ptr);
_ptr = vmulq_f32(_ptr, _norm);
vst1q_f32(ptr, _ptr);
ptr += 4;
asm volatile(
"dup v1.4s, %w4 \n"
"0: \n"
"prfm pldl1keep, [%1, #128] \n"
"ld1 {v0.4s}, [%1] \n"
"fmul v0.4s, v0.4s, v1.4s \n"
"subs %w0, %w0, #1 \n"
"st1 {v0.4s}, [%1], #16 \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(ptr) // %1
: "0"(nn),
"1"(ptr),
"r"(norm) // %4
: "cc", "memory", "v0", "v1"
);
}
#else
if (nn > 0)
@@ -155,15 +177,27 @@ void Mat::substract_mean_normalize(const float* mean_vals, const float* norm_val

#if __ARM_NEON
#if __aarch64__
float32x4_t _mean = vdupq_n_f32(mean);
float32x4_t _norm = vdupq_n_f32(norm);
for (; nn>0; nn--)
{
float32x4_t _ptr = vld1q_f32(ptr);
_ptr = vsubq_f32(_ptr, _mean);
_ptr = vmulq_f32(_ptr, _norm);
vst1q_f32(ptr, _ptr);
ptr += 4;
if (nn > 0)
{
asm volatile(
"dup v1.4s, %w4 \n"
"dup v2.4s, %w5 \n"
"0: \n"
"prfm pldl1keep, [%1, #128] \n"
"ld1 {v0.4s}, [%1] \n"
"fsub v0.4s, v0.4s, v1.4s \n"
"fmul v0.4s, v0.4s, v2.4s \n"
"subs %w0, %w0, #1 \n"
"st1 {v0.4s}, [%1], #16 \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(ptr) // %1
: "0"(nn),
"1"(ptr),
"r"(mean), // %4
"r"(norm) // %5
: "cc", "memory", "v0", "v1", "v2"
);
}
#else
if (nn > 0)


+ 13
- 3
src/mat.h View File

@@ -343,10 +343,20 @@ inline void Mat::fill(float _v)
#if __ARM_NEON
float32x4_t _c = vdupq_n_f32(_v);
#if __aarch64__
for (; nn>0; nn--)
if (nn > 0)
{
vst1q_f32(ptr, _c);
ptr += 4;
asm volatile (
"0: \n"
"subs %w0, %w0, #1 \n"
"st1 {%4.4s}, [%1], #16 \n"
"bne 0b \n"
: "=r"(nn), // %0
"=r"(ptr) // %1
: "0"(nn),
"1"(ptr),
"w"(_c) // %4
: "cc", "memory"
);
}
#else
if (nn > 0)


Loading…
Cancel
Save