Browse Source

bind register explicitly, fix #306, fix #310, fix #312

tags/20180427
nihuini 8 years ago
parent
commit
318d3abe66
1 changed files with 72 additions and 42 deletions
  1. +72
    -42
      src/layer/arm/convolution_3x3.h

+ 72
- 42
src/layer/arm/convolution_3x3.h View File

@@ -5462,22 +5462,22 @@ static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, co

for (int r=0; r<16; r++)
{
float32x4_t _k00;
float32x4_t _k01;
float32x4_t _k02;
float32x4_t _k03;
float32x4_t _k10;
float32x4_t _k11;
float32x4_t _k12;
float32x4_t _k13;
float32x4_t _k20;
float32x4_t _k21;
float32x4_t _k22;
float32x4_t _k23;
float32x4_t _k30;
float32x4_t _k31;
float32x4_t _k32;
float32x4_t _k33;
register float32x4_t _k00 asm("v0");
register float32x4_t _k01 asm("v1");
register float32x4_t _k02 asm("v2");
register float32x4_t _k03 asm("v3");
register float32x4_t _k10 asm("v4");
register float32x4_t _k11 asm("v5");
register float32x4_t _k12 asm("v6");
register float32x4_t _k13 asm("v7");
register float32x4_t _k20 asm("v8");
register float32x4_t _k21 asm("v9");
register float32x4_t _k22 asm("v10");
register float32x4_t _k23 asm("v11");
register float32x4_t _k30 asm("v12");
register float32x4_t _k31 asm("v13");
register float32x4_t _k32 asm("v14");
register float32x4_t _k33 asm("v15");
asm volatile(
"prfm pldl1keep, [%16, #512] \n"
"ld1 {%0.4s, %1.4s, %2.4s, %3.4s}, [%16], #64 \n"
@@ -6155,14 +6155,14 @@ static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, co
{
#if __ARM_NEON
#if __aarch64__
float32x4_t _k00;
float32x4_t _k01;
float32x4_t _k10;
float32x4_t _k11;
float32x4_t _k20;
float32x4_t _k21;
float32x4_t _k30;
float32x4_t _k31;
register float32x4_t _k00 asm("v0");
register float32x4_t _k01 asm("v1");
register float32x4_t _k10 asm("v2");
register float32x4_t _k11 asm("v3");
register float32x4_t _k20 asm("v4");
register float32x4_t _k21 asm("v5");
register float32x4_t _k30 asm("v6");
register float32x4_t _k31 asm("v7");
asm volatile(
"prfm pldl1keep, [%8, #256] \n"
"ld1 {%0.4s, %1.4s}, [%8], #32 \n"
@@ -6184,15 +6184,34 @@ static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, co
: "cc", "memory"
);
#else
asm volatile("pld [%0, #1024] \n" : : "r"(ktm) : );
float32x4_t _k00 = vld1q_f32(ktm); ktm += 4;
float32x4_t _k01 = vld1q_f32(ktm); ktm += 4;
float32x4_t _k10 = vld1q_f32(ktm); ktm += 4;
float32x4_t _k11 = vld1q_f32(ktm); ktm += 4;
float32x4_t _k20 = vld1q_f32(ktm); ktm += 4;
float32x4_t _k21 = vld1q_f32(ktm); ktm += 4;
float32x4_t _k30 = vld1q_f32(ktm); ktm += 4;
float32x4_t _k31 = vld1q_f32(ktm); ktm += 4;
register float32x4_t _k00 asm("q0");
register float32x4_t _k01 asm("q1");
register float32x4_t _k10 asm("q2");
register float32x4_t _k11 asm("q3");
register float32x4_t _k20 asm("q4");
register float32x4_t _k21 asm("q5");
register float32x4_t _k30 asm("q6");
register float32x4_t _k31 asm("q7");
asm volatile(
"pld [%8, #256] \n"
"vld1.f32 {%e0-%f1}, [%8 :128]! \n"
"pld [%8, #256] \n"
"vld1.f32 {%e2-%f3}, [%8 :128]! \n"
"pld [%8, #256] \n"
"vld1.f32 {%e4-%f5}, [%8 :128]! \n"
"pld [%8, #256] \n"
"vld1.f32 {%e6-%f7}, [%8 :128]! \n"
: "=w"(_k00),
"=w"(_k01),
"=w"(_k10),
"=w"(_k11),
"=w"(_k20),
"=w"(_k21),
"=w"(_k30),
"=w"(_k31)
: "r"(ktm)
: "cc", "memory"
);
#endif // __aarch64__
#endif // __ARM_NEON

@@ -6767,10 +6786,10 @@ static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, co
{
#if __ARM_NEON
#if __aarch64__
float32x4_t _k00;
float32x4_t _k10;
float32x4_t _k20;
float32x4_t _k30;
register float32x4_t _k00 asm("v0");
register float32x4_t _k10 asm("v1");
register float32x4_t _k20 asm("v2");
register float32x4_t _k30 asm("v3");
asm volatile(
"prfm pldl1keep, [%4, #256] \n"
"ld1 {%0.4s, %1.4s}, [%4], #32 \n"
@@ -6784,11 +6803,22 @@ static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, co
: "cc", "memory"
);
#else
asm volatile("pld [%0, #512] \n" : : "r"(ktm) : );
float32x4_t _k00 = vld1q_f32(ktm); ktm += 4;
float32x4_t _k10 = vld1q_f32(ktm); ktm += 4;
float32x4_t _k20 = vld1q_f32(ktm); ktm += 4;
float32x4_t _k30 = vld1q_f32(ktm); ktm += 4;
register float32x4_t _k00 asm("q0");
register float32x4_t _k10 asm("q1");
register float32x4_t _k20 asm("q2");
register float32x4_t _k30 asm("q3");
asm volatile(
"pld [%4, #256] \n"
"vld1.f32 {%e0-%f1}, [%4 :128]! \n"
"pld [%4, #256] \n"
"vld1.f32 {%e2-%f3}, [%4 :128]! \n"
: "=w"(_k00),
"=w"(_k10),
"=w"(_k20),
"=w"(_k30)
: "r"(ktm)
: "cc", "memory"
);
#endif // __aarch64__
#endif // __ARM_NEON



Loading…
Cancel
Save