Browse Source

arm neon assembly optimization for relu pack4

tags/20191113
nihui 6 years ago
parent
commit
08a97c169f
1 changed files with 133 additions and 9 deletions
  1. +133
    -9
      src/layer/arm/relu_arm.cpp

+ 133
- 9
src/layer/arm/relu_arm.cpp View File

@@ -134,15 +134,139 @@ int ReLU_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
float* ptr = bottom_top_blob.channel(q);

float32x4_t _zero = vdupq_n_f32(0.f);
for (int i=0; i<size; i++)
{
float32x4_t _p = vld1q_f32(ptr);
_p = vmaxq_f32(_p, _zero);
vst1q_f32(ptr, _p);

ptr += 4;
}
#if __aarch64__
asm volatile(
"eor v16.16b, v16.16b, v16.16b \n"

"lsr w4, %w2, #3 \n"// w4 = nn = size >> 3
"cmp w4, #0 \n"
"beq 1f \n"

"0: \n"
"prfm pldl1keep, [%0, #512] \n"
"ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
"prfm pldl1keep, [%0, #512] \n"
"ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%0] \n"
"fmax v0.4s, v0.4s, v16.4s \n"
"fmax v1.4s, v1.4s, v16.4s \n"
"fmax v2.4s, v2.4s, v16.4s \n"
"fmax v3.4s, v3.4s, v16.4s \n"
"sub %0, %0, #64 \n"
"fmax v4.4s, v4.4s, v16.4s \n"
"fmax v5.4s, v5.4s, v16.4s \n"
"fmax v6.4s, v6.4s, v16.4s \n"
"fmax v7.4s, v7.4s, v16.4s \n"
"st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
"subs w4, w4, #1 \n"
"st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%0], #64 \n"
"bne 0b \n"

"1: \n"

"and w4, %w2, #7 \n"// w4 = remain = size & 7

"cmp w4, #4 \n"// w4 >= 4
"blt 2f \n"
"prfm pldl1keep, [%0, #512] \n"
"ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%0] \n"
"fmax v0.4s, v0.4s, v16.4s \n"
"fmax v1.4s, v1.4s, v16.4s \n"
"fmax v2.4s, v2.4s, v16.4s \n"
"fmax v3.4s, v3.4s, v16.4s \n"
"sub w4, w4, #4 \n"
"st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
"2: \n"

"cmp w4, #2 \n"// w4 >= 2
"blt 3f \n"
"prfm pldl1keep, [%0, #256] \n"
"ld1 {v0.4s, v1.4s}, [%0] \n"
"fmax v0.4s, v0.4s, v16.4s \n"
"fmax v1.4s, v1.4s, v16.4s \n"
"sub w4, w4, #2 \n"
"st1 {v0.4s, v1.4s}, [%0], #32 \n"
"3: \n"

"cmp w4, #0 \n"// w4 > 0
"beq 4f \n"
"prfm pldl1keep, [%0, #128] \n"
"ld1 {v0.4s}, [%0] \n"
"fmax v0.4s, v0.4s, v16.4s \n"
"st1 {v0.4s}, [%0], #16 \n"
"4: \n"

: "=r"(ptr) // %0
: "0"(ptr),
"r"(size) // %2
: "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
);
#else // __aarch64__
asm volatile(
"veor q12, q12 \n"

"lsr r4, %2, #3 \n"// r4 = nn = size >> 3
"cmp r4, #0 \n"
"beq 1f \n"

"0: \n"
"pld [%0, #512] \n"
"vldm %0!, {d0-d7} \n"
"pld [%0, #512] \n"
"vldm %0, {d16-d23} \n"
"vmax.f32 q0, q0, q12 \n"
"vmax.f32 q1, q1, q12 \n"
"vmax.f32 q2, q2, q12 \n"
"vmax.f32 q3, q3, q12 \n"
"sub %0, %0, #64 \n"
"vmax.f32 q8, q8, q12 \n"
"vmax.f32 q9, q9, q12 \n"
"vmax.f32 q10, q10, q12 \n"
"vmax.f32 q11, q11, q12 \n"
"vstm %0!, {d0-d7} \n"
"subs r4, r4, #1 \n"
"vstm %0!, {d16-d23} \n"
"bne 0b \n"

"1: \n"

"and r4, %2, #7 \n"// r4 = remain = size & 7

"cmp r4, #4 \n"// r4 >= 4
"blt 2f \n"
"pld [%0, #512] \n"
"vldm %0, {d0-d7} \n"
"vmax.f32 q0, q0, q12 \n"
"vmax.f32 q1, q1, q12 \n"
"vmax.f32 q2, q2, q12 \n"
"vmax.f32 q3, q3, q12 \n"
"sub r4, r4, #4 \n"
"vstm %0!, {d0-d7} \n"
"2: \n"

"cmp r4, #2 \n"// r4 >= 2
"blt 3f \n"
"pld [%0, #256] \n"
"vld1.f32 {d0-d3}, [%0 :128] \n"
"vmax.f32 q0, q0, q12 \n"
"vmax.f32 q1, q1, q12 \n"
"sub r4, r4, #2 \n"
"vst1.f32 {d0-d3}, [%0 :128]! \n"
"3: \n"

"cmp r4, #0 \n"// r4 > 0
"beq 4f \n"
"pld [%0, #128] \n"
"vld1.f32 {d0-d1}, [%0 :128] \n"
"vmax.f32 q0, q0, q12 \n"
"vst1.f32 {d0-d1}, [%0 :128]! \n"
"4: \n"

: "=r"(ptr) // %0
: "0"(ptr),
"r"(size) // %2
: "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12"
);
#endif // __aarch64__
}
}
else


Loading…
Cancel
Save