From 08a97c169f91bb47be9af47519ec841dcd5c8baf Mon Sep 17 00:00:00 2001 From: nihui Date: Fri, 4 Oct 2019 12:41:48 +0800 Subject: [PATCH] arm neon assembly optimization for relu pack4 --- src/layer/arm/relu_arm.cpp | 142 ++++++++++++++++++++++++++++++++++--- 1 file changed, 133 insertions(+), 9 deletions(-) diff --git a/src/layer/arm/relu_arm.cpp b/src/layer/arm/relu_arm.cpp index 1bb391397..4fbbec2aa 100644 --- a/src/layer/arm/relu_arm.cpp +++ b/src/layer/arm/relu_arm.cpp @@ -134,15 +134,139 @@ int ReLU_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { float* ptr = bottom_top_blob.channel(q); - float32x4_t _zero = vdupq_n_f32(0.f); - for (int i=0; i> 3 + "cmp w4, #0 \n" + "beq 1f \n" + + "0: \n" + "prfm pldl1keep, [%0, #512] \n" + "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n" + "prfm pldl1keep, [%0, #512] \n" + "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%0] \n" + "fmax v0.4s, v0.4s, v16.4s \n" + "fmax v1.4s, v1.4s, v16.4s \n" + "fmax v2.4s, v2.4s, v16.4s \n" + "fmax v3.4s, v3.4s, v16.4s \n" + "sub %0, %0, #64 \n" + "fmax v4.4s, v4.4s, v16.4s \n" + "fmax v5.4s, v5.4s, v16.4s \n" + "fmax v6.4s, v6.4s, v16.4s \n" + "fmax v7.4s, v7.4s, v16.4s \n" + "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n" + "subs w4, w4, #1 \n" + "st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%0], #64 \n" + "bne 0b \n" + + "1: \n" + + "and w4, %w2, #7 \n"// w4 = remain = size & 7 + + "cmp w4, #4 \n"// w4 >= 4 + "blt 2f \n" + "prfm pldl1keep, [%0, #512] \n" + "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%0] \n" + "fmax v0.4s, v0.4s, v16.4s \n" + "fmax v1.4s, v1.4s, v16.4s \n" + "fmax v2.4s, v2.4s, v16.4s \n" + "fmax v3.4s, v3.4s, v16.4s \n" + "sub w4, w4, #4 \n" + "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n" + "2: \n" + + "cmp w4, #2 \n"// w4 >= 2 + "blt 3f \n" + "prfm pldl1keep, [%0, #256] \n" + "ld1 {v0.4s, v1.4s}, [%0] \n" + "fmax v0.4s, v0.4s, v16.4s \n" + "fmax v1.4s, v1.4s, v16.4s \n" + "sub w4, w4, #2 \n" + "st1 {v0.4s, v1.4s}, [%0], #32 \n" + "3: \n" + + "cmp w4, #0 \n"// w4 > 0 + "beq 4f \n" + "prfm pldl1keep, [%0, #128] \n" + "ld1 {v0.4s}, [%0] \n" + "fmax v0.4s, v0.4s, v16.4s \n" + "st1 {v0.4s}, [%0], #16 \n" + "4: \n" + + : "=r"(ptr) // %0 + : "0"(ptr), + "r"(size) // %2 + : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" + ); +#else // __aarch64__ + asm volatile( + "veor q12, q12 \n" + + "lsr r4, %2, #3 \n"// r4 = nn = size >> 3 + "cmp r4, #0 \n" + "beq 1f \n" + + "0: \n" + "pld [%0, #512] \n" + "vldm %0!, {d0-d7} \n" + "pld [%0, #512] \n" + "vldm %0, {d16-d23} \n" + "vmax.f32 q0, q0, q12 \n" + "vmax.f32 q1, q1, q12 \n" + "vmax.f32 q2, q2, q12 \n" + "vmax.f32 q3, q3, q12 \n" + "sub %0, %0, #64 \n" + "vmax.f32 q8, q8, q12 \n" + "vmax.f32 q9, q9, q12 \n" + "vmax.f32 q10, q10, q12 \n" + "vmax.f32 q11, q11, q12 \n" + "vstm %0!, {d0-d7} \n" + "subs r4, r4, #1 \n" + "vstm %0!, {d16-d23} \n" + "bne 0b \n" + + "1: \n" + + "and r4, %2, #7 \n"// r4 = remain = size & 7 + + "cmp r4, #4 \n"// r4 >= 4 + "blt 2f \n" + "pld [%0, #512] \n" + "vldm %0, {d0-d7} \n" + "vmax.f32 q0, q0, q12 \n" + "vmax.f32 q1, q1, q12 \n" + "vmax.f32 q2, q2, q12 \n" + "vmax.f32 q3, q3, q12 \n" + "sub r4, r4, #4 \n" + "vstm %0!, {d0-d7} \n" + "2: \n" + + "cmp r4, #2 \n"// r4 >= 2 + "blt 3f \n" + "pld [%0, #256] \n" + "vld1.f32 {d0-d3}, [%0 :128] \n" + "vmax.f32 q0, q0, q12 \n" + "vmax.f32 q1, q1, q12 \n" + "sub r4, r4, #2 \n" + "vst1.f32 {d0-d3}, [%0 :128]! \n" + "3: \n" + + "cmp r4, #0 \n"// r4 > 0 + "beq 4f \n" + "pld [%0, #128] \n" + "vld1.f32 {d0-d1}, [%0 :128] \n" + "vmax.f32 q0, q0, q12 \n" + "vst1.f32 {d0-d1}, [%0 :128]! \n" + "4: \n" + + : "=r"(ptr) // %0 + : "0"(ptr), + "r"(size) // %2 + : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12" + ); +#endif // __aarch64__ } } else