| @@ -134,15 +134,139 @@ int ReLU_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const | |||
| { | |||
| float* ptr = bottom_top_blob.channel(q); | |||
| float32x4_t _zero = vdupq_n_f32(0.f); | |||
| for (int i=0; i<size; i++) | |||
| { | |||
| float32x4_t _p = vld1q_f32(ptr); | |||
| _p = vmaxq_f32(_p, _zero); | |||
| vst1q_f32(ptr, _p); | |||
| ptr += 4; | |||
| } | |||
| #if __aarch64__ | |||
| asm volatile( | |||
| "eor v16.16b, v16.16b, v16.16b \n" | |||
| "lsr w4, %w2, #3 \n"// w4 = nn = size >> 3 | |||
| "cmp w4, #0 \n" | |||
| "beq 1f \n" | |||
| "0: \n" | |||
| "prfm pldl1keep, [%0, #512] \n" | |||
| "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n" | |||
| "prfm pldl1keep, [%0, #512] \n" | |||
| "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%0] \n" | |||
| "fmax v0.4s, v0.4s, v16.4s \n" | |||
| "fmax v1.4s, v1.4s, v16.4s \n" | |||
| "fmax v2.4s, v2.4s, v16.4s \n" | |||
| "fmax v3.4s, v3.4s, v16.4s \n" | |||
| "sub %0, %0, #64 \n" | |||
| "fmax v4.4s, v4.4s, v16.4s \n" | |||
| "fmax v5.4s, v5.4s, v16.4s \n" | |||
| "fmax v6.4s, v6.4s, v16.4s \n" | |||
| "fmax v7.4s, v7.4s, v16.4s \n" | |||
| "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n" | |||
| "subs w4, w4, #1 \n" | |||
| "st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%0], #64 \n" | |||
| "bne 0b \n" | |||
| "1: \n" | |||
| "and w4, %w2, #7 \n"// w4 = remain = size & 7 | |||
| "cmp w4, #4 \n"// w4 >= 4 | |||
| "blt 2f \n" | |||
| "prfm pldl1keep, [%0, #512] \n" | |||
| "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%0] \n" | |||
| "fmax v0.4s, v0.4s, v16.4s \n" | |||
| "fmax v1.4s, v1.4s, v16.4s \n" | |||
| "fmax v2.4s, v2.4s, v16.4s \n" | |||
| "fmax v3.4s, v3.4s, v16.4s \n" | |||
| "sub w4, w4, #4 \n" | |||
| "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n" | |||
| "2: \n" | |||
| "cmp w4, #2 \n"// w4 >= 2 | |||
| "blt 3f \n" | |||
| "prfm pldl1keep, [%0, #256] \n" | |||
| "ld1 {v0.4s, v1.4s}, [%0] \n" | |||
| "fmax v0.4s, v0.4s, v16.4s \n" | |||
| "fmax v1.4s, v1.4s, v16.4s \n" | |||
| "sub w4, w4, #2 \n" | |||
| "st1 {v0.4s, v1.4s}, [%0], #32 \n" | |||
| "3: \n" | |||
| "cmp w4, #0 \n"// w4 > 0 | |||
| "beq 4f \n" | |||
| "prfm pldl1keep, [%0, #128] \n" | |||
| "ld1 {v0.4s}, [%0] \n" | |||
| "fmax v0.4s, v0.4s, v16.4s \n" | |||
| "st1 {v0.4s}, [%0], #16 \n" | |||
| "4: \n" | |||
| : "=r"(ptr) // %0 | |||
| : "0"(ptr), | |||
| "r"(size) // %2 | |||
| : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" | |||
| ); | |||
| #else // __aarch64__ | |||
| asm volatile( | |||
| "veor q12, q12 \n" | |||
| "lsr r4, %2, #3 \n"// r4 = nn = size >> 3 | |||
| "cmp r4, #0 \n" | |||
| "beq 1f \n" | |||
| "0: \n" | |||
| "pld [%0, #512] \n" | |||
| "vldm %0!, {d0-d7} \n" | |||
| "pld [%0, #512] \n" | |||
| "vldm %0, {d16-d23} \n" | |||
| "vmax.f32 q0, q0, q12 \n" | |||
| "vmax.f32 q1, q1, q12 \n" | |||
| "vmax.f32 q2, q2, q12 \n" | |||
| "vmax.f32 q3, q3, q12 \n" | |||
| "sub %0, %0, #64 \n" | |||
| "vmax.f32 q8, q8, q12 \n" | |||
| "vmax.f32 q9, q9, q12 \n" | |||
| "vmax.f32 q10, q10, q12 \n" | |||
| "vmax.f32 q11, q11, q12 \n" | |||
| "vstm %0!, {d0-d7} \n" | |||
| "subs r4, r4, #1 \n" | |||
| "vstm %0!, {d16-d23} \n" | |||
| "bne 0b \n" | |||
| "1: \n" | |||
| "and r4, %2, #7 \n"// r4 = remain = size & 7 | |||
| "cmp r4, #4 \n"// r4 >= 4 | |||
| "blt 2f \n" | |||
| "pld [%0, #512] \n" | |||
| "vldm %0, {d0-d7} \n" | |||
| "vmax.f32 q0, q0, q12 \n" | |||
| "vmax.f32 q1, q1, q12 \n" | |||
| "vmax.f32 q2, q2, q12 \n" | |||
| "vmax.f32 q3, q3, q12 \n" | |||
| "sub r4, r4, #4 \n" | |||
| "vstm %0!, {d0-d7} \n" | |||
| "2: \n" | |||
| "cmp r4, #2 \n"// r4 >= 2 | |||
| "blt 3f \n" | |||
| "pld [%0, #256] \n" | |||
| "vld1.f32 {d0-d3}, [%0 :128] \n" | |||
| "vmax.f32 q0, q0, q12 \n" | |||
| "vmax.f32 q1, q1, q12 \n" | |||
| "sub r4, r4, #2 \n" | |||
| "vst1.f32 {d0-d3}, [%0 :128]! \n" | |||
| "3: \n" | |||
| "cmp r4, #0 \n"// r4 > 0 | |||
| "beq 4f \n" | |||
| "pld [%0, #128] \n" | |||
| "vld1.f32 {d0-d1}, [%0 :128] \n" | |||
| "vmax.f32 q0, q0, q12 \n" | |||
| "vst1.f32 {d0-d1}, [%0 :128]! \n" | |||
| "4: \n" | |||
| : "=r"(ptr) // %0 | |||
| : "0"(ptr), | |||
| "r"(size) // %2 | |||
| : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12" | |||
| ); | |||
| #endif // __aarch64__ | |||
| } | |||
| } | |||
| else | |||