From 4af3ec223b4b477d72f3d6cf5402c985409ca3ee Mon Sep 17 00:00:00 2001 From: nihuini Date: Tue, 16 Oct 2018 11:29:08 +0800 Subject: [PATCH] fix clip arm --- src/layer/arm/clip_arm.cpp | 95 +++++++++++++++++--------------------- 1 file changed, 43 insertions(+), 52 deletions(-) diff --git a/src/layer/arm/clip_arm.cpp b/src/layer/arm/clip_arm.cpp index f2aa2c45a..e42379ba7 100644 --- a/src/layer/arm/clip_arm.cpp +++ b/src/layer/arm/clip_arm.cpp @@ -29,77 +29,68 @@ int Clip_arm::forward_inplace(Mat &bottom_top_blob, const Option &opt) const int channels = bottom_top_blob.c; int size = w * h; -#if __ARM_NEON - int nn = size >> 2; - int remain = size - (nn << 2); -#else - int remian = size; -#endif + #pragma omp parallel for num_threads(opt.num_threads) + for (int q=0; q> 2; + int remain = size & 3; +#else + int remian = size; #endif - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < channels; ++i) - { - float *channel_ptr = bottom_top_blob.channel(i); #if __ARM_NEON + float32x4_t _max = vdupq_n_f32(max); + float32x4_t _min = vdupq_n_f32(min); #if __aarch64__ - for (; nn > 0; --nn) + for (; nn>0; nn--) { - float32x4_t clip_f32 = vld1q_f32(channel_ptr); - float32x4_t clip_min_f32 = vmaxq_f32(minf32, clip_f32); - float32x4_t clip_max_f32 = vminq_f32(maxf32, clip_min_f32); - vst1q_f32(channel_ptr, clip_max_f32); - channel_ptr += 4; + float32x4_t _ptr = vld1q_f32(ptr); + _ptr = vmaxq_f32(_ptr, _min); + _ptr = vminq_f32(_ptr, _max); + vst1q_f32(ptr, _ptr); + ptr += 4; } #else if (nn > 0) { - asm volatile( - "0:" - "pld [%1, #128] \n" - "vld1.f32 {d0-d1}, [%1:128] \n" - - "vmax.f32 q1, %q4, q0 \n" - "vmin.f32 q2, %q5, q1 \n" - - "subs %0, #1 \n" - "vst1.f32 {d4-d5}, [%1:128]! \n" - - "bne 0b \n" - - :"=r"(nn), //%0 - "=r"(channel_ptr) //%1 - :"0"(nn), - "1"(channel_ptr), - "w"(minf32), //%q4 - "w"(maxf32) //%q5 - :"cc", "memory", "q0", "q1", "q2" - ); - + asm volatile( + "0: \n" + "pld [%1, #128] \n" + "vld1.f32 {d0-d1}, [%1: 128] \n" + + "vmax.f32 q0, q0, %q4 \n" + "vmin.f32 q0, q0, %q5 \n" + + "subs %0, #1 \n" + "vst1.f32 {d0-d1}, [%1: 128]! \n" + + "bne 0b \n" + + : "=r"(nn), // %0 + "=r"(ptr) // %1 + : "0"(nn), + "1"(ptr), + "w"(_min), // %q4 + "w"(_max) // %q5 + : "cc", "memory", "q0" + ); } #endif // __aarch64__ #endif // __ARM_NEON - for (; remain > 0; --remain) + for (; remain>0; remain--) { + if (*ptr < min) + *ptr = min; - if (*channel_ptr < min) - { - *channel_ptr = min; - } - - if (*channel_ptr > max) - { - *channel_ptr = max; - } + if (*ptr > max) + *ptr = max; - ++channel_ptr; + ptr++; } - } return 0;