Browse Source

fix clip arm

tags/20181228
nihuini 7 years ago
parent
commit
4af3ec223b
1 changed files with 43 additions and 52 deletions
  1. +43
    -52
      src/layer/arm/clip_arm.cpp

+ 43
- 52
src/layer/arm/clip_arm.cpp View File

@@ -29,77 +29,68 @@ int Clip_arm::forward_inplace(Mat &bottom_top_blob, const Option &opt) const
int channels = bottom_top_blob.c;
int size = w * h;

#if __ARM_NEON
int nn = size >> 2;
int remain = size - (nn << 2);
#else
int remian = size;
#endif
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);

#if __ARM_NEON
float32x4_t maxf32 = vmovq_n_f32(max);
float32x4_t minf32 = vmovq_n_f32(min);
int nn = size >> 2;
int remain = size & 3;
#else
int remian = size;
#endif

#pragma omp parallel for num_threads(opt.num_threads)
for (int i = 0; i < channels; ++i)
{
float *channel_ptr = bottom_top_blob.channel(i);
#if __ARM_NEON
float32x4_t _max = vdupq_n_f32(max);
float32x4_t _min = vdupq_n_f32(min);
#if __aarch64__
for (; nn > 0; --nn)
for (; nn>0; nn--)
{
float32x4_t clip_f32 = vld1q_f32(channel_ptr);
float32x4_t clip_min_f32 = vmaxq_f32(minf32, clip_f32);
float32x4_t clip_max_f32 = vminq_f32(maxf32, clip_min_f32);
vst1q_f32(channel_ptr, clip_max_f32);
channel_ptr += 4;
float32x4_t _ptr = vld1q_f32(ptr);
_ptr = vmaxq_f32(_ptr, _min);
_ptr = vminq_f32(_ptr, _max);
vst1q_f32(ptr, _ptr);
ptr += 4;
}
#else
if (nn > 0)
{
asm volatile(
"0:"
"pld [%1, #128] \n"
"vld1.f32 {d0-d1}, [%1:128] \n"

"vmax.f32 q1, %q4, q0 \n"
"vmin.f32 q2, %q5, q1 \n"

"subs %0, #1 \n"
"vst1.f32 {d4-d5}, [%1:128]! \n"

"bne 0b \n"

:"=r"(nn), //%0
"=r"(channel_ptr) //%1
:"0"(nn),
"1"(channel_ptr),
"w"(minf32), //%q4
"w"(maxf32) //%q5
:"cc", "memory", "q0", "q1", "q2"
);

asm volatile(
"0: \n"
"pld [%1, #128] \n"
"vld1.f32 {d0-d1}, [%1: 128] \n"

"vmax.f32 q0, q0, %q4 \n"
"vmin.f32 q0, q0, %q5 \n"

"subs %0, #1 \n"
"vst1.f32 {d0-d1}, [%1: 128]! \n"

"bne 0b \n"

: "=r"(nn), // %0
"=r"(ptr) // %1
: "0"(nn),
"1"(ptr),
"w"(_min), // %q4
"w"(_max) // %q5
: "cc", "memory", "q0"
);
}
#endif // __aarch64__
#endif // __ARM_NEON

for (; remain > 0; --remain)
for (; remain>0; remain--)
{
if (*ptr < min)
*ptr = min;

if (*channel_ptr < min)
{
*channel_ptr = min;
}

if (*channel_ptr > max)
{
*channel_ptr = max;
}
if (*ptr > max)
*ptr = max;

++channel_ptr;
ptr++;
}

}

return 0;


Loading…
Cancel
Save