|
|
|
@@ -22,12 +22,53 @@ namespace ncnn { |
|
|
|
|
|
|
|
DEFINE_LAYER_CREATOR(HardSwish_arm) |
|
|
|
|
|
|
|
HardSwish_arm::HardSwish_arm() |
|
|
|
{ |
|
|
|
#if __ARM_NEON |
|
|
|
support_packing = true; |
|
|
|
#endif // __ARM_NEON |
|
|
|
} |
|
|
|
|
|
|
|
int HardSwish_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const |
|
|
|
{ |
|
|
|
int w = bottom_top_blob.w; |
|
|
|
int h = bottom_top_blob.h; |
|
|
|
int channels = bottom_top_blob.c; |
|
|
|
int size = w * h; |
|
|
|
int elempack = bottom_top_blob.elempack; |
|
|
|
|
|
|
|
#if __ARM_NEON |
|
|
|
if (opt.use_packing_layout) |
|
|
|
{ |
|
|
|
|
|
|
|
if (elempack == 4) |
|
|
|
{ |
|
|
|
#pragma omp parallel for num_threads(opt.num_threads) |
|
|
|
for (int q=0; q<channels; q++) |
|
|
|
{ |
|
|
|
float* ptr = bottom_top_blob.channel(q); |
|
|
|
|
|
|
|
float32x4_t _zero = vdupq_n_f32(0.f); |
|
|
|
float32x4_t _one = vdupq_n_f32(1.f); |
|
|
|
for (int i=0; i<size; i++) |
|
|
|
{ |
|
|
|
float32x4_t _p = vld1q_f32(ptr); |
|
|
|
float32x4_t _ans = vdupq_n_f32(beta); |
|
|
|
_ans = vmlaq_n_f32(_ans, _p, alpha); |
|
|
|
_ans = vmaxq_f32(_ans, _zero); |
|
|
|
_ans = vminq_f32(_ans, _one); |
|
|
|
_ans = vmulq_f32(_ans, _p); |
|
|
|
vst1q_f32(ptr, _ans); |
|
|
|
|
|
|
|
ptr += 4; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
return 0; |
|
|
|
} |
|
|
|
|
|
|
|
} // opt.use_packing_layout |
|
|
|
#endif // __ARM_NEON |
|
|
|
|
|
|
|
#pragma omp parallel for num_threads(opt.num_threads) |
|
|
|
for (int q=0; q<channels; q++) |
|
|
|
|