Browse Source

hardsigmoid and hardswish pack4

tags/20191113
nihui 6 years ago
parent
commit
3ac6335ba3
4 changed files with 85 additions and 0 deletions
  1. +40
    -0
      src/layer/arm/hardsigmoid_arm.cpp
  2. +2
    -0
      src/layer/arm/hardsigmoid_arm.h
  3. +41
    -0
      src/layer/arm/hardswish_arm.cpp
  4. +2
    -0
      src/layer/arm/hardswish_arm.h

+ 40
- 0
src/layer/arm/hardsigmoid_arm.cpp View File

@@ -22,12 +22,52 @@ namespace ncnn {

DEFINE_LAYER_CREATOR(HardSigmoid_arm)

HardSigmoid_arm::HardSigmoid_arm()
{
#if __ARM_NEON
support_packing = true;
#endif // __ARM_NEON
}

int HardSigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;
int size = w * h;
int elempack = bottom_top_blob.elempack;

#if __ARM_NEON
if (opt.use_packing_layout)
{

if (elempack == 4)
{
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);

float32x4_t _zero = vdupq_n_f32(0.f);
float32x4_t _one = vdupq_n_f32(1.f);
for (int i=0; i<size; i++)
{
float32x4_t _p = vld1q_f32(ptr);
float32x4_t _ans = vdupq_n_f32(beta);
_ans = vmlaq_n_f32(_ans, _p, alpha);
_ans = vmaxq_f32(_ans, _zero);
_ans = vminq_f32(_ans, _one);
vst1q_f32(ptr, _ans);

ptr += 4;
}
}

return 0;
}

} // opt.use_packing_layout
#endif // __ARM_NEON

#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)


+ 2
- 0
src/layer/arm/hardsigmoid_arm.h View File

@@ -22,6 +22,8 @@ namespace ncnn {
class HardSigmoid_arm : virtual public HardSigmoid
{
public:
HardSigmoid_arm();

virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
};



+ 41
- 0
src/layer/arm/hardswish_arm.cpp View File

@@ -22,12 +22,53 @@ namespace ncnn {

DEFINE_LAYER_CREATOR(HardSwish_arm)

HardSwish_arm::HardSwish_arm()
{
#if __ARM_NEON
support_packing = true;
#endif // __ARM_NEON
}

int HardSwish_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;
int size = w * h;
int elempack = bottom_top_blob.elempack;

#if __ARM_NEON
if (opt.use_packing_layout)
{

if (elempack == 4)
{
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
float* ptr = bottom_top_blob.channel(q);

float32x4_t _zero = vdupq_n_f32(0.f);
float32x4_t _one = vdupq_n_f32(1.f);
for (int i=0; i<size; i++)
{
float32x4_t _p = vld1q_f32(ptr);
float32x4_t _ans = vdupq_n_f32(beta);
_ans = vmlaq_n_f32(_ans, _p, alpha);
_ans = vmaxq_f32(_ans, _zero);
_ans = vminq_f32(_ans, _one);
_ans = vmulq_f32(_ans, _p);
vst1q_f32(ptr, _ans);

ptr += 4;
}
}

return 0;
}

} // opt.use_packing_layout
#endif // __ARM_NEON

#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)


+ 2
- 0
src/layer/arm/hardswish_arm.h View File

@@ -22,6 +22,8 @@ namespace ncnn {
class HardSwish_arm : virtual public HardSwish
{
public:
HardSwish_arm();

virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
};



Loading…
Cancel
Save