hardsigmoid and hardswish pack4

6 years ago · 3ac6335ba3
--- a/src/layer/arm/hardsigmoid_arm.cpp
+++ b/src/layer/arm/hardsigmoid_arm.cpp
@@ -22,12 +22,52 @@ namespace ncnn {

 DEFINE_LAYER_CREATOR(HardSigmoid_arm)

 HardSigmoid_arm::HardSigmoid_arm()
 {
 #if __ARM_NEON
    support_packing = true;
 #endif // __ARM_NEON
 }

 int HardSigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;
    int elempack = bottom_top_blob.elempack;

 #if __ARM_NEON
    if (opt.use_packing_layout)
    {

    if (elempack == 4)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);

            float32x4_t _zero = vdupq_n_f32(0.f);
            float32x4_t _one = vdupq_n_f32(1.f);
            for (int i=0; i<size; i++)
            {
                float32x4_t _p = vld1q_f32(ptr);
                float32x4_t _ans = vdupq_n_f32(beta);
                _ans = vmlaq_n_f32(_ans, _p, alpha);
                _ans = vmaxq_f32(_ans, _zero);
                _ans = vminq_f32(_ans, _one);
                vst1q_f32(ptr, _ans);

                ptr += 4;
            }
        }

        return 0;
    }

    } // opt.use_packing_layout
 #endif // __ARM_NEON

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q=0; q<channels; q++)
--- a/src/layer/arm/hardsigmoid_arm.h
+++ b/src/layer/arm/hardsigmoid_arm.h
@@ -22,6 +22,8 @@ namespace ncnn {
 class HardSigmoid_arm : virtual public HardSigmoid
 {
 public:
    HardSigmoid_arm();

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 };

--- a/src/layer/arm/hardswish_arm.cpp
+++ b/src/layer/arm/hardswish_arm.cpp
@@ -22,12 +22,53 @@ namespace ncnn {

 DEFINE_LAYER_CREATOR(HardSwish_arm)

 HardSwish_arm::HardSwish_arm()
 {
 #if __ARM_NEON
    support_packing = true;
 #endif // __ARM_NEON
 }

 int HardSwish_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;
    int elempack = bottom_top_blob.elempack;

 #if __ARM_NEON
    if (opt.use_packing_layout)
    {

    if (elempack == 4)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);

            float32x4_t _zero = vdupq_n_f32(0.f);
            float32x4_t _one = vdupq_n_f32(1.f);
            for (int i=0; i<size; i++)
            {
                float32x4_t _p = vld1q_f32(ptr);
                float32x4_t _ans = vdupq_n_f32(beta);
                _ans = vmlaq_n_f32(_ans, _p, alpha);
                _ans = vmaxq_f32(_ans, _zero);
                _ans = vminq_f32(_ans, _one);
                _ans = vmulq_f32(_ans, _p);
                vst1q_f32(ptr, _ans);

                ptr += 4;
            }
        }

        return 0;
    }

    } // opt.use_packing_layout
 #endif // __ARM_NEON

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q=0; q<channels; q++)
--- a/src/layer/arm/hardswish_arm.h
+++ b/src/layer/arm/hardswish_arm.h
@@ -22,6 +22,8 @@ namespace ncnn {
 class HardSwish_arm : virtual public HardSwish
 {
 public:
    HardSwish_arm();

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 };