Browse Source

drop armv7 vaddvq_f32 hack

tags/20180704
nihuini 8 years ago
parent
commit
2b20bf940c
1 changed files with 16 additions and 10 deletions
  1. +16
    -10
      src/layer/arm/convolution_3x3.h

+ 16
- 10
src/layer/arm/convolution_3x3.h View File

@@ -8325,16 +8325,6 @@ static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, co
copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w);
}

#if !__aarch64__
// TODO drop it
static inline float vaddvq_f32(float32x4_t _v)
{
float32x2_t _ss = vadd_f32(vget_low_f32(_v), vget_high_f32(_v));
float32x2_t _ss2 = vpadd_f32(_ss, _ss);
return vget_lane_f32(_ss2, 0);
}
#endif // !__aarch64__

static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& _bias)
{
int w = bottom_blob.w;
@@ -10268,10 +10258,21 @@ static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, co
}

// TODO transpose and acc
#if __aarch64__
float sum0 = vaddvq_f32(_sum0);
float sum1 = vaddvq_f32(_sum1);
float sum2 = vaddvq_f32(_sum2);
float sum3 = vaddvq_f32(_sum3);
#else
float32x2_t _ss0 = vadd_f32(vget_low_f32(_sum0), vget_high_f32(_sum0));
float32x2_t _ss1 = vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1));
float32x2_t _ss2 = vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2));
float32x2_t _ss3 = vadd_f32(vget_low_f32(_sum3), vget_high_f32(_sum3));
float sum0 = vget_lane_f32(vpadd_f32(_ss0, _ss0), 0);
float sum1 = vget_lane_f32(vpadd_f32(_ss1, _ss1), 0);
float sum2 = vget_lane_f32(vpadd_f32(_ss2, _ss2), 0);
float sum3 = vget_lane_f32(vpadd_f32(_ss3, _ss3), 0);
#endif // __aarch64__

for (; q<inch; q++)
{
@@ -10610,7 +10611,12 @@ static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, co
_sum0 = vmlaq_f32(_sum0, _bb2p0, _ktm0);
}

#if __aarch64__
float sum0 = vaddvq_f32(_sum0);
#else
float32x2_t _ss0 = vadd_f32(vget_low_f32(_sum0), vget_high_f32(_sum0));
float sum0 = vget_lane_f32(vpadd_f32(_ss0, _ss0), 0);
#endif // __aarch64__

for (; q<inch; q++)
{


Loading…
Cancel
Save