Browse Source

fix winograd neon2 for aarch64

tags/20171225
nihui 8 years ago
parent
commit
f2f7ecd2ec
1 changed files with 12 additions and 15 deletions
  1. +12
    -15
      src/layer/arm/convolution_3x3.h

+ 12
- 15
src/layer/arm/convolution_3x3.h View File

@@ -1571,10 +1571,10 @@ static void conv3x3s1_winograd64_neon2(const Mat& bottom_blob, Mat& top_blob, co
r0 += 8;
r1 += 8;

_output0_tm = vmlaq_f32(_output0_tm, _r0, _k0nn);
_output0_tmn = vmlaq_f32(_output0_tmn, _r0n, _k0nnn);
_output0_tm = vmlaq_f32(_output0_tm, _r1, _k1nn);
_output0_tmn = vmlaq_f32(_output0_tmn, _r1n, _k1nnn);
_output0_tm = vmlaq_f32(_output0_tm, _r0, _k0);
_output0_tmn = vmlaq_f32(_output0_tmn, _r0n, _k0n);
_output0_tm = vmlaq_f32(_output0_tm, _r1, _k1);
_output0_tmn = vmlaq_f32(_output0_tmn, _r1n, _k1n);

vst1q_f32(output0_tm, _output0_tm);
vst1q_f32(output0_tm+4, _output0_tmn);
@@ -1613,10 +1613,10 @@ static void conv3x3s1_winograd64_neon2(const Mat& bottom_blob, Mat& top_blob, co
r0 += 8;
r1 += 8;

_output0_tm = vmlaq_f32(_output0_tm, _r0, _k0nn);
_output0_tmn = vmlaq_f32(_output0_tmn, _r0n, _k0nnn);
_output0_tm = vmlaq_f32(_output0_tm, _r1, _k1nn);
_output0_tmn = vmlaq_f32(_output0_tmn, _r1n, _k1nnn);
_output0_tm = vmlaq_f32(_output0_tm, _r0, _k0);
_output0_tmn = vmlaq_f32(_output0_tmn, _r0n, _k0n);
_output0_tm = vmlaq_f32(_output0_tm, _r1, _k1);
_output0_tmn = vmlaq_f32(_output0_tmn, _r1n, _k1n);

vst1q_f32(output0_tm, _output0_tm);
vst1q_f32(output0_tm+4, _output0_tmn);
@@ -1655,10 +1655,10 @@ static void conv3x3s1_winograd64_neon2(const Mat& bottom_blob, Mat& top_blob, co
r0 += 8;
r1 += 8;

_output0_tm = vmlaq_f32(_output0_tm, _r0, _k0nn);
_output0_tmn = vmlaq_f32(_output0_tmn, _r0n, _k0nnn);
_output0_tm = vmlaq_f32(_output0_tm, _r1, _k1nn);
_output0_tmn = vmlaq_f32(_output0_tmn, _r1n, _k1nnn);
_output0_tm = vmlaq_f32(_output0_tm, _r0, _k0);
_output0_tmn = vmlaq_f32(_output0_tmn, _r0n, _k0n);
_output0_tm = vmlaq_f32(_output0_tm, _r1, _k1);
_output0_tmn = vmlaq_f32(_output0_tmn, _r1n, _k1n);

vst1q_f32(output0_tm, _output0_tm);
vst1q_f32(output0_tm+4, _output0_tmn);
@@ -1689,7 +1689,6 @@ static void conv3x3s1_winograd64_neon2(const Mat& bottom_blob, Mat& top_blob, co
#else
if (nn > 0)
{
#if 1
asm volatile(
"mov r4, %1 \n"

@@ -1868,8 +1867,6 @@ static void conv3x3s1_winograd64_neon2(const Mat& bottom_blob, Mat& top_blob, co
: "cc", "memory", "r4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
#endif

#endif // __aarch64__
#endif // __ARM_NEON
for (; remain>0; remain--)


Loading…
Cancel
Save