diff --git a/src/layer/arm/convolution_3x3.h b/src/layer/arm/convolution_3x3.h index 9462f67d7..8fceaedb8 100644 --- a/src/layer/arm/convolution_3x3.h +++ b/src/layer/arm/convolution_3x3.h @@ -5320,6 +5320,15 @@ static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, co // 5 = (r06 + (r02 - r04 * 1.25) * 4) + (r01 * 2 - r03 * 2.5 + r05 * 0.5) // 6 = (r06 + (r02 - r04 * 1.25) * 4) - (r01 * 2 - r03 * 2.5 + r05 * 0.5) +#if __ARM_NEON + const float coeff[8] = { + 0.25f, 0.5f, -1.25f, 2.f, + -2.5f, 4.f, 4.25f, 5.25f + }; + float32x4_t _coeff0 = vld1q_f32(coeff); + float32x4_t _coeff1 = vld1q_f32(coeff+4); +#endif // __ARM_NEON + #pragma omp parallel for for (int q = 0; q