diff --git a/src/layer/x86/convolution_3x3.h b/src/layer/x86/convolution_3x3.h index 8387ba710..aeafae037 100644 --- a/src/layer/x86/convolution_3x3.h +++ b/src/layer/x86/convolution_3x3.h @@ -1224,6 +1224,7 @@ static void conv3x3s1_winograd43_sse(const Mat& bottom_blob, Mat& top_blob, cons { const float* kptr = kernel_tm_test[r].channel(p/8); const float* r0 = bottom_blob_tm.channel(tiles*r+i); +#if __AVX__ || __SSE__ #if __AVX__ float zero_val = 0.f; __m128 _sum0 = _mm_broadcast_ss(&zero_val); @@ -1234,9 +1235,17 @@ static void conv3x3s1_winograd43_sse(const Mat& bottom_blob, Mat& top_blob, cons __m128 _sum5 = _mm_broadcast_ss(&zero_val); __m128 _sum6 = _mm_broadcast_ss(&zero_val); __m128 _sum7 = _mm_broadcast_ss(&zero_val); - +#else + __m128 _sum0 = _mm_set1_ps(0.f); + __m128 _sum1 = _mm_set1_ps(0.f); + __m128 _sum2 = _mm_set1_ps(0.f); + __m128 _sum3 = _mm_set1_ps(0.f); + __m128 _sum4 = _mm_set1_ps(0.f); + __m128 _sum5 = _mm_set1_ps(0.f); + __m128 _sum6 = _mm_set1_ps(0.f); + __m128 _sum7 = _mm_set1_ps(0.f); +#endif int q=0; - for (; q+3> 2; + remain_outch_start = nn_outch << 2; + + for (int pp=0; pp> 2; + int remain_size_start = nn_size << 2; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii=0; ii> 2; + remain_outch_start = nn_outch << 2; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp=0; pp +#endif +#if __AVX__ #include #endif