diff --git a/src/layer/arm/convolution_3x3.h b/src/layer/arm/convolution_3x3.h index afd938e6e..95f5b8fbd 100644 --- a/src/layer/arm/convolution_3x3.h +++ b/src/layer/arm/convolution_3x3.h @@ -683,6 +683,7 @@ static void conv3x3s1_winograd64_neon(const Mat& bottom_blob, Mat& top_blob, con } } + bottom_blob_bordered = Mat(); // END transform input // BEGIN dot @@ -692,8 +693,528 @@ static void conv3x3s1_winograd64_neon(const Mat& bottom_blob, Mat& top_blob, con int h_tm = outh / 6 * 8; top_blob_tm.create(8*8, w_tm/8 * h_tm/8, outch); + int nn_outch = outch >> 2; + int remain_outch_start = nn_outch << 2; + #pragma omp parallel for - for (int p = 0; p