diff --git a/src/layer/arm/convolution_3x3_pack1to4.h b/src/layer/arm/convolution_3x3_pack1to4.h index f515cc1d3..977766d87 100644 --- a/src/layer/arm/convolution_3x3_pack1to4.h +++ b/src/layer/arm/convolution_3x3_pack1to4.h @@ -21,8 +21,359 @@ static void conv3x3s1_pack1to4_neon(const Mat& bottom_blob, Mat& top_blob, const const float* bias = _bias; + int nn_outch = 0; + int remain_outch_start = 0; + +#if __ARM_NEON && __aarch64__ + nn_outch = outch >> 1; + remain_outch_start = nn_outch << 1; + #pragma omp parallel for num_threads(opt.num_threads) - for (int p=0; p