From e19b7097df953952f5bdad67d7d39cfec11983cc Mon Sep 17 00:00:00 2001 From: nihui Date: Sun, 6 Oct 2019 17:19:37 +0800 Subject: [PATCH] arm neon assembly optimization for conv3x3s1 pack1to4 --- src/layer/arm/convolution_3x3_pack1to4.h | 539 +++++++++++++++++++++++ src/layer/arm/convolution_arm.cpp | 12 + 2 files changed, 551 insertions(+) diff --git a/src/layer/arm/convolution_3x3_pack1to4.h b/src/layer/arm/convolution_3x3_pack1to4.h index 333b66abd..f515cc1d3 100644 --- a/src/layer/arm/convolution_3x3_pack1to4.h +++ b/src/layer/arm/convolution_3x3_pack1to4.h @@ -12,6 +12,545 @@ // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. +static void conv3x3s1_pack1to4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) +{ + int inch = bottom_blob.c; + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const float* bias = _bias; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p=0; pforward_inplace(top_blob, opt); + } + + return 0; + } + if (kernel_w == 3 && kernel_h == 3 && stride_w == 2 && stride_h == 2 && dilation_w == 1 && dilation_h == 1) { conv3x3s2_pack1to4_neon(bottom_blob_bordered, top_blob, weight_data_pack1to4, bias_data, opt);