diff --git a/src/layer/arm/convolution_arm.cpp b/src/layer/arm/convolution_arm.cpp index 961f81a13..e3017868e 100644 --- a/src/layer/arm/convolution_arm.cpp +++ b/src/layer/arm/convolution_arm.cpp @@ -29,6 +29,7 @@ namespace ncnn { #include "convolution_4x4.h" #include "convolution_5x5.h" #include "convolution_7x7.h" +#include "convolution_sgemm.h" #include "convolution_sgemm_int8.h" #include "convolution_1x1_int8.h" #include "convolution_3x3_int8.h" @@ -79,7 +80,7 @@ int Convolution_arm::create_pipeline(const Option& opt) if (activation) { Option opt_cpu = opt; - opt_cpu.vulkan_compute = false; + opt_cpu.use_vulkan_compute = false; activation->create_pipeline(opt_cpu); } @@ -155,6 +156,13 @@ int Convolution_arm::create_pipeline(const Option& opt) conv3x3s2_transform_kernel_neon(weight_data, weight_3x3s2_data, num_input, num_output); } + { + int kernel_size = kernel_w * kernel_h; + int num_input = weight_data_size / kernel_size / num_output; + + conv_im2col_sgemm_transform_kernel_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_size); + } + return 0; } @@ -163,7 +171,7 @@ int Convolution_arm::destroy_pipeline(const Option& opt) if (activation) { Option opt_cpu = opt; - opt_cpu.vulkan_compute = false; + opt_cpu.use_vulkan_compute = false; activation->destroy_pipeline(opt_cpu); delete activation; activation = 0; @@ -581,10 +589,17 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option { conv1x1s1_sgemm_neon(bottom_blob_bordered, top_blob, weight_1x1_sgemm_data, bias_data, opt); } - else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) { - conv3x3s2_packed_neon(bottom_blob_bordered, top_blob, weight_3x3s2_data, bias_data, opt); + conv_im2col_sgemm_neon(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, stride_w, stride_h, opt); } + else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + if (outw >=8 && outh >=8) + conv3x3s2_packed_neon(bottom_blob_bordered, top_blob, weight_3x3s2_data, bias_data, opt); + else + conv_im2col_sgemm_neon(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, stride_w, stride_h, opt); + } else conv(bottom_blob_bordered, top_blob, weight_data, bias_data, opt); diff --git a/src/layer/arm/convolution_arm.h b/src/layer/arm/convolution_arm.h index 32e52f6ee..91219f08e 100644 --- a/src/layer/arm/convolution_arm.h +++ b/src/layer/arm/convolution_arm.h @@ -43,6 +43,7 @@ public: Mat weight_1x1s1_sgemm_int8_data; Mat weight_3x3_winograd23_data; Mat weight_sgemm_int8_data; + Mat weight_sgemm_data; std::vector weight_3x3_winograd23_int8_data; }; diff --git a/src/layer/arm/convolution_sgemm.h b/src/layer/arm/convolution_sgemm.h new file mode 100644 index 000000000..9f0249be2 --- /dev/null +++ b/src/layer/arm/convolution_sgemm.h @@ -0,0 +1,1532 @@ +// BUG1989 is pleased to support the open source community by supporting ncnn available. +// +// Copyright (C) 2019 BUG1989. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv_im2col_sgemm_transform_kernel_neon(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_size) +{ + + const float* kernel = _kernel; + +#if __ARM_NEON && __aarch64__ + // kernel memory packed 8 x 8 + kernel_tm.create(8*kernel_size, inch, outch/8 + (outch%8)/4 + outch%4); +#else + // kernel memory packed 4 x 8 + kernel_tm.create(4*kernel_size, inch, outch/4 + outch%4); +#endif + + int nn_outch = 0; + int remain_outch_start = 0; + +#if __ARM_NEON && __aarch64__ + nn_outch = outch >> 3; + remain_outch_start = nn_outch << 3; + + for (int pp=0; pp> 2; + + for (int pp=0; pp> 3; + int remain_size_start = nn_size << 3; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii=0; ii> 3; + remain_outch_start = nn_outch << 3; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp=0; pp> 2 + "cmp w4, #0 \n" + "beq 1f \n" + + "0: \n"// for (; k+3> 2 + "cmp w4, #0 \n" + "beq 1f \n" + + "0: \n"// for (; k+3> 2; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp=0; pp> 2 + "cmp w4, #0 \n" + "beq 1f \n" + + "0: \n"// for (; k+3> 2 + "cmp r4, #0 \n" + "beq 1f \n" + + "0: \n"// for(; nn != 0; nn--) + "pld [%5, #512] \n" + "vldm %5!, {d0-d7} \n"// kernel + "pld [%4, #512] \n" + "vldm %4!, {d8-d15} \n"// data + + "vmla.f32 q8, q4, d0[0] \n"// sum0 = (a00-a07) * k00 + "vmla.f32 q9, q5, d0[0] \n" + "vmla.f32 q10, q4, d0[1] \n"// sum1 = (a00-a07) * k10 + "vmla.f32 q11, q5, d0[1] \n" + "vmla.f32 q12, q4, d1[0] \n"// sum2 = (a00-a07) * k20 + "vmla.f32 q13, q5, d1[0] \n" + "vmla.f32 q14, q4, d1[1] \n"// sum3 = (a00-a07) * k30 + "vmla.f32 q15, q5, d1[1] \n" + + "vmla.f32 q8, q6, d2[0] \n"// sum0 += (a10-a17) * k01 + "vmla.f32 q9, q7, d2[0] \n" + "vmla.f32 q10, q6, d2[1] \n"// sum1 += (a10-a17) * k11 + "vmla.f32 q11, q7, d2[1] \n" + "vmla.f32 q12, q6, d3[0] \n"// sum2 += (a10-a17) * k21 + "vmla.f32 q13, q7, d3[0] \n" + "vmla.f32 q14, q6, d3[1] \n"// sum3 += (a10-a17) * k31 + "vmla.f32 q15, q7, d3[1] \n" + + "pld [%4, #512] \n" + "vldm %4!, {d8-d15} \n"// data + + "vmla.f32 q8, q4, d4[0] \n"// sum0 += (a20-a27) * k02 + "vmla.f32 q9, q5, d4[0] \n" + "vmla.f32 q10, q4, d4[1] \n"// sum1 += (a20-a27) * k12 + "vmla.f32 q11, q5, d4[1] \n" + "vmla.f32 q12, q4, d5[0] \n"// sum2 += (a20-a27) * k22 + "vmla.f32 q13, q5, d5[0] \n" + "vmla.f32 q14, q4, d5[1] \n"// sum3 += (a20-a27) * k32 + "vmla.f32 q15, q5, d5[1] \n" + + "vmla.f32 q8, q6, d6[0] \n"// sum0 += (a30-a37) * k03 + "vmla.f32 q9, q7, d6[0] \n" + "vmla.f32 q10, q6, d6[1] \n"// sum1 += (a30-a37) * k13 + "vmla.f32 q11, q7, d6[1] \n" + "vmla.f32 q12, q6, d7[0] \n"// sum2 += (a30-a37) * k23 + "vmla.f32 q13, q7, d7[0] \n" + "vmla.f32 q14, q6, d7[1] \n"// sum3 += (a30-a37) * k33 + "vmla.f32 q15, q7, d7[1] \n" + + "subs r4, r4, #1 \n" + "bne 0b \n"// end for + + "1: \n" + // remain loop + "and r4, %12, #3 \n"// r4 = remain = inch & 3 + "cmp r4, #0 \n" + "beq 3f \n" + + "2: \n"// for(; remain != 0; remain--) + + "pld [%5, #128] \n" + "vld1.f32 {d0-d1}, [%5]! \n" + "pld [%4, #256] \n" + "vld1.f32 {d8-d11}, [%4]! \n" + + "vmla.f32 q8, q4, d0[0] \n"// sum0 += (a00-a70) * k00 + "vmla.f32 q9, q5, d0[0] \n" + "vmla.f32 q10, q4, d0[1] \n"// sum1 += (a00-a70) * k10 + "vmla.f32 q11, q5, d0[1] \n" + "vmla.f32 q12, q4, d1[0] \n"// sum2 += (a00-a70) * k20 + "vmla.f32 q13, q5, d1[0] \n" + "vmla.f32 q14, q4, d1[1] \n"// sum3 += (a00-a70) * k30 + "vmla.f32 q15, q5, d1[1] \n" + + "subs r4, r4, #1 \n" + "bne 2b \n" + + "3: \n"// store the result to memory + "vst1.f32 {d16-d19}, [%0] \n" + "vst1.f32 {d20-d23}, [%1] \n" + "vst1.f32 {d24-d27}, [%2] \n" + "vst1.f32 {d28-d31}, [%3] \n" + + : "=r"(output0), // %0 + "=r"(output1), // %1 + "=r"(output2), // %2 + "=r"(output3), // %3 + "=r"(vb), // %4 + "=r"(va) // %5 + : "0"(output0), + "1"(output1), + "2"(output2), + "3"(output3), + "4"(vb), + "5"(va), + "r"(L), // %12 + "r"(biasptr) // %13 + : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +#endif // __aarch64__ +#else + float sum0[8] = {0}; + float sum1[8] = {0}; + float sum2[8] = {0}; + float sum3[8] = {0}; + + int k=0; + for (; k+7> 2 + "cmp w4, #0 \n" + "beq 1f \n" + + "eor v16.16b, v16.16b, v16.16b \n" // sum0 + "eor v17.16b, v17.16b, v17.16b \n" // sum1 + "eor v18.16b, v18.16b, v18.16b \n" // sum2 + "eor v19.16b, v19.16b, v19.16b \n" // sum3 + + "0: \n"// for (; k+3> 2 + "cmp r4, #0 \n" + "beq 1f \n" + + "veor q8, q8, q8 \n" + "veor q9, q9, q9 \n" + "veor q10, q10, q10 \n" + "veor q11, q11, q11 \n" + + "0: \n"// for(; nn != 0; nn--) + "pld [%5, #512] \n" + "vldm %5!, {d0-d7} \n"// kernel + "pld [%4, #128] \n" + "vld1.f32 {d8-d9}, [%4]! \n"// data + + "vmla.f32 q8, q0, d8[0] \n"// (k00-k30) * a00 + "vmla.f32 q9, q1, d8[1] \n"// (k01-k31) * a01 + "vmla.f32 q10, q2, d9[0] \n"// (k02-k32) * a02 + "vmla.f32 q11, q3, d9[1] \n"// (k03-k33) * a03 + + "subs r4, r4, #1 \n" + "bne 0b \n"// end for + + "vadd.f32 q8, q8, q9 \n" + "vadd.f32 q10, q10, q11 \n" + "vadd.f32 q8, q8, q10 \n" + "vadd.f32 q12, q12, q8 \n" + + "1: \n" + // remain loop + "and r4, %12, #3 \n"// r4 = remain = inch & 3 + "cmp r4, #0 \n" + "beq 3f \n" + + "2: \n"// for(; remain != 0; remain--) + "pld [%5, #128] \n" + "vld1.f32 {d0-d1}, [%5]! \n" + "pld [%4, #32] \n" + "vld1.f32 {d8[],d9[]}, [%4]! \n" + + "subs r4, r4, #1 \n" + + "vmla.f32 q12, q0, q4 \n" + "bne 2b \n" + + "3: \n"// store the result to memory + "vst1.f32 {d24[0]}, [%0] \n" + "vst1.f32 {d24[1]}, [%1] \n" + "vst1.f32 {d25[0]}, [%2] \n" + "vst1.f32 {d25[1]}, [%3] \n" + + : "=r"(output0), // %0 + "=r"(output1), // %1 + "=r"(output2), // %2 + "=r"(output3), // %3 + "=r"(vb), // %4 + "=r"(va) // %5 + : "0"(output0), + "1"(output1), + "2"(output2), + "3"(output3), + "4"(vb), + "5"(va), + "r"(L), // %12 + "r"(biasptr) // %13 + : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12" + ); +#endif // __aarch64__ +#else + float sum0 = biasptr[0]; + float sum1 = biasptr[1]; + float sum2 = biasptr[2]; + float sum3 = biasptr[3]; + + for (int k=0; k> 2 + "cmp w4, #0 \n" + "beq 1f \n" + + "0: \n"// for (; k+3> 2 + "cmp r4, #0 \n" + "beq 1f \n" + + "0: \n" + + "pld [%1, #512] \n" + "vldm %1!, {d8-d15} \n" + "pld [%2, #128] \n" + "vld1.f32 {d0-d1}, [%2]! \n" + + "vmla.f32 q8, q4, d0[0] \n" + "vmla.f32 q9, q5, d0[0] \n" + + "pld [%1, #512] \n" + "vldm %1!, {d24-d31} \n" + + "vmla.f32 q8, q6, d0[1] \n" + "vmla.f32 q9, q7, d0[1] \n" + + "subs r4, r4, #1 \n" + + "vmla.f32 q8, q12, d1[0] \n" + "vmla.f32 q9, q13, d1[0] \n" + "vmla.f32 q8, q14, d1[1] \n" + "vmla.f32 q9, q15, d1[1] \n" + + "bne 0b \n" + + "1: \n" + // remain loop + "and r4, %6, #3 \n"// r4 = remain = inch & 3; + "cmp r4, #0 \n" + "beq 3f \n" + + "2: \n" + "pld [%1, #256] \n" + "vld1.f32 {d8-d11}, [%1]! \n" + "pld [%2, #32] \n" + "vld1.f32 {d0[],d1[]}, [%2]! \n" + + "subs r4, r4, #1 \n" + + "vmla.f32 q8, q4, q0 \n" + "vmla.f32 q9, q5, q0 \n" + "bne 2b \n" + + "3: \n" + "vst1.f32 {d16-d19}, [%0] \n" + + : "=r"(output), // %0 + "=r"(vb), // %1 + "=r"(va) // %2 + : "0"(output), + "1"(vb), + "2"(va), + "r"(L), // %6 + "r"(bias0) // %7 + : "cc", "memory", "r4", "q0", "q4", "q5", "q6", "q7", "q8", "q9", "q12", "q13", "q14", "q15" + ); +#endif // __aarch64__ +#else + float sum[8] = {0}; + + int k=0; + for (; k+7create_pipeline(opt_cpu); } @@ -80,7 +80,7 @@ int Deconvolution_arm::destroy_pipeline(const Option& opt) if (activation) { Option opt_cpu = opt; - opt_cpu.vulkan_compute = false; + opt_cpu.use_vulkan_compute = false; activation->destroy_pipeline(opt_cpu); delete activation; activation = 0; diff --git a/src/layer/arm/deconvolutiondepthwise_arm.cpp b/src/layer/arm/deconvolutiondepthwise_arm.cpp index de2329157..8a11248d6 100644 --- a/src/layer/arm/deconvolutiondepthwise_arm.cpp +++ b/src/layer/arm/deconvolutiondepthwise_arm.cpp @@ -65,7 +65,7 @@ int DeconvolutionDepthWise_arm::create_pipeline(const Option& opt) if (activation) { Option opt_cpu = opt; - opt_cpu.vulkan_compute = false; + opt_cpu.use_vulkan_compute = false; activation->create_pipeline(opt_cpu); } @@ -77,7 +77,7 @@ int DeconvolutionDepthWise_arm::destroy_pipeline(const Option& opt) if (activation) { Option opt_cpu = opt; - opt_cpu.vulkan_compute = false; + opt_cpu.use_vulkan_compute = false; activation->destroy_pipeline(opt_cpu); delete activation; activation = 0; diff --git a/src/net.cpp b/src/net.cpp index 4c6237ffd..23d3ac205 100644 --- a/src/net.cpp +++ b/src/net.cpp @@ -832,18 +832,38 @@ int Net::load_model(const unsigned char* _mem) return mem - _mem; } -void Net::fuse_network() +int Net::fuse_network() { // set the int8 op fusion:requantize #if NCNN_STRING && NCNN_REQUANT // fprintf(stderr, "Test op fusion to int8 implement:\n"); + // parse the network whether is a quantization model + bool net_quantized = false; + for (size_t i=0; itype == "Convolution" || layer->type == "ConvolutionDepthWise") + { + if (layer->type == "Convolution" && (((Convolution*)layer)->use_int8_inference == false)) + continue; + if (layer->type == "ConvolutionDepthWise" && (((ConvolutionDepthWise*)layer)->use_int8_inference == false)) + continue; + net_quantized = true; + } + } + + if (net_quantized == false) + return 0; + for (size_t i=0; itype == "Convolution" || layer->type == "ConvolutionDepthWise") { - if (((Convolution*)layer)->use_int8_inference == false) + if (layer->type == "Convolution" && (((Convolution*)layer)->use_int8_inference == false)) + continue; + if (layer->type == "ConvolutionDepthWise" && (((ConvolutionDepthWise*)layer)->use_int8_inference == false)) continue; for (size_t n=0; ntops[0]].consumers.size(); n++) @@ -858,6 +878,11 @@ void Net::fuse_network() if (layer_next_2->type == "Convolution" || layer_next_2->type == "ConvolutionDepthWise") { + if (layer_next_2->type == "Convolution" && ((Convolution*)layer_next_2)->use_int8_inference == false) + continue; + if (layer_next_2->type == "ConvolutionDepthWise" && ((ConvolutionDepthWise*)layer_next_2)->use_int8_inference == false) + continue; + // fprintf(stderr, "%s, %s, %s\n", layer->name.c_str(), layer_next->name.c_str(), layer_next_2->name.c_str()); if (layer->type == "Convolution" && layer_next_2->type == "Convolution") { @@ -934,6 +959,7 @@ void Net::fuse_network() } } #endif + return 0; } void Net::clear() diff --git a/src/net.h b/src/net.h index 5d5df01fc..5e89f2291 100644 --- a/src/net.h +++ b/src/net.h @@ -96,7 +96,7 @@ public: protected: // parse the structure of network // fuse int8 op dequantize and quantize by requantize - void fuse_network(); + int fuse_network(); #if NCNN_VULKAN