diff --git a/src/layer/arm/convolution_1x1_int8.h b/src/layer/arm/convolution_1x1_int8.h index c5985295e..0d135a51b 100644 --- a/src/layer/arm/convolution_1x1_int8.h +++ b/src/layer/arm/convolution_1x1_int8.h @@ -16,6 +16,91 @@ #include #endif // __ARM_NEON +#if __aarch64__ +static void conv1x1s1_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) +{ + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const float *kernel = _kernel; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + Mat out0 = top_blob.channel(p); + + out0.fill(0); + + int q = 0; + + for (; q+7 0; remain--) + { + //ToDo Neon + int sum0 = (int)*r0 * (int)kernel0[0] + (int)*r1 * (int)kernel0[1] + + (int)*r2 * (int)kernel0[2] + (int)*r3 * (int)kernel0[3] + + (int)*r4 * (int)kernel0[4] + (int)*r5 * (int)kernel0[5] + + (int)*r6 * (int)kernel0[6] + (int)*r7 * (int)kernel0[7]; + + *outptr0 += sum0; + + r0++; + r1++; + r2++; + r3++; + r4++; + r5++; + r6++; + r7++; + outptr0++; + } + } + + for (; q 0; remain--) + { + int sum0 = (int)(*r0) * (int)k0; + + *outptr0 += sum0; + + r0++; + outptr0++; + } + } + } +} +#else // __aarch64__ /* * Convolution 1x1 quantized with int8,unroll 8 x 4 */ @@ -1366,7 +1451,7 @@ static void conv1x1s1_neon_s8_left4(const Mat& bottom_blob, Mat& top_blob, const } } -static void conv1x1s1_neon_s8_inter(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Option& opt) +static void conv1x1s1_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Option& opt) { int size = top_blob.h * top_blob.w; int remain = size & 7; @@ -1391,3 +1476,105 @@ static void conv1x1s1_neon_s8_inter(const Mat& bottom_blob, Mat& top_blob, const return; } +#endif // __aarch64__ + +static void conv1x1s2_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) +{ + int w = bottom_blob.w; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const int tailstep = w - 2*outw + w; + const signed char *kernel = _kernel; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + Mat out0 = top_blob.channel(p); + + out0.fill(0); + + int q = 0; + + for (; q+7 0; remain--) + { + //ToDo Neon + int sum0 = (int)*r0 * (int)kernel0[0] + (int)*r1 * (int)kernel0[1] + + (int)*r2 * (int)kernel0[2] + (int)*r3 * (int)kernel0[3] + + (int)*r4 * (int)kernel0[4] + (int)*r5 * (int)kernel0[5] + + (int)*r6 * (int)kernel0[6] + (int)*r7 * (int)kernel0[7]; + + *outptr0 += sum0; + + r0 += 2; + r1 += 2; + r2 += 2; + r3 += 2; + r4 += 2; + r5 += 2; + r6 += 2; + r7 += 2; + outptr0++; + } + + r0 += tailstep; + r1 += tailstep; + r2 += tailstep; + r3 += tailstep; + r4 += tailstep; + r5 += tailstep; + r6 += tailstep; + r7 += tailstep; + } + } + + for (; q 0; remain--) + { + //ToDo Neon + int sum0 = (int)*r0 * (int)kernel0[0]; + + *outptr0 += sum0; + + r0 += 2; + outptr0++; + } + + r0 += tailstep; + } + } + } +} diff --git a/src/layer/arm/convolution_3x3_int8.h b/src/layer/arm/convolution_3x3_int8.h new file mode 100644 index 000000000..974931fb9 --- /dev/null +++ b/src/layer/arm/convolution_3x3_int8.h @@ -0,0 +1,149 @@ +// SenseNets is pleased to support the open source community by supporting ncnn available. +// +// Copyright (C) 2018 SenseNets Technology Ltd. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv3x3s1_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) +{ + int w = bottom_blob.w; + //int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const signed char *kernel = _kernel; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + Mat out0 = top_blob.channel(p); + + out0.fill(0); + + const signed char *kernel0 = (const signed char *)kernel + p * inch * 9; + + for (int q = 0; q < inch; q++) + { + int *outptr0 = out0; + + const signed char *img0 = bottom_blob.channel(q); + + const signed char *r0 = img0; + const signed char *r1 = img0 + w; + const signed char *r2 = img0 + w * 2; + + for (int i = 0; i < outh; i++) + { + int remain = outw; + + for (; remain > 0; remain--) + { + int sum0 = 0; + + sum0 += (int)r0[0] * kernel0[0]; + sum0 += (int)r0[1] * kernel0[1]; + sum0 += (int)r0[2] * kernel0[2]; + sum0 += (int)r1[0] * kernel0[3]; + sum0 += (int)r1[1] * kernel0[4]; + sum0 += (int)r1[2] * kernel0[5]; + sum0 += (int)r2[0] * kernel0[6]; + sum0 += (int)r2[1] * kernel0[7]; + sum0 += (int)r2[2] * kernel0[8]; + + *outptr0 += sum0; + + r0++; + r1++; + r2++; + outptr0++; + } + + r0 += 2; + r1 += 2; + r2 += 2; + } + + kernel0 += 9; + } + } +} + +static void conv3x3s2_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) +{ + int w = bottom_blob.w; + //int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const int tailstep = w - 2 * outw + w; + + const signed char *kernel = _kernel; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + Mat out0 = top_blob.channel(p); + + out0.fill(0); + + const signed char *kernel0 = (const signed char *)kernel + p * inch * 9; + + for (int q = 0; q < inch; q++) + { + int *outptr0 = out0; + + const signed char *img0 = bottom_blob.channel(q); + + const signed char *r0 = img0; + const signed char *r1 = img0 + w; + const signed char *r2 = img0 + w * 2; + + for (int i = 0; i < outh; i++) + { + int remain = outw; + + for (; remain > 0; remain--) + { + int sum0 = 0; + + sum0 += (int)r0[0] * (int)kernel0[0]; + sum0 += (int)r0[1] * (int)kernel0[1]; + sum0 += (int)r0[2] * (int)kernel0[2]; + sum0 += (int)r1[0] * (int)kernel0[3]; + sum0 += (int)r1[1] * (int)kernel0[4]; + sum0 += (int)r1[2] * (int)kernel0[5]; + sum0 += (int)r2[0] * (int)kernel0[6]; + sum0 += (int)r2[1] * (int)kernel0[7]; + sum0 += (int)r2[2] * (int)kernel0[8]; + + *outptr0 += sum0; + + r0 += 2; + r1 += 2; + r2 += 2; + outptr0++; + } + + r0 += tailstep; + r1 += tailstep; + r2 += tailstep; + } + + kernel0 += 9; + } + } +} diff --git a/src/layer/arm/convolution_arm.cpp b/src/layer/arm/convolution_arm.cpp index f241092d1..4e6e339f2 100644 --- a/src/layer/arm/convolution_arm.cpp +++ b/src/layer/arm/convolution_arm.cpp @@ -23,9 +23,8 @@ namespace ncnn { #include "convolution_5x5.h" #include "convolution_7x7.h" -#if !__aarch64__ #include "convolution_1x1_int8.h" -#endif // !__aarch64__ +#include "convolution_3x3_int8.h" DEFINE_LAYER_CREATOR(Convolution_arm) @@ -221,19 +220,6 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option return Convolution::forward(bottom_blob, top_blob, opt); } -#if __aarch64__ - if (use_int8_inference) - { - // TODO - return Convolution::forward(bottom_blob, top_blob, opt); - } -#else - if (use_int8_inference && (kernel_size != 1 || stride != 1)) - { - return Convolution::forward(bottom_blob, top_blob, opt); - } -#endif - typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&, const Option&); // kernel_size x stride @@ -283,10 +269,58 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option } // kernel_size = 7 }; + typedef void (*conv_int8_func)(const Mat&, Mat&, const Mat&, const Option&); + + // kernel_size x stride + conv_int8_func conv_int8_func_table[5][5] = + { + { + conv1x1s1_int8_neon, + conv1x1s2_int8_neon, + 0, + 0, + 0 + }, // kernel_size = 1 + { + 0, + 0, + 0, + 0, + 0 + }, // kernel_size = 2 + { + conv3x3s1_int8_neon, + conv3x3s2_int8_neon, + 0, + 0, + 0 + }, // kernel_size = 3 + { + 0, + 0, + 0, + 0, + 0 + }, // kernel_size = 4 + { + 0, + 0, + 0, + 0, + 0 + } // kernel_size = 5 + }; + conv_func conv = 0; + conv_int8_func conv_int8 = 0; if (use_int8_inference) { + conv_int8 = conv_int8_func_table[kernel_size-1][stride-1]; + if (!conv_int8) + { + return Convolution::forward(bottom_blob, top_blob, opt); + } } else { @@ -339,11 +373,8 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option if (top_blob.empty()) return -100; -#if !__aarch64__ if (use_int8_inference) { - // kernel_size = 1 - // stride = 1 Mat bottom_blob_bordered_int8; bottom_blob_bordered_int8.create(w, h, channels, (size_t)1u, opt.workspace_allocator); if (bottom_blob_bordered_int8.empty()) @@ -359,7 +390,7 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option quantize->forward(bottom_blob_bordered, bottom_blob_bordered_int8, opt); } - conv1x1s1_neon_s8_inter(bottom_blob_bordered_int8, top_blob, weight_data, opt); + conv_int8(bottom_blob_bordered_int8, top_blob, weight_data, opt); // dequantize, reverse scale inplace { @@ -382,7 +413,6 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option return 0; } -#endif if (use_winograd3x3 && w <= 120 && h <= 120) { diff --git a/src/layer/arm/convolutiondepthwise_3x3_int8.h b/src/layer/arm/convolutiondepthwise_3x3_int8.h new file mode 100644 index 000000000..73c11d3ed --- /dev/null +++ b/src/layer/arm/convolutiondepthwise_3x3_int8.h @@ -0,0 +1,142 @@ +// SenseNets is pleased to support the open source community by supporting ncnn available. +// +// Copyright (C) 2018 SenseNets Technology Ltd. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void convdw3x3s1_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) +{ + int w = bottom_blob.w; + //int h = bottom_blob.h; + //int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const signed char *kernel = _kernel; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + Mat out = top_blob.channel(p); + + out.fill(0); + + const signed char *kernel0 = (const signed char *)kernel + p * 9; + + int *outptr = out; + + const signed char *img0 = bottom_blob.channel(p); + + const signed char *r0 = img0; + const signed char *r1 = img0 + w; + const signed char *r2 = img0 + w * 2; + + int i = 0; + for (; i < outh; i++) + { + int remain = outw; + + for (; remain > 0; remain--) + { + + int sum = 0; + + sum += (int)r0[0] * (int)kernel0[0]; + sum += (int)r0[1] * (int)kernel0[1]; + sum += (int)r0[2] * (int)kernel0[2]; + sum += (int)r1[0] * (int)kernel0[3]; + sum += (int)r1[1] * (int)kernel0[4]; + sum += (int)r1[2] * (int)kernel0[5]; + sum += (int)r2[0] * (int)kernel0[6]; + sum += (int)r2[1] * (int)kernel0[7]; + sum += (int)r2[2] * (int)kernel0[8]; + + *outptr += sum; + + r0++; + r1++; + r2++; + outptr++; + } + + r0 += 2; + r1 += 2; + r2 += 2; + } + } +} + +static void convdw3x3s2_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) +{ + int w = bottom_blob.w; + //int h = bottom_blob.h; + //int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const int tailstep = w - 2 * outw + w; + + const signed char *kernel = _kernel; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + Mat out = top_blob.channel(p); + out.fill(0); + + const signed char *kernel0 = (const signed char *)kernel + p * 9; + + int *outptr = out; + + const signed char *img0 = bottom_blob.channel(p); + + const signed char *r0 = img0; + const signed char *r1 = img0 + w; + const signed char *r2 = img0 + w * 2; + + int i = 0; + + for (; i < outh; i++) + { + int remain = outw; + + for (; remain > 0; remain--) + { + int sum = 0; + + sum += (int)r0[0] * (int)kernel0[0]; + sum += (int)r0[1] * (int)kernel0[1]; + sum += (int)r0[2] * (int)kernel0[2]; + sum += (int)r1[0] * (int)kernel0[3]; + sum += (int)r1[1] * (int)kernel0[4]; + sum += (int)r1[2] * (int)kernel0[5]; + sum += (int)r2[0] * (int)kernel0[6]; + sum += (int)r2[1] * (int)kernel0[7]; + sum += (int)r2[2] * (int)kernel0[8]; + + *outptr += sum; + + r0 += 2; + r1 += 2; + r2 += 2; + outptr++; + } + + r0 += tailstep; + r1 += tailstep; + r2 += tailstep; + } + } +} diff --git a/src/layer/arm/convolutiondepthwise_arm.cpp b/src/layer/arm/convolutiondepthwise_arm.cpp index 1d938912f..e48820b38 100644 --- a/src/layer/arm/convolutiondepthwise_arm.cpp +++ b/src/layer/arm/convolutiondepthwise_arm.cpp @@ -24,6 +24,8 @@ namespace ncnn { #include "convolutiondepthwise_3x3.h" +#include "convolutiondepthwise_3x3_int8.h" + DEFINE_LAYER_CREATOR(ConvolutionDepthWise_arm) ConvolutionDepthWise_arm::ConvolutionDepthWise_arm() @@ -56,7 +58,13 @@ int ConvolutionDepthWise_arm::load_model(const ModelBin& mb) if (channels == group && group == num_output) { // depth-wise specific - return 0; + if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1) + { + if ((stride_w == 1 && stride_h == 1) || (stride_w == 2 && stride_h == 2)) + { + return 0; + } + } } const int channels_g = channels / group; @@ -66,7 +74,7 @@ int ConvolutionDepthWise_arm::load_model(const ModelBin& mb) for (int g=0; gload_param(pd); // set weights @@ -107,12 +121,6 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, con // convolv with NxN kernel // value = value + bias - if (use_int8_inference) - { - // TODO - return ConvolutionDepthWise::forward(bottom_blob, top_blob, opt); - } - int w = bottom_blob.w; int h = bottom_blob.h; int channels = bottom_blob.c; @@ -159,75 +167,91 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, con if (top_blob.empty()) return -100; - const int maxk = kernel_w * kernel_h; - // depth-wise if (channels == group && group == num_output) { - if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1) + if (use_int8_inference) { - if (stride_w == 1 && stride_h == 1) + if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1) { - convdw3x3s1_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt); - return 0; + if ((stride_w == 1 && stride_h == 1) || (stride_w == 2 && stride_h == 2)) + { + Mat bottom_blob_bordered_int8; + bottom_blob_bordered_int8.create(w, h, channels, (size_t)1u, opt.workspace_allocator); + if (bottom_blob_bordered_int8.empty()) + return -100; + + // quantize, scale and round to nearest + #pragma omp parallel for num_threads(opt.num_threads) + for (int g=0; gforward(bottom_blob_bordered_g, bottom_blob_bordered_int8_g, opt_g); + } + + if (stride_w == 1 && stride_h == 1) + { + convdw3x3s1_int8_neon(bottom_blob_bordered_int8, top_blob, weight_data, opt); + } + else if (stride_w == 2 && stride_h == 2) + { + convdw3x3s2_int8_neon(bottom_blob_bordered_int8, top_blob, weight_data, opt); + } + + // dequantize, reverse scale inplace + #pragma omp parallel for num_threads(opt.num_threads) + for (int g=0; gforward_inplace(top_blob_g, opt_g); + } + + return 0; + } } - else if (stride_w == 2 && stride_h == 2) + } + else + { + if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1) { - convdw3x3s2_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt); - return 0; + if (stride_w == 1 && stride_h == 1) + { + convdw3x3s1_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt); + return 0; + } + else if (stride_w == 2 && stride_h == 2) + { + convdw3x3s2_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt); + return 0; + } } } -#ifdef _OPENMP - int nested_current = omp_get_nested(); - omp_set_nested(0); -#endif - #pragma omp parallel for num_threads(opt.num_threads) for (int g=0; gload_param(pd); - - // set weights - ncnn::Mat weights[2]; - weights[0] = weight_data_g; - weights[1] = bias_data_g; - - op->load_model(ModelBinFromMatArray(weights)); - // forward - op->forward(bottom_blob_bordered_g, top_blob_g, opt); + const ncnn::Layer* op = group_ops[g]; - delete op; + ncnn::Option opt_g = opt; + opt_g.num_threads = 1; + opt_g.blob_allocator = top_blob.allocator; + + // forward + op->forward(bottom_blob_bordered_g, top_blob_g, opt_g); } -#ifdef _OPENMP - omp_set_nested(nested_current); -#endif return 0; } @@ -241,8 +265,11 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, con const ncnn::Layer* op = group_ops[g]; + ncnn::Option opt_g = opt; + opt_g.blob_allocator = top_blob.allocator; + // forward - op->forward(bottom_blob_bordered_g, top_blob_g, opt); + op->forward(bottom_blob_bordered_g, top_blob_g, opt_g); } return 0; diff --git a/src/layer/convolution.cpp b/src/layer/convolution.cpp index b6466934a..ac6e98dac 100644 --- a/src/layer/convolution.cpp +++ b/src/layer/convolution.cpp @@ -53,6 +53,9 @@ int Convolution::load_param(const ParamDict& pd) use_int8_inference = pd.use_int8_inference; + if (weight_data_int8_scale == 0.f || bottom_blob_int8_scale == 0.f) + use_int8_inference = false; + return 0; } diff --git a/src/layer/convolutiondepthwise.cpp b/src/layer/convolutiondepthwise.cpp index 5d416a82c..cc375a8ab 100644 --- a/src/layer/convolutiondepthwise.cpp +++ b/src/layer/convolutiondepthwise.cpp @@ -64,6 +64,9 @@ int ConvolutionDepthWise::load_param(const ParamDict& pd) return -100; } + if (weight_data_int8_scales.empty() || bottom_blob_int8_scales.empty()) + use_int8_inference = false; + // extend group if only one provided if (weight_data_int8_scales.w == 1) { @@ -121,7 +124,30 @@ int ConvolutionDepthWise::load_model(const ModelBin& mb) for (int g=0; gload_param(pd); + } + + for (int g=0; gload_param(pd); + + ncnn::Mat weights[1]; + weights[0] = Mat(1, (void*)((const float*)bias_data + g)); + + dequantize_ops[g]->load_model(ModelBinFromMatArray(weights)); } } @@ -252,11 +278,6 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const O { // quantize, scale and round to nearest { - ncnn::ParamDict pd; - pd.set(0, bottom_blob_int8_scales[g]);// scale - - quantize_ops[g]->load_param(pd); - ncnn::Option opt_g = opt; opt_g.num_threads = 1; opt_g.blob_allocator = bottom_blob_bordered_int8.allocator; @@ -293,20 +314,6 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const O // dequantize, reverse scale inplace { - float top_rescale = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); - - ncnn::ParamDict pd; - pd.set(0, top_rescale);// scale - pd.set(1, bias_term);// bias_term - pd.set(2, 1);// bias_data_size - - dequantize_ops[g]->load_param(pd); - - ncnn::Mat weights[1]; - weights[0] = Mat(1, (void*)((const float*)bias_data + g)); - - dequantize_ops[g]->load_model(ModelBinFromMatArray(weights)); - ncnn::Option opt_g = opt; opt_g.num_threads = 1; opt_g.blob_allocator = top_blob.allocator; @@ -325,11 +332,6 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const O #pragma omp parallel for num_threads(opt.num_threads) for (int g=0; gload_param(pd); - ncnn::Option opt_g = opt; opt_g.num_threads = 1; opt_g.blob_allocator = bottom_blob_bordered_int8.allocator; @@ -387,20 +389,6 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const O #pragma omp parallel for num_threads(opt.num_threads) for (int g=0; gload_param(pd); - - ncnn::Mat weights[1]; - weights[0] = Mat(num_output_g, (void*)((const float*)bias_data + num_output_g * g)); - - dequantize_ops[g]->load_model(ModelBinFromMatArray(weights)); - ncnn::Option opt_g = opt; opt_g.num_threads = 1; opt_g.blob_allocator = top_blob.allocator; diff --git a/src/layer/x86/convolutiondepthwise_x86.cpp b/src/layer/x86/convolutiondepthwise_x86.cpp index ce09b013b..ce2eaf207 100644 --- a/src/layer/x86/convolutiondepthwise_x86.cpp +++ b/src/layer/x86/convolutiondepthwise_x86.cpp @@ -28,6 +28,94 @@ namespace ncnn { DEFINE_LAYER_CREATOR(ConvolutionDepthWise_x86) +ConvolutionDepthWise_x86::ConvolutionDepthWise_x86() +{ +} + +ConvolutionDepthWise_x86::~ConvolutionDepthWise_x86() +{ + for (int i=0; i<(int)group_ops.size(); i++) + delete group_ops[i]; + + group_ops.clear(); +} + +int ConvolutionDepthWise_x86::load_model(const ModelBin& mb) +{ + int ret = ConvolutionDepthWise::load_model(mb); + if (ret != 0) + return ret; + + // create Convolution op for each group + const int maxk = kernel_w * kernel_h; + int channels = (weight_data_size / group) / maxk / (num_output / group) * group; + + for (int i=0; i<(int)group_ops.size(); i++) + delete group_ops[i]; + + group_ops.clear(); + + if (channels == group && group == num_output) + { + // depth-wise specific + if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1) + { + if ((stride_w == 1 && stride_h == 1) || (stride_w == 2 && stride_h == 2)) + { + return 0; + } + } + } + + const int channels_g = channels / group; + const int num_output_g = num_output / group; + + group_ops.resize(group); + + for (int g=0; gload_param(pd); + + // set weights + ncnn::Mat weights[2]; + weights[0] = weight_data_g; + weights[1] = bias_data_g; + + op->load_model(ModelBinFromMatArray(weights)); + + group_ops[g] = op; + } + + return 0; +} + int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { // convolv with NxN kernel @@ -79,8 +167,6 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con if (top_blob.empty()) return -100; - const int maxk = kernel_w * kernel_h; - // depth-wise if (channels == group && group == num_output) { @@ -99,11 +185,6 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con #pragma omp parallel for num_threads(opt.num_threads) for (int g=0; gload_param(pd); - ncnn::Option opt_g = opt; opt_g.num_threads = 1; opt_g.blob_allocator = bottom_blob_bordered_int8.allocator; @@ -126,20 +207,6 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con #pragma omp parallel for num_threads(opt.num_threads) for (int g=0; gload_param(pd); - - ncnn::Mat weights[1]; - weights[0] = Mat(1, (void*)((const float*)bias_data + g)); - - dequantize_ops[g]->load_model(ModelBinFromMatArray(weights)); - ncnn::Option opt_g = opt; opt_g.num_threads = 1; opt_g.blob_allocator = top_blob.allocator; @@ -174,42 +241,8 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con { Mat bottom_blob_bordered_g(w, h, 1, bottom_blob_bordered.channel(g)); Mat top_blob_g(outw, outh, 1, top_blob.channel(g), top_blob.elemsize, top_blob.allocator); - Mat weight_data_g(maxk, (void*)((const unsigned char*)weight_data + maxk * g * weight_data.elemsize), weight_data.elemsize); - Mat bias_data_g; - if (bias_term) - bias_data_g = Mat(1, (void*)((const float*)bias_data + g)); - - // call Convolution - ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution); - - // set param - ncnn::ParamDict pd; - pd.set(0, 1);// num_output - pd.set(1, kernel_w); - pd.set(11, kernel_h); - pd.set(2, dilation_w); - pd.set(12, dilation_h); - pd.set(3, stride_w); - pd.set(13, stride_h); - pd.set(4, 0);// pad_w - pd.set(14, 0);// pad_h - pd.set(5, bias_term); - pd.set(6, maxk);// weight_data_size - - if (use_int8_inference) - { - pd.set(8, weight_data_int8_scales[g]); - pd.set(9, bottom_blob_int8_scales[g]); - } - - op->load_param(pd); - - // set weights - ncnn::Mat weights[2]; - weights[0] = weight_data_g; - weights[1] = bias_data_g; - op->load_model(ModelBinFromMatArray(weights)); + const ncnn::Layer* op = group_ops[g]; ncnn::Option opt_g = opt; opt_g.num_threads = 1; @@ -217,8 +250,6 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con // forward op->forward(bottom_blob_bordered_g, top_blob_g, opt_g); - - delete op; } return 0; @@ -231,50 +262,14 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con { Mat bottom_blob_bordered_g(w, h, channels_g, bottom_blob_bordered.channel(channels_g * g)); Mat top_blob_g(outw, outh, num_output_g, top_blob.channel(num_output_g * g), top_blob.elemsize, top_blob.allocator); - Mat weight_data_g(maxk * channels_g * num_output_g, (void*)((const unsigned char*)weight_data + maxk * channels_g * num_output_g * g * weight_data.elemsize), weight_data.elemsize); - Mat bias_data_g; - if (bias_term) - bias_data_g = Mat(num_output_g, (void*)((const float*)bias_data + num_output_g * g)); - - // call Convolution - ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution); - // set param - ncnn::ParamDict pd; - pd.set(0, num_output_g);// num_output - pd.set(1, kernel_w); - pd.set(11, kernel_h); - pd.set(2, dilation_w); - pd.set(12, dilation_h); - pd.set(3, stride_w); - pd.set(13, stride_h); - pd.set(4, 0);// pad_w - pd.set(14, 0);// pad_h - pd.set(5, bias_term); - pd.set(6, maxk * channels_g * num_output_g);// weight_data_size - - if (use_int8_inference) - { - pd.set(8, weight_data_int8_scales[g]); - pd.set(9, bottom_blob_int8_scales[g]); - } - - op->load_param(pd); - - // set weights - ncnn::Mat weights[2]; - weights[0] = weight_data_g; - weights[1] = bias_data_g; - - op->load_model(ModelBinFromMatArray(weights)); + const ncnn::Layer* op = group_ops[g]; ncnn::Option opt_g = opt; opt_g.blob_allocator = top_blob.allocator; // forward op->forward(bottom_blob_bordered_g, top_blob_g, opt_g); - - delete op; } return 0; diff --git a/src/layer/x86/convolutiondepthwise_x86.h b/src/layer/x86/convolutiondepthwise_x86.h index 82352312e..6cec2538b 100644 --- a/src/layer/x86/convolutiondepthwise_x86.h +++ b/src/layer/x86/convolutiondepthwise_x86.h @@ -22,7 +22,15 @@ namespace ncnn { class ConvolutionDepthWise_x86 : public ConvolutionDepthWise { public: + ConvolutionDepthWise_x86(); + virtual ~ConvolutionDepthWise_x86(); + + virtual int load_model(const ModelBin& mb); + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; + +public: + std::vector group_ops; }; } // namespace ncnn