| @@ -16,6 +16,91 @@ | |||
| #include <arm_neon.h> | |||
| #endif // __ARM_NEON | |||
| #if __aarch64__ | |||
| static void conv1x1s1_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) | |||
| { | |||
| int inch = bottom_blob.c; | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| int outch = top_blob.c; | |||
| const float *kernel = _kernel; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p = 0; p < outch; p++) | |||
| { | |||
| Mat out0 = top_blob.channel(p); | |||
| out0.fill(0); | |||
| int q = 0; | |||
| for (; q+7<inch; q+=8) | |||
| { | |||
| int* outptr0 = out0; | |||
| const signed char *kernel0 = (const signed char *)kernel + p * inch + q; | |||
| const signed char *r0 = bottom_blob.channel(q); | |||
| const signed char *r1 = bottom_blob.channel(q + 1); | |||
| const signed char *r2 = bottom_blob.channel(q + 2); | |||
| const signed char *r3 = bottom_blob.channel(q + 3); | |||
| const signed char *r4 = bottom_blob.channel(q + 4); | |||
| const signed char *r5 = bottom_blob.channel(q + 5); | |||
| const signed char *r6 = bottom_blob.channel(q + 6); | |||
| const signed char *r7 = bottom_blob.channel(q + 7); | |||
| int size = outw * outh; | |||
| int remain = size; | |||
| for (; remain > 0; remain--) | |||
| { | |||
| //ToDo Neon | |||
| int sum0 = (int)*r0 * (int)kernel0[0] + (int)*r1 * (int)kernel0[1] + | |||
| (int)*r2 * (int)kernel0[2] + (int)*r3 * (int)kernel0[3] + | |||
| (int)*r4 * (int)kernel0[4] + (int)*r5 * (int)kernel0[5] + | |||
| (int)*r6 * (int)kernel0[6] + (int)*r7 * (int)kernel0[7]; | |||
| *outptr0 += sum0; | |||
| r0++; | |||
| r1++; | |||
| r2++; | |||
| r3++; | |||
| r4++; | |||
| r5++; | |||
| r6++; | |||
| r7++; | |||
| outptr0++; | |||
| } | |||
| } | |||
| for (; q<inch; q++) | |||
| { | |||
| int* outptr0 = out0; | |||
| const signed char *r0 = bottom_blob.channel(q); | |||
| const signed char *kernel0 = (const signed char *)kernel + p * inch + q; | |||
| const signed char k0 = kernel0[0]; | |||
| int size = outw * outh; | |||
| int remain = size; | |||
| for (; remain > 0; remain--) | |||
| { | |||
| int sum0 = (int)(*r0) * (int)k0; | |||
| *outptr0 += sum0; | |||
| r0++; | |||
| outptr0++; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| #else // __aarch64__ | |||
| /* | |||
| * Convolution 1x1 quantized with int8,unroll 8 x 4 | |||
| */ | |||
| @@ -1366,7 +1451,7 @@ static void conv1x1s1_neon_s8_left4(const Mat& bottom_blob, Mat& top_blob, const | |||
| } | |||
| } | |||
| static void conv1x1s1_neon_s8_inter(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Option& opt) | |||
| static void conv1x1s1_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Option& opt) | |||
| { | |||
| int size = top_blob.h * top_blob.w; | |||
| int remain = size & 7; | |||
| @@ -1391,3 +1476,105 @@ static void conv1x1s1_neon_s8_inter(const Mat& bottom_blob, Mat& top_blob, const | |||
| return; | |||
| } | |||
| #endif // __aarch64__ | |||
| static void conv1x1s2_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int inch = bottom_blob.c; | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| int outch = top_blob.c; | |||
| const int tailstep = w - 2*outw + w; | |||
| const signed char *kernel = _kernel; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p = 0; p < outch; p++) | |||
| { | |||
| Mat out0 = top_blob.channel(p); | |||
| out0.fill(0); | |||
| int q = 0; | |||
| for (; q+7<inch; q+=8) | |||
| { | |||
| int* outptr0 = out0; | |||
| const signed char *kernel0 = (const signed char *)kernel + p * inch + q; | |||
| const signed char *r0 = bottom_blob.channel(q); | |||
| const signed char *r1 = bottom_blob.channel(q + 1); | |||
| const signed char *r2 = bottom_blob.channel(q + 2); | |||
| const signed char *r3 = bottom_blob.channel(q + 3); | |||
| const signed char *r4 = bottom_blob.channel(q + 4); | |||
| const signed char *r5 = bottom_blob.channel(q + 5); | |||
| const signed char *r6 = bottom_blob.channel(q + 6); | |||
| const signed char *r7 = bottom_blob.channel(q + 7); | |||
| for(int i = 0; i < outh; i++) | |||
| { | |||
| int remain = outw; | |||
| for (; remain > 0; remain--) | |||
| { | |||
| //ToDo Neon | |||
| int sum0 = (int)*r0 * (int)kernel0[0] + (int)*r1 * (int)kernel0[1] + | |||
| (int)*r2 * (int)kernel0[2] + (int)*r3 * (int)kernel0[3] + | |||
| (int)*r4 * (int)kernel0[4] + (int)*r5 * (int)kernel0[5] + | |||
| (int)*r6 * (int)kernel0[6] + (int)*r7 * (int)kernel0[7]; | |||
| *outptr0 += sum0; | |||
| r0 += 2; | |||
| r1 += 2; | |||
| r2 += 2; | |||
| r3 += 2; | |||
| r4 += 2; | |||
| r5 += 2; | |||
| r6 += 2; | |||
| r7 += 2; | |||
| outptr0++; | |||
| } | |||
| r0 += tailstep; | |||
| r1 += tailstep; | |||
| r2 += tailstep; | |||
| r3 += tailstep; | |||
| r4 += tailstep; | |||
| r5 += tailstep; | |||
| r6 += tailstep; | |||
| r7 += tailstep; | |||
| } | |||
| } | |||
| for (; q<inch; q++) | |||
| { | |||
| int* outptr0 = out0; | |||
| const signed char *r0 = bottom_blob.channel(q); | |||
| const signed char *kernel0 = (const signed char *)kernel + p * inch + q; | |||
| for(int i = 0; i < outh; i++) | |||
| { | |||
| int remain = outw; | |||
| for (; remain > 0; remain--) | |||
| { | |||
| //ToDo Neon | |||
| int sum0 = (int)*r0 * (int)kernel0[0]; | |||
| *outptr0 += sum0; | |||
| r0 += 2; | |||
| outptr0++; | |||
| } | |||
| r0 += tailstep; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| @@ -0,0 +1,149 @@ | |||
| // SenseNets is pleased to support the open source community by supporting ncnn available. | |||
| // | |||
| // Copyright (C) 2018 SenseNets Technology Ltd. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void conv3x3s1_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| //int h = bottom_blob.h; | |||
| int inch = bottom_blob.c; | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| int outch = top_blob.c; | |||
| const signed char *kernel = _kernel; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p = 0; p < outch; p++) | |||
| { | |||
| Mat out0 = top_blob.channel(p); | |||
| out0.fill(0); | |||
| const signed char *kernel0 = (const signed char *)kernel + p * inch * 9; | |||
| for (int q = 0; q < inch; q++) | |||
| { | |||
| int *outptr0 = out0; | |||
| const signed char *img0 = bottom_blob.channel(q); | |||
| const signed char *r0 = img0; | |||
| const signed char *r1 = img0 + w; | |||
| const signed char *r2 = img0 + w * 2; | |||
| for (int i = 0; i < outh; i++) | |||
| { | |||
| int remain = outw; | |||
| for (; remain > 0; remain--) | |||
| { | |||
| int sum0 = 0; | |||
| sum0 += (int)r0[0] * kernel0[0]; | |||
| sum0 += (int)r0[1] * kernel0[1]; | |||
| sum0 += (int)r0[2] * kernel0[2]; | |||
| sum0 += (int)r1[0] * kernel0[3]; | |||
| sum0 += (int)r1[1] * kernel0[4]; | |||
| sum0 += (int)r1[2] * kernel0[5]; | |||
| sum0 += (int)r2[0] * kernel0[6]; | |||
| sum0 += (int)r2[1] * kernel0[7]; | |||
| sum0 += (int)r2[2] * kernel0[8]; | |||
| *outptr0 += sum0; | |||
| r0++; | |||
| r1++; | |||
| r2++; | |||
| outptr0++; | |||
| } | |||
| r0 += 2; | |||
| r1 += 2; | |||
| r2 += 2; | |||
| } | |||
| kernel0 += 9; | |||
| } | |||
| } | |||
| } | |||
| static void conv3x3s2_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| //int h = bottom_blob.h; | |||
| int inch = bottom_blob.c; | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| int outch = top_blob.c; | |||
| const int tailstep = w - 2 * outw + w; | |||
| const signed char *kernel = _kernel; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p = 0; p < outch; p++) | |||
| { | |||
| Mat out0 = top_blob.channel(p); | |||
| out0.fill(0); | |||
| const signed char *kernel0 = (const signed char *)kernel + p * inch * 9; | |||
| for (int q = 0; q < inch; q++) | |||
| { | |||
| int *outptr0 = out0; | |||
| const signed char *img0 = bottom_blob.channel(q); | |||
| const signed char *r0 = img0; | |||
| const signed char *r1 = img0 + w; | |||
| const signed char *r2 = img0 + w * 2; | |||
| for (int i = 0; i < outh; i++) | |||
| { | |||
| int remain = outw; | |||
| for (; remain > 0; remain--) | |||
| { | |||
| int sum0 = 0; | |||
| sum0 += (int)r0[0] * (int)kernel0[0]; | |||
| sum0 += (int)r0[1] * (int)kernel0[1]; | |||
| sum0 += (int)r0[2] * (int)kernel0[2]; | |||
| sum0 += (int)r1[0] * (int)kernel0[3]; | |||
| sum0 += (int)r1[1] * (int)kernel0[4]; | |||
| sum0 += (int)r1[2] * (int)kernel0[5]; | |||
| sum0 += (int)r2[0] * (int)kernel0[6]; | |||
| sum0 += (int)r2[1] * (int)kernel0[7]; | |||
| sum0 += (int)r2[2] * (int)kernel0[8]; | |||
| *outptr0 += sum0; | |||
| r0 += 2; | |||
| r1 += 2; | |||
| r2 += 2; | |||
| outptr0++; | |||
| } | |||
| r0 += tailstep; | |||
| r1 += tailstep; | |||
| r2 += tailstep; | |||
| } | |||
| kernel0 += 9; | |||
| } | |||
| } | |||
| } | |||
| @@ -23,9 +23,8 @@ namespace ncnn { | |||
| #include "convolution_5x5.h" | |||
| #include "convolution_7x7.h" | |||
| #if !__aarch64__ | |||
| #include "convolution_1x1_int8.h" | |||
| #endif // !__aarch64__ | |||
| #include "convolution_3x3_int8.h" | |||
| DEFINE_LAYER_CREATOR(Convolution_arm) | |||
| @@ -221,19 +220,6 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option | |||
| return Convolution::forward(bottom_blob, top_blob, opt); | |||
| } | |||
| #if __aarch64__ | |||
| if (use_int8_inference) | |||
| { | |||
| // TODO | |||
| return Convolution::forward(bottom_blob, top_blob, opt); | |||
| } | |||
| #else | |||
| if (use_int8_inference && (kernel_size != 1 || stride != 1)) | |||
| { | |||
| return Convolution::forward(bottom_blob, top_blob, opt); | |||
| } | |||
| #endif | |||
| typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&, const Option&); | |||
| // kernel_size x stride | |||
| @@ -283,10 +269,58 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option | |||
| } // kernel_size = 7 | |||
| }; | |||
| typedef void (*conv_int8_func)(const Mat&, Mat&, const Mat&, const Option&); | |||
| // kernel_size x stride | |||
| conv_int8_func conv_int8_func_table[5][5] = | |||
| { | |||
| { | |||
| conv1x1s1_int8_neon, | |||
| conv1x1s2_int8_neon, | |||
| 0, | |||
| 0, | |||
| 0 | |||
| }, // kernel_size = 1 | |||
| { | |||
| 0, | |||
| 0, | |||
| 0, | |||
| 0, | |||
| 0 | |||
| }, // kernel_size = 2 | |||
| { | |||
| conv3x3s1_int8_neon, | |||
| conv3x3s2_int8_neon, | |||
| 0, | |||
| 0, | |||
| 0 | |||
| }, // kernel_size = 3 | |||
| { | |||
| 0, | |||
| 0, | |||
| 0, | |||
| 0, | |||
| 0 | |||
| }, // kernel_size = 4 | |||
| { | |||
| 0, | |||
| 0, | |||
| 0, | |||
| 0, | |||
| 0 | |||
| } // kernel_size = 5 | |||
| }; | |||
| conv_func conv = 0; | |||
| conv_int8_func conv_int8 = 0; | |||
| if (use_int8_inference) | |||
| { | |||
| conv_int8 = conv_int8_func_table[kernel_size-1][stride-1]; | |||
| if (!conv_int8) | |||
| { | |||
| return Convolution::forward(bottom_blob, top_blob, opt); | |||
| } | |||
| } | |||
| else | |||
| { | |||
| @@ -339,11 +373,8 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| #if !__aarch64__ | |||
| if (use_int8_inference) | |||
| { | |||
| // kernel_size = 1 | |||
| // stride = 1 | |||
| Mat bottom_blob_bordered_int8; | |||
| bottom_blob_bordered_int8.create(w, h, channels, (size_t)1u, opt.workspace_allocator); | |||
| if (bottom_blob_bordered_int8.empty()) | |||
| @@ -359,7 +390,7 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option | |||
| quantize->forward(bottom_blob_bordered, bottom_blob_bordered_int8, opt); | |||
| } | |||
| conv1x1s1_neon_s8_inter(bottom_blob_bordered_int8, top_blob, weight_data, opt); | |||
| conv_int8(bottom_blob_bordered_int8, top_blob, weight_data, opt); | |||
| // dequantize, reverse scale inplace | |||
| { | |||
| @@ -382,7 +413,6 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option | |||
| return 0; | |||
| } | |||
| #endif | |||
| if (use_winograd3x3 && w <= 120 && h <= 120) | |||
| { | |||
| @@ -0,0 +1,142 @@ | |||
| // SenseNets is pleased to support the open source community by supporting ncnn available. | |||
| // | |||
| // Copyright (C) 2018 SenseNets Technology Ltd. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void convdw3x3s1_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| //int h = bottom_blob.h; | |||
| //int inch = bottom_blob.c; | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| int outch = top_blob.c; | |||
| const signed char *kernel = _kernel; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p = 0; p < outch; p++) | |||
| { | |||
| Mat out = top_blob.channel(p); | |||
| out.fill(0); | |||
| const signed char *kernel0 = (const signed char *)kernel + p * 9; | |||
| int *outptr = out; | |||
| const signed char *img0 = bottom_blob.channel(p); | |||
| const signed char *r0 = img0; | |||
| const signed char *r1 = img0 + w; | |||
| const signed char *r2 = img0 + w * 2; | |||
| int i = 0; | |||
| for (; i < outh; i++) | |||
| { | |||
| int remain = outw; | |||
| for (; remain > 0; remain--) | |||
| { | |||
| int sum = 0; | |||
| sum += (int)r0[0] * (int)kernel0[0]; | |||
| sum += (int)r0[1] * (int)kernel0[1]; | |||
| sum += (int)r0[2] * (int)kernel0[2]; | |||
| sum += (int)r1[0] * (int)kernel0[3]; | |||
| sum += (int)r1[1] * (int)kernel0[4]; | |||
| sum += (int)r1[2] * (int)kernel0[5]; | |||
| sum += (int)r2[0] * (int)kernel0[6]; | |||
| sum += (int)r2[1] * (int)kernel0[7]; | |||
| sum += (int)r2[2] * (int)kernel0[8]; | |||
| *outptr += sum; | |||
| r0++; | |||
| r1++; | |||
| r2++; | |||
| outptr++; | |||
| } | |||
| r0 += 2; | |||
| r1 += 2; | |||
| r2 += 2; | |||
| } | |||
| } | |||
| } | |||
| static void convdw3x3s2_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| //int h = bottom_blob.h; | |||
| //int inch = bottom_blob.c; | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| int outch = top_blob.c; | |||
| const int tailstep = w - 2 * outw + w; | |||
| const signed char *kernel = _kernel; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p = 0; p < outch; p++) | |||
| { | |||
| Mat out = top_blob.channel(p); | |||
| out.fill(0); | |||
| const signed char *kernel0 = (const signed char *)kernel + p * 9; | |||
| int *outptr = out; | |||
| const signed char *img0 = bottom_blob.channel(p); | |||
| const signed char *r0 = img0; | |||
| const signed char *r1 = img0 + w; | |||
| const signed char *r2 = img0 + w * 2; | |||
| int i = 0; | |||
| for (; i < outh; i++) | |||
| { | |||
| int remain = outw; | |||
| for (; remain > 0; remain--) | |||
| { | |||
| int sum = 0; | |||
| sum += (int)r0[0] * (int)kernel0[0]; | |||
| sum += (int)r0[1] * (int)kernel0[1]; | |||
| sum += (int)r0[2] * (int)kernel0[2]; | |||
| sum += (int)r1[0] * (int)kernel0[3]; | |||
| sum += (int)r1[1] * (int)kernel0[4]; | |||
| sum += (int)r1[2] * (int)kernel0[5]; | |||
| sum += (int)r2[0] * (int)kernel0[6]; | |||
| sum += (int)r2[1] * (int)kernel0[7]; | |||
| sum += (int)r2[2] * (int)kernel0[8]; | |||
| *outptr += sum; | |||
| r0 += 2; | |||
| r1 += 2; | |||
| r2 += 2; | |||
| outptr++; | |||
| } | |||
| r0 += tailstep; | |||
| r1 += tailstep; | |||
| r2 += tailstep; | |||
| } | |||
| } | |||
| } | |||
| @@ -24,6 +24,8 @@ namespace ncnn { | |||
| #include "convolutiondepthwise_3x3.h" | |||
| #include "convolutiondepthwise_3x3_int8.h" | |||
| DEFINE_LAYER_CREATOR(ConvolutionDepthWise_arm) | |||
| ConvolutionDepthWise_arm::ConvolutionDepthWise_arm() | |||
| @@ -56,7 +58,13 @@ int ConvolutionDepthWise_arm::load_model(const ModelBin& mb) | |||
| if (channels == group && group == num_output) | |||
| { | |||
| // depth-wise specific | |||
| return 0; | |||
| if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1) | |||
| { | |||
| if ((stride_w == 1 && stride_h == 1) || (stride_w == 2 && stride_h == 2)) | |||
| { | |||
| return 0; | |||
| } | |||
| } | |||
| } | |||
| const int channels_g = channels / group; | |||
| @@ -66,7 +74,7 @@ int ConvolutionDepthWise_arm::load_model(const ModelBin& mb) | |||
| for (int g=0; g<group; g++) | |||
| { | |||
| Mat weight_data_g(maxk * channels_g * num_output_g, (void*)((const float*)weight_data + maxk * channels_g * num_output_g * g)); | |||
| Mat weight_data_g(maxk * channels_g * num_output_g, (void*)((const unsigned char*)weight_data + maxk * channels_g * num_output_g * g * weight_data.elemsize), weight_data.elemsize); | |||
| Mat bias_data_g; | |||
| if (bias_term) | |||
| bias_data_g = Mat(num_output_g, (void*)((const float*)bias_data + num_output_g * g)); | |||
| @@ -87,6 +95,12 @@ int ConvolutionDepthWise_arm::load_model(const ModelBin& mb) | |||
| pd.set(5, bias_term); | |||
| pd.set(6, maxk * channels_g * num_output_g);// weight_data_size | |||
| if (use_int8_inference) | |||
| { | |||
| pd.set(8, weight_data_int8_scales[g]); | |||
| pd.set(9, bottom_blob_int8_scales[g]); | |||
| } | |||
| op->load_param(pd); | |||
| // set weights | |||
| @@ -107,12 +121,6 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, con | |||
| // convolv with NxN kernel | |||
| // value = value + bias | |||
| if (use_int8_inference) | |||
| { | |||
| // TODO | |||
| return ConvolutionDepthWise::forward(bottom_blob, top_blob, opt); | |||
| } | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| @@ -159,75 +167,91 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, con | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| const int maxk = kernel_w * kernel_h; | |||
| // depth-wise | |||
| if (channels == group && group == num_output) | |||
| { | |||
| if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1) | |||
| if (use_int8_inference) | |||
| { | |||
| if (stride_w == 1 && stride_h == 1) | |||
| if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1) | |||
| { | |||
| convdw3x3s1_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt); | |||
| return 0; | |||
| if ((stride_w == 1 && stride_h == 1) || (stride_w == 2 && stride_h == 2)) | |||
| { | |||
| Mat bottom_blob_bordered_int8; | |||
| bottom_blob_bordered_int8.create(w, h, channels, (size_t)1u, opt.workspace_allocator); | |||
| if (bottom_blob_bordered_int8.empty()) | |||
| return -100; | |||
| // quantize, scale and round to nearest | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int g=0; g<group; g++) | |||
| { | |||
| ncnn::Option opt_g = opt; | |||
| opt_g.num_threads = 1; | |||
| opt_g.blob_allocator = bottom_blob_bordered_int8.allocator; | |||
| const Mat bottom_blob_bordered_g = bottom_blob_bordered.channel(g); | |||
| Mat bottom_blob_bordered_int8_g = bottom_blob_bordered_int8.channel(g); | |||
| quantize_ops[g]->forward(bottom_blob_bordered_g, bottom_blob_bordered_int8_g, opt_g); | |||
| } | |||
| if (stride_w == 1 && stride_h == 1) | |||
| { | |||
| convdw3x3s1_int8_neon(bottom_blob_bordered_int8, top_blob, weight_data, opt); | |||
| } | |||
| else if (stride_w == 2 && stride_h == 2) | |||
| { | |||
| convdw3x3s2_int8_neon(bottom_blob_bordered_int8, top_blob, weight_data, opt); | |||
| } | |||
| // dequantize, reverse scale inplace | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int g=0; g<group; g++) | |||
| { | |||
| ncnn::Option opt_g = opt; | |||
| opt_g.num_threads = 1; | |||
| opt_g.blob_allocator = top_blob.allocator; | |||
| Mat top_blob_g = top_blob.channel(g); | |||
| dequantize_ops[g]->forward_inplace(top_blob_g, opt_g); | |||
| } | |||
| return 0; | |||
| } | |||
| } | |||
| else if (stride_w == 2 && stride_h == 2) | |||
| } | |||
| else | |||
| { | |||
| if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1) | |||
| { | |||
| convdw3x3s2_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt); | |||
| return 0; | |||
| if (stride_w == 1 && stride_h == 1) | |||
| { | |||
| convdw3x3s1_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt); | |||
| return 0; | |||
| } | |||
| else if (stride_w == 2 && stride_h == 2) | |||
| { | |||
| convdw3x3s2_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt); | |||
| return 0; | |||
| } | |||
| } | |||
| } | |||
| #ifdef _OPENMP | |||
| int nested_current = omp_get_nested(); | |||
| omp_set_nested(0); | |||
| #endif | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int g=0; g<group; g++) | |||
| { | |||
| Mat bottom_blob_bordered_g(w, h, 1, bottom_blob_bordered.channel(g)); | |||
| Mat top_blob_g(outw, outh, 1, top_blob.channel(g), top_blob.elemsize, top_blob.allocator); | |||
| Mat weight_data_g(maxk, (void*)((const float*)weight_data + maxk * g)); | |||
| Mat bias_data_g; | |||
| if (bias_term) | |||
| bias_data_g = Mat(1, (void*)((const float*)bias_data + g)); | |||
| // call Convolution | |||
| ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution); | |||
| // set param | |||
| ncnn::ParamDict pd; | |||
| pd.set(0, 1);// num_output | |||
| pd.set(1, kernel_w); | |||
| pd.set(11, kernel_h); | |||
| pd.set(2, dilation_w); | |||
| pd.set(12, dilation_h); | |||
| pd.set(3, stride_w); | |||
| pd.set(13, stride_h); | |||
| pd.set(4, 0);// pad_w | |||
| pd.set(14, 0);// pad_h | |||
| pd.set(5, bias_term); | |||
| pd.set(6, maxk);// weight_data_size | |||
| op->load_param(pd); | |||
| // set weights | |||
| ncnn::Mat weights[2]; | |||
| weights[0] = weight_data_g; | |||
| weights[1] = bias_data_g; | |||
| op->load_model(ModelBinFromMatArray(weights)); | |||
| // forward | |||
| op->forward(bottom_blob_bordered_g, top_blob_g, opt); | |||
| const ncnn::Layer* op = group_ops[g]; | |||
| delete op; | |||
| ncnn::Option opt_g = opt; | |||
| opt_g.num_threads = 1; | |||
| opt_g.blob_allocator = top_blob.allocator; | |||
| // forward | |||
| op->forward(bottom_blob_bordered_g, top_blob_g, opt_g); | |||
| } | |||
| #ifdef _OPENMP | |||
| omp_set_nested(nested_current); | |||
| #endif | |||
| return 0; | |||
| } | |||
| @@ -241,8 +265,11 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, con | |||
| const ncnn::Layer* op = group_ops[g]; | |||
| ncnn::Option opt_g = opt; | |||
| opt_g.blob_allocator = top_blob.allocator; | |||
| // forward | |||
| op->forward(bottom_blob_bordered_g, top_blob_g, opt); | |||
| op->forward(bottom_blob_bordered_g, top_blob_g, opt_g); | |||
| } | |||
| return 0; | |||
| @@ -53,6 +53,9 @@ int Convolution::load_param(const ParamDict& pd) | |||
| use_int8_inference = pd.use_int8_inference; | |||
| if (weight_data_int8_scale == 0.f || bottom_blob_int8_scale == 0.f) | |||
| use_int8_inference = false; | |||
| return 0; | |||
| } | |||
| @@ -64,6 +64,9 @@ int ConvolutionDepthWise::load_param(const ParamDict& pd) | |||
| return -100; | |||
| } | |||
| if (weight_data_int8_scales.empty() || bottom_blob_int8_scales.empty()) | |||
| use_int8_inference = false; | |||
| // extend group if only one provided | |||
| if (weight_data_int8_scales.w == 1) | |||
| { | |||
| @@ -121,7 +124,30 @@ int ConvolutionDepthWise::load_model(const ModelBin& mb) | |||
| for (int g=0; g<group; g++) | |||
| { | |||
| quantize_ops[g] = ncnn::create_layer(ncnn::LayerType::Quantize); | |||
| ncnn::ParamDict pd; | |||
| pd.set(0, bottom_blob_int8_scales[g]);// scale | |||
| quantize_ops[g]->load_param(pd); | |||
| } | |||
| for (int g=0; g<group; g++) | |||
| { | |||
| dequantize_ops[g] = ncnn::create_layer(ncnn::LayerType::Dequantize); | |||
| float top_rescale = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); | |||
| ncnn::ParamDict pd; | |||
| pd.set(0, top_rescale);// scale | |||
| pd.set(1, bias_term);// bias_term | |||
| pd.set(2, 1);// bias_data_size | |||
| dequantize_ops[g]->load_param(pd); | |||
| ncnn::Mat weights[1]; | |||
| weights[0] = Mat(1, (void*)((const float*)bias_data + g)); | |||
| dequantize_ops[g]->load_model(ModelBinFromMatArray(weights)); | |||
| } | |||
| } | |||
| @@ -252,11 +278,6 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const O | |||
| { | |||
| // quantize, scale and round to nearest | |||
| { | |||
| ncnn::ParamDict pd; | |||
| pd.set(0, bottom_blob_int8_scales[g]);// scale | |||
| quantize_ops[g]->load_param(pd); | |||
| ncnn::Option opt_g = opt; | |||
| opt_g.num_threads = 1; | |||
| opt_g.blob_allocator = bottom_blob_bordered_int8.allocator; | |||
| @@ -293,20 +314,6 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const O | |||
| // dequantize, reverse scale inplace | |||
| { | |||
| float top_rescale = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); | |||
| ncnn::ParamDict pd; | |||
| pd.set(0, top_rescale);// scale | |||
| pd.set(1, bias_term);// bias_term | |||
| pd.set(2, 1);// bias_data_size | |||
| dequantize_ops[g]->load_param(pd); | |||
| ncnn::Mat weights[1]; | |||
| weights[0] = Mat(1, (void*)((const float*)bias_data + g)); | |||
| dequantize_ops[g]->load_model(ModelBinFromMatArray(weights)); | |||
| ncnn::Option opt_g = opt; | |||
| opt_g.num_threads = 1; | |||
| opt_g.blob_allocator = top_blob.allocator; | |||
| @@ -325,11 +332,6 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const O | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int g=0; g<group; g++) | |||
| { | |||
| ncnn::ParamDict pd; | |||
| pd.set(0, bottom_blob_int8_scales[g]);// scale | |||
| quantize_ops[g]->load_param(pd); | |||
| ncnn::Option opt_g = opt; | |||
| opt_g.num_threads = 1; | |||
| opt_g.blob_allocator = bottom_blob_bordered_int8.allocator; | |||
| @@ -387,20 +389,6 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const O | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int g=0; g<group; g++) | |||
| { | |||
| float top_rescale = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); | |||
| ncnn::ParamDict pd; | |||
| pd.set(0, top_rescale);// scale | |||
| pd.set(1, bias_term);// bias_term | |||
| pd.set(2, num_output_g);// bias_data_size | |||
| dequantize_ops[g]->load_param(pd); | |||
| ncnn::Mat weights[1]; | |||
| weights[0] = Mat(num_output_g, (void*)((const float*)bias_data + num_output_g * g)); | |||
| dequantize_ops[g]->load_model(ModelBinFromMatArray(weights)); | |||
| ncnn::Option opt_g = opt; | |||
| opt_g.num_threads = 1; | |||
| opt_g.blob_allocator = top_blob.allocator; | |||
| @@ -28,6 +28,94 @@ namespace ncnn { | |||
| DEFINE_LAYER_CREATOR(ConvolutionDepthWise_x86) | |||
| ConvolutionDepthWise_x86::ConvolutionDepthWise_x86() | |||
| { | |||
| } | |||
| ConvolutionDepthWise_x86::~ConvolutionDepthWise_x86() | |||
| { | |||
| for (int i=0; i<(int)group_ops.size(); i++) | |||
| delete group_ops[i]; | |||
| group_ops.clear(); | |||
| } | |||
| int ConvolutionDepthWise_x86::load_model(const ModelBin& mb) | |||
| { | |||
| int ret = ConvolutionDepthWise::load_model(mb); | |||
| if (ret != 0) | |||
| return ret; | |||
| // create Convolution op for each group | |||
| const int maxk = kernel_w * kernel_h; | |||
| int channels = (weight_data_size / group) / maxk / (num_output / group) * group; | |||
| for (int i=0; i<(int)group_ops.size(); i++) | |||
| delete group_ops[i]; | |||
| group_ops.clear(); | |||
| if (channels == group && group == num_output) | |||
| { | |||
| // depth-wise specific | |||
| if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1) | |||
| { | |||
| if ((stride_w == 1 && stride_h == 1) || (stride_w == 2 && stride_h == 2)) | |||
| { | |||
| return 0; | |||
| } | |||
| } | |||
| } | |||
| const int channels_g = channels / group; | |||
| const int num_output_g = num_output / group; | |||
| group_ops.resize(group); | |||
| for (int g=0; g<group; g++) | |||
| { | |||
| Mat weight_data_g(maxk * channels_g * num_output_g, (void*)((const unsigned char*)weight_data + maxk * channels_g * num_output_g * g * weight_data.elemsize), weight_data.elemsize); | |||
| Mat bias_data_g; | |||
| if (bias_term) | |||
| bias_data_g = Mat(num_output_g, (void*)((const float*)bias_data + num_output_g * g)); | |||
| ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution); | |||
| // set param | |||
| ncnn::ParamDict pd; | |||
| pd.set(0, num_output_g);// num_output | |||
| pd.set(1, kernel_w); | |||
| pd.set(11, kernel_h); | |||
| pd.set(2, dilation_w); | |||
| pd.set(12, dilation_h); | |||
| pd.set(3, stride_w); | |||
| pd.set(13, stride_h); | |||
| pd.set(4, 0);// pad_w | |||
| pd.set(14, 0);// pad_h | |||
| pd.set(5, bias_term); | |||
| pd.set(6, maxk * channels_g * num_output_g);// weight_data_size | |||
| if (use_int8_inference) | |||
| { | |||
| pd.set(8, weight_data_int8_scales[g]); | |||
| pd.set(9, bottom_blob_int8_scales[g]); | |||
| } | |||
| op->load_param(pd); | |||
| // set weights | |||
| ncnn::Mat weights[2]; | |||
| weights[0] = weight_data_g; | |||
| weights[1] = bias_data_g; | |||
| op->load_model(ModelBinFromMatArray(weights)); | |||
| group_ops[g] = op; | |||
| } | |||
| return 0; | |||
| } | |||
| int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const | |||
| { | |||
| // convolv with NxN kernel | |||
| @@ -79,8 +167,6 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| const int maxk = kernel_w * kernel_h; | |||
| // depth-wise | |||
| if (channels == group && group == num_output) | |||
| { | |||
| @@ -99,11 +185,6 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int g=0; g<group; g++) | |||
| { | |||
| ncnn::ParamDict pd; | |||
| pd.set(0, bottom_blob_int8_scales[g]);// scale | |||
| quantize_ops[g]->load_param(pd); | |||
| ncnn::Option opt_g = opt; | |||
| opt_g.num_threads = 1; | |||
| opt_g.blob_allocator = bottom_blob_bordered_int8.allocator; | |||
| @@ -126,20 +207,6 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int g=0; g<group; g++) | |||
| { | |||
| float top_rescale = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); | |||
| ncnn::ParamDict pd; | |||
| pd.set(0, top_rescale);// scale | |||
| pd.set(1, bias_term);// bias_term | |||
| pd.set(2, 1);// bias_data_size | |||
| dequantize_ops[g]->load_param(pd); | |||
| ncnn::Mat weights[1]; | |||
| weights[0] = Mat(1, (void*)((const float*)bias_data + g)); | |||
| dequantize_ops[g]->load_model(ModelBinFromMatArray(weights)); | |||
| ncnn::Option opt_g = opt; | |||
| opt_g.num_threads = 1; | |||
| opt_g.blob_allocator = top_blob.allocator; | |||
| @@ -174,42 +241,8 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con | |||
| { | |||
| Mat bottom_blob_bordered_g(w, h, 1, bottom_blob_bordered.channel(g)); | |||
| Mat top_blob_g(outw, outh, 1, top_blob.channel(g), top_blob.elemsize, top_blob.allocator); | |||
| Mat weight_data_g(maxk, (void*)((const unsigned char*)weight_data + maxk * g * weight_data.elemsize), weight_data.elemsize); | |||
| Mat bias_data_g; | |||
| if (bias_term) | |||
| bias_data_g = Mat(1, (void*)((const float*)bias_data + g)); | |||
| // call Convolution | |||
| ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution); | |||
| // set param | |||
| ncnn::ParamDict pd; | |||
| pd.set(0, 1);// num_output | |||
| pd.set(1, kernel_w); | |||
| pd.set(11, kernel_h); | |||
| pd.set(2, dilation_w); | |||
| pd.set(12, dilation_h); | |||
| pd.set(3, stride_w); | |||
| pd.set(13, stride_h); | |||
| pd.set(4, 0);// pad_w | |||
| pd.set(14, 0);// pad_h | |||
| pd.set(5, bias_term); | |||
| pd.set(6, maxk);// weight_data_size | |||
| if (use_int8_inference) | |||
| { | |||
| pd.set(8, weight_data_int8_scales[g]); | |||
| pd.set(9, bottom_blob_int8_scales[g]); | |||
| } | |||
| op->load_param(pd); | |||
| // set weights | |||
| ncnn::Mat weights[2]; | |||
| weights[0] = weight_data_g; | |||
| weights[1] = bias_data_g; | |||
| op->load_model(ModelBinFromMatArray(weights)); | |||
| const ncnn::Layer* op = group_ops[g]; | |||
| ncnn::Option opt_g = opt; | |||
| opt_g.num_threads = 1; | |||
| @@ -217,8 +250,6 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con | |||
| // forward | |||
| op->forward(bottom_blob_bordered_g, top_blob_g, opt_g); | |||
| delete op; | |||
| } | |||
| return 0; | |||
| @@ -231,50 +262,14 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con | |||
| { | |||
| Mat bottom_blob_bordered_g(w, h, channels_g, bottom_blob_bordered.channel(channels_g * g)); | |||
| Mat top_blob_g(outw, outh, num_output_g, top_blob.channel(num_output_g * g), top_blob.elemsize, top_blob.allocator); | |||
| Mat weight_data_g(maxk * channels_g * num_output_g, (void*)((const unsigned char*)weight_data + maxk * channels_g * num_output_g * g * weight_data.elemsize), weight_data.elemsize); | |||
| Mat bias_data_g; | |||
| if (bias_term) | |||
| bias_data_g = Mat(num_output_g, (void*)((const float*)bias_data + num_output_g * g)); | |||
| // call Convolution | |||
| ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution); | |||
| // set param | |||
| ncnn::ParamDict pd; | |||
| pd.set(0, num_output_g);// num_output | |||
| pd.set(1, kernel_w); | |||
| pd.set(11, kernel_h); | |||
| pd.set(2, dilation_w); | |||
| pd.set(12, dilation_h); | |||
| pd.set(3, stride_w); | |||
| pd.set(13, stride_h); | |||
| pd.set(4, 0);// pad_w | |||
| pd.set(14, 0);// pad_h | |||
| pd.set(5, bias_term); | |||
| pd.set(6, maxk * channels_g * num_output_g);// weight_data_size | |||
| if (use_int8_inference) | |||
| { | |||
| pd.set(8, weight_data_int8_scales[g]); | |||
| pd.set(9, bottom_blob_int8_scales[g]); | |||
| } | |||
| op->load_param(pd); | |||
| // set weights | |||
| ncnn::Mat weights[2]; | |||
| weights[0] = weight_data_g; | |||
| weights[1] = bias_data_g; | |||
| op->load_model(ModelBinFromMatArray(weights)); | |||
| const ncnn::Layer* op = group_ops[g]; | |||
| ncnn::Option opt_g = opt; | |||
| opt_g.blob_allocator = top_blob.allocator; | |||
| // forward | |||
| op->forward(bottom_blob_bordered_g, top_blob_g, opt_g); | |||
| delete op; | |||
| } | |||
| return 0; | |||
| @@ -22,7 +22,15 @@ namespace ncnn { | |||
| class ConvolutionDepthWise_x86 : public ConvolutionDepthWise | |||
| { | |||
| public: | |||
| ConvolutionDepthWise_x86(); | |||
| virtual ~ConvolutionDepthWise_x86(); | |||
| virtual int load_model(const ModelBin& mb); | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| public: | |||
| std::vector<ncnn::Layer*> group_ops; | |||
| }; | |||
| } // namespace ncnn | |||