Browse Source

add arm int8 convolution stub, preload group op for x86

tags/20180830
nihui 7 years ago
parent
commit
2fe7ada4d8
9 changed files with 742 additions and 213 deletions
  1. +188
    -1
      src/layer/arm/convolution_1x1_int8.h
  2. +149
    -0
      src/layer/arm/convolution_3x3_int8.h
  3. +50
    -20
      src/layer/arm/convolution_arm.cpp
  4. +142
    -0
      src/layer/arm/convolutiondepthwise_3x3_int8.h
  5. +86
    -59
      src/layer/arm/convolutiondepthwise_arm.cpp
  6. +3
    -0
      src/layer/convolution.cpp
  7. +26
    -38
      src/layer/convolutiondepthwise.cpp
  8. +90
    -95
      src/layer/x86/convolutiondepthwise_x86.cpp
  9. +8
    -0
      src/layer/x86/convolutiondepthwise_x86.h

+ 188
- 1
src/layer/arm/convolution_1x1_int8.h View File

@@ -16,6 +16,91 @@
#include <arm_neon.h>
#endif // __ARM_NEON

#if __aarch64__
static void conv1x1s1_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
{
int inch = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

const float *kernel = _kernel;

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < outch; p++)
{
Mat out0 = top_blob.channel(p);

out0.fill(0);

int q = 0;

for (; q+7<inch; q+=8)
{
int* outptr0 = out0;

const signed char *kernel0 = (const signed char *)kernel + p * inch + q;

const signed char *r0 = bottom_blob.channel(q);
const signed char *r1 = bottom_blob.channel(q + 1);
const signed char *r2 = bottom_blob.channel(q + 2);
const signed char *r3 = bottom_blob.channel(q + 3);
const signed char *r4 = bottom_blob.channel(q + 4);
const signed char *r5 = bottom_blob.channel(q + 5);
const signed char *r6 = bottom_blob.channel(q + 6);
const signed char *r7 = bottom_blob.channel(q + 7);

int size = outw * outh;
int remain = size;

for (; remain > 0; remain--)
{
//ToDo Neon
int sum0 = (int)*r0 * (int)kernel0[0] + (int)*r1 * (int)kernel0[1] +
(int)*r2 * (int)kernel0[2] + (int)*r3 * (int)kernel0[3] +
(int)*r4 * (int)kernel0[4] + (int)*r5 * (int)kernel0[5] +
(int)*r6 * (int)kernel0[6] + (int)*r7 * (int)kernel0[7];

*outptr0 += sum0;

r0++;
r1++;
r2++;
r3++;
r4++;
r5++;
r6++;
r7++;
outptr0++;
}
}

for (; q<inch; q++)
{
int* outptr0 = out0;

const signed char *r0 = bottom_blob.channel(q);

const signed char *kernel0 = (const signed char *)kernel + p * inch + q;
const signed char k0 = kernel0[0];

int size = outw * outh;
int remain = size;

for (; remain > 0; remain--)
{
int sum0 = (int)(*r0) * (int)k0;

*outptr0 += sum0;

r0++;
outptr0++;
}
}
}
}
#else // __aarch64__
/*
* Convolution 1x1 quantized with int8,unroll 8 x 4
*/
@@ -1366,7 +1451,7 @@ static void conv1x1s1_neon_s8_left4(const Mat& bottom_blob, Mat& top_blob, const
}
}

static void conv1x1s1_neon_s8_inter(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Option& opt)
static void conv1x1s1_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Option& opt)
{
int size = top_blob.h * top_blob.w;
int remain = size & 7;
@@ -1391,3 +1476,105 @@ static void conv1x1s1_neon_s8_inter(const Mat& bottom_blob, Mat& top_blob, const

return;
}
#endif // __aarch64__

static void conv1x1s2_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
{
int w = bottom_blob.w;
int inch = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

const int tailstep = w - 2*outw + w;
const signed char *kernel = _kernel;

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < outch; p++)
{
Mat out0 = top_blob.channel(p);

out0.fill(0);

int q = 0;

for (; q+7<inch; q+=8)
{
int* outptr0 = out0;

const signed char *kernel0 = (const signed char *)kernel + p * inch + q;

const signed char *r0 = bottom_blob.channel(q);
const signed char *r1 = bottom_blob.channel(q + 1);
const signed char *r2 = bottom_blob.channel(q + 2);
const signed char *r3 = bottom_blob.channel(q + 3);
const signed char *r4 = bottom_blob.channel(q + 4);
const signed char *r5 = bottom_blob.channel(q + 5);
const signed char *r6 = bottom_blob.channel(q + 6);
const signed char *r7 = bottom_blob.channel(q + 7);

for(int i = 0; i < outh; i++)
{
int remain = outw;

for (; remain > 0; remain--)
{
//ToDo Neon
int sum0 = (int)*r0 * (int)kernel0[0] + (int)*r1 * (int)kernel0[1] +
(int)*r2 * (int)kernel0[2] + (int)*r3 * (int)kernel0[3] +
(int)*r4 * (int)kernel0[4] + (int)*r5 * (int)kernel0[5] +
(int)*r6 * (int)kernel0[6] + (int)*r7 * (int)kernel0[7];

*outptr0 += sum0;

r0 += 2;
r1 += 2;
r2 += 2;
r3 += 2;
r4 += 2;
r5 += 2;
r6 += 2;
r7 += 2;
outptr0++;
}

r0 += tailstep;
r1 += tailstep;
r2 += tailstep;
r3 += tailstep;
r4 += tailstep;
r5 += tailstep;
r6 += tailstep;
r7 += tailstep;
}
}

for (; q<inch; q++)
{
int* outptr0 = out0;

const signed char *r0 = bottom_blob.channel(q);

const signed char *kernel0 = (const signed char *)kernel + p * inch + q;

for(int i = 0; i < outh; i++)
{
int remain = outw;

for (; remain > 0; remain--)
{
//ToDo Neon
int sum0 = (int)*r0 * (int)kernel0[0];

*outptr0 += sum0;

r0 += 2;
outptr0++;
}

r0 += tailstep;
}
}
}
}

+ 149
- 0
src/layer/arm/convolution_3x3_int8.h View File

@@ -0,0 +1,149 @@
// SenseNets is pleased to support the open source community by supporting ncnn available.
//
// Copyright (C) 2018 SenseNets Technology Ltd. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

static void conv3x3s1_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
{
int w = bottom_blob.w;
//int h = bottom_blob.h;
int inch = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

const signed char *kernel = _kernel;

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < outch; p++)
{
Mat out0 = top_blob.channel(p);

out0.fill(0);

const signed char *kernel0 = (const signed char *)kernel + p * inch * 9;

for (int q = 0; q < inch; q++)
{
int *outptr0 = out0;

const signed char *img0 = bottom_blob.channel(q);

const signed char *r0 = img0;
const signed char *r1 = img0 + w;
const signed char *r2 = img0 + w * 2;

for (int i = 0; i < outh; i++)
{
int remain = outw;

for (; remain > 0; remain--)
{
int sum0 = 0;

sum0 += (int)r0[0] * kernel0[0];
sum0 += (int)r0[1] * kernel0[1];
sum0 += (int)r0[2] * kernel0[2];
sum0 += (int)r1[0] * kernel0[3];
sum0 += (int)r1[1] * kernel0[4];
sum0 += (int)r1[2] * kernel0[5];
sum0 += (int)r2[0] * kernel0[6];
sum0 += (int)r2[1] * kernel0[7];
sum0 += (int)r2[2] * kernel0[8];

*outptr0 += sum0;

r0++;
r1++;
r2++;
outptr0++;
}

r0 += 2;
r1 += 2;
r2 += 2;
}

kernel0 += 9;
}
}
}

static void conv3x3s2_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
{
int w = bottom_blob.w;
//int h = bottom_blob.h;
int inch = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

const int tailstep = w - 2 * outw + w;

const signed char *kernel = _kernel;

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < outch; p++)
{
Mat out0 = top_blob.channel(p);

out0.fill(0);

const signed char *kernel0 = (const signed char *)kernel + p * inch * 9;

for (int q = 0; q < inch; q++)
{
int *outptr0 = out0;

const signed char *img0 = bottom_blob.channel(q);

const signed char *r0 = img0;
const signed char *r1 = img0 + w;
const signed char *r2 = img0 + w * 2;

for (int i = 0; i < outh; i++)
{
int remain = outw;

for (; remain > 0; remain--)
{
int sum0 = 0;

sum0 += (int)r0[0] * (int)kernel0[0];
sum0 += (int)r0[1] * (int)kernel0[1];
sum0 += (int)r0[2] * (int)kernel0[2];
sum0 += (int)r1[0] * (int)kernel0[3];
sum0 += (int)r1[1] * (int)kernel0[4];
sum0 += (int)r1[2] * (int)kernel0[5];
sum0 += (int)r2[0] * (int)kernel0[6];
sum0 += (int)r2[1] * (int)kernel0[7];
sum0 += (int)r2[2] * (int)kernel0[8];

*outptr0 += sum0;

r0 += 2;
r1 += 2;
r2 += 2;
outptr0++;
}

r0 += tailstep;
r1 += tailstep;
r2 += tailstep;
}

kernel0 += 9;
}
}
}

+ 50
- 20
src/layer/arm/convolution_arm.cpp View File

@@ -23,9 +23,8 @@ namespace ncnn {
#include "convolution_5x5.h"
#include "convolution_7x7.h"

#if !__aarch64__
#include "convolution_1x1_int8.h"
#endif // !__aarch64__
#include "convolution_3x3_int8.h"

DEFINE_LAYER_CREATOR(Convolution_arm)

@@ -221,19 +220,6 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option
return Convolution::forward(bottom_blob, top_blob, opt);
}

#if __aarch64__
if (use_int8_inference)
{
// TODO
return Convolution::forward(bottom_blob, top_blob, opt);
}
#else
if (use_int8_inference && (kernel_size != 1 || stride != 1))
{
return Convolution::forward(bottom_blob, top_blob, opt);
}
#endif

typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&, const Option&);

// kernel_size x stride
@@ -283,10 +269,58 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option
} // kernel_size = 7
};

typedef void (*conv_int8_func)(const Mat&, Mat&, const Mat&, const Option&);

// kernel_size x stride
conv_int8_func conv_int8_func_table[5][5] =
{
{
conv1x1s1_int8_neon,
conv1x1s2_int8_neon,
0,
0,
0
}, // kernel_size = 1
{
0,
0,
0,
0,
0
}, // kernel_size = 2
{
conv3x3s1_int8_neon,
conv3x3s2_int8_neon,
0,
0,
0
}, // kernel_size = 3
{
0,
0,
0,
0,
0
}, // kernel_size = 4
{
0,
0,
0,
0,
0
} // kernel_size = 5
};

conv_func conv = 0;
conv_int8_func conv_int8 = 0;

if (use_int8_inference)
{
conv_int8 = conv_int8_func_table[kernel_size-1][stride-1];
if (!conv_int8)
{
return Convolution::forward(bottom_blob, top_blob, opt);
}
}
else
{
@@ -339,11 +373,8 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option
if (top_blob.empty())
return -100;

#if !__aarch64__
if (use_int8_inference)
{
// kernel_size = 1
// stride = 1
Mat bottom_blob_bordered_int8;
bottom_blob_bordered_int8.create(w, h, channels, (size_t)1u, opt.workspace_allocator);
if (bottom_blob_bordered_int8.empty())
@@ -359,7 +390,7 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option
quantize->forward(bottom_blob_bordered, bottom_blob_bordered_int8, opt);
}

conv1x1s1_neon_s8_inter(bottom_blob_bordered_int8, top_blob, weight_data, opt);
conv_int8(bottom_blob_bordered_int8, top_blob, weight_data, opt);

// dequantize, reverse scale inplace
{
@@ -382,7 +413,6 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option

return 0;
}
#endif

if (use_winograd3x3 && w <= 120 && h <= 120)
{


+ 142
- 0
src/layer/arm/convolutiondepthwise_3x3_int8.h View File

@@ -0,0 +1,142 @@
// SenseNets is pleased to support the open source community by supporting ncnn available.
//
// Copyright (C) 2018 SenseNets Technology Ltd. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

static void convdw3x3s1_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
{
int w = bottom_blob.w;
//int h = bottom_blob.h;
//int inch = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

const signed char *kernel = _kernel;

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < outch; p++)
{
Mat out = top_blob.channel(p);

out.fill(0);

const signed char *kernel0 = (const signed char *)kernel + p * 9;

int *outptr = out;

const signed char *img0 = bottom_blob.channel(p);

const signed char *r0 = img0;
const signed char *r1 = img0 + w;
const signed char *r2 = img0 + w * 2;

int i = 0;
for (; i < outh; i++)
{
int remain = outw;

for (; remain > 0; remain--)
{

int sum = 0;

sum += (int)r0[0] * (int)kernel0[0];
sum += (int)r0[1] * (int)kernel0[1];
sum += (int)r0[2] * (int)kernel0[2];
sum += (int)r1[0] * (int)kernel0[3];
sum += (int)r1[1] * (int)kernel0[4];
sum += (int)r1[2] * (int)kernel0[5];
sum += (int)r2[0] * (int)kernel0[6];
sum += (int)r2[1] * (int)kernel0[7];
sum += (int)r2[2] * (int)kernel0[8];

*outptr += sum;

r0++;
r1++;
r2++;
outptr++;
}

r0 += 2;
r1 += 2;
r2 += 2;
}
}
}

static void convdw3x3s2_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
{
int w = bottom_blob.w;
//int h = bottom_blob.h;
//int inch = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

const int tailstep = w - 2 * outw + w;

const signed char *kernel = _kernel;

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < outch; p++)
{
Mat out = top_blob.channel(p);
out.fill(0);

const signed char *kernel0 = (const signed char *)kernel + p * 9;

int *outptr = out;

const signed char *img0 = bottom_blob.channel(p);

const signed char *r0 = img0;
const signed char *r1 = img0 + w;
const signed char *r2 = img0 + w * 2;

int i = 0;

for (; i < outh; i++)
{
int remain = outw;

for (; remain > 0; remain--)
{
int sum = 0;

sum += (int)r0[0] * (int)kernel0[0];
sum += (int)r0[1] * (int)kernel0[1];
sum += (int)r0[2] * (int)kernel0[2];
sum += (int)r1[0] * (int)kernel0[3];
sum += (int)r1[1] * (int)kernel0[4];
sum += (int)r1[2] * (int)kernel0[5];
sum += (int)r2[0] * (int)kernel0[6];
sum += (int)r2[1] * (int)kernel0[7];
sum += (int)r2[2] * (int)kernel0[8];

*outptr += sum;

r0 += 2;
r1 += 2;
r2 += 2;
outptr++;
}

r0 += tailstep;
r1 += tailstep;
r2 += tailstep;
}
}
}

+ 86
- 59
src/layer/arm/convolutiondepthwise_arm.cpp View File

@@ -24,6 +24,8 @@ namespace ncnn {

#include "convolutiondepthwise_3x3.h"

#include "convolutiondepthwise_3x3_int8.h"

DEFINE_LAYER_CREATOR(ConvolutionDepthWise_arm)

ConvolutionDepthWise_arm::ConvolutionDepthWise_arm()
@@ -56,7 +58,13 @@ int ConvolutionDepthWise_arm::load_model(const ModelBin& mb)
if (channels == group && group == num_output)
{
// depth-wise specific
return 0;
if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1)
{
if ((stride_w == 1 && stride_h == 1) || (stride_w == 2 && stride_h == 2))
{
return 0;
}
}
}

const int channels_g = channels / group;
@@ -66,7 +74,7 @@ int ConvolutionDepthWise_arm::load_model(const ModelBin& mb)

for (int g=0; g<group; g++)
{
Mat weight_data_g(maxk * channels_g * num_output_g, (void*)((const float*)weight_data + maxk * channels_g * num_output_g * g));
Mat weight_data_g(maxk * channels_g * num_output_g, (void*)((const unsigned char*)weight_data + maxk * channels_g * num_output_g * g * weight_data.elemsize), weight_data.elemsize);
Mat bias_data_g;
if (bias_term)
bias_data_g = Mat(num_output_g, (void*)((const float*)bias_data + num_output_g * g));
@@ -87,6 +95,12 @@ int ConvolutionDepthWise_arm::load_model(const ModelBin& mb)
pd.set(5, bias_term);
pd.set(6, maxk * channels_g * num_output_g);// weight_data_size

if (use_int8_inference)
{
pd.set(8, weight_data_int8_scales[g]);
pd.set(9, bottom_blob_int8_scales[g]);
}

op->load_param(pd);

// set weights
@@ -107,12 +121,6 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, con
// convolv with NxN kernel
// value = value + bias

if (use_int8_inference)
{
// TODO
return ConvolutionDepthWise::forward(bottom_blob, top_blob, opt);
}

int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
@@ -159,75 +167,91 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, con
if (top_blob.empty())
return -100;

const int maxk = kernel_w * kernel_h;

// depth-wise
if (channels == group && group == num_output)
{
if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1)
if (use_int8_inference)
{
if (stride_w == 1 && stride_h == 1)
if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1)
{
convdw3x3s1_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
return 0;
if ((stride_w == 1 && stride_h == 1) || (stride_w == 2 && stride_h == 2))
{
Mat bottom_blob_bordered_int8;
bottom_blob_bordered_int8.create(w, h, channels, (size_t)1u, opt.workspace_allocator);
if (bottom_blob_bordered_int8.empty())
return -100;

// quantize, scale and round to nearest
#pragma omp parallel for num_threads(opt.num_threads)
for (int g=0; g<group; g++)
{
ncnn::Option opt_g = opt;
opt_g.num_threads = 1;
opt_g.blob_allocator = bottom_blob_bordered_int8.allocator;

const Mat bottom_blob_bordered_g = bottom_blob_bordered.channel(g);
Mat bottom_blob_bordered_int8_g = bottom_blob_bordered_int8.channel(g);
quantize_ops[g]->forward(bottom_blob_bordered_g, bottom_blob_bordered_int8_g, opt_g);
}

if (stride_w == 1 && stride_h == 1)
{
convdw3x3s1_int8_neon(bottom_blob_bordered_int8, top_blob, weight_data, opt);
}
else if (stride_w == 2 && stride_h == 2)
{
convdw3x3s2_int8_neon(bottom_blob_bordered_int8, top_blob, weight_data, opt);
}

// dequantize, reverse scale inplace
#pragma omp parallel for num_threads(opt.num_threads)
for (int g=0; g<group; g++)
{
ncnn::Option opt_g = opt;
opt_g.num_threads = 1;
opt_g.blob_allocator = top_blob.allocator;

Mat top_blob_g = top_blob.channel(g);
dequantize_ops[g]->forward_inplace(top_blob_g, opt_g);
}

return 0;
}
}
else if (stride_w == 2 && stride_h == 2)
}
else
{
if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1)
{
convdw3x3s2_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
return 0;
if (stride_w == 1 && stride_h == 1)
{
convdw3x3s1_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
return 0;
}
else if (stride_w == 2 && stride_h == 2)
{
convdw3x3s2_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
return 0;
}
}
}

#ifdef _OPENMP
int nested_current = omp_get_nested();
omp_set_nested(0);
#endif

#pragma omp parallel for num_threads(opt.num_threads)
for (int g=0; g<group; g++)
{
Mat bottom_blob_bordered_g(w, h, 1, bottom_blob_bordered.channel(g));
Mat top_blob_g(outw, outh, 1, top_blob.channel(g), top_blob.elemsize, top_blob.allocator);
Mat weight_data_g(maxk, (void*)((const float*)weight_data + maxk * g));
Mat bias_data_g;
if (bias_term)
bias_data_g = Mat(1, (void*)((const float*)bias_data + g));

// call Convolution
ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);

// set param
ncnn::ParamDict pd;
pd.set(0, 1);// num_output
pd.set(1, kernel_w);
pd.set(11, kernel_h);
pd.set(2, dilation_w);
pd.set(12, dilation_h);
pd.set(3, stride_w);
pd.set(13, stride_h);
pd.set(4, 0);// pad_w
pd.set(14, 0);// pad_h
pd.set(5, bias_term);
pd.set(6, maxk);// weight_data_size

op->load_param(pd);

// set weights
ncnn::Mat weights[2];
weights[0] = weight_data_g;
weights[1] = bias_data_g;

op->load_model(ModelBinFromMatArray(weights));

// forward
op->forward(bottom_blob_bordered_g, top_blob_g, opt);
const ncnn::Layer* op = group_ops[g];

delete op;
ncnn::Option opt_g = opt;
opt_g.num_threads = 1;
opt_g.blob_allocator = top_blob.allocator;

// forward
op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
}

#ifdef _OPENMP
omp_set_nested(nested_current);
#endif
return 0;
}

@@ -241,8 +265,11 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, con

const ncnn::Layer* op = group_ops[g];

ncnn::Option opt_g = opt;
opt_g.blob_allocator = top_blob.allocator;

// forward
op->forward(bottom_blob_bordered_g, top_blob_g, opt);
op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
}

return 0;


+ 3
- 0
src/layer/convolution.cpp View File

@@ -53,6 +53,9 @@ int Convolution::load_param(const ParamDict& pd)

use_int8_inference = pd.use_int8_inference;

if (weight_data_int8_scale == 0.f || bottom_blob_int8_scale == 0.f)
use_int8_inference = false;

return 0;
}



+ 26
- 38
src/layer/convolutiondepthwise.cpp View File

@@ -64,6 +64,9 @@ int ConvolutionDepthWise::load_param(const ParamDict& pd)
return -100;
}

if (weight_data_int8_scales.empty() || bottom_blob_int8_scales.empty())
use_int8_inference = false;

// extend group if only one provided
if (weight_data_int8_scales.w == 1)
{
@@ -121,7 +124,30 @@ int ConvolutionDepthWise::load_model(const ModelBin& mb)
for (int g=0; g<group; g++)
{
quantize_ops[g] = ncnn::create_layer(ncnn::LayerType::Quantize);

ncnn::ParamDict pd;
pd.set(0, bottom_blob_int8_scales[g]);// scale

quantize_ops[g]->load_param(pd);
}

for (int g=0; g<group; g++)
{
dequantize_ops[g] = ncnn::create_layer(ncnn::LayerType::Dequantize);

float top_rescale = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);

ncnn::ParamDict pd;
pd.set(0, top_rescale);// scale
pd.set(1, bias_term);// bias_term
pd.set(2, 1);// bias_data_size

dequantize_ops[g]->load_param(pd);

ncnn::Mat weights[1];
weights[0] = Mat(1, (void*)((const float*)bias_data + g));

dequantize_ops[g]->load_model(ModelBinFromMatArray(weights));
}
}

@@ -252,11 +278,6 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const O
{
// quantize, scale and round to nearest
{
ncnn::ParamDict pd;
pd.set(0, bottom_blob_int8_scales[g]);// scale

quantize_ops[g]->load_param(pd);

ncnn::Option opt_g = opt;
opt_g.num_threads = 1;
opt_g.blob_allocator = bottom_blob_bordered_int8.allocator;
@@ -293,20 +314,6 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const O

// dequantize, reverse scale inplace
{
float top_rescale = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);

ncnn::ParamDict pd;
pd.set(0, top_rescale);// scale
pd.set(1, bias_term);// bias_term
pd.set(2, 1);// bias_data_size

dequantize_ops[g]->load_param(pd);

ncnn::Mat weights[1];
weights[0] = Mat(1, (void*)((const float*)bias_data + g));

dequantize_ops[g]->load_model(ModelBinFromMatArray(weights));

ncnn::Option opt_g = opt;
opt_g.num_threads = 1;
opt_g.blob_allocator = top_blob.allocator;
@@ -325,11 +332,6 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const O
#pragma omp parallel for num_threads(opt.num_threads)
for (int g=0; g<group; g++)
{
ncnn::ParamDict pd;
pd.set(0, bottom_blob_int8_scales[g]);// scale

quantize_ops[g]->load_param(pd);

ncnn::Option opt_g = opt;
opt_g.num_threads = 1;
opt_g.blob_allocator = bottom_blob_bordered_int8.allocator;
@@ -387,20 +389,6 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const O
#pragma omp parallel for num_threads(opt.num_threads)
for (int g=0; g<group; g++)
{
float top_rescale = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);

ncnn::ParamDict pd;
pd.set(0, top_rescale);// scale
pd.set(1, bias_term);// bias_term
pd.set(2, num_output_g);// bias_data_size

dequantize_ops[g]->load_param(pd);

ncnn::Mat weights[1];
weights[0] = Mat(num_output_g, (void*)((const float*)bias_data + num_output_g * g));

dequantize_ops[g]->load_model(ModelBinFromMatArray(weights));

ncnn::Option opt_g = opt;
opt_g.num_threads = 1;
opt_g.blob_allocator = top_blob.allocator;


+ 90
- 95
src/layer/x86/convolutiondepthwise_x86.cpp View File

@@ -28,6 +28,94 @@ namespace ncnn {

DEFINE_LAYER_CREATOR(ConvolutionDepthWise_x86)

ConvolutionDepthWise_x86::ConvolutionDepthWise_x86()
{
}

ConvolutionDepthWise_x86::~ConvolutionDepthWise_x86()
{
for (int i=0; i<(int)group_ops.size(); i++)
delete group_ops[i];

group_ops.clear();
}

int ConvolutionDepthWise_x86::load_model(const ModelBin& mb)
{
int ret = ConvolutionDepthWise::load_model(mb);
if (ret != 0)
return ret;

// create Convolution op for each group
const int maxk = kernel_w * kernel_h;
int channels = (weight_data_size / group) / maxk / (num_output / group) * group;

for (int i=0; i<(int)group_ops.size(); i++)
delete group_ops[i];

group_ops.clear();

if (channels == group && group == num_output)
{
// depth-wise specific
if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1)
{
if ((stride_w == 1 && stride_h == 1) || (stride_w == 2 && stride_h == 2))
{
return 0;
}
}
}

const int channels_g = channels / group;
const int num_output_g = num_output / group;

group_ops.resize(group);

for (int g=0; g<group; g++)
{
Mat weight_data_g(maxk * channels_g * num_output_g, (void*)((const unsigned char*)weight_data + maxk * channels_g * num_output_g * g * weight_data.elemsize), weight_data.elemsize);
Mat bias_data_g;
if (bias_term)
bias_data_g = Mat(num_output_g, (void*)((const float*)bias_data + num_output_g * g));

ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);

// set param
ncnn::ParamDict pd;
pd.set(0, num_output_g);// num_output
pd.set(1, kernel_w);
pd.set(11, kernel_h);
pd.set(2, dilation_w);
pd.set(12, dilation_h);
pd.set(3, stride_w);
pd.set(13, stride_h);
pd.set(4, 0);// pad_w
pd.set(14, 0);// pad_h
pd.set(5, bias_term);
pd.set(6, maxk * channels_g * num_output_g);// weight_data_size

if (use_int8_inference)
{
pd.set(8, weight_data_int8_scales[g]);
pd.set(9, bottom_blob_int8_scales[g]);
}

op->load_param(pd);

// set weights
ncnn::Mat weights[2];
weights[0] = weight_data_g;
weights[1] = bias_data_g;

op->load_model(ModelBinFromMatArray(weights));

group_ops[g] = op;
}

return 0;
}

int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
// convolv with NxN kernel
@@ -79,8 +167,6 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con
if (top_blob.empty())
return -100;

const int maxk = kernel_w * kernel_h;

// depth-wise
if (channels == group && group == num_output)
{
@@ -99,11 +185,6 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con
#pragma omp parallel for num_threads(opt.num_threads)
for (int g=0; g<group; g++)
{
ncnn::ParamDict pd;
pd.set(0, bottom_blob_int8_scales[g]);// scale

quantize_ops[g]->load_param(pd);

ncnn::Option opt_g = opt;
opt_g.num_threads = 1;
opt_g.blob_allocator = bottom_blob_bordered_int8.allocator;
@@ -126,20 +207,6 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con
#pragma omp parallel for num_threads(opt.num_threads)
for (int g=0; g<group; g++)
{
float top_rescale = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);

ncnn::ParamDict pd;
pd.set(0, top_rescale);// scale
pd.set(1, bias_term);// bias_term
pd.set(2, 1);// bias_data_size

dequantize_ops[g]->load_param(pd);

ncnn::Mat weights[1];
weights[0] = Mat(1, (void*)((const float*)bias_data + g));

dequantize_ops[g]->load_model(ModelBinFromMatArray(weights));

ncnn::Option opt_g = opt;
opt_g.num_threads = 1;
opt_g.blob_allocator = top_blob.allocator;
@@ -174,42 +241,8 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con
{
Mat bottom_blob_bordered_g(w, h, 1, bottom_blob_bordered.channel(g));
Mat top_blob_g(outw, outh, 1, top_blob.channel(g), top_blob.elemsize, top_blob.allocator);
Mat weight_data_g(maxk, (void*)((const unsigned char*)weight_data + maxk * g * weight_data.elemsize), weight_data.elemsize);
Mat bias_data_g;
if (bias_term)
bias_data_g = Mat(1, (void*)((const float*)bias_data + g));

// call Convolution
ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);

// set param
ncnn::ParamDict pd;
pd.set(0, 1);// num_output
pd.set(1, kernel_w);
pd.set(11, kernel_h);
pd.set(2, dilation_w);
pd.set(12, dilation_h);
pd.set(3, stride_w);
pd.set(13, stride_h);
pd.set(4, 0);// pad_w
pd.set(14, 0);// pad_h
pd.set(5, bias_term);
pd.set(6, maxk);// weight_data_size

if (use_int8_inference)
{
pd.set(8, weight_data_int8_scales[g]);
pd.set(9, bottom_blob_int8_scales[g]);
}

op->load_param(pd);

// set weights
ncnn::Mat weights[2];
weights[0] = weight_data_g;
weights[1] = bias_data_g;

op->load_model(ModelBinFromMatArray(weights));
const ncnn::Layer* op = group_ops[g];

ncnn::Option opt_g = opt;
opt_g.num_threads = 1;
@@ -217,8 +250,6 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con

// forward
op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);

delete op;
}

return 0;
@@ -231,50 +262,14 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con
{
Mat bottom_blob_bordered_g(w, h, channels_g, bottom_blob_bordered.channel(channels_g * g));
Mat top_blob_g(outw, outh, num_output_g, top_blob.channel(num_output_g * g), top_blob.elemsize, top_blob.allocator);
Mat weight_data_g(maxk * channels_g * num_output_g, (void*)((const unsigned char*)weight_data + maxk * channels_g * num_output_g * g * weight_data.elemsize), weight_data.elemsize);
Mat bias_data_g;
if (bias_term)
bias_data_g = Mat(num_output_g, (void*)((const float*)bias_data + num_output_g * g));

// call Convolution
ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);

// set param
ncnn::ParamDict pd;
pd.set(0, num_output_g);// num_output
pd.set(1, kernel_w);
pd.set(11, kernel_h);
pd.set(2, dilation_w);
pd.set(12, dilation_h);
pd.set(3, stride_w);
pd.set(13, stride_h);
pd.set(4, 0);// pad_w
pd.set(14, 0);// pad_h
pd.set(5, bias_term);
pd.set(6, maxk * channels_g * num_output_g);// weight_data_size

if (use_int8_inference)
{
pd.set(8, weight_data_int8_scales[g]);
pd.set(9, bottom_blob_int8_scales[g]);
}

op->load_param(pd);

// set weights
ncnn::Mat weights[2];
weights[0] = weight_data_g;
weights[1] = bias_data_g;

op->load_model(ModelBinFromMatArray(weights));
const ncnn::Layer* op = group_ops[g];

ncnn::Option opt_g = opt;
opt_g.blob_allocator = top_blob.allocator;

// forward
op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);

delete op;
}

return 0;


+ 8
- 0
src/layer/x86/convolutiondepthwise_x86.h View File

@@ -22,7 +22,15 @@ namespace ncnn {
class ConvolutionDepthWise_x86 : public ConvolutionDepthWise
{
public:
ConvolutionDepthWise_x86();
virtual ~ConvolutionDepthWise_x86();

virtual int load_model(const ModelBin& mb);

virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

public:
std::vector<ncnn::Layer*> group_ops;
};

} // namespace ncnn


Loading…
Cancel
Save