add arm int8 convolution stub, preload group op for x86

7 years ago · 2fe7ada4d8
--- a/src/layer/arm/convolution_1x1_int8.h
+++ b/src/layer/arm/convolution_1x1_int8.h
@@ -16,6 +16,91 @@
 #include <arm_neon.h>
 #endif // __ARM_NEON

 #if __aarch64__
 static void conv1x1s1_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
 {
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const float *kernel = _kernel;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out0 = top_blob.channel(p);

        out0.fill(0);

        int q = 0;

        for (; q+7<inch; q+=8)
        {
            int* outptr0 = out0;

            const signed char *kernel0 = (const signed char *)kernel + p * inch + q;

            const signed char *r0 = bottom_blob.channel(q);
            const signed char *r1 = bottom_blob.channel(q + 1);
            const signed char *r2 = bottom_blob.channel(q + 2);
            const signed char *r3 = bottom_blob.channel(q + 3);
            const signed char *r4 = bottom_blob.channel(q + 4);
            const signed char *r5 = bottom_blob.channel(q + 5);
            const signed char *r6 = bottom_blob.channel(q + 6);
            const signed char *r7 = bottom_blob.channel(q + 7);

            int size = outw * outh;
            int remain = size;

            for (; remain > 0; remain--)
            {
                //ToDo Neon
                int sum0 = (int)*r0 * (int)kernel0[0] + (int)*r1 * (int)kernel0[1] +
                           (int)*r2 * (int)kernel0[2] + (int)*r3 * (int)kernel0[3] +
                           (int)*r4 * (int)kernel0[4] + (int)*r5 * (int)kernel0[5] +
                           (int)*r6 * (int)kernel0[6] + (int)*r7 * (int)kernel0[7];

                *outptr0 += sum0;

                r0++;
                r1++;
                r2++;
                r3++;
                r4++;
                r5++;
                r6++;
                r7++;
                outptr0++;
            }
        }

        for (; q<inch; q++)
        {
            int* outptr0 = out0;

            const signed char *r0 = bottom_blob.channel(q);

            const signed char *kernel0 = (const signed char *)kernel + p * inch + q;
            const signed char k0 = kernel0[0];

            int size = outw * outh;
            int remain = size;

            for (; remain > 0; remain--)
            {
                int sum0 = (int)(*r0) * (int)k0;

                *outptr0 += sum0;

                r0++;
                outptr0++;
            }
        }
    }
 }
 #else // __aarch64__
 /*
 * Convolution 1x1 quantized with int8,unroll 8 x 4
 */
@@ -1366,7 +1451,7 @@ static void conv1x1s1_neon_s8_left4(const Mat& bottom_blob, Mat& top_blob, const
    }
 }

 static void conv1x1s1_neon_s8_inter(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Option& opt)
 static void conv1x1s1_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Option& opt)
 {
    int size = top_blob.h * top_blob.w;
    int remain = size & 7;
@@ -1391,3 +1476,105 @@ static void conv1x1s1_neon_s8_inter(const Mat& bottom_blob, Mat& top_blob, const

    return;
 }
 #endif // __aarch64__

 static void conv1x1s2_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
 {
    int w = bottom_blob.w;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int tailstep = w - 2*outw + w;
    const signed char *kernel = _kernel;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out0 = top_blob.channel(p);

        out0.fill(0);

        int q = 0;

        for (; q+7<inch; q+=8)
        {
            int* outptr0 = out0;

            const signed char *kernel0 = (const signed char *)kernel + p * inch + q;

            const signed char *r0 = bottom_blob.channel(q);
            const signed char *r1 = bottom_blob.channel(q + 1);
            const signed char *r2 = bottom_blob.channel(q + 2);
            const signed char *r3 = bottom_blob.channel(q + 3);
            const signed char *r4 = bottom_blob.channel(q + 4);
            const signed char *r5 = bottom_blob.channel(q + 5);
            const signed char *r6 = bottom_blob.channel(q + 6);
            const signed char *r7 = bottom_blob.channel(q + 7);

            for(int i = 0; i < outh; i++)
            {
                int remain = outw;

                for (; remain > 0; remain--)
                {
                    //ToDo Neon
                    int sum0 = (int)*r0 * (int)kernel0[0] + (int)*r1 * (int)kernel0[1] +
                            (int)*r2 * (int)kernel0[2] + (int)*r3 * (int)kernel0[3] +
                            (int)*r4 * (int)kernel0[4] + (int)*r5 * (int)kernel0[5] +
                            (int)*r6 * (int)kernel0[6] + (int)*r7 * (int)kernel0[7];

                    *outptr0 += sum0;

                    r0 += 2;
                    r1 += 2;
                    r2 += 2;
                    r3 += 2;
                    r4 += 2;
                    r5 += 2;
                    r6 += 2;
                    r7 += 2;
                    outptr0++;
                }

                r0 += tailstep;
                r1 += tailstep;
                r2 += tailstep;
                r3 += tailstep;
                r4 += tailstep;
                r5 += tailstep;
                r6 += tailstep;
                r7 += tailstep;
            }
        }

        for (; q<inch; q++)
        {
            int* outptr0 = out0;

            const signed char *r0 = bottom_blob.channel(q);

            const signed char *kernel0 = (const signed char *)kernel + p * inch + q;

            for(int i = 0; i < outh; i++)
            {
                int remain = outw;

                for (; remain > 0; remain--)
                {
                    //ToDo Neon
                    int sum0 = (int)*r0 * (int)kernel0[0];

                    *outptr0 += sum0;

                    r0 += 2;
                    outptr0++;
                }

                r0 += tailstep;
            }
        }
    }
 }
--- a/src/layer/arm/convolution_3x3_int8.h
+++ b/src/layer/arm/convolution_3x3_int8.h
@@ -0,0 +1,149 @@
 // SenseNets is pleased to support the open source community by supporting ncnn available.
 //
 // Copyright (C) 2018 SenseNets Technology Ltd. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 static void conv3x3s1_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
 {
    int w = bottom_blob.w;
    //int h = bottom_blob.h;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const signed char *kernel = _kernel;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out0 = top_blob.channel(p);

        out0.fill(0);

        const signed char *kernel0 = (const signed char *)kernel + p * inch * 9;

        for (int q = 0; q < inch; q++)
        {
            int *outptr0 = out0;

            const signed char *img0 = bottom_blob.channel(q);

            const signed char *r0 = img0;
            const signed char *r1 = img0 + w;
            const signed char *r2 = img0 + w * 2;

            for (int i = 0; i < outh; i++)
            {
                int remain = outw;

                for (; remain > 0; remain--)
                {
                    int sum0 = 0;

                    sum0 += (int)r0[0] * kernel0[0];
                    sum0 += (int)r0[1] * kernel0[1];
                    sum0 += (int)r0[2] * kernel0[2];
                    sum0 += (int)r1[0] * kernel0[3];
                    sum0 += (int)r1[1] * kernel0[4];
                    sum0 += (int)r1[2] * kernel0[5];
                    sum0 += (int)r2[0] * kernel0[6];
                    sum0 += (int)r2[1] * kernel0[7];
                    sum0 += (int)r2[2] * kernel0[8];

                    *outptr0 += sum0;

                    r0++;
                    r1++;
                    r2++;
                    outptr0++;
                }

                r0 += 2;
                r1 += 2;
                r2 += 2;
            }

            kernel0 += 9;
        }
    }
 }

 static void conv3x3s2_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
 {
    int w = bottom_blob.w;
    //int h = bottom_blob.h;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int tailstep = w - 2 * outw + w;

    const signed char *kernel = _kernel;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out0 = top_blob.channel(p);

        out0.fill(0);

        const signed char *kernel0 = (const signed char *)kernel + p * inch * 9;

        for (int q = 0; q < inch; q++)
        {
            int *outptr0 = out0;

            const signed char *img0 = bottom_blob.channel(q);

            const signed char *r0 = img0;
            const signed char *r1 = img0 + w;
            const signed char *r2 = img0 + w * 2;

            for (int i = 0; i < outh; i++)
            {
                int remain = outw;

                for (; remain > 0; remain--)
                {
                    int sum0 = 0;

                    sum0 += (int)r0[0] * (int)kernel0[0];
                    sum0 += (int)r0[1] * (int)kernel0[1];
                    sum0 += (int)r0[2] * (int)kernel0[2];
                    sum0 += (int)r1[0] * (int)kernel0[3];
                    sum0 += (int)r1[1] * (int)kernel0[4];
                    sum0 += (int)r1[2] * (int)kernel0[5];
                    sum0 += (int)r2[0] * (int)kernel0[6];
                    sum0 += (int)r2[1] * (int)kernel0[7];
                    sum0 += (int)r2[2] * (int)kernel0[8];

                    *outptr0 += sum0;

                    r0 += 2;
                    r1 += 2;
                    r2 += 2;
                    outptr0++;
                }

                r0 += tailstep;
                r1 += tailstep;
                r2 += tailstep;
            }

            kernel0 += 9;
        }
    }
 }
--- a/src/layer/arm/convolution_arm.cpp
+++ b/src/layer/arm/convolution_arm.cpp
@@ -23,9 +23,8 @@ namespace ncnn {
 #include "convolution_5x5.h"
 #include "convolution_7x7.h"

 #if !__aarch64__
 #include "convolution_1x1_int8.h"
 #endif // !__aarch64__
 #include "convolution_3x3_int8.h"

 DEFINE_LAYER_CREATOR(Convolution_arm)

@@ -221,19 +220,6 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option
        return Convolution::forward(bottom_blob, top_blob, opt);
    }

 #if __aarch64__
    if (use_int8_inference)
    {
        // TODO
        return Convolution::forward(bottom_blob, top_blob, opt);
    }
 #else
    if (use_int8_inference && (kernel_size != 1 || stride != 1))
    {
        return Convolution::forward(bottom_blob, top_blob, opt);
    }
 #endif

    typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&, const Option&);

    // kernel_size x stride
@@ -283,10 +269,58 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option
        }  // kernel_size = 7
    };

    typedef void (*conv_int8_func)(const Mat&, Mat&, const Mat&, const Option&);

    // kernel_size x stride
    conv_int8_func conv_int8_func_table[5][5] =
    {
        {
            conv1x1s1_int8_neon,
            conv1x1s2_int8_neon,
            0,
            0,
            0
        }, // kernel_size = 1
        {
            0,
            0,
            0,
            0,
            0
        }, // kernel_size = 2
        {
            conv3x3s1_int8_neon,
            conv3x3s2_int8_neon,
            0,
            0,
            0
        }, // kernel_size = 3
        {
            0,
            0,
            0,
            0,
            0
        }, // kernel_size = 4
        {
            0,
            0,
            0,
            0,
            0
        }  // kernel_size = 5
    };

    conv_func conv = 0;
    conv_int8_func conv_int8 = 0;

    if (use_int8_inference)
    {
        conv_int8 = conv_int8_func_table[kernel_size-1][stride-1];
        if (!conv_int8)
        {
            return Convolution::forward(bottom_blob, top_blob, opt);
        }
    }
    else
    {
@@ -339,11 +373,8 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option
    if (top_blob.empty())
        return -100;

 #if !__aarch64__
    if (use_int8_inference)
    {
        // kernel_size = 1
        // stride = 1
        Mat bottom_blob_bordered_int8;
        bottom_blob_bordered_int8.create(w, h, channels, (size_t)1u, opt.workspace_allocator);
        if (bottom_blob_bordered_int8.empty())
@@ -359,7 +390,7 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option
            quantize->forward(bottom_blob_bordered, bottom_blob_bordered_int8, opt);
        }

        conv1x1s1_neon_s8_inter(bottom_blob_bordered_int8, top_blob, weight_data, opt);
        conv_int8(bottom_blob_bordered_int8, top_blob, weight_data, opt);

        // dequantize, reverse scale inplace
        {
@@ -382,7 +413,6 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option

        return 0;
    }
 #endif

    if (use_winograd3x3 && w <= 120 && h <= 120)
    {
--- a/src/layer/arm/convolutiondepthwise_3x3_int8.h
+++ b/src/layer/arm/convolutiondepthwise_3x3_int8.h
@@ -0,0 +1,142 @@
 // SenseNets is pleased to support the open source community by supporting ncnn available.
 //
 // Copyright (C) 2018 SenseNets Technology Ltd. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 static void convdw3x3s1_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
 {
    int w = bottom_blob.w;
    //int h = bottom_blob.h;
    //int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const signed char *kernel = _kernel;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out = top_blob.channel(p);

        out.fill(0);

        const signed char *kernel0 = (const signed char *)kernel + p * 9;

        int *outptr = out;

        const signed char *img0 = bottom_blob.channel(p);

        const signed char *r0 = img0;
        const signed char *r1 = img0 + w;
        const signed char *r2 = img0 + w * 2;

        int i = 0;
        for (; i < outh; i++)
        {
            int remain = outw;

            for (; remain > 0; remain--)
            {

                int sum = 0;

                sum += (int)r0[0] * (int)kernel0[0];
                sum += (int)r0[1] * (int)kernel0[1];
                sum += (int)r0[2] * (int)kernel0[2];
                sum += (int)r1[0] * (int)kernel0[3];
                sum += (int)r1[1] * (int)kernel0[4];
                sum += (int)r1[2] * (int)kernel0[5];
                sum += (int)r2[0] * (int)kernel0[6];
                sum += (int)r2[1] * (int)kernel0[7];
                sum += (int)r2[2] * (int)kernel0[8];

                *outptr += sum;

                r0++;
                r1++;
                r2++;
                outptr++;
            }

            r0 += 2;
            r1 += 2;
            r2 += 2;
        }
    }
 }

 static void convdw3x3s2_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
 {
    int w = bottom_blob.w;
    //int h = bottom_blob.h;
    //int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int tailstep = w - 2 * outw + w;

    const signed char *kernel = _kernel;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out = top_blob.channel(p);
        out.fill(0);

        const signed char *kernel0 = (const signed char *)kernel + p * 9;

        int *outptr = out;

        const signed char *img0 = bottom_blob.channel(p);

        const signed char *r0 = img0;
        const signed char *r1 = img0 + w;
        const signed char *r2 = img0 + w * 2;

        int i = 0;

        for (; i < outh; i++)
        {
            int remain = outw;

            for (; remain > 0; remain--)
            {
                int sum = 0;

                sum += (int)r0[0] * (int)kernel0[0];
                sum += (int)r0[1] * (int)kernel0[1];
                sum += (int)r0[2] * (int)kernel0[2];
                sum += (int)r1[0] * (int)kernel0[3];
                sum += (int)r1[1] * (int)kernel0[4];
                sum += (int)r1[2] * (int)kernel0[5];
                sum += (int)r2[0] * (int)kernel0[6];
                sum += (int)r2[1] * (int)kernel0[7];
                sum += (int)r2[2] * (int)kernel0[8];

                *outptr += sum;

                r0 += 2;
                r1 += 2;
                r2 += 2;
                outptr++;
            }

            r0 += tailstep;
            r1 += tailstep;
            r2 += tailstep;
        }
    }
 }
--- a/src/layer/arm/convolutiondepthwise_arm.cpp
+++ b/src/layer/arm/convolutiondepthwise_arm.cpp
@@ -24,6 +24,8 @@ namespace ncnn {

 #include "convolutiondepthwise_3x3.h"

 #include "convolutiondepthwise_3x3_int8.h"

 DEFINE_LAYER_CREATOR(ConvolutionDepthWise_arm)

 ConvolutionDepthWise_arm::ConvolutionDepthWise_arm()
@@ -56,7 +58,13 @@ int ConvolutionDepthWise_arm::load_model(const ModelBin& mb)
    if (channels == group && group == num_output)
    {
        // depth-wise specific
        return 0;
        if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1)
        {
            if ((stride_w == 1 && stride_h == 1) || (stride_w == 2 && stride_h == 2))
            {
                return 0;
            }
        }
    }

    const int channels_g = channels / group;
@@ -66,7 +74,7 @@ int ConvolutionDepthWise_arm::load_model(const ModelBin& mb)

    for (int g=0; g<group; g++)
    {
        Mat weight_data_g(maxk * channels_g * num_output_g, (void*)((const float*)weight_data + maxk * channels_g * num_output_g * g));
        Mat weight_data_g(maxk * channels_g * num_output_g, (void*)((const unsigned char*)weight_data + maxk * channels_g * num_output_g * g * weight_data.elemsize), weight_data.elemsize);
        Mat bias_data_g;
        if (bias_term)
            bias_data_g = Mat(num_output_g, (void*)((const float*)bias_data + num_output_g * g));
@@ -87,6 +95,12 @@ int ConvolutionDepthWise_arm::load_model(const ModelBin& mb)
        pd.set(5, bias_term);
        pd.set(6, maxk * channels_g * num_output_g);// weight_data_size

        if (use_int8_inference)
        {
            pd.set(8, weight_data_int8_scales[g]);
            pd.set(9, bottom_blob_int8_scales[g]);
        }

        op->load_param(pd);

        // set weights
@@ -107,12 +121,6 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, con
    // convolv with NxN kernel
    // value = value + bias

    if (use_int8_inference)
    {
        // TODO
        return ConvolutionDepthWise::forward(bottom_blob, top_blob, opt);
    }

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
@@ -159,75 +167,91 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, con
    if (top_blob.empty())
        return -100;

    const int maxk = kernel_w * kernel_h;

    // depth-wise
    if (channels == group && group == num_output)
    {
        if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1)
        if (use_int8_inference)
        {
            if (stride_w == 1 && stride_h == 1)
            if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1)
            {
                convdw3x3s1_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
                return 0;
                if ((stride_w == 1 && stride_h == 1) || (stride_w == 2 && stride_h == 2))
                {
                    Mat bottom_blob_bordered_int8;
                    bottom_blob_bordered_int8.create(w, h, channels, (size_t)1u, opt.workspace_allocator);
                    if (bottom_blob_bordered_int8.empty())
                        return -100;

                    // quantize, scale and round to nearest
                    #pragma omp parallel for num_threads(opt.num_threads)
                    for (int g=0; g<group; g++)
                    {
                        ncnn::Option opt_g = opt;
                        opt_g.num_threads = 1;
                        opt_g.blob_allocator = bottom_blob_bordered_int8.allocator;

                        const Mat bottom_blob_bordered_g = bottom_blob_bordered.channel(g);
                        Mat bottom_blob_bordered_int8_g = bottom_blob_bordered_int8.channel(g);
                        quantize_ops[g]->forward(bottom_blob_bordered_g, bottom_blob_bordered_int8_g, opt_g);
                    }

                    if (stride_w == 1 && stride_h == 1)
                    {
                        convdw3x3s1_int8_neon(bottom_blob_bordered_int8, top_blob, weight_data, opt);
                    }
                    else if (stride_w == 2 && stride_h == 2)
                    {
                        convdw3x3s2_int8_neon(bottom_blob_bordered_int8, top_blob, weight_data, opt);
                    }

                    // dequantize, reverse scale inplace
                    #pragma omp parallel for num_threads(opt.num_threads)
                    for (int g=0; g<group; g++)
                    {
                        ncnn::Option opt_g = opt;
                        opt_g.num_threads = 1;
                        opt_g.blob_allocator = top_blob.allocator;

                        Mat top_blob_g = top_blob.channel(g);
                        dequantize_ops[g]->forward_inplace(top_blob_g, opt_g);
                    }

                    return 0;
                }
            }
            else if (stride_w == 2 && stride_h == 2)
        }
        else
        {
            if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1)
            {
                convdw3x3s2_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
                return 0;
                if (stride_w == 1 && stride_h == 1)
                {
                    convdw3x3s1_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
                    return 0;
                }
                else if (stride_w == 2 && stride_h == 2)
                {
                    convdw3x3s2_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
                    return 0;
                }
            }
        }

 #ifdef _OPENMP
        int nested_current = omp_get_nested();
        omp_set_nested(0);
 #endif

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int g=0; g<group; g++)
        {
            Mat bottom_blob_bordered_g(w, h, 1, bottom_blob_bordered.channel(g));
            Mat top_blob_g(outw, outh, 1, top_blob.channel(g), top_blob.elemsize, top_blob.allocator);
            Mat weight_data_g(maxk, (void*)((const float*)weight_data + maxk * g));
            Mat bias_data_g;
            if (bias_term)
                bias_data_g = Mat(1, (void*)((const float*)bias_data + g));

            // call Convolution
            ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);

            // set param
            ncnn::ParamDict pd;
            pd.set(0, 1);// num_output
            pd.set(1, kernel_w);
            pd.set(11, kernel_h);
            pd.set(2, dilation_w);
            pd.set(12, dilation_h);
            pd.set(3, stride_w);
            pd.set(13, stride_h);
            pd.set(4, 0);// pad_w
            pd.set(14, 0);// pad_h
            pd.set(5, bias_term);
            pd.set(6, maxk);// weight_data_size

            op->load_param(pd);

            // set weights
            ncnn::Mat weights[2];
            weights[0] = weight_data_g;
            weights[1] = bias_data_g;

            op->load_model(ModelBinFromMatArray(weights));

            // forward
            op->forward(bottom_blob_bordered_g, top_blob_g, opt);
            const ncnn::Layer* op = group_ops[g];

            delete op;
            ncnn::Option opt_g = opt;
            opt_g.num_threads = 1;
            opt_g.blob_allocator = top_blob.allocator;

            // forward
            op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
        }

 #ifdef _OPENMP
        omp_set_nested(nested_current);
 #endif
        return 0;
    }

@@ -241,8 +265,11 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, con

        const ncnn::Layer* op = group_ops[g];

        ncnn::Option opt_g = opt;
        opt_g.blob_allocator = top_blob.allocator;

        // forward
        op->forward(bottom_blob_bordered_g, top_blob_g, opt);
        op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
    }

    return 0;
--- a/src/layer/convolution.cpp
+++ b/src/layer/convolution.cpp
@@ -53,6 +53,9 @@ int Convolution::load_param(const ParamDict& pd)

    use_int8_inference = pd.use_int8_inference;

    if (weight_data_int8_scale == 0.f || bottom_blob_int8_scale == 0.f)
        use_int8_inference = false;

    return 0;
 }

--- a/src/layer/convolutiondepthwise.cpp
+++ b/src/layer/convolutiondepthwise.cpp
@@ -64,6 +64,9 @@ int ConvolutionDepthWise::load_param(const ParamDict& pd)
        return -100;
    }

    if (weight_data_int8_scales.empty() || bottom_blob_int8_scales.empty())
        use_int8_inference = false;

    // extend group if only one provided
    if (weight_data_int8_scales.w == 1)
    {
@@ -121,7 +124,30 @@ int ConvolutionDepthWise::load_model(const ModelBin& mb)
        for (int g=0; g<group; g++)
        {
            quantize_ops[g] = ncnn::create_layer(ncnn::LayerType::Quantize);

            ncnn::ParamDict pd;
            pd.set(0, bottom_blob_int8_scales[g]);// scale

            quantize_ops[g]->load_param(pd);
        }

        for (int g=0; g<group; g++)
        {
            dequantize_ops[g] = ncnn::create_layer(ncnn::LayerType::Dequantize);

            float top_rescale = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);

            ncnn::ParamDict pd;
            pd.set(0, top_rescale);// scale
            pd.set(1, bias_term);// bias_term
            pd.set(2, 1);// bias_data_size

            dequantize_ops[g]->load_param(pd);

            ncnn::Mat weights[1];
            weights[0] = Mat(1, (void*)((const float*)bias_data + g));

            dequantize_ops[g]->load_model(ModelBinFromMatArray(weights));
        }
    }

@@ -252,11 +278,6 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const O
            {
                // quantize, scale and round to nearest
                {
                    ncnn::ParamDict pd;
                    pd.set(0, bottom_blob_int8_scales[g]);// scale

                    quantize_ops[g]->load_param(pd);

                    ncnn::Option opt_g = opt;
                    opt_g.num_threads = 1;
                    opt_g.blob_allocator = bottom_blob_bordered_int8.allocator;
@@ -293,20 +314,6 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const O

                // dequantize, reverse scale inplace
                {
                    float top_rescale = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);

                    ncnn::ParamDict pd;
                    pd.set(0, top_rescale);// scale
                    pd.set(1, bias_term);// bias_term
                    pd.set(2, 1);// bias_data_size

                    dequantize_ops[g]->load_param(pd);

                    ncnn::Mat weights[1];
                    weights[0] = Mat(1, (void*)((const float*)bias_data + g));

                    dequantize_ops[g]->load_model(ModelBinFromMatArray(weights));

                    ncnn::Option opt_g = opt;
                    opt_g.num_threads = 1;
                    opt_g.blob_allocator = top_blob.allocator;
@@ -325,11 +332,6 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const O
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int g=0; g<group; g++)
            {
                ncnn::ParamDict pd;
                pd.set(0, bottom_blob_int8_scales[g]);// scale

                quantize_ops[g]->load_param(pd);

                ncnn::Option opt_g = opt;
                opt_g.num_threads = 1;
                opt_g.blob_allocator = bottom_blob_bordered_int8.allocator;
@@ -387,20 +389,6 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const O
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int g=0; g<group; g++)
            {
                float top_rescale = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);

                ncnn::ParamDict pd;
                pd.set(0, top_rescale);// scale
                pd.set(1, bias_term);// bias_term
                pd.set(2, num_output_g);// bias_data_size

                dequantize_ops[g]->load_param(pd);

                ncnn::Mat weights[1];
                weights[0] = Mat(num_output_g, (void*)((const float*)bias_data + num_output_g * g));

                dequantize_ops[g]->load_model(ModelBinFromMatArray(weights));

                ncnn::Option opt_g = opt;
                opt_g.num_threads = 1;
                opt_g.blob_allocator = top_blob.allocator;
--- a/src/layer/x86/convolutiondepthwise_x86.cpp
+++ b/src/layer/x86/convolutiondepthwise_x86.cpp
@@ -28,6 +28,94 @@ namespace ncnn {

 DEFINE_LAYER_CREATOR(ConvolutionDepthWise_x86)

 ConvolutionDepthWise_x86::ConvolutionDepthWise_x86()
 {
 }

 ConvolutionDepthWise_x86::~ConvolutionDepthWise_x86()
 {
    for (int i=0; i<(int)group_ops.size(); i++)
        delete group_ops[i];

    group_ops.clear();
 }

 int ConvolutionDepthWise_x86::load_model(const ModelBin& mb)
 {
    int ret = ConvolutionDepthWise::load_model(mb);
    if (ret != 0)
        return ret;

    // create Convolution op for each group
    const int maxk = kernel_w * kernel_h;
    int channels = (weight_data_size / group) / maxk / (num_output / group) * group;

    for (int i=0; i<(int)group_ops.size(); i++)
        delete group_ops[i];

    group_ops.clear();

    if (channels == group && group == num_output)
    {
        // depth-wise specific
        if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1)
        {
            if ((stride_w == 1 && stride_h == 1) || (stride_w == 2 && stride_h == 2))
            {
                return 0;
            }
        }
    }

    const int channels_g = channels / group;
    const int num_output_g = num_output / group;

    group_ops.resize(group);

    for (int g=0; g<group; g++)
    {
        Mat weight_data_g(maxk * channels_g * num_output_g, (void*)((const unsigned char*)weight_data + maxk * channels_g * num_output_g * g * weight_data.elemsize), weight_data.elemsize);
        Mat bias_data_g;
        if (bias_term)
            bias_data_g = Mat(num_output_g, (void*)((const float*)bias_data + num_output_g * g));

        ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);

        // set param
        ncnn::ParamDict pd;
        pd.set(0, num_output_g);// num_output
        pd.set(1, kernel_w);
        pd.set(11, kernel_h);
        pd.set(2, dilation_w);
        pd.set(12, dilation_h);
        pd.set(3, stride_w);
        pd.set(13, stride_h);
        pd.set(4, 0);// pad_w
        pd.set(14, 0);// pad_h
        pd.set(5, bias_term);
        pd.set(6, maxk * channels_g * num_output_g);// weight_data_size

        if (use_int8_inference)
        {
            pd.set(8, weight_data_int8_scales[g]);
            pd.set(9, bottom_blob_int8_scales[g]);
        }

        op->load_param(pd);

        // set weights
        ncnn::Mat weights[2];
        weights[0] = weight_data_g;
        weights[1] = bias_data_g;

        op->load_model(ModelBinFromMatArray(weights));

        group_ops[g] = op;
    }

    return 0;
 }

 int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
    // convolv with NxN kernel
@@ -79,8 +167,6 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con
    if (top_blob.empty())
        return -100;

    const int maxk = kernel_w * kernel_h;

    // depth-wise
    if (channels == group && group == num_output)
    {
@@ -99,11 +185,6 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con
                    #pragma omp parallel for num_threads(opt.num_threads)
                    for (int g=0; g<group; g++)
                    {
                        ncnn::ParamDict pd;
                        pd.set(0, bottom_blob_int8_scales[g]);// scale

                        quantize_ops[g]->load_param(pd);

                        ncnn::Option opt_g = opt;
                        opt_g.num_threads = 1;
                        opt_g.blob_allocator = bottom_blob_bordered_int8.allocator;
@@ -126,20 +207,6 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con
                    #pragma omp parallel for num_threads(opt.num_threads)
                    for (int g=0; g<group; g++)
                    {
                        float top_rescale = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);

                        ncnn::ParamDict pd;
                        pd.set(0, top_rescale);// scale
                        pd.set(1, bias_term);// bias_term
                        pd.set(2, 1);// bias_data_size

                        dequantize_ops[g]->load_param(pd);

                        ncnn::Mat weights[1];
                        weights[0] = Mat(1, (void*)((const float*)bias_data + g));

                        dequantize_ops[g]->load_model(ModelBinFromMatArray(weights));

                        ncnn::Option opt_g = opt;
                        opt_g.num_threads = 1;
                        opt_g.blob_allocator = top_blob.allocator;
@@ -174,42 +241,8 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con
        {
            Mat bottom_blob_bordered_g(w, h, 1, bottom_blob_bordered.channel(g));
            Mat top_blob_g(outw, outh, 1, top_blob.channel(g), top_blob.elemsize, top_blob.allocator);
            Mat weight_data_g(maxk, (void*)((const unsigned char*)weight_data + maxk * g * weight_data.elemsize), weight_data.elemsize);
            Mat bias_data_g;
            if (bias_term)
                bias_data_g = Mat(1, (void*)((const float*)bias_data + g));

            // call Convolution
            ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);

            // set param
            ncnn::ParamDict pd;
            pd.set(0, 1);// num_output
            pd.set(1, kernel_w);
            pd.set(11, kernel_h);
            pd.set(2, dilation_w);
            pd.set(12, dilation_h);
            pd.set(3, stride_w);
            pd.set(13, stride_h);
            pd.set(4, 0);// pad_w
            pd.set(14, 0);// pad_h
            pd.set(5, bias_term);
            pd.set(6, maxk);// weight_data_size

            if (use_int8_inference)
            {
                pd.set(8, weight_data_int8_scales[g]);
                pd.set(9, bottom_blob_int8_scales[g]);
            }

            op->load_param(pd);

            // set weights
            ncnn::Mat weights[2];
            weights[0] = weight_data_g;
            weights[1] = bias_data_g;

            op->load_model(ModelBinFromMatArray(weights));
            const ncnn::Layer* op = group_ops[g];

            ncnn::Option opt_g = opt;
            opt_g.num_threads = 1;
@@ -217,8 +250,6 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con

            // forward
            op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);

            delete op;
        }

        return 0;
@@ -231,50 +262,14 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con
    {
        Mat bottom_blob_bordered_g(w, h, channels_g, bottom_blob_bordered.channel(channels_g * g));
        Mat top_blob_g(outw, outh, num_output_g, top_blob.channel(num_output_g * g), top_blob.elemsize, top_blob.allocator);
        Mat weight_data_g(maxk * channels_g * num_output_g, (void*)((const unsigned char*)weight_data + maxk * channels_g * num_output_g * g * weight_data.elemsize), weight_data.elemsize);
        Mat bias_data_g;
        if (bias_term)
            bias_data_g = Mat(num_output_g, (void*)((const float*)bias_data + num_output_g * g));

        // call Convolution
        ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);

        // set param
        ncnn::ParamDict pd;
        pd.set(0, num_output_g);// num_output
        pd.set(1, kernel_w);
        pd.set(11, kernel_h);
        pd.set(2, dilation_w);
        pd.set(12, dilation_h);
        pd.set(3, stride_w);
        pd.set(13, stride_h);
        pd.set(4, 0);// pad_w
        pd.set(14, 0);// pad_h
        pd.set(5, bias_term);
        pd.set(6, maxk * channels_g * num_output_g);// weight_data_size

        if (use_int8_inference)
        {
            pd.set(8, weight_data_int8_scales[g]);
            pd.set(9, bottom_blob_int8_scales[g]);
        }

        op->load_param(pd);

        // set weights
        ncnn::Mat weights[2];
        weights[0] = weight_data_g;
        weights[1] = bias_data_g;

        op->load_model(ModelBinFromMatArray(weights));
        const ncnn::Layer* op = group_ops[g];

        ncnn::Option opt_g = opt;
        opt_g.blob_allocator = top_blob.allocator;

        // forward
        op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);

        delete op;
    }

    return 0;
--- a/src/layer/x86/convolutiondepthwise_x86.h
+++ b/src/layer/x86/convolutiondepthwise_x86.h
@@ -22,7 +22,15 @@ namespace ncnn {
 class ConvolutionDepthWise_x86 : public ConvolutionDepthWise
 {
 public:
    ConvolutionDepthWise_x86();
    virtual ~ConvolutionDepthWise_x86();

    virtual int load_model(const ModelBin& mb);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

 public:
    std::vector<ncnn::Layer*> group_ops;
 };

 } // namespace ncnn