arm optimization for convolution int8 gemm unified elempack (#5016)

2 years ago · c8662cce5e
--- a/src/layer/arm/convolution_1x1_int8.h
+++ b/src/layer/arm/convolution_1x1_int8.h
@@ -1,83 +0,0 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 static void conv1x1s1_sgemm_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    const int size = w * h;

    Mat bottom_im2col = bottom_blob;
    bottom_im2col.w = size;
    bottom_im2col.h = 1;

    im2col_sgemm_int8_neon(bottom_im2col, top_blob, kernel, opt);
 }

 static void conv1x1s2_sgemm_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
 {
    int w = bottom_blob.w;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    int outw = top_blob.w;
    int outh = top_blob.h;

    const int tailstep = w - 2 * outw + w;

    Mat bottom_blob_shrinked;
    bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < channels; p++)
    {
        const signed char* r0 = bottom_blob.channel(p);
        signed char* outptr = bottom_blob_shrinked.channel(p);

        for (int i = 0; i < outh; i++)
        {
            int j = 0;
            for (; j + 3 < outw; j += 4)
            {
                outptr[0] = r0[0];
                outptr[1] = r0[2];
                outptr[2] = r0[4];
                outptr[3] = r0[6];

                r0 += 8;
                outptr += 4;
            }
            for (; j + 1 < outw; j += 2)
            {
                outptr[0] = r0[0];
                outptr[1] = r0[2];

                r0 += 4;
                outptr += 2;
            }
            for (; j < outw; j++)
            {
                outptr[0] = r0[0];

                r0 += 2;
                outptr += 1;
            }

            r0 += tailstep;
        }
    }

    conv1x1s1_sgemm_int8_neon(bottom_blob_shrinked, top_blob, kernel, opt);
 }
--- a/src/layer/arm/convolution_1x1_pack1to4_int8.h
+++ b/src/layer/arm/convolution_1x1_pack1to4_int8.h
@@ -1,83 +0,0 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 static void conv1x1s1_sgemm_pack1to4_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    const int size = w * h;

    Mat bottom_im2col = bottom_blob;
    bottom_im2col.w = size;
    bottom_im2col.h = 1;

    im2col_sgemm_pack1to4_int8_neon(bottom_im2col, top_blob, kernel, opt);
 }

 static void conv1x1s2_sgemm_pack1to4_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
 {
    int w = bottom_blob.w;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    int outw = top_blob.w;
    int outh = top_blob.h;

    const int tailstep = w - 2 * outw + w;

    Mat bottom_blob_shrinked;
    bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < channels; p++)
    {
        const signed char* r0 = bottom_blob.channel(p);
        signed char* outptr = bottom_blob_shrinked.channel(p);

        for (int i = 0; i < outh; i++)
        {
            int j = 0;
            for (; j + 3 < outw; j += 4)
            {
                outptr[0] = r0[0];
                outptr[1] = r0[2];
                outptr[2] = r0[4];
                outptr[3] = r0[6];

                r0 += 8;
                outptr += 4;
            }
            for (; j + 1 < outw; j += 2)
            {
                outptr[0] = r0[0];
                outptr[1] = r0[2];

                r0 += 4;
                outptr += 2;
            }
            for (; j < outw; j++)
            {
                outptr[0] = r0[0];

                r0 += 2;
                outptr += 1;
            }

            r0 += tailstep;
        }
    }

    conv1x1s1_sgemm_pack1to4_int8_neon(bottom_blob_shrinked, top_blob, kernel, opt);
 }
--- a/src/layer/arm/convolution_1x1_pack8to1_int8.h
+++ b/src/layer/arm/convolution_1x1_pack8to1_int8.h
@@ -1,90 +0,0 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 static void conv1x1s1_sgemm_pack8to1_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    const int size = w * h;

    Mat bottom_im2col = bottom_blob;
    bottom_im2col.w = size;
    bottom_im2col.h = 1;

    im2col_sgemm_pack8to1_int8_neon(bottom_im2col, top_blob, kernel, opt);
 }

 static void conv1x1s2_sgemm_pack8to1_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
 {
    int w = bottom_blob.w;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    int outw = top_blob.w;
    int outh = top_blob.h;

    const int tailstep = (w - 2 * outw + w) * 8;

    Mat bottom_blob_shrinked;
    bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < channels; p++)
    {
        const signed char* r0 = bottom_blob.channel(p);
        signed char* outptr = bottom_blob_shrinked.channel(p);

        for (int i = 0; i < outh; i++)
        {
            int j = 0;
            for (; j + 3 < outw; j += 4)
            {
                int8x8_t _v0 = vld1_s8(r0);
                int8x8_t _v1 = vld1_s8(r0 + 16);
                int8x8_t _v2 = vld1_s8(r0 + 32);
                int8x8_t _v3 = vld1_s8(r0 + 48);
                vst1_s8(outptr, _v0);
                vst1_s8(outptr + 8, _v1);
                vst1_s8(outptr + 16, _v2);
                vst1_s8(outptr + 24, _v3);

                r0 += 64;
                outptr += 32;
            }
            for (; j + 1 < outw; j += 2)
            {
                int8x8_t _v0 = vld1_s8(r0);
                int8x8_t _v1 = vld1_s8(r0 + 16);
                vst1_s8(outptr, _v0);
                vst1_s8(outptr + 8, _v1);

                r0 += 32;
                outptr += 16;
            }
            for (; j < outw; j++)
            {
                int8x8_t _v = vld1_s8(r0);
                vst1_s8(outptr, _v);

                r0 += 16;
                outptr += 8;
            }

            r0 += tailstep;
        }
    }

    conv1x1s1_sgemm_pack8to1_int8_neon(bottom_blob_shrinked, top_blob, kernel, opt);
 }
--- a/src/layer/arm/convolution_1x1_pack8to4_int8.h
+++ b/src/layer/arm/convolution_1x1_pack8to4_int8.h
@@ -1,90 +0,0 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 static void conv1x1s1_sgemm_pack8to4_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    const int size = w * h;

    Mat bottom_im2col = bottom_blob;
    bottom_im2col.w = size;
    bottom_im2col.h = 1;

    im2col_sgemm_pack8to4_int8_neon(bottom_im2col, top_blob, kernel, opt);
 }

 static void conv1x1s2_sgemm_pack8to4_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
 {
    int w = bottom_blob.w;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    int outw = top_blob.w;
    int outh = top_blob.h;

    const int tailstep = (w - 2 * outw + w) * 8;

    Mat bottom_blob_shrinked;
    bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < channels; p++)
    {
        const signed char* r0 = bottom_blob.channel(p);
        signed char* outptr = bottom_blob_shrinked.channel(p);

        for (int i = 0; i < outh; i++)
        {
            int j = 0;
            for (; j + 3 < outw; j += 4)
            {
                int8x8_t _v0 = vld1_s8(r0);
                int8x8_t _v1 = vld1_s8(r0 + 16);
                int8x8_t _v2 = vld1_s8(r0 + 32);
                int8x8_t _v3 = vld1_s8(r0 + 48);
                vst1_s8(outptr, _v0);
                vst1_s8(outptr + 8, _v1);
                vst1_s8(outptr + 16, _v2);
                vst1_s8(outptr + 24, _v3);

                r0 += 64;
                outptr += 32;
            }
            for (; j + 1 < outw; j += 2)
            {
                int8x8_t _v0 = vld1_s8(r0);
                int8x8_t _v1 = vld1_s8(r0 + 16);
                vst1_s8(outptr, _v0);
                vst1_s8(outptr + 8, _v1);

                r0 += 32;
                outptr += 16;
            }
            for (; j < outw; j++)
            {
                int8x8_t _v = vld1_s8(r0);
                vst1_s8(outptr, _v);

                r0 += 16;
                outptr += 8;
            }

            r0 += tailstep;
        }
    }

    conv1x1s1_sgemm_pack8to4_int8_neon(bottom_blob_shrinked, top_blob, kernel, opt);
 }
--- a/src/layer/arm/convolution_3x3_pack1to4_int8.h
+++ b/src/layer/arm/convolution_3x3_pack1to4_int8.h
@@ -1,147 +0,0 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 static void conv3x3s1_pack1to4_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
 {
    int w = bottom_blob.w;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    const int size = outw * outh;

    const int maxk = 9;

    // im2col
    Mat bottom_im2col(size, maxk, inch, 1u, 1, opt.workspace_allocator);
    {
        const int gap = w - outw;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < inch; p++)
        {
            const Mat img = bottom_blob.channel(p);
            signed char* ptr = bottom_im2col.channel(p);

            for (int u = 0; u < 3; u++)
            {
                for (int v = 0; v < 3; v++)
                {
                    const signed char* sptr = img.row<const signed char>(u) + v;

                    for (int i = 0; i < outh; i++)
                    {
                        int j = 0;
                        for (; j + 3 < outw; j += 4)
                        {
                            ptr[0] = sptr[0];
                            ptr[1] = sptr[1];
                            ptr[2] = sptr[2];
                            ptr[3] = sptr[3];

                            sptr += 4;
                            ptr += 4;
                        }
                        for (; j + 1 < outw; j += 2)
                        {
                            ptr[0] = sptr[0];
                            ptr[1] = sptr[1];

                            sptr += 2;
                            ptr += 2;
                        }
                        for (; j < outw; j++)
                        {
                            ptr[0] = sptr[0];

                            sptr += 1;
                            ptr += 1;
                        }

                        sptr += gap;
                    }
                }
            }
        }
    }

    im2col_sgemm_pack1to4_int8_neon(bottom_im2col, top_blob, kernel, opt);
 }

 static void conv3x3s2_pack1to4_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
 {
    int w = bottom_blob.w;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    const int size = outw * outh;

    const int maxk = 9;

    // im2col
    Mat bottom_im2col(size, maxk, inch, 1u, 1, opt.workspace_allocator);
    {
        const int gap = w * 2 - outw * 2;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < inch; p++)
        {
            const Mat img = bottom_blob.channel(p);
            signed char* ptr = bottom_im2col.channel(p);

            for (int u = 0; u < 3; u++)
            {
                for (int v = 0; v < 3; v++)
                {
                    const signed char* sptr = img.row<const signed char>(u) + v;

                    for (int i = 0; i < outh; i++)
                    {
                        int j = 0;
                        for (; j + 3 < outw; j += 4)
                        {
                            ptr[0] = sptr[0];
                            ptr[1] = sptr[2];
                            ptr[2] = sptr[4];
                            ptr[3] = sptr[6];

                            sptr += 8;
                            ptr += 4;
                        }
                        for (; j + 1 < outw; j += 2)
                        {
                            ptr[0] = sptr[0];
                            ptr[1] = sptr[2];

                            sptr += 4;
                            ptr += 2;
                        }
                        for (; j < outw; j++)
                        {
                            ptr[0] = sptr[0];

                            sptr += 2;
                            ptr += 1;
                        }

                        sptr += gap;
                    }
                }
            }
        }
    }

    im2col_sgemm_pack1to4_int8_neon(bottom_im2col, top_blob, kernel, opt);
 }
--- a/src/layer/arm/convolution_7x7_pack1to4_int8.h
+++ b/src/layer/arm/convolution_7x7_pack1to4_int8.h
@@ -1,80 +0,0 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 static void conv7x7s2_pack1to4_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
 {
    int w = bottom_blob.w;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    const int size = outw * outh;

    const int maxk = 49;

    // im2col
    Mat bottom_im2col(size, maxk, inch, 1u, 1, opt.workspace_allocator);
    {
        const int gap = w * 2 - outw * 2;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < inch; p++)
        {
            const Mat img = bottom_blob.channel(p);
            signed char* ptr = bottom_im2col.channel(p);

            for (int u = 0; u < 7; u++)
            {
                for (int v = 0; v < 7; v++)
                {
                    const signed char* sptr = img.row<const signed char>(u) + v;

                    for (int i = 0; i < outh; i++)
                    {
                        int j = 0;
                        for (; j + 3 < outw; j += 4)
                        {
                            ptr[0] = sptr[0];
                            ptr[1] = sptr[2];
                            ptr[2] = sptr[4];
                            ptr[3] = sptr[6];

                            sptr += 8;
                            ptr += 4;
                        }
                        for (; j + 1 < outw; j += 2)
                        {
                            ptr[0] = sptr[0];
                            ptr[1] = sptr[2];

                            sptr += 4;
                            ptr += 2;
                        }
                        for (; j < outw; j++)
                        {
                            ptr[0] = sptr[0];

                            sptr += 2;
                            ptr += 1;
                        }

                        sptr += gap;
                    }
                }
            }
        }
    }

    im2col_sgemm_pack1to4_int8_neon(bottom_im2col, top_blob, kernel, opt);
 }
--- a/src/layer/arm/convolution_arm.cpp
+++ b/src/layer/arm/convolution_arm.cpp
@@ -48,10 +48,10 @@ namespace ncnn {
 #endif // NCNN_BF16

 #if NCNN_INT8
 #include "convolution_sgemm_int8.h"
 #include "convolution_im2col_gemm_int8.h"

 #include "convolution_winograd_transform_int8.h"
 #include "convolution_winograd_dot_int8.h"
 #include "convolution_1x1_int8.h"
 #include "convolution_3x3_int8.h"
 #include "convolution_int8.h"
 #endif // NCNN_INT8
@@ -74,19 +74,11 @@ namespace ncnn {
 #include "convolution_pack8to4_int8.h"
 #include "convolution_pack1to4_int8.h"
 #include "convolution_pack8to1_int8.h"
 #include "convolution_sgemm_pack8to4_int8.h"
 #include "convolution_sgemm_pack1to4_int8.h"
 #include "convolution_sgemm_pack8to1_int8.h"
 #include "convolution_winograd_transform_pack4_int8.h"
 #include "convolution_winograd_transform_pack8_int8.h"
 #include "convolution_winograd_dot_pack8to4_int8.h"
 #include "convolution_winograd_dot_pack8to1_int8.h"
 #include "convolution_1x1_pack8to4_int8.h"
 #include "convolution_1x1_pack1to4_int8.h"
 #include "convolution_1x1_pack8to1_int8.h"
 #include "convolution_3x3_pack8to4_int8.h"
 #include "convolution_3x3_pack1to4_int8.h"
 #include "convolution_7x7_pack1to4_int8.h"
 #include "convolution_3x3_pack8to1_int8.h"
 #endif // NCNN_INT8
 #endif // __ARM_NEON
@@ -1303,121 +1295,41 @@ int Convolution_arm::create_pipeline_int8_arm(const Option& opt)
    }
 #endif // __ARM_NEON

 #if __ARM_NEON
    if (elempack == 8 && out_elempack == 4)
    {
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            convolution_im2col_sgemm_transform_kernel_pack8to4_int8_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            convolution_im2col_sgemm_transform_kernel_pack8to4_int8_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
 #if NCNN_ARM82DOT
        else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && (!ncnn::cpu_support_arm_asimddp() || (ncnn::cpu_support_arm_asimddp() && num_input >= 256 && num_output >= 256)))
    if (elempack == 8 && out_elempack == 4 && opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && (!ncnn::cpu_support_arm_asimddp() || (ncnn::cpu_support_arm_asimddp() && num_input >= 256 && num_output >= 256)))
 #else
        else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
    if (elempack == 8 && out_elempack == 4 && opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
 #endif
        {
            conv3x3s1_winograd43_transform_kernel_pack8to4_int8_neon(weight_data, weight_winograd43_data, num_input, num_output, opt);
        }
        else if (opt.use_sgemm_convolution)
        {
            convolution_im2col_sgemm_transform_kernel_pack8to4_int8_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else
        {
            convolution_transform_kernel_packed_int8_neon(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
        }
    {
 #if __ARM_NEON
        conv3x3s1_winograd43_transform_kernel_pack8to4_int8_neon(weight_data, weight_winograd43_data, num_input, num_output, opt);
 #endif // __ARM_NEON
    }

    if (elempack == 1 && out_elempack == 4)
    else if (elempack == 8 && out_elempack == 1 && opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
    {
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            convolution_im2col_sgemm_transform_kernel_pack1to4_int8_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            convolution_im2col_sgemm_transform_kernel_pack1to4_int8_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            convolution_im2col_sgemm_transform_kernel_pack1to4_int8_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            convolution_im2col_sgemm_transform_kernel_pack1to4_int8_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            convolution_im2col_sgemm_transform_kernel_pack1to4_int8_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
        {
            convolution_im2col_sgemm_transform_kernel_pack1to4_int8_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else
        {
            convolution_transform_kernel_packed_int8_neon(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
        }
 #if __ARM_NEON
        conv3x3s1_winograd43_transform_kernel_pack8to1_int8_neon(weight_data, weight_winograd43_data, num_input, num_output, opt);
 #endif // __ARM_NEON
    }

    if (elempack == 8 && out_elempack == 1)
    else if (elempack == 1 && out_elempack == 1 && opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
    {
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            convolution_im2col_sgemm_transform_kernel_pack8to1_int8_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            convolution_im2col_sgemm_transform_kernel_pack8to1_int8_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv3x3s1_winograd43_transform_kernel_pack8to1_int8_neon(weight_data, weight_winograd43_data, num_input, num_output, opt);
        }
        else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
        {
            convolution_im2col_sgemm_transform_kernel_pack8to1_int8_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else
        {
            convolution_transform_kernel_packed_int8_neon(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
        }
        conv3x3s1_winograd43_transform_kernel_int8_neon(weight_data, weight_winograd43_data, num_input, num_output, opt);
    }
 #endif // __ARM_NEON

    if (elempack == 1 && out_elempack == 1)
    else if (opt.use_sgemm_convolution)
    {
        if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv3x3s1_winograd43_transform_kernel_int8_neon(weight_data, weight_winograd43_data, num_input, num_output, opt);
        }

        /*    if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
        conv3x3s2_transform_kernel_int8_neon(weight_data, weight_3x3s2_data_int8, num_input, num_output);
        }
        else */

        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            convolution_im2col_sgemm_transform_kernel_int8_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            convolution_im2col_sgemm_transform_kernel_int8_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else if (opt.use_sgemm_convolution)
        {
            convolution_im2col_sgemm_transform_kernel_int8_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else
        {
            weight_data_tm = weight_data;
        }
        convolution_im2col_gemm_transform_kernel_int8(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h, opt);
    }
    else if (elempack == 1 && out_elempack == 1)
    {
        // if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        // {
        //     conv3x3s2_transform_kernel_int8_neon(weight_data, weight_3x3s2_data_int8, num_input, num_output);
        // }
        weight_data_tm = weight_data;
    }
    else
    {
        convolution_transform_kernel_packed_int8_neon(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
    }

    scale_in_data.create(num_output);
@@ -1515,114 +1427,55 @@ int Convolution_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, con
    if (top_blob_int32.empty())
        return -100;

 #if __ARM_NEON
    if (elempack == 8 && out_elempack_int32 == 4)
    int _nT = nT ? nT : opt.num_threads;
    if (nT != 0 && opt.num_threads != nT)
    {
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv1x1s1_sgemm_pack8to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
        }
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            conv1x1s2_sgemm_pack8to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
        }
        // force num_threads the same as in create_pipeline
        // so we could use pre-packed A/B from the same tile config
        NCNN_LOGE("opt.num_threads %d changed, convolution gemm will use load-time value %d", opt.num_threads, nT);
    }

 #if NCNN_ARM82DOT
        else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && (!ncnn::cpu_support_arm_asimddp() || (ncnn::cpu_support_arm_asimddp() && num_input >= 256 && num_output >= 256)))
    if (elempack == 8 && out_elempack_int32 == 4 && opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && (!ncnn::cpu_support_arm_asimddp() || (ncnn::cpu_support_arm_asimddp() && num_input >= 256 && num_output >= 256)))
 #else
        else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
    if (elempack == 8 && out_elempack_int32 == 4 && opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
 #endif
        {
            conv3x3s1_winograd43_pack8to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt);
        }
        else if (opt.use_sgemm_convolution)
        {
            convolution_im2col_sgemm_pack8to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
        }
        else
        {
            convolution_pack8to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
        }
    {
 #if __ARM_NEON
        conv3x3s1_winograd43_pack8to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt);
 #endif // __ARM_NEON
    }

    if (elempack == 1 && out_elempack_int32 == 4)
    else if (elempack == 8 && out_elempack_int32 == 1 && opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
    {
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv1x1s1_sgemm_pack1to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
        }
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            conv1x1s2_sgemm_pack1to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
        }
        else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv3x3s1_pack1to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
        }
        else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            conv3x3s2_pack1to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
        }
        else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            conv7x7s2_pack1to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
        }
        else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
        {
            convolution_im2col_sgemm_pack1to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
        }
        else
        {
            convolution_pack1to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
        }
 #if __ARM_NEON
        conv3x3s1_winograd43_pack8to1_int8_neon(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt);
 #endif // __ARM_NEON
    }

    if (elempack == 8 && out_elempack_int32 == 1)
    else if (elempack == 1 && out_elempack_int32 == 1 && opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
    {
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv1x1s1_sgemm_pack8to1_int8_neon(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
        }
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            conv1x1s2_sgemm_pack8to1_int8_neon(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
        }
        else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv3x3s1_winograd43_pack8to1_int8_neon(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt);
        }
        else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
        {
            convolution_im2col_sgemm_pack8to1_int8_neon(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
        }
        else
        {
            convolution_pack8to1_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
        }
        conv3x3s1_winograd43_int8_neon(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt);
    }
 #endif // __ARM_NEON

    if (elempack == 1 && out_elempack_int32 == 1)
    else if (opt.use_sgemm_convolution)
    {
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv1x1s1_sgemm_int8_neon(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
        }
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            conv1x1s2_sgemm_int8_neon(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
        }
        else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv3x3s1_winograd43_int8_neon(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt);
        }
        else if (opt.use_sgemm_convolution)
        {
            convolution_im2col_sgemm_int8_neon(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
        }
        else
        {
            convolution_int8(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
        }
        convolution_im2col_gemm_int8(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, _nT, opt);
    }
 #if __ARM_NEON
    else if (elempack == 8 && out_elempack_int32 == 4)
    {
        convolution_pack8to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
    }
    else if (elempack == 1 && out_elempack_int32 == 4)
    {
        convolution_pack1to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
    }
    else if (elempack == 8 && out_elempack_int32 == 1)
    {
        convolution_pack8to1_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
    }
 #endif   // __ARM_NEON
    else // if (elempack == 1 && out_elempack_int32 == 1)
    {
        convolution_int8(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
    }

    if (use_int8_requantize)
--- a/src/layer/arm/convolution_arm_asimddp.cpp
+++ b/src/layer/arm/convolution_arm_asimddp.cpp
@@ -17,53 +17,17 @@

 namespace ncnn {

 #include "convolution_sgemm_int8.h"
 #include "convolution_sgemm_pack1to4_int8.h"
 #include "convolution_sgemm_pack8to1_int8.h"
 #include "convolution_sgemm_pack8to4_int8.h"
 #include "convolution_im2col_gemm_int8.h"

 // pack1
 void im2col_sgemm_int8_neon_asimddp(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
 // gemm
 void convolution_im2col_gemm_transform_kernel_int8_asimddp(const Mat& kernel, Mat& AT, int inch, int outch, int kernel_w, int kernel_h, const Option& opt)
 {
    im2col_sgemm_int8_neon(bottom_im2col, top_blob, kernel, opt);
    convolution_im2col_gemm_transform_kernel_int8(kernel, AT, inch, outch, kernel_w, kernel_h, opt);
 }

 void convolution_im2col_sgemm_transform_kernel_int8_neon_asimddp(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
 void convolution_im2col_gemm_int8_asimddp(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt)
 {
    convolution_im2col_sgemm_transform_kernel_int8_neon(kernel, kernel_tm, inch, outch, kernel_w, kernel_h);
 }

 // pack1to4
 void im2col_sgemm_pack1to4_int8_neon_asimddp(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
 {
    im2col_sgemm_pack1to4_int8_neon(bottom_im2col, top_blob, kernel, opt);
 }

 void convolution_im2col_sgemm_transform_kernel_pack1to4_int8_neon_asimddp(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
 {
    convolution_im2col_sgemm_transform_kernel_pack1to4_int8_neon(kernel, kernel_tm, inch, outch, kernel_w, kernel_h);
 }

 // pack8to1
 void im2col_sgemm_pack8to1_int8_neon_asimddp(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
 {
    im2col_sgemm_pack8to1_int8_neon(bottom_im2col, top_blob, kernel, opt);
 }

 void convolution_im2col_sgemm_transform_kernel_pack8to1_int8_neon_asimddp(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
 {
    convolution_im2col_sgemm_transform_kernel_pack8to1_int8_neon(kernel, kernel_tm, inch, outch, kernel_w, kernel_h);
 }

 // pack8to4
 void im2col_sgemm_pack8to4_int8_neon_asimddp(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
 {
    im2col_sgemm_pack8to4_int8_neon(bottom_im2col, top_blob, kernel, opt);
 }

 void convolution_im2col_sgemm_transform_kernel_pack8to4_int8_neon_asimddp(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
 {
    convolution_im2col_sgemm_transform_kernel_pack8to4_int8_neon(kernel, kernel_tm, inch, outch, kernel_w, kernel_h);
    convolution_im2col_gemm_int8(bottom_blob, top_blob, AT, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, nT, opt);
 }

 } // namespace ncnn
--- a/src/layer/arm/convolution_arm_i8mm.cpp
+++ b/src/layer/arm/convolution_arm_i8mm.cpp
@@ -17,53 +17,17 @@

 namespace ncnn {

 #include "convolution_sgemm_int8.h"
 #include "convolution_sgemm_pack1to4_int8.h"
 #include "convolution_sgemm_pack8to1_int8.h"
 #include "convolution_sgemm_pack8to4_int8.h"
 #include "convolution_im2col_gemm_int8.h"

 // pack1
 void im2col_sgemm_int8_neon_i8mm(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
 // gemm
 void convolution_im2col_gemm_transform_kernel_int8_i8mm(const Mat& kernel, Mat& AT, int inch, int outch, int kernel_w, int kernel_h, const Option& opt)
 {
    im2col_sgemm_int8_neon(bottom_im2col, top_blob, kernel, opt);
    convolution_im2col_gemm_transform_kernel_int8(kernel, AT, inch, outch, kernel_w, kernel_h, opt);
 }

 void convolution_im2col_sgemm_transform_kernel_int8_neon_i8mm(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
 void convolution_im2col_gemm_int8_i8mm(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt)
 {
    convolution_im2col_sgemm_transform_kernel_int8_neon(kernel, kernel_tm, inch, outch, kernel_w, kernel_h);
 }

 // pack1to4
 void im2col_sgemm_pack1to4_int8_neon_i8mm(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
 {
    im2col_sgemm_pack1to4_int8_neon(bottom_im2col, top_blob, kernel, opt);
 }

 void convolution_im2col_sgemm_transform_kernel_pack1to4_int8_neon_i8mm(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
 {
    convolution_im2col_sgemm_transform_kernel_pack1to4_int8_neon(kernel, kernel_tm, inch, outch, kernel_w, kernel_h);
 }

 // pack8to1
 void im2col_sgemm_pack8to1_int8_neon_i8mm(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
 {
    im2col_sgemm_pack8to1_int8_neon(bottom_im2col, top_blob, kernel, opt);
 }

 void convolution_im2col_sgemm_transform_kernel_pack8to1_int8_neon_i8mm(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
 {
    convolution_im2col_sgemm_transform_kernel_pack8to1_int8_neon(kernel, kernel_tm, inch, outch, kernel_w, kernel_h);
 }

 // pack8to4
 void im2col_sgemm_pack8to4_int8_neon_i8mm(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
 {
    im2col_sgemm_pack8to4_int8_neon(bottom_im2col, top_blob, kernel, opt);
 }

 void convolution_im2col_sgemm_transform_kernel_pack8to4_int8_neon_i8mm(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
 {
    convolution_im2col_sgemm_transform_kernel_pack8to4_int8_neon(kernel, kernel_tm, inch, outch, kernel_w, kernel_h);
    convolution_im2col_gemm_int8(bottom_blob, top_blob, AT, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, nT, opt);
 }

 } // namespace ncnn
--- a/src/layer/arm/convolution_im2col_gemm_int8.h
+++ b/src/layer/arm/convolution_im2col_gemm_int8.h
--- a/src/layer/arm/convolution_sgemm_int8.h
+++ b/src/layer/arm/convolution_sgemm_int8.h
--- a/src/layer/arm/convolution_sgemm_pack1to4_int8.h
+++ b/src/layer/arm/convolution_sgemm_pack1to4_int8.h
--- a/src/layer/arm/convolution_sgemm_pack8to1_int8.h
+++ b/src/layer/arm/convolution_sgemm_pack8to1_int8.h
--- a/src/layer/arm/convolution_sgemm_pack8to4_int8.h
+++ b/src/layer/arm/convolution_sgemm_pack8to4_int8.h
--- a/tests/test_convolution_3.cpp
+++ b/tests/test_convolution_3.cpp
@@ -310,7 +310,13 @@ static int test_convolution_1()
           || test_convolution_int8(4, 20, 16, 24, 3, 1, 1, 1, 0)
           || test_convolution_int8(6, 7, 64, 64, 3, 1, 2, 0, 1)
           || test_convolution_int8(25, 33, 16, 15, 3, 1, 1, 1, 0)
           || test_convolution_int8(7, 7, 15, 12, 3, 1, 1, 1, 0);
           || test_convolution_int8(7, 7, 15, 12, 3, 1, 1, 1, 0)
           || test_convolution_int8(5, 6, 31, 9, 5, 1, 1, 0, 1)
           || test_convolution_int8(5, 7, 32, 8, 5, 1, 2, 0, 1)
           || test_convolution_int8(16, 10, 31, 32, 2, 1, 3, 0, 0)
           || test_convolution_int8(5, 10, 5, 32, 3, 2, 1, 0, 1)
           || test_convolution_int8(3, 9, 16, 13, 2, 2, 1, 0, 0)
           || test_convolution_int8(33, 5, 15, 5, 2, 1, 3, 0, 1);
 }

 static int test_convolution_1_2()
@@ -326,65 +332,65 @@ static int test_convolution_1_2()
           || test_convolution_int8(19, 17, 32, 1, 3, 2, 2, 0, 0)

           || test_convolution_int8(19, 17, 1, 2, 5, 2, 2, 0, 1)
           || test_convolution_int8(19, 17, 2, 2, 5, 2, 2, 0, 0)
           || test_convolution_int8(19, 17, 2, 2, 5, 2, 3, 0, 0)
           || test_convolution_int8(19, 17, 7, 2, 5, 2, 2, 0, 1)
           || test_convolution_int8(19, 17, 8, 2, 5, 2, 2, 0, 0)
           || test_convolution_int8(19, 17, 8, 2, 5, 2, 3, 0, 0)
           || test_convolution_int8(19, 17, 15, 2, 5, 2, 2, 0, 1)
           || test_convolution_int8(19, 17, 16, 2, 5, 2, 2, 0, 0)
           || test_convolution_int8(19, 17, 16, 2, 5, 2, 3, 0, 0)
           || test_convolution_int8(19, 17, 31, 2, 5, 2, 2, 0, 1)
           || test_convolution_int8(19, 17, 32, 2, 5, 2, 2, 0, 0)
           || test_convolution_int8(19, 17, 32, 2, 5, 2, 3, 0, 0)

           || test_convolution_int8(19, 17, 1, 7, 5, 2, 2, 0, 1)
           || test_convolution_int8(19, 17, 1, 7, 5, 2, 3, 0, 1)
           || test_convolution_int8(19, 17, 2, 7, 5, 2, 2, 0, 0)
           || test_convolution_int8(19, 17, 7, 7, 5, 2, 2, 0, 1)
           || test_convolution_int8(19, 17, 7, 7, 5, 2, 3, 0, 1)
           || test_convolution_int8(19, 17, 8, 7, 5, 2, 2, 0, 0)
           || test_convolution_int8(19, 17, 15, 7, 5, 2, 2, 0, 1)
           || test_convolution_int8(19, 17, 15, 7, 5, 2, 3, 0, 1)
           || test_convolution_int8(19, 17, 16, 7, 5, 2, 2, 0, 0)
           || test_convolution_int8(19, 17, 31, 7, 5, 2, 2, 0, 1)
           || test_convolution_int8(19, 17, 31, 7, 5, 2, 3, 0, 1)
           || test_convolution_int8(19, 17, 32, 7, 5, 2, 2, 0, 0)

           || test_convolution_int8(19, 17, 1, 8, 5, 2, 2, 0, 1)
           || test_convolution_int8(19, 17, 2, 8, 5, 2, 2, 0, 0)
           || test_convolution_int8(19, 17, 2, 8, 5, 2, 3, 0, 0)
           || test_convolution_int8(19, 17, 7, 8, 5, 2, 2, 0, 1)
           || test_convolution_int8(19, 17, 8, 8, 5, 2, 2, 0, 0)
           || test_convolution_int8(19, 17, 8, 8, 5, 2, 3, 0, 0)
           || test_convolution_int8(19, 17, 15, 8, 5, 2, 2, 0, 1)
           || test_convolution_int8(19, 17, 16, 8, 5, 2, 2, 0, 0)
           || test_convolution_int8(19, 17, 16, 8, 5, 2, 3, 0, 0)
           || test_convolution_int8(19, 17, 31, 8, 5, 2, 2, 0, 1)
           || test_convolution_int8(19, 17, 32, 8, 5, 2, 2, 0, 0)
           || test_convolution_int8(19, 17, 32, 8, 5, 2, 3, 0, 0)

           || test_convolution_int8(19, 17, 1, 15, 5, 2, 2, 0, 1)
           || test_convolution_int8(19, 17, 1, 15, 5, 2, 3, 0, 1)
           || test_convolution_int8(19, 17, 2, 15, 5, 2, 2, 0, 0)
           || test_convolution_int8(19, 17, 7, 15, 5, 2, 2, 0, 1)
           || test_convolution_int8(19, 17, 7, 15, 5, 2, 3, 0, 1)
           || test_convolution_int8(19, 17, 8, 15, 5, 2, 2, 0, 0)
           || test_convolution_int8(19, 17, 15, 15, 5, 2, 2, 0, 1)
           || test_convolution_int8(19, 17, 15, 15, 5, 2, 3, 0, 1)
           || test_convolution_int8(19, 17, 16, 15, 5, 2, 2, 0, 0)
           || test_convolution_int8(19, 17, 31, 15, 5, 2, 2, 0, 1)
           || test_convolution_int8(19, 17, 31, 15, 5, 2, 3, 0, 1)
           || test_convolution_int8(19, 17, 32, 15, 5, 2, 2, 0, 0)

           || test_convolution_int8(19, 17, 1, 16, 5, 2, 2, 0, 1)
           || test_convolution_int8(19, 17, 2, 16, 5, 2, 2, 0, 0)
           || test_convolution_int8(19, 17, 1, 16, 5, 2, 3, 0, 1)
           || test_convolution_int8(19, 17, 2, 16, 5, 2, 3, 0, 0)
           || test_convolution_int8(19, 17, 7, 16, 5, 2, 2, 0, 1)
           || test_convolution_int8(19, 17, 8, 16, 5, 2, 2, 0, 0)
           || test_convolution_int8(19, 17, 15, 16, 5, 2, 2, 0, 1)
           || test_convolution_int8(19, 17, 16, 16, 5, 2, 2, 0, 0)
           || test_convolution_int8(19, 17, 31, 16, 5, 2, 2, 0, 1)
           || test_convolution_int8(19, 17, 32, 16, 5, 2, 2, 0, 0)
           || test_convolution_int8(19, 17, 31, 16, 5, 2, 3, 0, 1)
           || test_convolution_int8(19, 17, 32, 16, 5, 2, 3, 0, 0)

           || test_convolution_int8(19, 17, 1, 31, 5, 2, 2, 0, 1)
           || test_convolution_int8(19, 17, 2, 31, 5, 2, 2, 0, 0)
           || test_convolution_int8(19, 17, 7, 31, 5, 2, 2, 0, 1)
           || test_convolution_int8(19, 17, 2, 31, 5, 2, 3, 0, 0)
           || test_convolution_int8(19, 17, 7, 31, 5, 2, 3, 0, 1)
           || test_convolution_int8(19, 17, 8, 31, 5, 2, 2, 0, 0)
           || test_convolution_int8(19, 17, 15, 31, 5, 2, 2, 0, 1)
           || test_convolution_int8(19, 17, 16, 31, 5, 2, 2, 0, 0)
           || test_convolution_int8(19, 17, 31, 31, 5, 2, 2, 0, 1)
           || test_convolution_int8(19, 17, 16, 31, 5, 2, 3, 0, 0)
           || test_convolution_int8(19, 17, 31, 31, 5, 2, 3, 0, 1)
           || test_convolution_int8(19, 17, 32, 31, 5, 2, 2, 0, 0)

           || test_convolution_int8(19, 17, 1, 32, 5, 2, 2, 0, 1)
           || test_convolution_int8(19, 17, 2, 32, 5, 2, 2, 0, 0)
           || test_convolution_int8(19, 17, 7, 32, 5, 2, 2, 0, 1)
           || test_convolution_int8(19, 17, 8, 32, 5, 2, 2, 0, 0)
           || test_convolution_int8(19, 17, 15, 32, 5, 2, 2, 0, 1)
           || test_convolution_int8(19, 17, 16, 32, 5, 2, 2, 0, 0)
           || test_convolution_int8(19, 17, 7, 32, 5, 2, 3, 0, 1)
           || test_convolution_int8(19, 17, 8, 32, 5, 2, 3, 0, 0)
           || test_convolution_int8(19, 17, 15, 32, 5, 2, 3, 0, 1)
           || test_convolution_int8(19, 17, 16, 32, 5, 2, 3, 0, 0)
           || test_convolution_int8(19, 17, 31, 32, 5, 2, 2, 0, 1)
           || test_convolution_int8(19, 17, 32, 32, 5, 2, 2, 0, 0);
 }