mips msa optimization for convolution int8 (#3675)

* basic mips msa optimization for convolution int8 * mips msa optimization for convolution int8 gemm * mips msa optimization for convolution int8 winograd pack8to4/pack8to1 * mention msa maddv/msubv intrinsics bug
4 years ago · c09d7b3591
--- a/docs/how-to-build/how-to-build.md
+++ b/docs/how-to-build/how-to-build.md
@@ -578,12 +578,38 @@ You can upload binary inside `build-c906/examples` folder and run on D1 board fo

 ### Build for Loongson 2K1000

 For gcc version < 8.5, you need to fix msa.h header for workaround msa fmadd bug.
 For gcc version < 8.5, you need to fix msa.h header for workaround msa fmadd/fmsub/maddv/msubv bug.

 Open ```/usr/lib/gcc/mips64el-linux-gnuabi64/8/include/msa.h```, find ```__msa_fmadd_w``` and apply changes as the following
 Open ```/usr/lib/gcc/mips64el-linux-gnuabi64/8/include/msa.h```, find ```__msa_fmadd``` and ```__msa_fmsub``` and apply changes as the following
 ```c
 // #define __msa_fmadd_w __builtin_msa_fmadd_w
 // #define __msa_fmadd_d __builtin_msa_fmadd_d
 // #define __msa_fmsub_w __builtin_msa_fmsub_w
 // #define __msa_fmsub_d __builtin_msa_fmsub_d
 #define __msa_fmadd_w(a, b, c) __builtin_msa_fmadd_w(c, b, a)
 #define __msa_fmadd_d(a, b, c) __builtin_msa_fmadd_d(c, b, a)
 #define __msa_fmsub_w(a, b, c) __builtin_msa_fmsub_w(c, b, a)
 #define __msa_fmsub_d(a, b, c) __builtin_msa_fmsub_d(c, b, a)
 ```

 find ```__msa_maddv``` and ```__msa_msubv``` and apply changes as the following
 ```c
 // #define __msa_maddv_b __builtin_msa_maddv_b
 // #define __msa_maddv_h __builtin_msa_maddv_h
 // #define __msa_maddv_w __builtin_msa_maddv_w
 // #define __msa_maddv_d __builtin_msa_maddv_d
 // #define __msa_msubv_b __builtin_msa_msubv_b
 // #define __msa_msubv_h __builtin_msa_msubv_h
 // #define __msa_msubv_w __builtin_msa_msubv_w
 // #define __msa_msubv_d __builtin_msa_msubv_d
 #define __msa_maddv_b(a, b, c) __builtin_msa_maddv_b(c, b, a)
 #define __msa_maddv_h(a, b, c) __builtin_msa_maddv_h(c, b, a)
 #define __msa_maddv_w(a, b, c) __builtin_msa_maddv_w(c, b, a)
 #define __msa_maddv_d(a, b, c) __builtin_msa_maddv_d(c, b, a)
 #define __msa_msubv_b(a, b, c) __builtin_msa_msubv_b(c, b, a)
 #define __msa_msubv_h(a, b, c) __builtin_msa_msubv_h(c, b, a)
 #define __msa_msubv_w(a, b, c) __builtin_msa_msubv_w(c, b, a)
 #define __msa_msubv_d(a, b, c) __builtin_msa_msubv_d(c, b, a)
 ```

 Build ncnn with mips msa and simpleocv enabled:
--- a/src/layer/mips/convolution1d_mips.cpp
+++ b/src/layer/mips/convolution1d_mips.cpp
@@ -253,7 +253,7 @@ int Convolution1D_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Opt
                        }
                    }

                    sum += __msa_fhadd_w(_sum);
                    sum += __msa_reduce_fadd_w(_sum);

                    sum = activation_ss(sum, activation_type, activation_params);

--- a/src/layer/mips/convolution_1x1_int8.h
+++ b/src/layer/mips/convolution_1x1_int8.h
@@ -0,0 +1,83 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 static void conv1x1s1_sgemm_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    const int size = w * h;

    Mat bottom_im2col = bottom_blob;
    bottom_im2col.w = size;
    bottom_im2col.h = 1;

    im2col_sgemm_int8_msa(bottom_im2col, top_blob, kernel, opt);
 }

 static void conv1x1s2_sgemm_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
 {
    int w = bottom_blob.w;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    int outw = top_blob.w;
    int outh = top_blob.h;

    const int tailstep = w - 2 * outw + w;

    Mat bottom_blob_shrinked;
    bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < channels; p++)
    {
        const signed char* r0 = bottom_blob.channel(p);
        signed char* outptr = bottom_blob_shrinked.channel(p);

        for (int i = 0; i < outh; i++)
        {
            int j = 0;
            for (; j + 3 < outw; j += 4)
            {
                outptr[0] = r0[0];
                outptr[1] = r0[2];
                outptr[2] = r0[4];
                outptr[3] = r0[6];

                r0 += 8;
                outptr += 4;
            }
            for (; j + 1 < outw; j += 2)
            {
                outptr[0] = r0[0];
                outptr[1] = r0[2];

                r0 += 4;
                outptr += 2;
            }
            for (; j < outw; j++)
            {
                outptr[0] = r0[0];

                r0 += 2;
                outptr += 1;
            }

            r0 += tailstep;
        }
    }

    conv1x1s1_sgemm_int8_msa(bottom_blob_shrinked, top_blob, kernel, opt);
 }
--- a/src/layer/mips/convolution_1x1_pack1to4_int8.h
+++ b/src/layer/mips/convolution_1x1_pack1to4_int8.h
@@ -0,0 +1,83 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 static void conv1x1s1_sgemm_pack1to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    const int size = w * h;

    Mat bottom_im2col = bottom_blob;
    bottom_im2col.w = size;
    bottom_im2col.h = 1;

    im2col_sgemm_pack1to4_int8_msa(bottom_im2col, top_blob, kernel, opt);
 }

 static void conv1x1s2_sgemm_pack1to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
 {
    int w = bottom_blob.w;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    int outw = top_blob.w;
    int outh = top_blob.h;

    const int tailstep = w - 2 * outw + w;

    Mat bottom_blob_shrinked;
    bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < channels; p++)
    {
        const signed char* r0 = bottom_blob.channel(p);
        signed char* outptr = bottom_blob_shrinked.channel(p);

        for (int i = 0; i < outh; i++)
        {
            int j = 0;
            for (; j + 3 < outw; j += 4)
            {
                outptr[0] = r0[0];
                outptr[1] = r0[2];
                outptr[2] = r0[4];
                outptr[3] = r0[6];

                r0 += 8;
                outptr += 4;
            }
            for (; j + 1 < outw; j += 2)
            {
                outptr[0] = r0[0];
                outptr[1] = r0[2];

                r0 += 4;
                outptr += 2;
            }
            for (; j < outw; j++)
            {
                outptr[0] = r0[0];

                r0 += 2;
                outptr += 1;
            }

            r0 += tailstep;
        }
    }

    conv1x1s1_sgemm_pack1to4_int8_msa(bottom_blob_shrinked, top_blob, kernel, opt);
 }
--- a/src/layer/mips/convolution_1x1_pack8to1_int8.h
+++ b/src/layer/mips/convolution_1x1_pack8to1_int8.h
@@ -0,0 +1,65 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 static void conv1x1s1_sgemm_pack8to1_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    const int size = w * h;

    Mat bottom_im2col = bottom_blob;
    bottom_im2col.w = size;
    bottom_im2col.h = 1;

    im2col_sgemm_pack8to1_int8_msa(bottom_im2col, top_blob, kernel, opt);
 }

 static void conv1x1s2_sgemm_pack8to1_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
 {
    int w = bottom_blob.w;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    int outw = top_blob.w;
    int outh = top_blob.h;

    const int tailstep = w - 2 * outw + w;

    Mat bottom_blob_shrinked;
    bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < channels; p++)
    {
        const int64_t* r0 = bottom_blob.channel(p);
        int64_t* outptr = bottom_blob_shrinked.channel(p);

        for (int i = 0; i < outh; i++)
        {
            int j = 0;
            for (; j < outw; j++)
            {
                outptr[0] = r0[0];

                r0 += 2;
                outptr += 1;
            }

            r0 += tailstep;
        }
    }

    conv1x1s1_sgemm_pack8to1_int8_msa(bottom_blob_shrinked, top_blob, kernel, opt);
 }
--- a/src/layer/mips/convolution_1x1_pack8to4_int8.h
+++ b/src/layer/mips/convolution_1x1_pack8to4_int8.h
@@ -0,0 +1,65 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 static void conv1x1s1_sgemm_pack8to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    const int size = w * h;

    Mat bottom_im2col = bottom_blob;
    bottom_im2col.w = size;
    bottom_im2col.h = 1;

    im2col_sgemm_pack8to4_int8_msa(bottom_im2col, top_blob, kernel, opt);
 }

 static void conv1x1s2_sgemm_pack8to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
 {
    int w = bottom_blob.w;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    int outw = top_blob.w;
    int outh = top_blob.h;

    const int tailstep = w - 2 * outw + w;

    Mat bottom_blob_shrinked;
    bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < channels; p++)
    {
        const int64_t* r0 = bottom_blob.channel(p);
        int64_t* outptr = bottom_blob_shrinked.channel(p);

        for (int i = 0; i < outh; i++)
        {
            int j = 0;
            for (; j < outw; j++)
            {
                outptr[0] = r0[0];

                r0 += 2;
                outptr += 1;
            }

            r0 += tailstep;
        }
    }

    conv1x1s1_sgemm_pack8to4_int8_msa(bottom_blob_shrinked, top_blob, kernel, opt);
 }
--- a/src/layer/mips/convolution_3x3_pack8to1_int8.h
+++ b/src/layer/mips/convolution_3x3_pack8to1_int8.h
@@ -0,0 +1,731 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 static void conv3x3s1_winograd42_transform_kernel_pack8to1_int8_msa(const Mat& kernel, Mat& kernel_tm_pack8to1, int inch, int outch, const Option& opt)
 {
    // winograd42 transform kernel
    Mat kernel_tm(6 * 6, inch, outch, (size_t)2u);

    const short ktm[6][3] = {
        {6, 0, 0},
        {-4, -4, -4},
        {-4, 4, -4},
        {1, 2, 4},
        {1, -2, 4},
        {0, 0, 6}
    };

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        for (int q = 0; q < inch; q++)
        {
            const signed char* kernel0 = (const signed char*)kernel + p * inch * 9 + q * 9;
            short* kernel_tm0 = kernel_tm.channel(p).row<short>(q);

            // transform kernel
            const signed char* k0 = kernel0;
            const signed char* k1 = kernel0 + 3;
            const signed char* k2 = kernel0 + 6;

            // h
            short tmp[6][3];
            for (int i = 0; i < 6; i++)
            {
                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
            }

            // U
            for (int j = 0; j < 6; j++)
            {
                short* tmpp = &tmp[j][0];

                for (int i = 0; i < 6; i++)
                {
                    kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
                }
            }
        }
    }

    // interleave
    // src = 36-inch-outch
    // dst = 4b-8a-inch/8a-36-outch/4b
    kernel_tm_pack8to1.create(8 * inch / 8, 36, outch / 4 + outch % 4, (size_t)2u * 4, 4);

    int p = 0;
    for (; p + 3 < outch; p += 4)
    {
        const Mat k0 = kernel_tm.channel(p);
        const Mat k1 = kernel_tm.channel(p + 1);
        const Mat k2 = kernel_tm.channel(p + 2);
        const Mat k3 = kernel_tm.channel(p + 3);

        Mat g0 = kernel_tm_pack8to1.channel(p / 4);

        for (int k = 0; k < 36; k++)
        {
            short* g00 = g0.row<short>(k);

            for (int q = 0; q + 7 < inch; q += 8)
            {
                for (int i = 0; i < 8; i++)
                {
                    g00[0] = k0.row<const short>(q + i)[k];
                    g00[1] = k1.row<const short>(q + i)[k];
                    g00[2] = k2.row<const short>(q + i)[k];
                    g00[3] = k3.row<const short>(q + i)[k];

                    g00 += 4;
                }
            }
        }
    }
    for (; p < outch; p++)
    {
        const Mat k0 = kernel_tm.channel(p);

        Mat g0 = kernel_tm_pack8to1.channel(p / 4 + p % 4);

        for (int k = 0; k < 36; k++)
        {
            short* g00 = g0.row<short>(k);

            for (int q = 0; q + 7 < inch; q += 8)
            {
                for (int i = 0; i < 8; i++)
                {
                    g00[0] = k0.row<const short>(q + i)[k];

                    g00 += 1;
                }
            }
        }
    }
 }

 static void conv3x3s1_winograd42_pack8to1_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Option& opt)
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int inch = bottom_blob.c;
    //     size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    // pad to 4n+2
    Mat bottom_blob_bordered = bottom_blob;

    outw = (outw + 3) / 4 * 4;
    outh = (outh + 3) / 4 * 4;

    w = outw + 2;
    h = outh + 2;
    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);

    // BEGIN transform input
    Mat bottom_blob_tm;
    {
        int w_tm = outw / 4 * 6;
        int h_tm = outh / 4 * 6;

        const int tiles = w_tm / 6 * h_tm / 6;

        bottom_blob_tm.create(tiles, 36, inch, 2u * elempack, elempack, opt.workspace_allocator);

        // const float itm[4][4] = {
        //     {4.0f, 0.0f, -5.0f, 0.0f, 1.0f, 0.0f},
        //     {0.0f,-4.0f, -4.0f, 1.0f, 1.0f, 0.0f},
        //     {0.0f, 4.0f, -4.0f,-1.0f, 1.0f, 0.0f},
        //     {0.0f,-2.0f, -1.0f, 2.0f, 1.0f, 0.0f},
        //     {0.0f, 2.0f, -1.0f,-2.0f, 1.0f, 0.0f},
        //     {0.0f, 4.0f,  0.0f,-5.0f, 0.0f, 1.0f}
        // };

        // 0 =  4 * r00 - 5 * r02 + r04
        // 1 = -4 * (r01 + r02) + r04 + r03
        // 2 =  4 * (r01 - r02) + r04 - r03
        // 3 = -2 * (r01 - r03) + r04 - r02
        // 4 =  2 * (r01 - r03) + r04 - r02
        // 5 =  4 * r01 - 5 * r03 + r05

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < inch; q++)
        {
            const Mat img0 = bottom_blob_bordered.channel(q);
            Mat img0_tm = bottom_blob_tm.channel(q);

            short tmp[6][6][8];

            // tile
            for (int i = 0; i < h_tm / 6; i++)
            {
                for (int j = 0; j < w_tm / 6; j++)
                {
                    const signed char* r0 = img0.row<const signed char>(i * 4) + (j * 4) * 8;

                    for (int m = 0; m < 6; m++)
                    {
                        v16i8 _r00_01 = __msa_ld_b(r0, 0);
                        v16i8 _r02_03 = __msa_ld_b(r0 + 16, 0);
                        v16i8 _r04_05 = __msa_ld_b(r0 + 32, 0);
                        v16i8 _extr0001 = __msa_clti_s_b(_r00_01, 0);
                        v16i8 _extr0203 = __msa_clti_s_b(_r02_03, 0);
                        v16i8 _extr0405 = __msa_clti_s_b(_r04_05, 0);
                        v8i16 _r00 = (v8i16)__msa_ilvr_b(_extr0001, _r00_01);
                        v8i16 _r01 = (v8i16)__msa_ilvl_b(_extr0001, _r00_01);
                        v8i16 _r02 = (v8i16)__msa_ilvr_b(_extr0203, _r02_03);
                        v8i16 _r03 = (v8i16)__msa_ilvl_b(_extr0203, _r02_03);
                        v8i16 _r04 = (v8i16)__msa_ilvr_b(_extr0405, _r04_05);
                        v8i16 _r05 = (v8i16)__msa_ilvl_b(_extr0405, _r04_05);

                        v8i16 _v5 = __msa_fill_h(5);

                        v8i16 _tmp0m = __msa_subv_h(__msa_addv_h(__msa_slli_h(_r00, 2), _r04), __msa_mulv_h(_r02, _v5));
                        v8i16 _tmp1m = __msa_subv_h(__msa_addv_h(_r04, _r03), __msa_slli_h(__msa_addv_h(_r01, _r02), 2));
                        v8i16 _tmp2m = __msa_addv_h(__msa_subv_h(_r04, _r03), __msa_slli_h(__msa_subv_h(_r01, _r02), 2));
                        v8i16 _tmp3m = __msa_subv_h(__msa_subv_h(_r04, _r02), __msa_slli_h(__msa_subv_h(_r01, _r03), 1));
                        v8i16 _tmp4m = __msa_addv_h(__msa_subv_h(_r04, _r02), __msa_slli_h(__msa_subv_h(_r01, _r03), 1));
                        v8i16 _tmp5m = __msa_subv_h(__msa_addv_h(__msa_slli_h(_r01, 2), _r05), __msa_mulv_h(_r03, _v5));

                        __msa_st_h(_tmp0m, tmp[0][m], 0);
                        __msa_st_h(_tmp1m, tmp[1][m], 0);
                        __msa_st_h(_tmp2m, tmp[2][m], 0);
                        __msa_st_h(_tmp3m, tmp[3][m], 0);
                        __msa_st_h(_tmp4m, tmp[4][m], 0);
                        __msa_st_h(_tmp5m, tmp[5][m], 0);

                        r0 += w * 8;
                    }

                    short* r0_tm_0 = (short*)img0_tm + (i * w_tm / 6 + j) * 8;
                    short* r0_tm_1 = r0_tm_0 + tiles * 8;
                    short* r0_tm_2 = r0_tm_0 + tiles * 16;
                    short* r0_tm_3 = r0_tm_0 + tiles * 24;
                    short* r0_tm_4 = r0_tm_0 + tiles * 32;
                    short* r0_tm_5 = r0_tm_0 + tiles * 40;

                    for (int m = 0; m < 6; m++)
                    {
                        v8i16 _tmp00 = __msa_ld_h(tmp[m][0], 0);
                        v8i16 _tmp01 = __msa_ld_h(tmp[m][1], 0);
                        v8i16 _tmp02 = __msa_ld_h(tmp[m][2], 0);
                        v8i16 _tmp03 = __msa_ld_h(tmp[m][3], 0);
                        v8i16 _tmp04 = __msa_ld_h(tmp[m][4], 0);
                        v8i16 _tmp05 = __msa_ld_h(tmp[m][5], 0);

                        v8i16 _v5 = __msa_fill_h(5);

                        v8i16 _r0tm0 = __msa_subv_h(__msa_addv_h(__msa_slli_h(_tmp00, 2), _tmp04), __msa_mulv_h(_tmp02, _v5));
                        v8i16 _r0tm1 = __msa_subv_h(__msa_addv_h(_tmp04, _tmp03), __msa_slli_h(__msa_addv_h(_tmp01, _tmp02), 2));
                        v8i16 _r0tm2 = __msa_addv_h(__msa_subv_h(_tmp04, _tmp03), __msa_slli_h(__msa_subv_h(_tmp01, _tmp02), 2));
                        v8i16 _r0tm3 = __msa_subv_h(__msa_subv_h(_tmp04, _tmp02), __msa_slli_h(__msa_subv_h(_tmp01, _tmp03), 1));
                        v8i16 _r0tm4 = __msa_addv_h(__msa_subv_h(_tmp04, _tmp02), __msa_slli_h(__msa_subv_h(_tmp01, _tmp03), 1));
                        v8i16 _r0tm5 = __msa_subv_h(__msa_addv_h(__msa_slli_h(_tmp01, 2), _tmp05), __msa_mulv_h(_tmp03, _v5));

                        __msa_st_h(_r0tm0, r0_tm_0, 0);
                        __msa_st_h(_r0tm1, r0_tm_1, 0);
                        __msa_st_h(_r0tm2, r0_tm_2, 0);
                        __msa_st_h(_r0tm3, r0_tm_3, 0);
                        __msa_st_h(_r0tm4, r0_tm_4, 0);
                        __msa_st_h(_r0tm5, r0_tm_5, 0);

                        r0_tm_0 += tiles * 48;
                        r0_tm_1 += tiles * 48;
                        r0_tm_2 += tiles * 48;
                        r0_tm_3 += tiles * 48;
                        r0_tm_4 += tiles * 48;
                        r0_tm_5 += tiles * 48;
                    }
                }
            }
        }
    }
    bottom_blob_bordered = Mat();
    // END transform input

    // BEGIN dot
    Mat top_blob_tm;
    {
        int w_tm = outw / 4 * 6;
        int h_tm = outh / 4 * 6;

        const int tiles = h_tm / 6 * w_tm / 6;

        // permute
        //         bottom_blob_tm.create(tiles, 36, inch, elemsize, elempack, opt.workspace_allocator);
        Mat bottom_blob_tm2;
        if (tiles >= 2)
            bottom_blob_tm2.create(2 * inch, tiles / 2 + tiles % 2, 36, 2u * elempack, elempack, opt.workspace_allocator);
        else // if (tiles >= 1)
            bottom_blob_tm2.create(1 * inch, tiles, 36, 2u * elempack, elempack, opt.workspace_allocator);

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int r = 0; r < 36; r++)
        {
            Mat tm2 = bottom_blob_tm2.channel(r);

            // tile
            int i = 0;
            for (; i + 1 < tiles; i += 2)
            {
                short* tmpptr = tm2.row<short>(i / 2);

                const short* r0 = bottom_blob_tm;

                r0 += (r * tiles + i) * 8;

                for (int q = 0; q < inch; q++)
                {
                    v8i16 _r0 = __msa_ld_h(r0, 0);
                    v8i16 _r1 = __msa_ld_h(r0 + 8, 0);
                    __msa_st_h(_r0, tmpptr, 0);
                    __msa_st_h(_r1, tmpptr + 8, 0);
                    r0 += bottom_blob_tm.cstep * 8;
                    tmpptr += 16;
                }
            }
            for (; i < tiles; i++)
            {
                short* tmpptr = tm2.row<short>(i / 2 + i % 2);

                const short* r0 = bottom_blob_tm;

                r0 += (r * tiles + i) * 8;

                for (int q = 0; q < inch; q++)
                {
                    v8i16 _r0 = __msa_ld_h(r0, 0);
                    __msa_st_h(_r0, tmpptr, 0);
                    r0 += bottom_blob_tm.cstep * 8;
                    tmpptr += 8;
                }
            }
        }

        bottom_blob_tm = Mat();
        // permute end

        top_blob_tm.create(tiles, 36, outch, 4u, 1, opt.workspace_allocator);

        int nn_outch = 0;
        int remain_outch_start = 0;

        nn_outch = outch >> 2;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int pp = 0; pp < nn_outch; pp++)
        {
            int p = pp * 4;

            int* output0_tm = top_blob_tm.channel(p);
            int* output1_tm = top_blob_tm.channel(p + 1);
            int* output2_tm = top_blob_tm.channel(p + 2);
            int* output3_tm = top_blob_tm.channel(p + 3);

            const Mat kernel0_tm = kernel_tm.channel(p / 4);

            for (int r = 0; r < 36; r++)
            {
                const Mat bb2 = bottom_blob_tm2.channel(r);

                int i = 0;
                for (; i + 1 < tiles; i += 2)
                {
                    const short* r0 = bb2.row<const short>(i / 2);
                    const short* k0 = kernel0_tm.row<const short>(r);

                    int nn = inch; // inch always > 0

                    v4i32 _sum0 = __msa_fill_w(0);
                    v4i32 _sum1 = __msa_fill_w(0);
                    v4i32 _sum2 = __msa_fill_w(0);
                    v4i32 _sum3 = __msa_fill_w(0);

                    for (int j = 0; j < nn; j++)
                    {
                        v8i16 _w0 = __msa_ld_h(k0, 0);
                        v8i16 _w1 = __msa_ld_h(k0 + 8, 0);
                        v8i16 _w2 = __msa_ld_h(k0 + 16, 0);
                        v8i16 _w3 = __msa_ld_h(k0 + 24, 0);

                        v8i16 _extw0 = __msa_clti_s_h(_w0, 0);
                        v8i16 _extw1 = __msa_clti_s_h(_w1, 0);
                        v8i16 _extw2 = __msa_clti_s_h(_w2, 0);
                        v8i16 _extw3 = __msa_clti_s_h(_w3, 0);

                        v4i32 _w0l = (v4i32)__msa_ilvr_h(_extw0, _w0);
                        v4i32 _w0h = (v4i32)__msa_ilvl_h(_extw0, _w0);
                        v4i32 _w1l = (v4i32)__msa_ilvr_h(_extw1, _w1);
                        v4i32 _w1h = (v4i32)__msa_ilvl_h(_extw1, _w1);
                        v4i32 _w2l = (v4i32)__msa_ilvr_h(_extw2, _w2);
                        v4i32 _w2h = (v4i32)__msa_ilvl_h(_extw2, _w2);
                        v4i32 _w3l = (v4i32)__msa_ilvr_h(_extw3, _w3);
                        v4i32 _w3h = (v4i32)__msa_ilvl_h(_extw3, _w3);

                        v4i32 _val0_0 = __msa_fill_w(r0[0]);
                        v4i32 _val0_1 = __msa_fill_w(r0[1]);
                        v4i32 _val0_2 = __msa_fill_w(r0[2]);
                        v4i32 _val0_3 = __msa_fill_w(r0[3]);
                        v4i32 _val0_4 = __msa_fill_w(r0[4]);
                        v4i32 _val0_5 = __msa_fill_w(r0[5]);
                        v4i32 _val0_6 = __msa_fill_w(r0[6]);
                        v4i32 _val0_7 = __msa_fill_w(r0[7]);
                        v4i32 _val1_0 = __msa_fill_w(r0[8]);
                        v4i32 _val1_1 = __msa_fill_w(r0[9]);
                        v4i32 _val1_2 = __msa_fill_w(r0[10]);
                        v4i32 _val1_3 = __msa_fill_w(r0[11]);
                        v4i32 _val1_4 = __msa_fill_w(r0[12]);
                        v4i32 _val1_5 = __msa_fill_w(r0[13]);
                        v4i32 _val1_6 = __msa_fill_w(r0[14]);
                        v4i32 _val1_7 = __msa_fill_w(r0[15]);

                        _sum0 = __msa_maddv_w(_sum0, _w0l, _val0_0);
                        _sum1 = __msa_maddv_w(_sum1, _w0h, _val0_1);
                        _sum2 = __msa_maddv_w(_sum2, _w0l, _val1_0);
                        _sum3 = __msa_maddv_w(_sum3, _w0h, _val1_1);
                        _sum0 = __msa_maddv_w(_sum0, _w1l, _val0_2);
                        _sum1 = __msa_maddv_w(_sum1, _w1h, _val0_3);
                        _sum2 = __msa_maddv_w(_sum2, _w1l, _val1_2);
                        _sum3 = __msa_maddv_w(_sum3, _w1h, _val1_3);
                        _sum0 = __msa_maddv_w(_sum0, _w2l, _val0_4);
                        _sum1 = __msa_maddv_w(_sum1, _w2h, _val0_5);
                        _sum2 = __msa_maddv_w(_sum2, _w2l, _val1_4);
                        _sum3 = __msa_maddv_w(_sum3, _w2h, _val1_5);
                        _sum0 = __msa_maddv_w(_sum0, _w3l, _val0_6);
                        _sum1 = __msa_maddv_w(_sum1, _w3h, _val0_7);
                        _sum2 = __msa_maddv_w(_sum2, _w3l, _val1_6);
                        _sum3 = __msa_maddv_w(_sum3, _w3h, _val1_7);

                        r0 += 16;
                        k0 += 32;
                    }

                    _sum0 = __msa_addv_w(_sum0, _sum1);
                    _sum2 = __msa_addv_w(_sum2, _sum3);

                    int sum[8];
                    __msa_st_w(_sum0, sum, 0);
                    __msa_st_w(_sum2, sum + 4, 0);

                    output0_tm[0] = sum[0];
                    output1_tm[0] = sum[1];
                    output2_tm[0] = sum[2];
                    output3_tm[0] = sum[3];
                    output0_tm[1] = sum[4];
                    output1_tm[1] = sum[5];
                    output2_tm[1] = sum[6];
                    output3_tm[1] = sum[7];
                    output0_tm += 2;
                    output1_tm += 2;
                    output2_tm += 2;
                    output3_tm += 2;
                }
                for (; i < tiles; i++)
                {
                    const short* r0 = bb2.row<const short>(i / 2 + i % 2);
                    const short* k0 = kernel0_tm.row<const short>(r);

                    int nn = inch; // inch always > 0

                    v4i32 _sum0 = __msa_fill_w(0);
                    v4i32 _sum1 = __msa_fill_w(0);

                    for (int j = 0; j < nn; j++)
                    {
                        v8i16 _w0 = __msa_ld_h(k0, 0);
                        v8i16 _w1 = __msa_ld_h(k0 + 8, 0);
                        v8i16 _w2 = __msa_ld_h(k0 + 16, 0);
                        v8i16 _w3 = __msa_ld_h(k0 + 24, 0);

                        v8i16 _extw0 = __msa_clti_s_h(_w0, 0);
                        v8i16 _extw1 = __msa_clti_s_h(_w1, 0);
                        v8i16 _extw2 = __msa_clti_s_h(_w2, 0);
                        v8i16 _extw3 = __msa_clti_s_h(_w3, 0);

                        v4i32 _w0l = (v4i32)__msa_ilvr_h(_extw0, _w0);
                        v4i32 _w0h = (v4i32)__msa_ilvl_h(_extw0, _w0);
                        v4i32 _w1l = (v4i32)__msa_ilvr_h(_extw1, _w1);
                        v4i32 _w1h = (v4i32)__msa_ilvl_h(_extw1, _w1);
                        v4i32 _w2l = (v4i32)__msa_ilvr_h(_extw2, _w2);
                        v4i32 _w2h = (v4i32)__msa_ilvl_h(_extw2, _w2);
                        v4i32 _w3l = (v4i32)__msa_ilvr_h(_extw3, _w3);
                        v4i32 _w3h = (v4i32)__msa_ilvl_h(_extw3, _w3);

                        v4i32 _val0 = __msa_fill_w(r0[0]);
                        v4i32 _val1 = __msa_fill_w(r0[1]);
                        v4i32 _val2 = __msa_fill_w(r0[2]);
                        v4i32 _val3 = __msa_fill_w(r0[3]);
                        v4i32 _val4 = __msa_fill_w(r0[4]);
                        v4i32 _val5 = __msa_fill_w(r0[5]);
                        v4i32 _val6 = __msa_fill_w(r0[6]);
                        v4i32 _val7 = __msa_fill_w(r0[7]);

                        _sum0 = __msa_maddv_w(_sum0, _w0l, _val0);
                        _sum1 = __msa_maddv_w(_sum1, _w0h, _val1);
                        _sum0 = __msa_maddv_w(_sum0, _w1l, _val2);
                        _sum1 = __msa_maddv_w(_sum1, _w1h, _val3);
                        _sum0 = __msa_maddv_w(_sum0, _w2l, _val4);
                        _sum1 = __msa_maddv_w(_sum1, _w2h, _val5);
                        _sum0 = __msa_maddv_w(_sum0, _w3l, _val6);
                        _sum1 = __msa_maddv_w(_sum1, _w3h, _val7);

                        r0 += 8;
                        k0 += 32;
                    }

                    _sum0 = __msa_addv_w(_sum0, _sum1);

                    int sum[4];
                    __msa_st_w(_sum0, sum, 0);

                    output0_tm[0] = sum[0];
                    output1_tm[0] = sum[1];
                    output2_tm[0] = sum[2];
                    output3_tm[0] = sum[3];
                    output0_tm += 1;
                    output1_tm += 1;
                    output2_tm += 1;
                    output3_tm += 1;
                }
            }
        }

        remain_outch_start += nn_outch << 2;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = remain_outch_start; p < outch; p++)
        {
            int* output0_tm = top_blob_tm.channel(p);

            const Mat kernel0_tm = kernel_tm.channel(p / 4 + p % 4);

            for (int r = 0; r < 36; r++)
            {
                const Mat bb2 = bottom_blob_tm2.channel(r);

                int i = 0;
                for (; i + 1 < tiles; i += 2)
                {
                    const short* r0 = bb2.row<const short>(i / 2);
                    const short* k0 = kernel0_tm.row<const short>(r);

                    v4i32 _sum0 = __msa_fill_w(0);
                    v4i32 _sum1 = __msa_fill_w(0);
                    v4i32 _sum2 = __msa_fill_w(0);
                    v4i32 _sum3 = __msa_fill_w(0);

                    for (int q = 0; q < inch; q++)
                    {
                        v8i16 _val0 = __msa_ld_h(r0, 0);
                        v8i16 _val1 = __msa_ld_h(r0 + 8, 0);

                        v8i16 _extval0 = __msa_clti_s_h(_val0, 0);
                        v8i16 _extval1 = __msa_clti_s_h(_val1, 0);
                        v4i32 _val0l = (v4i32)__msa_ilvr_h(_extval0, _val0);
                        v4i32 _val0h = (v4i32)__msa_ilvl_h(_extval0, _val0);
                        v4i32 _val1l = (v4i32)__msa_ilvr_h(_extval1, _val1);
                        v4i32 _val1h = (v4i32)__msa_ilvl_h(_extval1, _val1);

                        v8i16 _w0 = __msa_ld_h(k0, 0);

                        v8i16 _extw0 = __msa_clti_s_h(_w0, 0);
                        v4i32 _w0l = (v4i32)__msa_ilvr_h(_extw0, _w0);
                        v4i32 _w0h = (v4i32)__msa_ilvl_h(_extw0, _w0);

                        _sum0 = __msa_maddv_w(_sum0, _w0l, _val0l);
                        _sum1 = __msa_maddv_w(_sum1, _w0h, _val0h);
                        _sum2 = __msa_maddv_w(_sum2, _w0l, _val1l);
                        _sum3 = __msa_maddv_w(_sum3, _w0h, _val1h);

                        k0 += 8;
                        r0 += 16;
                    }

                    _sum0 = __msa_addv_w(_sum0, _sum1);
                    _sum2 = __msa_addv_w(_sum2, _sum3);

                    output0_tm[0] = __msa_reduce_add_w(_sum0);
                    output0_tm[1] = __msa_reduce_add_w(_sum2);
                    output0_tm += 2;
                }
                for (; i < tiles; i++)
                {
                    const short* r0 = bb2.row<const short>(i / 2 + i % 2);
                    const short* k0 = kernel0_tm.row<const short>(r);

                    v4i32 _sum0 = __msa_fill_w(0);
                    v4i32 _sum1 = __msa_fill_w(0);

                    for (int q = 0; q < inch; q++)
                    {
                        v8i16 _val = __msa_ld_h(r0, 0);

                        v8i16 _extval = __msa_clti_s_h(_val, 0);
                        v4i32 _vall = (v4i32)__msa_ilvr_h(_extval, _val);
                        v4i32 _valh = (v4i32)__msa_ilvl_h(_extval, _val);

                        v8i16 _w0 = __msa_ld_h(k0, 0);

                        v8i16 _extw0 = __msa_clti_s_h(_w0, 0);
                        v4i32 _w0l = (v4i32)__msa_ilvr_h(_extw0, _w0);
                        v4i32 _w0h = (v4i32)__msa_ilvl_h(_extw0, _w0);

                        _sum0 = __msa_maddv_w(_sum0, _w0l, _vall);
                        _sum1 = __msa_maddv_w(_sum1, _w0h, _valh);

                        k0 += 8;
                        r0 += 8;
                    }

                    _sum0 = __msa_addv_w(_sum0, _sum1);

                    output0_tm[0] = __msa_reduce_add_w(_sum0);
                    output0_tm++;
                }
            }
        }
    }
    bottom_blob_tm = Mat();
    // END dot

    // BEGIN transform output
    Mat top_blob_bordered;
    if (outw == top_blob.w && outh == top_blob.h)
    {
        top_blob_bordered = top_blob;
    }
    else
    {
        top_blob_bordered.create(outw, outh, outch, 4u, 1, opt.workspace_allocator);
    }
    {
        // const float otm[4][6] = {
        //     {1.0f, 1.0f,  1.0f, 1.0f,  1.0f, 0.0f},
        //     {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 0.0f},
        //     {0.0f, 1.0f,  1.0f, 4.0f,  4.0f, 0.0f},
        //     {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 1.0f}
        // };

        // 0 = r00 + (r01 + r02) + (r03 + r04)
        // 1 =       (r01 - r02) + (r03 - r04) * 2
        // 2 =       (r01 + r02) + (r03 + r04) * 4
        // 3 = r05 + (r01 - r02) + (r03 - r04) * 8

        int w_tm = outw / 4 * 6;
        int h_tm = outh / 4 * 6;
        const int tiles = w_tm / 6 * h_tm / 6;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < outch; p++)
        {
            const Mat out0_tm = top_blob_tm.channel(p);
            Mat out0 = top_blob_bordered.channel(p);

            int tmp[4][6];

            // tile
            for (int i = 0; i < outh / 4; i++)
            {
                for (int j = 0; j < outw / 4; j++)
                {
                    // top_blob_tm.create(tiles, 36, outch, 4u, 1, opt.workspace_allocator);

                    const int* output0_tm_0 = (const int*)out0_tm + (i * w_tm / 6 + j) * 1;
                    const int* output0_tm_1 = output0_tm_0 + tiles * 1;
                    const int* output0_tm_2 = output0_tm_0 + tiles * 2;
                    const int* output0_tm_3 = output0_tm_0 + tiles * 3;
                    const int* output0_tm_4 = output0_tm_0 + tiles * 4;
                    const int* output0_tm_5 = output0_tm_0 + tiles * 5;

                    int* output0 = out0.row<int>(i * 4) + j * 4;

                    // 0 = r00 + (r01 + r02) + (r03 + r04)
                    // 1 =       (r01 - r02) + (r03 - r04) * 2
                    // 2 =       (r01 + r02) + (r03 + r04) * 4
                    // 3 = r05 + (r01 - r02) + (r03 - r04) * 8

                    // TODO msa optimize
                    for (int m = 0; m < 5; m++)
                    {
                        int tmp02a = output0_tm_1[0] + output0_tm_2[0];
                        int tmp13a = output0_tm_1[0] - output0_tm_2[0];

                        int tmp02b = output0_tm_3[0] + output0_tm_4[0];
                        int tmp13b = output0_tm_3[0] - output0_tm_4[0];

                        tmp[0][m] = output0_tm_0[0] + tmp02a + tmp02b;
                        tmp[1][m] = tmp13a + tmp13b * 2;
                        tmp[2][m] = tmp02a + tmp02b * 4;
                        tmp[3][m] = output0_tm_5[0] * 4 + tmp13a + tmp13b * 8;

                        output0_tm_0 += tiles * 6;
                        output0_tm_1 += tiles * 6;
                        output0_tm_2 += tiles * 6;
                        output0_tm_3 += tiles * 6;
                        output0_tm_4 += tiles * 6;
                        output0_tm_5 += tiles * 6;
                    }
                    for (int m = 5; m < 6; m++)
                    {
                        int tmp02a = output0_tm_1[0] + output0_tm_2[0];
                        int tmp13a = output0_tm_1[0] - output0_tm_2[0];

                        int tmp02b = output0_tm_3[0] + output0_tm_4[0];
                        int tmp13b = output0_tm_3[0] - output0_tm_4[0];

                        tmp[0][m] = (output0_tm_0[0] + tmp02a + tmp02b) * 4;
                        tmp[1][m] = (tmp13a + tmp13b * 2) * 4;
                        tmp[2][m] = (tmp02a + tmp02b * 4) * 4;
                        tmp[3][m] = (output0_tm_5[0] * 4 + tmp13a + tmp13b * 8) * 4;

                        output0_tm_0 += tiles * 6;
                        output0_tm_1 += tiles * 6;
                        output0_tm_2 += tiles * 6;
                        output0_tm_3 += tiles * 6;
                        output0_tm_4 += tiles * 6;
                        output0_tm_5 += tiles * 6;
                    }

                    for (int m = 0; m < 4; m++)
                    {
                        const int* tmp0 = tmp[m];

                        int tmp02a = tmp0[1] + tmp0[2];
                        int tmp13a = tmp0[1] - tmp0[2];

                        int tmp02b = tmp0[3] + tmp0[4];
                        int tmp13b = tmp0[3] - tmp0[4];

                        output0[0] = (tmp0[0] + tmp02a + tmp02b) / 576;
                        output0[1] = (tmp13a + tmp13b * 2) / 576;
                        output0[2] = (tmp02a + tmp02b * 4) / 576;
                        output0[3] = (tmp0[5] + tmp13a + tmp13b * 8) / 576;

                        output0 += outw;
                    }
                }
            }
        }
    }
    // END transform output

    // cut result pad
    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
 }
--- a/src/layer/mips/convolution_3x3_pack8to4_int8.h
+++ b/src/layer/mips/convolution_3x3_pack8to4_int8.h
@@ -0,0 +1,629 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 static void conv3x3s1_winograd42_transform_kernel_pack8to4_int8_msa(const Mat& kernel, Mat& kernel_tm_pack8, int inch, int outch, const Option& opt)
 {
    // winograd42 transform kernel
    Mat kernel_tm(6 * 6, inch, outch, (size_t)2u);

    const short ktm[6][3] = {
        {6, 0, 0},
        {-4, -4, -4},
        {-4, 4, -4},
        {1, 2, 4},
        {1, -2, 4},
        {0, 0, 6}
    };

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        for (int q = 0; q < inch; q++)
        {
            const signed char* kernel0 = (const signed char*)kernel + p * inch * 9 + q * 9;
            short* kernel_tm0 = kernel_tm.channel(p).row<short>(q);

            // transform kernel
            const signed char* k0 = kernel0;
            const signed char* k1 = kernel0 + 3;
            const signed char* k2 = kernel0 + 6;

            // h
            short tmp[6][3];
            for (int i = 0; i < 6; i++)
            {
                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
            }

            // U
            for (int j = 0; j < 6; j++)
            {
                short* tmpp = &tmp[j][0];

                for (int i = 0; i < 6; i++)
                {
                    kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
                }
            }
        }
    }

    // interleave
    // src = 36-inch-outch
    // dst = 4b-8a-inch/8a-36-outch/4b
    kernel_tm_pack8.create(inch / 8, 36, outch / 4, (size_t)2u * 32, 32);

    int q = 0;
    for (; q + 3 < outch; q += 4)
    {
        const Mat k0 = kernel_tm.channel(q);
        const Mat k1 = kernel_tm.channel(q + 1);
        const Mat k2 = kernel_tm.channel(q + 2);
        const Mat k3 = kernel_tm.channel(q + 3);

        Mat kernel_tm = kernel_tm_pack8.channel(q / 4);

        for (int k = 0; k < 36; k++)
        {
            short* g00 = kernel_tm.row<short>(k);

            for (int p = 0; p + 7 < inch; p += 8)
            {
                for (int i = 0; i < 8; i++)
                {
                    const short* k00 = k0.row<const short>(p + i);
                    const short* k10 = k1.row<const short>(p + i);
                    const short* k20 = k2.row<const short>(p + i);
                    const short* k30 = k3.row<const short>(p + i);

                    g00[0] = k00[k];
                    g00[1] = k10[k];
                    g00[2] = k20[k];
                    g00[3] = k30[k];

                    g00 += 4;
                }
            }
        }
    }
 }

 static void conv3x3s1_winograd42_pack8to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Option& opt)
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int inch = bottom_blob.c;
    //     size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    // pad to 4n+2
    Mat bottom_blob_bordered = bottom_blob;

    outw = (outw + 3) / 4 * 4;
    outh = (outh + 3) / 4 * 4;

    w = outw + 2;
    h = outh + 2;
    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);

    // BEGIN transform input
    Mat bottom_blob_tm;
    {
        int w_tm = outw / 4 * 6;
        int h_tm = outh / 4 * 6;

        const int tiles = w_tm / 6 * h_tm / 6;

        bottom_blob_tm.create(tiles, 36, inch, 2u * elempack, elempack, opt.workspace_allocator);

        // const float itm[4][4] = {
        //     {4.0f, 0.0f, -5.0f, 0.0f, 1.0f, 0.0f},
        //     {0.0f,-4.0f, -4.0f, 1.0f, 1.0f, 0.0f},
        //     {0.0f, 4.0f, -4.0f,-1.0f, 1.0f, 0.0f},
        //     {0.0f,-2.0f, -1.0f, 2.0f, 1.0f, 0.0f},
        //     {0.0f, 2.0f, -1.0f,-2.0f, 1.0f, 0.0f},
        //     {0.0f, 4.0f,  0.0f,-5.0f, 0.0f, 1.0f}
        // };

        // 0 =  4 * r00 - 5 * r02 + r04
        // 1 = -4 * (r01 + r02) + r04 + r03
        // 2 =  4 * (r01 - r02) + r04 - r03
        // 3 = -2 * (r01 - r03) + r04 - r02
        // 4 =  2 * (r01 - r03) + r04 - r02
        // 5 =  4 * r01 - 5 * r03 + r05

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < inch; q++)
        {
            const Mat img0 = bottom_blob_bordered.channel(q);
            Mat img0_tm = bottom_blob_tm.channel(q);

            short tmp[6][6][8];

            // tile
            for (int i = 0; i < h_tm / 6; i++)
            {
                for (int j = 0; j < w_tm / 6; j++)
                {
                    const signed char* r0 = img0.row<const signed char>(i * 4) + (j * 4) * 8;

                    for (int m = 0; m < 6; m++)
                    {
                        v16i8 _r00_01 = __msa_ld_b(r0, 0);
                        v16i8 _r02_03 = __msa_ld_b(r0 + 16, 0);
                        v16i8 _r04_05 = __msa_ld_b(r0 + 32, 0);
                        v16i8 _extr0001 = __msa_clti_s_b(_r00_01, 0);
                        v16i8 _extr0203 = __msa_clti_s_b(_r02_03, 0);
                        v16i8 _extr0405 = __msa_clti_s_b(_r04_05, 0);
                        v8i16 _r00 = (v8i16)__msa_ilvr_b(_extr0001, _r00_01);
                        v8i16 _r01 = (v8i16)__msa_ilvl_b(_extr0001, _r00_01);
                        v8i16 _r02 = (v8i16)__msa_ilvr_b(_extr0203, _r02_03);
                        v8i16 _r03 = (v8i16)__msa_ilvl_b(_extr0203, _r02_03);
                        v8i16 _r04 = (v8i16)__msa_ilvr_b(_extr0405, _r04_05);
                        v8i16 _r05 = (v8i16)__msa_ilvl_b(_extr0405, _r04_05);

                        v8i16 _v5 = __msa_fill_h(5);

                        v8i16 _tmp0m = __msa_subv_h(__msa_addv_h(__msa_slli_h(_r00, 2), _r04), __msa_mulv_h(_r02, _v5));
                        v8i16 _tmp1m = __msa_subv_h(__msa_addv_h(_r04, _r03), __msa_slli_h(__msa_addv_h(_r01, _r02), 2));
                        v8i16 _tmp2m = __msa_addv_h(__msa_subv_h(_r04, _r03), __msa_slli_h(__msa_subv_h(_r01, _r02), 2));
                        v8i16 _tmp3m = __msa_subv_h(__msa_subv_h(_r04, _r02), __msa_slli_h(__msa_subv_h(_r01, _r03), 1));
                        v8i16 _tmp4m = __msa_addv_h(__msa_subv_h(_r04, _r02), __msa_slli_h(__msa_subv_h(_r01, _r03), 1));
                        v8i16 _tmp5m = __msa_subv_h(__msa_addv_h(__msa_slli_h(_r01, 2), _r05), __msa_mulv_h(_r03, _v5));

                        __msa_st_h(_tmp0m, tmp[0][m], 0);
                        __msa_st_h(_tmp1m, tmp[1][m], 0);
                        __msa_st_h(_tmp2m, tmp[2][m], 0);
                        __msa_st_h(_tmp3m, tmp[3][m], 0);
                        __msa_st_h(_tmp4m, tmp[4][m], 0);
                        __msa_st_h(_tmp5m, tmp[5][m], 0);

                        r0 += w * 8;
                    }

                    short* r0_tm_0 = (short*)img0_tm + (i * w_tm / 6 + j) * 8;
                    short* r0_tm_1 = r0_tm_0 + tiles * 8;
                    short* r0_tm_2 = r0_tm_0 + tiles * 16;
                    short* r0_tm_3 = r0_tm_0 + tiles * 24;
                    short* r0_tm_4 = r0_tm_0 + tiles * 32;
                    short* r0_tm_5 = r0_tm_0 + tiles * 40;

                    for (int m = 0; m < 6; m++)
                    {
                        v8i16 _tmp00 = __msa_ld_h(tmp[m][0], 0);
                        v8i16 _tmp01 = __msa_ld_h(tmp[m][1], 0);
                        v8i16 _tmp02 = __msa_ld_h(tmp[m][2], 0);
                        v8i16 _tmp03 = __msa_ld_h(tmp[m][3], 0);
                        v8i16 _tmp04 = __msa_ld_h(tmp[m][4], 0);
                        v8i16 _tmp05 = __msa_ld_h(tmp[m][5], 0);

                        v8i16 _v5 = __msa_fill_h(5);

                        v8i16 _r0tm0 = __msa_subv_h(__msa_addv_h(__msa_slli_h(_tmp00, 2), _tmp04), __msa_mulv_h(_tmp02, _v5));
                        v8i16 _r0tm1 = __msa_subv_h(__msa_addv_h(_tmp04, _tmp03), __msa_slli_h(__msa_addv_h(_tmp01, _tmp02), 2));
                        v8i16 _r0tm2 = __msa_addv_h(__msa_subv_h(_tmp04, _tmp03), __msa_slli_h(__msa_subv_h(_tmp01, _tmp02), 2));
                        v8i16 _r0tm3 = __msa_subv_h(__msa_subv_h(_tmp04, _tmp02), __msa_slli_h(__msa_subv_h(_tmp01, _tmp03), 1));
                        v8i16 _r0tm4 = __msa_addv_h(__msa_subv_h(_tmp04, _tmp02), __msa_slli_h(__msa_subv_h(_tmp01, _tmp03), 1));
                        v8i16 _r0tm5 = __msa_subv_h(__msa_addv_h(__msa_slli_h(_tmp01, 2), _tmp05), __msa_mulv_h(_tmp03, _v5));

                        __msa_st_h(_r0tm0, r0_tm_0, 0);
                        __msa_st_h(_r0tm1, r0_tm_1, 0);
                        __msa_st_h(_r0tm2, r0_tm_2, 0);
                        __msa_st_h(_r0tm3, r0_tm_3, 0);
                        __msa_st_h(_r0tm4, r0_tm_4, 0);
                        __msa_st_h(_r0tm5, r0_tm_5, 0);

                        r0_tm_0 += tiles * 48;
                        r0_tm_1 += tiles * 48;
                        r0_tm_2 += tiles * 48;
                        r0_tm_3 += tiles * 48;
                        r0_tm_4 += tiles * 48;
                        r0_tm_5 += tiles * 48;
                    }
                }
            }
        }
    }
    bottom_blob_bordered = Mat();
    // END transform input

    // BEGIN dot
    Mat top_blob_tm;
    {
        int w_tm = outw / 4 * 6;
        int h_tm = outh / 4 * 6;

        const int tiles = h_tm / 6 * w_tm / 6;

        // permute
        //         bottom_blob_tm.create(tiles, 36, inch, elemsize, elempack, opt.workspace_allocator);
        Mat bottom_blob_tm2;
        if (tiles >= 2)
            bottom_blob_tm2.create(2 * inch, tiles / 2 + tiles % 2, 36, 2u * elempack, elempack, opt.workspace_allocator);
        else // if (tiles >= 1)
            bottom_blob_tm2.create(1 * inch, tiles, 36, 2u * elempack, elempack, opt.workspace_allocator);

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int r = 0; r < 36; r++)
        {
            Mat tm2 = bottom_blob_tm2.channel(r);

            // tile
            int i = 0;
            for (; i + 1 < tiles; i += 2)
            {
                short* tmpptr = tm2.row<short>(i / 2);

                const short* r0 = bottom_blob_tm;

                r0 += (r * tiles + i) * 8;

                for (int q = 0; q < inch; q++)
                {
                    v8i16 _r0 = __msa_ld_h(r0, 0);
                    v8i16 _r1 = __msa_ld_h(r0 + 8, 0);
                    __msa_st_h(_r0, tmpptr, 0);
                    __msa_st_h(_r1, tmpptr + 8, 0);
                    r0 += bottom_blob_tm.cstep * 8;
                    tmpptr += 16;
                }
            }
            for (; i < tiles; i++)
            {
                short* tmpptr = tm2.row<short>(i / 2 + i % 2);

                const short* r0 = bottom_blob_tm;

                r0 += (r * tiles + i) * 8;

                for (int q = 0; q < inch; q++)
                {
                    v8i16 _r0 = __msa_ld_h(r0, 0);
                    __msa_st_h(_r0, tmpptr, 0);
                    r0 += bottom_blob_tm.cstep * 8;
                    tmpptr += 8;
                }
            }
        }

        bottom_blob_tm = Mat();
        // permute end

        top_blob_tm.create(tiles, 36, outch, 4u * 4, 4, opt.workspace_allocator);

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < outch; p++)
        {
            int* output0_tm = top_blob_tm.channel(p);

            const Mat kernel0_tm = kernel_tm.channel(p);

            for (int r = 0; r < 36; r++)
            {
                const Mat bb2 = bottom_blob_tm2.channel(r);

                int i = 0;
                for (; i + 1 < tiles; i += 2)
                {
                    const short* r0 = bb2.row<const short>(i / 2);
                    const short* k0 = kernel0_tm.row<const short>(r);

                    int nn = inch; // inch always > 0

                    v4i32 _sum0 = __msa_fill_w(0);
                    v4i32 _sum1 = __msa_fill_w(0);
                    v4i32 _sum2 = __msa_fill_w(0);
                    v4i32 _sum3 = __msa_fill_w(0);

                    for (int j = 0; j < nn; j++)
                    {
                        v8i16 _w0 = __msa_ld_h(k0, 0);
                        v8i16 _w1 = __msa_ld_h(k0 + 8, 0);
                        v8i16 _w2 = __msa_ld_h(k0 + 16, 0);
                        v8i16 _w3 = __msa_ld_h(k0 + 24, 0);

                        v8i16 _extw0 = __msa_clti_s_h(_w0, 0);
                        v8i16 _extw1 = __msa_clti_s_h(_w1, 0);
                        v8i16 _extw2 = __msa_clti_s_h(_w2, 0);
                        v8i16 _extw3 = __msa_clti_s_h(_w3, 0);

                        v4i32 _w0l = (v4i32)__msa_ilvr_h(_extw0, _w0);
                        v4i32 _w0h = (v4i32)__msa_ilvl_h(_extw0, _w0);
                        v4i32 _w1l = (v4i32)__msa_ilvr_h(_extw1, _w1);
                        v4i32 _w1h = (v4i32)__msa_ilvl_h(_extw1, _w1);
                        v4i32 _w2l = (v4i32)__msa_ilvr_h(_extw2, _w2);
                        v4i32 _w2h = (v4i32)__msa_ilvl_h(_extw2, _w2);
                        v4i32 _w3l = (v4i32)__msa_ilvr_h(_extw3, _w3);
                        v4i32 _w3h = (v4i32)__msa_ilvl_h(_extw3, _w3);

                        v4i32 _val0_0 = __msa_fill_w(r0[0]);
                        v4i32 _val0_1 = __msa_fill_w(r0[1]);
                        v4i32 _val0_2 = __msa_fill_w(r0[2]);
                        v4i32 _val0_3 = __msa_fill_w(r0[3]);
                        v4i32 _val0_4 = __msa_fill_w(r0[4]);
                        v4i32 _val0_5 = __msa_fill_w(r0[5]);
                        v4i32 _val0_6 = __msa_fill_w(r0[6]);
                        v4i32 _val0_7 = __msa_fill_w(r0[7]);
                        v4i32 _val1_0 = __msa_fill_w(r0[8]);
                        v4i32 _val1_1 = __msa_fill_w(r0[9]);
                        v4i32 _val1_2 = __msa_fill_w(r0[10]);
                        v4i32 _val1_3 = __msa_fill_w(r0[11]);
                        v4i32 _val1_4 = __msa_fill_w(r0[12]);
                        v4i32 _val1_5 = __msa_fill_w(r0[13]);
                        v4i32 _val1_6 = __msa_fill_w(r0[14]);
                        v4i32 _val1_7 = __msa_fill_w(r0[15]);

                        _sum0 = __msa_maddv_w(_sum0, _w0l, _val0_0);
                        _sum1 = __msa_maddv_w(_sum1, _w0h, _val0_1);
                        _sum2 = __msa_maddv_w(_sum2, _w0l, _val1_0);
                        _sum3 = __msa_maddv_w(_sum3, _w0h, _val1_1);
                        _sum0 = __msa_maddv_w(_sum0, _w1l, _val0_2);
                        _sum1 = __msa_maddv_w(_sum1, _w1h, _val0_3);
                        _sum2 = __msa_maddv_w(_sum2, _w1l, _val1_2);
                        _sum3 = __msa_maddv_w(_sum3, _w1h, _val1_3);
                        _sum0 = __msa_maddv_w(_sum0, _w2l, _val0_4);
                        _sum1 = __msa_maddv_w(_sum1, _w2h, _val0_5);
                        _sum2 = __msa_maddv_w(_sum2, _w2l, _val1_4);
                        _sum3 = __msa_maddv_w(_sum3, _w2h, _val1_5);
                        _sum0 = __msa_maddv_w(_sum0, _w3l, _val0_6);
                        _sum1 = __msa_maddv_w(_sum1, _w3h, _val0_7);
                        _sum2 = __msa_maddv_w(_sum2, _w3l, _val1_6);
                        _sum3 = __msa_maddv_w(_sum3, _w3h, _val1_7);

                        r0 += 16;
                        k0 += 32;
                    }

                    _sum0 = __msa_addv_w(_sum0, _sum1);
                    _sum2 = __msa_addv_w(_sum2, _sum3);

                    __msa_st_w(_sum0, output0_tm, 0);
                    __msa_st_w(_sum2, output0_tm + 4, 0);

                    output0_tm += 8;
                }
                for (; i < tiles; i++)
                {
                    const short* r0 = bb2.row<const short>(i / 2 + i % 2);
                    const short* k0 = kernel0_tm.row<const short>(r);

                    int nn = inch; // inch always > 0

                    v4i32 _sum0 = __msa_fill_w(0);
                    v4i32 _sum1 = __msa_fill_w(0);

                    for (int j = 0; j < nn; j++)
                    {
                        v8i16 _w0 = __msa_ld_h(k0, 0);
                        v8i16 _w1 = __msa_ld_h(k0 + 8, 0);
                        v8i16 _w2 = __msa_ld_h(k0 + 16, 0);
                        v8i16 _w3 = __msa_ld_h(k0 + 24, 0);

                        v8i16 _extw0 = __msa_clti_s_h(_w0, 0);
                        v8i16 _extw1 = __msa_clti_s_h(_w1, 0);
                        v8i16 _extw2 = __msa_clti_s_h(_w2, 0);
                        v8i16 _extw3 = __msa_clti_s_h(_w3, 0);

                        v4i32 _w0l = (v4i32)__msa_ilvr_h(_extw0, _w0);
                        v4i32 _w0h = (v4i32)__msa_ilvl_h(_extw0, _w0);
                        v4i32 _w1l = (v4i32)__msa_ilvr_h(_extw1, _w1);
                        v4i32 _w1h = (v4i32)__msa_ilvl_h(_extw1, _w1);
                        v4i32 _w2l = (v4i32)__msa_ilvr_h(_extw2, _w2);
                        v4i32 _w2h = (v4i32)__msa_ilvl_h(_extw2, _w2);
                        v4i32 _w3l = (v4i32)__msa_ilvr_h(_extw3, _w3);
                        v4i32 _w3h = (v4i32)__msa_ilvl_h(_extw3, _w3);

                        v4i32 _val0 = __msa_fill_w(r0[0]);
                        v4i32 _val1 = __msa_fill_w(r0[1]);
                        v4i32 _val2 = __msa_fill_w(r0[2]);
                        v4i32 _val3 = __msa_fill_w(r0[3]);
                        v4i32 _val4 = __msa_fill_w(r0[4]);
                        v4i32 _val5 = __msa_fill_w(r0[5]);
                        v4i32 _val6 = __msa_fill_w(r0[6]);
                        v4i32 _val7 = __msa_fill_w(r0[7]);

                        _sum0 = __msa_maddv_w(_sum0, _w0l, _val0);
                        _sum1 = __msa_maddv_w(_sum1, _w0h, _val1);
                        _sum0 = __msa_maddv_w(_sum0, _w1l, _val2);
                        _sum1 = __msa_maddv_w(_sum1, _w1h, _val3);
                        _sum0 = __msa_maddv_w(_sum0, _w2l, _val4);
                        _sum1 = __msa_maddv_w(_sum1, _w2h, _val5);
                        _sum0 = __msa_maddv_w(_sum0, _w3l, _val6);
                        _sum1 = __msa_maddv_w(_sum1, _w3h, _val7);

                        r0 += 8;
                        k0 += 32;
                    }

                    _sum0 = __msa_addv_w(_sum0, _sum1);

                    __msa_st_w(_sum0, output0_tm, 0);
                    output0_tm += 4;
                }
            }
        }
    }
    bottom_blob_tm = Mat();
    // END dot

    // BEGIN transform output
    Mat top_blob_bordered;
    if (outw == top_blob.w && outh == top_blob.h)
    {
        top_blob_bordered = top_blob;
    }
    else
    {
        top_blob_bordered.create(outw, outh, outch, 4u * 4, 4, opt.workspace_allocator);
    }
    {
        // const float otm[4][6] = {
        //     {1.0f, 1.0f,  1.0f, 1.0f,  1.0f, 0.0f},
        //     {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 0.0f},
        //     {0.0f, 1.0f,  1.0f, 4.0f,  4.0f, 0.0f},
        //     {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 1.0f}
        // };

        // 0 = r00 + (r01 + r02) + (r03 + r04)
        // 1 =       (r01 - r02) + (r03 - r04) * 2
        // 2 =       (r01 + r02) + (r03 + r04) * 4
        // 3 = r05 + (r01 - r02) + (r03 - r04) * 8

        int w_tm = outw / 4 * 6;
        int h_tm = outh / 4 * 6;
        const int tiles = w_tm / 6 * h_tm / 6;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < outch; p++)
        {
            const Mat out0_tm = top_blob_tm.channel(p);
            Mat out0 = top_blob_bordered.channel(p);

            int tmp[4][6][4];

            // tile
            for (int i = 0; i < outh / 4; i++)
            {
                for (int j = 0; j < outw / 4; j++)
                {
                    // top_blob_tm.create(tiles, 36, outch, elemsize, elempack);

                    const int* output0_tm_0 = (const int*)out0_tm + (i * w_tm / 6 + j) * 4;
                    const int* output0_tm_1 = output0_tm_0 + tiles * 4;
                    const int* output0_tm_2 = output0_tm_0 + tiles * 8;
                    const int* output0_tm_3 = output0_tm_0 + tiles * 12;
                    const int* output0_tm_4 = output0_tm_0 + tiles * 16;
                    const int* output0_tm_5 = output0_tm_0 + tiles * 20;

                    int* output0 = out0.row<int>(i * 4) + (j * 4) * 4;

                    for (int m = 0; m < 5; m++)
                    {
                        v4i32 _out0tm0 = __msa_ld_w(output0_tm_0, 0);
                        v4i32 _out0tm1 = __msa_ld_w(output0_tm_1, 0);
                        v4i32 _out0tm2 = __msa_ld_w(output0_tm_2, 0);
                        v4i32 _out0tm3 = __msa_ld_w(output0_tm_3, 0);
                        v4i32 _out0tm4 = __msa_ld_w(output0_tm_4, 0);
                        v4i32 _out0tm5 = __msa_ld_w(output0_tm_5, 0);

                        v4i32 _tmp02a = __msa_addv_w(_out0tm1, _out0tm2);
                        v4i32 _tmp13a = __msa_subv_w(_out0tm1, _out0tm2);

                        v4i32 _tmp02b = __msa_addv_w(_out0tm3, _out0tm4);
                        v4i32 _tmp13b = __msa_subv_w(_out0tm3, _out0tm4);

                        v4i32 _tmp0m = __msa_addv_w(__msa_addv_w(_out0tm0, _tmp02a), _tmp02b);
                        v4i32 _tmp1m = __msa_addv_w(_tmp13a, __msa_slli_w(_tmp13b, 1));
                        v4i32 _tmp2m = __msa_addv_w(_tmp02a, __msa_slli_w(_tmp02b, 2));
                        v4i32 _tmp3m = __msa_addv_w(__msa_addv_w(_tmp13a, __msa_slli_w(_out0tm5, 2)), __msa_slli_w(_tmp13b, 3));

                        __msa_st_w(_tmp0m, tmp[0][m], 0);
                        __msa_st_w(_tmp1m, tmp[1][m], 0);
                        __msa_st_w(_tmp2m, tmp[2][m], 0);
                        __msa_st_w(_tmp3m, tmp[3][m], 0);

                        output0_tm_0 += tiles * 24;
                        output0_tm_1 += tiles * 24;
                        output0_tm_2 += tiles * 24;
                        output0_tm_3 += tiles * 24;
                        output0_tm_4 += tiles * 24;
                        output0_tm_5 += tiles * 24;
                    }
                    for (int m = 5; m < 6; m++)
                    {
                        v4i32 _out0tm0 = __msa_ld_w(output0_tm_0, 0);
                        v4i32 _out0tm1 = __msa_ld_w(output0_tm_1, 0);
                        v4i32 _out0tm2 = __msa_ld_w(output0_tm_2, 0);
                        v4i32 _out0tm3 = __msa_ld_w(output0_tm_3, 0);
                        v4i32 _out0tm4 = __msa_ld_w(output0_tm_4, 0);
                        v4i32 _out0tm5 = __msa_ld_w(output0_tm_5, 0);

                        v4i32 _tmp02a = __msa_addv_w(_out0tm1, _out0tm2);
                        v4i32 _tmp13a = __msa_subv_w(_out0tm1, _out0tm2);

                        v4i32 _tmp02b = __msa_addv_w(_out0tm3, _out0tm4);
                        v4i32 _tmp13b = __msa_subv_w(_out0tm3, _out0tm4);

                        v4i32 _tmp0m = __msa_addv_w(__msa_addv_w(_out0tm0, _tmp02a), _tmp02b);
                        v4i32 _tmp1m = __msa_addv_w(_tmp13a, __msa_slli_w(_tmp13b, 1));
                        v4i32 _tmp2m = __msa_addv_w(_tmp02a, __msa_slli_w(_tmp02b, 2));
                        v4i32 _tmp3m = __msa_addv_w(__msa_addv_w(_tmp13a, __msa_slli_w(_out0tm5, 2)), __msa_slli_w(_tmp13b, 3));

                        _tmp0m = __msa_slli_w(_tmp0m, 2);
                        _tmp1m = __msa_slli_w(_tmp1m, 2);
                        _tmp2m = __msa_slli_w(_tmp2m, 2);
                        _tmp3m = __msa_slli_w(_tmp3m, 2);

                        __msa_st_w(_tmp0m, tmp[0][m], 0);
                        __msa_st_w(_tmp1m, tmp[1][m], 0);
                        __msa_st_w(_tmp2m, tmp[2][m], 0);
                        __msa_st_w(_tmp3m, tmp[3][m], 0);

                        output0_tm_0 += tiles * 24;
                        output0_tm_1 += tiles * 24;
                        output0_tm_2 += tiles * 24;
                        output0_tm_3 += tiles * 24;
                        output0_tm_4 += tiles * 24;
                        output0_tm_5 += tiles * 24;
                    }

                    for (int m = 0; m < 4; m++)
                    {
                        v4i32 _tmp00 = __msa_ld_w(tmp[m][0], 0);
                        v4i32 _tmp01 = __msa_ld_w(tmp[m][1], 0);
                        v4i32 _tmp02 = __msa_ld_w(tmp[m][2], 0);
                        v4i32 _tmp03 = __msa_ld_w(tmp[m][3], 0);
                        v4i32 _tmp04 = __msa_ld_w(tmp[m][4], 0);
                        v4i32 _tmp05 = __msa_ld_w(tmp[m][5], 0);

                        v4i32 _tmp02a = __msa_addv_w(_tmp01, _tmp02);
                        v4i32 _tmp13a = __msa_subv_w(_tmp01, _tmp02);

                        v4i32 _tmp02b = __msa_addv_w(_tmp03, _tmp04);
                        v4i32 _tmp13b = __msa_subv_w(_tmp03, _tmp04);

                        v4i32 _out00 = __msa_addv_w(__msa_addv_w(_tmp00, _tmp02a), _tmp02b);
                        v4i32 _out01 = __msa_addv_w(_tmp13a, __msa_slli_w(_tmp13b, 1));
                        v4i32 _out02 = __msa_addv_w(_tmp02a, __msa_slli_w(_tmp02b, 2));
                        v4i32 _out03 = __msa_addv_w(__msa_addv_w(_tmp05, _tmp13a), __msa_slli_w(_tmp13b, 3));

                        // TODO use integer trick for division by 576
                        v4f32 _v576 = __msa_fill_w_f32(1.0 / 576);
                        _out00 = __msa_ftint_s_w(__msa_fmul_w(__msa_ffint_s_w(_out00), _v576));
                        _out01 = __msa_ftint_s_w(__msa_fmul_w(__msa_ffint_s_w(_out01), _v576));
                        _out02 = __msa_ftint_s_w(__msa_fmul_w(__msa_ffint_s_w(_out02), _v576));
                        _out03 = __msa_ftint_s_w(__msa_fmul_w(__msa_ffint_s_w(_out03), _v576));

                        __msa_st_w(_out00, output0, 0);
                        __msa_st_w(_out01, output0 + 4, 0);
                        __msa_st_w(_out02, output0 + 8, 0);
                        __msa_st_w(_out03, output0 + 12, 0);

                        output0 += outw * 4;
                    }
                }
            }
        }
    }
    // END transform output

    // cut result pad
    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
 }
--- a/src/layer/mips/convolution_int8.h
+++ b/src/layer/mips/convolution_int8.h
@@ -0,0 +1,82 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 static void convolution_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
 {
    int w = bottom_blob.w;
    int channels = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int maxk = kernel_w * kernel_h;

    // kernel offsets
    std::vector<int> _space_ofs(maxk);
    int* space_ofs = &_space_ofs[0];
    {
        int p1 = 0;
        int p2 = 0;
        int gap = w * dilation_h - kernel_w * dilation_w;
        for (int i = 0; i < kernel_h; i++)
        {
            for (int j = 0; j < kernel_w; j++)
            {
                space_ofs[p1] = p2;
                p1++;
                p2 += dilation_w;
            }
            p2 += gap;
        }
    }

    // num_output
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        int* outptr = top_blob.channel(p);

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
            {
                int sum = 0;

                //                 const signed char* kptr = weight_data_int8.channel(p);
                const signed char* kptr = (const signed char*)weight_data_int8 + maxk * channels * p;

                // channels
                for (int q = 0; q < channels; q++)
                {
                    const Mat m = bottom_blob.channel(q);
                    const signed char* sptr = m.row<signed char>(i * stride_h) + j * stride_w;

                    for (int k = 0; k < maxk; k++)
                    {
                        signed char val = sptr[space_ofs[k]];
                        signed char w = kptr[k];
                        sum += val * w;
                    }

                    kptr += maxk;
                }

                outptr[j] = sum;
            }

            outptr += outw;
        }
    }
 }
--- a/src/layer/mips/convolution_mips.cpp
+++ b/src/layer/mips/convolution_mips.cpp
@@ -32,6 +32,12 @@ namespace ncnn {
 #include "convolution_sgemm.h"
 #include "convolution_1x1.h"

 #if NCNN_INT8
 #include "convolution_sgemm_int8.h"
 #include "convolution_1x1_int8.h"
 #include "convolution_int8.h"
 #endif // NCNN_INT8

 #if __mips_msa
 #include "convolution_pack4.h"
 #include "convolution_pack1to4.h"
@@ -44,6 +50,20 @@ namespace ncnn {
 #include "convolution_3x3_pack4.h"
 #include "convolution_3x3_pack1to4.h"
 #include "convolution_7x7_pack1to4.h"

 #if NCNN_INT8
 #include "convolution_pack8to4_int8.h"
 #include "convolution_pack1to4_int8.h"
 #include "convolution_pack8to1_int8.h"
 #include "convolution_sgemm_pack8to4_int8.h"
 #include "convolution_sgemm_pack1to4_int8.h"
 #include "convolution_sgemm_pack8to1_int8.h"
 #include "convolution_1x1_pack8to4_int8.h"
 #include "convolution_1x1_pack1to4_int8.h"
 #include "convolution_1x1_pack8to1_int8.h"
 #include "convolution_3x3_pack8to4_int8.h"
 #include "convolution_3x3_pack8to1_int8.h"
 #endif // NCNN_INT8
 #endif // __mips_msa

 Convolution_mips::Convolution_mips()
@@ -98,6 +118,13 @@ int Convolution_mips::create_pipeline(const Option& opt)

    activation = create_activation_layer(activation_type, activation_params, opt);

 #if NCNN_INT8
    if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
    {
        return create_pipeline_int8_mips(opt);
    }
 #endif

    const int maxk = kernel_w * kernel_h;
    const int num_input = weight_data_size / maxk / num_output;

@@ -117,8 +144,8 @@ int Convolution_mips::create_pipeline(const Option& opt)
    {
        if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && num_input >= 16 && num_output >= 16)
        {
            conv3x3s1_winograd64_transform_kernel_pack4_msa(weight_data, weight_data_packed, num_input, num_output, opt);
            conv3x3s1_winograd42_transform_kernel_pack4_msa(weight_data, weight_3x3_winograd42_data_packed, num_input, num_output, opt);
            conv3x3s1_winograd64_transform_kernel_pack4_msa(weight_data, weight_3x3_winograd64_data, num_input, num_output, opt);
            conv3x3s1_winograd42_transform_kernel_pack4_msa(weight_data, weight_3x3_winograd42_data, num_input, num_output, opt);
        }
        else
        {
@@ -187,27 +214,7 @@ int Convolution_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
 #if NCNN_INT8
    if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
    {
        Mat bottom_blob_unpacked = bottom_blob;
        if (bottom_blob.elempack != 1)
        {
            Option opt_pack1 = opt;
            opt_pack1.blob_allocator = opt.workspace_allocator;

            convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1);
        }

        Mat bottom_blob_unpacked_fp32 = bottom_blob_unpacked;
        if (bottom_blob_unpacked.elembits() == 16)
        {
            Option opt_pack1 = opt;
            opt_pack1.blob_allocator = opt.workspace_allocator;

            cast_float16_to_float32(bottom_blob_unpacked, bottom_blob_unpacked_fp32, opt_pack1);
        }

        Option opt_unpacked = opt;
        opt_unpacked.use_packing_layout = false;
        return Convolution::forward_int8(bottom_blob_unpacked_fp32, top_blob, opt_unpacked);
        return forward_int8_mips(bottom_blob, top_blob, opt);
    }
 #endif

@@ -278,11 +285,11 @@ int Convolution_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
            // we need more proper conditions
            if ((w <= 10 || (w >= 15 && w <= 18) || w == 21 || w == 22) && (h <= 10 || (h >= 15 && h <= 18) || h == 21 || h == 22))
            {
                conv3x3s1_winograd42_pack4_msa(bottom_blob_bordered, top_blob, weight_3x3_winograd42_data_packed, bias_data, opt);
                conv3x3s1_winograd42_pack4_msa(bottom_blob_bordered, top_blob, weight_3x3_winograd42_data, bias_data, opt);
            }
            else
            {
                conv3x3s1_winograd64_pack4_msa(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
                conv3x3s1_winograd64_pack4_msa(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data, opt);
            }

            if (activation)
@@ -542,4 +549,408 @@ int Convolution_mips::forward(const std::vector<Mat>& bottom_blobs, std::vector<
    return 0;
 }

 #if NCNN_INT8
 static void convolution_transform_kernel_packed_int8_msa(const Mat& weight_data, Mat& weight_data_int8, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack)
 {
    const int maxk = kernel_w * kernel_h;

    // src = kw-kh-inch-outch
    // dst = pa-pb-kw-kh-inch/pa-outch/pb
    {
        Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);

        weight_data_int8.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)elempack * out_elempack, elempack * out_elempack);

        for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
        {
            signed char* g00 = weight_data_int8.channel(q / out_elempack);

            for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
            {
                for (int k = 0; k < maxk; k++)
                {
                    for (int i = 0; i < out_elempack; i++)
                    {
                        for (int j = 0; j < elempack; j++)
                        {
                            const signed char* k00 = weight_data_r2.channel(q + i).row<const signed char>(p + j);

                            g00[0] = k00[k];

                            g00++;
                        }
                    }
                }
            }
        }
    }
 }

 int Convolution_mips::create_pipeline_int8_mips(const Option& opt)
 {
    const int maxk = kernel_w * kernel_h;
    const int num_input = weight_data_size / maxk / num_output;

    int elempack = 1;
    int out_elempack = 1;
 #if __mips_msa
    if (opt.use_packing_layout)
    {
        elempack = num_input % 8 == 0 ? 8 : 1;
        out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
 #endif // __mips_msa

 #if __mips_msa
    if (elempack == 8 && out_elempack == 4)
    {
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            convolution_im2col_sgemm_transform_kernel_pack8to4_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            convolution_im2col_sgemm_transform_kernel_pack8to4_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv3x3s1_winograd42_transform_kernel_pack8to4_int8_msa(weight_data, weight_3x3_winograd42_data, num_input, num_output, opt);
        }
        else if (opt.use_sgemm_convolution)
        {
            convolution_im2col_sgemm_transform_kernel_pack8to4_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else
        {
            convolution_transform_kernel_packed_int8_msa(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
        }
    }

    if (elempack == 1 && out_elempack == 4)
    {
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            convolution_im2col_sgemm_transform_kernel_pack1to4_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            convolution_im2col_sgemm_transform_kernel_pack1to4_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
        {
            convolution_im2col_sgemm_transform_kernel_pack1to4_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else
        {
            convolution_transform_kernel_packed_int8_msa(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
        }
    }

    if (elempack == 8 && out_elempack == 1)
    {
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            convolution_im2col_sgemm_transform_kernel_pack8to1_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            convolution_im2col_sgemm_transform_kernel_pack8to1_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv3x3s1_winograd42_transform_kernel_pack8to1_int8_msa(weight_data, weight_3x3_winograd42_data, num_input, num_output, opt);
        }
        else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
        {
            convolution_im2col_sgemm_transform_kernel_pack8to1_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else
        {
            convolution_transform_kernel_packed_int8_msa(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
        }
    }
 #endif // __mips_msa

    if (elempack == 1 && out_elempack == 1)
    {
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            convolution_im2col_sgemm_transform_kernel_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            convolution_im2col_sgemm_transform_kernel_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else if (opt.use_sgemm_convolution)
        {
            convolution_im2col_sgemm_transform_kernel_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
    }

    return 0;
 }

 int Convolution_mips::forward_int8_mips(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
    int elembits = bottom_blob.elembits();

    Mat bottom_blob_int8 = bottom_blob;
    if (elembits != 8)
    {
        Option opt_q = opt;
        opt_q.blob_allocator = opt.workspace_allocator;
        quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_q);
    }

    Mat bottom_blob_bordered;
    make_padding(bottom_blob_int8, bottom_blob_bordered, opt);
    if (bottom_blob_bordered.empty())
        return -100;

    int w = bottom_blob_bordered.w;
    int h = bottom_blob_bordered.h;
    int channels = bottom_blob_bordered.c;
    int elempack = bottom_blob_bordered.elempack;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    int outw = (w - kernel_extent_w) / stride_w + 1;
    int outh = (h - kernel_extent_h) / stride_h + 1;

    bool use_int8_requantize = int8_scale_term > 100;
    int out_elempack = 1;
 #if __mips_msa
    if (opt.use_packing_layout)
    {
        if (use_int8_requantize)
            out_elempack = num_output % 8 == 0 ? 8 : 1;
        else
            out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
 #endif // __mips_msa
    size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack;

    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    const int num_input = channels * elempack;

    int out_elempack_int32 = 1;
 #if __mips_msa
    if (opt.use_packing_layout)
    {
        out_elempack_int32 = num_output % 4 == 0 ? 4 : 1;
    }
 #endif // __mips_msa

    Mat top_blob_int32;
    top_blob_int32.create(outw, outh, num_output / out_elempack_int32, (size_t)(4u * out_elempack_int32), out_elempack_int32, opt.workspace_allocator);
    if (top_blob_int32.empty())
        return -100;

 #if __mips_msa
    if (elempack == 8 && out_elempack_int32 == 4)
    {
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv1x1s1_sgemm_pack8to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
        }
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            conv1x1s2_sgemm_pack8to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
        }
        else if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv3x3s1_winograd42_pack8to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_3x3_winograd42_data, opt);
        }
        else if (opt.use_sgemm_convolution)
        {
            convolution_im2col_sgemm_pack8to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
        }
        else
        {
            convolution_pack8to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_data_int8, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
        }

        Mat scale_in_data(num_output);
        for (int p = 0; p < num_output; p++)
        {
            // requantize and relu
            float scale_in;
            if (weight_data_int8_scales[p] == 0)
                scale_in = 0;
            else
                scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]);

            scale_in_data[p] = scale_in;
        }

        if (use_int8_requantize)
        {
            requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt);
        }
        else
        {
            dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
    }

    if (elempack == 1 && out_elempack_int32 == 4)
    {
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv1x1s1_sgemm_pack1to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
        }
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            conv1x1s2_sgemm_pack1to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
        }
        else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
        {
            convolution_im2col_sgemm_pack1to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
        }
        else
        {
            convolution_pack1to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_data_int8, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
        }

        Mat scale_in_data(num_output);
        for (int p = 0; p < num_output; p++)
        {
            // requantize and relu
            float scale_in;
            if (weight_data_int8_scales[p] == 0)
                scale_in = 0;
            else
                scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]);

            scale_in_data[p] = scale_in;
        }

        if (use_int8_requantize)
        {
            requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt);
        }
        else
        {
            dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
    }

    if (elempack == 8 && out_elempack_int32 == 1)
    {
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv1x1s1_sgemm_pack8to1_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
        }
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            conv1x1s2_sgemm_pack8to1_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
        }
        else if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv3x3s1_winograd42_pack8to1_int8_msa(bottom_blob_bordered, top_blob_int32, weight_3x3_winograd42_data, opt);
        }
        else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
        {
            convolution_im2col_sgemm_pack8to1_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
        }
        else
        {
            convolution_pack8to1_int8_msa(bottom_blob_bordered, top_blob_int32, weight_data_int8, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
        }

        Mat scale_in_data(num_output);
        for (int p = 0; p < num_output; p++)
        {
            // requantize and relu
            float scale_in;
            if (weight_data_int8_scales[p] == 0)
                scale_in = 0;
            else
                scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]);

            scale_in_data[p] = scale_in;
        }

        if (use_int8_requantize)
        {
            requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt);
        }
        else
        {
            dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
    }
 #endif // __mips_msa

    if (elempack == 1 && out_elempack_int32 == 1)
    {
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv1x1s1_sgemm_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
        }
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            conv1x1s2_sgemm_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
        }
        else if (opt.use_sgemm_convolution)
        {
            convolution_im2col_sgemm_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
        }
        else
        {
            //         convolution_int8(bottom_blob_bordered, top_blob_int32, weight_data_int8, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
            convolution_int8(bottom_blob_bordered, top_blob_int32, weight_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
        }

        Mat scale_in_data(num_output);
        for (int p = 0; p < num_output; p++)
        {
            // requantize and relu
            float scale_in;
            if (weight_data_int8_scales[p] == 0)
                scale_in = 0;
            else
                scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]);

            scale_in_data[p] = scale_in;
        }

        if (use_int8_requantize)
        {
            requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt);
        }
        else
        {
            dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
    }

    return 0;
 }
 #endif // NCNN_INT8

 } // namespace ncnn
--- a/src/layer/mips/convolution_mips.h
+++ b/src/layer/mips/convolution_mips.h
@@ -31,12 +31,27 @@ public:

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

 protected:
 #if NCNN_INT8
    int create_pipeline_int8_mips(const Option& opt);
    int forward_int8_mips(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 #endif

 public:
    Layer* activation;

    // packn
    Mat weight_sgemm_data;

    Mat weight_3x3_winograd42_data;
    Mat weight_3x3_winograd64_data;

    // pack4
    Mat weight_data_packed;
    Mat weight_3x3_winograd42_data_packed;

 #if NCNN_INT8
    // int8
    Mat weight_data_int8;
 #endif
 };

 } // namespace ncnn
--- a/src/layer/mips/convolution_pack1to4_int8.h
+++ b/src/layer/mips/convolution_pack1to4_int8.h
@@ -0,0 +1,87 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 static void convolution_pack1to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
 {
    int w = bottom_blob.w;
    int channels = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int maxk = kernel_w * kernel_h;

    // kernel offsets
    std::vector<int> _space_ofs(maxk);
    int* space_ofs = &_space_ofs[0];
    {
        int p1 = 0;
        int p2 = 0;
        int gap = w * dilation_h - kernel_w * dilation_w;
        for (int i = 0; i < kernel_h; i++)
        {
            for (int j = 0; j < kernel_w; j++)
            {
                space_ofs[p1] = p2;
                p1++;
                p2 += dilation_w;
            }
            p2 += gap;
        }
    }

    // num_output
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        int* outptr = top_blob.channel(p);

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
            {
                v4i32 _sum = __msa_fill_w(0);

                const signed char* kptr = weight_data_int8.channel(p);

                // channels
                for (int q = 0; q < channels; q++)
                {
                    const Mat m = bottom_blob.channel(q);
                    const signed char* sptr = m.row<const signed char>(i * stride_h) + j * stride_w;

                    for (int k = 0; k < maxk; k++)
                    {
                        v8i16 _val = __msa_fill_h((short)sptr[space_ofs[k]]);

                        v16i8 _w = __msa_ld_b(kptr, 0);
                        v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w);

                        v8i16 _s0 = __msa_mulv_h(_val, _w16);
                        v4i32 _s032 = (v4i32)__msa_ilvr_h(__msa_clti_s_h(_s0, 0), _s0);

                        _sum = __msa_addv_w(_sum, _s032);

                        kptr += 4;
                    }
                }

                __msa_st_w(_sum, outptr + j * 4, 0);
            }

            outptr += outw * 4;
        }
    }
 }
--- a/src/layer/mips/convolution_pack4to1.h
+++ b/src/layer/mips/convolution_pack4to1.h
@@ -81,7 +81,7 @@ static void convolution_pack4to1_msa(const Mat& bottom_blob, Mat& top_blob, cons
                    }
                }

                sum += __msa_fhadd_w(_sum);
                sum += __msa_reduce_fadd_w(_sum);

                sum = activation_ss(sum, activation_type, activation_params);

--- a/src/layer/mips/convolution_pack8to1_int8.h
+++ b/src/layer/mips/convolution_pack8to1_int8.h
@@ -0,0 +1,87 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 static void convolution_pack8to1_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
 {
    int w = bottom_blob.w;
    int channels = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int maxk = kernel_w * kernel_h;

    // kernel offsets
    std::vector<int> _space_ofs(maxk);
    int* space_ofs = &_space_ofs[0];
    {
        int p1 = 0;
        int p2 = 0;
        int gap = w * dilation_h - kernel_w * dilation_w;
        for (int i = 0; i < kernel_h; i++)
        {
            for (int j = 0; j < kernel_w; j++)
            {
                space_ofs[p1] = p2;
                p1++;
                p2 += dilation_w;
            }
            p2 += gap;
        }
    }

    // num_output
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        int* outptr = top_blob.channel(p);

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
            {
                v4i32 _sum = __msa_fill_w(0);

                const signed char* kptr = weight_data_int8.channel(p);

                // channels
                for (int q = 0; q < channels; q++)
                {
                    const Mat m = bottom_blob.channel(q);
                    const signed char* sptr = m.row<const signed char>(i * stride_h) + j * stride_w * 8;

                    for (int k = 0; k < maxk; k++)
                    {
                        v16i8 _val = __msa_ld_b(sptr + space_ofs[k] * 8, 0);
                        v8i16 _val16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val);

                        v16i8 _w = __msa_ld_b(kptr, 0);
                        v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w);

                        v8i16 _s0 = __msa_mulv_h(_val16, _w16);

                        _sum = __msa_addv_w(_sum, __msa_hadd_s_w(_s0, _s0));

                        kptr += 8;
                    }
                }

                outptr[j] = __msa_reduce_add_w(_sum);
            }

            outptr += outw;
        }
    }
 }
--- a/src/layer/mips/convolution_pack8to4_int8.h
+++ b/src/layer/mips/convolution_pack8to4_int8.h
@@ -0,0 +1,120 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 static void convolution_pack8to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
 {
    int w = bottom_blob.w;
    int channels = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int maxk = kernel_w * kernel_h;

    // kernel offsets
    std::vector<int> _space_ofs(maxk);
    int* space_ofs = &_space_ofs[0];
    {
        int p1 = 0;
        int p2 = 0;
        int gap = w * dilation_h - kernel_w * dilation_w;
        for (int i = 0; i < kernel_h; i++)
        {
            for (int j = 0; j < kernel_w; j++)
            {
                space_ofs[p1] = p2;
                p1++;
                p2 += dilation_w;
            }
            p2 += gap;
        }
    }

    // num_output
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        int* outptr = top_blob.channel(p);

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
            {
                v4i32 _sum0 = __msa_fill_w(0);
                v4i32 _sum1 = __msa_fill_w(0);
                v4i32 _sum2 = __msa_fill_w(0);
                v4i32 _sum3 = __msa_fill_w(0);

                const signed char* kptr = weight_data_int8.channel(p);

                // channels
                for (int q = 0; q < channels; q++)
                {
                    const Mat m = bottom_blob.channel(q);
                    const signed char* sptr = m.row<signed char>(i * stride_h) + j * stride_w * 8;

                    for (int k = 0; k < maxk; k++)
                    {
                        v16i8 _val = __msa_ld_b(sptr + space_ofs[k] * 8, 0);
                        v8i16 _val16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val);

                        v16i8 _w01 = __msa_ld_b(kptr, 0);
                        v16i8 _w23 = __msa_ld_b(kptr + 16, 0);
                        v16i8 _extw01 = __msa_clti_s_b(_w01, 0);
                        v16i8 _extw23 = __msa_clti_s_b(_w23, 0);
                        v8i16 _w0 = (v8i16)__msa_ilvr_b(_extw01, _w01);
                        v8i16 _w1 = (v8i16)__msa_ilvl_b(_extw01, _w01);
                        v8i16 _w2 = (v8i16)__msa_ilvr_b(_extw23, _w23);
                        v8i16 _w3 = (v8i16)__msa_ilvl_b(_extw23, _w23);

                        v8i16 _s0 = __msa_mulv_h(_val16, _w0);
                        v8i16 _s1 = __msa_mulv_h(_val16, _w1);
                        v8i16 _s2 = __msa_mulv_h(_val16, _w2);
                        v8i16 _s3 = __msa_mulv_h(_val16, _w3);

                        _sum0 = __msa_addv_w(_sum0, __msa_hadd_s_w(_s0, _s0));
                        _sum1 = __msa_addv_w(_sum1, __msa_hadd_s_w(_s1, _s1));
                        _sum2 = __msa_addv_w(_sum2, __msa_hadd_s_w(_s2, _s2));
                        _sum3 = __msa_addv_w(_sum3, __msa_hadd_s_w(_s3, _s3));

                        kptr += 32;
                    }
                }

                // transpose 4x4
                {
                    v4i32 _tmp0, _tmp1, _tmp2, _tmp3;
                    _tmp0 = __msa_ilvr_w(_sum1, _sum0);
                    _tmp1 = __msa_ilvr_w(_sum3, _sum2);
                    _tmp2 = __msa_ilvl_w(_sum1, _sum0);
                    _tmp3 = __msa_ilvl_w(_sum3, _sum2);
                    _sum0 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0);
                    _sum1 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0);
                    _sum2 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2);
                    _sum3 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2);
                }

                _sum0 = __msa_addv_w(_sum0, _sum1);
                _sum2 = __msa_addv_w(_sum2, _sum3);

                _sum0 = __msa_addv_w(_sum0, _sum2);

                __msa_st_w(_sum0, outptr + j * 4, 0);
            }

            outptr += outw * 4;
        }
    }
 }
--- a/src/layer/mips/convolution_sgemm_int8.h
+++ b/src/layer/mips/convolution_sgemm_int8.h
@@ -0,0 +1,731 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 static void im2col_sgemm_int8_msa(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
 {
    // Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator);

    const int size = bottom_im2col.w;
    const int maxk = bottom_im2col.h;
    const int inch = bottom_im2col.c;

    const int outch = top_blob.c;

    // permute
    Mat tmp;
 #if __mips_msa
    if (inch >= 4)
    {
        if (size >= 2)
            tmp.create(2 * maxk, inch / 4 + inch % 4, size / 2 + size % 2, 4u, 4, opt.workspace_allocator);
        else
            tmp.create(maxk, inch / 4 + inch % 4, size, 4u, 4, opt.workspace_allocator);
    }
    else
    {
        if (size >= 2)
            tmp.create(2 * maxk, inch, size / 2 + size % 2, 1u, 1, opt.workspace_allocator);
        else
            tmp.create(maxk, inch, size, 1u, 1, opt.workspace_allocator);
    }
    {
        int remain_size_start = 0;
        int nn_size = (size - remain_size_start) >> 1;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ii = 0; ii < nn_size; ii++)
        {
            int i = remain_size_start + ii * 2;

            signed char* tmpptr = tmp.channel(i / 2);

            int q = 0;
            for (; q + 3 < inch; q += 4)
            {
                const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;
                const signed char* img1 = (const signed char*)bottom_im2col.channel(q + 1) + i;
                const signed char* img2 = (const signed char*)bottom_im2col.channel(q + 2) + i;
                const signed char* img3 = (const signed char*)bottom_im2col.channel(q + 3) + i;

                for (int k = 0; k < maxk; k++)
                {
                    tmpptr[0] = img0[0];
                    tmpptr[1] = img1[0];
                    tmpptr[2] = img2[0];
                    tmpptr[3] = img3[0];
                    tmpptr[4] = img0[1];
                    tmpptr[5] = img1[1];
                    tmpptr[6] = img2[1];
                    tmpptr[7] = img3[1];
                    tmpptr += 8;

                    img0 += size;
                    img1 += size;
                    img2 += size;
                    img3 += size;
                }
            }
            for (; q < inch; q++)
            {
                const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;

                for (int k = 0; k < maxk; k++)
                {
                    tmpptr[0] = img0[0];
                    tmpptr[1] = img0[1];

                    tmpptr += 2;

                    img0 += size;
                }
            }
        }

        remain_size_start += nn_size << 1;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = remain_size_start; i < size; i++)
        {
            signed char* tmpptr = tmp.channel(i / 2 + i % 2);

            int q = 0;
            for (; q + 3 < inch; q += 4)
            {
                const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;
                const signed char* img1 = (const signed char*)bottom_im2col.channel(q + 1) + i;
                const signed char* img2 = (const signed char*)bottom_im2col.channel(q + 2) + i;
                const signed char* img3 = (const signed char*)bottom_im2col.channel(q + 3) + i;

                for (int k = 0; k < maxk; k++)
                {
                    tmpptr[0] = img0[0];
                    tmpptr[1] = img1[0];
                    tmpptr[2] = img2[0];
                    tmpptr[3] = img3[0];
                    tmpptr += 4;

                    img0 += size;
                    img1 += size;
                    img2 += size;
                    img3 += size;
                }
            }
            for (; q < inch; q++)
            {
                const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;

                for (int k = 0; k < maxk; k++)
                {
                    tmpptr[0] = img0[0];

                    tmpptr += 1;

                    img0 += size;
                }
            }
        }
    }
 #else // __mips_msa
    tmp.create(maxk, inch, size, 1u, 1, opt.workspace_allocator);
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < size; i++)
        {
            signed char* tmpptr = tmp.channel(i);

            int q = 0;
            for (; q < inch; q++)
            {
                const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;

                for (int k = 0; k < maxk; k++)
                {
                    tmpptr[0] = img0[0];

                    tmpptr += 1;

                    img0 += size;
                }
            }
        }
    }
 #endif // __mips_msa

    int nn_outch = 0;
    int remain_outch_start = 0;

 #if __mips_msa
    nn_outch = outch >> 2;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp = 0; pp < nn_outch; pp++)
    {
        int p = pp * 4;

        int* outptr0 = top_blob.channel(p);
        int* outptr1 = top_blob.channel(p + 1);
        int* outptr2 = top_blob.channel(p + 2);
        int* outptr3 = top_blob.channel(p + 3);

        int i = 0;
        for (; i + 1 < size; i += 2)
        {
            const signed char* tmpptr = tmp.channel(i / 2);
            const signed char* kptr = kernel.channel(p / 4);

            int nn4 = (inch / 4) * maxk;
            int nn1 = (inch % 4) * maxk;

            v4i32 _sum00 = __msa_fill_w(0);
            v4i32 _sum10 = __msa_fill_w(0);

            if (nn4 > 0)
            {
                v4i32 _sum01 = __msa_fill_w(0);
                v4i32 _sum02 = __msa_fill_w(0);
                v4i32 _sum03 = __msa_fill_w(0);
                v4i32 _sum11 = __msa_fill_w(0);
                v4i32 _sum12 = __msa_fill_w(0);
                v4i32 _sum13 = __msa_fill_w(0);

                int j = 0;
                for (; j < nn4; j++)
                {
                    v16i8 _val = __msa_ld_b(tmpptr, 0);
                    v8i16 _val01 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val);

                    v8i16 _val0 = (v8i16)__msa_ilvr_d((v2i64)_val01, (v2i64)_val01);
                    v8i16 _val1 = (v8i16)__msa_ilvl_d((v2i64)_val01, (v2i64)_val01);

                    v16i8 _w01 = __msa_ld_b(kptr, 0);
                    v16i8 _extw01 = __msa_clti_s_b(_w01, 0);
                    v8i16 _w0 = (v8i16)__msa_ilvr_b(_extw01, _w01);
                    v8i16 _w1 = (v8i16)__msa_ilvl_b(_extw01, _w01);

                    v8i16 _s00 = __msa_mulv_h(_val0, _w0);
                    v8i16 _s01 = __msa_mulv_h(_val0, _w1);
                    v8i16 _s10 = __msa_mulv_h(_val1, _w0);
                    v8i16 _s11 = __msa_mulv_h(_val1, _w1);

                    v8i16 _exts00 = __msa_clti_s_h(_s00, 0);
                    v8i16 _exts01 = __msa_clti_s_h(_s01, 0);
                    v8i16 _exts10 = __msa_clti_s_h(_s10, 0);
                    v8i16 _exts11 = __msa_clti_s_h(_s11, 0);
                    v4i32 _s00l = (v4i32)__msa_ilvr_h(_exts00, _s00);
                    v4i32 _s00h = (v4i32)__msa_ilvl_h(_exts00, _s00);
                    v4i32 _s01l = (v4i32)__msa_ilvr_h(_exts01, _s01);
                    v4i32 _s01h = (v4i32)__msa_ilvl_h(_exts01, _s01);
                    v4i32 _s10l = (v4i32)__msa_ilvr_h(_exts10, _s10);
                    v4i32 _s10h = (v4i32)__msa_ilvl_h(_exts10, _s10);
                    v4i32 _s11l = (v4i32)__msa_ilvr_h(_exts11, _s11);
                    v4i32 _s11h = (v4i32)__msa_ilvl_h(_exts11, _s11);

                    _sum00 = __msa_addv_w(_sum00, _s00l);
                    _sum01 = __msa_addv_w(_sum01, _s00h);
                    _sum02 = __msa_addv_w(_sum02, _s01l);
                    _sum03 = __msa_addv_w(_sum03, _s01h);
                    _sum10 = __msa_addv_w(_sum10, _s10l);
                    _sum11 = __msa_addv_w(_sum11, _s10h);
                    _sum12 = __msa_addv_w(_sum12, _s11l);
                    _sum13 = __msa_addv_w(_sum13, _s11h);

                    tmpptr += 8;
                    kptr += 16;
                }

                // transpose 4x4
                {
                    v4i32 _tmp0, _tmp1, _tmp2, _tmp3;
                    _tmp0 = __msa_ilvr_w(_sum01, _sum00);
                    _tmp1 = __msa_ilvr_w(_sum03, _sum02);
                    _tmp2 = __msa_ilvl_w(_sum01, _sum00);
                    _tmp3 = __msa_ilvl_w(_sum03, _sum02);
                    _sum00 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0);
                    _sum01 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0);
                    _sum02 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2);
                    _sum03 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2);
                }
                {
                    v4i32 _tmp0, _tmp1, _tmp2, _tmp3;
                    _tmp0 = __msa_ilvr_w(_sum11, _sum10);
                    _tmp1 = __msa_ilvr_w(_sum13, _sum12);
                    _tmp2 = __msa_ilvl_w(_sum11, _sum10);
                    _tmp3 = __msa_ilvl_w(_sum13, _sum12);
                    _sum10 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0);
                    _sum11 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0);
                    _sum12 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2);
                    _sum13 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2);
                }

                _sum00 = __msa_addv_w(_sum00, _sum01);
                _sum02 = __msa_addv_w(_sum02, _sum03);
                _sum10 = __msa_addv_w(_sum10, _sum11);
                _sum12 = __msa_addv_w(_sum12, _sum13);

                _sum00 = __msa_addv_w(_sum00, _sum02);
                _sum10 = __msa_addv_w(_sum10, _sum12);
            }

            int j = 0;
            for (; j < nn1; j++)
            {
                v8i16 _val0 = __msa_fill_h(tmpptr[0]);
                v8i16 _val1 = __msa_fill_h(tmpptr[1]);
                v8i16 _val = (v8i16)__msa_ilvr_d((v2i64)_val1, (v2i64)_val0);

                v16i8 _w = __msa_ld_b(kptr, 0);
                v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w);

                _w16 = (v8i16)__msa_ilvr_d((v2i64)_w16, (v2i64)_w16);

                v8i16 _s0 = __msa_mulv_h(_val, _w16);
                v8i16 _exts0 = __msa_clti_s_h(_s0, 0);
                v4i32 _s0l = (v4i32)__msa_ilvr_h(_exts0, _s0);
                v4i32 _s0h = (v4i32)__msa_ilvl_h(_exts0, _s0);

                _sum00 = __msa_addv_w(_sum00, _s0l);
                _sum10 = __msa_addv_w(_sum10, _s0h);

                tmpptr += 2;
                kptr += 4;
            }

            int sum[8];
            __msa_st_w(_sum00, sum, 0);
            __msa_st_w(_sum10, sum + 4, 0);

            outptr0[0] = sum[0];
            outptr1[0] = sum[1];
            outptr2[0] = sum[2];
            outptr3[0] = sum[3];
            outptr0[1] = sum[4];
            outptr1[1] = sum[5];
            outptr2[1] = sum[6];
            outptr3[1] = sum[7];
            outptr0 += 2;
            outptr1 += 2;
            outptr2 += 2;
            outptr3 += 2;
        }
        for (; i < size; i++)
        {
            const signed char* tmpptr = tmp.channel(i / 2 + i % 2);
            const signed char* kptr = kernel.channel(p / 4);

            int nn4 = (inch / 4) * maxk;
            int nn1 = (inch % 4) * maxk;

            v4i32 _sum0 = __msa_fill_w(0);

            if (nn4 > 0)
            {
                v4i32 _sum1 = __msa_fill_w(0);
                v4i32 _sum2 = __msa_fill_w(0);
                v4i32 _sum3 = __msa_fill_w(0);

                int j = 0;
                for (; j < nn4; j++)
                {
                    v16i8 _val = __msa_ld_b(tmpptr, 0);
                    v8i16 _val16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val);

                    _val16 = (v8i16)__msa_ilvr_d((v2i64)_val16, (v2i64)_val16);

                    v16i8 _w01 = __msa_ld_b(kptr, 0);
                    v16i8 _extw01 = __msa_clti_s_b(_w01, 0);
                    v8i16 _w0 = (v8i16)__msa_ilvr_b(_extw01, _w01);
                    v8i16 _w1 = (v8i16)__msa_ilvl_b(_extw01, _w01);

                    v8i16 _s0 = __msa_mulv_h(_val16, _w0);
                    v8i16 _s1 = __msa_mulv_h(_val16, _w1);

                    v8i16 _exts0 = __msa_clti_s_h(_s0, 0);
                    v8i16 _exts1 = __msa_clti_s_h(_s1, 0);
                    v4i32 _s0l = (v4i32)__msa_ilvr_h(_exts0, _s0);
                    v4i32 _s0h = (v4i32)__msa_ilvl_h(_exts0, _s0);
                    v4i32 _s1l = (v4i32)__msa_ilvr_h(_exts1, _s1);
                    v4i32 _s1h = (v4i32)__msa_ilvl_h(_exts1, _s1);

                    _sum0 = __msa_addv_w(_sum0, _s0l);
                    _sum1 = __msa_addv_w(_sum1, _s0h);
                    _sum2 = __msa_addv_w(_sum2, _s1l);
                    _sum3 = __msa_addv_w(_sum3, _s1h);

                    tmpptr += 4;
                    kptr += 16;
                }

                // transpose 4x4
                {
                    v4i32 _tmp0, _tmp1, _tmp2, _tmp3;
                    _tmp0 = __msa_ilvr_w(_sum1, _sum0);
                    _tmp1 = __msa_ilvr_w(_sum3, _sum2);
                    _tmp2 = __msa_ilvl_w(_sum1, _sum0);
                    _tmp3 = __msa_ilvl_w(_sum3, _sum2);
                    _sum0 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0);
                    _sum1 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0);
                    _sum2 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2);
                    _sum3 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2);
                }

                _sum0 = __msa_addv_w(_sum0, _sum1);
                _sum2 = __msa_addv_w(_sum2, _sum3);
                _sum0 = __msa_addv_w(_sum0, _sum2);
            }

            int j = 0;
            for (; j < nn1; j++)
            {
                v8i16 _val = __msa_fill_h(tmpptr[0]);

                v16i8 _w = __msa_ld_b(kptr, 0);
                v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w);

                v8i16 _s0 = __msa_mulv_h(_val, _w16);
                v4i32 _s032 = (v4i32)__msa_ilvr_h(__msa_clti_s_h(_s0, 0), _s0);

                _sum0 = __msa_addv_w(_sum0, _s032);

                tmpptr += 1;
                kptr += 4;
            }

            int sum[4];
            __msa_st_w(_sum0, sum, 0);

            outptr0[0] = sum[0];
            outptr1[0] = sum[1];
            outptr2[0] = sum[2];
            outptr3[0] = sum[3];
            outptr0 += 1;
            outptr1 += 1;
            outptr2 += 1;
            outptr3 += 1;
        }
    }

    remain_outch_start += nn_outch << 2;
 #endif // __mips_msa

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = remain_outch_start; p < outch; p++)
    {
        int* outptr0 = top_blob.channel(p);

        int i = 0;
 #if __mips_msa
        for (; i + 1 < size; i += 2)
        {
            const signed char* tmpptr = tmp.channel(i / 2);
            const signed char* kptr = kernel.channel(p / 4 + p % 4);

            int nn4 = (inch / 4) * maxk;
            int nn1 = (inch % 4) * maxk;

            int sum0 = 0;
            int sum1 = 0;

            if (nn4 > 0)
            {
                v4i32 _sum0 = __msa_fill_w(0);
                v4i32 _sum1 = __msa_fill_w(0);

                int j = 0;
                for (; j < nn4; j++)
                {
                    v16i8 _val = __msa_ld_b(tmpptr, 0);
                    v8i16 _val16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val);

                    v16i8 _w = __msa_ld_b(kptr, 0);
                    v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w);

                    _w16 = (v8i16)__msa_ilvr_d((v2i64)_w16, (v2i64)_w16);

                    v8i16 _s0 = __msa_mulv_h(_val16, _w16);
                    v8i16 _exts0 = __msa_clti_s_h(_s0, 0);
                    v4i32 _s0l = (v4i32)__msa_ilvr_h(_exts0, _s0);
                    v4i32 _s0h = (v4i32)__msa_ilvl_h(_exts0, _s0);

                    _sum0 = __msa_addv_w(_sum0, _s0l);
                    _sum1 = __msa_addv_w(_sum1, _s0h);

                    tmpptr += 8;
                    kptr += 4;
                }

                sum0 = _sum0[0] + _sum0[1] + _sum0[2] + _sum0[3];
                sum1 = _sum1[0] + _sum1[1] + _sum1[2] + _sum1[3];
            }

            int j = 0;
            for (; j < nn1; j++)
            {
                signed char val0 = tmpptr[0];
                signed char val1 = tmpptr[1];
                signed char w = kptr[0];

                sum0 += val0 * w;
                sum1 += val1 * w;

                tmpptr += 2;
                kptr += 1;
            }

            outptr0[0] = sum0;
            outptr0[1] = sum1;
            outptr0 += 2;
        }
        for (; i < size; i++)
        {
            const signed char* tmpptr = tmp.channel(i / 2 + i % 2);
            const signed char* kptr = kernel.channel(p / 4 + p % 4);

            int nn4 = (inch / 4) * maxk;
            int nn1 = (inch % 4) * maxk;

            int sum = 0;

            if (nn4 > 0)
            {
                v4i32 _sum = __msa_fill_w(0);

                int j = 0;
                for (; j < nn4; j++)
                {
                    v16i8 _val = __msa_ld_b(tmpptr, 0);
                    v8i16 _val16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val);

                    v16i8 _w = __msa_ld_b(kptr, 0);
                    v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w);

                    v8i16 _s0 = __msa_mulv_h(_val16, _w16);
                    v4i32 _s032 = (v4i32)__msa_ilvr_h(__msa_clti_s_h(_s0, 0), _s0);

                    _sum = __msa_addv_w(_sum, _s032);

                    tmpptr += 4;
                    kptr += 4;
                }

                sum = _sum[0] + _sum[1] + _sum[2] + _sum[3];
            }

            int j = 0;
            for (; j < nn1; j++)
            {
                signed char val = tmpptr[0];
                signed char w = kptr[0];

                sum += val * w;

                tmpptr += 1;
                kptr += 1;
            }

            outptr0[0] = sum;
            outptr0 += 1;
        }
 #else  // __mips_msa
        for (; i < size; i++)
        {
            const signed char* tmpptr = tmp.channel(i);
            const signed char* kptr = kernel.channel(p);

            int nn1 = inch * maxk;

            int sum = 0;
            int j = 0;
            for (; j < nn1; j++)
            {
                signed char val = tmpptr[0];
                signed char w = kptr[0];

                sum += val * w;

                tmpptr += 1;
                kptr += 1;
            }

            outptr0[0] = sum;
            outptr0 += 1;
        }
 #endif // __mips_msa
    }
 }

 static void convolution_im2col_sgemm_transform_kernel_int8_msa(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
 {
    const int maxk = kernel_w * kernel_h;

 #if __mips_msa
    // interleave
    // src = maxk-inch-outch
    // dst = 4a-4b-maxk-inch/4a-outch/4b
    Mat kernel = _kernel.reshape(maxk, inch, outch);
    if (outch >= 4)
    {
        if (inch >= 4)
            kernel_tm.create(16 * maxk, inch / 4 + inch % 4, outch / 4 + outch % 4, (size_t)1u);
        else
            kernel_tm.create(4 * maxk, inch, outch / 4 + outch % 4, (size_t)1u);
    }
    else
    {
        if (inch >= 4)
            kernel_tm.create(4 * maxk, inch / 4 + inch % 4, outch, (size_t)1u);
        else
            kernel_tm.create(1 * maxk, inch, outch, (size_t)1u);
    }

    int q = 0;
    for (; q + 3 < outch; q += 4)
    {
        signed char* g00 = kernel_tm.channel(q / 4);

        int p = 0;
        for (; p + 3 < inch; p += 4)
        {
            for (int k = 0; k < maxk; k++)
            {
                for (int i = 0; i < 4; i++)
                {
                    for (int j = 0; j < 4; j++)
                    {
                        const signed char* k00 = kernel.channel(q + i).row<const signed char>(p + j);

                        g00[0] = k00[k];

                        g00++;
                    }
                }
            }
        }
        for (; p < inch; p++)
        {
            for (int k = 0; k < maxk; k++)
            {
                for (int i = 0; i < 4; i++)
                {
                    const signed char* k00 = kernel.channel(q + i).row<const signed char>(p);

                    g00[0] = k00[k];

                    g00++;
                }
            }
        }
    }
    // TODO unroll 2
    for (; q < outch; q++)
    {
        signed char* g00 = kernel_tm.channel(q / 4 + q % 4);

        int p = 0;
        for (; p + 3 < inch; p += 4)
        {
            for (int k = 0; k < maxk; k++)
            {
                for (int j = 0; j < 4; j++)
                {
                    const signed char* k00 = kernel.channel(q).row<const signed char>(p + j);

                    g00[0] = k00[k];

                    g00++;
                }
            }
        }
        for (; p < inch; p++)
        {
            for (int k = 0; k < maxk; k++)
            {
                const signed char* k00 = kernel.channel(q).row<const signed char>(p);

                g00[0] = k00[k];

                g00++;
            }
        }
    }
 #else  // __mips_msa
    kernel_tm = _kernel.reshape(maxk, inch, outch);
 #endif // __mips_msa
 }

 static void convolution_im2col_sgemm_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
 {
    int w = bottom_blob.w;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    const int size = outw * outh;

    const int maxk = kernel_w * kernel_h;

    // im2col
    Mat bottom_im2col(size, maxk, inch, 1u, 1, opt.workspace_allocator);
    {
        const int gap = w * stride_h - outw * stride_w;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < inch; p++)
        {
            const Mat img = bottom_blob.channel(p);
            signed char* ptr = bottom_im2col.channel(p);

            for (int u = 0; u < kernel_h; u++)
            {
                for (int v = 0; v < kernel_w; v++)
                {
                    const signed char* sptr = img.row<const signed char>(dilation_h * u) + dilation_w * v;

                    for (int i = 0; i < outh; i++)
                    {
                        int j = 0;
                        for (; j + 3 < outw; j += 4)
                        {
                            ptr[0] = sptr[0];
                            ptr[1] = sptr[stride_w];
                            ptr[2] = sptr[stride_w * 2];
                            ptr[3] = sptr[stride_w * 3];

                            sptr += stride_w * 4;
                            ptr += 4;
                        }
                        for (; j + 1 < outw; j += 2)
                        {
                            ptr[0] = sptr[0];
                            ptr[1] = sptr[stride_w];

                            sptr += stride_w * 2;
                            ptr += 2;
                        }
                        for (; j < outw; j++)
                        {
                            ptr[0] = sptr[0];

                            sptr += stride_w;
                            ptr += 1;
                        }

                        sptr += gap;
                    }
                }
            }
        }
    }

    im2col_sgemm_int8_msa(bottom_im2col, top_blob, kernel, opt);
 }
--- a/src/layer/mips/convolution_sgemm_pack1to4_int8.h
+++ b/src/layer/mips/convolution_sgemm_pack1to4_int8.h
@@ -0,0 +1,477 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 static void im2col_sgemm_pack1to4_int8_msa(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
 {
    // Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator);

    const int size = bottom_im2col.w;
    const int maxk = bottom_im2col.h;
    const int inch = bottom_im2col.c;

    const int outch = top_blob.c;

    // permute
    Mat tmp;
    if (inch >= 4)
    {
        if (size >= 2)
            tmp.create(2 * maxk, inch / 4 + inch % 4, size / 2 + size % 2, 4u, 4, opt.workspace_allocator);
        else
            tmp.create(maxk, inch / 4 + inch % 4, size, 4u, 4, opt.workspace_allocator);
    }
    else
    {
        if (size >= 2)
            tmp.create(2 * maxk, inch, size / 2 + size % 2, 1u, 1, opt.workspace_allocator);
        else
            tmp.create(maxk, inch, size, 1u, 1, opt.workspace_allocator);
    }
    {
        int remain_size_start = 0;
        int nn_size = (size - remain_size_start) >> 1;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ii = 0; ii < nn_size; ii++)
        {
            int i = remain_size_start + ii * 2;

            signed char* tmpptr = tmp.channel(i / 2);

            int q = 0;
            for (; q + 3 < inch; q += 4)
            {
                const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;
                const signed char* img1 = (const signed char*)bottom_im2col.channel(q + 1) + i;
                const signed char* img2 = (const signed char*)bottom_im2col.channel(q + 2) + i;
                const signed char* img3 = (const signed char*)bottom_im2col.channel(q + 3) + i;

                for (int k = 0; k < maxk; k++)
                {
                    tmpptr[0] = img0[0];
                    tmpptr[1] = img1[0];
                    tmpptr[2] = img2[0];
                    tmpptr[3] = img3[0];
                    tmpptr[4] = img0[1];
                    tmpptr[5] = img1[1];
                    tmpptr[6] = img2[1];
                    tmpptr[7] = img3[1];
                    tmpptr += 8;

                    img0 += size;
                    img1 += size;
                    img2 += size;
                    img3 += size;
                }
            }
            for (; q < inch; q++)
            {
                const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;

                for (int k = 0; k < maxk; k++)
                {
                    tmpptr[0] = img0[0];
                    tmpptr[1] = img0[1];

                    tmpptr += 2;

                    img0 += size;
                }
            }
        }

        remain_size_start += nn_size << 1;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = remain_size_start; i < size; i++)
        {
            signed char* tmpptr = tmp.channel(i / 2 + i % 2);

            int q = 0;
            for (; q + 3 < inch; q += 4)
            {
                const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;
                const signed char* img1 = (const signed char*)bottom_im2col.channel(q + 1) + i;
                const signed char* img2 = (const signed char*)bottom_im2col.channel(q + 2) + i;
                const signed char* img3 = (const signed char*)bottom_im2col.channel(q + 3) + i;

                for (int k = 0; k < maxk; k++)
                {
                    tmpptr[0] = img0[0];
                    tmpptr[1] = img1[0];
                    tmpptr[2] = img2[0];
                    tmpptr[3] = img3[0];
                    tmpptr += 4;

                    img0 += size;
                    img1 += size;
                    img2 += size;
                    img3 += size;
                }
            }
            for (; q < inch; q++)
            {
                const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;

                for (int k = 0; k < maxk; k++)
                {
                    tmpptr[0] = img0[0];

                    tmpptr += 1;

                    img0 += size;
                }
            }
        }
    }

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        int* outptr0 = top_blob.channel(p);

        int i = 0;
        for (; i + 1 < size; i += 2)
        {
            const signed char* tmpptr = tmp.channel(i / 2);
            const signed char* kptr = kernel.channel(p);

            int nn4 = (inch / 4) * maxk;
            int nn1 = (inch % 4) * maxk;

            v4i32 _sum00 = __msa_fill_w(0);
            v4i32 _sum10 = __msa_fill_w(0);

            if (nn4 > 0)
            {
                v4i32 _sum01 = __msa_fill_w(0);
                v4i32 _sum02 = __msa_fill_w(0);
                v4i32 _sum03 = __msa_fill_w(0);
                v4i32 _sum11 = __msa_fill_w(0);
                v4i32 _sum12 = __msa_fill_w(0);
                v4i32 _sum13 = __msa_fill_w(0);

                int j = 0;
                for (; j < nn4; j++)
                {
                    v16i8 _val = __msa_ld_b(tmpptr, 0);
                    v8i16 _val01 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val);

                    v8i16 _val0 = (v8i16)__msa_ilvr_d((v2i64)_val01, (v2i64)_val01);
                    v8i16 _val1 = (v8i16)__msa_ilvl_d((v2i64)_val01, (v2i64)_val01);

                    v16i8 _w01 = __msa_ld_b(kptr, 0);
                    v16i8 _extw01 = __msa_clti_s_b(_w01, 0);
                    v8i16 _w0 = (v8i16)__msa_ilvr_b(_extw01, _w01);
                    v8i16 _w1 = (v8i16)__msa_ilvl_b(_extw01, _w01);

                    v8i16 _s00 = __msa_mulv_h(_val0, _w0);
                    v8i16 _s01 = __msa_mulv_h(_val0, _w1);
                    v8i16 _s10 = __msa_mulv_h(_val1, _w0);
                    v8i16 _s11 = __msa_mulv_h(_val1, _w1);

                    v8i16 _exts00 = __msa_clti_s_h(_s00, 0);
                    v8i16 _exts01 = __msa_clti_s_h(_s01, 0);
                    v8i16 _exts10 = __msa_clti_s_h(_s10, 0);
                    v8i16 _exts11 = __msa_clti_s_h(_s11, 0);
                    v4i32 _s00l = (v4i32)__msa_ilvr_h(_exts00, _s00);
                    v4i32 _s00h = (v4i32)__msa_ilvl_h(_exts00, _s00);
                    v4i32 _s01l = (v4i32)__msa_ilvr_h(_exts01, _s01);
                    v4i32 _s01h = (v4i32)__msa_ilvl_h(_exts01, _s01);
                    v4i32 _s10l = (v4i32)__msa_ilvr_h(_exts10, _s10);
                    v4i32 _s10h = (v4i32)__msa_ilvl_h(_exts10, _s10);
                    v4i32 _s11l = (v4i32)__msa_ilvr_h(_exts11, _s11);
                    v4i32 _s11h = (v4i32)__msa_ilvl_h(_exts11, _s11);

                    _sum00 = __msa_addv_w(_sum00, _s00l);
                    _sum01 = __msa_addv_w(_sum01, _s00h);
                    _sum02 = __msa_addv_w(_sum02, _s01l);
                    _sum03 = __msa_addv_w(_sum03, _s01h);
                    _sum10 = __msa_addv_w(_sum10, _s10l);
                    _sum11 = __msa_addv_w(_sum11, _s10h);
                    _sum12 = __msa_addv_w(_sum12, _s11l);
                    _sum13 = __msa_addv_w(_sum13, _s11h);

                    tmpptr += 8;
                    kptr += 16;
                }

                // transpose 4x4
                {
                    v4i32 _tmp0, _tmp1, _tmp2, _tmp3;
                    _tmp0 = __msa_ilvr_w(_sum01, _sum00);
                    _tmp1 = __msa_ilvr_w(_sum03, _sum02);
                    _tmp2 = __msa_ilvl_w(_sum01, _sum00);
                    _tmp3 = __msa_ilvl_w(_sum03, _sum02);
                    _sum00 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0);
                    _sum01 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0);
                    _sum02 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2);
                    _sum03 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2);
                }
                {
                    v4i32 _tmp0, _tmp1, _tmp2, _tmp3;
                    _tmp0 = __msa_ilvr_w(_sum11, _sum10);
                    _tmp1 = __msa_ilvr_w(_sum13, _sum12);
                    _tmp2 = __msa_ilvl_w(_sum11, _sum10);
                    _tmp3 = __msa_ilvl_w(_sum13, _sum12);
                    _sum10 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0);
                    _sum11 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0);
                    _sum12 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2);
                    _sum13 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2);
                }

                _sum00 = __msa_addv_w(_sum00, _sum01);
                _sum02 = __msa_addv_w(_sum02, _sum03);
                _sum10 = __msa_addv_w(_sum10, _sum11);
                _sum12 = __msa_addv_w(_sum12, _sum13);

                _sum00 = __msa_addv_w(_sum00, _sum02);
                _sum10 = __msa_addv_w(_sum10, _sum12);
            }

            int j = 0;
            for (; j < nn1; j++)
            {
                v8i16 _val0 = __msa_fill_h(tmpptr[0]);
                v8i16 _val1 = __msa_fill_h(tmpptr[1]);
                v8i16 _val = (v8i16)__msa_ilvr_d((v2i64)_val1, (v2i64)_val0);

                v16i8 _w = __msa_ld_b(kptr, 0);
                v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w);

                _w16 = (v8i16)__msa_ilvr_d((v2i64)_w16, (v2i64)_w16);

                v8i16 _s0 = __msa_mulv_h(_val, _w16);
                v8i16 _exts0 = __msa_clti_s_h(_s0, 0);
                v4i32 _s0l = (v4i32)__msa_ilvr_h(_exts0, _s0);
                v4i32 _s0h = (v4i32)__msa_ilvl_h(_exts0, _s0);

                _sum00 = __msa_addv_w(_sum00, _s0l);
                _sum10 = __msa_addv_w(_sum10, _s0h);

                tmpptr += 2;
                kptr += 4;
            }

            __msa_st_w(_sum00, outptr0, 0);
            __msa_st_w(_sum10, outptr0 + 4, 0);
            outptr0 += 8;
        }
        for (; i < size; i++)
        {
            const signed char* tmpptr = tmp.channel(i / 2 + i % 2);
            const signed char* kptr = kernel.channel(p);

            int nn4 = (inch / 4) * maxk;
            int nn1 = (inch % 4) * maxk;

            v4i32 _sum0 = __msa_fill_w(0);

            if (nn4 > 0)
            {
                v4i32 _sum1 = __msa_fill_w(0);
                v4i32 _sum2 = __msa_fill_w(0);
                v4i32 _sum3 = __msa_fill_w(0);

                int j = 0;
                for (; j < nn4; j++)
                {
                    v16i8 _val = __msa_ld_b(tmpptr, 0);
                    v8i16 _val16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val);

                    _val16 = (v8i16)__msa_ilvr_d((v2i64)_val16, (v2i64)_val16);

                    v16i8 _w01 = __msa_ld_b(kptr, 0);
                    v16i8 _extw01 = __msa_clti_s_b(_w01, 0);
                    v8i16 _w0 = (v8i16)__msa_ilvr_b(_extw01, _w01);
                    v8i16 _w1 = (v8i16)__msa_ilvl_b(_extw01, _w01);

                    v8i16 _s0 = __msa_mulv_h(_val16, _w0);
                    v8i16 _s1 = __msa_mulv_h(_val16, _w1);

                    v8i16 _exts0 = __msa_clti_s_h(_s0, 0);
                    v8i16 _exts1 = __msa_clti_s_h(_s1, 0);
                    v4i32 _s0l = (v4i32)__msa_ilvr_h(_exts0, _s0);
                    v4i32 _s0h = (v4i32)__msa_ilvl_h(_exts0, _s0);
                    v4i32 _s1l = (v4i32)__msa_ilvr_h(_exts1, _s1);
                    v4i32 _s1h = (v4i32)__msa_ilvl_h(_exts1, _s1);

                    _sum0 = __msa_addv_w(_sum0, _s0l);
                    _sum1 = __msa_addv_w(_sum1, _s0h);
                    _sum2 = __msa_addv_w(_sum2, _s1l);
                    _sum3 = __msa_addv_w(_sum3, _s1h);

                    tmpptr += 4;
                    kptr += 16;
                }

                // transpose 4x4
                {
                    v4i32 _tmp0, _tmp1, _tmp2, _tmp3;
                    _tmp0 = __msa_ilvr_w(_sum1, _sum0);
                    _tmp1 = __msa_ilvr_w(_sum3, _sum2);
                    _tmp2 = __msa_ilvl_w(_sum1, _sum0);
                    _tmp3 = __msa_ilvl_w(_sum3, _sum2);
                    _sum0 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0);
                    _sum1 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0);
                    _sum2 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2);
                    _sum3 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2);
                }

                _sum0 = __msa_addv_w(_sum0, _sum1);
                _sum2 = __msa_addv_w(_sum2, _sum3);
                _sum0 = __msa_addv_w(_sum0, _sum2);
            }

            int j = 0;
            for (; j < nn1; j++)
            {
                v8i16 _val = __msa_fill_h(tmpptr[0]);

                v16i8 _w = __msa_ld_b(kptr, 0);
                v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w);

                v8i16 _s0 = __msa_mulv_h(_val, _w16);
                v4i32 _s032 = (v4i32)__msa_ilvr_h(__msa_clti_s_h(_s0, 0), _s0);

                _sum0 = __msa_addv_w(_sum0, _s032);

                tmpptr += 1;
                kptr += 4;
            }

            __msa_st_w(_sum0, outptr0, 0);
            outptr0 += 4;
        }
    }
 }

 static void convolution_im2col_sgemm_transform_kernel_pack1to4_int8_msa(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
 {
    const int maxk = kernel_w * kernel_h;

    // interleave
    // src = maxk-inch-outch
    // dst = 4a-4b-maxk-inch/4a-outch/4b
    Mat kernel = _kernel.reshape(maxk, inch, outch);
    if (inch >= 4)
        kernel_tm.create(16 * maxk, inch / 4 + inch % 4, outch / 4, (size_t)1u);
    else
        kernel_tm.create(4 * maxk, inch, outch / 4, (size_t)1u);

    for (int q = 0; q + 3 < outch; q += 4)
    {
        signed char* g00 = kernel_tm.channel(q / 4);

        int p = 0;
        for (; p + 3 < inch; p += 4)
        {
            for (int k = 0; k < maxk; k++)
            {
                for (int i = 0; i < 4; i++)
                {
                    for (int j = 0; j < 4; j++)
                    {
                        const signed char* k00 = kernel.channel(q + i).row<const signed char>(p + j);

                        g00[0] = k00[k];

                        g00++;
                    }
                }
            }
        }
        for (; p < inch; p++)
        {
            for (int k = 0; k < maxk; k++)
            {
                for (int i = 0; i < 4; i++)
                {
                    const signed char* k00 = kernel.channel(q + i).row<const signed char>(p);

                    g00[0] = k00[k];

                    g00++;
                }
            }
        }
    }
 }

 static void convolution_im2col_sgemm_pack1to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
 {
    int w = bottom_blob.w;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    const int size = outw * outh;

    const int maxk = kernel_w * kernel_h;

    // im2col
    Mat bottom_im2col(size, maxk, inch, 1u, 1, opt.workspace_allocator);
    {
        const int gap = w * stride_h - outw * stride_w;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < inch; p++)
        {
            const Mat img = bottom_blob.channel(p);
            signed char* ptr = bottom_im2col.channel(p);

            for (int u = 0; u < kernel_h; u++)
            {
                for (int v = 0; v < kernel_w; v++)
                {
                    const signed char* sptr = img.row<const signed char>(dilation_h * u) + dilation_w * v;

                    for (int i = 0; i < outh; i++)
                    {
                        int j = 0;
                        for (; j + 3 < outw; j += 4)
                        {
                            ptr[0] = sptr[0];
                            ptr[1] = sptr[stride_w];
                            ptr[2] = sptr[stride_w * 2];
                            ptr[3] = sptr[stride_w * 3];

                            sptr += stride_w * 4;
                            ptr += 4;
                        }
                        for (; j + 1 < outw; j += 2)
                        {
                            ptr[0] = sptr[0];
                            ptr[1] = sptr[stride_w];

                            sptr += stride_w * 2;
                            ptr += 2;
                        }
                        for (; j < outw; j++)
                        {
                            ptr[0] = sptr[0];

                            sptr += stride_w;
                            ptr += 1;
                        }

                        sptr += gap;
                    }
                }
            }
        }
    }

    im2col_sgemm_pack1to4_int8_msa(bottom_im2col, top_blob, kernel, opt);
 }
--- a/src/layer/mips/convolution_sgemm_pack4to1.h
+++ b/src/layer/mips/convolution_sgemm_pack4to1.h
@@ -550,7 +550,7 @@ static void im2col_sgemm_pack4to1_msa(const Mat& bottom_im2col, Mat& top_blob, c
                kptr0 += 4;
            }

            sum0 += __msa_fhadd_w(_sum0);
            sum0 += __msa_reduce_fadd_w(_sum0);

            outptr0[0] = sum0;

--- a/src/layer/mips/convolution_sgemm_pack8to1_int8.h
+++ b/src/layer/mips/convolution_sgemm_pack8to1_int8.h
@@ -0,0 +1,450 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 static void im2col_sgemm_pack8to1_int8_msa(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
 {
    // Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator);

    const int size = bottom_im2col.w;
    const int maxk = bottom_im2col.h;
    const int inch = bottom_im2col.c;

    const int outch = top_blob.c;

    // permute
    Mat tmp;
    if (size >= 2)
        tmp.create(2 * maxk, inch, size / 2 + size % 2, 8u, 8, opt.workspace_allocator);
    else
        tmp.create(maxk, inch, size, 8u, 8, opt.workspace_allocator);
    {
        int remain_size_start = 0;
        int nn_size = (size - remain_size_start) >> 1;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ii = 0; ii < nn_size; ii++)
        {
            int i = remain_size_start + ii * 2;

            int64_t* tmpptr = tmp.channel(i / 2);

            for (int q = 0; q < inch; q++)
            {
                const int64_t* img0 = (const int64_t*)bottom_im2col.channel(q) + i;

                for (int k = 0; k < maxk; k++)
                {
                    v16i8 _v = __msa_ld_b(img0, 0);
                    __msa_st_b(_v, tmpptr, 0);
                    tmpptr += 2;
                    img0 += size;
                }
            }
        }

        remain_size_start += nn_size << 1;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = remain_size_start; i < size; i++)
        {
            int64_t* tmpptr = tmp.channel(i / 2 + i % 2);

            for (int q = 0; q < inch; q++)
            {
                const int64_t* img0 = (const int64_t*)bottom_im2col.channel(q) + i;

                for (int k = 0; k < maxk; k++)
                {
                    tmpptr[0] = img0[0];
                    tmpptr += 1;
                    img0 += size;
                }
            }
        }
    }

    int nn_outch = 0;
    int remain_outch_start = 0;

    nn_outch = outch >> 2;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp = 0; pp < nn_outch; pp++)
    {
        int p = pp * 4;

        int* outptr0 = top_blob.channel(p);
        int* outptr1 = top_blob.channel(p + 1);
        int* outptr2 = top_blob.channel(p + 2);
        int* outptr3 = top_blob.channel(p + 3);

        int i = 0;
        for (; i + 1 < size; i += 2)
        {
            const signed char* tmpptr = tmp.channel(i / 2);
            const signed char* kptr = kernel.channel(p / 4);

            int nn = inch * maxk; // inch always > 0

            v4i32 _sum00 = __msa_fill_w(0);
            v4i32 _sum01 = __msa_fill_w(0);
            v4i32 _sum02 = __msa_fill_w(0);
            v4i32 _sum03 = __msa_fill_w(0);
            v4i32 _sum10 = __msa_fill_w(0);
            v4i32 _sum11 = __msa_fill_w(0);
            v4i32 _sum12 = __msa_fill_w(0);
            v4i32 _sum13 = __msa_fill_w(0);

            int j = 0;
            for (; j < nn; j++)
            {
                v16i8 _val01 = __msa_ld_b(tmpptr, 0);
                v16i8 _extval01 = __msa_clti_s_b(_val01, 0);
                v8i16 _val0 = (v8i16)__msa_ilvr_b(_extval01, _val01);
                v8i16 _val1 = (v8i16)__msa_ilvl_b(_extval01, _val01);

                v16i8 _w01 = __msa_ld_b(kptr, 0);
                v16i8 _w23 = __msa_ld_b(kptr + 16, 0);
                v16i8 _extw01 = __msa_clti_s_b(_w01, 0);
                v16i8 _extw23 = __msa_clti_s_b(_w23, 0);
                v8i16 _w0 = (v8i16)__msa_ilvr_b(_extw01, _w01);
                v8i16 _w1 = (v8i16)__msa_ilvl_b(_extw01, _w01);
                v8i16 _w2 = (v8i16)__msa_ilvr_b(_extw23, _w23);
                v8i16 _w3 = (v8i16)__msa_ilvl_b(_extw23, _w23);

                v8i16 _s00 = __msa_mulv_h(_val0, _w0);
                v8i16 _s01 = __msa_mulv_h(_val0, _w1);
                v8i16 _s02 = __msa_mulv_h(_val0, _w2);
                v8i16 _s03 = __msa_mulv_h(_val0, _w3);
                v8i16 _s10 = __msa_mulv_h(_val1, _w0);
                v8i16 _s11 = __msa_mulv_h(_val1, _w1);
                v8i16 _s12 = __msa_mulv_h(_val1, _w2);
                v8i16 _s13 = __msa_mulv_h(_val1, _w3);

                _sum00 = __msa_addv_w(_sum00, __msa_hadd_s_w(_s00, _s00));
                _sum01 = __msa_addv_w(_sum01, __msa_hadd_s_w(_s01, _s01));
                _sum02 = __msa_addv_w(_sum02, __msa_hadd_s_w(_s02, _s02));
                _sum03 = __msa_addv_w(_sum03, __msa_hadd_s_w(_s03, _s03));
                _sum10 = __msa_addv_w(_sum10, __msa_hadd_s_w(_s10, _s10));
                _sum11 = __msa_addv_w(_sum11, __msa_hadd_s_w(_s11, _s11));
                _sum12 = __msa_addv_w(_sum12, __msa_hadd_s_w(_s12, _s12));
                _sum13 = __msa_addv_w(_sum13, __msa_hadd_s_w(_s13, _s13));

                tmpptr += 16;
                kptr += 32;
            }

            // transpose 4x4
            {
                v4i32 _tmp0, _tmp1, _tmp2, _tmp3;
                _tmp0 = __msa_ilvr_w(_sum01, _sum00);
                _tmp1 = __msa_ilvr_w(_sum03, _sum02);
                _tmp2 = __msa_ilvl_w(_sum01, _sum00);
                _tmp3 = __msa_ilvl_w(_sum03, _sum02);
                _sum00 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0);
                _sum01 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0);
                _sum02 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2);
                _sum03 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2);
            }
            {
                v4i32 _tmp0, _tmp1, _tmp2, _tmp3;
                _tmp0 = __msa_ilvr_w(_sum11, _sum10);
                _tmp1 = __msa_ilvr_w(_sum13, _sum12);
                _tmp2 = __msa_ilvl_w(_sum11, _sum10);
                _tmp3 = __msa_ilvl_w(_sum13, _sum12);
                _sum10 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0);
                _sum11 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0);
                _sum12 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2);
                _sum13 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2);
            }

            _sum00 = __msa_addv_w(_sum00, _sum01);
            _sum02 = __msa_addv_w(_sum02, _sum03);
            _sum10 = __msa_addv_w(_sum10, _sum11);
            _sum12 = __msa_addv_w(_sum12, _sum13);

            _sum00 = __msa_addv_w(_sum00, _sum02);
            _sum10 = __msa_addv_w(_sum10, _sum12);

            int sum[8];
            __msa_st_w(_sum00, sum, 0);
            __msa_st_w(_sum10, sum + 4, 0);

            outptr0[0] = sum[0];
            outptr1[0] = sum[1];
            outptr2[0] = sum[2];
            outptr3[0] = sum[3];
            outptr0[1] = sum[4];
            outptr1[1] = sum[5];
            outptr2[1] = sum[6];
            outptr3[1] = sum[7];
            outptr0 += 2;
            outptr1 += 2;
            outptr2 += 2;
            outptr3 += 2;
        }
        for (; i < size; i++)
        {
            const signed char* tmpptr = tmp.channel(i / 2 + i % 2);
            const signed char* kptr = kernel.channel(p / 4);

            int nn = inch * maxk; // inch always > 0

            v4i32 _sum0 = __msa_fill_w(0);
            v4i32 _sum1 = __msa_fill_w(0);
            v4i32 _sum2 = __msa_fill_w(0);
            v4i32 _sum3 = __msa_fill_w(0);

            int j = 0;
            for (; j < nn; j++)
            {
                v16i8 _val = __msa_ld_b(tmpptr, 0);
                v8i16 _val16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val);

                v16i8 _w01 = __msa_ld_b(kptr, 0);
                v16i8 _w23 = __msa_ld_b(kptr + 16, 0);
                v16i8 _extw01 = __msa_clti_s_b(_w01, 0);
                v16i8 _extw23 = __msa_clti_s_b(_w23, 0);
                v8i16 _w0 = (v8i16)__msa_ilvr_b(_extw01, _w01);
                v8i16 _w1 = (v8i16)__msa_ilvl_b(_extw01, _w01);
                v8i16 _w2 = (v8i16)__msa_ilvr_b(_extw23, _w23);
                v8i16 _w3 = (v8i16)__msa_ilvl_b(_extw23, _w23);

                v8i16 _s0 = __msa_mulv_h(_val16, _w0);
                v8i16 _s1 = __msa_mulv_h(_val16, _w1);
                v8i16 _s2 = __msa_mulv_h(_val16, _w2);
                v8i16 _s3 = __msa_mulv_h(_val16, _w3);

                _sum0 = __msa_addv_w(_sum0, __msa_hadd_s_w(_s0, _s0));
                _sum1 = __msa_addv_w(_sum1, __msa_hadd_s_w(_s1, _s1));
                _sum2 = __msa_addv_w(_sum2, __msa_hadd_s_w(_s2, _s2));
                _sum3 = __msa_addv_w(_sum3, __msa_hadd_s_w(_s3, _s3));

                tmpptr += 8;
                kptr += 32;
            }

            // transpose 4x4
            {
                v4i32 _tmp0, _tmp1, _tmp2, _tmp3;
                _tmp0 = __msa_ilvr_w(_sum1, _sum0);
                _tmp1 = __msa_ilvr_w(_sum3, _sum2);
                _tmp2 = __msa_ilvl_w(_sum1, _sum0);
                _tmp3 = __msa_ilvl_w(_sum3, _sum2);
                _sum0 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0);
                _sum1 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0);
                _sum2 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2);
                _sum3 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2);
            }

            _sum0 = __msa_addv_w(_sum0, _sum1);
            _sum2 = __msa_addv_w(_sum2, _sum3);

            _sum0 = __msa_addv_w(_sum0, _sum2);

            int sum[4];
            __msa_st_w(_sum0, sum, 0);

            outptr0[0] = sum[0];
            outptr1[0] = sum[1];
            outptr2[0] = sum[2];
            outptr3[0] = sum[3];
            outptr0 += 1;
            outptr1 += 1;
            outptr2 += 1;
            outptr3 += 1;
        }
    }

    remain_outch_start += nn_outch << 2;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = remain_outch_start; p < outch; p++)
    {
        int* outptr0 = top_blob.channel(p);

        int i = 0;
        for (; i + 1 < size; i += 2)
        {
            const signed char* tmpptr = tmp.channel(i / 2);
            const signed char* kptr = kernel.channel(p / 4 + p % 4);

            int nn = inch * maxk; // inch always > 0

            v4i32 _sum0 = __msa_fill_w(0);
            v4i32 _sum1 = __msa_fill_w(0);

            int j = 0;
            for (; j < nn; j++)
            {
                v16i8 _val01 = __msa_ld_b(tmpptr, 0);
                v16i8 _extval01 = __msa_clti_s_b(_val01, 0);
                v8i16 _val0 = (v8i16)__msa_ilvr_b(_extval01, _val01);
                v8i16 _val1 = (v8i16)__msa_ilvl_b(_extval01, _val01);

                v16i8 _w = __msa_ld_b(kptr, 0);
                v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w);

                v8i16 _s0 = __msa_mulv_h(_val0, _w16);
                v8i16 _s1 = __msa_mulv_h(_val1, _w16);

                _sum0 = __msa_addv_w(_sum0, __msa_hadd_s_w(_s0, _s0));
                _sum1 = __msa_addv_w(_sum1, __msa_hadd_s_w(_s1, _s1));

                tmpptr += 16;
                kptr += 8;
            }

            outptr0[0] = __msa_reduce_add_w(_sum0);
            outptr0[1] = __msa_reduce_add_w(_sum1);
            outptr0 += 2;
        }
        for (; i < size; i++)
        {
            const signed char* tmpptr = tmp.channel(i / 2 + i % 2);
            const signed char* kptr = kernel.channel(p / 4 + p % 4);

            int nn = inch * maxk; // inch always > 0

            v4i32 _sum = __msa_fill_w(0);

            int j = 0;
            for (; j < nn; j++)
            {
                v16i8 _val = __msa_ld_b(tmpptr, 0);
                v8i16 _val16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val);

                v16i8 _w = __msa_ld_b(kptr, 0);
                v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w);

                v8i16 _s0 = __msa_mulv_h(_val16, _w16);

                _sum = __msa_addv_w(_sum, __msa_hadd_s_w(_s0, _s0));

                tmpptr += 8;
                kptr += 8;
            }

            outptr0[0] = __msa_reduce_add_w(_sum);
            outptr0 += 1;
        }
    }
 }

 static void convolution_im2col_sgemm_transform_kernel_pack8to1_int8_msa(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
 {
    const int maxk = kernel_w * kernel_h;

    // interleave
    // src = maxk-inch-outch
    // dst = 8a-4b-maxk-inch/8a-outch/4b
    Mat kernel = _kernel.reshape(maxk, inch, outch);
    if (outch >= 4)
        kernel_tm.create(32 * maxk, inch / 8, outch / 4 + outch % 4, (size_t)1u);
    else
        kernel_tm.create(8 * maxk, inch / 8, outch, (size_t)1u);

    int q = 0;
    for (; q + 3 < outch; q += 4)
    {
        signed char* g00 = kernel_tm.channel(q / 4);

        for (int p = 0; p + 7 < inch; p += 8)
        {
            for (int k = 0; k < maxk; k++)
            {
                for (int i = 0; i < 4; i++)
                {
                    for (int j = 0; j < 8; j++)
                    {
                        const signed char* k00 = kernel.channel(q + i).row<const signed char>(p + j);

                        g00[0] = k00[k];

                        g00++;
                    }
                }
            }
        }
    }
    // TODO unroll 2
    for (; q < outch; q++)
    {
        signed char* g00 = kernel_tm.channel(q / 4 + q % 4);

        for (int p = 0; p + 7 < inch; p += 8)
        {
            for (int k = 0; k < maxk; k++)
            {
                for (int j = 0; j < 8; j++)
                {
                    const signed char* k00 = kernel.channel(q).row<const signed char>(p + j);

                    g00[0] = k00[k];

                    g00++;
                }
            }
        }
    }
 }

 static void convolution_im2col_sgemm_pack8to1_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
 {
    int w = bottom_blob.w;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    const int size = outw * outh;

    const int maxk = kernel_w * kernel_h;

    // im2col
    Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator);
    {
        const int gap = w * stride_h - outw * stride_w;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < inch; p++)
        {
            const Mat img = bottom_blob.channel(p);
            int64_t* ptr = bottom_im2col.channel(p);

            for (int u = 0; u < kernel_h; u++)
            {
                for (int v = 0; v < kernel_w; v++)
                {
                    const int64_t* sptr = img.row<const int64_t>(dilation_h * u) + dilation_w * v;

                    for (int i = 0; i < outh; i++)
                    {
                        int j = 0;
                        for (; j < outw; j++)
                        {
                            ptr[0] = sptr[0];

                            sptr += stride_w;
                            ptr += 1;
                        }

                        sptr += gap;
                    }
                }
            }
        }
    }

    im2col_sgemm_pack8to1_int8_msa(bottom_im2col, top_blob, kernel, opt);
 }
--- a/src/layer/mips/convolution_sgemm_pack8to4_int8.h
+++ b/src/layer/mips/convolution_sgemm_pack8to4_int8.h
@@ -0,0 +1,320 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 static void im2col_sgemm_pack8to4_int8_msa(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
 {
    // Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator);

    const int size = bottom_im2col.w;
    const int maxk = bottom_im2col.h;
    const int inch = bottom_im2col.c;

    const int outch = top_blob.c;

    // permute
    Mat tmp;
    if (size >= 2)
        tmp.create(2 * maxk, inch, size / 2 + size % 2, 8u, 8, opt.workspace_allocator);
    else
        tmp.create(maxk, inch, size, 8u, 8, opt.workspace_allocator);
    {
        int remain_size_start = 0;
        int nn_size = size >> 1;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ii = 0; ii < nn_size; ii++)
        {
            int i = remain_size_start + ii * 2;

            int64_t* tmpptr = tmp.channel(i / 2);

            for (int q = 0; q < inch; q++)
            {
                const int64_t* img0 = (const int64_t*)bottom_im2col.channel(q) + i;

                for (int k = 0; k < maxk; k++)
                {
                    v16i8 _v = __msa_ld_b(img0, 0);
                    __msa_st_b(_v, tmpptr, 0);
                    tmpptr += 2;
                    img0 += size;
                }
            }
        }

        remain_size_start += nn_size << 1;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = remain_size_start; i < size; i++)
        {
            int64_t* tmpptr = tmp.channel(i / 2 + i % 2);

            for (int q = 0; q < inch; q++)
            {
                const int64_t* img0 = (const int64_t*)bottom_im2col.channel(q) + i;

                for (int k = 0; k < maxk; k++)
                {
                    tmpptr[0] = img0[0];
                    tmpptr += 1;
                    img0 += size;
                }
            }
        }
    }

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        int* outptr0 = top_blob.channel(p);

        int i = 0;
        for (; i + 1 < size; i += 2)
        {
            const signed char* tmpptr = tmp.channel(i / 2);
            const signed char* kptr = kernel.channel(p);

            int nn = inch * maxk; // inch always > 0

            v4i32 _sum00 = __msa_fill_w(0);
            v4i32 _sum01 = __msa_fill_w(0);
            v4i32 _sum02 = __msa_fill_w(0);
            v4i32 _sum03 = __msa_fill_w(0);
            v4i32 _sum10 = __msa_fill_w(0);
            v4i32 _sum11 = __msa_fill_w(0);
            v4i32 _sum12 = __msa_fill_w(0);
            v4i32 _sum13 = __msa_fill_w(0);

            int j = 0;
            for (; j < nn; j++)
            {
                v16i8 _val01 = __msa_ld_b(tmpptr, 0);
                v16i8 _extval01 = __msa_clti_s_b(_val01, 0);
                v8i16 _val0 = (v8i16)__msa_ilvr_b(_extval01, _val01);
                v8i16 _val1 = (v8i16)__msa_ilvl_b(_extval01, _val01);

                v16i8 _w01 = __msa_ld_b(kptr, 0);
                v16i8 _w23 = __msa_ld_b(kptr + 16, 0);
                v16i8 _extw01 = __msa_clti_s_b(_w01, 0);
                v16i8 _extw23 = __msa_clti_s_b(_w23, 0);
                v8i16 _w0 = (v8i16)__msa_ilvr_b(_extw01, _w01);
                v8i16 _w1 = (v8i16)__msa_ilvl_b(_extw01, _w01);
                v8i16 _w2 = (v8i16)__msa_ilvr_b(_extw23, _w23);
                v8i16 _w3 = (v8i16)__msa_ilvl_b(_extw23, _w23);

                v8i16 _s00 = __msa_mulv_h(_val0, _w0);
                v8i16 _s01 = __msa_mulv_h(_val0, _w1);
                v8i16 _s02 = __msa_mulv_h(_val0, _w2);
                v8i16 _s03 = __msa_mulv_h(_val0, _w3);
                v8i16 _s10 = __msa_mulv_h(_val1, _w0);
                v8i16 _s11 = __msa_mulv_h(_val1, _w1);
                v8i16 _s12 = __msa_mulv_h(_val1, _w2);
                v8i16 _s13 = __msa_mulv_h(_val1, _w3);

                _sum00 = __msa_addv_w(_sum00, __msa_hadd_s_w(_s00, _s00));
                _sum01 = __msa_addv_w(_sum01, __msa_hadd_s_w(_s01, _s01));
                _sum02 = __msa_addv_w(_sum02, __msa_hadd_s_w(_s02, _s02));
                _sum03 = __msa_addv_w(_sum03, __msa_hadd_s_w(_s03, _s03));
                _sum10 = __msa_addv_w(_sum10, __msa_hadd_s_w(_s10, _s10));
                _sum11 = __msa_addv_w(_sum11, __msa_hadd_s_w(_s11, _s11));
                _sum12 = __msa_addv_w(_sum12, __msa_hadd_s_w(_s12, _s12));
                _sum13 = __msa_addv_w(_sum13, __msa_hadd_s_w(_s13, _s13));

                tmpptr += 16;
                kptr += 32;
            }

            // transpose 4x4
            {
                v4i32 _tmp0, _tmp1, _tmp2, _tmp3;
                _tmp0 = __msa_ilvr_w(_sum01, _sum00);
                _tmp1 = __msa_ilvr_w(_sum03, _sum02);
                _tmp2 = __msa_ilvl_w(_sum01, _sum00);
                _tmp3 = __msa_ilvl_w(_sum03, _sum02);
                _sum00 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0);
                _sum01 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0);
                _sum02 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2);
                _sum03 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2);
            }
            {
                v4i32 _tmp0, _tmp1, _tmp2, _tmp3;
                _tmp0 = __msa_ilvr_w(_sum11, _sum10);
                _tmp1 = __msa_ilvr_w(_sum13, _sum12);
                _tmp2 = __msa_ilvl_w(_sum11, _sum10);
                _tmp3 = __msa_ilvl_w(_sum13, _sum12);
                _sum10 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0);
                _sum11 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0);
                _sum12 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2);
                _sum13 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2);
            }

            _sum00 = __msa_addv_w(_sum00, _sum01);
            _sum02 = __msa_addv_w(_sum02, _sum03);
            _sum10 = __msa_addv_w(_sum10, _sum11);
            _sum12 = __msa_addv_w(_sum12, _sum13);

            _sum00 = __msa_addv_w(_sum00, _sum02);
            _sum10 = __msa_addv_w(_sum10, _sum12);

            __msa_st_w(_sum00, outptr0, 0);
            __msa_st_w(_sum10, outptr0 + 4, 0);
            outptr0 += 8;
        }
        for (; i < size; i++)
        {
            const signed char* tmpptr = tmp.channel(i / 2 + i % 2);
            const signed char* kptr = kernel.channel(p);

            int nn = inch * maxk; // inch always > 0

            v4i32 _sum0 = __msa_fill_w(0);
            v4i32 _sum1 = __msa_fill_w(0);
            v4i32 _sum2 = __msa_fill_w(0);
            v4i32 _sum3 = __msa_fill_w(0);

            int j = 0;
            for (; j < nn; j++)
            {
                v16i8 _val = __msa_ld_b(tmpptr, 0);
                v8i16 _val16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val);

                v16i8 _w01 = __msa_ld_b(kptr, 0);
                v16i8 _w23 = __msa_ld_b(kptr + 16, 0);
                v16i8 _extw01 = __msa_clti_s_b(_w01, 0);
                v16i8 _extw23 = __msa_clti_s_b(_w23, 0);
                v8i16 _w0 = (v8i16)__msa_ilvr_b(_extw01, _w01);
                v8i16 _w1 = (v8i16)__msa_ilvl_b(_extw01, _w01);
                v8i16 _w2 = (v8i16)__msa_ilvr_b(_extw23, _w23);
                v8i16 _w3 = (v8i16)__msa_ilvl_b(_extw23, _w23);

                v8i16 _s0 = __msa_mulv_h(_val16, _w0);
                v8i16 _s1 = __msa_mulv_h(_val16, _w1);
                v8i16 _s2 = __msa_mulv_h(_val16, _w2);
                v8i16 _s3 = __msa_mulv_h(_val16, _w3);

                _sum0 = __msa_addv_w(_sum0, __msa_hadd_s_w(_s0, _s0));
                _sum1 = __msa_addv_w(_sum1, __msa_hadd_s_w(_s1, _s1));
                _sum2 = __msa_addv_w(_sum2, __msa_hadd_s_w(_s2, _s2));
                _sum3 = __msa_addv_w(_sum3, __msa_hadd_s_w(_s3, _s3));

                tmpptr += 8;
                kptr += 32;
            }

            // transpose 4x4
            {
                v4i32 _tmp0, _tmp1, _tmp2, _tmp3;
                _tmp0 = __msa_ilvr_w(_sum1, _sum0);
                _tmp1 = __msa_ilvr_w(_sum3, _sum2);
                _tmp2 = __msa_ilvl_w(_sum1, _sum0);
                _tmp3 = __msa_ilvl_w(_sum3, _sum2);
                _sum0 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0);
                _sum1 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0);
                _sum2 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2);
                _sum3 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2);
            }

            _sum0 = __msa_addv_w(_sum0, _sum1);
            _sum2 = __msa_addv_w(_sum2, _sum3);

            _sum0 = __msa_addv_w(_sum0, _sum2);

            __msa_st_w(_sum0, outptr0, 0);
            outptr0 += 4;
        }
    }
 }

 static void convolution_im2col_sgemm_transform_kernel_pack8to4_int8_msa(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
 {
    const int maxk = kernel_w * kernel_h;

    // interleave
    // src = maxk-inch-outch
    // dst = 8a-4b-maxk-inch/8a-outch/4b
    Mat kernel = _kernel.reshape(maxk, inch, outch);
    kernel_tm.create(32 * maxk, inch / 8, outch / 4, (size_t)1u);

    for (int q = 0; q + 3 < outch; q += 4)
    {
        signed char* g00 = kernel_tm.channel(q / 4);

        for (int p = 0; p + 7 < inch; p += 8)
        {
            for (int k = 0; k < maxk; k++)
            {
                for (int i = 0; i < 4; i++)
                {
                    for (int j = 0; j < 8; j++)
                    {
                        const signed char* k00 = kernel.channel(q + i).row<const signed char>(p + j);

                        g00[0] = k00[k];

                        g00++;
                    }
                }
            }
        }
    }
 }

 static void convolution_im2col_sgemm_pack8to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
 {
    int w = bottom_blob.w;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    const int size = outw * outh;

    const int maxk = kernel_w * kernel_h;

    // im2col
    Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator);
    {
        const int gap = w * stride_h - outw * stride_w;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < inch; p++)
        {
            const Mat img = bottom_blob.channel(p);
            int64_t* ptr = bottom_im2col.channel(p);

            for (int u = 0; u < kernel_h; u++)
            {
                for (int v = 0; v < kernel_w; v++)
                {
                    const int64_t* sptr = img.row<const int64_t>(dilation_h * u) + dilation_w * v;

                    for (int i = 0; i < outh; i++)
                    {
                        int j = 0;
                        for (; j < outw; j++)
                        {
                            ptr[0] = sptr[0];

                            sptr += stride_w;
                            ptr += 1;
                        }

                        sptr += gap;
                    }
                }
            }
        }
    }

    im2col_sgemm_pack8to4_int8_msa(bottom_im2col, top_blob, kernel, opt);
 }
--- a/src/layer/mips/deconvolution_pack4to1.h
+++ b/src/layer/mips/deconvolution_pack4to1.h
@@ -88,7 +88,7 @@ static void deconvolution_pack4to1_msa(const Mat& bottom_blob, Mat& top_blob, co
                    kptr += maxk * 4;
                }

                sum += __msa_fhadd_w(_sum);
                sum += __msa_reduce_fadd_w(_sum);

                sum = activation_ss(sum, activation_type, activation_params);

--- a/src/layer/mips/innerproduct_mips.cpp
+++ b/src/layer/mips/innerproduct_mips.cpp
@@ -300,10 +300,10 @@ int InnerProduct_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Opti
        }

 #if __mips_msa
        sum0 += __msa_fhadd_w(_sum0);
        sum1 += __msa_fhadd_w(_sum1);
        sum2 += __msa_fhadd_w(_sum2);
        sum3 += __msa_fhadd_w(_sum3);
        sum0 += __msa_reduce_fadd_w(_sum0);
        sum1 += __msa_reduce_fadd_w(_sum1);
        sum2 += __msa_reduce_fadd_w(_sum2);
        sum3 += __msa_reduce_fadd_w(_sum3);
 #endif // __mips_msa

        sum0 = activation_ss(sum0, activation_type, activation_params);
--- a/src/layer/mips/mips_usability.h
+++ b/src/layer/mips/mips_usability.h
@@ -38,19 +38,25 @@ typedef union
    static const ncnn::FloatInt Name = {.f = Val}

 /* float type data load instructions */
 static inline v4f32 __msa_fill_w_f32(float val)
 static NCNN_FORCEINLINE v4f32 __msa_fill_w_f32(float val)
 {
    ncnn::FloatInt fi_tmpval = {.f = val};
    return (v4f32)__msa_fill_w(fi_tmpval.i);
 }

 static inline float __msa_fhadd_w(v4f32 _v)
 static NCNN_FORCEINLINE float __msa_reduce_fadd_w(v4f32 _v)
 {
    // TODO find a more efficient way
    return _v[0] + _v[1] + _v[2] + _v[3];
 }

 static inline int __msa_cfcmsa_msacsr()
 static NCNN_FORCEINLINE int __msa_reduce_add_w(v4i32 _v)
 {
    // TODO find a more efficient way
    return _v[0] + _v[1] + _v[2] + _v[3];
 }

 static NCNN_FORCEINLINE int __msa_cfcmsa_msacsr()
 {
    int v;
    asm volatile("cfcmsa %0, $1 \n"
@@ -60,7 +66,7 @@ static inline int __msa_cfcmsa_msacsr()
    return v;
 }

 static inline void __msa_ctcmsa_msacsr(int v)
 static NCNN_FORCEINLINE void __msa_ctcmsa_msacsr(int v)
 {
    asm volatile("ctcmsa $1, %0 \n"
                 :
@@ -69,7 +75,7 @@ static inline void __msa_ctcmsa_msacsr(int v)
 }
 #endif // __mips_msa

 static inline signed char float2int8(float v)
 static NCNN_FORCEINLINE signed char float2int8(float v)
 {
    int int32 = round(v);
    if (int32 > 127) return 127;
@@ -78,7 +84,7 @@ static inline signed char float2int8(float v)
 }

 #if __mips_msa
 static inline v16i8 float2int8(v4f32 _v)
 static NCNN_FORCEINLINE v16i8 float2int8(v4f32 _v)
 {
    // simulate round to nearest via +/-0.5
    v4f32 _p5 = (v4f32)__msa_fill_w_f32(0.5f);
@@ -98,7 +104,7 @@ static inline v16i8 float2int8(v4f32 _v)
    return _v8;
 }

 static inline int64_t float2int8(v4f32 _vlow, v4f32 _vhigh)
 static NCNN_FORCEINLINE int64_t float2int8(v4f32 _vlow, v4f32 _vhigh)
 {
    // simulate round to nearest via +/-0.5
    v4f32 _p5 = (v4f32)__msa_fill_w_f32(0.5f);
@@ -123,7 +129,7 @@ static inline int64_t float2int8(v4f32 _vlow, v4f32 _vhigh)
    return _v8[0];
 }

 static inline v16i8 float2int8relu(v4f32 _v)
 static NCNN_FORCEINLINE v16i8 float2int8relu(v4f32 _v)
 {
    // simulate round to nearest via +/-0.5
    v4f32 _p5 = (v4f32)__msa_fill_w_f32(0.5f);
@@ -143,7 +149,7 @@ static inline v16i8 float2int8relu(v4f32 _v)
    return _v8;
 }

 static inline int64_t float2int8relu(v4f32 _vlow, v4f32 _vhigh)
 static NCNN_FORCEINLINE int64_t float2int8relu(v4f32 _vlow, v4f32 _vhigh)
 {
    // simulate round to nearest via +/-0.5
    v4f32 _p5 = (v4f32)__msa_fill_w_f32(0.5f);
@@ -168,7 +174,7 @@ static inline int64_t float2int8relu(v4f32 _vlow, v4f32 _vhigh)
    return _v8[0];
 }

 static inline v16i8 float2int8leakyrelu(v4f32 _v, v4f32 _slope)
 static NCNN_FORCEINLINE v16i8 float2int8leakyrelu(v4f32 _v, v4f32 _slope)
 {
    v4f32 _v_leaky = __msa_fmul_w(_v, _slope);

@@ -199,7 +205,7 @@ static inline v16i8 float2int8leakyrelu(v4f32 _v, v4f32 _slope)
    return _v8;
 }

 static inline int64_t float2int8leakyrelu(v4f32 _vlow, v4f32 _vhigh, v4f32 _slope)
 static NCNN_FORCEINLINE int64_t float2int8leakyrelu(v4f32 _vlow, v4f32 _vhigh, v4f32 _slope)
 {
    v4f32 _vlow_leaky = __msa_fmul_w(_vlow, _slope);
    v4f32 _vhigh_leaky = __msa_fmul_w(_vhigh, _slope);