From c09d7b359116c02891ef0bb678e596d67cb0d363 Mon Sep 17 00:00:00 2001
From: nihui <nihuini@tencent.com>
Date: Tue, 5 Apr 2022 12:25:25 +0800
Subject: [PATCH] mips msa optimization for convolution int8 (#3675)

* basic mips msa optimization for convolution int8

* mips msa optimization for convolution int8 gemm

* mips msa optimization for convolution int8 winograd pack8to4/pack8to1

* mention msa maddv/msubv intrinsics bug
---
 docs/how-to-build/how-to-build.md             |  30 +-
 src/layer/mips/convolution1d_mips.cpp         |   2 +-
 src/layer/mips/convolution_1x1_int8.h         |  83 ++
 .../mips/convolution_1x1_pack1to4_int8.h      |  83 ++
 .../mips/convolution_1x1_pack8to1_int8.h      |  65 ++
 .../mips/convolution_1x1_pack8to4_int8.h      |  65 ++
 .../mips/convolution_3x3_pack8to1_int8.h      | 731 ++++++++++++++++++
 .../mips/convolution_3x3_pack8to4_int8.h      | 629 +++++++++++++++
 src/layer/mips/convolution_int8.h             |  82 ++
 src/layer/mips/convolution_mips.cpp           | 461 ++++++++++-
 src/layer/mips/convolution_mips.h             |  19 +-
 src/layer/mips/convolution_pack1to4_int8.h    |  87 +++
 src/layer/mips/convolution_pack4to1.h         |   2 +-
 src/layer/mips/convolution_pack8to1_int8.h    |  87 +++
 src/layer/mips/convolution_pack8to4_int8.h    | 120 +++
 src/layer/mips/convolution_sgemm_int8.h       | 731 ++++++++++++++++++
 .../mips/convolution_sgemm_pack1to4_int8.h    | 477 ++++++++++++
 src/layer/mips/convolution_sgemm_pack4to1.h   |   2 +-
 .../mips/convolution_sgemm_pack8to1_int8.h    | 450 +++++++++++
 .../mips/convolution_sgemm_pack8to4_int8.h    | 320 ++++++++
 src/layer/mips/deconvolution_pack4to1.h       |   2 +-
 src/layer/mips/innerproduct_mips.cpp          |   8 +-
 src/layer/mips/mips_usability.h               |  28 +-
 23 files changed, 4516 insertions(+), 48 deletions(-)
 create mode 100644 src/layer/mips/convolution_1x1_int8.h
 create mode 100644 src/layer/mips/convolution_1x1_pack1to4_int8.h
 create mode 100644 src/layer/mips/convolution_1x1_pack8to1_int8.h
 create mode 100644 src/layer/mips/convolution_1x1_pack8to4_int8.h
 create mode 100644 src/layer/mips/convolution_3x3_pack8to1_int8.h
 create mode 100644 src/layer/mips/convolution_3x3_pack8to4_int8.h
 create mode 100644 src/layer/mips/convolution_int8.h
 create mode 100644 src/layer/mips/convolution_pack1to4_int8.h
 create mode 100644 src/layer/mips/convolution_pack8to1_int8.h
 create mode 100644 src/layer/mips/convolution_pack8to4_int8.h
 create mode 100644 src/layer/mips/convolution_sgemm_int8.h
 create mode 100644 src/layer/mips/convolution_sgemm_pack1to4_int8.h
 create mode 100644 src/layer/mips/convolution_sgemm_pack8to1_int8.h
 create mode 100644 src/layer/mips/convolution_sgemm_pack8to4_int8.h

diff --git a/docs/how-to-build/how-to-build.md b/docs/how-to-build/how-to-build.md
index 3df87e870..4d999aab0 100644
--- a/docs/how-to-build/how-to-build.md
+++ b/docs/how-to-build/how-to-build.md
@@ -578,12 +578,38 @@ You can upload binary inside `build-c906/examples` folder and run on D1 board fo
 
 ### Build for Loongson 2K1000
 
-For gcc version < 8.5, you need to fix msa.h header for workaround msa fmadd bug.
+For gcc version < 8.5, you need to fix msa.h header for workaround msa fmadd/fmsub/maddv/msubv bug.
 
-Open ```/usr/lib/gcc/mips64el-linux-gnuabi64/8/include/msa.h```, find ```__msa_fmadd_w``` and apply changes as the following
+Open ```/usr/lib/gcc/mips64el-linux-gnuabi64/8/include/msa.h```, find ```__msa_fmadd``` and ```__msa_fmsub``` and apply changes as the following
 ```c
 // #define __msa_fmadd_w __builtin_msa_fmadd_w
+// #define __msa_fmadd_d __builtin_msa_fmadd_d
+// #define __msa_fmsub_w __builtin_msa_fmsub_w
+// #define __msa_fmsub_d __builtin_msa_fmsub_d
 #define __msa_fmadd_w(a, b, c) __builtin_msa_fmadd_w(c, b, a)
+#define __msa_fmadd_d(a, b, c) __builtin_msa_fmadd_d(c, b, a)
+#define __msa_fmsub_w(a, b, c) __builtin_msa_fmsub_w(c, b, a)
+#define __msa_fmsub_d(a, b, c) __builtin_msa_fmsub_d(c, b, a)
+```
+
+find ```__msa_maddv``` and ```__msa_msubv``` and apply changes as the following
+```c
+// #define __msa_maddv_b __builtin_msa_maddv_b
+// #define __msa_maddv_h __builtin_msa_maddv_h
+// #define __msa_maddv_w __builtin_msa_maddv_w
+// #define __msa_maddv_d __builtin_msa_maddv_d
+// #define __msa_msubv_b __builtin_msa_msubv_b
+// #define __msa_msubv_h __builtin_msa_msubv_h
+// #define __msa_msubv_w __builtin_msa_msubv_w
+// #define __msa_msubv_d __builtin_msa_msubv_d
+#define __msa_maddv_b(a, b, c) __builtin_msa_maddv_b(c, b, a)
+#define __msa_maddv_h(a, b, c) __builtin_msa_maddv_h(c, b, a)
+#define __msa_maddv_w(a, b, c) __builtin_msa_maddv_w(c, b, a)
+#define __msa_maddv_d(a, b, c) __builtin_msa_maddv_d(c, b, a)
+#define __msa_msubv_b(a, b, c) __builtin_msa_msubv_b(c, b, a)
+#define __msa_msubv_h(a, b, c) __builtin_msa_msubv_h(c, b, a)
+#define __msa_msubv_w(a, b, c) __builtin_msa_msubv_w(c, b, a)
+#define __msa_msubv_d(a, b, c) __builtin_msa_msubv_d(c, b, a)
 ```
 
 Build ncnn with mips msa and simpleocv enabled:
diff --git a/src/layer/mips/convolution1d_mips.cpp b/src/layer/mips/convolution1d_mips.cpp
index 92889a8b7..fc61c9406 100644
--- a/src/layer/mips/convolution1d_mips.cpp
+++ b/src/layer/mips/convolution1d_mips.cpp
@@ -253,7 +253,7 @@ int Convolution1D_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Opt
                         }
                     }
 
-                    sum += __msa_fhadd_w(_sum);
+                    sum += __msa_reduce_fadd_w(_sum);
 
                     sum = activation_ss(sum, activation_type, activation_params);
 
diff --git a/src/layer/mips/convolution_1x1_int8.h b/src/layer/mips/convolution_1x1_int8.h
new file mode 100644
index 000000000..8730041f6
--- /dev/null
+++ b/src/layer/mips/convolution_1x1_int8.h
@@ -0,0 +1,83 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv1x1s1_sgemm_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    const int size = w * h;
+
+    Mat bottom_im2col = bottom_blob;
+    bottom_im2col.w = size;
+    bottom_im2col.h = 1;
+
+    im2col_sgemm_int8_msa(bottom_im2col, top_blob, kernel, opt);
+}
+
+static void conv1x1s2_sgemm_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+
+    const int tailstep = w - 2 * outw + w;
+
+    Mat bottom_blob_shrinked;
+    bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < channels; p++)
+    {
+        const signed char* r0 = bottom_blob.channel(p);
+        signed char* outptr = bottom_blob_shrinked.channel(p);
+
+        for (int i = 0; i < outh; i++)
+        {
+            int j = 0;
+            for (; j + 3 < outw; j += 4)
+            {
+                outptr[0] = r0[0];
+                outptr[1] = r0[2];
+                outptr[2] = r0[4];
+                outptr[3] = r0[6];
+
+                r0 += 8;
+                outptr += 4;
+            }
+            for (; j + 1 < outw; j += 2)
+            {
+                outptr[0] = r0[0];
+                outptr[1] = r0[2];
+
+                r0 += 4;
+                outptr += 2;
+            }
+            for (; j < outw; j++)
+            {
+                outptr[0] = r0[0];
+
+                r0 += 2;
+                outptr += 1;
+            }
+
+            r0 += tailstep;
+        }
+    }
+
+    conv1x1s1_sgemm_int8_msa(bottom_blob_shrinked, top_blob, kernel, opt);
+}
diff --git a/src/layer/mips/convolution_1x1_pack1to4_int8.h b/src/layer/mips/convolution_1x1_pack1to4_int8.h
new file mode 100644
index 000000000..928e16336
--- /dev/null
+++ b/src/layer/mips/convolution_1x1_pack1to4_int8.h
@@ -0,0 +1,83 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv1x1s1_sgemm_pack1to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    const int size = w * h;
+
+    Mat bottom_im2col = bottom_blob;
+    bottom_im2col.w = size;
+    bottom_im2col.h = 1;
+
+    im2col_sgemm_pack1to4_int8_msa(bottom_im2col, top_blob, kernel, opt);
+}
+
+static void conv1x1s2_sgemm_pack1to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+
+    const int tailstep = w - 2 * outw + w;
+
+    Mat bottom_blob_shrinked;
+    bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < channels; p++)
+    {
+        const signed char* r0 = bottom_blob.channel(p);
+        signed char* outptr = bottom_blob_shrinked.channel(p);
+
+        for (int i = 0; i < outh; i++)
+        {
+            int j = 0;
+            for (; j + 3 < outw; j += 4)
+            {
+                outptr[0] = r0[0];
+                outptr[1] = r0[2];
+                outptr[2] = r0[4];
+                outptr[3] = r0[6];
+
+                r0 += 8;
+                outptr += 4;
+            }
+            for (; j + 1 < outw; j += 2)
+            {
+                outptr[0] = r0[0];
+                outptr[1] = r0[2];
+
+                r0 += 4;
+                outptr += 2;
+            }
+            for (; j < outw; j++)
+            {
+                outptr[0] = r0[0];
+
+                r0 += 2;
+                outptr += 1;
+            }
+
+            r0 += tailstep;
+        }
+    }
+
+    conv1x1s1_sgemm_pack1to4_int8_msa(bottom_blob_shrinked, top_blob, kernel, opt);
+}
diff --git a/src/layer/mips/convolution_1x1_pack8to1_int8.h b/src/layer/mips/convolution_1x1_pack8to1_int8.h
new file mode 100644
index 000000000..398f85e23
--- /dev/null
+++ b/src/layer/mips/convolution_1x1_pack8to1_int8.h
@@ -0,0 +1,65 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv1x1s1_sgemm_pack8to1_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    const int size = w * h;
+
+    Mat bottom_im2col = bottom_blob;
+    bottom_im2col.w = size;
+    bottom_im2col.h = 1;
+
+    im2col_sgemm_pack8to1_int8_msa(bottom_im2col, top_blob, kernel, opt);
+}
+
+static void conv1x1s2_sgemm_pack8to1_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+
+    const int tailstep = w - 2 * outw + w;
+
+    Mat bottom_blob_shrinked;
+    bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < channels; p++)
+    {
+        const int64_t* r0 = bottom_blob.channel(p);
+        int64_t* outptr = bottom_blob_shrinked.channel(p);
+
+        for (int i = 0; i < outh; i++)
+        {
+            int j = 0;
+            for (; j < outw; j++)
+            {
+                outptr[0] = r0[0];
+
+                r0 += 2;
+                outptr += 1;
+            }
+
+            r0 += tailstep;
+        }
+    }
+
+    conv1x1s1_sgemm_pack8to1_int8_msa(bottom_blob_shrinked, top_blob, kernel, opt);
+}
diff --git a/src/layer/mips/convolution_1x1_pack8to4_int8.h b/src/layer/mips/convolution_1x1_pack8to4_int8.h
new file mode 100644
index 000000000..aa38542e4
--- /dev/null
+++ b/src/layer/mips/convolution_1x1_pack8to4_int8.h
@@ -0,0 +1,65 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv1x1s1_sgemm_pack8to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    const int size = w * h;
+
+    Mat bottom_im2col = bottom_blob;
+    bottom_im2col.w = size;
+    bottom_im2col.h = 1;
+
+    im2col_sgemm_pack8to4_int8_msa(bottom_im2col, top_blob, kernel, opt);
+}
+
+static void conv1x1s2_sgemm_pack8to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+
+    const int tailstep = w - 2 * outw + w;
+
+    Mat bottom_blob_shrinked;
+    bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < channels; p++)
+    {
+        const int64_t* r0 = bottom_blob.channel(p);
+        int64_t* outptr = bottom_blob_shrinked.channel(p);
+
+        for (int i = 0; i < outh; i++)
+        {
+            int j = 0;
+            for (; j < outw; j++)
+            {
+                outptr[0] = r0[0];
+
+                r0 += 2;
+                outptr += 1;
+            }
+
+            r0 += tailstep;
+        }
+    }
+
+    conv1x1s1_sgemm_pack8to4_int8_msa(bottom_blob_shrinked, top_blob, kernel, opt);
+}
diff --git a/src/layer/mips/convolution_3x3_pack8to1_int8.h b/src/layer/mips/convolution_3x3_pack8to1_int8.h
new file mode 100644
index 000000000..0f18f20bf
--- /dev/null
+++ b/src/layer/mips/convolution_3x3_pack8to1_int8.h
@@ -0,0 +1,731 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv3x3s1_winograd42_transform_kernel_pack8to1_int8_msa(const Mat& kernel, Mat& kernel_tm_pack8to1, int inch, int outch, const Option& opt)
+{
+    // winograd42 transform kernel
+    Mat kernel_tm(6 * 6, inch, outch, (size_t)2u);
+
+    const short ktm[6][3] = {
+        {6, 0, 0},
+        {-4, -4, -4},
+        {-4, 4, -4},
+        {1, 2, 4},
+        {1, -2, 4},
+        {0, 0, 6}
+    };
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        for (int q = 0; q < inch; q++)
+        {
+            const signed char* kernel0 = (const signed char*)kernel + p * inch * 9 + q * 9;
+            short* kernel_tm0 = kernel_tm.channel(p).row<short>(q);
+
+            // transform kernel
+            const signed char* k0 = kernel0;
+            const signed char* k1 = kernel0 + 3;
+            const signed char* k2 = kernel0 + 6;
+
+            // h
+            short tmp[6][3];
+            for (int i = 0; i < 6; i++)
+            {
+                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
+                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
+                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
+            }
+
+            // U
+            for (int j = 0; j < 6; j++)
+            {
+                short* tmpp = &tmp[j][0];
+
+                for (int i = 0; i < 6; i++)
+                {
+                    kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                }
+            }
+        }
+    }
+
+    // interleave
+    // src = 36-inch-outch
+    // dst = 4b-8a-inch/8a-36-outch/4b
+    kernel_tm_pack8to1.create(8 * inch / 8, 36, outch / 4 + outch % 4, (size_t)2u * 4, 4);
+
+    int p = 0;
+    for (; p + 3 < outch; p += 4)
+    {
+        const Mat k0 = kernel_tm.channel(p);
+        const Mat k1 = kernel_tm.channel(p + 1);
+        const Mat k2 = kernel_tm.channel(p + 2);
+        const Mat k3 = kernel_tm.channel(p + 3);
+
+        Mat g0 = kernel_tm_pack8to1.channel(p / 4);
+
+        for (int k = 0; k < 36; k++)
+        {
+            short* g00 = g0.row<short>(k);
+
+            for (int q = 0; q + 7 < inch; q += 8)
+            {
+                for (int i = 0; i < 8; i++)
+                {
+                    g00[0] = k0.row<const short>(q + i)[k];
+                    g00[1] = k1.row<const short>(q + i)[k];
+                    g00[2] = k2.row<const short>(q + i)[k];
+                    g00[3] = k3.row<const short>(q + i)[k];
+
+                    g00 += 4;
+                }
+            }
+        }
+    }
+    for (; p < outch; p++)
+    {
+        const Mat k0 = kernel_tm.channel(p);
+
+        Mat g0 = kernel_tm_pack8to1.channel(p / 4 + p % 4);
+
+        for (int k = 0; k < 36; k++)
+        {
+            short* g00 = g0.row<short>(k);
+
+            for (int q = 0; q + 7 < inch; q += 8)
+            {
+                for (int i = 0; i < 8; i++)
+                {
+                    g00[0] = k0.row<const short>(q + i)[k];
+
+                    g00 += 1;
+                }
+            }
+        }
+    }
+}
+
+static void conv3x3s1_winograd42_pack8to1_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+    //     size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    // pad to 4n+2
+    Mat bottom_blob_bordered = bottom_blob;
+
+    outw = (outw + 3) / 4 * 4;
+    outh = (outh + 3) / 4 * 4;
+
+    w = outw + 2;
+    h = outh + 2;
+    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);
+
+    // BEGIN transform input
+    Mat bottom_blob_tm;
+    {
+        int w_tm = outw / 4 * 6;
+        int h_tm = outh / 4 * 6;
+
+        const int tiles = w_tm / 6 * h_tm / 6;
+
+        bottom_blob_tm.create(tiles, 36, inch, 2u * elempack, elempack, opt.workspace_allocator);
+
+        // const float itm[4][4] = {
+        //     {4.0f, 0.0f, -5.0f, 0.0f, 1.0f, 0.0f},
+        //     {0.0f,-4.0f, -4.0f, 1.0f, 1.0f, 0.0f},
+        //     {0.0f, 4.0f, -4.0f,-1.0f, 1.0f, 0.0f},
+        //     {0.0f,-2.0f, -1.0f, 2.0f, 1.0f, 0.0f},
+        //     {0.0f, 2.0f, -1.0f,-2.0f, 1.0f, 0.0f},
+        //     {0.0f, 4.0f,  0.0f,-5.0f, 0.0f, 1.0f}
+        // };
+
+        // 0 =  4 * r00 - 5 * r02 + r04
+        // 1 = -4 * (r01 + r02) + r04 + r03
+        // 2 =  4 * (r01 - r02) + r04 - r03
+        // 3 = -2 * (r01 - r03) + r04 - r02
+        // 4 =  2 * (r01 - r03) + r04 - r02
+        // 5 =  4 * r01 - 5 * r03 + r05
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < inch; q++)
+        {
+            const Mat img0 = bottom_blob_bordered.channel(q);
+            Mat img0_tm = bottom_blob_tm.channel(q);
+
+            short tmp[6][6][8];
+
+            // tile
+            for (int i = 0; i < h_tm / 6; i++)
+            {
+                for (int j = 0; j < w_tm / 6; j++)
+                {
+                    const signed char* r0 = img0.row<const signed char>(i * 4) + (j * 4) * 8;
+
+                    for (int m = 0; m < 6; m++)
+                    {
+                        v16i8 _r00_01 = __msa_ld_b(r0, 0);
+                        v16i8 _r02_03 = __msa_ld_b(r0 + 16, 0);
+                        v16i8 _r04_05 = __msa_ld_b(r0 + 32, 0);
+                        v16i8 _extr0001 = __msa_clti_s_b(_r00_01, 0);
+                        v16i8 _extr0203 = __msa_clti_s_b(_r02_03, 0);
+                        v16i8 _extr0405 = __msa_clti_s_b(_r04_05, 0);
+                        v8i16 _r00 = (v8i16)__msa_ilvr_b(_extr0001, _r00_01);
+                        v8i16 _r01 = (v8i16)__msa_ilvl_b(_extr0001, _r00_01);
+                        v8i16 _r02 = (v8i16)__msa_ilvr_b(_extr0203, _r02_03);
+                        v8i16 _r03 = (v8i16)__msa_ilvl_b(_extr0203, _r02_03);
+                        v8i16 _r04 = (v8i16)__msa_ilvr_b(_extr0405, _r04_05);
+                        v8i16 _r05 = (v8i16)__msa_ilvl_b(_extr0405, _r04_05);
+
+                        v8i16 _v5 = __msa_fill_h(5);
+
+                        v8i16 _tmp0m = __msa_subv_h(__msa_addv_h(__msa_slli_h(_r00, 2), _r04), __msa_mulv_h(_r02, _v5));
+                        v8i16 _tmp1m = __msa_subv_h(__msa_addv_h(_r04, _r03), __msa_slli_h(__msa_addv_h(_r01, _r02), 2));
+                        v8i16 _tmp2m = __msa_addv_h(__msa_subv_h(_r04, _r03), __msa_slli_h(__msa_subv_h(_r01, _r02), 2));
+                        v8i16 _tmp3m = __msa_subv_h(__msa_subv_h(_r04, _r02), __msa_slli_h(__msa_subv_h(_r01, _r03), 1));
+                        v8i16 _tmp4m = __msa_addv_h(__msa_subv_h(_r04, _r02), __msa_slli_h(__msa_subv_h(_r01, _r03), 1));
+                        v8i16 _tmp5m = __msa_subv_h(__msa_addv_h(__msa_slli_h(_r01, 2), _r05), __msa_mulv_h(_r03, _v5));
+
+                        __msa_st_h(_tmp0m, tmp[0][m], 0);
+                        __msa_st_h(_tmp1m, tmp[1][m], 0);
+                        __msa_st_h(_tmp2m, tmp[2][m], 0);
+                        __msa_st_h(_tmp3m, tmp[3][m], 0);
+                        __msa_st_h(_tmp4m, tmp[4][m], 0);
+                        __msa_st_h(_tmp5m, tmp[5][m], 0);
+
+                        r0 += w * 8;
+                    }
+
+                    short* r0_tm_0 = (short*)img0_tm + (i * w_tm / 6 + j) * 8;
+                    short* r0_tm_1 = r0_tm_0 + tiles * 8;
+                    short* r0_tm_2 = r0_tm_0 + tiles * 16;
+                    short* r0_tm_3 = r0_tm_0 + tiles * 24;
+                    short* r0_tm_4 = r0_tm_0 + tiles * 32;
+                    short* r0_tm_5 = r0_tm_0 + tiles * 40;
+
+                    for (int m = 0; m < 6; m++)
+                    {
+                        v8i16 _tmp00 = __msa_ld_h(tmp[m][0], 0);
+                        v8i16 _tmp01 = __msa_ld_h(tmp[m][1], 0);
+                        v8i16 _tmp02 = __msa_ld_h(tmp[m][2], 0);
+                        v8i16 _tmp03 = __msa_ld_h(tmp[m][3], 0);
+                        v8i16 _tmp04 = __msa_ld_h(tmp[m][4], 0);
+                        v8i16 _tmp05 = __msa_ld_h(tmp[m][5], 0);
+
+                        v8i16 _v5 = __msa_fill_h(5);
+
+                        v8i16 _r0tm0 = __msa_subv_h(__msa_addv_h(__msa_slli_h(_tmp00, 2), _tmp04), __msa_mulv_h(_tmp02, _v5));
+                        v8i16 _r0tm1 = __msa_subv_h(__msa_addv_h(_tmp04, _tmp03), __msa_slli_h(__msa_addv_h(_tmp01, _tmp02), 2));
+                        v8i16 _r0tm2 = __msa_addv_h(__msa_subv_h(_tmp04, _tmp03), __msa_slli_h(__msa_subv_h(_tmp01, _tmp02), 2));
+                        v8i16 _r0tm3 = __msa_subv_h(__msa_subv_h(_tmp04, _tmp02), __msa_slli_h(__msa_subv_h(_tmp01, _tmp03), 1));
+                        v8i16 _r0tm4 = __msa_addv_h(__msa_subv_h(_tmp04, _tmp02), __msa_slli_h(__msa_subv_h(_tmp01, _tmp03), 1));
+                        v8i16 _r0tm5 = __msa_subv_h(__msa_addv_h(__msa_slli_h(_tmp01, 2), _tmp05), __msa_mulv_h(_tmp03, _v5));
+
+                        __msa_st_h(_r0tm0, r0_tm_0, 0);
+                        __msa_st_h(_r0tm1, r0_tm_1, 0);
+                        __msa_st_h(_r0tm2, r0_tm_2, 0);
+                        __msa_st_h(_r0tm3, r0_tm_3, 0);
+                        __msa_st_h(_r0tm4, r0_tm_4, 0);
+                        __msa_st_h(_r0tm5, r0_tm_5, 0);
+
+                        r0_tm_0 += tiles * 48;
+                        r0_tm_1 += tiles * 48;
+                        r0_tm_2 += tiles * 48;
+                        r0_tm_3 += tiles * 48;
+                        r0_tm_4 += tiles * 48;
+                        r0_tm_5 += tiles * 48;
+                    }
+                }
+            }
+        }
+    }
+    bottom_blob_bordered = Mat();
+    // END transform input
+
+    // BEGIN dot
+    Mat top_blob_tm;
+    {
+        int w_tm = outw / 4 * 6;
+        int h_tm = outh / 4 * 6;
+
+        const int tiles = h_tm / 6 * w_tm / 6;
+
+        // permute
+        //         bottom_blob_tm.create(tiles, 36, inch, elemsize, elempack, opt.workspace_allocator);
+        Mat bottom_blob_tm2;
+        if (tiles >= 2)
+            bottom_blob_tm2.create(2 * inch, tiles / 2 + tiles % 2, 36, 2u * elempack, elempack, opt.workspace_allocator);
+        else // if (tiles >= 1)
+            bottom_blob_tm2.create(1 * inch, tiles, 36, 2u * elempack, elempack, opt.workspace_allocator);
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int r = 0; r < 36; r++)
+        {
+            Mat tm2 = bottom_blob_tm2.channel(r);
+
+            // tile
+            int i = 0;
+            for (; i + 1 < tiles; i += 2)
+            {
+                short* tmpptr = tm2.row<short>(i / 2);
+
+                const short* r0 = bottom_blob_tm;
+
+                r0 += (r * tiles + i) * 8;
+
+                for (int q = 0; q < inch; q++)
+                {
+                    v8i16 _r0 = __msa_ld_h(r0, 0);
+                    v8i16 _r1 = __msa_ld_h(r0 + 8, 0);
+                    __msa_st_h(_r0, tmpptr, 0);
+                    __msa_st_h(_r1, tmpptr + 8, 0);
+                    r0 += bottom_blob_tm.cstep * 8;
+                    tmpptr += 16;
+                }
+            }
+            for (; i < tiles; i++)
+            {
+                short* tmpptr = tm2.row<short>(i / 2 + i % 2);
+
+                const short* r0 = bottom_blob_tm;
+
+                r0 += (r * tiles + i) * 8;
+
+                for (int q = 0; q < inch; q++)
+                {
+                    v8i16 _r0 = __msa_ld_h(r0, 0);
+                    __msa_st_h(_r0, tmpptr, 0);
+                    r0 += bottom_blob_tm.cstep * 8;
+                    tmpptr += 8;
+                }
+            }
+        }
+
+        bottom_blob_tm = Mat();
+        // permute end
+
+        top_blob_tm.create(tiles, 36, outch, 4u, 1, opt.workspace_allocator);
+
+        int nn_outch = 0;
+        int remain_outch_start = 0;
+
+        nn_outch = outch >> 2;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int pp = 0; pp < nn_outch; pp++)
+        {
+            int p = pp * 4;
+
+            int* output0_tm = top_blob_tm.channel(p);
+            int* output1_tm = top_blob_tm.channel(p + 1);
+            int* output2_tm = top_blob_tm.channel(p + 2);
+            int* output3_tm = top_blob_tm.channel(p + 3);
+
+            const Mat kernel0_tm = kernel_tm.channel(p / 4);
+
+            for (int r = 0; r < 36; r++)
+            {
+                const Mat bb2 = bottom_blob_tm2.channel(r);
+
+                int i = 0;
+                for (; i + 1 < tiles; i += 2)
+                {
+                    const short* r0 = bb2.row<const short>(i / 2);
+                    const short* k0 = kernel0_tm.row<const short>(r);
+
+                    int nn = inch; // inch always > 0
+
+                    v4i32 _sum0 = __msa_fill_w(0);
+                    v4i32 _sum1 = __msa_fill_w(0);
+                    v4i32 _sum2 = __msa_fill_w(0);
+                    v4i32 _sum3 = __msa_fill_w(0);
+
+                    for (int j = 0; j < nn; j++)
+                    {
+                        v8i16 _w0 = __msa_ld_h(k0, 0);
+                        v8i16 _w1 = __msa_ld_h(k0 + 8, 0);
+                        v8i16 _w2 = __msa_ld_h(k0 + 16, 0);
+                        v8i16 _w3 = __msa_ld_h(k0 + 24, 0);
+
+                        v8i16 _extw0 = __msa_clti_s_h(_w0, 0);
+                        v8i16 _extw1 = __msa_clti_s_h(_w1, 0);
+                        v8i16 _extw2 = __msa_clti_s_h(_w2, 0);
+                        v8i16 _extw3 = __msa_clti_s_h(_w3, 0);
+
+                        v4i32 _w0l = (v4i32)__msa_ilvr_h(_extw0, _w0);
+                        v4i32 _w0h = (v4i32)__msa_ilvl_h(_extw0, _w0);
+                        v4i32 _w1l = (v4i32)__msa_ilvr_h(_extw1, _w1);
+                        v4i32 _w1h = (v4i32)__msa_ilvl_h(_extw1, _w1);
+                        v4i32 _w2l = (v4i32)__msa_ilvr_h(_extw2, _w2);
+                        v4i32 _w2h = (v4i32)__msa_ilvl_h(_extw2, _w2);
+                        v4i32 _w3l = (v4i32)__msa_ilvr_h(_extw3, _w3);
+                        v4i32 _w3h = (v4i32)__msa_ilvl_h(_extw3, _w3);
+
+                        v4i32 _val0_0 = __msa_fill_w(r0[0]);
+                        v4i32 _val0_1 = __msa_fill_w(r0[1]);
+                        v4i32 _val0_2 = __msa_fill_w(r0[2]);
+                        v4i32 _val0_3 = __msa_fill_w(r0[3]);
+                        v4i32 _val0_4 = __msa_fill_w(r0[4]);
+                        v4i32 _val0_5 = __msa_fill_w(r0[5]);
+                        v4i32 _val0_6 = __msa_fill_w(r0[6]);
+                        v4i32 _val0_7 = __msa_fill_w(r0[7]);
+                        v4i32 _val1_0 = __msa_fill_w(r0[8]);
+                        v4i32 _val1_1 = __msa_fill_w(r0[9]);
+                        v4i32 _val1_2 = __msa_fill_w(r0[10]);
+                        v4i32 _val1_3 = __msa_fill_w(r0[11]);
+                        v4i32 _val1_4 = __msa_fill_w(r0[12]);
+                        v4i32 _val1_5 = __msa_fill_w(r0[13]);
+                        v4i32 _val1_6 = __msa_fill_w(r0[14]);
+                        v4i32 _val1_7 = __msa_fill_w(r0[15]);
+
+                        _sum0 = __msa_maddv_w(_sum0, _w0l, _val0_0);
+                        _sum1 = __msa_maddv_w(_sum1, _w0h, _val0_1);
+                        _sum2 = __msa_maddv_w(_sum2, _w0l, _val1_0);
+                        _sum3 = __msa_maddv_w(_sum3, _w0h, _val1_1);
+                        _sum0 = __msa_maddv_w(_sum0, _w1l, _val0_2);
+                        _sum1 = __msa_maddv_w(_sum1, _w1h, _val0_3);
+                        _sum2 = __msa_maddv_w(_sum2, _w1l, _val1_2);
+                        _sum3 = __msa_maddv_w(_sum3, _w1h, _val1_3);
+                        _sum0 = __msa_maddv_w(_sum0, _w2l, _val0_4);
+                        _sum1 = __msa_maddv_w(_sum1, _w2h, _val0_5);
+                        _sum2 = __msa_maddv_w(_sum2, _w2l, _val1_4);
+                        _sum3 = __msa_maddv_w(_sum3, _w2h, _val1_5);
+                        _sum0 = __msa_maddv_w(_sum0, _w3l, _val0_6);
+                        _sum1 = __msa_maddv_w(_sum1, _w3h, _val0_7);
+                        _sum2 = __msa_maddv_w(_sum2, _w3l, _val1_6);
+                        _sum3 = __msa_maddv_w(_sum3, _w3h, _val1_7);
+
+                        r0 += 16;
+                        k0 += 32;
+                    }
+
+                    _sum0 = __msa_addv_w(_sum0, _sum1);
+                    _sum2 = __msa_addv_w(_sum2, _sum3);
+
+                    int sum[8];
+                    __msa_st_w(_sum0, sum, 0);
+                    __msa_st_w(_sum2, sum + 4, 0);
+
+                    output0_tm[0] = sum[0];
+                    output1_tm[0] = sum[1];
+                    output2_tm[0] = sum[2];
+                    output3_tm[0] = sum[3];
+                    output0_tm[1] = sum[4];
+                    output1_tm[1] = sum[5];
+                    output2_tm[1] = sum[6];
+                    output3_tm[1] = sum[7];
+                    output0_tm += 2;
+                    output1_tm += 2;
+                    output2_tm += 2;
+                    output3_tm += 2;
+                }
+                for (; i < tiles; i++)
+                {
+                    const short* r0 = bb2.row<const short>(i / 2 + i % 2);
+                    const short* k0 = kernel0_tm.row<const short>(r);
+
+                    int nn = inch; // inch always > 0
+
+                    v4i32 _sum0 = __msa_fill_w(0);
+                    v4i32 _sum1 = __msa_fill_w(0);
+
+                    for (int j = 0; j < nn; j++)
+                    {
+                        v8i16 _w0 = __msa_ld_h(k0, 0);
+                        v8i16 _w1 = __msa_ld_h(k0 + 8, 0);
+                        v8i16 _w2 = __msa_ld_h(k0 + 16, 0);
+                        v8i16 _w3 = __msa_ld_h(k0 + 24, 0);
+
+                        v8i16 _extw0 = __msa_clti_s_h(_w0, 0);
+                        v8i16 _extw1 = __msa_clti_s_h(_w1, 0);
+                        v8i16 _extw2 = __msa_clti_s_h(_w2, 0);
+                        v8i16 _extw3 = __msa_clti_s_h(_w3, 0);
+
+                        v4i32 _w0l = (v4i32)__msa_ilvr_h(_extw0, _w0);
+                        v4i32 _w0h = (v4i32)__msa_ilvl_h(_extw0, _w0);
+                        v4i32 _w1l = (v4i32)__msa_ilvr_h(_extw1, _w1);
+                        v4i32 _w1h = (v4i32)__msa_ilvl_h(_extw1, _w1);
+                        v4i32 _w2l = (v4i32)__msa_ilvr_h(_extw2, _w2);
+                        v4i32 _w2h = (v4i32)__msa_ilvl_h(_extw2, _w2);
+                        v4i32 _w3l = (v4i32)__msa_ilvr_h(_extw3, _w3);
+                        v4i32 _w3h = (v4i32)__msa_ilvl_h(_extw3, _w3);
+
+                        v4i32 _val0 = __msa_fill_w(r0[0]);
+                        v4i32 _val1 = __msa_fill_w(r0[1]);
+                        v4i32 _val2 = __msa_fill_w(r0[2]);
+                        v4i32 _val3 = __msa_fill_w(r0[3]);
+                        v4i32 _val4 = __msa_fill_w(r0[4]);
+                        v4i32 _val5 = __msa_fill_w(r0[5]);
+                        v4i32 _val6 = __msa_fill_w(r0[6]);
+                        v4i32 _val7 = __msa_fill_w(r0[7]);
+
+                        _sum0 = __msa_maddv_w(_sum0, _w0l, _val0);
+                        _sum1 = __msa_maddv_w(_sum1, _w0h, _val1);
+                        _sum0 = __msa_maddv_w(_sum0, _w1l, _val2);
+                        _sum1 = __msa_maddv_w(_sum1, _w1h, _val3);
+                        _sum0 = __msa_maddv_w(_sum0, _w2l, _val4);
+                        _sum1 = __msa_maddv_w(_sum1, _w2h, _val5);
+                        _sum0 = __msa_maddv_w(_sum0, _w3l, _val6);
+                        _sum1 = __msa_maddv_w(_sum1, _w3h, _val7);
+
+                        r0 += 8;
+                        k0 += 32;
+                    }
+
+                    _sum0 = __msa_addv_w(_sum0, _sum1);
+
+                    int sum[4];
+                    __msa_st_w(_sum0, sum, 0);
+
+                    output0_tm[0] = sum[0];
+                    output1_tm[0] = sum[1];
+                    output2_tm[0] = sum[2];
+                    output3_tm[0] = sum[3];
+                    output0_tm += 1;
+                    output1_tm += 1;
+                    output2_tm += 1;
+                    output3_tm += 1;
+                }
+            }
+        }
+
+        remain_outch_start += nn_outch << 2;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = remain_outch_start; p < outch; p++)
+        {
+            int* output0_tm = top_blob_tm.channel(p);
+
+            const Mat kernel0_tm = kernel_tm.channel(p / 4 + p % 4);
+
+            for (int r = 0; r < 36; r++)
+            {
+                const Mat bb2 = bottom_blob_tm2.channel(r);
+
+                int i = 0;
+                for (; i + 1 < tiles; i += 2)
+                {
+                    const short* r0 = bb2.row<const short>(i / 2);
+                    const short* k0 = kernel0_tm.row<const short>(r);
+
+                    v4i32 _sum0 = __msa_fill_w(0);
+                    v4i32 _sum1 = __msa_fill_w(0);
+                    v4i32 _sum2 = __msa_fill_w(0);
+                    v4i32 _sum3 = __msa_fill_w(0);
+
+                    for (int q = 0; q < inch; q++)
+                    {
+                        v8i16 _val0 = __msa_ld_h(r0, 0);
+                        v8i16 _val1 = __msa_ld_h(r0 + 8, 0);
+
+                        v8i16 _extval0 = __msa_clti_s_h(_val0, 0);
+                        v8i16 _extval1 = __msa_clti_s_h(_val1, 0);
+                        v4i32 _val0l = (v4i32)__msa_ilvr_h(_extval0, _val0);
+                        v4i32 _val0h = (v4i32)__msa_ilvl_h(_extval0, _val0);
+                        v4i32 _val1l = (v4i32)__msa_ilvr_h(_extval1, _val1);
+                        v4i32 _val1h = (v4i32)__msa_ilvl_h(_extval1, _val1);
+
+                        v8i16 _w0 = __msa_ld_h(k0, 0);
+
+                        v8i16 _extw0 = __msa_clti_s_h(_w0, 0);
+                        v4i32 _w0l = (v4i32)__msa_ilvr_h(_extw0, _w0);
+                        v4i32 _w0h = (v4i32)__msa_ilvl_h(_extw0, _w0);
+
+                        _sum0 = __msa_maddv_w(_sum0, _w0l, _val0l);
+                        _sum1 = __msa_maddv_w(_sum1, _w0h, _val0h);
+                        _sum2 = __msa_maddv_w(_sum2, _w0l, _val1l);
+                        _sum3 = __msa_maddv_w(_sum3, _w0h, _val1h);
+
+                        k0 += 8;
+                        r0 += 16;
+                    }
+
+                    _sum0 = __msa_addv_w(_sum0, _sum1);
+                    _sum2 = __msa_addv_w(_sum2, _sum3);
+
+                    output0_tm[0] = __msa_reduce_add_w(_sum0);
+                    output0_tm[1] = __msa_reduce_add_w(_sum2);
+                    output0_tm += 2;
+                }
+                for (; i < tiles; i++)
+                {
+                    const short* r0 = bb2.row<const short>(i / 2 + i % 2);
+                    const short* k0 = kernel0_tm.row<const short>(r);
+
+                    v4i32 _sum0 = __msa_fill_w(0);
+                    v4i32 _sum1 = __msa_fill_w(0);
+
+                    for (int q = 0; q < inch; q++)
+                    {
+                        v8i16 _val = __msa_ld_h(r0, 0);
+
+                        v8i16 _extval = __msa_clti_s_h(_val, 0);
+                        v4i32 _vall = (v4i32)__msa_ilvr_h(_extval, _val);
+                        v4i32 _valh = (v4i32)__msa_ilvl_h(_extval, _val);
+
+                        v8i16 _w0 = __msa_ld_h(k0, 0);
+
+                        v8i16 _extw0 = __msa_clti_s_h(_w0, 0);
+                        v4i32 _w0l = (v4i32)__msa_ilvr_h(_extw0, _w0);
+                        v4i32 _w0h = (v4i32)__msa_ilvl_h(_extw0, _w0);
+
+                        _sum0 = __msa_maddv_w(_sum0, _w0l, _vall);
+                        _sum1 = __msa_maddv_w(_sum1, _w0h, _valh);
+
+                        k0 += 8;
+                        r0 += 8;
+                    }
+
+                    _sum0 = __msa_addv_w(_sum0, _sum1);
+
+                    output0_tm[0] = __msa_reduce_add_w(_sum0);
+                    output0_tm++;
+                }
+            }
+        }
+    }
+    bottom_blob_tm = Mat();
+    // END dot
+
+    // BEGIN transform output
+    Mat top_blob_bordered;
+    if (outw == top_blob.w && outh == top_blob.h)
+    {
+        top_blob_bordered = top_blob;
+    }
+    else
+    {
+        top_blob_bordered.create(outw, outh, outch, 4u, 1, opt.workspace_allocator);
+    }
+    {
+        // const float otm[4][6] = {
+        //     {1.0f, 1.0f,  1.0f, 1.0f,  1.0f, 0.0f},
+        //     {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 0.0f},
+        //     {0.0f, 1.0f,  1.0f, 4.0f,  4.0f, 0.0f},
+        //     {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 1.0f}
+        // };
+
+        // 0 = r00 + (r01 + r02) + (r03 + r04)
+        // 1 =       (r01 - r02) + (r03 - r04) * 2
+        // 2 =       (r01 + r02) + (r03 + r04) * 4
+        // 3 = r05 + (r01 - r02) + (r03 - r04) * 8
+
+        int w_tm = outw / 4 * 6;
+        int h_tm = outh / 4 * 6;
+        const int tiles = w_tm / 6 * h_tm / 6;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < outch; p++)
+        {
+            const Mat out0_tm = top_blob_tm.channel(p);
+            Mat out0 = top_blob_bordered.channel(p);
+
+            int tmp[4][6];
+
+            // tile
+            for (int i = 0; i < outh / 4; i++)
+            {
+                for (int j = 0; j < outw / 4; j++)
+                {
+                    // top_blob_tm.create(tiles, 36, outch, 4u, 1, opt.workspace_allocator);
+
+                    const int* output0_tm_0 = (const int*)out0_tm + (i * w_tm / 6 + j) * 1;
+                    const int* output0_tm_1 = output0_tm_0 + tiles * 1;
+                    const int* output0_tm_2 = output0_tm_0 + tiles * 2;
+                    const int* output0_tm_3 = output0_tm_0 + tiles * 3;
+                    const int* output0_tm_4 = output0_tm_0 + tiles * 4;
+                    const int* output0_tm_5 = output0_tm_0 + tiles * 5;
+
+                    int* output0 = out0.row<int>(i * 4) + j * 4;
+
+                    // 0 = r00 + (r01 + r02) + (r03 + r04)
+                    // 1 =       (r01 - r02) + (r03 - r04) * 2
+                    // 2 =       (r01 + r02) + (r03 + r04) * 4
+                    // 3 = r05 + (r01 - r02) + (r03 - r04) * 8
+
+                    // TODO msa optimize
+                    for (int m = 0; m < 5; m++)
+                    {
+                        int tmp02a = output0_tm_1[0] + output0_tm_2[0];
+                        int tmp13a = output0_tm_1[0] - output0_tm_2[0];
+
+                        int tmp02b = output0_tm_3[0] + output0_tm_4[0];
+                        int tmp13b = output0_tm_3[0] - output0_tm_4[0];
+
+                        tmp[0][m] = output0_tm_0[0] + tmp02a + tmp02b;
+                        tmp[1][m] = tmp13a + tmp13b * 2;
+                        tmp[2][m] = tmp02a + tmp02b * 4;
+                        tmp[3][m] = output0_tm_5[0] * 4 + tmp13a + tmp13b * 8;
+
+                        output0_tm_0 += tiles * 6;
+                        output0_tm_1 += tiles * 6;
+                        output0_tm_2 += tiles * 6;
+                        output0_tm_3 += tiles * 6;
+                        output0_tm_4 += tiles * 6;
+                        output0_tm_5 += tiles * 6;
+                    }
+                    for (int m = 5; m < 6; m++)
+                    {
+                        int tmp02a = output0_tm_1[0] + output0_tm_2[0];
+                        int tmp13a = output0_tm_1[0] - output0_tm_2[0];
+
+                        int tmp02b = output0_tm_3[0] + output0_tm_4[0];
+                        int tmp13b = output0_tm_3[0] - output0_tm_4[0];
+
+                        tmp[0][m] = (output0_tm_0[0] + tmp02a + tmp02b) * 4;
+                        tmp[1][m] = (tmp13a + tmp13b * 2) * 4;
+                        tmp[2][m] = (tmp02a + tmp02b * 4) * 4;
+                        tmp[3][m] = (output0_tm_5[0] * 4 + tmp13a + tmp13b * 8) * 4;
+
+                        output0_tm_0 += tiles * 6;
+                        output0_tm_1 += tiles * 6;
+                        output0_tm_2 += tiles * 6;
+                        output0_tm_3 += tiles * 6;
+                        output0_tm_4 += tiles * 6;
+                        output0_tm_5 += tiles * 6;
+                    }
+
+                    for (int m = 0; m < 4; m++)
+                    {
+                        const int* tmp0 = tmp[m];
+
+                        int tmp02a = tmp0[1] + tmp0[2];
+                        int tmp13a = tmp0[1] - tmp0[2];
+
+                        int tmp02b = tmp0[3] + tmp0[4];
+                        int tmp13b = tmp0[3] - tmp0[4];
+
+                        output0[0] = (tmp0[0] + tmp02a + tmp02b) / 576;
+                        output0[1] = (tmp13a + tmp13b * 2) / 576;
+                        output0[2] = (tmp02a + tmp02b * 4) / 576;
+                        output0[3] = (tmp0[5] + tmp13a + tmp13b * 8) / 576;
+
+                        output0 += outw;
+                    }
+                }
+            }
+        }
+    }
+    // END transform output
+
+    // cut result pad
+    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
+}
diff --git a/src/layer/mips/convolution_3x3_pack8to4_int8.h b/src/layer/mips/convolution_3x3_pack8to4_int8.h
new file mode 100644
index 000000000..9060f421c
--- /dev/null
+++ b/src/layer/mips/convolution_3x3_pack8to4_int8.h
@@ -0,0 +1,629 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv3x3s1_winograd42_transform_kernel_pack8to4_int8_msa(const Mat& kernel, Mat& kernel_tm_pack8, int inch, int outch, const Option& opt)
+{
+    // winograd42 transform kernel
+    Mat kernel_tm(6 * 6, inch, outch, (size_t)2u);
+
+    const short ktm[6][3] = {
+        {6, 0, 0},
+        {-4, -4, -4},
+        {-4, 4, -4},
+        {1, 2, 4},
+        {1, -2, 4},
+        {0, 0, 6}
+    };
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        for (int q = 0; q < inch; q++)
+        {
+            const signed char* kernel0 = (const signed char*)kernel + p * inch * 9 + q * 9;
+            short* kernel_tm0 = kernel_tm.channel(p).row<short>(q);
+
+            // transform kernel
+            const signed char* k0 = kernel0;
+            const signed char* k1 = kernel0 + 3;
+            const signed char* k2 = kernel0 + 6;
+
+            // h
+            short tmp[6][3];
+            for (int i = 0; i < 6; i++)
+            {
+                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
+                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
+                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
+            }
+
+            // U
+            for (int j = 0; j < 6; j++)
+            {
+                short* tmpp = &tmp[j][0];
+
+                for (int i = 0; i < 6; i++)
+                {
+                    kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                }
+            }
+        }
+    }
+
+    // interleave
+    // src = 36-inch-outch
+    // dst = 4b-8a-inch/8a-36-outch/4b
+    kernel_tm_pack8.create(inch / 8, 36, outch / 4, (size_t)2u * 32, 32);
+
+    int q = 0;
+    for (; q + 3 < outch; q += 4)
+    {
+        const Mat k0 = kernel_tm.channel(q);
+        const Mat k1 = kernel_tm.channel(q + 1);
+        const Mat k2 = kernel_tm.channel(q + 2);
+        const Mat k3 = kernel_tm.channel(q + 3);
+
+        Mat kernel_tm = kernel_tm_pack8.channel(q / 4);
+
+        for (int k = 0; k < 36; k++)
+        {
+            short* g00 = kernel_tm.row<short>(k);
+
+            for (int p = 0; p + 7 < inch; p += 8)
+            {
+                for (int i = 0; i < 8; i++)
+                {
+                    const short* k00 = k0.row<const short>(p + i);
+                    const short* k10 = k1.row<const short>(p + i);
+                    const short* k20 = k2.row<const short>(p + i);
+                    const short* k30 = k3.row<const short>(p + i);
+
+                    g00[0] = k00[k];
+                    g00[1] = k10[k];
+                    g00[2] = k20[k];
+                    g00[3] = k30[k];
+
+                    g00 += 4;
+                }
+            }
+        }
+    }
+}
+
+static void conv3x3s1_winograd42_pack8to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+    //     size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    // pad to 4n+2
+    Mat bottom_blob_bordered = bottom_blob;
+
+    outw = (outw + 3) / 4 * 4;
+    outh = (outh + 3) / 4 * 4;
+
+    w = outw + 2;
+    h = outh + 2;
+    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);
+
+    // BEGIN transform input
+    Mat bottom_blob_tm;
+    {
+        int w_tm = outw / 4 * 6;
+        int h_tm = outh / 4 * 6;
+
+        const int tiles = w_tm / 6 * h_tm / 6;
+
+        bottom_blob_tm.create(tiles, 36, inch, 2u * elempack, elempack, opt.workspace_allocator);
+
+        // const float itm[4][4] = {
+        //     {4.0f, 0.0f, -5.0f, 0.0f, 1.0f, 0.0f},
+        //     {0.0f,-4.0f, -4.0f, 1.0f, 1.0f, 0.0f},
+        //     {0.0f, 4.0f, -4.0f,-1.0f, 1.0f, 0.0f},
+        //     {0.0f,-2.0f, -1.0f, 2.0f, 1.0f, 0.0f},
+        //     {0.0f, 2.0f, -1.0f,-2.0f, 1.0f, 0.0f},
+        //     {0.0f, 4.0f,  0.0f,-5.0f, 0.0f, 1.0f}
+        // };
+
+        // 0 =  4 * r00 - 5 * r02 + r04
+        // 1 = -4 * (r01 + r02) + r04 + r03
+        // 2 =  4 * (r01 - r02) + r04 - r03
+        // 3 = -2 * (r01 - r03) + r04 - r02
+        // 4 =  2 * (r01 - r03) + r04 - r02
+        // 5 =  4 * r01 - 5 * r03 + r05
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < inch; q++)
+        {
+            const Mat img0 = bottom_blob_bordered.channel(q);
+            Mat img0_tm = bottom_blob_tm.channel(q);
+
+            short tmp[6][6][8];
+
+            // tile
+            for (int i = 0; i < h_tm / 6; i++)
+            {
+                for (int j = 0; j < w_tm / 6; j++)
+                {
+                    const signed char* r0 = img0.row<const signed char>(i * 4) + (j * 4) * 8;
+
+                    for (int m = 0; m < 6; m++)
+                    {
+                        v16i8 _r00_01 = __msa_ld_b(r0, 0);
+                        v16i8 _r02_03 = __msa_ld_b(r0 + 16, 0);
+                        v16i8 _r04_05 = __msa_ld_b(r0 + 32, 0);
+                        v16i8 _extr0001 = __msa_clti_s_b(_r00_01, 0);
+                        v16i8 _extr0203 = __msa_clti_s_b(_r02_03, 0);
+                        v16i8 _extr0405 = __msa_clti_s_b(_r04_05, 0);
+                        v8i16 _r00 = (v8i16)__msa_ilvr_b(_extr0001, _r00_01);
+                        v8i16 _r01 = (v8i16)__msa_ilvl_b(_extr0001, _r00_01);
+                        v8i16 _r02 = (v8i16)__msa_ilvr_b(_extr0203, _r02_03);
+                        v8i16 _r03 = (v8i16)__msa_ilvl_b(_extr0203, _r02_03);
+                        v8i16 _r04 = (v8i16)__msa_ilvr_b(_extr0405, _r04_05);
+                        v8i16 _r05 = (v8i16)__msa_ilvl_b(_extr0405, _r04_05);
+
+                        v8i16 _v5 = __msa_fill_h(5);
+
+                        v8i16 _tmp0m = __msa_subv_h(__msa_addv_h(__msa_slli_h(_r00, 2), _r04), __msa_mulv_h(_r02, _v5));
+                        v8i16 _tmp1m = __msa_subv_h(__msa_addv_h(_r04, _r03), __msa_slli_h(__msa_addv_h(_r01, _r02), 2));
+                        v8i16 _tmp2m = __msa_addv_h(__msa_subv_h(_r04, _r03), __msa_slli_h(__msa_subv_h(_r01, _r02), 2));
+                        v8i16 _tmp3m = __msa_subv_h(__msa_subv_h(_r04, _r02), __msa_slli_h(__msa_subv_h(_r01, _r03), 1));
+                        v8i16 _tmp4m = __msa_addv_h(__msa_subv_h(_r04, _r02), __msa_slli_h(__msa_subv_h(_r01, _r03), 1));
+                        v8i16 _tmp5m = __msa_subv_h(__msa_addv_h(__msa_slli_h(_r01, 2), _r05), __msa_mulv_h(_r03, _v5));
+
+                        __msa_st_h(_tmp0m, tmp[0][m], 0);
+                        __msa_st_h(_tmp1m, tmp[1][m], 0);
+                        __msa_st_h(_tmp2m, tmp[2][m], 0);
+                        __msa_st_h(_tmp3m, tmp[3][m], 0);
+                        __msa_st_h(_tmp4m, tmp[4][m], 0);
+                        __msa_st_h(_tmp5m, tmp[5][m], 0);
+
+                        r0 += w * 8;
+                    }
+
+                    short* r0_tm_0 = (short*)img0_tm + (i * w_tm / 6 + j) * 8;
+                    short* r0_tm_1 = r0_tm_0 + tiles * 8;
+                    short* r0_tm_2 = r0_tm_0 + tiles * 16;
+                    short* r0_tm_3 = r0_tm_0 + tiles * 24;
+                    short* r0_tm_4 = r0_tm_0 + tiles * 32;
+                    short* r0_tm_5 = r0_tm_0 + tiles * 40;
+
+                    for (int m = 0; m < 6; m++)
+                    {
+                        v8i16 _tmp00 = __msa_ld_h(tmp[m][0], 0);
+                        v8i16 _tmp01 = __msa_ld_h(tmp[m][1], 0);
+                        v8i16 _tmp02 = __msa_ld_h(tmp[m][2], 0);
+                        v8i16 _tmp03 = __msa_ld_h(tmp[m][3], 0);
+                        v8i16 _tmp04 = __msa_ld_h(tmp[m][4], 0);
+                        v8i16 _tmp05 = __msa_ld_h(tmp[m][5], 0);
+
+                        v8i16 _v5 = __msa_fill_h(5);
+
+                        v8i16 _r0tm0 = __msa_subv_h(__msa_addv_h(__msa_slli_h(_tmp00, 2), _tmp04), __msa_mulv_h(_tmp02, _v5));
+                        v8i16 _r0tm1 = __msa_subv_h(__msa_addv_h(_tmp04, _tmp03), __msa_slli_h(__msa_addv_h(_tmp01, _tmp02), 2));
+                        v8i16 _r0tm2 = __msa_addv_h(__msa_subv_h(_tmp04, _tmp03), __msa_slli_h(__msa_subv_h(_tmp01, _tmp02), 2));
+                        v8i16 _r0tm3 = __msa_subv_h(__msa_subv_h(_tmp04, _tmp02), __msa_slli_h(__msa_subv_h(_tmp01, _tmp03), 1));
+                        v8i16 _r0tm4 = __msa_addv_h(__msa_subv_h(_tmp04, _tmp02), __msa_slli_h(__msa_subv_h(_tmp01, _tmp03), 1));
+                        v8i16 _r0tm5 = __msa_subv_h(__msa_addv_h(__msa_slli_h(_tmp01, 2), _tmp05), __msa_mulv_h(_tmp03, _v5));
+
+                        __msa_st_h(_r0tm0, r0_tm_0, 0);
+                        __msa_st_h(_r0tm1, r0_tm_1, 0);
+                        __msa_st_h(_r0tm2, r0_tm_2, 0);
+                        __msa_st_h(_r0tm3, r0_tm_3, 0);
+                        __msa_st_h(_r0tm4, r0_tm_4, 0);
+                        __msa_st_h(_r0tm5, r0_tm_5, 0);
+
+                        r0_tm_0 += tiles * 48;
+                        r0_tm_1 += tiles * 48;
+                        r0_tm_2 += tiles * 48;
+                        r0_tm_3 += tiles * 48;
+                        r0_tm_4 += tiles * 48;
+                        r0_tm_5 += tiles * 48;
+                    }
+                }
+            }
+        }
+    }
+    bottom_blob_bordered = Mat();
+    // END transform input
+
+    // BEGIN dot
+    Mat top_blob_tm;
+    {
+        int w_tm = outw / 4 * 6;
+        int h_tm = outh / 4 * 6;
+
+        const int tiles = h_tm / 6 * w_tm / 6;
+
+        // permute
+        //         bottom_blob_tm.create(tiles, 36, inch, elemsize, elempack, opt.workspace_allocator);
+        Mat bottom_blob_tm2;
+        if (tiles >= 2)
+            bottom_blob_tm2.create(2 * inch, tiles / 2 + tiles % 2, 36, 2u * elempack, elempack, opt.workspace_allocator);
+        else // if (tiles >= 1)
+            bottom_blob_tm2.create(1 * inch, tiles, 36, 2u * elempack, elempack, opt.workspace_allocator);
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int r = 0; r < 36; r++)
+        {
+            Mat tm2 = bottom_blob_tm2.channel(r);
+
+            // tile
+            int i = 0;
+            for (; i + 1 < tiles; i += 2)
+            {
+                short* tmpptr = tm2.row<short>(i / 2);
+
+                const short* r0 = bottom_blob_tm;
+
+                r0 += (r * tiles + i) * 8;
+
+                for (int q = 0; q < inch; q++)
+                {
+                    v8i16 _r0 = __msa_ld_h(r0, 0);
+                    v8i16 _r1 = __msa_ld_h(r0 + 8, 0);
+                    __msa_st_h(_r0, tmpptr, 0);
+                    __msa_st_h(_r1, tmpptr + 8, 0);
+                    r0 += bottom_blob_tm.cstep * 8;
+                    tmpptr += 16;
+                }
+            }
+            for (; i < tiles; i++)
+            {
+                short* tmpptr = tm2.row<short>(i / 2 + i % 2);
+
+                const short* r0 = bottom_blob_tm;
+
+                r0 += (r * tiles + i) * 8;
+
+                for (int q = 0; q < inch; q++)
+                {
+                    v8i16 _r0 = __msa_ld_h(r0, 0);
+                    __msa_st_h(_r0, tmpptr, 0);
+                    r0 += bottom_blob_tm.cstep * 8;
+                    tmpptr += 8;
+                }
+            }
+        }
+
+        bottom_blob_tm = Mat();
+        // permute end
+
+        top_blob_tm.create(tiles, 36, outch, 4u * 4, 4, opt.workspace_allocator);
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < outch; p++)
+        {
+            int* output0_tm = top_blob_tm.channel(p);
+
+            const Mat kernel0_tm = kernel_tm.channel(p);
+
+            for (int r = 0; r < 36; r++)
+            {
+                const Mat bb2 = bottom_blob_tm2.channel(r);
+
+                int i = 0;
+                for (; i + 1 < tiles; i += 2)
+                {
+                    const short* r0 = bb2.row<const short>(i / 2);
+                    const short* k0 = kernel0_tm.row<const short>(r);
+
+                    int nn = inch; // inch always > 0
+
+                    v4i32 _sum0 = __msa_fill_w(0);
+                    v4i32 _sum1 = __msa_fill_w(0);
+                    v4i32 _sum2 = __msa_fill_w(0);
+                    v4i32 _sum3 = __msa_fill_w(0);
+
+                    for (int j = 0; j < nn; j++)
+                    {
+                        v8i16 _w0 = __msa_ld_h(k0, 0);
+                        v8i16 _w1 = __msa_ld_h(k0 + 8, 0);
+                        v8i16 _w2 = __msa_ld_h(k0 + 16, 0);
+                        v8i16 _w3 = __msa_ld_h(k0 + 24, 0);
+
+                        v8i16 _extw0 = __msa_clti_s_h(_w0, 0);
+                        v8i16 _extw1 = __msa_clti_s_h(_w1, 0);
+                        v8i16 _extw2 = __msa_clti_s_h(_w2, 0);
+                        v8i16 _extw3 = __msa_clti_s_h(_w3, 0);
+
+                        v4i32 _w0l = (v4i32)__msa_ilvr_h(_extw0, _w0);
+                        v4i32 _w0h = (v4i32)__msa_ilvl_h(_extw0, _w0);
+                        v4i32 _w1l = (v4i32)__msa_ilvr_h(_extw1, _w1);
+                        v4i32 _w1h = (v4i32)__msa_ilvl_h(_extw1, _w1);
+                        v4i32 _w2l = (v4i32)__msa_ilvr_h(_extw2, _w2);
+                        v4i32 _w2h = (v4i32)__msa_ilvl_h(_extw2, _w2);
+                        v4i32 _w3l = (v4i32)__msa_ilvr_h(_extw3, _w3);
+                        v4i32 _w3h = (v4i32)__msa_ilvl_h(_extw3, _w3);
+
+                        v4i32 _val0_0 = __msa_fill_w(r0[0]);
+                        v4i32 _val0_1 = __msa_fill_w(r0[1]);
+                        v4i32 _val0_2 = __msa_fill_w(r0[2]);
+                        v4i32 _val0_3 = __msa_fill_w(r0[3]);
+                        v4i32 _val0_4 = __msa_fill_w(r0[4]);
+                        v4i32 _val0_5 = __msa_fill_w(r0[5]);
+                        v4i32 _val0_6 = __msa_fill_w(r0[6]);
+                        v4i32 _val0_7 = __msa_fill_w(r0[7]);
+                        v4i32 _val1_0 = __msa_fill_w(r0[8]);
+                        v4i32 _val1_1 = __msa_fill_w(r0[9]);
+                        v4i32 _val1_2 = __msa_fill_w(r0[10]);
+                        v4i32 _val1_3 = __msa_fill_w(r0[11]);
+                        v4i32 _val1_4 = __msa_fill_w(r0[12]);
+                        v4i32 _val1_5 = __msa_fill_w(r0[13]);
+                        v4i32 _val1_6 = __msa_fill_w(r0[14]);
+                        v4i32 _val1_7 = __msa_fill_w(r0[15]);
+
+                        _sum0 = __msa_maddv_w(_sum0, _w0l, _val0_0);
+                        _sum1 = __msa_maddv_w(_sum1, _w0h, _val0_1);
+                        _sum2 = __msa_maddv_w(_sum2, _w0l, _val1_0);
+                        _sum3 = __msa_maddv_w(_sum3, _w0h, _val1_1);
+                        _sum0 = __msa_maddv_w(_sum0, _w1l, _val0_2);
+                        _sum1 = __msa_maddv_w(_sum1, _w1h, _val0_3);
+                        _sum2 = __msa_maddv_w(_sum2, _w1l, _val1_2);
+                        _sum3 = __msa_maddv_w(_sum3, _w1h, _val1_3);
+                        _sum0 = __msa_maddv_w(_sum0, _w2l, _val0_4);
+                        _sum1 = __msa_maddv_w(_sum1, _w2h, _val0_5);
+                        _sum2 = __msa_maddv_w(_sum2, _w2l, _val1_4);
+                        _sum3 = __msa_maddv_w(_sum3, _w2h, _val1_5);
+                        _sum0 = __msa_maddv_w(_sum0, _w3l, _val0_6);
+                        _sum1 = __msa_maddv_w(_sum1, _w3h, _val0_7);
+                        _sum2 = __msa_maddv_w(_sum2, _w3l, _val1_6);
+                        _sum3 = __msa_maddv_w(_sum3, _w3h, _val1_7);
+
+                        r0 += 16;
+                        k0 += 32;
+                    }
+
+                    _sum0 = __msa_addv_w(_sum0, _sum1);
+                    _sum2 = __msa_addv_w(_sum2, _sum3);
+
+                    __msa_st_w(_sum0, output0_tm, 0);
+                    __msa_st_w(_sum2, output0_tm + 4, 0);
+
+                    output0_tm += 8;
+                }
+                for (; i < tiles; i++)
+                {
+                    const short* r0 = bb2.row<const short>(i / 2 + i % 2);
+                    const short* k0 = kernel0_tm.row<const short>(r);
+
+                    int nn = inch; // inch always > 0
+
+                    v4i32 _sum0 = __msa_fill_w(0);
+                    v4i32 _sum1 = __msa_fill_w(0);
+
+                    for (int j = 0; j < nn; j++)
+                    {
+                        v8i16 _w0 = __msa_ld_h(k0, 0);
+                        v8i16 _w1 = __msa_ld_h(k0 + 8, 0);
+                        v8i16 _w2 = __msa_ld_h(k0 + 16, 0);
+                        v8i16 _w3 = __msa_ld_h(k0 + 24, 0);
+
+                        v8i16 _extw0 = __msa_clti_s_h(_w0, 0);
+                        v8i16 _extw1 = __msa_clti_s_h(_w1, 0);
+                        v8i16 _extw2 = __msa_clti_s_h(_w2, 0);
+                        v8i16 _extw3 = __msa_clti_s_h(_w3, 0);
+
+                        v4i32 _w0l = (v4i32)__msa_ilvr_h(_extw0, _w0);
+                        v4i32 _w0h = (v4i32)__msa_ilvl_h(_extw0, _w0);
+                        v4i32 _w1l = (v4i32)__msa_ilvr_h(_extw1, _w1);
+                        v4i32 _w1h = (v4i32)__msa_ilvl_h(_extw1, _w1);
+                        v4i32 _w2l = (v4i32)__msa_ilvr_h(_extw2, _w2);
+                        v4i32 _w2h = (v4i32)__msa_ilvl_h(_extw2, _w2);
+                        v4i32 _w3l = (v4i32)__msa_ilvr_h(_extw3, _w3);
+                        v4i32 _w3h = (v4i32)__msa_ilvl_h(_extw3, _w3);
+
+                        v4i32 _val0 = __msa_fill_w(r0[0]);
+                        v4i32 _val1 = __msa_fill_w(r0[1]);
+                        v4i32 _val2 = __msa_fill_w(r0[2]);
+                        v4i32 _val3 = __msa_fill_w(r0[3]);
+                        v4i32 _val4 = __msa_fill_w(r0[4]);
+                        v4i32 _val5 = __msa_fill_w(r0[5]);
+                        v4i32 _val6 = __msa_fill_w(r0[6]);
+                        v4i32 _val7 = __msa_fill_w(r0[7]);
+
+                        _sum0 = __msa_maddv_w(_sum0, _w0l, _val0);
+                        _sum1 = __msa_maddv_w(_sum1, _w0h, _val1);
+                        _sum0 = __msa_maddv_w(_sum0, _w1l, _val2);
+                        _sum1 = __msa_maddv_w(_sum1, _w1h, _val3);
+                        _sum0 = __msa_maddv_w(_sum0, _w2l, _val4);
+                        _sum1 = __msa_maddv_w(_sum1, _w2h, _val5);
+                        _sum0 = __msa_maddv_w(_sum0, _w3l, _val6);
+                        _sum1 = __msa_maddv_w(_sum1, _w3h, _val7);
+
+                        r0 += 8;
+                        k0 += 32;
+                    }
+
+                    _sum0 = __msa_addv_w(_sum0, _sum1);
+
+                    __msa_st_w(_sum0, output0_tm, 0);
+                    output0_tm += 4;
+                }
+            }
+        }
+    }
+    bottom_blob_tm = Mat();
+    // END dot
+
+    // BEGIN transform output
+    Mat top_blob_bordered;
+    if (outw == top_blob.w && outh == top_blob.h)
+    {
+        top_blob_bordered = top_blob;
+    }
+    else
+    {
+        top_blob_bordered.create(outw, outh, outch, 4u * 4, 4, opt.workspace_allocator);
+    }
+    {
+        // const float otm[4][6] = {
+        //     {1.0f, 1.0f,  1.0f, 1.0f,  1.0f, 0.0f},
+        //     {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 0.0f},
+        //     {0.0f, 1.0f,  1.0f, 4.0f,  4.0f, 0.0f},
+        //     {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 1.0f}
+        // };
+
+        // 0 = r00 + (r01 + r02) + (r03 + r04)
+        // 1 =       (r01 - r02) + (r03 - r04) * 2
+        // 2 =       (r01 + r02) + (r03 + r04) * 4
+        // 3 = r05 + (r01 - r02) + (r03 - r04) * 8
+
+        int w_tm = outw / 4 * 6;
+        int h_tm = outh / 4 * 6;
+        const int tiles = w_tm / 6 * h_tm / 6;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < outch; p++)
+        {
+            const Mat out0_tm = top_blob_tm.channel(p);
+            Mat out0 = top_blob_bordered.channel(p);
+
+            int tmp[4][6][4];
+
+            // tile
+            for (int i = 0; i < outh / 4; i++)
+            {
+                for (int j = 0; j < outw / 4; j++)
+                {
+                    // top_blob_tm.create(tiles, 36, outch, elemsize, elempack);
+
+                    const int* output0_tm_0 = (const int*)out0_tm + (i * w_tm / 6 + j) * 4;
+                    const int* output0_tm_1 = output0_tm_0 + tiles * 4;
+                    const int* output0_tm_2 = output0_tm_0 + tiles * 8;
+                    const int* output0_tm_3 = output0_tm_0 + tiles * 12;
+                    const int* output0_tm_4 = output0_tm_0 + tiles * 16;
+                    const int* output0_tm_5 = output0_tm_0 + tiles * 20;
+
+                    int* output0 = out0.row<int>(i * 4) + (j * 4) * 4;
+
+                    for (int m = 0; m < 5; m++)
+                    {
+                        v4i32 _out0tm0 = __msa_ld_w(output0_tm_0, 0);
+                        v4i32 _out0tm1 = __msa_ld_w(output0_tm_1, 0);
+                        v4i32 _out0tm2 = __msa_ld_w(output0_tm_2, 0);
+                        v4i32 _out0tm3 = __msa_ld_w(output0_tm_3, 0);
+                        v4i32 _out0tm4 = __msa_ld_w(output0_tm_4, 0);
+                        v4i32 _out0tm5 = __msa_ld_w(output0_tm_5, 0);
+
+                        v4i32 _tmp02a = __msa_addv_w(_out0tm1, _out0tm2);
+                        v4i32 _tmp13a = __msa_subv_w(_out0tm1, _out0tm2);
+
+                        v4i32 _tmp02b = __msa_addv_w(_out0tm3, _out0tm4);
+                        v4i32 _tmp13b = __msa_subv_w(_out0tm3, _out0tm4);
+
+                        v4i32 _tmp0m = __msa_addv_w(__msa_addv_w(_out0tm0, _tmp02a), _tmp02b);
+                        v4i32 _tmp1m = __msa_addv_w(_tmp13a, __msa_slli_w(_tmp13b, 1));
+                        v4i32 _tmp2m = __msa_addv_w(_tmp02a, __msa_slli_w(_tmp02b, 2));
+                        v4i32 _tmp3m = __msa_addv_w(__msa_addv_w(_tmp13a, __msa_slli_w(_out0tm5, 2)), __msa_slli_w(_tmp13b, 3));
+
+                        __msa_st_w(_tmp0m, tmp[0][m], 0);
+                        __msa_st_w(_tmp1m, tmp[1][m], 0);
+                        __msa_st_w(_tmp2m, tmp[2][m], 0);
+                        __msa_st_w(_tmp3m, tmp[3][m], 0);
+
+                        output0_tm_0 += tiles * 24;
+                        output0_tm_1 += tiles * 24;
+                        output0_tm_2 += tiles * 24;
+                        output0_tm_3 += tiles * 24;
+                        output0_tm_4 += tiles * 24;
+                        output0_tm_5 += tiles * 24;
+                    }
+                    for (int m = 5; m < 6; m++)
+                    {
+                        v4i32 _out0tm0 = __msa_ld_w(output0_tm_0, 0);
+                        v4i32 _out0tm1 = __msa_ld_w(output0_tm_1, 0);
+                        v4i32 _out0tm2 = __msa_ld_w(output0_tm_2, 0);
+                        v4i32 _out0tm3 = __msa_ld_w(output0_tm_3, 0);
+                        v4i32 _out0tm4 = __msa_ld_w(output0_tm_4, 0);
+                        v4i32 _out0tm5 = __msa_ld_w(output0_tm_5, 0);
+
+                        v4i32 _tmp02a = __msa_addv_w(_out0tm1, _out0tm2);
+                        v4i32 _tmp13a = __msa_subv_w(_out0tm1, _out0tm2);
+
+                        v4i32 _tmp02b = __msa_addv_w(_out0tm3, _out0tm4);
+                        v4i32 _tmp13b = __msa_subv_w(_out0tm3, _out0tm4);
+
+                        v4i32 _tmp0m = __msa_addv_w(__msa_addv_w(_out0tm0, _tmp02a), _tmp02b);
+                        v4i32 _tmp1m = __msa_addv_w(_tmp13a, __msa_slli_w(_tmp13b, 1));
+                        v4i32 _tmp2m = __msa_addv_w(_tmp02a, __msa_slli_w(_tmp02b, 2));
+                        v4i32 _tmp3m = __msa_addv_w(__msa_addv_w(_tmp13a, __msa_slli_w(_out0tm5, 2)), __msa_slli_w(_tmp13b, 3));
+
+                        _tmp0m = __msa_slli_w(_tmp0m, 2);
+                        _tmp1m = __msa_slli_w(_tmp1m, 2);
+                        _tmp2m = __msa_slli_w(_tmp2m, 2);
+                        _tmp3m = __msa_slli_w(_tmp3m, 2);
+
+                        __msa_st_w(_tmp0m, tmp[0][m], 0);
+                        __msa_st_w(_tmp1m, tmp[1][m], 0);
+                        __msa_st_w(_tmp2m, tmp[2][m], 0);
+                        __msa_st_w(_tmp3m, tmp[3][m], 0);
+
+                        output0_tm_0 += tiles * 24;
+                        output0_tm_1 += tiles * 24;
+                        output0_tm_2 += tiles * 24;
+                        output0_tm_3 += tiles * 24;
+                        output0_tm_4 += tiles * 24;
+                        output0_tm_5 += tiles * 24;
+                    }
+
+                    for (int m = 0; m < 4; m++)
+                    {
+                        v4i32 _tmp00 = __msa_ld_w(tmp[m][0], 0);
+                        v4i32 _tmp01 = __msa_ld_w(tmp[m][1], 0);
+                        v4i32 _tmp02 = __msa_ld_w(tmp[m][2], 0);
+                        v4i32 _tmp03 = __msa_ld_w(tmp[m][3], 0);
+                        v4i32 _tmp04 = __msa_ld_w(tmp[m][4], 0);
+                        v4i32 _tmp05 = __msa_ld_w(tmp[m][5], 0);
+
+                        v4i32 _tmp02a = __msa_addv_w(_tmp01, _tmp02);
+                        v4i32 _tmp13a = __msa_subv_w(_tmp01, _tmp02);
+
+                        v4i32 _tmp02b = __msa_addv_w(_tmp03, _tmp04);
+                        v4i32 _tmp13b = __msa_subv_w(_tmp03, _tmp04);
+
+                        v4i32 _out00 = __msa_addv_w(__msa_addv_w(_tmp00, _tmp02a), _tmp02b);
+                        v4i32 _out01 = __msa_addv_w(_tmp13a, __msa_slli_w(_tmp13b, 1));
+                        v4i32 _out02 = __msa_addv_w(_tmp02a, __msa_slli_w(_tmp02b, 2));
+                        v4i32 _out03 = __msa_addv_w(__msa_addv_w(_tmp05, _tmp13a), __msa_slli_w(_tmp13b, 3));
+
+                        // TODO use integer trick for division by 576
+                        v4f32 _v576 = __msa_fill_w_f32(1.0 / 576);
+                        _out00 = __msa_ftint_s_w(__msa_fmul_w(__msa_ffint_s_w(_out00), _v576));
+                        _out01 = __msa_ftint_s_w(__msa_fmul_w(__msa_ffint_s_w(_out01), _v576));
+                        _out02 = __msa_ftint_s_w(__msa_fmul_w(__msa_ffint_s_w(_out02), _v576));
+                        _out03 = __msa_ftint_s_w(__msa_fmul_w(__msa_ffint_s_w(_out03), _v576));
+
+                        __msa_st_w(_out00, output0, 0);
+                        __msa_st_w(_out01, output0 + 4, 0);
+                        __msa_st_w(_out02, output0 + 8, 0);
+                        __msa_st_w(_out03, output0 + 12, 0);
+
+                        output0 += outw * 4;
+                    }
+                }
+            }
+        }
+    }
+    // END transform output
+
+    // cut result pad
+    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
+}
diff --git a/src/layer/mips/convolution_int8.h b/src/layer/mips/convolution_int8.h
new file mode 100644
index 000000000..6b56d11eb
--- /dev/null
+++ b/src/layer/mips/convolution_int8.h
@@ -0,0 +1,82 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void convolution_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int channels = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // kernel offsets
+    std::vector<int> _space_ofs(maxk);
+    int* space_ofs = &_space_ofs[0];
+    {
+        int p1 = 0;
+        int p2 = 0;
+        int gap = w * dilation_h - kernel_w * dilation_w;
+        for (int i = 0; i < kernel_h; i++)
+        {
+            for (int j = 0; j < kernel_w; j++)
+            {
+                space_ofs[p1] = p2;
+                p1++;
+                p2 += dilation_w;
+            }
+            p2 += gap;
+        }
+    }
+
+    // num_output
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        int* outptr = top_blob.channel(p);
+
+        for (int i = 0; i < outh; i++)
+        {
+            for (int j = 0; j < outw; j++)
+            {
+                int sum = 0;
+
+                //                 const signed char* kptr = weight_data_int8.channel(p);
+                const signed char* kptr = (const signed char*)weight_data_int8 + maxk * channels * p;
+
+                // channels
+                for (int q = 0; q < channels; q++)
+                {
+                    const Mat m = bottom_blob.channel(q);
+                    const signed char* sptr = m.row<signed char>(i * stride_h) + j * stride_w;
+
+                    for (int k = 0; k < maxk; k++)
+                    {
+                        signed char val = sptr[space_ofs[k]];
+                        signed char w = kptr[k];
+                        sum += val * w;
+                    }
+
+                    kptr += maxk;
+                }
+
+                outptr[j] = sum;
+            }
+
+            outptr += outw;
+        }
+    }
+}
diff --git a/src/layer/mips/convolution_mips.cpp b/src/layer/mips/convolution_mips.cpp
index f9d37028d..a3e012c57 100644
--- a/src/layer/mips/convolution_mips.cpp
+++ b/src/layer/mips/convolution_mips.cpp
@@ -32,6 +32,12 @@ namespace ncnn {
 #include "convolution_sgemm.h"
 #include "convolution_1x1.h"
 
+#if NCNN_INT8
+#include "convolution_sgemm_int8.h"
+#include "convolution_1x1_int8.h"
+#include "convolution_int8.h"
+#endif // NCNN_INT8
+
 #if __mips_msa
 #include "convolution_pack4.h"
 #include "convolution_pack1to4.h"
@@ -44,6 +50,20 @@ namespace ncnn {
 #include "convolution_3x3_pack4.h"
 #include "convolution_3x3_pack1to4.h"
 #include "convolution_7x7_pack1to4.h"
+
+#if NCNN_INT8
+#include "convolution_pack8to4_int8.h"
+#include "convolution_pack1to4_int8.h"
+#include "convolution_pack8to1_int8.h"
+#include "convolution_sgemm_pack8to4_int8.h"
+#include "convolution_sgemm_pack1to4_int8.h"
+#include "convolution_sgemm_pack8to1_int8.h"
+#include "convolution_1x1_pack8to4_int8.h"
+#include "convolution_1x1_pack1to4_int8.h"
+#include "convolution_1x1_pack8to1_int8.h"
+#include "convolution_3x3_pack8to4_int8.h"
+#include "convolution_3x3_pack8to1_int8.h"
+#endif // NCNN_INT8
 #endif // __mips_msa
 
 Convolution_mips::Convolution_mips()
@@ -98,6 +118,13 @@ int Convolution_mips::create_pipeline(const Option& opt)
 
     activation = create_activation_layer(activation_type, activation_params, opt);
 
+#if NCNN_INT8
+    if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
+    {
+        return create_pipeline_int8_mips(opt);
+    }
+#endif
+
     const int maxk = kernel_w * kernel_h;
     const int num_input = weight_data_size / maxk / num_output;
 
@@ -117,8 +144,8 @@ int Convolution_mips::create_pipeline(const Option& opt)
     {
         if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && num_input >= 16 && num_output >= 16)
         {
-            conv3x3s1_winograd64_transform_kernel_pack4_msa(weight_data, weight_data_packed, num_input, num_output, opt);
-            conv3x3s1_winograd42_transform_kernel_pack4_msa(weight_data, weight_3x3_winograd42_data_packed, num_input, num_output, opt);
+            conv3x3s1_winograd64_transform_kernel_pack4_msa(weight_data, weight_3x3_winograd64_data, num_input, num_output, opt);
+            conv3x3s1_winograd42_transform_kernel_pack4_msa(weight_data, weight_3x3_winograd42_data, num_input, num_output, opt);
         }
         else
         {
@@ -187,27 +214,7 @@ int Convolution_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
 #if NCNN_INT8
     if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
     {
-        Mat bottom_blob_unpacked = bottom_blob;
-        if (bottom_blob.elempack != 1)
-        {
-            Option opt_pack1 = opt;
-            opt_pack1.blob_allocator = opt.workspace_allocator;
-
-            convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1);
-        }
-
-        Mat bottom_blob_unpacked_fp32 = bottom_blob_unpacked;
-        if (bottom_blob_unpacked.elembits() == 16)
-        {
-            Option opt_pack1 = opt;
-            opt_pack1.blob_allocator = opt.workspace_allocator;
-
-            cast_float16_to_float32(bottom_blob_unpacked, bottom_blob_unpacked_fp32, opt_pack1);
-        }
-
-        Option opt_unpacked = opt;
-        opt_unpacked.use_packing_layout = false;
-        return Convolution::forward_int8(bottom_blob_unpacked_fp32, top_blob, opt_unpacked);
+        return forward_int8_mips(bottom_blob, top_blob, opt);
     }
 #endif
 
@@ -278,11 +285,11 @@ int Convolution_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
             // we need more proper conditions
             if ((w <= 10 || (w >= 15 && w <= 18) || w == 21 || w == 22) && (h <= 10 || (h >= 15 && h <= 18) || h == 21 || h == 22))
             {
-                conv3x3s1_winograd42_pack4_msa(bottom_blob_bordered, top_blob, weight_3x3_winograd42_data_packed, bias_data, opt);
+                conv3x3s1_winograd42_pack4_msa(bottom_blob_bordered, top_blob, weight_3x3_winograd42_data, bias_data, opt);
             }
             else
             {
-                conv3x3s1_winograd64_pack4_msa(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
+                conv3x3s1_winograd64_pack4_msa(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data, opt);
             }
 
             if (activation)
@@ -542,4 +549,408 @@ int Convolution_mips::forward(const std::vector<Mat>& bottom_blobs, std::vector<
     return 0;
 }
 
+#if NCNN_INT8
+static void convolution_transform_kernel_packed_int8_msa(const Mat& weight_data, Mat& weight_data_int8, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack)
+{
+    const int maxk = kernel_w * kernel_h;
+
+    // src = kw-kh-inch-outch
+    // dst = pa-pb-kw-kh-inch/pa-outch/pb
+    {
+        Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
+
+        weight_data_int8.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)elempack * out_elempack, elempack * out_elempack);
+
+        for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
+        {
+            signed char* g00 = weight_data_int8.channel(q / out_elempack);
+
+            for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
+            {
+                for (int k = 0; k < maxk; k++)
+                {
+                    for (int i = 0; i < out_elempack; i++)
+                    {
+                        for (int j = 0; j < elempack; j++)
+                        {
+                            const signed char* k00 = weight_data_r2.channel(q + i).row<const signed char>(p + j);
+
+                            g00[0] = k00[k];
+
+                            g00++;
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+int Convolution_mips::create_pipeline_int8_mips(const Option& opt)
+{
+    const int maxk = kernel_w * kernel_h;
+    const int num_input = weight_data_size / maxk / num_output;
+
+    int elempack = 1;
+    int out_elempack = 1;
+#if __mips_msa
+    if (opt.use_packing_layout)
+    {
+        elempack = num_input % 8 == 0 ? 8 : 1;
+        out_elempack = num_output % 4 == 0 ? 4 : 1;
+    }
+#endif // __mips_msa
+
+#if __mips_msa
+    if (elempack == 8 && out_elempack == 4)
+    {
+        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack8to4_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack8to4_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            conv3x3s1_winograd42_transform_kernel_pack8to4_int8_msa(weight_data, weight_3x3_winograd42_data, num_input, num_output, opt);
+        }
+        else if (opt.use_sgemm_convolution)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack8to4_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else
+        {
+            convolution_transform_kernel_packed_int8_msa(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
+        }
+    }
+
+    if (elempack == 1 && out_elempack == 4)
+    {
+        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack1to4_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack1to4_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack1to4_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else
+        {
+            convolution_transform_kernel_packed_int8_msa(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
+        }
+    }
+
+    if (elempack == 8 && out_elempack == 1)
+    {
+        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack8to1_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack8to1_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            conv3x3s1_winograd42_transform_kernel_pack8to1_int8_msa(weight_data, weight_3x3_winograd42_data, num_input, num_output, opt);
+        }
+        else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
+        {
+            convolution_im2col_sgemm_transform_kernel_pack8to1_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else
+        {
+            convolution_transform_kernel_packed_int8_msa(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
+        }
+    }
+#endif // __mips_msa
+
+    if (elempack == 1 && out_elempack == 1)
+    {
+        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            convolution_im2col_sgemm_transform_kernel_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+        {
+            convolution_im2col_sgemm_transform_kernel_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+        else if (opt.use_sgemm_convolution)
+        {
+            convolution_im2col_sgemm_transform_kernel_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
+        }
+    }
+
+    return 0;
+}
+
+int Convolution_mips::forward_int8_mips(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    int elembits = bottom_blob.elembits();
+
+    Mat bottom_blob_int8 = bottom_blob;
+    if (elembits != 8)
+    {
+        Option opt_q = opt;
+        opt_q.blob_allocator = opt.workspace_allocator;
+        quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_q);
+    }
+
+    Mat bottom_blob_bordered;
+    make_padding(bottom_blob_int8, bottom_blob_bordered, opt);
+    if (bottom_blob_bordered.empty())
+        return -100;
+
+    int w = bottom_blob_bordered.w;
+    int h = bottom_blob_bordered.h;
+    int channels = bottom_blob_bordered.c;
+    int elempack = bottom_blob_bordered.elempack;
+
+    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+
+    int outw = (w - kernel_extent_w) / stride_w + 1;
+    int outh = (h - kernel_extent_h) / stride_h + 1;
+
+    bool use_int8_requantize = int8_scale_term > 100;
+    int out_elempack = 1;
+#if __mips_msa
+    if (opt.use_packing_layout)
+    {
+        if (use_int8_requantize)
+            out_elempack = num_output % 8 == 0 ? 8 : 1;
+        else
+            out_elempack = num_output % 4 == 0 ? 4 : 1;
+    }
+#endif // __mips_msa
+    size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack;
+
+    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    const int num_input = channels * elempack;
+
+    int out_elempack_int32 = 1;
+#if __mips_msa
+    if (opt.use_packing_layout)
+    {
+        out_elempack_int32 = num_output % 4 == 0 ? 4 : 1;
+    }
+#endif // __mips_msa
+
+    Mat top_blob_int32;
+    top_blob_int32.create(outw, outh, num_output / out_elempack_int32, (size_t)(4u * out_elempack_int32), out_elempack_int32, opt.workspace_allocator);
+    if (top_blob_int32.empty())
+        return -100;
+
+#if __mips_msa
+    if (elempack == 8 && out_elempack_int32 == 4)
+    {
+        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            conv1x1s1_sgemm_pack8to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
+        }
+        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+        {
+            conv1x1s2_sgemm_pack8to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
+        }
+        else if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            conv3x3s1_winograd42_pack8to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_3x3_winograd42_data, opt);
+        }
+        else if (opt.use_sgemm_convolution)
+        {
+            convolution_im2col_sgemm_pack8to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
+        }
+        else
+        {
+            convolution_pack8to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_data_int8, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
+        }
+
+        Mat scale_in_data(num_output);
+        for (int p = 0; p < num_output; p++)
+        {
+            // requantize and relu
+            float scale_in;
+            if (weight_data_int8_scales[p] == 0)
+                scale_in = 0;
+            else
+                scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]);
+
+            scale_in_data[p] = scale_in;
+        }
+
+        if (use_int8_requantize)
+        {
+            requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt);
+        }
+        else
+        {
+            dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt);
+
+            if (activation)
+            {
+                activation->forward_inplace(top_blob, opt);
+            }
+        }
+    }
+
+    if (elempack == 1 && out_elempack_int32 == 4)
+    {
+        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            conv1x1s1_sgemm_pack1to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
+        }
+        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+        {
+            conv1x1s2_sgemm_pack1to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
+        }
+        else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
+        {
+            convolution_im2col_sgemm_pack1to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
+        }
+        else
+        {
+            convolution_pack1to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_data_int8, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
+        }
+
+        Mat scale_in_data(num_output);
+        for (int p = 0; p < num_output; p++)
+        {
+            // requantize and relu
+            float scale_in;
+            if (weight_data_int8_scales[p] == 0)
+                scale_in = 0;
+            else
+                scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]);
+
+            scale_in_data[p] = scale_in;
+        }
+
+        if (use_int8_requantize)
+        {
+            requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt);
+        }
+        else
+        {
+            dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt);
+
+            if (activation)
+            {
+                activation->forward_inplace(top_blob, opt);
+            }
+        }
+    }
+
+    if (elempack == 8 && out_elempack_int32 == 1)
+    {
+        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            conv1x1s1_sgemm_pack8to1_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
+        }
+        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+        {
+            conv1x1s2_sgemm_pack8to1_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
+        }
+        else if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            conv3x3s1_winograd42_pack8to1_int8_msa(bottom_blob_bordered, top_blob_int32, weight_3x3_winograd42_data, opt);
+        }
+        else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
+        {
+            convolution_im2col_sgemm_pack8to1_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
+        }
+        else
+        {
+            convolution_pack8to1_int8_msa(bottom_blob_bordered, top_blob_int32, weight_data_int8, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
+        }
+
+        Mat scale_in_data(num_output);
+        for (int p = 0; p < num_output; p++)
+        {
+            // requantize and relu
+            float scale_in;
+            if (weight_data_int8_scales[p] == 0)
+                scale_in = 0;
+            else
+                scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]);
+
+            scale_in_data[p] = scale_in;
+        }
+
+        if (use_int8_requantize)
+        {
+            requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt);
+        }
+        else
+        {
+            dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt);
+
+            if (activation)
+            {
+                activation->forward_inplace(top_blob, opt);
+            }
+        }
+    }
+#endif // __mips_msa
+
+    if (elempack == 1 && out_elempack_int32 == 1)
+    {
+        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        {
+            conv1x1s1_sgemm_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
+        }
+        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+        {
+            conv1x1s2_sgemm_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
+        }
+        else if (opt.use_sgemm_convolution)
+        {
+            convolution_im2col_sgemm_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
+        }
+        else
+        {
+            //         convolution_int8(bottom_blob_bordered, top_blob_int32, weight_data_int8, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
+            convolution_int8(bottom_blob_bordered, top_blob_int32, weight_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
+        }
+
+        Mat scale_in_data(num_output);
+        for (int p = 0; p < num_output; p++)
+        {
+            // requantize and relu
+            float scale_in;
+            if (weight_data_int8_scales[p] == 0)
+                scale_in = 0;
+            else
+                scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]);
+
+            scale_in_data[p] = scale_in;
+        }
+
+        if (use_int8_requantize)
+        {
+            requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt);
+        }
+        else
+        {
+            dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt);
+
+            if (activation)
+            {
+                activation->forward_inplace(top_blob, opt);
+            }
+        }
+    }
+
+    return 0;
+}
+#endif // NCNN_INT8
+
 } // namespace ncnn
diff --git a/src/layer/mips/convolution_mips.h b/src/layer/mips/convolution_mips.h
index acb5394d8..69d442d2d 100644
--- a/src/layer/mips/convolution_mips.h
+++ b/src/layer/mips/convolution_mips.h
@@ -31,12 +31,27 @@ public:
 
     virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
 
+protected:
+#if NCNN_INT8
+    int create_pipeline_int8_mips(const Option& opt);
+    int forward_int8_mips(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+#endif
+
 public:
     Layer* activation;
 
-    // packn
+    Mat weight_sgemm_data;
+
+    Mat weight_3x3_winograd42_data;
+    Mat weight_3x3_winograd64_data;
+
+    // pack4
     Mat weight_data_packed;
-    Mat weight_3x3_winograd42_data_packed;
+
+#if NCNN_INT8
+    // int8
+    Mat weight_data_int8;
+#endif
 };
 
 } // namespace ncnn
diff --git a/src/layer/mips/convolution_pack1to4_int8.h b/src/layer/mips/convolution_pack1to4_int8.h
new file mode 100644
index 000000000..b752bda0b
--- /dev/null
+++ b/src/layer/mips/convolution_pack1to4_int8.h
@@ -0,0 +1,87 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void convolution_pack1to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int channels = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // kernel offsets
+    std::vector<int> _space_ofs(maxk);
+    int* space_ofs = &_space_ofs[0];
+    {
+        int p1 = 0;
+        int p2 = 0;
+        int gap = w * dilation_h - kernel_w * dilation_w;
+        for (int i = 0; i < kernel_h; i++)
+        {
+            for (int j = 0; j < kernel_w; j++)
+            {
+                space_ofs[p1] = p2;
+                p1++;
+                p2 += dilation_w;
+            }
+            p2 += gap;
+        }
+    }
+
+    // num_output
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        int* outptr = top_blob.channel(p);
+
+        for (int i = 0; i < outh; i++)
+        {
+            for (int j = 0; j < outw; j++)
+            {
+                v4i32 _sum = __msa_fill_w(0);
+
+                const signed char* kptr = weight_data_int8.channel(p);
+
+                // channels
+                for (int q = 0; q < channels; q++)
+                {
+                    const Mat m = bottom_blob.channel(q);
+                    const signed char* sptr = m.row<const signed char>(i * stride_h) + j * stride_w;
+
+                    for (int k = 0; k < maxk; k++)
+                    {
+                        v8i16 _val = __msa_fill_h((short)sptr[space_ofs[k]]);
+
+                        v16i8 _w = __msa_ld_b(kptr, 0);
+                        v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w);
+
+                        v8i16 _s0 = __msa_mulv_h(_val, _w16);
+                        v4i32 _s032 = (v4i32)__msa_ilvr_h(__msa_clti_s_h(_s0, 0), _s0);
+
+                        _sum = __msa_addv_w(_sum, _s032);
+
+                        kptr += 4;
+                    }
+                }
+
+                __msa_st_w(_sum, outptr + j * 4, 0);
+            }
+
+            outptr += outw * 4;
+        }
+    }
+}
diff --git a/src/layer/mips/convolution_pack4to1.h b/src/layer/mips/convolution_pack4to1.h
index efdea27d0..8269860b2 100644
--- a/src/layer/mips/convolution_pack4to1.h
+++ b/src/layer/mips/convolution_pack4to1.h
@@ -81,7 +81,7 @@ static void convolution_pack4to1_msa(const Mat& bottom_blob, Mat& top_blob, cons
                     }
                 }
 
-                sum += __msa_fhadd_w(_sum);
+                sum += __msa_reduce_fadd_w(_sum);
 
                 sum = activation_ss(sum, activation_type, activation_params);
 
diff --git a/src/layer/mips/convolution_pack8to1_int8.h b/src/layer/mips/convolution_pack8to1_int8.h
new file mode 100644
index 000000000..1f9ad5a6c
--- /dev/null
+++ b/src/layer/mips/convolution_pack8to1_int8.h
@@ -0,0 +1,87 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void convolution_pack8to1_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int channels = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // kernel offsets
+    std::vector<int> _space_ofs(maxk);
+    int* space_ofs = &_space_ofs[0];
+    {
+        int p1 = 0;
+        int p2 = 0;
+        int gap = w * dilation_h - kernel_w * dilation_w;
+        for (int i = 0; i < kernel_h; i++)
+        {
+            for (int j = 0; j < kernel_w; j++)
+            {
+                space_ofs[p1] = p2;
+                p1++;
+                p2 += dilation_w;
+            }
+            p2 += gap;
+        }
+    }
+
+    // num_output
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        int* outptr = top_blob.channel(p);
+
+        for (int i = 0; i < outh; i++)
+        {
+            for (int j = 0; j < outw; j++)
+            {
+                v4i32 _sum = __msa_fill_w(0);
+
+                const signed char* kptr = weight_data_int8.channel(p);
+
+                // channels
+                for (int q = 0; q < channels; q++)
+                {
+                    const Mat m = bottom_blob.channel(q);
+                    const signed char* sptr = m.row<const signed char>(i * stride_h) + j * stride_w * 8;
+
+                    for (int k = 0; k < maxk; k++)
+                    {
+                        v16i8 _val = __msa_ld_b(sptr + space_ofs[k] * 8, 0);
+                        v8i16 _val16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val);
+
+                        v16i8 _w = __msa_ld_b(kptr, 0);
+                        v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w);
+
+                        v8i16 _s0 = __msa_mulv_h(_val16, _w16);
+
+                        _sum = __msa_addv_w(_sum, __msa_hadd_s_w(_s0, _s0));
+
+                        kptr += 8;
+                    }
+                }
+
+                outptr[j] = __msa_reduce_add_w(_sum);
+            }
+
+            outptr += outw;
+        }
+    }
+}
diff --git a/src/layer/mips/convolution_pack8to4_int8.h b/src/layer/mips/convolution_pack8to4_int8.h
new file mode 100644
index 000000000..50636f82d
--- /dev/null
+++ b/src/layer/mips/convolution_pack8to4_int8.h
@@ -0,0 +1,120 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void convolution_pack8to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int channels = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // kernel offsets
+    std::vector<int> _space_ofs(maxk);
+    int* space_ofs = &_space_ofs[0];
+    {
+        int p1 = 0;
+        int p2 = 0;
+        int gap = w * dilation_h - kernel_w * dilation_w;
+        for (int i = 0; i < kernel_h; i++)
+        {
+            for (int j = 0; j < kernel_w; j++)
+            {
+                space_ofs[p1] = p2;
+                p1++;
+                p2 += dilation_w;
+            }
+            p2 += gap;
+        }
+    }
+
+    // num_output
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        int* outptr = top_blob.channel(p);
+
+        for (int i = 0; i < outh; i++)
+        {
+            for (int j = 0; j < outw; j++)
+            {
+                v4i32 _sum0 = __msa_fill_w(0);
+                v4i32 _sum1 = __msa_fill_w(0);
+                v4i32 _sum2 = __msa_fill_w(0);
+                v4i32 _sum3 = __msa_fill_w(0);
+
+                const signed char* kptr = weight_data_int8.channel(p);
+
+                // channels
+                for (int q = 0; q < channels; q++)
+                {
+                    const Mat m = bottom_blob.channel(q);
+                    const signed char* sptr = m.row<signed char>(i * stride_h) + j * stride_w * 8;
+
+                    for (int k = 0; k < maxk; k++)
+                    {
+                        v16i8 _val = __msa_ld_b(sptr + space_ofs[k] * 8, 0);
+                        v8i16 _val16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val);
+
+                        v16i8 _w01 = __msa_ld_b(kptr, 0);
+                        v16i8 _w23 = __msa_ld_b(kptr + 16, 0);
+                        v16i8 _extw01 = __msa_clti_s_b(_w01, 0);
+                        v16i8 _extw23 = __msa_clti_s_b(_w23, 0);
+                        v8i16 _w0 = (v8i16)__msa_ilvr_b(_extw01, _w01);
+                        v8i16 _w1 = (v8i16)__msa_ilvl_b(_extw01, _w01);
+                        v8i16 _w2 = (v8i16)__msa_ilvr_b(_extw23, _w23);
+                        v8i16 _w3 = (v8i16)__msa_ilvl_b(_extw23, _w23);
+
+                        v8i16 _s0 = __msa_mulv_h(_val16, _w0);
+                        v8i16 _s1 = __msa_mulv_h(_val16, _w1);
+                        v8i16 _s2 = __msa_mulv_h(_val16, _w2);
+                        v8i16 _s3 = __msa_mulv_h(_val16, _w3);
+
+                        _sum0 = __msa_addv_w(_sum0, __msa_hadd_s_w(_s0, _s0));
+                        _sum1 = __msa_addv_w(_sum1, __msa_hadd_s_w(_s1, _s1));
+                        _sum2 = __msa_addv_w(_sum2, __msa_hadd_s_w(_s2, _s2));
+                        _sum3 = __msa_addv_w(_sum3, __msa_hadd_s_w(_s3, _s3));
+
+                        kptr += 32;
+                    }
+                }
+
+                // transpose 4x4
+                {
+                    v4i32 _tmp0, _tmp1, _tmp2, _tmp3;
+                    _tmp0 = __msa_ilvr_w(_sum1, _sum0);
+                    _tmp1 = __msa_ilvr_w(_sum3, _sum2);
+                    _tmp2 = __msa_ilvl_w(_sum1, _sum0);
+                    _tmp3 = __msa_ilvl_w(_sum3, _sum2);
+                    _sum0 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0);
+                    _sum1 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0);
+                    _sum2 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2);
+                    _sum3 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2);
+                }
+
+                _sum0 = __msa_addv_w(_sum0, _sum1);
+                _sum2 = __msa_addv_w(_sum2, _sum3);
+
+                _sum0 = __msa_addv_w(_sum0, _sum2);
+
+                __msa_st_w(_sum0, outptr + j * 4, 0);
+            }
+
+            outptr += outw * 4;
+        }
+    }
+}
diff --git a/src/layer/mips/convolution_sgemm_int8.h b/src/layer/mips/convolution_sgemm_int8.h
new file mode 100644
index 000000000..297c4b1b0
--- /dev/null
+++ b/src/layer/mips/convolution_sgemm_int8.h
@@ -0,0 +1,731 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void im2col_sgemm_int8_msa(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
+{
+    // Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator);
+
+    const int size = bottom_im2col.w;
+    const int maxk = bottom_im2col.h;
+    const int inch = bottom_im2col.c;
+
+    const int outch = top_blob.c;
+
+    // permute
+    Mat tmp;
+#if __mips_msa
+    if (inch >= 4)
+    {
+        if (size >= 2)
+            tmp.create(2 * maxk, inch / 4 + inch % 4, size / 2 + size % 2, 4u, 4, opt.workspace_allocator);
+        else
+            tmp.create(maxk, inch / 4 + inch % 4, size, 4u, 4, opt.workspace_allocator);
+    }
+    else
+    {
+        if (size >= 2)
+            tmp.create(2 * maxk, inch, size / 2 + size % 2, 1u, 1, opt.workspace_allocator);
+        else
+            tmp.create(maxk, inch, size, 1u, 1, opt.workspace_allocator);
+    }
+    {
+        int remain_size_start = 0;
+        int nn_size = (size - remain_size_start) >> 1;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int ii = 0; ii < nn_size; ii++)
+        {
+            int i = remain_size_start + ii * 2;
+
+            signed char* tmpptr = tmp.channel(i / 2);
+
+            int q = 0;
+            for (; q + 3 < inch; q += 4)
+            {
+                const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;
+                const signed char* img1 = (const signed char*)bottom_im2col.channel(q + 1) + i;
+                const signed char* img2 = (const signed char*)bottom_im2col.channel(q + 2) + i;
+                const signed char* img3 = (const signed char*)bottom_im2col.channel(q + 3) + i;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    tmpptr[0] = img0[0];
+                    tmpptr[1] = img1[0];
+                    tmpptr[2] = img2[0];
+                    tmpptr[3] = img3[0];
+                    tmpptr[4] = img0[1];
+                    tmpptr[5] = img1[1];
+                    tmpptr[6] = img2[1];
+                    tmpptr[7] = img3[1];
+                    tmpptr += 8;
+
+                    img0 += size;
+                    img1 += size;
+                    img2 += size;
+                    img3 += size;
+                }
+            }
+            for (; q < inch; q++)
+            {
+                const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    tmpptr[0] = img0[0];
+                    tmpptr[1] = img0[1];
+
+                    tmpptr += 2;
+
+                    img0 += size;
+                }
+            }
+        }
+
+        remain_size_start += nn_size << 1;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = remain_size_start; i < size; i++)
+        {
+            signed char* tmpptr = tmp.channel(i / 2 + i % 2);
+
+            int q = 0;
+            for (; q + 3 < inch; q += 4)
+            {
+                const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;
+                const signed char* img1 = (const signed char*)bottom_im2col.channel(q + 1) + i;
+                const signed char* img2 = (const signed char*)bottom_im2col.channel(q + 2) + i;
+                const signed char* img3 = (const signed char*)bottom_im2col.channel(q + 3) + i;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    tmpptr[0] = img0[0];
+                    tmpptr[1] = img1[0];
+                    tmpptr[2] = img2[0];
+                    tmpptr[3] = img3[0];
+                    tmpptr += 4;
+
+                    img0 += size;
+                    img1 += size;
+                    img2 += size;
+                    img3 += size;
+                }
+            }
+            for (; q < inch; q++)
+            {
+                const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    tmpptr[0] = img0[0];
+
+                    tmpptr += 1;
+
+                    img0 += size;
+                }
+            }
+        }
+    }
+#else // __mips_msa
+    tmp.create(maxk, inch, size, 1u, 1, opt.workspace_allocator);
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = 0; i < size; i++)
+        {
+            signed char* tmpptr = tmp.channel(i);
+
+            int q = 0;
+            for (; q < inch; q++)
+            {
+                const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    tmpptr[0] = img0[0];
+
+                    tmpptr += 1;
+
+                    img0 += size;
+                }
+            }
+        }
+    }
+#endif // __mips_msa
+
+    int nn_outch = 0;
+    int remain_outch_start = 0;
+
+#if __mips_msa
+    nn_outch = outch >> 2;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int pp = 0; pp < nn_outch; pp++)
+    {
+        int p = pp * 4;
+
+        int* outptr0 = top_blob.channel(p);
+        int* outptr1 = top_blob.channel(p + 1);
+        int* outptr2 = top_blob.channel(p + 2);
+        int* outptr3 = top_blob.channel(p + 3);
+
+        int i = 0;
+        for (; i + 1 < size; i += 2)
+        {
+            const signed char* tmpptr = tmp.channel(i / 2);
+            const signed char* kptr = kernel.channel(p / 4);
+
+            int nn4 = (inch / 4) * maxk;
+            int nn1 = (inch % 4) * maxk;
+
+            v4i32 _sum00 = __msa_fill_w(0);
+            v4i32 _sum10 = __msa_fill_w(0);
+
+            if (nn4 > 0)
+            {
+                v4i32 _sum01 = __msa_fill_w(0);
+                v4i32 _sum02 = __msa_fill_w(0);
+                v4i32 _sum03 = __msa_fill_w(0);
+                v4i32 _sum11 = __msa_fill_w(0);
+                v4i32 _sum12 = __msa_fill_w(0);
+                v4i32 _sum13 = __msa_fill_w(0);
+
+                int j = 0;
+                for (; j < nn4; j++)
+                {
+                    v16i8 _val = __msa_ld_b(tmpptr, 0);
+                    v8i16 _val01 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val);
+
+                    v8i16 _val0 = (v8i16)__msa_ilvr_d((v2i64)_val01, (v2i64)_val01);
+                    v8i16 _val1 = (v8i16)__msa_ilvl_d((v2i64)_val01, (v2i64)_val01);
+
+                    v16i8 _w01 = __msa_ld_b(kptr, 0);
+                    v16i8 _extw01 = __msa_clti_s_b(_w01, 0);
+                    v8i16 _w0 = (v8i16)__msa_ilvr_b(_extw01, _w01);
+                    v8i16 _w1 = (v8i16)__msa_ilvl_b(_extw01, _w01);
+
+                    v8i16 _s00 = __msa_mulv_h(_val0, _w0);
+                    v8i16 _s01 = __msa_mulv_h(_val0, _w1);
+                    v8i16 _s10 = __msa_mulv_h(_val1, _w0);
+                    v8i16 _s11 = __msa_mulv_h(_val1, _w1);
+
+                    v8i16 _exts00 = __msa_clti_s_h(_s00, 0);
+                    v8i16 _exts01 = __msa_clti_s_h(_s01, 0);
+                    v8i16 _exts10 = __msa_clti_s_h(_s10, 0);
+                    v8i16 _exts11 = __msa_clti_s_h(_s11, 0);
+                    v4i32 _s00l = (v4i32)__msa_ilvr_h(_exts00, _s00);
+                    v4i32 _s00h = (v4i32)__msa_ilvl_h(_exts00, _s00);
+                    v4i32 _s01l = (v4i32)__msa_ilvr_h(_exts01, _s01);
+                    v4i32 _s01h = (v4i32)__msa_ilvl_h(_exts01, _s01);
+                    v4i32 _s10l = (v4i32)__msa_ilvr_h(_exts10, _s10);
+                    v4i32 _s10h = (v4i32)__msa_ilvl_h(_exts10, _s10);
+                    v4i32 _s11l = (v4i32)__msa_ilvr_h(_exts11, _s11);
+                    v4i32 _s11h = (v4i32)__msa_ilvl_h(_exts11, _s11);
+
+                    _sum00 = __msa_addv_w(_sum00, _s00l);
+                    _sum01 = __msa_addv_w(_sum01, _s00h);
+                    _sum02 = __msa_addv_w(_sum02, _s01l);
+                    _sum03 = __msa_addv_w(_sum03, _s01h);
+                    _sum10 = __msa_addv_w(_sum10, _s10l);
+                    _sum11 = __msa_addv_w(_sum11, _s10h);
+                    _sum12 = __msa_addv_w(_sum12, _s11l);
+                    _sum13 = __msa_addv_w(_sum13, _s11h);
+
+                    tmpptr += 8;
+                    kptr += 16;
+                }
+
+                // transpose 4x4
+                {
+                    v4i32 _tmp0, _tmp1, _tmp2, _tmp3;
+                    _tmp0 = __msa_ilvr_w(_sum01, _sum00);
+                    _tmp1 = __msa_ilvr_w(_sum03, _sum02);
+                    _tmp2 = __msa_ilvl_w(_sum01, _sum00);
+                    _tmp3 = __msa_ilvl_w(_sum03, _sum02);
+                    _sum00 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0);
+                    _sum01 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0);
+                    _sum02 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2);
+                    _sum03 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2);
+                }
+                {
+                    v4i32 _tmp0, _tmp1, _tmp2, _tmp3;
+                    _tmp0 = __msa_ilvr_w(_sum11, _sum10);
+                    _tmp1 = __msa_ilvr_w(_sum13, _sum12);
+                    _tmp2 = __msa_ilvl_w(_sum11, _sum10);
+                    _tmp3 = __msa_ilvl_w(_sum13, _sum12);
+                    _sum10 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0);
+                    _sum11 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0);
+                    _sum12 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2);
+                    _sum13 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2);
+                }
+
+                _sum00 = __msa_addv_w(_sum00, _sum01);
+                _sum02 = __msa_addv_w(_sum02, _sum03);
+                _sum10 = __msa_addv_w(_sum10, _sum11);
+                _sum12 = __msa_addv_w(_sum12, _sum13);
+
+                _sum00 = __msa_addv_w(_sum00, _sum02);
+                _sum10 = __msa_addv_w(_sum10, _sum12);
+            }
+
+            int j = 0;
+            for (; j < nn1; j++)
+            {
+                v8i16 _val0 = __msa_fill_h(tmpptr[0]);
+                v8i16 _val1 = __msa_fill_h(tmpptr[1]);
+                v8i16 _val = (v8i16)__msa_ilvr_d((v2i64)_val1, (v2i64)_val0);
+
+                v16i8 _w = __msa_ld_b(kptr, 0);
+                v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w);
+
+                _w16 = (v8i16)__msa_ilvr_d((v2i64)_w16, (v2i64)_w16);
+
+                v8i16 _s0 = __msa_mulv_h(_val, _w16);
+                v8i16 _exts0 = __msa_clti_s_h(_s0, 0);
+                v4i32 _s0l = (v4i32)__msa_ilvr_h(_exts0, _s0);
+                v4i32 _s0h = (v4i32)__msa_ilvl_h(_exts0, _s0);
+
+                _sum00 = __msa_addv_w(_sum00, _s0l);
+                _sum10 = __msa_addv_w(_sum10, _s0h);
+
+                tmpptr += 2;
+                kptr += 4;
+            }
+
+            int sum[8];
+            __msa_st_w(_sum00, sum, 0);
+            __msa_st_w(_sum10, sum + 4, 0);
+
+            outptr0[0] = sum[0];
+            outptr1[0] = sum[1];
+            outptr2[0] = sum[2];
+            outptr3[0] = sum[3];
+            outptr0[1] = sum[4];
+            outptr1[1] = sum[5];
+            outptr2[1] = sum[6];
+            outptr3[1] = sum[7];
+            outptr0 += 2;
+            outptr1 += 2;
+            outptr2 += 2;
+            outptr3 += 2;
+        }
+        for (; i < size; i++)
+        {
+            const signed char* tmpptr = tmp.channel(i / 2 + i % 2);
+            const signed char* kptr = kernel.channel(p / 4);
+
+            int nn4 = (inch / 4) * maxk;
+            int nn1 = (inch % 4) * maxk;
+
+            v4i32 _sum0 = __msa_fill_w(0);
+
+            if (nn4 > 0)
+            {
+                v4i32 _sum1 = __msa_fill_w(0);
+                v4i32 _sum2 = __msa_fill_w(0);
+                v4i32 _sum3 = __msa_fill_w(0);
+
+                int j = 0;
+                for (; j < nn4; j++)
+                {
+                    v16i8 _val = __msa_ld_b(tmpptr, 0);
+                    v8i16 _val16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val);
+
+                    _val16 = (v8i16)__msa_ilvr_d((v2i64)_val16, (v2i64)_val16);
+
+                    v16i8 _w01 = __msa_ld_b(kptr, 0);
+                    v16i8 _extw01 = __msa_clti_s_b(_w01, 0);
+                    v8i16 _w0 = (v8i16)__msa_ilvr_b(_extw01, _w01);
+                    v8i16 _w1 = (v8i16)__msa_ilvl_b(_extw01, _w01);
+
+                    v8i16 _s0 = __msa_mulv_h(_val16, _w0);
+                    v8i16 _s1 = __msa_mulv_h(_val16, _w1);
+
+                    v8i16 _exts0 = __msa_clti_s_h(_s0, 0);
+                    v8i16 _exts1 = __msa_clti_s_h(_s1, 0);
+                    v4i32 _s0l = (v4i32)__msa_ilvr_h(_exts0, _s0);
+                    v4i32 _s0h = (v4i32)__msa_ilvl_h(_exts0, _s0);
+                    v4i32 _s1l = (v4i32)__msa_ilvr_h(_exts1, _s1);
+                    v4i32 _s1h = (v4i32)__msa_ilvl_h(_exts1, _s1);
+
+                    _sum0 = __msa_addv_w(_sum0, _s0l);
+                    _sum1 = __msa_addv_w(_sum1, _s0h);
+                    _sum2 = __msa_addv_w(_sum2, _s1l);
+                    _sum3 = __msa_addv_w(_sum3, _s1h);
+
+                    tmpptr += 4;
+                    kptr += 16;
+                }
+
+                // transpose 4x4
+                {
+                    v4i32 _tmp0, _tmp1, _tmp2, _tmp3;
+                    _tmp0 = __msa_ilvr_w(_sum1, _sum0);
+                    _tmp1 = __msa_ilvr_w(_sum3, _sum2);
+                    _tmp2 = __msa_ilvl_w(_sum1, _sum0);
+                    _tmp3 = __msa_ilvl_w(_sum3, _sum2);
+                    _sum0 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0);
+                    _sum1 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0);
+                    _sum2 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2);
+                    _sum3 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2);
+                }
+
+                _sum0 = __msa_addv_w(_sum0, _sum1);
+                _sum2 = __msa_addv_w(_sum2, _sum3);
+                _sum0 = __msa_addv_w(_sum0, _sum2);
+            }
+
+            int j = 0;
+            for (; j < nn1; j++)
+            {
+                v8i16 _val = __msa_fill_h(tmpptr[0]);
+
+                v16i8 _w = __msa_ld_b(kptr, 0);
+                v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w);
+
+                v8i16 _s0 = __msa_mulv_h(_val, _w16);
+                v4i32 _s032 = (v4i32)__msa_ilvr_h(__msa_clti_s_h(_s0, 0), _s0);
+
+                _sum0 = __msa_addv_w(_sum0, _s032);
+
+                tmpptr += 1;
+                kptr += 4;
+            }
+
+            int sum[4];
+            __msa_st_w(_sum0, sum, 0);
+
+            outptr0[0] = sum[0];
+            outptr1[0] = sum[1];
+            outptr2[0] = sum[2];
+            outptr3[0] = sum[3];
+            outptr0 += 1;
+            outptr1 += 1;
+            outptr2 += 1;
+            outptr3 += 1;
+        }
+    }
+
+    remain_outch_start += nn_outch << 2;
+#endif // __mips_msa
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = remain_outch_start; p < outch; p++)
+    {
+        int* outptr0 = top_blob.channel(p);
+
+        int i = 0;
+#if __mips_msa
+        for (; i + 1 < size; i += 2)
+        {
+            const signed char* tmpptr = tmp.channel(i / 2);
+            const signed char* kptr = kernel.channel(p / 4 + p % 4);
+
+            int nn4 = (inch / 4) * maxk;
+            int nn1 = (inch % 4) * maxk;
+
+            int sum0 = 0;
+            int sum1 = 0;
+
+            if (nn4 > 0)
+            {
+                v4i32 _sum0 = __msa_fill_w(0);
+                v4i32 _sum1 = __msa_fill_w(0);
+
+                int j = 0;
+                for (; j < nn4; j++)
+                {
+                    v16i8 _val = __msa_ld_b(tmpptr, 0);
+                    v8i16 _val16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val);
+
+                    v16i8 _w = __msa_ld_b(kptr, 0);
+                    v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w);
+
+                    _w16 = (v8i16)__msa_ilvr_d((v2i64)_w16, (v2i64)_w16);
+
+                    v8i16 _s0 = __msa_mulv_h(_val16, _w16);
+                    v8i16 _exts0 = __msa_clti_s_h(_s0, 0);
+                    v4i32 _s0l = (v4i32)__msa_ilvr_h(_exts0, _s0);
+                    v4i32 _s0h = (v4i32)__msa_ilvl_h(_exts0, _s0);
+
+                    _sum0 = __msa_addv_w(_sum0, _s0l);
+                    _sum1 = __msa_addv_w(_sum1, _s0h);
+
+                    tmpptr += 8;
+                    kptr += 4;
+                }
+
+                sum0 = _sum0[0] + _sum0[1] + _sum0[2] + _sum0[3];
+                sum1 = _sum1[0] + _sum1[1] + _sum1[2] + _sum1[3];
+            }
+
+            int j = 0;
+            for (; j < nn1; j++)
+            {
+                signed char val0 = tmpptr[0];
+                signed char val1 = tmpptr[1];
+                signed char w = kptr[0];
+
+                sum0 += val0 * w;
+                sum1 += val1 * w;
+
+                tmpptr += 2;
+                kptr += 1;
+            }
+
+            outptr0[0] = sum0;
+            outptr0[1] = sum1;
+            outptr0 += 2;
+        }
+        for (; i < size; i++)
+        {
+            const signed char* tmpptr = tmp.channel(i / 2 + i % 2);
+            const signed char* kptr = kernel.channel(p / 4 + p % 4);
+
+            int nn4 = (inch / 4) * maxk;
+            int nn1 = (inch % 4) * maxk;
+
+            int sum = 0;
+
+            if (nn4 > 0)
+            {
+                v4i32 _sum = __msa_fill_w(0);
+
+                int j = 0;
+                for (; j < nn4; j++)
+                {
+                    v16i8 _val = __msa_ld_b(tmpptr, 0);
+                    v8i16 _val16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val);
+
+                    v16i8 _w = __msa_ld_b(kptr, 0);
+                    v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w);
+
+                    v8i16 _s0 = __msa_mulv_h(_val16, _w16);
+                    v4i32 _s032 = (v4i32)__msa_ilvr_h(__msa_clti_s_h(_s0, 0), _s0);
+
+                    _sum = __msa_addv_w(_sum, _s032);
+
+                    tmpptr += 4;
+                    kptr += 4;
+                }
+
+                sum = _sum[0] + _sum[1] + _sum[2] + _sum[3];
+            }
+
+            int j = 0;
+            for (; j < nn1; j++)
+            {
+                signed char val = tmpptr[0];
+                signed char w = kptr[0];
+
+                sum += val * w;
+
+                tmpptr += 1;
+                kptr += 1;
+            }
+
+            outptr0[0] = sum;
+            outptr0 += 1;
+        }
+#else  // __mips_msa
+        for (; i < size; i++)
+        {
+            const signed char* tmpptr = tmp.channel(i);
+            const signed char* kptr = kernel.channel(p);
+
+            int nn1 = inch * maxk;
+
+            int sum = 0;
+            int j = 0;
+            for (; j < nn1; j++)
+            {
+                signed char val = tmpptr[0];
+                signed char w = kptr[0];
+
+                sum += val * w;
+
+                tmpptr += 1;
+                kptr += 1;
+            }
+
+            outptr0[0] = sum;
+            outptr0 += 1;
+        }
+#endif // __mips_msa
+    }
+}
+
+static void convolution_im2col_sgemm_transform_kernel_int8_msa(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
+{
+    const int maxk = kernel_w * kernel_h;
+
+#if __mips_msa
+    // interleave
+    // src = maxk-inch-outch
+    // dst = 4a-4b-maxk-inch/4a-outch/4b
+    Mat kernel = _kernel.reshape(maxk, inch, outch);
+    if (outch >= 4)
+    {
+        if (inch >= 4)
+            kernel_tm.create(16 * maxk, inch / 4 + inch % 4, outch / 4 + outch % 4, (size_t)1u);
+        else
+            kernel_tm.create(4 * maxk, inch, outch / 4 + outch % 4, (size_t)1u);
+    }
+    else
+    {
+        if (inch >= 4)
+            kernel_tm.create(4 * maxk, inch / 4 + inch % 4, outch, (size_t)1u);
+        else
+            kernel_tm.create(1 * maxk, inch, outch, (size_t)1u);
+    }
+
+    int q = 0;
+    for (; q + 3 < outch; q += 4)
+    {
+        signed char* g00 = kernel_tm.channel(q / 4);
+
+        int p = 0;
+        for (; p + 3 < inch; p += 4)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                for (int i = 0; i < 4; i++)
+                {
+                    for (int j = 0; j < 4; j++)
+                    {
+                        const signed char* k00 = kernel.channel(q + i).row<const signed char>(p + j);
+
+                        g00[0] = k00[k];
+
+                        g00++;
+                    }
+                }
+            }
+        }
+        for (; p < inch; p++)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                for (int i = 0; i < 4; i++)
+                {
+                    const signed char* k00 = kernel.channel(q + i).row<const signed char>(p);
+
+                    g00[0] = k00[k];
+
+                    g00++;
+                }
+            }
+        }
+    }
+    // TODO unroll 2
+    for (; q < outch; q++)
+    {
+        signed char* g00 = kernel_tm.channel(q / 4 + q % 4);
+
+        int p = 0;
+        for (; p + 3 < inch; p += 4)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                for (int j = 0; j < 4; j++)
+                {
+                    const signed char* k00 = kernel.channel(q).row<const signed char>(p + j);
+
+                    g00[0] = k00[k];
+
+                    g00++;
+                }
+            }
+        }
+        for (; p < inch; p++)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                const signed char* k00 = kernel.channel(q).row<const signed char>(p);
+
+                g00[0] = k00[k];
+
+                g00++;
+            }
+        }
+    }
+#else  // __mips_msa
+    kernel_tm = _kernel.reshape(maxk, inch, outch);
+#endif // __mips_msa
+}
+
+static void convolution_im2col_sgemm_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    const int size = outw * outh;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // im2col
+    Mat bottom_im2col(size, maxk, inch, 1u, 1, opt.workspace_allocator);
+    {
+        const int gap = w * stride_h - outw * stride_w;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < inch; p++)
+        {
+            const Mat img = bottom_blob.channel(p);
+            signed char* ptr = bottom_im2col.channel(p);
+
+            for (int u = 0; u < kernel_h; u++)
+            {
+                for (int v = 0; v < kernel_w; v++)
+                {
+                    const signed char* sptr = img.row<const signed char>(dilation_h * u) + dilation_w * v;
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        int j = 0;
+                        for (; j + 3 < outw; j += 4)
+                        {
+                            ptr[0] = sptr[0];
+                            ptr[1] = sptr[stride_w];
+                            ptr[2] = sptr[stride_w * 2];
+                            ptr[3] = sptr[stride_w * 3];
+
+                            sptr += stride_w * 4;
+                            ptr += 4;
+                        }
+                        for (; j + 1 < outw; j += 2)
+                        {
+                            ptr[0] = sptr[0];
+                            ptr[1] = sptr[stride_w];
+
+                            sptr += stride_w * 2;
+                            ptr += 2;
+                        }
+                        for (; j < outw; j++)
+                        {
+                            ptr[0] = sptr[0];
+
+                            sptr += stride_w;
+                            ptr += 1;
+                        }
+
+                        sptr += gap;
+                    }
+                }
+            }
+        }
+    }
+
+    im2col_sgemm_int8_msa(bottom_im2col, top_blob, kernel, opt);
+}
diff --git a/src/layer/mips/convolution_sgemm_pack1to4_int8.h b/src/layer/mips/convolution_sgemm_pack1to4_int8.h
new file mode 100644
index 000000000..34ea59f58
--- /dev/null
+++ b/src/layer/mips/convolution_sgemm_pack1to4_int8.h
@@ -0,0 +1,477 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void im2col_sgemm_pack1to4_int8_msa(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
+{
+    // Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator);
+
+    const int size = bottom_im2col.w;
+    const int maxk = bottom_im2col.h;
+    const int inch = bottom_im2col.c;
+
+    const int outch = top_blob.c;
+
+    // permute
+    Mat tmp;
+    if (inch >= 4)
+    {
+        if (size >= 2)
+            tmp.create(2 * maxk, inch / 4 + inch % 4, size / 2 + size % 2, 4u, 4, opt.workspace_allocator);
+        else
+            tmp.create(maxk, inch / 4 + inch % 4, size, 4u, 4, opt.workspace_allocator);
+    }
+    else
+    {
+        if (size >= 2)
+            tmp.create(2 * maxk, inch, size / 2 + size % 2, 1u, 1, opt.workspace_allocator);
+        else
+            tmp.create(maxk, inch, size, 1u, 1, opt.workspace_allocator);
+    }
+    {
+        int remain_size_start = 0;
+        int nn_size = (size - remain_size_start) >> 1;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int ii = 0; ii < nn_size; ii++)
+        {
+            int i = remain_size_start + ii * 2;
+
+            signed char* tmpptr = tmp.channel(i / 2);
+
+            int q = 0;
+            for (; q + 3 < inch; q += 4)
+            {
+                const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;
+                const signed char* img1 = (const signed char*)bottom_im2col.channel(q + 1) + i;
+                const signed char* img2 = (const signed char*)bottom_im2col.channel(q + 2) + i;
+                const signed char* img3 = (const signed char*)bottom_im2col.channel(q + 3) + i;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    tmpptr[0] = img0[0];
+                    tmpptr[1] = img1[0];
+                    tmpptr[2] = img2[0];
+                    tmpptr[3] = img3[0];
+                    tmpptr[4] = img0[1];
+                    tmpptr[5] = img1[1];
+                    tmpptr[6] = img2[1];
+                    tmpptr[7] = img3[1];
+                    tmpptr += 8;
+
+                    img0 += size;
+                    img1 += size;
+                    img2 += size;
+                    img3 += size;
+                }
+            }
+            for (; q < inch; q++)
+            {
+                const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    tmpptr[0] = img0[0];
+                    tmpptr[1] = img0[1];
+
+                    tmpptr += 2;
+
+                    img0 += size;
+                }
+            }
+        }
+
+        remain_size_start += nn_size << 1;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = remain_size_start; i < size; i++)
+        {
+            signed char* tmpptr = tmp.channel(i / 2 + i % 2);
+
+            int q = 0;
+            for (; q + 3 < inch; q += 4)
+            {
+                const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;
+                const signed char* img1 = (const signed char*)bottom_im2col.channel(q + 1) + i;
+                const signed char* img2 = (const signed char*)bottom_im2col.channel(q + 2) + i;
+                const signed char* img3 = (const signed char*)bottom_im2col.channel(q + 3) + i;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    tmpptr[0] = img0[0];
+                    tmpptr[1] = img1[0];
+                    tmpptr[2] = img2[0];
+                    tmpptr[3] = img3[0];
+                    tmpptr += 4;
+
+                    img0 += size;
+                    img1 += size;
+                    img2 += size;
+                    img3 += size;
+                }
+            }
+            for (; q < inch; q++)
+            {
+                const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    tmpptr[0] = img0[0];
+
+                    tmpptr += 1;
+
+                    img0 += size;
+                }
+            }
+        }
+    }
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        int* outptr0 = top_blob.channel(p);
+
+        int i = 0;
+        for (; i + 1 < size; i += 2)
+        {
+            const signed char* tmpptr = tmp.channel(i / 2);
+            const signed char* kptr = kernel.channel(p);
+
+            int nn4 = (inch / 4) * maxk;
+            int nn1 = (inch % 4) * maxk;
+
+            v4i32 _sum00 = __msa_fill_w(0);
+            v4i32 _sum10 = __msa_fill_w(0);
+
+            if (nn4 > 0)
+            {
+                v4i32 _sum01 = __msa_fill_w(0);
+                v4i32 _sum02 = __msa_fill_w(0);
+                v4i32 _sum03 = __msa_fill_w(0);
+                v4i32 _sum11 = __msa_fill_w(0);
+                v4i32 _sum12 = __msa_fill_w(0);
+                v4i32 _sum13 = __msa_fill_w(0);
+
+                int j = 0;
+                for (; j < nn4; j++)
+                {
+                    v16i8 _val = __msa_ld_b(tmpptr, 0);
+                    v8i16 _val01 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val);
+
+                    v8i16 _val0 = (v8i16)__msa_ilvr_d((v2i64)_val01, (v2i64)_val01);
+                    v8i16 _val1 = (v8i16)__msa_ilvl_d((v2i64)_val01, (v2i64)_val01);
+
+                    v16i8 _w01 = __msa_ld_b(kptr, 0);
+                    v16i8 _extw01 = __msa_clti_s_b(_w01, 0);
+                    v8i16 _w0 = (v8i16)__msa_ilvr_b(_extw01, _w01);
+                    v8i16 _w1 = (v8i16)__msa_ilvl_b(_extw01, _w01);
+
+                    v8i16 _s00 = __msa_mulv_h(_val0, _w0);
+                    v8i16 _s01 = __msa_mulv_h(_val0, _w1);
+                    v8i16 _s10 = __msa_mulv_h(_val1, _w0);
+                    v8i16 _s11 = __msa_mulv_h(_val1, _w1);
+
+                    v8i16 _exts00 = __msa_clti_s_h(_s00, 0);
+                    v8i16 _exts01 = __msa_clti_s_h(_s01, 0);
+                    v8i16 _exts10 = __msa_clti_s_h(_s10, 0);
+                    v8i16 _exts11 = __msa_clti_s_h(_s11, 0);
+                    v4i32 _s00l = (v4i32)__msa_ilvr_h(_exts00, _s00);
+                    v4i32 _s00h = (v4i32)__msa_ilvl_h(_exts00, _s00);
+                    v4i32 _s01l = (v4i32)__msa_ilvr_h(_exts01, _s01);
+                    v4i32 _s01h = (v4i32)__msa_ilvl_h(_exts01, _s01);
+                    v4i32 _s10l = (v4i32)__msa_ilvr_h(_exts10, _s10);
+                    v4i32 _s10h = (v4i32)__msa_ilvl_h(_exts10, _s10);
+                    v4i32 _s11l = (v4i32)__msa_ilvr_h(_exts11, _s11);
+                    v4i32 _s11h = (v4i32)__msa_ilvl_h(_exts11, _s11);
+
+                    _sum00 = __msa_addv_w(_sum00, _s00l);
+                    _sum01 = __msa_addv_w(_sum01, _s00h);
+                    _sum02 = __msa_addv_w(_sum02, _s01l);
+                    _sum03 = __msa_addv_w(_sum03, _s01h);
+                    _sum10 = __msa_addv_w(_sum10, _s10l);
+                    _sum11 = __msa_addv_w(_sum11, _s10h);
+                    _sum12 = __msa_addv_w(_sum12, _s11l);
+                    _sum13 = __msa_addv_w(_sum13, _s11h);
+
+                    tmpptr += 8;
+                    kptr += 16;
+                }
+
+                // transpose 4x4
+                {
+                    v4i32 _tmp0, _tmp1, _tmp2, _tmp3;
+                    _tmp0 = __msa_ilvr_w(_sum01, _sum00);
+                    _tmp1 = __msa_ilvr_w(_sum03, _sum02);
+                    _tmp2 = __msa_ilvl_w(_sum01, _sum00);
+                    _tmp3 = __msa_ilvl_w(_sum03, _sum02);
+                    _sum00 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0);
+                    _sum01 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0);
+                    _sum02 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2);
+                    _sum03 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2);
+                }
+                {
+                    v4i32 _tmp0, _tmp1, _tmp2, _tmp3;
+                    _tmp0 = __msa_ilvr_w(_sum11, _sum10);
+                    _tmp1 = __msa_ilvr_w(_sum13, _sum12);
+                    _tmp2 = __msa_ilvl_w(_sum11, _sum10);
+                    _tmp3 = __msa_ilvl_w(_sum13, _sum12);
+                    _sum10 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0);
+                    _sum11 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0);
+                    _sum12 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2);
+                    _sum13 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2);
+                }
+
+                _sum00 = __msa_addv_w(_sum00, _sum01);
+                _sum02 = __msa_addv_w(_sum02, _sum03);
+                _sum10 = __msa_addv_w(_sum10, _sum11);
+                _sum12 = __msa_addv_w(_sum12, _sum13);
+
+                _sum00 = __msa_addv_w(_sum00, _sum02);
+                _sum10 = __msa_addv_w(_sum10, _sum12);
+            }
+
+            int j = 0;
+            for (; j < nn1; j++)
+            {
+                v8i16 _val0 = __msa_fill_h(tmpptr[0]);
+                v8i16 _val1 = __msa_fill_h(tmpptr[1]);
+                v8i16 _val = (v8i16)__msa_ilvr_d((v2i64)_val1, (v2i64)_val0);
+
+                v16i8 _w = __msa_ld_b(kptr, 0);
+                v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w);
+
+                _w16 = (v8i16)__msa_ilvr_d((v2i64)_w16, (v2i64)_w16);
+
+                v8i16 _s0 = __msa_mulv_h(_val, _w16);
+                v8i16 _exts0 = __msa_clti_s_h(_s0, 0);
+                v4i32 _s0l = (v4i32)__msa_ilvr_h(_exts0, _s0);
+                v4i32 _s0h = (v4i32)__msa_ilvl_h(_exts0, _s0);
+
+                _sum00 = __msa_addv_w(_sum00, _s0l);
+                _sum10 = __msa_addv_w(_sum10, _s0h);
+
+                tmpptr += 2;
+                kptr += 4;
+            }
+
+            __msa_st_w(_sum00, outptr0, 0);
+            __msa_st_w(_sum10, outptr0 + 4, 0);
+            outptr0 += 8;
+        }
+        for (; i < size; i++)
+        {
+            const signed char* tmpptr = tmp.channel(i / 2 + i % 2);
+            const signed char* kptr = kernel.channel(p);
+
+            int nn4 = (inch / 4) * maxk;
+            int nn1 = (inch % 4) * maxk;
+
+            v4i32 _sum0 = __msa_fill_w(0);
+
+            if (nn4 > 0)
+            {
+                v4i32 _sum1 = __msa_fill_w(0);
+                v4i32 _sum2 = __msa_fill_w(0);
+                v4i32 _sum3 = __msa_fill_w(0);
+
+                int j = 0;
+                for (; j < nn4; j++)
+                {
+                    v16i8 _val = __msa_ld_b(tmpptr, 0);
+                    v8i16 _val16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val);
+
+                    _val16 = (v8i16)__msa_ilvr_d((v2i64)_val16, (v2i64)_val16);
+
+                    v16i8 _w01 = __msa_ld_b(kptr, 0);
+                    v16i8 _extw01 = __msa_clti_s_b(_w01, 0);
+                    v8i16 _w0 = (v8i16)__msa_ilvr_b(_extw01, _w01);
+                    v8i16 _w1 = (v8i16)__msa_ilvl_b(_extw01, _w01);
+
+                    v8i16 _s0 = __msa_mulv_h(_val16, _w0);
+                    v8i16 _s1 = __msa_mulv_h(_val16, _w1);
+
+                    v8i16 _exts0 = __msa_clti_s_h(_s0, 0);
+                    v8i16 _exts1 = __msa_clti_s_h(_s1, 0);
+                    v4i32 _s0l = (v4i32)__msa_ilvr_h(_exts0, _s0);
+                    v4i32 _s0h = (v4i32)__msa_ilvl_h(_exts0, _s0);
+                    v4i32 _s1l = (v4i32)__msa_ilvr_h(_exts1, _s1);
+                    v4i32 _s1h = (v4i32)__msa_ilvl_h(_exts1, _s1);
+
+                    _sum0 = __msa_addv_w(_sum0, _s0l);
+                    _sum1 = __msa_addv_w(_sum1, _s0h);
+                    _sum2 = __msa_addv_w(_sum2, _s1l);
+                    _sum3 = __msa_addv_w(_sum3, _s1h);
+
+                    tmpptr += 4;
+                    kptr += 16;
+                }
+
+                // transpose 4x4
+                {
+                    v4i32 _tmp0, _tmp1, _tmp2, _tmp3;
+                    _tmp0 = __msa_ilvr_w(_sum1, _sum0);
+                    _tmp1 = __msa_ilvr_w(_sum3, _sum2);
+                    _tmp2 = __msa_ilvl_w(_sum1, _sum0);
+                    _tmp3 = __msa_ilvl_w(_sum3, _sum2);
+                    _sum0 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0);
+                    _sum1 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0);
+                    _sum2 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2);
+                    _sum3 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2);
+                }
+
+                _sum0 = __msa_addv_w(_sum0, _sum1);
+                _sum2 = __msa_addv_w(_sum2, _sum3);
+                _sum0 = __msa_addv_w(_sum0, _sum2);
+            }
+
+            int j = 0;
+            for (; j < nn1; j++)
+            {
+                v8i16 _val = __msa_fill_h(tmpptr[0]);
+
+                v16i8 _w = __msa_ld_b(kptr, 0);
+                v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w);
+
+                v8i16 _s0 = __msa_mulv_h(_val, _w16);
+                v4i32 _s032 = (v4i32)__msa_ilvr_h(__msa_clti_s_h(_s0, 0), _s0);
+
+                _sum0 = __msa_addv_w(_sum0, _s032);
+
+                tmpptr += 1;
+                kptr += 4;
+            }
+
+            __msa_st_w(_sum0, outptr0, 0);
+            outptr0 += 4;
+        }
+    }
+}
+
+static void convolution_im2col_sgemm_transform_kernel_pack1to4_int8_msa(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
+{
+    const int maxk = kernel_w * kernel_h;
+
+    // interleave
+    // src = maxk-inch-outch
+    // dst = 4a-4b-maxk-inch/4a-outch/4b
+    Mat kernel = _kernel.reshape(maxk, inch, outch);
+    if (inch >= 4)
+        kernel_tm.create(16 * maxk, inch / 4 + inch % 4, outch / 4, (size_t)1u);
+    else
+        kernel_tm.create(4 * maxk, inch, outch / 4, (size_t)1u);
+
+    for (int q = 0; q + 3 < outch; q += 4)
+    {
+        signed char* g00 = kernel_tm.channel(q / 4);
+
+        int p = 0;
+        for (; p + 3 < inch; p += 4)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                for (int i = 0; i < 4; i++)
+                {
+                    for (int j = 0; j < 4; j++)
+                    {
+                        const signed char* k00 = kernel.channel(q + i).row<const signed char>(p + j);
+
+                        g00[0] = k00[k];
+
+                        g00++;
+                    }
+                }
+            }
+        }
+        for (; p < inch; p++)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                for (int i = 0; i < 4; i++)
+                {
+                    const signed char* k00 = kernel.channel(q + i).row<const signed char>(p);
+
+                    g00[0] = k00[k];
+
+                    g00++;
+                }
+            }
+        }
+    }
+}
+
+static void convolution_im2col_sgemm_pack1to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    const int size = outw * outh;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // im2col
+    Mat bottom_im2col(size, maxk, inch, 1u, 1, opt.workspace_allocator);
+    {
+        const int gap = w * stride_h - outw * stride_w;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < inch; p++)
+        {
+            const Mat img = bottom_blob.channel(p);
+            signed char* ptr = bottom_im2col.channel(p);
+
+            for (int u = 0; u < kernel_h; u++)
+            {
+                for (int v = 0; v < kernel_w; v++)
+                {
+                    const signed char* sptr = img.row<const signed char>(dilation_h * u) + dilation_w * v;
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        int j = 0;
+                        for (; j + 3 < outw; j += 4)
+                        {
+                            ptr[0] = sptr[0];
+                            ptr[1] = sptr[stride_w];
+                            ptr[2] = sptr[stride_w * 2];
+                            ptr[3] = sptr[stride_w * 3];
+
+                            sptr += stride_w * 4;
+                            ptr += 4;
+                        }
+                        for (; j + 1 < outw; j += 2)
+                        {
+                            ptr[0] = sptr[0];
+                            ptr[1] = sptr[stride_w];
+
+                            sptr += stride_w * 2;
+                            ptr += 2;
+                        }
+                        for (; j < outw; j++)
+                        {
+                            ptr[0] = sptr[0];
+
+                            sptr += stride_w;
+                            ptr += 1;
+                        }
+
+                        sptr += gap;
+                    }
+                }
+            }
+        }
+    }
+
+    im2col_sgemm_pack1to4_int8_msa(bottom_im2col, top_blob, kernel, opt);
+}
diff --git a/src/layer/mips/convolution_sgemm_pack4to1.h b/src/layer/mips/convolution_sgemm_pack4to1.h
index 2ceef6c06..281293602 100644
--- a/src/layer/mips/convolution_sgemm_pack4to1.h
+++ b/src/layer/mips/convolution_sgemm_pack4to1.h
@@ -550,7 +550,7 @@ static void im2col_sgemm_pack4to1_msa(const Mat& bottom_im2col, Mat& top_blob, c
                 kptr0 += 4;
             }
 
-            sum0 += __msa_fhadd_w(_sum0);
+            sum0 += __msa_reduce_fadd_w(_sum0);
 
             outptr0[0] = sum0;
 
diff --git a/src/layer/mips/convolution_sgemm_pack8to1_int8.h b/src/layer/mips/convolution_sgemm_pack8to1_int8.h
new file mode 100644
index 000000000..536517cfd
--- /dev/null
+++ b/src/layer/mips/convolution_sgemm_pack8to1_int8.h
@@ -0,0 +1,450 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void im2col_sgemm_pack8to1_int8_msa(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
+{
+    // Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator);
+
+    const int size = bottom_im2col.w;
+    const int maxk = bottom_im2col.h;
+    const int inch = bottom_im2col.c;
+
+    const int outch = top_blob.c;
+
+    // permute
+    Mat tmp;
+    if (size >= 2)
+        tmp.create(2 * maxk, inch, size / 2 + size % 2, 8u, 8, opt.workspace_allocator);
+    else
+        tmp.create(maxk, inch, size, 8u, 8, opt.workspace_allocator);
+    {
+        int remain_size_start = 0;
+        int nn_size = (size - remain_size_start) >> 1;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int ii = 0; ii < nn_size; ii++)
+        {
+            int i = remain_size_start + ii * 2;
+
+            int64_t* tmpptr = tmp.channel(i / 2);
+
+            for (int q = 0; q < inch; q++)
+            {
+                const int64_t* img0 = (const int64_t*)bottom_im2col.channel(q) + i;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    v16i8 _v = __msa_ld_b(img0, 0);
+                    __msa_st_b(_v, tmpptr, 0);
+                    tmpptr += 2;
+                    img0 += size;
+                }
+            }
+        }
+
+        remain_size_start += nn_size << 1;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = remain_size_start; i < size; i++)
+        {
+            int64_t* tmpptr = tmp.channel(i / 2 + i % 2);
+
+            for (int q = 0; q < inch; q++)
+            {
+                const int64_t* img0 = (const int64_t*)bottom_im2col.channel(q) + i;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    tmpptr[0] = img0[0];
+                    tmpptr += 1;
+                    img0 += size;
+                }
+            }
+        }
+    }
+
+    int nn_outch = 0;
+    int remain_outch_start = 0;
+
+    nn_outch = outch >> 2;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int pp = 0; pp < nn_outch; pp++)
+    {
+        int p = pp * 4;
+
+        int* outptr0 = top_blob.channel(p);
+        int* outptr1 = top_blob.channel(p + 1);
+        int* outptr2 = top_blob.channel(p + 2);
+        int* outptr3 = top_blob.channel(p + 3);
+
+        int i = 0;
+        for (; i + 1 < size; i += 2)
+        {
+            const signed char* tmpptr = tmp.channel(i / 2);
+            const signed char* kptr = kernel.channel(p / 4);
+
+            int nn = inch * maxk; // inch always > 0
+
+            v4i32 _sum00 = __msa_fill_w(0);
+            v4i32 _sum01 = __msa_fill_w(0);
+            v4i32 _sum02 = __msa_fill_w(0);
+            v4i32 _sum03 = __msa_fill_w(0);
+            v4i32 _sum10 = __msa_fill_w(0);
+            v4i32 _sum11 = __msa_fill_w(0);
+            v4i32 _sum12 = __msa_fill_w(0);
+            v4i32 _sum13 = __msa_fill_w(0);
+
+            int j = 0;
+            for (; j < nn; j++)
+            {
+                v16i8 _val01 = __msa_ld_b(tmpptr, 0);
+                v16i8 _extval01 = __msa_clti_s_b(_val01, 0);
+                v8i16 _val0 = (v8i16)__msa_ilvr_b(_extval01, _val01);
+                v8i16 _val1 = (v8i16)__msa_ilvl_b(_extval01, _val01);
+
+                v16i8 _w01 = __msa_ld_b(kptr, 0);
+                v16i8 _w23 = __msa_ld_b(kptr + 16, 0);
+                v16i8 _extw01 = __msa_clti_s_b(_w01, 0);
+                v16i8 _extw23 = __msa_clti_s_b(_w23, 0);
+                v8i16 _w0 = (v8i16)__msa_ilvr_b(_extw01, _w01);
+                v8i16 _w1 = (v8i16)__msa_ilvl_b(_extw01, _w01);
+                v8i16 _w2 = (v8i16)__msa_ilvr_b(_extw23, _w23);
+                v8i16 _w3 = (v8i16)__msa_ilvl_b(_extw23, _w23);
+
+                v8i16 _s00 = __msa_mulv_h(_val0, _w0);
+                v8i16 _s01 = __msa_mulv_h(_val0, _w1);
+                v8i16 _s02 = __msa_mulv_h(_val0, _w2);
+                v8i16 _s03 = __msa_mulv_h(_val0, _w3);
+                v8i16 _s10 = __msa_mulv_h(_val1, _w0);
+                v8i16 _s11 = __msa_mulv_h(_val1, _w1);
+                v8i16 _s12 = __msa_mulv_h(_val1, _w2);
+                v8i16 _s13 = __msa_mulv_h(_val1, _w3);
+
+                _sum00 = __msa_addv_w(_sum00, __msa_hadd_s_w(_s00, _s00));
+                _sum01 = __msa_addv_w(_sum01, __msa_hadd_s_w(_s01, _s01));
+                _sum02 = __msa_addv_w(_sum02, __msa_hadd_s_w(_s02, _s02));
+                _sum03 = __msa_addv_w(_sum03, __msa_hadd_s_w(_s03, _s03));
+                _sum10 = __msa_addv_w(_sum10, __msa_hadd_s_w(_s10, _s10));
+                _sum11 = __msa_addv_w(_sum11, __msa_hadd_s_w(_s11, _s11));
+                _sum12 = __msa_addv_w(_sum12, __msa_hadd_s_w(_s12, _s12));
+                _sum13 = __msa_addv_w(_sum13, __msa_hadd_s_w(_s13, _s13));
+
+                tmpptr += 16;
+                kptr += 32;
+            }
+
+            // transpose 4x4
+            {
+                v4i32 _tmp0, _tmp1, _tmp2, _tmp3;
+                _tmp0 = __msa_ilvr_w(_sum01, _sum00);
+                _tmp1 = __msa_ilvr_w(_sum03, _sum02);
+                _tmp2 = __msa_ilvl_w(_sum01, _sum00);
+                _tmp3 = __msa_ilvl_w(_sum03, _sum02);
+                _sum00 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0);
+                _sum01 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0);
+                _sum02 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2);
+                _sum03 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2);
+            }
+            {
+                v4i32 _tmp0, _tmp1, _tmp2, _tmp3;
+                _tmp0 = __msa_ilvr_w(_sum11, _sum10);
+                _tmp1 = __msa_ilvr_w(_sum13, _sum12);
+                _tmp2 = __msa_ilvl_w(_sum11, _sum10);
+                _tmp3 = __msa_ilvl_w(_sum13, _sum12);
+                _sum10 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0);
+                _sum11 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0);
+                _sum12 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2);
+                _sum13 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2);
+            }
+
+            _sum00 = __msa_addv_w(_sum00, _sum01);
+            _sum02 = __msa_addv_w(_sum02, _sum03);
+            _sum10 = __msa_addv_w(_sum10, _sum11);
+            _sum12 = __msa_addv_w(_sum12, _sum13);
+
+            _sum00 = __msa_addv_w(_sum00, _sum02);
+            _sum10 = __msa_addv_w(_sum10, _sum12);
+
+            int sum[8];
+            __msa_st_w(_sum00, sum, 0);
+            __msa_st_w(_sum10, sum + 4, 0);
+
+            outptr0[0] = sum[0];
+            outptr1[0] = sum[1];
+            outptr2[0] = sum[2];
+            outptr3[0] = sum[3];
+            outptr0[1] = sum[4];
+            outptr1[1] = sum[5];
+            outptr2[1] = sum[6];
+            outptr3[1] = sum[7];
+            outptr0 += 2;
+            outptr1 += 2;
+            outptr2 += 2;
+            outptr3 += 2;
+        }
+        for (; i < size; i++)
+        {
+            const signed char* tmpptr = tmp.channel(i / 2 + i % 2);
+            const signed char* kptr = kernel.channel(p / 4);
+
+            int nn = inch * maxk; // inch always > 0
+
+            v4i32 _sum0 = __msa_fill_w(0);
+            v4i32 _sum1 = __msa_fill_w(0);
+            v4i32 _sum2 = __msa_fill_w(0);
+            v4i32 _sum3 = __msa_fill_w(0);
+
+            int j = 0;
+            for (; j < nn; j++)
+            {
+                v16i8 _val = __msa_ld_b(tmpptr, 0);
+                v8i16 _val16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val);
+
+                v16i8 _w01 = __msa_ld_b(kptr, 0);
+                v16i8 _w23 = __msa_ld_b(kptr + 16, 0);
+                v16i8 _extw01 = __msa_clti_s_b(_w01, 0);
+                v16i8 _extw23 = __msa_clti_s_b(_w23, 0);
+                v8i16 _w0 = (v8i16)__msa_ilvr_b(_extw01, _w01);
+                v8i16 _w1 = (v8i16)__msa_ilvl_b(_extw01, _w01);
+                v8i16 _w2 = (v8i16)__msa_ilvr_b(_extw23, _w23);
+                v8i16 _w3 = (v8i16)__msa_ilvl_b(_extw23, _w23);
+
+                v8i16 _s0 = __msa_mulv_h(_val16, _w0);
+                v8i16 _s1 = __msa_mulv_h(_val16, _w1);
+                v8i16 _s2 = __msa_mulv_h(_val16, _w2);
+                v8i16 _s3 = __msa_mulv_h(_val16, _w3);
+
+                _sum0 = __msa_addv_w(_sum0, __msa_hadd_s_w(_s0, _s0));
+                _sum1 = __msa_addv_w(_sum1, __msa_hadd_s_w(_s1, _s1));
+                _sum2 = __msa_addv_w(_sum2, __msa_hadd_s_w(_s2, _s2));
+                _sum3 = __msa_addv_w(_sum3, __msa_hadd_s_w(_s3, _s3));
+
+                tmpptr += 8;
+                kptr += 32;
+            }
+
+            // transpose 4x4
+            {
+                v4i32 _tmp0, _tmp1, _tmp2, _tmp3;
+                _tmp0 = __msa_ilvr_w(_sum1, _sum0);
+                _tmp1 = __msa_ilvr_w(_sum3, _sum2);
+                _tmp2 = __msa_ilvl_w(_sum1, _sum0);
+                _tmp3 = __msa_ilvl_w(_sum3, _sum2);
+                _sum0 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0);
+                _sum1 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0);
+                _sum2 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2);
+                _sum3 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2);
+            }
+
+            _sum0 = __msa_addv_w(_sum0, _sum1);
+            _sum2 = __msa_addv_w(_sum2, _sum3);
+
+            _sum0 = __msa_addv_w(_sum0, _sum2);
+
+            int sum[4];
+            __msa_st_w(_sum0, sum, 0);
+
+            outptr0[0] = sum[0];
+            outptr1[0] = sum[1];
+            outptr2[0] = sum[2];
+            outptr3[0] = sum[3];
+            outptr0 += 1;
+            outptr1 += 1;
+            outptr2 += 1;
+            outptr3 += 1;
+        }
+    }
+
+    remain_outch_start += nn_outch << 2;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = remain_outch_start; p < outch; p++)
+    {
+        int* outptr0 = top_blob.channel(p);
+
+        int i = 0;
+        for (; i + 1 < size; i += 2)
+        {
+            const signed char* tmpptr = tmp.channel(i / 2);
+            const signed char* kptr = kernel.channel(p / 4 + p % 4);
+
+            int nn = inch * maxk; // inch always > 0
+
+            v4i32 _sum0 = __msa_fill_w(0);
+            v4i32 _sum1 = __msa_fill_w(0);
+
+            int j = 0;
+            for (; j < nn; j++)
+            {
+                v16i8 _val01 = __msa_ld_b(tmpptr, 0);
+                v16i8 _extval01 = __msa_clti_s_b(_val01, 0);
+                v8i16 _val0 = (v8i16)__msa_ilvr_b(_extval01, _val01);
+                v8i16 _val1 = (v8i16)__msa_ilvl_b(_extval01, _val01);
+
+                v16i8 _w = __msa_ld_b(kptr, 0);
+                v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w);
+
+                v8i16 _s0 = __msa_mulv_h(_val0, _w16);
+                v8i16 _s1 = __msa_mulv_h(_val1, _w16);
+
+                _sum0 = __msa_addv_w(_sum0, __msa_hadd_s_w(_s0, _s0));
+                _sum1 = __msa_addv_w(_sum1, __msa_hadd_s_w(_s1, _s1));
+
+                tmpptr += 16;
+                kptr += 8;
+            }
+
+            outptr0[0] = __msa_reduce_add_w(_sum0);
+            outptr0[1] = __msa_reduce_add_w(_sum1);
+            outptr0 += 2;
+        }
+        for (; i < size; i++)
+        {
+            const signed char* tmpptr = tmp.channel(i / 2 + i % 2);
+            const signed char* kptr = kernel.channel(p / 4 + p % 4);
+
+            int nn = inch * maxk; // inch always > 0
+
+            v4i32 _sum = __msa_fill_w(0);
+
+            int j = 0;
+            for (; j < nn; j++)
+            {
+                v16i8 _val = __msa_ld_b(tmpptr, 0);
+                v8i16 _val16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val);
+
+                v16i8 _w = __msa_ld_b(kptr, 0);
+                v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w);
+
+                v8i16 _s0 = __msa_mulv_h(_val16, _w16);
+
+                _sum = __msa_addv_w(_sum, __msa_hadd_s_w(_s0, _s0));
+
+                tmpptr += 8;
+                kptr += 8;
+            }
+
+            outptr0[0] = __msa_reduce_add_w(_sum);
+            outptr0 += 1;
+        }
+    }
+}
+
+static void convolution_im2col_sgemm_transform_kernel_pack8to1_int8_msa(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
+{
+    const int maxk = kernel_w * kernel_h;
+
+    // interleave
+    // src = maxk-inch-outch
+    // dst = 8a-4b-maxk-inch/8a-outch/4b
+    Mat kernel = _kernel.reshape(maxk, inch, outch);
+    if (outch >= 4)
+        kernel_tm.create(32 * maxk, inch / 8, outch / 4 + outch % 4, (size_t)1u);
+    else
+        kernel_tm.create(8 * maxk, inch / 8, outch, (size_t)1u);
+
+    int q = 0;
+    for (; q + 3 < outch; q += 4)
+    {
+        signed char* g00 = kernel_tm.channel(q / 4);
+
+        for (int p = 0; p + 7 < inch; p += 8)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                for (int i = 0; i < 4; i++)
+                {
+                    for (int j = 0; j < 8; j++)
+                    {
+                        const signed char* k00 = kernel.channel(q + i).row<const signed char>(p + j);
+
+                        g00[0] = k00[k];
+
+                        g00++;
+                    }
+                }
+            }
+        }
+    }
+    // TODO unroll 2
+    for (; q < outch; q++)
+    {
+        signed char* g00 = kernel_tm.channel(q / 4 + q % 4);
+
+        for (int p = 0; p + 7 < inch; p += 8)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                for (int j = 0; j < 8; j++)
+                {
+                    const signed char* k00 = kernel.channel(q).row<const signed char>(p + j);
+
+                    g00[0] = k00[k];
+
+                    g00++;
+                }
+            }
+        }
+    }
+}
+
+static void convolution_im2col_sgemm_pack8to1_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    const int size = outw * outh;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // im2col
+    Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator);
+    {
+        const int gap = w * stride_h - outw * stride_w;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < inch; p++)
+        {
+            const Mat img = bottom_blob.channel(p);
+            int64_t* ptr = bottom_im2col.channel(p);
+
+            for (int u = 0; u < kernel_h; u++)
+            {
+                for (int v = 0; v < kernel_w; v++)
+                {
+                    const int64_t* sptr = img.row<const int64_t>(dilation_h * u) + dilation_w * v;
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        int j = 0;
+                        for (; j < outw; j++)
+                        {
+                            ptr[0] = sptr[0];
+
+                            sptr += stride_w;
+                            ptr += 1;
+                        }
+
+                        sptr += gap;
+                    }
+                }
+            }
+        }
+    }
+
+    im2col_sgemm_pack8to1_int8_msa(bottom_im2col, top_blob, kernel, opt);
+}
diff --git a/src/layer/mips/convolution_sgemm_pack8to4_int8.h b/src/layer/mips/convolution_sgemm_pack8to4_int8.h
new file mode 100644
index 000000000..fdfa442b0
--- /dev/null
+++ b/src/layer/mips/convolution_sgemm_pack8to4_int8.h
@@ -0,0 +1,320 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void im2col_sgemm_pack8to4_int8_msa(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
+{
+    // Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator);
+
+    const int size = bottom_im2col.w;
+    const int maxk = bottom_im2col.h;
+    const int inch = bottom_im2col.c;
+
+    const int outch = top_blob.c;
+
+    // permute
+    Mat tmp;
+    if (size >= 2)
+        tmp.create(2 * maxk, inch, size / 2 + size % 2, 8u, 8, opt.workspace_allocator);
+    else
+        tmp.create(maxk, inch, size, 8u, 8, opt.workspace_allocator);
+    {
+        int remain_size_start = 0;
+        int nn_size = size >> 1;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int ii = 0; ii < nn_size; ii++)
+        {
+            int i = remain_size_start + ii * 2;
+
+            int64_t* tmpptr = tmp.channel(i / 2);
+
+            for (int q = 0; q < inch; q++)
+            {
+                const int64_t* img0 = (const int64_t*)bottom_im2col.channel(q) + i;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    v16i8 _v = __msa_ld_b(img0, 0);
+                    __msa_st_b(_v, tmpptr, 0);
+                    tmpptr += 2;
+                    img0 += size;
+                }
+            }
+        }
+
+        remain_size_start += nn_size << 1;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = remain_size_start; i < size; i++)
+        {
+            int64_t* tmpptr = tmp.channel(i / 2 + i % 2);
+
+            for (int q = 0; q < inch; q++)
+            {
+                const int64_t* img0 = (const int64_t*)bottom_im2col.channel(q) + i;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    tmpptr[0] = img0[0];
+                    tmpptr += 1;
+                    img0 += size;
+                }
+            }
+        }
+    }
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        int* outptr0 = top_blob.channel(p);
+
+        int i = 0;
+        for (; i + 1 < size; i += 2)
+        {
+            const signed char* tmpptr = tmp.channel(i / 2);
+            const signed char* kptr = kernel.channel(p);
+
+            int nn = inch * maxk; // inch always > 0
+
+            v4i32 _sum00 = __msa_fill_w(0);
+            v4i32 _sum01 = __msa_fill_w(0);
+            v4i32 _sum02 = __msa_fill_w(0);
+            v4i32 _sum03 = __msa_fill_w(0);
+            v4i32 _sum10 = __msa_fill_w(0);
+            v4i32 _sum11 = __msa_fill_w(0);
+            v4i32 _sum12 = __msa_fill_w(0);
+            v4i32 _sum13 = __msa_fill_w(0);
+
+            int j = 0;
+            for (; j < nn; j++)
+            {
+                v16i8 _val01 = __msa_ld_b(tmpptr, 0);
+                v16i8 _extval01 = __msa_clti_s_b(_val01, 0);
+                v8i16 _val0 = (v8i16)__msa_ilvr_b(_extval01, _val01);
+                v8i16 _val1 = (v8i16)__msa_ilvl_b(_extval01, _val01);
+
+                v16i8 _w01 = __msa_ld_b(kptr, 0);
+                v16i8 _w23 = __msa_ld_b(kptr + 16, 0);
+                v16i8 _extw01 = __msa_clti_s_b(_w01, 0);
+                v16i8 _extw23 = __msa_clti_s_b(_w23, 0);
+                v8i16 _w0 = (v8i16)__msa_ilvr_b(_extw01, _w01);
+                v8i16 _w1 = (v8i16)__msa_ilvl_b(_extw01, _w01);
+                v8i16 _w2 = (v8i16)__msa_ilvr_b(_extw23, _w23);
+                v8i16 _w3 = (v8i16)__msa_ilvl_b(_extw23, _w23);
+
+                v8i16 _s00 = __msa_mulv_h(_val0, _w0);
+                v8i16 _s01 = __msa_mulv_h(_val0, _w1);
+                v8i16 _s02 = __msa_mulv_h(_val0, _w2);
+                v8i16 _s03 = __msa_mulv_h(_val0, _w3);
+                v8i16 _s10 = __msa_mulv_h(_val1, _w0);
+                v8i16 _s11 = __msa_mulv_h(_val1, _w1);
+                v8i16 _s12 = __msa_mulv_h(_val1, _w2);
+                v8i16 _s13 = __msa_mulv_h(_val1, _w3);
+
+                _sum00 = __msa_addv_w(_sum00, __msa_hadd_s_w(_s00, _s00));
+                _sum01 = __msa_addv_w(_sum01, __msa_hadd_s_w(_s01, _s01));
+                _sum02 = __msa_addv_w(_sum02, __msa_hadd_s_w(_s02, _s02));
+                _sum03 = __msa_addv_w(_sum03, __msa_hadd_s_w(_s03, _s03));
+                _sum10 = __msa_addv_w(_sum10, __msa_hadd_s_w(_s10, _s10));
+                _sum11 = __msa_addv_w(_sum11, __msa_hadd_s_w(_s11, _s11));
+                _sum12 = __msa_addv_w(_sum12, __msa_hadd_s_w(_s12, _s12));
+                _sum13 = __msa_addv_w(_sum13, __msa_hadd_s_w(_s13, _s13));
+
+                tmpptr += 16;
+                kptr += 32;
+            }
+
+            // transpose 4x4
+            {
+                v4i32 _tmp0, _tmp1, _tmp2, _tmp3;
+                _tmp0 = __msa_ilvr_w(_sum01, _sum00);
+                _tmp1 = __msa_ilvr_w(_sum03, _sum02);
+                _tmp2 = __msa_ilvl_w(_sum01, _sum00);
+                _tmp3 = __msa_ilvl_w(_sum03, _sum02);
+                _sum00 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0);
+                _sum01 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0);
+                _sum02 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2);
+                _sum03 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2);
+            }
+            {
+                v4i32 _tmp0, _tmp1, _tmp2, _tmp3;
+                _tmp0 = __msa_ilvr_w(_sum11, _sum10);
+                _tmp1 = __msa_ilvr_w(_sum13, _sum12);
+                _tmp2 = __msa_ilvl_w(_sum11, _sum10);
+                _tmp3 = __msa_ilvl_w(_sum13, _sum12);
+                _sum10 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0);
+                _sum11 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0);
+                _sum12 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2);
+                _sum13 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2);
+            }
+
+            _sum00 = __msa_addv_w(_sum00, _sum01);
+            _sum02 = __msa_addv_w(_sum02, _sum03);
+            _sum10 = __msa_addv_w(_sum10, _sum11);
+            _sum12 = __msa_addv_w(_sum12, _sum13);
+
+            _sum00 = __msa_addv_w(_sum00, _sum02);
+            _sum10 = __msa_addv_w(_sum10, _sum12);
+
+            __msa_st_w(_sum00, outptr0, 0);
+            __msa_st_w(_sum10, outptr0 + 4, 0);
+            outptr0 += 8;
+        }
+        for (; i < size; i++)
+        {
+            const signed char* tmpptr = tmp.channel(i / 2 + i % 2);
+            const signed char* kptr = kernel.channel(p);
+
+            int nn = inch * maxk; // inch always > 0
+
+            v4i32 _sum0 = __msa_fill_w(0);
+            v4i32 _sum1 = __msa_fill_w(0);
+            v4i32 _sum2 = __msa_fill_w(0);
+            v4i32 _sum3 = __msa_fill_w(0);
+
+            int j = 0;
+            for (; j < nn; j++)
+            {
+                v16i8 _val = __msa_ld_b(tmpptr, 0);
+                v8i16 _val16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val);
+
+                v16i8 _w01 = __msa_ld_b(kptr, 0);
+                v16i8 _w23 = __msa_ld_b(kptr + 16, 0);
+                v16i8 _extw01 = __msa_clti_s_b(_w01, 0);
+                v16i8 _extw23 = __msa_clti_s_b(_w23, 0);
+                v8i16 _w0 = (v8i16)__msa_ilvr_b(_extw01, _w01);
+                v8i16 _w1 = (v8i16)__msa_ilvl_b(_extw01, _w01);
+                v8i16 _w2 = (v8i16)__msa_ilvr_b(_extw23, _w23);
+                v8i16 _w3 = (v8i16)__msa_ilvl_b(_extw23, _w23);
+
+                v8i16 _s0 = __msa_mulv_h(_val16, _w0);
+                v8i16 _s1 = __msa_mulv_h(_val16, _w1);
+                v8i16 _s2 = __msa_mulv_h(_val16, _w2);
+                v8i16 _s3 = __msa_mulv_h(_val16, _w3);
+
+                _sum0 = __msa_addv_w(_sum0, __msa_hadd_s_w(_s0, _s0));
+                _sum1 = __msa_addv_w(_sum1, __msa_hadd_s_w(_s1, _s1));
+                _sum2 = __msa_addv_w(_sum2, __msa_hadd_s_w(_s2, _s2));
+                _sum3 = __msa_addv_w(_sum3, __msa_hadd_s_w(_s3, _s3));
+
+                tmpptr += 8;
+                kptr += 32;
+            }
+
+            // transpose 4x4
+            {
+                v4i32 _tmp0, _tmp1, _tmp2, _tmp3;
+                _tmp0 = __msa_ilvr_w(_sum1, _sum0);
+                _tmp1 = __msa_ilvr_w(_sum3, _sum2);
+                _tmp2 = __msa_ilvl_w(_sum1, _sum0);
+                _tmp3 = __msa_ilvl_w(_sum3, _sum2);
+                _sum0 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0);
+                _sum1 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0);
+                _sum2 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2);
+                _sum3 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2);
+            }
+
+            _sum0 = __msa_addv_w(_sum0, _sum1);
+            _sum2 = __msa_addv_w(_sum2, _sum3);
+
+            _sum0 = __msa_addv_w(_sum0, _sum2);
+
+            __msa_st_w(_sum0, outptr0, 0);
+            outptr0 += 4;
+        }
+    }
+}
+
+static void convolution_im2col_sgemm_transform_kernel_pack8to4_int8_msa(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
+{
+    const int maxk = kernel_w * kernel_h;
+
+    // interleave
+    // src = maxk-inch-outch
+    // dst = 8a-4b-maxk-inch/8a-outch/4b
+    Mat kernel = _kernel.reshape(maxk, inch, outch);
+    kernel_tm.create(32 * maxk, inch / 8, outch / 4, (size_t)1u);
+
+    for (int q = 0; q + 3 < outch; q += 4)
+    {
+        signed char* g00 = kernel_tm.channel(q / 4);
+
+        for (int p = 0; p + 7 < inch; p += 8)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                for (int i = 0; i < 4; i++)
+                {
+                    for (int j = 0; j < 8; j++)
+                    {
+                        const signed char* k00 = kernel.channel(q + i).row<const signed char>(p + j);
+
+                        g00[0] = k00[k];
+
+                        g00++;
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void convolution_im2col_sgemm_pack8to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    const int size = outw * outh;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // im2col
+    Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator);
+    {
+        const int gap = w * stride_h - outw * stride_w;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < inch; p++)
+        {
+            const Mat img = bottom_blob.channel(p);
+            int64_t* ptr = bottom_im2col.channel(p);
+
+            for (int u = 0; u < kernel_h; u++)
+            {
+                for (int v = 0; v < kernel_w; v++)
+                {
+                    const int64_t* sptr = img.row<const int64_t>(dilation_h * u) + dilation_w * v;
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        int j = 0;
+                        for (; j < outw; j++)
+                        {
+                            ptr[0] = sptr[0];
+
+                            sptr += stride_w;
+                            ptr += 1;
+                        }
+
+                        sptr += gap;
+                    }
+                }
+            }
+        }
+    }
+
+    im2col_sgemm_pack8to4_int8_msa(bottom_im2col, top_blob, kernel, opt);
+}
diff --git a/src/layer/mips/deconvolution_pack4to1.h b/src/layer/mips/deconvolution_pack4to1.h
index 655d39f0a..b825e1469 100644
--- a/src/layer/mips/deconvolution_pack4to1.h
+++ b/src/layer/mips/deconvolution_pack4to1.h
@@ -88,7 +88,7 @@ static void deconvolution_pack4to1_msa(const Mat& bottom_blob, Mat& top_blob, co
                     kptr += maxk * 4;
                 }
 
-                sum += __msa_fhadd_w(_sum);
+                sum += __msa_reduce_fadd_w(_sum);
 
                 sum = activation_ss(sum, activation_type, activation_params);
 
diff --git a/src/layer/mips/innerproduct_mips.cpp b/src/layer/mips/innerproduct_mips.cpp
index 9a3ce1875..a3ee4b40f 100644
--- a/src/layer/mips/innerproduct_mips.cpp
+++ b/src/layer/mips/innerproduct_mips.cpp
@@ -300,10 +300,10 @@ int InnerProduct_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Opti
         }
 
 #if __mips_msa
-        sum0 += __msa_fhadd_w(_sum0);
-        sum1 += __msa_fhadd_w(_sum1);
-        sum2 += __msa_fhadd_w(_sum2);
-        sum3 += __msa_fhadd_w(_sum3);
+        sum0 += __msa_reduce_fadd_w(_sum0);
+        sum1 += __msa_reduce_fadd_w(_sum1);
+        sum2 += __msa_reduce_fadd_w(_sum2);
+        sum3 += __msa_reduce_fadd_w(_sum3);
 #endif // __mips_msa
 
         sum0 = activation_ss(sum0, activation_type, activation_params);
diff --git a/src/layer/mips/mips_usability.h b/src/layer/mips/mips_usability.h
index 40acd2f07..662320ee7 100644
--- a/src/layer/mips/mips_usability.h
+++ b/src/layer/mips/mips_usability.h
@@ -38,19 +38,25 @@ typedef union
     static const ncnn::FloatInt Name = {.f = Val}
 
 /* float type data load instructions */
-static inline v4f32 __msa_fill_w_f32(float val)
+static NCNN_FORCEINLINE v4f32 __msa_fill_w_f32(float val)
 {
     ncnn::FloatInt fi_tmpval = {.f = val};
     return (v4f32)__msa_fill_w(fi_tmpval.i);
 }
 
-static inline float __msa_fhadd_w(v4f32 _v)
+static NCNN_FORCEINLINE float __msa_reduce_fadd_w(v4f32 _v)
 {
     // TODO find a more efficient way
     return _v[0] + _v[1] + _v[2] + _v[3];
 }
 
-static inline int __msa_cfcmsa_msacsr()
+static NCNN_FORCEINLINE int __msa_reduce_add_w(v4i32 _v)
+{
+    // TODO find a more efficient way
+    return _v[0] + _v[1] + _v[2] + _v[3];
+}
+
+static NCNN_FORCEINLINE int __msa_cfcmsa_msacsr()
 {
     int v;
     asm volatile("cfcmsa %0, $1 \n"
@@ -60,7 +66,7 @@ static inline int __msa_cfcmsa_msacsr()
     return v;
 }
 
-static inline void __msa_ctcmsa_msacsr(int v)
+static NCNN_FORCEINLINE void __msa_ctcmsa_msacsr(int v)
 {
     asm volatile("ctcmsa $1, %0 \n"
                  :
@@ -69,7 +75,7 @@ static inline void __msa_ctcmsa_msacsr(int v)
 }
 #endif // __mips_msa
 
-static inline signed char float2int8(float v)
+static NCNN_FORCEINLINE signed char float2int8(float v)
 {
     int int32 = round(v);
     if (int32 > 127) return 127;
@@ -78,7 +84,7 @@ static inline signed char float2int8(float v)
 }
 
 #if __mips_msa
-static inline v16i8 float2int8(v4f32 _v)
+static NCNN_FORCEINLINE v16i8 float2int8(v4f32 _v)
 {
     // simulate round to nearest via +/-0.5
     v4f32 _p5 = (v4f32)__msa_fill_w_f32(0.5f);
@@ -98,7 +104,7 @@ static inline v16i8 float2int8(v4f32 _v)
     return _v8;
 }
 
-static inline int64_t float2int8(v4f32 _vlow, v4f32 _vhigh)
+static NCNN_FORCEINLINE int64_t float2int8(v4f32 _vlow, v4f32 _vhigh)
 {
     // simulate round to nearest via +/-0.5
     v4f32 _p5 = (v4f32)__msa_fill_w_f32(0.5f);
@@ -123,7 +129,7 @@ static inline int64_t float2int8(v4f32 _vlow, v4f32 _vhigh)
     return _v8[0];
 }
 
-static inline v16i8 float2int8relu(v4f32 _v)
+static NCNN_FORCEINLINE v16i8 float2int8relu(v4f32 _v)
 {
     // simulate round to nearest via +/-0.5
     v4f32 _p5 = (v4f32)__msa_fill_w_f32(0.5f);
@@ -143,7 +149,7 @@ static inline v16i8 float2int8relu(v4f32 _v)
     return _v8;
 }
 
-static inline int64_t float2int8relu(v4f32 _vlow, v4f32 _vhigh)
+static NCNN_FORCEINLINE int64_t float2int8relu(v4f32 _vlow, v4f32 _vhigh)
 {
     // simulate round to nearest via +/-0.5
     v4f32 _p5 = (v4f32)__msa_fill_w_f32(0.5f);
@@ -168,7 +174,7 @@ static inline int64_t float2int8relu(v4f32 _vlow, v4f32 _vhigh)
     return _v8[0];
 }
 
-static inline v16i8 float2int8leakyrelu(v4f32 _v, v4f32 _slope)
+static NCNN_FORCEINLINE v16i8 float2int8leakyrelu(v4f32 _v, v4f32 _slope)
 {
     v4f32 _v_leaky = __msa_fmul_w(_v, _slope);
 
@@ -199,7 +205,7 @@ static inline v16i8 float2int8leakyrelu(v4f32 _v, v4f32 _slope)
     return _v8;
 }
 
-static inline int64_t float2int8leakyrelu(v4f32 _vlow, v4f32 _vhigh, v4f32 _slope)
+static NCNN_FORCEINLINE int64_t float2int8leakyrelu(v4f32 _vlow, v4f32 _vhigh, v4f32 _slope)
 {
     v4f32 _vlow_leaky = __msa_fmul_w(_vlow, _slope);
     v4f32 _vhigh_leaky = __msa_fmul_w(_vhigh, _slope);