From c09d7b359116c02891ef0bb678e596d67cb0d363 Mon Sep 17 00:00:00 2001 From: nihui Date: Tue, 5 Apr 2022 12:25:25 +0800 Subject: [PATCH] mips msa optimization for convolution int8 (#3675) * basic mips msa optimization for convolution int8 * mips msa optimization for convolution int8 gemm * mips msa optimization for convolution int8 winograd pack8to4/pack8to1 * mention msa maddv/msubv intrinsics bug --- docs/how-to-build/how-to-build.md | 30 +- src/layer/mips/convolution1d_mips.cpp | 2 +- src/layer/mips/convolution_1x1_int8.h | 83 ++ .../mips/convolution_1x1_pack1to4_int8.h | 83 ++ .../mips/convolution_1x1_pack8to1_int8.h | 65 ++ .../mips/convolution_1x1_pack8to4_int8.h | 65 ++ .../mips/convolution_3x3_pack8to1_int8.h | 731 ++++++++++++++++++ .../mips/convolution_3x3_pack8to4_int8.h | 629 +++++++++++++++ src/layer/mips/convolution_int8.h | 82 ++ src/layer/mips/convolution_mips.cpp | 461 ++++++++++- src/layer/mips/convolution_mips.h | 19 +- src/layer/mips/convolution_pack1to4_int8.h | 87 +++ src/layer/mips/convolution_pack4to1.h | 2 +- src/layer/mips/convolution_pack8to1_int8.h | 87 +++ src/layer/mips/convolution_pack8to4_int8.h | 120 +++ src/layer/mips/convolution_sgemm_int8.h | 731 ++++++++++++++++++ .../mips/convolution_sgemm_pack1to4_int8.h | 477 ++++++++++++ src/layer/mips/convolution_sgemm_pack4to1.h | 2 +- .../mips/convolution_sgemm_pack8to1_int8.h | 450 +++++++++++ .../mips/convolution_sgemm_pack8to4_int8.h | 320 ++++++++ src/layer/mips/deconvolution_pack4to1.h | 2 +- src/layer/mips/innerproduct_mips.cpp | 8 +- src/layer/mips/mips_usability.h | 28 +- 23 files changed, 4516 insertions(+), 48 deletions(-) create mode 100644 src/layer/mips/convolution_1x1_int8.h create mode 100644 src/layer/mips/convolution_1x1_pack1to4_int8.h create mode 100644 src/layer/mips/convolution_1x1_pack8to1_int8.h create mode 100644 src/layer/mips/convolution_1x1_pack8to4_int8.h create mode 100644 src/layer/mips/convolution_3x3_pack8to1_int8.h create mode 100644 src/layer/mips/convolution_3x3_pack8to4_int8.h create mode 100644 src/layer/mips/convolution_int8.h create mode 100644 src/layer/mips/convolution_pack1to4_int8.h create mode 100644 src/layer/mips/convolution_pack8to1_int8.h create mode 100644 src/layer/mips/convolution_pack8to4_int8.h create mode 100644 src/layer/mips/convolution_sgemm_int8.h create mode 100644 src/layer/mips/convolution_sgemm_pack1to4_int8.h create mode 100644 src/layer/mips/convolution_sgemm_pack8to1_int8.h create mode 100644 src/layer/mips/convolution_sgemm_pack8to4_int8.h diff --git a/docs/how-to-build/how-to-build.md b/docs/how-to-build/how-to-build.md index 3df87e870..4d999aab0 100644 --- a/docs/how-to-build/how-to-build.md +++ b/docs/how-to-build/how-to-build.md @@ -578,12 +578,38 @@ You can upload binary inside `build-c906/examples` folder and run on D1 board fo ### Build for Loongson 2K1000 -For gcc version < 8.5, you need to fix msa.h header for workaround msa fmadd bug. +For gcc version < 8.5, you need to fix msa.h header for workaround msa fmadd/fmsub/maddv/msubv bug. -Open ```/usr/lib/gcc/mips64el-linux-gnuabi64/8/include/msa.h```, find ```__msa_fmadd_w``` and apply changes as the following +Open ```/usr/lib/gcc/mips64el-linux-gnuabi64/8/include/msa.h```, find ```__msa_fmadd``` and ```__msa_fmsub``` and apply changes as the following ```c // #define __msa_fmadd_w __builtin_msa_fmadd_w +// #define __msa_fmadd_d __builtin_msa_fmadd_d +// #define __msa_fmsub_w __builtin_msa_fmsub_w +// #define __msa_fmsub_d __builtin_msa_fmsub_d #define __msa_fmadd_w(a, b, c) __builtin_msa_fmadd_w(c, b, a) +#define __msa_fmadd_d(a, b, c) __builtin_msa_fmadd_d(c, b, a) +#define __msa_fmsub_w(a, b, c) __builtin_msa_fmsub_w(c, b, a) +#define __msa_fmsub_d(a, b, c) __builtin_msa_fmsub_d(c, b, a) +``` + +find ```__msa_maddv``` and ```__msa_msubv``` and apply changes as the following +```c +// #define __msa_maddv_b __builtin_msa_maddv_b +// #define __msa_maddv_h __builtin_msa_maddv_h +// #define __msa_maddv_w __builtin_msa_maddv_w +// #define __msa_maddv_d __builtin_msa_maddv_d +// #define __msa_msubv_b __builtin_msa_msubv_b +// #define __msa_msubv_h __builtin_msa_msubv_h +// #define __msa_msubv_w __builtin_msa_msubv_w +// #define __msa_msubv_d __builtin_msa_msubv_d +#define __msa_maddv_b(a, b, c) __builtin_msa_maddv_b(c, b, a) +#define __msa_maddv_h(a, b, c) __builtin_msa_maddv_h(c, b, a) +#define __msa_maddv_w(a, b, c) __builtin_msa_maddv_w(c, b, a) +#define __msa_maddv_d(a, b, c) __builtin_msa_maddv_d(c, b, a) +#define __msa_msubv_b(a, b, c) __builtin_msa_msubv_b(c, b, a) +#define __msa_msubv_h(a, b, c) __builtin_msa_msubv_h(c, b, a) +#define __msa_msubv_w(a, b, c) __builtin_msa_msubv_w(c, b, a) +#define __msa_msubv_d(a, b, c) __builtin_msa_msubv_d(c, b, a) ``` Build ncnn with mips msa and simpleocv enabled: diff --git a/src/layer/mips/convolution1d_mips.cpp b/src/layer/mips/convolution1d_mips.cpp index 92889a8b7..fc61c9406 100644 --- a/src/layer/mips/convolution1d_mips.cpp +++ b/src/layer/mips/convolution1d_mips.cpp @@ -253,7 +253,7 @@ int Convolution1D_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Opt } } - sum += __msa_fhadd_w(_sum); + sum += __msa_reduce_fadd_w(_sum); sum = activation_ss(sum, activation_type, activation_params); diff --git a/src/layer/mips/convolution_1x1_int8.h b/src/layer/mips/convolution_1x1_int8.h new file mode 100644 index 000000000..8730041f6 --- /dev/null +++ b/src/layer/mips/convolution_1x1_int8.h @@ -0,0 +1,83 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv1x1s1_sgemm_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + const int size = w * h; + + Mat bottom_im2col = bottom_blob; + bottom_im2col.w = size; + bottom_im2col.h = 1; + + im2col_sgemm_int8_msa(bottom_im2col, top_blob, kernel, opt); +} + +static void conv1x1s2_sgemm_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) +{ + int w = bottom_blob.w; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int outw = top_blob.w; + int outh = top_blob.h; + + const int tailstep = w - 2 * outw + w; + + Mat bottom_blob_shrinked; + bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < channels; p++) + { + const signed char* r0 = bottom_blob.channel(p); + signed char* outptr = bottom_blob_shrinked.channel(p); + + for (int i = 0; i < outh; i++) + { + int j = 0; + for (; j + 3 < outw; j += 4) + { + outptr[0] = r0[0]; + outptr[1] = r0[2]; + outptr[2] = r0[4]; + outptr[3] = r0[6]; + + r0 += 8; + outptr += 4; + } + for (; j + 1 < outw; j += 2) + { + outptr[0] = r0[0]; + outptr[1] = r0[2]; + + r0 += 4; + outptr += 2; + } + for (; j < outw; j++) + { + outptr[0] = r0[0]; + + r0 += 2; + outptr += 1; + } + + r0 += tailstep; + } + } + + conv1x1s1_sgemm_int8_msa(bottom_blob_shrinked, top_blob, kernel, opt); +} diff --git a/src/layer/mips/convolution_1x1_pack1to4_int8.h b/src/layer/mips/convolution_1x1_pack1to4_int8.h new file mode 100644 index 000000000..928e16336 --- /dev/null +++ b/src/layer/mips/convolution_1x1_pack1to4_int8.h @@ -0,0 +1,83 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv1x1s1_sgemm_pack1to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + const int size = w * h; + + Mat bottom_im2col = bottom_blob; + bottom_im2col.w = size; + bottom_im2col.h = 1; + + im2col_sgemm_pack1to4_int8_msa(bottom_im2col, top_blob, kernel, opt); +} + +static void conv1x1s2_sgemm_pack1to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) +{ + int w = bottom_blob.w; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int outw = top_blob.w; + int outh = top_blob.h; + + const int tailstep = w - 2 * outw + w; + + Mat bottom_blob_shrinked; + bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < channels; p++) + { + const signed char* r0 = bottom_blob.channel(p); + signed char* outptr = bottom_blob_shrinked.channel(p); + + for (int i = 0; i < outh; i++) + { + int j = 0; + for (; j + 3 < outw; j += 4) + { + outptr[0] = r0[0]; + outptr[1] = r0[2]; + outptr[2] = r0[4]; + outptr[3] = r0[6]; + + r0 += 8; + outptr += 4; + } + for (; j + 1 < outw; j += 2) + { + outptr[0] = r0[0]; + outptr[1] = r0[2]; + + r0 += 4; + outptr += 2; + } + for (; j < outw; j++) + { + outptr[0] = r0[0]; + + r0 += 2; + outptr += 1; + } + + r0 += tailstep; + } + } + + conv1x1s1_sgemm_pack1to4_int8_msa(bottom_blob_shrinked, top_blob, kernel, opt); +} diff --git a/src/layer/mips/convolution_1x1_pack8to1_int8.h b/src/layer/mips/convolution_1x1_pack8to1_int8.h new file mode 100644 index 000000000..398f85e23 --- /dev/null +++ b/src/layer/mips/convolution_1x1_pack8to1_int8.h @@ -0,0 +1,65 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv1x1s1_sgemm_pack8to1_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + const int size = w * h; + + Mat bottom_im2col = bottom_blob; + bottom_im2col.w = size; + bottom_im2col.h = 1; + + im2col_sgemm_pack8to1_int8_msa(bottom_im2col, top_blob, kernel, opt); +} + +static void conv1x1s2_sgemm_pack8to1_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) +{ + int w = bottom_blob.w; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int outw = top_blob.w; + int outh = top_blob.h; + + const int tailstep = w - 2 * outw + w; + + Mat bottom_blob_shrinked; + bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < channels; p++) + { + const int64_t* r0 = bottom_blob.channel(p); + int64_t* outptr = bottom_blob_shrinked.channel(p); + + for (int i = 0; i < outh; i++) + { + int j = 0; + for (; j < outw; j++) + { + outptr[0] = r0[0]; + + r0 += 2; + outptr += 1; + } + + r0 += tailstep; + } + } + + conv1x1s1_sgemm_pack8to1_int8_msa(bottom_blob_shrinked, top_blob, kernel, opt); +} diff --git a/src/layer/mips/convolution_1x1_pack8to4_int8.h b/src/layer/mips/convolution_1x1_pack8to4_int8.h new file mode 100644 index 000000000..aa38542e4 --- /dev/null +++ b/src/layer/mips/convolution_1x1_pack8to4_int8.h @@ -0,0 +1,65 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv1x1s1_sgemm_pack8to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + const int size = w * h; + + Mat bottom_im2col = bottom_blob; + bottom_im2col.w = size; + bottom_im2col.h = 1; + + im2col_sgemm_pack8to4_int8_msa(bottom_im2col, top_blob, kernel, opt); +} + +static void conv1x1s2_sgemm_pack8to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) +{ + int w = bottom_blob.w; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int outw = top_blob.w; + int outh = top_blob.h; + + const int tailstep = w - 2 * outw + w; + + Mat bottom_blob_shrinked; + bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < channels; p++) + { + const int64_t* r0 = bottom_blob.channel(p); + int64_t* outptr = bottom_blob_shrinked.channel(p); + + for (int i = 0; i < outh; i++) + { + int j = 0; + for (; j < outw; j++) + { + outptr[0] = r0[0]; + + r0 += 2; + outptr += 1; + } + + r0 += tailstep; + } + } + + conv1x1s1_sgemm_pack8to4_int8_msa(bottom_blob_shrinked, top_blob, kernel, opt); +} diff --git a/src/layer/mips/convolution_3x3_pack8to1_int8.h b/src/layer/mips/convolution_3x3_pack8to1_int8.h new file mode 100644 index 000000000..0f18f20bf --- /dev/null +++ b/src/layer/mips/convolution_3x3_pack8to1_int8.h @@ -0,0 +1,731 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv3x3s1_winograd42_transform_kernel_pack8to1_int8_msa(const Mat& kernel, Mat& kernel_tm_pack8to1, int inch, int outch, const Option& opt) +{ + // winograd42 transform kernel + Mat kernel_tm(6 * 6, inch, outch, (size_t)2u); + + const short ktm[6][3] = { + {6, 0, 0}, + {-4, -4, -4}, + {-4, 4, -4}, + {1, 2, 4}, + {1, -2, 4}, + {0, 0, 6} + }; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + for (int q = 0; q < inch; q++) + { + const signed char* kernel0 = (const signed char*)kernel + p * inch * 9 + q * 9; + short* kernel_tm0 = kernel_tm.channel(p).row(q); + + // transform kernel + const signed char* k0 = kernel0; + const signed char* k1 = kernel0 + 3; + const signed char* k2 = kernel0 + 6; + + // h + short tmp[6][3]; + for (int i = 0; i < 6; i++) + { + tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; + tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; + tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; + } + + // U + for (int j = 0; j < 6; j++) + { + short* tmpp = &tmp[j][0]; + + for (int i = 0; i < 6; i++) + { + kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + } + } + } + } + + // interleave + // src = 36-inch-outch + // dst = 4b-8a-inch/8a-36-outch/4b + kernel_tm_pack8to1.create(8 * inch / 8, 36, outch / 4 + outch % 4, (size_t)2u * 4, 4); + + int p = 0; + for (; p + 3 < outch; p += 4) + { + const Mat k0 = kernel_tm.channel(p); + const Mat k1 = kernel_tm.channel(p + 1); + const Mat k2 = kernel_tm.channel(p + 2); + const Mat k3 = kernel_tm.channel(p + 3); + + Mat g0 = kernel_tm_pack8to1.channel(p / 4); + + for (int k = 0; k < 36; k++) + { + short* g00 = g0.row(k); + + for (int q = 0; q + 7 < inch; q += 8) + { + for (int i = 0; i < 8; i++) + { + g00[0] = k0.row(q + i)[k]; + g00[1] = k1.row(q + i)[k]; + g00[2] = k2.row(q + i)[k]; + g00[3] = k3.row(q + i)[k]; + + g00 += 4; + } + } + } + } + for (; p < outch; p++) + { + const Mat k0 = kernel_tm.channel(p); + + Mat g0 = kernel_tm_pack8to1.channel(p / 4 + p % 4); + + for (int k = 0; k < 36; k++) + { + short* g00 = g0.row(k); + + for (int q = 0; q + 7 < inch; q += 8) + { + for (int i = 0; i < 8; i++) + { + g00[0] = k0.row(q + i)[k]; + + g00 += 1; + } + } + } + } +} + +static void conv3x3s1_winograd42_pack8to1_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + // size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + // pad to 4n+2 + Mat bottom_blob_bordered = bottom_blob; + + outw = (outw + 3) / 4 * 4; + outh = (outh + 3) / 4 * 4; + + w = outw + 2; + h = outh + 2; + copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt); + + // BEGIN transform input + Mat bottom_blob_tm; + { + int w_tm = outw / 4 * 6; + int h_tm = outh / 4 * 6; + + const int tiles = w_tm / 6 * h_tm / 6; + + bottom_blob_tm.create(tiles, 36, inch, 2u * elempack, elempack, opt.workspace_allocator); + + // const float itm[4][4] = { + // {4.0f, 0.0f, -5.0f, 0.0f, 1.0f, 0.0f}, + // {0.0f,-4.0f, -4.0f, 1.0f, 1.0f, 0.0f}, + // {0.0f, 4.0f, -4.0f,-1.0f, 1.0f, 0.0f}, + // {0.0f,-2.0f, -1.0f, 2.0f, 1.0f, 0.0f}, + // {0.0f, 2.0f, -1.0f,-2.0f, 1.0f, 0.0f}, + // {0.0f, 4.0f, 0.0f,-5.0f, 0.0f, 1.0f} + // }; + + // 0 = 4 * r00 - 5 * r02 + r04 + // 1 = -4 * (r01 + r02) + r04 + r03 + // 2 = 4 * (r01 - r02) + r04 - r03 + // 3 = -2 * (r01 - r03) + r04 - r02 + // 4 = 2 * (r01 - r03) + r04 - r02 + // 5 = 4 * r01 - 5 * r03 + r05 + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < inch; q++) + { + const Mat img0 = bottom_blob_bordered.channel(q); + Mat img0_tm = bottom_blob_tm.channel(q); + + short tmp[6][6][8]; + + // tile + for (int i = 0; i < h_tm / 6; i++) + { + for (int j = 0; j < w_tm / 6; j++) + { + const signed char* r0 = img0.row(i * 4) + (j * 4) * 8; + + for (int m = 0; m < 6; m++) + { + v16i8 _r00_01 = __msa_ld_b(r0, 0); + v16i8 _r02_03 = __msa_ld_b(r0 + 16, 0); + v16i8 _r04_05 = __msa_ld_b(r0 + 32, 0); + v16i8 _extr0001 = __msa_clti_s_b(_r00_01, 0); + v16i8 _extr0203 = __msa_clti_s_b(_r02_03, 0); + v16i8 _extr0405 = __msa_clti_s_b(_r04_05, 0); + v8i16 _r00 = (v8i16)__msa_ilvr_b(_extr0001, _r00_01); + v8i16 _r01 = (v8i16)__msa_ilvl_b(_extr0001, _r00_01); + v8i16 _r02 = (v8i16)__msa_ilvr_b(_extr0203, _r02_03); + v8i16 _r03 = (v8i16)__msa_ilvl_b(_extr0203, _r02_03); + v8i16 _r04 = (v8i16)__msa_ilvr_b(_extr0405, _r04_05); + v8i16 _r05 = (v8i16)__msa_ilvl_b(_extr0405, _r04_05); + + v8i16 _v5 = __msa_fill_h(5); + + v8i16 _tmp0m = __msa_subv_h(__msa_addv_h(__msa_slli_h(_r00, 2), _r04), __msa_mulv_h(_r02, _v5)); + v8i16 _tmp1m = __msa_subv_h(__msa_addv_h(_r04, _r03), __msa_slli_h(__msa_addv_h(_r01, _r02), 2)); + v8i16 _tmp2m = __msa_addv_h(__msa_subv_h(_r04, _r03), __msa_slli_h(__msa_subv_h(_r01, _r02), 2)); + v8i16 _tmp3m = __msa_subv_h(__msa_subv_h(_r04, _r02), __msa_slli_h(__msa_subv_h(_r01, _r03), 1)); + v8i16 _tmp4m = __msa_addv_h(__msa_subv_h(_r04, _r02), __msa_slli_h(__msa_subv_h(_r01, _r03), 1)); + v8i16 _tmp5m = __msa_subv_h(__msa_addv_h(__msa_slli_h(_r01, 2), _r05), __msa_mulv_h(_r03, _v5)); + + __msa_st_h(_tmp0m, tmp[0][m], 0); + __msa_st_h(_tmp1m, tmp[1][m], 0); + __msa_st_h(_tmp2m, tmp[2][m], 0); + __msa_st_h(_tmp3m, tmp[3][m], 0); + __msa_st_h(_tmp4m, tmp[4][m], 0); + __msa_st_h(_tmp5m, tmp[5][m], 0); + + r0 += w * 8; + } + + short* r0_tm_0 = (short*)img0_tm + (i * w_tm / 6 + j) * 8; + short* r0_tm_1 = r0_tm_0 + tiles * 8; + short* r0_tm_2 = r0_tm_0 + tiles * 16; + short* r0_tm_3 = r0_tm_0 + tiles * 24; + short* r0_tm_4 = r0_tm_0 + tiles * 32; + short* r0_tm_5 = r0_tm_0 + tiles * 40; + + for (int m = 0; m < 6; m++) + { + v8i16 _tmp00 = __msa_ld_h(tmp[m][0], 0); + v8i16 _tmp01 = __msa_ld_h(tmp[m][1], 0); + v8i16 _tmp02 = __msa_ld_h(tmp[m][2], 0); + v8i16 _tmp03 = __msa_ld_h(tmp[m][3], 0); + v8i16 _tmp04 = __msa_ld_h(tmp[m][4], 0); + v8i16 _tmp05 = __msa_ld_h(tmp[m][5], 0); + + v8i16 _v5 = __msa_fill_h(5); + + v8i16 _r0tm0 = __msa_subv_h(__msa_addv_h(__msa_slli_h(_tmp00, 2), _tmp04), __msa_mulv_h(_tmp02, _v5)); + v8i16 _r0tm1 = __msa_subv_h(__msa_addv_h(_tmp04, _tmp03), __msa_slli_h(__msa_addv_h(_tmp01, _tmp02), 2)); + v8i16 _r0tm2 = __msa_addv_h(__msa_subv_h(_tmp04, _tmp03), __msa_slli_h(__msa_subv_h(_tmp01, _tmp02), 2)); + v8i16 _r0tm3 = __msa_subv_h(__msa_subv_h(_tmp04, _tmp02), __msa_slli_h(__msa_subv_h(_tmp01, _tmp03), 1)); + v8i16 _r0tm4 = __msa_addv_h(__msa_subv_h(_tmp04, _tmp02), __msa_slli_h(__msa_subv_h(_tmp01, _tmp03), 1)); + v8i16 _r0tm5 = __msa_subv_h(__msa_addv_h(__msa_slli_h(_tmp01, 2), _tmp05), __msa_mulv_h(_tmp03, _v5)); + + __msa_st_h(_r0tm0, r0_tm_0, 0); + __msa_st_h(_r0tm1, r0_tm_1, 0); + __msa_st_h(_r0tm2, r0_tm_2, 0); + __msa_st_h(_r0tm3, r0_tm_3, 0); + __msa_st_h(_r0tm4, r0_tm_4, 0); + __msa_st_h(_r0tm5, r0_tm_5, 0); + + r0_tm_0 += tiles * 48; + r0_tm_1 += tiles * 48; + r0_tm_2 += tiles * 48; + r0_tm_3 += tiles * 48; + r0_tm_4 += tiles * 48; + r0_tm_5 += tiles * 48; + } + } + } + } + } + bottom_blob_bordered = Mat(); + // END transform input + + // BEGIN dot + Mat top_blob_tm; + { + int w_tm = outw / 4 * 6; + int h_tm = outh / 4 * 6; + + const int tiles = h_tm / 6 * w_tm / 6; + + // permute + // bottom_blob_tm.create(tiles, 36, inch, elemsize, elempack, opt.workspace_allocator); + Mat bottom_blob_tm2; + if (tiles >= 2) + bottom_blob_tm2.create(2 * inch, tiles / 2 + tiles % 2, 36, 2u * elempack, elempack, opt.workspace_allocator); + else // if (tiles >= 1) + bottom_blob_tm2.create(1 * inch, tiles, 36, 2u * elempack, elempack, opt.workspace_allocator); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int r = 0; r < 36; r++) + { + Mat tm2 = bottom_blob_tm2.channel(r); + + // tile + int i = 0; + for (; i + 1 < tiles; i += 2) + { + short* tmpptr = tm2.row(i / 2); + + const short* r0 = bottom_blob_tm; + + r0 += (r * tiles + i) * 8; + + for (int q = 0; q < inch; q++) + { + v8i16 _r0 = __msa_ld_h(r0, 0); + v8i16 _r1 = __msa_ld_h(r0 + 8, 0); + __msa_st_h(_r0, tmpptr, 0); + __msa_st_h(_r1, tmpptr + 8, 0); + r0 += bottom_blob_tm.cstep * 8; + tmpptr += 16; + } + } + for (; i < tiles; i++) + { + short* tmpptr = tm2.row(i / 2 + i % 2); + + const short* r0 = bottom_blob_tm; + + r0 += (r * tiles + i) * 8; + + for (int q = 0; q < inch; q++) + { + v8i16 _r0 = __msa_ld_h(r0, 0); + __msa_st_h(_r0, tmpptr, 0); + r0 += bottom_blob_tm.cstep * 8; + tmpptr += 8; + } + } + } + + bottom_blob_tm = Mat(); + // permute end + + top_blob_tm.create(tiles, 36, outch, 4u, 1, opt.workspace_allocator); + + int nn_outch = 0; + int remain_outch_start = 0; + + nn_outch = outch >> 2; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_outch; pp++) + { + int p = pp * 4; + + int* output0_tm = top_blob_tm.channel(p); + int* output1_tm = top_blob_tm.channel(p + 1); + int* output2_tm = top_blob_tm.channel(p + 2); + int* output3_tm = top_blob_tm.channel(p + 3); + + const Mat kernel0_tm = kernel_tm.channel(p / 4); + + for (int r = 0; r < 36; r++) + { + const Mat bb2 = bottom_blob_tm2.channel(r); + + int i = 0; + for (; i + 1 < tiles; i += 2) + { + const short* r0 = bb2.row(i / 2); + const short* k0 = kernel0_tm.row(r); + + int nn = inch; // inch always > 0 + + v4i32 _sum0 = __msa_fill_w(0); + v4i32 _sum1 = __msa_fill_w(0); + v4i32 _sum2 = __msa_fill_w(0); + v4i32 _sum3 = __msa_fill_w(0); + + for (int j = 0; j < nn; j++) + { + v8i16 _w0 = __msa_ld_h(k0, 0); + v8i16 _w1 = __msa_ld_h(k0 + 8, 0); + v8i16 _w2 = __msa_ld_h(k0 + 16, 0); + v8i16 _w3 = __msa_ld_h(k0 + 24, 0); + + v8i16 _extw0 = __msa_clti_s_h(_w0, 0); + v8i16 _extw1 = __msa_clti_s_h(_w1, 0); + v8i16 _extw2 = __msa_clti_s_h(_w2, 0); + v8i16 _extw3 = __msa_clti_s_h(_w3, 0); + + v4i32 _w0l = (v4i32)__msa_ilvr_h(_extw0, _w0); + v4i32 _w0h = (v4i32)__msa_ilvl_h(_extw0, _w0); + v4i32 _w1l = (v4i32)__msa_ilvr_h(_extw1, _w1); + v4i32 _w1h = (v4i32)__msa_ilvl_h(_extw1, _w1); + v4i32 _w2l = (v4i32)__msa_ilvr_h(_extw2, _w2); + v4i32 _w2h = (v4i32)__msa_ilvl_h(_extw2, _w2); + v4i32 _w3l = (v4i32)__msa_ilvr_h(_extw3, _w3); + v4i32 _w3h = (v4i32)__msa_ilvl_h(_extw3, _w3); + + v4i32 _val0_0 = __msa_fill_w(r0[0]); + v4i32 _val0_1 = __msa_fill_w(r0[1]); + v4i32 _val0_2 = __msa_fill_w(r0[2]); + v4i32 _val0_3 = __msa_fill_w(r0[3]); + v4i32 _val0_4 = __msa_fill_w(r0[4]); + v4i32 _val0_5 = __msa_fill_w(r0[5]); + v4i32 _val0_6 = __msa_fill_w(r0[6]); + v4i32 _val0_7 = __msa_fill_w(r0[7]); + v4i32 _val1_0 = __msa_fill_w(r0[8]); + v4i32 _val1_1 = __msa_fill_w(r0[9]); + v4i32 _val1_2 = __msa_fill_w(r0[10]); + v4i32 _val1_3 = __msa_fill_w(r0[11]); + v4i32 _val1_4 = __msa_fill_w(r0[12]); + v4i32 _val1_5 = __msa_fill_w(r0[13]); + v4i32 _val1_6 = __msa_fill_w(r0[14]); + v4i32 _val1_7 = __msa_fill_w(r0[15]); + + _sum0 = __msa_maddv_w(_sum0, _w0l, _val0_0); + _sum1 = __msa_maddv_w(_sum1, _w0h, _val0_1); + _sum2 = __msa_maddv_w(_sum2, _w0l, _val1_0); + _sum3 = __msa_maddv_w(_sum3, _w0h, _val1_1); + _sum0 = __msa_maddv_w(_sum0, _w1l, _val0_2); + _sum1 = __msa_maddv_w(_sum1, _w1h, _val0_3); + _sum2 = __msa_maddv_w(_sum2, _w1l, _val1_2); + _sum3 = __msa_maddv_w(_sum3, _w1h, _val1_3); + _sum0 = __msa_maddv_w(_sum0, _w2l, _val0_4); + _sum1 = __msa_maddv_w(_sum1, _w2h, _val0_5); + _sum2 = __msa_maddv_w(_sum2, _w2l, _val1_4); + _sum3 = __msa_maddv_w(_sum3, _w2h, _val1_5); + _sum0 = __msa_maddv_w(_sum0, _w3l, _val0_6); + _sum1 = __msa_maddv_w(_sum1, _w3h, _val0_7); + _sum2 = __msa_maddv_w(_sum2, _w3l, _val1_6); + _sum3 = __msa_maddv_w(_sum3, _w3h, _val1_7); + + r0 += 16; + k0 += 32; + } + + _sum0 = __msa_addv_w(_sum0, _sum1); + _sum2 = __msa_addv_w(_sum2, _sum3); + + int sum[8]; + __msa_st_w(_sum0, sum, 0); + __msa_st_w(_sum2, sum + 4, 0); + + output0_tm[0] = sum[0]; + output1_tm[0] = sum[1]; + output2_tm[0] = sum[2]; + output3_tm[0] = sum[3]; + output0_tm[1] = sum[4]; + output1_tm[1] = sum[5]; + output2_tm[1] = sum[6]; + output3_tm[1] = sum[7]; + output0_tm += 2; + output1_tm += 2; + output2_tm += 2; + output3_tm += 2; + } + for (; i < tiles; i++) + { + const short* r0 = bb2.row(i / 2 + i % 2); + const short* k0 = kernel0_tm.row(r); + + int nn = inch; // inch always > 0 + + v4i32 _sum0 = __msa_fill_w(0); + v4i32 _sum1 = __msa_fill_w(0); + + for (int j = 0; j < nn; j++) + { + v8i16 _w0 = __msa_ld_h(k0, 0); + v8i16 _w1 = __msa_ld_h(k0 + 8, 0); + v8i16 _w2 = __msa_ld_h(k0 + 16, 0); + v8i16 _w3 = __msa_ld_h(k0 + 24, 0); + + v8i16 _extw0 = __msa_clti_s_h(_w0, 0); + v8i16 _extw1 = __msa_clti_s_h(_w1, 0); + v8i16 _extw2 = __msa_clti_s_h(_w2, 0); + v8i16 _extw3 = __msa_clti_s_h(_w3, 0); + + v4i32 _w0l = (v4i32)__msa_ilvr_h(_extw0, _w0); + v4i32 _w0h = (v4i32)__msa_ilvl_h(_extw0, _w0); + v4i32 _w1l = (v4i32)__msa_ilvr_h(_extw1, _w1); + v4i32 _w1h = (v4i32)__msa_ilvl_h(_extw1, _w1); + v4i32 _w2l = (v4i32)__msa_ilvr_h(_extw2, _w2); + v4i32 _w2h = (v4i32)__msa_ilvl_h(_extw2, _w2); + v4i32 _w3l = (v4i32)__msa_ilvr_h(_extw3, _w3); + v4i32 _w3h = (v4i32)__msa_ilvl_h(_extw3, _w3); + + v4i32 _val0 = __msa_fill_w(r0[0]); + v4i32 _val1 = __msa_fill_w(r0[1]); + v4i32 _val2 = __msa_fill_w(r0[2]); + v4i32 _val3 = __msa_fill_w(r0[3]); + v4i32 _val4 = __msa_fill_w(r0[4]); + v4i32 _val5 = __msa_fill_w(r0[5]); + v4i32 _val6 = __msa_fill_w(r0[6]); + v4i32 _val7 = __msa_fill_w(r0[7]); + + _sum0 = __msa_maddv_w(_sum0, _w0l, _val0); + _sum1 = __msa_maddv_w(_sum1, _w0h, _val1); + _sum0 = __msa_maddv_w(_sum0, _w1l, _val2); + _sum1 = __msa_maddv_w(_sum1, _w1h, _val3); + _sum0 = __msa_maddv_w(_sum0, _w2l, _val4); + _sum1 = __msa_maddv_w(_sum1, _w2h, _val5); + _sum0 = __msa_maddv_w(_sum0, _w3l, _val6); + _sum1 = __msa_maddv_w(_sum1, _w3h, _val7); + + r0 += 8; + k0 += 32; + } + + _sum0 = __msa_addv_w(_sum0, _sum1); + + int sum[4]; + __msa_st_w(_sum0, sum, 0); + + output0_tm[0] = sum[0]; + output1_tm[0] = sum[1]; + output2_tm[0] = sum[2]; + output3_tm[0] = sum[3]; + output0_tm += 1; + output1_tm += 1; + output2_tm += 1; + output3_tm += 1; + } + } + } + + remain_outch_start += nn_outch << 2; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = remain_outch_start; p < outch; p++) + { + int* output0_tm = top_blob_tm.channel(p); + + const Mat kernel0_tm = kernel_tm.channel(p / 4 + p % 4); + + for (int r = 0; r < 36; r++) + { + const Mat bb2 = bottom_blob_tm2.channel(r); + + int i = 0; + for (; i + 1 < tiles; i += 2) + { + const short* r0 = bb2.row(i / 2); + const short* k0 = kernel0_tm.row(r); + + v4i32 _sum0 = __msa_fill_w(0); + v4i32 _sum1 = __msa_fill_w(0); + v4i32 _sum2 = __msa_fill_w(0); + v4i32 _sum3 = __msa_fill_w(0); + + for (int q = 0; q < inch; q++) + { + v8i16 _val0 = __msa_ld_h(r0, 0); + v8i16 _val1 = __msa_ld_h(r0 + 8, 0); + + v8i16 _extval0 = __msa_clti_s_h(_val0, 0); + v8i16 _extval1 = __msa_clti_s_h(_val1, 0); + v4i32 _val0l = (v4i32)__msa_ilvr_h(_extval0, _val0); + v4i32 _val0h = (v4i32)__msa_ilvl_h(_extval0, _val0); + v4i32 _val1l = (v4i32)__msa_ilvr_h(_extval1, _val1); + v4i32 _val1h = (v4i32)__msa_ilvl_h(_extval1, _val1); + + v8i16 _w0 = __msa_ld_h(k0, 0); + + v8i16 _extw0 = __msa_clti_s_h(_w0, 0); + v4i32 _w0l = (v4i32)__msa_ilvr_h(_extw0, _w0); + v4i32 _w0h = (v4i32)__msa_ilvl_h(_extw0, _w0); + + _sum0 = __msa_maddv_w(_sum0, _w0l, _val0l); + _sum1 = __msa_maddv_w(_sum1, _w0h, _val0h); + _sum2 = __msa_maddv_w(_sum2, _w0l, _val1l); + _sum3 = __msa_maddv_w(_sum3, _w0h, _val1h); + + k0 += 8; + r0 += 16; + } + + _sum0 = __msa_addv_w(_sum0, _sum1); + _sum2 = __msa_addv_w(_sum2, _sum3); + + output0_tm[0] = __msa_reduce_add_w(_sum0); + output0_tm[1] = __msa_reduce_add_w(_sum2); + output0_tm += 2; + } + for (; i < tiles; i++) + { + const short* r0 = bb2.row(i / 2 + i % 2); + const short* k0 = kernel0_tm.row(r); + + v4i32 _sum0 = __msa_fill_w(0); + v4i32 _sum1 = __msa_fill_w(0); + + for (int q = 0; q < inch; q++) + { + v8i16 _val = __msa_ld_h(r0, 0); + + v8i16 _extval = __msa_clti_s_h(_val, 0); + v4i32 _vall = (v4i32)__msa_ilvr_h(_extval, _val); + v4i32 _valh = (v4i32)__msa_ilvl_h(_extval, _val); + + v8i16 _w0 = __msa_ld_h(k0, 0); + + v8i16 _extw0 = __msa_clti_s_h(_w0, 0); + v4i32 _w0l = (v4i32)__msa_ilvr_h(_extw0, _w0); + v4i32 _w0h = (v4i32)__msa_ilvl_h(_extw0, _w0); + + _sum0 = __msa_maddv_w(_sum0, _w0l, _vall); + _sum1 = __msa_maddv_w(_sum1, _w0h, _valh); + + k0 += 8; + r0 += 8; + } + + _sum0 = __msa_addv_w(_sum0, _sum1); + + output0_tm[0] = __msa_reduce_add_w(_sum0); + output0_tm++; + } + } + } + } + bottom_blob_tm = Mat(); + // END dot + + // BEGIN transform output + Mat top_blob_bordered; + if (outw == top_blob.w && outh == top_blob.h) + { + top_blob_bordered = top_blob; + } + else + { + top_blob_bordered.create(outw, outh, outch, 4u, 1, opt.workspace_allocator); + } + { + // const float otm[4][6] = { + // {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f}, + // {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 0.0f}, + // {0.0f, 1.0f, 1.0f, 4.0f, 4.0f, 0.0f}, + // {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 1.0f} + // }; + + // 0 = r00 + (r01 + r02) + (r03 + r04) + // 1 = (r01 - r02) + (r03 - r04) * 2 + // 2 = (r01 + r02) + (r03 + r04) * 4 + // 3 = r05 + (r01 - r02) + (r03 - r04) * 8 + + int w_tm = outw / 4 * 6; + int h_tm = outh / 4 * 6; + const int tiles = w_tm / 6 * h_tm / 6; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + const Mat out0_tm = top_blob_tm.channel(p); + Mat out0 = top_blob_bordered.channel(p); + + int tmp[4][6]; + + // tile + for (int i = 0; i < outh / 4; i++) + { + for (int j = 0; j < outw / 4; j++) + { + // top_blob_tm.create(tiles, 36, outch, 4u, 1, opt.workspace_allocator); + + const int* output0_tm_0 = (const int*)out0_tm + (i * w_tm / 6 + j) * 1; + const int* output0_tm_1 = output0_tm_0 + tiles * 1; + const int* output0_tm_2 = output0_tm_0 + tiles * 2; + const int* output0_tm_3 = output0_tm_0 + tiles * 3; + const int* output0_tm_4 = output0_tm_0 + tiles * 4; + const int* output0_tm_5 = output0_tm_0 + tiles * 5; + + int* output0 = out0.row(i * 4) + j * 4; + + // 0 = r00 + (r01 + r02) + (r03 + r04) + // 1 = (r01 - r02) + (r03 - r04) * 2 + // 2 = (r01 + r02) + (r03 + r04) * 4 + // 3 = r05 + (r01 - r02) + (r03 - r04) * 8 + + // TODO msa optimize + for (int m = 0; m < 5; m++) + { + int tmp02a = output0_tm_1[0] + output0_tm_2[0]; + int tmp13a = output0_tm_1[0] - output0_tm_2[0]; + + int tmp02b = output0_tm_3[0] + output0_tm_4[0]; + int tmp13b = output0_tm_3[0] - output0_tm_4[0]; + + tmp[0][m] = output0_tm_0[0] + tmp02a + tmp02b; + tmp[1][m] = tmp13a + tmp13b * 2; + tmp[2][m] = tmp02a + tmp02b * 4; + tmp[3][m] = output0_tm_5[0] * 4 + tmp13a + tmp13b * 8; + + output0_tm_0 += tiles * 6; + output0_tm_1 += tiles * 6; + output0_tm_2 += tiles * 6; + output0_tm_3 += tiles * 6; + output0_tm_4 += tiles * 6; + output0_tm_5 += tiles * 6; + } + for (int m = 5; m < 6; m++) + { + int tmp02a = output0_tm_1[0] + output0_tm_2[0]; + int tmp13a = output0_tm_1[0] - output0_tm_2[0]; + + int tmp02b = output0_tm_3[0] + output0_tm_4[0]; + int tmp13b = output0_tm_3[0] - output0_tm_4[0]; + + tmp[0][m] = (output0_tm_0[0] + tmp02a + tmp02b) * 4; + tmp[1][m] = (tmp13a + tmp13b * 2) * 4; + tmp[2][m] = (tmp02a + tmp02b * 4) * 4; + tmp[3][m] = (output0_tm_5[0] * 4 + tmp13a + tmp13b * 8) * 4; + + output0_tm_0 += tiles * 6; + output0_tm_1 += tiles * 6; + output0_tm_2 += tiles * 6; + output0_tm_3 += tiles * 6; + output0_tm_4 += tiles * 6; + output0_tm_5 += tiles * 6; + } + + for (int m = 0; m < 4; m++) + { + const int* tmp0 = tmp[m]; + + int tmp02a = tmp0[1] + tmp0[2]; + int tmp13a = tmp0[1] - tmp0[2]; + + int tmp02b = tmp0[3] + tmp0[4]; + int tmp13b = tmp0[3] - tmp0[4]; + + output0[0] = (tmp0[0] + tmp02a + tmp02b) / 576; + output0[1] = (tmp13a + tmp13b * 2) / 576; + output0[2] = (tmp02a + tmp02b * 4) / 576; + output0[3] = (tmp0[5] + tmp13a + tmp13b * 8) / 576; + + output0 += outw; + } + } + } + } + } + // END transform output + + // cut result pad + copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt); +} diff --git a/src/layer/mips/convolution_3x3_pack8to4_int8.h b/src/layer/mips/convolution_3x3_pack8to4_int8.h new file mode 100644 index 000000000..9060f421c --- /dev/null +++ b/src/layer/mips/convolution_3x3_pack8to4_int8.h @@ -0,0 +1,629 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv3x3s1_winograd42_transform_kernel_pack8to4_int8_msa(const Mat& kernel, Mat& kernel_tm_pack8, int inch, int outch, const Option& opt) +{ + // winograd42 transform kernel + Mat kernel_tm(6 * 6, inch, outch, (size_t)2u); + + const short ktm[6][3] = { + {6, 0, 0}, + {-4, -4, -4}, + {-4, 4, -4}, + {1, 2, 4}, + {1, -2, 4}, + {0, 0, 6} + }; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + for (int q = 0; q < inch; q++) + { + const signed char* kernel0 = (const signed char*)kernel + p * inch * 9 + q * 9; + short* kernel_tm0 = kernel_tm.channel(p).row(q); + + // transform kernel + const signed char* k0 = kernel0; + const signed char* k1 = kernel0 + 3; + const signed char* k2 = kernel0 + 6; + + // h + short tmp[6][3]; + for (int i = 0; i < 6; i++) + { + tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; + tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; + tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; + } + + // U + for (int j = 0; j < 6; j++) + { + short* tmpp = &tmp[j][0]; + + for (int i = 0; i < 6; i++) + { + kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + } + } + } + } + + // interleave + // src = 36-inch-outch + // dst = 4b-8a-inch/8a-36-outch/4b + kernel_tm_pack8.create(inch / 8, 36, outch / 4, (size_t)2u * 32, 32); + + int q = 0; + for (; q + 3 < outch; q += 4) + { + const Mat k0 = kernel_tm.channel(q); + const Mat k1 = kernel_tm.channel(q + 1); + const Mat k2 = kernel_tm.channel(q + 2); + const Mat k3 = kernel_tm.channel(q + 3); + + Mat kernel_tm = kernel_tm_pack8.channel(q / 4); + + for (int k = 0; k < 36; k++) + { + short* g00 = kernel_tm.row(k); + + for (int p = 0; p + 7 < inch; p += 8) + { + for (int i = 0; i < 8; i++) + { + const short* k00 = k0.row(p + i); + const short* k10 = k1.row(p + i); + const short* k20 = k2.row(p + i); + const short* k30 = k3.row(p + i); + + g00[0] = k00[k]; + g00[1] = k10[k]; + g00[2] = k20[k]; + g00[3] = k30[k]; + + g00 += 4; + } + } + } + } +} + +static void conv3x3s1_winograd42_pack8to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + // size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + // pad to 4n+2 + Mat bottom_blob_bordered = bottom_blob; + + outw = (outw + 3) / 4 * 4; + outh = (outh + 3) / 4 * 4; + + w = outw + 2; + h = outh + 2; + copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt); + + // BEGIN transform input + Mat bottom_blob_tm; + { + int w_tm = outw / 4 * 6; + int h_tm = outh / 4 * 6; + + const int tiles = w_tm / 6 * h_tm / 6; + + bottom_blob_tm.create(tiles, 36, inch, 2u * elempack, elempack, opt.workspace_allocator); + + // const float itm[4][4] = { + // {4.0f, 0.0f, -5.0f, 0.0f, 1.0f, 0.0f}, + // {0.0f,-4.0f, -4.0f, 1.0f, 1.0f, 0.0f}, + // {0.0f, 4.0f, -4.0f,-1.0f, 1.0f, 0.0f}, + // {0.0f,-2.0f, -1.0f, 2.0f, 1.0f, 0.0f}, + // {0.0f, 2.0f, -1.0f,-2.0f, 1.0f, 0.0f}, + // {0.0f, 4.0f, 0.0f,-5.0f, 0.0f, 1.0f} + // }; + + // 0 = 4 * r00 - 5 * r02 + r04 + // 1 = -4 * (r01 + r02) + r04 + r03 + // 2 = 4 * (r01 - r02) + r04 - r03 + // 3 = -2 * (r01 - r03) + r04 - r02 + // 4 = 2 * (r01 - r03) + r04 - r02 + // 5 = 4 * r01 - 5 * r03 + r05 + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < inch; q++) + { + const Mat img0 = bottom_blob_bordered.channel(q); + Mat img0_tm = bottom_blob_tm.channel(q); + + short tmp[6][6][8]; + + // tile + for (int i = 0; i < h_tm / 6; i++) + { + for (int j = 0; j < w_tm / 6; j++) + { + const signed char* r0 = img0.row(i * 4) + (j * 4) * 8; + + for (int m = 0; m < 6; m++) + { + v16i8 _r00_01 = __msa_ld_b(r0, 0); + v16i8 _r02_03 = __msa_ld_b(r0 + 16, 0); + v16i8 _r04_05 = __msa_ld_b(r0 + 32, 0); + v16i8 _extr0001 = __msa_clti_s_b(_r00_01, 0); + v16i8 _extr0203 = __msa_clti_s_b(_r02_03, 0); + v16i8 _extr0405 = __msa_clti_s_b(_r04_05, 0); + v8i16 _r00 = (v8i16)__msa_ilvr_b(_extr0001, _r00_01); + v8i16 _r01 = (v8i16)__msa_ilvl_b(_extr0001, _r00_01); + v8i16 _r02 = (v8i16)__msa_ilvr_b(_extr0203, _r02_03); + v8i16 _r03 = (v8i16)__msa_ilvl_b(_extr0203, _r02_03); + v8i16 _r04 = (v8i16)__msa_ilvr_b(_extr0405, _r04_05); + v8i16 _r05 = (v8i16)__msa_ilvl_b(_extr0405, _r04_05); + + v8i16 _v5 = __msa_fill_h(5); + + v8i16 _tmp0m = __msa_subv_h(__msa_addv_h(__msa_slli_h(_r00, 2), _r04), __msa_mulv_h(_r02, _v5)); + v8i16 _tmp1m = __msa_subv_h(__msa_addv_h(_r04, _r03), __msa_slli_h(__msa_addv_h(_r01, _r02), 2)); + v8i16 _tmp2m = __msa_addv_h(__msa_subv_h(_r04, _r03), __msa_slli_h(__msa_subv_h(_r01, _r02), 2)); + v8i16 _tmp3m = __msa_subv_h(__msa_subv_h(_r04, _r02), __msa_slli_h(__msa_subv_h(_r01, _r03), 1)); + v8i16 _tmp4m = __msa_addv_h(__msa_subv_h(_r04, _r02), __msa_slli_h(__msa_subv_h(_r01, _r03), 1)); + v8i16 _tmp5m = __msa_subv_h(__msa_addv_h(__msa_slli_h(_r01, 2), _r05), __msa_mulv_h(_r03, _v5)); + + __msa_st_h(_tmp0m, tmp[0][m], 0); + __msa_st_h(_tmp1m, tmp[1][m], 0); + __msa_st_h(_tmp2m, tmp[2][m], 0); + __msa_st_h(_tmp3m, tmp[3][m], 0); + __msa_st_h(_tmp4m, tmp[4][m], 0); + __msa_st_h(_tmp5m, tmp[5][m], 0); + + r0 += w * 8; + } + + short* r0_tm_0 = (short*)img0_tm + (i * w_tm / 6 + j) * 8; + short* r0_tm_1 = r0_tm_0 + tiles * 8; + short* r0_tm_2 = r0_tm_0 + tiles * 16; + short* r0_tm_3 = r0_tm_0 + tiles * 24; + short* r0_tm_4 = r0_tm_0 + tiles * 32; + short* r0_tm_5 = r0_tm_0 + tiles * 40; + + for (int m = 0; m < 6; m++) + { + v8i16 _tmp00 = __msa_ld_h(tmp[m][0], 0); + v8i16 _tmp01 = __msa_ld_h(tmp[m][1], 0); + v8i16 _tmp02 = __msa_ld_h(tmp[m][2], 0); + v8i16 _tmp03 = __msa_ld_h(tmp[m][3], 0); + v8i16 _tmp04 = __msa_ld_h(tmp[m][4], 0); + v8i16 _tmp05 = __msa_ld_h(tmp[m][5], 0); + + v8i16 _v5 = __msa_fill_h(5); + + v8i16 _r0tm0 = __msa_subv_h(__msa_addv_h(__msa_slli_h(_tmp00, 2), _tmp04), __msa_mulv_h(_tmp02, _v5)); + v8i16 _r0tm1 = __msa_subv_h(__msa_addv_h(_tmp04, _tmp03), __msa_slli_h(__msa_addv_h(_tmp01, _tmp02), 2)); + v8i16 _r0tm2 = __msa_addv_h(__msa_subv_h(_tmp04, _tmp03), __msa_slli_h(__msa_subv_h(_tmp01, _tmp02), 2)); + v8i16 _r0tm3 = __msa_subv_h(__msa_subv_h(_tmp04, _tmp02), __msa_slli_h(__msa_subv_h(_tmp01, _tmp03), 1)); + v8i16 _r0tm4 = __msa_addv_h(__msa_subv_h(_tmp04, _tmp02), __msa_slli_h(__msa_subv_h(_tmp01, _tmp03), 1)); + v8i16 _r0tm5 = __msa_subv_h(__msa_addv_h(__msa_slli_h(_tmp01, 2), _tmp05), __msa_mulv_h(_tmp03, _v5)); + + __msa_st_h(_r0tm0, r0_tm_0, 0); + __msa_st_h(_r0tm1, r0_tm_1, 0); + __msa_st_h(_r0tm2, r0_tm_2, 0); + __msa_st_h(_r0tm3, r0_tm_3, 0); + __msa_st_h(_r0tm4, r0_tm_4, 0); + __msa_st_h(_r0tm5, r0_tm_5, 0); + + r0_tm_0 += tiles * 48; + r0_tm_1 += tiles * 48; + r0_tm_2 += tiles * 48; + r0_tm_3 += tiles * 48; + r0_tm_4 += tiles * 48; + r0_tm_5 += tiles * 48; + } + } + } + } + } + bottom_blob_bordered = Mat(); + // END transform input + + // BEGIN dot + Mat top_blob_tm; + { + int w_tm = outw / 4 * 6; + int h_tm = outh / 4 * 6; + + const int tiles = h_tm / 6 * w_tm / 6; + + // permute + // bottom_blob_tm.create(tiles, 36, inch, elemsize, elempack, opt.workspace_allocator); + Mat bottom_blob_tm2; + if (tiles >= 2) + bottom_blob_tm2.create(2 * inch, tiles / 2 + tiles % 2, 36, 2u * elempack, elempack, opt.workspace_allocator); + else // if (tiles >= 1) + bottom_blob_tm2.create(1 * inch, tiles, 36, 2u * elempack, elempack, opt.workspace_allocator); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int r = 0; r < 36; r++) + { + Mat tm2 = bottom_blob_tm2.channel(r); + + // tile + int i = 0; + for (; i + 1 < tiles; i += 2) + { + short* tmpptr = tm2.row(i / 2); + + const short* r0 = bottom_blob_tm; + + r0 += (r * tiles + i) * 8; + + for (int q = 0; q < inch; q++) + { + v8i16 _r0 = __msa_ld_h(r0, 0); + v8i16 _r1 = __msa_ld_h(r0 + 8, 0); + __msa_st_h(_r0, tmpptr, 0); + __msa_st_h(_r1, tmpptr + 8, 0); + r0 += bottom_blob_tm.cstep * 8; + tmpptr += 16; + } + } + for (; i < tiles; i++) + { + short* tmpptr = tm2.row(i / 2 + i % 2); + + const short* r0 = bottom_blob_tm; + + r0 += (r * tiles + i) * 8; + + for (int q = 0; q < inch; q++) + { + v8i16 _r0 = __msa_ld_h(r0, 0); + __msa_st_h(_r0, tmpptr, 0); + r0 += bottom_blob_tm.cstep * 8; + tmpptr += 8; + } + } + } + + bottom_blob_tm = Mat(); + // permute end + + top_blob_tm.create(tiles, 36, outch, 4u * 4, 4, opt.workspace_allocator); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + int* output0_tm = top_blob_tm.channel(p); + + const Mat kernel0_tm = kernel_tm.channel(p); + + for (int r = 0; r < 36; r++) + { + const Mat bb2 = bottom_blob_tm2.channel(r); + + int i = 0; + for (; i + 1 < tiles; i += 2) + { + const short* r0 = bb2.row(i / 2); + const short* k0 = kernel0_tm.row(r); + + int nn = inch; // inch always > 0 + + v4i32 _sum0 = __msa_fill_w(0); + v4i32 _sum1 = __msa_fill_w(0); + v4i32 _sum2 = __msa_fill_w(0); + v4i32 _sum3 = __msa_fill_w(0); + + for (int j = 0; j < nn; j++) + { + v8i16 _w0 = __msa_ld_h(k0, 0); + v8i16 _w1 = __msa_ld_h(k0 + 8, 0); + v8i16 _w2 = __msa_ld_h(k0 + 16, 0); + v8i16 _w3 = __msa_ld_h(k0 + 24, 0); + + v8i16 _extw0 = __msa_clti_s_h(_w0, 0); + v8i16 _extw1 = __msa_clti_s_h(_w1, 0); + v8i16 _extw2 = __msa_clti_s_h(_w2, 0); + v8i16 _extw3 = __msa_clti_s_h(_w3, 0); + + v4i32 _w0l = (v4i32)__msa_ilvr_h(_extw0, _w0); + v4i32 _w0h = (v4i32)__msa_ilvl_h(_extw0, _w0); + v4i32 _w1l = (v4i32)__msa_ilvr_h(_extw1, _w1); + v4i32 _w1h = (v4i32)__msa_ilvl_h(_extw1, _w1); + v4i32 _w2l = (v4i32)__msa_ilvr_h(_extw2, _w2); + v4i32 _w2h = (v4i32)__msa_ilvl_h(_extw2, _w2); + v4i32 _w3l = (v4i32)__msa_ilvr_h(_extw3, _w3); + v4i32 _w3h = (v4i32)__msa_ilvl_h(_extw3, _w3); + + v4i32 _val0_0 = __msa_fill_w(r0[0]); + v4i32 _val0_1 = __msa_fill_w(r0[1]); + v4i32 _val0_2 = __msa_fill_w(r0[2]); + v4i32 _val0_3 = __msa_fill_w(r0[3]); + v4i32 _val0_4 = __msa_fill_w(r0[4]); + v4i32 _val0_5 = __msa_fill_w(r0[5]); + v4i32 _val0_6 = __msa_fill_w(r0[6]); + v4i32 _val0_7 = __msa_fill_w(r0[7]); + v4i32 _val1_0 = __msa_fill_w(r0[8]); + v4i32 _val1_1 = __msa_fill_w(r0[9]); + v4i32 _val1_2 = __msa_fill_w(r0[10]); + v4i32 _val1_3 = __msa_fill_w(r0[11]); + v4i32 _val1_4 = __msa_fill_w(r0[12]); + v4i32 _val1_5 = __msa_fill_w(r0[13]); + v4i32 _val1_6 = __msa_fill_w(r0[14]); + v4i32 _val1_7 = __msa_fill_w(r0[15]); + + _sum0 = __msa_maddv_w(_sum0, _w0l, _val0_0); + _sum1 = __msa_maddv_w(_sum1, _w0h, _val0_1); + _sum2 = __msa_maddv_w(_sum2, _w0l, _val1_0); + _sum3 = __msa_maddv_w(_sum3, _w0h, _val1_1); + _sum0 = __msa_maddv_w(_sum0, _w1l, _val0_2); + _sum1 = __msa_maddv_w(_sum1, _w1h, _val0_3); + _sum2 = __msa_maddv_w(_sum2, _w1l, _val1_2); + _sum3 = __msa_maddv_w(_sum3, _w1h, _val1_3); + _sum0 = __msa_maddv_w(_sum0, _w2l, _val0_4); + _sum1 = __msa_maddv_w(_sum1, _w2h, _val0_5); + _sum2 = __msa_maddv_w(_sum2, _w2l, _val1_4); + _sum3 = __msa_maddv_w(_sum3, _w2h, _val1_5); + _sum0 = __msa_maddv_w(_sum0, _w3l, _val0_6); + _sum1 = __msa_maddv_w(_sum1, _w3h, _val0_7); + _sum2 = __msa_maddv_w(_sum2, _w3l, _val1_6); + _sum3 = __msa_maddv_w(_sum3, _w3h, _val1_7); + + r0 += 16; + k0 += 32; + } + + _sum0 = __msa_addv_w(_sum0, _sum1); + _sum2 = __msa_addv_w(_sum2, _sum3); + + __msa_st_w(_sum0, output0_tm, 0); + __msa_st_w(_sum2, output0_tm + 4, 0); + + output0_tm += 8; + } + for (; i < tiles; i++) + { + const short* r0 = bb2.row(i / 2 + i % 2); + const short* k0 = kernel0_tm.row(r); + + int nn = inch; // inch always > 0 + + v4i32 _sum0 = __msa_fill_w(0); + v4i32 _sum1 = __msa_fill_w(0); + + for (int j = 0; j < nn; j++) + { + v8i16 _w0 = __msa_ld_h(k0, 0); + v8i16 _w1 = __msa_ld_h(k0 + 8, 0); + v8i16 _w2 = __msa_ld_h(k0 + 16, 0); + v8i16 _w3 = __msa_ld_h(k0 + 24, 0); + + v8i16 _extw0 = __msa_clti_s_h(_w0, 0); + v8i16 _extw1 = __msa_clti_s_h(_w1, 0); + v8i16 _extw2 = __msa_clti_s_h(_w2, 0); + v8i16 _extw3 = __msa_clti_s_h(_w3, 0); + + v4i32 _w0l = (v4i32)__msa_ilvr_h(_extw0, _w0); + v4i32 _w0h = (v4i32)__msa_ilvl_h(_extw0, _w0); + v4i32 _w1l = (v4i32)__msa_ilvr_h(_extw1, _w1); + v4i32 _w1h = (v4i32)__msa_ilvl_h(_extw1, _w1); + v4i32 _w2l = (v4i32)__msa_ilvr_h(_extw2, _w2); + v4i32 _w2h = (v4i32)__msa_ilvl_h(_extw2, _w2); + v4i32 _w3l = (v4i32)__msa_ilvr_h(_extw3, _w3); + v4i32 _w3h = (v4i32)__msa_ilvl_h(_extw3, _w3); + + v4i32 _val0 = __msa_fill_w(r0[0]); + v4i32 _val1 = __msa_fill_w(r0[1]); + v4i32 _val2 = __msa_fill_w(r0[2]); + v4i32 _val3 = __msa_fill_w(r0[3]); + v4i32 _val4 = __msa_fill_w(r0[4]); + v4i32 _val5 = __msa_fill_w(r0[5]); + v4i32 _val6 = __msa_fill_w(r0[6]); + v4i32 _val7 = __msa_fill_w(r0[7]); + + _sum0 = __msa_maddv_w(_sum0, _w0l, _val0); + _sum1 = __msa_maddv_w(_sum1, _w0h, _val1); + _sum0 = __msa_maddv_w(_sum0, _w1l, _val2); + _sum1 = __msa_maddv_w(_sum1, _w1h, _val3); + _sum0 = __msa_maddv_w(_sum0, _w2l, _val4); + _sum1 = __msa_maddv_w(_sum1, _w2h, _val5); + _sum0 = __msa_maddv_w(_sum0, _w3l, _val6); + _sum1 = __msa_maddv_w(_sum1, _w3h, _val7); + + r0 += 8; + k0 += 32; + } + + _sum0 = __msa_addv_w(_sum0, _sum1); + + __msa_st_w(_sum0, output0_tm, 0); + output0_tm += 4; + } + } + } + } + bottom_blob_tm = Mat(); + // END dot + + // BEGIN transform output + Mat top_blob_bordered; + if (outw == top_blob.w && outh == top_blob.h) + { + top_blob_bordered = top_blob; + } + else + { + top_blob_bordered.create(outw, outh, outch, 4u * 4, 4, opt.workspace_allocator); + } + { + // const float otm[4][6] = { + // {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f}, + // {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 0.0f}, + // {0.0f, 1.0f, 1.0f, 4.0f, 4.0f, 0.0f}, + // {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 1.0f} + // }; + + // 0 = r00 + (r01 + r02) + (r03 + r04) + // 1 = (r01 - r02) + (r03 - r04) * 2 + // 2 = (r01 + r02) + (r03 + r04) * 4 + // 3 = r05 + (r01 - r02) + (r03 - r04) * 8 + + int w_tm = outw / 4 * 6; + int h_tm = outh / 4 * 6; + const int tiles = w_tm / 6 * h_tm / 6; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + const Mat out0_tm = top_blob_tm.channel(p); + Mat out0 = top_blob_bordered.channel(p); + + int tmp[4][6][4]; + + // tile + for (int i = 0; i < outh / 4; i++) + { + for (int j = 0; j < outw / 4; j++) + { + // top_blob_tm.create(tiles, 36, outch, elemsize, elempack); + + const int* output0_tm_0 = (const int*)out0_tm + (i * w_tm / 6 + j) * 4; + const int* output0_tm_1 = output0_tm_0 + tiles * 4; + const int* output0_tm_2 = output0_tm_0 + tiles * 8; + const int* output0_tm_3 = output0_tm_0 + tiles * 12; + const int* output0_tm_4 = output0_tm_0 + tiles * 16; + const int* output0_tm_5 = output0_tm_0 + tiles * 20; + + int* output0 = out0.row(i * 4) + (j * 4) * 4; + + for (int m = 0; m < 5; m++) + { + v4i32 _out0tm0 = __msa_ld_w(output0_tm_0, 0); + v4i32 _out0tm1 = __msa_ld_w(output0_tm_1, 0); + v4i32 _out0tm2 = __msa_ld_w(output0_tm_2, 0); + v4i32 _out0tm3 = __msa_ld_w(output0_tm_3, 0); + v4i32 _out0tm4 = __msa_ld_w(output0_tm_4, 0); + v4i32 _out0tm5 = __msa_ld_w(output0_tm_5, 0); + + v4i32 _tmp02a = __msa_addv_w(_out0tm1, _out0tm2); + v4i32 _tmp13a = __msa_subv_w(_out0tm1, _out0tm2); + + v4i32 _tmp02b = __msa_addv_w(_out0tm3, _out0tm4); + v4i32 _tmp13b = __msa_subv_w(_out0tm3, _out0tm4); + + v4i32 _tmp0m = __msa_addv_w(__msa_addv_w(_out0tm0, _tmp02a), _tmp02b); + v4i32 _tmp1m = __msa_addv_w(_tmp13a, __msa_slli_w(_tmp13b, 1)); + v4i32 _tmp2m = __msa_addv_w(_tmp02a, __msa_slli_w(_tmp02b, 2)); + v4i32 _tmp3m = __msa_addv_w(__msa_addv_w(_tmp13a, __msa_slli_w(_out0tm5, 2)), __msa_slli_w(_tmp13b, 3)); + + __msa_st_w(_tmp0m, tmp[0][m], 0); + __msa_st_w(_tmp1m, tmp[1][m], 0); + __msa_st_w(_tmp2m, tmp[2][m], 0); + __msa_st_w(_tmp3m, tmp[3][m], 0); + + output0_tm_0 += tiles * 24; + output0_tm_1 += tiles * 24; + output0_tm_2 += tiles * 24; + output0_tm_3 += tiles * 24; + output0_tm_4 += tiles * 24; + output0_tm_5 += tiles * 24; + } + for (int m = 5; m < 6; m++) + { + v4i32 _out0tm0 = __msa_ld_w(output0_tm_0, 0); + v4i32 _out0tm1 = __msa_ld_w(output0_tm_1, 0); + v4i32 _out0tm2 = __msa_ld_w(output0_tm_2, 0); + v4i32 _out0tm3 = __msa_ld_w(output0_tm_3, 0); + v4i32 _out0tm4 = __msa_ld_w(output0_tm_4, 0); + v4i32 _out0tm5 = __msa_ld_w(output0_tm_5, 0); + + v4i32 _tmp02a = __msa_addv_w(_out0tm1, _out0tm2); + v4i32 _tmp13a = __msa_subv_w(_out0tm1, _out0tm2); + + v4i32 _tmp02b = __msa_addv_w(_out0tm3, _out0tm4); + v4i32 _tmp13b = __msa_subv_w(_out0tm3, _out0tm4); + + v4i32 _tmp0m = __msa_addv_w(__msa_addv_w(_out0tm0, _tmp02a), _tmp02b); + v4i32 _tmp1m = __msa_addv_w(_tmp13a, __msa_slli_w(_tmp13b, 1)); + v4i32 _tmp2m = __msa_addv_w(_tmp02a, __msa_slli_w(_tmp02b, 2)); + v4i32 _tmp3m = __msa_addv_w(__msa_addv_w(_tmp13a, __msa_slli_w(_out0tm5, 2)), __msa_slli_w(_tmp13b, 3)); + + _tmp0m = __msa_slli_w(_tmp0m, 2); + _tmp1m = __msa_slli_w(_tmp1m, 2); + _tmp2m = __msa_slli_w(_tmp2m, 2); + _tmp3m = __msa_slli_w(_tmp3m, 2); + + __msa_st_w(_tmp0m, tmp[0][m], 0); + __msa_st_w(_tmp1m, tmp[1][m], 0); + __msa_st_w(_tmp2m, tmp[2][m], 0); + __msa_st_w(_tmp3m, tmp[3][m], 0); + + output0_tm_0 += tiles * 24; + output0_tm_1 += tiles * 24; + output0_tm_2 += tiles * 24; + output0_tm_3 += tiles * 24; + output0_tm_4 += tiles * 24; + output0_tm_5 += tiles * 24; + } + + for (int m = 0; m < 4; m++) + { + v4i32 _tmp00 = __msa_ld_w(tmp[m][0], 0); + v4i32 _tmp01 = __msa_ld_w(tmp[m][1], 0); + v4i32 _tmp02 = __msa_ld_w(tmp[m][2], 0); + v4i32 _tmp03 = __msa_ld_w(tmp[m][3], 0); + v4i32 _tmp04 = __msa_ld_w(tmp[m][4], 0); + v4i32 _tmp05 = __msa_ld_w(tmp[m][5], 0); + + v4i32 _tmp02a = __msa_addv_w(_tmp01, _tmp02); + v4i32 _tmp13a = __msa_subv_w(_tmp01, _tmp02); + + v4i32 _tmp02b = __msa_addv_w(_tmp03, _tmp04); + v4i32 _tmp13b = __msa_subv_w(_tmp03, _tmp04); + + v4i32 _out00 = __msa_addv_w(__msa_addv_w(_tmp00, _tmp02a), _tmp02b); + v4i32 _out01 = __msa_addv_w(_tmp13a, __msa_slli_w(_tmp13b, 1)); + v4i32 _out02 = __msa_addv_w(_tmp02a, __msa_slli_w(_tmp02b, 2)); + v4i32 _out03 = __msa_addv_w(__msa_addv_w(_tmp05, _tmp13a), __msa_slli_w(_tmp13b, 3)); + + // TODO use integer trick for division by 576 + v4f32 _v576 = __msa_fill_w_f32(1.0 / 576); + _out00 = __msa_ftint_s_w(__msa_fmul_w(__msa_ffint_s_w(_out00), _v576)); + _out01 = __msa_ftint_s_w(__msa_fmul_w(__msa_ffint_s_w(_out01), _v576)); + _out02 = __msa_ftint_s_w(__msa_fmul_w(__msa_ffint_s_w(_out02), _v576)); + _out03 = __msa_ftint_s_w(__msa_fmul_w(__msa_ffint_s_w(_out03), _v576)); + + __msa_st_w(_out00, output0, 0); + __msa_st_w(_out01, output0 + 4, 0); + __msa_st_w(_out02, output0 + 8, 0); + __msa_st_w(_out03, output0 + 12, 0); + + output0 += outw * 4; + } + } + } + } + } + // END transform output + + // cut result pad + copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt); +} diff --git a/src/layer/mips/convolution_int8.h b/src/layer/mips/convolution_int8.h new file mode 100644 index 000000000..6b56d11eb --- /dev/null +++ b/src/layer/mips/convolution_int8.h @@ -0,0 +1,82 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void convolution_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) +{ + int w = bottom_blob.w; + int channels = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w * dilation_h - kernel_w * dilation_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2; + p1++; + p2 += dilation_w; + } + p2 += gap; + } + } + + // num_output + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + int* outptr = top_blob.channel(p); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + int sum = 0; + + // const signed char* kptr = weight_data_int8.channel(p); + const signed char* kptr = (const signed char*)weight_data_int8 + maxk * channels * p; + + // channels + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob.channel(q); + const signed char* sptr = m.row(i * stride_h) + j * stride_w; + + for (int k = 0; k < maxk; k++) + { + signed char val = sptr[space_ofs[k]]; + signed char w = kptr[k]; + sum += val * w; + } + + kptr += maxk; + } + + outptr[j] = sum; + } + + outptr += outw; + } + } +} diff --git a/src/layer/mips/convolution_mips.cpp b/src/layer/mips/convolution_mips.cpp index f9d37028d..a3e012c57 100644 --- a/src/layer/mips/convolution_mips.cpp +++ b/src/layer/mips/convolution_mips.cpp @@ -32,6 +32,12 @@ namespace ncnn { #include "convolution_sgemm.h" #include "convolution_1x1.h" +#if NCNN_INT8 +#include "convolution_sgemm_int8.h" +#include "convolution_1x1_int8.h" +#include "convolution_int8.h" +#endif // NCNN_INT8 + #if __mips_msa #include "convolution_pack4.h" #include "convolution_pack1to4.h" @@ -44,6 +50,20 @@ namespace ncnn { #include "convolution_3x3_pack4.h" #include "convolution_3x3_pack1to4.h" #include "convolution_7x7_pack1to4.h" + +#if NCNN_INT8 +#include "convolution_pack8to4_int8.h" +#include "convolution_pack1to4_int8.h" +#include "convolution_pack8to1_int8.h" +#include "convolution_sgemm_pack8to4_int8.h" +#include "convolution_sgemm_pack1to4_int8.h" +#include "convolution_sgemm_pack8to1_int8.h" +#include "convolution_1x1_pack8to4_int8.h" +#include "convolution_1x1_pack1to4_int8.h" +#include "convolution_1x1_pack8to1_int8.h" +#include "convolution_3x3_pack8to4_int8.h" +#include "convolution_3x3_pack8to1_int8.h" +#endif // NCNN_INT8 #endif // __mips_msa Convolution_mips::Convolution_mips() @@ -98,6 +118,13 @@ int Convolution_mips::create_pipeline(const Option& opt) activation = create_activation_layer(activation_type, activation_params, opt); +#if NCNN_INT8 + if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u) + { + return create_pipeline_int8_mips(opt); + } +#endif + const int maxk = kernel_w * kernel_h; const int num_input = weight_data_size / maxk / num_output; @@ -117,8 +144,8 @@ int Convolution_mips::create_pipeline(const Option& opt) { if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && num_input >= 16 && num_output >= 16) { - conv3x3s1_winograd64_transform_kernel_pack4_msa(weight_data, weight_data_packed, num_input, num_output, opt); - conv3x3s1_winograd42_transform_kernel_pack4_msa(weight_data, weight_3x3_winograd42_data_packed, num_input, num_output, opt); + conv3x3s1_winograd64_transform_kernel_pack4_msa(weight_data, weight_3x3_winograd64_data, num_input, num_output, opt); + conv3x3s1_winograd42_transform_kernel_pack4_msa(weight_data, weight_3x3_winograd42_data, num_input, num_output, opt); } else { @@ -187,27 +214,7 @@ int Convolution_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Optio #if NCNN_INT8 if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u) { - Mat bottom_blob_unpacked = bottom_blob; - if (bottom_blob.elempack != 1) - { - Option opt_pack1 = opt; - opt_pack1.blob_allocator = opt.workspace_allocator; - - convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1); - } - - Mat bottom_blob_unpacked_fp32 = bottom_blob_unpacked; - if (bottom_blob_unpacked.elembits() == 16) - { - Option opt_pack1 = opt; - opt_pack1.blob_allocator = opt.workspace_allocator; - - cast_float16_to_float32(bottom_blob_unpacked, bottom_blob_unpacked_fp32, opt_pack1); - } - - Option opt_unpacked = opt; - opt_unpacked.use_packing_layout = false; - return Convolution::forward_int8(bottom_blob_unpacked_fp32, top_blob, opt_unpacked); + return forward_int8_mips(bottom_blob, top_blob, opt); } #endif @@ -278,11 +285,11 @@ int Convolution_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Optio // we need more proper conditions if ((w <= 10 || (w >= 15 && w <= 18) || w == 21 || w == 22) && (h <= 10 || (h >= 15 && h <= 18) || h == 21 || h == 22)) { - conv3x3s1_winograd42_pack4_msa(bottom_blob_bordered, top_blob, weight_3x3_winograd42_data_packed, bias_data, opt); + conv3x3s1_winograd42_pack4_msa(bottom_blob_bordered, top_blob, weight_3x3_winograd42_data, bias_data, opt); } else { - conv3x3s1_winograd64_pack4_msa(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt); + conv3x3s1_winograd64_pack4_msa(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data, opt); } if (activation) @@ -542,4 +549,408 @@ int Convolution_mips::forward(const std::vector& bottom_blobs, std::vector< return 0; } +#if NCNN_INT8 +static void convolution_transform_kernel_packed_int8_msa(const Mat& weight_data, Mat& weight_data_int8, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack) +{ + const int maxk = kernel_w * kernel_h; + + // src = kw-kh-inch-outch + // dst = pa-pb-kw-kh-inch/pa-outch/pb + { + Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output); + + weight_data_int8.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)elempack * out_elempack, elempack * out_elempack); + + for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack) + { + signed char* g00 = weight_data_int8.channel(q / out_elempack); + + for (int p = 0; p + (elempack - 1) < num_input; p += elempack) + { + for (int k = 0; k < maxk; k++) + { + for (int i = 0; i < out_elempack; i++) + { + for (int j = 0; j < elempack; j++) + { + const signed char* k00 = weight_data_r2.channel(q + i).row(p + j); + + g00[0] = k00[k]; + + g00++; + } + } + } + } + } + } +} + +int Convolution_mips::create_pipeline_int8_mips(const Option& opt) +{ + const int maxk = kernel_w * kernel_h; + const int num_input = weight_data_size / maxk / num_output; + + int elempack = 1; + int out_elempack = 1; +#if __mips_msa + if (opt.use_packing_layout) + { + elempack = num_input % 8 == 0 ? 8 : 1; + out_elempack = num_output % 4 == 0 ? 4 : 1; + } +#endif // __mips_msa + +#if __mips_msa + if (elempack == 8 && out_elempack == 4) + { + if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + convolution_im2col_sgemm_transform_kernel_pack8to4_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + convolution_im2col_sgemm_transform_kernel_pack8to4_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + conv3x3s1_winograd42_transform_kernel_pack8to4_int8_msa(weight_data, weight_3x3_winograd42_data, num_input, num_output, opt); + } + else if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack8to4_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + convolution_transform_kernel_packed_int8_msa(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + + if (elempack == 1 && out_elempack == 4) + { + if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + convolution_im2col_sgemm_transform_kernel_pack1to4_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + convolution_im2col_sgemm_transform_kernel_pack1to4_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8) + { + convolution_im2col_sgemm_transform_kernel_pack1to4_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + convolution_transform_kernel_packed_int8_msa(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + + if (elempack == 8 && out_elempack == 1) + { + if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + convolution_im2col_sgemm_transform_kernel_pack8to1_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + convolution_im2col_sgemm_transform_kernel_pack8to1_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + conv3x3s1_winograd42_transform_kernel_pack8to1_int8_msa(weight_data, weight_3x3_winograd42_data, num_input, num_output, opt); + } + else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8) + { + convolution_im2col_sgemm_transform_kernel_pack8to1_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + convolution_transform_kernel_packed_int8_msa(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } +#endif // __mips_msa + + if (elempack == 1 && out_elempack == 1) + { + if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + convolution_im2col_sgemm_transform_kernel_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + convolution_im2col_sgemm_transform_kernel_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + } + + return 0; +} + +int Convolution_mips::forward_int8_mips(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int elembits = bottom_blob.elembits(); + + Mat bottom_blob_int8 = bottom_blob; + if (elembits != 8) + { + Option opt_q = opt; + opt_q.blob_allocator = opt.workspace_allocator; + quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_q); + } + + Mat bottom_blob_bordered; + make_padding(bottom_blob_int8, bottom_blob_bordered, opt); + if (bottom_blob_bordered.empty()) + return -100; + + int w = bottom_blob_bordered.w; + int h = bottom_blob_bordered.h; + int channels = bottom_blob_bordered.c; + int elempack = bottom_blob_bordered.elempack; + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + int outw = (w - kernel_extent_w) / stride_w + 1; + int outh = (h - kernel_extent_h) / stride_h + 1; + + bool use_int8_requantize = int8_scale_term > 100; + int out_elempack = 1; +#if __mips_msa + if (opt.use_packing_layout) + { + if (use_int8_requantize) + out_elempack = num_output % 8 == 0 ? 8 : 1; + else + out_elempack = num_output % 4 == 0 ? 4 : 1; + } +#endif // __mips_msa + size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack; + + top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + const int num_input = channels * elempack; + + int out_elempack_int32 = 1; +#if __mips_msa + if (opt.use_packing_layout) + { + out_elempack_int32 = num_output % 4 == 0 ? 4 : 1; + } +#endif // __mips_msa + + Mat top_blob_int32; + top_blob_int32.create(outw, outh, num_output / out_elempack_int32, (size_t)(4u * out_elempack_int32), out_elempack_int32, opt.workspace_allocator); + if (top_blob_int32.empty()) + return -100; + +#if __mips_msa + if (elempack == 8 && out_elempack_int32 == 4) + { + if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + conv1x1s1_sgemm_pack8to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt); + } + else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + conv1x1s2_sgemm_pack8to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt); + } + else if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + conv3x3s1_winograd42_pack8to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_3x3_winograd42_data, opt); + } + else if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_pack8to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + } + else + { + convolution_pack8to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_data_int8, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + } + + Mat scale_in_data(num_output); + for (int p = 0; p < num_output; p++) + { + // requantize and relu + float scale_in; + if (weight_data_int8_scales[p] == 0) + scale_in = 0; + else + scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]); + + scale_in_data[p] = scale_in; + } + + if (use_int8_requantize) + { + requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt); + } + else + { + dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + } + + if (elempack == 1 && out_elempack_int32 == 4) + { + if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + conv1x1s1_sgemm_pack1to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt); + } + else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + conv1x1s2_sgemm_pack1to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt); + } + else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8) + { + convolution_im2col_sgemm_pack1to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + } + else + { + convolution_pack1to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_data_int8, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + } + + Mat scale_in_data(num_output); + for (int p = 0; p < num_output; p++) + { + // requantize and relu + float scale_in; + if (weight_data_int8_scales[p] == 0) + scale_in = 0; + else + scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]); + + scale_in_data[p] = scale_in; + } + + if (use_int8_requantize) + { + requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt); + } + else + { + dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + } + + if (elempack == 8 && out_elempack_int32 == 1) + { + if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + conv1x1s1_sgemm_pack8to1_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt); + } + else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + conv1x1s2_sgemm_pack8to1_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt); + } + else if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + conv3x3s1_winograd42_pack8to1_int8_msa(bottom_blob_bordered, top_blob_int32, weight_3x3_winograd42_data, opt); + } + else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8) + { + convolution_im2col_sgemm_pack8to1_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + } + else + { + convolution_pack8to1_int8_msa(bottom_blob_bordered, top_blob_int32, weight_data_int8, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + } + + Mat scale_in_data(num_output); + for (int p = 0; p < num_output; p++) + { + // requantize and relu + float scale_in; + if (weight_data_int8_scales[p] == 0) + scale_in = 0; + else + scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]); + + scale_in_data[p] = scale_in; + } + + if (use_int8_requantize) + { + requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt); + } + else + { + dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + } +#endif // __mips_msa + + if (elempack == 1 && out_elempack_int32 == 1) + { + if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + conv1x1s1_sgemm_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt); + } + else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + conv1x1s2_sgemm_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt); + } + else if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + } + else + { + // convolution_int8(bottom_blob_bordered, top_blob_int32, weight_data_int8, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + convolution_int8(bottom_blob_bordered, top_blob_int32, weight_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + } + + Mat scale_in_data(num_output); + for (int p = 0; p < num_output; p++) + { + // requantize and relu + float scale_in; + if (weight_data_int8_scales[p] == 0) + scale_in = 0; + else + scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]); + + scale_in_data[p] = scale_in; + } + + if (use_int8_requantize) + { + requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt); + } + else + { + dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + } + + return 0; +} +#endif // NCNN_INT8 + } // namespace ncnn diff --git a/src/layer/mips/convolution_mips.h b/src/layer/mips/convolution_mips.h index acb5394d8..69d442d2d 100644 --- a/src/layer/mips/convolution_mips.h +++ b/src/layer/mips/convolution_mips.h @@ -31,12 +31,27 @@ public: virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; +protected: +#if NCNN_INT8 + int create_pipeline_int8_mips(const Option& opt); + int forward_int8_mips(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +#endif + public: Layer* activation; - // packn + Mat weight_sgemm_data; + + Mat weight_3x3_winograd42_data; + Mat weight_3x3_winograd64_data; + + // pack4 Mat weight_data_packed; - Mat weight_3x3_winograd42_data_packed; + +#if NCNN_INT8 + // int8 + Mat weight_data_int8; +#endif }; } // namespace ncnn diff --git a/src/layer/mips/convolution_pack1to4_int8.h b/src/layer/mips/convolution_pack1to4_int8.h new file mode 100644 index 000000000..b752bda0b --- /dev/null +++ b/src/layer/mips/convolution_pack1to4_int8.h @@ -0,0 +1,87 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void convolution_pack1to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) +{ + int w = bottom_blob.w; + int channels = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w * dilation_h - kernel_w * dilation_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2; + p1++; + p2 += dilation_w; + } + p2 += gap; + } + } + + // num_output + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + int* outptr = top_blob.channel(p); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + v4i32 _sum = __msa_fill_w(0); + + const signed char* kptr = weight_data_int8.channel(p); + + // channels + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob.channel(q); + const signed char* sptr = m.row(i * stride_h) + j * stride_w; + + for (int k = 0; k < maxk; k++) + { + v8i16 _val = __msa_fill_h((short)sptr[space_ofs[k]]); + + v16i8 _w = __msa_ld_b(kptr, 0); + v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w); + + v8i16 _s0 = __msa_mulv_h(_val, _w16); + v4i32 _s032 = (v4i32)__msa_ilvr_h(__msa_clti_s_h(_s0, 0), _s0); + + _sum = __msa_addv_w(_sum, _s032); + + kptr += 4; + } + } + + __msa_st_w(_sum, outptr + j * 4, 0); + } + + outptr += outw * 4; + } + } +} diff --git a/src/layer/mips/convolution_pack4to1.h b/src/layer/mips/convolution_pack4to1.h index efdea27d0..8269860b2 100644 --- a/src/layer/mips/convolution_pack4to1.h +++ b/src/layer/mips/convolution_pack4to1.h @@ -81,7 +81,7 @@ static void convolution_pack4to1_msa(const Mat& bottom_blob, Mat& top_blob, cons } } - sum += __msa_fhadd_w(_sum); + sum += __msa_reduce_fadd_w(_sum); sum = activation_ss(sum, activation_type, activation_params); diff --git a/src/layer/mips/convolution_pack8to1_int8.h b/src/layer/mips/convolution_pack8to1_int8.h new file mode 100644 index 000000000..1f9ad5a6c --- /dev/null +++ b/src/layer/mips/convolution_pack8to1_int8.h @@ -0,0 +1,87 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void convolution_pack8to1_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) +{ + int w = bottom_blob.w; + int channels = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w * dilation_h - kernel_w * dilation_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2; + p1++; + p2 += dilation_w; + } + p2 += gap; + } + } + + // num_output + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + int* outptr = top_blob.channel(p); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + v4i32 _sum = __msa_fill_w(0); + + const signed char* kptr = weight_data_int8.channel(p); + + // channels + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob.channel(q); + const signed char* sptr = m.row(i * stride_h) + j * stride_w * 8; + + for (int k = 0; k < maxk; k++) + { + v16i8 _val = __msa_ld_b(sptr + space_ofs[k] * 8, 0); + v8i16 _val16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val); + + v16i8 _w = __msa_ld_b(kptr, 0); + v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w); + + v8i16 _s0 = __msa_mulv_h(_val16, _w16); + + _sum = __msa_addv_w(_sum, __msa_hadd_s_w(_s0, _s0)); + + kptr += 8; + } + } + + outptr[j] = __msa_reduce_add_w(_sum); + } + + outptr += outw; + } + } +} diff --git a/src/layer/mips/convolution_pack8to4_int8.h b/src/layer/mips/convolution_pack8to4_int8.h new file mode 100644 index 000000000..50636f82d --- /dev/null +++ b/src/layer/mips/convolution_pack8to4_int8.h @@ -0,0 +1,120 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void convolution_pack8to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) +{ + int w = bottom_blob.w; + int channels = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w * dilation_h - kernel_w * dilation_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2; + p1++; + p2 += dilation_w; + } + p2 += gap; + } + } + + // num_output + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + int* outptr = top_blob.channel(p); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + v4i32 _sum0 = __msa_fill_w(0); + v4i32 _sum1 = __msa_fill_w(0); + v4i32 _sum2 = __msa_fill_w(0); + v4i32 _sum3 = __msa_fill_w(0); + + const signed char* kptr = weight_data_int8.channel(p); + + // channels + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob.channel(q); + const signed char* sptr = m.row(i * stride_h) + j * stride_w * 8; + + for (int k = 0; k < maxk; k++) + { + v16i8 _val = __msa_ld_b(sptr + space_ofs[k] * 8, 0); + v8i16 _val16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val); + + v16i8 _w01 = __msa_ld_b(kptr, 0); + v16i8 _w23 = __msa_ld_b(kptr + 16, 0); + v16i8 _extw01 = __msa_clti_s_b(_w01, 0); + v16i8 _extw23 = __msa_clti_s_b(_w23, 0); + v8i16 _w0 = (v8i16)__msa_ilvr_b(_extw01, _w01); + v8i16 _w1 = (v8i16)__msa_ilvl_b(_extw01, _w01); + v8i16 _w2 = (v8i16)__msa_ilvr_b(_extw23, _w23); + v8i16 _w3 = (v8i16)__msa_ilvl_b(_extw23, _w23); + + v8i16 _s0 = __msa_mulv_h(_val16, _w0); + v8i16 _s1 = __msa_mulv_h(_val16, _w1); + v8i16 _s2 = __msa_mulv_h(_val16, _w2); + v8i16 _s3 = __msa_mulv_h(_val16, _w3); + + _sum0 = __msa_addv_w(_sum0, __msa_hadd_s_w(_s0, _s0)); + _sum1 = __msa_addv_w(_sum1, __msa_hadd_s_w(_s1, _s1)); + _sum2 = __msa_addv_w(_sum2, __msa_hadd_s_w(_s2, _s2)); + _sum3 = __msa_addv_w(_sum3, __msa_hadd_s_w(_s3, _s3)); + + kptr += 32; + } + } + + // transpose 4x4 + { + v4i32 _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = __msa_ilvr_w(_sum1, _sum0); + _tmp1 = __msa_ilvr_w(_sum3, _sum2); + _tmp2 = __msa_ilvl_w(_sum1, _sum0); + _tmp3 = __msa_ilvl_w(_sum3, _sum2); + _sum0 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0); + _sum1 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0); + _sum2 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2); + _sum3 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2); + } + + _sum0 = __msa_addv_w(_sum0, _sum1); + _sum2 = __msa_addv_w(_sum2, _sum3); + + _sum0 = __msa_addv_w(_sum0, _sum2); + + __msa_st_w(_sum0, outptr + j * 4, 0); + } + + outptr += outw * 4; + } + } +} diff --git a/src/layer/mips/convolution_sgemm_int8.h b/src/layer/mips/convolution_sgemm_int8.h new file mode 100644 index 000000000..297c4b1b0 --- /dev/null +++ b/src/layer/mips/convolution_sgemm_int8.h @@ -0,0 +1,731 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void im2col_sgemm_int8_msa(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt) +{ + // Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator); + + const int size = bottom_im2col.w; + const int maxk = bottom_im2col.h; + const int inch = bottom_im2col.c; + + const int outch = top_blob.c; + + // permute + Mat tmp; +#if __mips_msa + if (inch >= 4) + { + if (size >= 2) + tmp.create(2 * maxk, inch / 4 + inch % 4, size / 2 + size % 2, 4u, 4, opt.workspace_allocator); + else + tmp.create(maxk, inch / 4 + inch % 4, size, 4u, 4, opt.workspace_allocator); + } + else + { + if (size >= 2) + tmp.create(2 * maxk, inch, size / 2 + size % 2, 1u, 1, opt.workspace_allocator); + else + tmp.create(maxk, inch, size, 1u, 1, opt.workspace_allocator); + } + { + int remain_size_start = 0; + int nn_size = (size - remain_size_start) >> 1; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_size; ii++) + { + int i = remain_size_start + ii * 2; + + signed char* tmpptr = tmp.channel(i / 2); + + int q = 0; + for (; q + 3 < inch; q += 4) + { + const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i; + const signed char* img1 = (const signed char*)bottom_im2col.channel(q + 1) + i; + const signed char* img2 = (const signed char*)bottom_im2col.channel(q + 2) + i; + const signed char* img3 = (const signed char*)bottom_im2col.channel(q + 3) + i; + + for (int k = 0; k < maxk; k++) + { + tmpptr[0] = img0[0]; + tmpptr[1] = img1[0]; + tmpptr[2] = img2[0]; + tmpptr[3] = img3[0]; + tmpptr[4] = img0[1]; + tmpptr[5] = img1[1]; + tmpptr[6] = img2[1]; + tmpptr[7] = img3[1]; + tmpptr += 8; + + img0 += size; + img1 += size; + img2 += size; + img3 += size; + } + } + for (; q < inch; q++) + { + const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i; + + for (int k = 0; k < maxk; k++) + { + tmpptr[0] = img0[0]; + tmpptr[1] = img0[1]; + + tmpptr += 2; + + img0 += size; + } + } + } + + remain_size_start += nn_size << 1; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = remain_size_start; i < size; i++) + { + signed char* tmpptr = tmp.channel(i / 2 + i % 2); + + int q = 0; + for (; q + 3 < inch; q += 4) + { + const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i; + const signed char* img1 = (const signed char*)bottom_im2col.channel(q + 1) + i; + const signed char* img2 = (const signed char*)bottom_im2col.channel(q + 2) + i; + const signed char* img3 = (const signed char*)bottom_im2col.channel(q + 3) + i; + + for (int k = 0; k < maxk; k++) + { + tmpptr[0] = img0[0]; + tmpptr[1] = img1[0]; + tmpptr[2] = img2[0]; + tmpptr[3] = img3[0]; + tmpptr += 4; + + img0 += size; + img1 += size; + img2 += size; + img3 += size; + } + } + for (; q < inch; q++) + { + const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i; + + for (int k = 0; k < maxk; k++) + { + tmpptr[0] = img0[0]; + + tmpptr += 1; + + img0 += size; + } + } + } + } +#else // __mips_msa + tmp.create(maxk, inch, size, 1u, 1, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < size; i++) + { + signed char* tmpptr = tmp.channel(i); + + int q = 0; + for (; q < inch; q++) + { + const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i; + + for (int k = 0; k < maxk; k++) + { + tmpptr[0] = img0[0]; + + tmpptr += 1; + + img0 += size; + } + } + } + } +#endif // __mips_msa + + int nn_outch = 0; + int remain_outch_start = 0; + +#if __mips_msa + nn_outch = outch >> 2; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_outch; pp++) + { + int p = pp * 4; + + int* outptr0 = top_blob.channel(p); + int* outptr1 = top_blob.channel(p + 1); + int* outptr2 = top_blob.channel(p + 2); + int* outptr3 = top_blob.channel(p + 3); + + int i = 0; + for (; i + 1 < size; i += 2) + { + const signed char* tmpptr = tmp.channel(i / 2); + const signed char* kptr = kernel.channel(p / 4); + + int nn4 = (inch / 4) * maxk; + int nn1 = (inch % 4) * maxk; + + v4i32 _sum00 = __msa_fill_w(0); + v4i32 _sum10 = __msa_fill_w(0); + + if (nn4 > 0) + { + v4i32 _sum01 = __msa_fill_w(0); + v4i32 _sum02 = __msa_fill_w(0); + v4i32 _sum03 = __msa_fill_w(0); + v4i32 _sum11 = __msa_fill_w(0); + v4i32 _sum12 = __msa_fill_w(0); + v4i32 _sum13 = __msa_fill_w(0); + + int j = 0; + for (; j < nn4; j++) + { + v16i8 _val = __msa_ld_b(tmpptr, 0); + v8i16 _val01 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val); + + v8i16 _val0 = (v8i16)__msa_ilvr_d((v2i64)_val01, (v2i64)_val01); + v8i16 _val1 = (v8i16)__msa_ilvl_d((v2i64)_val01, (v2i64)_val01); + + v16i8 _w01 = __msa_ld_b(kptr, 0); + v16i8 _extw01 = __msa_clti_s_b(_w01, 0); + v8i16 _w0 = (v8i16)__msa_ilvr_b(_extw01, _w01); + v8i16 _w1 = (v8i16)__msa_ilvl_b(_extw01, _w01); + + v8i16 _s00 = __msa_mulv_h(_val0, _w0); + v8i16 _s01 = __msa_mulv_h(_val0, _w1); + v8i16 _s10 = __msa_mulv_h(_val1, _w0); + v8i16 _s11 = __msa_mulv_h(_val1, _w1); + + v8i16 _exts00 = __msa_clti_s_h(_s00, 0); + v8i16 _exts01 = __msa_clti_s_h(_s01, 0); + v8i16 _exts10 = __msa_clti_s_h(_s10, 0); + v8i16 _exts11 = __msa_clti_s_h(_s11, 0); + v4i32 _s00l = (v4i32)__msa_ilvr_h(_exts00, _s00); + v4i32 _s00h = (v4i32)__msa_ilvl_h(_exts00, _s00); + v4i32 _s01l = (v4i32)__msa_ilvr_h(_exts01, _s01); + v4i32 _s01h = (v4i32)__msa_ilvl_h(_exts01, _s01); + v4i32 _s10l = (v4i32)__msa_ilvr_h(_exts10, _s10); + v4i32 _s10h = (v4i32)__msa_ilvl_h(_exts10, _s10); + v4i32 _s11l = (v4i32)__msa_ilvr_h(_exts11, _s11); + v4i32 _s11h = (v4i32)__msa_ilvl_h(_exts11, _s11); + + _sum00 = __msa_addv_w(_sum00, _s00l); + _sum01 = __msa_addv_w(_sum01, _s00h); + _sum02 = __msa_addv_w(_sum02, _s01l); + _sum03 = __msa_addv_w(_sum03, _s01h); + _sum10 = __msa_addv_w(_sum10, _s10l); + _sum11 = __msa_addv_w(_sum11, _s10h); + _sum12 = __msa_addv_w(_sum12, _s11l); + _sum13 = __msa_addv_w(_sum13, _s11h); + + tmpptr += 8; + kptr += 16; + } + + // transpose 4x4 + { + v4i32 _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = __msa_ilvr_w(_sum01, _sum00); + _tmp1 = __msa_ilvr_w(_sum03, _sum02); + _tmp2 = __msa_ilvl_w(_sum01, _sum00); + _tmp3 = __msa_ilvl_w(_sum03, _sum02); + _sum00 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0); + _sum01 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0); + _sum02 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2); + _sum03 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2); + } + { + v4i32 _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = __msa_ilvr_w(_sum11, _sum10); + _tmp1 = __msa_ilvr_w(_sum13, _sum12); + _tmp2 = __msa_ilvl_w(_sum11, _sum10); + _tmp3 = __msa_ilvl_w(_sum13, _sum12); + _sum10 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0); + _sum11 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0); + _sum12 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2); + _sum13 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2); + } + + _sum00 = __msa_addv_w(_sum00, _sum01); + _sum02 = __msa_addv_w(_sum02, _sum03); + _sum10 = __msa_addv_w(_sum10, _sum11); + _sum12 = __msa_addv_w(_sum12, _sum13); + + _sum00 = __msa_addv_w(_sum00, _sum02); + _sum10 = __msa_addv_w(_sum10, _sum12); + } + + int j = 0; + for (; j < nn1; j++) + { + v8i16 _val0 = __msa_fill_h(tmpptr[0]); + v8i16 _val1 = __msa_fill_h(tmpptr[1]); + v8i16 _val = (v8i16)__msa_ilvr_d((v2i64)_val1, (v2i64)_val0); + + v16i8 _w = __msa_ld_b(kptr, 0); + v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w); + + _w16 = (v8i16)__msa_ilvr_d((v2i64)_w16, (v2i64)_w16); + + v8i16 _s0 = __msa_mulv_h(_val, _w16); + v8i16 _exts0 = __msa_clti_s_h(_s0, 0); + v4i32 _s0l = (v4i32)__msa_ilvr_h(_exts0, _s0); + v4i32 _s0h = (v4i32)__msa_ilvl_h(_exts0, _s0); + + _sum00 = __msa_addv_w(_sum00, _s0l); + _sum10 = __msa_addv_w(_sum10, _s0h); + + tmpptr += 2; + kptr += 4; + } + + int sum[8]; + __msa_st_w(_sum00, sum, 0); + __msa_st_w(_sum10, sum + 4, 0); + + outptr0[0] = sum[0]; + outptr1[0] = sum[1]; + outptr2[0] = sum[2]; + outptr3[0] = sum[3]; + outptr0[1] = sum[4]; + outptr1[1] = sum[5]; + outptr2[1] = sum[6]; + outptr3[1] = sum[7]; + outptr0 += 2; + outptr1 += 2; + outptr2 += 2; + outptr3 += 2; + } + for (; i < size; i++) + { + const signed char* tmpptr = tmp.channel(i / 2 + i % 2); + const signed char* kptr = kernel.channel(p / 4); + + int nn4 = (inch / 4) * maxk; + int nn1 = (inch % 4) * maxk; + + v4i32 _sum0 = __msa_fill_w(0); + + if (nn4 > 0) + { + v4i32 _sum1 = __msa_fill_w(0); + v4i32 _sum2 = __msa_fill_w(0); + v4i32 _sum3 = __msa_fill_w(0); + + int j = 0; + for (; j < nn4; j++) + { + v16i8 _val = __msa_ld_b(tmpptr, 0); + v8i16 _val16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val); + + _val16 = (v8i16)__msa_ilvr_d((v2i64)_val16, (v2i64)_val16); + + v16i8 _w01 = __msa_ld_b(kptr, 0); + v16i8 _extw01 = __msa_clti_s_b(_w01, 0); + v8i16 _w0 = (v8i16)__msa_ilvr_b(_extw01, _w01); + v8i16 _w1 = (v8i16)__msa_ilvl_b(_extw01, _w01); + + v8i16 _s0 = __msa_mulv_h(_val16, _w0); + v8i16 _s1 = __msa_mulv_h(_val16, _w1); + + v8i16 _exts0 = __msa_clti_s_h(_s0, 0); + v8i16 _exts1 = __msa_clti_s_h(_s1, 0); + v4i32 _s0l = (v4i32)__msa_ilvr_h(_exts0, _s0); + v4i32 _s0h = (v4i32)__msa_ilvl_h(_exts0, _s0); + v4i32 _s1l = (v4i32)__msa_ilvr_h(_exts1, _s1); + v4i32 _s1h = (v4i32)__msa_ilvl_h(_exts1, _s1); + + _sum0 = __msa_addv_w(_sum0, _s0l); + _sum1 = __msa_addv_w(_sum1, _s0h); + _sum2 = __msa_addv_w(_sum2, _s1l); + _sum3 = __msa_addv_w(_sum3, _s1h); + + tmpptr += 4; + kptr += 16; + } + + // transpose 4x4 + { + v4i32 _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = __msa_ilvr_w(_sum1, _sum0); + _tmp1 = __msa_ilvr_w(_sum3, _sum2); + _tmp2 = __msa_ilvl_w(_sum1, _sum0); + _tmp3 = __msa_ilvl_w(_sum3, _sum2); + _sum0 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0); + _sum1 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0); + _sum2 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2); + _sum3 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2); + } + + _sum0 = __msa_addv_w(_sum0, _sum1); + _sum2 = __msa_addv_w(_sum2, _sum3); + _sum0 = __msa_addv_w(_sum0, _sum2); + } + + int j = 0; + for (; j < nn1; j++) + { + v8i16 _val = __msa_fill_h(tmpptr[0]); + + v16i8 _w = __msa_ld_b(kptr, 0); + v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w); + + v8i16 _s0 = __msa_mulv_h(_val, _w16); + v4i32 _s032 = (v4i32)__msa_ilvr_h(__msa_clti_s_h(_s0, 0), _s0); + + _sum0 = __msa_addv_w(_sum0, _s032); + + tmpptr += 1; + kptr += 4; + } + + int sum[4]; + __msa_st_w(_sum0, sum, 0); + + outptr0[0] = sum[0]; + outptr1[0] = sum[1]; + outptr2[0] = sum[2]; + outptr3[0] = sum[3]; + outptr0 += 1; + outptr1 += 1; + outptr2 += 1; + outptr3 += 1; + } + } + + remain_outch_start += nn_outch << 2; +#endif // __mips_msa + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = remain_outch_start; p < outch; p++) + { + int* outptr0 = top_blob.channel(p); + + int i = 0; +#if __mips_msa + for (; i + 1 < size; i += 2) + { + const signed char* tmpptr = tmp.channel(i / 2); + const signed char* kptr = kernel.channel(p / 4 + p % 4); + + int nn4 = (inch / 4) * maxk; + int nn1 = (inch % 4) * maxk; + + int sum0 = 0; + int sum1 = 0; + + if (nn4 > 0) + { + v4i32 _sum0 = __msa_fill_w(0); + v4i32 _sum1 = __msa_fill_w(0); + + int j = 0; + for (; j < nn4; j++) + { + v16i8 _val = __msa_ld_b(tmpptr, 0); + v8i16 _val16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val); + + v16i8 _w = __msa_ld_b(kptr, 0); + v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w); + + _w16 = (v8i16)__msa_ilvr_d((v2i64)_w16, (v2i64)_w16); + + v8i16 _s0 = __msa_mulv_h(_val16, _w16); + v8i16 _exts0 = __msa_clti_s_h(_s0, 0); + v4i32 _s0l = (v4i32)__msa_ilvr_h(_exts0, _s0); + v4i32 _s0h = (v4i32)__msa_ilvl_h(_exts0, _s0); + + _sum0 = __msa_addv_w(_sum0, _s0l); + _sum1 = __msa_addv_w(_sum1, _s0h); + + tmpptr += 8; + kptr += 4; + } + + sum0 = _sum0[0] + _sum0[1] + _sum0[2] + _sum0[3]; + sum1 = _sum1[0] + _sum1[1] + _sum1[2] + _sum1[3]; + } + + int j = 0; + for (; j < nn1; j++) + { + signed char val0 = tmpptr[0]; + signed char val1 = tmpptr[1]; + signed char w = kptr[0]; + + sum0 += val0 * w; + sum1 += val1 * w; + + tmpptr += 2; + kptr += 1; + } + + outptr0[0] = sum0; + outptr0[1] = sum1; + outptr0 += 2; + } + for (; i < size; i++) + { + const signed char* tmpptr = tmp.channel(i / 2 + i % 2); + const signed char* kptr = kernel.channel(p / 4 + p % 4); + + int nn4 = (inch / 4) * maxk; + int nn1 = (inch % 4) * maxk; + + int sum = 0; + + if (nn4 > 0) + { + v4i32 _sum = __msa_fill_w(0); + + int j = 0; + for (; j < nn4; j++) + { + v16i8 _val = __msa_ld_b(tmpptr, 0); + v8i16 _val16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val); + + v16i8 _w = __msa_ld_b(kptr, 0); + v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w); + + v8i16 _s0 = __msa_mulv_h(_val16, _w16); + v4i32 _s032 = (v4i32)__msa_ilvr_h(__msa_clti_s_h(_s0, 0), _s0); + + _sum = __msa_addv_w(_sum, _s032); + + tmpptr += 4; + kptr += 4; + } + + sum = _sum[0] + _sum[1] + _sum[2] + _sum[3]; + } + + int j = 0; + for (; j < nn1; j++) + { + signed char val = tmpptr[0]; + signed char w = kptr[0]; + + sum += val * w; + + tmpptr += 1; + kptr += 1; + } + + outptr0[0] = sum; + outptr0 += 1; + } +#else // __mips_msa + for (; i < size; i++) + { + const signed char* tmpptr = tmp.channel(i); + const signed char* kptr = kernel.channel(p); + + int nn1 = inch * maxk; + + int sum = 0; + int j = 0; + for (; j < nn1; j++) + { + signed char val = tmpptr[0]; + signed char w = kptr[0]; + + sum += val * w; + + tmpptr += 1; + kptr += 1; + } + + outptr0[0] = sum; + outptr0 += 1; + } +#endif // __mips_msa + } +} + +static void convolution_im2col_sgemm_transform_kernel_int8_msa(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h) +{ + const int maxk = kernel_w * kernel_h; + +#if __mips_msa + // interleave + // src = maxk-inch-outch + // dst = 4a-4b-maxk-inch/4a-outch/4b + Mat kernel = _kernel.reshape(maxk, inch, outch); + if (outch >= 4) + { + if (inch >= 4) + kernel_tm.create(16 * maxk, inch / 4 + inch % 4, outch / 4 + outch % 4, (size_t)1u); + else + kernel_tm.create(4 * maxk, inch, outch / 4 + outch % 4, (size_t)1u); + } + else + { + if (inch >= 4) + kernel_tm.create(4 * maxk, inch / 4 + inch % 4, outch, (size_t)1u); + else + kernel_tm.create(1 * maxk, inch, outch, (size_t)1u); + } + + int q = 0; + for (; q + 3 < outch; q += 4) + { + signed char* g00 = kernel_tm.channel(q / 4); + + int p = 0; + for (; p + 3 < inch; p += 4) + { + for (int k = 0; k < maxk; k++) + { + for (int i = 0; i < 4; i++) + { + for (int j = 0; j < 4; j++) + { + const signed char* k00 = kernel.channel(q + i).row(p + j); + + g00[0] = k00[k]; + + g00++; + } + } + } + } + for (; p < inch; p++) + { + for (int k = 0; k < maxk; k++) + { + for (int i = 0; i < 4; i++) + { + const signed char* k00 = kernel.channel(q + i).row(p); + + g00[0] = k00[k]; + + g00++; + } + } + } + } + // TODO unroll 2 + for (; q < outch; q++) + { + signed char* g00 = kernel_tm.channel(q / 4 + q % 4); + + int p = 0; + for (; p + 3 < inch; p += 4) + { + for (int k = 0; k < maxk; k++) + { + for (int j = 0; j < 4; j++) + { + const signed char* k00 = kernel.channel(q).row(p + j); + + g00[0] = k00[k]; + + g00++; + } + } + } + for (; p < inch; p++) + { + for (int k = 0; k < maxk; k++) + { + const signed char* k00 = kernel.channel(q).row(p); + + g00[0] = k00[k]; + + g00++; + } + } + } +#else // __mips_msa + kernel_tm = _kernel.reshape(maxk, inch, outch); +#endif // __mips_msa +} + +static void convolution_im2col_sgemm_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) +{ + int w = bottom_blob.w; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + + // im2col + Mat bottom_im2col(size, maxk, inch, 1u, 1, opt.workspace_allocator); + { + const int gap = w * stride_h - outw * stride_w; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < inch; p++) + { + const Mat img = bottom_blob.channel(p); + signed char* ptr = bottom_im2col.channel(p); + + for (int u = 0; u < kernel_h; u++) + { + for (int v = 0; v < kernel_w; v++) + { + const signed char* sptr = img.row(dilation_h * u) + dilation_w * v; + + for (int i = 0; i < outh; i++) + { + int j = 0; + for (; j + 3 < outw; j += 4) + { + ptr[0] = sptr[0]; + ptr[1] = sptr[stride_w]; + ptr[2] = sptr[stride_w * 2]; + ptr[3] = sptr[stride_w * 3]; + + sptr += stride_w * 4; + ptr += 4; + } + for (; j + 1 < outw; j += 2) + { + ptr[0] = sptr[0]; + ptr[1] = sptr[stride_w]; + + sptr += stride_w * 2; + ptr += 2; + } + for (; j < outw; j++) + { + ptr[0] = sptr[0]; + + sptr += stride_w; + ptr += 1; + } + + sptr += gap; + } + } + } + } + } + + im2col_sgemm_int8_msa(bottom_im2col, top_blob, kernel, opt); +} diff --git a/src/layer/mips/convolution_sgemm_pack1to4_int8.h b/src/layer/mips/convolution_sgemm_pack1to4_int8.h new file mode 100644 index 000000000..34ea59f58 --- /dev/null +++ b/src/layer/mips/convolution_sgemm_pack1to4_int8.h @@ -0,0 +1,477 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void im2col_sgemm_pack1to4_int8_msa(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt) +{ + // Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator); + + const int size = bottom_im2col.w; + const int maxk = bottom_im2col.h; + const int inch = bottom_im2col.c; + + const int outch = top_blob.c; + + // permute + Mat tmp; + if (inch >= 4) + { + if (size >= 2) + tmp.create(2 * maxk, inch / 4 + inch % 4, size / 2 + size % 2, 4u, 4, opt.workspace_allocator); + else + tmp.create(maxk, inch / 4 + inch % 4, size, 4u, 4, opt.workspace_allocator); + } + else + { + if (size >= 2) + tmp.create(2 * maxk, inch, size / 2 + size % 2, 1u, 1, opt.workspace_allocator); + else + tmp.create(maxk, inch, size, 1u, 1, opt.workspace_allocator); + } + { + int remain_size_start = 0; + int nn_size = (size - remain_size_start) >> 1; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_size; ii++) + { + int i = remain_size_start + ii * 2; + + signed char* tmpptr = tmp.channel(i / 2); + + int q = 0; + for (; q + 3 < inch; q += 4) + { + const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i; + const signed char* img1 = (const signed char*)bottom_im2col.channel(q + 1) + i; + const signed char* img2 = (const signed char*)bottom_im2col.channel(q + 2) + i; + const signed char* img3 = (const signed char*)bottom_im2col.channel(q + 3) + i; + + for (int k = 0; k < maxk; k++) + { + tmpptr[0] = img0[0]; + tmpptr[1] = img1[0]; + tmpptr[2] = img2[0]; + tmpptr[3] = img3[0]; + tmpptr[4] = img0[1]; + tmpptr[5] = img1[1]; + tmpptr[6] = img2[1]; + tmpptr[7] = img3[1]; + tmpptr += 8; + + img0 += size; + img1 += size; + img2 += size; + img3 += size; + } + } + for (; q < inch; q++) + { + const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i; + + for (int k = 0; k < maxk; k++) + { + tmpptr[0] = img0[0]; + tmpptr[1] = img0[1]; + + tmpptr += 2; + + img0 += size; + } + } + } + + remain_size_start += nn_size << 1; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = remain_size_start; i < size; i++) + { + signed char* tmpptr = tmp.channel(i / 2 + i % 2); + + int q = 0; + for (; q + 3 < inch; q += 4) + { + const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i; + const signed char* img1 = (const signed char*)bottom_im2col.channel(q + 1) + i; + const signed char* img2 = (const signed char*)bottom_im2col.channel(q + 2) + i; + const signed char* img3 = (const signed char*)bottom_im2col.channel(q + 3) + i; + + for (int k = 0; k < maxk; k++) + { + tmpptr[0] = img0[0]; + tmpptr[1] = img1[0]; + tmpptr[2] = img2[0]; + tmpptr[3] = img3[0]; + tmpptr += 4; + + img0 += size; + img1 += size; + img2 += size; + img3 += size; + } + } + for (; q < inch; q++) + { + const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i; + + for (int k = 0; k < maxk; k++) + { + tmpptr[0] = img0[0]; + + tmpptr += 1; + + img0 += size; + } + } + } + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + int* outptr0 = top_blob.channel(p); + + int i = 0; + for (; i + 1 < size; i += 2) + { + const signed char* tmpptr = tmp.channel(i / 2); + const signed char* kptr = kernel.channel(p); + + int nn4 = (inch / 4) * maxk; + int nn1 = (inch % 4) * maxk; + + v4i32 _sum00 = __msa_fill_w(0); + v4i32 _sum10 = __msa_fill_w(0); + + if (nn4 > 0) + { + v4i32 _sum01 = __msa_fill_w(0); + v4i32 _sum02 = __msa_fill_w(0); + v4i32 _sum03 = __msa_fill_w(0); + v4i32 _sum11 = __msa_fill_w(0); + v4i32 _sum12 = __msa_fill_w(0); + v4i32 _sum13 = __msa_fill_w(0); + + int j = 0; + for (; j < nn4; j++) + { + v16i8 _val = __msa_ld_b(tmpptr, 0); + v8i16 _val01 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val); + + v8i16 _val0 = (v8i16)__msa_ilvr_d((v2i64)_val01, (v2i64)_val01); + v8i16 _val1 = (v8i16)__msa_ilvl_d((v2i64)_val01, (v2i64)_val01); + + v16i8 _w01 = __msa_ld_b(kptr, 0); + v16i8 _extw01 = __msa_clti_s_b(_w01, 0); + v8i16 _w0 = (v8i16)__msa_ilvr_b(_extw01, _w01); + v8i16 _w1 = (v8i16)__msa_ilvl_b(_extw01, _w01); + + v8i16 _s00 = __msa_mulv_h(_val0, _w0); + v8i16 _s01 = __msa_mulv_h(_val0, _w1); + v8i16 _s10 = __msa_mulv_h(_val1, _w0); + v8i16 _s11 = __msa_mulv_h(_val1, _w1); + + v8i16 _exts00 = __msa_clti_s_h(_s00, 0); + v8i16 _exts01 = __msa_clti_s_h(_s01, 0); + v8i16 _exts10 = __msa_clti_s_h(_s10, 0); + v8i16 _exts11 = __msa_clti_s_h(_s11, 0); + v4i32 _s00l = (v4i32)__msa_ilvr_h(_exts00, _s00); + v4i32 _s00h = (v4i32)__msa_ilvl_h(_exts00, _s00); + v4i32 _s01l = (v4i32)__msa_ilvr_h(_exts01, _s01); + v4i32 _s01h = (v4i32)__msa_ilvl_h(_exts01, _s01); + v4i32 _s10l = (v4i32)__msa_ilvr_h(_exts10, _s10); + v4i32 _s10h = (v4i32)__msa_ilvl_h(_exts10, _s10); + v4i32 _s11l = (v4i32)__msa_ilvr_h(_exts11, _s11); + v4i32 _s11h = (v4i32)__msa_ilvl_h(_exts11, _s11); + + _sum00 = __msa_addv_w(_sum00, _s00l); + _sum01 = __msa_addv_w(_sum01, _s00h); + _sum02 = __msa_addv_w(_sum02, _s01l); + _sum03 = __msa_addv_w(_sum03, _s01h); + _sum10 = __msa_addv_w(_sum10, _s10l); + _sum11 = __msa_addv_w(_sum11, _s10h); + _sum12 = __msa_addv_w(_sum12, _s11l); + _sum13 = __msa_addv_w(_sum13, _s11h); + + tmpptr += 8; + kptr += 16; + } + + // transpose 4x4 + { + v4i32 _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = __msa_ilvr_w(_sum01, _sum00); + _tmp1 = __msa_ilvr_w(_sum03, _sum02); + _tmp2 = __msa_ilvl_w(_sum01, _sum00); + _tmp3 = __msa_ilvl_w(_sum03, _sum02); + _sum00 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0); + _sum01 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0); + _sum02 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2); + _sum03 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2); + } + { + v4i32 _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = __msa_ilvr_w(_sum11, _sum10); + _tmp1 = __msa_ilvr_w(_sum13, _sum12); + _tmp2 = __msa_ilvl_w(_sum11, _sum10); + _tmp3 = __msa_ilvl_w(_sum13, _sum12); + _sum10 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0); + _sum11 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0); + _sum12 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2); + _sum13 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2); + } + + _sum00 = __msa_addv_w(_sum00, _sum01); + _sum02 = __msa_addv_w(_sum02, _sum03); + _sum10 = __msa_addv_w(_sum10, _sum11); + _sum12 = __msa_addv_w(_sum12, _sum13); + + _sum00 = __msa_addv_w(_sum00, _sum02); + _sum10 = __msa_addv_w(_sum10, _sum12); + } + + int j = 0; + for (; j < nn1; j++) + { + v8i16 _val0 = __msa_fill_h(tmpptr[0]); + v8i16 _val1 = __msa_fill_h(tmpptr[1]); + v8i16 _val = (v8i16)__msa_ilvr_d((v2i64)_val1, (v2i64)_val0); + + v16i8 _w = __msa_ld_b(kptr, 0); + v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w); + + _w16 = (v8i16)__msa_ilvr_d((v2i64)_w16, (v2i64)_w16); + + v8i16 _s0 = __msa_mulv_h(_val, _w16); + v8i16 _exts0 = __msa_clti_s_h(_s0, 0); + v4i32 _s0l = (v4i32)__msa_ilvr_h(_exts0, _s0); + v4i32 _s0h = (v4i32)__msa_ilvl_h(_exts0, _s0); + + _sum00 = __msa_addv_w(_sum00, _s0l); + _sum10 = __msa_addv_w(_sum10, _s0h); + + tmpptr += 2; + kptr += 4; + } + + __msa_st_w(_sum00, outptr0, 0); + __msa_st_w(_sum10, outptr0 + 4, 0); + outptr0 += 8; + } + for (; i < size; i++) + { + const signed char* tmpptr = tmp.channel(i / 2 + i % 2); + const signed char* kptr = kernel.channel(p); + + int nn4 = (inch / 4) * maxk; + int nn1 = (inch % 4) * maxk; + + v4i32 _sum0 = __msa_fill_w(0); + + if (nn4 > 0) + { + v4i32 _sum1 = __msa_fill_w(0); + v4i32 _sum2 = __msa_fill_w(0); + v4i32 _sum3 = __msa_fill_w(0); + + int j = 0; + for (; j < nn4; j++) + { + v16i8 _val = __msa_ld_b(tmpptr, 0); + v8i16 _val16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val); + + _val16 = (v8i16)__msa_ilvr_d((v2i64)_val16, (v2i64)_val16); + + v16i8 _w01 = __msa_ld_b(kptr, 0); + v16i8 _extw01 = __msa_clti_s_b(_w01, 0); + v8i16 _w0 = (v8i16)__msa_ilvr_b(_extw01, _w01); + v8i16 _w1 = (v8i16)__msa_ilvl_b(_extw01, _w01); + + v8i16 _s0 = __msa_mulv_h(_val16, _w0); + v8i16 _s1 = __msa_mulv_h(_val16, _w1); + + v8i16 _exts0 = __msa_clti_s_h(_s0, 0); + v8i16 _exts1 = __msa_clti_s_h(_s1, 0); + v4i32 _s0l = (v4i32)__msa_ilvr_h(_exts0, _s0); + v4i32 _s0h = (v4i32)__msa_ilvl_h(_exts0, _s0); + v4i32 _s1l = (v4i32)__msa_ilvr_h(_exts1, _s1); + v4i32 _s1h = (v4i32)__msa_ilvl_h(_exts1, _s1); + + _sum0 = __msa_addv_w(_sum0, _s0l); + _sum1 = __msa_addv_w(_sum1, _s0h); + _sum2 = __msa_addv_w(_sum2, _s1l); + _sum3 = __msa_addv_w(_sum3, _s1h); + + tmpptr += 4; + kptr += 16; + } + + // transpose 4x4 + { + v4i32 _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = __msa_ilvr_w(_sum1, _sum0); + _tmp1 = __msa_ilvr_w(_sum3, _sum2); + _tmp2 = __msa_ilvl_w(_sum1, _sum0); + _tmp3 = __msa_ilvl_w(_sum3, _sum2); + _sum0 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0); + _sum1 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0); + _sum2 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2); + _sum3 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2); + } + + _sum0 = __msa_addv_w(_sum0, _sum1); + _sum2 = __msa_addv_w(_sum2, _sum3); + _sum0 = __msa_addv_w(_sum0, _sum2); + } + + int j = 0; + for (; j < nn1; j++) + { + v8i16 _val = __msa_fill_h(tmpptr[0]); + + v16i8 _w = __msa_ld_b(kptr, 0); + v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w); + + v8i16 _s0 = __msa_mulv_h(_val, _w16); + v4i32 _s032 = (v4i32)__msa_ilvr_h(__msa_clti_s_h(_s0, 0), _s0); + + _sum0 = __msa_addv_w(_sum0, _s032); + + tmpptr += 1; + kptr += 4; + } + + __msa_st_w(_sum0, outptr0, 0); + outptr0 += 4; + } + } +} + +static void convolution_im2col_sgemm_transform_kernel_pack1to4_int8_msa(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h) +{ + const int maxk = kernel_w * kernel_h; + + // interleave + // src = maxk-inch-outch + // dst = 4a-4b-maxk-inch/4a-outch/4b + Mat kernel = _kernel.reshape(maxk, inch, outch); + if (inch >= 4) + kernel_tm.create(16 * maxk, inch / 4 + inch % 4, outch / 4, (size_t)1u); + else + kernel_tm.create(4 * maxk, inch, outch / 4, (size_t)1u); + + for (int q = 0; q + 3 < outch; q += 4) + { + signed char* g00 = kernel_tm.channel(q / 4); + + int p = 0; + for (; p + 3 < inch; p += 4) + { + for (int k = 0; k < maxk; k++) + { + for (int i = 0; i < 4; i++) + { + for (int j = 0; j < 4; j++) + { + const signed char* k00 = kernel.channel(q + i).row(p + j); + + g00[0] = k00[k]; + + g00++; + } + } + } + } + for (; p < inch; p++) + { + for (int k = 0; k < maxk; k++) + { + for (int i = 0; i < 4; i++) + { + const signed char* k00 = kernel.channel(q + i).row(p); + + g00[0] = k00[k]; + + g00++; + } + } + } + } +} + +static void convolution_im2col_sgemm_pack1to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) +{ + int w = bottom_blob.w; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + + // im2col + Mat bottom_im2col(size, maxk, inch, 1u, 1, opt.workspace_allocator); + { + const int gap = w * stride_h - outw * stride_w; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < inch; p++) + { + const Mat img = bottom_blob.channel(p); + signed char* ptr = bottom_im2col.channel(p); + + for (int u = 0; u < kernel_h; u++) + { + for (int v = 0; v < kernel_w; v++) + { + const signed char* sptr = img.row(dilation_h * u) + dilation_w * v; + + for (int i = 0; i < outh; i++) + { + int j = 0; + for (; j + 3 < outw; j += 4) + { + ptr[0] = sptr[0]; + ptr[1] = sptr[stride_w]; + ptr[2] = sptr[stride_w * 2]; + ptr[3] = sptr[stride_w * 3]; + + sptr += stride_w * 4; + ptr += 4; + } + for (; j + 1 < outw; j += 2) + { + ptr[0] = sptr[0]; + ptr[1] = sptr[stride_w]; + + sptr += stride_w * 2; + ptr += 2; + } + for (; j < outw; j++) + { + ptr[0] = sptr[0]; + + sptr += stride_w; + ptr += 1; + } + + sptr += gap; + } + } + } + } + } + + im2col_sgemm_pack1to4_int8_msa(bottom_im2col, top_blob, kernel, opt); +} diff --git a/src/layer/mips/convolution_sgemm_pack4to1.h b/src/layer/mips/convolution_sgemm_pack4to1.h index 2ceef6c06..281293602 100644 --- a/src/layer/mips/convolution_sgemm_pack4to1.h +++ b/src/layer/mips/convolution_sgemm_pack4to1.h @@ -550,7 +550,7 @@ static void im2col_sgemm_pack4to1_msa(const Mat& bottom_im2col, Mat& top_blob, c kptr0 += 4; } - sum0 += __msa_fhadd_w(_sum0); + sum0 += __msa_reduce_fadd_w(_sum0); outptr0[0] = sum0; diff --git a/src/layer/mips/convolution_sgemm_pack8to1_int8.h b/src/layer/mips/convolution_sgemm_pack8to1_int8.h new file mode 100644 index 000000000..536517cfd --- /dev/null +++ b/src/layer/mips/convolution_sgemm_pack8to1_int8.h @@ -0,0 +1,450 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void im2col_sgemm_pack8to1_int8_msa(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt) +{ + // Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator); + + const int size = bottom_im2col.w; + const int maxk = bottom_im2col.h; + const int inch = bottom_im2col.c; + + const int outch = top_blob.c; + + // permute + Mat tmp; + if (size >= 2) + tmp.create(2 * maxk, inch, size / 2 + size % 2, 8u, 8, opt.workspace_allocator); + else + tmp.create(maxk, inch, size, 8u, 8, opt.workspace_allocator); + { + int remain_size_start = 0; + int nn_size = (size - remain_size_start) >> 1; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_size; ii++) + { + int i = remain_size_start + ii * 2; + + int64_t* tmpptr = tmp.channel(i / 2); + + for (int q = 0; q < inch; q++) + { + const int64_t* img0 = (const int64_t*)bottom_im2col.channel(q) + i; + + for (int k = 0; k < maxk; k++) + { + v16i8 _v = __msa_ld_b(img0, 0); + __msa_st_b(_v, tmpptr, 0); + tmpptr += 2; + img0 += size; + } + } + } + + remain_size_start += nn_size << 1; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = remain_size_start; i < size; i++) + { + int64_t* tmpptr = tmp.channel(i / 2 + i % 2); + + for (int q = 0; q < inch; q++) + { + const int64_t* img0 = (const int64_t*)bottom_im2col.channel(q) + i; + + for (int k = 0; k < maxk; k++) + { + tmpptr[0] = img0[0]; + tmpptr += 1; + img0 += size; + } + } + } + } + + int nn_outch = 0; + int remain_outch_start = 0; + + nn_outch = outch >> 2; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_outch; pp++) + { + int p = pp * 4; + + int* outptr0 = top_blob.channel(p); + int* outptr1 = top_blob.channel(p + 1); + int* outptr2 = top_blob.channel(p + 2); + int* outptr3 = top_blob.channel(p + 3); + + int i = 0; + for (; i + 1 < size; i += 2) + { + const signed char* tmpptr = tmp.channel(i / 2); + const signed char* kptr = kernel.channel(p / 4); + + int nn = inch * maxk; // inch always > 0 + + v4i32 _sum00 = __msa_fill_w(0); + v4i32 _sum01 = __msa_fill_w(0); + v4i32 _sum02 = __msa_fill_w(0); + v4i32 _sum03 = __msa_fill_w(0); + v4i32 _sum10 = __msa_fill_w(0); + v4i32 _sum11 = __msa_fill_w(0); + v4i32 _sum12 = __msa_fill_w(0); + v4i32 _sum13 = __msa_fill_w(0); + + int j = 0; + for (; j < nn; j++) + { + v16i8 _val01 = __msa_ld_b(tmpptr, 0); + v16i8 _extval01 = __msa_clti_s_b(_val01, 0); + v8i16 _val0 = (v8i16)__msa_ilvr_b(_extval01, _val01); + v8i16 _val1 = (v8i16)__msa_ilvl_b(_extval01, _val01); + + v16i8 _w01 = __msa_ld_b(kptr, 0); + v16i8 _w23 = __msa_ld_b(kptr + 16, 0); + v16i8 _extw01 = __msa_clti_s_b(_w01, 0); + v16i8 _extw23 = __msa_clti_s_b(_w23, 0); + v8i16 _w0 = (v8i16)__msa_ilvr_b(_extw01, _w01); + v8i16 _w1 = (v8i16)__msa_ilvl_b(_extw01, _w01); + v8i16 _w2 = (v8i16)__msa_ilvr_b(_extw23, _w23); + v8i16 _w3 = (v8i16)__msa_ilvl_b(_extw23, _w23); + + v8i16 _s00 = __msa_mulv_h(_val0, _w0); + v8i16 _s01 = __msa_mulv_h(_val0, _w1); + v8i16 _s02 = __msa_mulv_h(_val0, _w2); + v8i16 _s03 = __msa_mulv_h(_val0, _w3); + v8i16 _s10 = __msa_mulv_h(_val1, _w0); + v8i16 _s11 = __msa_mulv_h(_val1, _w1); + v8i16 _s12 = __msa_mulv_h(_val1, _w2); + v8i16 _s13 = __msa_mulv_h(_val1, _w3); + + _sum00 = __msa_addv_w(_sum00, __msa_hadd_s_w(_s00, _s00)); + _sum01 = __msa_addv_w(_sum01, __msa_hadd_s_w(_s01, _s01)); + _sum02 = __msa_addv_w(_sum02, __msa_hadd_s_w(_s02, _s02)); + _sum03 = __msa_addv_w(_sum03, __msa_hadd_s_w(_s03, _s03)); + _sum10 = __msa_addv_w(_sum10, __msa_hadd_s_w(_s10, _s10)); + _sum11 = __msa_addv_w(_sum11, __msa_hadd_s_w(_s11, _s11)); + _sum12 = __msa_addv_w(_sum12, __msa_hadd_s_w(_s12, _s12)); + _sum13 = __msa_addv_w(_sum13, __msa_hadd_s_w(_s13, _s13)); + + tmpptr += 16; + kptr += 32; + } + + // transpose 4x4 + { + v4i32 _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = __msa_ilvr_w(_sum01, _sum00); + _tmp1 = __msa_ilvr_w(_sum03, _sum02); + _tmp2 = __msa_ilvl_w(_sum01, _sum00); + _tmp3 = __msa_ilvl_w(_sum03, _sum02); + _sum00 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0); + _sum01 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0); + _sum02 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2); + _sum03 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2); + } + { + v4i32 _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = __msa_ilvr_w(_sum11, _sum10); + _tmp1 = __msa_ilvr_w(_sum13, _sum12); + _tmp2 = __msa_ilvl_w(_sum11, _sum10); + _tmp3 = __msa_ilvl_w(_sum13, _sum12); + _sum10 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0); + _sum11 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0); + _sum12 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2); + _sum13 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2); + } + + _sum00 = __msa_addv_w(_sum00, _sum01); + _sum02 = __msa_addv_w(_sum02, _sum03); + _sum10 = __msa_addv_w(_sum10, _sum11); + _sum12 = __msa_addv_w(_sum12, _sum13); + + _sum00 = __msa_addv_w(_sum00, _sum02); + _sum10 = __msa_addv_w(_sum10, _sum12); + + int sum[8]; + __msa_st_w(_sum00, sum, 0); + __msa_st_w(_sum10, sum + 4, 0); + + outptr0[0] = sum[0]; + outptr1[0] = sum[1]; + outptr2[0] = sum[2]; + outptr3[0] = sum[3]; + outptr0[1] = sum[4]; + outptr1[1] = sum[5]; + outptr2[1] = sum[6]; + outptr3[1] = sum[7]; + outptr0 += 2; + outptr1 += 2; + outptr2 += 2; + outptr3 += 2; + } + for (; i < size; i++) + { + const signed char* tmpptr = tmp.channel(i / 2 + i % 2); + const signed char* kptr = kernel.channel(p / 4); + + int nn = inch * maxk; // inch always > 0 + + v4i32 _sum0 = __msa_fill_w(0); + v4i32 _sum1 = __msa_fill_w(0); + v4i32 _sum2 = __msa_fill_w(0); + v4i32 _sum3 = __msa_fill_w(0); + + int j = 0; + for (; j < nn; j++) + { + v16i8 _val = __msa_ld_b(tmpptr, 0); + v8i16 _val16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val); + + v16i8 _w01 = __msa_ld_b(kptr, 0); + v16i8 _w23 = __msa_ld_b(kptr + 16, 0); + v16i8 _extw01 = __msa_clti_s_b(_w01, 0); + v16i8 _extw23 = __msa_clti_s_b(_w23, 0); + v8i16 _w0 = (v8i16)__msa_ilvr_b(_extw01, _w01); + v8i16 _w1 = (v8i16)__msa_ilvl_b(_extw01, _w01); + v8i16 _w2 = (v8i16)__msa_ilvr_b(_extw23, _w23); + v8i16 _w3 = (v8i16)__msa_ilvl_b(_extw23, _w23); + + v8i16 _s0 = __msa_mulv_h(_val16, _w0); + v8i16 _s1 = __msa_mulv_h(_val16, _w1); + v8i16 _s2 = __msa_mulv_h(_val16, _w2); + v8i16 _s3 = __msa_mulv_h(_val16, _w3); + + _sum0 = __msa_addv_w(_sum0, __msa_hadd_s_w(_s0, _s0)); + _sum1 = __msa_addv_w(_sum1, __msa_hadd_s_w(_s1, _s1)); + _sum2 = __msa_addv_w(_sum2, __msa_hadd_s_w(_s2, _s2)); + _sum3 = __msa_addv_w(_sum3, __msa_hadd_s_w(_s3, _s3)); + + tmpptr += 8; + kptr += 32; + } + + // transpose 4x4 + { + v4i32 _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = __msa_ilvr_w(_sum1, _sum0); + _tmp1 = __msa_ilvr_w(_sum3, _sum2); + _tmp2 = __msa_ilvl_w(_sum1, _sum0); + _tmp3 = __msa_ilvl_w(_sum3, _sum2); + _sum0 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0); + _sum1 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0); + _sum2 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2); + _sum3 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2); + } + + _sum0 = __msa_addv_w(_sum0, _sum1); + _sum2 = __msa_addv_w(_sum2, _sum3); + + _sum0 = __msa_addv_w(_sum0, _sum2); + + int sum[4]; + __msa_st_w(_sum0, sum, 0); + + outptr0[0] = sum[0]; + outptr1[0] = sum[1]; + outptr2[0] = sum[2]; + outptr3[0] = sum[3]; + outptr0 += 1; + outptr1 += 1; + outptr2 += 1; + outptr3 += 1; + } + } + + remain_outch_start += nn_outch << 2; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = remain_outch_start; p < outch; p++) + { + int* outptr0 = top_blob.channel(p); + + int i = 0; + for (; i + 1 < size; i += 2) + { + const signed char* tmpptr = tmp.channel(i / 2); + const signed char* kptr = kernel.channel(p / 4 + p % 4); + + int nn = inch * maxk; // inch always > 0 + + v4i32 _sum0 = __msa_fill_w(0); + v4i32 _sum1 = __msa_fill_w(0); + + int j = 0; + for (; j < nn; j++) + { + v16i8 _val01 = __msa_ld_b(tmpptr, 0); + v16i8 _extval01 = __msa_clti_s_b(_val01, 0); + v8i16 _val0 = (v8i16)__msa_ilvr_b(_extval01, _val01); + v8i16 _val1 = (v8i16)__msa_ilvl_b(_extval01, _val01); + + v16i8 _w = __msa_ld_b(kptr, 0); + v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w); + + v8i16 _s0 = __msa_mulv_h(_val0, _w16); + v8i16 _s1 = __msa_mulv_h(_val1, _w16); + + _sum0 = __msa_addv_w(_sum0, __msa_hadd_s_w(_s0, _s0)); + _sum1 = __msa_addv_w(_sum1, __msa_hadd_s_w(_s1, _s1)); + + tmpptr += 16; + kptr += 8; + } + + outptr0[0] = __msa_reduce_add_w(_sum0); + outptr0[1] = __msa_reduce_add_w(_sum1); + outptr0 += 2; + } + for (; i < size; i++) + { + const signed char* tmpptr = tmp.channel(i / 2 + i % 2); + const signed char* kptr = kernel.channel(p / 4 + p % 4); + + int nn = inch * maxk; // inch always > 0 + + v4i32 _sum = __msa_fill_w(0); + + int j = 0; + for (; j < nn; j++) + { + v16i8 _val = __msa_ld_b(tmpptr, 0); + v8i16 _val16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val); + + v16i8 _w = __msa_ld_b(kptr, 0); + v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w); + + v8i16 _s0 = __msa_mulv_h(_val16, _w16); + + _sum = __msa_addv_w(_sum, __msa_hadd_s_w(_s0, _s0)); + + tmpptr += 8; + kptr += 8; + } + + outptr0[0] = __msa_reduce_add_w(_sum); + outptr0 += 1; + } + } +} + +static void convolution_im2col_sgemm_transform_kernel_pack8to1_int8_msa(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h) +{ + const int maxk = kernel_w * kernel_h; + + // interleave + // src = maxk-inch-outch + // dst = 8a-4b-maxk-inch/8a-outch/4b + Mat kernel = _kernel.reshape(maxk, inch, outch); + if (outch >= 4) + kernel_tm.create(32 * maxk, inch / 8, outch / 4 + outch % 4, (size_t)1u); + else + kernel_tm.create(8 * maxk, inch / 8, outch, (size_t)1u); + + int q = 0; + for (; q + 3 < outch; q += 4) + { + signed char* g00 = kernel_tm.channel(q / 4); + + for (int p = 0; p + 7 < inch; p += 8) + { + for (int k = 0; k < maxk; k++) + { + for (int i = 0; i < 4; i++) + { + for (int j = 0; j < 8; j++) + { + const signed char* k00 = kernel.channel(q + i).row(p + j); + + g00[0] = k00[k]; + + g00++; + } + } + } + } + } + // TODO unroll 2 + for (; q < outch; q++) + { + signed char* g00 = kernel_tm.channel(q / 4 + q % 4); + + for (int p = 0; p + 7 < inch; p += 8) + { + for (int k = 0; k < maxk; k++) + { + for (int j = 0; j < 8; j++) + { + const signed char* k00 = kernel.channel(q).row(p + j); + + g00[0] = k00[k]; + + g00++; + } + } + } + } +} + +static void convolution_im2col_sgemm_pack8to1_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) +{ + int w = bottom_blob.w; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + + // im2col + Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator); + { + const int gap = w * stride_h - outw * stride_w; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < inch; p++) + { + const Mat img = bottom_blob.channel(p); + int64_t* ptr = bottom_im2col.channel(p); + + for (int u = 0; u < kernel_h; u++) + { + for (int v = 0; v < kernel_w; v++) + { + const int64_t* sptr = img.row(dilation_h * u) + dilation_w * v; + + for (int i = 0; i < outh; i++) + { + int j = 0; + for (; j < outw; j++) + { + ptr[0] = sptr[0]; + + sptr += stride_w; + ptr += 1; + } + + sptr += gap; + } + } + } + } + } + + im2col_sgemm_pack8to1_int8_msa(bottom_im2col, top_blob, kernel, opt); +} diff --git a/src/layer/mips/convolution_sgemm_pack8to4_int8.h b/src/layer/mips/convolution_sgemm_pack8to4_int8.h new file mode 100644 index 000000000..fdfa442b0 --- /dev/null +++ b/src/layer/mips/convolution_sgemm_pack8to4_int8.h @@ -0,0 +1,320 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void im2col_sgemm_pack8to4_int8_msa(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt) +{ + // Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator); + + const int size = bottom_im2col.w; + const int maxk = bottom_im2col.h; + const int inch = bottom_im2col.c; + + const int outch = top_blob.c; + + // permute + Mat tmp; + if (size >= 2) + tmp.create(2 * maxk, inch, size / 2 + size % 2, 8u, 8, opt.workspace_allocator); + else + tmp.create(maxk, inch, size, 8u, 8, opt.workspace_allocator); + { + int remain_size_start = 0; + int nn_size = size >> 1; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_size; ii++) + { + int i = remain_size_start + ii * 2; + + int64_t* tmpptr = tmp.channel(i / 2); + + for (int q = 0; q < inch; q++) + { + const int64_t* img0 = (const int64_t*)bottom_im2col.channel(q) + i; + + for (int k = 0; k < maxk; k++) + { + v16i8 _v = __msa_ld_b(img0, 0); + __msa_st_b(_v, tmpptr, 0); + tmpptr += 2; + img0 += size; + } + } + } + + remain_size_start += nn_size << 1; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = remain_size_start; i < size; i++) + { + int64_t* tmpptr = tmp.channel(i / 2 + i % 2); + + for (int q = 0; q < inch; q++) + { + const int64_t* img0 = (const int64_t*)bottom_im2col.channel(q) + i; + + for (int k = 0; k < maxk; k++) + { + tmpptr[0] = img0[0]; + tmpptr += 1; + img0 += size; + } + } + } + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + int* outptr0 = top_blob.channel(p); + + int i = 0; + for (; i + 1 < size; i += 2) + { + const signed char* tmpptr = tmp.channel(i / 2); + const signed char* kptr = kernel.channel(p); + + int nn = inch * maxk; // inch always > 0 + + v4i32 _sum00 = __msa_fill_w(0); + v4i32 _sum01 = __msa_fill_w(0); + v4i32 _sum02 = __msa_fill_w(0); + v4i32 _sum03 = __msa_fill_w(0); + v4i32 _sum10 = __msa_fill_w(0); + v4i32 _sum11 = __msa_fill_w(0); + v4i32 _sum12 = __msa_fill_w(0); + v4i32 _sum13 = __msa_fill_w(0); + + int j = 0; + for (; j < nn; j++) + { + v16i8 _val01 = __msa_ld_b(tmpptr, 0); + v16i8 _extval01 = __msa_clti_s_b(_val01, 0); + v8i16 _val0 = (v8i16)__msa_ilvr_b(_extval01, _val01); + v8i16 _val1 = (v8i16)__msa_ilvl_b(_extval01, _val01); + + v16i8 _w01 = __msa_ld_b(kptr, 0); + v16i8 _w23 = __msa_ld_b(kptr + 16, 0); + v16i8 _extw01 = __msa_clti_s_b(_w01, 0); + v16i8 _extw23 = __msa_clti_s_b(_w23, 0); + v8i16 _w0 = (v8i16)__msa_ilvr_b(_extw01, _w01); + v8i16 _w1 = (v8i16)__msa_ilvl_b(_extw01, _w01); + v8i16 _w2 = (v8i16)__msa_ilvr_b(_extw23, _w23); + v8i16 _w3 = (v8i16)__msa_ilvl_b(_extw23, _w23); + + v8i16 _s00 = __msa_mulv_h(_val0, _w0); + v8i16 _s01 = __msa_mulv_h(_val0, _w1); + v8i16 _s02 = __msa_mulv_h(_val0, _w2); + v8i16 _s03 = __msa_mulv_h(_val0, _w3); + v8i16 _s10 = __msa_mulv_h(_val1, _w0); + v8i16 _s11 = __msa_mulv_h(_val1, _w1); + v8i16 _s12 = __msa_mulv_h(_val1, _w2); + v8i16 _s13 = __msa_mulv_h(_val1, _w3); + + _sum00 = __msa_addv_w(_sum00, __msa_hadd_s_w(_s00, _s00)); + _sum01 = __msa_addv_w(_sum01, __msa_hadd_s_w(_s01, _s01)); + _sum02 = __msa_addv_w(_sum02, __msa_hadd_s_w(_s02, _s02)); + _sum03 = __msa_addv_w(_sum03, __msa_hadd_s_w(_s03, _s03)); + _sum10 = __msa_addv_w(_sum10, __msa_hadd_s_w(_s10, _s10)); + _sum11 = __msa_addv_w(_sum11, __msa_hadd_s_w(_s11, _s11)); + _sum12 = __msa_addv_w(_sum12, __msa_hadd_s_w(_s12, _s12)); + _sum13 = __msa_addv_w(_sum13, __msa_hadd_s_w(_s13, _s13)); + + tmpptr += 16; + kptr += 32; + } + + // transpose 4x4 + { + v4i32 _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = __msa_ilvr_w(_sum01, _sum00); + _tmp1 = __msa_ilvr_w(_sum03, _sum02); + _tmp2 = __msa_ilvl_w(_sum01, _sum00); + _tmp3 = __msa_ilvl_w(_sum03, _sum02); + _sum00 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0); + _sum01 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0); + _sum02 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2); + _sum03 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2); + } + { + v4i32 _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = __msa_ilvr_w(_sum11, _sum10); + _tmp1 = __msa_ilvr_w(_sum13, _sum12); + _tmp2 = __msa_ilvl_w(_sum11, _sum10); + _tmp3 = __msa_ilvl_w(_sum13, _sum12); + _sum10 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0); + _sum11 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0); + _sum12 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2); + _sum13 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2); + } + + _sum00 = __msa_addv_w(_sum00, _sum01); + _sum02 = __msa_addv_w(_sum02, _sum03); + _sum10 = __msa_addv_w(_sum10, _sum11); + _sum12 = __msa_addv_w(_sum12, _sum13); + + _sum00 = __msa_addv_w(_sum00, _sum02); + _sum10 = __msa_addv_w(_sum10, _sum12); + + __msa_st_w(_sum00, outptr0, 0); + __msa_st_w(_sum10, outptr0 + 4, 0); + outptr0 += 8; + } + for (; i < size; i++) + { + const signed char* tmpptr = tmp.channel(i / 2 + i % 2); + const signed char* kptr = kernel.channel(p); + + int nn = inch * maxk; // inch always > 0 + + v4i32 _sum0 = __msa_fill_w(0); + v4i32 _sum1 = __msa_fill_w(0); + v4i32 _sum2 = __msa_fill_w(0); + v4i32 _sum3 = __msa_fill_w(0); + + int j = 0; + for (; j < nn; j++) + { + v16i8 _val = __msa_ld_b(tmpptr, 0); + v8i16 _val16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val); + + v16i8 _w01 = __msa_ld_b(kptr, 0); + v16i8 _w23 = __msa_ld_b(kptr + 16, 0); + v16i8 _extw01 = __msa_clti_s_b(_w01, 0); + v16i8 _extw23 = __msa_clti_s_b(_w23, 0); + v8i16 _w0 = (v8i16)__msa_ilvr_b(_extw01, _w01); + v8i16 _w1 = (v8i16)__msa_ilvl_b(_extw01, _w01); + v8i16 _w2 = (v8i16)__msa_ilvr_b(_extw23, _w23); + v8i16 _w3 = (v8i16)__msa_ilvl_b(_extw23, _w23); + + v8i16 _s0 = __msa_mulv_h(_val16, _w0); + v8i16 _s1 = __msa_mulv_h(_val16, _w1); + v8i16 _s2 = __msa_mulv_h(_val16, _w2); + v8i16 _s3 = __msa_mulv_h(_val16, _w3); + + _sum0 = __msa_addv_w(_sum0, __msa_hadd_s_w(_s0, _s0)); + _sum1 = __msa_addv_w(_sum1, __msa_hadd_s_w(_s1, _s1)); + _sum2 = __msa_addv_w(_sum2, __msa_hadd_s_w(_s2, _s2)); + _sum3 = __msa_addv_w(_sum3, __msa_hadd_s_w(_s3, _s3)); + + tmpptr += 8; + kptr += 32; + } + + // transpose 4x4 + { + v4i32 _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = __msa_ilvr_w(_sum1, _sum0); + _tmp1 = __msa_ilvr_w(_sum3, _sum2); + _tmp2 = __msa_ilvl_w(_sum1, _sum0); + _tmp3 = __msa_ilvl_w(_sum3, _sum2); + _sum0 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0); + _sum1 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0); + _sum2 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2); + _sum3 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2); + } + + _sum0 = __msa_addv_w(_sum0, _sum1); + _sum2 = __msa_addv_w(_sum2, _sum3); + + _sum0 = __msa_addv_w(_sum0, _sum2); + + __msa_st_w(_sum0, outptr0, 0); + outptr0 += 4; + } + } +} + +static void convolution_im2col_sgemm_transform_kernel_pack8to4_int8_msa(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h) +{ + const int maxk = kernel_w * kernel_h; + + // interleave + // src = maxk-inch-outch + // dst = 8a-4b-maxk-inch/8a-outch/4b + Mat kernel = _kernel.reshape(maxk, inch, outch); + kernel_tm.create(32 * maxk, inch / 8, outch / 4, (size_t)1u); + + for (int q = 0; q + 3 < outch; q += 4) + { + signed char* g00 = kernel_tm.channel(q / 4); + + for (int p = 0; p + 7 < inch; p += 8) + { + for (int k = 0; k < maxk; k++) + { + for (int i = 0; i < 4; i++) + { + for (int j = 0; j < 8; j++) + { + const signed char* k00 = kernel.channel(q + i).row(p + j); + + g00[0] = k00[k]; + + g00++; + } + } + } + } + } +} + +static void convolution_im2col_sgemm_pack8to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) +{ + int w = bottom_blob.w; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + + // im2col + Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator); + { + const int gap = w * stride_h - outw * stride_w; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < inch; p++) + { + const Mat img = bottom_blob.channel(p); + int64_t* ptr = bottom_im2col.channel(p); + + for (int u = 0; u < kernel_h; u++) + { + for (int v = 0; v < kernel_w; v++) + { + const int64_t* sptr = img.row(dilation_h * u) + dilation_w * v; + + for (int i = 0; i < outh; i++) + { + int j = 0; + for (; j < outw; j++) + { + ptr[0] = sptr[0]; + + sptr += stride_w; + ptr += 1; + } + + sptr += gap; + } + } + } + } + } + + im2col_sgemm_pack8to4_int8_msa(bottom_im2col, top_blob, kernel, opt); +} diff --git a/src/layer/mips/deconvolution_pack4to1.h b/src/layer/mips/deconvolution_pack4to1.h index 655d39f0a..b825e1469 100644 --- a/src/layer/mips/deconvolution_pack4to1.h +++ b/src/layer/mips/deconvolution_pack4to1.h @@ -88,7 +88,7 @@ static void deconvolution_pack4to1_msa(const Mat& bottom_blob, Mat& top_blob, co kptr += maxk * 4; } - sum += __msa_fhadd_w(_sum); + sum += __msa_reduce_fadd_w(_sum); sum = activation_ss(sum, activation_type, activation_params); diff --git a/src/layer/mips/innerproduct_mips.cpp b/src/layer/mips/innerproduct_mips.cpp index 9a3ce1875..a3ee4b40f 100644 --- a/src/layer/mips/innerproduct_mips.cpp +++ b/src/layer/mips/innerproduct_mips.cpp @@ -300,10 +300,10 @@ int InnerProduct_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Opti } #if __mips_msa - sum0 += __msa_fhadd_w(_sum0); - sum1 += __msa_fhadd_w(_sum1); - sum2 += __msa_fhadd_w(_sum2); - sum3 += __msa_fhadd_w(_sum3); + sum0 += __msa_reduce_fadd_w(_sum0); + sum1 += __msa_reduce_fadd_w(_sum1); + sum2 += __msa_reduce_fadd_w(_sum2); + sum3 += __msa_reduce_fadd_w(_sum3); #endif // __mips_msa sum0 = activation_ss(sum0, activation_type, activation_params); diff --git a/src/layer/mips/mips_usability.h b/src/layer/mips/mips_usability.h index 40acd2f07..662320ee7 100644 --- a/src/layer/mips/mips_usability.h +++ b/src/layer/mips/mips_usability.h @@ -38,19 +38,25 @@ typedef union static const ncnn::FloatInt Name = {.f = Val} /* float type data load instructions */ -static inline v4f32 __msa_fill_w_f32(float val) +static NCNN_FORCEINLINE v4f32 __msa_fill_w_f32(float val) { ncnn::FloatInt fi_tmpval = {.f = val}; return (v4f32)__msa_fill_w(fi_tmpval.i); } -static inline float __msa_fhadd_w(v4f32 _v) +static NCNN_FORCEINLINE float __msa_reduce_fadd_w(v4f32 _v) { // TODO find a more efficient way return _v[0] + _v[1] + _v[2] + _v[3]; } -static inline int __msa_cfcmsa_msacsr() +static NCNN_FORCEINLINE int __msa_reduce_add_w(v4i32 _v) +{ + // TODO find a more efficient way + return _v[0] + _v[1] + _v[2] + _v[3]; +} + +static NCNN_FORCEINLINE int __msa_cfcmsa_msacsr() { int v; asm volatile("cfcmsa %0, $1 \n" @@ -60,7 +66,7 @@ static inline int __msa_cfcmsa_msacsr() return v; } -static inline void __msa_ctcmsa_msacsr(int v) +static NCNN_FORCEINLINE void __msa_ctcmsa_msacsr(int v) { asm volatile("ctcmsa $1, %0 \n" : @@ -69,7 +75,7 @@ static inline void __msa_ctcmsa_msacsr(int v) } #endif // __mips_msa -static inline signed char float2int8(float v) +static NCNN_FORCEINLINE signed char float2int8(float v) { int int32 = round(v); if (int32 > 127) return 127; @@ -78,7 +84,7 @@ static inline signed char float2int8(float v) } #if __mips_msa -static inline v16i8 float2int8(v4f32 _v) +static NCNN_FORCEINLINE v16i8 float2int8(v4f32 _v) { // simulate round to nearest via +/-0.5 v4f32 _p5 = (v4f32)__msa_fill_w_f32(0.5f); @@ -98,7 +104,7 @@ static inline v16i8 float2int8(v4f32 _v) return _v8; } -static inline int64_t float2int8(v4f32 _vlow, v4f32 _vhigh) +static NCNN_FORCEINLINE int64_t float2int8(v4f32 _vlow, v4f32 _vhigh) { // simulate round to nearest via +/-0.5 v4f32 _p5 = (v4f32)__msa_fill_w_f32(0.5f); @@ -123,7 +129,7 @@ static inline int64_t float2int8(v4f32 _vlow, v4f32 _vhigh) return _v8[0]; } -static inline v16i8 float2int8relu(v4f32 _v) +static NCNN_FORCEINLINE v16i8 float2int8relu(v4f32 _v) { // simulate round to nearest via +/-0.5 v4f32 _p5 = (v4f32)__msa_fill_w_f32(0.5f); @@ -143,7 +149,7 @@ static inline v16i8 float2int8relu(v4f32 _v) return _v8; } -static inline int64_t float2int8relu(v4f32 _vlow, v4f32 _vhigh) +static NCNN_FORCEINLINE int64_t float2int8relu(v4f32 _vlow, v4f32 _vhigh) { // simulate round to nearest via +/-0.5 v4f32 _p5 = (v4f32)__msa_fill_w_f32(0.5f); @@ -168,7 +174,7 @@ static inline int64_t float2int8relu(v4f32 _vlow, v4f32 _vhigh) return _v8[0]; } -static inline v16i8 float2int8leakyrelu(v4f32 _v, v4f32 _slope) +static NCNN_FORCEINLINE v16i8 float2int8leakyrelu(v4f32 _v, v4f32 _slope) { v4f32 _v_leaky = __msa_fmul_w(_v, _slope); @@ -199,7 +205,7 @@ static inline v16i8 float2int8leakyrelu(v4f32 _v, v4f32 _slope) return _v8; } -static inline int64_t float2int8leakyrelu(v4f32 _vlow, v4f32 _vhigh, v4f32 _slope) +static NCNN_FORCEINLINE int64_t float2int8leakyrelu(v4f32 _vlow, v4f32 _vhigh, v4f32 _slope) { v4f32 _vlow_leaky = __msa_fmul_w(_vlow, _slope); v4f32 _vhigh_leaky = __msa_fmul_w(_vhigh, _slope);