* basic mips msa optimization for convolution int8 * mips msa optimization for convolution int8 gemm * mips msa optimization for convolution int8 winograd pack8to4/pack8to1 * mention msa maddv/msubv intrinsics bugtags/20220420
| @@ -578,12 +578,38 @@ You can upload binary inside `build-c906/examples` folder and run on D1 board fo | |||
| ### Build for Loongson 2K1000 | |||
| For gcc version < 8.5, you need to fix msa.h header for workaround msa fmadd bug. | |||
| For gcc version < 8.5, you need to fix msa.h header for workaround msa fmadd/fmsub/maddv/msubv bug. | |||
| Open ```/usr/lib/gcc/mips64el-linux-gnuabi64/8/include/msa.h```, find ```__msa_fmadd_w``` and apply changes as the following | |||
| Open ```/usr/lib/gcc/mips64el-linux-gnuabi64/8/include/msa.h```, find ```__msa_fmadd``` and ```__msa_fmsub``` and apply changes as the following | |||
| ```c | |||
| // #define __msa_fmadd_w __builtin_msa_fmadd_w | |||
| // #define __msa_fmadd_d __builtin_msa_fmadd_d | |||
| // #define __msa_fmsub_w __builtin_msa_fmsub_w | |||
| // #define __msa_fmsub_d __builtin_msa_fmsub_d | |||
| #define __msa_fmadd_w(a, b, c) __builtin_msa_fmadd_w(c, b, a) | |||
| #define __msa_fmadd_d(a, b, c) __builtin_msa_fmadd_d(c, b, a) | |||
| #define __msa_fmsub_w(a, b, c) __builtin_msa_fmsub_w(c, b, a) | |||
| #define __msa_fmsub_d(a, b, c) __builtin_msa_fmsub_d(c, b, a) | |||
| ``` | |||
| find ```__msa_maddv``` and ```__msa_msubv``` and apply changes as the following | |||
| ```c | |||
| // #define __msa_maddv_b __builtin_msa_maddv_b | |||
| // #define __msa_maddv_h __builtin_msa_maddv_h | |||
| // #define __msa_maddv_w __builtin_msa_maddv_w | |||
| // #define __msa_maddv_d __builtin_msa_maddv_d | |||
| // #define __msa_msubv_b __builtin_msa_msubv_b | |||
| // #define __msa_msubv_h __builtin_msa_msubv_h | |||
| // #define __msa_msubv_w __builtin_msa_msubv_w | |||
| // #define __msa_msubv_d __builtin_msa_msubv_d | |||
| #define __msa_maddv_b(a, b, c) __builtin_msa_maddv_b(c, b, a) | |||
| #define __msa_maddv_h(a, b, c) __builtin_msa_maddv_h(c, b, a) | |||
| #define __msa_maddv_w(a, b, c) __builtin_msa_maddv_w(c, b, a) | |||
| #define __msa_maddv_d(a, b, c) __builtin_msa_maddv_d(c, b, a) | |||
| #define __msa_msubv_b(a, b, c) __builtin_msa_msubv_b(c, b, a) | |||
| #define __msa_msubv_h(a, b, c) __builtin_msa_msubv_h(c, b, a) | |||
| #define __msa_msubv_w(a, b, c) __builtin_msa_msubv_w(c, b, a) | |||
| #define __msa_msubv_d(a, b, c) __builtin_msa_msubv_d(c, b, a) | |||
| ``` | |||
| Build ncnn with mips msa and simpleocv enabled: | |||
| @@ -253,7 +253,7 @@ int Convolution1D_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Opt | |||
| } | |||
| } | |||
| sum += __msa_fhadd_w(_sum); | |||
| sum += __msa_reduce_fadd_w(_sum); | |||
| sum = activation_ss(sum, activation_type, activation_params); | |||
| @@ -0,0 +1,83 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void conv1x1s1_sgemm_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| const int size = w * h; | |||
| Mat bottom_im2col = bottom_blob; | |||
| bottom_im2col.w = size; | |||
| bottom_im2col.h = 1; | |||
| im2col_sgemm_int8_msa(bottom_im2col, top_blob, kernel, opt); | |||
| } | |||
| static void conv1x1s2_sgemm_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int channels = bottom_blob.c; | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| int elempack = bottom_blob.elempack; | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| const int tailstep = w - 2 * outw + w; | |||
| Mat bottom_blob_shrinked; | |||
| bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator); | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p = 0; p < channels; p++) | |||
| { | |||
| const signed char* r0 = bottom_blob.channel(p); | |||
| signed char* outptr = bottom_blob_shrinked.channel(p); | |||
| for (int i = 0; i < outh; i++) | |||
| { | |||
| int j = 0; | |||
| for (; j + 3 < outw; j += 4) | |||
| { | |||
| outptr[0] = r0[0]; | |||
| outptr[1] = r0[2]; | |||
| outptr[2] = r0[4]; | |||
| outptr[3] = r0[6]; | |||
| r0 += 8; | |||
| outptr += 4; | |||
| } | |||
| for (; j + 1 < outw; j += 2) | |||
| { | |||
| outptr[0] = r0[0]; | |||
| outptr[1] = r0[2]; | |||
| r0 += 4; | |||
| outptr += 2; | |||
| } | |||
| for (; j < outw; j++) | |||
| { | |||
| outptr[0] = r0[0]; | |||
| r0 += 2; | |||
| outptr += 1; | |||
| } | |||
| r0 += tailstep; | |||
| } | |||
| } | |||
| conv1x1s1_sgemm_int8_msa(bottom_blob_shrinked, top_blob, kernel, opt); | |||
| } | |||
| @@ -0,0 +1,83 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void conv1x1s1_sgemm_pack1to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| const int size = w * h; | |||
| Mat bottom_im2col = bottom_blob; | |||
| bottom_im2col.w = size; | |||
| bottom_im2col.h = 1; | |||
| im2col_sgemm_pack1to4_int8_msa(bottom_im2col, top_blob, kernel, opt); | |||
| } | |||
| static void conv1x1s2_sgemm_pack1to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int channels = bottom_blob.c; | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| int elempack = bottom_blob.elempack; | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| const int tailstep = w - 2 * outw + w; | |||
| Mat bottom_blob_shrinked; | |||
| bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator); | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p = 0; p < channels; p++) | |||
| { | |||
| const signed char* r0 = bottom_blob.channel(p); | |||
| signed char* outptr = bottom_blob_shrinked.channel(p); | |||
| for (int i = 0; i < outh; i++) | |||
| { | |||
| int j = 0; | |||
| for (; j + 3 < outw; j += 4) | |||
| { | |||
| outptr[0] = r0[0]; | |||
| outptr[1] = r0[2]; | |||
| outptr[2] = r0[4]; | |||
| outptr[3] = r0[6]; | |||
| r0 += 8; | |||
| outptr += 4; | |||
| } | |||
| for (; j + 1 < outw; j += 2) | |||
| { | |||
| outptr[0] = r0[0]; | |||
| outptr[1] = r0[2]; | |||
| r0 += 4; | |||
| outptr += 2; | |||
| } | |||
| for (; j < outw; j++) | |||
| { | |||
| outptr[0] = r0[0]; | |||
| r0 += 2; | |||
| outptr += 1; | |||
| } | |||
| r0 += tailstep; | |||
| } | |||
| } | |||
| conv1x1s1_sgemm_pack1to4_int8_msa(bottom_blob_shrinked, top_blob, kernel, opt); | |||
| } | |||
| @@ -0,0 +1,65 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void conv1x1s1_sgemm_pack8to1_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| const int size = w * h; | |||
| Mat bottom_im2col = bottom_blob; | |||
| bottom_im2col.w = size; | |||
| bottom_im2col.h = 1; | |||
| im2col_sgemm_pack8to1_int8_msa(bottom_im2col, top_blob, kernel, opt); | |||
| } | |||
| static void conv1x1s2_sgemm_pack8to1_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int channels = bottom_blob.c; | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| int elempack = bottom_blob.elempack; | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| const int tailstep = w - 2 * outw + w; | |||
| Mat bottom_blob_shrinked; | |||
| bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator); | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p = 0; p < channels; p++) | |||
| { | |||
| const int64_t* r0 = bottom_blob.channel(p); | |||
| int64_t* outptr = bottom_blob_shrinked.channel(p); | |||
| for (int i = 0; i < outh; i++) | |||
| { | |||
| int j = 0; | |||
| for (; j < outw; j++) | |||
| { | |||
| outptr[0] = r0[0]; | |||
| r0 += 2; | |||
| outptr += 1; | |||
| } | |||
| r0 += tailstep; | |||
| } | |||
| } | |||
| conv1x1s1_sgemm_pack8to1_int8_msa(bottom_blob_shrinked, top_blob, kernel, opt); | |||
| } | |||
| @@ -0,0 +1,65 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void conv1x1s1_sgemm_pack8to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| const int size = w * h; | |||
| Mat bottom_im2col = bottom_blob; | |||
| bottom_im2col.w = size; | |||
| bottom_im2col.h = 1; | |||
| im2col_sgemm_pack8to4_int8_msa(bottom_im2col, top_blob, kernel, opt); | |||
| } | |||
| static void conv1x1s2_sgemm_pack8to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int channels = bottom_blob.c; | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| int elempack = bottom_blob.elempack; | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| const int tailstep = w - 2 * outw + w; | |||
| Mat bottom_blob_shrinked; | |||
| bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator); | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p = 0; p < channels; p++) | |||
| { | |||
| const int64_t* r0 = bottom_blob.channel(p); | |||
| int64_t* outptr = bottom_blob_shrinked.channel(p); | |||
| for (int i = 0; i < outh; i++) | |||
| { | |||
| int j = 0; | |||
| for (; j < outw; j++) | |||
| { | |||
| outptr[0] = r0[0]; | |||
| r0 += 2; | |||
| outptr += 1; | |||
| } | |||
| r0 += tailstep; | |||
| } | |||
| } | |||
| conv1x1s1_sgemm_pack8to4_int8_msa(bottom_blob_shrinked, top_blob, kernel, opt); | |||
| } | |||
| @@ -0,0 +1,731 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void conv3x3s1_winograd42_transform_kernel_pack8to1_int8_msa(const Mat& kernel, Mat& kernel_tm_pack8to1, int inch, int outch, const Option& opt) | |||
| { | |||
| // winograd42 transform kernel | |||
| Mat kernel_tm(6 * 6, inch, outch, (size_t)2u); | |||
| const short ktm[6][3] = { | |||
| {6, 0, 0}, | |||
| {-4, -4, -4}, | |||
| {-4, 4, -4}, | |||
| {1, 2, 4}, | |||
| {1, -2, 4}, | |||
| {0, 0, 6} | |||
| }; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p = 0; p < outch; p++) | |||
| { | |||
| for (int q = 0; q < inch; q++) | |||
| { | |||
| const signed char* kernel0 = (const signed char*)kernel + p * inch * 9 + q * 9; | |||
| short* kernel_tm0 = kernel_tm.channel(p).row<short>(q); | |||
| // transform kernel | |||
| const signed char* k0 = kernel0; | |||
| const signed char* k1 = kernel0 + 3; | |||
| const signed char* k2 = kernel0 + 6; | |||
| // h | |||
| short tmp[6][3]; | |||
| for (int i = 0; i < 6; i++) | |||
| { | |||
| tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; | |||
| tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; | |||
| tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; | |||
| } | |||
| // U | |||
| for (int j = 0; j < 6; j++) | |||
| { | |||
| short* tmpp = &tmp[j][0]; | |||
| for (int i = 0; i < 6; i++) | |||
| { | |||
| kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| // interleave | |||
| // src = 36-inch-outch | |||
| // dst = 4b-8a-inch/8a-36-outch/4b | |||
| kernel_tm_pack8to1.create(8 * inch / 8, 36, outch / 4 + outch % 4, (size_t)2u * 4, 4); | |||
| int p = 0; | |||
| for (; p + 3 < outch; p += 4) | |||
| { | |||
| const Mat k0 = kernel_tm.channel(p); | |||
| const Mat k1 = kernel_tm.channel(p + 1); | |||
| const Mat k2 = kernel_tm.channel(p + 2); | |||
| const Mat k3 = kernel_tm.channel(p + 3); | |||
| Mat g0 = kernel_tm_pack8to1.channel(p / 4); | |||
| for (int k = 0; k < 36; k++) | |||
| { | |||
| short* g00 = g0.row<short>(k); | |||
| for (int q = 0; q + 7 < inch; q += 8) | |||
| { | |||
| for (int i = 0; i < 8; i++) | |||
| { | |||
| g00[0] = k0.row<const short>(q + i)[k]; | |||
| g00[1] = k1.row<const short>(q + i)[k]; | |||
| g00[2] = k2.row<const short>(q + i)[k]; | |||
| g00[3] = k3.row<const short>(q + i)[k]; | |||
| g00 += 4; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| for (; p < outch; p++) | |||
| { | |||
| const Mat k0 = kernel_tm.channel(p); | |||
| Mat g0 = kernel_tm_pack8to1.channel(p / 4 + p % 4); | |||
| for (int k = 0; k < 36; k++) | |||
| { | |||
| short* g00 = g0.row<short>(k); | |||
| for (int q = 0; q + 7 < inch; q += 8) | |||
| { | |||
| for (int i = 0; i < 8; i++) | |||
| { | |||
| g00[0] = k0.row<const short>(q + i)[k]; | |||
| g00 += 1; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| static void conv3x3s1_winograd42_pack8to1_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int inch = bottom_blob.c; | |||
| // size_t elemsize = bottom_blob.elemsize; | |||
| int elempack = bottom_blob.elempack; | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| int outch = top_blob.c; | |||
| // pad to 4n+2 | |||
| Mat bottom_blob_bordered = bottom_blob; | |||
| outw = (outw + 3) / 4 * 4; | |||
| outh = (outh + 3) / 4 * 4; | |||
| w = outw + 2; | |||
| h = outh + 2; | |||
| copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt); | |||
| // BEGIN transform input | |||
| Mat bottom_blob_tm; | |||
| { | |||
| int w_tm = outw / 4 * 6; | |||
| int h_tm = outh / 4 * 6; | |||
| const int tiles = w_tm / 6 * h_tm / 6; | |||
| bottom_blob_tm.create(tiles, 36, inch, 2u * elempack, elempack, opt.workspace_allocator); | |||
| // const float itm[4][4] = { | |||
| // {4.0f, 0.0f, -5.0f, 0.0f, 1.0f, 0.0f}, | |||
| // {0.0f,-4.0f, -4.0f, 1.0f, 1.0f, 0.0f}, | |||
| // {0.0f, 4.0f, -4.0f,-1.0f, 1.0f, 0.0f}, | |||
| // {0.0f,-2.0f, -1.0f, 2.0f, 1.0f, 0.0f}, | |||
| // {0.0f, 2.0f, -1.0f,-2.0f, 1.0f, 0.0f}, | |||
| // {0.0f, 4.0f, 0.0f,-5.0f, 0.0f, 1.0f} | |||
| // }; | |||
| // 0 = 4 * r00 - 5 * r02 + r04 | |||
| // 1 = -4 * (r01 + r02) + r04 + r03 | |||
| // 2 = 4 * (r01 - r02) + r04 - r03 | |||
| // 3 = -2 * (r01 - r03) + r04 - r02 | |||
| // 4 = 2 * (r01 - r03) + r04 - r02 | |||
| // 5 = 4 * r01 - 5 * r03 + r05 | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < inch; q++) | |||
| { | |||
| const Mat img0 = bottom_blob_bordered.channel(q); | |||
| Mat img0_tm = bottom_blob_tm.channel(q); | |||
| short tmp[6][6][8]; | |||
| // tile | |||
| for (int i = 0; i < h_tm / 6; i++) | |||
| { | |||
| for (int j = 0; j < w_tm / 6; j++) | |||
| { | |||
| const signed char* r0 = img0.row<const signed char>(i * 4) + (j * 4) * 8; | |||
| for (int m = 0; m < 6; m++) | |||
| { | |||
| v16i8 _r00_01 = __msa_ld_b(r0, 0); | |||
| v16i8 _r02_03 = __msa_ld_b(r0 + 16, 0); | |||
| v16i8 _r04_05 = __msa_ld_b(r0 + 32, 0); | |||
| v16i8 _extr0001 = __msa_clti_s_b(_r00_01, 0); | |||
| v16i8 _extr0203 = __msa_clti_s_b(_r02_03, 0); | |||
| v16i8 _extr0405 = __msa_clti_s_b(_r04_05, 0); | |||
| v8i16 _r00 = (v8i16)__msa_ilvr_b(_extr0001, _r00_01); | |||
| v8i16 _r01 = (v8i16)__msa_ilvl_b(_extr0001, _r00_01); | |||
| v8i16 _r02 = (v8i16)__msa_ilvr_b(_extr0203, _r02_03); | |||
| v8i16 _r03 = (v8i16)__msa_ilvl_b(_extr0203, _r02_03); | |||
| v8i16 _r04 = (v8i16)__msa_ilvr_b(_extr0405, _r04_05); | |||
| v8i16 _r05 = (v8i16)__msa_ilvl_b(_extr0405, _r04_05); | |||
| v8i16 _v5 = __msa_fill_h(5); | |||
| v8i16 _tmp0m = __msa_subv_h(__msa_addv_h(__msa_slli_h(_r00, 2), _r04), __msa_mulv_h(_r02, _v5)); | |||
| v8i16 _tmp1m = __msa_subv_h(__msa_addv_h(_r04, _r03), __msa_slli_h(__msa_addv_h(_r01, _r02), 2)); | |||
| v8i16 _tmp2m = __msa_addv_h(__msa_subv_h(_r04, _r03), __msa_slli_h(__msa_subv_h(_r01, _r02), 2)); | |||
| v8i16 _tmp3m = __msa_subv_h(__msa_subv_h(_r04, _r02), __msa_slli_h(__msa_subv_h(_r01, _r03), 1)); | |||
| v8i16 _tmp4m = __msa_addv_h(__msa_subv_h(_r04, _r02), __msa_slli_h(__msa_subv_h(_r01, _r03), 1)); | |||
| v8i16 _tmp5m = __msa_subv_h(__msa_addv_h(__msa_slli_h(_r01, 2), _r05), __msa_mulv_h(_r03, _v5)); | |||
| __msa_st_h(_tmp0m, tmp[0][m], 0); | |||
| __msa_st_h(_tmp1m, tmp[1][m], 0); | |||
| __msa_st_h(_tmp2m, tmp[2][m], 0); | |||
| __msa_st_h(_tmp3m, tmp[3][m], 0); | |||
| __msa_st_h(_tmp4m, tmp[4][m], 0); | |||
| __msa_st_h(_tmp5m, tmp[5][m], 0); | |||
| r0 += w * 8; | |||
| } | |||
| short* r0_tm_0 = (short*)img0_tm + (i * w_tm / 6 + j) * 8; | |||
| short* r0_tm_1 = r0_tm_0 + tiles * 8; | |||
| short* r0_tm_2 = r0_tm_0 + tiles * 16; | |||
| short* r0_tm_3 = r0_tm_0 + tiles * 24; | |||
| short* r0_tm_4 = r0_tm_0 + tiles * 32; | |||
| short* r0_tm_5 = r0_tm_0 + tiles * 40; | |||
| for (int m = 0; m < 6; m++) | |||
| { | |||
| v8i16 _tmp00 = __msa_ld_h(tmp[m][0], 0); | |||
| v8i16 _tmp01 = __msa_ld_h(tmp[m][1], 0); | |||
| v8i16 _tmp02 = __msa_ld_h(tmp[m][2], 0); | |||
| v8i16 _tmp03 = __msa_ld_h(tmp[m][3], 0); | |||
| v8i16 _tmp04 = __msa_ld_h(tmp[m][4], 0); | |||
| v8i16 _tmp05 = __msa_ld_h(tmp[m][5], 0); | |||
| v8i16 _v5 = __msa_fill_h(5); | |||
| v8i16 _r0tm0 = __msa_subv_h(__msa_addv_h(__msa_slli_h(_tmp00, 2), _tmp04), __msa_mulv_h(_tmp02, _v5)); | |||
| v8i16 _r0tm1 = __msa_subv_h(__msa_addv_h(_tmp04, _tmp03), __msa_slli_h(__msa_addv_h(_tmp01, _tmp02), 2)); | |||
| v8i16 _r0tm2 = __msa_addv_h(__msa_subv_h(_tmp04, _tmp03), __msa_slli_h(__msa_subv_h(_tmp01, _tmp02), 2)); | |||
| v8i16 _r0tm3 = __msa_subv_h(__msa_subv_h(_tmp04, _tmp02), __msa_slli_h(__msa_subv_h(_tmp01, _tmp03), 1)); | |||
| v8i16 _r0tm4 = __msa_addv_h(__msa_subv_h(_tmp04, _tmp02), __msa_slli_h(__msa_subv_h(_tmp01, _tmp03), 1)); | |||
| v8i16 _r0tm5 = __msa_subv_h(__msa_addv_h(__msa_slli_h(_tmp01, 2), _tmp05), __msa_mulv_h(_tmp03, _v5)); | |||
| __msa_st_h(_r0tm0, r0_tm_0, 0); | |||
| __msa_st_h(_r0tm1, r0_tm_1, 0); | |||
| __msa_st_h(_r0tm2, r0_tm_2, 0); | |||
| __msa_st_h(_r0tm3, r0_tm_3, 0); | |||
| __msa_st_h(_r0tm4, r0_tm_4, 0); | |||
| __msa_st_h(_r0tm5, r0_tm_5, 0); | |||
| r0_tm_0 += tiles * 48; | |||
| r0_tm_1 += tiles * 48; | |||
| r0_tm_2 += tiles * 48; | |||
| r0_tm_3 += tiles * 48; | |||
| r0_tm_4 += tiles * 48; | |||
| r0_tm_5 += tiles * 48; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| bottom_blob_bordered = Mat(); | |||
| // END transform input | |||
| // BEGIN dot | |||
| Mat top_blob_tm; | |||
| { | |||
| int w_tm = outw / 4 * 6; | |||
| int h_tm = outh / 4 * 6; | |||
| const int tiles = h_tm / 6 * w_tm / 6; | |||
| // permute | |||
| // bottom_blob_tm.create(tiles, 36, inch, elemsize, elempack, opt.workspace_allocator); | |||
| Mat bottom_blob_tm2; | |||
| if (tiles >= 2) | |||
| bottom_blob_tm2.create(2 * inch, tiles / 2 + tiles % 2, 36, 2u * elempack, elempack, opt.workspace_allocator); | |||
| else // if (tiles >= 1) | |||
| bottom_blob_tm2.create(1 * inch, tiles, 36, 2u * elempack, elempack, opt.workspace_allocator); | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int r = 0; r < 36; r++) | |||
| { | |||
| Mat tm2 = bottom_blob_tm2.channel(r); | |||
| // tile | |||
| int i = 0; | |||
| for (; i + 1 < tiles; i += 2) | |||
| { | |||
| short* tmpptr = tm2.row<short>(i / 2); | |||
| const short* r0 = bottom_blob_tm; | |||
| r0 += (r * tiles + i) * 8; | |||
| for (int q = 0; q < inch; q++) | |||
| { | |||
| v8i16 _r0 = __msa_ld_h(r0, 0); | |||
| v8i16 _r1 = __msa_ld_h(r0 + 8, 0); | |||
| __msa_st_h(_r0, tmpptr, 0); | |||
| __msa_st_h(_r1, tmpptr + 8, 0); | |||
| r0 += bottom_blob_tm.cstep * 8; | |||
| tmpptr += 16; | |||
| } | |||
| } | |||
| for (; i < tiles; i++) | |||
| { | |||
| short* tmpptr = tm2.row<short>(i / 2 + i % 2); | |||
| const short* r0 = bottom_blob_tm; | |||
| r0 += (r * tiles + i) * 8; | |||
| for (int q = 0; q < inch; q++) | |||
| { | |||
| v8i16 _r0 = __msa_ld_h(r0, 0); | |||
| __msa_st_h(_r0, tmpptr, 0); | |||
| r0 += bottom_blob_tm.cstep * 8; | |||
| tmpptr += 8; | |||
| } | |||
| } | |||
| } | |||
| bottom_blob_tm = Mat(); | |||
| // permute end | |||
| top_blob_tm.create(tiles, 36, outch, 4u, 1, opt.workspace_allocator); | |||
| int nn_outch = 0; | |||
| int remain_outch_start = 0; | |||
| nn_outch = outch >> 2; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int pp = 0; pp < nn_outch; pp++) | |||
| { | |||
| int p = pp * 4; | |||
| int* output0_tm = top_blob_tm.channel(p); | |||
| int* output1_tm = top_blob_tm.channel(p + 1); | |||
| int* output2_tm = top_blob_tm.channel(p + 2); | |||
| int* output3_tm = top_blob_tm.channel(p + 3); | |||
| const Mat kernel0_tm = kernel_tm.channel(p / 4); | |||
| for (int r = 0; r < 36; r++) | |||
| { | |||
| const Mat bb2 = bottom_blob_tm2.channel(r); | |||
| int i = 0; | |||
| for (; i + 1 < tiles; i += 2) | |||
| { | |||
| const short* r0 = bb2.row<const short>(i / 2); | |||
| const short* k0 = kernel0_tm.row<const short>(r); | |||
| int nn = inch; // inch always > 0 | |||
| v4i32 _sum0 = __msa_fill_w(0); | |||
| v4i32 _sum1 = __msa_fill_w(0); | |||
| v4i32 _sum2 = __msa_fill_w(0); | |||
| v4i32 _sum3 = __msa_fill_w(0); | |||
| for (int j = 0; j < nn; j++) | |||
| { | |||
| v8i16 _w0 = __msa_ld_h(k0, 0); | |||
| v8i16 _w1 = __msa_ld_h(k0 + 8, 0); | |||
| v8i16 _w2 = __msa_ld_h(k0 + 16, 0); | |||
| v8i16 _w3 = __msa_ld_h(k0 + 24, 0); | |||
| v8i16 _extw0 = __msa_clti_s_h(_w0, 0); | |||
| v8i16 _extw1 = __msa_clti_s_h(_w1, 0); | |||
| v8i16 _extw2 = __msa_clti_s_h(_w2, 0); | |||
| v8i16 _extw3 = __msa_clti_s_h(_w3, 0); | |||
| v4i32 _w0l = (v4i32)__msa_ilvr_h(_extw0, _w0); | |||
| v4i32 _w0h = (v4i32)__msa_ilvl_h(_extw0, _w0); | |||
| v4i32 _w1l = (v4i32)__msa_ilvr_h(_extw1, _w1); | |||
| v4i32 _w1h = (v4i32)__msa_ilvl_h(_extw1, _w1); | |||
| v4i32 _w2l = (v4i32)__msa_ilvr_h(_extw2, _w2); | |||
| v4i32 _w2h = (v4i32)__msa_ilvl_h(_extw2, _w2); | |||
| v4i32 _w3l = (v4i32)__msa_ilvr_h(_extw3, _w3); | |||
| v4i32 _w3h = (v4i32)__msa_ilvl_h(_extw3, _w3); | |||
| v4i32 _val0_0 = __msa_fill_w(r0[0]); | |||
| v4i32 _val0_1 = __msa_fill_w(r0[1]); | |||
| v4i32 _val0_2 = __msa_fill_w(r0[2]); | |||
| v4i32 _val0_3 = __msa_fill_w(r0[3]); | |||
| v4i32 _val0_4 = __msa_fill_w(r0[4]); | |||
| v4i32 _val0_5 = __msa_fill_w(r0[5]); | |||
| v4i32 _val0_6 = __msa_fill_w(r0[6]); | |||
| v4i32 _val0_7 = __msa_fill_w(r0[7]); | |||
| v4i32 _val1_0 = __msa_fill_w(r0[8]); | |||
| v4i32 _val1_1 = __msa_fill_w(r0[9]); | |||
| v4i32 _val1_2 = __msa_fill_w(r0[10]); | |||
| v4i32 _val1_3 = __msa_fill_w(r0[11]); | |||
| v4i32 _val1_4 = __msa_fill_w(r0[12]); | |||
| v4i32 _val1_5 = __msa_fill_w(r0[13]); | |||
| v4i32 _val1_6 = __msa_fill_w(r0[14]); | |||
| v4i32 _val1_7 = __msa_fill_w(r0[15]); | |||
| _sum0 = __msa_maddv_w(_sum0, _w0l, _val0_0); | |||
| _sum1 = __msa_maddv_w(_sum1, _w0h, _val0_1); | |||
| _sum2 = __msa_maddv_w(_sum2, _w0l, _val1_0); | |||
| _sum3 = __msa_maddv_w(_sum3, _w0h, _val1_1); | |||
| _sum0 = __msa_maddv_w(_sum0, _w1l, _val0_2); | |||
| _sum1 = __msa_maddv_w(_sum1, _w1h, _val0_3); | |||
| _sum2 = __msa_maddv_w(_sum2, _w1l, _val1_2); | |||
| _sum3 = __msa_maddv_w(_sum3, _w1h, _val1_3); | |||
| _sum0 = __msa_maddv_w(_sum0, _w2l, _val0_4); | |||
| _sum1 = __msa_maddv_w(_sum1, _w2h, _val0_5); | |||
| _sum2 = __msa_maddv_w(_sum2, _w2l, _val1_4); | |||
| _sum3 = __msa_maddv_w(_sum3, _w2h, _val1_5); | |||
| _sum0 = __msa_maddv_w(_sum0, _w3l, _val0_6); | |||
| _sum1 = __msa_maddv_w(_sum1, _w3h, _val0_7); | |||
| _sum2 = __msa_maddv_w(_sum2, _w3l, _val1_6); | |||
| _sum3 = __msa_maddv_w(_sum3, _w3h, _val1_7); | |||
| r0 += 16; | |||
| k0 += 32; | |||
| } | |||
| _sum0 = __msa_addv_w(_sum0, _sum1); | |||
| _sum2 = __msa_addv_w(_sum2, _sum3); | |||
| int sum[8]; | |||
| __msa_st_w(_sum0, sum, 0); | |||
| __msa_st_w(_sum2, sum + 4, 0); | |||
| output0_tm[0] = sum[0]; | |||
| output1_tm[0] = sum[1]; | |||
| output2_tm[0] = sum[2]; | |||
| output3_tm[0] = sum[3]; | |||
| output0_tm[1] = sum[4]; | |||
| output1_tm[1] = sum[5]; | |||
| output2_tm[1] = sum[6]; | |||
| output3_tm[1] = sum[7]; | |||
| output0_tm += 2; | |||
| output1_tm += 2; | |||
| output2_tm += 2; | |||
| output3_tm += 2; | |||
| } | |||
| for (; i < tiles; i++) | |||
| { | |||
| const short* r0 = bb2.row<const short>(i / 2 + i % 2); | |||
| const short* k0 = kernel0_tm.row<const short>(r); | |||
| int nn = inch; // inch always > 0 | |||
| v4i32 _sum0 = __msa_fill_w(0); | |||
| v4i32 _sum1 = __msa_fill_w(0); | |||
| for (int j = 0; j < nn; j++) | |||
| { | |||
| v8i16 _w0 = __msa_ld_h(k0, 0); | |||
| v8i16 _w1 = __msa_ld_h(k0 + 8, 0); | |||
| v8i16 _w2 = __msa_ld_h(k0 + 16, 0); | |||
| v8i16 _w3 = __msa_ld_h(k0 + 24, 0); | |||
| v8i16 _extw0 = __msa_clti_s_h(_w0, 0); | |||
| v8i16 _extw1 = __msa_clti_s_h(_w1, 0); | |||
| v8i16 _extw2 = __msa_clti_s_h(_w2, 0); | |||
| v8i16 _extw3 = __msa_clti_s_h(_w3, 0); | |||
| v4i32 _w0l = (v4i32)__msa_ilvr_h(_extw0, _w0); | |||
| v4i32 _w0h = (v4i32)__msa_ilvl_h(_extw0, _w0); | |||
| v4i32 _w1l = (v4i32)__msa_ilvr_h(_extw1, _w1); | |||
| v4i32 _w1h = (v4i32)__msa_ilvl_h(_extw1, _w1); | |||
| v4i32 _w2l = (v4i32)__msa_ilvr_h(_extw2, _w2); | |||
| v4i32 _w2h = (v4i32)__msa_ilvl_h(_extw2, _w2); | |||
| v4i32 _w3l = (v4i32)__msa_ilvr_h(_extw3, _w3); | |||
| v4i32 _w3h = (v4i32)__msa_ilvl_h(_extw3, _w3); | |||
| v4i32 _val0 = __msa_fill_w(r0[0]); | |||
| v4i32 _val1 = __msa_fill_w(r0[1]); | |||
| v4i32 _val2 = __msa_fill_w(r0[2]); | |||
| v4i32 _val3 = __msa_fill_w(r0[3]); | |||
| v4i32 _val4 = __msa_fill_w(r0[4]); | |||
| v4i32 _val5 = __msa_fill_w(r0[5]); | |||
| v4i32 _val6 = __msa_fill_w(r0[6]); | |||
| v4i32 _val7 = __msa_fill_w(r0[7]); | |||
| _sum0 = __msa_maddv_w(_sum0, _w0l, _val0); | |||
| _sum1 = __msa_maddv_w(_sum1, _w0h, _val1); | |||
| _sum0 = __msa_maddv_w(_sum0, _w1l, _val2); | |||
| _sum1 = __msa_maddv_w(_sum1, _w1h, _val3); | |||
| _sum0 = __msa_maddv_w(_sum0, _w2l, _val4); | |||
| _sum1 = __msa_maddv_w(_sum1, _w2h, _val5); | |||
| _sum0 = __msa_maddv_w(_sum0, _w3l, _val6); | |||
| _sum1 = __msa_maddv_w(_sum1, _w3h, _val7); | |||
| r0 += 8; | |||
| k0 += 32; | |||
| } | |||
| _sum0 = __msa_addv_w(_sum0, _sum1); | |||
| int sum[4]; | |||
| __msa_st_w(_sum0, sum, 0); | |||
| output0_tm[0] = sum[0]; | |||
| output1_tm[0] = sum[1]; | |||
| output2_tm[0] = sum[2]; | |||
| output3_tm[0] = sum[3]; | |||
| output0_tm += 1; | |||
| output1_tm += 1; | |||
| output2_tm += 1; | |||
| output3_tm += 1; | |||
| } | |||
| } | |||
| } | |||
| remain_outch_start += nn_outch << 2; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p = remain_outch_start; p < outch; p++) | |||
| { | |||
| int* output0_tm = top_blob_tm.channel(p); | |||
| const Mat kernel0_tm = kernel_tm.channel(p / 4 + p % 4); | |||
| for (int r = 0; r < 36; r++) | |||
| { | |||
| const Mat bb2 = bottom_blob_tm2.channel(r); | |||
| int i = 0; | |||
| for (; i + 1 < tiles; i += 2) | |||
| { | |||
| const short* r0 = bb2.row<const short>(i / 2); | |||
| const short* k0 = kernel0_tm.row<const short>(r); | |||
| v4i32 _sum0 = __msa_fill_w(0); | |||
| v4i32 _sum1 = __msa_fill_w(0); | |||
| v4i32 _sum2 = __msa_fill_w(0); | |||
| v4i32 _sum3 = __msa_fill_w(0); | |||
| for (int q = 0; q < inch; q++) | |||
| { | |||
| v8i16 _val0 = __msa_ld_h(r0, 0); | |||
| v8i16 _val1 = __msa_ld_h(r0 + 8, 0); | |||
| v8i16 _extval0 = __msa_clti_s_h(_val0, 0); | |||
| v8i16 _extval1 = __msa_clti_s_h(_val1, 0); | |||
| v4i32 _val0l = (v4i32)__msa_ilvr_h(_extval0, _val0); | |||
| v4i32 _val0h = (v4i32)__msa_ilvl_h(_extval0, _val0); | |||
| v4i32 _val1l = (v4i32)__msa_ilvr_h(_extval1, _val1); | |||
| v4i32 _val1h = (v4i32)__msa_ilvl_h(_extval1, _val1); | |||
| v8i16 _w0 = __msa_ld_h(k0, 0); | |||
| v8i16 _extw0 = __msa_clti_s_h(_w0, 0); | |||
| v4i32 _w0l = (v4i32)__msa_ilvr_h(_extw0, _w0); | |||
| v4i32 _w0h = (v4i32)__msa_ilvl_h(_extw0, _w0); | |||
| _sum0 = __msa_maddv_w(_sum0, _w0l, _val0l); | |||
| _sum1 = __msa_maddv_w(_sum1, _w0h, _val0h); | |||
| _sum2 = __msa_maddv_w(_sum2, _w0l, _val1l); | |||
| _sum3 = __msa_maddv_w(_sum3, _w0h, _val1h); | |||
| k0 += 8; | |||
| r0 += 16; | |||
| } | |||
| _sum0 = __msa_addv_w(_sum0, _sum1); | |||
| _sum2 = __msa_addv_w(_sum2, _sum3); | |||
| output0_tm[0] = __msa_reduce_add_w(_sum0); | |||
| output0_tm[1] = __msa_reduce_add_w(_sum2); | |||
| output0_tm += 2; | |||
| } | |||
| for (; i < tiles; i++) | |||
| { | |||
| const short* r0 = bb2.row<const short>(i / 2 + i % 2); | |||
| const short* k0 = kernel0_tm.row<const short>(r); | |||
| v4i32 _sum0 = __msa_fill_w(0); | |||
| v4i32 _sum1 = __msa_fill_w(0); | |||
| for (int q = 0; q < inch; q++) | |||
| { | |||
| v8i16 _val = __msa_ld_h(r0, 0); | |||
| v8i16 _extval = __msa_clti_s_h(_val, 0); | |||
| v4i32 _vall = (v4i32)__msa_ilvr_h(_extval, _val); | |||
| v4i32 _valh = (v4i32)__msa_ilvl_h(_extval, _val); | |||
| v8i16 _w0 = __msa_ld_h(k0, 0); | |||
| v8i16 _extw0 = __msa_clti_s_h(_w0, 0); | |||
| v4i32 _w0l = (v4i32)__msa_ilvr_h(_extw0, _w0); | |||
| v4i32 _w0h = (v4i32)__msa_ilvl_h(_extw0, _w0); | |||
| _sum0 = __msa_maddv_w(_sum0, _w0l, _vall); | |||
| _sum1 = __msa_maddv_w(_sum1, _w0h, _valh); | |||
| k0 += 8; | |||
| r0 += 8; | |||
| } | |||
| _sum0 = __msa_addv_w(_sum0, _sum1); | |||
| output0_tm[0] = __msa_reduce_add_w(_sum0); | |||
| output0_tm++; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| bottom_blob_tm = Mat(); | |||
| // END dot | |||
| // BEGIN transform output | |||
| Mat top_blob_bordered; | |||
| if (outw == top_blob.w && outh == top_blob.h) | |||
| { | |||
| top_blob_bordered = top_blob; | |||
| } | |||
| else | |||
| { | |||
| top_blob_bordered.create(outw, outh, outch, 4u, 1, opt.workspace_allocator); | |||
| } | |||
| { | |||
| // const float otm[4][6] = { | |||
| // {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f}, | |||
| // {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 0.0f}, | |||
| // {0.0f, 1.0f, 1.0f, 4.0f, 4.0f, 0.0f}, | |||
| // {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 1.0f} | |||
| // }; | |||
| // 0 = r00 + (r01 + r02) + (r03 + r04) | |||
| // 1 = (r01 - r02) + (r03 - r04) * 2 | |||
| // 2 = (r01 + r02) + (r03 + r04) * 4 | |||
| // 3 = r05 + (r01 - r02) + (r03 - r04) * 8 | |||
| int w_tm = outw / 4 * 6; | |||
| int h_tm = outh / 4 * 6; | |||
| const int tiles = w_tm / 6 * h_tm / 6; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p = 0; p < outch; p++) | |||
| { | |||
| const Mat out0_tm = top_blob_tm.channel(p); | |||
| Mat out0 = top_blob_bordered.channel(p); | |||
| int tmp[4][6]; | |||
| // tile | |||
| for (int i = 0; i < outh / 4; i++) | |||
| { | |||
| for (int j = 0; j < outw / 4; j++) | |||
| { | |||
| // top_blob_tm.create(tiles, 36, outch, 4u, 1, opt.workspace_allocator); | |||
| const int* output0_tm_0 = (const int*)out0_tm + (i * w_tm / 6 + j) * 1; | |||
| const int* output0_tm_1 = output0_tm_0 + tiles * 1; | |||
| const int* output0_tm_2 = output0_tm_0 + tiles * 2; | |||
| const int* output0_tm_3 = output0_tm_0 + tiles * 3; | |||
| const int* output0_tm_4 = output0_tm_0 + tiles * 4; | |||
| const int* output0_tm_5 = output0_tm_0 + tiles * 5; | |||
| int* output0 = out0.row<int>(i * 4) + j * 4; | |||
| // 0 = r00 + (r01 + r02) + (r03 + r04) | |||
| // 1 = (r01 - r02) + (r03 - r04) * 2 | |||
| // 2 = (r01 + r02) + (r03 + r04) * 4 | |||
| // 3 = r05 + (r01 - r02) + (r03 - r04) * 8 | |||
| // TODO msa optimize | |||
| for (int m = 0; m < 5; m++) | |||
| { | |||
| int tmp02a = output0_tm_1[0] + output0_tm_2[0]; | |||
| int tmp13a = output0_tm_1[0] - output0_tm_2[0]; | |||
| int tmp02b = output0_tm_3[0] + output0_tm_4[0]; | |||
| int tmp13b = output0_tm_3[0] - output0_tm_4[0]; | |||
| tmp[0][m] = output0_tm_0[0] + tmp02a + tmp02b; | |||
| tmp[1][m] = tmp13a + tmp13b * 2; | |||
| tmp[2][m] = tmp02a + tmp02b * 4; | |||
| tmp[3][m] = output0_tm_5[0] * 4 + tmp13a + tmp13b * 8; | |||
| output0_tm_0 += tiles * 6; | |||
| output0_tm_1 += tiles * 6; | |||
| output0_tm_2 += tiles * 6; | |||
| output0_tm_3 += tiles * 6; | |||
| output0_tm_4 += tiles * 6; | |||
| output0_tm_5 += tiles * 6; | |||
| } | |||
| for (int m = 5; m < 6; m++) | |||
| { | |||
| int tmp02a = output0_tm_1[0] + output0_tm_2[0]; | |||
| int tmp13a = output0_tm_1[0] - output0_tm_2[0]; | |||
| int tmp02b = output0_tm_3[0] + output0_tm_4[0]; | |||
| int tmp13b = output0_tm_3[0] - output0_tm_4[0]; | |||
| tmp[0][m] = (output0_tm_0[0] + tmp02a + tmp02b) * 4; | |||
| tmp[1][m] = (tmp13a + tmp13b * 2) * 4; | |||
| tmp[2][m] = (tmp02a + tmp02b * 4) * 4; | |||
| tmp[3][m] = (output0_tm_5[0] * 4 + tmp13a + tmp13b * 8) * 4; | |||
| output0_tm_0 += tiles * 6; | |||
| output0_tm_1 += tiles * 6; | |||
| output0_tm_2 += tiles * 6; | |||
| output0_tm_3 += tiles * 6; | |||
| output0_tm_4 += tiles * 6; | |||
| output0_tm_5 += tiles * 6; | |||
| } | |||
| for (int m = 0; m < 4; m++) | |||
| { | |||
| const int* tmp0 = tmp[m]; | |||
| int tmp02a = tmp0[1] + tmp0[2]; | |||
| int tmp13a = tmp0[1] - tmp0[2]; | |||
| int tmp02b = tmp0[3] + tmp0[4]; | |||
| int tmp13b = tmp0[3] - tmp0[4]; | |||
| output0[0] = (tmp0[0] + tmp02a + tmp02b) / 576; | |||
| output0[1] = (tmp13a + tmp13b * 2) / 576; | |||
| output0[2] = (tmp02a + tmp02b * 4) / 576; | |||
| output0[3] = (tmp0[5] + tmp13a + tmp13b * 8) / 576; | |||
| output0 += outw; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| // END transform output | |||
| // cut result pad | |||
| copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt); | |||
| } | |||
| @@ -0,0 +1,629 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void conv3x3s1_winograd42_transform_kernel_pack8to4_int8_msa(const Mat& kernel, Mat& kernel_tm_pack8, int inch, int outch, const Option& opt) | |||
| { | |||
| // winograd42 transform kernel | |||
| Mat kernel_tm(6 * 6, inch, outch, (size_t)2u); | |||
| const short ktm[6][3] = { | |||
| {6, 0, 0}, | |||
| {-4, -4, -4}, | |||
| {-4, 4, -4}, | |||
| {1, 2, 4}, | |||
| {1, -2, 4}, | |||
| {0, 0, 6} | |||
| }; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p = 0; p < outch; p++) | |||
| { | |||
| for (int q = 0; q < inch; q++) | |||
| { | |||
| const signed char* kernel0 = (const signed char*)kernel + p * inch * 9 + q * 9; | |||
| short* kernel_tm0 = kernel_tm.channel(p).row<short>(q); | |||
| // transform kernel | |||
| const signed char* k0 = kernel0; | |||
| const signed char* k1 = kernel0 + 3; | |||
| const signed char* k2 = kernel0 + 6; | |||
| // h | |||
| short tmp[6][3]; | |||
| for (int i = 0; i < 6; i++) | |||
| { | |||
| tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; | |||
| tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; | |||
| tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; | |||
| } | |||
| // U | |||
| for (int j = 0; j < 6; j++) | |||
| { | |||
| short* tmpp = &tmp[j][0]; | |||
| for (int i = 0; i < 6; i++) | |||
| { | |||
| kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| // interleave | |||
| // src = 36-inch-outch | |||
| // dst = 4b-8a-inch/8a-36-outch/4b | |||
| kernel_tm_pack8.create(inch / 8, 36, outch / 4, (size_t)2u * 32, 32); | |||
| int q = 0; | |||
| for (; q + 3 < outch; q += 4) | |||
| { | |||
| const Mat k0 = kernel_tm.channel(q); | |||
| const Mat k1 = kernel_tm.channel(q + 1); | |||
| const Mat k2 = kernel_tm.channel(q + 2); | |||
| const Mat k3 = kernel_tm.channel(q + 3); | |||
| Mat kernel_tm = kernel_tm_pack8.channel(q / 4); | |||
| for (int k = 0; k < 36; k++) | |||
| { | |||
| short* g00 = kernel_tm.row<short>(k); | |||
| for (int p = 0; p + 7 < inch; p += 8) | |||
| { | |||
| for (int i = 0; i < 8; i++) | |||
| { | |||
| const short* k00 = k0.row<const short>(p + i); | |||
| const short* k10 = k1.row<const short>(p + i); | |||
| const short* k20 = k2.row<const short>(p + i); | |||
| const short* k30 = k3.row<const short>(p + i); | |||
| g00[0] = k00[k]; | |||
| g00[1] = k10[k]; | |||
| g00[2] = k20[k]; | |||
| g00[3] = k30[k]; | |||
| g00 += 4; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| static void conv3x3s1_winograd42_pack8to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int inch = bottom_blob.c; | |||
| // size_t elemsize = bottom_blob.elemsize; | |||
| int elempack = bottom_blob.elempack; | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| int outch = top_blob.c; | |||
| // pad to 4n+2 | |||
| Mat bottom_blob_bordered = bottom_blob; | |||
| outw = (outw + 3) / 4 * 4; | |||
| outh = (outh + 3) / 4 * 4; | |||
| w = outw + 2; | |||
| h = outh + 2; | |||
| copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt); | |||
| // BEGIN transform input | |||
| Mat bottom_blob_tm; | |||
| { | |||
| int w_tm = outw / 4 * 6; | |||
| int h_tm = outh / 4 * 6; | |||
| const int tiles = w_tm / 6 * h_tm / 6; | |||
| bottom_blob_tm.create(tiles, 36, inch, 2u * elempack, elempack, opt.workspace_allocator); | |||
| // const float itm[4][4] = { | |||
| // {4.0f, 0.0f, -5.0f, 0.0f, 1.0f, 0.0f}, | |||
| // {0.0f,-4.0f, -4.0f, 1.0f, 1.0f, 0.0f}, | |||
| // {0.0f, 4.0f, -4.0f,-1.0f, 1.0f, 0.0f}, | |||
| // {0.0f,-2.0f, -1.0f, 2.0f, 1.0f, 0.0f}, | |||
| // {0.0f, 2.0f, -1.0f,-2.0f, 1.0f, 0.0f}, | |||
| // {0.0f, 4.0f, 0.0f,-5.0f, 0.0f, 1.0f} | |||
| // }; | |||
| // 0 = 4 * r00 - 5 * r02 + r04 | |||
| // 1 = -4 * (r01 + r02) + r04 + r03 | |||
| // 2 = 4 * (r01 - r02) + r04 - r03 | |||
| // 3 = -2 * (r01 - r03) + r04 - r02 | |||
| // 4 = 2 * (r01 - r03) + r04 - r02 | |||
| // 5 = 4 * r01 - 5 * r03 + r05 | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q = 0; q < inch; q++) | |||
| { | |||
| const Mat img0 = bottom_blob_bordered.channel(q); | |||
| Mat img0_tm = bottom_blob_tm.channel(q); | |||
| short tmp[6][6][8]; | |||
| // tile | |||
| for (int i = 0; i < h_tm / 6; i++) | |||
| { | |||
| for (int j = 0; j < w_tm / 6; j++) | |||
| { | |||
| const signed char* r0 = img0.row<const signed char>(i * 4) + (j * 4) * 8; | |||
| for (int m = 0; m < 6; m++) | |||
| { | |||
| v16i8 _r00_01 = __msa_ld_b(r0, 0); | |||
| v16i8 _r02_03 = __msa_ld_b(r0 + 16, 0); | |||
| v16i8 _r04_05 = __msa_ld_b(r0 + 32, 0); | |||
| v16i8 _extr0001 = __msa_clti_s_b(_r00_01, 0); | |||
| v16i8 _extr0203 = __msa_clti_s_b(_r02_03, 0); | |||
| v16i8 _extr0405 = __msa_clti_s_b(_r04_05, 0); | |||
| v8i16 _r00 = (v8i16)__msa_ilvr_b(_extr0001, _r00_01); | |||
| v8i16 _r01 = (v8i16)__msa_ilvl_b(_extr0001, _r00_01); | |||
| v8i16 _r02 = (v8i16)__msa_ilvr_b(_extr0203, _r02_03); | |||
| v8i16 _r03 = (v8i16)__msa_ilvl_b(_extr0203, _r02_03); | |||
| v8i16 _r04 = (v8i16)__msa_ilvr_b(_extr0405, _r04_05); | |||
| v8i16 _r05 = (v8i16)__msa_ilvl_b(_extr0405, _r04_05); | |||
| v8i16 _v5 = __msa_fill_h(5); | |||
| v8i16 _tmp0m = __msa_subv_h(__msa_addv_h(__msa_slli_h(_r00, 2), _r04), __msa_mulv_h(_r02, _v5)); | |||
| v8i16 _tmp1m = __msa_subv_h(__msa_addv_h(_r04, _r03), __msa_slli_h(__msa_addv_h(_r01, _r02), 2)); | |||
| v8i16 _tmp2m = __msa_addv_h(__msa_subv_h(_r04, _r03), __msa_slli_h(__msa_subv_h(_r01, _r02), 2)); | |||
| v8i16 _tmp3m = __msa_subv_h(__msa_subv_h(_r04, _r02), __msa_slli_h(__msa_subv_h(_r01, _r03), 1)); | |||
| v8i16 _tmp4m = __msa_addv_h(__msa_subv_h(_r04, _r02), __msa_slli_h(__msa_subv_h(_r01, _r03), 1)); | |||
| v8i16 _tmp5m = __msa_subv_h(__msa_addv_h(__msa_slli_h(_r01, 2), _r05), __msa_mulv_h(_r03, _v5)); | |||
| __msa_st_h(_tmp0m, tmp[0][m], 0); | |||
| __msa_st_h(_tmp1m, tmp[1][m], 0); | |||
| __msa_st_h(_tmp2m, tmp[2][m], 0); | |||
| __msa_st_h(_tmp3m, tmp[3][m], 0); | |||
| __msa_st_h(_tmp4m, tmp[4][m], 0); | |||
| __msa_st_h(_tmp5m, tmp[5][m], 0); | |||
| r0 += w * 8; | |||
| } | |||
| short* r0_tm_0 = (short*)img0_tm + (i * w_tm / 6 + j) * 8; | |||
| short* r0_tm_1 = r0_tm_0 + tiles * 8; | |||
| short* r0_tm_2 = r0_tm_0 + tiles * 16; | |||
| short* r0_tm_3 = r0_tm_0 + tiles * 24; | |||
| short* r0_tm_4 = r0_tm_0 + tiles * 32; | |||
| short* r0_tm_5 = r0_tm_0 + tiles * 40; | |||
| for (int m = 0; m < 6; m++) | |||
| { | |||
| v8i16 _tmp00 = __msa_ld_h(tmp[m][0], 0); | |||
| v8i16 _tmp01 = __msa_ld_h(tmp[m][1], 0); | |||
| v8i16 _tmp02 = __msa_ld_h(tmp[m][2], 0); | |||
| v8i16 _tmp03 = __msa_ld_h(tmp[m][3], 0); | |||
| v8i16 _tmp04 = __msa_ld_h(tmp[m][4], 0); | |||
| v8i16 _tmp05 = __msa_ld_h(tmp[m][5], 0); | |||
| v8i16 _v5 = __msa_fill_h(5); | |||
| v8i16 _r0tm0 = __msa_subv_h(__msa_addv_h(__msa_slli_h(_tmp00, 2), _tmp04), __msa_mulv_h(_tmp02, _v5)); | |||
| v8i16 _r0tm1 = __msa_subv_h(__msa_addv_h(_tmp04, _tmp03), __msa_slli_h(__msa_addv_h(_tmp01, _tmp02), 2)); | |||
| v8i16 _r0tm2 = __msa_addv_h(__msa_subv_h(_tmp04, _tmp03), __msa_slli_h(__msa_subv_h(_tmp01, _tmp02), 2)); | |||
| v8i16 _r0tm3 = __msa_subv_h(__msa_subv_h(_tmp04, _tmp02), __msa_slli_h(__msa_subv_h(_tmp01, _tmp03), 1)); | |||
| v8i16 _r0tm4 = __msa_addv_h(__msa_subv_h(_tmp04, _tmp02), __msa_slli_h(__msa_subv_h(_tmp01, _tmp03), 1)); | |||
| v8i16 _r0tm5 = __msa_subv_h(__msa_addv_h(__msa_slli_h(_tmp01, 2), _tmp05), __msa_mulv_h(_tmp03, _v5)); | |||
| __msa_st_h(_r0tm0, r0_tm_0, 0); | |||
| __msa_st_h(_r0tm1, r0_tm_1, 0); | |||
| __msa_st_h(_r0tm2, r0_tm_2, 0); | |||
| __msa_st_h(_r0tm3, r0_tm_3, 0); | |||
| __msa_st_h(_r0tm4, r0_tm_4, 0); | |||
| __msa_st_h(_r0tm5, r0_tm_5, 0); | |||
| r0_tm_0 += tiles * 48; | |||
| r0_tm_1 += tiles * 48; | |||
| r0_tm_2 += tiles * 48; | |||
| r0_tm_3 += tiles * 48; | |||
| r0_tm_4 += tiles * 48; | |||
| r0_tm_5 += tiles * 48; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| bottom_blob_bordered = Mat(); | |||
| // END transform input | |||
| // BEGIN dot | |||
| Mat top_blob_tm; | |||
| { | |||
| int w_tm = outw / 4 * 6; | |||
| int h_tm = outh / 4 * 6; | |||
| const int tiles = h_tm / 6 * w_tm / 6; | |||
| // permute | |||
| // bottom_blob_tm.create(tiles, 36, inch, elemsize, elempack, opt.workspace_allocator); | |||
| Mat bottom_blob_tm2; | |||
| if (tiles >= 2) | |||
| bottom_blob_tm2.create(2 * inch, tiles / 2 + tiles % 2, 36, 2u * elempack, elempack, opt.workspace_allocator); | |||
| else // if (tiles >= 1) | |||
| bottom_blob_tm2.create(1 * inch, tiles, 36, 2u * elempack, elempack, opt.workspace_allocator); | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int r = 0; r < 36; r++) | |||
| { | |||
| Mat tm2 = bottom_blob_tm2.channel(r); | |||
| // tile | |||
| int i = 0; | |||
| for (; i + 1 < tiles; i += 2) | |||
| { | |||
| short* tmpptr = tm2.row<short>(i / 2); | |||
| const short* r0 = bottom_blob_tm; | |||
| r0 += (r * tiles + i) * 8; | |||
| for (int q = 0; q < inch; q++) | |||
| { | |||
| v8i16 _r0 = __msa_ld_h(r0, 0); | |||
| v8i16 _r1 = __msa_ld_h(r0 + 8, 0); | |||
| __msa_st_h(_r0, tmpptr, 0); | |||
| __msa_st_h(_r1, tmpptr + 8, 0); | |||
| r0 += bottom_blob_tm.cstep * 8; | |||
| tmpptr += 16; | |||
| } | |||
| } | |||
| for (; i < tiles; i++) | |||
| { | |||
| short* tmpptr = tm2.row<short>(i / 2 + i % 2); | |||
| const short* r0 = bottom_blob_tm; | |||
| r0 += (r * tiles + i) * 8; | |||
| for (int q = 0; q < inch; q++) | |||
| { | |||
| v8i16 _r0 = __msa_ld_h(r0, 0); | |||
| __msa_st_h(_r0, tmpptr, 0); | |||
| r0 += bottom_blob_tm.cstep * 8; | |||
| tmpptr += 8; | |||
| } | |||
| } | |||
| } | |||
| bottom_blob_tm = Mat(); | |||
| // permute end | |||
| top_blob_tm.create(tiles, 36, outch, 4u * 4, 4, opt.workspace_allocator); | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p = 0; p < outch; p++) | |||
| { | |||
| int* output0_tm = top_blob_tm.channel(p); | |||
| const Mat kernel0_tm = kernel_tm.channel(p); | |||
| for (int r = 0; r < 36; r++) | |||
| { | |||
| const Mat bb2 = bottom_blob_tm2.channel(r); | |||
| int i = 0; | |||
| for (; i + 1 < tiles; i += 2) | |||
| { | |||
| const short* r0 = bb2.row<const short>(i / 2); | |||
| const short* k0 = kernel0_tm.row<const short>(r); | |||
| int nn = inch; // inch always > 0 | |||
| v4i32 _sum0 = __msa_fill_w(0); | |||
| v4i32 _sum1 = __msa_fill_w(0); | |||
| v4i32 _sum2 = __msa_fill_w(0); | |||
| v4i32 _sum3 = __msa_fill_w(0); | |||
| for (int j = 0; j < nn; j++) | |||
| { | |||
| v8i16 _w0 = __msa_ld_h(k0, 0); | |||
| v8i16 _w1 = __msa_ld_h(k0 + 8, 0); | |||
| v8i16 _w2 = __msa_ld_h(k0 + 16, 0); | |||
| v8i16 _w3 = __msa_ld_h(k0 + 24, 0); | |||
| v8i16 _extw0 = __msa_clti_s_h(_w0, 0); | |||
| v8i16 _extw1 = __msa_clti_s_h(_w1, 0); | |||
| v8i16 _extw2 = __msa_clti_s_h(_w2, 0); | |||
| v8i16 _extw3 = __msa_clti_s_h(_w3, 0); | |||
| v4i32 _w0l = (v4i32)__msa_ilvr_h(_extw0, _w0); | |||
| v4i32 _w0h = (v4i32)__msa_ilvl_h(_extw0, _w0); | |||
| v4i32 _w1l = (v4i32)__msa_ilvr_h(_extw1, _w1); | |||
| v4i32 _w1h = (v4i32)__msa_ilvl_h(_extw1, _w1); | |||
| v4i32 _w2l = (v4i32)__msa_ilvr_h(_extw2, _w2); | |||
| v4i32 _w2h = (v4i32)__msa_ilvl_h(_extw2, _w2); | |||
| v4i32 _w3l = (v4i32)__msa_ilvr_h(_extw3, _w3); | |||
| v4i32 _w3h = (v4i32)__msa_ilvl_h(_extw3, _w3); | |||
| v4i32 _val0_0 = __msa_fill_w(r0[0]); | |||
| v4i32 _val0_1 = __msa_fill_w(r0[1]); | |||
| v4i32 _val0_2 = __msa_fill_w(r0[2]); | |||
| v4i32 _val0_3 = __msa_fill_w(r0[3]); | |||
| v4i32 _val0_4 = __msa_fill_w(r0[4]); | |||
| v4i32 _val0_5 = __msa_fill_w(r0[5]); | |||
| v4i32 _val0_6 = __msa_fill_w(r0[6]); | |||
| v4i32 _val0_7 = __msa_fill_w(r0[7]); | |||
| v4i32 _val1_0 = __msa_fill_w(r0[8]); | |||
| v4i32 _val1_1 = __msa_fill_w(r0[9]); | |||
| v4i32 _val1_2 = __msa_fill_w(r0[10]); | |||
| v4i32 _val1_3 = __msa_fill_w(r0[11]); | |||
| v4i32 _val1_4 = __msa_fill_w(r0[12]); | |||
| v4i32 _val1_5 = __msa_fill_w(r0[13]); | |||
| v4i32 _val1_6 = __msa_fill_w(r0[14]); | |||
| v4i32 _val1_7 = __msa_fill_w(r0[15]); | |||
| _sum0 = __msa_maddv_w(_sum0, _w0l, _val0_0); | |||
| _sum1 = __msa_maddv_w(_sum1, _w0h, _val0_1); | |||
| _sum2 = __msa_maddv_w(_sum2, _w0l, _val1_0); | |||
| _sum3 = __msa_maddv_w(_sum3, _w0h, _val1_1); | |||
| _sum0 = __msa_maddv_w(_sum0, _w1l, _val0_2); | |||
| _sum1 = __msa_maddv_w(_sum1, _w1h, _val0_3); | |||
| _sum2 = __msa_maddv_w(_sum2, _w1l, _val1_2); | |||
| _sum3 = __msa_maddv_w(_sum3, _w1h, _val1_3); | |||
| _sum0 = __msa_maddv_w(_sum0, _w2l, _val0_4); | |||
| _sum1 = __msa_maddv_w(_sum1, _w2h, _val0_5); | |||
| _sum2 = __msa_maddv_w(_sum2, _w2l, _val1_4); | |||
| _sum3 = __msa_maddv_w(_sum3, _w2h, _val1_5); | |||
| _sum0 = __msa_maddv_w(_sum0, _w3l, _val0_6); | |||
| _sum1 = __msa_maddv_w(_sum1, _w3h, _val0_7); | |||
| _sum2 = __msa_maddv_w(_sum2, _w3l, _val1_6); | |||
| _sum3 = __msa_maddv_w(_sum3, _w3h, _val1_7); | |||
| r0 += 16; | |||
| k0 += 32; | |||
| } | |||
| _sum0 = __msa_addv_w(_sum0, _sum1); | |||
| _sum2 = __msa_addv_w(_sum2, _sum3); | |||
| __msa_st_w(_sum0, output0_tm, 0); | |||
| __msa_st_w(_sum2, output0_tm + 4, 0); | |||
| output0_tm += 8; | |||
| } | |||
| for (; i < tiles; i++) | |||
| { | |||
| const short* r0 = bb2.row<const short>(i / 2 + i % 2); | |||
| const short* k0 = kernel0_tm.row<const short>(r); | |||
| int nn = inch; // inch always > 0 | |||
| v4i32 _sum0 = __msa_fill_w(0); | |||
| v4i32 _sum1 = __msa_fill_w(0); | |||
| for (int j = 0; j < nn; j++) | |||
| { | |||
| v8i16 _w0 = __msa_ld_h(k0, 0); | |||
| v8i16 _w1 = __msa_ld_h(k0 + 8, 0); | |||
| v8i16 _w2 = __msa_ld_h(k0 + 16, 0); | |||
| v8i16 _w3 = __msa_ld_h(k0 + 24, 0); | |||
| v8i16 _extw0 = __msa_clti_s_h(_w0, 0); | |||
| v8i16 _extw1 = __msa_clti_s_h(_w1, 0); | |||
| v8i16 _extw2 = __msa_clti_s_h(_w2, 0); | |||
| v8i16 _extw3 = __msa_clti_s_h(_w3, 0); | |||
| v4i32 _w0l = (v4i32)__msa_ilvr_h(_extw0, _w0); | |||
| v4i32 _w0h = (v4i32)__msa_ilvl_h(_extw0, _w0); | |||
| v4i32 _w1l = (v4i32)__msa_ilvr_h(_extw1, _w1); | |||
| v4i32 _w1h = (v4i32)__msa_ilvl_h(_extw1, _w1); | |||
| v4i32 _w2l = (v4i32)__msa_ilvr_h(_extw2, _w2); | |||
| v4i32 _w2h = (v4i32)__msa_ilvl_h(_extw2, _w2); | |||
| v4i32 _w3l = (v4i32)__msa_ilvr_h(_extw3, _w3); | |||
| v4i32 _w3h = (v4i32)__msa_ilvl_h(_extw3, _w3); | |||
| v4i32 _val0 = __msa_fill_w(r0[0]); | |||
| v4i32 _val1 = __msa_fill_w(r0[1]); | |||
| v4i32 _val2 = __msa_fill_w(r0[2]); | |||
| v4i32 _val3 = __msa_fill_w(r0[3]); | |||
| v4i32 _val4 = __msa_fill_w(r0[4]); | |||
| v4i32 _val5 = __msa_fill_w(r0[5]); | |||
| v4i32 _val6 = __msa_fill_w(r0[6]); | |||
| v4i32 _val7 = __msa_fill_w(r0[7]); | |||
| _sum0 = __msa_maddv_w(_sum0, _w0l, _val0); | |||
| _sum1 = __msa_maddv_w(_sum1, _w0h, _val1); | |||
| _sum0 = __msa_maddv_w(_sum0, _w1l, _val2); | |||
| _sum1 = __msa_maddv_w(_sum1, _w1h, _val3); | |||
| _sum0 = __msa_maddv_w(_sum0, _w2l, _val4); | |||
| _sum1 = __msa_maddv_w(_sum1, _w2h, _val5); | |||
| _sum0 = __msa_maddv_w(_sum0, _w3l, _val6); | |||
| _sum1 = __msa_maddv_w(_sum1, _w3h, _val7); | |||
| r0 += 8; | |||
| k0 += 32; | |||
| } | |||
| _sum0 = __msa_addv_w(_sum0, _sum1); | |||
| __msa_st_w(_sum0, output0_tm, 0); | |||
| output0_tm += 4; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| bottom_blob_tm = Mat(); | |||
| // END dot | |||
| // BEGIN transform output | |||
| Mat top_blob_bordered; | |||
| if (outw == top_blob.w && outh == top_blob.h) | |||
| { | |||
| top_blob_bordered = top_blob; | |||
| } | |||
| else | |||
| { | |||
| top_blob_bordered.create(outw, outh, outch, 4u * 4, 4, opt.workspace_allocator); | |||
| } | |||
| { | |||
| // const float otm[4][6] = { | |||
| // {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f}, | |||
| // {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 0.0f}, | |||
| // {0.0f, 1.0f, 1.0f, 4.0f, 4.0f, 0.0f}, | |||
| // {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 1.0f} | |||
| // }; | |||
| // 0 = r00 + (r01 + r02) + (r03 + r04) | |||
| // 1 = (r01 - r02) + (r03 - r04) * 2 | |||
| // 2 = (r01 + r02) + (r03 + r04) * 4 | |||
| // 3 = r05 + (r01 - r02) + (r03 - r04) * 8 | |||
| int w_tm = outw / 4 * 6; | |||
| int h_tm = outh / 4 * 6; | |||
| const int tiles = w_tm / 6 * h_tm / 6; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p = 0; p < outch; p++) | |||
| { | |||
| const Mat out0_tm = top_blob_tm.channel(p); | |||
| Mat out0 = top_blob_bordered.channel(p); | |||
| int tmp[4][6][4]; | |||
| // tile | |||
| for (int i = 0; i < outh / 4; i++) | |||
| { | |||
| for (int j = 0; j < outw / 4; j++) | |||
| { | |||
| // top_blob_tm.create(tiles, 36, outch, elemsize, elempack); | |||
| const int* output0_tm_0 = (const int*)out0_tm + (i * w_tm / 6 + j) * 4; | |||
| const int* output0_tm_1 = output0_tm_0 + tiles * 4; | |||
| const int* output0_tm_2 = output0_tm_0 + tiles * 8; | |||
| const int* output0_tm_3 = output0_tm_0 + tiles * 12; | |||
| const int* output0_tm_4 = output0_tm_0 + tiles * 16; | |||
| const int* output0_tm_5 = output0_tm_0 + tiles * 20; | |||
| int* output0 = out0.row<int>(i * 4) + (j * 4) * 4; | |||
| for (int m = 0; m < 5; m++) | |||
| { | |||
| v4i32 _out0tm0 = __msa_ld_w(output0_tm_0, 0); | |||
| v4i32 _out0tm1 = __msa_ld_w(output0_tm_1, 0); | |||
| v4i32 _out0tm2 = __msa_ld_w(output0_tm_2, 0); | |||
| v4i32 _out0tm3 = __msa_ld_w(output0_tm_3, 0); | |||
| v4i32 _out0tm4 = __msa_ld_w(output0_tm_4, 0); | |||
| v4i32 _out0tm5 = __msa_ld_w(output0_tm_5, 0); | |||
| v4i32 _tmp02a = __msa_addv_w(_out0tm1, _out0tm2); | |||
| v4i32 _tmp13a = __msa_subv_w(_out0tm1, _out0tm2); | |||
| v4i32 _tmp02b = __msa_addv_w(_out0tm3, _out0tm4); | |||
| v4i32 _tmp13b = __msa_subv_w(_out0tm3, _out0tm4); | |||
| v4i32 _tmp0m = __msa_addv_w(__msa_addv_w(_out0tm0, _tmp02a), _tmp02b); | |||
| v4i32 _tmp1m = __msa_addv_w(_tmp13a, __msa_slli_w(_tmp13b, 1)); | |||
| v4i32 _tmp2m = __msa_addv_w(_tmp02a, __msa_slli_w(_tmp02b, 2)); | |||
| v4i32 _tmp3m = __msa_addv_w(__msa_addv_w(_tmp13a, __msa_slli_w(_out0tm5, 2)), __msa_slli_w(_tmp13b, 3)); | |||
| __msa_st_w(_tmp0m, tmp[0][m], 0); | |||
| __msa_st_w(_tmp1m, tmp[1][m], 0); | |||
| __msa_st_w(_tmp2m, tmp[2][m], 0); | |||
| __msa_st_w(_tmp3m, tmp[3][m], 0); | |||
| output0_tm_0 += tiles * 24; | |||
| output0_tm_1 += tiles * 24; | |||
| output0_tm_2 += tiles * 24; | |||
| output0_tm_3 += tiles * 24; | |||
| output0_tm_4 += tiles * 24; | |||
| output0_tm_5 += tiles * 24; | |||
| } | |||
| for (int m = 5; m < 6; m++) | |||
| { | |||
| v4i32 _out0tm0 = __msa_ld_w(output0_tm_0, 0); | |||
| v4i32 _out0tm1 = __msa_ld_w(output0_tm_1, 0); | |||
| v4i32 _out0tm2 = __msa_ld_w(output0_tm_2, 0); | |||
| v4i32 _out0tm3 = __msa_ld_w(output0_tm_3, 0); | |||
| v4i32 _out0tm4 = __msa_ld_w(output0_tm_4, 0); | |||
| v4i32 _out0tm5 = __msa_ld_w(output0_tm_5, 0); | |||
| v4i32 _tmp02a = __msa_addv_w(_out0tm1, _out0tm2); | |||
| v4i32 _tmp13a = __msa_subv_w(_out0tm1, _out0tm2); | |||
| v4i32 _tmp02b = __msa_addv_w(_out0tm3, _out0tm4); | |||
| v4i32 _tmp13b = __msa_subv_w(_out0tm3, _out0tm4); | |||
| v4i32 _tmp0m = __msa_addv_w(__msa_addv_w(_out0tm0, _tmp02a), _tmp02b); | |||
| v4i32 _tmp1m = __msa_addv_w(_tmp13a, __msa_slli_w(_tmp13b, 1)); | |||
| v4i32 _tmp2m = __msa_addv_w(_tmp02a, __msa_slli_w(_tmp02b, 2)); | |||
| v4i32 _tmp3m = __msa_addv_w(__msa_addv_w(_tmp13a, __msa_slli_w(_out0tm5, 2)), __msa_slli_w(_tmp13b, 3)); | |||
| _tmp0m = __msa_slli_w(_tmp0m, 2); | |||
| _tmp1m = __msa_slli_w(_tmp1m, 2); | |||
| _tmp2m = __msa_slli_w(_tmp2m, 2); | |||
| _tmp3m = __msa_slli_w(_tmp3m, 2); | |||
| __msa_st_w(_tmp0m, tmp[0][m], 0); | |||
| __msa_st_w(_tmp1m, tmp[1][m], 0); | |||
| __msa_st_w(_tmp2m, tmp[2][m], 0); | |||
| __msa_st_w(_tmp3m, tmp[3][m], 0); | |||
| output0_tm_0 += tiles * 24; | |||
| output0_tm_1 += tiles * 24; | |||
| output0_tm_2 += tiles * 24; | |||
| output0_tm_3 += tiles * 24; | |||
| output0_tm_4 += tiles * 24; | |||
| output0_tm_5 += tiles * 24; | |||
| } | |||
| for (int m = 0; m < 4; m++) | |||
| { | |||
| v4i32 _tmp00 = __msa_ld_w(tmp[m][0], 0); | |||
| v4i32 _tmp01 = __msa_ld_w(tmp[m][1], 0); | |||
| v4i32 _tmp02 = __msa_ld_w(tmp[m][2], 0); | |||
| v4i32 _tmp03 = __msa_ld_w(tmp[m][3], 0); | |||
| v4i32 _tmp04 = __msa_ld_w(tmp[m][4], 0); | |||
| v4i32 _tmp05 = __msa_ld_w(tmp[m][5], 0); | |||
| v4i32 _tmp02a = __msa_addv_w(_tmp01, _tmp02); | |||
| v4i32 _tmp13a = __msa_subv_w(_tmp01, _tmp02); | |||
| v4i32 _tmp02b = __msa_addv_w(_tmp03, _tmp04); | |||
| v4i32 _tmp13b = __msa_subv_w(_tmp03, _tmp04); | |||
| v4i32 _out00 = __msa_addv_w(__msa_addv_w(_tmp00, _tmp02a), _tmp02b); | |||
| v4i32 _out01 = __msa_addv_w(_tmp13a, __msa_slli_w(_tmp13b, 1)); | |||
| v4i32 _out02 = __msa_addv_w(_tmp02a, __msa_slli_w(_tmp02b, 2)); | |||
| v4i32 _out03 = __msa_addv_w(__msa_addv_w(_tmp05, _tmp13a), __msa_slli_w(_tmp13b, 3)); | |||
| // TODO use integer trick for division by 576 | |||
| v4f32 _v576 = __msa_fill_w_f32(1.0 / 576); | |||
| _out00 = __msa_ftint_s_w(__msa_fmul_w(__msa_ffint_s_w(_out00), _v576)); | |||
| _out01 = __msa_ftint_s_w(__msa_fmul_w(__msa_ffint_s_w(_out01), _v576)); | |||
| _out02 = __msa_ftint_s_w(__msa_fmul_w(__msa_ffint_s_w(_out02), _v576)); | |||
| _out03 = __msa_ftint_s_w(__msa_fmul_w(__msa_ffint_s_w(_out03), _v576)); | |||
| __msa_st_w(_out00, output0, 0); | |||
| __msa_st_w(_out01, output0 + 4, 0); | |||
| __msa_st_w(_out02, output0 + 8, 0); | |||
| __msa_st_w(_out03, output0 + 12, 0); | |||
| output0 += outw * 4; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| // END transform output | |||
| // cut result pad | |||
| copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt); | |||
| } | |||
| @@ -0,0 +1,82 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void convolution_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int channels = bottom_blob.c; | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| int outch = top_blob.c; | |||
| const int maxk = kernel_w * kernel_h; | |||
| // kernel offsets | |||
| std::vector<int> _space_ofs(maxk); | |||
| int* space_ofs = &_space_ofs[0]; | |||
| { | |||
| int p1 = 0; | |||
| int p2 = 0; | |||
| int gap = w * dilation_h - kernel_w * dilation_w; | |||
| for (int i = 0; i < kernel_h; i++) | |||
| { | |||
| for (int j = 0; j < kernel_w; j++) | |||
| { | |||
| space_ofs[p1] = p2; | |||
| p1++; | |||
| p2 += dilation_w; | |||
| } | |||
| p2 += gap; | |||
| } | |||
| } | |||
| // num_output | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p = 0; p < outch; p++) | |||
| { | |||
| int* outptr = top_blob.channel(p); | |||
| for (int i = 0; i < outh; i++) | |||
| { | |||
| for (int j = 0; j < outw; j++) | |||
| { | |||
| int sum = 0; | |||
| // const signed char* kptr = weight_data_int8.channel(p); | |||
| const signed char* kptr = (const signed char*)weight_data_int8 + maxk * channels * p; | |||
| // channels | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| const Mat m = bottom_blob.channel(q); | |||
| const signed char* sptr = m.row<signed char>(i * stride_h) + j * stride_w; | |||
| for (int k = 0; k < maxk; k++) | |||
| { | |||
| signed char val = sptr[space_ofs[k]]; | |||
| signed char w = kptr[k]; | |||
| sum += val * w; | |||
| } | |||
| kptr += maxk; | |||
| } | |||
| outptr[j] = sum; | |||
| } | |||
| outptr += outw; | |||
| } | |||
| } | |||
| } | |||
| @@ -32,6 +32,12 @@ namespace ncnn { | |||
| #include "convolution_sgemm.h" | |||
| #include "convolution_1x1.h" | |||
| #if NCNN_INT8 | |||
| #include "convolution_sgemm_int8.h" | |||
| #include "convolution_1x1_int8.h" | |||
| #include "convolution_int8.h" | |||
| #endif // NCNN_INT8 | |||
| #if __mips_msa | |||
| #include "convolution_pack4.h" | |||
| #include "convolution_pack1to4.h" | |||
| @@ -44,6 +50,20 @@ namespace ncnn { | |||
| #include "convolution_3x3_pack4.h" | |||
| #include "convolution_3x3_pack1to4.h" | |||
| #include "convolution_7x7_pack1to4.h" | |||
| #if NCNN_INT8 | |||
| #include "convolution_pack8to4_int8.h" | |||
| #include "convolution_pack1to4_int8.h" | |||
| #include "convolution_pack8to1_int8.h" | |||
| #include "convolution_sgemm_pack8to4_int8.h" | |||
| #include "convolution_sgemm_pack1to4_int8.h" | |||
| #include "convolution_sgemm_pack8to1_int8.h" | |||
| #include "convolution_1x1_pack8to4_int8.h" | |||
| #include "convolution_1x1_pack1to4_int8.h" | |||
| #include "convolution_1x1_pack8to1_int8.h" | |||
| #include "convolution_3x3_pack8to4_int8.h" | |||
| #include "convolution_3x3_pack8to1_int8.h" | |||
| #endif // NCNN_INT8 | |||
| #endif // __mips_msa | |||
| Convolution_mips::Convolution_mips() | |||
| @@ -98,6 +118,13 @@ int Convolution_mips::create_pipeline(const Option& opt) | |||
| activation = create_activation_layer(activation_type, activation_params, opt); | |||
| #if NCNN_INT8 | |||
| if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u) | |||
| { | |||
| return create_pipeline_int8_mips(opt); | |||
| } | |||
| #endif | |||
| const int maxk = kernel_w * kernel_h; | |||
| const int num_input = weight_data_size / maxk / num_output; | |||
| @@ -117,8 +144,8 @@ int Convolution_mips::create_pipeline(const Option& opt) | |||
| { | |||
| if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && num_input >= 16 && num_output >= 16) | |||
| { | |||
| conv3x3s1_winograd64_transform_kernel_pack4_msa(weight_data, weight_data_packed, num_input, num_output, opt); | |||
| conv3x3s1_winograd42_transform_kernel_pack4_msa(weight_data, weight_3x3_winograd42_data_packed, num_input, num_output, opt); | |||
| conv3x3s1_winograd64_transform_kernel_pack4_msa(weight_data, weight_3x3_winograd64_data, num_input, num_output, opt); | |||
| conv3x3s1_winograd42_transform_kernel_pack4_msa(weight_data, weight_3x3_winograd42_data, num_input, num_output, opt); | |||
| } | |||
| else | |||
| { | |||
| @@ -187,27 +214,7 @@ int Convolution_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Optio | |||
| #if NCNN_INT8 | |||
| if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u) | |||
| { | |||
| Mat bottom_blob_unpacked = bottom_blob; | |||
| if (bottom_blob.elempack != 1) | |||
| { | |||
| Option opt_pack1 = opt; | |||
| opt_pack1.blob_allocator = opt.workspace_allocator; | |||
| convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1); | |||
| } | |||
| Mat bottom_blob_unpacked_fp32 = bottom_blob_unpacked; | |||
| if (bottom_blob_unpacked.elembits() == 16) | |||
| { | |||
| Option opt_pack1 = opt; | |||
| opt_pack1.blob_allocator = opt.workspace_allocator; | |||
| cast_float16_to_float32(bottom_blob_unpacked, bottom_blob_unpacked_fp32, opt_pack1); | |||
| } | |||
| Option opt_unpacked = opt; | |||
| opt_unpacked.use_packing_layout = false; | |||
| return Convolution::forward_int8(bottom_blob_unpacked_fp32, top_blob, opt_unpacked); | |||
| return forward_int8_mips(bottom_blob, top_blob, opt); | |||
| } | |||
| #endif | |||
| @@ -278,11 +285,11 @@ int Convolution_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Optio | |||
| // we need more proper conditions | |||
| if ((w <= 10 || (w >= 15 && w <= 18) || w == 21 || w == 22) && (h <= 10 || (h >= 15 && h <= 18) || h == 21 || h == 22)) | |||
| { | |||
| conv3x3s1_winograd42_pack4_msa(bottom_blob_bordered, top_blob, weight_3x3_winograd42_data_packed, bias_data, opt); | |||
| conv3x3s1_winograd42_pack4_msa(bottom_blob_bordered, top_blob, weight_3x3_winograd42_data, bias_data, opt); | |||
| } | |||
| else | |||
| { | |||
| conv3x3s1_winograd64_pack4_msa(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt); | |||
| conv3x3s1_winograd64_pack4_msa(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data, opt); | |||
| } | |||
| if (activation) | |||
| @@ -542,4 +549,408 @@ int Convolution_mips::forward(const std::vector<Mat>& bottom_blobs, std::vector< | |||
| return 0; | |||
| } | |||
| #if NCNN_INT8 | |||
| static void convolution_transform_kernel_packed_int8_msa(const Mat& weight_data, Mat& weight_data_int8, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack) | |||
| { | |||
| const int maxk = kernel_w * kernel_h; | |||
| // src = kw-kh-inch-outch | |||
| // dst = pa-pb-kw-kh-inch/pa-outch/pb | |||
| { | |||
| Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output); | |||
| weight_data_int8.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)elempack * out_elempack, elempack * out_elempack); | |||
| for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack) | |||
| { | |||
| signed char* g00 = weight_data_int8.channel(q / out_elempack); | |||
| for (int p = 0; p + (elempack - 1) < num_input; p += elempack) | |||
| { | |||
| for (int k = 0; k < maxk; k++) | |||
| { | |||
| for (int i = 0; i < out_elempack; i++) | |||
| { | |||
| for (int j = 0; j < elempack; j++) | |||
| { | |||
| const signed char* k00 = weight_data_r2.channel(q + i).row<const signed char>(p + j); | |||
| g00[0] = k00[k]; | |||
| g00++; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| int Convolution_mips::create_pipeline_int8_mips(const Option& opt) | |||
| { | |||
| const int maxk = kernel_w * kernel_h; | |||
| const int num_input = weight_data_size / maxk / num_output; | |||
| int elempack = 1; | |||
| int out_elempack = 1; | |||
| #if __mips_msa | |||
| if (opt.use_packing_layout) | |||
| { | |||
| elempack = num_input % 8 == 0 ? 8 : 1; | |||
| out_elempack = num_output % 4 == 0 ? 4 : 1; | |||
| } | |||
| #endif // __mips_msa | |||
| #if __mips_msa | |||
| if (elempack == 8 && out_elempack == 4) | |||
| { | |||
| if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_pack8to4_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); | |||
| } | |||
| else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_pack8to4_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); | |||
| } | |||
| else if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| conv3x3s1_winograd42_transform_kernel_pack8to4_int8_msa(weight_data, weight_3x3_winograd42_data, num_input, num_output, opt); | |||
| } | |||
| else if (opt.use_sgemm_convolution) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_pack8to4_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); | |||
| } | |||
| else | |||
| { | |||
| convolution_transform_kernel_packed_int8_msa(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); | |||
| } | |||
| } | |||
| if (elempack == 1 && out_elempack == 4) | |||
| { | |||
| if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_pack1to4_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); | |||
| } | |||
| else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_pack1to4_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); | |||
| } | |||
| else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_pack1to4_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); | |||
| } | |||
| else | |||
| { | |||
| convolution_transform_kernel_packed_int8_msa(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); | |||
| } | |||
| } | |||
| if (elempack == 8 && out_elempack == 1) | |||
| { | |||
| if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_pack8to1_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); | |||
| } | |||
| else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_pack8to1_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); | |||
| } | |||
| else if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| conv3x3s1_winograd42_transform_kernel_pack8to1_int8_msa(weight_data, weight_3x3_winograd42_data, num_input, num_output, opt); | |||
| } | |||
| else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_pack8to1_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); | |||
| } | |||
| else | |||
| { | |||
| convolution_transform_kernel_packed_int8_msa(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); | |||
| } | |||
| } | |||
| #endif // __mips_msa | |||
| if (elempack == 1 && out_elempack == 1) | |||
| { | |||
| if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); | |||
| } | |||
| else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); | |||
| } | |||
| else if (opt.use_sgemm_convolution) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| int Convolution_mips::forward_int8_mips(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const | |||
| { | |||
| int elembits = bottom_blob.elembits(); | |||
| Mat bottom_blob_int8 = bottom_blob; | |||
| if (elembits != 8) | |||
| { | |||
| Option opt_q = opt; | |||
| opt_q.blob_allocator = opt.workspace_allocator; | |||
| quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_q); | |||
| } | |||
| Mat bottom_blob_bordered; | |||
| make_padding(bottom_blob_int8, bottom_blob_bordered, opt); | |||
| if (bottom_blob_bordered.empty()) | |||
| return -100; | |||
| int w = bottom_blob_bordered.w; | |||
| int h = bottom_blob_bordered.h; | |||
| int channels = bottom_blob_bordered.c; | |||
| int elempack = bottom_blob_bordered.elempack; | |||
| const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; | |||
| const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; | |||
| int outw = (w - kernel_extent_w) / stride_w + 1; | |||
| int outh = (h - kernel_extent_h) / stride_h + 1; | |||
| bool use_int8_requantize = int8_scale_term > 100; | |||
| int out_elempack = 1; | |||
| #if __mips_msa | |||
| if (opt.use_packing_layout) | |||
| { | |||
| if (use_int8_requantize) | |||
| out_elempack = num_output % 8 == 0 ? 8 : 1; | |||
| else | |||
| out_elempack = num_output % 4 == 0 ? 4 : 1; | |||
| } | |||
| #endif // __mips_msa | |||
| size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack; | |||
| top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| const int num_input = channels * elempack; | |||
| int out_elempack_int32 = 1; | |||
| #if __mips_msa | |||
| if (opt.use_packing_layout) | |||
| { | |||
| out_elempack_int32 = num_output % 4 == 0 ? 4 : 1; | |||
| } | |||
| #endif // __mips_msa | |||
| Mat top_blob_int32; | |||
| top_blob_int32.create(outw, outh, num_output / out_elempack_int32, (size_t)(4u * out_elempack_int32), out_elempack_int32, opt.workspace_allocator); | |||
| if (top_blob_int32.empty()) | |||
| return -100; | |||
| #if __mips_msa | |||
| if (elempack == 8 && out_elempack_int32 == 4) | |||
| { | |||
| if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| conv1x1s1_sgemm_pack8to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt); | |||
| } | |||
| else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| conv1x1s2_sgemm_pack8to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt); | |||
| } | |||
| else if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| conv3x3s1_winograd42_pack8to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_3x3_winograd42_data, opt); | |||
| } | |||
| else if (opt.use_sgemm_convolution) | |||
| { | |||
| convolution_im2col_sgemm_pack8to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| } | |||
| else | |||
| { | |||
| convolution_pack8to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_data_int8, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| } | |||
| Mat scale_in_data(num_output); | |||
| for (int p = 0; p < num_output; p++) | |||
| { | |||
| // requantize and relu | |||
| float scale_in; | |||
| if (weight_data_int8_scales[p] == 0) | |||
| scale_in = 0; | |||
| else | |||
| scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]); | |||
| scale_in_data[p] = scale_in; | |||
| } | |||
| if (use_int8_requantize) | |||
| { | |||
| requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt); | |||
| } | |||
| else | |||
| { | |||
| dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt); | |||
| if (activation) | |||
| { | |||
| activation->forward_inplace(top_blob, opt); | |||
| } | |||
| } | |||
| } | |||
| if (elempack == 1 && out_elempack_int32 == 4) | |||
| { | |||
| if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| conv1x1s1_sgemm_pack1to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt); | |||
| } | |||
| else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| conv1x1s2_sgemm_pack1to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt); | |||
| } | |||
| else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8) | |||
| { | |||
| convolution_im2col_sgemm_pack1to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| } | |||
| else | |||
| { | |||
| convolution_pack1to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_data_int8, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| } | |||
| Mat scale_in_data(num_output); | |||
| for (int p = 0; p < num_output; p++) | |||
| { | |||
| // requantize and relu | |||
| float scale_in; | |||
| if (weight_data_int8_scales[p] == 0) | |||
| scale_in = 0; | |||
| else | |||
| scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]); | |||
| scale_in_data[p] = scale_in; | |||
| } | |||
| if (use_int8_requantize) | |||
| { | |||
| requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt); | |||
| } | |||
| else | |||
| { | |||
| dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt); | |||
| if (activation) | |||
| { | |||
| activation->forward_inplace(top_blob, opt); | |||
| } | |||
| } | |||
| } | |||
| if (elempack == 8 && out_elempack_int32 == 1) | |||
| { | |||
| if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| conv1x1s1_sgemm_pack8to1_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt); | |||
| } | |||
| else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| conv1x1s2_sgemm_pack8to1_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt); | |||
| } | |||
| else if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| conv3x3s1_winograd42_pack8to1_int8_msa(bottom_blob_bordered, top_blob_int32, weight_3x3_winograd42_data, opt); | |||
| } | |||
| else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8) | |||
| { | |||
| convolution_im2col_sgemm_pack8to1_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| } | |||
| else | |||
| { | |||
| convolution_pack8to1_int8_msa(bottom_blob_bordered, top_blob_int32, weight_data_int8, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| } | |||
| Mat scale_in_data(num_output); | |||
| for (int p = 0; p < num_output; p++) | |||
| { | |||
| // requantize and relu | |||
| float scale_in; | |||
| if (weight_data_int8_scales[p] == 0) | |||
| scale_in = 0; | |||
| else | |||
| scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]); | |||
| scale_in_data[p] = scale_in; | |||
| } | |||
| if (use_int8_requantize) | |||
| { | |||
| requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt); | |||
| } | |||
| else | |||
| { | |||
| dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt); | |||
| if (activation) | |||
| { | |||
| activation->forward_inplace(top_blob, opt); | |||
| } | |||
| } | |||
| } | |||
| #endif // __mips_msa | |||
| if (elempack == 1 && out_elempack_int32 == 1) | |||
| { | |||
| if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| conv1x1s1_sgemm_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt); | |||
| } | |||
| else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| conv1x1s2_sgemm_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt); | |||
| } | |||
| else if (opt.use_sgemm_convolution) | |||
| { | |||
| convolution_im2col_sgemm_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| } | |||
| else | |||
| { | |||
| // convolution_int8(bottom_blob_bordered, top_blob_int32, weight_data_int8, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| convolution_int8(bottom_blob_bordered, top_blob_int32, weight_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| } | |||
| Mat scale_in_data(num_output); | |||
| for (int p = 0; p < num_output; p++) | |||
| { | |||
| // requantize and relu | |||
| float scale_in; | |||
| if (weight_data_int8_scales[p] == 0) | |||
| scale_in = 0; | |||
| else | |||
| scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]); | |||
| scale_in_data[p] = scale_in; | |||
| } | |||
| if (use_int8_requantize) | |||
| { | |||
| requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt); | |||
| } | |||
| else | |||
| { | |||
| dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt); | |||
| if (activation) | |||
| { | |||
| activation->forward_inplace(top_blob, opt); | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| #endif // NCNN_INT8 | |||
| } // namespace ncnn | |||
| @@ -31,12 +31,27 @@ public: | |||
| virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const; | |||
| protected: | |||
| #if NCNN_INT8 | |||
| int create_pipeline_int8_mips(const Option& opt); | |||
| int forward_int8_mips(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| #endif | |||
| public: | |||
| Layer* activation; | |||
| // packn | |||
| Mat weight_sgemm_data; | |||
| Mat weight_3x3_winograd42_data; | |||
| Mat weight_3x3_winograd64_data; | |||
| // pack4 | |||
| Mat weight_data_packed; | |||
| Mat weight_3x3_winograd42_data_packed; | |||
| #if NCNN_INT8 | |||
| // int8 | |||
| Mat weight_data_int8; | |||
| #endif | |||
| }; | |||
| } // namespace ncnn | |||
| @@ -0,0 +1,87 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void convolution_pack1to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int channels = bottom_blob.c; | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| int outch = top_blob.c; | |||
| const int maxk = kernel_w * kernel_h; | |||
| // kernel offsets | |||
| std::vector<int> _space_ofs(maxk); | |||
| int* space_ofs = &_space_ofs[0]; | |||
| { | |||
| int p1 = 0; | |||
| int p2 = 0; | |||
| int gap = w * dilation_h - kernel_w * dilation_w; | |||
| for (int i = 0; i < kernel_h; i++) | |||
| { | |||
| for (int j = 0; j < kernel_w; j++) | |||
| { | |||
| space_ofs[p1] = p2; | |||
| p1++; | |||
| p2 += dilation_w; | |||
| } | |||
| p2 += gap; | |||
| } | |||
| } | |||
| // num_output | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p = 0; p < outch; p++) | |||
| { | |||
| int* outptr = top_blob.channel(p); | |||
| for (int i = 0; i < outh; i++) | |||
| { | |||
| for (int j = 0; j < outw; j++) | |||
| { | |||
| v4i32 _sum = __msa_fill_w(0); | |||
| const signed char* kptr = weight_data_int8.channel(p); | |||
| // channels | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| const Mat m = bottom_blob.channel(q); | |||
| const signed char* sptr = m.row<const signed char>(i * stride_h) + j * stride_w; | |||
| for (int k = 0; k < maxk; k++) | |||
| { | |||
| v8i16 _val = __msa_fill_h((short)sptr[space_ofs[k]]); | |||
| v16i8 _w = __msa_ld_b(kptr, 0); | |||
| v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w); | |||
| v8i16 _s0 = __msa_mulv_h(_val, _w16); | |||
| v4i32 _s032 = (v4i32)__msa_ilvr_h(__msa_clti_s_h(_s0, 0), _s0); | |||
| _sum = __msa_addv_w(_sum, _s032); | |||
| kptr += 4; | |||
| } | |||
| } | |||
| __msa_st_w(_sum, outptr + j * 4, 0); | |||
| } | |||
| outptr += outw * 4; | |||
| } | |||
| } | |||
| } | |||
| @@ -81,7 +81,7 @@ static void convolution_pack4to1_msa(const Mat& bottom_blob, Mat& top_blob, cons | |||
| } | |||
| } | |||
| sum += __msa_fhadd_w(_sum); | |||
| sum += __msa_reduce_fadd_w(_sum); | |||
| sum = activation_ss(sum, activation_type, activation_params); | |||
| @@ -0,0 +1,87 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void convolution_pack8to1_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int channels = bottom_blob.c; | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| int outch = top_blob.c; | |||
| const int maxk = kernel_w * kernel_h; | |||
| // kernel offsets | |||
| std::vector<int> _space_ofs(maxk); | |||
| int* space_ofs = &_space_ofs[0]; | |||
| { | |||
| int p1 = 0; | |||
| int p2 = 0; | |||
| int gap = w * dilation_h - kernel_w * dilation_w; | |||
| for (int i = 0; i < kernel_h; i++) | |||
| { | |||
| for (int j = 0; j < kernel_w; j++) | |||
| { | |||
| space_ofs[p1] = p2; | |||
| p1++; | |||
| p2 += dilation_w; | |||
| } | |||
| p2 += gap; | |||
| } | |||
| } | |||
| // num_output | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p = 0; p < outch; p++) | |||
| { | |||
| int* outptr = top_blob.channel(p); | |||
| for (int i = 0; i < outh; i++) | |||
| { | |||
| for (int j = 0; j < outw; j++) | |||
| { | |||
| v4i32 _sum = __msa_fill_w(0); | |||
| const signed char* kptr = weight_data_int8.channel(p); | |||
| // channels | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| const Mat m = bottom_blob.channel(q); | |||
| const signed char* sptr = m.row<const signed char>(i * stride_h) + j * stride_w * 8; | |||
| for (int k = 0; k < maxk; k++) | |||
| { | |||
| v16i8 _val = __msa_ld_b(sptr + space_ofs[k] * 8, 0); | |||
| v8i16 _val16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val); | |||
| v16i8 _w = __msa_ld_b(kptr, 0); | |||
| v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w); | |||
| v8i16 _s0 = __msa_mulv_h(_val16, _w16); | |||
| _sum = __msa_addv_w(_sum, __msa_hadd_s_w(_s0, _s0)); | |||
| kptr += 8; | |||
| } | |||
| } | |||
| outptr[j] = __msa_reduce_add_w(_sum); | |||
| } | |||
| outptr += outw; | |||
| } | |||
| } | |||
| } | |||
| @@ -0,0 +1,120 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void convolution_pack8to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int channels = bottom_blob.c; | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| int outch = top_blob.c; | |||
| const int maxk = kernel_w * kernel_h; | |||
| // kernel offsets | |||
| std::vector<int> _space_ofs(maxk); | |||
| int* space_ofs = &_space_ofs[0]; | |||
| { | |||
| int p1 = 0; | |||
| int p2 = 0; | |||
| int gap = w * dilation_h - kernel_w * dilation_w; | |||
| for (int i = 0; i < kernel_h; i++) | |||
| { | |||
| for (int j = 0; j < kernel_w; j++) | |||
| { | |||
| space_ofs[p1] = p2; | |||
| p1++; | |||
| p2 += dilation_w; | |||
| } | |||
| p2 += gap; | |||
| } | |||
| } | |||
| // num_output | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p = 0; p < outch; p++) | |||
| { | |||
| int* outptr = top_blob.channel(p); | |||
| for (int i = 0; i < outh; i++) | |||
| { | |||
| for (int j = 0; j < outw; j++) | |||
| { | |||
| v4i32 _sum0 = __msa_fill_w(0); | |||
| v4i32 _sum1 = __msa_fill_w(0); | |||
| v4i32 _sum2 = __msa_fill_w(0); | |||
| v4i32 _sum3 = __msa_fill_w(0); | |||
| const signed char* kptr = weight_data_int8.channel(p); | |||
| // channels | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| const Mat m = bottom_blob.channel(q); | |||
| const signed char* sptr = m.row<signed char>(i * stride_h) + j * stride_w * 8; | |||
| for (int k = 0; k < maxk; k++) | |||
| { | |||
| v16i8 _val = __msa_ld_b(sptr + space_ofs[k] * 8, 0); | |||
| v8i16 _val16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val); | |||
| v16i8 _w01 = __msa_ld_b(kptr, 0); | |||
| v16i8 _w23 = __msa_ld_b(kptr + 16, 0); | |||
| v16i8 _extw01 = __msa_clti_s_b(_w01, 0); | |||
| v16i8 _extw23 = __msa_clti_s_b(_w23, 0); | |||
| v8i16 _w0 = (v8i16)__msa_ilvr_b(_extw01, _w01); | |||
| v8i16 _w1 = (v8i16)__msa_ilvl_b(_extw01, _w01); | |||
| v8i16 _w2 = (v8i16)__msa_ilvr_b(_extw23, _w23); | |||
| v8i16 _w3 = (v8i16)__msa_ilvl_b(_extw23, _w23); | |||
| v8i16 _s0 = __msa_mulv_h(_val16, _w0); | |||
| v8i16 _s1 = __msa_mulv_h(_val16, _w1); | |||
| v8i16 _s2 = __msa_mulv_h(_val16, _w2); | |||
| v8i16 _s3 = __msa_mulv_h(_val16, _w3); | |||
| _sum0 = __msa_addv_w(_sum0, __msa_hadd_s_w(_s0, _s0)); | |||
| _sum1 = __msa_addv_w(_sum1, __msa_hadd_s_w(_s1, _s1)); | |||
| _sum2 = __msa_addv_w(_sum2, __msa_hadd_s_w(_s2, _s2)); | |||
| _sum3 = __msa_addv_w(_sum3, __msa_hadd_s_w(_s3, _s3)); | |||
| kptr += 32; | |||
| } | |||
| } | |||
| // transpose 4x4 | |||
| { | |||
| v4i32 _tmp0, _tmp1, _tmp2, _tmp3; | |||
| _tmp0 = __msa_ilvr_w(_sum1, _sum0); | |||
| _tmp1 = __msa_ilvr_w(_sum3, _sum2); | |||
| _tmp2 = __msa_ilvl_w(_sum1, _sum0); | |||
| _tmp3 = __msa_ilvl_w(_sum3, _sum2); | |||
| _sum0 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0); | |||
| _sum1 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0); | |||
| _sum2 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2); | |||
| _sum3 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2); | |||
| } | |||
| _sum0 = __msa_addv_w(_sum0, _sum1); | |||
| _sum2 = __msa_addv_w(_sum2, _sum3); | |||
| _sum0 = __msa_addv_w(_sum0, _sum2); | |||
| __msa_st_w(_sum0, outptr + j * 4, 0); | |||
| } | |||
| outptr += outw * 4; | |||
| } | |||
| } | |||
| } | |||
| @@ -0,0 +1,731 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void im2col_sgemm_int8_msa(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| { | |||
| // Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator); | |||
| const int size = bottom_im2col.w; | |||
| const int maxk = bottom_im2col.h; | |||
| const int inch = bottom_im2col.c; | |||
| const int outch = top_blob.c; | |||
| // permute | |||
| Mat tmp; | |||
| #if __mips_msa | |||
| if (inch >= 4) | |||
| { | |||
| if (size >= 2) | |||
| tmp.create(2 * maxk, inch / 4 + inch % 4, size / 2 + size % 2, 4u, 4, opt.workspace_allocator); | |||
| else | |||
| tmp.create(maxk, inch / 4 + inch % 4, size, 4u, 4, opt.workspace_allocator); | |||
| } | |||
| else | |||
| { | |||
| if (size >= 2) | |||
| tmp.create(2 * maxk, inch, size / 2 + size % 2, 1u, 1, opt.workspace_allocator); | |||
| else | |||
| tmp.create(maxk, inch, size, 1u, 1, opt.workspace_allocator); | |||
| } | |||
| { | |||
| int remain_size_start = 0; | |||
| int nn_size = (size - remain_size_start) >> 1; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int ii = 0; ii < nn_size; ii++) | |||
| { | |||
| int i = remain_size_start + ii * 2; | |||
| signed char* tmpptr = tmp.channel(i / 2); | |||
| int q = 0; | |||
| for (; q + 3 < inch; q += 4) | |||
| { | |||
| const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i; | |||
| const signed char* img1 = (const signed char*)bottom_im2col.channel(q + 1) + i; | |||
| const signed char* img2 = (const signed char*)bottom_im2col.channel(q + 2) + i; | |||
| const signed char* img3 = (const signed char*)bottom_im2col.channel(q + 3) + i; | |||
| for (int k = 0; k < maxk; k++) | |||
| { | |||
| tmpptr[0] = img0[0]; | |||
| tmpptr[1] = img1[0]; | |||
| tmpptr[2] = img2[0]; | |||
| tmpptr[3] = img3[0]; | |||
| tmpptr[4] = img0[1]; | |||
| tmpptr[5] = img1[1]; | |||
| tmpptr[6] = img2[1]; | |||
| tmpptr[7] = img3[1]; | |||
| tmpptr += 8; | |||
| img0 += size; | |||
| img1 += size; | |||
| img2 += size; | |||
| img3 += size; | |||
| } | |||
| } | |||
| for (; q < inch; q++) | |||
| { | |||
| const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i; | |||
| for (int k = 0; k < maxk; k++) | |||
| { | |||
| tmpptr[0] = img0[0]; | |||
| tmpptr[1] = img0[1]; | |||
| tmpptr += 2; | |||
| img0 += size; | |||
| } | |||
| } | |||
| } | |||
| remain_size_start += nn_size << 1; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int i = remain_size_start; i < size; i++) | |||
| { | |||
| signed char* tmpptr = tmp.channel(i / 2 + i % 2); | |||
| int q = 0; | |||
| for (; q + 3 < inch; q += 4) | |||
| { | |||
| const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i; | |||
| const signed char* img1 = (const signed char*)bottom_im2col.channel(q + 1) + i; | |||
| const signed char* img2 = (const signed char*)bottom_im2col.channel(q + 2) + i; | |||
| const signed char* img3 = (const signed char*)bottom_im2col.channel(q + 3) + i; | |||
| for (int k = 0; k < maxk; k++) | |||
| { | |||
| tmpptr[0] = img0[0]; | |||
| tmpptr[1] = img1[0]; | |||
| tmpptr[2] = img2[0]; | |||
| tmpptr[3] = img3[0]; | |||
| tmpptr += 4; | |||
| img0 += size; | |||
| img1 += size; | |||
| img2 += size; | |||
| img3 += size; | |||
| } | |||
| } | |||
| for (; q < inch; q++) | |||
| { | |||
| const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i; | |||
| for (int k = 0; k < maxk; k++) | |||
| { | |||
| tmpptr[0] = img0[0]; | |||
| tmpptr += 1; | |||
| img0 += size; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| #else // __mips_msa | |||
| tmp.create(maxk, inch, size, 1u, 1, opt.workspace_allocator); | |||
| { | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int i = 0; i < size; i++) | |||
| { | |||
| signed char* tmpptr = tmp.channel(i); | |||
| int q = 0; | |||
| for (; q < inch; q++) | |||
| { | |||
| const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i; | |||
| for (int k = 0; k < maxk; k++) | |||
| { | |||
| tmpptr[0] = img0[0]; | |||
| tmpptr += 1; | |||
| img0 += size; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| #endif // __mips_msa | |||
| int nn_outch = 0; | |||
| int remain_outch_start = 0; | |||
| #if __mips_msa | |||
| nn_outch = outch >> 2; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int pp = 0; pp < nn_outch; pp++) | |||
| { | |||
| int p = pp * 4; | |||
| int* outptr0 = top_blob.channel(p); | |||
| int* outptr1 = top_blob.channel(p + 1); | |||
| int* outptr2 = top_blob.channel(p + 2); | |||
| int* outptr3 = top_blob.channel(p + 3); | |||
| int i = 0; | |||
| for (; i + 1 < size; i += 2) | |||
| { | |||
| const signed char* tmpptr = tmp.channel(i / 2); | |||
| const signed char* kptr = kernel.channel(p / 4); | |||
| int nn4 = (inch / 4) * maxk; | |||
| int nn1 = (inch % 4) * maxk; | |||
| v4i32 _sum00 = __msa_fill_w(0); | |||
| v4i32 _sum10 = __msa_fill_w(0); | |||
| if (nn4 > 0) | |||
| { | |||
| v4i32 _sum01 = __msa_fill_w(0); | |||
| v4i32 _sum02 = __msa_fill_w(0); | |||
| v4i32 _sum03 = __msa_fill_w(0); | |||
| v4i32 _sum11 = __msa_fill_w(0); | |||
| v4i32 _sum12 = __msa_fill_w(0); | |||
| v4i32 _sum13 = __msa_fill_w(0); | |||
| int j = 0; | |||
| for (; j < nn4; j++) | |||
| { | |||
| v16i8 _val = __msa_ld_b(tmpptr, 0); | |||
| v8i16 _val01 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val); | |||
| v8i16 _val0 = (v8i16)__msa_ilvr_d((v2i64)_val01, (v2i64)_val01); | |||
| v8i16 _val1 = (v8i16)__msa_ilvl_d((v2i64)_val01, (v2i64)_val01); | |||
| v16i8 _w01 = __msa_ld_b(kptr, 0); | |||
| v16i8 _extw01 = __msa_clti_s_b(_w01, 0); | |||
| v8i16 _w0 = (v8i16)__msa_ilvr_b(_extw01, _w01); | |||
| v8i16 _w1 = (v8i16)__msa_ilvl_b(_extw01, _w01); | |||
| v8i16 _s00 = __msa_mulv_h(_val0, _w0); | |||
| v8i16 _s01 = __msa_mulv_h(_val0, _w1); | |||
| v8i16 _s10 = __msa_mulv_h(_val1, _w0); | |||
| v8i16 _s11 = __msa_mulv_h(_val1, _w1); | |||
| v8i16 _exts00 = __msa_clti_s_h(_s00, 0); | |||
| v8i16 _exts01 = __msa_clti_s_h(_s01, 0); | |||
| v8i16 _exts10 = __msa_clti_s_h(_s10, 0); | |||
| v8i16 _exts11 = __msa_clti_s_h(_s11, 0); | |||
| v4i32 _s00l = (v4i32)__msa_ilvr_h(_exts00, _s00); | |||
| v4i32 _s00h = (v4i32)__msa_ilvl_h(_exts00, _s00); | |||
| v4i32 _s01l = (v4i32)__msa_ilvr_h(_exts01, _s01); | |||
| v4i32 _s01h = (v4i32)__msa_ilvl_h(_exts01, _s01); | |||
| v4i32 _s10l = (v4i32)__msa_ilvr_h(_exts10, _s10); | |||
| v4i32 _s10h = (v4i32)__msa_ilvl_h(_exts10, _s10); | |||
| v4i32 _s11l = (v4i32)__msa_ilvr_h(_exts11, _s11); | |||
| v4i32 _s11h = (v4i32)__msa_ilvl_h(_exts11, _s11); | |||
| _sum00 = __msa_addv_w(_sum00, _s00l); | |||
| _sum01 = __msa_addv_w(_sum01, _s00h); | |||
| _sum02 = __msa_addv_w(_sum02, _s01l); | |||
| _sum03 = __msa_addv_w(_sum03, _s01h); | |||
| _sum10 = __msa_addv_w(_sum10, _s10l); | |||
| _sum11 = __msa_addv_w(_sum11, _s10h); | |||
| _sum12 = __msa_addv_w(_sum12, _s11l); | |||
| _sum13 = __msa_addv_w(_sum13, _s11h); | |||
| tmpptr += 8; | |||
| kptr += 16; | |||
| } | |||
| // transpose 4x4 | |||
| { | |||
| v4i32 _tmp0, _tmp1, _tmp2, _tmp3; | |||
| _tmp0 = __msa_ilvr_w(_sum01, _sum00); | |||
| _tmp1 = __msa_ilvr_w(_sum03, _sum02); | |||
| _tmp2 = __msa_ilvl_w(_sum01, _sum00); | |||
| _tmp3 = __msa_ilvl_w(_sum03, _sum02); | |||
| _sum00 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0); | |||
| _sum01 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0); | |||
| _sum02 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2); | |||
| _sum03 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2); | |||
| } | |||
| { | |||
| v4i32 _tmp0, _tmp1, _tmp2, _tmp3; | |||
| _tmp0 = __msa_ilvr_w(_sum11, _sum10); | |||
| _tmp1 = __msa_ilvr_w(_sum13, _sum12); | |||
| _tmp2 = __msa_ilvl_w(_sum11, _sum10); | |||
| _tmp3 = __msa_ilvl_w(_sum13, _sum12); | |||
| _sum10 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0); | |||
| _sum11 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0); | |||
| _sum12 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2); | |||
| _sum13 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2); | |||
| } | |||
| _sum00 = __msa_addv_w(_sum00, _sum01); | |||
| _sum02 = __msa_addv_w(_sum02, _sum03); | |||
| _sum10 = __msa_addv_w(_sum10, _sum11); | |||
| _sum12 = __msa_addv_w(_sum12, _sum13); | |||
| _sum00 = __msa_addv_w(_sum00, _sum02); | |||
| _sum10 = __msa_addv_w(_sum10, _sum12); | |||
| } | |||
| int j = 0; | |||
| for (; j < nn1; j++) | |||
| { | |||
| v8i16 _val0 = __msa_fill_h(tmpptr[0]); | |||
| v8i16 _val1 = __msa_fill_h(tmpptr[1]); | |||
| v8i16 _val = (v8i16)__msa_ilvr_d((v2i64)_val1, (v2i64)_val0); | |||
| v16i8 _w = __msa_ld_b(kptr, 0); | |||
| v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w); | |||
| _w16 = (v8i16)__msa_ilvr_d((v2i64)_w16, (v2i64)_w16); | |||
| v8i16 _s0 = __msa_mulv_h(_val, _w16); | |||
| v8i16 _exts0 = __msa_clti_s_h(_s0, 0); | |||
| v4i32 _s0l = (v4i32)__msa_ilvr_h(_exts0, _s0); | |||
| v4i32 _s0h = (v4i32)__msa_ilvl_h(_exts0, _s0); | |||
| _sum00 = __msa_addv_w(_sum00, _s0l); | |||
| _sum10 = __msa_addv_w(_sum10, _s0h); | |||
| tmpptr += 2; | |||
| kptr += 4; | |||
| } | |||
| int sum[8]; | |||
| __msa_st_w(_sum00, sum, 0); | |||
| __msa_st_w(_sum10, sum + 4, 0); | |||
| outptr0[0] = sum[0]; | |||
| outptr1[0] = sum[1]; | |||
| outptr2[0] = sum[2]; | |||
| outptr3[0] = sum[3]; | |||
| outptr0[1] = sum[4]; | |||
| outptr1[1] = sum[5]; | |||
| outptr2[1] = sum[6]; | |||
| outptr3[1] = sum[7]; | |||
| outptr0 += 2; | |||
| outptr1 += 2; | |||
| outptr2 += 2; | |||
| outptr3 += 2; | |||
| } | |||
| for (; i < size; i++) | |||
| { | |||
| const signed char* tmpptr = tmp.channel(i / 2 + i % 2); | |||
| const signed char* kptr = kernel.channel(p / 4); | |||
| int nn4 = (inch / 4) * maxk; | |||
| int nn1 = (inch % 4) * maxk; | |||
| v4i32 _sum0 = __msa_fill_w(0); | |||
| if (nn4 > 0) | |||
| { | |||
| v4i32 _sum1 = __msa_fill_w(0); | |||
| v4i32 _sum2 = __msa_fill_w(0); | |||
| v4i32 _sum3 = __msa_fill_w(0); | |||
| int j = 0; | |||
| for (; j < nn4; j++) | |||
| { | |||
| v16i8 _val = __msa_ld_b(tmpptr, 0); | |||
| v8i16 _val16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val); | |||
| _val16 = (v8i16)__msa_ilvr_d((v2i64)_val16, (v2i64)_val16); | |||
| v16i8 _w01 = __msa_ld_b(kptr, 0); | |||
| v16i8 _extw01 = __msa_clti_s_b(_w01, 0); | |||
| v8i16 _w0 = (v8i16)__msa_ilvr_b(_extw01, _w01); | |||
| v8i16 _w1 = (v8i16)__msa_ilvl_b(_extw01, _w01); | |||
| v8i16 _s0 = __msa_mulv_h(_val16, _w0); | |||
| v8i16 _s1 = __msa_mulv_h(_val16, _w1); | |||
| v8i16 _exts0 = __msa_clti_s_h(_s0, 0); | |||
| v8i16 _exts1 = __msa_clti_s_h(_s1, 0); | |||
| v4i32 _s0l = (v4i32)__msa_ilvr_h(_exts0, _s0); | |||
| v4i32 _s0h = (v4i32)__msa_ilvl_h(_exts0, _s0); | |||
| v4i32 _s1l = (v4i32)__msa_ilvr_h(_exts1, _s1); | |||
| v4i32 _s1h = (v4i32)__msa_ilvl_h(_exts1, _s1); | |||
| _sum0 = __msa_addv_w(_sum0, _s0l); | |||
| _sum1 = __msa_addv_w(_sum1, _s0h); | |||
| _sum2 = __msa_addv_w(_sum2, _s1l); | |||
| _sum3 = __msa_addv_w(_sum3, _s1h); | |||
| tmpptr += 4; | |||
| kptr += 16; | |||
| } | |||
| // transpose 4x4 | |||
| { | |||
| v4i32 _tmp0, _tmp1, _tmp2, _tmp3; | |||
| _tmp0 = __msa_ilvr_w(_sum1, _sum0); | |||
| _tmp1 = __msa_ilvr_w(_sum3, _sum2); | |||
| _tmp2 = __msa_ilvl_w(_sum1, _sum0); | |||
| _tmp3 = __msa_ilvl_w(_sum3, _sum2); | |||
| _sum0 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0); | |||
| _sum1 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0); | |||
| _sum2 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2); | |||
| _sum3 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2); | |||
| } | |||
| _sum0 = __msa_addv_w(_sum0, _sum1); | |||
| _sum2 = __msa_addv_w(_sum2, _sum3); | |||
| _sum0 = __msa_addv_w(_sum0, _sum2); | |||
| } | |||
| int j = 0; | |||
| for (; j < nn1; j++) | |||
| { | |||
| v8i16 _val = __msa_fill_h(tmpptr[0]); | |||
| v16i8 _w = __msa_ld_b(kptr, 0); | |||
| v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w); | |||
| v8i16 _s0 = __msa_mulv_h(_val, _w16); | |||
| v4i32 _s032 = (v4i32)__msa_ilvr_h(__msa_clti_s_h(_s0, 0), _s0); | |||
| _sum0 = __msa_addv_w(_sum0, _s032); | |||
| tmpptr += 1; | |||
| kptr += 4; | |||
| } | |||
| int sum[4]; | |||
| __msa_st_w(_sum0, sum, 0); | |||
| outptr0[0] = sum[0]; | |||
| outptr1[0] = sum[1]; | |||
| outptr2[0] = sum[2]; | |||
| outptr3[0] = sum[3]; | |||
| outptr0 += 1; | |||
| outptr1 += 1; | |||
| outptr2 += 1; | |||
| outptr3 += 1; | |||
| } | |||
| } | |||
| remain_outch_start += nn_outch << 2; | |||
| #endif // __mips_msa | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p = remain_outch_start; p < outch; p++) | |||
| { | |||
| int* outptr0 = top_blob.channel(p); | |||
| int i = 0; | |||
| #if __mips_msa | |||
| for (; i + 1 < size; i += 2) | |||
| { | |||
| const signed char* tmpptr = tmp.channel(i / 2); | |||
| const signed char* kptr = kernel.channel(p / 4 + p % 4); | |||
| int nn4 = (inch / 4) * maxk; | |||
| int nn1 = (inch % 4) * maxk; | |||
| int sum0 = 0; | |||
| int sum1 = 0; | |||
| if (nn4 > 0) | |||
| { | |||
| v4i32 _sum0 = __msa_fill_w(0); | |||
| v4i32 _sum1 = __msa_fill_w(0); | |||
| int j = 0; | |||
| for (; j < nn4; j++) | |||
| { | |||
| v16i8 _val = __msa_ld_b(tmpptr, 0); | |||
| v8i16 _val16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val); | |||
| v16i8 _w = __msa_ld_b(kptr, 0); | |||
| v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w); | |||
| _w16 = (v8i16)__msa_ilvr_d((v2i64)_w16, (v2i64)_w16); | |||
| v8i16 _s0 = __msa_mulv_h(_val16, _w16); | |||
| v8i16 _exts0 = __msa_clti_s_h(_s0, 0); | |||
| v4i32 _s0l = (v4i32)__msa_ilvr_h(_exts0, _s0); | |||
| v4i32 _s0h = (v4i32)__msa_ilvl_h(_exts0, _s0); | |||
| _sum0 = __msa_addv_w(_sum0, _s0l); | |||
| _sum1 = __msa_addv_w(_sum1, _s0h); | |||
| tmpptr += 8; | |||
| kptr += 4; | |||
| } | |||
| sum0 = _sum0[0] + _sum0[1] + _sum0[2] + _sum0[3]; | |||
| sum1 = _sum1[0] + _sum1[1] + _sum1[2] + _sum1[3]; | |||
| } | |||
| int j = 0; | |||
| for (; j < nn1; j++) | |||
| { | |||
| signed char val0 = tmpptr[0]; | |||
| signed char val1 = tmpptr[1]; | |||
| signed char w = kptr[0]; | |||
| sum0 += val0 * w; | |||
| sum1 += val1 * w; | |||
| tmpptr += 2; | |||
| kptr += 1; | |||
| } | |||
| outptr0[0] = sum0; | |||
| outptr0[1] = sum1; | |||
| outptr0 += 2; | |||
| } | |||
| for (; i < size; i++) | |||
| { | |||
| const signed char* tmpptr = tmp.channel(i / 2 + i % 2); | |||
| const signed char* kptr = kernel.channel(p / 4 + p % 4); | |||
| int nn4 = (inch / 4) * maxk; | |||
| int nn1 = (inch % 4) * maxk; | |||
| int sum = 0; | |||
| if (nn4 > 0) | |||
| { | |||
| v4i32 _sum = __msa_fill_w(0); | |||
| int j = 0; | |||
| for (; j < nn4; j++) | |||
| { | |||
| v16i8 _val = __msa_ld_b(tmpptr, 0); | |||
| v8i16 _val16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val); | |||
| v16i8 _w = __msa_ld_b(kptr, 0); | |||
| v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w); | |||
| v8i16 _s0 = __msa_mulv_h(_val16, _w16); | |||
| v4i32 _s032 = (v4i32)__msa_ilvr_h(__msa_clti_s_h(_s0, 0), _s0); | |||
| _sum = __msa_addv_w(_sum, _s032); | |||
| tmpptr += 4; | |||
| kptr += 4; | |||
| } | |||
| sum = _sum[0] + _sum[1] + _sum[2] + _sum[3]; | |||
| } | |||
| int j = 0; | |||
| for (; j < nn1; j++) | |||
| { | |||
| signed char val = tmpptr[0]; | |||
| signed char w = kptr[0]; | |||
| sum += val * w; | |||
| tmpptr += 1; | |||
| kptr += 1; | |||
| } | |||
| outptr0[0] = sum; | |||
| outptr0 += 1; | |||
| } | |||
| #else // __mips_msa | |||
| for (; i < size; i++) | |||
| { | |||
| const signed char* tmpptr = tmp.channel(i); | |||
| const signed char* kptr = kernel.channel(p); | |||
| int nn1 = inch * maxk; | |||
| int sum = 0; | |||
| int j = 0; | |||
| for (; j < nn1; j++) | |||
| { | |||
| signed char val = tmpptr[0]; | |||
| signed char w = kptr[0]; | |||
| sum += val * w; | |||
| tmpptr += 1; | |||
| kptr += 1; | |||
| } | |||
| outptr0[0] = sum; | |||
| outptr0 += 1; | |||
| } | |||
| #endif // __mips_msa | |||
| } | |||
| } | |||
| static void convolution_im2col_sgemm_transform_kernel_int8_msa(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h) | |||
| { | |||
| const int maxk = kernel_w * kernel_h; | |||
| #if __mips_msa | |||
| // interleave | |||
| // src = maxk-inch-outch | |||
| // dst = 4a-4b-maxk-inch/4a-outch/4b | |||
| Mat kernel = _kernel.reshape(maxk, inch, outch); | |||
| if (outch >= 4) | |||
| { | |||
| if (inch >= 4) | |||
| kernel_tm.create(16 * maxk, inch / 4 + inch % 4, outch / 4 + outch % 4, (size_t)1u); | |||
| else | |||
| kernel_tm.create(4 * maxk, inch, outch / 4 + outch % 4, (size_t)1u); | |||
| } | |||
| else | |||
| { | |||
| if (inch >= 4) | |||
| kernel_tm.create(4 * maxk, inch / 4 + inch % 4, outch, (size_t)1u); | |||
| else | |||
| kernel_tm.create(1 * maxk, inch, outch, (size_t)1u); | |||
| } | |||
| int q = 0; | |||
| for (; q + 3 < outch; q += 4) | |||
| { | |||
| signed char* g00 = kernel_tm.channel(q / 4); | |||
| int p = 0; | |||
| for (; p + 3 < inch; p += 4) | |||
| { | |||
| for (int k = 0; k < maxk; k++) | |||
| { | |||
| for (int i = 0; i < 4; i++) | |||
| { | |||
| for (int j = 0; j < 4; j++) | |||
| { | |||
| const signed char* k00 = kernel.channel(q + i).row<const signed char>(p + j); | |||
| g00[0] = k00[k]; | |||
| g00++; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| for (; p < inch; p++) | |||
| { | |||
| for (int k = 0; k < maxk; k++) | |||
| { | |||
| for (int i = 0; i < 4; i++) | |||
| { | |||
| const signed char* k00 = kernel.channel(q + i).row<const signed char>(p); | |||
| g00[0] = k00[k]; | |||
| g00++; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| // TODO unroll 2 | |||
| for (; q < outch; q++) | |||
| { | |||
| signed char* g00 = kernel_tm.channel(q / 4 + q % 4); | |||
| int p = 0; | |||
| for (; p + 3 < inch; p += 4) | |||
| { | |||
| for (int k = 0; k < maxk; k++) | |||
| { | |||
| for (int j = 0; j < 4; j++) | |||
| { | |||
| const signed char* k00 = kernel.channel(q).row<const signed char>(p + j); | |||
| g00[0] = k00[k]; | |||
| g00++; | |||
| } | |||
| } | |||
| } | |||
| for (; p < inch; p++) | |||
| { | |||
| for (int k = 0; k < maxk; k++) | |||
| { | |||
| const signed char* k00 = kernel.channel(q).row<const signed char>(p); | |||
| g00[0] = k00[k]; | |||
| g00++; | |||
| } | |||
| } | |||
| } | |||
| #else // __mips_msa | |||
| kernel_tm = _kernel.reshape(maxk, inch, outch); | |||
| #endif // __mips_msa | |||
| } | |||
| static void convolution_im2col_sgemm_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int inch = bottom_blob.c; | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| const int size = outw * outh; | |||
| const int maxk = kernel_w * kernel_h; | |||
| // im2col | |||
| Mat bottom_im2col(size, maxk, inch, 1u, 1, opt.workspace_allocator); | |||
| { | |||
| const int gap = w * stride_h - outw * stride_w; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p = 0; p < inch; p++) | |||
| { | |||
| const Mat img = bottom_blob.channel(p); | |||
| signed char* ptr = bottom_im2col.channel(p); | |||
| for (int u = 0; u < kernel_h; u++) | |||
| { | |||
| for (int v = 0; v < kernel_w; v++) | |||
| { | |||
| const signed char* sptr = img.row<const signed char>(dilation_h * u) + dilation_w * v; | |||
| for (int i = 0; i < outh; i++) | |||
| { | |||
| int j = 0; | |||
| for (; j + 3 < outw; j += 4) | |||
| { | |||
| ptr[0] = sptr[0]; | |||
| ptr[1] = sptr[stride_w]; | |||
| ptr[2] = sptr[stride_w * 2]; | |||
| ptr[3] = sptr[stride_w * 3]; | |||
| sptr += stride_w * 4; | |||
| ptr += 4; | |||
| } | |||
| for (; j + 1 < outw; j += 2) | |||
| { | |||
| ptr[0] = sptr[0]; | |||
| ptr[1] = sptr[stride_w]; | |||
| sptr += stride_w * 2; | |||
| ptr += 2; | |||
| } | |||
| for (; j < outw; j++) | |||
| { | |||
| ptr[0] = sptr[0]; | |||
| sptr += stride_w; | |||
| ptr += 1; | |||
| } | |||
| sptr += gap; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| im2col_sgemm_int8_msa(bottom_im2col, top_blob, kernel, opt); | |||
| } | |||
| @@ -0,0 +1,477 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void im2col_sgemm_pack1to4_int8_msa(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| { | |||
| // Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator); | |||
| const int size = bottom_im2col.w; | |||
| const int maxk = bottom_im2col.h; | |||
| const int inch = bottom_im2col.c; | |||
| const int outch = top_blob.c; | |||
| // permute | |||
| Mat tmp; | |||
| if (inch >= 4) | |||
| { | |||
| if (size >= 2) | |||
| tmp.create(2 * maxk, inch / 4 + inch % 4, size / 2 + size % 2, 4u, 4, opt.workspace_allocator); | |||
| else | |||
| tmp.create(maxk, inch / 4 + inch % 4, size, 4u, 4, opt.workspace_allocator); | |||
| } | |||
| else | |||
| { | |||
| if (size >= 2) | |||
| tmp.create(2 * maxk, inch, size / 2 + size % 2, 1u, 1, opt.workspace_allocator); | |||
| else | |||
| tmp.create(maxk, inch, size, 1u, 1, opt.workspace_allocator); | |||
| } | |||
| { | |||
| int remain_size_start = 0; | |||
| int nn_size = (size - remain_size_start) >> 1; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int ii = 0; ii < nn_size; ii++) | |||
| { | |||
| int i = remain_size_start + ii * 2; | |||
| signed char* tmpptr = tmp.channel(i / 2); | |||
| int q = 0; | |||
| for (; q + 3 < inch; q += 4) | |||
| { | |||
| const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i; | |||
| const signed char* img1 = (const signed char*)bottom_im2col.channel(q + 1) + i; | |||
| const signed char* img2 = (const signed char*)bottom_im2col.channel(q + 2) + i; | |||
| const signed char* img3 = (const signed char*)bottom_im2col.channel(q + 3) + i; | |||
| for (int k = 0; k < maxk; k++) | |||
| { | |||
| tmpptr[0] = img0[0]; | |||
| tmpptr[1] = img1[0]; | |||
| tmpptr[2] = img2[0]; | |||
| tmpptr[3] = img3[0]; | |||
| tmpptr[4] = img0[1]; | |||
| tmpptr[5] = img1[1]; | |||
| tmpptr[6] = img2[1]; | |||
| tmpptr[7] = img3[1]; | |||
| tmpptr += 8; | |||
| img0 += size; | |||
| img1 += size; | |||
| img2 += size; | |||
| img3 += size; | |||
| } | |||
| } | |||
| for (; q < inch; q++) | |||
| { | |||
| const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i; | |||
| for (int k = 0; k < maxk; k++) | |||
| { | |||
| tmpptr[0] = img0[0]; | |||
| tmpptr[1] = img0[1]; | |||
| tmpptr += 2; | |||
| img0 += size; | |||
| } | |||
| } | |||
| } | |||
| remain_size_start += nn_size << 1; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int i = remain_size_start; i < size; i++) | |||
| { | |||
| signed char* tmpptr = tmp.channel(i / 2 + i % 2); | |||
| int q = 0; | |||
| for (; q + 3 < inch; q += 4) | |||
| { | |||
| const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i; | |||
| const signed char* img1 = (const signed char*)bottom_im2col.channel(q + 1) + i; | |||
| const signed char* img2 = (const signed char*)bottom_im2col.channel(q + 2) + i; | |||
| const signed char* img3 = (const signed char*)bottom_im2col.channel(q + 3) + i; | |||
| for (int k = 0; k < maxk; k++) | |||
| { | |||
| tmpptr[0] = img0[0]; | |||
| tmpptr[1] = img1[0]; | |||
| tmpptr[2] = img2[0]; | |||
| tmpptr[3] = img3[0]; | |||
| tmpptr += 4; | |||
| img0 += size; | |||
| img1 += size; | |||
| img2 += size; | |||
| img3 += size; | |||
| } | |||
| } | |||
| for (; q < inch; q++) | |||
| { | |||
| const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i; | |||
| for (int k = 0; k < maxk; k++) | |||
| { | |||
| tmpptr[0] = img0[0]; | |||
| tmpptr += 1; | |||
| img0 += size; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p = 0; p < outch; p++) | |||
| { | |||
| int* outptr0 = top_blob.channel(p); | |||
| int i = 0; | |||
| for (; i + 1 < size; i += 2) | |||
| { | |||
| const signed char* tmpptr = tmp.channel(i / 2); | |||
| const signed char* kptr = kernel.channel(p); | |||
| int nn4 = (inch / 4) * maxk; | |||
| int nn1 = (inch % 4) * maxk; | |||
| v4i32 _sum00 = __msa_fill_w(0); | |||
| v4i32 _sum10 = __msa_fill_w(0); | |||
| if (nn4 > 0) | |||
| { | |||
| v4i32 _sum01 = __msa_fill_w(0); | |||
| v4i32 _sum02 = __msa_fill_w(0); | |||
| v4i32 _sum03 = __msa_fill_w(0); | |||
| v4i32 _sum11 = __msa_fill_w(0); | |||
| v4i32 _sum12 = __msa_fill_w(0); | |||
| v4i32 _sum13 = __msa_fill_w(0); | |||
| int j = 0; | |||
| for (; j < nn4; j++) | |||
| { | |||
| v16i8 _val = __msa_ld_b(tmpptr, 0); | |||
| v8i16 _val01 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val); | |||
| v8i16 _val0 = (v8i16)__msa_ilvr_d((v2i64)_val01, (v2i64)_val01); | |||
| v8i16 _val1 = (v8i16)__msa_ilvl_d((v2i64)_val01, (v2i64)_val01); | |||
| v16i8 _w01 = __msa_ld_b(kptr, 0); | |||
| v16i8 _extw01 = __msa_clti_s_b(_w01, 0); | |||
| v8i16 _w0 = (v8i16)__msa_ilvr_b(_extw01, _w01); | |||
| v8i16 _w1 = (v8i16)__msa_ilvl_b(_extw01, _w01); | |||
| v8i16 _s00 = __msa_mulv_h(_val0, _w0); | |||
| v8i16 _s01 = __msa_mulv_h(_val0, _w1); | |||
| v8i16 _s10 = __msa_mulv_h(_val1, _w0); | |||
| v8i16 _s11 = __msa_mulv_h(_val1, _w1); | |||
| v8i16 _exts00 = __msa_clti_s_h(_s00, 0); | |||
| v8i16 _exts01 = __msa_clti_s_h(_s01, 0); | |||
| v8i16 _exts10 = __msa_clti_s_h(_s10, 0); | |||
| v8i16 _exts11 = __msa_clti_s_h(_s11, 0); | |||
| v4i32 _s00l = (v4i32)__msa_ilvr_h(_exts00, _s00); | |||
| v4i32 _s00h = (v4i32)__msa_ilvl_h(_exts00, _s00); | |||
| v4i32 _s01l = (v4i32)__msa_ilvr_h(_exts01, _s01); | |||
| v4i32 _s01h = (v4i32)__msa_ilvl_h(_exts01, _s01); | |||
| v4i32 _s10l = (v4i32)__msa_ilvr_h(_exts10, _s10); | |||
| v4i32 _s10h = (v4i32)__msa_ilvl_h(_exts10, _s10); | |||
| v4i32 _s11l = (v4i32)__msa_ilvr_h(_exts11, _s11); | |||
| v4i32 _s11h = (v4i32)__msa_ilvl_h(_exts11, _s11); | |||
| _sum00 = __msa_addv_w(_sum00, _s00l); | |||
| _sum01 = __msa_addv_w(_sum01, _s00h); | |||
| _sum02 = __msa_addv_w(_sum02, _s01l); | |||
| _sum03 = __msa_addv_w(_sum03, _s01h); | |||
| _sum10 = __msa_addv_w(_sum10, _s10l); | |||
| _sum11 = __msa_addv_w(_sum11, _s10h); | |||
| _sum12 = __msa_addv_w(_sum12, _s11l); | |||
| _sum13 = __msa_addv_w(_sum13, _s11h); | |||
| tmpptr += 8; | |||
| kptr += 16; | |||
| } | |||
| // transpose 4x4 | |||
| { | |||
| v4i32 _tmp0, _tmp1, _tmp2, _tmp3; | |||
| _tmp0 = __msa_ilvr_w(_sum01, _sum00); | |||
| _tmp1 = __msa_ilvr_w(_sum03, _sum02); | |||
| _tmp2 = __msa_ilvl_w(_sum01, _sum00); | |||
| _tmp3 = __msa_ilvl_w(_sum03, _sum02); | |||
| _sum00 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0); | |||
| _sum01 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0); | |||
| _sum02 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2); | |||
| _sum03 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2); | |||
| } | |||
| { | |||
| v4i32 _tmp0, _tmp1, _tmp2, _tmp3; | |||
| _tmp0 = __msa_ilvr_w(_sum11, _sum10); | |||
| _tmp1 = __msa_ilvr_w(_sum13, _sum12); | |||
| _tmp2 = __msa_ilvl_w(_sum11, _sum10); | |||
| _tmp3 = __msa_ilvl_w(_sum13, _sum12); | |||
| _sum10 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0); | |||
| _sum11 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0); | |||
| _sum12 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2); | |||
| _sum13 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2); | |||
| } | |||
| _sum00 = __msa_addv_w(_sum00, _sum01); | |||
| _sum02 = __msa_addv_w(_sum02, _sum03); | |||
| _sum10 = __msa_addv_w(_sum10, _sum11); | |||
| _sum12 = __msa_addv_w(_sum12, _sum13); | |||
| _sum00 = __msa_addv_w(_sum00, _sum02); | |||
| _sum10 = __msa_addv_w(_sum10, _sum12); | |||
| } | |||
| int j = 0; | |||
| for (; j < nn1; j++) | |||
| { | |||
| v8i16 _val0 = __msa_fill_h(tmpptr[0]); | |||
| v8i16 _val1 = __msa_fill_h(tmpptr[1]); | |||
| v8i16 _val = (v8i16)__msa_ilvr_d((v2i64)_val1, (v2i64)_val0); | |||
| v16i8 _w = __msa_ld_b(kptr, 0); | |||
| v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w); | |||
| _w16 = (v8i16)__msa_ilvr_d((v2i64)_w16, (v2i64)_w16); | |||
| v8i16 _s0 = __msa_mulv_h(_val, _w16); | |||
| v8i16 _exts0 = __msa_clti_s_h(_s0, 0); | |||
| v4i32 _s0l = (v4i32)__msa_ilvr_h(_exts0, _s0); | |||
| v4i32 _s0h = (v4i32)__msa_ilvl_h(_exts0, _s0); | |||
| _sum00 = __msa_addv_w(_sum00, _s0l); | |||
| _sum10 = __msa_addv_w(_sum10, _s0h); | |||
| tmpptr += 2; | |||
| kptr += 4; | |||
| } | |||
| __msa_st_w(_sum00, outptr0, 0); | |||
| __msa_st_w(_sum10, outptr0 + 4, 0); | |||
| outptr0 += 8; | |||
| } | |||
| for (; i < size; i++) | |||
| { | |||
| const signed char* tmpptr = tmp.channel(i / 2 + i % 2); | |||
| const signed char* kptr = kernel.channel(p); | |||
| int nn4 = (inch / 4) * maxk; | |||
| int nn1 = (inch % 4) * maxk; | |||
| v4i32 _sum0 = __msa_fill_w(0); | |||
| if (nn4 > 0) | |||
| { | |||
| v4i32 _sum1 = __msa_fill_w(0); | |||
| v4i32 _sum2 = __msa_fill_w(0); | |||
| v4i32 _sum3 = __msa_fill_w(0); | |||
| int j = 0; | |||
| for (; j < nn4; j++) | |||
| { | |||
| v16i8 _val = __msa_ld_b(tmpptr, 0); | |||
| v8i16 _val16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val); | |||
| _val16 = (v8i16)__msa_ilvr_d((v2i64)_val16, (v2i64)_val16); | |||
| v16i8 _w01 = __msa_ld_b(kptr, 0); | |||
| v16i8 _extw01 = __msa_clti_s_b(_w01, 0); | |||
| v8i16 _w0 = (v8i16)__msa_ilvr_b(_extw01, _w01); | |||
| v8i16 _w1 = (v8i16)__msa_ilvl_b(_extw01, _w01); | |||
| v8i16 _s0 = __msa_mulv_h(_val16, _w0); | |||
| v8i16 _s1 = __msa_mulv_h(_val16, _w1); | |||
| v8i16 _exts0 = __msa_clti_s_h(_s0, 0); | |||
| v8i16 _exts1 = __msa_clti_s_h(_s1, 0); | |||
| v4i32 _s0l = (v4i32)__msa_ilvr_h(_exts0, _s0); | |||
| v4i32 _s0h = (v4i32)__msa_ilvl_h(_exts0, _s0); | |||
| v4i32 _s1l = (v4i32)__msa_ilvr_h(_exts1, _s1); | |||
| v4i32 _s1h = (v4i32)__msa_ilvl_h(_exts1, _s1); | |||
| _sum0 = __msa_addv_w(_sum0, _s0l); | |||
| _sum1 = __msa_addv_w(_sum1, _s0h); | |||
| _sum2 = __msa_addv_w(_sum2, _s1l); | |||
| _sum3 = __msa_addv_w(_sum3, _s1h); | |||
| tmpptr += 4; | |||
| kptr += 16; | |||
| } | |||
| // transpose 4x4 | |||
| { | |||
| v4i32 _tmp0, _tmp1, _tmp2, _tmp3; | |||
| _tmp0 = __msa_ilvr_w(_sum1, _sum0); | |||
| _tmp1 = __msa_ilvr_w(_sum3, _sum2); | |||
| _tmp2 = __msa_ilvl_w(_sum1, _sum0); | |||
| _tmp3 = __msa_ilvl_w(_sum3, _sum2); | |||
| _sum0 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0); | |||
| _sum1 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0); | |||
| _sum2 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2); | |||
| _sum3 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2); | |||
| } | |||
| _sum0 = __msa_addv_w(_sum0, _sum1); | |||
| _sum2 = __msa_addv_w(_sum2, _sum3); | |||
| _sum0 = __msa_addv_w(_sum0, _sum2); | |||
| } | |||
| int j = 0; | |||
| for (; j < nn1; j++) | |||
| { | |||
| v8i16 _val = __msa_fill_h(tmpptr[0]); | |||
| v16i8 _w = __msa_ld_b(kptr, 0); | |||
| v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w); | |||
| v8i16 _s0 = __msa_mulv_h(_val, _w16); | |||
| v4i32 _s032 = (v4i32)__msa_ilvr_h(__msa_clti_s_h(_s0, 0), _s0); | |||
| _sum0 = __msa_addv_w(_sum0, _s032); | |||
| tmpptr += 1; | |||
| kptr += 4; | |||
| } | |||
| __msa_st_w(_sum0, outptr0, 0); | |||
| outptr0 += 4; | |||
| } | |||
| } | |||
| } | |||
| static void convolution_im2col_sgemm_transform_kernel_pack1to4_int8_msa(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h) | |||
| { | |||
| const int maxk = kernel_w * kernel_h; | |||
| // interleave | |||
| // src = maxk-inch-outch | |||
| // dst = 4a-4b-maxk-inch/4a-outch/4b | |||
| Mat kernel = _kernel.reshape(maxk, inch, outch); | |||
| if (inch >= 4) | |||
| kernel_tm.create(16 * maxk, inch / 4 + inch % 4, outch / 4, (size_t)1u); | |||
| else | |||
| kernel_tm.create(4 * maxk, inch, outch / 4, (size_t)1u); | |||
| for (int q = 0; q + 3 < outch; q += 4) | |||
| { | |||
| signed char* g00 = kernel_tm.channel(q / 4); | |||
| int p = 0; | |||
| for (; p + 3 < inch; p += 4) | |||
| { | |||
| for (int k = 0; k < maxk; k++) | |||
| { | |||
| for (int i = 0; i < 4; i++) | |||
| { | |||
| for (int j = 0; j < 4; j++) | |||
| { | |||
| const signed char* k00 = kernel.channel(q + i).row<const signed char>(p + j); | |||
| g00[0] = k00[k]; | |||
| g00++; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| for (; p < inch; p++) | |||
| { | |||
| for (int k = 0; k < maxk; k++) | |||
| { | |||
| for (int i = 0; i < 4; i++) | |||
| { | |||
| const signed char* k00 = kernel.channel(q + i).row<const signed char>(p); | |||
| g00[0] = k00[k]; | |||
| g00++; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| static void convolution_im2col_sgemm_pack1to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int inch = bottom_blob.c; | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| const int size = outw * outh; | |||
| const int maxk = kernel_w * kernel_h; | |||
| // im2col | |||
| Mat bottom_im2col(size, maxk, inch, 1u, 1, opt.workspace_allocator); | |||
| { | |||
| const int gap = w * stride_h - outw * stride_w; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p = 0; p < inch; p++) | |||
| { | |||
| const Mat img = bottom_blob.channel(p); | |||
| signed char* ptr = bottom_im2col.channel(p); | |||
| for (int u = 0; u < kernel_h; u++) | |||
| { | |||
| for (int v = 0; v < kernel_w; v++) | |||
| { | |||
| const signed char* sptr = img.row<const signed char>(dilation_h * u) + dilation_w * v; | |||
| for (int i = 0; i < outh; i++) | |||
| { | |||
| int j = 0; | |||
| for (; j + 3 < outw; j += 4) | |||
| { | |||
| ptr[0] = sptr[0]; | |||
| ptr[1] = sptr[stride_w]; | |||
| ptr[2] = sptr[stride_w * 2]; | |||
| ptr[3] = sptr[stride_w * 3]; | |||
| sptr += stride_w * 4; | |||
| ptr += 4; | |||
| } | |||
| for (; j + 1 < outw; j += 2) | |||
| { | |||
| ptr[0] = sptr[0]; | |||
| ptr[1] = sptr[stride_w]; | |||
| sptr += stride_w * 2; | |||
| ptr += 2; | |||
| } | |||
| for (; j < outw; j++) | |||
| { | |||
| ptr[0] = sptr[0]; | |||
| sptr += stride_w; | |||
| ptr += 1; | |||
| } | |||
| sptr += gap; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| im2col_sgemm_pack1to4_int8_msa(bottom_im2col, top_blob, kernel, opt); | |||
| } | |||
| @@ -550,7 +550,7 @@ static void im2col_sgemm_pack4to1_msa(const Mat& bottom_im2col, Mat& top_blob, c | |||
| kptr0 += 4; | |||
| } | |||
| sum0 += __msa_fhadd_w(_sum0); | |||
| sum0 += __msa_reduce_fadd_w(_sum0); | |||
| outptr0[0] = sum0; | |||
| @@ -0,0 +1,450 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void im2col_sgemm_pack8to1_int8_msa(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| { | |||
| // Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator); | |||
| const int size = bottom_im2col.w; | |||
| const int maxk = bottom_im2col.h; | |||
| const int inch = bottom_im2col.c; | |||
| const int outch = top_blob.c; | |||
| // permute | |||
| Mat tmp; | |||
| if (size >= 2) | |||
| tmp.create(2 * maxk, inch, size / 2 + size % 2, 8u, 8, opt.workspace_allocator); | |||
| else | |||
| tmp.create(maxk, inch, size, 8u, 8, opt.workspace_allocator); | |||
| { | |||
| int remain_size_start = 0; | |||
| int nn_size = (size - remain_size_start) >> 1; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int ii = 0; ii < nn_size; ii++) | |||
| { | |||
| int i = remain_size_start + ii * 2; | |||
| int64_t* tmpptr = tmp.channel(i / 2); | |||
| for (int q = 0; q < inch; q++) | |||
| { | |||
| const int64_t* img0 = (const int64_t*)bottom_im2col.channel(q) + i; | |||
| for (int k = 0; k < maxk; k++) | |||
| { | |||
| v16i8 _v = __msa_ld_b(img0, 0); | |||
| __msa_st_b(_v, tmpptr, 0); | |||
| tmpptr += 2; | |||
| img0 += size; | |||
| } | |||
| } | |||
| } | |||
| remain_size_start += nn_size << 1; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int i = remain_size_start; i < size; i++) | |||
| { | |||
| int64_t* tmpptr = tmp.channel(i / 2 + i % 2); | |||
| for (int q = 0; q < inch; q++) | |||
| { | |||
| const int64_t* img0 = (const int64_t*)bottom_im2col.channel(q) + i; | |||
| for (int k = 0; k < maxk; k++) | |||
| { | |||
| tmpptr[0] = img0[0]; | |||
| tmpptr += 1; | |||
| img0 += size; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| int nn_outch = 0; | |||
| int remain_outch_start = 0; | |||
| nn_outch = outch >> 2; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int pp = 0; pp < nn_outch; pp++) | |||
| { | |||
| int p = pp * 4; | |||
| int* outptr0 = top_blob.channel(p); | |||
| int* outptr1 = top_blob.channel(p + 1); | |||
| int* outptr2 = top_blob.channel(p + 2); | |||
| int* outptr3 = top_blob.channel(p + 3); | |||
| int i = 0; | |||
| for (; i + 1 < size; i += 2) | |||
| { | |||
| const signed char* tmpptr = tmp.channel(i / 2); | |||
| const signed char* kptr = kernel.channel(p / 4); | |||
| int nn = inch * maxk; // inch always > 0 | |||
| v4i32 _sum00 = __msa_fill_w(0); | |||
| v4i32 _sum01 = __msa_fill_w(0); | |||
| v4i32 _sum02 = __msa_fill_w(0); | |||
| v4i32 _sum03 = __msa_fill_w(0); | |||
| v4i32 _sum10 = __msa_fill_w(0); | |||
| v4i32 _sum11 = __msa_fill_w(0); | |||
| v4i32 _sum12 = __msa_fill_w(0); | |||
| v4i32 _sum13 = __msa_fill_w(0); | |||
| int j = 0; | |||
| for (; j < nn; j++) | |||
| { | |||
| v16i8 _val01 = __msa_ld_b(tmpptr, 0); | |||
| v16i8 _extval01 = __msa_clti_s_b(_val01, 0); | |||
| v8i16 _val0 = (v8i16)__msa_ilvr_b(_extval01, _val01); | |||
| v8i16 _val1 = (v8i16)__msa_ilvl_b(_extval01, _val01); | |||
| v16i8 _w01 = __msa_ld_b(kptr, 0); | |||
| v16i8 _w23 = __msa_ld_b(kptr + 16, 0); | |||
| v16i8 _extw01 = __msa_clti_s_b(_w01, 0); | |||
| v16i8 _extw23 = __msa_clti_s_b(_w23, 0); | |||
| v8i16 _w0 = (v8i16)__msa_ilvr_b(_extw01, _w01); | |||
| v8i16 _w1 = (v8i16)__msa_ilvl_b(_extw01, _w01); | |||
| v8i16 _w2 = (v8i16)__msa_ilvr_b(_extw23, _w23); | |||
| v8i16 _w3 = (v8i16)__msa_ilvl_b(_extw23, _w23); | |||
| v8i16 _s00 = __msa_mulv_h(_val0, _w0); | |||
| v8i16 _s01 = __msa_mulv_h(_val0, _w1); | |||
| v8i16 _s02 = __msa_mulv_h(_val0, _w2); | |||
| v8i16 _s03 = __msa_mulv_h(_val0, _w3); | |||
| v8i16 _s10 = __msa_mulv_h(_val1, _w0); | |||
| v8i16 _s11 = __msa_mulv_h(_val1, _w1); | |||
| v8i16 _s12 = __msa_mulv_h(_val1, _w2); | |||
| v8i16 _s13 = __msa_mulv_h(_val1, _w3); | |||
| _sum00 = __msa_addv_w(_sum00, __msa_hadd_s_w(_s00, _s00)); | |||
| _sum01 = __msa_addv_w(_sum01, __msa_hadd_s_w(_s01, _s01)); | |||
| _sum02 = __msa_addv_w(_sum02, __msa_hadd_s_w(_s02, _s02)); | |||
| _sum03 = __msa_addv_w(_sum03, __msa_hadd_s_w(_s03, _s03)); | |||
| _sum10 = __msa_addv_w(_sum10, __msa_hadd_s_w(_s10, _s10)); | |||
| _sum11 = __msa_addv_w(_sum11, __msa_hadd_s_w(_s11, _s11)); | |||
| _sum12 = __msa_addv_w(_sum12, __msa_hadd_s_w(_s12, _s12)); | |||
| _sum13 = __msa_addv_w(_sum13, __msa_hadd_s_w(_s13, _s13)); | |||
| tmpptr += 16; | |||
| kptr += 32; | |||
| } | |||
| // transpose 4x4 | |||
| { | |||
| v4i32 _tmp0, _tmp1, _tmp2, _tmp3; | |||
| _tmp0 = __msa_ilvr_w(_sum01, _sum00); | |||
| _tmp1 = __msa_ilvr_w(_sum03, _sum02); | |||
| _tmp2 = __msa_ilvl_w(_sum01, _sum00); | |||
| _tmp3 = __msa_ilvl_w(_sum03, _sum02); | |||
| _sum00 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0); | |||
| _sum01 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0); | |||
| _sum02 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2); | |||
| _sum03 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2); | |||
| } | |||
| { | |||
| v4i32 _tmp0, _tmp1, _tmp2, _tmp3; | |||
| _tmp0 = __msa_ilvr_w(_sum11, _sum10); | |||
| _tmp1 = __msa_ilvr_w(_sum13, _sum12); | |||
| _tmp2 = __msa_ilvl_w(_sum11, _sum10); | |||
| _tmp3 = __msa_ilvl_w(_sum13, _sum12); | |||
| _sum10 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0); | |||
| _sum11 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0); | |||
| _sum12 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2); | |||
| _sum13 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2); | |||
| } | |||
| _sum00 = __msa_addv_w(_sum00, _sum01); | |||
| _sum02 = __msa_addv_w(_sum02, _sum03); | |||
| _sum10 = __msa_addv_w(_sum10, _sum11); | |||
| _sum12 = __msa_addv_w(_sum12, _sum13); | |||
| _sum00 = __msa_addv_w(_sum00, _sum02); | |||
| _sum10 = __msa_addv_w(_sum10, _sum12); | |||
| int sum[8]; | |||
| __msa_st_w(_sum00, sum, 0); | |||
| __msa_st_w(_sum10, sum + 4, 0); | |||
| outptr0[0] = sum[0]; | |||
| outptr1[0] = sum[1]; | |||
| outptr2[0] = sum[2]; | |||
| outptr3[0] = sum[3]; | |||
| outptr0[1] = sum[4]; | |||
| outptr1[1] = sum[5]; | |||
| outptr2[1] = sum[6]; | |||
| outptr3[1] = sum[7]; | |||
| outptr0 += 2; | |||
| outptr1 += 2; | |||
| outptr2 += 2; | |||
| outptr3 += 2; | |||
| } | |||
| for (; i < size; i++) | |||
| { | |||
| const signed char* tmpptr = tmp.channel(i / 2 + i % 2); | |||
| const signed char* kptr = kernel.channel(p / 4); | |||
| int nn = inch * maxk; // inch always > 0 | |||
| v4i32 _sum0 = __msa_fill_w(0); | |||
| v4i32 _sum1 = __msa_fill_w(0); | |||
| v4i32 _sum2 = __msa_fill_w(0); | |||
| v4i32 _sum3 = __msa_fill_w(0); | |||
| int j = 0; | |||
| for (; j < nn; j++) | |||
| { | |||
| v16i8 _val = __msa_ld_b(tmpptr, 0); | |||
| v8i16 _val16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val); | |||
| v16i8 _w01 = __msa_ld_b(kptr, 0); | |||
| v16i8 _w23 = __msa_ld_b(kptr + 16, 0); | |||
| v16i8 _extw01 = __msa_clti_s_b(_w01, 0); | |||
| v16i8 _extw23 = __msa_clti_s_b(_w23, 0); | |||
| v8i16 _w0 = (v8i16)__msa_ilvr_b(_extw01, _w01); | |||
| v8i16 _w1 = (v8i16)__msa_ilvl_b(_extw01, _w01); | |||
| v8i16 _w2 = (v8i16)__msa_ilvr_b(_extw23, _w23); | |||
| v8i16 _w3 = (v8i16)__msa_ilvl_b(_extw23, _w23); | |||
| v8i16 _s0 = __msa_mulv_h(_val16, _w0); | |||
| v8i16 _s1 = __msa_mulv_h(_val16, _w1); | |||
| v8i16 _s2 = __msa_mulv_h(_val16, _w2); | |||
| v8i16 _s3 = __msa_mulv_h(_val16, _w3); | |||
| _sum0 = __msa_addv_w(_sum0, __msa_hadd_s_w(_s0, _s0)); | |||
| _sum1 = __msa_addv_w(_sum1, __msa_hadd_s_w(_s1, _s1)); | |||
| _sum2 = __msa_addv_w(_sum2, __msa_hadd_s_w(_s2, _s2)); | |||
| _sum3 = __msa_addv_w(_sum3, __msa_hadd_s_w(_s3, _s3)); | |||
| tmpptr += 8; | |||
| kptr += 32; | |||
| } | |||
| // transpose 4x4 | |||
| { | |||
| v4i32 _tmp0, _tmp1, _tmp2, _tmp3; | |||
| _tmp0 = __msa_ilvr_w(_sum1, _sum0); | |||
| _tmp1 = __msa_ilvr_w(_sum3, _sum2); | |||
| _tmp2 = __msa_ilvl_w(_sum1, _sum0); | |||
| _tmp3 = __msa_ilvl_w(_sum3, _sum2); | |||
| _sum0 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0); | |||
| _sum1 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0); | |||
| _sum2 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2); | |||
| _sum3 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2); | |||
| } | |||
| _sum0 = __msa_addv_w(_sum0, _sum1); | |||
| _sum2 = __msa_addv_w(_sum2, _sum3); | |||
| _sum0 = __msa_addv_w(_sum0, _sum2); | |||
| int sum[4]; | |||
| __msa_st_w(_sum0, sum, 0); | |||
| outptr0[0] = sum[0]; | |||
| outptr1[0] = sum[1]; | |||
| outptr2[0] = sum[2]; | |||
| outptr3[0] = sum[3]; | |||
| outptr0 += 1; | |||
| outptr1 += 1; | |||
| outptr2 += 1; | |||
| outptr3 += 1; | |||
| } | |||
| } | |||
| remain_outch_start += nn_outch << 2; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p = remain_outch_start; p < outch; p++) | |||
| { | |||
| int* outptr0 = top_blob.channel(p); | |||
| int i = 0; | |||
| for (; i + 1 < size; i += 2) | |||
| { | |||
| const signed char* tmpptr = tmp.channel(i / 2); | |||
| const signed char* kptr = kernel.channel(p / 4 + p % 4); | |||
| int nn = inch * maxk; // inch always > 0 | |||
| v4i32 _sum0 = __msa_fill_w(0); | |||
| v4i32 _sum1 = __msa_fill_w(0); | |||
| int j = 0; | |||
| for (; j < nn; j++) | |||
| { | |||
| v16i8 _val01 = __msa_ld_b(tmpptr, 0); | |||
| v16i8 _extval01 = __msa_clti_s_b(_val01, 0); | |||
| v8i16 _val0 = (v8i16)__msa_ilvr_b(_extval01, _val01); | |||
| v8i16 _val1 = (v8i16)__msa_ilvl_b(_extval01, _val01); | |||
| v16i8 _w = __msa_ld_b(kptr, 0); | |||
| v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w); | |||
| v8i16 _s0 = __msa_mulv_h(_val0, _w16); | |||
| v8i16 _s1 = __msa_mulv_h(_val1, _w16); | |||
| _sum0 = __msa_addv_w(_sum0, __msa_hadd_s_w(_s0, _s0)); | |||
| _sum1 = __msa_addv_w(_sum1, __msa_hadd_s_w(_s1, _s1)); | |||
| tmpptr += 16; | |||
| kptr += 8; | |||
| } | |||
| outptr0[0] = __msa_reduce_add_w(_sum0); | |||
| outptr0[1] = __msa_reduce_add_w(_sum1); | |||
| outptr0 += 2; | |||
| } | |||
| for (; i < size; i++) | |||
| { | |||
| const signed char* tmpptr = tmp.channel(i / 2 + i % 2); | |||
| const signed char* kptr = kernel.channel(p / 4 + p % 4); | |||
| int nn = inch * maxk; // inch always > 0 | |||
| v4i32 _sum = __msa_fill_w(0); | |||
| int j = 0; | |||
| for (; j < nn; j++) | |||
| { | |||
| v16i8 _val = __msa_ld_b(tmpptr, 0); | |||
| v8i16 _val16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val); | |||
| v16i8 _w = __msa_ld_b(kptr, 0); | |||
| v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w); | |||
| v8i16 _s0 = __msa_mulv_h(_val16, _w16); | |||
| _sum = __msa_addv_w(_sum, __msa_hadd_s_w(_s0, _s0)); | |||
| tmpptr += 8; | |||
| kptr += 8; | |||
| } | |||
| outptr0[0] = __msa_reduce_add_w(_sum); | |||
| outptr0 += 1; | |||
| } | |||
| } | |||
| } | |||
| static void convolution_im2col_sgemm_transform_kernel_pack8to1_int8_msa(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h) | |||
| { | |||
| const int maxk = kernel_w * kernel_h; | |||
| // interleave | |||
| // src = maxk-inch-outch | |||
| // dst = 8a-4b-maxk-inch/8a-outch/4b | |||
| Mat kernel = _kernel.reshape(maxk, inch, outch); | |||
| if (outch >= 4) | |||
| kernel_tm.create(32 * maxk, inch / 8, outch / 4 + outch % 4, (size_t)1u); | |||
| else | |||
| kernel_tm.create(8 * maxk, inch / 8, outch, (size_t)1u); | |||
| int q = 0; | |||
| for (; q + 3 < outch; q += 4) | |||
| { | |||
| signed char* g00 = kernel_tm.channel(q / 4); | |||
| for (int p = 0; p + 7 < inch; p += 8) | |||
| { | |||
| for (int k = 0; k < maxk; k++) | |||
| { | |||
| for (int i = 0; i < 4; i++) | |||
| { | |||
| for (int j = 0; j < 8; j++) | |||
| { | |||
| const signed char* k00 = kernel.channel(q + i).row<const signed char>(p + j); | |||
| g00[0] = k00[k]; | |||
| g00++; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| // TODO unroll 2 | |||
| for (; q < outch; q++) | |||
| { | |||
| signed char* g00 = kernel_tm.channel(q / 4 + q % 4); | |||
| for (int p = 0; p + 7 < inch; p += 8) | |||
| { | |||
| for (int k = 0; k < maxk; k++) | |||
| { | |||
| for (int j = 0; j < 8; j++) | |||
| { | |||
| const signed char* k00 = kernel.channel(q).row<const signed char>(p + j); | |||
| g00[0] = k00[k]; | |||
| g00++; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| static void convolution_im2col_sgemm_pack8to1_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int inch = bottom_blob.c; | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| const int size = outw * outh; | |||
| const int maxk = kernel_w * kernel_h; | |||
| // im2col | |||
| Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator); | |||
| { | |||
| const int gap = w * stride_h - outw * stride_w; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p = 0; p < inch; p++) | |||
| { | |||
| const Mat img = bottom_blob.channel(p); | |||
| int64_t* ptr = bottom_im2col.channel(p); | |||
| for (int u = 0; u < kernel_h; u++) | |||
| { | |||
| for (int v = 0; v < kernel_w; v++) | |||
| { | |||
| const int64_t* sptr = img.row<const int64_t>(dilation_h * u) + dilation_w * v; | |||
| for (int i = 0; i < outh; i++) | |||
| { | |||
| int j = 0; | |||
| for (; j < outw; j++) | |||
| { | |||
| ptr[0] = sptr[0]; | |||
| sptr += stride_w; | |||
| ptr += 1; | |||
| } | |||
| sptr += gap; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| im2col_sgemm_pack8to1_int8_msa(bottom_im2col, top_blob, kernel, opt); | |||
| } | |||
| @@ -0,0 +1,320 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void im2col_sgemm_pack8to4_int8_msa(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| { | |||
| // Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator); | |||
| const int size = bottom_im2col.w; | |||
| const int maxk = bottom_im2col.h; | |||
| const int inch = bottom_im2col.c; | |||
| const int outch = top_blob.c; | |||
| // permute | |||
| Mat tmp; | |||
| if (size >= 2) | |||
| tmp.create(2 * maxk, inch, size / 2 + size % 2, 8u, 8, opt.workspace_allocator); | |||
| else | |||
| tmp.create(maxk, inch, size, 8u, 8, opt.workspace_allocator); | |||
| { | |||
| int remain_size_start = 0; | |||
| int nn_size = size >> 1; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int ii = 0; ii < nn_size; ii++) | |||
| { | |||
| int i = remain_size_start + ii * 2; | |||
| int64_t* tmpptr = tmp.channel(i / 2); | |||
| for (int q = 0; q < inch; q++) | |||
| { | |||
| const int64_t* img0 = (const int64_t*)bottom_im2col.channel(q) + i; | |||
| for (int k = 0; k < maxk; k++) | |||
| { | |||
| v16i8 _v = __msa_ld_b(img0, 0); | |||
| __msa_st_b(_v, tmpptr, 0); | |||
| tmpptr += 2; | |||
| img0 += size; | |||
| } | |||
| } | |||
| } | |||
| remain_size_start += nn_size << 1; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int i = remain_size_start; i < size; i++) | |||
| { | |||
| int64_t* tmpptr = tmp.channel(i / 2 + i % 2); | |||
| for (int q = 0; q < inch; q++) | |||
| { | |||
| const int64_t* img0 = (const int64_t*)bottom_im2col.channel(q) + i; | |||
| for (int k = 0; k < maxk; k++) | |||
| { | |||
| tmpptr[0] = img0[0]; | |||
| tmpptr += 1; | |||
| img0 += size; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p = 0; p < outch; p++) | |||
| { | |||
| int* outptr0 = top_blob.channel(p); | |||
| int i = 0; | |||
| for (; i + 1 < size; i += 2) | |||
| { | |||
| const signed char* tmpptr = tmp.channel(i / 2); | |||
| const signed char* kptr = kernel.channel(p); | |||
| int nn = inch * maxk; // inch always > 0 | |||
| v4i32 _sum00 = __msa_fill_w(0); | |||
| v4i32 _sum01 = __msa_fill_w(0); | |||
| v4i32 _sum02 = __msa_fill_w(0); | |||
| v4i32 _sum03 = __msa_fill_w(0); | |||
| v4i32 _sum10 = __msa_fill_w(0); | |||
| v4i32 _sum11 = __msa_fill_w(0); | |||
| v4i32 _sum12 = __msa_fill_w(0); | |||
| v4i32 _sum13 = __msa_fill_w(0); | |||
| int j = 0; | |||
| for (; j < nn; j++) | |||
| { | |||
| v16i8 _val01 = __msa_ld_b(tmpptr, 0); | |||
| v16i8 _extval01 = __msa_clti_s_b(_val01, 0); | |||
| v8i16 _val0 = (v8i16)__msa_ilvr_b(_extval01, _val01); | |||
| v8i16 _val1 = (v8i16)__msa_ilvl_b(_extval01, _val01); | |||
| v16i8 _w01 = __msa_ld_b(kptr, 0); | |||
| v16i8 _w23 = __msa_ld_b(kptr + 16, 0); | |||
| v16i8 _extw01 = __msa_clti_s_b(_w01, 0); | |||
| v16i8 _extw23 = __msa_clti_s_b(_w23, 0); | |||
| v8i16 _w0 = (v8i16)__msa_ilvr_b(_extw01, _w01); | |||
| v8i16 _w1 = (v8i16)__msa_ilvl_b(_extw01, _w01); | |||
| v8i16 _w2 = (v8i16)__msa_ilvr_b(_extw23, _w23); | |||
| v8i16 _w3 = (v8i16)__msa_ilvl_b(_extw23, _w23); | |||
| v8i16 _s00 = __msa_mulv_h(_val0, _w0); | |||
| v8i16 _s01 = __msa_mulv_h(_val0, _w1); | |||
| v8i16 _s02 = __msa_mulv_h(_val0, _w2); | |||
| v8i16 _s03 = __msa_mulv_h(_val0, _w3); | |||
| v8i16 _s10 = __msa_mulv_h(_val1, _w0); | |||
| v8i16 _s11 = __msa_mulv_h(_val1, _w1); | |||
| v8i16 _s12 = __msa_mulv_h(_val1, _w2); | |||
| v8i16 _s13 = __msa_mulv_h(_val1, _w3); | |||
| _sum00 = __msa_addv_w(_sum00, __msa_hadd_s_w(_s00, _s00)); | |||
| _sum01 = __msa_addv_w(_sum01, __msa_hadd_s_w(_s01, _s01)); | |||
| _sum02 = __msa_addv_w(_sum02, __msa_hadd_s_w(_s02, _s02)); | |||
| _sum03 = __msa_addv_w(_sum03, __msa_hadd_s_w(_s03, _s03)); | |||
| _sum10 = __msa_addv_w(_sum10, __msa_hadd_s_w(_s10, _s10)); | |||
| _sum11 = __msa_addv_w(_sum11, __msa_hadd_s_w(_s11, _s11)); | |||
| _sum12 = __msa_addv_w(_sum12, __msa_hadd_s_w(_s12, _s12)); | |||
| _sum13 = __msa_addv_w(_sum13, __msa_hadd_s_w(_s13, _s13)); | |||
| tmpptr += 16; | |||
| kptr += 32; | |||
| } | |||
| // transpose 4x4 | |||
| { | |||
| v4i32 _tmp0, _tmp1, _tmp2, _tmp3; | |||
| _tmp0 = __msa_ilvr_w(_sum01, _sum00); | |||
| _tmp1 = __msa_ilvr_w(_sum03, _sum02); | |||
| _tmp2 = __msa_ilvl_w(_sum01, _sum00); | |||
| _tmp3 = __msa_ilvl_w(_sum03, _sum02); | |||
| _sum00 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0); | |||
| _sum01 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0); | |||
| _sum02 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2); | |||
| _sum03 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2); | |||
| } | |||
| { | |||
| v4i32 _tmp0, _tmp1, _tmp2, _tmp3; | |||
| _tmp0 = __msa_ilvr_w(_sum11, _sum10); | |||
| _tmp1 = __msa_ilvr_w(_sum13, _sum12); | |||
| _tmp2 = __msa_ilvl_w(_sum11, _sum10); | |||
| _tmp3 = __msa_ilvl_w(_sum13, _sum12); | |||
| _sum10 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0); | |||
| _sum11 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0); | |||
| _sum12 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2); | |||
| _sum13 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2); | |||
| } | |||
| _sum00 = __msa_addv_w(_sum00, _sum01); | |||
| _sum02 = __msa_addv_w(_sum02, _sum03); | |||
| _sum10 = __msa_addv_w(_sum10, _sum11); | |||
| _sum12 = __msa_addv_w(_sum12, _sum13); | |||
| _sum00 = __msa_addv_w(_sum00, _sum02); | |||
| _sum10 = __msa_addv_w(_sum10, _sum12); | |||
| __msa_st_w(_sum00, outptr0, 0); | |||
| __msa_st_w(_sum10, outptr0 + 4, 0); | |||
| outptr0 += 8; | |||
| } | |||
| for (; i < size; i++) | |||
| { | |||
| const signed char* tmpptr = tmp.channel(i / 2 + i % 2); | |||
| const signed char* kptr = kernel.channel(p); | |||
| int nn = inch * maxk; // inch always > 0 | |||
| v4i32 _sum0 = __msa_fill_w(0); | |||
| v4i32 _sum1 = __msa_fill_w(0); | |||
| v4i32 _sum2 = __msa_fill_w(0); | |||
| v4i32 _sum3 = __msa_fill_w(0); | |||
| int j = 0; | |||
| for (; j < nn; j++) | |||
| { | |||
| v16i8 _val = __msa_ld_b(tmpptr, 0); | |||
| v8i16 _val16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val); | |||
| v16i8 _w01 = __msa_ld_b(kptr, 0); | |||
| v16i8 _w23 = __msa_ld_b(kptr + 16, 0); | |||
| v16i8 _extw01 = __msa_clti_s_b(_w01, 0); | |||
| v16i8 _extw23 = __msa_clti_s_b(_w23, 0); | |||
| v8i16 _w0 = (v8i16)__msa_ilvr_b(_extw01, _w01); | |||
| v8i16 _w1 = (v8i16)__msa_ilvl_b(_extw01, _w01); | |||
| v8i16 _w2 = (v8i16)__msa_ilvr_b(_extw23, _w23); | |||
| v8i16 _w3 = (v8i16)__msa_ilvl_b(_extw23, _w23); | |||
| v8i16 _s0 = __msa_mulv_h(_val16, _w0); | |||
| v8i16 _s1 = __msa_mulv_h(_val16, _w1); | |||
| v8i16 _s2 = __msa_mulv_h(_val16, _w2); | |||
| v8i16 _s3 = __msa_mulv_h(_val16, _w3); | |||
| _sum0 = __msa_addv_w(_sum0, __msa_hadd_s_w(_s0, _s0)); | |||
| _sum1 = __msa_addv_w(_sum1, __msa_hadd_s_w(_s1, _s1)); | |||
| _sum2 = __msa_addv_w(_sum2, __msa_hadd_s_w(_s2, _s2)); | |||
| _sum3 = __msa_addv_w(_sum3, __msa_hadd_s_w(_s3, _s3)); | |||
| tmpptr += 8; | |||
| kptr += 32; | |||
| } | |||
| // transpose 4x4 | |||
| { | |||
| v4i32 _tmp0, _tmp1, _tmp2, _tmp3; | |||
| _tmp0 = __msa_ilvr_w(_sum1, _sum0); | |||
| _tmp1 = __msa_ilvr_w(_sum3, _sum2); | |||
| _tmp2 = __msa_ilvl_w(_sum1, _sum0); | |||
| _tmp3 = __msa_ilvl_w(_sum3, _sum2); | |||
| _sum0 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0); | |||
| _sum1 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0); | |||
| _sum2 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2); | |||
| _sum3 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2); | |||
| } | |||
| _sum0 = __msa_addv_w(_sum0, _sum1); | |||
| _sum2 = __msa_addv_w(_sum2, _sum3); | |||
| _sum0 = __msa_addv_w(_sum0, _sum2); | |||
| __msa_st_w(_sum0, outptr0, 0); | |||
| outptr0 += 4; | |||
| } | |||
| } | |||
| } | |||
| static void convolution_im2col_sgemm_transform_kernel_pack8to4_int8_msa(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h) | |||
| { | |||
| const int maxk = kernel_w * kernel_h; | |||
| // interleave | |||
| // src = maxk-inch-outch | |||
| // dst = 8a-4b-maxk-inch/8a-outch/4b | |||
| Mat kernel = _kernel.reshape(maxk, inch, outch); | |||
| kernel_tm.create(32 * maxk, inch / 8, outch / 4, (size_t)1u); | |||
| for (int q = 0; q + 3 < outch; q += 4) | |||
| { | |||
| signed char* g00 = kernel_tm.channel(q / 4); | |||
| for (int p = 0; p + 7 < inch; p += 8) | |||
| { | |||
| for (int k = 0; k < maxk; k++) | |||
| { | |||
| for (int i = 0; i < 4; i++) | |||
| { | |||
| for (int j = 0; j < 8; j++) | |||
| { | |||
| const signed char* k00 = kernel.channel(q + i).row<const signed char>(p + j); | |||
| g00[0] = k00[k]; | |||
| g00++; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| static void convolution_im2col_sgemm_pack8to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int inch = bottom_blob.c; | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| const int size = outw * outh; | |||
| const int maxk = kernel_w * kernel_h; | |||
| // im2col | |||
| Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator); | |||
| { | |||
| const int gap = w * stride_h - outw * stride_w; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p = 0; p < inch; p++) | |||
| { | |||
| const Mat img = bottom_blob.channel(p); | |||
| int64_t* ptr = bottom_im2col.channel(p); | |||
| for (int u = 0; u < kernel_h; u++) | |||
| { | |||
| for (int v = 0; v < kernel_w; v++) | |||
| { | |||
| const int64_t* sptr = img.row<const int64_t>(dilation_h * u) + dilation_w * v; | |||
| for (int i = 0; i < outh; i++) | |||
| { | |||
| int j = 0; | |||
| for (; j < outw; j++) | |||
| { | |||
| ptr[0] = sptr[0]; | |||
| sptr += stride_w; | |||
| ptr += 1; | |||
| } | |||
| sptr += gap; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| im2col_sgemm_pack8to4_int8_msa(bottom_im2col, top_blob, kernel, opt); | |||
| } | |||
| @@ -88,7 +88,7 @@ static void deconvolution_pack4to1_msa(const Mat& bottom_blob, Mat& top_blob, co | |||
| kptr += maxk * 4; | |||
| } | |||
| sum += __msa_fhadd_w(_sum); | |||
| sum += __msa_reduce_fadd_w(_sum); | |||
| sum = activation_ss(sum, activation_type, activation_params); | |||
| @@ -300,10 +300,10 @@ int InnerProduct_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Opti | |||
| } | |||
| #if __mips_msa | |||
| sum0 += __msa_fhadd_w(_sum0); | |||
| sum1 += __msa_fhadd_w(_sum1); | |||
| sum2 += __msa_fhadd_w(_sum2); | |||
| sum3 += __msa_fhadd_w(_sum3); | |||
| sum0 += __msa_reduce_fadd_w(_sum0); | |||
| sum1 += __msa_reduce_fadd_w(_sum1); | |||
| sum2 += __msa_reduce_fadd_w(_sum2); | |||
| sum3 += __msa_reduce_fadd_w(_sum3); | |||
| #endif // __mips_msa | |||
| sum0 = activation_ss(sum0, activation_type, activation_params); | |||
| @@ -38,19 +38,25 @@ typedef union | |||
| static const ncnn::FloatInt Name = {.f = Val} | |||
| /* float type data load instructions */ | |||
| static inline v4f32 __msa_fill_w_f32(float val) | |||
| static NCNN_FORCEINLINE v4f32 __msa_fill_w_f32(float val) | |||
| { | |||
| ncnn::FloatInt fi_tmpval = {.f = val}; | |||
| return (v4f32)__msa_fill_w(fi_tmpval.i); | |||
| } | |||
| static inline float __msa_fhadd_w(v4f32 _v) | |||
| static NCNN_FORCEINLINE float __msa_reduce_fadd_w(v4f32 _v) | |||
| { | |||
| // TODO find a more efficient way | |||
| return _v[0] + _v[1] + _v[2] + _v[3]; | |||
| } | |||
| static inline int __msa_cfcmsa_msacsr() | |||
| static NCNN_FORCEINLINE int __msa_reduce_add_w(v4i32 _v) | |||
| { | |||
| // TODO find a more efficient way | |||
| return _v[0] + _v[1] + _v[2] + _v[3]; | |||
| } | |||
| static NCNN_FORCEINLINE int __msa_cfcmsa_msacsr() | |||
| { | |||
| int v; | |||
| asm volatile("cfcmsa %0, $1 \n" | |||
| @@ -60,7 +66,7 @@ static inline int __msa_cfcmsa_msacsr() | |||
| return v; | |||
| } | |||
| static inline void __msa_ctcmsa_msacsr(int v) | |||
| static NCNN_FORCEINLINE void __msa_ctcmsa_msacsr(int v) | |||
| { | |||
| asm volatile("ctcmsa $1, %0 \n" | |||
| : | |||
| @@ -69,7 +75,7 @@ static inline void __msa_ctcmsa_msacsr(int v) | |||
| } | |||
| #endif // __mips_msa | |||
| static inline signed char float2int8(float v) | |||
| static NCNN_FORCEINLINE signed char float2int8(float v) | |||
| { | |||
| int int32 = round(v); | |||
| if (int32 > 127) return 127; | |||
| @@ -78,7 +84,7 @@ static inline signed char float2int8(float v) | |||
| } | |||
| #if __mips_msa | |||
| static inline v16i8 float2int8(v4f32 _v) | |||
| static NCNN_FORCEINLINE v16i8 float2int8(v4f32 _v) | |||
| { | |||
| // simulate round to nearest via +/-0.5 | |||
| v4f32 _p5 = (v4f32)__msa_fill_w_f32(0.5f); | |||
| @@ -98,7 +104,7 @@ static inline v16i8 float2int8(v4f32 _v) | |||
| return _v8; | |||
| } | |||
| static inline int64_t float2int8(v4f32 _vlow, v4f32 _vhigh) | |||
| static NCNN_FORCEINLINE int64_t float2int8(v4f32 _vlow, v4f32 _vhigh) | |||
| { | |||
| // simulate round to nearest via +/-0.5 | |||
| v4f32 _p5 = (v4f32)__msa_fill_w_f32(0.5f); | |||
| @@ -123,7 +129,7 @@ static inline int64_t float2int8(v4f32 _vlow, v4f32 _vhigh) | |||
| return _v8[0]; | |||
| } | |||
| static inline v16i8 float2int8relu(v4f32 _v) | |||
| static NCNN_FORCEINLINE v16i8 float2int8relu(v4f32 _v) | |||
| { | |||
| // simulate round to nearest via +/-0.5 | |||
| v4f32 _p5 = (v4f32)__msa_fill_w_f32(0.5f); | |||
| @@ -143,7 +149,7 @@ static inline v16i8 float2int8relu(v4f32 _v) | |||
| return _v8; | |||
| } | |||
| static inline int64_t float2int8relu(v4f32 _vlow, v4f32 _vhigh) | |||
| static NCNN_FORCEINLINE int64_t float2int8relu(v4f32 _vlow, v4f32 _vhigh) | |||
| { | |||
| // simulate round to nearest via +/-0.5 | |||
| v4f32 _p5 = (v4f32)__msa_fill_w_f32(0.5f); | |||
| @@ -168,7 +174,7 @@ static inline int64_t float2int8relu(v4f32 _vlow, v4f32 _vhigh) | |||
| return _v8[0]; | |||
| } | |||
| static inline v16i8 float2int8leakyrelu(v4f32 _v, v4f32 _slope) | |||
| static NCNN_FORCEINLINE v16i8 float2int8leakyrelu(v4f32 _v, v4f32 _slope) | |||
| { | |||
| v4f32 _v_leaky = __msa_fmul_w(_v, _slope); | |||
| @@ -199,7 +205,7 @@ static inline v16i8 float2int8leakyrelu(v4f32 _v, v4f32 _slope) | |||
| return _v8; | |||
| } | |||
| static inline int64_t float2int8leakyrelu(v4f32 _vlow, v4f32 _vhigh, v4f32 _slope) | |||
| static NCNN_FORCEINLINE int64_t float2int8leakyrelu(v4f32 _vlow, v4f32 _vhigh, v4f32 _slope) | |||
| { | |||
| v4f32 _vlow_leaky = __msa_fmul_w(_vlow, _slope); | |||
| v4f32 _vhigh_leaky = __msa_fmul_w(_vhigh, _slope); | |||