Browse Source

mips msa optimization for convolution int8 (#3675)

* basic mips msa optimization for convolution int8

* mips msa optimization for convolution int8 gemm

* mips msa optimization for convolution int8 winograd pack8to4/pack8to1

* mention msa maddv/msubv intrinsics bug
tags/20220420
nihui GitHub 4 years ago
parent
commit
c09d7b3591
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
23 changed files with 4516 additions and 48 deletions
  1. +28
    -2
      docs/how-to-build/how-to-build.md
  2. +1
    -1
      src/layer/mips/convolution1d_mips.cpp
  3. +83
    -0
      src/layer/mips/convolution_1x1_int8.h
  4. +83
    -0
      src/layer/mips/convolution_1x1_pack1to4_int8.h
  5. +65
    -0
      src/layer/mips/convolution_1x1_pack8to1_int8.h
  6. +65
    -0
      src/layer/mips/convolution_1x1_pack8to4_int8.h
  7. +731
    -0
      src/layer/mips/convolution_3x3_pack8to1_int8.h
  8. +629
    -0
      src/layer/mips/convolution_3x3_pack8to4_int8.h
  9. +82
    -0
      src/layer/mips/convolution_int8.h
  10. +436
    -25
      src/layer/mips/convolution_mips.cpp
  11. +17
    -2
      src/layer/mips/convolution_mips.h
  12. +87
    -0
      src/layer/mips/convolution_pack1to4_int8.h
  13. +1
    -1
      src/layer/mips/convolution_pack4to1.h
  14. +87
    -0
      src/layer/mips/convolution_pack8to1_int8.h
  15. +120
    -0
      src/layer/mips/convolution_pack8to4_int8.h
  16. +731
    -0
      src/layer/mips/convolution_sgemm_int8.h
  17. +477
    -0
      src/layer/mips/convolution_sgemm_pack1to4_int8.h
  18. +1
    -1
      src/layer/mips/convolution_sgemm_pack4to1.h
  19. +450
    -0
      src/layer/mips/convolution_sgemm_pack8to1_int8.h
  20. +320
    -0
      src/layer/mips/convolution_sgemm_pack8to4_int8.h
  21. +1
    -1
      src/layer/mips/deconvolution_pack4to1.h
  22. +4
    -4
      src/layer/mips/innerproduct_mips.cpp
  23. +17
    -11
      src/layer/mips/mips_usability.h

+ 28
- 2
docs/how-to-build/how-to-build.md View File

@@ -578,12 +578,38 @@ You can upload binary inside `build-c906/examples` folder and run on D1 board fo

### Build for Loongson 2K1000

For gcc version < 8.5, you need to fix msa.h header for workaround msa fmadd bug.
For gcc version < 8.5, you need to fix msa.h header for workaround msa fmadd/fmsub/maddv/msubv bug.

Open ```/usr/lib/gcc/mips64el-linux-gnuabi64/8/include/msa.h```, find ```__msa_fmadd_w``` and apply changes as the following
Open ```/usr/lib/gcc/mips64el-linux-gnuabi64/8/include/msa.h```, find ```__msa_fmadd``` and ```__msa_fmsub``` and apply changes as the following
```c
// #define __msa_fmadd_w __builtin_msa_fmadd_w
// #define __msa_fmadd_d __builtin_msa_fmadd_d
// #define __msa_fmsub_w __builtin_msa_fmsub_w
// #define __msa_fmsub_d __builtin_msa_fmsub_d
#define __msa_fmadd_w(a, b, c) __builtin_msa_fmadd_w(c, b, a)
#define __msa_fmadd_d(a, b, c) __builtin_msa_fmadd_d(c, b, a)
#define __msa_fmsub_w(a, b, c) __builtin_msa_fmsub_w(c, b, a)
#define __msa_fmsub_d(a, b, c) __builtin_msa_fmsub_d(c, b, a)
```

find ```__msa_maddv``` and ```__msa_msubv``` and apply changes as the following
```c
// #define __msa_maddv_b __builtin_msa_maddv_b
// #define __msa_maddv_h __builtin_msa_maddv_h
// #define __msa_maddv_w __builtin_msa_maddv_w
// #define __msa_maddv_d __builtin_msa_maddv_d
// #define __msa_msubv_b __builtin_msa_msubv_b
// #define __msa_msubv_h __builtin_msa_msubv_h
// #define __msa_msubv_w __builtin_msa_msubv_w
// #define __msa_msubv_d __builtin_msa_msubv_d
#define __msa_maddv_b(a, b, c) __builtin_msa_maddv_b(c, b, a)
#define __msa_maddv_h(a, b, c) __builtin_msa_maddv_h(c, b, a)
#define __msa_maddv_w(a, b, c) __builtin_msa_maddv_w(c, b, a)
#define __msa_maddv_d(a, b, c) __builtin_msa_maddv_d(c, b, a)
#define __msa_msubv_b(a, b, c) __builtin_msa_msubv_b(c, b, a)
#define __msa_msubv_h(a, b, c) __builtin_msa_msubv_h(c, b, a)
#define __msa_msubv_w(a, b, c) __builtin_msa_msubv_w(c, b, a)
#define __msa_msubv_d(a, b, c) __builtin_msa_msubv_d(c, b, a)
```

Build ncnn with mips msa and simpleocv enabled:


+ 1
- 1
src/layer/mips/convolution1d_mips.cpp View File

@@ -253,7 +253,7 @@ int Convolution1D_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Opt
}
}

sum += __msa_fhadd_w(_sum);
sum += __msa_reduce_fadd_w(_sum);

sum = activation_ss(sum, activation_type, activation_params);



+ 83
- 0
src/layer/mips/convolution_1x1_int8.h View File

@@ -0,0 +1,83 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

static void conv1x1s1_sgemm_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
const int size = w * h;

Mat bottom_im2col = bottom_blob;
bottom_im2col.w = size;
bottom_im2col.h = 1;

im2col_sgemm_int8_msa(bottom_im2col, top_blob, kernel, opt);
}

static void conv1x1s2_sgemm_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
int w = bottom_blob.w;
int channels = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;
int elempack = bottom_blob.elempack;

int outw = top_blob.w;
int outh = top_blob.h;

const int tailstep = w - 2 * outw + w;

Mat bottom_blob_shrinked;
bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < channels; p++)
{
const signed char* r0 = bottom_blob.channel(p);
signed char* outptr = bottom_blob_shrinked.channel(p);

for (int i = 0; i < outh; i++)
{
int j = 0;
for (; j + 3 < outw; j += 4)
{
outptr[0] = r0[0];
outptr[1] = r0[2];
outptr[2] = r0[4];
outptr[3] = r0[6];

r0 += 8;
outptr += 4;
}
for (; j + 1 < outw; j += 2)
{
outptr[0] = r0[0];
outptr[1] = r0[2];

r0 += 4;
outptr += 2;
}
for (; j < outw; j++)
{
outptr[0] = r0[0];

r0 += 2;
outptr += 1;
}

r0 += tailstep;
}
}

conv1x1s1_sgemm_int8_msa(bottom_blob_shrinked, top_blob, kernel, opt);
}

+ 83
- 0
src/layer/mips/convolution_1x1_pack1to4_int8.h View File

@@ -0,0 +1,83 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

static void conv1x1s1_sgemm_pack1to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
const int size = w * h;

Mat bottom_im2col = bottom_blob;
bottom_im2col.w = size;
bottom_im2col.h = 1;

im2col_sgemm_pack1to4_int8_msa(bottom_im2col, top_blob, kernel, opt);
}

static void conv1x1s2_sgemm_pack1to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
int w = bottom_blob.w;
int channels = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;
int elempack = bottom_blob.elempack;

int outw = top_blob.w;
int outh = top_blob.h;

const int tailstep = w - 2 * outw + w;

Mat bottom_blob_shrinked;
bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < channels; p++)
{
const signed char* r0 = bottom_blob.channel(p);
signed char* outptr = bottom_blob_shrinked.channel(p);

for (int i = 0; i < outh; i++)
{
int j = 0;
for (; j + 3 < outw; j += 4)
{
outptr[0] = r0[0];
outptr[1] = r0[2];
outptr[2] = r0[4];
outptr[3] = r0[6];

r0 += 8;
outptr += 4;
}
for (; j + 1 < outw; j += 2)
{
outptr[0] = r0[0];
outptr[1] = r0[2];

r0 += 4;
outptr += 2;
}
for (; j < outw; j++)
{
outptr[0] = r0[0];

r0 += 2;
outptr += 1;
}

r0 += tailstep;
}
}

conv1x1s1_sgemm_pack1to4_int8_msa(bottom_blob_shrinked, top_blob, kernel, opt);
}

+ 65
- 0
src/layer/mips/convolution_1x1_pack8to1_int8.h View File

@@ -0,0 +1,65 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

static void conv1x1s1_sgemm_pack8to1_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
const int size = w * h;

Mat bottom_im2col = bottom_blob;
bottom_im2col.w = size;
bottom_im2col.h = 1;

im2col_sgemm_pack8to1_int8_msa(bottom_im2col, top_blob, kernel, opt);
}

static void conv1x1s2_sgemm_pack8to1_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
int w = bottom_blob.w;
int channels = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;
int elempack = bottom_blob.elempack;

int outw = top_blob.w;
int outh = top_blob.h;

const int tailstep = w - 2 * outw + w;

Mat bottom_blob_shrinked;
bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < channels; p++)
{
const int64_t* r0 = bottom_blob.channel(p);
int64_t* outptr = bottom_blob_shrinked.channel(p);

for (int i = 0; i < outh; i++)
{
int j = 0;
for (; j < outw; j++)
{
outptr[0] = r0[0];

r0 += 2;
outptr += 1;
}

r0 += tailstep;
}
}

conv1x1s1_sgemm_pack8to1_int8_msa(bottom_blob_shrinked, top_blob, kernel, opt);
}

+ 65
- 0
src/layer/mips/convolution_1x1_pack8to4_int8.h View File

@@ -0,0 +1,65 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

static void conv1x1s1_sgemm_pack8to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
const int size = w * h;

Mat bottom_im2col = bottom_blob;
bottom_im2col.w = size;
bottom_im2col.h = 1;

im2col_sgemm_pack8to4_int8_msa(bottom_im2col, top_blob, kernel, opt);
}

static void conv1x1s2_sgemm_pack8to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
int w = bottom_blob.w;
int channels = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;
int elempack = bottom_blob.elempack;

int outw = top_blob.w;
int outh = top_blob.h;

const int tailstep = w - 2 * outw + w;

Mat bottom_blob_shrinked;
bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < channels; p++)
{
const int64_t* r0 = bottom_blob.channel(p);
int64_t* outptr = bottom_blob_shrinked.channel(p);

for (int i = 0; i < outh; i++)
{
int j = 0;
for (; j < outw; j++)
{
outptr[0] = r0[0];

r0 += 2;
outptr += 1;
}

r0 += tailstep;
}
}

conv1x1s1_sgemm_pack8to4_int8_msa(bottom_blob_shrinked, top_blob, kernel, opt);
}

+ 731
- 0
src/layer/mips/convolution_3x3_pack8to1_int8.h View File

@@ -0,0 +1,731 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

static void conv3x3s1_winograd42_transform_kernel_pack8to1_int8_msa(const Mat& kernel, Mat& kernel_tm_pack8to1, int inch, int outch, const Option& opt)
{
// winograd42 transform kernel
Mat kernel_tm(6 * 6, inch, outch, (size_t)2u);

const short ktm[6][3] = {
{6, 0, 0},
{-4, -4, -4},
{-4, 4, -4},
{1, 2, 4},
{1, -2, 4},
{0, 0, 6}
};

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < outch; p++)
{
for (int q = 0; q < inch; q++)
{
const signed char* kernel0 = (const signed char*)kernel + p * inch * 9 + q * 9;
short* kernel_tm0 = kernel_tm.channel(p).row<short>(q);

// transform kernel
const signed char* k0 = kernel0;
const signed char* k1 = kernel0 + 3;
const signed char* k2 = kernel0 + 6;

// h
short tmp[6][3];
for (int i = 0; i < 6; i++)
{
tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
}

// U
for (int j = 0; j < 6; j++)
{
short* tmpp = &tmp[j][0];

for (int i = 0; i < 6; i++)
{
kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
}
}
}
}

// interleave
// src = 36-inch-outch
// dst = 4b-8a-inch/8a-36-outch/4b
kernel_tm_pack8to1.create(8 * inch / 8, 36, outch / 4 + outch % 4, (size_t)2u * 4, 4);

int p = 0;
for (; p + 3 < outch; p += 4)
{
const Mat k0 = kernel_tm.channel(p);
const Mat k1 = kernel_tm.channel(p + 1);
const Mat k2 = kernel_tm.channel(p + 2);
const Mat k3 = kernel_tm.channel(p + 3);

Mat g0 = kernel_tm_pack8to1.channel(p / 4);

for (int k = 0; k < 36; k++)
{
short* g00 = g0.row<short>(k);

for (int q = 0; q + 7 < inch; q += 8)
{
for (int i = 0; i < 8; i++)
{
g00[0] = k0.row<const short>(q + i)[k];
g00[1] = k1.row<const short>(q + i)[k];
g00[2] = k2.row<const short>(q + i)[k];
g00[3] = k3.row<const short>(q + i)[k];

g00 += 4;
}
}
}
}
for (; p < outch; p++)
{
const Mat k0 = kernel_tm.channel(p);

Mat g0 = kernel_tm_pack8to1.channel(p / 4 + p % 4);

for (int k = 0; k < 36; k++)
{
short* g00 = g0.row<short>(k);

for (int q = 0; q + 7 < inch; q += 8)
{
for (int i = 0; i < 8; i++)
{
g00[0] = k0.row<const short>(q + i)[k];

g00 += 1;
}
}
}
}
}

static void conv3x3s1_winograd42_pack8to1_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int inch = bottom_blob.c;
// size_t elemsize = bottom_blob.elemsize;
int elempack = bottom_blob.elempack;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

// pad to 4n+2
Mat bottom_blob_bordered = bottom_blob;

outw = (outw + 3) / 4 * 4;
outh = (outh + 3) / 4 * 4;

w = outw + 2;
h = outh + 2;
copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);

// BEGIN transform input
Mat bottom_blob_tm;
{
int w_tm = outw / 4 * 6;
int h_tm = outh / 4 * 6;

const int tiles = w_tm / 6 * h_tm / 6;

bottom_blob_tm.create(tiles, 36, inch, 2u * elempack, elempack, opt.workspace_allocator);

// const float itm[4][4] = {
// {4.0f, 0.0f, -5.0f, 0.0f, 1.0f, 0.0f},
// {0.0f,-4.0f, -4.0f, 1.0f, 1.0f, 0.0f},
// {0.0f, 4.0f, -4.0f,-1.0f, 1.0f, 0.0f},
// {0.0f,-2.0f, -1.0f, 2.0f, 1.0f, 0.0f},
// {0.0f, 2.0f, -1.0f,-2.0f, 1.0f, 0.0f},
// {0.0f, 4.0f, 0.0f,-5.0f, 0.0f, 1.0f}
// };

// 0 = 4 * r00 - 5 * r02 + r04
// 1 = -4 * (r01 + r02) + r04 + r03
// 2 = 4 * (r01 - r02) + r04 - r03
// 3 = -2 * (r01 - r03) + r04 - r02
// 4 = 2 * (r01 - r03) + r04 - r02
// 5 = 4 * r01 - 5 * r03 + r05

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < inch; q++)
{
const Mat img0 = bottom_blob_bordered.channel(q);
Mat img0_tm = bottom_blob_tm.channel(q);

short tmp[6][6][8];

// tile
for (int i = 0; i < h_tm / 6; i++)
{
for (int j = 0; j < w_tm / 6; j++)
{
const signed char* r0 = img0.row<const signed char>(i * 4) + (j * 4) * 8;

for (int m = 0; m < 6; m++)
{
v16i8 _r00_01 = __msa_ld_b(r0, 0);
v16i8 _r02_03 = __msa_ld_b(r0 + 16, 0);
v16i8 _r04_05 = __msa_ld_b(r0 + 32, 0);
v16i8 _extr0001 = __msa_clti_s_b(_r00_01, 0);
v16i8 _extr0203 = __msa_clti_s_b(_r02_03, 0);
v16i8 _extr0405 = __msa_clti_s_b(_r04_05, 0);
v8i16 _r00 = (v8i16)__msa_ilvr_b(_extr0001, _r00_01);
v8i16 _r01 = (v8i16)__msa_ilvl_b(_extr0001, _r00_01);
v8i16 _r02 = (v8i16)__msa_ilvr_b(_extr0203, _r02_03);
v8i16 _r03 = (v8i16)__msa_ilvl_b(_extr0203, _r02_03);
v8i16 _r04 = (v8i16)__msa_ilvr_b(_extr0405, _r04_05);
v8i16 _r05 = (v8i16)__msa_ilvl_b(_extr0405, _r04_05);

v8i16 _v5 = __msa_fill_h(5);

v8i16 _tmp0m = __msa_subv_h(__msa_addv_h(__msa_slli_h(_r00, 2), _r04), __msa_mulv_h(_r02, _v5));
v8i16 _tmp1m = __msa_subv_h(__msa_addv_h(_r04, _r03), __msa_slli_h(__msa_addv_h(_r01, _r02), 2));
v8i16 _tmp2m = __msa_addv_h(__msa_subv_h(_r04, _r03), __msa_slli_h(__msa_subv_h(_r01, _r02), 2));
v8i16 _tmp3m = __msa_subv_h(__msa_subv_h(_r04, _r02), __msa_slli_h(__msa_subv_h(_r01, _r03), 1));
v8i16 _tmp4m = __msa_addv_h(__msa_subv_h(_r04, _r02), __msa_slli_h(__msa_subv_h(_r01, _r03), 1));
v8i16 _tmp5m = __msa_subv_h(__msa_addv_h(__msa_slli_h(_r01, 2), _r05), __msa_mulv_h(_r03, _v5));

__msa_st_h(_tmp0m, tmp[0][m], 0);
__msa_st_h(_tmp1m, tmp[1][m], 0);
__msa_st_h(_tmp2m, tmp[2][m], 0);
__msa_st_h(_tmp3m, tmp[3][m], 0);
__msa_st_h(_tmp4m, tmp[4][m], 0);
__msa_st_h(_tmp5m, tmp[5][m], 0);

r0 += w * 8;
}

short* r0_tm_0 = (short*)img0_tm + (i * w_tm / 6 + j) * 8;
short* r0_tm_1 = r0_tm_0 + tiles * 8;
short* r0_tm_2 = r0_tm_0 + tiles * 16;
short* r0_tm_3 = r0_tm_0 + tiles * 24;
short* r0_tm_4 = r0_tm_0 + tiles * 32;
short* r0_tm_5 = r0_tm_0 + tiles * 40;

for (int m = 0; m < 6; m++)
{
v8i16 _tmp00 = __msa_ld_h(tmp[m][0], 0);
v8i16 _tmp01 = __msa_ld_h(tmp[m][1], 0);
v8i16 _tmp02 = __msa_ld_h(tmp[m][2], 0);
v8i16 _tmp03 = __msa_ld_h(tmp[m][3], 0);
v8i16 _tmp04 = __msa_ld_h(tmp[m][4], 0);
v8i16 _tmp05 = __msa_ld_h(tmp[m][5], 0);

v8i16 _v5 = __msa_fill_h(5);

v8i16 _r0tm0 = __msa_subv_h(__msa_addv_h(__msa_slli_h(_tmp00, 2), _tmp04), __msa_mulv_h(_tmp02, _v5));
v8i16 _r0tm1 = __msa_subv_h(__msa_addv_h(_tmp04, _tmp03), __msa_slli_h(__msa_addv_h(_tmp01, _tmp02), 2));
v8i16 _r0tm2 = __msa_addv_h(__msa_subv_h(_tmp04, _tmp03), __msa_slli_h(__msa_subv_h(_tmp01, _tmp02), 2));
v8i16 _r0tm3 = __msa_subv_h(__msa_subv_h(_tmp04, _tmp02), __msa_slli_h(__msa_subv_h(_tmp01, _tmp03), 1));
v8i16 _r0tm4 = __msa_addv_h(__msa_subv_h(_tmp04, _tmp02), __msa_slli_h(__msa_subv_h(_tmp01, _tmp03), 1));
v8i16 _r0tm5 = __msa_subv_h(__msa_addv_h(__msa_slli_h(_tmp01, 2), _tmp05), __msa_mulv_h(_tmp03, _v5));

__msa_st_h(_r0tm0, r0_tm_0, 0);
__msa_st_h(_r0tm1, r0_tm_1, 0);
__msa_st_h(_r0tm2, r0_tm_2, 0);
__msa_st_h(_r0tm3, r0_tm_3, 0);
__msa_st_h(_r0tm4, r0_tm_4, 0);
__msa_st_h(_r0tm5, r0_tm_5, 0);

r0_tm_0 += tiles * 48;
r0_tm_1 += tiles * 48;
r0_tm_2 += tiles * 48;
r0_tm_3 += tiles * 48;
r0_tm_4 += tiles * 48;
r0_tm_5 += tiles * 48;
}
}
}
}
}
bottom_blob_bordered = Mat();
// END transform input

// BEGIN dot
Mat top_blob_tm;
{
int w_tm = outw / 4 * 6;
int h_tm = outh / 4 * 6;

const int tiles = h_tm / 6 * w_tm / 6;

// permute
// bottom_blob_tm.create(tiles, 36, inch, elemsize, elempack, opt.workspace_allocator);
Mat bottom_blob_tm2;
if (tiles >= 2)
bottom_blob_tm2.create(2 * inch, tiles / 2 + tiles % 2, 36, 2u * elempack, elempack, opt.workspace_allocator);
else // if (tiles >= 1)
bottom_blob_tm2.create(1 * inch, tiles, 36, 2u * elempack, elempack, opt.workspace_allocator);

#pragma omp parallel for num_threads(opt.num_threads)
for (int r = 0; r < 36; r++)
{
Mat tm2 = bottom_blob_tm2.channel(r);

// tile
int i = 0;
for (; i + 1 < tiles; i += 2)
{
short* tmpptr = tm2.row<short>(i / 2);

const short* r0 = bottom_blob_tm;

r0 += (r * tiles + i) * 8;

for (int q = 0; q < inch; q++)
{
v8i16 _r0 = __msa_ld_h(r0, 0);
v8i16 _r1 = __msa_ld_h(r0 + 8, 0);
__msa_st_h(_r0, tmpptr, 0);
__msa_st_h(_r1, tmpptr + 8, 0);
r0 += bottom_blob_tm.cstep * 8;
tmpptr += 16;
}
}
for (; i < tiles; i++)
{
short* tmpptr = tm2.row<short>(i / 2 + i % 2);

const short* r0 = bottom_blob_tm;

r0 += (r * tiles + i) * 8;

for (int q = 0; q < inch; q++)
{
v8i16 _r0 = __msa_ld_h(r0, 0);
__msa_st_h(_r0, tmpptr, 0);
r0 += bottom_blob_tm.cstep * 8;
tmpptr += 8;
}
}
}

bottom_blob_tm = Mat();
// permute end

top_blob_tm.create(tiles, 36, outch, 4u, 1, opt.workspace_allocator);

int nn_outch = 0;
int remain_outch_start = 0;

nn_outch = outch >> 2;

#pragma omp parallel for num_threads(opt.num_threads)
for (int pp = 0; pp < nn_outch; pp++)
{
int p = pp * 4;

int* output0_tm = top_blob_tm.channel(p);
int* output1_tm = top_blob_tm.channel(p + 1);
int* output2_tm = top_blob_tm.channel(p + 2);
int* output3_tm = top_blob_tm.channel(p + 3);

const Mat kernel0_tm = kernel_tm.channel(p / 4);

for (int r = 0; r < 36; r++)
{
const Mat bb2 = bottom_blob_tm2.channel(r);

int i = 0;
for (; i + 1 < tiles; i += 2)
{
const short* r0 = bb2.row<const short>(i / 2);
const short* k0 = kernel0_tm.row<const short>(r);

int nn = inch; // inch always > 0

v4i32 _sum0 = __msa_fill_w(0);
v4i32 _sum1 = __msa_fill_w(0);
v4i32 _sum2 = __msa_fill_w(0);
v4i32 _sum3 = __msa_fill_w(0);

for (int j = 0; j < nn; j++)
{
v8i16 _w0 = __msa_ld_h(k0, 0);
v8i16 _w1 = __msa_ld_h(k0 + 8, 0);
v8i16 _w2 = __msa_ld_h(k0 + 16, 0);
v8i16 _w3 = __msa_ld_h(k0 + 24, 0);

v8i16 _extw0 = __msa_clti_s_h(_w0, 0);
v8i16 _extw1 = __msa_clti_s_h(_w1, 0);
v8i16 _extw2 = __msa_clti_s_h(_w2, 0);
v8i16 _extw3 = __msa_clti_s_h(_w3, 0);

v4i32 _w0l = (v4i32)__msa_ilvr_h(_extw0, _w0);
v4i32 _w0h = (v4i32)__msa_ilvl_h(_extw0, _w0);
v4i32 _w1l = (v4i32)__msa_ilvr_h(_extw1, _w1);
v4i32 _w1h = (v4i32)__msa_ilvl_h(_extw1, _w1);
v4i32 _w2l = (v4i32)__msa_ilvr_h(_extw2, _w2);
v4i32 _w2h = (v4i32)__msa_ilvl_h(_extw2, _w2);
v4i32 _w3l = (v4i32)__msa_ilvr_h(_extw3, _w3);
v4i32 _w3h = (v4i32)__msa_ilvl_h(_extw3, _w3);

v4i32 _val0_0 = __msa_fill_w(r0[0]);
v4i32 _val0_1 = __msa_fill_w(r0[1]);
v4i32 _val0_2 = __msa_fill_w(r0[2]);
v4i32 _val0_3 = __msa_fill_w(r0[3]);
v4i32 _val0_4 = __msa_fill_w(r0[4]);
v4i32 _val0_5 = __msa_fill_w(r0[5]);
v4i32 _val0_6 = __msa_fill_w(r0[6]);
v4i32 _val0_7 = __msa_fill_w(r0[7]);
v4i32 _val1_0 = __msa_fill_w(r0[8]);
v4i32 _val1_1 = __msa_fill_w(r0[9]);
v4i32 _val1_2 = __msa_fill_w(r0[10]);
v4i32 _val1_3 = __msa_fill_w(r0[11]);
v4i32 _val1_4 = __msa_fill_w(r0[12]);
v4i32 _val1_5 = __msa_fill_w(r0[13]);
v4i32 _val1_6 = __msa_fill_w(r0[14]);
v4i32 _val1_7 = __msa_fill_w(r0[15]);

_sum0 = __msa_maddv_w(_sum0, _w0l, _val0_0);
_sum1 = __msa_maddv_w(_sum1, _w0h, _val0_1);
_sum2 = __msa_maddv_w(_sum2, _w0l, _val1_0);
_sum3 = __msa_maddv_w(_sum3, _w0h, _val1_1);
_sum0 = __msa_maddv_w(_sum0, _w1l, _val0_2);
_sum1 = __msa_maddv_w(_sum1, _w1h, _val0_3);
_sum2 = __msa_maddv_w(_sum2, _w1l, _val1_2);
_sum3 = __msa_maddv_w(_sum3, _w1h, _val1_3);
_sum0 = __msa_maddv_w(_sum0, _w2l, _val0_4);
_sum1 = __msa_maddv_w(_sum1, _w2h, _val0_5);
_sum2 = __msa_maddv_w(_sum2, _w2l, _val1_4);
_sum3 = __msa_maddv_w(_sum3, _w2h, _val1_5);
_sum0 = __msa_maddv_w(_sum0, _w3l, _val0_6);
_sum1 = __msa_maddv_w(_sum1, _w3h, _val0_7);
_sum2 = __msa_maddv_w(_sum2, _w3l, _val1_6);
_sum3 = __msa_maddv_w(_sum3, _w3h, _val1_7);

r0 += 16;
k0 += 32;
}

_sum0 = __msa_addv_w(_sum0, _sum1);
_sum2 = __msa_addv_w(_sum2, _sum3);

int sum[8];
__msa_st_w(_sum0, sum, 0);
__msa_st_w(_sum2, sum + 4, 0);

output0_tm[0] = sum[0];
output1_tm[0] = sum[1];
output2_tm[0] = sum[2];
output3_tm[0] = sum[3];
output0_tm[1] = sum[4];
output1_tm[1] = sum[5];
output2_tm[1] = sum[6];
output3_tm[1] = sum[7];
output0_tm += 2;
output1_tm += 2;
output2_tm += 2;
output3_tm += 2;
}
for (; i < tiles; i++)
{
const short* r0 = bb2.row<const short>(i / 2 + i % 2);
const short* k0 = kernel0_tm.row<const short>(r);

int nn = inch; // inch always > 0

v4i32 _sum0 = __msa_fill_w(0);
v4i32 _sum1 = __msa_fill_w(0);

for (int j = 0; j < nn; j++)
{
v8i16 _w0 = __msa_ld_h(k0, 0);
v8i16 _w1 = __msa_ld_h(k0 + 8, 0);
v8i16 _w2 = __msa_ld_h(k0 + 16, 0);
v8i16 _w3 = __msa_ld_h(k0 + 24, 0);

v8i16 _extw0 = __msa_clti_s_h(_w0, 0);
v8i16 _extw1 = __msa_clti_s_h(_w1, 0);
v8i16 _extw2 = __msa_clti_s_h(_w2, 0);
v8i16 _extw3 = __msa_clti_s_h(_w3, 0);

v4i32 _w0l = (v4i32)__msa_ilvr_h(_extw0, _w0);
v4i32 _w0h = (v4i32)__msa_ilvl_h(_extw0, _w0);
v4i32 _w1l = (v4i32)__msa_ilvr_h(_extw1, _w1);
v4i32 _w1h = (v4i32)__msa_ilvl_h(_extw1, _w1);
v4i32 _w2l = (v4i32)__msa_ilvr_h(_extw2, _w2);
v4i32 _w2h = (v4i32)__msa_ilvl_h(_extw2, _w2);
v4i32 _w3l = (v4i32)__msa_ilvr_h(_extw3, _w3);
v4i32 _w3h = (v4i32)__msa_ilvl_h(_extw3, _w3);

v4i32 _val0 = __msa_fill_w(r0[0]);
v4i32 _val1 = __msa_fill_w(r0[1]);
v4i32 _val2 = __msa_fill_w(r0[2]);
v4i32 _val3 = __msa_fill_w(r0[3]);
v4i32 _val4 = __msa_fill_w(r0[4]);
v4i32 _val5 = __msa_fill_w(r0[5]);
v4i32 _val6 = __msa_fill_w(r0[6]);
v4i32 _val7 = __msa_fill_w(r0[7]);

_sum0 = __msa_maddv_w(_sum0, _w0l, _val0);
_sum1 = __msa_maddv_w(_sum1, _w0h, _val1);
_sum0 = __msa_maddv_w(_sum0, _w1l, _val2);
_sum1 = __msa_maddv_w(_sum1, _w1h, _val3);
_sum0 = __msa_maddv_w(_sum0, _w2l, _val4);
_sum1 = __msa_maddv_w(_sum1, _w2h, _val5);
_sum0 = __msa_maddv_w(_sum0, _w3l, _val6);
_sum1 = __msa_maddv_w(_sum1, _w3h, _val7);

r0 += 8;
k0 += 32;
}

_sum0 = __msa_addv_w(_sum0, _sum1);

int sum[4];
__msa_st_w(_sum0, sum, 0);

output0_tm[0] = sum[0];
output1_tm[0] = sum[1];
output2_tm[0] = sum[2];
output3_tm[0] = sum[3];
output0_tm += 1;
output1_tm += 1;
output2_tm += 1;
output3_tm += 1;
}
}
}

remain_outch_start += nn_outch << 2;

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = remain_outch_start; p < outch; p++)
{
int* output0_tm = top_blob_tm.channel(p);

const Mat kernel0_tm = kernel_tm.channel(p / 4 + p % 4);

for (int r = 0; r < 36; r++)
{
const Mat bb2 = bottom_blob_tm2.channel(r);

int i = 0;
for (; i + 1 < tiles; i += 2)
{
const short* r0 = bb2.row<const short>(i / 2);
const short* k0 = kernel0_tm.row<const short>(r);

v4i32 _sum0 = __msa_fill_w(0);
v4i32 _sum1 = __msa_fill_w(0);
v4i32 _sum2 = __msa_fill_w(0);
v4i32 _sum3 = __msa_fill_w(0);

for (int q = 0; q < inch; q++)
{
v8i16 _val0 = __msa_ld_h(r0, 0);
v8i16 _val1 = __msa_ld_h(r0 + 8, 0);

v8i16 _extval0 = __msa_clti_s_h(_val0, 0);
v8i16 _extval1 = __msa_clti_s_h(_val1, 0);
v4i32 _val0l = (v4i32)__msa_ilvr_h(_extval0, _val0);
v4i32 _val0h = (v4i32)__msa_ilvl_h(_extval0, _val0);
v4i32 _val1l = (v4i32)__msa_ilvr_h(_extval1, _val1);
v4i32 _val1h = (v4i32)__msa_ilvl_h(_extval1, _val1);

v8i16 _w0 = __msa_ld_h(k0, 0);

v8i16 _extw0 = __msa_clti_s_h(_w0, 0);
v4i32 _w0l = (v4i32)__msa_ilvr_h(_extw0, _w0);
v4i32 _w0h = (v4i32)__msa_ilvl_h(_extw0, _w0);

_sum0 = __msa_maddv_w(_sum0, _w0l, _val0l);
_sum1 = __msa_maddv_w(_sum1, _w0h, _val0h);
_sum2 = __msa_maddv_w(_sum2, _w0l, _val1l);
_sum3 = __msa_maddv_w(_sum3, _w0h, _val1h);

k0 += 8;
r0 += 16;
}

_sum0 = __msa_addv_w(_sum0, _sum1);
_sum2 = __msa_addv_w(_sum2, _sum3);

output0_tm[0] = __msa_reduce_add_w(_sum0);
output0_tm[1] = __msa_reduce_add_w(_sum2);
output0_tm += 2;
}
for (; i < tiles; i++)
{
const short* r0 = bb2.row<const short>(i / 2 + i % 2);
const short* k0 = kernel0_tm.row<const short>(r);

v4i32 _sum0 = __msa_fill_w(0);
v4i32 _sum1 = __msa_fill_w(0);

for (int q = 0; q < inch; q++)
{
v8i16 _val = __msa_ld_h(r0, 0);

v8i16 _extval = __msa_clti_s_h(_val, 0);
v4i32 _vall = (v4i32)__msa_ilvr_h(_extval, _val);
v4i32 _valh = (v4i32)__msa_ilvl_h(_extval, _val);

v8i16 _w0 = __msa_ld_h(k0, 0);

v8i16 _extw0 = __msa_clti_s_h(_w0, 0);
v4i32 _w0l = (v4i32)__msa_ilvr_h(_extw0, _w0);
v4i32 _w0h = (v4i32)__msa_ilvl_h(_extw0, _w0);

_sum0 = __msa_maddv_w(_sum0, _w0l, _vall);
_sum1 = __msa_maddv_w(_sum1, _w0h, _valh);

k0 += 8;
r0 += 8;
}

_sum0 = __msa_addv_w(_sum0, _sum1);

output0_tm[0] = __msa_reduce_add_w(_sum0);
output0_tm++;
}
}
}
}
bottom_blob_tm = Mat();
// END dot

// BEGIN transform output
Mat top_blob_bordered;
if (outw == top_blob.w && outh == top_blob.h)
{
top_blob_bordered = top_blob;
}
else
{
top_blob_bordered.create(outw, outh, outch, 4u, 1, opt.workspace_allocator);
}
{
// const float otm[4][6] = {
// {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f},
// {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 0.0f},
// {0.0f, 1.0f, 1.0f, 4.0f, 4.0f, 0.0f},
// {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 1.0f}
// };

// 0 = r00 + (r01 + r02) + (r03 + r04)
// 1 = (r01 - r02) + (r03 - r04) * 2
// 2 = (r01 + r02) + (r03 + r04) * 4
// 3 = r05 + (r01 - r02) + (r03 - r04) * 8

int w_tm = outw / 4 * 6;
int h_tm = outh / 4 * 6;
const int tiles = w_tm / 6 * h_tm / 6;

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < outch; p++)
{
const Mat out0_tm = top_blob_tm.channel(p);
Mat out0 = top_blob_bordered.channel(p);

int tmp[4][6];

// tile
for (int i = 0; i < outh / 4; i++)
{
for (int j = 0; j < outw / 4; j++)
{
// top_blob_tm.create(tiles, 36, outch, 4u, 1, opt.workspace_allocator);

const int* output0_tm_0 = (const int*)out0_tm + (i * w_tm / 6 + j) * 1;
const int* output0_tm_1 = output0_tm_0 + tiles * 1;
const int* output0_tm_2 = output0_tm_0 + tiles * 2;
const int* output0_tm_3 = output0_tm_0 + tiles * 3;
const int* output0_tm_4 = output0_tm_0 + tiles * 4;
const int* output0_tm_5 = output0_tm_0 + tiles * 5;

int* output0 = out0.row<int>(i * 4) + j * 4;

// 0 = r00 + (r01 + r02) + (r03 + r04)
// 1 = (r01 - r02) + (r03 - r04) * 2
// 2 = (r01 + r02) + (r03 + r04) * 4
// 3 = r05 + (r01 - r02) + (r03 - r04) * 8

// TODO msa optimize
for (int m = 0; m < 5; m++)
{
int tmp02a = output0_tm_1[0] + output0_tm_2[0];
int tmp13a = output0_tm_1[0] - output0_tm_2[0];

int tmp02b = output0_tm_3[0] + output0_tm_4[0];
int tmp13b = output0_tm_3[0] - output0_tm_4[0];

tmp[0][m] = output0_tm_0[0] + tmp02a + tmp02b;
tmp[1][m] = tmp13a + tmp13b * 2;
tmp[2][m] = tmp02a + tmp02b * 4;
tmp[3][m] = output0_tm_5[0] * 4 + tmp13a + tmp13b * 8;

output0_tm_0 += tiles * 6;
output0_tm_1 += tiles * 6;
output0_tm_2 += tiles * 6;
output0_tm_3 += tiles * 6;
output0_tm_4 += tiles * 6;
output0_tm_5 += tiles * 6;
}
for (int m = 5; m < 6; m++)
{
int tmp02a = output0_tm_1[0] + output0_tm_2[0];
int tmp13a = output0_tm_1[0] - output0_tm_2[0];

int tmp02b = output0_tm_3[0] + output0_tm_4[0];
int tmp13b = output0_tm_3[0] - output0_tm_4[0];

tmp[0][m] = (output0_tm_0[0] + tmp02a + tmp02b) * 4;
tmp[1][m] = (tmp13a + tmp13b * 2) * 4;
tmp[2][m] = (tmp02a + tmp02b * 4) * 4;
tmp[3][m] = (output0_tm_5[0] * 4 + tmp13a + tmp13b * 8) * 4;

output0_tm_0 += tiles * 6;
output0_tm_1 += tiles * 6;
output0_tm_2 += tiles * 6;
output0_tm_3 += tiles * 6;
output0_tm_4 += tiles * 6;
output0_tm_5 += tiles * 6;
}

for (int m = 0; m < 4; m++)
{
const int* tmp0 = tmp[m];

int tmp02a = tmp0[1] + tmp0[2];
int tmp13a = tmp0[1] - tmp0[2];

int tmp02b = tmp0[3] + tmp0[4];
int tmp13b = tmp0[3] - tmp0[4];

output0[0] = (tmp0[0] + tmp02a + tmp02b) / 576;
output0[1] = (tmp13a + tmp13b * 2) / 576;
output0[2] = (tmp02a + tmp02b * 4) / 576;
output0[3] = (tmp0[5] + tmp13a + tmp13b * 8) / 576;

output0 += outw;
}
}
}
}
}
// END transform output

// cut result pad
copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
}

+ 629
- 0
src/layer/mips/convolution_3x3_pack8to4_int8.h View File

@@ -0,0 +1,629 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

static void conv3x3s1_winograd42_transform_kernel_pack8to4_int8_msa(const Mat& kernel, Mat& kernel_tm_pack8, int inch, int outch, const Option& opt)
{
// winograd42 transform kernel
Mat kernel_tm(6 * 6, inch, outch, (size_t)2u);

const short ktm[6][3] = {
{6, 0, 0},
{-4, -4, -4},
{-4, 4, -4},
{1, 2, 4},
{1, -2, 4},
{0, 0, 6}
};

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < outch; p++)
{
for (int q = 0; q < inch; q++)
{
const signed char* kernel0 = (const signed char*)kernel + p * inch * 9 + q * 9;
short* kernel_tm0 = kernel_tm.channel(p).row<short>(q);

// transform kernel
const signed char* k0 = kernel0;
const signed char* k1 = kernel0 + 3;
const signed char* k2 = kernel0 + 6;

// h
short tmp[6][3];
for (int i = 0; i < 6; i++)
{
tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
}

// U
for (int j = 0; j < 6; j++)
{
short* tmpp = &tmp[j][0];

for (int i = 0; i < 6; i++)
{
kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
}
}
}
}

// interleave
// src = 36-inch-outch
// dst = 4b-8a-inch/8a-36-outch/4b
kernel_tm_pack8.create(inch / 8, 36, outch / 4, (size_t)2u * 32, 32);

int q = 0;
for (; q + 3 < outch; q += 4)
{
const Mat k0 = kernel_tm.channel(q);
const Mat k1 = kernel_tm.channel(q + 1);
const Mat k2 = kernel_tm.channel(q + 2);
const Mat k3 = kernel_tm.channel(q + 3);

Mat kernel_tm = kernel_tm_pack8.channel(q / 4);

for (int k = 0; k < 36; k++)
{
short* g00 = kernel_tm.row<short>(k);

for (int p = 0; p + 7 < inch; p += 8)
{
for (int i = 0; i < 8; i++)
{
const short* k00 = k0.row<const short>(p + i);
const short* k10 = k1.row<const short>(p + i);
const short* k20 = k2.row<const short>(p + i);
const short* k30 = k3.row<const short>(p + i);

g00[0] = k00[k];
g00[1] = k10[k];
g00[2] = k20[k];
g00[3] = k30[k];

g00 += 4;
}
}
}
}
}

static void conv3x3s1_winograd42_pack8to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int inch = bottom_blob.c;
// size_t elemsize = bottom_blob.elemsize;
int elempack = bottom_blob.elempack;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

// pad to 4n+2
Mat bottom_blob_bordered = bottom_blob;

outw = (outw + 3) / 4 * 4;
outh = (outh + 3) / 4 * 4;

w = outw + 2;
h = outh + 2;
copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);

// BEGIN transform input
Mat bottom_blob_tm;
{
int w_tm = outw / 4 * 6;
int h_tm = outh / 4 * 6;

const int tiles = w_tm / 6 * h_tm / 6;

bottom_blob_tm.create(tiles, 36, inch, 2u * elempack, elempack, opt.workspace_allocator);

// const float itm[4][4] = {
// {4.0f, 0.0f, -5.0f, 0.0f, 1.0f, 0.0f},
// {0.0f,-4.0f, -4.0f, 1.0f, 1.0f, 0.0f},
// {0.0f, 4.0f, -4.0f,-1.0f, 1.0f, 0.0f},
// {0.0f,-2.0f, -1.0f, 2.0f, 1.0f, 0.0f},
// {0.0f, 2.0f, -1.0f,-2.0f, 1.0f, 0.0f},
// {0.0f, 4.0f, 0.0f,-5.0f, 0.0f, 1.0f}
// };

// 0 = 4 * r00 - 5 * r02 + r04
// 1 = -4 * (r01 + r02) + r04 + r03
// 2 = 4 * (r01 - r02) + r04 - r03
// 3 = -2 * (r01 - r03) + r04 - r02
// 4 = 2 * (r01 - r03) + r04 - r02
// 5 = 4 * r01 - 5 * r03 + r05

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < inch; q++)
{
const Mat img0 = bottom_blob_bordered.channel(q);
Mat img0_tm = bottom_blob_tm.channel(q);

short tmp[6][6][8];

// tile
for (int i = 0; i < h_tm / 6; i++)
{
for (int j = 0; j < w_tm / 6; j++)
{
const signed char* r0 = img0.row<const signed char>(i * 4) + (j * 4) * 8;

for (int m = 0; m < 6; m++)
{
v16i8 _r00_01 = __msa_ld_b(r0, 0);
v16i8 _r02_03 = __msa_ld_b(r0 + 16, 0);
v16i8 _r04_05 = __msa_ld_b(r0 + 32, 0);
v16i8 _extr0001 = __msa_clti_s_b(_r00_01, 0);
v16i8 _extr0203 = __msa_clti_s_b(_r02_03, 0);
v16i8 _extr0405 = __msa_clti_s_b(_r04_05, 0);
v8i16 _r00 = (v8i16)__msa_ilvr_b(_extr0001, _r00_01);
v8i16 _r01 = (v8i16)__msa_ilvl_b(_extr0001, _r00_01);
v8i16 _r02 = (v8i16)__msa_ilvr_b(_extr0203, _r02_03);
v8i16 _r03 = (v8i16)__msa_ilvl_b(_extr0203, _r02_03);
v8i16 _r04 = (v8i16)__msa_ilvr_b(_extr0405, _r04_05);
v8i16 _r05 = (v8i16)__msa_ilvl_b(_extr0405, _r04_05);

v8i16 _v5 = __msa_fill_h(5);

v8i16 _tmp0m = __msa_subv_h(__msa_addv_h(__msa_slli_h(_r00, 2), _r04), __msa_mulv_h(_r02, _v5));
v8i16 _tmp1m = __msa_subv_h(__msa_addv_h(_r04, _r03), __msa_slli_h(__msa_addv_h(_r01, _r02), 2));
v8i16 _tmp2m = __msa_addv_h(__msa_subv_h(_r04, _r03), __msa_slli_h(__msa_subv_h(_r01, _r02), 2));
v8i16 _tmp3m = __msa_subv_h(__msa_subv_h(_r04, _r02), __msa_slli_h(__msa_subv_h(_r01, _r03), 1));
v8i16 _tmp4m = __msa_addv_h(__msa_subv_h(_r04, _r02), __msa_slli_h(__msa_subv_h(_r01, _r03), 1));
v8i16 _tmp5m = __msa_subv_h(__msa_addv_h(__msa_slli_h(_r01, 2), _r05), __msa_mulv_h(_r03, _v5));

__msa_st_h(_tmp0m, tmp[0][m], 0);
__msa_st_h(_tmp1m, tmp[1][m], 0);
__msa_st_h(_tmp2m, tmp[2][m], 0);
__msa_st_h(_tmp3m, tmp[3][m], 0);
__msa_st_h(_tmp4m, tmp[4][m], 0);
__msa_st_h(_tmp5m, tmp[5][m], 0);

r0 += w * 8;
}

short* r0_tm_0 = (short*)img0_tm + (i * w_tm / 6 + j) * 8;
short* r0_tm_1 = r0_tm_0 + tiles * 8;
short* r0_tm_2 = r0_tm_0 + tiles * 16;
short* r0_tm_3 = r0_tm_0 + tiles * 24;
short* r0_tm_4 = r0_tm_0 + tiles * 32;
short* r0_tm_5 = r0_tm_0 + tiles * 40;

for (int m = 0; m < 6; m++)
{
v8i16 _tmp00 = __msa_ld_h(tmp[m][0], 0);
v8i16 _tmp01 = __msa_ld_h(tmp[m][1], 0);
v8i16 _tmp02 = __msa_ld_h(tmp[m][2], 0);
v8i16 _tmp03 = __msa_ld_h(tmp[m][3], 0);
v8i16 _tmp04 = __msa_ld_h(tmp[m][4], 0);
v8i16 _tmp05 = __msa_ld_h(tmp[m][5], 0);

v8i16 _v5 = __msa_fill_h(5);

v8i16 _r0tm0 = __msa_subv_h(__msa_addv_h(__msa_slli_h(_tmp00, 2), _tmp04), __msa_mulv_h(_tmp02, _v5));
v8i16 _r0tm1 = __msa_subv_h(__msa_addv_h(_tmp04, _tmp03), __msa_slli_h(__msa_addv_h(_tmp01, _tmp02), 2));
v8i16 _r0tm2 = __msa_addv_h(__msa_subv_h(_tmp04, _tmp03), __msa_slli_h(__msa_subv_h(_tmp01, _tmp02), 2));
v8i16 _r0tm3 = __msa_subv_h(__msa_subv_h(_tmp04, _tmp02), __msa_slli_h(__msa_subv_h(_tmp01, _tmp03), 1));
v8i16 _r0tm4 = __msa_addv_h(__msa_subv_h(_tmp04, _tmp02), __msa_slli_h(__msa_subv_h(_tmp01, _tmp03), 1));
v8i16 _r0tm5 = __msa_subv_h(__msa_addv_h(__msa_slli_h(_tmp01, 2), _tmp05), __msa_mulv_h(_tmp03, _v5));

__msa_st_h(_r0tm0, r0_tm_0, 0);
__msa_st_h(_r0tm1, r0_tm_1, 0);
__msa_st_h(_r0tm2, r0_tm_2, 0);
__msa_st_h(_r0tm3, r0_tm_3, 0);
__msa_st_h(_r0tm4, r0_tm_4, 0);
__msa_st_h(_r0tm5, r0_tm_5, 0);

r0_tm_0 += tiles * 48;
r0_tm_1 += tiles * 48;
r0_tm_2 += tiles * 48;
r0_tm_3 += tiles * 48;
r0_tm_4 += tiles * 48;
r0_tm_5 += tiles * 48;
}
}
}
}
}
bottom_blob_bordered = Mat();
// END transform input

// BEGIN dot
Mat top_blob_tm;
{
int w_tm = outw / 4 * 6;
int h_tm = outh / 4 * 6;

const int tiles = h_tm / 6 * w_tm / 6;

// permute
// bottom_blob_tm.create(tiles, 36, inch, elemsize, elempack, opt.workspace_allocator);
Mat bottom_blob_tm2;
if (tiles >= 2)
bottom_blob_tm2.create(2 * inch, tiles / 2 + tiles % 2, 36, 2u * elempack, elempack, opt.workspace_allocator);
else // if (tiles >= 1)
bottom_blob_tm2.create(1 * inch, tiles, 36, 2u * elempack, elempack, opt.workspace_allocator);

#pragma omp parallel for num_threads(opt.num_threads)
for (int r = 0; r < 36; r++)
{
Mat tm2 = bottom_blob_tm2.channel(r);

// tile
int i = 0;
for (; i + 1 < tiles; i += 2)
{
short* tmpptr = tm2.row<short>(i / 2);

const short* r0 = bottom_blob_tm;

r0 += (r * tiles + i) * 8;

for (int q = 0; q < inch; q++)
{
v8i16 _r0 = __msa_ld_h(r0, 0);
v8i16 _r1 = __msa_ld_h(r0 + 8, 0);
__msa_st_h(_r0, tmpptr, 0);
__msa_st_h(_r1, tmpptr + 8, 0);
r0 += bottom_blob_tm.cstep * 8;
tmpptr += 16;
}
}
for (; i < tiles; i++)
{
short* tmpptr = tm2.row<short>(i / 2 + i % 2);

const short* r0 = bottom_blob_tm;

r0 += (r * tiles + i) * 8;

for (int q = 0; q < inch; q++)
{
v8i16 _r0 = __msa_ld_h(r0, 0);
__msa_st_h(_r0, tmpptr, 0);
r0 += bottom_blob_tm.cstep * 8;
tmpptr += 8;
}
}
}

bottom_blob_tm = Mat();
// permute end

top_blob_tm.create(tiles, 36, outch, 4u * 4, 4, opt.workspace_allocator);

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < outch; p++)
{
int* output0_tm = top_blob_tm.channel(p);

const Mat kernel0_tm = kernel_tm.channel(p);

for (int r = 0; r < 36; r++)
{
const Mat bb2 = bottom_blob_tm2.channel(r);

int i = 0;
for (; i + 1 < tiles; i += 2)
{
const short* r0 = bb2.row<const short>(i / 2);
const short* k0 = kernel0_tm.row<const short>(r);

int nn = inch; // inch always > 0

v4i32 _sum0 = __msa_fill_w(0);
v4i32 _sum1 = __msa_fill_w(0);
v4i32 _sum2 = __msa_fill_w(0);
v4i32 _sum3 = __msa_fill_w(0);

for (int j = 0; j < nn; j++)
{
v8i16 _w0 = __msa_ld_h(k0, 0);
v8i16 _w1 = __msa_ld_h(k0 + 8, 0);
v8i16 _w2 = __msa_ld_h(k0 + 16, 0);
v8i16 _w3 = __msa_ld_h(k0 + 24, 0);

v8i16 _extw0 = __msa_clti_s_h(_w0, 0);
v8i16 _extw1 = __msa_clti_s_h(_w1, 0);
v8i16 _extw2 = __msa_clti_s_h(_w2, 0);
v8i16 _extw3 = __msa_clti_s_h(_w3, 0);

v4i32 _w0l = (v4i32)__msa_ilvr_h(_extw0, _w0);
v4i32 _w0h = (v4i32)__msa_ilvl_h(_extw0, _w0);
v4i32 _w1l = (v4i32)__msa_ilvr_h(_extw1, _w1);
v4i32 _w1h = (v4i32)__msa_ilvl_h(_extw1, _w1);
v4i32 _w2l = (v4i32)__msa_ilvr_h(_extw2, _w2);
v4i32 _w2h = (v4i32)__msa_ilvl_h(_extw2, _w2);
v4i32 _w3l = (v4i32)__msa_ilvr_h(_extw3, _w3);
v4i32 _w3h = (v4i32)__msa_ilvl_h(_extw3, _w3);

v4i32 _val0_0 = __msa_fill_w(r0[0]);
v4i32 _val0_1 = __msa_fill_w(r0[1]);
v4i32 _val0_2 = __msa_fill_w(r0[2]);
v4i32 _val0_3 = __msa_fill_w(r0[3]);
v4i32 _val0_4 = __msa_fill_w(r0[4]);
v4i32 _val0_5 = __msa_fill_w(r0[5]);
v4i32 _val0_6 = __msa_fill_w(r0[6]);
v4i32 _val0_7 = __msa_fill_w(r0[7]);
v4i32 _val1_0 = __msa_fill_w(r0[8]);
v4i32 _val1_1 = __msa_fill_w(r0[9]);
v4i32 _val1_2 = __msa_fill_w(r0[10]);
v4i32 _val1_3 = __msa_fill_w(r0[11]);
v4i32 _val1_4 = __msa_fill_w(r0[12]);
v4i32 _val1_5 = __msa_fill_w(r0[13]);
v4i32 _val1_6 = __msa_fill_w(r0[14]);
v4i32 _val1_7 = __msa_fill_w(r0[15]);

_sum0 = __msa_maddv_w(_sum0, _w0l, _val0_0);
_sum1 = __msa_maddv_w(_sum1, _w0h, _val0_1);
_sum2 = __msa_maddv_w(_sum2, _w0l, _val1_0);
_sum3 = __msa_maddv_w(_sum3, _w0h, _val1_1);
_sum0 = __msa_maddv_w(_sum0, _w1l, _val0_2);
_sum1 = __msa_maddv_w(_sum1, _w1h, _val0_3);
_sum2 = __msa_maddv_w(_sum2, _w1l, _val1_2);
_sum3 = __msa_maddv_w(_sum3, _w1h, _val1_3);
_sum0 = __msa_maddv_w(_sum0, _w2l, _val0_4);
_sum1 = __msa_maddv_w(_sum1, _w2h, _val0_5);
_sum2 = __msa_maddv_w(_sum2, _w2l, _val1_4);
_sum3 = __msa_maddv_w(_sum3, _w2h, _val1_5);
_sum0 = __msa_maddv_w(_sum0, _w3l, _val0_6);
_sum1 = __msa_maddv_w(_sum1, _w3h, _val0_7);
_sum2 = __msa_maddv_w(_sum2, _w3l, _val1_6);
_sum3 = __msa_maddv_w(_sum3, _w3h, _val1_7);

r0 += 16;
k0 += 32;
}

_sum0 = __msa_addv_w(_sum0, _sum1);
_sum2 = __msa_addv_w(_sum2, _sum3);

__msa_st_w(_sum0, output0_tm, 0);
__msa_st_w(_sum2, output0_tm + 4, 0);

output0_tm += 8;
}
for (; i < tiles; i++)
{
const short* r0 = bb2.row<const short>(i / 2 + i % 2);
const short* k0 = kernel0_tm.row<const short>(r);

int nn = inch; // inch always > 0

v4i32 _sum0 = __msa_fill_w(0);
v4i32 _sum1 = __msa_fill_w(0);

for (int j = 0; j < nn; j++)
{
v8i16 _w0 = __msa_ld_h(k0, 0);
v8i16 _w1 = __msa_ld_h(k0 + 8, 0);
v8i16 _w2 = __msa_ld_h(k0 + 16, 0);
v8i16 _w3 = __msa_ld_h(k0 + 24, 0);

v8i16 _extw0 = __msa_clti_s_h(_w0, 0);
v8i16 _extw1 = __msa_clti_s_h(_w1, 0);
v8i16 _extw2 = __msa_clti_s_h(_w2, 0);
v8i16 _extw3 = __msa_clti_s_h(_w3, 0);

v4i32 _w0l = (v4i32)__msa_ilvr_h(_extw0, _w0);
v4i32 _w0h = (v4i32)__msa_ilvl_h(_extw0, _w0);
v4i32 _w1l = (v4i32)__msa_ilvr_h(_extw1, _w1);
v4i32 _w1h = (v4i32)__msa_ilvl_h(_extw1, _w1);
v4i32 _w2l = (v4i32)__msa_ilvr_h(_extw2, _w2);
v4i32 _w2h = (v4i32)__msa_ilvl_h(_extw2, _w2);
v4i32 _w3l = (v4i32)__msa_ilvr_h(_extw3, _w3);
v4i32 _w3h = (v4i32)__msa_ilvl_h(_extw3, _w3);

v4i32 _val0 = __msa_fill_w(r0[0]);
v4i32 _val1 = __msa_fill_w(r0[1]);
v4i32 _val2 = __msa_fill_w(r0[2]);
v4i32 _val3 = __msa_fill_w(r0[3]);
v4i32 _val4 = __msa_fill_w(r0[4]);
v4i32 _val5 = __msa_fill_w(r0[5]);
v4i32 _val6 = __msa_fill_w(r0[6]);
v4i32 _val7 = __msa_fill_w(r0[7]);

_sum0 = __msa_maddv_w(_sum0, _w0l, _val0);
_sum1 = __msa_maddv_w(_sum1, _w0h, _val1);
_sum0 = __msa_maddv_w(_sum0, _w1l, _val2);
_sum1 = __msa_maddv_w(_sum1, _w1h, _val3);
_sum0 = __msa_maddv_w(_sum0, _w2l, _val4);
_sum1 = __msa_maddv_w(_sum1, _w2h, _val5);
_sum0 = __msa_maddv_w(_sum0, _w3l, _val6);
_sum1 = __msa_maddv_w(_sum1, _w3h, _val7);

r0 += 8;
k0 += 32;
}

_sum0 = __msa_addv_w(_sum0, _sum1);

__msa_st_w(_sum0, output0_tm, 0);
output0_tm += 4;
}
}
}
}
bottom_blob_tm = Mat();
// END dot

// BEGIN transform output
Mat top_blob_bordered;
if (outw == top_blob.w && outh == top_blob.h)
{
top_blob_bordered = top_blob;
}
else
{
top_blob_bordered.create(outw, outh, outch, 4u * 4, 4, opt.workspace_allocator);
}
{
// const float otm[4][6] = {
// {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f},
// {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 0.0f},
// {0.0f, 1.0f, 1.0f, 4.0f, 4.0f, 0.0f},
// {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 1.0f}
// };

// 0 = r00 + (r01 + r02) + (r03 + r04)
// 1 = (r01 - r02) + (r03 - r04) * 2
// 2 = (r01 + r02) + (r03 + r04) * 4
// 3 = r05 + (r01 - r02) + (r03 - r04) * 8

int w_tm = outw / 4 * 6;
int h_tm = outh / 4 * 6;
const int tiles = w_tm / 6 * h_tm / 6;

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < outch; p++)
{
const Mat out0_tm = top_blob_tm.channel(p);
Mat out0 = top_blob_bordered.channel(p);

int tmp[4][6][4];

// tile
for (int i = 0; i < outh / 4; i++)
{
for (int j = 0; j < outw / 4; j++)
{
// top_blob_tm.create(tiles, 36, outch, elemsize, elempack);

const int* output0_tm_0 = (const int*)out0_tm + (i * w_tm / 6 + j) * 4;
const int* output0_tm_1 = output0_tm_0 + tiles * 4;
const int* output0_tm_2 = output0_tm_0 + tiles * 8;
const int* output0_tm_3 = output0_tm_0 + tiles * 12;
const int* output0_tm_4 = output0_tm_0 + tiles * 16;
const int* output0_tm_5 = output0_tm_0 + tiles * 20;

int* output0 = out0.row<int>(i * 4) + (j * 4) * 4;

for (int m = 0; m < 5; m++)
{
v4i32 _out0tm0 = __msa_ld_w(output0_tm_0, 0);
v4i32 _out0tm1 = __msa_ld_w(output0_tm_1, 0);
v4i32 _out0tm2 = __msa_ld_w(output0_tm_2, 0);
v4i32 _out0tm3 = __msa_ld_w(output0_tm_3, 0);
v4i32 _out0tm4 = __msa_ld_w(output0_tm_4, 0);
v4i32 _out0tm5 = __msa_ld_w(output0_tm_5, 0);

v4i32 _tmp02a = __msa_addv_w(_out0tm1, _out0tm2);
v4i32 _tmp13a = __msa_subv_w(_out0tm1, _out0tm2);

v4i32 _tmp02b = __msa_addv_w(_out0tm3, _out0tm4);
v4i32 _tmp13b = __msa_subv_w(_out0tm3, _out0tm4);

v4i32 _tmp0m = __msa_addv_w(__msa_addv_w(_out0tm0, _tmp02a), _tmp02b);
v4i32 _tmp1m = __msa_addv_w(_tmp13a, __msa_slli_w(_tmp13b, 1));
v4i32 _tmp2m = __msa_addv_w(_tmp02a, __msa_slli_w(_tmp02b, 2));
v4i32 _tmp3m = __msa_addv_w(__msa_addv_w(_tmp13a, __msa_slli_w(_out0tm5, 2)), __msa_slli_w(_tmp13b, 3));

__msa_st_w(_tmp0m, tmp[0][m], 0);
__msa_st_w(_tmp1m, tmp[1][m], 0);
__msa_st_w(_tmp2m, tmp[2][m], 0);
__msa_st_w(_tmp3m, tmp[3][m], 0);

output0_tm_0 += tiles * 24;
output0_tm_1 += tiles * 24;
output0_tm_2 += tiles * 24;
output0_tm_3 += tiles * 24;
output0_tm_4 += tiles * 24;
output0_tm_5 += tiles * 24;
}
for (int m = 5; m < 6; m++)
{
v4i32 _out0tm0 = __msa_ld_w(output0_tm_0, 0);
v4i32 _out0tm1 = __msa_ld_w(output0_tm_1, 0);
v4i32 _out0tm2 = __msa_ld_w(output0_tm_2, 0);
v4i32 _out0tm3 = __msa_ld_w(output0_tm_3, 0);
v4i32 _out0tm4 = __msa_ld_w(output0_tm_4, 0);
v4i32 _out0tm5 = __msa_ld_w(output0_tm_5, 0);

v4i32 _tmp02a = __msa_addv_w(_out0tm1, _out0tm2);
v4i32 _tmp13a = __msa_subv_w(_out0tm1, _out0tm2);

v4i32 _tmp02b = __msa_addv_w(_out0tm3, _out0tm4);
v4i32 _tmp13b = __msa_subv_w(_out0tm3, _out0tm4);

v4i32 _tmp0m = __msa_addv_w(__msa_addv_w(_out0tm0, _tmp02a), _tmp02b);
v4i32 _tmp1m = __msa_addv_w(_tmp13a, __msa_slli_w(_tmp13b, 1));
v4i32 _tmp2m = __msa_addv_w(_tmp02a, __msa_slli_w(_tmp02b, 2));
v4i32 _tmp3m = __msa_addv_w(__msa_addv_w(_tmp13a, __msa_slli_w(_out0tm5, 2)), __msa_slli_w(_tmp13b, 3));

_tmp0m = __msa_slli_w(_tmp0m, 2);
_tmp1m = __msa_slli_w(_tmp1m, 2);
_tmp2m = __msa_slli_w(_tmp2m, 2);
_tmp3m = __msa_slli_w(_tmp3m, 2);

__msa_st_w(_tmp0m, tmp[0][m], 0);
__msa_st_w(_tmp1m, tmp[1][m], 0);
__msa_st_w(_tmp2m, tmp[2][m], 0);
__msa_st_w(_tmp3m, tmp[3][m], 0);

output0_tm_0 += tiles * 24;
output0_tm_1 += tiles * 24;
output0_tm_2 += tiles * 24;
output0_tm_3 += tiles * 24;
output0_tm_4 += tiles * 24;
output0_tm_5 += tiles * 24;
}

for (int m = 0; m < 4; m++)
{
v4i32 _tmp00 = __msa_ld_w(tmp[m][0], 0);
v4i32 _tmp01 = __msa_ld_w(tmp[m][1], 0);
v4i32 _tmp02 = __msa_ld_w(tmp[m][2], 0);
v4i32 _tmp03 = __msa_ld_w(tmp[m][3], 0);
v4i32 _tmp04 = __msa_ld_w(tmp[m][4], 0);
v4i32 _tmp05 = __msa_ld_w(tmp[m][5], 0);

v4i32 _tmp02a = __msa_addv_w(_tmp01, _tmp02);
v4i32 _tmp13a = __msa_subv_w(_tmp01, _tmp02);

v4i32 _tmp02b = __msa_addv_w(_tmp03, _tmp04);
v4i32 _tmp13b = __msa_subv_w(_tmp03, _tmp04);

v4i32 _out00 = __msa_addv_w(__msa_addv_w(_tmp00, _tmp02a), _tmp02b);
v4i32 _out01 = __msa_addv_w(_tmp13a, __msa_slli_w(_tmp13b, 1));
v4i32 _out02 = __msa_addv_w(_tmp02a, __msa_slli_w(_tmp02b, 2));
v4i32 _out03 = __msa_addv_w(__msa_addv_w(_tmp05, _tmp13a), __msa_slli_w(_tmp13b, 3));

// TODO use integer trick for division by 576
v4f32 _v576 = __msa_fill_w_f32(1.0 / 576);
_out00 = __msa_ftint_s_w(__msa_fmul_w(__msa_ffint_s_w(_out00), _v576));
_out01 = __msa_ftint_s_w(__msa_fmul_w(__msa_ffint_s_w(_out01), _v576));
_out02 = __msa_ftint_s_w(__msa_fmul_w(__msa_ffint_s_w(_out02), _v576));
_out03 = __msa_ftint_s_w(__msa_fmul_w(__msa_ffint_s_w(_out03), _v576));

__msa_st_w(_out00, output0, 0);
__msa_st_w(_out01, output0 + 4, 0);
__msa_st_w(_out02, output0 + 8, 0);
__msa_st_w(_out03, output0 + 12, 0);

output0 += outw * 4;
}
}
}
}
}
// END transform output

// cut result pad
copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
}

+ 82
- 0
src/layer/mips/convolution_int8.h View File

@@ -0,0 +1,82 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

static void convolution_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
{
int w = bottom_blob.w;
int channels = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

const int maxk = kernel_w * kernel_h;

// kernel offsets
std::vector<int> _space_ofs(maxk);
int* space_ofs = &_space_ofs[0];
{
int p1 = 0;
int p2 = 0;
int gap = w * dilation_h - kernel_w * dilation_w;
for (int i = 0; i < kernel_h; i++)
{
for (int j = 0; j < kernel_w; j++)
{
space_ofs[p1] = p2;
p1++;
p2 += dilation_w;
}
p2 += gap;
}
}

// num_output
#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < outch; p++)
{
int* outptr = top_blob.channel(p);

for (int i = 0; i < outh; i++)
{
for (int j = 0; j < outw; j++)
{
int sum = 0;

// const signed char* kptr = weight_data_int8.channel(p);
const signed char* kptr = (const signed char*)weight_data_int8 + maxk * channels * p;

// channels
for (int q = 0; q < channels; q++)
{
const Mat m = bottom_blob.channel(q);
const signed char* sptr = m.row<signed char>(i * stride_h) + j * stride_w;

for (int k = 0; k < maxk; k++)
{
signed char val = sptr[space_ofs[k]];
signed char w = kptr[k];
sum += val * w;
}

kptr += maxk;
}

outptr[j] = sum;
}

outptr += outw;
}
}
}

+ 436
- 25
src/layer/mips/convolution_mips.cpp View File

@@ -32,6 +32,12 @@ namespace ncnn {
#include "convolution_sgemm.h"
#include "convolution_1x1.h"

#if NCNN_INT8
#include "convolution_sgemm_int8.h"
#include "convolution_1x1_int8.h"
#include "convolution_int8.h"
#endif // NCNN_INT8

#if __mips_msa
#include "convolution_pack4.h"
#include "convolution_pack1to4.h"
@@ -44,6 +50,20 @@ namespace ncnn {
#include "convolution_3x3_pack4.h"
#include "convolution_3x3_pack1to4.h"
#include "convolution_7x7_pack1to4.h"

#if NCNN_INT8
#include "convolution_pack8to4_int8.h"
#include "convolution_pack1to4_int8.h"
#include "convolution_pack8to1_int8.h"
#include "convolution_sgemm_pack8to4_int8.h"
#include "convolution_sgemm_pack1to4_int8.h"
#include "convolution_sgemm_pack8to1_int8.h"
#include "convolution_1x1_pack8to4_int8.h"
#include "convolution_1x1_pack1to4_int8.h"
#include "convolution_1x1_pack8to1_int8.h"
#include "convolution_3x3_pack8to4_int8.h"
#include "convolution_3x3_pack8to1_int8.h"
#endif // NCNN_INT8
#endif // __mips_msa

Convolution_mips::Convolution_mips()
@@ -98,6 +118,13 @@ int Convolution_mips::create_pipeline(const Option& opt)

activation = create_activation_layer(activation_type, activation_params, opt);

#if NCNN_INT8
if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
{
return create_pipeline_int8_mips(opt);
}
#endif

const int maxk = kernel_w * kernel_h;
const int num_input = weight_data_size / maxk / num_output;

@@ -117,8 +144,8 @@ int Convolution_mips::create_pipeline(const Option& opt)
{
if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && num_input >= 16 && num_output >= 16)
{
conv3x3s1_winograd64_transform_kernel_pack4_msa(weight_data, weight_data_packed, num_input, num_output, opt);
conv3x3s1_winograd42_transform_kernel_pack4_msa(weight_data, weight_3x3_winograd42_data_packed, num_input, num_output, opt);
conv3x3s1_winograd64_transform_kernel_pack4_msa(weight_data, weight_3x3_winograd64_data, num_input, num_output, opt);
conv3x3s1_winograd42_transform_kernel_pack4_msa(weight_data, weight_3x3_winograd42_data, num_input, num_output, opt);
}
else
{
@@ -187,27 +214,7 @@ int Convolution_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
#if NCNN_INT8
if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
{
Mat bottom_blob_unpacked = bottom_blob;
if (bottom_blob.elempack != 1)
{
Option opt_pack1 = opt;
opt_pack1.blob_allocator = opt.workspace_allocator;

convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1);
}

Mat bottom_blob_unpacked_fp32 = bottom_blob_unpacked;
if (bottom_blob_unpacked.elembits() == 16)
{
Option opt_pack1 = opt;
opt_pack1.blob_allocator = opt.workspace_allocator;

cast_float16_to_float32(bottom_blob_unpacked, bottom_blob_unpacked_fp32, opt_pack1);
}

Option opt_unpacked = opt;
opt_unpacked.use_packing_layout = false;
return Convolution::forward_int8(bottom_blob_unpacked_fp32, top_blob, opt_unpacked);
return forward_int8_mips(bottom_blob, top_blob, opt);
}
#endif

@@ -278,11 +285,11 @@ int Convolution_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
// we need more proper conditions
if ((w <= 10 || (w >= 15 && w <= 18) || w == 21 || w == 22) && (h <= 10 || (h >= 15 && h <= 18) || h == 21 || h == 22))
{
conv3x3s1_winograd42_pack4_msa(bottom_blob_bordered, top_blob, weight_3x3_winograd42_data_packed, bias_data, opt);
conv3x3s1_winograd42_pack4_msa(bottom_blob_bordered, top_blob, weight_3x3_winograd42_data, bias_data, opt);
}
else
{
conv3x3s1_winograd64_pack4_msa(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
conv3x3s1_winograd64_pack4_msa(bottom_blob_bordered, top_blob, weight_3x3_winograd64_data, bias_data, opt);
}

if (activation)
@@ -542,4 +549,408 @@ int Convolution_mips::forward(const std::vector<Mat>& bottom_blobs, std::vector<
return 0;
}

#if NCNN_INT8
static void convolution_transform_kernel_packed_int8_msa(const Mat& weight_data, Mat& weight_data_int8, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack)
{
const int maxk = kernel_w * kernel_h;

// src = kw-kh-inch-outch
// dst = pa-pb-kw-kh-inch/pa-outch/pb
{
Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);

weight_data_int8.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)elempack * out_elempack, elempack * out_elempack);

for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
{
signed char* g00 = weight_data_int8.channel(q / out_elempack);

for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
{
for (int k = 0; k < maxk; k++)
{
for (int i = 0; i < out_elempack; i++)
{
for (int j = 0; j < elempack; j++)
{
const signed char* k00 = weight_data_r2.channel(q + i).row<const signed char>(p + j);

g00[0] = k00[k];

g00++;
}
}
}
}
}
}
}

int Convolution_mips::create_pipeline_int8_mips(const Option& opt)
{
const int maxk = kernel_w * kernel_h;
const int num_input = weight_data_size / maxk / num_output;

int elempack = 1;
int out_elempack = 1;
#if __mips_msa
if (opt.use_packing_layout)
{
elempack = num_input % 8 == 0 ? 8 : 1;
out_elempack = num_output % 4 == 0 ? 4 : 1;
}
#endif // __mips_msa

#if __mips_msa
if (elempack == 8 && out_elempack == 4)
{
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
convolution_im2col_sgemm_transform_kernel_pack8to4_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
convolution_im2col_sgemm_transform_kernel_pack8to4_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
else if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
conv3x3s1_winograd42_transform_kernel_pack8to4_int8_msa(weight_data, weight_3x3_winograd42_data, num_input, num_output, opt);
}
else if (opt.use_sgemm_convolution)
{
convolution_im2col_sgemm_transform_kernel_pack8to4_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
else
{
convolution_transform_kernel_packed_int8_msa(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
}
}

if (elempack == 1 && out_elempack == 4)
{
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
convolution_im2col_sgemm_transform_kernel_pack1to4_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
convolution_im2col_sgemm_transform_kernel_pack1to4_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
{
convolution_im2col_sgemm_transform_kernel_pack1to4_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
else
{
convolution_transform_kernel_packed_int8_msa(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
}
}

if (elempack == 8 && out_elempack == 1)
{
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
convolution_im2col_sgemm_transform_kernel_pack8to1_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
convolution_im2col_sgemm_transform_kernel_pack8to1_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
else if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
conv3x3s1_winograd42_transform_kernel_pack8to1_int8_msa(weight_data, weight_3x3_winograd42_data, num_input, num_output, opt);
}
else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
{
convolution_im2col_sgemm_transform_kernel_pack8to1_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
else
{
convolution_transform_kernel_packed_int8_msa(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
}
}
#endif // __mips_msa

if (elempack == 1 && out_elempack == 1)
{
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
convolution_im2col_sgemm_transform_kernel_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
convolution_im2col_sgemm_transform_kernel_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
else if (opt.use_sgemm_convolution)
{
convolution_im2col_sgemm_transform_kernel_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
}

return 0;
}

int Convolution_mips::forward_int8_mips(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
int elembits = bottom_blob.elembits();

Mat bottom_blob_int8 = bottom_blob;
if (elembits != 8)
{
Option opt_q = opt;
opt_q.blob_allocator = opt.workspace_allocator;
quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_q);
}

Mat bottom_blob_bordered;
make_padding(bottom_blob_int8, bottom_blob_bordered, opt);
if (bottom_blob_bordered.empty())
return -100;

int w = bottom_blob_bordered.w;
int h = bottom_blob_bordered.h;
int channels = bottom_blob_bordered.c;
int elempack = bottom_blob_bordered.elempack;

const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

int outw = (w - kernel_extent_w) / stride_w + 1;
int outh = (h - kernel_extent_h) / stride_h + 1;

bool use_int8_requantize = int8_scale_term > 100;
int out_elempack = 1;
#if __mips_msa
if (opt.use_packing_layout)
{
if (use_int8_requantize)
out_elempack = num_output % 8 == 0 ? 8 : 1;
else
out_elempack = num_output % 4 == 0 ? 4 : 1;
}
#endif // __mips_msa
size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack;

top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
if (top_blob.empty())
return -100;

const int num_input = channels * elempack;

int out_elempack_int32 = 1;
#if __mips_msa
if (opt.use_packing_layout)
{
out_elempack_int32 = num_output % 4 == 0 ? 4 : 1;
}
#endif // __mips_msa

Mat top_blob_int32;
top_blob_int32.create(outw, outh, num_output / out_elempack_int32, (size_t)(4u * out_elempack_int32), out_elempack_int32, opt.workspace_allocator);
if (top_blob_int32.empty())
return -100;

#if __mips_msa
if (elempack == 8 && out_elempack_int32 == 4)
{
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
conv1x1s1_sgemm_pack8to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
}
else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
conv1x1s2_sgemm_pack8to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
}
else if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
conv3x3s1_winograd42_pack8to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_3x3_winograd42_data, opt);
}
else if (opt.use_sgemm_convolution)
{
convolution_im2col_sgemm_pack8to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}
else
{
convolution_pack8to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_data_int8, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}

Mat scale_in_data(num_output);
for (int p = 0; p < num_output; p++)
{
// requantize and relu
float scale_in;
if (weight_data_int8_scales[p] == 0)
scale_in = 0;
else
scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]);

scale_in_data[p] = scale_in;
}

if (use_int8_requantize)
{
requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt);
}
else
{
dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt);

if (activation)
{
activation->forward_inplace(top_blob, opt);
}
}
}

if (elempack == 1 && out_elempack_int32 == 4)
{
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
conv1x1s1_sgemm_pack1to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
}
else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
conv1x1s2_sgemm_pack1to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
}
else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
{
convolution_im2col_sgemm_pack1to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}
else
{
convolution_pack1to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_data_int8, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}

Mat scale_in_data(num_output);
for (int p = 0; p < num_output; p++)
{
// requantize and relu
float scale_in;
if (weight_data_int8_scales[p] == 0)
scale_in = 0;
else
scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]);

scale_in_data[p] = scale_in;
}

if (use_int8_requantize)
{
requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt);
}
else
{
dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt);

if (activation)
{
activation->forward_inplace(top_blob, opt);
}
}
}

if (elempack == 8 && out_elempack_int32 == 1)
{
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
conv1x1s1_sgemm_pack8to1_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
}
else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
conv1x1s2_sgemm_pack8to1_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
}
else if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
conv3x3s1_winograd42_pack8to1_int8_msa(bottom_blob_bordered, top_blob_int32, weight_3x3_winograd42_data, opt);
}
else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
{
convolution_im2col_sgemm_pack8to1_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}
else
{
convolution_pack8to1_int8_msa(bottom_blob_bordered, top_blob_int32, weight_data_int8, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}

Mat scale_in_data(num_output);
for (int p = 0; p < num_output; p++)
{
// requantize and relu
float scale_in;
if (weight_data_int8_scales[p] == 0)
scale_in = 0;
else
scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]);

scale_in_data[p] = scale_in;
}

if (use_int8_requantize)
{
requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt);
}
else
{
dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt);

if (activation)
{
activation->forward_inplace(top_blob, opt);
}
}
}
#endif // __mips_msa

if (elempack == 1 && out_elempack_int32 == 1)
{
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
conv1x1s1_sgemm_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
}
else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
conv1x1s2_sgemm_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
}
else if (opt.use_sgemm_convolution)
{
convolution_im2col_sgemm_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}
else
{
// convolution_int8(bottom_blob_bordered, top_blob_int32, weight_data_int8, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
convolution_int8(bottom_blob_bordered, top_blob_int32, weight_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}

Mat scale_in_data(num_output);
for (int p = 0; p < num_output; p++)
{
// requantize and relu
float scale_in;
if (weight_data_int8_scales[p] == 0)
scale_in = 0;
else
scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]);

scale_in_data[p] = scale_in;
}

if (use_int8_requantize)
{
requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt);
}
else
{
dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt);

if (activation)
{
activation->forward_inplace(top_blob, opt);
}
}
}

return 0;
}
#endif // NCNN_INT8

} // namespace ncnn

+ 17
- 2
src/layer/mips/convolution_mips.h View File

@@ -31,12 +31,27 @@ public:

virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

protected:
#if NCNN_INT8
int create_pipeline_int8_mips(const Option& opt);
int forward_int8_mips(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif

public:
Layer* activation;

// packn
Mat weight_sgemm_data;

Mat weight_3x3_winograd42_data;
Mat weight_3x3_winograd64_data;

// pack4
Mat weight_data_packed;
Mat weight_3x3_winograd42_data_packed;

#if NCNN_INT8
// int8
Mat weight_data_int8;
#endif
};

} // namespace ncnn


+ 87
- 0
src/layer/mips/convolution_pack1to4_int8.h View File

@@ -0,0 +1,87 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

static void convolution_pack1to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
{
int w = bottom_blob.w;
int channels = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

const int maxk = kernel_w * kernel_h;

// kernel offsets
std::vector<int> _space_ofs(maxk);
int* space_ofs = &_space_ofs[0];
{
int p1 = 0;
int p2 = 0;
int gap = w * dilation_h - kernel_w * dilation_w;
for (int i = 0; i < kernel_h; i++)
{
for (int j = 0; j < kernel_w; j++)
{
space_ofs[p1] = p2;
p1++;
p2 += dilation_w;
}
p2 += gap;
}
}

// num_output
#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < outch; p++)
{
int* outptr = top_blob.channel(p);

for (int i = 0; i < outh; i++)
{
for (int j = 0; j < outw; j++)
{
v4i32 _sum = __msa_fill_w(0);

const signed char* kptr = weight_data_int8.channel(p);

// channels
for (int q = 0; q < channels; q++)
{
const Mat m = bottom_blob.channel(q);
const signed char* sptr = m.row<const signed char>(i * stride_h) + j * stride_w;

for (int k = 0; k < maxk; k++)
{
v8i16 _val = __msa_fill_h((short)sptr[space_ofs[k]]);

v16i8 _w = __msa_ld_b(kptr, 0);
v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w);

v8i16 _s0 = __msa_mulv_h(_val, _w16);
v4i32 _s032 = (v4i32)__msa_ilvr_h(__msa_clti_s_h(_s0, 0), _s0);

_sum = __msa_addv_w(_sum, _s032);

kptr += 4;
}
}

__msa_st_w(_sum, outptr + j * 4, 0);
}

outptr += outw * 4;
}
}
}

+ 1
- 1
src/layer/mips/convolution_pack4to1.h View File

@@ -81,7 +81,7 @@ static void convolution_pack4to1_msa(const Mat& bottom_blob, Mat& top_blob, cons
}
}

sum += __msa_fhadd_w(_sum);
sum += __msa_reduce_fadd_w(_sum);

sum = activation_ss(sum, activation_type, activation_params);



+ 87
- 0
src/layer/mips/convolution_pack8to1_int8.h View File

@@ -0,0 +1,87 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

static void convolution_pack8to1_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
{
int w = bottom_blob.w;
int channels = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

const int maxk = kernel_w * kernel_h;

// kernel offsets
std::vector<int> _space_ofs(maxk);
int* space_ofs = &_space_ofs[0];
{
int p1 = 0;
int p2 = 0;
int gap = w * dilation_h - kernel_w * dilation_w;
for (int i = 0; i < kernel_h; i++)
{
for (int j = 0; j < kernel_w; j++)
{
space_ofs[p1] = p2;
p1++;
p2 += dilation_w;
}
p2 += gap;
}
}

// num_output
#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < outch; p++)
{
int* outptr = top_blob.channel(p);

for (int i = 0; i < outh; i++)
{
for (int j = 0; j < outw; j++)
{
v4i32 _sum = __msa_fill_w(0);

const signed char* kptr = weight_data_int8.channel(p);

// channels
for (int q = 0; q < channels; q++)
{
const Mat m = bottom_blob.channel(q);
const signed char* sptr = m.row<const signed char>(i * stride_h) + j * stride_w * 8;

for (int k = 0; k < maxk; k++)
{
v16i8 _val = __msa_ld_b(sptr + space_ofs[k] * 8, 0);
v8i16 _val16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val);

v16i8 _w = __msa_ld_b(kptr, 0);
v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w);

v8i16 _s0 = __msa_mulv_h(_val16, _w16);

_sum = __msa_addv_w(_sum, __msa_hadd_s_w(_s0, _s0));

kptr += 8;
}
}

outptr[j] = __msa_reduce_add_w(_sum);
}

outptr += outw;
}
}
}

+ 120
- 0
src/layer/mips/convolution_pack8to4_int8.h View File

@@ -0,0 +1,120 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

static void convolution_pack8to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
{
int w = bottom_blob.w;
int channels = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

const int maxk = kernel_w * kernel_h;

// kernel offsets
std::vector<int> _space_ofs(maxk);
int* space_ofs = &_space_ofs[0];
{
int p1 = 0;
int p2 = 0;
int gap = w * dilation_h - kernel_w * dilation_w;
for (int i = 0; i < kernel_h; i++)
{
for (int j = 0; j < kernel_w; j++)
{
space_ofs[p1] = p2;
p1++;
p2 += dilation_w;
}
p2 += gap;
}
}

// num_output
#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < outch; p++)
{
int* outptr = top_blob.channel(p);

for (int i = 0; i < outh; i++)
{
for (int j = 0; j < outw; j++)
{
v4i32 _sum0 = __msa_fill_w(0);
v4i32 _sum1 = __msa_fill_w(0);
v4i32 _sum2 = __msa_fill_w(0);
v4i32 _sum3 = __msa_fill_w(0);

const signed char* kptr = weight_data_int8.channel(p);

// channels
for (int q = 0; q < channels; q++)
{
const Mat m = bottom_blob.channel(q);
const signed char* sptr = m.row<signed char>(i * stride_h) + j * stride_w * 8;

for (int k = 0; k < maxk; k++)
{
v16i8 _val = __msa_ld_b(sptr + space_ofs[k] * 8, 0);
v8i16 _val16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val);

v16i8 _w01 = __msa_ld_b(kptr, 0);
v16i8 _w23 = __msa_ld_b(kptr + 16, 0);
v16i8 _extw01 = __msa_clti_s_b(_w01, 0);
v16i8 _extw23 = __msa_clti_s_b(_w23, 0);
v8i16 _w0 = (v8i16)__msa_ilvr_b(_extw01, _w01);
v8i16 _w1 = (v8i16)__msa_ilvl_b(_extw01, _w01);
v8i16 _w2 = (v8i16)__msa_ilvr_b(_extw23, _w23);
v8i16 _w3 = (v8i16)__msa_ilvl_b(_extw23, _w23);

v8i16 _s0 = __msa_mulv_h(_val16, _w0);
v8i16 _s1 = __msa_mulv_h(_val16, _w1);
v8i16 _s2 = __msa_mulv_h(_val16, _w2);
v8i16 _s3 = __msa_mulv_h(_val16, _w3);

_sum0 = __msa_addv_w(_sum0, __msa_hadd_s_w(_s0, _s0));
_sum1 = __msa_addv_w(_sum1, __msa_hadd_s_w(_s1, _s1));
_sum2 = __msa_addv_w(_sum2, __msa_hadd_s_w(_s2, _s2));
_sum3 = __msa_addv_w(_sum3, __msa_hadd_s_w(_s3, _s3));

kptr += 32;
}
}

// transpose 4x4
{
v4i32 _tmp0, _tmp1, _tmp2, _tmp3;
_tmp0 = __msa_ilvr_w(_sum1, _sum0);
_tmp1 = __msa_ilvr_w(_sum3, _sum2);
_tmp2 = __msa_ilvl_w(_sum1, _sum0);
_tmp3 = __msa_ilvl_w(_sum3, _sum2);
_sum0 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0);
_sum1 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0);
_sum2 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2);
_sum3 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2);
}

_sum0 = __msa_addv_w(_sum0, _sum1);
_sum2 = __msa_addv_w(_sum2, _sum3);

_sum0 = __msa_addv_w(_sum0, _sum2);

__msa_st_w(_sum0, outptr + j * 4, 0);
}

outptr += outw * 4;
}
}
}

+ 731
- 0
src/layer/mips/convolution_sgemm_int8.h View File

@@ -0,0 +1,731 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

static void im2col_sgemm_int8_msa(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
{
// Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator);

const int size = bottom_im2col.w;
const int maxk = bottom_im2col.h;
const int inch = bottom_im2col.c;

const int outch = top_blob.c;

// permute
Mat tmp;
#if __mips_msa
if (inch >= 4)
{
if (size >= 2)
tmp.create(2 * maxk, inch / 4 + inch % 4, size / 2 + size % 2, 4u, 4, opt.workspace_allocator);
else
tmp.create(maxk, inch / 4 + inch % 4, size, 4u, 4, opt.workspace_allocator);
}
else
{
if (size >= 2)
tmp.create(2 * maxk, inch, size / 2 + size % 2, 1u, 1, opt.workspace_allocator);
else
tmp.create(maxk, inch, size, 1u, 1, opt.workspace_allocator);
}
{
int remain_size_start = 0;
int nn_size = (size - remain_size_start) >> 1;

#pragma omp parallel for num_threads(opt.num_threads)
for (int ii = 0; ii < nn_size; ii++)
{
int i = remain_size_start + ii * 2;

signed char* tmpptr = tmp.channel(i / 2);

int q = 0;
for (; q + 3 < inch; q += 4)
{
const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;
const signed char* img1 = (const signed char*)bottom_im2col.channel(q + 1) + i;
const signed char* img2 = (const signed char*)bottom_im2col.channel(q + 2) + i;
const signed char* img3 = (const signed char*)bottom_im2col.channel(q + 3) + i;

for (int k = 0; k < maxk; k++)
{
tmpptr[0] = img0[0];
tmpptr[1] = img1[0];
tmpptr[2] = img2[0];
tmpptr[3] = img3[0];
tmpptr[4] = img0[1];
tmpptr[5] = img1[1];
tmpptr[6] = img2[1];
tmpptr[7] = img3[1];
tmpptr += 8;

img0 += size;
img1 += size;
img2 += size;
img3 += size;
}
}
for (; q < inch; q++)
{
const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;

for (int k = 0; k < maxk; k++)
{
tmpptr[0] = img0[0];
tmpptr[1] = img0[1];

tmpptr += 2;

img0 += size;
}
}
}

remain_size_start += nn_size << 1;

#pragma omp parallel for num_threads(opt.num_threads)
for (int i = remain_size_start; i < size; i++)
{
signed char* tmpptr = tmp.channel(i / 2 + i % 2);

int q = 0;
for (; q + 3 < inch; q += 4)
{
const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;
const signed char* img1 = (const signed char*)bottom_im2col.channel(q + 1) + i;
const signed char* img2 = (const signed char*)bottom_im2col.channel(q + 2) + i;
const signed char* img3 = (const signed char*)bottom_im2col.channel(q + 3) + i;

for (int k = 0; k < maxk; k++)
{
tmpptr[0] = img0[0];
tmpptr[1] = img1[0];
tmpptr[2] = img2[0];
tmpptr[3] = img3[0];
tmpptr += 4;

img0 += size;
img1 += size;
img2 += size;
img3 += size;
}
}
for (; q < inch; q++)
{
const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;

for (int k = 0; k < maxk; k++)
{
tmpptr[0] = img0[0];

tmpptr += 1;

img0 += size;
}
}
}
}
#else // __mips_msa
tmp.create(maxk, inch, size, 1u, 1, opt.workspace_allocator);
{
#pragma omp parallel for num_threads(opt.num_threads)
for (int i = 0; i < size; i++)
{
signed char* tmpptr = tmp.channel(i);

int q = 0;
for (; q < inch; q++)
{
const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;

for (int k = 0; k < maxk; k++)
{
tmpptr[0] = img0[0];

tmpptr += 1;

img0 += size;
}
}
}
}
#endif // __mips_msa

int nn_outch = 0;
int remain_outch_start = 0;

#if __mips_msa
nn_outch = outch >> 2;

#pragma omp parallel for num_threads(opt.num_threads)
for (int pp = 0; pp < nn_outch; pp++)
{
int p = pp * 4;

int* outptr0 = top_blob.channel(p);
int* outptr1 = top_blob.channel(p + 1);
int* outptr2 = top_blob.channel(p + 2);
int* outptr3 = top_blob.channel(p + 3);

int i = 0;
for (; i + 1 < size; i += 2)
{
const signed char* tmpptr = tmp.channel(i / 2);
const signed char* kptr = kernel.channel(p / 4);

int nn4 = (inch / 4) * maxk;
int nn1 = (inch % 4) * maxk;

v4i32 _sum00 = __msa_fill_w(0);
v4i32 _sum10 = __msa_fill_w(0);

if (nn4 > 0)
{
v4i32 _sum01 = __msa_fill_w(0);
v4i32 _sum02 = __msa_fill_w(0);
v4i32 _sum03 = __msa_fill_w(0);
v4i32 _sum11 = __msa_fill_w(0);
v4i32 _sum12 = __msa_fill_w(0);
v4i32 _sum13 = __msa_fill_w(0);

int j = 0;
for (; j < nn4; j++)
{
v16i8 _val = __msa_ld_b(tmpptr, 0);
v8i16 _val01 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val);

v8i16 _val0 = (v8i16)__msa_ilvr_d((v2i64)_val01, (v2i64)_val01);
v8i16 _val1 = (v8i16)__msa_ilvl_d((v2i64)_val01, (v2i64)_val01);

v16i8 _w01 = __msa_ld_b(kptr, 0);
v16i8 _extw01 = __msa_clti_s_b(_w01, 0);
v8i16 _w0 = (v8i16)__msa_ilvr_b(_extw01, _w01);
v8i16 _w1 = (v8i16)__msa_ilvl_b(_extw01, _w01);

v8i16 _s00 = __msa_mulv_h(_val0, _w0);
v8i16 _s01 = __msa_mulv_h(_val0, _w1);
v8i16 _s10 = __msa_mulv_h(_val1, _w0);
v8i16 _s11 = __msa_mulv_h(_val1, _w1);

v8i16 _exts00 = __msa_clti_s_h(_s00, 0);
v8i16 _exts01 = __msa_clti_s_h(_s01, 0);
v8i16 _exts10 = __msa_clti_s_h(_s10, 0);
v8i16 _exts11 = __msa_clti_s_h(_s11, 0);
v4i32 _s00l = (v4i32)__msa_ilvr_h(_exts00, _s00);
v4i32 _s00h = (v4i32)__msa_ilvl_h(_exts00, _s00);
v4i32 _s01l = (v4i32)__msa_ilvr_h(_exts01, _s01);
v4i32 _s01h = (v4i32)__msa_ilvl_h(_exts01, _s01);
v4i32 _s10l = (v4i32)__msa_ilvr_h(_exts10, _s10);
v4i32 _s10h = (v4i32)__msa_ilvl_h(_exts10, _s10);
v4i32 _s11l = (v4i32)__msa_ilvr_h(_exts11, _s11);
v4i32 _s11h = (v4i32)__msa_ilvl_h(_exts11, _s11);

_sum00 = __msa_addv_w(_sum00, _s00l);
_sum01 = __msa_addv_w(_sum01, _s00h);
_sum02 = __msa_addv_w(_sum02, _s01l);
_sum03 = __msa_addv_w(_sum03, _s01h);
_sum10 = __msa_addv_w(_sum10, _s10l);
_sum11 = __msa_addv_w(_sum11, _s10h);
_sum12 = __msa_addv_w(_sum12, _s11l);
_sum13 = __msa_addv_w(_sum13, _s11h);

tmpptr += 8;
kptr += 16;
}

// transpose 4x4
{
v4i32 _tmp0, _tmp1, _tmp2, _tmp3;
_tmp0 = __msa_ilvr_w(_sum01, _sum00);
_tmp1 = __msa_ilvr_w(_sum03, _sum02);
_tmp2 = __msa_ilvl_w(_sum01, _sum00);
_tmp3 = __msa_ilvl_w(_sum03, _sum02);
_sum00 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0);
_sum01 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0);
_sum02 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2);
_sum03 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2);
}
{
v4i32 _tmp0, _tmp1, _tmp2, _tmp3;
_tmp0 = __msa_ilvr_w(_sum11, _sum10);
_tmp1 = __msa_ilvr_w(_sum13, _sum12);
_tmp2 = __msa_ilvl_w(_sum11, _sum10);
_tmp3 = __msa_ilvl_w(_sum13, _sum12);
_sum10 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0);
_sum11 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0);
_sum12 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2);
_sum13 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2);
}

_sum00 = __msa_addv_w(_sum00, _sum01);
_sum02 = __msa_addv_w(_sum02, _sum03);
_sum10 = __msa_addv_w(_sum10, _sum11);
_sum12 = __msa_addv_w(_sum12, _sum13);

_sum00 = __msa_addv_w(_sum00, _sum02);
_sum10 = __msa_addv_w(_sum10, _sum12);
}

int j = 0;
for (; j < nn1; j++)
{
v8i16 _val0 = __msa_fill_h(tmpptr[0]);
v8i16 _val1 = __msa_fill_h(tmpptr[1]);
v8i16 _val = (v8i16)__msa_ilvr_d((v2i64)_val1, (v2i64)_val0);

v16i8 _w = __msa_ld_b(kptr, 0);
v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w);

_w16 = (v8i16)__msa_ilvr_d((v2i64)_w16, (v2i64)_w16);

v8i16 _s0 = __msa_mulv_h(_val, _w16);
v8i16 _exts0 = __msa_clti_s_h(_s0, 0);
v4i32 _s0l = (v4i32)__msa_ilvr_h(_exts0, _s0);
v4i32 _s0h = (v4i32)__msa_ilvl_h(_exts0, _s0);

_sum00 = __msa_addv_w(_sum00, _s0l);
_sum10 = __msa_addv_w(_sum10, _s0h);

tmpptr += 2;
kptr += 4;
}

int sum[8];
__msa_st_w(_sum00, sum, 0);
__msa_st_w(_sum10, sum + 4, 0);

outptr0[0] = sum[0];
outptr1[0] = sum[1];
outptr2[0] = sum[2];
outptr3[0] = sum[3];
outptr0[1] = sum[4];
outptr1[1] = sum[5];
outptr2[1] = sum[6];
outptr3[1] = sum[7];
outptr0 += 2;
outptr1 += 2;
outptr2 += 2;
outptr3 += 2;
}
for (; i < size; i++)
{
const signed char* tmpptr = tmp.channel(i / 2 + i % 2);
const signed char* kptr = kernel.channel(p / 4);

int nn4 = (inch / 4) * maxk;
int nn1 = (inch % 4) * maxk;

v4i32 _sum0 = __msa_fill_w(0);

if (nn4 > 0)
{
v4i32 _sum1 = __msa_fill_w(0);
v4i32 _sum2 = __msa_fill_w(0);
v4i32 _sum3 = __msa_fill_w(0);

int j = 0;
for (; j < nn4; j++)
{
v16i8 _val = __msa_ld_b(tmpptr, 0);
v8i16 _val16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val);

_val16 = (v8i16)__msa_ilvr_d((v2i64)_val16, (v2i64)_val16);

v16i8 _w01 = __msa_ld_b(kptr, 0);
v16i8 _extw01 = __msa_clti_s_b(_w01, 0);
v8i16 _w0 = (v8i16)__msa_ilvr_b(_extw01, _w01);
v8i16 _w1 = (v8i16)__msa_ilvl_b(_extw01, _w01);

v8i16 _s0 = __msa_mulv_h(_val16, _w0);
v8i16 _s1 = __msa_mulv_h(_val16, _w1);

v8i16 _exts0 = __msa_clti_s_h(_s0, 0);
v8i16 _exts1 = __msa_clti_s_h(_s1, 0);
v4i32 _s0l = (v4i32)__msa_ilvr_h(_exts0, _s0);
v4i32 _s0h = (v4i32)__msa_ilvl_h(_exts0, _s0);
v4i32 _s1l = (v4i32)__msa_ilvr_h(_exts1, _s1);
v4i32 _s1h = (v4i32)__msa_ilvl_h(_exts1, _s1);

_sum0 = __msa_addv_w(_sum0, _s0l);
_sum1 = __msa_addv_w(_sum1, _s0h);
_sum2 = __msa_addv_w(_sum2, _s1l);
_sum3 = __msa_addv_w(_sum3, _s1h);

tmpptr += 4;
kptr += 16;
}

// transpose 4x4
{
v4i32 _tmp0, _tmp1, _tmp2, _tmp3;
_tmp0 = __msa_ilvr_w(_sum1, _sum0);
_tmp1 = __msa_ilvr_w(_sum3, _sum2);
_tmp2 = __msa_ilvl_w(_sum1, _sum0);
_tmp3 = __msa_ilvl_w(_sum3, _sum2);
_sum0 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0);
_sum1 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0);
_sum2 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2);
_sum3 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2);
}

_sum0 = __msa_addv_w(_sum0, _sum1);
_sum2 = __msa_addv_w(_sum2, _sum3);
_sum0 = __msa_addv_w(_sum0, _sum2);
}

int j = 0;
for (; j < nn1; j++)
{
v8i16 _val = __msa_fill_h(tmpptr[0]);

v16i8 _w = __msa_ld_b(kptr, 0);
v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w);

v8i16 _s0 = __msa_mulv_h(_val, _w16);
v4i32 _s032 = (v4i32)__msa_ilvr_h(__msa_clti_s_h(_s0, 0), _s0);

_sum0 = __msa_addv_w(_sum0, _s032);

tmpptr += 1;
kptr += 4;
}

int sum[4];
__msa_st_w(_sum0, sum, 0);

outptr0[0] = sum[0];
outptr1[0] = sum[1];
outptr2[0] = sum[2];
outptr3[0] = sum[3];
outptr0 += 1;
outptr1 += 1;
outptr2 += 1;
outptr3 += 1;
}
}

remain_outch_start += nn_outch << 2;
#endif // __mips_msa

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = remain_outch_start; p < outch; p++)
{
int* outptr0 = top_blob.channel(p);

int i = 0;
#if __mips_msa
for (; i + 1 < size; i += 2)
{
const signed char* tmpptr = tmp.channel(i / 2);
const signed char* kptr = kernel.channel(p / 4 + p % 4);

int nn4 = (inch / 4) * maxk;
int nn1 = (inch % 4) * maxk;

int sum0 = 0;
int sum1 = 0;

if (nn4 > 0)
{
v4i32 _sum0 = __msa_fill_w(0);
v4i32 _sum1 = __msa_fill_w(0);

int j = 0;
for (; j < nn4; j++)
{
v16i8 _val = __msa_ld_b(tmpptr, 0);
v8i16 _val16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val);

v16i8 _w = __msa_ld_b(kptr, 0);
v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w);

_w16 = (v8i16)__msa_ilvr_d((v2i64)_w16, (v2i64)_w16);

v8i16 _s0 = __msa_mulv_h(_val16, _w16);
v8i16 _exts0 = __msa_clti_s_h(_s0, 0);
v4i32 _s0l = (v4i32)__msa_ilvr_h(_exts0, _s0);
v4i32 _s0h = (v4i32)__msa_ilvl_h(_exts0, _s0);

_sum0 = __msa_addv_w(_sum0, _s0l);
_sum1 = __msa_addv_w(_sum1, _s0h);

tmpptr += 8;
kptr += 4;
}

sum0 = _sum0[0] + _sum0[1] + _sum0[2] + _sum0[3];
sum1 = _sum1[0] + _sum1[1] + _sum1[2] + _sum1[3];
}

int j = 0;
for (; j < nn1; j++)
{
signed char val0 = tmpptr[0];
signed char val1 = tmpptr[1];
signed char w = kptr[0];

sum0 += val0 * w;
sum1 += val1 * w;

tmpptr += 2;
kptr += 1;
}

outptr0[0] = sum0;
outptr0[1] = sum1;
outptr0 += 2;
}
for (; i < size; i++)
{
const signed char* tmpptr = tmp.channel(i / 2 + i % 2);
const signed char* kptr = kernel.channel(p / 4 + p % 4);

int nn4 = (inch / 4) * maxk;
int nn1 = (inch % 4) * maxk;

int sum = 0;

if (nn4 > 0)
{
v4i32 _sum = __msa_fill_w(0);

int j = 0;
for (; j < nn4; j++)
{
v16i8 _val = __msa_ld_b(tmpptr, 0);
v8i16 _val16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val);

v16i8 _w = __msa_ld_b(kptr, 0);
v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w);

v8i16 _s0 = __msa_mulv_h(_val16, _w16);
v4i32 _s032 = (v4i32)__msa_ilvr_h(__msa_clti_s_h(_s0, 0), _s0);

_sum = __msa_addv_w(_sum, _s032);

tmpptr += 4;
kptr += 4;
}

sum = _sum[0] + _sum[1] + _sum[2] + _sum[3];
}

int j = 0;
for (; j < nn1; j++)
{
signed char val = tmpptr[0];
signed char w = kptr[0];

sum += val * w;

tmpptr += 1;
kptr += 1;
}

outptr0[0] = sum;
outptr0 += 1;
}
#else // __mips_msa
for (; i < size; i++)
{
const signed char* tmpptr = tmp.channel(i);
const signed char* kptr = kernel.channel(p);

int nn1 = inch * maxk;

int sum = 0;
int j = 0;
for (; j < nn1; j++)
{
signed char val = tmpptr[0];
signed char w = kptr[0];

sum += val * w;

tmpptr += 1;
kptr += 1;
}

outptr0[0] = sum;
outptr0 += 1;
}
#endif // __mips_msa
}
}

static void convolution_im2col_sgemm_transform_kernel_int8_msa(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
{
const int maxk = kernel_w * kernel_h;

#if __mips_msa
// interleave
// src = maxk-inch-outch
// dst = 4a-4b-maxk-inch/4a-outch/4b
Mat kernel = _kernel.reshape(maxk, inch, outch);
if (outch >= 4)
{
if (inch >= 4)
kernel_tm.create(16 * maxk, inch / 4 + inch % 4, outch / 4 + outch % 4, (size_t)1u);
else
kernel_tm.create(4 * maxk, inch, outch / 4 + outch % 4, (size_t)1u);
}
else
{
if (inch >= 4)
kernel_tm.create(4 * maxk, inch / 4 + inch % 4, outch, (size_t)1u);
else
kernel_tm.create(1 * maxk, inch, outch, (size_t)1u);
}

int q = 0;
for (; q + 3 < outch; q += 4)
{
signed char* g00 = kernel_tm.channel(q / 4);

int p = 0;
for (; p + 3 < inch; p += 4)
{
for (int k = 0; k < maxk; k++)
{
for (int i = 0; i < 4; i++)
{
for (int j = 0; j < 4; j++)
{
const signed char* k00 = kernel.channel(q + i).row<const signed char>(p + j);

g00[0] = k00[k];

g00++;
}
}
}
}
for (; p < inch; p++)
{
for (int k = 0; k < maxk; k++)
{
for (int i = 0; i < 4; i++)
{
const signed char* k00 = kernel.channel(q + i).row<const signed char>(p);

g00[0] = k00[k];

g00++;
}
}
}
}
// TODO unroll 2
for (; q < outch; q++)
{
signed char* g00 = kernel_tm.channel(q / 4 + q % 4);

int p = 0;
for (; p + 3 < inch; p += 4)
{
for (int k = 0; k < maxk; k++)
{
for (int j = 0; j < 4; j++)
{
const signed char* k00 = kernel.channel(q).row<const signed char>(p + j);

g00[0] = k00[k];

g00++;
}
}
}
for (; p < inch; p++)
{
for (int k = 0; k < maxk; k++)
{
const signed char* k00 = kernel.channel(q).row<const signed char>(p);

g00[0] = k00[k];

g00++;
}
}
}
#else // __mips_msa
kernel_tm = _kernel.reshape(maxk, inch, outch);
#endif // __mips_msa
}

static void convolution_im2col_sgemm_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
{
int w = bottom_blob.w;
int inch = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
const int size = outw * outh;

const int maxk = kernel_w * kernel_h;

// im2col
Mat bottom_im2col(size, maxk, inch, 1u, 1, opt.workspace_allocator);
{
const int gap = w * stride_h - outw * stride_w;

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < inch; p++)
{
const Mat img = bottom_blob.channel(p);
signed char* ptr = bottom_im2col.channel(p);

for (int u = 0; u < kernel_h; u++)
{
for (int v = 0; v < kernel_w; v++)
{
const signed char* sptr = img.row<const signed char>(dilation_h * u) + dilation_w * v;

for (int i = 0; i < outh; i++)
{
int j = 0;
for (; j + 3 < outw; j += 4)
{
ptr[0] = sptr[0];
ptr[1] = sptr[stride_w];
ptr[2] = sptr[stride_w * 2];
ptr[3] = sptr[stride_w * 3];

sptr += stride_w * 4;
ptr += 4;
}
for (; j + 1 < outw; j += 2)
{
ptr[0] = sptr[0];
ptr[1] = sptr[stride_w];

sptr += stride_w * 2;
ptr += 2;
}
for (; j < outw; j++)
{
ptr[0] = sptr[0];

sptr += stride_w;
ptr += 1;
}

sptr += gap;
}
}
}
}
}

im2col_sgemm_int8_msa(bottom_im2col, top_blob, kernel, opt);
}

+ 477
- 0
src/layer/mips/convolution_sgemm_pack1to4_int8.h View File

@@ -0,0 +1,477 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

static void im2col_sgemm_pack1to4_int8_msa(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
{
// Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator);

const int size = bottom_im2col.w;
const int maxk = bottom_im2col.h;
const int inch = bottom_im2col.c;

const int outch = top_blob.c;

// permute
Mat tmp;
if (inch >= 4)
{
if (size >= 2)
tmp.create(2 * maxk, inch / 4 + inch % 4, size / 2 + size % 2, 4u, 4, opt.workspace_allocator);
else
tmp.create(maxk, inch / 4 + inch % 4, size, 4u, 4, opt.workspace_allocator);
}
else
{
if (size >= 2)
tmp.create(2 * maxk, inch, size / 2 + size % 2, 1u, 1, opt.workspace_allocator);
else
tmp.create(maxk, inch, size, 1u, 1, opt.workspace_allocator);
}
{
int remain_size_start = 0;
int nn_size = (size - remain_size_start) >> 1;

#pragma omp parallel for num_threads(opt.num_threads)
for (int ii = 0; ii < nn_size; ii++)
{
int i = remain_size_start + ii * 2;

signed char* tmpptr = tmp.channel(i / 2);

int q = 0;
for (; q + 3 < inch; q += 4)
{
const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;
const signed char* img1 = (const signed char*)bottom_im2col.channel(q + 1) + i;
const signed char* img2 = (const signed char*)bottom_im2col.channel(q + 2) + i;
const signed char* img3 = (const signed char*)bottom_im2col.channel(q + 3) + i;

for (int k = 0; k < maxk; k++)
{
tmpptr[0] = img0[0];
tmpptr[1] = img1[0];
tmpptr[2] = img2[0];
tmpptr[3] = img3[0];
tmpptr[4] = img0[1];
tmpptr[5] = img1[1];
tmpptr[6] = img2[1];
tmpptr[7] = img3[1];
tmpptr += 8;

img0 += size;
img1 += size;
img2 += size;
img3 += size;
}
}
for (; q < inch; q++)
{
const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;

for (int k = 0; k < maxk; k++)
{
tmpptr[0] = img0[0];
tmpptr[1] = img0[1];

tmpptr += 2;

img0 += size;
}
}
}

remain_size_start += nn_size << 1;

#pragma omp parallel for num_threads(opt.num_threads)
for (int i = remain_size_start; i < size; i++)
{
signed char* tmpptr = tmp.channel(i / 2 + i % 2);

int q = 0;
for (; q + 3 < inch; q += 4)
{
const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;
const signed char* img1 = (const signed char*)bottom_im2col.channel(q + 1) + i;
const signed char* img2 = (const signed char*)bottom_im2col.channel(q + 2) + i;
const signed char* img3 = (const signed char*)bottom_im2col.channel(q + 3) + i;

for (int k = 0; k < maxk; k++)
{
tmpptr[0] = img0[0];
tmpptr[1] = img1[0];
tmpptr[2] = img2[0];
tmpptr[3] = img3[0];
tmpptr += 4;

img0 += size;
img1 += size;
img2 += size;
img3 += size;
}
}
for (; q < inch; q++)
{
const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;

for (int k = 0; k < maxk; k++)
{
tmpptr[0] = img0[0];

tmpptr += 1;

img0 += size;
}
}
}
}

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < outch; p++)
{
int* outptr0 = top_blob.channel(p);

int i = 0;
for (; i + 1 < size; i += 2)
{
const signed char* tmpptr = tmp.channel(i / 2);
const signed char* kptr = kernel.channel(p);

int nn4 = (inch / 4) * maxk;
int nn1 = (inch % 4) * maxk;

v4i32 _sum00 = __msa_fill_w(0);
v4i32 _sum10 = __msa_fill_w(0);

if (nn4 > 0)
{
v4i32 _sum01 = __msa_fill_w(0);
v4i32 _sum02 = __msa_fill_w(0);
v4i32 _sum03 = __msa_fill_w(0);
v4i32 _sum11 = __msa_fill_w(0);
v4i32 _sum12 = __msa_fill_w(0);
v4i32 _sum13 = __msa_fill_w(0);

int j = 0;
for (; j < nn4; j++)
{
v16i8 _val = __msa_ld_b(tmpptr, 0);
v8i16 _val01 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val);

v8i16 _val0 = (v8i16)__msa_ilvr_d((v2i64)_val01, (v2i64)_val01);
v8i16 _val1 = (v8i16)__msa_ilvl_d((v2i64)_val01, (v2i64)_val01);

v16i8 _w01 = __msa_ld_b(kptr, 0);
v16i8 _extw01 = __msa_clti_s_b(_w01, 0);
v8i16 _w0 = (v8i16)__msa_ilvr_b(_extw01, _w01);
v8i16 _w1 = (v8i16)__msa_ilvl_b(_extw01, _w01);

v8i16 _s00 = __msa_mulv_h(_val0, _w0);
v8i16 _s01 = __msa_mulv_h(_val0, _w1);
v8i16 _s10 = __msa_mulv_h(_val1, _w0);
v8i16 _s11 = __msa_mulv_h(_val1, _w1);

v8i16 _exts00 = __msa_clti_s_h(_s00, 0);
v8i16 _exts01 = __msa_clti_s_h(_s01, 0);
v8i16 _exts10 = __msa_clti_s_h(_s10, 0);
v8i16 _exts11 = __msa_clti_s_h(_s11, 0);
v4i32 _s00l = (v4i32)__msa_ilvr_h(_exts00, _s00);
v4i32 _s00h = (v4i32)__msa_ilvl_h(_exts00, _s00);
v4i32 _s01l = (v4i32)__msa_ilvr_h(_exts01, _s01);
v4i32 _s01h = (v4i32)__msa_ilvl_h(_exts01, _s01);
v4i32 _s10l = (v4i32)__msa_ilvr_h(_exts10, _s10);
v4i32 _s10h = (v4i32)__msa_ilvl_h(_exts10, _s10);
v4i32 _s11l = (v4i32)__msa_ilvr_h(_exts11, _s11);
v4i32 _s11h = (v4i32)__msa_ilvl_h(_exts11, _s11);

_sum00 = __msa_addv_w(_sum00, _s00l);
_sum01 = __msa_addv_w(_sum01, _s00h);
_sum02 = __msa_addv_w(_sum02, _s01l);
_sum03 = __msa_addv_w(_sum03, _s01h);
_sum10 = __msa_addv_w(_sum10, _s10l);
_sum11 = __msa_addv_w(_sum11, _s10h);
_sum12 = __msa_addv_w(_sum12, _s11l);
_sum13 = __msa_addv_w(_sum13, _s11h);

tmpptr += 8;
kptr += 16;
}

// transpose 4x4
{
v4i32 _tmp0, _tmp1, _tmp2, _tmp3;
_tmp0 = __msa_ilvr_w(_sum01, _sum00);
_tmp1 = __msa_ilvr_w(_sum03, _sum02);
_tmp2 = __msa_ilvl_w(_sum01, _sum00);
_tmp3 = __msa_ilvl_w(_sum03, _sum02);
_sum00 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0);
_sum01 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0);
_sum02 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2);
_sum03 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2);
}
{
v4i32 _tmp0, _tmp1, _tmp2, _tmp3;
_tmp0 = __msa_ilvr_w(_sum11, _sum10);
_tmp1 = __msa_ilvr_w(_sum13, _sum12);
_tmp2 = __msa_ilvl_w(_sum11, _sum10);
_tmp3 = __msa_ilvl_w(_sum13, _sum12);
_sum10 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0);
_sum11 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0);
_sum12 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2);
_sum13 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2);
}

_sum00 = __msa_addv_w(_sum00, _sum01);
_sum02 = __msa_addv_w(_sum02, _sum03);
_sum10 = __msa_addv_w(_sum10, _sum11);
_sum12 = __msa_addv_w(_sum12, _sum13);

_sum00 = __msa_addv_w(_sum00, _sum02);
_sum10 = __msa_addv_w(_sum10, _sum12);
}

int j = 0;
for (; j < nn1; j++)
{
v8i16 _val0 = __msa_fill_h(tmpptr[0]);
v8i16 _val1 = __msa_fill_h(tmpptr[1]);
v8i16 _val = (v8i16)__msa_ilvr_d((v2i64)_val1, (v2i64)_val0);

v16i8 _w = __msa_ld_b(kptr, 0);
v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w);

_w16 = (v8i16)__msa_ilvr_d((v2i64)_w16, (v2i64)_w16);

v8i16 _s0 = __msa_mulv_h(_val, _w16);
v8i16 _exts0 = __msa_clti_s_h(_s0, 0);
v4i32 _s0l = (v4i32)__msa_ilvr_h(_exts0, _s0);
v4i32 _s0h = (v4i32)__msa_ilvl_h(_exts0, _s0);

_sum00 = __msa_addv_w(_sum00, _s0l);
_sum10 = __msa_addv_w(_sum10, _s0h);

tmpptr += 2;
kptr += 4;
}

__msa_st_w(_sum00, outptr0, 0);
__msa_st_w(_sum10, outptr0 + 4, 0);
outptr0 += 8;
}
for (; i < size; i++)
{
const signed char* tmpptr = tmp.channel(i / 2 + i % 2);
const signed char* kptr = kernel.channel(p);

int nn4 = (inch / 4) * maxk;
int nn1 = (inch % 4) * maxk;

v4i32 _sum0 = __msa_fill_w(0);

if (nn4 > 0)
{
v4i32 _sum1 = __msa_fill_w(0);
v4i32 _sum2 = __msa_fill_w(0);
v4i32 _sum3 = __msa_fill_w(0);

int j = 0;
for (; j < nn4; j++)
{
v16i8 _val = __msa_ld_b(tmpptr, 0);
v8i16 _val16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val);

_val16 = (v8i16)__msa_ilvr_d((v2i64)_val16, (v2i64)_val16);

v16i8 _w01 = __msa_ld_b(kptr, 0);
v16i8 _extw01 = __msa_clti_s_b(_w01, 0);
v8i16 _w0 = (v8i16)__msa_ilvr_b(_extw01, _w01);
v8i16 _w1 = (v8i16)__msa_ilvl_b(_extw01, _w01);

v8i16 _s0 = __msa_mulv_h(_val16, _w0);
v8i16 _s1 = __msa_mulv_h(_val16, _w1);

v8i16 _exts0 = __msa_clti_s_h(_s0, 0);
v8i16 _exts1 = __msa_clti_s_h(_s1, 0);
v4i32 _s0l = (v4i32)__msa_ilvr_h(_exts0, _s0);
v4i32 _s0h = (v4i32)__msa_ilvl_h(_exts0, _s0);
v4i32 _s1l = (v4i32)__msa_ilvr_h(_exts1, _s1);
v4i32 _s1h = (v4i32)__msa_ilvl_h(_exts1, _s1);

_sum0 = __msa_addv_w(_sum0, _s0l);
_sum1 = __msa_addv_w(_sum1, _s0h);
_sum2 = __msa_addv_w(_sum2, _s1l);
_sum3 = __msa_addv_w(_sum3, _s1h);

tmpptr += 4;
kptr += 16;
}

// transpose 4x4
{
v4i32 _tmp0, _tmp1, _tmp2, _tmp3;
_tmp0 = __msa_ilvr_w(_sum1, _sum0);
_tmp1 = __msa_ilvr_w(_sum3, _sum2);
_tmp2 = __msa_ilvl_w(_sum1, _sum0);
_tmp3 = __msa_ilvl_w(_sum3, _sum2);
_sum0 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0);
_sum1 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0);
_sum2 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2);
_sum3 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2);
}

_sum0 = __msa_addv_w(_sum0, _sum1);
_sum2 = __msa_addv_w(_sum2, _sum3);
_sum0 = __msa_addv_w(_sum0, _sum2);
}

int j = 0;
for (; j < nn1; j++)
{
v8i16 _val = __msa_fill_h(tmpptr[0]);

v16i8 _w = __msa_ld_b(kptr, 0);
v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w);

v8i16 _s0 = __msa_mulv_h(_val, _w16);
v4i32 _s032 = (v4i32)__msa_ilvr_h(__msa_clti_s_h(_s0, 0), _s0);

_sum0 = __msa_addv_w(_sum0, _s032);

tmpptr += 1;
kptr += 4;
}

__msa_st_w(_sum0, outptr0, 0);
outptr0 += 4;
}
}
}

static void convolution_im2col_sgemm_transform_kernel_pack1to4_int8_msa(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
{
const int maxk = kernel_w * kernel_h;

// interleave
// src = maxk-inch-outch
// dst = 4a-4b-maxk-inch/4a-outch/4b
Mat kernel = _kernel.reshape(maxk, inch, outch);
if (inch >= 4)
kernel_tm.create(16 * maxk, inch / 4 + inch % 4, outch / 4, (size_t)1u);
else
kernel_tm.create(4 * maxk, inch, outch / 4, (size_t)1u);

for (int q = 0; q + 3 < outch; q += 4)
{
signed char* g00 = kernel_tm.channel(q / 4);

int p = 0;
for (; p + 3 < inch; p += 4)
{
for (int k = 0; k < maxk; k++)
{
for (int i = 0; i < 4; i++)
{
for (int j = 0; j < 4; j++)
{
const signed char* k00 = kernel.channel(q + i).row<const signed char>(p + j);

g00[0] = k00[k];

g00++;
}
}
}
}
for (; p < inch; p++)
{
for (int k = 0; k < maxk; k++)
{
for (int i = 0; i < 4; i++)
{
const signed char* k00 = kernel.channel(q + i).row<const signed char>(p);

g00[0] = k00[k];

g00++;
}
}
}
}
}

static void convolution_im2col_sgemm_pack1to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
{
int w = bottom_blob.w;
int inch = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
const int size = outw * outh;

const int maxk = kernel_w * kernel_h;

// im2col
Mat bottom_im2col(size, maxk, inch, 1u, 1, opt.workspace_allocator);
{
const int gap = w * stride_h - outw * stride_w;

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < inch; p++)
{
const Mat img = bottom_blob.channel(p);
signed char* ptr = bottom_im2col.channel(p);

for (int u = 0; u < kernel_h; u++)
{
for (int v = 0; v < kernel_w; v++)
{
const signed char* sptr = img.row<const signed char>(dilation_h * u) + dilation_w * v;

for (int i = 0; i < outh; i++)
{
int j = 0;
for (; j + 3 < outw; j += 4)
{
ptr[0] = sptr[0];
ptr[1] = sptr[stride_w];
ptr[2] = sptr[stride_w * 2];
ptr[3] = sptr[stride_w * 3];

sptr += stride_w * 4;
ptr += 4;
}
for (; j + 1 < outw; j += 2)
{
ptr[0] = sptr[0];
ptr[1] = sptr[stride_w];

sptr += stride_w * 2;
ptr += 2;
}
for (; j < outw; j++)
{
ptr[0] = sptr[0];

sptr += stride_w;
ptr += 1;
}

sptr += gap;
}
}
}
}
}

im2col_sgemm_pack1to4_int8_msa(bottom_im2col, top_blob, kernel, opt);
}

+ 1
- 1
src/layer/mips/convolution_sgemm_pack4to1.h View File

@@ -550,7 +550,7 @@ static void im2col_sgemm_pack4to1_msa(const Mat& bottom_im2col, Mat& top_blob, c
kptr0 += 4;
}

sum0 += __msa_fhadd_w(_sum0);
sum0 += __msa_reduce_fadd_w(_sum0);

outptr0[0] = sum0;



+ 450
- 0
src/layer/mips/convolution_sgemm_pack8to1_int8.h View File

@@ -0,0 +1,450 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

static void im2col_sgemm_pack8to1_int8_msa(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
{
// Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator);

const int size = bottom_im2col.w;
const int maxk = bottom_im2col.h;
const int inch = bottom_im2col.c;

const int outch = top_blob.c;

// permute
Mat tmp;
if (size >= 2)
tmp.create(2 * maxk, inch, size / 2 + size % 2, 8u, 8, opt.workspace_allocator);
else
tmp.create(maxk, inch, size, 8u, 8, opt.workspace_allocator);
{
int remain_size_start = 0;
int nn_size = (size - remain_size_start) >> 1;

#pragma omp parallel for num_threads(opt.num_threads)
for (int ii = 0; ii < nn_size; ii++)
{
int i = remain_size_start + ii * 2;

int64_t* tmpptr = tmp.channel(i / 2);

for (int q = 0; q < inch; q++)
{
const int64_t* img0 = (const int64_t*)bottom_im2col.channel(q) + i;

for (int k = 0; k < maxk; k++)
{
v16i8 _v = __msa_ld_b(img0, 0);
__msa_st_b(_v, tmpptr, 0);
tmpptr += 2;
img0 += size;
}
}
}

remain_size_start += nn_size << 1;

#pragma omp parallel for num_threads(opt.num_threads)
for (int i = remain_size_start; i < size; i++)
{
int64_t* tmpptr = tmp.channel(i / 2 + i % 2);

for (int q = 0; q < inch; q++)
{
const int64_t* img0 = (const int64_t*)bottom_im2col.channel(q) + i;

for (int k = 0; k < maxk; k++)
{
tmpptr[0] = img0[0];
tmpptr += 1;
img0 += size;
}
}
}
}

int nn_outch = 0;
int remain_outch_start = 0;

nn_outch = outch >> 2;

#pragma omp parallel for num_threads(opt.num_threads)
for (int pp = 0; pp < nn_outch; pp++)
{
int p = pp * 4;

int* outptr0 = top_blob.channel(p);
int* outptr1 = top_blob.channel(p + 1);
int* outptr2 = top_blob.channel(p + 2);
int* outptr3 = top_blob.channel(p + 3);

int i = 0;
for (; i + 1 < size; i += 2)
{
const signed char* tmpptr = tmp.channel(i / 2);
const signed char* kptr = kernel.channel(p / 4);

int nn = inch * maxk; // inch always > 0

v4i32 _sum00 = __msa_fill_w(0);
v4i32 _sum01 = __msa_fill_w(0);
v4i32 _sum02 = __msa_fill_w(0);
v4i32 _sum03 = __msa_fill_w(0);
v4i32 _sum10 = __msa_fill_w(0);
v4i32 _sum11 = __msa_fill_w(0);
v4i32 _sum12 = __msa_fill_w(0);
v4i32 _sum13 = __msa_fill_w(0);

int j = 0;
for (; j < nn; j++)
{
v16i8 _val01 = __msa_ld_b(tmpptr, 0);
v16i8 _extval01 = __msa_clti_s_b(_val01, 0);
v8i16 _val0 = (v8i16)__msa_ilvr_b(_extval01, _val01);
v8i16 _val1 = (v8i16)__msa_ilvl_b(_extval01, _val01);

v16i8 _w01 = __msa_ld_b(kptr, 0);
v16i8 _w23 = __msa_ld_b(kptr + 16, 0);
v16i8 _extw01 = __msa_clti_s_b(_w01, 0);
v16i8 _extw23 = __msa_clti_s_b(_w23, 0);
v8i16 _w0 = (v8i16)__msa_ilvr_b(_extw01, _w01);
v8i16 _w1 = (v8i16)__msa_ilvl_b(_extw01, _w01);
v8i16 _w2 = (v8i16)__msa_ilvr_b(_extw23, _w23);
v8i16 _w3 = (v8i16)__msa_ilvl_b(_extw23, _w23);

v8i16 _s00 = __msa_mulv_h(_val0, _w0);
v8i16 _s01 = __msa_mulv_h(_val0, _w1);
v8i16 _s02 = __msa_mulv_h(_val0, _w2);
v8i16 _s03 = __msa_mulv_h(_val0, _w3);
v8i16 _s10 = __msa_mulv_h(_val1, _w0);
v8i16 _s11 = __msa_mulv_h(_val1, _w1);
v8i16 _s12 = __msa_mulv_h(_val1, _w2);
v8i16 _s13 = __msa_mulv_h(_val1, _w3);

_sum00 = __msa_addv_w(_sum00, __msa_hadd_s_w(_s00, _s00));
_sum01 = __msa_addv_w(_sum01, __msa_hadd_s_w(_s01, _s01));
_sum02 = __msa_addv_w(_sum02, __msa_hadd_s_w(_s02, _s02));
_sum03 = __msa_addv_w(_sum03, __msa_hadd_s_w(_s03, _s03));
_sum10 = __msa_addv_w(_sum10, __msa_hadd_s_w(_s10, _s10));
_sum11 = __msa_addv_w(_sum11, __msa_hadd_s_w(_s11, _s11));
_sum12 = __msa_addv_w(_sum12, __msa_hadd_s_w(_s12, _s12));
_sum13 = __msa_addv_w(_sum13, __msa_hadd_s_w(_s13, _s13));

tmpptr += 16;
kptr += 32;
}

// transpose 4x4
{
v4i32 _tmp0, _tmp1, _tmp2, _tmp3;
_tmp0 = __msa_ilvr_w(_sum01, _sum00);
_tmp1 = __msa_ilvr_w(_sum03, _sum02);
_tmp2 = __msa_ilvl_w(_sum01, _sum00);
_tmp3 = __msa_ilvl_w(_sum03, _sum02);
_sum00 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0);
_sum01 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0);
_sum02 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2);
_sum03 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2);
}
{
v4i32 _tmp0, _tmp1, _tmp2, _tmp3;
_tmp0 = __msa_ilvr_w(_sum11, _sum10);
_tmp1 = __msa_ilvr_w(_sum13, _sum12);
_tmp2 = __msa_ilvl_w(_sum11, _sum10);
_tmp3 = __msa_ilvl_w(_sum13, _sum12);
_sum10 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0);
_sum11 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0);
_sum12 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2);
_sum13 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2);
}

_sum00 = __msa_addv_w(_sum00, _sum01);
_sum02 = __msa_addv_w(_sum02, _sum03);
_sum10 = __msa_addv_w(_sum10, _sum11);
_sum12 = __msa_addv_w(_sum12, _sum13);

_sum00 = __msa_addv_w(_sum00, _sum02);
_sum10 = __msa_addv_w(_sum10, _sum12);

int sum[8];
__msa_st_w(_sum00, sum, 0);
__msa_st_w(_sum10, sum + 4, 0);

outptr0[0] = sum[0];
outptr1[0] = sum[1];
outptr2[0] = sum[2];
outptr3[0] = sum[3];
outptr0[1] = sum[4];
outptr1[1] = sum[5];
outptr2[1] = sum[6];
outptr3[1] = sum[7];
outptr0 += 2;
outptr1 += 2;
outptr2 += 2;
outptr3 += 2;
}
for (; i < size; i++)
{
const signed char* tmpptr = tmp.channel(i / 2 + i % 2);
const signed char* kptr = kernel.channel(p / 4);

int nn = inch * maxk; // inch always > 0

v4i32 _sum0 = __msa_fill_w(0);
v4i32 _sum1 = __msa_fill_w(0);
v4i32 _sum2 = __msa_fill_w(0);
v4i32 _sum3 = __msa_fill_w(0);

int j = 0;
for (; j < nn; j++)
{
v16i8 _val = __msa_ld_b(tmpptr, 0);
v8i16 _val16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val);

v16i8 _w01 = __msa_ld_b(kptr, 0);
v16i8 _w23 = __msa_ld_b(kptr + 16, 0);
v16i8 _extw01 = __msa_clti_s_b(_w01, 0);
v16i8 _extw23 = __msa_clti_s_b(_w23, 0);
v8i16 _w0 = (v8i16)__msa_ilvr_b(_extw01, _w01);
v8i16 _w1 = (v8i16)__msa_ilvl_b(_extw01, _w01);
v8i16 _w2 = (v8i16)__msa_ilvr_b(_extw23, _w23);
v8i16 _w3 = (v8i16)__msa_ilvl_b(_extw23, _w23);

v8i16 _s0 = __msa_mulv_h(_val16, _w0);
v8i16 _s1 = __msa_mulv_h(_val16, _w1);
v8i16 _s2 = __msa_mulv_h(_val16, _w2);
v8i16 _s3 = __msa_mulv_h(_val16, _w3);

_sum0 = __msa_addv_w(_sum0, __msa_hadd_s_w(_s0, _s0));
_sum1 = __msa_addv_w(_sum1, __msa_hadd_s_w(_s1, _s1));
_sum2 = __msa_addv_w(_sum2, __msa_hadd_s_w(_s2, _s2));
_sum3 = __msa_addv_w(_sum3, __msa_hadd_s_w(_s3, _s3));

tmpptr += 8;
kptr += 32;
}

// transpose 4x4
{
v4i32 _tmp0, _tmp1, _tmp2, _tmp3;
_tmp0 = __msa_ilvr_w(_sum1, _sum0);
_tmp1 = __msa_ilvr_w(_sum3, _sum2);
_tmp2 = __msa_ilvl_w(_sum1, _sum0);
_tmp3 = __msa_ilvl_w(_sum3, _sum2);
_sum0 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0);
_sum1 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0);
_sum2 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2);
_sum3 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2);
}

_sum0 = __msa_addv_w(_sum0, _sum1);
_sum2 = __msa_addv_w(_sum2, _sum3);

_sum0 = __msa_addv_w(_sum0, _sum2);

int sum[4];
__msa_st_w(_sum0, sum, 0);

outptr0[0] = sum[0];
outptr1[0] = sum[1];
outptr2[0] = sum[2];
outptr3[0] = sum[3];
outptr0 += 1;
outptr1 += 1;
outptr2 += 1;
outptr3 += 1;
}
}

remain_outch_start += nn_outch << 2;

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = remain_outch_start; p < outch; p++)
{
int* outptr0 = top_blob.channel(p);

int i = 0;
for (; i + 1 < size; i += 2)
{
const signed char* tmpptr = tmp.channel(i / 2);
const signed char* kptr = kernel.channel(p / 4 + p % 4);

int nn = inch * maxk; // inch always > 0

v4i32 _sum0 = __msa_fill_w(0);
v4i32 _sum1 = __msa_fill_w(0);

int j = 0;
for (; j < nn; j++)
{
v16i8 _val01 = __msa_ld_b(tmpptr, 0);
v16i8 _extval01 = __msa_clti_s_b(_val01, 0);
v8i16 _val0 = (v8i16)__msa_ilvr_b(_extval01, _val01);
v8i16 _val1 = (v8i16)__msa_ilvl_b(_extval01, _val01);

v16i8 _w = __msa_ld_b(kptr, 0);
v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w);

v8i16 _s0 = __msa_mulv_h(_val0, _w16);
v8i16 _s1 = __msa_mulv_h(_val1, _w16);

_sum0 = __msa_addv_w(_sum0, __msa_hadd_s_w(_s0, _s0));
_sum1 = __msa_addv_w(_sum1, __msa_hadd_s_w(_s1, _s1));

tmpptr += 16;
kptr += 8;
}

outptr0[0] = __msa_reduce_add_w(_sum0);
outptr0[1] = __msa_reduce_add_w(_sum1);
outptr0 += 2;
}
for (; i < size; i++)
{
const signed char* tmpptr = tmp.channel(i / 2 + i % 2);
const signed char* kptr = kernel.channel(p / 4 + p % 4);

int nn = inch * maxk; // inch always > 0

v4i32 _sum = __msa_fill_w(0);

int j = 0;
for (; j < nn; j++)
{
v16i8 _val = __msa_ld_b(tmpptr, 0);
v8i16 _val16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val);

v16i8 _w = __msa_ld_b(kptr, 0);
v8i16 _w16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_w, 0), _w);

v8i16 _s0 = __msa_mulv_h(_val16, _w16);

_sum = __msa_addv_w(_sum, __msa_hadd_s_w(_s0, _s0));

tmpptr += 8;
kptr += 8;
}

outptr0[0] = __msa_reduce_add_w(_sum);
outptr0 += 1;
}
}
}

static void convolution_im2col_sgemm_transform_kernel_pack8to1_int8_msa(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
{
const int maxk = kernel_w * kernel_h;

// interleave
// src = maxk-inch-outch
// dst = 8a-4b-maxk-inch/8a-outch/4b
Mat kernel = _kernel.reshape(maxk, inch, outch);
if (outch >= 4)
kernel_tm.create(32 * maxk, inch / 8, outch / 4 + outch % 4, (size_t)1u);
else
kernel_tm.create(8 * maxk, inch / 8, outch, (size_t)1u);

int q = 0;
for (; q + 3 < outch; q += 4)
{
signed char* g00 = kernel_tm.channel(q / 4);

for (int p = 0; p + 7 < inch; p += 8)
{
for (int k = 0; k < maxk; k++)
{
for (int i = 0; i < 4; i++)
{
for (int j = 0; j < 8; j++)
{
const signed char* k00 = kernel.channel(q + i).row<const signed char>(p + j);

g00[0] = k00[k];

g00++;
}
}
}
}
}
// TODO unroll 2
for (; q < outch; q++)
{
signed char* g00 = kernel_tm.channel(q / 4 + q % 4);

for (int p = 0; p + 7 < inch; p += 8)
{
for (int k = 0; k < maxk; k++)
{
for (int j = 0; j < 8; j++)
{
const signed char* k00 = kernel.channel(q).row<const signed char>(p + j);

g00[0] = k00[k];

g00++;
}
}
}
}
}

static void convolution_im2col_sgemm_pack8to1_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
{
int w = bottom_blob.w;
int inch = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
const int size = outw * outh;

const int maxk = kernel_w * kernel_h;

// im2col
Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator);
{
const int gap = w * stride_h - outw * stride_w;

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < inch; p++)
{
const Mat img = bottom_blob.channel(p);
int64_t* ptr = bottom_im2col.channel(p);

for (int u = 0; u < kernel_h; u++)
{
for (int v = 0; v < kernel_w; v++)
{
const int64_t* sptr = img.row<const int64_t>(dilation_h * u) + dilation_w * v;

for (int i = 0; i < outh; i++)
{
int j = 0;
for (; j < outw; j++)
{
ptr[0] = sptr[0];

sptr += stride_w;
ptr += 1;
}

sptr += gap;
}
}
}
}
}

im2col_sgemm_pack8to1_int8_msa(bottom_im2col, top_blob, kernel, opt);
}

+ 320
- 0
src/layer/mips/convolution_sgemm_pack8to4_int8.h View File

@@ -0,0 +1,320 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

static void im2col_sgemm_pack8to4_int8_msa(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
{
// Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator);

const int size = bottom_im2col.w;
const int maxk = bottom_im2col.h;
const int inch = bottom_im2col.c;

const int outch = top_blob.c;

// permute
Mat tmp;
if (size >= 2)
tmp.create(2 * maxk, inch, size / 2 + size % 2, 8u, 8, opt.workspace_allocator);
else
tmp.create(maxk, inch, size, 8u, 8, opt.workspace_allocator);
{
int remain_size_start = 0;
int nn_size = size >> 1;

#pragma omp parallel for num_threads(opt.num_threads)
for (int ii = 0; ii < nn_size; ii++)
{
int i = remain_size_start + ii * 2;

int64_t* tmpptr = tmp.channel(i / 2);

for (int q = 0; q < inch; q++)
{
const int64_t* img0 = (const int64_t*)bottom_im2col.channel(q) + i;

for (int k = 0; k < maxk; k++)
{
v16i8 _v = __msa_ld_b(img0, 0);
__msa_st_b(_v, tmpptr, 0);
tmpptr += 2;
img0 += size;
}
}
}

remain_size_start += nn_size << 1;

#pragma omp parallel for num_threads(opt.num_threads)
for (int i = remain_size_start; i < size; i++)
{
int64_t* tmpptr = tmp.channel(i / 2 + i % 2);

for (int q = 0; q < inch; q++)
{
const int64_t* img0 = (const int64_t*)bottom_im2col.channel(q) + i;

for (int k = 0; k < maxk; k++)
{
tmpptr[0] = img0[0];
tmpptr += 1;
img0 += size;
}
}
}
}

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < outch; p++)
{
int* outptr0 = top_blob.channel(p);

int i = 0;
for (; i + 1 < size; i += 2)
{
const signed char* tmpptr = tmp.channel(i / 2);
const signed char* kptr = kernel.channel(p);

int nn = inch * maxk; // inch always > 0

v4i32 _sum00 = __msa_fill_w(0);
v4i32 _sum01 = __msa_fill_w(0);
v4i32 _sum02 = __msa_fill_w(0);
v4i32 _sum03 = __msa_fill_w(0);
v4i32 _sum10 = __msa_fill_w(0);
v4i32 _sum11 = __msa_fill_w(0);
v4i32 _sum12 = __msa_fill_w(0);
v4i32 _sum13 = __msa_fill_w(0);

int j = 0;
for (; j < nn; j++)
{
v16i8 _val01 = __msa_ld_b(tmpptr, 0);
v16i8 _extval01 = __msa_clti_s_b(_val01, 0);
v8i16 _val0 = (v8i16)__msa_ilvr_b(_extval01, _val01);
v8i16 _val1 = (v8i16)__msa_ilvl_b(_extval01, _val01);

v16i8 _w01 = __msa_ld_b(kptr, 0);
v16i8 _w23 = __msa_ld_b(kptr + 16, 0);
v16i8 _extw01 = __msa_clti_s_b(_w01, 0);
v16i8 _extw23 = __msa_clti_s_b(_w23, 0);
v8i16 _w0 = (v8i16)__msa_ilvr_b(_extw01, _w01);
v8i16 _w1 = (v8i16)__msa_ilvl_b(_extw01, _w01);
v8i16 _w2 = (v8i16)__msa_ilvr_b(_extw23, _w23);
v8i16 _w3 = (v8i16)__msa_ilvl_b(_extw23, _w23);

v8i16 _s00 = __msa_mulv_h(_val0, _w0);
v8i16 _s01 = __msa_mulv_h(_val0, _w1);
v8i16 _s02 = __msa_mulv_h(_val0, _w2);
v8i16 _s03 = __msa_mulv_h(_val0, _w3);
v8i16 _s10 = __msa_mulv_h(_val1, _w0);
v8i16 _s11 = __msa_mulv_h(_val1, _w1);
v8i16 _s12 = __msa_mulv_h(_val1, _w2);
v8i16 _s13 = __msa_mulv_h(_val1, _w3);

_sum00 = __msa_addv_w(_sum00, __msa_hadd_s_w(_s00, _s00));
_sum01 = __msa_addv_w(_sum01, __msa_hadd_s_w(_s01, _s01));
_sum02 = __msa_addv_w(_sum02, __msa_hadd_s_w(_s02, _s02));
_sum03 = __msa_addv_w(_sum03, __msa_hadd_s_w(_s03, _s03));
_sum10 = __msa_addv_w(_sum10, __msa_hadd_s_w(_s10, _s10));
_sum11 = __msa_addv_w(_sum11, __msa_hadd_s_w(_s11, _s11));
_sum12 = __msa_addv_w(_sum12, __msa_hadd_s_w(_s12, _s12));
_sum13 = __msa_addv_w(_sum13, __msa_hadd_s_w(_s13, _s13));

tmpptr += 16;
kptr += 32;
}

// transpose 4x4
{
v4i32 _tmp0, _tmp1, _tmp2, _tmp3;
_tmp0 = __msa_ilvr_w(_sum01, _sum00);
_tmp1 = __msa_ilvr_w(_sum03, _sum02);
_tmp2 = __msa_ilvl_w(_sum01, _sum00);
_tmp3 = __msa_ilvl_w(_sum03, _sum02);
_sum00 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0);
_sum01 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0);
_sum02 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2);
_sum03 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2);
}
{
v4i32 _tmp0, _tmp1, _tmp2, _tmp3;
_tmp0 = __msa_ilvr_w(_sum11, _sum10);
_tmp1 = __msa_ilvr_w(_sum13, _sum12);
_tmp2 = __msa_ilvl_w(_sum11, _sum10);
_tmp3 = __msa_ilvl_w(_sum13, _sum12);
_sum10 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0);
_sum11 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0);
_sum12 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2);
_sum13 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2);
}

_sum00 = __msa_addv_w(_sum00, _sum01);
_sum02 = __msa_addv_w(_sum02, _sum03);
_sum10 = __msa_addv_w(_sum10, _sum11);
_sum12 = __msa_addv_w(_sum12, _sum13);

_sum00 = __msa_addv_w(_sum00, _sum02);
_sum10 = __msa_addv_w(_sum10, _sum12);

__msa_st_w(_sum00, outptr0, 0);
__msa_st_w(_sum10, outptr0 + 4, 0);
outptr0 += 8;
}
for (; i < size; i++)
{
const signed char* tmpptr = tmp.channel(i / 2 + i % 2);
const signed char* kptr = kernel.channel(p);

int nn = inch * maxk; // inch always > 0

v4i32 _sum0 = __msa_fill_w(0);
v4i32 _sum1 = __msa_fill_w(0);
v4i32 _sum2 = __msa_fill_w(0);
v4i32 _sum3 = __msa_fill_w(0);

int j = 0;
for (; j < nn; j++)
{
v16i8 _val = __msa_ld_b(tmpptr, 0);
v8i16 _val16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val);

v16i8 _w01 = __msa_ld_b(kptr, 0);
v16i8 _w23 = __msa_ld_b(kptr + 16, 0);
v16i8 _extw01 = __msa_clti_s_b(_w01, 0);
v16i8 _extw23 = __msa_clti_s_b(_w23, 0);
v8i16 _w0 = (v8i16)__msa_ilvr_b(_extw01, _w01);
v8i16 _w1 = (v8i16)__msa_ilvl_b(_extw01, _w01);
v8i16 _w2 = (v8i16)__msa_ilvr_b(_extw23, _w23);
v8i16 _w3 = (v8i16)__msa_ilvl_b(_extw23, _w23);

v8i16 _s0 = __msa_mulv_h(_val16, _w0);
v8i16 _s1 = __msa_mulv_h(_val16, _w1);
v8i16 _s2 = __msa_mulv_h(_val16, _w2);
v8i16 _s3 = __msa_mulv_h(_val16, _w3);

_sum0 = __msa_addv_w(_sum0, __msa_hadd_s_w(_s0, _s0));
_sum1 = __msa_addv_w(_sum1, __msa_hadd_s_w(_s1, _s1));
_sum2 = __msa_addv_w(_sum2, __msa_hadd_s_w(_s2, _s2));
_sum3 = __msa_addv_w(_sum3, __msa_hadd_s_w(_s3, _s3));

tmpptr += 8;
kptr += 32;
}

// transpose 4x4
{
v4i32 _tmp0, _tmp1, _tmp2, _tmp3;
_tmp0 = __msa_ilvr_w(_sum1, _sum0);
_tmp1 = __msa_ilvr_w(_sum3, _sum2);
_tmp2 = __msa_ilvl_w(_sum1, _sum0);
_tmp3 = __msa_ilvl_w(_sum3, _sum2);
_sum0 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0);
_sum1 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0);
_sum2 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2);
_sum3 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2);
}

_sum0 = __msa_addv_w(_sum0, _sum1);
_sum2 = __msa_addv_w(_sum2, _sum3);

_sum0 = __msa_addv_w(_sum0, _sum2);

__msa_st_w(_sum0, outptr0, 0);
outptr0 += 4;
}
}
}

static void convolution_im2col_sgemm_transform_kernel_pack8to4_int8_msa(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
{
const int maxk = kernel_w * kernel_h;

// interleave
// src = maxk-inch-outch
// dst = 8a-4b-maxk-inch/8a-outch/4b
Mat kernel = _kernel.reshape(maxk, inch, outch);
kernel_tm.create(32 * maxk, inch / 8, outch / 4, (size_t)1u);

for (int q = 0; q + 3 < outch; q += 4)
{
signed char* g00 = kernel_tm.channel(q / 4);

for (int p = 0; p + 7 < inch; p += 8)
{
for (int k = 0; k < maxk; k++)
{
for (int i = 0; i < 4; i++)
{
for (int j = 0; j < 8; j++)
{
const signed char* k00 = kernel.channel(q + i).row<const signed char>(p + j);

g00[0] = k00[k];

g00++;
}
}
}
}
}
}

static void convolution_im2col_sgemm_pack8to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
{
int w = bottom_blob.w;
int inch = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
const int size = outw * outh;

const int maxk = kernel_w * kernel_h;

// im2col
Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator);
{
const int gap = w * stride_h - outw * stride_w;

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < inch; p++)
{
const Mat img = bottom_blob.channel(p);
int64_t* ptr = bottom_im2col.channel(p);

for (int u = 0; u < kernel_h; u++)
{
for (int v = 0; v < kernel_w; v++)
{
const int64_t* sptr = img.row<const int64_t>(dilation_h * u) + dilation_w * v;

for (int i = 0; i < outh; i++)
{
int j = 0;
for (; j < outw; j++)
{
ptr[0] = sptr[0];

sptr += stride_w;
ptr += 1;
}

sptr += gap;
}
}
}
}
}

im2col_sgemm_pack8to4_int8_msa(bottom_im2col, top_blob, kernel, opt);
}

+ 1
- 1
src/layer/mips/deconvolution_pack4to1.h View File

@@ -88,7 +88,7 @@ static void deconvolution_pack4to1_msa(const Mat& bottom_blob, Mat& top_blob, co
kptr += maxk * 4;
}

sum += __msa_fhadd_w(_sum);
sum += __msa_reduce_fadd_w(_sum);

sum = activation_ss(sum, activation_type, activation_params);



+ 4
- 4
src/layer/mips/innerproduct_mips.cpp View File

@@ -300,10 +300,10 @@ int InnerProduct_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Opti
}

#if __mips_msa
sum0 += __msa_fhadd_w(_sum0);
sum1 += __msa_fhadd_w(_sum1);
sum2 += __msa_fhadd_w(_sum2);
sum3 += __msa_fhadd_w(_sum3);
sum0 += __msa_reduce_fadd_w(_sum0);
sum1 += __msa_reduce_fadd_w(_sum1);
sum2 += __msa_reduce_fadd_w(_sum2);
sum3 += __msa_reduce_fadd_w(_sum3);
#endif // __mips_msa

sum0 = activation_ss(sum0, activation_type, activation_params);


+ 17
- 11
src/layer/mips/mips_usability.h View File

@@ -38,19 +38,25 @@ typedef union
static const ncnn::FloatInt Name = {.f = Val}

/* float type data load instructions */
static inline v4f32 __msa_fill_w_f32(float val)
static NCNN_FORCEINLINE v4f32 __msa_fill_w_f32(float val)
{
ncnn::FloatInt fi_tmpval = {.f = val};
return (v4f32)__msa_fill_w(fi_tmpval.i);
}

static inline float __msa_fhadd_w(v4f32 _v)
static NCNN_FORCEINLINE float __msa_reduce_fadd_w(v4f32 _v)
{
// TODO find a more efficient way
return _v[0] + _v[1] + _v[2] + _v[3];
}

static inline int __msa_cfcmsa_msacsr()
static NCNN_FORCEINLINE int __msa_reduce_add_w(v4i32 _v)
{
// TODO find a more efficient way
return _v[0] + _v[1] + _v[2] + _v[3];
}

static NCNN_FORCEINLINE int __msa_cfcmsa_msacsr()
{
int v;
asm volatile("cfcmsa %0, $1 \n"
@@ -60,7 +66,7 @@ static inline int __msa_cfcmsa_msacsr()
return v;
}

static inline void __msa_ctcmsa_msacsr(int v)
static NCNN_FORCEINLINE void __msa_ctcmsa_msacsr(int v)
{
asm volatile("ctcmsa $1, %0 \n"
:
@@ -69,7 +75,7 @@ static inline void __msa_ctcmsa_msacsr(int v)
}
#endif // __mips_msa

static inline signed char float2int8(float v)
static NCNN_FORCEINLINE signed char float2int8(float v)
{
int int32 = round(v);
if (int32 > 127) return 127;
@@ -78,7 +84,7 @@ static inline signed char float2int8(float v)
}

#if __mips_msa
static inline v16i8 float2int8(v4f32 _v)
static NCNN_FORCEINLINE v16i8 float2int8(v4f32 _v)
{
// simulate round to nearest via +/-0.5
v4f32 _p5 = (v4f32)__msa_fill_w_f32(0.5f);
@@ -98,7 +104,7 @@ static inline v16i8 float2int8(v4f32 _v)
return _v8;
}

static inline int64_t float2int8(v4f32 _vlow, v4f32 _vhigh)
static NCNN_FORCEINLINE int64_t float2int8(v4f32 _vlow, v4f32 _vhigh)
{
// simulate round to nearest via +/-0.5
v4f32 _p5 = (v4f32)__msa_fill_w_f32(0.5f);
@@ -123,7 +129,7 @@ static inline int64_t float2int8(v4f32 _vlow, v4f32 _vhigh)
return _v8[0];
}

static inline v16i8 float2int8relu(v4f32 _v)
static NCNN_FORCEINLINE v16i8 float2int8relu(v4f32 _v)
{
// simulate round to nearest via +/-0.5
v4f32 _p5 = (v4f32)__msa_fill_w_f32(0.5f);
@@ -143,7 +149,7 @@ static inline v16i8 float2int8relu(v4f32 _v)
return _v8;
}

static inline int64_t float2int8relu(v4f32 _vlow, v4f32 _vhigh)
static NCNN_FORCEINLINE int64_t float2int8relu(v4f32 _vlow, v4f32 _vhigh)
{
// simulate round to nearest via +/-0.5
v4f32 _p5 = (v4f32)__msa_fill_w_f32(0.5f);
@@ -168,7 +174,7 @@ static inline int64_t float2int8relu(v4f32 _vlow, v4f32 _vhigh)
return _v8[0];
}

static inline v16i8 float2int8leakyrelu(v4f32 _v, v4f32 _slope)
static NCNN_FORCEINLINE v16i8 float2int8leakyrelu(v4f32 _v, v4f32 _slope)
{
v4f32 _v_leaky = __msa_fmul_w(_v, _slope);

@@ -199,7 +205,7 @@ static inline v16i8 float2int8leakyrelu(v4f32 _v, v4f32 _slope)
return _v8;
}

static inline int64_t float2int8leakyrelu(v4f32 _vlow, v4f32 _vhigh, v4f32 _slope)
static NCNN_FORCEINLINE int64_t float2int8leakyrelu(v4f32 _vlow, v4f32 _vhigh, v4f32 _slope)
{
v4f32 _vlow_leaky = __msa_fmul_w(_vlow, _slope);
v4f32 _vhigh_leaky = __msa_fmul_w(_vhigh, _slope);


Loading…
Cancel
Save