Browse Source

x86 optimization for convolution int8 gemm unified elempack (#4881)

tags/20231027
nihui GitHub 2 years ago
parent
commit
9ecf6a61be
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
18 changed files with 7184 additions and 4378 deletions
  1. +14
    -8
      CMakeLists.txt
  2. +6
    -0
      cmake/ncnn_add_layer.cmake
  3. +0
    -262
      src/layer/x86/convolution_1x1_int8.h
  4. +0
    -83
      src/layer/x86/convolution_1x1_pack1to4_int8.h
  5. +0
    -65
      src/layer/x86/convolution_1x1_pack8to1_int8.h
  6. +0
    -65
      src/layer/x86/convolution_1x1_pack8to4_int8.h
  7. +0
    -147
      src/layer/x86/convolution_3x3_pack1to4_int8.h
  8. +0
    -80
      src/layer/x86/convolution_7x7_pack1to4_int8.h
  9. +7099
    -0
      src/layer/x86/convolution_im2col_gemm_int8.h
  10. +0
    -1129
      src/layer/x86/convolution_sgemm_int8.h
  11. +0
    -736
      src/layer/x86/convolution_sgemm_pack1to4_int8.h
  12. +0
    -875
      src/layer/x86/convolution_sgemm_pack8to1_int8.h
  13. +0
    -625
      src/layer/x86/convolution_sgemm_pack8to4_int8.h
  14. +43
    -206
      src/layer/x86/convolution_x86.cpp
  15. +7
    -22
      src/layer/x86/convolution_x86_avx2.cpp
  16. +5
    -25
      src/layer/x86/convolution_x86_avx512vnni.cpp
  17. +5
    -25
      src/layer/x86/convolution_x86_avxvnni.cpp
  18. +5
    -25
      src/layer/x86/convolution_x86_xop.cpp

+ 14
- 8
CMakeLists.txt View File

@@ -408,15 +408,21 @@ else()
check_cxx_compiler_flag("/arch:AVX" NCNN_COMPILER_SUPPORT_X86_XOP)
check_cxx_compiler_flag("/arch:AVX" NCNN_COMPILER_SUPPORT_X86_F16C)
check_cxx_compiler_flag("/arch:AVX2" NCNN_COMPILER_SUPPORT_X86_AVX2)
check_cxx_compiler_flag("/arch:AVX2" NCNN_COMPILER_SUPPORT_X86_AVX_VNNI)
check_cxx_compiler_flag("/arch:AVX512" NCNN_COMPILER_SUPPORT_X86_AVX512)
check_cxx_compiler_flag("/arch:AVX512" NCNN_COMPILER_SUPPORT_X86_AVX512_VNNI)
if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 19.16)
# vs2017+ supports avx512 and vnni
set(NCNN_COMPILER_SUPPORT_X86_AVX_VNNI OFF)
set(NCNN_COMPILER_SUPPORT_X86_AVX512 OFF)
set(NCNN_COMPILER_SUPPORT_X86_AVX512_VNNI OFF)
endif()

set(CMAKE_REQUIRED_FLAGS "/arch:AVX2")
check_cxx_source_compiles("#include <immintrin.h>\nint main() { __m256i _s, _a, _b; _s = _mm256_dpwssd_epi32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX_VNNI)

set(CMAKE_REQUIRED_FLAGS "/arch:AVX512")
check_cxx_source_compiles("#include <immintrin.h>\nint main() { __m512i _s, _a, _b; _s = _mm512_dpwssd_epi32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX512_VNNI)

set(CMAKE_REQUIRED_FLAGS "/arch:AVX512")
check_cxx_source_compiles("#include <immintrin.h>\nint main() { __m256bh _s; __m512bh _a, _b; _s = _mm512_cvtneps_pbh(_mm512_dpbf16_ps(_mm512_cvtpbh_ps(_s), _a, _b)); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX512_BF16)

set(CMAKE_REQUIRED_FLAGS "/arch:AVX512")
check_cxx_source_compiles("#include <immintrin.h>\nint main() { __m512h _s, _a, _b; _s = _mm512_fmadd_ph(_s, _a, _b); __m512 _s2; _s2 = _mm512_cvtxph_ps(_mm512_cvtxps_ph(_s2)); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX512_FP16)

unset(CMAKE_REQUIRED_FLAGS)
else()
check_cxx_compiler_flag("-mavx" NCNN_COMPILER_SUPPORT_X86_AVX)



+ 6
- 0
cmake/ncnn_add_layer.cmake View File

@@ -171,6 +171,12 @@ macro(ncnn_add_layer class)
if(NCNN_AVX512VNNI)
ncnn_add_arch_opt_source(${class} avx512vnni "/arch:AVX512 /D__SSE4_1__ /D__FMA__ /D__F16C__ /D__AVX512VNNI__")
endif()
if(NCNN_AVX512BF16)
ncnn_add_arch_opt_source(${class} avx512bf16 "/arch:AVX512 /D__SSE4_1__ /D__FMA__ /D__F16C__ /D__AVX512BF16__")
endif()
if(NCNN_AVX512FP16)
ncnn_add_arch_opt_source(${class} avx512fp16 "/arch:AVX512 /D__SSE4_1__ /D__FMA__ /D__F16C__ /D__AVX512FP16__")
endif()
if(NCNN_AVXVNNI)
ncnn_add_arch_opt_source(${class} avxvnni "/arch:AVX2 /D__SSE4_1__ /D__FMA__ /D__F16C__ /D__AVXVNNI__")
endif()


+ 0
- 262
src/layer/x86/convolution_1x1_int8.h View File

@@ -1,262 +0,0 @@
// BUG1989 is pleased to support the open source community by supporting ncnn available.
//
// Copyright (C) 2019 BUG1989. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

static void conv1x1s1_sgemm_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
const int size = w * h;

Mat bottom_im2col = bottom_blob;
bottom_im2col.w = size;
bottom_im2col.h = 1;

im2col_sgemm_int8_sse(bottom_im2col, top_blob, kernel, opt);
}

static void conv1x1s2_sgemm_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
int w = bottom_blob.w;
int channels = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;
int elempack = bottom_blob.elempack;

int outw = top_blob.w;
int outh = top_blob.h;

const int tailstep = w - 2 * outw + w;

Mat bottom_blob_shrinked;
bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < channels; p++)
{
const signed char* r0 = bottom_blob.channel(p);
signed char* outptr = bottom_blob_shrinked.channel(p);

for (int i = 0; i < outh; i++)
{
int j = 0;
for (; j + 3 < outw; j += 4)
{
outptr[0] = r0[0];
outptr[1] = r0[2];
outptr[2] = r0[4];
outptr[3] = r0[6];

r0 += 8;
outptr += 4;
}
for (; j + 1 < outw; j += 2)
{
outptr[0] = r0[0];
outptr[1] = r0[2];

r0 += 4;
outptr += 2;
}
for (; j < outw; j++)
{
outptr[0] = r0[0];

r0 += 2;
outptr += 1;
}

r0 += tailstep;
}
}

conv1x1s1_sgemm_int8_sse(bottom_blob_shrinked, top_blob, kernel, opt);
}

static void conv1x1s1_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Option& opt)
{
int inch = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

const float* kernel = _kernel;

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < outch; p++)
{
Mat out0 = top_blob.channel(p);

out0.fill(0);

int q = 0;

for (; q + 7 < inch; q += 8)
{
int* outptr0 = out0;

const signed char* kernel0 = (const signed char*)kernel + p * inch + q;

const signed char* r0 = bottom_blob.channel(q);
const signed char* r1 = bottom_blob.channel(q + 1);
const signed char* r2 = bottom_blob.channel(q + 2);
const signed char* r3 = bottom_blob.channel(q + 3);
const signed char* r4 = bottom_blob.channel(q + 4);
const signed char* r5 = bottom_blob.channel(q + 5);
const signed char* r6 = bottom_blob.channel(q + 6);
const signed char* r7 = bottom_blob.channel(q + 7);

int size = outw * outh;
int remain = size;

for (; remain > 0; remain--)
{
//ToDo Neon
int sum0 = (int)*r0 * (int)kernel0[0] + (int)*r1 * (int)kernel0[1] + (int)*r2 * (int)kernel0[2] + (int)*r3 * (int)kernel0[3] + (int)*r4 * (int)kernel0[4] + (int)*r5 * (int)kernel0[5] + (int)*r6 * (int)kernel0[6] + (int)*r7 * (int)kernel0[7];

*outptr0 += sum0;

r0++;
r1++;
r2++;
r3++;
r4++;
r5++;
r6++;
r7++;
outptr0++;
}
}

for (; q < inch; q++)
{
int* outptr0 = out0;

const signed char* r0 = bottom_blob.channel(q);

const signed char* kernel0 = (const signed char*)kernel + p * inch + q;
const signed char k0 = kernel0[0];

int size = outw * outh;
int remain = size;

for (; remain > 0; remain--)
{
int sum0 = (int)(*r0) * (int)k0;

*outptr0 += sum0;

r0++;
outptr0++;
}
}
}
}

static void conv1x1s2_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Option& opt)
{
int w = bottom_blob.w;
int inch = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

const int tailstep = w - 2 * outw + w;
const signed char* kernel = _kernel;

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < outch; p++)
{
Mat out0 = top_blob.channel(p);

out0.fill(0);

int q = 0;

for (; q + 7 < inch; q += 8)
{
int* outptr0 = out0;

const signed char* kernel0 = (const signed char*)kernel + p * inch + q;

const signed char* r0 = bottom_blob.channel(q);
const signed char* r1 = bottom_blob.channel(q + 1);
const signed char* r2 = bottom_blob.channel(q + 2);
const signed char* r3 = bottom_blob.channel(q + 3);
const signed char* r4 = bottom_blob.channel(q + 4);
const signed char* r5 = bottom_blob.channel(q + 5);
const signed char* r6 = bottom_blob.channel(q + 6);
const signed char* r7 = bottom_blob.channel(q + 7);

for (int i = 0; i < outh; i++)
{
int remain = outw;

for (; remain > 0; remain--)
{
//ToDo Neon
int sum0 = (int)*r0 * (int)kernel0[0] + (int)*r1 * (int)kernel0[1] + (int)*r2 * (int)kernel0[2] + (int)*r3 * (int)kernel0[3] + (int)*r4 * (int)kernel0[4] + (int)*r5 * (int)kernel0[5] + (int)*r6 * (int)kernel0[6] + (int)*r7 * (int)kernel0[7];

*outptr0 += sum0;

r0 += 2;
r1 += 2;
r2 += 2;
r3 += 2;
r4 += 2;
r5 += 2;
r6 += 2;
r7 += 2;
outptr0++;
}

r0 += tailstep;
r1 += tailstep;
r2 += tailstep;
r3 += tailstep;
r4 += tailstep;
r5 += tailstep;
r6 += tailstep;
r7 += tailstep;
}
}

for (; q < inch; q++)
{
int* outptr0 = out0;

const signed char* r0 = bottom_blob.channel(q);

const signed char* kernel0 = (const signed char*)kernel + p * inch + q;

for (int i = 0; i < outh; i++)
{
int remain = outw;

for (; remain > 0; remain--)
{
//ToDo Neon
int sum0 = (int)*r0 * (int)kernel0[0];

*outptr0 += sum0;

r0 += 2;
outptr0++;
}

r0 += tailstep;
}
}
}
}

+ 0
- 83
src/layer/x86/convolution_1x1_pack1to4_int8.h View File

@@ -1,83 +0,0 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

static void conv1x1s1_sgemm_pack1to4_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
const int size = w * h;

Mat bottom_im2col = bottom_blob;
bottom_im2col.w = size;
bottom_im2col.h = 1;

im2col_sgemm_pack1to4_int8_sse(bottom_im2col, top_blob, kernel, opt);
}

static void conv1x1s2_sgemm_pack1to4_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
int w = bottom_blob.w;
int channels = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;
int elempack = bottom_blob.elempack;

int outw = top_blob.w;
int outh = top_blob.h;

const int tailstep = w - 2 * outw + w;

Mat bottom_blob_shrinked;
bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < channels; p++)
{
const signed char* r0 = bottom_blob.channel(p);
signed char* outptr = bottom_blob_shrinked.channel(p);

for (int i = 0; i < outh; i++)
{
int j = 0;
for (; j + 3 < outw; j += 4)
{
outptr[0] = r0[0];
outptr[1] = r0[2];
outptr[2] = r0[4];
outptr[3] = r0[6];

r0 += 8;
outptr += 4;
}
for (; j + 1 < outw; j += 2)
{
outptr[0] = r0[0];
outptr[1] = r0[2];

r0 += 4;
outptr += 2;
}
for (; j < outw; j++)
{
outptr[0] = r0[0];

r0 += 2;
outptr += 1;
}

r0 += tailstep;
}
}

conv1x1s1_sgemm_pack1to4_int8_sse(bottom_blob_shrinked, top_blob, kernel, opt);
}

+ 0
- 65
src/layer/x86/convolution_1x1_pack8to1_int8.h View File

@@ -1,65 +0,0 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

static void conv1x1s1_sgemm_pack8to1_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
const int size = w * h;

Mat bottom_im2col = bottom_blob;
bottom_im2col.w = size;
bottom_im2col.h = 1;

im2col_sgemm_pack8to1_int8_sse(bottom_im2col, top_blob, kernel, opt);
}

static void conv1x1s2_sgemm_pack8to1_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
int w = bottom_blob.w;
int channels = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;
int elempack = bottom_blob.elempack;

int outw = top_blob.w;
int outh = top_blob.h;

const int tailstep = w - 2 * outw + w;

Mat bottom_blob_shrinked;
bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < channels; p++)
{
const int64_t* r0 = bottom_blob.channel(p);
int64_t* outptr = bottom_blob_shrinked.channel(p);

for (int i = 0; i < outh; i++)
{
int j = 0;
for (; j < outw; j++)
{
outptr[0] = r0[0];

r0 += 2;
outptr += 1;
}

r0 += tailstep;
}
}

conv1x1s1_sgemm_pack8to1_int8_sse(bottom_blob_shrinked, top_blob, kernel, opt);
}

+ 0
- 65
src/layer/x86/convolution_1x1_pack8to4_int8.h View File

@@ -1,65 +0,0 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

static void conv1x1s1_sgemm_pack8to4_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
const int size = w * h;

Mat bottom_im2col = bottom_blob;
bottom_im2col.w = size;
bottom_im2col.h = 1;

im2col_sgemm_pack8to4_int8_sse(bottom_im2col, top_blob, kernel, opt);
}

static void conv1x1s2_sgemm_pack8to4_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
int w = bottom_blob.w;
int channels = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;
int elempack = bottom_blob.elempack;

int outw = top_blob.w;
int outh = top_blob.h;

const int tailstep = w - 2 * outw + w;

Mat bottom_blob_shrinked;
bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < channels; p++)
{
const int64_t* r0 = bottom_blob.channel(p);
int64_t* outptr = bottom_blob_shrinked.channel(p);

for (int i = 0; i < outh; i++)
{
int j = 0;
for (; j < outw; j++)
{
outptr[0] = r0[0];

r0 += 2;
outptr += 1;
}

r0 += tailstep;
}
}

conv1x1s1_sgemm_pack8to4_int8_sse(bottom_blob_shrinked, top_blob, kernel, opt);
}

+ 0
- 147
src/layer/x86/convolution_3x3_pack1to4_int8.h View File

@@ -1,147 +0,0 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

static void conv3x3s1_pack1to4_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
int w = bottom_blob.w;
int inch = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
const int size = outw * outh;

const int maxk = 9;

// im2col
Mat bottom_im2col(size, maxk, inch, 1u, 1, opt.workspace_allocator);
{
const int gap = w - outw;

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < inch; p++)
{
const Mat img = bottom_blob.channel(p);
signed char* ptr = bottom_im2col.channel(p);

for (int u = 0; u < 3; u++)
{
for (int v = 0; v < 3; v++)
{
const signed char* sptr = img.row<const signed char>(u) + v;

for (int i = 0; i < outh; i++)
{
int j = 0;
for (; j + 3 < outw; j += 4)
{
ptr[0] = sptr[0];
ptr[1] = sptr[1];
ptr[2] = sptr[2];
ptr[3] = sptr[3];

sptr += 4;
ptr += 4;
}
for (; j + 1 < outw; j += 2)
{
ptr[0] = sptr[0];
ptr[1] = sptr[1];

sptr += 2;
ptr += 2;
}
for (; j < outw; j++)
{
ptr[0] = sptr[0];

sptr += 1;
ptr += 1;
}

sptr += gap;
}
}
}
}
}

im2col_sgemm_pack1to4_int8_sse(bottom_im2col, top_blob, kernel, opt);
}

static void conv3x3s2_pack1to4_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
int w = bottom_blob.w;
int inch = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
const int size = outw * outh;

const int maxk = 9;

// im2col
Mat bottom_im2col(size, maxk, inch, 1u, 1, opt.workspace_allocator);
{
const int gap = w * 2 - outw * 2;

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < inch; p++)
{
const Mat img = bottom_blob.channel(p);
signed char* ptr = bottom_im2col.channel(p);

for (int u = 0; u < 3; u++)
{
for (int v = 0; v < 3; v++)
{
const signed char* sptr = img.row<const signed char>(u) + v;

for (int i = 0; i < outh; i++)
{
int j = 0;
for (; j + 3 < outw; j += 4)
{
ptr[0] = sptr[0];
ptr[1] = sptr[2];
ptr[2] = sptr[4];
ptr[3] = sptr[6];

sptr += 8;
ptr += 4;
}
for (; j + 1 < outw; j += 2)
{
ptr[0] = sptr[0];
ptr[1] = sptr[2];

sptr += 4;
ptr += 2;
}
for (; j < outw; j++)
{
ptr[0] = sptr[0];

sptr += 2;
ptr += 1;
}

sptr += gap;
}
}
}
}
}

im2col_sgemm_pack1to4_int8_sse(bottom_im2col, top_blob, kernel, opt);
}

+ 0
- 80
src/layer/x86/convolution_7x7_pack1to4_int8.h View File

@@ -1,80 +0,0 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

static void conv7x7s2_pack1to4_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
int w = bottom_blob.w;
int inch = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
const int size = outw * outh;

const int maxk = 49;

// im2col
Mat bottom_im2col(size, maxk, inch, 1u, 1, opt.workspace_allocator);
{
const int gap = w * 2 - outw * 2;

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < inch; p++)
{
const Mat img = bottom_blob.channel(p);
signed char* ptr = bottom_im2col.channel(p);

for (int u = 0; u < 7; u++)
{
for (int v = 0; v < 7; v++)
{
const signed char* sptr = img.row<const signed char>(u) + v;

for (int i = 0; i < outh; i++)
{
int j = 0;
for (; j + 3 < outw; j += 4)
{
ptr[0] = sptr[0];
ptr[1] = sptr[2];
ptr[2] = sptr[4];
ptr[3] = sptr[6];

sptr += 8;
ptr += 4;
}
for (; j + 1 < outw; j += 2)
{
ptr[0] = sptr[0];
ptr[1] = sptr[2];

sptr += 4;
ptr += 2;
}
for (; j < outw; j++)
{
ptr[0] = sptr[0];

sptr += 2;
ptr += 1;
}

sptr += gap;
}
}
}
}
}

im2col_sgemm_pack1to4_int8_sse(bottom_im2col, top_blob, kernel, opt);
}

+ 7099
- 0
src/layer/x86/convolution_im2col_gemm_int8.h
File diff suppressed because it is too large
View File


+ 0
- 1129
src/layer/x86/convolution_sgemm_int8.h
File diff suppressed because it is too large
View File


+ 0
- 736
src/layer/x86/convolution_sgemm_pack1to4_int8.h View File

@@ -1,736 +0,0 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#if !(__AVX512VNNI__ || __AVXVNNI__ || __AVX2__ || __XOP__)
#if NCNN_RUNTIME_CPU && NCNN_AVX512VNNI && __AVX512F__ && !__AVX512VNNI__
void im2col_sgemm_pack1to4_int8_sse_avx512vnni(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt);
#endif

#if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__
void im2col_sgemm_pack1to4_int8_sse_avxvnni(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt);
#endif

#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__
void im2col_sgemm_pack1to4_int8_sse_avx2(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt);
#endif

#if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__
void im2col_sgemm_pack1to4_int8_sse_xop(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt);
#endif
#endif

static void im2col_sgemm_pack1to4_int8_sse(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
{
#if !(__AVX512VNNI__ || __AVXVNNI__ || __AVX2__ || __XOP__)
#if NCNN_RUNTIME_CPU && NCNN_AVX512VNNI && __AVX512F__ && !__AVX512VNNI__
if (ncnn::cpu_support_x86_avx512_vnni())
{
im2col_sgemm_pack1to4_int8_sse_avx512vnni(bottom_im2col, top_blob, kernel, opt);
return;
}
#endif

#if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__
if (ncnn::cpu_support_x86_avx_vnni())
{
im2col_sgemm_pack1to4_int8_sse_avxvnni(bottom_im2col, top_blob, kernel, opt);
return;
}
#endif

#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__
if (ncnn::cpu_support_x86_avx2())
{
im2col_sgemm_pack1to4_int8_sse_avx2(bottom_im2col, top_blob, kernel, opt);
return;
}
#endif

#if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__
if (ncnn::cpu_support_x86_xop())
{
im2col_sgemm_pack1to4_int8_sse_xop(bottom_im2col, top_blob, kernel, opt);
return;
}
#endif
#endif

// Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator);

const int size = bottom_im2col.w;
const int maxk = bottom_im2col.h;
const int inch = bottom_im2col.c;

const int outch = top_blob.c;

// permute
Mat tmp;
if (inch >= 4)
{
#if __AVX2__
if (size >= 4)
tmp.create(4 * maxk, inch / 4 + inch % 4, size / 4 + (size % 4) / 2 + size % 2, 4u, 4, opt.workspace_allocator);
else if (size >= 2)
tmp.create(2 * maxk, inch / 4 + inch % 4, size / 2 + size % 2, 4u, 4, opt.workspace_allocator);
else
tmp.create(maxk, inch / 4 + inch % 4, size, 4u, 4, opt.workspace_allocator);
#else
if (size >= 2)
tmp.create(2 * maxk, inch / 4 + inch % 4, size / 2 + size % 2, 4u, 4, opt.workspace_allocator);
else
tmp.create(maxk, inch / 4 + inch % 4, size, 4u, 4, opt.workspace_allocator);
#endif
}
else
{
#if __AVX2__
if (size >= 4)
tmp.create(4 * maxk, inch, size / 4 + (size % 4) / 2 + size % 2, 1u, 1, opt.workspace_allocator);
else if (size >= 2)
tmp.create(2 * maxk, inch, size / 2 + size % 2, 1u, 1, opt.workspace_allocator);
else
tmp.create(maxk, inch, size, 1u, 1, opt.workspace_allocator);
#else
if (size >= 2)
tmp.create(2 * maxk, inch, size / 2 + size % 2, 1u, 1, opt.workspace_allocator);
else
tmp.create(maxk, inch, size, 1u, 1, opt.workspace_allocator);
#endif
}
{
#if __AVX2__
int remain_size_start = 0;
int nn_size = size >> 2;

#pragma omp parallel for num_threads(opt.num_threads)
for (int ii = 0; ii < nn_size; ii++)
{
int i = remain_size_start + ii * 4;

signed char* tmpptr = tmp.channel(i / 4);

int q = 0;
for (; q + 3 < inch; q += 4)
{
const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;
const signed char* img1 = (const signed char*)bottom_im2col.channel(q + 1) + i;
const signed char* img2 = (const signed char*)bottom_im2col.channel(q + 2) + i;
const signed char* img3 = (const signed char*)bottom_im2col.channel(q + 3) + i;

for (int k = 0; k < maxk; k++)
{
tmpptr[0] = img0[0];
tmpptr[1] = img1[0];
tmpptr[2] = img2[0];
tmpptr[3] = img3[0];
tmpptr[4] = img0[1];
tmpptr[5] = img1[1];
tmpptr[6] = img2[1];
tmpptr[7] = img3[1];
tmpptr[8] = img0[2];
tmpptr[9] = img1[2];
tmpptr[10] = img2[2];
tmpptr[11] = img3[2];
tmpptr[12] = img0[3];
tmpptr[13] = img1[3];
tmpptr[14] = img2[3];
tmpptr[15] = img3[3];
tmpptr += 16;

img0 += size;
img1 += size;
img2 += size;
img3 += size;
}
}
for (; q < inch; q++)
{
const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;

for (int k = 0; k < maxk; k++)
{
tmpptr[0] = img0[0];
tmpptr[1] = img0[1];
tmpptr[2] = img0[2];
tmpptr[3] = img0[3];

tmpptr += 4;

img0 += size;
}
}
}

remain_size_start += nn_size << 2;
nn_size = (size - remain_size_start) >> 1;
#else
int remain_size_start = 0;
int nn_size = (size - remain_size_start) >> 1;
#endif

#pragma omp parallel for num_threads(opt.num_threads)
for (int ii = 0; ii < nn_size; ii++)
{
int i = remain_size_start + ii * 2;

#if __AVX2__
signed char* tmpptr = tmp.channel(i / 4 + (i % 4) / 2);
#else
signed char* tmpptr = tmp.channel(i / 2);
#endif

int q = 0;
for (; q + 3 < inch; q += 4)
{
const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;
const signed char* img1 = (const signed char*)bottom_im2col.channel(q + 1) + i;
const signed char* img2 = (const signed char*)bottom_im2col.channel(q + 2) + i;
const signed char* img3 = (const signed char*)bottom_im2col.channel(q + 3) + i;

for (int k = 0; k < maxk; k++)
{
tmpptr[0] = img0[0];
tmpptr[1] = img1[0];
tmpptr[2] = img2[0];
tmpptr[3] = img3[0];
tmpptr[4] = img0[1];
tmpptr[5] = img1[1];
tmpptr[6] = img2[1];
tmpptr[7] = img3[1];
tmpptr += 8;

img0 += size;
img1 += size;
img2 += size;
img3 += size;
}
}
for (; q < inch; q++)
{
const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;

for (int k = 0; k < maxk; k++)
{
tmpptr[0] = img0[0];
tmpptr[1] = img0[1];

tmpptr += 2;

img0 += size;
}
}
}

remain_size_start += nn_size << 1;

#pragma omp parallel for num_threads(opt.num_threads)
for (int i = remain_size_start; i < size; i++)
{
#if __AVX2__
signed char* tmpptr = tmp.channel(i / 4 + (i % 4) / 2 + i % 2);
#else
signed char* tmpptr = tmp.channel(i / 2 + i % 2);
#endif

int q = 0;
for (; q + 3 < inch; q += 4)
{
const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;
const signed char* img1 = (const signed char*)bottom_im2col.channel(q + 1) + i;
const signed char* img2 = (const signed char*)bottom_im2col.channel(q + 2) + i;
const signed char* img3 = (const signed char*)bottom_im2col.channel(q + 3) + i;

for (int k = 0; k < maxk; k++)
{
tmpptr[0] = img0[0];
tmpptr[1] = img1[0];
tmpptr[2] = img2[0];
tmpptr[3] = img3[0];
tmpptr += 4;

img0 += size;
img1 += size;
img2 += size;
img3 += size;
}
}
for (; q < inch; q++)
{
const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;

for (int k = 0; k < maxk; k++)
{
tmpptr[0] = img0[0];

tmpptr += 1;

img0 += size;
}
}
}
}

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < outch; p++)
{
int* outptr0 = top_blob.channel(p);

int i = 0;
#if __AVX2__
for (; i + 3 < size; i += 4)
{
const signed char* tmpptr = tmp.channel(i / 4);
const signed char* kptr0 = kernel.channel(p);

int nn4 = (inch / 4) * maxk;
int nn1 = (inch % 4) * maxk;

__m256i _sum00_12 = _mm256_setzero_si256();
__m256i _sum20_32 = _mm256_setzero_si256();

if (nn4 > 0)
{
__m256i _sum10_02 = _mm256_setzero_si256();
__m256i _sum30_22 = _mm256_setzero_si256();

int j = 0;
for (; j < nn4; j++)
{
__m128i _val0123 = _mm_loadu_si128((const __m128i*)tmpptr);
__m256i _val0123_16 = _mm256_cvtepi8_epi16(_val0123);

__m256i _val01_16 = _mm256_permute4x64_epi64(_val0123_16, _MM_SHUFFLE(1, 1, 0, 0));
__m256i _val23_16 = _mm256_permute4x64_epi64(_val0123_16, _MM_SHUFFLE(3, 3, 2, 2));

__m128i _w01 = _mm_loadu_si128((const __m128i*)kptr0);
__m256i _w01_16 = _mm256_cvtepi8_epi16(_w01);

__m256i _val10_16 = _mm256_permute4x64_epi64(_val01_16, 78);
__m256i _val32_16 = _mm256_permute4x64_epi64(_val23_16, 78);

#if __AVXVNNI__ || __AVX512VNNI__
_sum00_12 = _mm256_dpwssd_epi32(_sum00_12, _val01_16, _w01_16);
_sum10_02 = _mm256_dpwssd_epi32(_sum10_02, _val10_16, _w01_16);
_sum20_32 = _mm256_dpwssd_epi32(_sum20_32, _val23_16, _w01_16);
_sum30_22 = _mm256_dpwssd_epi32(_sum30_22, _val32_16, _w01_16);
#else
_sum00_12 = _mm256_add_epi32(_sum00_12, _mm256_madd_epi16(_val01_16, _w01_16));
_sum10_02 = _mm256_add_epi32(_sum10_02, _mm256_madd_epi16(_val10_16, _w01_16));
_sum20_32 = _mm256_add_epi32(_sum20_32, _mm256_madd_epi16(_val23_16, _w01_16));
_sum30_22 = _mm256_add_epi32(_sum30_22, _mm256_madd_epi16(_val32_16, _w01_16));
#endif

tmpptr += 16;
kptr0 += 16;
}

_sum00_12 = _mm256_hadd_epi32(_sum00_12, _sum10_02);
_sum20_32 = _mm256_hadd_epi32(_sum20_32, _sum30_22);

_sum00_12 = _mm256_permute4x64_epi64(_sum00_12, _MM_SHUFFLE(2, 1, 3, 0));
_sum20_32 = _mm256_permute4x64_epi64(_sum20_32, _MM_SHUFFLE(2, 1, 3, 0));
}

__m128i _sum00 = _mm256_extracti128_si256(_sum00_12, 0);
__m128i _sum10 = _mm256_extracti128_si256(_sum00_12, 1);
__m128i _sum20 = _mm256_extracti128_si256(_sum20_32, 0);
__m128i _sum30 = _mm256_extracti128_si256(_sum20_32, 1);

int j = 0;
for (; j < nn1; j++)
{
__m128i _val01 = _mm_set_epi16(tmpptr[1], tmpptr[1], tmpptr[1], tmpptr[1], tmpptr[0], tmpptr[0], tmpptr[0], tmpptr[0]);
__m128i _val23 = _mm_set_epi16(tmpptr[3], tmpptr[3], tmpptr[3], tmpptr[3], tmpptr[2], tmpptr[2], tmpptr[2], tmpptr[2]);

__m128i _w0123 = _mm_set_epi16(kptr0[3], kptr0[2], kptr0[1], kptr0[0], kptr0[3], kptr0[2], kptr0[1], kptr0[0]);

__m128i _sl00 = _mm_mullo_epi16(_val01, _w0123);
__m128i _sh00 = _mm_mulhi_epi16(_val01, _w0123);
__m128i _sl10 = _mm_mullo_epi16(_val23, _w0123);
__m128i _sh10 = _mm_mulhi_epi16(_val23, _w0123);

_sum00 = _mm_add_epi32(_sum00, _mm_unpacklo_epi16(_sl00, _sh00));
_sum10 = _mm_add_epi32(_sum10, _mm_unpackhi_epi16(_sl00, _sh00));
_sum20 = _mm_add_epi32(_sum20, _mm_unpacklo_epi16(_sl10, _sh10));
_sum30 = _mm_add_epi32(_sum30, _mm_unpackhi_epi16(_sl10, _sh10));

tmpptr += 4;
kptr0 += 4;
}

_mm_storeu_si128((__m128i*)outptr0, _sum00);
_mm_storeu_si128((__m128i*)(outptr0 + 4), _sum10);
_mm_storeu_si128((__m128i*)(outptr0 + 8), _sum20);
_mm_storeu_si128((__m128i*)(outptr0 + 12), _sum30);
outptr0 += 16;
}
#endif
for (; i + 1 < size; i += 2)
{
#if __AVX2__
const signed char* tmpptr = tmp.channel(i / 4 + (i % 4) / 2);
#else
const signed char* tmpptr = tmp.channel(i / 2);
#endif
const signed char* kptr0 = kernel.channel(p);

int nn4 = (inch / 4) * maxk;
int nn1 = (inch % 4) * maxk;

#if __AVX2__
__m256i _sum00_12 = _mm256_setzero_si256();
#else
__m128i _sum00 = _mm_setzero_si128();
__m128i _sum10 = _mm_setzero_si128();
#endif

if (nn4 > 0)
{
#if __AVX2__
__m256i _sum10_02 = _mm256_setzero_si256();
#else
__m128i _sum01 = _mm_setzero_si128();
__m128i _sum11 = _mm_setzero_si128();
#endif

int j = 0;
for (; j < nn4; j++)
{
#if __AVX2__
__m128i _val01 = _mm_loadu_si128((const __m128i*)tmpptr);
__m256i _val01_16 = _mm256_cvtepi8_epi16(_val01);

_val01_16 = _mm256_permute4x64_epi64(_val01_16, _MM_SHUFFLE(1, 1, 0, 0));

__m128i _w01 = _mm_loadu_si128((const __m128i*)kptr0);
__m256i _w01_16 = _mm256_cvtepi8_epi16(_w01);

__m256i _val10_16 = _mm256_permute4x64_epi64(_val01_16, 78);

#if __AVXVNNI__ || __AVX512VNNI__
_sum00_12 = _mm256_dpwssd_epi32(_sum00_12, _val01_16, _w01_16);
_sum10_02 = _mm256_dpwssd_epi32(_sum10_02, _val10_16, _w01_16);
#else
_sum00_12 = _mm256_add_epi32(_sum00_12, _mm256_madd_epi16(_val01_16, _w01_16));
_sum10_02 = _mm256_add_epi32(_sum10_02, _mm256_madd_epi16(_val10_16, _w01_16));
#endif
#else
__m128i _val01 = _mm_loadl_epi64((const __m128i*)tmpptr);
#if __SSE4_1__
_val01 = _mm_cvtepi8_epi16(_val01);
#else
__m128i _extval01 = _mm_cmpgt_epi8(_mm_setzero_si128(), _val01);
_val01 = _mm_unpacklo_epi8(_val01, _extval01);
#endif

__m128i _val0 = _mm_shuffle_epi32(_val01, _MM_SHUFFLE(1, 0, 1, 0));
__m128i _val1 = _mm_shuffle_epi32(_val01, _MM_SHUFFLE(3, 2, 3, 2));

__m128i _w01 = _mm_loadu_si128((const __m128i*)kptr0);
__m128i _extw01 = _mm_cmpgt_epi8(_mm_setzero_si128(), _w01);
__m128i _w0 = _mm_unpacklo_epi8(_w01, _extw01);
__m128i _w1 = _mm_unpackhi_epi8(_w01, _extw01);

#if __XOP__
_sum00 = _mm_maddd_epi16(_val0, _w0, _sum00);
_sum01 = _mm_maddd_epi16(_val0, _w1, _sum01);
_sum10 = _mm_maddd_epi16(_val1, _w0, _sum10);
_sum11 = _mm_maddd_epi16(_val1, _w1, _sum11);
#else
_sum00 = _mm_add_epi32(_mm_madd_epi16(_val0, _w0), _sum00);
_sum01 = _mm_add_epi32(_mm_madd_epi16(_val0, _w1), _sum01);
_sum10 = _mm_add_epi32(_mm_madd_epi16(_val1, _w0), _sum10);
_sum11 = _mm_add_epi32(_mm_madd_epi16(_val1, _w1), _sum11);
#endif
#endif

tmpptr += 8;
kptr0 += 16;
}

#if __AVX2__
_sum00_12 = _mm256_hadd_epi32(_sum00_12, _sum10_02);

_sum00_12 = _mm256_permute4x64_epi64(_sum00_12, _MM_SHUFFLE(2, 1, 3, 0));
#else
#if __SSSE3__
_sum00 = _mm_hadd_epi32(_sum00, _sum01);
_sum10 = _mm_hadd_epi32(_sum10, _sum11);
#else
__m128i _sum00_sh = _mm_shuffle_epi32(_sum00, 216);
__m128i _sum01_sh = _mm_shuffle_epi32(_sum01, 216);
__m128i _sum10_sh = _mm_shuffle_epi32(_sum10, 216);
__m128i _sum11_sh = _mm_shuffle_epi32(_sum11, 216);

_sum00 = _mm_unpacklo_epi64(_sum00_sh, _sum01_sh);
_sum01 = _mm_unpackhi_epi64(_sum00_sh, _sum01_sh);
_sum10 = _mm_unpacklo_epi64(_sum10_sh, _sum11_sh);
_sum11 = _mm_unpackhi_epi64(_sum10_sh, _sum11_sh);

_sum00 = _mm_add_epi32(_sum00, _sum01);
_sum10 = _mm_add_epi32(_sum10, _sum11);
#endif
#endif
}

#if __AVX2__
__m128i _sum00 = _mm256_extracti128_si256(_sum00_12, 0);
__m128i _sum10 = _mm256_extracti128_si256(_sum00_12, 1);
#endif

int j = 0;
for (; j < nn1; j++)
{
__m128i _val = _mm_set_epi16(tmpptr[1], tmpptr[1], tmpptr[1], tmpptr[1], tmpptr[0], tmpptr[0], tmpptr[0], tmpptr[0]);

__m128i _w0123 = _mm_loadl_epi64((const __m128i*)kptr0);
#if __SSE4_1__
_w0123 = _mm_cvtepi8_epi16(_w0123);
#else
__m128i _extw0123 = _mm_cmpgt_epi8(_mm_setzero_si128(), _w0123);
_w0123 = _mm_unpacklo_epi8(_w0123, _extw0123);
#endif

_w0123 = _mm_shuffle_epi32(_w0123, _MM_SHUFFLE(1, 0, 1, 0));

__m128i _sl00 = _mm_mullo_epi16(_val, _w0123);
__m128i _sh00 = _mm_mulhi_epi16(_val, _w0123);

_sum00 = _mm_add_epi32(_sum00, _mm_unpacklo_epi16(_sl00, _sh00));
_sum10 = _mm_add_epi32(_sum10, _mm_unpackhi_epi16(_sl00, _sh00));

tmpptr += 2;
kptr0 += 4;
}

_mm_storeu_si128((__m128i*)outptr0, _sum00);
_mm_storeu_si128((__m128i*)(outptr0 + 4), _sum10);
outptr0 += 8;
}
for (; i < size; i++)
{
#if __AVX2__
const signed char* tmpptr = tmp.channel(i / 4 + (i % 4) / 2 + i % 2);
#else
const signed char* tmpptr = tmp.channel(i / 2 + i % 2);
#endif
const signed char* kptr0 = kernel.channel(p);

int nn4 = (inch / 4) * maxk;
int nn1 = (inch % 4) * maxk;

__m128i _sum0 = _mm_setzero_si128();

if (nn4 > 0)
{
__m128i _sum1 = _mm_setzero_si128();
__m128i _sum2 = _mm_setzero_si128();
__m128i _sum3 = _mm_setzero_si128();

int j = 0;
for (; j < nn4; j++)
{
__m128i _val01 = _mm_loadl_epi64((const __m128i*)tmpptr);
#if __SSE4_1__
__m128i _val0 = _mm_cvtepi8_epi16(_val01);
#else
__m128i _extval01 = _mm_cmpgt_epi8(_mm_setzero_si128(), _val01);
__m128i _val0 = _mm_unpacklo_epi8(_val01, _extval01);
#endif

_val0 = _mm_shuffle_epi32(_val0, _MM_SHUFFLE(1, 0, 1, 0));

__m128i _w01 = _mm_loadu_si128((const __m128i*)kptr0);
__m128i _extw01 = _mm_cmpgt_epi8(_mm_setzero_si128(), _w01);
__m128i _w0 = _mm_unpacklo_epi8(_w01, _extw01);
__m128i _w1 = _mm_unpackhi_epi8(_w01, _extw01);

__m128i _sl00 = _mm_mullo_epi16(_val0, _w0);
__m128i _sh00 = _mm_mulhi_epi16(_val0, _w0);
__m128i _sl01 = _mm_mullo_epi16(_val0, _w1);
__m128i _sh01 = _mm_mulhi_epi16(_val0, _w1);

_sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl00, _sh00));
_sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl00, _sh00));
_sum2 = _mm_add_epi32(_sum2, _mm_unpacklo_epi16(_sl01, _sh01));
_sum3 = _mm_add_epi32(_sum3, _mm_unpackhi_epi16(_sl01, _sh01));

tmpptr += 4;
kptr0 += 16;
}

// transpose 4x4
{
__m128i _tmp0, _tmp1, _tmp2, _tmp3;
_tmp0 = _mm_unpacklo_epi32(_sum0, _sum1);
_tmp1 = _mm_unpacklo_epi32(_sum2, _sum3);
_tmp2 = _mm_unpackhi_epi32(_sum0, _sum1);
_tmp3 = _mm_unpackhi_epi32(_sum2, _sum3);
_sum0 = _mm_unpacklo_epi64(_tmp0, _tmp1);
_sum1 = _mm_unpackhi_epi64(_tmp0, _tmp1);
_sum2 = _mm_unpacklo_epi64(_tmp2, _tmp3);
_sum3 = _mm_unpackhi_epi64(_tmp2, _tmp3);
}

_sum0 = _mm_add_epi32(_sum0, _sum1);
_sum2 = _mm_add_epi32(_sum2, _sum3);
_sum0 = _mm_add_epi32(_sum0, _sum2);
}

int j = 0;
for (; j < nn1; j++)
{
__m128i _val = _mm_set1_epi16(tmpptr[0]);

__m128i _w0123 = _mm_loadl_epi64((const __m128i*)kptr0);
#if __SSE4_1__
_w0123 = _mm_cvtepi8_epi16(_w0123);
#else
__m128i _extw0123 = _mm_cmpgt_epi8(_mm_setzero_si128(), _w0123);
_w0123 = _mm_unpacklo_epi8(_w0123, _extw0123);
#endif

__m128i _sl00 = _mm_mullo_epi16(_val, _w0123);
__m128i _sh00 = _mm_mulhi_epi16(_val, _w0123);

_sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl00, _sh00));

tmpptr += 1;
kptr0 += 4;
}

_mm_storeu_si128((__m128i*)outptr0, _sum0);
outptr0 += 4;
}
}
}

static void convolution_im2col_sgemm_transform_kernel_pack1to4_int8_sse(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
{
const int maxk = kernel_w * kernel_h;

// interleave
// src = maxk-inch-outch
// dst = 4a-4b-maxk-inch/4a-outch/4b
Mat kernel = _kernel.reshape(maxk, inch, outch);
if (inch >= 4)
kernel_tm.create(16 * maxk, inch / 4 + inch % 4, outch / 4, (size_t)1u);
else
kernel_tm.create(4 * maxk, inch, outch / 4, (size_t)1u);

for (int q = 0; q + 3 < outch; q += 4)
{
signed char* g00 = kernel_tm.channel(q / 4);

int p = 0;
for (; p + 3 < inch; p += 4)
{
for (int k = 0; k < maxk; k++)
{
for (int i = 0; i < 4; i++)
{
for (int j = 0; j < 4; j++)
{
const signed char* k00 = kernel.channel(q + i).row<const signed char>(p + j);

g00[0] = k00[k];

g00++;
}
}
}
}
for (; p < inch; p++)
{
for (int k = 0; k < maxk; k++)
{
for (int i = 0; i < 4; i++)
{
const signed char* k00 = kernel.channel(q + i).row<const signed char>(p);

g00[0] = k00[k];

g00++;
}
}
}
}
}

static void convolution_im2col_sgemm_pack1to4_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
{
int w = bottom_blob.w;
int inch = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
const int size = outw * outh;

const int maxk = kernel_w * kernel_h;

// im2col
Mat bottom_im2col(size, maxk, inch, 1u, 1, opt.workspace_allocator);
{
const int gap = w * stride_h - outw * stride_w;

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < inch; p++)
{
const Mat img = bottom_blob.channel(p);
signed char* ptr = bottom_im2col.channel(p);

for (int u = 0; u < kernel_h; u++)
{
for (int v = 0; v < kernel_w; v++)
{
const signed char* sptr = img.row<const signed char>(dilation_h * u) + dilation_w * v;

for (int i = 0; i < outh; i++)
{
int j = 0;
for (; j + 3 < outw; j += 4)
{
ptr[0] = sptr[0];
ptr[1] = sptr[stride_w];
ptr[2] = sptr[stride_w * 2];
ptr[3] = sptr[stride_w * 3];

sptr += stride_w * 4;
ptr += 4;
}
for (; j + 1 < outw; j += 2)
{
ptr[0] = sptr[0];
ptr[1] = sptr[stride_w];

sptr += stride_w * 2;
ptr += 2;
}
for (; j < outw; j++)
{
ptr[0] = sptr[0];

sptr += stride_w;
ptr += 1;
}

sptr += gap;
}
}
}
}
}

im2col_sgemm_pack1to4_int8_sse(bottom_im2col, top_blob, kernel, opt);
}

+ 0
- 875
src/layer/x86/convolution_sgemm_pack8to1_int8.h View File

@@ -1,875 +0,0 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#if !(__AVX512VNNI__ || __AVXVNNI__ || __AVX2__ || __XOP__)
#if NCNN_RUNTIME_CPU && NCNN_AVX512VNNI && __AVX512F__ && !__AVX512VNNI__
void im2col_sgemm_pack8to1_int8_sse_avx512vnni(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt);
#endif

#if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__
void im2col_sgemm_pack8to1_int8_sse_avxvnni(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt);
#endif

#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__
void im2col_sgemm_pack8to1_int8_sse_avx2(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt);
#endif

#if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__
void im2col_sgemm_pack8to1_int8_sse_xop(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt);
#endif
#endif

static void im2col_sgemm_pack8to1_int8_sse(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
{
#if !(__AVX512VNNI__ || __AVXVNNI__ || __AVX2__ || __XOP__)
#if NCNN_RUNTIME_CPU && NCNN_AVX512VNNI && __AVX512F__ && !__AVX512VNNI__
if (ncnn::cpu_support_x86_avx512_vnni())
{
im2col_sgemm_pack8to1_int8_sse_avx512vnni(bottom_im2col, top_blob, kernel, opt);
return;
}
#endif

#if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__
if (ncnn::cpu_support_x86_avx_vnni())
{
im2col_sgemm_pack8to1_int8_sse_avxvnni(bottom_im2col, top_blob, kernel, opt);
return;
}
#endif

#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__
if (ncnn::cpu_support_x86_avx2())
{
im2col_sgemm_pack8to1_int8_sse_avx2(bottom_im2col, top_blob, kernel, opt);
return;
}
#endif

#if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__
if (ncnn::cpu_support_x86_xop())
{
im2col_sgemm_pack8to1_int8_sse_xop(bottom_im2col, top_blob, kernel, opt);
return;
}
#endif
#endif

// Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator);

const int size = bottom_im2col.w;
const int maxk = bottom_im2col.h;
const int inch = bottom_im2col.c;

const int outch = top_blob.c;

// permute
Mat tmp;
#if __AVX2__
if (size >= 4)
tmp.create(4 * maxk, inch, size / 4 + (size % 4) / 2 + size % 2, 8u, 8, opt.workspace_allocator);
else if (size >= 2)
tmp.create(2 * maxk, inch, size / 2 + size % 2, 8u, 8, opt.workspace_allocator);
else
tmp.create(maxk, inch, size, 8u, 8, opt.workspace_allocator);
#else
if (size >= 2)
tmp.create(2 * maxk, inch, size / 2 + size % 2, 8u, 8, opt.workspace_allocator);
else
tmp.create(maxk, inch, size, 8u, 8, opt.workspace_allocator);
#endif
{
#if __AVX2__
int remain_size_start = 0;
int nn_size = size >> 2;

#pragma omp parallel for num_threads(opt.num_threads)
for (int ii = 0; ii < nn_size; ii++)
{
int i = remain_size_start + ii * 4;

int64_t* tmpptr = tmp.channel(i / 4);

for (int q = 0; q < inch; q++)
{
const int64_t* img0 = (const int64_t*)bottom_im2col.channel(q) + i;

for (int k = 0; k < maxk; k++)
{
__m256i _v = _mm256_loadu_si256((const __m256i*)img0);
_mm256_storeu_si256((__m256i*)tmpptr, _v);
tmpptr += 4;
img0 += size;
}
}
}

remain_size_start += nn_size << 2;
nn_size = (size - remain_size_start) >> 1;
#else
int remain_size_start = 0;
int nn_size = (size - remain_size_start) >> 1;
#endif

#pragma omp parallel for num_threads(opt.num_threads)
for (int ii = 0; ii < nn_size; ii++)
{
int i = remain_size_start + ii * 2;

#if __AVX2__
int64_t* tmpptr = tmp.channel(i / 4 + (i % 4) / 2);
#else
int64_t* tmpptr = tmp.channel(i / 2);
#endif

for (int q = 0; q < inch; q++)
{
const int64_t* img0 = (const int64_t*)bottom_im2col.channel(q) + i;

for (int k = 0; k < maxk; k++)
{
__m128i _v = _mm_loadu_si128((const __m128i*)img0);
_mm_storeu_si128((__m128i*)tmpptr, _v);
tmpptr += 2;
img0 += size;
}
}
}

remain_size_start += nn_size << 1;

#pragma omp parallel for num_threads(opt.num_threads)
for (int i = remain_size_start; i < size; i++)
{
#if __AVX2__
int64_t* tmpptr = tmp.channel(i / 4 + (i % 4) / 2 + i % 2);
#else
int64_t* tmpptr = tmp.channel(i / 2 + i % 2);
#endif

for (int q = 0; q < inch; q++)
{
const int64_t* img0 = (const int64_t*)bottom_im2col.channel(q) + i;

for (int k = 0; k < maxk; k++)
{
tmpptr[0] = img0[0];
tmpptr += 1;
img0 += size;
}
}
}
}

int nn_outch = 0;
int remain_outch_start = 0;

nn_outch = outch >> 2;

#pragma omp parallel for num_threads(opt.num_threads)
for (int pp = 0; pp < nn_outch; pp++)
{
int p = pp * 4;

int* outptr0 = top_blob.channel(p);
int* outptr1 = top_blob.channel(p + 1);
int* outptr2 = top_blob.channel(p + 2);
int* outptr3 = top_blob.channel(p + 3);

int i = 0;
#if __AVX2__
for (; i + 3 < size; i += 4)
{
const signed char* tmpptr = tmp.channel(i / 4);
const signed char* kptr0 = kernel.channel(p / 4);

int nn = inch * maxk; // inch always > 0

__m256i _sum00_11 = _mm256_setzero_si256();
__m256i _sum10_01 = _mm256_setzero_si256();
__m256i _sum02_13 = _mm256_setzero_si256();
__m256i _sum12_03 = _mm256_setzero_si256();

__m256i _sum04_15 = _mm256_setzero_si256();
__m256i _sum14_05 = _mm256_setzero_si256();
__m256i _sum06_17 = _mm256_setzero_si256();
__m256i _sum16_07 = _mm256_setzero_si256();

int j = 0;
for (; j < nn; j++)
{
__m128i _val01 = _mm_loadu_si128((const __m128i*)tmpptr);
__m256i _val01_16 = _mm256_cvtepi8_epi16(_val01);

__m128i _w01 = _mm_loadu_si128((const __m128i*)kptr0);
__m128i _w23 = _mm_loadu_si128((const __m128i*)(kptr0 + 16));
__m256i _w01_16 = _mm256_cvtepi8_epi16(_w01);
__m256i _w23_16 = _mm256_cvtepi8_epi16(_w23);

__m256i _val10_16 = _mm256_permute4x64_epi64(_val01_16, 78);

#if __AVXVNNI__ || __AVX512VNNI__
_sum00_11 = _mm256_dpwssd_epi32(_sum00_11, _val01_16, _w01_16);
_sum10_01 = _mm256_dpwssd_epi32(_sum10_01, _val10_16, _w01_16);
_sum02_13 = _mm256_dpwssd_epi32(_sum02_13, _val01_16, _w23_16);
_sum12_03 = _mm256_dpwssd_epi32(_sum12_03, _val10_16, _w23_16);
#else
_sum00_11 = _mm256_add_epi32(_sum00_11, _mm256_madd_epi16(_val01_16, _w01_16));
_sum10_01 = _mm256_add_epi32(_sum10_01, _mm256_madd_epi16(_val10_16, _w01_16));
_sum02_13 = _mm256_add_epi32(_sum02_13, _mm256_madd_epi16(_val01_16, _w23_16));
_sum12_03 = _mm256_add_epi32(_sum12_03, _mm256_madd_epi16(_val10_16, _w23_16));
#endif

__m128i _val23 = _mm_loadu_si128((const __m128i*)(tmpptr + 16));
__m256i _val23_16 = _mm256_cvtepi8_epi16(_val23);
__m256i _val32_16 = _mm256_permute4x64_epi64(_val23_16, 78);

#if __AVXVNNI__ || __AVX512VNNI__
_sum04_15 = _mm256_dpwssd_epi32(_sum04_15, _val23_16, _w01_16);
_sum14_05 = _mm256_dpwssd_epi32(_sum14_05, _val32_16, _w01_16);
_sum06_17 = _mm256_dpwssd_epi32(_sum06_17, _val23_16, _w23_16);
_sum16_07 = _mm256_dpwssd_epi32(_sum16_07, _val32_16, _w23_16);
#else
_sum04_15 = _mm256_add_epi32(_sum04_15, _mm256_madd_epi16(_val23_16, _w01_16));
_sum14_05 = _mm256_add_epi32(_sum14_05, _mm256_madd_epi16(_val32_16, _w01_16));
_sum06_17 = _mm256_add_epi32(_sum06_17, _mm256_madd_epi16(_val23_16, _w23_16));
_sum16_07 = _mm256_add_epi32(_sum16_07, _mm256_madd_epi16(_val32_16, _w23_16));
#endif

tmpptr += 32;
kptr0 += 32;
}

// transpose 4x8
{
__m256i _tmp0, _tmp1, _tmp2, _tmp3;
_tmp0 = _mm256_unpacklo_epi32(_sum00_11, _sum10_01);
_tmp1 = _mm256_unpacklo_epi32(_sum02_13, _sum12_03);
_tmp2 = _mm256_unpackhi_epi32(_sum00_11, _sum10_01);
_tmp3 = _mm256_unpackhi_epi32(_sum02_13, _sum12_03);
_sum00_11 = _mm256_unpacklo_epi64(_tmp0, _tmp1);
_sum10_01 = _mm256_unpackhi_epi64(_tmp0, _tmp1);
_sum02_13 = _mm256_unpacklo_epi64(_tmp2, _tmp3);
_sum12_03 = _mm256_unpackhi_epi64(_tmp2, _tmp3);
}
{
__m256i _tmp0, _tmp1, _tmp2, _tmp3;
_tmp0 = _mm256_unpacklo_epi32(_sum04_15, _sum14_05);
_tmp1 = _mm256_unpacklo_epi32(_sum06_17, _sum16_07);
_tmp2 = _mm256_unpackhi_epi32(_sum04_15, _sum14_05);
_tmp3 = _mm256_unpackhi_epi32(_sum06_17, _sum16_07);
_sum04_15 = _mm256_unpacklo_epi64(_tmp0, _tmp1);
_sum14_05 = _mm256_unpackhi_epi64(_tmp0, _tmp1);
_sum06_17 = _mm256_unpacklo_epi64(_tmp2, _tmp3);
_sum16_07 = _mm256_unpackhi_epi64(_tmp2, _tmp3);
}

_sum00_11 = _mm256_add_epi32(_sum00_11, _sum10_01);
_sum02_13 = _mm256_add_epi32(_sum02_13, _sum12_03);
_sum00_11 = _mm256_add_epi32(_sum00_11, _sum02_13);

_sum04_15 = _mm256_add_epi32(_sum04_15, _sum14_05);
_sum06_17 = _mm256_add_epi32(_sum06_17, _sum16_07);
_sum04_15 = _mm256_add_epi32(_sum04_15, _sum06_17);

__m256i _perm_mask = _mm256_set_epi32(6, 3, 4, 1, 7, 2, 5, 0);
_sum00_11 = _mm256_permutevar8x32_epi32(_sum00_11, _perm_mask);
_sum04_15 = _mm256_permutevar8x32_epi32(_sum04_15, _perm_mask);

int sum[16];
_mm256_storeu_si256((__m256i*)sum, _sum00_11);
_mm256_storeu_si256((__m256i*)(sum + 8), _sum04_15);

outptr0[0] = sum[0];
outptr1[0] = sum[1];
outptr2[0] = sum[2];
outptr3[0] = sum[3];
outptr0[1] = sum[4];
outptr1[1] = sum[5];
outptr2[1] = sum[6];
outptr3[1] = sum[7];
outptr0[2] = sum[8];
outptr1[2] = sum[9];
outptr2[2] = sum[10];
outptr3[2] = sum[11];
outptr0[3] = sum[12];
outptr1[3] = sum[13];
outptr2[3] = sum[14];
outptr3[3] = sum[15];
outptr0 += 4;
outptr1 += 4;
outptr2 += 4;
outptr3 += 4;
}
#endif
for (; i + 1 < size; i += 2)
{
#if __AVX2__
const signed char* tmpptr = tmp.channel(i / 4 + (i % 4) / 2);
#else
const signed char* tmpptr = tmp.channel(i / 2);
#endif
const signed char* kptr0 = kernel.channel(p / 4);

int nn = inch * maxk; // inch always > 0

#if __AVX2__
__m256i _sum00_11 = _mm256_setzero_si256();
__m256i _sum10_01 = _mm256_setzero_si256();
__m256i _sum02_13 = _mm256_setzero_si256();
__m256i _sum12_03 = _mm256_setzero_si256();
#else
__m128i _sum00 = _mm_setzero_si128();
__m128i _sum01 = _mm_setzero_si128();
__m128i _sum02 = _mm_setzero_si128();
__m128i _sum03 = _mm_setzero_si128();
__m128i _sum10 = _mm_setzero_si128();
__m128i _sum11 = _mm_setzero_si128();
__m128i _sum12 = _mm_setzero_si128();
__m128i _sum13 = _mm_setzero_si128();
#endif

int j = 0;
for (; j < nn; j++)
{
#if __AVX2__
__m128i _val01 = _mm_loadu_si128((const __m128i*)tmpptr);
__m256i _val01_16 = _mm256_cvtepi8_epi16(_val01);

__m128i _w01 = _mm_loadu_si128((const __m128i*)kptr0);
__m128i _w23 = _mm_loadu_si128((const __m128i*)(kptr0 + 16));
__m256i _w01_16 = _mm256_cvtepi8_epi16(_w01);
__m256i _w23_16 = _mm256_cvtepi8_epi16(_w23);

__m256i _val10_16 = _mm256_permute4x64_epi64(_val01_16, 78);

#if __AVXVNNI__ || __AVX512VNNI__
_sum00_11 = _mm256_dpwssd_epi32(_sum00_11, _val01_16, _w01_16);
_sum10_01 = _mm256_dpwssd_epi32(_sum10_01, _val10_16, _w01_16);
_sum02_13 = _mm256_dpwssd_epi32(_sum02_13, _val01_16, _w23_16);
_sum12_03 = _mm256_dpwssd_epi32(_sum12_03, _val10_16, _w23_16);
#else
_sum00_11 = _mm256_add_epi32(_sum00_11, _mm256_madd_epi16(_val01_16, _w01_16));
_sum10_01 = _mm256_add_epi32(_sum10_01, _mm256_madd_epi16(_val10_16, _w01_16));
_sum02_13 = _mm256_add_epi32(_sum02_13, _mm256_madd_epi16(_val01_16, _w23_16));
_sum12_03 = _mm256_add_epi32(_sum12_03, _mm256_madd_epi16(_val10_16, _w23_16));
#endif
#else
__m128i _val01 = _mm_loadu_si128((const __m128i*)tmpptr);
__m128i _extval01 = _mm_cmpgt_epi8(_mm_setzero_si128(), _val01);
__m128i _val0 = _mm_unpacklo_epi8(_val01, _extval01);
__m128i _val1 = _mm_unpackhi_epi8(_val01, _extval01);

__m128i _w01 = _mm_loadu_si128((const __m128i*)kptr0);
__m128i _w23 = _mm_loadu_si128((const __m128i*)(kptr0 + 16));
__m128i _extw01 = _mm_cmpgt_epi8(_mm_setzero_si128(), _w01);
__m128i _extw23 = _mm_cmpgt_epi8(_mm_setzero_si128(), _w23);
__m128i _w0 = _mm_unpacklo_epi8(_w01, _extw01);
__m128i _w1 = _mm_unpackhi_epi8(_w01, _extw01);
__m128i _w2 = _mm_unpacklo_epi8(_w23, _extw23);
__m128i _w3 = _mm_unpackhi_epi8(_w23, _extw23);

#if __XOP__
_sum00 = _mm_maddd_epi16(_val0, _w0, _sum00);
_sum01 = _mm_maddd_epi16(_val0, _w1, _sum01);
_sum02 = _mm_maddd_epi16(_val0, _w2, _sum02);
_sum03 = _mm_maddd_epi16(_val0, _w3, _sum03);
_sum10 = _mm_maddd_epi16(_val1, _w0, _sum10);
_sum11 = _mm_maddd_epi16(_val1, _w1, _sum11);
_sum12 = _mm_maddd_epi16(_val1, _w2, _sum12);
_sum13 = _mm_maddd_epi16(_val1, _w3, _sum13);
#else
_sum00 = _mm_add_epi32(_mm_madd_epi16(_val0, _w0), _sum00);
_sum01 = _mm_add_epi32(_mm_madd_epi16(_val0, _w1), _sum01);
_sum02 = _mm_add_epi32(_mm_madd_epi16(_val0, _w2), _sum02);
_sum03 = _mm_add_epi32(_mm_madd_epi16(_val0, _w3), _sum03);
_sum10 = _mm_add_epi32(_mm_madd_epi16(_val1, _w0), _sum10);
_sum11 = _mm_add_epi32(_mm_madd_epi16(_val1, _w1), _sum11);
_sum12 = _mm_add_epi32(_mm_madd_epi16(_val1, _w2), _sum12);
_sum13 = _mm_add_epi32(_mm_madd_epi16(_val1, _w3), _sum13);
#endif
#endif

tmpptr += 16;
kptr0 += 32;
}

#if __AVX2__
// transpose 4x8
{
__m256i _tmp0, _tmp1, _tmp2, _tmp3;
_tmp0 = _mm256_unpacklo_epi32(_sum00_11, _sum10_01);
_tmp1 = _mm256_unpacklo_epi32(_sum02_13, _sum12_03);
_tmp2 = _mm256_unpackhi_epi32(_sum00_11, _sum10_01);
_tmp3 = _mm256_unpackhi_epi32(_sum02_13, _sum12_03);
_sum00_11 = _mm256_unpacklo_epi64(_tmp0, _tmp1);
_sum10_01 = _mm256_unpackhi_epi64(_tmp0, _tmp1);
_sum02_13 = _mm256_unpacklo_epi64(_tmp2, _tmp3);
_sum12_03 = _mm256_unpackhi_epi64(_tmp2, _tmp3);
}

_sum00_11 = _mm256_add_epi32(_sum00_11, _sum10_01);
_sum02_13 = _mm256_add_epi32(_sum02_13, _sum12_03);
_sum00_11 = _mm256_add_epi32(_sum00_11, _sum02_13);

__m256i _perm_mask = _mm256_set_epi32(6, 3, 4, 1, 7, 2, 5, 0);
_sum00_11 = _mm256_permutevar8x32_epi32(_sum00_11, _perm_mask);

int sum[8];
_mm256_storeu_si256((__m256i*)sum, _sum00_11);
#else
// transpose 4x4
{
__m128i _tmp0, _tmp1, _tmp2, _tmp3;
_tmp0 = _mm_unpacklo_epi32(_sum00, _sum01);
_tmp1 = _mm_unpacklo_epi32(_sum02, _sum03);
_tmp2 = _mm_unpackhi_epi32(_sum00, _sum01);
_tmp3 = _mm_unpackhi_epi32(_sum02, _sum03);
_sum00 = _mm_unpacklo_epi64(_tmp0, _tmp1);
_sum01 = _mm_unpackhi_epi64(_tmp0, _tmp1);
_sum02 = _mm_unpacklo_epi64(_tmp2, _tmp3);
_sum03 = _mm_unpackhi_epi64(_tmp2, _tmp3);
}
{
__m128i _tmp0, _tmp1, _tmp2, _tmp3;
_tmp0 = _mm_unpacklo_epi32(_sum10, _sum11);
_tmp1 = _mm_unpacklo_epi32(_sum12, _sum13);
_tmp2 = _mm_unpackhi_epi32(_sum10, _sum11);
_tmp3 = _mm_unpackhi_epi32(_sum12, _sum13);
_sum10 = _mm_unpacklo_epi64(_tmp0, _tmp1);
_sum11 = _mm_unpackhi_epi64(_tmp0, _tmp1);
_sum12 = _mm_unpacklo_epi64(_tmp2, _tmp3);
_sum13 = _mm_unpackhi_epi64(_tmp2, _tmp3);
}

_sum00 = _mm_add_epi32(_sum00, _sum01);
_sum02 = _mm_add_epi32(_sum02, _sum03);
_sum10 = _mm_add_epi32(_sum10, _sum11);
_sum12 = _mm_add_epi32(_sum12, _sum13);

_sum00 = _mm_add_epi32(_sum00, _sum02);
_sum10 = _mm_add_epi32(_sum10, _sum12);

int sum[8];
_mm_storeu_si128((__m128i*)sum, _sum00);
_mm_storeu_si128((__m128i*)(sum + 4), _sum10);
#endif

outptr0[0] = sum[0];
outptr1[0] = sum[1];
outptr2[0] = sum[2];
outptr3[0] = sum[3];
outptr0[1] = sum[4];
outptr1[1] = sum[5];
outptr2[1] = sum[6];
outptr3[1] = sum[7];
outptr0 += 2;
outptr1 += 2;
outptr2 += 2;
outptr3 += 2;
}
for (; i < size; i++)
{
#if __AVX2__
const signed char* tmpptr = tmp.channel(i / 4 + (i % 4) / 2 + i % 2);
#else
const signed char* tmpptr = tmp.channel(i / 2 + i % 2);
#endif
const signed char* kptr0 = kernel.channel(p / 4);

int nn = inch * maxk; // inch always > 0

#if __AVX2__
__m256i _sum0_1 = _mm256_setzero_si256();
__m256i _sum2_3 = _mm256_setzero_si256();
#else
__m128i _sum0 = _mm_setzero_si128();
__m128i _sum1 = _mm_setzero_si128();
__m128i _sum2 = _mm_setzero_si128();
__m128i _sum3 = _mm_setzero_si128();
#endif

int j = 0;
for (; j < nn; j++)
{
#if __AVX2__
__m128i _val = _mm_loadl_epi64((const __m128i*)tmpptr);
_val = _mm_cvtepi8_epi16(_val);

__m128i _w01 = _mm_loadu_si128((const __m128i*)kptr0);
__m128i _w23 = _mm_loadu_si128((const __m128i*)(kptr0 + 16));
__m256i _w01_16 = _mm256_cvtepi8_epi16(_w01);
__m256i _w23_16 = _mm256_cvtepi8_epi16(_w23);

__m256i _valval = _mm256_inserti128_si256(_mm256_castsi128_si256(_val), _val, 1);

#if __AVXVNNI__ || __AVX512VNNI__
_sum0_1 = _mm256_dpwssd_epi32(_sum0_1, _valval, _w01_16);
_sum2_3 = _mm256_dpwssd_epi32(_sum2_3, _valval, _w23_16);
#else
_sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_madd_epi16(_valval, _w01_16));
_sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_madd_epi16(_valval, _w23_16));
#endif
#else
__m128i _val = _mm_loadl_epi64((const __m128i*)tmpptr);
#if __SSE4_1__
_val = _mm_cvtepi8_epi16(_val);
#else
_val = _mm_unpacklo_epi8(_val, _mm_cmpgt_epi8(_mm_setzero_si128(), _val));
#endif

__m128i _w01 = _mm_loadu_si128((const __m128i*)kptr0);
__m128i _w23 = _mm_loadu_si128((const __m128i*)(kptr0 + 16));
__m128i _extw01 = _mm_cmpgt_epi8(_mm_setzero_si128(), _w01);
__m128i _extw23 = _mm_cmpgt_epi8(_mm_setzero_si128(), _w23);
__m128i _w0 = _mm_unpacklo_epi8(_w01, _extw01);
__m128i _w1 = _mm_unpackhi_epi8(_w01, _extw01);
__m128i _w2 = _mm_unpacklo_epi8(_w23, _extw23);
__m128i _w3 = _mm_unpackhi_epi8(_w23, _extw23);

#if __XOP__
_sum0 = _mm_maddd_epi16(_val, _w0, _sum0);
_sum1 = _mm_maddd_epi16(_val, _w1, _sum1);
_sum2 = _mm_maddd_epi16(_val, _w2, _sum2);
_sum3 = _mm_maddd_epi16(_val, _w3, _sum3);
#else
_sum0 = _mm_add_epi32(_mm_madd_epi16(_val, _w0), _sum0);
_sum1 = _mm_add_epi32(_mm_madd_epi16(_val, _w1), _sum1);
_sum2 = _mm_add_epi32(_mm_madd_epi16(_val, _w2), _sum2);
_sum3 = _mm_add_epi32(_mm_madd_epi16(_val, _w3), _sum3);
#endif
#endif

tmpptr += 8;
kptr0 += 32;
}

#if __AVX2__
__m128i _sum0 = _mm256_extracti128_si256(_sum0_1, 0);
__m128i _sum1 = _mm256_extracti128_si256(_sum0_1, 1);
__m128i _sum2 = _mm256_extracti128_si256(_sum2_3, 0);
__m128i _sum3 = _mm256_extracti128_si256(_sum2_3, 1);
#endif

// transpose 4x4
{
__m128i _tmp0, _tmp1, _tmp2, _tmp3;
_tmp0 = _mm_unpacklo_epi32(_sum0, _sum1);
_tmp1 = _mm_unpacklo_epi32(_sum2, _sum3);
_tmp2 = _mm_unpackhi_epi32(_sum0, _sum1);
_tmp3 = _mm_unpackhi_epi32(_sum2, _sum3);
_sum0 = _mm_unpacklo_epi64(_tmp0, _tmp1);
_sum1 = _mm_unpackhi_epi64(_tmp0, _tmp1);
_sum2 = _mm_unpacklo_epi64(_tmp2, _tmp3);
_sum3 = _mm_unpackhi_epi64(_tmp2, _tmp3);
}

_sum0 = _mm_add_epi32(_sum0, _sum1);
_sum2 = _mm_add_epi32(_sum2, _sum3);

_sum0 = _mm_add_epi32(_sum0, _sum2);

int sum[4];
_mm_storeu_si128((__m128i*)sum, _sum0);

outptr0[0] = sum[0];
outptr1[0] = sum[1];
outptr2[0] = sum[2];
outptr3[0] = sum[3];
outptr0 += 1;
outptr1 += 1;
outptr2 += 1;
outptr3 += 1;
}
}

remain_outch_start += nn_outch << 2;

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = remain_outch_start; p < outch; p++)
{
int* outptr0 = top_blob.channel(p);

int i = 0;
#if __AVX2__
for (; i + 3 < size; i += 4)
{
const signed char* tmpptr = tmp.channel(i / 4);
const signed char* kptr0 = kernel.channel(p / 4 + p % 4);

int nn = inch * maxk; // inch always > 0

__m256i _sum01 = _mm256_setzero_si256();
__m256i _sum23 = _mm256_setzero_si256();

int j = 0;
for (; j < nn; j++)
{
__m128i _val01 = _mm_loadu_si128((const __m128i*)tmpptr);
__m128i _val23 = _mm_loadu_si128((const __m128i*)(tmpptr + 16));
__m256i _val01_16 = _mm256_cvtepi8_epi16(_val01);
__m256i _val23_16 = _mm256_cvtepi8_epi16(_val23);

__m128i _w01 = _mm_loadl_epi64((const __m128i*)kptr0);
__m256i _w01_16 = _mm256_cvtepi8_epi16(_w01);
_w01_16 = _mm256_permute4x64_epi64(_w01_16, _MM_SHUFFLE(1, 0, 1, 0));

#if __AVXVNNI__ || __AVX512VNNI__
_sum01 = _mm256_dpwssd_epi32(_sum01, _val01_16, _w01_16);
_sum23 = _mm256_dpwssd_epi32(_sum23, _val23_16, _w01_16);
#else
_sum01 = _mm256_add_epi32(_sum01, _mm256_madd_epi16(_val01_16, _w01_16));
_sum23 = _mm256_add_epi32(_sum23, _mm256_madd_epi16(_val23_16, _w01_16));
#endif

tmpptr += 32;
kptr0 += 8;
}

__m128i _sum0 = _mm256_extracti128_si256(_sum01, 0);
__m128i _sum1 = _mm256_extracti128_si256(_sum01, 1);
__m128i _sum2 = _mm256_extracti128_si256(_sum23, 0);
__m128i _sum3 = _mm256_extracti128_si256(_sum23, 1);

outptr0[0] = _mm_reduce_add_epi32(_sum0);
outptr0[1] = _mm_reduce_add_epi32(_sum1);
outptr0[2] = _mm_reduce_add_epi32(_sum2);
outptr0[3] = _mm_reduce_add_epi32(_sum3);
outptr0 += 4;
}
#endif
for (; i + 1 < size; i += 2)
{
#if __AVX2__
const signed char* tmpptr = tmp.channel(i / 4 + (i % 4) / 2);
#else
const signed char* tmpptr = tmp.channel(i / 2);
#endif
const signed char* kptr0 = kernel.channel(p / 4 + p % 4);

int nn = inch * maxk; // inch always > 0

#if __AVX2__
__m256i _sum01 = _mm256_setzero_si256();
#else
__m128i _sum0 = _mm_setzero_si128();
__m128i _sum1 = _mm_setzero_si128();
#endif

int j = 0;
for (; j < nn; j++)
{
#if __AVX2__
__m128i _val01 = _mm_loadu_si128((const __m128i*)tmpptr);
__m256i _val01_16 = _mm256_cvtepi8_epi16(_val01);

__m128i _w01 = _mm_loadl_epi64((const __m128i*)kptr0);
__m256i _w01_16 = _mm256_cvtepi8_epi16(_w01);
_w01_16 = _mm256_permute4x64_epi64(_w01_16, _MM_SHUFFLE(1, 0, 1, 0));

#if __AVXVNNI__ || __AVX512VNNI__
_sum01 = _mm256_dpwssd_epi32(_sum01, _val01_16, _w01_16);
#else
_sum01 = _mm256_add_epi32(_sum01, _mm256_madd_epi16(_val01_16, _w01_16));
#endif
#else
__m128i _val01 = _mm_loadu_si128((const __m128i*)tmpptr);
__m128i _extval01 = _mm_cmpgt_epi8(_mm_setzero_si128(), _val01);
__m128i _val0 = _mm_unpacklo_epi8(_val01, _extval01);
__m128i _val1 = _mm_unpackhi_epi8(_val01, _extval01);

__m128i _w01 = _mm_loadl_epi64((const __m128i*)kptr0);
#if __SSE4_1__
__m128i _w0 = _mm_cvtepi8_epi16(_w01);
#else
__m128i _extw01 = _mm_cmpgt_epi8(_mm_setzero_si128(), _w01);
__m128i _w0 = _mm_unpacklo_epi8(_w01, _extw01);
#endif

#if __XOP__
_sum0 = _mm_maddd_epi16(_val0, _w0, _sum0);
_sum1 = _mm_maddd_epi16(_val1, _w0, _sum1);
#else
_sum0 = _mm_add_epi32(_mm_madd_epi16(_val0, _w0), _sum0);
_sum1 = _mm_add_epi32(_mm_madd_epi16(_val1, _w0), _sum1);
#endif
#endif

tmpptr += 16;
kptr0 += 8;
}

#if __AVX2__
__m128i _sum0 = _mm256_extracti128_si256(_sum01, 0);
__m128i _sum1 = _mm256_extracti128_si256(_sum01, 1);
#endif

outptr0[0] = _mm_reduce_add_epi32(_sum0);
outptr0[1] = _mm_reduce_add_epi32(_sum1);
outptr0 += 2;
}
for (; i < size; i++)
{
#if __AVX2__
const signed char* tmpptr = tmp.channel(i / 4 + (i % 4) / 2 + i % 2);
#else
const signed char* tmpptr = tmp.channel(i / 2 + i % 2);
#endif
const signed char* kptr0 = kernel.channel(p / 4 + p % 4);

int nn = inch * maxk; // inch always > 0

__m128i _sum0 = _mm_setzero_si128();

int j = 0;
for (; j < nn; j++)
{
__m128i _val01 = _mm_loadl_epi64((const __m128i*)tmpptr);
#if __SSE4_1__
__m128i _val0 = _mm_cvtepi8_epi16(_val01);
#else
__m128i _extval01 = _mm_cmpgt_epi8(_mm_setzero_si128(), _val01);
__m128i _val0 = _mm_unpacklo_epi8(_val01, _extval01);
#endif

__m128i _w01 = _mm_loadl_epi64((const __m128i*)kptr0);
#if __SSE4_1__
__m128i _w0 = _mm_cvtepi8_epi16(_w01);
#else
__m128i _extw01 = _mm_cmpgt_epi8(_mm_setzero_si128(), _w01);
__m128i _w0 = _mm_unpacklo_epi8(_w01, _extw01);
#endif

#if __XOP__
_sum0 = _mm_maddd_epi16(_val0, _w0, _sum0);
#else
_sum0 = _mm_add_epi32(_mm_madd_epi16(_val0, _w0), _sum0);
#endif

tmpptr += 8;
kptr0 += 8;
}

outptr0[0] = _mm_reduce_add_epi32(_sum0);
outptr0 += 1;
}
}
}

static void convolution_im2col_sgemm_transform_kernel_pack8to1_int8_sse(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
{
const int maxk = kernel_w * kernel_h;

// interleave
// src = maxk-inch-outch
// dst = 8a-4b-maxk-inch/8a-outch/4b
Mat kernel = _kernel.reshape(maxk, inch, outch);
if (outch >= 4)
kernel_tm.create(32 * maxk, inch / 8, outch / 4 + outch % 4, (size_t)1u);
else
kernel_tm.create(8 * maxk, inch / 8, outch, (size_t)1u);

int q = 0;
for (; q + 3 < outch; q += 4)
{
signed char* g00 = kernel_tm.channel(q / 4);

for (int p = 0; p + 7 < inch; p += 8)
{
for (int k = 0; k < maxk; k++)
{
for (int i = 0; i < 4; i++)
{
for (int j = 0; j < 8; j++)
{
const signed char* k00 = kernel.channel(q + i).row<const signed char>(p + j);

g00[0] = k00[k];

g00++;
}
}
}
}
}
// TODO unroll 2
for (; q < outch; q++)
{
signed char* g00 = kernel_tm.channel(q / 4 + q % 4);

for (int p = 0; p + 7 < inch; p += 8)
{
for (int k = 0; k < maxk; k++)
{
for (int j = 0; j < 8; j++)
{
const signed char* k00 = kernel.channel(q).row<const signed char>(p + j);

g00[0] = k00[k];

g00++;
}
}
}
}
}

static void convolution_im2col_sgemm_pack8to1_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
{
int w = bottom_blob.w;
int inch = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
const int size = outw * outh;

const int maxk = kernel_w * kernel_h;

// im2col
Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator);
{
const int gap = w * stride_h - outw * stride_w;

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < inch; p++)
{
const Mat img = bottom_blob.channel(p);
int64_t* ptr = bottom_im2col.channel(p);

for (int u = 0; u < kernel_h; u++)
{
for (int v = 0; v < kernel_w; v++)
{
const int64_t* sptr = img.row<const int64_t>(dilation_h * u) + dilation_w * v;

for (int i = 0; i < outh; i++)
{
int j = 0;
for (; j < outw; j++)
{
ptr[0] = sptr[0];

sptr += stride_w;
ptr += 1;
}

sptr += gap;
}
}
}
}
}

im2col_sgemm_pack8to1_int8_sse(bottom_im2col, top_blob, kernel, opt);
}

+ 0
- 625
src/layer/x86/convolution_sgemm_pack8to4_int8.h View File

@@ -1,625 +0,0 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#if !(__AVX512VNNI__ || __AVXVNNI__ || __AVX2__ || __XOP__)
#if NCNN_RUNTIME_CPU && NCNN_AVX512VNNI && __AVX512F__ && !__AVX512VNNI__
void im2col_sgemm_pack8to4_int8_sse_avx512vnni(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt);
#endif

#if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__
void im2col_sgemm_pack8to4_int8_sse_avxvnni(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt);
#endif

#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__
void im2col_sgemm_pack8to4_int8_sse_avx2(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt);
#endif

#if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__
void im2col_sgemm_pack8to4_int8_sse_xop(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt);
#endif
#endif

static void im2col_sgemm_pack8to4_int8_sse(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
{
#if !(__AVX512VNNI__ || __AVXVNNI__ || __AVX2__ || __XOP__)
#if NCNN_RUNTIME_CPU && NCNN_AVX512VNNI && __AVX512F__ && !__AVX512VNNI__
if (ncnn::cpu_support_x86_avx512_vnni())
{
im2col_sgemm_pack8to4_int8_sse_avx512vnni(bottom_im2col, top_blob, kernel, opt);
return;
}
#endif

#if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__
if (ncnn::cpu_support_x86_avx_vnni())
{
im2col_sgemm_pack8to4_int8_sse_avxvnni(bottom_im2col, top_blob, kernel, opt);
return;
}
#endif

#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__
if (ncnn::cpu_support_x86_avx2())
{
im2col_sgemm_pack8to4_int8_sse_avx2(bottom_im2col, top_blob, kernel, opt);
return;
}
#endif

#if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__
if (ncnn::cpu_support_x86_xop())
{
im2col_sgemm_pack8to4_int8_sse_xop(bottom_im2col, top_blob, kernel, opt);
return;
}
#endif
#endif

// Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator);

const int size = bottom_im2col.w;
const int maxk = bottom_im2col.h;
const int inch = bottom_im2col.c;

const int outch = top_blob.c;

// permute
Mat tmp;
#if __AVX2__
if (size >= 4)
tmp.create(4 * maxk, inch, size / 4 + (size % 4) / 2 + size % 2, 8u, 8, opt.workspace_allocator);
else if (size >= 2)
tmp.create(2 * maxk, inch, size / 2 + size % 2, 8u, 8, opt.workspace_allocator);
else
tmp.create(maxk, inch, size, 8u, 8, opt.workspace_allocator);
#else
if (size >= 2)
tmp.create(2 * maxk, inch, size / 2 + size % 2, 8u, 8, opt.workspace_allocator);
else
tmp.create(maxk, inch, size, 8u, 8, opt.workspace_allocator);
#endif
{
#if __AVX2__
int remain_size_start = 0;
int nn_size = size >> 2;

#pragma omp parallel for num_threads(opt.num_threads)
for (int ii = 0; ii < nn_size; ii++)
{
int i = remain_size_start + ii * 4;

int64_t* tmpptr = tmp.channel(i / 4);

for (int q = 0; q < inch; q++)
{
const int64_t* img0 = (const int64_t*)bottom_im2col.channel(q) + i;

for (int k = 0; k < maxk; k++)
{
__m256i _v = _mm256_loadu_si256((const __m256i*)img0);
_mm256_storeu_si256((__m256i*)tmpptr, _v);
tmpptr += 4;
img0 += size;
}
}
}

remain_size_start += nn_size << 2;
nn_size = (size - remain_size_start) >> 1;
#else
int remain_size_start = 0;
int nn_size = size >> 1;
#endif

#pragma omp parallel for num_threads(opt.num_threads)
for (int ii = 0; ii < nn_size; ii++)
{
int i = remain_size_start + ii * 2;

#if __AVX2__
int64_t* tmpptr = tmp.channel(i / 4 + (i % 4) / 2);
#else
int64_t* tmpptr = tmp.channel(i / 2);
#endif

for (int q = 0; q < inch; q++)
{
const int64_t* img0 = (const int64_t*)bottom_im2col.channel(q) + i;

for (int k = 0; k < maxk; k++)
{
__m128i _v = _mm_loadu_si128((const __m128i*)img0);
_mm_storeu_si128((__m128i*)tmpptr, _v);
tmpptr += 2;
img0 += size;
}
}
}

remain_size_start += nn_size << 1;

#pragma omp parallel for num_threads(opt.num_threads)
for (int i = remain_size_start; i < size; i++)
{
#if __AVX2__
int64_t* tmpptr = tmp.channel(i / 4 + (i % 4) / 2 + i % 2);
#else
int64_t* tmpptr = tmp.channel(i / 2 + i % 2);
#endif

for (int q = 0; q < inch; q++)
{
const int64_t* img0 = (const int64_t*)bottom_im2col.channel(q) + i;

for (int k = 0; k < maxk; k++)
{
tmpptr[0] = img0[0];
tmpptr += 1;
img0 += size;
}
}
}
}

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < outch; p++)
{
int* outptr0 = top_blob.channel(p);

int i = 0;
#if __AVX2__
for (; i + 3 < size; i += 4)
{
const signed char* tmpptr = tmp.channel(i / 4);
const signed char* kptr0 = kernel.channel(p);

int nn = inch * maxk; // inch always > 0

__m256i _sum00_11 = _mm256_setzero_si256();
__m256i _sum10_01 = _mm256_setzero_si256();
__m256i _sum02_13 = _mm256_setzero_si256();
__m256i _sum12_03 = _mm256_setzero_si256();

__m256i _sum04_15 = _mm256_setzero_si256();
__m256i _sum14_05 = _mm256_setzero_si256();
__m256i _sum06_17 = _mm256_setzero_si256();
__m256i _sum16_07 = _mm256_setzero_si256();

int j = 0;
for (; j < nn; j++)
{
__m128i _val01 = _mm_loadu_si128((const __m128i*)tmpptr);
__m256i _val01_16 = _mm256_cvtepi8_epi16(_val01);

__m128i _w01 = _mm_loadu_si128((const __m128i*)kptr0);
__m128i _w23 = _mm_loadu_si128((const __m128i*)(kptr0 + 16));
__m256i _w01_16 = _mm256_cvtepi8_epi16(_w01);
__m256i _w23_16 = _mm256_cvtepi8_epi16(_w23);

__m256i _val10_16 = _mm256_permute4x64_epi64(_val01_16, 78);

#if __AVXVNNI__ || __AVX512VNNI__
_sum00_11 = _mm256_dpwssd_epi32(_sum00_11, _val01_16, _w01_16);
_sum10_01 = _mm256_dpwssd_epi32(_sum10_01, _val10_16, _w01_16);
_sum02_13 = _mm256_dpwssd_epi32(_sum02_13, _val01_16, _w23_16);
_sum12_03 = _mm256_dpwssd_epi32(_sum12_03, _val10_16, _w23_16);
#else
_sum00_11 = _mm256_add_epi32(_sum00_11, _mm256_madd_epi16(_val01_16, _w01_16));
_sum10_01 = _mm256_add_epi32(_sum10_01, _mm256_madd_epi16(_val10_16, _w01_16));
_sum02_13 = _mm256_add_epi32(_sum02_13, _mm256_madd_epi16(_val01_16, _w23_16));
_sum12_03 = _mm256_add_epi32(_sum12_03, _mm256_madd_epi16(_val10_16, _w23_16));
#endif

__m128i _val23 = _mm_loadu_si128((const __m128i*)(tmpptr + 16));
__m256i _val23_16 = _mm256_cvtepi8_epi16(_val23);
__m256i _val32_16 = _mm256_permute4x64_epi64(_val23_16, 78);

#if __AVXVNNI__ || __AVX512VNNI__
_sum04_15 = _mm256_dpwssd_epi32(_sum04_15, _val23_16, _w01_16);
_sum14_05 = _mm256_dpwssd_epi32(_sum14_05, _val32_16, _w01_16);
_sum06_17 = _mm256_dpwssd_epi32(_sum06_17, _val23_16, _w23_16);
_sum16_07 = _mm256_dpwssd_epi32(_sum16_07, _val32_16, _w23_16);
#else
_sum04_15 = _mm256_add_epi32(_sum04_15, _mm256_madd_epi16(_val23_16, _w01_16));
_sum14_05 = _mm256_add_epi32(_sum14_05, _mm256_madd_epi16(_val32_16, _w01_16));
_sum06_17 = _mm256_add_epi32(_sum06_17, _mm256_madd_epi16(_val23_16, _w23_16));
_sum16_07 = _mm256_add_epi32(_sum16_07, _mm256_madd_epi16(_val32_16, _w23_16));
#endif

tmpptr += 32;
kptr0 += 32;
}

// transpose 4x8
{
__m256i _tmp0, _tmp1, _tmp2, _tmp3;
_tmp0 = _mm256_unpacklo_epi32(_sum00_11, _sum10_01);
_tmp1 = _mm256_unpacklo_epi32(_sum02_13, _sum12_03);
_tmp2 = _mm256_unpackhi_epi32(_sum00_11, _sum10_01);
_tmp3 = _mm256_unpackhi_epi32(_sum02_13, _sum12_03);
_sum00_11 = _mm256_unpacklo_epi64(_tmp0, _tmp1);
_sum10_01 = _mm256_unpackhi_epi64(_tmp0, _tmp1);
_sum02_13 = _mm256_unpacklo_epi64(_tmp2, _tmp3);
_sum12_03 = _mm256_unpackhi_epi64(_tmp2, _tmp3);
}
{
__m256i _tmp0, _tmp1, _tmp2, _tmp3;
_tmp0 = _mm256_unpacklo_epi32(_sum04_15, _sum14_05);
_tmp1 = _mm256_unpacklo_epi32(_sum06_17, _sum16_07);
_tmp2 = _mm256_unpackhi_epi32(_sum04_15, _sum14_05);
_tmp3 = _mm256_unpackhi_epi32(_sum06_17, _sum16_07);
_sum04_15 = _mm256_unpacklo_epi64(_tmp0, _tmp1);
_sum14_05 = _mm256_unpackhi_epi64(_tmp0, _tmp1);
_sum06_17 = _mm256_unpacklo_epi64(_tmp2, _tmp3);
_sum16_07 = _mm256_unpackhi_epi64(_tmp2, _tmp3);
}

_sum00_11 = _mm256_add_epi32(_sum00_11, _sum10_01);
_sum02_13 = _mm256_add_epi32(_sum02_13, _sum12_03);
_sum00_11 = _mm256_add_epi32(_sum00_11, _sum02_13);

_sum04_15 = _mm256_add_epi32(_sum04_15, _sum14_05);
_sum06_17 = _mm256_add_epi32(_sum06_17, _sum16_07);
_sum04_15 = _mm256_add_epi32(_sum04_15, _sum06_17);

__m256i _perm_mask = _mm256_set_epi32(6, 3, 4, 1, 7, 2, 5, 0);
_sum00_11 = _mm256_permutevar8x32_epi32(_sum00_11, _perm_mask);
_sum04_15 = _mm256_permutevar8x32_epi32(_sum04_15, _perm_mask);

_mm256_storeu_si256((__m256i*)outptr0, _sum00_11);
_mm256_storeu_si256((__m256i*)(outptr0 + 8), _sum04_15);
outptr0 += 16;
}
#endif
for (; i + 1 < size; i += 2)
{
#if __AVX2__
const signed char* tmpptr = tmp.channel(i / 4 + (i % 4) / 2);
#else
const signed char* tmpptr = tmp.channel(i / 2);
#endif
const signed char* kptr0 = kernel.channel(p);

int nn = inch * maxk; // inch always > 0

#if __AVX2__
__m256i _sum00_11 = _mm256_setzero_si256();
__m256i _sum10_01 = _mm256_setzero_si256();
__m256i _sum02_13 = _mm256_setzero_si256();
__m256i _sum12_03 = _mm256_setzero_si256();
#else
__m128i _sum00 = _mm_setzero_si128();
__m128i _sum01 = _mm_setzero_si128();
__m128i _sum02 = _mm_setzero_si128();
__m128i _sum03 = _mm_setzero_si128();
__m128i _sum10 = _mm_setzero_si128();
__m128i _sum11 = _mm_setzero_si128();
__m128i _sum12 = _mm_setzero_si128();
__m128i _sum13 = _mm_setzero_si128();
#endif

int j = 0;
for (; j < nn; j++)
{
#if __AVX2__
__m128i _val01 = _mm_loadu_si128((const __m128i*)tmpptr);
__m256i _val01_16 = _mm256_cvtepi8_epi16(_val01);

__m128i _w01 = _mm_loadu_si128((const __m128i*)kptr0);
__m128i _w23 = _mm_loadu_si128((const __m128i*)(kptr0 + 16));
__m256i _w01_16 = _mm256_cvtepi8_epi16(_w01);
__m256i _w23_16 = _mm256_cvtepi8_epi16(_w23);

__m256i _val10_16 = _mm256_permute4x64_epi64(_val01_16, 78);

#if __AVXVNNI__ || __AVX512VNNI__
_sum00_11 = _mm256_dpwssd_epi32(_sum00_11, _val01_16, _w01_16);
_sum10_01 = _mm256_dpwssd_epi32(_sum10_01, _val10_16, _w01_16);
_sum02_13 = _mm256_dpwssd_epi32(_sum02_13, _val01_16, _w23_16);
_sum12_03 = _mm256_dpwssd_epi32(_sum12_03, _val10_16, _w23_16);
#else
_sum00_11 = _mm256_add_epi32(_sum00_11, _mm256_madd_epi16(_val01_16, _w01_16));
_sum10_01 = _mm256_add_epi32(_sum10_01, _mm256_madd_epi16(_val10_16, _w01_16));
_sum02_13 = _mm256_add_epi32(_sum02_13, _mm256_madd_epi16(_val01_16, _w23_16));
_sum12_03 = _mm256_add_epi32(_sum12_03, _mm256_madd_epi16(_val10_16, _w23_16));
#endif
#else
__m128i _val01 = _mm_loadu_si128((const __m128i*)tmpptr);
__m128i _extval01 = _mm_cmpgt_epi8(_mm_setzero_si128(), _val01);
__m128i _val0 = _mm_unpacklo_epi8(_val01, _extval01);
__m128i _val1 = _mm_unpackhi_epi8(_val01, _extval01);

__m128i _w01 = _mm_loadu_si128((const __m128i*)kptr0);
__m128i _w23 = _mm_loadu_si128((const __m128i*)(kptr0 + 16));
__m128i _extw01 = _mm_cmpgt_epi8(_mm_setzero_si128(), _w01);
__m128i _extw23 = _mm_cmpgt_epi8(_mm_setzero_si128(), _w23);
__m128i _w0 = _mm_unpacklo_epi8(_w01, _extw01);
__m128i _w1 = _mm_unpackhi_epi8(_w01, _extw01);
__m128i _w2 = _mm_unpacklo_epi8(_w23, _extw23);
__m128i _w3 = _mm_unpackhi_epi8(_w23, _extw23);

#if __XOP__
_sum00 = _mm_maddd_epi16(_val0, _w0, _sum00);
_sum01 = _mm_maddd_epi16(_val0, _w1, _sum01);
_sum02 = _mm_maddd_epi16(_val0, _w2, _sum02);
_sum03 = _mm_maddd_epi16(_val0, _w3, _sum03);
_sum10 = _mm_maddd_epi16(_val1, _w0, _sum10);
_sum11 = _mm_maddd_epi16(_val1, _w1, _sum11);
_sum12 = _mm_maddd_epi16(_val1, _w2, _sum12);
_sum13 = _mm_maddd_epi16(_val1, _w3, _sum13);
#else
_sum00 = _mm_add_epi32(_mm_madd_epi16(_val0, _w0), _sum00);
_sum01 = _mm_add_epi32(_mm_madd_epi16(_val0, _w1), _sum01);
_sum02 = _mm_add_epi32(_mm_madd_epi16(_val0, _w2), _sum02);
_sum03 = _mm_add_epi32(_mm_madd_epi16(_val0, _w3), _sum03);
_sum10 = _mm_add_epi32(_mm_madd_epi16(_val1, _w0), _sum10);
_sum11 = _mm_add_epi32(_mm_madd_epi16(_val1, _w1), _sum11);
_sum12 = _mm_add_epi32(_mm_madd_epi16(_val1, _w2), _sum12);
_sum13 = _mm_add_epi32(_mm_madd_epi16(_val1, _w3), _sum13);
#endif
#endif

tmpptr += 16;
kptr0 += 32;
}

#if __AVX2__
// transpose 4x8
{
__m256i _tmp0, _tmp1, _tmp2, _tmp3;
_tmp0 = _mm256_unpacklo_epi32(_sum00_11, _sum10_01);
_tmp1 = _mm256_unpacklo_epi32(_sum02_13, _sum12_03);
_tmp2 = _mm256_unpackhi_epi32(_sum00_11, _sum10_01);
_tmp3 = _mm256_unpackhi_epi32(_sum02_13, _sum12_03);
_sum00_11 = _mm256_unpacklo_epi64(_tmp0, _tmp1);
_sum10_01 = _mm256_unpackhi_epi64(_tmp0, _tmp1);
_sum02_13 = _mm256_unpacklo_epi64(_tmp2, _tmp3);
_sum12_03 = _mm256_unpackhi_epi64(_tmp2, _tmp3);
}

_sum00_11 = _mm256_add_epi32(_sum00_11, _sum10_01);
_sum02_13 = _mm256_add_epi32(_sum02_13, _sum12_03);
_sum00_11 = _mm256_add_epi32(_sum00_11, _sum02_13);

__m256i _perm_mask = _mm256_set_epi32(6, 3, 4, 1, 7, 2, 5, 0);
_sum00_11 = _mm256_permutevar8x32_epi32(_sum00_11, _perm_mask);

_mm256_storeu_si256((__m256i*)outptr0, _sum00_11);
#else
// transpose 4x4
{
__m128i _tmp0, _tmp1, _tmp2, _tmp3;
_tmp0 = _mm_unpacklo_epi32(_sum00, _sum01);
_tmp1 = _mm_unpacklo_epi32(_sum02, _sum03);
_tmp2 = _mm_unpackhi_epi32(_sum00, _sum01);
_tmp3 = _mm_unpackhi_epi32(_sum02, _sum03);
_sum00 = _mm_unpacklo_epi64(_tmp0, _tmp1);
_sum01 = _mm_unpackhi_epi64(_tmp0, _tmp1);
_sum02 = _mm_unpacklo_epi64(_tmp2, _tmp3);
_sum03 = _mm_unpackhi_epi64(_tmp2, _tmp3);
}
{
__m128i _tmp0, _tmp1, _tmp2, _tmp3;
_tmp0 = _mm_unpacklo_epi32(_sum10, _sum11);
_tmp1 = _mm_unpacklo_epi32(_sum12, _sum13);
_tmp2 = _mm_unpackhi_epi32(_sum10, _sum11);
_tmp3 = _mm_unpackhi_epi32(_sum12, _sum13);
_sum10 = _mm_unpacklo_epi64(_tmp0, _tmp1);
_sum11 = _mm_unpackhi_epi64(_tmp0, _tmp1);
_sum12 = _mm_unpacklo_epi64(_tmp2, _tmp3);
_sum13 = _mm_unpackhi_epi64(_tmp2, _tmp3);
}

_sum00 = _mm_add_epi32(_sum00, _sum01);
_sum02 = _mm_add_epi32(_sum02, _sum03);
_sum10 = _mm_add_epi32(_sum10, _sum11);
_sum12 = _mm_add_epi32(_sum12, _sum13);

_sum00 = _mm_add_epi32(_sum00, _sum02);
_sum10 = _mm_add_epi32(_sum10, _sum12);

_mm_storeu_si128((__m128i*)outptr0, _sum00);
_mm_storeu_si128((__m128i*)(outptr0 + 4), _sum10);
#endif
outptr0 += 8;
}
for (; i < size; i++)
{
#if __AVX2__
const signed char* tmpptr = tmp.channel(i / 4 + (i % 4) / 2 + i % 2);
#else
const signed char* tmpptr = tmp.channel(i / 2 + i % 2);
#endif
const signed char* kptr0 = kernel.channel(p);

int nn = inch * maxk; // inch always > 0

#if __AVX2__
__m256i _sum0_1 = _mm256_setzero_si256();
__m256i _sum2_3 = _mm256_setzero_si256();
#else
__m128i _sum0 = _mm_setzero_si128();
__m128i _sum1 = _mm_setzero_si128();
__m128i _sum2 = _mm_setzero_si128();
__m128i _sum3 = _mm_setzero_si128();
#endif

int j = 0;
for (; j < nn; j++)
{
#if __AVX2__
__m128i _val = _mm_loadl_epi64((const __m128i*)tmpptr);
_val = _mm_cvtepi8_epi16(_val);

__m128i _w01 = _mm_loadu_si128((const __m128i*)kptr0);
__m128i _w23 = _mm_loadu_si128((const __m128i*)(kptr0 + 16));
__m256i _w01_16 = _mm256_cvtepi8_epi16(_w01);
__m256i _w23_16 = _mm256_cvtepi8_epi16(_w23);

__m256i _valval = _mm256_inserti128_si256(_mm256_castsi128_si256(_val), _val, 1);

#if __AVXVNNI__ || __AVX512VNNI__
_sum0_1 = _mm256_dpwssd_epi32(_sum0_1, _valval, _w01_16);
_sum2_3 = _mm256_dpwssd_epi32(_sum2_3, _valval, _w23_16);
#else
_sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_madd_epi16(_valval, _w01_16));
_sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_madd_epi16(_valval, _w23_16));
#endif
#else
__m128i _val = _mm_loadl_epi64((const __m128i*)tmpptr);
#if __SSE4_1__
_val = _mm_cvtepi8_epi16(_val);
#else
_val = _mm_unpacklo_epi8(_val, _mm_cmpgt_epi8(_mm_setzero_si128(), _val));
#endif

__m128i _w01 = _mm_loadu_si128((const __m128i*)kptr0);
__m128i _w23 = _mm_loadu_si128((const __m128i*)(kptr0 + 16));
__m128i _extw01 = _mm_cmpgt_epi8(_mm_setzero_si128(), _w01);
__m128i _extw23 = _mm_cmpgt_epi8(_mm_setzero_si128(), _w23);
__m128i _w0 = _mm_unpacklo_epi8(_w01, _extw01);
__m128i _w1 = _mm_unpackhi_epi8(_w01, _extw01);
__m128i _w2 = _mm_unpacklo_epi8(_w23, _extw23);
__m128i _w3 = _mm_unpackhi_epi8(_w23, _extw23);

#if __XOP__
_sum0 = _mm_maddd_epi16(_val, _w0, _sum0);
_sum1 = _mm_maddd_epi16(_val, _w1, _sum1);
_sum2 = _mm_maddd_epi16(_val, _w2, _sum2);
_sum3 = _mm_maddd_epi16(_val, _w3, _sum3);
#else
_sum0 = _mm_add_epi32(_mm_madd_epi16(_val, _w0), _sum0);
_sum1 = _mm_add_epi32(_mm_madd_epi16(_val, _w1), _sum1);
_sum2 = _mm_add_epi32(_mm_madd_epi16(_val, _w2), _sum2);
_sum3 = _mm_add_epi32(_mm_madd_epi16(_val, _w3), _sum3);
#endif
#endif

tmpptr += 8;
kptr0 += 32;
}

#if __AVX2__
__m128i _sum0 = _mm256_extracti128_si256(_sum0_1, 0);
__m128i _sum1 = _mm256_extracti128_si256(_sum0_1, 1);
__m128i _sum2 = _mm256_extracti128_si256(_sum2_3, 0);
__m128i _sum3 = _mm256_extracti128_si256(_sum2_3, 1);
#endif

// transpose 4x4
{
__m128i _tmp0, _tmp1, _tmp2, _tmp3;
_tmp0 = _mm_unpacklo_epi32(_sum0, _sum1);
_tmp1 = _mm_unpacklo_epi32(_sum2, _sum3);
_tmp2 = _mm_unpackhi_epi32(_sum0, _sum1);
_tmp3 = _mm_unpackhi_epi32(_sum2, _sum3);
_sum0 = _mm_unpacklo_epi64(_tmp0, _tmp1);
_sum1 = _mm_unpackhi_epi64(_tmp0, _tmp1);
_sum2 = _mm_unpacklo_epi64(_tmp2, _tmp3);
_sum3 = _mm_unpackhi_epi64(_tmp2, _tmp3);
}

_sum0 = _mm_add_epi32(_sum0, _sum1);
_sum2 = _mm_add_epi32(_sum2, _sum3);

_sum0 = _mm_add_epi32(_sum0, _sum2);

_mm_storeu_si128((__m128i*)outptr0, _sum0);
outptr0 += 4;
}
}
}

static void convolution_im2col_sgemm_transform_kernel_pack8to4_int8_sse(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
{
const int maxk = kernel_w * kernel_h;

// interleave
// src = maxk-inch-outch
// dst = 8a-4b-maxk-inch/8a-outch/4b
Mat kernel = _kernel.reshape(maxk, inch, outch);
kernel_tm.create(32 * maxk, inch / 8, outch / 4, (size_t)1u);

for (int q = 0; q + 3 < outch; q += 4)
{
signed char* g00 = kernel_tm.channel(q / 4);

for (int p = 0; p + 7 < inch; p += 8)
{
for (int k = 0; k < maxk; k++)
{
for (int i = 0; i < 4; i++)
{
for (int j = 0; j < 8; j++)
{
const signed char* k00 = kernel.channel(q + i).row<const signed char>(p + j);

g00[0] = k00[k];

g00++;
}
}
}
}
}
}

static void convolution_im2col_sgemm_pack8to4_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
{
int w = bottom_blob.w;
int inch = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
const int size = outw * outh;

const int maxk = kernel_w * kernel_h;

// im2col
Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator);
{
const int gap = w * stride_h - outw * stride_w;

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < inch; p++)
{
const Mat img = bottom_blob.channel(p);
int64_t* ptr = bottom_im2col.channel(p);

for (int u = 0; u < kernel_h; u++)
{
for (int v = 0; v < kernel_w; v++)
{
const int64_t* sptr = img.row<const int64_t>(dilation_h * u) + dilation_w * v;

for (int i = 0; i < outh; i++)
{
int j = 0;
for (; j < outw; j++)
{
ptr[0] = sptr[0];

sptr += stride_w;
ptr += 1;
}

sptr += gap;
}
}
}
}
}

im2col_sgemm_pack8to4_int8_sse(bottom_im2col, top_blob, kernel, opt);
}

+ 43
- 206
src/layer/x86/convolution_x86.cpp View File

@@ -42,27 +42,18 @@ namespace ncnn {
#include "convolution_packed.h"

#if NCNN_INT8
#include "convolution_sgemm_int8.h"
#include "convolution_1x1_int8.h"
#include "convolution_3x3_int8.h"

#include "convolution_packed_int8.h"
#include "convolution_im2col_gemm_int8.h"
#endif // NCNN_INT8

#if __SSE2__
#include "convolution_3x3_pack1to4.h"

#if NCNN_INT8
#include "convolution_sgemm_pack8to4_int8.h"
#include "convolution_sgemm_pack1to4_int8.h"
#include "convolution_sgemm_pack8to1_int8.h"
#include "convolution_1x1_pack8to4_int8.h"
#include "convolution_1x1_pack1to4_int8.h"
#include "convolution_1x1_pack8to1_int8.h"
#include "convolution_3x3_pack8to4_int8.h"
#include "convolution_3x3_pack1to4_int8.h"
#include "convolution_3x3_pack8to1_int8.h"
#include "convolution_7x7_pack1to4_int8.h"
#endif // NCNN_INT8

#if __AVX__
@@ -1241,120 +1232,39 @@ int Convolution_x86::create_pipeline_int8_x86(const Option& opt)
const int num_input = weight_data_size / maxk / num_output;

int elempack = 1;
int out_elempack = 1;
int out_elempack_int32 = 1;
#if __SSE2__
if (opt.use_packing_layout)
{
elempack = num_input % 8 == 0 ? 8 : 1;
out_elempack = num_output % 4 == 0 ? 4 : 1;
out_elempack_int32 = num_output % 4 == 0 ? 4 : 1;
}
#endif // __SSE2__

if (elempack == 8 && out_elempack_int32 == 4 && opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
#if __SSE2__
if (elempack == 8 && out_elempack == 4)
conv3x3s1_winograd43_transform_kernel_pack8to4_int8_sse(weight_data, weight_winograd43_data, num_input, num_output, opt);
#endif // __SSE2__
}
else if (elempack == 8 && out_elempack_int32 == 1 && opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
convolution_im2col_sgemm_transform_kernel_pack8to4_int8_sse(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
convolution_im2col_sgemm_transform_kernel_pack8to4_int8_sse(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
conv3x3s1_winograd43_transform_kernel_pack8to4_int8_sse(weight_data, weight_winograd43_data, num_input, num_output, opt);
}
else if (opt.use_sgemm_convolution)
{
convolution_im2col_sgemm_transform_kernel_pack8to4_int8_sse(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
else
{
convolution_transform_kernel_packed_int8(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
}
#if __SSE2__
conv3x3s1_winograd43_transform_kernel_pack8to1_int8_sse(weight_data, weight_winograd43_data, num_input, num_output, opt);
#endif // __SSE2__
}

if (elempack == 1 && out_elempack == 4)
else if (elempack == 1 && out_elempack_int32 == 1 && opt.use_winograd_convolution && opt.use_winograd23_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && num_input >= 16 && num_output >= 16)
{
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
convolution_im2col_sgemm_transform_kernel_pack1to4_int8_sse(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
convolution_im2col_sgemm_transform_kernel_pack1to4_int8_sse(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
convolution_im2col_sgemm_transform_kernel_pack1to4_int8_sse(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
convolution_im2col_sgemm_transform_kernel_pack1to4_int8_sse(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
convolution_im2col_sgemm_transform_kernel_pack1to4_int8_sse(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
{
convolution_im2col_sgemm_transform_kernel_pack1to4_int8_sse(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
else
{
convolution_transform_kernel_packed_int8(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
}
conv3x3s1_winograd23_transform_kernel_int8_sse(weight_data, weight_winograd23_data, num_input, num_output, opt);
// conv3x3s1_winograd43_transform_kernel_int8_sse(weight_data, weight_winograd43_data, num_input, num_output, opt);
}

if (elempack == 8 && out_elempack == 1)
else if (opt.use_sgemm_convolution)
{
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
convolution_im2col_sgemm_transform_kernel_pack8to1_int8_sse(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
convolution_im2col_sgemm_transform_kernel_pack8to1_int8_sse(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
conv3x3s1_winograd43_transform_kernel_pack8to1_int8_sse(weight_data, weight_winograd43_data, num_input, num_output, opt);
}
else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
{
convolution_im2col_sgemm_transform_kernel_pack8to1_int8_sse(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
else
{
convolution_transform_kernel_packed_int8(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
}
convolution_im2col_gemm_transform_kernel_int8(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h, opt);
}
#endif // __SSE2__

if (elempack == 1 && out_elempack == 1)
else
{
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
convolution_im2col_sgemm_transform_kernel_int8_sse(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
convolution_im2col_sgemm_transform_kernel_int8_sse(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
else if (opt.use_winograd_convolution && opt.use_winograd23_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && num_input >= 16 && num_output >= 16)
{
conv3x3s1_winograd23_transform_kernel_int8_sse(weight_data, weight_winograd23_data, num_input, num_output, opt);
// conv3x3s1_winograd43_transform_kernel_int8_sse(weight_data, weight_winograd43_data, num_input, num_output, opt);
}
else if (opt.use_sgemm_convolution)
{
convolution_im2col_sgemm_transform_kernel_int8_sse(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
else
{
convolution_transform_kernel_packed_int8(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
}
convolution_transform_kernel_packed_int8(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
}

scale_in_data.create(num_output);
@@ -1442,111 +1352,38 @@ int Convolution_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, con
if (top_blob_int32.empty())
return -100;

#if __SSE2__
if (elempack == 8 && out_elempack_int32 == 4)
int _nT = nT ? nT : opt.num_threads;
if (nT != 0 && opt.num_threads != nT)
{
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
conv1x1s1_sgemm_pack8to4_int8_sse(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
}
else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
conv1x1s2_sgemm_pack8to4_int8_sse(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
}
else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
conv3x3s1_winograd43_pack8to4_int8_sse(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt);
}
else if (opt.use_sgemm_convolution)
{
convolution_im2col_sgemm_pack8to4_int8_sse(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}
else
{
convolution_packed_int8(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}
// force num_threads the same as in create_pipeline
// so we could use pre-packed A/B from the same tile config
NCNN_LOGE("opt.num_threads %d changed, convolution gemm will use load-time value %d", opt.num_threads, nT);
}

if (elempack == 1 && out_elempack_int32 == 4)
if (elempack == 8 && out_elempack_int32 == 4 && opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
conv1x1s1_sgemm_pack1to4_int8_sse(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
}
else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
conv1x1s2_sgemm_pack1to4_int8_sse(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
}
else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
conv3x3s1_pack1to4_int8_sse(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
}
else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
conv3x3s2_pack1to4_int8_sse(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
}
else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
conv7x7s2_pack1to4_int8_sse(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
}
else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
{
convolution_im2col_sgemm_pack1to4_int8_sse(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}
else
{
convolution_packed_int8(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}
#if __SSE2__
conv3x3s1_winograd43_pack8to4_int8_sse(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt);
#endif // __SSE2__
}

if (elempack == 8 && out_elempack_int32 == 1)
else if (elempack == 8 && out_elempack_int32 == 1 && opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
conv1x1s1_sgemm_pack8to1_int8_sse(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
}
else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
conv1x1s2_sgemm_pack8to1_int8_sse(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
}
else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
conv3x3s1_winograd43_pack8to1_int8_sse(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt);
}
else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
{
convolution_im2col_sgemm_pack8to1_int8_sse(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}
else
{
convolution_packed_int8(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}
}
#if __SSE2__
conv3x3s1_winograd43_pack8to1_int8_sse(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt);
#endif // __SSE2__
if (elempack == 1 && out_elempack_int32 == 1)
}
else if (elempack == 1 && out_elempack_int32 == 1 && opt.use_winograd_convolution && opt.use_winograd23_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && num_input >= 16 && num_output >= 16)
{
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
conv1x1s1_sgemm_int8_sse(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
}
else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
conv1x1s2_sgemm_int8_sse(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
}
else if (opt.use_winograd_convolution && opt.use_winograd23_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && num_input >= 16 && num_output >= 16)
{
conv3x3s1_winograd23_int8_sse(bottom_blob_bordered, top_blob_int32, weight_winograd23_data, opt);
// conv3x3s1_winograd43_int8_sse(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt);
}
else if (opt.use_sgemm_convolution)
{
convolution_im2col_sgemm_int8_sse(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}
else
{
convolution_packed_int8(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}
conv3x3s1_winograd23_int8_sse(bottom_blob_bordered, top_blob_int32, weight_winograd23_data, opt);
// conv3x3s1_winograd43_int8_sse(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt);
}
else if (opt.use_sgemm_convolution)
{
convolution_im2col_gemm_int8(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, _nT, opt);
}
else
{
convolution_packed_int8(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}

if (use_int8_requantize)


+ 7
- 22
src/layer/x86/convolution_x86_avx2.cpp View File

@@ -19,10 +19,7 @@
namespace ncnn {

#include "convolution_packed_int8.h"
#include "convolution_sgemm_int8.h"
#include "convolution_sgemm_pack1to4_int8.h"
#include "convolution_sgemm_pack8to1_int8.h"
#include "convolution_sgemm_pack8to4_int8.h"
#include "convolution_im2col_gemm_int8.h"
#include "convolution_3x3_pack8to1_int8.h"
#include "convolution_3x3_pack8to4_int8.h"

@@ -37,24 +34,18 @@ void convolution_packed_int8_avx2(const Mat& bottom_blob, Mat& top_blob, const M
convolution_packed_int8(bottom_blob, top_blob, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}

// pack1
void im2col_sgemm_int8_sse_avx2(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
// gemm
void convolution_im2col_gemm_transform_kernel_int8_avx2(const Mat& kernel, Mat& AT, int inch, int outch, int kernel_w, int kernel_h, const Option& opt)
{
im2col_sgemm_int8_sse(bottom_im2col, top_blob, kernel, opt);
convolution_im2col_gemm_transform_kernel_int8(kernel, AT, inch, outch, kernel_w, kernel_h, opt);
}

// pack1to4
void im2col_sgemm_pack1to4_int8_sse_avx2(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
void convolution_im2col_gemm_int8_avx2(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt)
{
im2col_sgemm_pack1to4_int8_sse(bottom_im2col, top_blob, kernel, opt);
}

// pack8to1
void im2col_sgemm_pack8to1_int8_sse_avx2(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
{
im2col_sgemm_pack8to1_int8_sse(bottom_im2col, top_blob, kernel, opt);
convolution_im2col_gemm_int8(bottom_blob, top_blob, AT, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, nT, opt);
}

// winograd
void conv3x3s1_winograd43_transform_kernel_pack8to1_int8_sse_avx2(const Mat& kernel, Mat& kernel_tm, int inch, int outch, const Option& opt)
{
conv3x3s1_winograd43_transform_kernel_pack8to1_int8_sse(kernel, kernel_tm, inch, outch, opt);
@@ -65,12 +56,6 @@ void conv3x3s1_winograd43_pack8to1_int8_sse_avx2(const Mat& bottom_blob, Mat& to
conv3x3s1_winograd43_pack8to1_int8_sse(bottom_blob, top_blob, kernel, opt);
}

// pack8to4
void im2col_sgemm_pack8to4_int8_sse_avx2(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
{
im2col_sgemm_pack8to4_int8_sse(bottom_im2col, top_blob, kernel, opt);
}

void conv3x3s1_winograd43_transform_kernel_pack8to4_int8_sse_avx2(const Mat& kernel, Mat& kernel_tm, int inch, int outch, const Option& opt)
{
conv3x3s1_winograd43_transform_kernel_pack8to4_int8_sse(kernel, kernel_tm, inch, outch, opt);


+ 5
- 25
src/layer/x86/convolution_x86_avx512vnni.cpp View File

@@ -19,10 +19,7 @@
namespace ncnn {

#include "convolution_packed_int8.h"
#include "convolution_sgemm_int8.h"
#include "convolution_sgemm_pack1to4_int8.h"
#include "convolution_sgemm_pack8to1_int8.h"
#include "convolution_sgemm_pack8to4_int8.h"
#include "convolution_im2col_gemm_int8.h"
#include "convolution_3x3_pack8to1_int8.h"
#include "convolution_3x3_pack8to4_int8.h"

@@ -32,24 +29,13 @@ void convolution_packed_int8_avx512vnni(const Mat& bottom_blob, Mat& top_blob, c
convolution_packed_int8(bottom_blob, top_blob, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}

// pack1
void im2col_sgemm_int8_sse_avx512vnni(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
// gemm
void convolution_im2col_gemm_int8_avx512vnni(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt)
{
im2col_sgemm_int8_sse(bottom_im2col, top_blob, kernel, opt);
}

// pack1to4
void im2col_sgemm_pack1to4_int8_sse_avx512vnni(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
{
im2col_sgemm_pack1to4_int8_sse(bottom_im2col, top_blob, kernel, opt);
}

// pack8to1
void im2col_sgemm_pack8to1_int8_sse_avx512vnni(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
{
im2col_sgemm_pack8to1_int8_sse(bottom_im2col, top_blob, kernel, opt);
convolution_im2col_gemm_int8(bottom_blob, top_blob, AT, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, nT, opt);
}

// winograd
void conv3x3s1_winograd43_transform_kernel_pack8to1_int8_sse_avx512vnni(const Mat& kernel, Mat& kernel_tm, int inch, int outch, const Option& opt)
{
conv3x3s1_winograd43_transform_kernel_pack8to1_int8_sse(kernel, kernel_tm, inch, outch, opt);
@@ -60,12 +46,6 @@ void conv3x3s1_winograd43_pack8to1_int8_sse_avx512vnni(const Mat& bottom_blob, M
conv3x3s1_winograd43_pack8to1_int8_sse(bottom_blob, top_blob, kernel, opt);
}

// pack8to4
void im2col_sgemm_pack8to4_int8_sse_avx512vnni(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
{
im2col_sgemm_pack8to4_int8_sse(bottom_im2col, top_blob, kernel, opt);
}

void conv3x3s1_winograd43_transform_kernel_pack8to4_int8_sse_avx512vnni(const Mat& kernel, Mat& kernel_tm, int inch, int outch, const Option& opt)
{
conv3x3s1_winograd43_transform_kernel_pack8to4_int8_sse(kernel, kernel_tm, inch, outch, opt);


+ 5
- 25
src/layer/x86/convolution_x86_avxvnni.cpp View File

@@ -19,10 +19,7 @@
namespace ncnn {

#include "convolution_packed_int8.h"
#include "convolution_sgemm_int8.h"
#include "convolution_sgemm_pack1to4_int8.h"
#include "convolution_sgemm_pack8to1_int8.h"
#include "convolution_sgemm_pack8to4_int8.h"
#include "convolution_im2col_gemm_int8.h"
#include "convolution_3x3_pack8to1_int8.h"
#include "convolution_3x3_pack8to4_int8.h"

@@ -32,24 +29,13 @@ void convolution_packed_int8_avxvnni(const Mat& bottom_blob, Mat& top_blob, cons
convolution_packed_int8(bottom_blob, top_blob, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}

// pack1
void im2col_sgemm_int8_sse_avxvnni(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
// gemm
void convolution_im2col_gemm_int8_avxvnni(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt)
{
im2col_sgemm_int8_sse(bottom_im2col, top_blob, kernel, opt);
}

// pack1to4
void im2col_sgemm_pack1to4_int8_sse_avxvnni(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
{
im2col_sgemm_pack1to4_int8_sse(bottom_im2col, top_blob, kernel, opt);
}

// pack8to1
void im2col_sgemm_pack8to1_int8_sse_avxvnni(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
{
im2col_sgemm_pack8to1_int8_sse(bottom_im2col, top_blob, kernel, opt);
convolution_im2col_gemm_int8(bottom_blob, top_blob, AT, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, nT, opt);
}

// winograd
void conv3x3s1_winograd43_transform_kernel_pack8to1_int8_sse_avxvnni(const Mat& kernel, Mat& kernel_tm, int inch, int outch, const Option& opt)
{
conv3x3s1_winograd43_transform_kernel_pack8to1_int8_sse(kernel, kernel_tm, inch, outch, opt);
@@ -60,12 +46,6 @@ void conv3x3s1_winograd43_pack8to1_int8_sse_avxvnni(const Mat& bottom_blob, Mat&
conv3x3s1_winograd43_pack8to1_int8_sse(bottom_blob, top_blob, kernel, opt);
}

// pack8to4
void im2col_sgemm_pack8to4_int8_sse_avxvnni(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
{
im2col_sgemm_pack8to4_int8_sse(bottom_im2col, top_blob, kernel, opt);
}

void conv3x3s1_winograd43_transform_kernel_pack8to4_int8_sse_avxvnni(const Mat& kernel, Mat& kernel_tm, int inch, int outch, const Option& opt)
{
conv3x3s1_winograd43_transform_kernel_pack8to4_int8_sse(kernel, kernel_tm, inch, outch, opt);


+ 5
- 25
src/layer/x86/convolution_x86_xop.cpp View File

@@ -19,10 +19,7 @@
namespace ncnn {

#include "convolution_packed_int8.h"
#include "convolution_sgemm_int8.h"
#include "convolution_sgemm_pack1to4_int8.h"
#include "convolution_sgemm_pack8to1_int8.h"
#include "convolution_sgemm_pack8to4_int8.h"
#include "convolution_im2col_gemm_int8.h"
#include "convolution_3x3_pack8to1_int8.h"
#include "convolution_3x3_pack8to4_int8.h"

@@ -32,24 +29,13 @@ void convolution_packed_int8_xop(const Mat& bottom_blob, Mat& top_blob, const Ma
convolution_packed_int8(bottom_blob, top_blob, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}

// pack1
void im2col_sgemm_int8_sse_xop(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
// gemm
void convolution_im2col_gemm_int8_xop(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt)
{
im2col_sgemm_int8_sse(bottom_im2col, top_blob, kernel, opt);
}

// pack1to4
void im2col_sgemm_pack1to4_int8_sse_xop(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
{
im2col_sgemm_pack1to4_int8_sse(bottom_im2col, top_blob, kernel, opt);
}

// pack8to1
void im2col_sgemm_pack8to1_int8_sse_xop(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
{
im2col_sgemm_pack8to1_int8_sse(bottom_im2col, top_blob, kernel, opt);
convolution_im2col_gemm_int8(bottom_blob, top_blob, AT, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, nT, opt);
}

// winograd
void conv3x3s1_winograd43_transform_kernel_pack8to1_int8_sse_xop(const Mat& kernel, Mat& kernel_tm, int inch, int outch, const Option& opt)
{
conv3x3s1_winograd43_transform_kernel_pack8to1_int8_sse(kernel, kernel_tm, inch, outch, opt);
@@ -60,12 +46,6 @@ void conv3x3s1_winograd43_pack8to1_int8_sse_xop(const Mat& bottom_blob, Mat& top
conv3x3s1_winograd43_pack8to1_int8_sse(bottom_blob, top_blob, kernel, opt);
}

// pack8to4
void im2col_sgemm_pack8to4_int8_sse_xop(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
{
im2col_sgemm_pack8to4_int8_sse(bottom_im2col, top_blob, kernel, opt);
}

void conv3x3s1_winograd43_transform_kernel_pack8to4_int8_sse_xop(const Mat& kernel, Mat& kernel_tm, int inch, int outch, const Option& opt)
{
conv3x3s1_winograd43_transform_kernel_pack8to4_int8_sse(kernel, kernel_tm, inch, outch, opt);


Loading…
Cancel
Save