Browse Source

arm optimization for convolution int8 gemm unified elempack (#5016)

tags/20231027
nihui GitHub 2 years ago
parent
commit
c8662cce5e
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 13655 additions and 13110 deletions
  1. +0
    -83
      src/layer/arm/convolution_1x1_int8.h
  2. +0
    -83
      src/layer/arm/convolution_1x1_pack1to4_int8.h
  3. +0
    -90
      src/layer/arm/convolution_1x1_pack8to1_int8.h
  4. +0
    -90
      src/layer/arm/convolution_1x1_pack8to4_int8.h
  5. +0
    -147
      src/layer/arm/convolution_3x3_pack1to4_int8.h
  6. +0
    -80
      src/layer/arm/convolution_7x7_pack1to4_int8.h
  7. +67
    -214
      src/layer/arm/convolution_arm.cpp
  8. +6
    -42
      src/layer/arm/convolution_arm_asimddp.cpp
  9. +6
    -42
      src/layer/arm/convolution_arm_i8mm.cpp
  10. +13541
    -0
      src/layer/arm/convolution_im2col_gemm_int8.h
  11. +0
    -4514
      src/layer/arm/convolution_sgemm_int8.h
  12. +0
    -3360
      src/layer/arm/convolution_sgemm_pack1to4_int8.h
  13. +0
    -2530
      src/layer/arm/convolution_sgemm_pack8to1_int8.h
  14. +0
    -1806
      src/layer/arm/convolution_sgemm_pack8to4_int8.h
  15. +35
    -29
      tests/test_convolution_3.cpp

+ 0
- 83
src/layer/arm/convolution_1x1_int8.h View File

@@ -1,83 +0,0 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

static void conv1x1s1_sgemm_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
const int size = w * h;

Mat bottom_im2col = bottom_blob;
bottom_im2col.w = size;
bottom_im2col.h = 1;

im2col_sgemm_int8_neon(bottom_im2col, top_blob, kernel, opt);
}

static void conv1x1s2_sgemm_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
int w = bottom_blob.w;
int channels = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;
int elempack = bottom_blob.elempack;

int outw = top_blob.w;
int outh = top_blob.h;

const int tailstep = w - 2 * outw + w;

Mat bottom_blob_shrinked;
bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < channels; p++)
{
const signed char* r0 = bottom_blob.channel(p);
signed char* outptr = bottom_blob_shrinked.channel(p);

for (int i = 0; i < outh; i++)
{
int j = 0;
for (; j + 3 < outw; j += 4)
{
outptr[0] = r0[0];
outptr[1] = r0[2];
outptr[2] = r0[4];
outptr[3] = r0[6];

r0 += 8;
outptr += 4;
}
for (; j + 1 < outw; j += 2)
{
outptr[0] = r0[0];
outptr[1] = r0[2];

r0 += 4;
outptr += 2;
}
for (; j < outw; j++)
{
outptr[0] = r0[0];

r0 += 2;
outptr += 1;
}

r0 += tailstep;
}
}

conv1x1s1_sgemm_int8_neon(bottom_blob_shrinked, top_blob, kernel, opt);
}

+ 0
- 83
src/layer/arm/convolution_1x1_pack1to4_int8.h View File

@@ -1,83 +0,0 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

static void conv1x1s1_sgemm_pack1to4_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
const int size = w * h;

Mat bottom_im2col = bottom_blob;
bottom_im2col.w = size;
bottom_im2col.h = 1;

im2col_sgemm_pack1to4_int8_neon(bottom_im2col, top_blob, kernel, opt);
}

static void conv1x1s2_sgemm_pack1to4_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
int w = bottom_blob.w;
int channels = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;
int elempack = bottom_blob.elempack;

int outw = top_blob.w;
int outh = top_blob.h;

const int tailstep = w - 2 * outw + w;

Mat bottom_blob_shrinked;
bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < channels; p++)
{
const signed char* r0 = bottom_blob.channel(p);
signed char* outptr = bottom_blob_shrinked.channel(p);

for (int i = 0; i < outh; i++)
{
int j = 0;
for (; j + 3 < outw; j += 4)
{
outptr[0] = r0[0];
outptr[1] = r0[2];
outptr[2] = r0[4];
outptr[3] = r0[6];

r0 += 8;
outptr += 4;
}
for (; j + 1 < outw; j += 2)
{
outptr[0] = r0[0];
outptr[1] = r0[2];

r0 += 4;
outptr += 2;
}
for (; j < outw; j++)
{
outptr[0] = r0[0];

r0 += 2;
outptr += 1;
}

r0 += tailstep;
}
}

conv1x1s1_sgemm_pack1to4_int8_neon(bottom_blob_shrinked, top_blob, kernel, opt);
}

+ 0
- 90
src/layer/arm/convolution_1x1_pack8to1_int8.h View File

@@ -1,90 +0,0 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

static void conv1x1s1_sgemm_pack8to1_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
const int size = w * h;

Mat bottom_im2col = bottom_blob;
bottom_im2col.w = size;
bottom_im2col.h = 1;

im2col_sgemm_pack8to1_int8_neon(bottom_im2col, top_blob, kernel, opt);
}

static void conv1x1s2_sgemm_pack8to1_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
int w = bottom_blob.w;
int channels = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;
int elempack = bottom_blob.elempack;

int outw = top_blob.w;
int outh = top_blob.h;

const int tailstep = (w - 2 * outw + w) * 8;

Mat bottom_blob_shrinked;
bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < channels; p++)
{
const signed char* r0 = bottom_blob.channel(p);
signed char* outptr = bottom_blob_shrinked.channel(p);

for (int i = 0; i < outh; i++)
{
int j = 0;
for (; j + 3 < outw; j += 4)
{
int8x8_t _v0 = vld1_s8(r0);
int8x8_t _v1 = vld1_s8(r0 + 16);
int8x8_t _v2 = vld1_s8(r0 + 32);
int8x8_t _v3 = vld1_s8(r0 + 48);
vst1_s8(outptr, _v0);
vst1_s8(outptr + 8, _v1);
vst1_s8(outptr + 16, _v2);
vst1_s8(outptr + 24, _v3);

r0 += 64;
outptr += 32;
}
for (; j + 1 < outw; j += 2)
{
int8x8_t _v0 = vld1_s8(r0);
int8x8_t _v1 = vld1_s8(r0 + 16);
vst1_s8(outptr, _v0);
vst1_s8(outptr + 8, _v1);

r0 += 32;
outptr += 16;
}
for (; j < outw; j++)
{
int8x8_t _v = vld1_s8(r0);
vst1_s8(outptr, _v);

r0 += 16;
outptr += 8;
}

r0 += tailstep;
}
}

conv1x1s1_sgemm_pack8to1_int8_neon(bottom_blob_shrinked, top_blob, kernel, opt);
}

+ 0
- 90
src/layer/arm/convolution_1x1_pack8to4_int8.h View File

@@ -1,90 +0,0 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

static void conv1x1s1_sgemm_pack8to4_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
const int size = w * h;

Mat bottom_im2col = bottom_blob;
bottom_im2col.w = size;
bottom_im2col.h = 1;

im2col_sgemm_pack8to4_int8_neon(bottom_im2col, top_blob, kernel, opt);
}

static void conv1x1s2_sgemm_pack8to4_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
int w = bottom_blob.w;
int channels = bottom_blob.c;
size_t elemsize = bottom_blob.elemsize;
int elempack = bottom_blob.elempack;

int outw = top_blob.w;
int outh = top_blob.h;

const int tailstep = (w - 2 * outw + w) * 8;

Mat bottom_blob_shrinked;
bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < channels; p++)
{
const signed char* r0 = bottom_blob.channel(p);
signed char* outptr = bottom_blob_shrinked.channel(p);

for (int i = 0; i < outh; i++)
{
int j = 0;
for (; j + 3 < outw; j += 4)
{
int8x8_t _v0 = vld1_s8(r0);
int8x8_t _v1 = vld1_s8(r0 + 16);
int8x8_t _v2 = vld1_s8(r0 + 32);
int8x8_t _v3 = vld1_s8(r0 + 48);
vst1_s8(outptr, _v0);
vst1_s8(outptr + 8, _v1);
vst1_s8(outptr + 16, _v2);
vst1_s8(outptr + 24, _v3);

r0 += 64;
outptr += 32;
}
for (; j + 1 < outw; j += 2)
{
int8x8_t _v0 = vld1_s8(r0);
int8x8_t _v1 = vld1_s8(r0 + 16);
vst1_s8(outptr, _v0);
vst1_s8(outptr + 8, _v1);

r0 += 32;
outptr += 16;
}
for (; j < outw; j++)
{
int8x8_t _v = vld1_s8(r0);
vst1_s8(outptr, _v);

r0 += 16;
outptr += 8;
}

r0 += tailstep;
}
}

conv1x1s1_sgemm_pack8to4_int8_neon(bottom_blob_shrinked, top_blob, kernel, opt);
}

+ 0
- 147
src/layer/arm/convolution_3x3_pack1to4_int8.h View File

@@ -1,147 +0,0 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

static void conv3x3s1_pack1to4_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
int w = bottom_blob.w;
int inch = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
const int size = outw * outh;

const int maxk = 9;

// im2col
Mat bottom_im2col(size, maxk, inch, 1u, 1, opt.workspace_allocator);
{
const int gap = w - outw;

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < inch; p++)
{
const Mat img = bottom_blob.channel(p);
signed char* ptr = bottom_im2col.channel(p);

for (int u = 0; u < 3; u++)
{
for (int v = 0; v < 3; v++)
{
const signed char* sptr = img.row<const signed char>(u) + v;

for (int i = 0; i < outh; i++)
{
int j = 0;
for (; j + 3 < outw; j += 4)
{
ptr[0] = sptr[0];
ptr[1] = sptr[1];
ptr[2] = sptr[2];
ptr[3] = sptr[3];

sptr += 4;
ptr += 4;
}
for (; j + 1 < outw; j += 2)
{
ptr[0] = sptr[0];
ptr[1] = sptr[1];

sptr += 2;
ptr += 2;
}
for (; j < outw; j++)
{
ptr[0] = sptr[0];

sptr += 1;
ptr += 1;
}

sptr += gap;
}
}
}
}
}

im2col_sgemm_pack1to4_int8_neon(bottom_im2col, top_blob, kernel, opt);
}

static void conv3x3s2_pack1to4_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
int w = bottom_blob.w;
int inch = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
const int size = outw * outh;

const int maxk = 9;

// im2col
Mat bottom_im2col(size, maxk, inch, 1u, 1, opt.workspace_allocator);
{
const int gap = w * 2 - outw * 2;

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < inch; p++)
{
const Mat img = bottom_blob.channel(p);
signed char* ptr = bottom_im2col.channel(p);

for (int u = 0; u < 3; u++)
{
for (int v = 0; v < 3; v++)
{
const signed char* sptr = img.row<const signed char>(u) + v;

for (int i = 0; i < outh; i++)
{
int j = 0;
for (; j + 3 < outw; j += 4)
{
ptr[0] = sptr[0];
ptr[1] = sptr[2];
ptr[2] = sptr[4];
ptr[3] = sptr[6];

sptr += 8;
ptr += 4;
}
for (; j + 1 < outw; j += 2)
{
ptr[0] = sptr[0];
ptr[1] = sptr[2];

sptr += 4;
ptr += 2;
}
for (; j < outw; j++)
{
ptr[0] = sptr[0];

sptr += 2;
ptr += 1;
}

sptr += gap;
}
}
}
}
}

im2col_sgemm_pack1to4_int8_neon(bottom_im2col, top_blob, kernel, opt);
}

+ 0
- 80
src/layer/arm/convolution_7x7_pack1to4_int8.h View File

@@ -1,80 +0,0 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

static void conv7x7s2_pack1to4_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
int w = bottom_blob.w;
int inch = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
const int size = outw * outh;

const int maxk = 49;

// im2col
Mat bottom_im2col(size, maxk, inch, 1u, 1, opt.workspace_allocator);
{
const int gap = w * 2 - outw * 2;

#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < inch; p++)
{
const Mat img = bottom_blob.channel(p);
signed char* ptr = bottom_im2col.channel(p);

for (int u = 0; u < 7; u++)
{
for (int v = 0; v < 7; v++)
{
const signed char* sptr = img.row<const signed char>(u) + v;

for (int i = 0; i < outh; i++)
{
int j = 0;
for (; j + 3 < outw; j += 4)
{
ptr[0] = sptr[0];
ptr[1] = sptr[2];
ptr[2] = sptr[4];
ptr[3] = sptr[6];

sptr += 8;
ptr += 4;
}
for (; j + 1 < outw; j += 2)
{
ptr[0] = sptr[0];
ptr[1] = sptr[2];

sptr += 4;
ptr += 2;
}
for (; j < outw; j++)
{
ptr[0] = sptr[0];

sptr += 2;
ptr += 1;
}

sptr += gap;
}
}
}
}
}

im2col_sgemm_pack1to4_int8_neon(bottom_im2col, top_blob, kernel, opt);
}

+ 67
- 214
src/layer/arm/convolution_arm.cpp View File

@@ -48,10 +48,10 @@ namespace ncnn {
#endif // NCNN_BF16

#if NCNN_INT8
#include "convolution_sgemm_int8.h"
#include "convolution_im2col_gemm_int8.h"

#include "convolution_winograd_transform_int8.h"
#include "convolution_winograd_dot_int8.h"
#include "convolution_1x1_int8.h"
#include "convolution_3x3_int8.h"
#include "convolution_int8.h"
#endif // NCNN_INT8
@@ -74,19 +74,11 @@ namespace ncnn {
#include "convolution_pack8to4_int8.h"
#include "convolution_pack1to4_int8.h"
#include "convolution_pack8to1_int8.h"
#include "convolution_sgemm_pack8to4_int8.h"
#include "convolution_sgemm_pack1to4_int8.h"
#include "convolution_sgemm_pack8to1_int8.h"
#include "convolution_winograd_transform_pack4_int8.h"
#include "convolution_winograd_transform_pack8_int8.h"
#include "convolution_winograd_dot_pack8to4_int8.h"
#include "convolution_winograd_dot_pack8to1_int8.h"
#include "convolution_1x1_pack8to4_int8.h"
#include "convolution_1x1_pack1to4_int8.h"
#include "convolution_1x1_pack8to1_int8.h"
#include "convolution_3x3_pack8to4_int8.h"
#include "convolution_3x3_pack1to4_int8.h"
#include "convolution_7x7_pack1to4_int8.h"
#include "convolution_3x3_pack8to1_int8.h"
#endif // NCNN_INT8
#endif // __ARM_NEON
@@ -1303,121 +1295,41 @@ int Convolution_arm::create_pipeline_int8_arm(const Option& opt)
}
#endif // __ARM_NEON

#if __ARM_NEON
if (elempack == 8 && out_elempack == 4)
{
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
convolution_im2col_sgemm_transform_kernel_pack8to4_int8_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
convolution_im2col_sgemm_transform_kernel_pack8to4_int8_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
#if NCNN_ARM82DOT
else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && (!ncnn::cpu_support_arm_asimddp() || (ncnn::cpu_support_arm_asimddp() && num_input >= 256 && num_output >= 256)))
if (elempack == 8 && out_elempack == 4 && opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && (!ncnn::cpu_support_arm_asimddp() || (ncnn::cpu_support_arm_asimddp() && num_input >= 256 && num_output >= 256)))
#else
else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
if (elempack == 8 && out_elempack == 4 && opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
#endif
{
conv3x3s1_winograd43_transform_kernel_pack8to4_int8_neon(weight_data, weight_winograd43_data, num_input, num_output, opt);
}
else if (opt.use_sgemm_convolution)
{
convolution_im2col_sgemm_transform_kernel_pack8to4_int8_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
else
{
convolution_transform_kernel_packed_int8_neon(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
}
{
#if __ARM_NEON
conv3x3s1_winograd43_transform_kernel_pack8to4_int8_neon(weight_data, weight_winograd43_data, num_input, num_output, opt);
#endif // __ARM_NEON
}

if (elempack == 1 && out_elempack == 4)
else if (elempack == 8 && out_elempack == 1 && opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
convolution_im2col_sgemm_transform_kernel_pack1to4_int8_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
convolution_im2col_sgemm_transform_kernel_pack1to4_int8_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
convolution_im2col_sgemm_transform_kernel_pack1to4_int8_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
convolution_im2col_sgemm_transform_kernel_pack1to4_int8_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
convolution_im2col_sgemm_transform_kernel_pack1to4_int8_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
{
convolution_im2col_sgemm_transform_kernel_pack1to4_int8_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
else
{
convolution_transform_kernel_packed_int8_neon(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
}
#if __ARM_NEON
conv3x3s1_winograd43_transform_kernel_pack8to1_int8_neon(weight_data, weight_winograd43_data, num_input, num_output, opt);
#endif // __ARM_NEON
}

if (elempack == 8 && out_elempack == 1)
else if (elempack == 1 && out_elempack == 1 && opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
convolution_im2col_sgemm_transform_kernel_pack8to1_int8_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
convolution_im2col_sgemm_transform_kernel_pack8to1_int8_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
conv3x3s1_winograd43_transform_kernel_pack8to1_int8_neon(weight_data, weight_winograd43_data, num_input, num_output, opt);
}
else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
{
convolution_im2col_sgemm_transform_kernel_pack8to1_int8_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
else
{
convolution_transform_kernel_packed_int8_neon(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
}
conv3x3s1_winograd43_transform_kernel_int8_neon(weight_data, weight_winograd43_data, num_input, num_output, opt);
}
#endif // __ARM_NEON

if (elempack == 1 && out_elempack == 1)
else if (opt.use_sgemm_convolution)
{
if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
conv3x3s1_winograd43_transform_kernel_int8_neon(weight_data, weight_winograd43_data, num_input, num_output, opt);
}

/* if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
conv3x3s2_transform_kernel_int8_neon(weight_data, weight_3x3s2_data_int8, num_input, num_output);
}
else */

if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
convolution_im2col_sgemm_transform_kernel_int8_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
convolution_im2col_sgemm_transform_kernel_int8_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
else if (opt.use_sgemm_convolution)
{
convolution_im2col_sgemm_transform_kernel_int8_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
else
{
weight_data_tm = weight_data;
}
convolution_im2col_gemm_transform_kernel_int8(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h, opt);
}
else if (elempack == 1 && out_elempack == 1)
{
// if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
// {
// conv3x3s2_transform_kernel_int8_neon(weight_data, weight_3x3s2_data_int8, num_input, num_output);
// }
weight_data_tm = weight_data;
}
else
{
convolution_transform_kernel_packed_int8_neon(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
}

scale_in_data.create(num_output);
@@ -1515,114 +1427,55 @@ int Convolution_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, con
if (top_blob_int32.empty())
return -100;

#if __ARM_NEON
if (elempack == 8 && out_elempack_int32 == 4)
int _nT = nT ? nT : opt.num_threads;
if (nT != 0 && opt.num_threads != nT)
{
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
conv1x1s1_sgemm_pack8to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
}
else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
conv1x1s2_sgemm_pack8to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
}
// force num_threads the same as in create_pipeline
// so we could use pre-packed A/B from the same tile config
NCNN_LOGE("opt.num_threads %d changed, convolution gemm will use load-time value %d", opt.num_threads, nT);
}

#if NCNN_ARM82DOT
else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && (!ncnn::cpu_support_arm_asimddp() || (ncnn::cpu_support_arm_asimddp() && num_input >= 256 && num_output >= 256)))
if (elempack == 8 && out_elempack_int32 == 4 && opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && (!ncnn::cpu_support_arm_asimddp() || (ncnn::cpu_support_arm_asimddp() && num_input >= 256 && num_output >= 256)))
#else
else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
if (elempack == 8 && out_elempack_int32 == 4 && opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
#endif
{
conv3x3s1_winograd43_pack8to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt);
}
else if (opt.use_sgemm_convolution)
{
convolution_im2col_sgemm_pack8to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}
else
{
convolution_pack8to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}
{
#if __ARM_NEON
conv3x3s1_winograd43_pack8to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt);
#endif // __ARM_NEON
}

if (elempack == 1 && out_elempack_int32 == 4)
else if (elempack == 8 && out_elempack_int32 == 1 && opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
conv1x1s1_sgemm_pack1to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
}
else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
conv1x1s2_sgemm_pack1to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
}
else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
conv3x3s1_pack1to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
}
else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
conv3x3s2_pack1to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
}
else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
conv7x7s2_pack1to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
}
else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
{
convolution_im2col_sgemm_pack1to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}
else
{
convolution_pack1to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}
#if __ARM_NEON
conv3x3s1_winograd43_pack8to1_int8_neon(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt);
#endif // __ARM_NEON
}

if (elempack == 8 && out_elempack_int32 == 1)
else if (elempack == 1 && out_elempack_int32 == 1 && opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
conv1x1s1_sgemm_pack8to1_int8_neon(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
}
else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
conv1x1s2_sgemm_pack8to1_int8_neon(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
}
else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
conv3x3s1_winograd43_pack8to1_int8_neon(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt);
}
else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
{
convolution_im2col_sgemm_pack8to1_int8_neon(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}
else
{
convolution_pack8to1_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}
conv3x3s1_winograd43_int8_neon(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt);
}
#endif // __ARM_NEON

if (elempack == 1 && out_elempack_int32 == 1)
else if (opt.use_sgemm_convolution)
{
if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
conv1x1s1_sgemm_int8_neon(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
}
else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
{
conv1x1s2_sgemm_int8_neon(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
}
else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
conv3x3s1_winograd43_int8_neon(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt);
}
else if (opt.use_sgemm_convolution)
{
convolution_im2col_sgemm_int8_neon(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}
else
{
convolution_int8(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}
convolution_im2col_gemm_int8(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, _nT, opt);
}
#if __ARM_NEON
else if (elempack == 8 && out_elempack_int32 == 4)
{
convolution_pack8to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}
else if (elempack == 1 && out_elempack_int32 == 4)
{
convolution_pack1to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}
else if (elempack == 8 && out_elempack_int32 == 1)
{
convolution_pack8to1_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}
#endif // __ARM_NEON
else // if (elempack == 1 && out_elempack_int32 == 1)
{
convolution_int8(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}

if (use_int8_requantize)


+ 6
- 42
src/layer/arm/convolution_arm_asimddp.cpp View File

@@ -17,53 +17,17 @@

namespace ncnn {

#include "convolution_sgemm_int8.h"
#include "convolution_sgemm_pack1to4_int8.h"
#include "convolution_sgemm_pack8to1_int8.h"
#include "convolution_sgemm_pack8to4_int8.h"
#include "convolution_im2col_gemm_int8.h"

// pack1
void im2col_sgemm_int8_neon_asimddp(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
// gemm
void convolution_im2col_gemm_transform_kernel_int8_asimddp(const Mat& kernel, Mat& AT, int inch, int outch, int kernel_w, int kernel_h, const Option& opt)
{
im2col_sgemm_int8_neon(bottom_im2col, top_blob, kernel, opt);
convolution_im2col_gemm_transform_kernel_int8(kernel, AT, inch, outch, kernel_w, kernel_h, opt);
}

void convolution_im2col_sgemm_transform_kernel_int8_neon_asimddp(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
void convolution_im2col_gemm_int8_asimddp(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt)
{
convolution_im2col_sgemm_transform_kernel_int8_neon(kernel, kernel_tm, inch, outch, kernel_w, kernel_h);
}

// pack1to4
void im2col_sgemm_pack1to4_int8_neon_asimddp(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
{
im2col_sgemm_pack1to4_int8_neon(bottom_im2col, top_blob, kernel, opt);
}

void convolution_im2col_sgemm_transform_kernel_pack1to4_int8_neon_asimddp(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
{
convolution_im2col_sgemm_transform_kernel_pack1to4_int8_neon(kernel, kernel_tm, inch, outch, kernel_w, kernel_h);
}

// pack8to1
void im2col_sgemm_pack8to1_int8_neon_asimddp(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
{
im2col_sgemm_pack8to1_int8_neon(bottom_im2col, top_blob, kernel, opt);
}

void convolution_im2col_sgemm_transform_kernel_pack8to1_int8_neon_asimddp(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
{
convolution_im2col_sgemm_transform_kernel_pack8to1_int8_neon(kernel, kernel_tm, inch, outch, kernel_w, kernel_h);
}

// pack8to4
void im2col_sgemm_pack8to4_int8_neon_asimddp(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
{
im2col_sgemm_pack8to4_int8_neon(bottom_im2col, top_blob, kernel, opt);
}

void convolution_im2col_sgemm_transform_kernel_pack8to4_int8_neon_asimddp(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
{
convolution_im2col_sgemm_transform_kernel_pack8to4_int8_neon(kernel, kernel_tm, inch, outch, kernel_w, kernel_h);
convolution_im2col_gemm_int8(bottom_blob, top_blob, AT, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, nT, opt);
}

} // namespace ncnn

+ 6
- 42
src/layer/arm/convolution_arm_i8mm.cpp View File

@@ -17,53 +17,17 @@

namespace ncnn {

#include "convolution_sgemm_int8.h"
#include "convolution_sgemm_pack1to4_int8.h"
#include "convolution_sgemm_pack8to1_int8.h"
#include "convolution_sgemm_pack8to4_int8.h"
#include "convolution_im2col_gemm_int8.h"

// pack1
void im2col_sgemm_int8_neon_i8mm(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
// gemm
void convolution_im2col_gemm_transform_kernel_int8_i8mm(const Mat& kernel, Mat& AT, int inch, int outch, int kernel_w, int kernel_h, const Option& opt)
{
im2col_sgemm_int8_neon(bottom_im2col, top_blob, kernel, opt);
convolution_im2col_gemm_transform_kernel_int8(kernel, AT, inch, outch, kernel_w, kernel_h, opt);
}

void convolution_im2col_sgemm_transform_kernel_int8_neon_i8mm(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
void convolution_im2col_gemm_int8_i8mm(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt)
{
convolution_im2col_sgemm_transform_kernel_int8_neon(kernel, kernel_tm, inch, outch, kernel_w, kernel_h);
}

// pack1to4
void im2col_sgemm_pack1to4_int8_neon_i8mm(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
{
im2col_sgemm_pack1to4_int8_neon(bottom_im2col, top_blob, kernel, opt);
}

void convolution_im2col_sgemm_transform_kernel_pack1to4_int8_neon_i8mm(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
{
convolution_im2col_sgemm_transform_kernel_pack1to4_int8_neon(kernel, kernel_tm, inch, outch, kernel_w, kernel_h);
}

// pack8to1
void im2col_sgemm_pack8to1_int8_neon_i8mm(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
{
im2col_sgemm_pack8to1_int8_neon(bottom_im2col, top_blob, kernel, opt);
}

void convolution_im2col_sgemm_transform_kernel_pack8to1_int8_neon_i8mm(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
{
convolution_im2col_sgemm_transform_kernel_pack8to1_int8_neon(kernel, kernel_tm, inch, outch, kernel_w, kernel_h);
}

// pack8to4
void im2col_sgemm_pack8to4_int8_neon_i8mm(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
{
im2col_sgemm_pack8to4_int8_neon(bottom_im2col, top_blob, kernel, opt);
}

void convolution_im2col_sgemm_transform_kernel_pack8to4_int8_neon_i8mm(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
{
convolution_im2col_sgemm_transform_kernel_pack8to4_int8_neon(kernel, kernel_tm, inch, outch, kernel_w, kernel_h);
convolution_im2col_gemm_int8(bottom_blob, top_blob, AT, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, nT, opt);
}

} // namespace ncnn

+ 13541
- 0
src/layer/arm/convolution_im2col_gemm_int8.h
File diff suppressed because it is too large
View File


+ 0
- 4514
src/layer/arm/convolution_sgemm_int8.h
File diff suppressed because it is too large
View File


+ 0
- 3360
src/layer/arm/convolution_sgemm_pack1to4_int8.h
File diff suppressed because it is too large
View File


+ 0
- 2530
src/layer/arm/convolution_sgemm_pack8to1_int8.h
File diff suppressed because it is too large
View File


+ 0
- 1806
src/layer/arm/convolution_sgemm_pack8to4_int8.h
File diff suppressed because it is too large
View File


+ 35
- 29
tests/test_convolution_3.cpp View File

@@ -310,7 +310,13 @@ static int test_convolution_1()
|| test_convolution_int8(4, 20, 16, 24, 3, 1, 1, 1, 0)
|| test_convolution_int8(6, 7, 64, 64, 3, 1, 2, 0, 1)
|| test_convolution_int8(25, 33, 16, 15, 3, 1, 1, 1, 0)
|| test_convolution_int8(7, 7, 15, 12, 3, 1, 1, 1, 0);
|| test_convolution_int8(7, 7, 15, 12, 3, 1, 1, 1, 0)
|| test_convolution_int8(5, 6, 31, 9, 5, 1, 1, 0, 1)
|| test_convolution_int8(5, 7, 32, 8, 5, 1, 2, 0, 1)
|| test_convolution_int8(16, 10, 31, 32, 2, 1, 3, 0, 0)
|| test_convolution_int8(5, 10, 5, 32, 3, 2, 1, 0, 1)
|| test_convolution_int8(3, 9, 16, 13, 2, 2, 1, 0, 0)
|| test_convolution_int8(33, 5, 15, 5, 2, 1, 3, 0, 1);
}

static int test_convolution_1_2()
@@ -326,65 +332,65 @@ static int test_convolution_1_2()
|| test_convolution_int8(19, 17, 32, 1, 3, 2, 2, 0, 0)

|| test_convolution_int8(19, 17, 1, 2, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 2, 2, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 2, 2, 5, 2, 3, 0, 0)
|| test_convolution_int8(19, 17, 7, 2, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 8, 2, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 8, 2, 5, 2, 3, 0, 0)
|| test_convolution_int8(19, 17, 15, 2, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 16, 2, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 16, 2, 5, 2, 3, 0, 0)
|| test_convolution_int8(19, 17, 31, 2, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 32, 2, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 32, 2, 5, 2, 3, 0, 0)

|| test_convolution_int8(19, 17, 1, 7, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 1, 7, 5, 2, 3, 0, 1)
|| test_convolution_int8(19, 17, 2, 7, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 7, 7, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 7, 7, 5, 2, 3, 0, 1)
|| test_convolution_int8(19, 17, 8, 7, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 15, 7, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 15, 7, 5, 2, 3, 0, 1)
|| test_convolution_int8(19, 17, 16, 7, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 31, 7, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 31, 7, 5, 2, 3, 0, 1)
|| test_convolution_int8(19, 17, 32, 7, 5, 2, 2, 0, 0)

|| test_convolution_int8(19, 17, 1, 8, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 2, 8, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 2, 8, 5, 2, 3, 0, 0)
|| test_convolution_int8(19, 17, 7, 8, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 8, 8, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 8, 8, 5, 2, 3, 0, 0)
|| test_convolution_int8(19, 17, 15, 8, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 16, 8, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 16, 8, 5, 2, 3, 0, 0)
|| test_convolution_int8(19, 17, 31, 8, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 32, 8, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 32, 8, 5, 2, 3, 0, 0)

|| test_convolution_int8(19, 17, 1, 15, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 1, 15, 5, 2, 3, 0, 1)
|| test_convolution_int8(19, 17, 2, 15, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 7, 15, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 7, 15, 5, 2, 3, 0, 1)
|| test_convolution_int8(19, 17, 8, 15, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 15, 15, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 15, 15, 5, 2, 3, 0, 1)
|| test_convolution_int8(19, 17, 16, 15, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 31, 15, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 31, 15, 5, 2, 3, 0, 1)
|| test_convolution_int8(19, 17, 32, 15, 5, 2, 2, 0, 0)

|| test_convolution_int8(19, 17, 1, 16, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 2, 16, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 1, 16, 5, 2, 3, 0, 1)
|| test_convolution_int8(19, 17, 2, 16, 5, 2, 3, 0, 0)
|| test_convolution_int8(19, 17, 7, 16, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 8, 16, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 15, 16, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 16, 16, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 31, 16, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 32, 16, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 31, 16, 5, 2, 3, 0, 1)
|| test_convolution_int8(19, 17, 32, 16, 5, 2, 3, 0, 0)

|| test_convolution_int8(19, 17, 1, 31, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 2, 31, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 7, 31, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 2, 31, 5, 2, 3, 0, 0)
|| test_convolution_int8(19, 17, 7, 31, 5, 2, 3, 0, 1)
|| test_convolution_int8(19, 17, 8, 31, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 15, 31, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 16, 31, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 31, 31, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 16, 31, 5, 2, 3, 0, 0)
|| test_convolution_int8(19, 17, 31, 31, 5, 2, 3, 0, 1)
|| test_convolution_int8(19, 17, 32, 31, 5, 2, 2, 0, 0)

|| test_convolution_int8(19, 17, 1, 32, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 2, 32, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 7, 32, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 8, 32, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 15, 32, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 16, 32, 5, 2, 2, 0, 0)
|| test_convolution_int8(19, 17, 7, 32, 5, 2, 3, 0, 1)
|| test_convolution_int8(19, 17, 8, 32, 5, 2, 3, 0, 0)
|| test_convolution_int8(19, 17, 15, 32, 5, 2, 3, 0, 1)
|| test_convolution_int8(19, 17, 16, 32, 5, 2, 3, 0, 0)
|| test_convolution_int8(19, 17, 31, 32, 5, 2, 2, 0, 1)
|| test_convolution_int8(19, 17, 32, 32, 5, 2, 2, 0, 0);
}


Loading…
Cancel
Save