Browse Source

arm optimization for convolution int8 packed unified elempack (#5147)

tags/20240102
nihui GitHub 2 years ago
parent
commit
4136de3b8d
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 1623 additions and 439 deletions
  1. +5
    -81
      src/layer/arm/convolution_arm.cpp
  2. +12
    -0
      src/layer/arm/convolution_arm_asimddp.cpp
  3. +12
    -0
      src/layer/arm/convolution_arm_i8mm.cpp
  4. +0
    -82
      src/layer/arm/convolution_int8.h
  5. +0
    -82
      src/layer/arm/convolution_pack1to4_int8.h
  6. +0
    -94
      src/layer/arm/convolution_pack8to1_int8.h
  7. +0
    -100
      src/layer/arm/convolution_pack8to4_int8.h
  8. +1573
    -0
      src/layer/arm/convolution_packed_int8.h
  9. +21
    -0
      tests/test_convolution_3.cpp

+ 5
- 81
src/layer/arm/convolution_arm.cpp View File

@@ -48,11 +48,11 @@ namespace ncnn {
#endif // NCNN_BF16

#if NCNN_INT8
#include "convolution_packed_int8.h"
#include "convolution_im2col_gemm_int8.h"
#include "convolution_3x3_winograd_int8.h"

// #include "convolution_3x3_int8.h"
#include "convolution_int8.h"
#endif // NCNN_INT8

#if __ARM_NEON
@@ -68,12 +68,6 @@ namespace ncnn {
#include "convolution_5x5_pack4_bf16s.h"
#include "convolution_7x7_pack1to4_bf16s.h"
#endif // NCNN_BF16

#if NCNN_INT8
#include "convolution_pack8to4_int8.h"
#include "convolution_pack1to4_int8.h"
#include "convolution_pack8to1_int8.h"
#endif // NCNN_INT8
#endif // __ARM_NEON

Convolution_arm::Convolution_arm()
@@ -1238,41 +1232,6 @@ int Convolution_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const
#endif // NCNN_BF16

#if NCNN_INT8
static void convolution_transform_kernel_packed_int8_neon(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack)
{
const int maxk = kernel_w * kernel_h;

// src = kw-kh-inch-outch
// dst = pa-pb-kw-kh-inch/pa-outch/pb
{
Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);

weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)elempack * out_elempack, elempack * out_elempack);

for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
{
signed char* g00 = weight_data_tm.channel(q / out_elempack);

for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
{
for (int k = 0; k < maxk; k++)
{
for (int i = 0; i < out_elempack; i++)
{
for (int j = 0; j < elempack; j++)
{
const signed char* k00 = weight_data_r2.channel(q + i).row<const signed char>(p + j);

g00[0] = k00[k];

g00++;
}
}
}
}
}
}
}
int Convolution_arm::create_pipeline_int8_arm(const Option& opt)
{
const int maxk = kernel_w * kernel_h;
@@ -1286,16 +1245,6 @@ int Convolution_arm::create_pipeline_int8_arm(const Option& opt)
}
#endif

int elempack = 1;
int out_elempack = 1;
#if __ARM_NEON
if (opt.use_packing_layout)
{
elempack = num_input % 8 == 0 ? 8 : 1;
out_elempack = num_output % 4 == 0 ? 4 : 1;
}
#endif // __ARM_NEON

if (opt.use_winograd_convolution && prefer_winograd)
{
if (opt.use_winograd43_convolution)
@@ -1307,13 +1256,9 @@ int Convolution_arm::create_pipeline_int8_arm(const Option& opt)
{
convolution_im2col_gemm_transform_kernel_int8(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h, opt);
}
else if (elempack == 1 && out_elempack == 1)
{
weight_data_tm = weight_data;
}
else
{
convolution_transform_kernel_packed_int8_neon(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
convolution_transform_kernel_packed_int8(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
}

scale_in_data.create(num_output);
@@ -1404,14 +1349,7 @@ int Convolution_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, con
#if __ARM_NEON
if (opt.use_packing_layout)
{
if ((opt.use_winograd_convolution && prefer_winograd) || opt.use_sgemm_convolution)
{
out_elempack_int32 = num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
}
else
{
out_elempack_int32 = num_output % 4 == 0 ? 4 : 1;
}
out_elempack_int32 = num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
}
#endif // __ARM_NEON

@@ -1439,23 +1377,9 @@ int Convolution_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, con
{
convolution_im2col_gemm_int8(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, _nT, opt);
}
#if __ARM_NEON
else if (elempack == 8 && out_elempack_int32 == 4)
{
convolution_pack8to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}
else if (elempack == 1 && out_elempack_int32 == 4)
{
convolution_pack1to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}
else if (elempack == 8 && out_elempack_int32 == 1)
{
convolution_pack8to1_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}
#endif // __ARM_NEON
else // if (elempack == 1 && out_elempack_int32 == 1)
else
{
convolution_int8(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
convolution_packed_int8(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}

bottom_blob_bordered.release();


+ 12
- 0
src/layer/arm/convolution_arm_asimddp.cpp View File

@@ -17,8 +17,20 @@

namespace ncnn {

#include "convolution_packed_int8.h"
#include "convolution_im2col_gemm_int8.h"

// packed
void convolution_transform_kernel_packed_int8_asimddp(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
{
convolution_transform_kernel_packed_int8(kernel, kernel_tm, inch, outch, kernel_w, kernel_h);
}

void convolution_packed_int8_asimddp(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
{
convolution_packed_int8(bottom_blob, top_blob, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}

// gemm
void convolution_im2col_gemm_transform_kernel_int8_asimddp(const Mat& kernel, Mat& AT, int inch, int outch, int kernel_w, int kernel_h, const Option& opt)
{


+ 12
- 0
src/layer/arm/convolution_arm_i8mm.cpp View File

@@ -17,8 +17,20 @@

namespace ncnn {

#include "convolution_packed_int8.h"
#include "convolution_im2col_gemm_int8.h"

// packed
void convolution_transform_kernel_packed_int8_i8mm(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
{
convolution_transform_kernel_packed_int8(kernel, kernel_tm, inch, outch, kernel_w, kernel_h);
}

void convolution_packed_int8_i8mm(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
{
convolution_packed_int8(bottom_blob, top_blob, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}

// gemm
void convolution_im2col_gemm_transform_kernel_int8_i8mm(const Mat& kernel, Mat& AT, int inch, int outch, int kernel_w, int kernel_h, const Option& opt)
{


+ 0
- 82
src/layer/arm/convolution_int8.h View File

@@ -1,82 +0,0 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

static void convolution_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
{
int w = bottom_blob.w;
int channels = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

const int maxk = kernel_w * kernel_h;

// kernel offsets
std::vector<int> _space_ofs(maxk);
int* space_ofs = &_space_ofs[0];
{
int p1 = 0;
int p2 = 0;
int gap = w * dilation_h - kernel_w * dilation_w;
for (int i = 0; i < kernel_h; i++)
{
for (int j = 0; j < kernel_w; j++)
{
space_ofs[p1] = p2;
p1++;
p2 += dilation_w;
}
p2 += gap;
}
}

// num_output
#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < outch; p++)
{
int* outptr = top_blob.channel(p);

for (int i = 0; i < outh; i++)
{
for (int j = 0; j < outw; j++)
{
int sum = 0;

// const signed char* kptr = weight_data_int8.channel(p);
const signed char* kptr = (const signed char*)weight_data_int8 + maxk * channels * p;

// channels
for (int q = 0; q < channels; q++)
{
const Mat m = bottom_blob.channel(q);
const signed char* sptr = m.row<signed char>(i * stride_h) + j * stride_w;

for (int k = 0; k < maxk; k++)
{
signed char val = sptr[space_ofs[k]];
signed char w = kptr[k];
sum += val * w;
}

kptr += maxk;
}

outptr[j] = sum;
}

outptr += outw;
}
}
}

+ 0
- 82
src/layer/arm/convolution_pack1to4_int8.h View File

@@ -1,82 +0,0 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

static void convolution_pack1to4_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
{
int w = bottom_blob.w;
int channels = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

const int maxk = kernel_w * kernel_h;

// kernel offsets
std::vector<int> _space_ofs(maxk);
int* space_ofs = &_space_ofs[0];
{
int p1 = 0;
int p2 = 0;
int gap = w * dilation_h - kernel_w * dilation_w;
for (int i = 0; i < kernel_h; i++)
{
for (int j = 0; j < kernel_w; j++)
{
space_ofs[p1] = p2;
p1++;
p2 += dilation_w;
}
p2 += gap;
}
}

// num_output
#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < outch; p++)
{
int* outptr = top_blob.channel(p);

for (int i = 0; i < outh; i++)
{
for (int j = 0; j < outw; j++)
{
int32x4_t _sum0 = vdupq_n_s32(0);

const signed char* kptr = weight_data_int8.channel(p);

// channels
for (int q = 0; q < channels; q++)
{
const Mat m = bottom_blob.channel(q);
const signed char* sptr = m.row<const signed char>(i * stride_h) + j * stride_w;

for (int k = 0; k < maxk; k++)
{
int8x8_t _val = vdup_n_s8(sptr[space_ofs[k]]);
int8x8_t _w = vld1_s8(kptr);
int16x8_t _s0 = vmull_s8(_val, _w);
_sum0 = vaddw_s16(_sum0, vget_low_s16(_s0));

kptr += 4;
}
}

vst1q_s32(outptr + j * 4, _sum0);
}

outptr += outw * 4;
}
}
}

+ 0
- 94
src/layer/arm/convolution_pack8to1_int8.h View File

@@ -1,94 +0,0 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

static void convolution_pack8to1_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
{
int w = bottom_blob.w;
int channels = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

const int maxk = kernel_w * kernel_h;

// kernel offsets
std::vector<int> _space_ofs(maxk);
int* space_ofs = &_space_ofs[0];
{
int p1 = 0;
int p2 = 0;
int gap = w * dilation_h - kernel_w * dilation_w;
for (int i = 0; i < kernel_h; i++)
{
for (int j = 0; j < kernel_w; j++)
{
space_ofs[p1] = p2;
p1++;
p2 += dilation_w;
}
p2 += gap;
}
}

// num_output
#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < outch; p++)
{
int* outptr = top_blob.channel(p);

for (int i = 0; i < outh; i++)
{
for (int j = 0; j < outw; j++)
{
int32x4_t _sum0 = vdupq_n_s32(0);
int32x4_t _sum1 = vdupq_n_s32(0);

const signed char* kptr = weight_data_int8.channel(p);

// channels
for (int q = 0; q < channels; q++)
{
const Mat m = bottom_blob.channel(q);
const signed char* sptr = m.row<const signed char>(i * stride_h) + j * stride_w * 8;

for (int k = 0; k < maxk; k++)
{
int8x8_t _val = vld1_s8(sptr + space_ofs[k] * 8);
int8x8_t _w = vld1_s8(kptr);
int16x8_t _s8 = vmull_s8(_val, _w);

_sum0 = vaddw_s16(_sum0, vget_low_s16(_s8));
_sum1 = vaddw_s16(_sum1, vget_high_s16(_s8));

kptr += 8;
}
}

int32x4_t _sum = vaddq_s32(_sum0, _sum1);
#if __aarch64__
int sum = vaddvq_s32(_sum); // dot
#else
int32x2_t _ss = vadd_s32(vget_low_s32(_sum), vget_high_s32(_sum));
_ss = vpadd_s32(_ss, _ss);
int sum = vget_lane_s32(_ss, 0);
#endif

outptr[j] = sum;
}

outptr += outw;
}
}
}

+ 0
- 100
src/layer/arm/convolution_pack8to4_int8.h View File

@@ -1,100 +0,0 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

static void convolution_pack8to4_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
{
int w = bottom_blob.w;
int channels = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

const int maxk = kernel_w * kernel_h;

// kernel offsets
std::vector<int> _space_ofs(maxk);
int* space_ofs = &_space_ofs[0];
{
int p1 = 0;
int p2 = 0;
int gap = w * dilation_h - kernel_w * dilation_w;
for (int i = 0; i < kernel_h; i++)
{
for (int j = 0; j < kernel_w; j++)
{
space_ofs[p1] = p2;
p1++;
p2 += dilation_w;
}
p2 += gap;
}
}

// num_output
#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < outch; p++)
{
int* outptr = top_blob.channel(p);

for (int i = 0; i < outh; i++)
{
for (int j = 0; j < outw; j++)
{
int32x4_t _sum01 = vdupq_n_s32(0);
int32x4_t _sum23 = vdupq_n_s32(0);

const signed char* kptr = weight_data_int8.channel(p);

// channels
for (int q = 0; q < channels; q++)
{
const Mat m = bottom_blob.channel(q);
const signed char* sptr = m.row<signed char>(i * stride_h) + j * stride_w * 8;

for (int k = 0; k < maxk; k++)
{
int8x8_t _val = vld1_s8(sptr + space_ofs[k] * 8);

int8x8_t _w0 = vld1_s8(kptr);
int8x8_t _w1 = vld1_s8(kptr + 8);
int8x8_t _w2 = vld1_s8(kptr + 16);
int8x8_t _w3 = vld1_s8(kptr + 24);

int16x8_t _wv0 = vmull_s8(_val, _w0);
int16x8_t _wv1 = vmull_s8(_val, _w1);
int16x8_t _wv2 = vmull_s8(_val, _w2);
int16x8_t _wv3 = vmull_s8(_val, _w3);

int16x4_t _wv00 = vpadd_s16(vget_low_s16(_wv0), vget_high_s16(_wv0));
int16x4_t _wv11 = vpadd_s16(vget_low_s16(_wv1), vget_high_s16(_wv1));
int16x4_t _wv22 = vpadd_s16(vget_low_s16(_wv2), vget_high_s16(_wv2));
int16x4_t _wv33 = vpadd_s16(vget_low_s16(_wv3), vget_high_s16(_wv3));

_sum01 = vpadalq_s16(_sum01, vcombine_s16(_wv00, _wv11));
_sum23 = vpadalq_s16(_sum23, vcombine_s16(_wv22, _wv33));

kptr += 32;
}
}

int32x4_t _sum0 = vcombine_s32(vpadd_s32(vget_low_s32(_sum01), vget_high_s32(_sum01)), vpadd_s32(vget_low_s32(_sum23), vget_high_s32(_sum23)));

vst1q_s32(outptr + j * 4, _sum0);
}

outptr += outw * 4;
}
}
}

+ 1573
- 0
src/layer/arm/convolution_packed_int8.h
File diff suppressed because it is too large
View File


+ 21
- 0
tests/test_convolution_3.cpp View File

@@ -214,6 +214,27 @@ static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int
}
}

{
ncnn::Option opt;
opt.num_threads = 1;
opt.use_packing_layout = false;
opt.use_fp16_packed = false;
opt.use_fp16_storage = false;
opt.use_fp16_arithmetic = false;
opt.use_bf16_storage = false;
opt.use_shader_pack8 = false;
opt.use_image_storage = false;
opt.use_sgemm_convolution = false;
opt.use_winograd_convolution = false;

ret = test_layer_opt<ncnn::Convolution>("Convolution", pd, weights, opt, a, requant ? 1.0f : 0.001f, 0, flag);
if (ret != 0)
{
fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d requant=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, requant, activation_type, activation_params[0], activation_params[1]);
return ret;
}
}

{
ncnn::Option opt;
opt.num_threads = 1;


Loading…
Cancel
Save