| @@ -1,83 +0,0 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void conv1x1s1_sgemm_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| const int size = w * h; | |||
| Mat bottom_im2col = bottom_blob; | |||
| bottom_im2col.w = size; | |||
| bottom_im2col.h = 1; | |||
| im2col_sgemm_int8_neon(bottom_im2col, top_blob, kernel, opt); | |||
| } | |||
| static void conv1x1s2_sgemm_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int channels = bottom_blob.c; | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| int elempack = bottom_blob.elempack; | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| const int tailstep = w - 2 * outw + w; | |||
| Mat bottom_blob_shrinked; | |||
| bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator); | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p = 0; p < channels; p++) | |||
| { | |||
| const signed char* r0 = bottom_blob.channel(p); | |||
| signed char* outptr = bottom_blob_shrinked.channel(p); | |||
| for (int i = 0; i < outh; i++) | |||
| { | |||
| int j = 0; | |||
| for (; j + 3 < outw; j += 4) | |||
| { | |||
| outptr[0] = r0[0]; | |||
| outptr[1] = r0[2]; | |||
| outptr[2] = r0[4]; | |||
| outptr[3] = r0[6]; | |||
| r0 += 8; | |||
| outptr += 4; | |||
| } | |||
| for (; j + 1 < outw; j += 2) | |||
| { | |||
| outptr[0] = r0[0]; | |||
| outptr[1] = r0[2]; | |||
| r0 += 4; | |||
| outptr += 2; | |||
| } | |||
| for (; j < outw; j++) | |||
| { | |||
| outptr[0] = r0[0]; | |||
| r0 += 2; | |||
| outptr += 1; | |||
| } | |||
| r0 += tailstep; | |||
| } | |||
| } | |||
| conv1x1s1_sgemm_int8_neon(bottom_blob_shrinked, top_blob, kernel, opt); | |||
| } | |||
| @@ -1,83 +0,0 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void conv1x1s1_sgemm_pack1to4_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| const int size = w * h; | |||
| Mat bottom_im2col = bottom_blob; | |||
| bottom_im2col.w = size; | |||
| bottom_im2col.h = 1; | |||
| im2col_sgemm_pack1to4_int8_neon(bottom_im2col, top_blob, kernel, opt); | |||
| } | |||
| static void conv1x1s2_sgemm_pack1to4_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int channels = bottom_blob.c; | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| int elempack = bottom_blob.elempack; | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| const int tailstep = w - 2 * outw + w; | |||
| Mat bottom_blob_shrinked; | |||
| bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator); | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p = 0; p < channels; p++) | |||
| { | |||
| const signed char* r0 = bottom_blob.channel(p); | |||
| signed char* outptr = bottom_blob_shrinked.channel(p); | |||
| for (int i = 0; i < outh; i++) | |||
| { | |||
| int j = 0; | |||
| for (; j + 3 < outw; j += 4) | |||
| { | |||
| outptr[0] = r0[0]; | |||
| outptr[1] = r0[2]; | |||
| outptr[2] = r0[4]; | |||
| outptr[3] = r0[6]; | |||
| r0 += 8; | |||
| outptr += 4; | |||
| } | |||
| for (; j + 1 < outw; j += 2) | |||
| { | |||
| outptr[0] = r0[0]; | |||
| outptr[1] = r0[2]; | |||
| r0 += 4; | |||
| outptr += 2; | |||
| } | |||
| for (; j < outw; j++) | |||
| { | |||
| outptr[0] = r0[0]; | |||
| r0 += 2; | |||
| outptr += 1; | |||
| } | |||
| r0 += tailstep; | |||
| } | |||
| } | |||
| conv1x1s1_sgemm_pack1to4_int8_neon(bottom_blob_shrinked, top_blob, kernel, opt); | |||
| } | |||
| @@ -1,90 +0,0 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void conv1x1s1_sgemm_pack8to1_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| const int size = w * h; | |||
| Mat bottom_im2col = bottom_blob; | |||
| bottom_im2col.w = size; | |||
| bottom_im2col.h = 1; | |||
| im2col_sgemm_pack8to1_int8_neon(bottom_im2col, top_blob, kernel, opt); | |||
| } | |||
| static void conv1x1s2_sgemm_pack8to1_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int channels = bottom_blob.c; | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| int elempack = bottom_blob.elempack; | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| const int tailstep = (w - 2 * outw + w) * 8; | |||
| Mat bottom_blob_shrinked; | |||
| bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator); | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p = 0; p < channels; p++) | |||
| { | |||
| const signed char* r0 = bottom_blob.channel(p); | |||
| signed char* outptr = bottom_blob_shrinked.channel(p); | |||
| for (int i = 0; i < outh; i++) | |||
| { | |||
| int j = 0; | |||
| for (; j + 3 < outw; j += 4) | |||
| { | |||
| int8x8_t _v0 = vld1_s8(r0); | |||
| int8x8_t _v1 = vld1_s8(r0 + 16); | |||
| int8x8_t _v2 = vld1_s8(r0 + 32); | |||
| int8x8_t _v3 = vld1_s8(r0 + 48); | |||
| vst1_s8(outptr, _v0); | |||
| vst1_s8(outptr + 8, _v1); | |||
| vst1_s8(outptr + 16, _v2); | |||
| vst1_s8(outptr + 24, _v3); | |||
| r0 += 64; | |||
| outptr += 32; | |||
| } | |||
| for (; j + 1 < outw; j += 2) | |||
| { | |||
| int8x8_t _v0 = vld1_s8(r0); | |||
| int8x8_t _v1 = vld1_s8(r0 + 16); | |||
| vst1_s8(outptr, _v0); | |||
| vst1_s8(outptr + 8, _v1); | |||
| r0 += 32; | |||
| outptr += 16; | |||
| } | |||
| for (; j < outw; j++) | |||
| { | |||
| int8x8_t _v = vld1_s8(r0); | |||
| vst1_s8(outptr, _v); | |||
| r0 += 16; | |||
| outptr += 8; | |||
| } | |||
| r0 += tailstep; | |||
| } | |||
| } | |||
| conv1x1s1_sgemm_pack8to1_int8_neon(bottom_blob_shrinked, top_blob, kernel, opt); | |||
| } | |||
| @@ -1,90 +0,0 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void conv1x1s1_sgemm_pack8to4_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| const int size = w * h; | |||
| Mat bottom_im2col = bottom_blob; | |||
| bottom_im2col.w = size; | |||
| bottom_im2col.h = 1; | |||
| im2col_sgemm_pack8to4_int8_neon(bottom_im2col, top_blob, kernel, opt); | |||
| } | |||
| static void conv1x1s2_sgemm_pack8to4_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int channels = bottom_blob.c; | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| int elempack = bottom_blob.elempack; | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| const int tailstep = (w - 2 * outw + w) * 8; | |||
| Mat bottom_blob_shrinked; | |||
| bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator); | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p = 0; p < channels; p++) | |||
| { | |||
| const signed char* r0 = bottom_blob.channel(p); | |||
| signed char* outptr = bottom_blob_shrinked.channel(p); | |||
| for (int i = 0; i < outh; i++) | |||
| { | |||
| int j = 0; | |||
| for (; j + 3 < outw; j += 4) | |||
| { | |||
| int8x8_t _v0 = vld1_s8(r0); | |||
| int8x8_t _v1 = vld1_s8(r0 + 16); | |||
| int8x8_t _v2 = vld1_s8(r0 + 32); | |||
| int8x8_t _v3 = vld1_s8(r0 + 48); | |||
| vst1_s8(outptr, _v0); | |||
| vst1_s8(outptr + 8, _v1); | |||
| vst1_s8(outptr + 16, _v2); | |||
| vst1_s8(outptr + 24, _v3); | |||
| r0 += 64; | |||
| outptr += 32; | |||
| } | |||
| for (; j + 1 < outw; j += 2) | |||
| { | |||
| int8x8_t _v0 = vld1_s8(r0); | |||
| int8x8_t _v1 = vld1_s8(r0 + 16); | |||
| vst1_s8(outptr, _v0); | |||
| vst1_s8(outptr + 8, _v1); | |||
| r0 += 32; | |||
| outptr += 16; | |||
| } | |||
| for (; j < outw; j++) | |||
| { | |||
| int8x8_t _v = vld1_s8(r0); | |||
| vst1_s8(outptr, _v); | |||
| r0 += 16; | |||
| outptr += 8; | |||
| } | |||
| r0 += tailstep; | |||
| } | |||
| } | |||
| conv1x1s1_sgemm_pack8to4_int8_neon(bottom_blob_shrinked, top_blob, kernel, opt); | |||
| } | |||
| @@ -1,147 +0,0 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void conv3x3s1_pack1to4_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int inch = bottom_blob.c; | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| const int size = outw * outh; | |||
| const int maxk = 9; | |||
| // im2col | |||
| Mat bottom_im2col(size, maxk, inch, 1u, 1, opt.workspace_allocator); | |||
| { | |||
| const int gap = w - outw; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p = 0; p < inch; p++) | |||
| { | |||
| const Mat img = bottom_blob.channel(p); | |||
| signed char* ptr = bottom_im2col.channel(p); | |||
| for (int u = 0; u < 3; u++) | |||
| { | |||
| for (int v = 0; v < 3; v++) | |||
| { | |||
| const signed char* sptr = img.row<const signed char>(u) + v; | |||
| for (int i = 0; i < outh; i++) | |||
| { | |||
| int j = 0; | |||
| for (; j + 3 < outw; j += 4) | |||
| { | |||
| ptr[0] = sptr[0]; | |||
| ptr[1] = sptr[1]; | |||
| ptr[2] = sptr[2]; | |||
| ptr[3] = sptr[3]; | |||
| sptr += 4; | |||
| ptr += 4; | |||
| } | |||
| for (; j + 1 < outw; j += 2) | |||
| { | |||
| ptr[0] = sptr[0]; | |||
| ptr[1] = sptr[1]; | |||
| sptr += 2; | |||
| ptr += 2; | |||
| } | |||
| for (; j < outw; j++) | |||
| { | |||
| ptr[0] = sptr[0]; | |||
| sptr += 1; | |||
| ptr += 1; | |||
| } | |||
| sptr += gap; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| im2col_sgemm_pack1to4_int8_neon(bottom_im2col, top_blob, kernel, opt); | |||
| } | |||
| static void conv3x3s2_pack1to4_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int inch = bottom_blob.c; | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| const int size = outw * outh; | |||
| const int maxk = 9; | |||
| // im2col | |||
| Mat bottom_im2col(size, maxk, inch, 1u, 1, opt.workspace_allocator); | |||
| { | |||
| const int gap = w * 2 - outw * 2; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p = 0; p < inch; p++) | |||
| { | |||
| const Mat img = bottom_blob.channel(p); | |||
| signed char* ptr = bottom_im2col.channel(p); | |||
| for (int u = 0; u < 3; u++) | |||
| { | |||
| for (int v = 0; v < 3; v++) | |||
| { | |||
| const signed char* sptr = img.row<const signed char>(u) + v; | |||
| for (int i = 0; i < outh; i++) | |||
| { | |||
| int j = 0; | |||
| for (; j + 3 < outw; j += 4) | |||
| { | |||
| ptr[0] = sptr[0]; | |||
| ptr[1] = sptr[2]; | |||
| ptr[2] = sptr[4]; | |||
| ptr[3] = sptr[6]; | |||
| sptr += 8; | |||
| ptr += 4; | |||
| } | |||
| for (; j + 1 < outw; j += 2) | |||
| { | |||
| ptr[0] = sptr[0]; | |||
| ptr[1] = sptr[2]; | |||
| sptr += 4; | |||
| ptr += 2; | |||
| } | |||
| for (; j < outw; j++) | |||
| { | |||
| ptr[0] = sptr[0]; | |||
| sptr += 2; | |||
| ptr += 1; | |||
| } | |||
| sptr += gap; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| im2col_sgemm_pack1to4_int8_neon(bottom_im2col, top_blob, kernel, opt); | |||
| } | |||
| @@ -1,80 +0,0 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void conv7x7s2_pack1to4_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int inch = bottom_blob.c; | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| const int size = outw * outh; | |||
| const int maxk = 49; | |||
| // im2col | |||
| Mat bottom_im2col(size, maxk, inch, 1u, 1, opt.workspace_allocator); | |||
| { | |||
| const int gap = w * 2 - outw * 2; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p = 0; p < inch; p++) | |||
| { | |||
| const Mat img = bottom_blob.channel(p); | |||
| signed char* ptr = bottom_im2col.channel(p); | |||
| for (int u = 0; u < 7; u++) | |||
| { | |||
| for (int v = 0; v < 7; v++) | |||
| { | |||
| const signed char* sptr = img.row<const signed char>(u) + v; | |||
| for (int i = 0; i < outh; i++) | |||
| { | |||
| int j = 0; | |||
| for (; j + 3 < outw; j += 4) | |||
| { | |||
| ptr[0] = sptr[0]; | |||
| ptr[1] = sptr[2]; | |||
| ptr[2] = sptr[4]; | |||
| ptr[3] = sptr[6]; | |||
| sptr += 8; | |||
| ptr += 4; | |||
| } | |||
| for (; j + 1 < outw; j += 2) | |||
| { | |||
| ptr[0] = sptr[0]; | |||
| ptr[1] = sptr[2]; | |||
| sptr += 4; | |||
| ptr += 2; | |||
| } | |||
| for (; j < outw; j++) | |||
| { | |||
| ptr[0] = sptr[0]; | |||
| sptr += 2; | |||
| ptr += 1; | |||
| } | |||
| sptr += gap; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| im2col_sgemm_pack1to4_int8_neon(bottom_im2col, top_blob, kernel, opt); | |||
| } | |||
| @@ -48,10 +48,10 @@ namespace ncnn { | |||
| #endif // NCNN_BF16 | |||
| #if NCNN_INT8 | |||
| #include "convolution_sgemm_int8.h" | |||
| #include "convolution_im2col_gemm_int8.h" | |||
| #include "convolution_winograd_transform_int8.h" | |||
| #include "convolution_winograd_dot_int8.h" | |||
| #include "convolution_1x1_int8.h" | |||
| #include "convolution_3x3_int8.h" | |||
| #include "convolution_int8.h" | |||
| #endif // NCNN_INT8 | |||
| @@ -74,19 +74,11 @@ namespace ncnn { | |||
| #include "convolution_pack8to4_int8.h" | |||
| #include "convolution_pack1to4_int8.h" | |||
| #include "convolution_pack8to1_int8.h" | |||
| #include "convolution_sgemm_pack8to4_int8.h" | |||
| #include "convolution_sgemm_pack1to4_int8.h" | |||
| #include "convolution_sgemm_pack8to1_int8.h" | |||
| #include "convolution_winograd_transform_pack4_int8.h" | |||
| #include "convolution_winograd_transform_pack8_int8.h" | |||
| #include "convolution_winograd_dot_pack8to4_int8.h" | |||
| #include "convolution_winograd_dot_pack8to1_int8.h" | |||
| #include "convolution_1x1_pack8to4_int8.h" | |||
| #include "convolution_1x1_pack1to4_int8.h" | |||
| #include "convolution_1x1_pack8to1_int8.h" | |||
| #include "convolution_3x3_pack8to4_int8.h" | |||
| #include "convolution_3x3_pack1to4_int8.h" | |||
| #include "convolution_7x7_pack1to4_int8.h" | |||
| #include "convolution_3x3_pack8to1_int8.h" | |||
| #endif // NCNN_INT8 | |||
| #endif // __ARM_NEON | |||
| @@ -1303,121 +1295,41 @@ int Convolution_arm::create_pipeline_int8_arm(const Option& opt) | |||
| } | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| if (elempack == 8 && out_elempack == 4) | |||
| { | |||
| if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_pack8to4_int8_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); | |||
| } | |||
| else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_pack8to4_int8_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); | |||
| } | |||
| #if NCNN_ARM82DOT | |||
| else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && (!ncnn::cpu_support_arm_asimddp() || (ncnn::cpu_support_arm_asimddp() && num_input >= 256 && num_output >= 256))) | |||
| if (elempack == 8 && out_elempack == 4 && opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && (!ncnn::cpu_support_arm_asimddp() || (ncnn::cpu_support_arm_asimddp() && num_input >= 256 && num_output >= 256))) | |||
| #else | |||
| else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| if (elempack == 8 && out_elempack == 4 && opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| #endif | |||
| { | |||
| conv3x3s1_winograd43_transform_kernel_pack8to4_int8_neon(weight_data, weight_winograd43_data, num_input, num_output, opt); | |||
| } | |||
| else if (opt.use_sgemm_convolution) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_pack8to4_int8_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); | |||
| } | |||
| else | |||
| { | |||
| convolution_transform_kernel_packed_int8_neon(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); | |||
| } | |||
| { | |||
| #if __ARM_NEON | |||
| conv3x3s1_winograd43_transform_kernel_pack8to4_int8_neon(weight_data, weight_winograd43_data, num_input, num_output, opt); | |||
| #endif // __ARM_NEON | |||
| } | |||
| if (elempack == 1 && out_elempack == 4) | |||
| else if (elempack == 8 && out_elempack == 1 && opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_pack1to4_int8_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); | |||
| } | |||
| else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_pack1to4_int8_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); | |||
| } | |||
| else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_pack1to4_int8_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); | |||
| } | |||
| else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_pack1to4_int8_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); | |||
| } | |||
| else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_pack1to4_int8_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); | |||
| } | |||
| else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_pack1to4_int8_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); | |||
| } | |||
| else | |||
| { | |||
| convolution_transform_kernel_packed_int8_neon(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); | |||
| } | |||
| #if __ARM_NEON | |||
| conv3x3s1_winograd43_transform_kernel_pack8to1_int8_neon(weight_data, weight_winograd43_data, num_input, num_output, opt); | |||
| #endif // __ARM_NEON | |||
| } | |||
| if (elempack == 8 && out_elempack == 1) | |||
| else if (elempack == 1 && out_elempack == 1 && opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_pack8to1_int8_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); | |||
| } | |||
| else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_pack8to1_int8_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); | |||
| } | |||
| else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| conv3x3s1_winograd43_transform_kernel_pack8to1_int8_neon(weight_data, weight_winograd43_data, num_input, num_output, opt); | |||
| } | |||
| else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_pack8to1_int8_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); | |||
| } | |||
| else | |||
| { | |||
| convolution_transform_kernel_packed_int8_neon(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); | |||
| } | |||
| conv3x3s1_winograd43_transform_kernel_int8_neon(weight_data, weight_winograd43_data, num_input, num_output, opt); | |||
| } | |||
| #endif // __ARM_NEON | |||
| if (elempack == 1 && out_elempack == 1) | |||
| else if (opt.use_sgemm_convolution) | |||
| { | |||
| if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| conv3x3s1_winograd43_transform_kernel_int8_neon(weight_data, weight_winograd43_data, num_input, num_output, opt); | |||
| } | |||
| /* if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| conv3x3s2_transform_kernel_int8_neon(weight_data, weight_3x3s2_data_int8, num_input, num_output); | |||
| } | |||
| else */ | |||
| if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_int8_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); | |||
| } | |||
| else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_int8_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); | |||
| } | |||
| else if (opt.use_sgemm_convolution) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_int8_neon(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); | |||
| } | |||
| else | |||
| { | |||
| weight_data_tm = weight_data; | |||
| } | |||
| convolution_im2col_gemm_transform_kernel_int8(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h, opt); | |||
| } | |||
| else if (elempack == 1 && out_elempack == 1) | |||
| { | |||
| // if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| // { | |||
| // conv3x3s2_transform_kernel_int8_neon(weight_data, weight_3x3s2_data_int8, num_input, num_output); | |||
| // } | |||
| weight_data_tm = weight_data; | |||
| } | |||
| else | |||
| { | |||
| convolution_transform_kernel_packed_int8_neon(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); | |||
| } | |||
| scale_in_data.create(num_output); | |||
| @@ -1515,114 +1427,55 @@ int Convolution_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, con | |||
| if (top_blob_int32.empty()) | |||
| return -100; | |||
| #if __ARM_NEON | |||
| if (elempack == 8 && out_elempack_int32 == 4) | |||
| int _nT = nT ? nT : opt.num_threads; | |||
| if (nT != 0 && opt.num_threads != nT) | |||
| { | |||
| if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| conv1x1s1_sgemm_pack8to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt); | |||
| } | |||
| else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| conv1x1s2_sgemm_pack8to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt); | |||
| } | |||
| // force num_threads the same as in create_pipeline | |||
| // so we could use pre-packed A/B from the same tile config | |||
| NCNN_LOGE("opt.num_threads %d changed, convolution gemm will use load-time value %d", opt.num_threads, nT); | |||
| } | |||
| #if NCNN_ARM82DOT | |||
| else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && (!ncnn::cpu_support_arm_asimddp() || (ncnn::cpu_support_arm_asimddp() && num_input >= 256 && num_output >= 256))) | |||
| if (elempack == 8 && out_elempack_int32 == 4 && opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && (!ncnn::cpu_support_arm_asimddp() || (ncnn::cpu_support_arm_asimddp() && num_input >= 256 && num_output >= 256))) | |||
| #else | |||
| else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| if (elempack == 8 && out_elempack_int32 == 4 && opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| #endif | |||
| { | |||
| conv3x3s1_winograd43_pack8to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt); | |||
| } | |||
| else if (opt.use_sgemm_convolution) | |||
| { | |||
| convolution_im2col_sgemm_pack8to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| } | |||
| else | |||
| { | |||
| convolution_pack8to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| } | |||
| { | |||
| #if __ARM_NEON | |||
| conv3x3s1_winograd43_pack8to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt); | |||
| #endif // __ARM_NEON | |||
| } | |||
| if (elempack == 1 && out_elempack_int32 == 4) | |||
| else if (elempack == 8 && out_elempack_int32 == 1 && opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| conv1x1s1_sgemm_pack1to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt); | |||
| } | |||
| else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| conv1x1s2_sgemm_pack1to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt); | |||
| } | |||
| else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| conv3x3s1_pack1to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt); | |||
| } | |||
| else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| conv3x3s2_pack1to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt); | |||
| } | |||
| else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| conv7x7s2_pack1to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt); | |||
| } | |||
| else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8) | |||
| { | |||
| convolution_im2col_sgemm_pack1to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| } | |||
| else | |||
| { | |||
| convolution_pack1to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| } | |||
| #if __ARM_NEON | |||
| conv3x3s1_winograd43_pack8to1_int8_neon(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt); | |||
| #endif // __ARM_NEON | |||
| } | |||
| if (elempack == 8 && out_elempack_int32 == 1) | |||
| else if (elempack == 1 && out_elempack_int32 == 1 && opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| conv1x1s1_sgemm_pack8to1_int8_neon(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt); | |||
| } | |||
| else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| conv1x1s2_sgemm_pack8to1_int8_neon(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt); | |||
| } | |||
| else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| conv3x3s1_winograd43_pack8to1_int8_neon(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt); | |||
| } | |||
| else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8) | |||
| { | |||
| convolution_im2col_sgemm_pack8to1_int8_neon(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| } | |||
| else | |||
| { | |||
| convolution_pack8to1_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| } | |||
| conv3x3s1_winograd43_int8_neon(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt); | |||
| } | |||
| #endif // __ARM_NEON | |||
| if (elempack == 1 && out_elempack_int32 == 1) | |||
| else if (opt.use_sgemm_convolution) | |||
| { | |||
| if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| conv1x1s1_sgemm_int8_neon(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt); | |||
| } | |||
| else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| conv1x1s2_sgemm_int8_neon(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt); | |||
| } | |||
| else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| conv3x3s1_winograd43_int8_neon(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt); | |||
| } | |||
| else if (opt.use_sgemm_convolution) | |||
| { | |||
| convolution_im2col_sgemm_int8_neon(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| } | |||
| else | |||
| { | |||
| convolution_int8(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| } | |||
| convolution_im2col_gemm_int8(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, _nT, opt); | |||
| } | |||
| #if __ARM_NEON | |||
| else if (elempack == 8 && out_elempack_int32 == 4) | |||
| { | |||
| convolution_pack8to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| } | |||
| else if (elempack == 1 && out_elempack_int32 == 4) | |||
| { | |||
| convolution_pack1to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| } | |||
| else if (elempack == 8 && out_elempack_int32 == 1) | |||
| { | |||
| convolution_pack8to1_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| } | |||
| #endif // __ARM_NEON | |||
| else // if (elempack == 1 && out_elempack_int32 == 1) | |||
| { | |||
| convolution_int8(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| } | |||
| if (use_int8_requantize) | |||
| @@ -17,53 +17,17 @@ | |||
| namespace ncnn { | |||
| #include "convolution_sgemm_int8.h" | |||
| #include "convolution_sgemm_pack1to4_int8.h" | |||
| #include "convolution_sgemm_pack8to1_int8.h" | |||
| #include "convolution_sgemm_pack8to4_int8.h" | |||
| #include "convolution_im2col_gemm_int8.h" | |||
| // pack1 | |||
| void im2col_sgemm_int8_neon_asimddp(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| // gemm | |||
| void convolution_im2col_gemm_transform_kernel_int8_asimddp(const Mat& kernel, Mat& AT, int inch, int outch, int kernel_w, int kernel_h, const Option& opt) | |||
| { | |||
| im2col_sgemm_int8_neon(bottom_im2col, top_blob, kernel, opt); | |||
| convolution_im2col_gemm_transform_kernel_int8(kernel, AT, inch, outch, kernel_w, kernel_h, opt); | |||
| } | |||
| void convolution_im2col_sgemm_transform_kernel_int8_neon_asimddp(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h) | |||
| void convolution_im2col_gemm_int8_asimddp(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_int8_neon(kernel, kernel_tm, inch, outch, kernel_w, kernel_h); | |||
| } | |||
| // pack1to4 | |||
| void im2col_sgemm_pack1to4_int8_neon_asimddp(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| { | |||
| im2col_sgemm_pack1to4_int8_neon(bottom_im2col, top_blob, kernel, opt); | |||
| } | |||
| void convolution_im2col_sgemm_transform_kernel_pack1to4_int8_neon_asimddp(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_pack1to4_int8_neon(kernel, kernel_tm, inch, outch, kernel_w, kernel_h); | |||
| } | |||
| // pack8to1 | |||
| void im2col_sgemm_pack8to1_int8_neon_asimddp(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| { | |||
| im2col_sgemm_pack8to1_int8_neon(bottom_im2col, top_blob, kernel, opt); | |||
| } | |||
| void convolution_im2col_sgemm_transform_kernel_pack8to1_int8_neon_asimddp(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_pack8to1_int8_neon(kernel, kernel_tm, inch, outch, kernel_w, kernel_h); | |||
| } | |||
| // pack8to4 | |||
| void im2col_sgemm_pack8to4_int8_neon_asimddp(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| { | |||
| im2col_sgemm_pack8to4_int8_neon(bottom_im2col, top_blob, kernel, opt); | |||
| } | |||
| void convolution_im2col_sgemm_transform_kernel_pack8to4_int8_neon_asimddp(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_pack8to4_int8_neon(kernel, kernel_tm, inch, outch, kernel_w, kernel_h); | |||
| convolution_im2col_gemm_int8(bottom_blob, top_blob, AT, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, nT, opt); | |||
| } | |||
| } // namespace ncnn | |||
| @@ -17,53 +17,17 @@ | |||
| namespace ncnn { | |||
| #include "convolution_sgemm_int8.h" | |||
| #include "convolution_sgemm_pack1to4_int8.h" | |||
| #include "convolution_sgemm_pack8to1_int8.h" | |||
| #include "convolution_sgemm_pack8to4_int8.h" | |||
| #include "convolution_im2col_gemm_int8.h" | |||
| // pack1 | |||
| void im2col_sgemm_int8_neon_i8mm(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| // gemm | |||
| void convolution_im2col_gemm_transform_kernel_int8_i8mm(const Mat& kernel, Mat& AT, int inch, int outch, int kernel_w, int kernel_h, const Option& opt) | |||
| { | |||
| im2col_sgemm_int8_neon(bottom_im2col, top_blob, kernel, opt); | |||
| convolution_im2col_gemm_transform_kernel_int8(kernel, AT, inch, outch, kernel_w, kernel_h, opt); | |||
| } | |||
| void convolution_im2col_sgemm_transform_kernel_int8_neon_i8mm(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h) | |||
| void convolution_im2col_gemm_int8_i8mm(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_int8_neon(kernel, kernel_tm, inch, outch, kernel_w, kernel_h); | |||
| } | |||
| // pack1to4 | |||
| void im2col_sgemm_pack1to4_int8_neon_i8mm(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| { | |||
| im2col_sgemm_pack1to4_int8_neon(bottom_im2col, top_blob, kernel, opt); | |||
| } | |||
| void convolution_im2col_sgemm_transform_kernel_pack1to4_int8_neon_i8mm(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_pack1to4_int8_neon(kernel, kernel_tm, inch, outch, kernel_w, kernel_h); | |||
| } | |||
| // pack8to1 | |||
| void im2col_sgemm_pack8to1_int8_neon_i8mm(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| { | |||
| im2col_sgemm_pack8to1_int8_neon(bottom_im2col, top_blob, kernel, opt); | |||
| } | |||
| void convolution_im2col_sgemm_transform_kernel_pack8to1_int8_neon_i8mm(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_pack8to1_int8_neon(kernel, kernel_tm, inch, outch, kernel_w, kernel_h); | |||
| } | |||
| // pack8to4 | |||
| void im2col_sgemm_pack8to4_int8_neon_i8mm(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| { | |||
| im2col_sgemm_pack8to4_int8_neon(bottom_im2col, top_blob, kernel, opt); | |||
| } | |||
| void convolution_im2col_sgemm_transform_kernel_pack8to4_int8_neon_i8mm(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_pack8to4_int8_neon(kernel, kernel_tm, inch, outch, kernel_w, kernel_h); | |||
| convolution_im2col_gemm_int8(bottom_blob, top_blob, AT, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, nT, opt); | |||
| } | |||
| } // namespace ncnn | |||
| @@ -310,7 +310,13 @@ static int test_convolution_1() | |||
| || test_convolution_int8(4, 20, 16, 24, 3, 1, 1, 1, 0) | |||
| || test_convolution_int8(6, 7, 64, 64, 3, 1, 2, 0, 1) | |||
| || test_convolution_int8(25, 33, 16, 15, 3, 1, 1, 1, 0) | |||
| || test_convolution_int8(7, 7, 15, 12, 3, 1, 1, 1, 0); | |||
| || test_convolution_int8(7, 7, 15, 12, 3, 1, 1, 1, 0) | |||
| || test_convolution_int8(5, 6, 31, 9, 5, 1, 1, 0, 1) | |||
| || test_convolution_int8(5, 7, 32, 8, 5, 1, 2, 0, 1) | |||
| || test_convolution_int8(16, 10, 31, 32, 2, 1, 3, 0, 0) | |||
| || test_convolution_int8(5, 10, 5, 32, 3, 2, 1, 0, 1) | |||
| || test_convolution_int8(3, 9, 16, 13, 2, 2, 1, 0, 0) | |||
| || test_convolution_int8(33, 5, 15, 5, 2, 1, 3, 0, 1); | |||
| } | |||
| static int test_convolution_1_2() | |||
| @@ -326,65 +332,65 @@ static int test_convolution_1_2() | |||
| || test_convolution_int8(19, 17, 32, 1, 3, 2, 2, 0, 0) | |||
| || test_convolution_int8(19, 17, 1, 2, 5, 2, 2, 0, 1) | |||
| || test_convolution_int8(19, 17, 2, 2, 5, 2, 2, 0, 0) | |||
| || test_convolution_int8(19, 17, 2, 2, 5, 2, 3, 0, 0) | |||
| || test_convolution_int8(19, 17, 7, 2, 5, 2, 2, 0, 1) | |||
| || test_convolution_int8(19, 17, 8, 2, 5, 2, 2, 0, 0) | |||
| || test_convolution_int8(19, 17, 8, 2, 5, 2, 3, 0, 0) | |||
| || test_convolution_int8(19, 17, 15, 2, 5, 2, 2, 0, 1) | |||
| || test_convolution_int8(19, 17, 16, 2, 5, 2, 2, 0, 0) | |||
| || test_convolution_int8(19, 17, 16, 2, 5, 2, 3, 0, 0) | |||
| || test_convolution_int8(19, 17, 31, 2, 5, 2, 2, 0, 1) | |||
| || test_convolution_int8(19, 17, 32, 2, 5, 2, 2, 0, 0) | |||
| || test_convolution_int8(19, 17, 32, 2, 5, 2, 3, 0, 0) | |||
| || test_convolution_int8(19, 17, 1, 7, 5, 2, 2, 0, 1) | |||
| || test_convolution_int8(19, 17, 1, 7, 5, 2, 3, 0, 1) | |||
| || test_convolution_int8(19, 17, 2, 7, 5, 2, 2, 0, 0) | |||
| || test_convolution_int8(19, 17, 7, 7, 5, 2, 2, 0, 1) | |||
| || test_convolution_int8(19, 17, 7, 7, 5, 2, 3, 0, 1) | |||
| || test_convolution_int8(19, 17, 8, 7, 5, 2, 2, 0, 0) | |||
| || test_convolution_int8(19, 17, 15, 7, 5, 2, 2, 0, 1) | |||
| || test_convolution_int8(19, 17, 15, 7, 5, 2, 3, 0, 1) | |||
| || test_convolution_int8(19, 17, 16, 7, 5, 2, 2, 0, 0) | |||
| || test_convolution_int8(19, 17, 31, 7, 5, 2, 2, 0, 1) | |||
| || test_convolution_int8(19, 17, 31, 7, 5, 2, 3, 0, 1) | |||
| || test_convolution_int8(19, 17, 32, 7, 5, 2, 2, 0, 0) | |||
| || test_convolution_int8(19, 17, 1, 8, 5, 2, 2, 0, 1) | |||
| || test_convolution_int8(19, 17, 2, 8, 5, 2, 2, 0, 0) | |||
| || test_convolution_int8(19, 17, 2, 8, 5, 2, 3, 0, 0) | |||
| || test_convolution_int8(19, 17, 7, 8, 5, 2, 2, 0, 1) | |||
| || test_convolution_int8(19, 17, 8, 8, 5, 2, 2, 0, 0) | |||
| || test_convolution_int8(19, 17, 8, 8, 5, 2, 3, 0, 0) | |||
| || test_convolution_int8(19, 17, 15, 8, 5, 2, 2, 0, 1) | |||
| || test_convolution_int8(19, 17, 16, 8, 5, 2, 2, 0, 0) | |||
| || test_convolution_int8(19, 17, 16, 8, 5, 2, 3, 0, 0) | |||
| || test_convolution_int8(19, 17, 31, 8, 5, 2, 2, 0, 1) | |||
| || test_convolution_int8(19, 17, 32, 8, 5, 2, 2, 0, 0) | |||
| || test_convolution_int8(19, 17, 32, 8, 5, 2, 3, 0, 0) | |||
| || test_convolution_int8(19, 17, 1, 15, 5, 2, 2, 0, 1) | |||
| || test_convolution_int8(19, 17, 1, 15, 5, 2, 3, 0, 1) | |||
| || test_convolution_int8(19, 17, 2, 15, 5, 2, 2, 0, 0) | |||
| || test_convolution_int8(19, 17, 7, 15, 5, 2, 2, 0, 1) | |||
| || test_convolution_int8(19, 17, 7, 15, 5, 2, 3, 0, 1) | |||
| || test_convolution_int8(19, 17, 8, 15, 5, 2, 2, 0, 0) | |||
| || test_convolution_int8(19, 17, 15, 15, 5, 2, 2, 0, 1) | |||
| || test_convolution_int8(19, 17, 15, 15, 5, 2, 3, 0, 1) | |||
| || test_convolution_int8(19, 17, 16, 15, 5, 2, 2, 0, 0) | |||
| || test_convolution_int8(19, 17, 31, 15, 5, 2, 2, 0, 1) | |||
| || test_convolution_int8(19, 17, 31, 15, 5, 2, 3, 0, 1) | |||
| || test_convolution_int8(19, 17, 32, 15, 5, 2, 2, 0, 0) | |||
| || test_convolution_int8(19, 17, 1, 16, 5, 2, 2, 0, 1) | |||
| || test_convolution_int8(19, 17, 2, 16, 5, 2, 2, 0, 0) | |||
| || test_convolution_int8(19, 17, 1, 16, 5, 2, 3, 0, 1) | |||
| || test_convolution_int8(19, 17, 2, 16, 5, 2, 3, 0, 0) | |||
| || test_convolution_int8(19, 17, 7, 16, 5, 2, 2, 0, 1) | |||
| || test_convolution_int8(19, 17, 8, 16, 5, 2, 2, 0, 0) | |||
| || test_convolution_int8(19, 17, 15, 16, 5, 2, 2, 0, 1) | |||
| || test_convolution_int8(19, 17, 16, 16, 5, 2, 2, 0, 0) | |||
| || test_convolution_int8(19, 17, 31, 16, 5, 2, 2, 0, 1) | |||
| || test_convolution_int8(19, 17, 32, 16, 5, 2, 2, 0, 0) | |||
| || test_convolution_int8(19, 17, 31, 16, 5, 2, 3, 0, 1) | |||
| || test_convolution_int8(19, 17, 32, 16, 5, 2, 3, 0, 0) | |||
| || test_convolution_int8(19, 17, 1, 31, 5, 2, 2, 0, 1) | |||
| || test_convolution_int8(19, 17, 2, 31, 5, 2, 2, 0, 0) | |||
| || test_convolution_int8(19, 17, 7, 31, 5, 2, 2, 0, 1) | |||
| || test_convolution_int8(19, 17, 2, 31, 5, 2, 3, 0, 0) | |||
| || test_convolution_int8(19, 17, 7, 31, 5, 2, 3, 0, 1) | |||
| || test_convolution_int8(19, 17, 8, 31, 5, 2, 2, 0, 0) | |||
| || test_convolution_int8(19, 17, 15, 31, 5, 2, 2, 0, 1) | |||
| || test_convolution_int8(19, 17, 16, 31, 5, 2, 2, 0, 0) | |||
| || test_convolution_int8(19, 17, 31, 31, 5, 2, 2, 0, 1) | |||
| || test_convolution_int8(19, 17, 16, 31, 5, 2, 3, 0, 0) | |||
| || test_convolution_int8(19, 17, 31, 31, 5, 2, 3, 0, 1) | |||
| || test_convolution_int8(19, 17, 32, 31, 5, 2, 2, 0, 0) | |||
| || test_convolution_int8(19, 17, 1, 32, 5, 2, 2, 0, 1) | |||
| || test_convolution_int8(19, 17, 2, 32, 5, 2, 2, 0, 0) | |||
| || test_convolution_int8(19, 17, 7, 32, 5, 2, 2, 0, 1) | |||
| || test_convolution_int8(19, 17, 8, 32, 5, 2, 2, 0, 0) | |||
| || test_convolution_int8(19, 17, 15, 32, 5, 2, 2, 0, 1) | |||
| || test_convolution_int8(19, 17, 16, 32, 5, 2, 2, 0, 0) | |||
| || test_convolution_int8(19, 17, 7, 32, 5, 2, 3, 0, 1) | |||
| || test_convolution_int8(19, 17, 8, 32, 5, 2, 3, 0, 0) | |||
| || test_convolution_int8(19, 17, 15, 32, 5, 2, 3, 0, 1) | |||
| || test_convolution_int8(19, 17, 16, 32, 5, 2, 3, 0, 0) | |||
| || test_convolution_int8(19, 17, 31, 32, 5, 2, 2, 0, 1) | |||
| || test_convolution_int8(19, 17, 32, 32, 5, 2, 2, 0, 0); | |||
| } | |||