From baf49574c4ea3333cf2b9772094a919e605fdf2b Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Tue, 5 Jan 2021 23:29:14 +0800 Subject: [PATCH] innerproduct aarch64 use gemm (#2521) * perf(innerproduct-arm): add aarch64 gemm * fix(innerproduct): fix compilation errror * fix(armv7-innerproduct): fix armv7 compilation error * fix(innerproduct): fix gemm param * fix(int8): update mock scales and fix runtime error * fix(compilation): fix compilation error --- src/layer/arm/gemm_symm_int8.h | 2 +- src/layer/arm/innerproduct_arm.cpp | 108 ++++++++++++++++++++++++++++- src/layer/arm/innerproduct_arm.h | 9 +++ src/layer/innerproduct.cpp | 5 +- tests/test_convolution.cpp | 13 ++-- tests/test_innerproduct.cpp | 14 ++-- tests/testutil.h | 25 +++++++ 7 files changed, 161 insertions(+), 15 deletions(-) diff --git a/src/layer/arm/gemm_symm_int8.h b/src/layer/arm/gemm_symm_int8.h index 6154ae253..912ce0b04 100644 --- a/src/layer/arm/gemm_symm_int8.h +++ b/src/layer/arm/gemm_symm_int8.h @@ -2667,7 +2667,7 @@ static void int8kernel_m4(void* dst, int8_t* sa, int8_t* sb, int, int k, int n, #undef DECOMPOSE_K #undef DECOMPOSE_N -static void int8kernel(void* dst, const int8_t* sa, const int8_t* sb, int m, int k, int n, int ldc, float* scales, float* bias, const Option& opt) +static void int8kernel(void* dst, const int8_t* sa, const int8_t* sb, int m, int k, int n, int ldc, float* scales, float* bias, const ncnn::Option& opt) { int8_t* pa = (int8_t*)sa; int8_t* pb = (int8_t*)sb; diff --git a/src/layer/arm/innerproduct_arm.cpp b/src/layer/arm/innerproduct_arm.cpp index caab529a0..ac151db86 100644 --- a/src/layer/arm/innerproduct_arm.cpp +++ b/src/layer/arm/innerproduct_arm.cpp @@ -19,6 +19,9 @@ #if __ARM_NEON #include #include "neon_mathfun.h" +#if __aarch64__ +#include "gemm_symm_int8.h" +#endif #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC #include "neon_mathfun_fp16s.h" #endif @@ -45,7 +48,7 @@ InnerProduct_arm::InnerProduct_arm() int InnerProduct_arm::create_pipeline(const Option& opt) { #if __ARM_NEON - if (opt.use_packing_layout) + if (opt.use_packing_layout or opt.use_int8_inference) { flatten = ncnn::create_layer(ncnn::LayerType::Flatten); @@ -69,6 +72,11 @@ int InnerProduct_arm::create_pipeline(const Option& opt) return create_pipeline_bf16s(opt); } + if (opt.use_int8_inference) + { + return create_pipeline_int8(opt); + } + return 0; } @@ -84,12 +92,108 @@ int InnerProduct_arm::destroy_pipeline(const Option& opt) return 0; } +int InnerProduct_arm::create_pipeline_int8(const Option& opt) +{ + // convert fp32 to int8 + if (weight_data_int8_scales.empty()) + { + return 0; + } +#if __aarch64__ + // first reorder Matrix A before MatMul + const int n = num_output; + const int k = weight_data.total() / n; + weight_data_int8.create(n * k, (size_t)1u, opt.blob_allocator); + + int8_t* b = weight_data; + int8_t* sb = weight_data_int8; + reorder_a(b, sb, n, k, k); + + // pre-built scales + scales_in.create(num_output, 4u, opt.blob_allocator); + for (int i = 0; i < num_output; ++i) + { + if (std::fabs(static_cast(weight_data_int8_scales[i])) <= 1e-6) + { + scales_in[i] = 0.f; + } + else + { + scales_in[i] = 1.f / (bottom_blob_int8_scale * weight_data_int8_scales[i]); + } + } +#endif + return 0; +} + +int InnerProduct_arm::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ +#if __aarch64__ + Mat bottom_blob_tm = bottom_blob; + if (bottom_blob.elemsize != 1) + { + quantize_float32_to_int8(bottom_blob, bottom_blob_tm, bottom_blob_int8_scale, opt); + } + + Mat bottom_blob_tm_flattened = bottom_blob_tm; + if (bottom_blob_tm.dims != 1) + { + Option opt_flatten = opt; + opt_flatten.blob_allocator = opt.workspace_allocator; + flatten->forward(bottom_blob_tm, bottom_blob_tm_flattened, opt_flatten); + } + + top_blob.create(num_output, 4u, opt.blob_allocator); + if (top_blob.empty()) + { + return -100; + } + + const int w = bottom_blob_tm.w; + const int h = bottom_blob_tm.h; + + const int m = 1; + const int k = bottom_blob_tm.c * w * h; + Mat bottom_blob_reorder(m * k, (size_t)1u, opt.workspace_allocator); + { + reorder_a(bottom_blob_tm_flattened, bottom_blob_reorder, m, k, k); + } + + Mat top_blob_tm(m * num_output, (size_t)4u, opt.workspace_allocator); + int32_t* pc = top_blob_tm; + const int8_t* pa = bottom_blob_reorder; + const int8_t* pb = weight_data_int8; + int8kernel((void*)pc, pa, pb, m, k, num_output, num_output, 0, 0, opt); + + float* outptr = top_blob; + + // dequant.fused.relu int32_t to float + for (int p = 0; p < num_output; ++p) + { + float sumfp32 = pc[p] * scales_in[p]; + if (bias_term) + { + sumfp32 += bias_data[p]; + } + if (1 == activation_type) + { + sumfp32 = std::max(0.f, sumfp32); + } + + outptr[p] = sumfp32; + } + return 0; +#else + return InnerProduct::forward(bottom_blob, top_blob, opt); +#endif +} + int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u) { // TODO - return InnerProduct::forward(bottom_blob, top_blob, opt); + return forward_int8(bottom_blob, top_blob, opt); } int elembits = bottom_blob.elembits(); diff --git a/src/layer/arm/innerproduct_arm.h b/src/layer/arm/innerproduct_arm.h index b22b3e0fb..cd62e9344 100644 --- a/src/layer/arm/innerproduct_arm.h +++ b/src/layer/arm/innerproduct_arm.h @@ -16,6 +16,8 @@ #define LAYER_INNERPRODUCT_ARM_H #include "innerproduct.h" +#include +#include namespace ncnn { @@ -38,6 +40,9 @@ protected: int create_pipeline_bf16s(const Option& opt); int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; + int create_pipeline_int8(const Option& opt); + int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; + public: ncnn::Layer* flatten; @@ -47,6 +52,10 @@ public: // bf16 Mat weight_data_bf16; + + // int8 + Mat weight_data_int8; + Mat scales_in; }; } // namespace ncnn diff --git a/src/layer/innerproduct.cpp b/src/layer/innerproduct.cpp index b287a2aa7..c931ada7a 100644 --- a/src/layer/innerproduct.cpp +++ b/src/layer/innerproduct.cpp @@ -94,7 +94,7 @@ int InnerProduct::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o { if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u) { - return forward_int8(bottom_blob, top_blob, opt); + return InnerProduct::forward_int8(bottom_blob, top_blob, opt); } int w = bottom_blob.w; @@ -189,10 +189,11 @@ int InnerProduct::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opti int sum = 0; + int offset = size * channels * p; // channels for (int q = 0; q < channels; q++) { - const signed char* w = (const signed char*)weight_data + size * channels * p + size * q; + const signed char* w = (const signed char*)weight_data + offset + size * q; const signed char* m = bottom_blob_tm.channel(q); for (int i = 0; i < size; i++) diff --git a/tests/test_convolution.cpp b/tests/test_convolution.cpp index c5da23372..ce85a3872 100644 --- a/tests/test_convolution.cpp +++ b/tests/test_convolution.cpp @@ -187,19 +187,22 @@ static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int std::vector weights(bias ? 4 : 3); weights[0] = RandomMat(outch * c * kernel * kernel); + + ncnn::Mat weight_scales = scales_mat(weights[0], outch, c * kernel * kernel, c * kernel * kernel); + ncnn::Mat input_scales = scales_mat(a, 1, w * h * c, a.cstep); if (bias) { weights[1] = RandomMat(outch); - weights[2] = RandomMat(outch); - weights[3] = RandomMat(1); + weights[2] = weight_scales; + weights[3] = input_scales; } else { - weights[1] = RandomMat(outch); - weights[2] = RandomMat(1); + weights[1] = weight_scales; + weights[2] = input_scales; } - int ret = test_layer("Convolution", pd, weights, a, 0.001f, requant ? set_param : 0); + int ret = test_layer("Convolution", pd, weights, a, 1.0f, requant ? set_param : 0); if (ret != 0) { fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d requant=%d\n", w, h, c, outch, kernel, dilation, stride, pad, bias, requant); diff --git a/tests/test_innerproduct.cpp b/tests/test_innerproduct.cpp index e4004acaf..20ea11315 100644 --- a/tests/test_innerproduct.cpp +++ b/tests/test_innerproduct.cpp @@ -95,17 +95,21 @@ static int test_innerproduct_int8(const ncnn::Mat& a, int outch, int bias) pd.set(8, 1); // int8_scale_term std::vector weights(bias ? 4 : 3); - weights[0] = RandomMat(outch * a.w * a.h * a.c); + const int k = a.w * a.h * a.c; + weights[0] = RandomMat(outch * k); + ncnn::Mat weight_scales = scales_mat(weights[0], outch, k, k); + ncnn::Mat input_scales = scales_mat(a, 1, k, k); + if (bias) { weights[1] = RandomMat(outch); - weights[2] = RandomMat(outch); - weights[3] = RandomMat(1); + weights[2] = weight_scales; + weights[3] = input_scales; } else { - weights[1] = RandomMat(outch); - weights[2] = RandomMat(1); + weights[1] = weight_scales; + weights[2] = input_scales; } int ret = test_layer("InnerProduct", pd, weights, a); diff --git a/tests/testutil.h b/tests/testutil.h index 96d9146c1..dadcd1764 100644 --- a/tests/testutil.h +++ b/tests/testutil.h @@ -22,6 +22,7 @@ #include #include +#include #if NCNN_VULKAN #include "command.h" @@ -69,6 +70,30 @@ static ncnn::Mat RandomMat(int w, int h, int c) return m; } +static ncnn::Mat scales_mat(const ncnn::Mat& mat, int m, int k, int ldx) +{ + ncnn::Mat weight_scales(m); + for (int i = 0; i < m; ++i) + { + float min = mat[0], _max = mat[0]; + const float* ptr = (const float*)(mat.data) + i * ldx; + for (int j = 0; j < k; ++j) + { + if (min > ptr[j]) + { + min = ptr[j]; + } + if (_max < ptr[j]) + { + _max = ptr[j]; + } + } + const float abs_min = abs(min), abs_max = abs(_max); + weight_scales[i] = 127.f / (abs_min > abs_max ? abs_min : abs_max); + } + return weight_scales; +} + static bool NearlyEqual(float a, float b, float epsilon) { if (a == b)