From baf49574c4ea3333cf2b9772094a919e605fdf2b Mon Sep 17 00:00:00 2001
From: tpoisonooo <khj.application@aliyun.com>
Date: Tue, 5 Jan 2021 23:29:14 +0800
Subject: [PATCH] innerproduct aarch64 use gemm (#2521)

* perf(innerproduct-arm): add aarch64 gemm

* fix(innerproduct): fix compilation errror

* fix(armv7-innerproduct): fix armv7 compilation error

* fix(innerproduct): fix gemm param

* fix(int8): update mock scales and fix runtime error

* fix(compilation): fix compilation error
---
 src/layer/arm/gemm_symm_int8.h     |   2 +-
 src/layer/arm/innerproduct_arm.cpp | 108 ++++++++++++++++++++++++++++-
 src/layer/arm/innerproduct_arm.h   |   9 +++
 src/layer/innerproduct.cpp         |   5 +-
 tests/test_convolution.cpp         |  13 ++--
 tests/test_innerproduct.cpp        |  14 ++--
 tests/testutil.h                   |  25 +++++++
 7 files changed, 161 insertions(+), 15 deletions(-)
diff --git a/src/layer/arm/gemm_symm_int8.h b/src/layer/arm/gemm_symm_int8.h
index 6154ae253..912ce0b04 100644
--- a/src/layer/arm/gemm_symm_int8.h
+++ b/src/layer/arm/gemm_symm_int8.h
@@ -2667,7 +2667,7 @@ static void int8kernel_m4(void* dst, int8_t* sa, int8_t* sb, int, int k, int n,
 #undef DECOMPOSE_K
 #undef DECOMPOSE_N
 
-static void int8kernel(void* dst, const int8_t* sa, const int8_t* sb, int m, int k, int n, int ldc, float* scales, float* bias, const Option& opt)
+static void int8kernel(void* dst, const int8_t* sa, const int8_t* sb, int m, int k, int n, int ldc, float* scales, float* bias, const ncnn::Option& opt)
 {
     int8_t* pa = (int8_t*)sa;
     int8_t* pb = (int8_t*)sb;
diff --git a/src/layer/arm/innerproduct_arm.cpp b/src/layer/arm/innerproduct_arm.cpp
index caab529a0..ac151db86 100644
--- a/src/layer/arm/innerproduct_arm.cpp
+++ b/src/layer/arm/innerproduct_arm.cpp
@@ -19,6 +19,9 @@
 #if __ARM_NEON
 #include <arm_neon.h>
 #include "neon_mathfun.h"
+#if __aarch64__
+#include "gemm_symm_int8.h"
+#endif
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 #include "neon_mathfun_fp16s.h"
 #endif
@@ -45,7 +48,7 @@ InnerProduct_arm::InnerProduct_arm()
 int InnerProduct_arm::create_pipeline(const Option& opt)
 {
 #if __ARM_NEON
-    if (opt.use_packing_layout)
+    if (opt.use_packing_layout or opt.use_int8_inference)
     {
         flatten = ncnn::create_layer(ncnn::LayerType::Flatten);
 
@@ -69,6 +72,11 @@ int InnerProduct_arm::create_pipeline(const Option& opt)
         return create_pipeline_bf16s(opt);
     }
 
+    if (opt.use_int8_inference)
+    {
+        return create_pipeline_int8(opt);
+    }
+
     return 0;
 }
 
@@ -84,12 +92,108 @@ int InnerProduct_arm::destroy_pipeline(const Option& opt)
     return 0;
 }
 
+int InnerProduct_arm::create_pipeline_int8(const Option& opt)
+{
+    // convert fp32 to int8
+    if (weight_data_int8_scales.empty())
+    {
+        return 0;
+    }
+#if __aarch64__
+    // first reorder Matrix A before MatMul
+    const int n = num_output;
+    const int k = weight_data.total() / n;
+    weight_data_int8.create(n * k, (size_t)1u, opt.blob_allocator);
+
+    int8_t* b = weight_data;
+    int8_t* sb = weight_data_int8;
+    reorder_a(b, sb, n, k, k);
+
+    // pre-built scales
+    scales_in.create(num_output, 4u, opt.blob_allocator);
+    for (int i = 0; i < num_output; ++i)
+    {
+        if (std::fabs(static_cast<float>(weight_data_int8_scales[i])) <= 1e-6)
+        {
+            scales_in[i] = 0.f;
+        }
+        else
+        {
+            scales_in[i] = 1.f / (bottom_blob_int8_scale * weight_data_int8_scales[i]);
+        }
+    }
+#endif
+    return 0;
+}
+
+int InnerProduct_arm::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+#if __aarch64__
+    Mat bottom_blob_tm = bottom_blob;
+    if (bottom_blob.elemsize != 1)
+    {
+        quantize_float32_to_int8(bottom_blob, bottom_blob_tm, bottom_blob_int8_scale, opt);
+    }
+
+    Mat bottom_blob_tm_flattened = bottom_blob_tm;
+    if (bottom_blob_tm.dims != 1)
+    {
+        Option opt_flatten = opt;
+        opt_flatten.blob_allocator = opt.workspace_allocator;
+        flatten->forward(bottom_blob_tm, bottom_blob_tm_flattened, opt_flatten);
+    }
+
+    top_blob.create(num_output, 4u, opt.blob_allocator);
+    if (top_blob.empty())
+    {
+        return -100;
+    }
+
+    const int w = bottom_blob_tm.w;
+    const int h = bottom_blob_tm.h;
+
+    const int m = 1;
+    const int k = bottom_blob_tm.c * w * h;
+    Mat bottom_blob_reorder(m * k, (size_t)1u, opt.workspace_allocator);
+    {
+        reorder_a(bottom_blob_tm_flattened, bottom_blob_reorder, m, k, k);
+    }
+
+    Mat top_blob_tm(m * num_output, (size_t)4u, opt.workspace_allocator);
+    int32_t* pc = top_blob_tm;
+    const int8_t* pa = bottom_blob_reorder;
+    const int8_t* pb = weight_data_int8;
+    int8kernel((void*)pc, pa, pb, m, k, num_output, num_output, 0, 0, opt);
+
+    float* outptr = top_blob;
+
+    // dequant.fused.relu int32_t to float
+    for (int p = 0; p < num_output; ++p)
+    {
+        float sumfp32 = pc[p] * scales_in[p];
+        if (bias_term)
+        {
+            sumfp32 += bias_data[p];
+        }
+        if (1 == activation_type)
+        {
+            sumfp32 = std::max(0.f, sumfp32);
+        }
+
+        outptr[p] = sumfp32;
+    }
+    return 0;
+#else
+    return InnerProduct::forward(bottom_blob, top_blob, opt);
+#endif
+}
+
 int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
     if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
     {
         // TODO
-        return InnerProduct::forward(bottom_blob, top_blob, opt);
+        return forward_int8(bottom_blob, top_blob, opt);
     }
 
     int elembits = bottom_blob.elembits();
diff --git a/src/layer/arm/innerproduct_arm.h b/src/layer/arm/innerproduct_arm.h
index b22b3e0fb..cd62e9344 100644
--- a/src/layer/arm/innerproduct_arm.h
+++ b/src/layer/arm/innerproduct_arm.h
@@ -16,6 +16,8 @@
 #define LAYER_INNERPRODUCT_ARM_H
 
 #include "innerproduct.h"
+#include <cmath>
+#include <cstdlib>
 
 namespace ncnn {
 
@@ -38,6 +40,9 @@ protected:
     int create_pipeline_bf16s(const Option& opt);
     int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
+    int create_pipeline_int8(const Option& opt);
+    int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
 public:
     ncnn::Layer* flatten;
 
@@ -47,6 +52,10 @@ public:
 
     // bf16
     Mat weight_data_bf16;
+
+    // int8
+    Mat weight_data_int8;
+    Mat scales_in;
 };
 
 } // namespace ncnn
diff --git a/src/layer/innerproduct.cpp b/src/layer/innerproduct.cpp
index b287a2aa7..c931ada7a 100644
--- a/src/layer/innerproduct.cpp
+++ b/src/layer/innerproduct.cpp
@@ -94,7 +94,7 @@ int InnerProduct::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o
 {
     if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
     {
-        return forward_int8(bottom_blob, top_blob, opt);
+        return InnerProduct::forward_int8(bottom_blob, top_blob, opt);
     }
 
     int w = bottom_blob.w;
@@ -189,10 +189,11 @@ int InnerProduct::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opti
 
         int sum = 0;
 
+        int offset = size * channels * p;
         // channels
         for (int q = 0; q < channels; q++)
         {
-            const signed char* w = (const signed char*)weight_data + size * channels * p + size * q;
+            const signed char* w = (const signed char*)weight_data + offset + size * q;
             const signed char* m = bottom_blob_tm.channel(q);
 
             for (int i = 0; i < size; i++)
diff --git a/tests/test_convolution.cpp b/tests/test_convolution.cpp
index c5da23372..ce85a3872 100644
--- a/tests/test_convolution.cpp
+++ b/tests/test_convolution.cpp
@@ -187,19 +187,22 @@ static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int
 
     std::vector<ncnn::Mat> weights(bias ? 4 : 3);
     weights[0] = RandomMat(outch * c * kernel * kernel);
+
+    ncnn::Mat weight_scales = scales_mat(weights[0], outch, c * kernel * kernel, c * kernel * kernel);
+    ncnn::Mat input_scales = scales_mat(a, 1, w * h * c, a.cstep);
     if (bias)
     {
         weights[1] = RandomMat(outch);
-        weights[2] = RandomMat(outch);
-        weights[3] = RandomMat(1);
+        weights[2] = weight_scales;
+        weights[3] = input_scales;
     }
     else
     {
-        weights[1] = RandomMat(outch);
-        weights[2] = RandomMat(1);
+        weights[1] = weight_scales;
+        weights[2] = input_scales;
     }
 
-    int ret = test_layer<ncnn::Convolution>("Convolution", pd, weights, a, 0.001f, requant ? set_param : 0);
+    int ret = test_layer<ncnn::Convolution>("Convolution", pd, weights, a, 1.0f, requant ? set_param : 0);
     if (ret != 0)
     {
         fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d requant=%d\n", w, h, c, outch, kernel, dilation, stride, pad, bias, requant);
diff --git a/tests/test_innerproduct.cpp b/tests/test_innerproduct.cpp
index e4004acaf..20ea11315 100644
--- a/tests/test_innerproduct.cpp
+++ b/tests/test_innerproduct.cpp
@@ -95,17 +95,21 @@ static int test_innerproduct_int8(const ncnn::Mat& a, int outch, int bias)
     pd.set(8, 1); // int8_scale_term
 
     std::vector<ncnn::Mat> weights(bias ? 4 : 3);
-    weights[0] = RandomMat(outch * a.w * a.h * a.c);
+    const int k = a.w * a.h * a.c;
+    weights[0] = RandomMat(outch * k);
+    ncnn::Mat weight_scales = scales_mat(weights[0], outch, k, k);
+    ncnn::Mat input_scales = scales_mat(a, 1, k, k);
+
     if (bias)
     {
         weights[1] = RandomMat(outch);
-        weights[2] = RandomMat(outch);
-        weights[3] = RandomMat(1);
+        weights[2] = weight_scales;
+        weights[3] = input_scales;
     }
     else
     {
-        weights[1] = RandomMat(outch);
-        weights[2] = RandomMat(1);
+        weights[1] = weight_scales;
+        weights[2] = input_scales;
     }
 
     int ret = test_layer<ncnn::InnerProduct>("InnerProduct", pd, weights, a);
diff --git a/tests/testutil.h b/tests/testutil.h
index 96d9146c1..dadcd1764 100644
--- a/tests/testutil.h
+++ b/tests/testutil.h
@@ -22,6 +22,7 @@
 
 #include <math.h>
 #include <stdio.h>
+#include <stdlib.h>
 
 #if NCNN_VULKAN
 #include "command.h"
@@ -69,6 +70,30 @@ static ncnn::Mat RandomMat(int w, int h, int c)
     return m;
 }
 
+static ncnn::Mat scales_mat(const ncnn::Mat& mat, int m, int k, int ldx)
+{
+    ncnn::Mat weight_scales(m);
+    for (int i = 0; i < m; ++i)
+    {
+        float min = mat[0], _max = mat[0];
+        const float* ptr = (const float*)(mat.data) + i * ldx;
+        for (int j = 0; j < k; ++j)
+        {
+            if (min > ptr[j])
+            {
+                min = ptr[j];
+            }
+            if (_max < ptr[j])
+            {
+                _max = ptr[j];
+            }
+        }
+        const float abs_min = abs(min), abs_max = abs(_max);
+        weight_scales[i] = 127.f / (abs_min > abs_max ? abs_min : abs_max);
+    }
+    return weight_scales;
+}
+
 static bool NearlyEqual(float a, float b, float epsilon)
 {
     if (a == b)