innerproduct aarch64 use gemm (#2521)

* perf(innerproduct-arm): add aarch64 gemm * fix(innerproduct): fix compilation errror * fix(armv7-innerproduct): fix armv7 compilation error * fix(innerproduct): fix gemm param * fix(int8): update mock scales and fix runtime error * fix(compilation): fix compilation error
5 years ago · baf49574c4
--- a/src/layer/arm/gemm_symm_int8.h
+++ b/src/layer/arm/gemm_symm_int8.h
@@ -2667,7 +2667,7 @@ static void int8kernel_m4(void* dst, int8_t* sa, int8_t* sb, int, int k, int n,
 #undef DECOMPOSE_K
 #undef DECOMPOSE_N

 static void int8kernel(void* dst, const int8_t* sa, const int8_t* sb, int m, int k, int n, int ldc, float* scales, float* bias, const Option& opt)
 static void int8kernel(void* dst, const int8_t* sa, const int8_t* sb, int m, int k, int n, int ldc, float* scales, float* bias, const ncnn::Option& opt)
 {
    int8_t* pa = (int8_t*)sa;
    int8_t* pb = (int8_t*)sb;
--- a/src/layer/arm/innerproduct_arm.cpp
+++ b/src/layer/arm/innerproduct_arm.cpp
@@ -19,6 +19,9 @@
 #if __ARM_NEON
 #include <arm_neon.h>
 #include "neon_mathfun.h"
 #if __aarch64__
 #include "gemm_symm_int8.h"
 #endif
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 #include "neon_mathfun_fp16s.h"
 #endif
@@ -45,7 +48,7 @@ InnerProduct_arm::InnerProduct_arm()
 int InnerProduct_arm::create_pipeline(const Option& opt)
 {
 #if __ARM_NEON
    if (opt.use_packing_layout)
    if (opt.use_packing_layout or opt.use_int8_inference)
    {
        flatten = ncnn::create_layer(ncnn::LayerType::Flatten);

@@ -69,6 +72,11 @@ int InnerProduct_arm::create_pipeline(const Option& opt)
        return create_pipeline_bf16s(opt);
    }

    if (opt.use_int8_inference)
    {
        return create_pipeline_int8(opt);
    }

    return 0;
 }

@@ -84,12 +92,108 @@ int InnerProduct_arm::destroy_pipeline(const Option& opt)
    return 0;
 }

 int InnerProduct_arm::create_pipeline_int8(const Option& opt)
 {
    // convert fp32 to int8
    if (weight_data_int8_scales.empty())
    {
        return 0;
    }
 #if __aarch64__
    // first reorder Matrix A before MatMul
    const int n = num_output;
    const int k = weight_data.total() / n;
    weight_data_int8.create(n * k, (size_t)1u, opt.blob_allocator);

    int8_t* b = weight_data;
    int8_t* sb = weight_data_int8;
    reorder_a(b, sb, n, k, k);

    // pre-built scales
    scales_in.create(num_output, 4u, opt.blob_allocator);
    for (int i = 0; i < num_output; ++i)
    {
        if (std::fabs(static_cast<float>(weight_data_int8_scales[i])) <= 1e-6)
        {
            scales_in[i] = 0.f;
        }
        else
        {
            scales_in[i] = 1.f / (bottom_blob_int8_scale * weight_data_int8_scales[i]);
        }
    }
 #endif
    return 0;
 }

 int InnerProduct_arm::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
 #if __aarch64__
    Mat bottom_blob_tm = bottom_blob;
    if (bottom_blob.elemsize != 1)
    {
        quantize_float32_to_int8(bottom_blob, bottom_blob_tm, bottom_blob_int8_scale, opt);
    }

    Mat bottom_blob_tm_flattened = bottom_blob_tm;
    if (bottom_blob_tm.dims != 1)
    {
        Option opt_flatten = opt;
        opt_flatten.blob_allocator = opt.workspace_allocator;
        flatten->forward(bottom_blob_tm, bottom_blob_tm_flattened, opt_flatten);
    }

    top_blob.create(num_output, 4u, opt.blob_allocator);
    if (top_blob.empty())
    {
        return -100;
    }

    const int w = bottom_blob_tm.w;
    const int h = bottom_blob_tm.h;

    const int m = 1;
    const int k = bottom_blob_tm.c * w * h;
    Mat bottom_blob_reorder(m * k, (size_t)1u, opt.workspace_allocator);
    {
        reorder_a(bottom_blob_tm_flattened, bottom_blob_reorder, m, k, k);
    }

    Mat top_blob_tm(m * num_output, (size_t)4u, opt.workspace_allocator);
    int32_t* pc = top_blob_tm;
    const int8_t* pa = bottom_blob_reorder;
    const int8_t* pb = weight_data_int8;
    int8kernel((void*)pc, pa, pb, m, k, num_output, num_output, 0, 0, opt);

    float* outptr = top_blob;

    // dequant.fused.relu int32_t to float
    for (int p = 0; p < num_output; ++p)
    {
        float sumfp32 = pc[p] * scales_in[p];
        if (bias_term)
        {
            sumfp32 += bias_data[p];
        }
        if (1 == activation_type)
        {
            sumfp32 = std::max(0.f, sumfp32);
        }

        outptr[p] = sumfp32;
    }
    return 0;
 #else
    return InnerProduct::forward(bottom_blob, top_blob, opt);
 #endif
 }

 int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
    if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
    {
        // TODO
        return InnerProduct::forward(bottom_blob, top_blob, opt);
        return forward_int8(bottom_blob, top_blob, opt);
    }

    int elembits = bottom_blob.elembits();
--- a/src/layer/arm/innerproduct_arm.h
+++ b/src/layer/arm/innerproduct_arm.h
@@ -16,6 +16,8 @@
 #define LAYER_INNERPRODUCT_ARM_H

 #include "innerproduct.h"
 #include <cmath>
 #include <cstdlib>

 namespace ncnn {

@@ -38,6 +40,9 @@ protected:
    int create_pipeline_bf16s(const Option& opt);
    int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

    int create_pipeline_int8(const Option& opt);
    int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

 public:
    ncnn::Layer* flatten;

@@ -47,6 +52,10 @@ public:

    // bf16
    Mat weight_data_bf16;

    // int8
    Mat weight_data_int8;
    Mat scales_in;
 };

 } // namespace ncnn
--- a/src/layer/innerproduct.cpp
+++ b/src/layer/innerproduct.cpp
@@ -94,7 +94,7 @@ int InnerProduct::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o
 {
    if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
    {
        return forward_int8(bottom_blob, top_blob, opt);
        return InnerProduct::forward_int8(bottom_blob, top_blob, opt);
    }

    int w = bottom_blob.w;
@@ -189,10 +189,11 @@ int InnerProduct::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opti

        int sum = 0;

        int offset = size * channels * p;
        // channels
        for (int q = 0; q < channels; q++)
        {
            const signed char* w = (const signed char*)weight_data + size * channels * p + size * q;
            const signed char* w = (const signed char*)weight_data + offset + size * q;
            const signed char* m = bottom_blob_tm.channel(q);

            for (int i = 0; i < size; i++)
--- a/tests/test_convolution.cpp
+++ b/tests/test_convolution.cpp
@@ -187,19 +187,22 @@ static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int

    std::vector<ncnn::Mat> weights(bias ? 4 : 3);
    weights[0] = RandomMat(outch * c * kernel * kernel);

    ncnn::Mat weight_scales = scales_mat(weights[0], outch, c * kernel * kernel, c * kernel * kernel);
    ncnn::Mat input_scales = scales_mat(a, 1, w * h * c, a.cstep);
    if (bias)
    {
        weights[1] = RandomMat(outch);
        weights[2] = RandomMat(outch);
        weights[3] = RandomMat(1);
        weights[2] = weight_scales;
        weights[3] = input_scales;
    }
    else
    {
        weights[1] = RandomMat(outch);
        weights[2] = RandomMat(1);
        weights[1] = weight_scales;
        weights[2] = input_scales;
    }

    int ret = test_layer<ncnn::Convolution>("Convolution", pd, weights, a, 0.001f, requant ? set_param : 0);
    int ret = test_layer<ncnn::Convolution>("Convolution", pd, weights, a, 1.0f, requant ? set_param : 0);
    if (ret != 0)
    {
        fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d requant=%d\n", w, h, c, outch, kernel, dilation, stride, pad, bias, requant);
--- a/tests/test_innerproduct.cpp
+++ b/tests/test_innerproduct.cpp
@@ -95,17 +95,21 @@ static int test_innerproduct_int8(const ncnn::Mat& a, int outch, int bias)
    pd.set(8, 1); // int8_scale_term

    std::vector<ncnn::Mat> weights(bias ? 4 : 3);
    weights[0] = RandomMat(outch * a.w * a.h * a.c);
    const int k = a.w * a.h * a.c;
    weights[0] = RandomMat(outch * k);
    ncnn::Mat weight_scales = scales_mat(weights[0], outch, k, k);
    ncnn::Mat input_scales = scales_mat(a, 1, k, k);

    if (bias)
    {
        weights[1] = RandomMat(outch);
        weights[2] = RandomMat(outch);
        weights[3] = RandomMat(1);
        weights[2] = weight_scales;
        weights[3] = input_scales;
    }
    else
    {
        weights[1] = RandomMat(outch);
        weights[2] = RandomMat(1);
        weights[1] = weight_scales;
        weights[2] = input_scales;
    }

    int ret = test_layer<ncnn::InnerProduct>("InnerProduct", pd, weights, a);
--- a/tests/testutil.h
+++ b/tests/testutil.h
@@ -22,6 +22,7 @@

 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>

 #if NCNN_VULKAN
 #include "command.h"
@@ -69,6 +70,30 @@ static ncnn::Mat RandomMat(int w, int h, int c)
    return m;
 }

 static ncnn::Mat scales_mat(const ncnn::Mat& mat, int m, int k, int ldx)
 {
    ncnn::Mat weight_scales(m);
    for (int i = 0; i < m; ++i)
    {
        float min = mat[0], _max = mat[0];
        const float* ptr = (const float*)(mat.data) + i * ldx;
        for (int j = 0; j < k; ++j)
        {
            if (min > ptr[j])
            {
                min = ptr[j];
            }
            if (_max < ptr[j])
            {
                _max = ptr[j];
            }
        }
        const float abs_min = abs(min), abs_max = abs(_max);
        weight_scales[i] = 127.f / (abs_min > abs_max ? abs_min : abs_max);
    }
    return weight_scales;
 }

 static bool NearlyEqual(float a, float b, float epsilon)
 {
    if (a == b)