Browse Source

innerproduct aarch64 use gemm (#2521)

* perf(innerproduct-arm): add aarch64 gemm

* fix(innerproduct): fix compilation errror

* fix(armv7-innerproduct): fix armv7 compilation error

* fix(innerproduct): fix gemm param

* fix(int8): update mock scales and fix runtime error

* fix(compilation): fix compilation error
tags/20210124
tpoisonooo GitHub 5 years ago
parent
commit
baf49574c4
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 161 additions and 15 deletions
  1. +1
    -1
      src/layer/arm/gemm_symm_int8.h
  2. +106
    -2
      src/layer/arm/innerproduct_arm.cpp
  3. +9
    -0
      src/layer/arm/innerproduct_arm.h
  4. +3
    -2
      src/layer/innerproduct.cpp
  5. +8
    -5
      tests/test_convolution.cpp
  6. +9
    -5
      tests/test_innerproduct.cpp
  7. +25
    -0
      tests/testutil.h

+ 1
- 1
src/layer/arm/gemm_symm_int8.h View File

@@ -2667,7 +2667,7 @@ static void int8kernel_m4(void* dst, int8_t* sa, int8_t* sb, int, int k, int n,
#undef DECOMPOSE_K
#undef DECOMPOSE_N

static void int8kernel(void* dst, const int8_t* sa, const int8_t* sb, int m, int k, int n, int ldc, float* scales, float* bias, const Option& opt)
static void int8kernel(void* dst, const int8_t* sa, const int8_t* sb, int m, int k, int n, int ldc, float* scales, float* bias, const ncnn::Option& opt)
{
int8_t* pa = (int8_t*)sa;
int8_t* pb = (int8_t*)sb;


+ 106
- 2
src/layer/arm/innerproduct_arm.cpp View File

@@ -19,6 +19,9 @@
#if __ARM_NEON
#include <arm_neon.h>
#include "neon_mathfun.h"
#if __aarch64__
#include "gemm_symm_int8.h"
#endif
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#include "neon_mathfun_fp16s.h"
#endif
@@ -45,7 +48,7 @@ InnerProduct_arm::InnerProduct_arm()
int InnerProduct_arm::create_pipeline(const Option& opt)
{
#if __ARM_NEON
if (opt.use_packing_layout)
if (opt.use_packing_layout or opt.use_int8_inference)
{
flatten = ncnn::create_layer(ncnn::LayerType::Flatten);

@@ -69,6 +72,11 @@ int InnerProduct_arm::create_pipeline(const Option& opt)
return create_pipeline_bf16s(opt);
}

if (opt.use_int8_inference)
{
return create_pipeline_int8(opt);
}

return 0;
}

@@ -84,12 +92,108 @@ int InnerProduct_arm::destroy_pipeline(const Option& opt)
return 0;
}

int InnerProduct_arm::create_pipeline_int8(const Option& opt)
{
// convert fp32 to int8
if (weight_data_int8_scales.empty())
{
return 0;
}
#if __aarch64__
// first reorder Matrix A before MatMul
const int n = num_output;
const int k = weight_data.total() / n;
weight_data_int8.create(n * k, (size_t)1u, opt.blob_allocator);

int8_t* b = weight_data;
int8_t* sb = weight_data_int8;
reorder_a(b, sb, n, k, k);

// pre-built scales
scales_in.create(num_output, 4u, opt.blob_allocator);
for (int i = 0; i < num_output; ++i)
{
if (std::fabs(static_cast<float>(weight_data_int8_scales[i])) <= 1e-6)
{
scales_in[i] = 0.f;
}
else
{
scales_in[i] = 1.f / (bottom_blob_int8_scale * weight_data_int8_scales[i]);
}
}
#endif
return 0;
}

int InnerProduct_arm::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
#if __aarch64__
Mat bottom_blob_tm = bottom_blob;
if (bottom_blob.elemsize != 1)
{
quantize_float32_to_int8(bottom_blob, bottom_blob_tm, bottom_blob_int8_scale, opt);
}

Mat bottom_blob_tm_flattened = bottom_blob_tm;
if (bottom_blob_tm.dims != 1)
{
Option opt_flatten = opt;
opt_flatten.blob_allocator = opt.workspace_allocator;
flatten->forward(bottom_blob_tm, bottom_blob_tm_flattened, opt_flatten);
}

top_blob.create(num_output, 4u, opt.blob_allocator);
if (top_blob.empty())
{
return -100;
}

const int w = bottom_blob_tm.w;
const int h = bottom_blob_tm.h;

const int m = 1;
const int k = bottom_blob_tm.c * w * h;
Mat bottom_blob_reorder(m * k, (size_t)1u, opt.workspace_allocator);
{
reorder_a(bottom_blob_tm_flattened, bottom_blob_reorder, m, k, k);
}

Mat top_blob_tm(m * num_output, (size_t)4u, opt.workspace_allocator);
int32_t* pc = top_blob_tm;
const int8_t* pa = bottom_blob_reorder;
const int8_t* pb = weight_data_int8;
int8kernel((void*)pc, pa, pb, m, k, num_output, num_output, 0, 0, opt);

float* outptr = top_blob;

// dequant.fused.relu int32_t to float
for (int p = 0; p < num_output; ++p)
{
float sumfp32 = pc[p] * scales_in[p];
if (bias_term)
{
sumfp32 += bias_data[p];
}
if (1 == activation_type)
{
sumfp32 = std::max(0.f, sumfp32);
}

outptr[p] = sumfp32;
}
return 0;
#else
return InnerProduct::forward(bottom_blob, top_blob, opt);
#endif
}

int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
{
// TODO
return InnerProduct::forward(bottom_blob, top_blob, opt);
return forward_int8(bottom_blob, top_blob, opt);
}

int elembits = bottom_blob.elembits();


+ 9
- 0
src/layer/arm/innerproduct_arm.h View File

@@ -16,6 +16,8 @@
#define LAYER_INNERPRODUCT_ARM_H

#include "innerproduct.h"
#include <cmath>
#include <cstdlib>

namespace ncnn {

@@ -38,6 +40,9 @@ protected:
int create_pipeline_bf16s(const Option& opt);
int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

int create_pipeline_int8(const Option& opt);
int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

public:
ncnn::Layer* flatten;

@@ -47,6 +52,10 @@ public:

// bf16
Mat weight_data_bf16;

// int8
Mat weight_data_int8;
Mat scales_in;
};

} // namespace ncnn


+ 3
- 2
src/layer/innerproduct.cpp View File

@@ -94,7 +94,7 @@ int InnerProduct::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o
{
if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
{
return forward_int8(bottom_blob, top_blob, opt);
return InnerProduct::forward_int8(bottom_blob, top_blob, opt);
}

int w = bottom_blob.w;
@@ -189,10 +189,11 @@ int InnerProduct::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opti

int sum = 0;

int offset = size * channels * p;
// channels
for (int q = 0; q < channels; q++)
{
const signed char* w = (const signed char*)weight_data + size * channels * p + size * q;
const signed char* w = (const signed char*)weight_data + offset + size * q;
const signed char* m = bottom_blob_tm.channel(q);

for (int i = 0; i < size; i++)


+ 8
- 5
tests/test_convolution.cpp View File

@@ -187,19 +187,22 @@ static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int

std::vector<ncnn::Mat> weights(bias ? 4 : 3);
weights[0] = RandomMat(outch * c * kernel * kernel);

ncnn::Mat weight_scales = scales_mat(weights[0], outch, c * kernel * kernel, c * kernel * kernel);
ncnn::Mat input_scales = scales_mat(a, 1, w * h * c, a.cstep);
if (bias)
{
weights[1] = RandomMat(outch);
weights[2] = RandomMat(outch);
weights[3] = RandomMat(1);
weights[2] = weight_scales;
weights[3] = input_scales;
}
else
{
weights[1] = RandomMat(outch);
weights[2] = RandomMat(1);
weights[1] = weight_scales;
weights[2] = input_scales;
}

int ret = test_layer<ncnn::Convolution>("Convolution", pd, weights, a, 0.001f, requant ? set_param : 0);
int ret = test_layer<ncnn::Convolution>("Convolution", pd, weights, a, 1.0f, requant ? set_param : 0);
if (ret != 0)
{
fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d requant=%d\n", w, h, c, outch, kernel, dilation, stride, pad, bias, requant);


+ 9
- 5
tests/test_innerproduct.cpp View File

@@ -95,17 +95,21 @@ static int test_innerproduct_int8(const ncnn::Mat& a, int outch, int bias)
pd.set(8, 1); // int8_scale_term

std::vector<ncnn::Mat> weights(bias ? 4 : 3);
weights[0] = RandomMat(outch * a.w * a.h * a.c);
const int k = a.w * a.h * a.c;
weights[0] = RandomMat(outch * k);
ncnn::Mat weight_scales = scales_mat(weights[0], outch, k, k);
ncnn::Mat input_scales = scales_mat(a, 1, k, k);

if (bias)
{
weights[1] = RandomMat(outch);
weights[2] = RandomMat(outch);
weights[3] = RandomMat(1);
weights[2] = weight_scales;
weights[3] = input_scales;
}
else
{
weights[1] = RandomMat(outch);
weights[2] = RandomMat(1);
weights[1] = weight_scales;
weights[2] = input_scales;
}

int ret = test_layer<ncnn::InnerProduct>("InnerProduct", pd, weights, a);


+ 25
- 0
tests/testutil.h View File

@@ -22,6 +22,7 @@

#include <math.h>
#include <stdio.h>
#include <stdlib.h>

#if NCNN_VULKAN
#include "command.h"
@@ -69,6 +70,30 @@ static ncnn::Mat RandomMat(int w, int h, int c)
return m;
}

static ncnn::Mat scales_mat(const ncnn::Mat& mat, int m, int k, int ldx)
{
ncnn::Mat weight_scales(m);
for (int i = 0; i < m; ++i)
{
float min = mat[0], _max = mat[0];
const float* ptr = (const float*)(mat.data) + i * ldx;
for (int j = 0; j < k; ++j)
{
if (min > ptr[j])
{
min = ptr[j];
}
if (_max < ptr[j])
{
_max = ptr[j];
}
}
const float abs_min = abs(min), abs_max = abs(_max);
weight_scales[i] = 127.f / (abs_min > abs_max ? abs_min : abs_max);
}
return weight_scales;
}

static bool NearlyEqual(float a, float b, float epsilon)
{
if (a == b)


Loading…
Cancel
Save