* perf(innerproduct-arm): add aarch64 gemm * fix(innerproduct): fix compilation errror * fix(armv7-innerproduct): fix armv7 compilation error * fix(innerproduct): fix gemm param * fix(int8): update mock scales and fix runtime error * fix(compilation): fix compilation errortags/20210124
| @@ -2667,7 +2667,7 @@ static void int8kernel_m4(void* dst, int8_t* sa, int8_t* sb, int, int k, int n, | |||
| #undef DECOMPOSE_K | |||
| #undef DECOMPOSE_N | |||
| static void int8kernel(void* dst, const int8_t* sa, const int8_t* sb, int m, int k, int n, int ldc, float* scales, float* bias, const Option& opt) | |||
| static void int8kernel(void* dst, const int8_t* sa, const int8_t* sb, int m, int k, int n, int ldc, float* scales, float* bias, const ncnn::Option& opt) | |||
| { | |||
| int8_t* pa = (int8_t*)sa; | |||
| int8_t* pb = (int8_t*)sb; | |||
| @@ -19,6 +19,9 @@ | |||
| #if __ARM_NEON | |||
| #include <arm_neon.h> | |||
| #include "neon_mathfun.h" | |||
| #if __aarch64__ | |||
| #include "gemm_symm_int8.h" | |||
| #endif | |||
| #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| #include "neon_mathfun_fp16s.h" | |||
| #endif | |||
| @@ -45,7 +48,7 @@ InnerProduct_arm::InnerProduct_arm() | |||
| int InnerProduct_arm::create_pipeline(const Option& opt) | |||
| { | |||
| #if __ARM_NEON | |||
| if (opt.use_packing_layout) | |||
| if (opt.use_packing_layout or opt.use_int8_inference) | |||
| { | |||
| flatten = ncnn::create_layer(ncnn::LayerType::Flatten); | |||
| @@ -69,6 +72,11 @@ int InnerProduct_arm::create_pipeline(const Option& opt) | |||
| return create_pipeline_bf16s(opt); | |||
| } | |||
| if (opt.use_int8_inference) | |||
| { | |||
| return create_pipeline_int8(opt); | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -84,12 +92,108 @@ int InnerProduct_arm::destroy_pipeline(const Option& opt) | |||
| return 0; | |||
| } | |||
| int InnerProduct_arm::create_pipeline_int8(const Option& opt) | |||
| { | |||
| // convert fp32 to int8 | |||
| if (weight_data_int8_scales.empty()) | |||
| { | |||
| return 0; | |||
| } | |||
| #if __aarch64__ | |||
| // first reorder Matrix A before MatMul | |||
| const int n = num_output; | |||
| const int k = weight_data.total() / n; | |||
| weight_data_int8.create(n * k, (size_t)1u, opt.blob_allocator); | |||
| int8_t* b = weight_data; | |||
| int8_t* sb = weight_data_int8; | |||
| reorder_a(b, sb, n, k, k); | |||
| // pre-built scales | |||
| scales_in.create(num_output, 4u, opt.blob_allocator); | |||
| for (int i = 0; i < num_output; ++i) | |||
| { | |||
| if (std::fabs(static_cast<float>(weight_data_int8_scales[i])) <= 1e-6) | |||
| { | |||
| scales_in[i] = 0.f; | |||
| } | |||
| else | |||
| { | |||
| scales_in[i] = 1.f / (bottom_blob_int8_scale * weight_data_int8_scales[i]); | |||
| } | |||
| } | |||
| #endif | |||
| return 0; | |||
| } | |||
| int InnerProduct_arm::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const | |||
| { | |||
| #if __aarch64__ | |||
| Mat bottom_blob_tm = bottom_blob; | |||
| if (bottom_blob.elemsize != 1) | |||
| { | |||
| quantize_float32_to_int8(bottom_blob, bottom_blob_tm, bottom_blob_int8_scale, opt); | |||
| } | |||
| Mat bottom_blob_tm_flattened = bottom_blob_tm; | |||
| if (bottom_blob_tm.dims != 1) | |||
| { | |||
| Option opt_flatten = opt; | |||
| opt_flatten.blob_allocator = opt.workspace_allocator; | |||
| flatten->forward(bottom_blob_tm, bottom_blob_tm_flattened, opt_flatten); | |||
| } | |||
| top_blob.create(num_output, 4u, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| { | |||
| return -100; | |||
| } | |||
| const int w = bottom_blob_tm.w; | |||
| const int h = bottom_blob_tm.h; | |||
| const int m = 1; | |||
| const int k = bottom_blob_tm.c * w * h; | |||
| Mat bottom_blob_reorder(m * k, (size_t)1u, opt.workspace_allocator); | |||
| { | |||
| reorder_a(bottom_blob_tm_flattened, bottom_blob_reorder, m, k, k); | |||
| } | |||
| Mat top_blob_tm(m * num_output, (size_t)4u, opt.workspace_allocator); | |||
| int32_t* pc = top_blob_tm; | |||
| const int8_t* pa = bottom_blob_reorder; | |||
| const int8_t* pb = weight_data_int8; | |||
| int8kernel((void*)pc, pa, pb, m, k, num_output, num_output, 0, 0, opt); | |||
| float* outptr = top_blob; | |||
| // dequant.fused.relu int32_t to float | |||
| for (int p = 0; p < num_output; ++p) | |||
| { | |||
| float sumfp32 = pc[p] * scales_in[p]; | |||
| if (bias_term) | |||
| { | |||
| sumfp32 += bias_data[p]; | |||
| } | |||
| if (1 == activation_type) | |||
| { | |||
| sumfp32 = std::max(0.f, sumfp32); | |||
| } | |||
| outptr[p] = sumfp32; | |||
| } | |||
| return 0; | |||
| #else | |||
| return InnerProduct::forward(bottom_blob, top_blob, opt); | |||
| #endif | |||
| } | |||
| int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const | |||
| { | |||
| if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u) | |||
| { | |||
| // TODO | |||
| return InnerProduct::forward(bottom_blob, top_blob, opt); | |||
| return forward_int8(bottom_blob, top_blob, opt); | |||
| } | |||
| int elembits = bottom_blob.elembits(); | |||
| @@ -16,6 +16,8 @@ | |||
| #define LAYER_INNERPRODUCT_ARM_H | |||
| #include "innerproduct.h" | |||
| #include <cmath> | |||
| #include <cstdlib> | |||
| namespace ncnn { | |||
| @@ -38,6 +40,9 @@ protected: | |||
| int create_pipeline_bf16s(const Option& opt); | |||
| int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| int create_pipeline_int8(const Option& opt); | |||
| int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| public: | |||
| ncnn::Layer* flatten; | |||
| @@ -47,6 +52,10 @@ public: | |||
| // bf16 | |||
| Mat weight_data_bf16; | |||
| // int8 | |||
| Mat weight_data_int8; | |||
| Mat scales_in; | |||
| }; | |||
| } // namespace ncnn | |||
| @@ -94,7 +94,7 @@ int InnerProduct::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o | |||
| { | |||
| if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u) | |||
| { | |||
| return forward_int8(bottom_blob, top_blob, opt); | |||
| return InnerProduct::forward_int8(bottom_blob, top_blob, opt); | |||
| } | |||
| int w = bottom_blob.w; | |||
| @@ -189,10 +189,11 @@ int InnerProduct::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opti | |||
| int sum = 0; | |||
| int offset = size * channels * p; | |||
| // channels | |||
| for (int q = 0; q < channels; q++) | |||
| { | |||
| const signed char* w = (const signed char*)weight_data + size * channels * p + size * q; | |||
| const signed char* w = (const signed char*)weight_data + offset + size * q; | |||
| const signed char* m = bottom_blob_tm.channel(q); | |||
| for (int i = 0; i < size; i++) | |||
| @@ -187,19 +187,22 @@ static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int | |||
| std::vector<ncnn::Mat> weights(bias ? 4 : 3); | |||
| weights[0] = RandomMat(outch * c * kernel * kernel); | |||
| ncnn::Mat weight_scales = scales_mat(weights[0], outch, c * kernel * kernel, c * kernel * kernel); | |||
| ncnn::Mat input_scales = scales_mat(a, 1, w * h * c, a.cstep); | |||
| if (bias) | |||
| { | |||
| weights[1] = RandomMat(outch); | |||
| weights[2] = RandomMat(outch); | |||
| weights[3] = RandomMat(1); | |||
| weights[2] = weight_scales; | |||
| weights[3] = input_scales; | |||
| } | |||
| else | |||
| { | |||
| weights[1] = RandomMat(outch); | |||
| weights[2] = RandomMat(1); | |||
| weights[1] = weight_scales; | |||
| weights[2] = input_scales; | |||
| } | |||
| int ret = test_layer<ncnn::Convolution>("Convolution", pd, weights, a, 0.001f, requant ? set_param : 0); | |||
| int ret = test_layer<ncnn::Convolution>("Convolution", pd, weights, a, 1.0f, requant ? set_param : 0); | |||
| if (ret != 0) | |||
| { | |||
| fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d requant=%d\n", w, h, c, outch, kernel, dilation, stride, pad, bias, requant); | |||
| @@ -95,17 +95,21 @@ static int test_innerproduct_int8(const ncnn::Mat& a, int outch, int bias) | |||
| pd.set(8, 1); // int8_scale_term | |||
| std::vector<ncnn::Mat> weights(bias ? 4 : 3); | |||
| weights[0] = RandomMat(outch * a.w * a.h * a.c); | |||
| const int k = a.w * a.h * a.c; | |||
| weights[0] = RandomMat(outch * k); | |||
| ncnn::Mat weight_scales = scales_mat(weights[0], outch, k, k); | |||
| ncnn::Mat input_scales = scales_mat(a, 1, k, k); | |||
| if (bias) | |||
| { | |||
| weights[1] = RandomMat(outch); | |||
| weights[2] = RandomMat(outch); | |||
| weights[3] = RandomMat(1); | |||
| weights[2] = weight_scales; | |||
| weights[3] = input_scales; | |||
| } | |||
| else | |||
| { | |||
| weights[1] = RandomMat(outch); | |||
| weights[2] = RandomMat(1); | |||
| weights[1] = weight_scales; | |||
| weights[2] = input_scales; | |||
| } | |||
| int ret = test_layer<ncnn::InnerProduct>("InnerProduct", pd, weights, a); | |||
| @@ -22,6 +22,7 @@ | |||
| #include <math.h> | |||
| #include <stdio.h> | |||
| #include <stdlib.h> | |||
| #if NCNN_VULKAN | |||
| #include "command.h" | |||
| @@ -69,6 +70,30 @@ static ncnn::Mat RandomMat(int w, int h, int c) | |||
| return m; | |||
| } | |||
| static ncnn::Mat scales_mat(const ncnn::Mat& mat, int m, int k, int ldx) | |||
| { | |||
| ncnn::Mat weight_scales(m); | |||
| for (int i = 0; i < m; ++i) | |||
| { | |||
| float min = mat[0], _max = mat[0]; | |||
| const float* ptr = (const float*)(mat.data) + i * ldx; | |||
| for (int j = 0; j < k; ++j) | |||
| { | |||
| if (min > ptr[j]) | |||
| { | |||
| min = ptr[j]; | |||
| } | |||
| if (_max < ptr[j]) | |||
| { | |||
| _max = ptr[j]; | |||
| } | |||
| } | |||
| const float abs_min = abs(min), abs_max = abs(_max); | |||
| weight_scales[i] = 127.f / (abs_min > abs_max ? abs_min : abs_max); | |||
| } | |||
| return weight_scales; | |||
| } | |||
| static bool NearlyEqual(float a, float b, float epsilon) | |||
| { | |||
| if (a == b) | |||