From d2ca3143942b1d1874a359b262040cb26e090a88 Mon Sep 17 00:00:00 2001 From: ling Date: Fri, 5 Feb 2021 16:22:07 +0800 Subject: [PATCH] [MSLITE] int8 matmul base --- mindspore/lite/nnacl/int8/matmul_int8.c | 227 ++++++------ mindspore/lite/nnacl/int8/matmul_int8.h | 8 +- mindspore/lite/nnacl/int8/quantize.h | 11 - mindspore/lite/nnacl/matmul_parameter.h | 11 + .../kernel/arm/fp32/fullconnection_fp32.cc | 5 - .../kernel/arm/fp32/fullconnection_fp32.h | 1 - .../kernel/arm/int8/fullconnection_int8.cc | 240 +------------ .../kernel/arm/int8/fullconnection_int8.h | 41 +-- .../kernel/arm/int8/matmul_base_int8.cc | 323 ++++++++++++++++++ .../kernel/arm/int8/matmul_base_int8.h | 82 +++++ .../runtime/kernel/arm/int8/matmul_int8.cc | 185 +--------- .../src/runtime/kernel/arm/int8/matmul_int8.h | 31 +- mindspore/lite/test/run_benchmark_nets.sh | 4 +- .../kernel/arm/int8/matmul_int8_tests.cc | 52 --- 14 files changed, 557 insertions(+), 664 deletions(-) create mode 100644 mindspore/lite/src/runtime/kernel/arm/int8/matmul_base_int8.cc create mode 100644 mindspore/lite/src/runtime/kernel/arm/int8/matmul_base_int8.h diff --git a/mindspore/lite/nnacl/int8/matmul_int8.c b/mindspore/lite/nnacl/int8/matmul_int8.c index 9db55b6ac6..857f640001 100644 --- a/mindspore/lite/nnacl/int8/matmul_int8.c +++ b/mindspore/lite/nnacl/int8/matmul_int8.c @@ -182,40 +182,6 @@ void MatMulInt8_16x4(const int8_t *a, const int8_t *b, int *dst, int row_4, int return; } -void MatMulInt8_16x4_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16, - size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift, - int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi, - bool peroc) { - /* support per-layer && weight per-channel */ - /* row4x16-major * row16x4-major => (int8)row-major*/ - for (int r = 0; r < row; r++) { - for (int c = 0; c < col; c++) { - int r4div = r / C4NUM, r4mod = r % C4NUM; - int c4div = c / C4NUM, c4mod = c % C4NUM; - size_t ci = r * stride + c; - int32_t value = 0; - for (int d = 0; d < deep_16; d++) { - int d16div = d / C16NUM, d16mod = d % C16NUM; - size_t ai = r4div * deep_16 * C4NUM + d16div * C4NUM * C16NUM + r4mod * C16NUM + d16mod; - size_t bi = c4div * deep_16 * C4NUM + d16div * C4NUM * C16NUM + c4mod * C16NUM + d16mod; - value = value + a[ai] * b[bi]; - } - int32_t cur_input_sum = - peroc ? input_sum[c4div * UP_ROUND(row, C4NUM) * C4NUM + r * C4NUM + c4mod] : input_sum[r]; - value -= cur_input_sum; - value += bias[c]; - int32_t cur_left_shift = peroc ? left_shift[c] : left_shift[0]; - int32_t cur_right_shift = peroc ? right_shift[c] : right_shift[0]; - int32_t cur_multiplier = peroc ? multiplier[c] : multiplier[0]; - value = MultiplyByQuantizedMultiplier(value, cur_multiplier, cur_left_shift, cur_right_shift) + output_zp; - value = MSMIN(maxi, value); - value = MSMAX(mini, value); - dst[ci] = (int8_t)value; - } - } - return; -} - void MatMulInt8_4x2_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16, size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift, int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi, @@ -353,6 +319,105 @@ void MatMulInt8_4x16_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row return; } +#ifdef ENABLE_ARM64 +void PackInput4x4AndInputSumPert_arm64(const int8_t *src_ic, int8_t *pack_ic, int32_t *input_sum_r, size_t src_stride, + size_t ic_4div, size_t ic_4res, int32_t filter_zp) { + asm volatile( + "dup v2.4s, wzr \n" + "mov x14, %[input_sum_r] \n" + "dup v3.4s, %w[filter_zp] \n" + + "mov x10, %[src_ic] \n" + "mov x11, %[pack_ic] \n" + + "mov x15, #0 \n" + "1: \n" + "cmp x15, %[ic_4div] \n" + "add x15, x15, #4\n" + "mov x12, x10 \n" + "add x10, x10, #4\n" + "blt 2f \n" + "cmp %[ic_4res], #0\n" + "beq 6f \n" + "cmp %[ic_4res], #1\n" + "beq 3f \n" + "cmp %[ic_4res], #2\n" + "beq 4f \n" + "cmp %[ic_4res], #3\n" + "beq 5f \n" + + "2: \n" + "ld1 {v0.s}[0], [x12], %[src_stride]\n" + "ld1 {v0.s}[1], [x12], %[src_stride]\n" + "ld1 {v0.s}[2], [x12], %[src_stride]\n" + "ld1 {v0.s}[3], [x12], %[src_stride]\n" + + "st1 {v0.16b}, [x11], #16\n" + + "saddlp v1.8h, v0.16b \n" + "saddlp v0.4s, v1.8h \n" + "add v2.4s, v2.4s, v0.4s \n" + "b 1b \n" + + "3: \n" /* ic res 1 */ + "dup v0.4s, wzr \n" + + "ld1 {v0.b}[0], [x12], %[src_stride]\n" + "ld1 {v0.b}[4], [x12], %[src_stride]\n" + "ld1 {v0.b}[8], [x12], %[src_stride]\n" + "ld1 {v0.b}[12], [x12], %[src_stride]\n" + + "st1 {v0.16b}, [x11], #16\n" + "saddlp v1.8h, v0.16b \n" + "saddlp v0.4s, v1.8h \n" + "add v2.4s, v2.4s, v0.4s \n" + "b 6f \n" + + "4: \n" /* ic res 2 */ + "dup v0.4s, wzr \n" + + "ld1 {v0.h}[0], [x12], %[src_stride]\n" + "ld1 {v0.h}[2], [x12], %[src_stride]\n" + "ld1 {v0.h}[4], [x12], %[src_stride]\n" + "ld1 {v0.h}[6], [x12], %[src_stride]\n" + + "st1 {v0.16b}, [x11], #16\n" + "saddlp v1.8h, v0.16b \n" + "saddlp v0.4s, v1.8h \n" + "add v2.4s, v2.4s, v0.4s \n" + "b 6f \n" + + "5: \n" /* ic res 3 */ + "dup v0.4s, wzr \n" + "add x13, x12, #2 \n" + + "ld1 {v0.h}[0], [x12], %[src_stride]\n" + "ld1 {v0.b}[2], [x13], %[src_stride]\n" + "ld1 {v0.h}[2], [x12], %[src_stride]\n" + "ld1 {v0.b}[6], [x13], %[src_stride]\n" + "ld1 {v0.h}[4], [x12], %[src_stride]\n" + "ld1 {v0.b}[10], [x13], %[src_stride]\n" + "ld1 {v0.h}[6], [x12], %[src_stride]\n" + "ld1 {v0.b}[14], [x13], %[src_stride]\n" + + "st1 {v0.16b}, [x11], #16\n" + "saddlp v1.8h, v0.16b \n" + "saddlp v0.4s, v1.8h \n" + "add v2.4s, v2.4s, v0.4s \n" + "b 6f \n" + + "6: \n" + "mul v2.4s, v2.4s, v3.4s \n" + + "st1 {v2.4s}, [x14], #16 \n" + + : + : [ src_ic ] "r"(src_ic), [ pack_ic ] "r"(pack_ic), [ input_sum_r ] "r"(input_sum_r), + [ src_stride ] "r"(src_stride), [ ic_4div ] "r"(ic_4div), [ ic_4res ] "r"(ic_4res), [ filter_zp ] "r"(filter_zp) + : "x10", "x11", "x12", "x13", "x14", "x15", "v0", "v1", "v2", "v3"); + return; +} +#endif void PackInput4x4AndInputSumPert(const int8_t *src_input, int8_t *packed_input, int32_t *input_sum, size_t input_channel, size_t plane_size, int32_t filter_zp) { int ic4 = UP_ROUND(input_channel, C4NUM); @@ -370,99 +435,7 @@ void PackInput4x4AndInputSumPert(const int8_t *src_input, int8_t *packed_input, #ifdef ENABLE_ARM64 size_t src_stride = input_channel; size_t ic_4res = input_channel - ic_4div; - asm volatile( - "dup v2.4s, wzr \n" - "mov x14, %[input_sum_r] \n" - "dup v3.4s, %w[filter_zp] \n" - - "mov x10, %[src_ic] \n" - "mov x11, %[pack_ic] \n" - - "mov x15, #0 \n" - "1: \n" - "cmp x15, %[ic_4div] \n" - "add x15, x15, #4\n" - "mov x12, x10 \n" - "add x10, x10, #4\n" - "blt 2f \n" - "cmp %[ic_4res], #0\n" - "beq 6f \n" - "cmp %[ic_4res], #1\n" - "beq 3f \n" - "cmp %[ic_4res], #2\n" - "beq 4f \n" - "cmp %[ic_4res], #3\n" - "beq 5f \n" - - "2: \n" - "ld1 {v0.s}[0], [x12], %[src_stride]\n" - "ld1 {v0.s}[1], [x12], %[src_stride]\n" - "ld1 {v0.s}[2], [x12], %[src_stride]\n" - "ld1 {v0.s}[3], [x12], %[src_stride]\n" - - "st1 {v0.16b}, [x11], #16\n" - - "saddlp v1.8h, v0.16b \n" - "saddlp v0.4s, v1.8h \n" - "add v2.4s, v2.4s, v0.4s \n" - "b 1b \n" - - "3: \n" /* ic res 1 */ - "dup v0.4s, wzr \n" - - "ld1 {v0.b}[0], [x12], %[src_stride]\n" - "ld1 {v0.b}[4], [x12], %[src_stride]\n" - "ld1 {v0.b}[8], [x12], %[src_stride]\n" - "ld1 {v0.b}[12], [x12], %[src_stride]\n" - - "st1 {v0.16b}, [x11], #16\n" - "saddlp v1.8h, v0.16b \n" - "saddlp v0.4s, v1.8h \n" - "add v2.4s, v2.4s, v0.4s \n" - "b 6f \n" - - "4: \n" /* ic res 2 */ - "dup v0.4s, wzr \n" - - "ld1 {v0.h}[0], [x12], %[src_stride]\n" - "ld1 {v0.h}[2], [x12], %[src_stride]\n" - "ld1 {v0.h}[4], [x12], %[src_stride]\n" - "ld1 {v0.h}[6], [x12], %[src_stride]\n" - - "st1 {v0.16b}, [x11], #16\n" - "saddlp v1.8h, v0.16b \n" - "saddlp v0.4s, v1.8h \n" - "add v2.4s, v2.4s, v0.4s \n" - "b 6f \n" - - "5: \n" /* ic res 3 */ - "dup v0.4s, wzr \n" - "add x13, x12, #2 \n" - - "ld1 {v0.h}[0], [x12], %[src_stride]\n" - "ld1 {v0.b}[2], [x13], %[src_stride]\n" - "ld1 {v0.h}[2], [x12], %[src_stride]\n" - "ld1 {v0.b}[6], [x13], %[src_stride]\n" - "ld1 {v0.h}[4], [x12], %[src_stride]\n" - "ld1 {v0.b}[10], [x13], %[src_stride]\n" - "ld1 {v0.h}[6], [x12], %[src_stride]\n" - "ld1 {v0.b}[14], [x13], %[src_stride]\n" - - "st1 {v0.16b}, [x11], #16\n" - "saddlp v1.8h, v0.16b \n" - "saddlp v0.4s, v1.8h \n" - "add v2.4s, v2.4s, v0.4s \n" - "b 6f \n" - - "6: \n" - "mul v2.4s, v2.4s, v3.4s \n" - - "st1 {v2.4s}, [x14], #16 \n" - - : - : [ src_ic ] "r"(src_ic), [ pack_ic ] "r"(pack_ic), [ input_sum_r ] "r"(input_sum_r), - [ src_stride ] "r"(src_stride), [ ic_4div ] "r"(ic_4div), [ ic_4res ] "r"(ic_4res), [ filter_zp ] "r"(filter_zp) - : "x10", "x11", "x12", "x13", "x14", "x15", "v0", "v1", "v2", "v3"); + PackInput4x4AndInputSumPert_arm64(src_ic, pack_ic, input_sum_r, src_stride, ic_4div, ic_4res, filter_zp); #else int32_t tmp_sum_value[4] = {0}; for (int ici = 0; ici < ic_4div; ici += C4NUM) { diff --git a/mindspore/lite/nnacl/int8/matmul_int8.h b/mindspore/lite/nnacl/int8/matmul_int8.h index 35ae749ddf..c10c1b6149 100644 --- a/mindspore/lite/nnacl/int8/matmul_int8.h +++ b/mindspore/lite/nnacl/int8/matmul_int8.h @@ -25,12 +25,9 @@ extern "C" { #endif /* 4x16 16x4 -> 4x4 */ +/* matmul */ void MatMulInt8_16x4(const int8_t *a, const int8_t *b, int *dst, int row_4, int col_4, int deep_16, const int *input_sum, const int *bias); -void MatMulInt8_16x4_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16, - size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift, - int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi, - bool per_channel); void RowMajor2Row16x4MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col); void RowMajor2Col16x4MajorInt8(int8_t *src, int row, int col, int8_t *dst); void CalcInputSums(int8_t *input, int row, int col, int weight_zp, int *dst, DataOrder order); @@ -41,6 +38,7 @@ void MatmulInt8Opt(const int8_t *a, const int8_t *b, int8_t *dst, int row, int c int32_t *right_shift, size_t stride, size_t filter_peroc, int32_t *filter_zp); /* 8x4 4x8 -> 8x8 */ +/* optimize conv */ void RowMajor2Row8x4MajorInt8(const int8_t *src_ptr, int8_t *dst_ptr, int row, int col); void MatMulInt8_8x8_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_4, size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift, @@ -48,6 +46,7 @@ void MatMulInt8_8x8_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t per_channel); /* 4x16 16x2 -> 4x2 */ +/* arm32 conv1x1 */ void RowMajor2Row2x16MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col); void MatMulInt8_4x2_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16, size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift, @@ -55,6 +54,7 @@ void MatMulInt8_4x2_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, bool peroc); /* 4x4 4x16 -> 4x16 */ +/* optimize conv1x1 */ void RowMajor2Row4x16MajorInt8(const int8_t *src_ptr, int8_t *dst_ptr, int row, int col); void PackInput4x4AndInputSumPert(const int8_t *src_input, int8_t *packed_input, int32_t *input_sum, size_t input_channel, size_t plane_size, int32_t filter_zp); diff --git a/mindspore/lite/nnacl/int8/quantize.h b/mindspore/lite/nnacl/int8/quantize.h index f93e6b401e..06a4b5fa55 100644 --- a/mindspore/lite/nnacl/int8/quantize.h +++ b/mindspore/lite/nnacl/int8/quantize.h @@ -66,17 +66,6 @@ typedef struct PreluQuantArg { QuantArg out_quant_args_; } PreluQuantArg; -typedef struct MatmulQuantArg { - QuantArg input; - QuantArg weight; - QuantArg output; - int32_t out_act_min; - int32_t out_act_max; - int32_t left_shift; - int32_t right_shift; - int32_t quant_multiplier; -} MatmulQuantArg; - typedef struct CropQuantArg { QuantArg in_args_; QuantArg out_args_; diff --git a/mindspore/lite/nnacl/matmul_parameter.h b/mindspore/lite/nnacl/matmul_parameter.h index 9681080fef..4e6ff89aa1 100644 --- a/mindspore/lite/nnacl/matmul_parameter.h +++ b/mindspore/lite/nnacl/matmul_parameter.h @@ -73,4 +73,15 @@ typedef struct MatmulQuantParameter { int32_t *quant_multiplier_; } MatmulQuantParameter; +typedef struct MatmulQuantArg { + QuantArg input; + QuantArg weight; + QuantArg output; + int32_t out_act_min; + int32_t out_act_max; + int32_t left_shift; + int32_t right_shift; + int32_t quant_multiplier; +} MatmulQuantArg; + #endif // MINDSPORE_LITE_NNACL_MATMUL_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection_fp32.cc index 5d6fa3f621..4fdc8b8d36 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection_fp32.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection_fp32.cc @@ -67,10 +67,5 @@ int FullconnectionCPUKernel::ReSize() { return MatmulFp32BaseCPUKernel::ReSize(); } -int FullconnectionCPUKernel::Run() { - MatmulFp32BaseCPUKernel::Run(); - return RET_OK; -} - REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_FullConnection, LiteKernelCreator) } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection_fp32.h b/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection_fp32.h index 450efff2fb..d1f9c5546b 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection_fp32.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection_fp32.h @@ -33,7 +33,6 @@ class FullconnectionCPUKernel : public MatmulFp32BaseCPUKernel { ~FullconnectionCPUKernel() = default; int Init() override; int ReSize() override; - int Run() override; }; } // namespace mindspore::kernel #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_FULLCONNECTION_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/fullconnection_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/fullconnection_int8.cc index 1eb5441363..9ffad7e233 100644 --- a/mindspore/lite/src/runtime/kernel/arm/int8/fullconnection_int8.cc +++ b/mindspore/lite/src/runtime/kernel/arm/int8/fullconnection_int8.cc @@ -15,258 +15,44 @@ */ #include "src/runtime/kernel/arm/int8/fullconnection_int8.h" -#include "src/runtime/runtime_api.h" #include "src/kernel_registry.h" using mindspore::lite::KernelRegistrar; -using mindspore::lite::RET_ERROR; -using mindspore::lite::RET_MEMORY_FAILED; using mindspore::lite::RET_OK; using mindspore::schema::PrimitiveType_FullConnection; namespace mindspore::kernel { -void FullconnectionInt8CPUKernel::FreeQuantParam() { - if (quant_.filter_scale_ != nullptr) { - free(quant_.filter_scale_); - quant_.filter_scale_ = nullptr; - } - if (quant_.filter_zp_ != nullptr) { - free(quant_.filter_zp_); - quant_.filter_zp_ = nullptr; - } - if (quant_.left_shift_ != nullptr) { - free(quant_.left_shift_); - quant_.left_shift_ = nullptr; - } - if (quant_.right_shift_ != nullptr) { - free(quant_.right_shift_); - quant_.right_shift_ = nullptr; - } - if (quant_.quant_multiplier_ != nullptr) { - free(quant_.quant_multiplier_); - quant_.quant_multiplier_ = nullptr; - } - return; -} - -void FullconnectionInt8CPUKernel::FreeTmpBuffer() { - if (pack_a_ptr_ != nullptr) { - free(pack_a_ptr_); - pack_a_ptr_ = nullptr; - } - if (pack_b_ptr_ != nullptr) { - free(pack_b_ptr_); - pack_b_ptr_ = nullptr; - } - if (input_sums_ != nullptr) { - free(input_sums_); - input_sums_ = nullptr; - } - if (weight_bias_sums_ != nullptr) { - free(weight_bias_sums_); - weight_bias_sums_ = nullptr; - } - if (bias_ptr_ != nullptr) { - free(bias_ptr_); - bias_ptr_ = nullptr; - } - return; -} - -int FullconnectionInt8CPUKernel::MallocQuantParam() { - auto weight_tensor = in_tensors_.at(1); - auto weight_quant_params = weight_tensor->quant_params(); - int col = weight_tensor->shape().front(); - filter_per_channel_ = (weight_quant_params.size() > 1); - - int init_size = filter_per_channel_ ? col : 1; +int FullconnectionInt8CPUKernel::Init() { + param_->batch = 1; + param_->a_transpose_ = false; + param_->b_transpose_ = true; - quant_.filter_scale_ = reinterpret_cast(malloc(init_size * sizeof(float))); - if (quant_.filter_scale_ == nullptr) { - return RET_ERROR; - } - quant_.filter_zp_ = reinterpret_cast(malloc(init_size * sizeof(int32_t))); - if (quant_.filter_zp_ == nullptr) { - return RET_ERROR; - } - quant_.left_shift_ = reinterpret_cast(malloc(init_size * sizeof(int32_t))); - if (quant_.left_shift_ == nullptr) { - return RET_ERROR; - } - quant_.right_shift_ = reinterpret_cast(malloc(init_size * sizeof(int32_t))); - if (quant_.right_shift_ == nullptr) { - return RET_ERROR; - } - quant_.quant_multiplier_ = reinterpret_cast(malloc(init_size * sizeof(int32_t))); - if (quant_.quant_multiplier_ == nullptr) { - return RET_ERROR; - } - return RET_OK; -} + InitParameter(); -int FullconnectionInt8CPUKernel::Init() { - auto ret = MallocQuantParam(); + auto ret = MatmulBaseInt8CPUKernel::Init(); if (ret != RET_OK) { - FreeQuantParam(); + MS_LOG(ERROR) << "ParallelLaunch failed"; return ret; } - auto in_quant_params = in_tensors_.at(0)->quant_params(); - quant_.input_.zp_ = in_quant_params.front().zeroPoint; - quant_.input_.scale_ = in_quant_params.front().scale; - - auto out_quant_params = out_tensors_.at(0)->quant_params(); - quant_.output_.zp_ = out_quant_params.front().zeroPoint; - quant_.output_.scale_ = out_quant_params.front().scale; - - auto weight_tensor = in_tensors_.at(1); - fc_param_->b_const_ = (weight_tensor->data_c() != nullptr); - int weight_quant_num = filter_per_channel_ ? weight_tensor->shape().front() : 1; - auto weight_quant_params = weight_tensor->quant_params(); - - for (int i = 0; i < weight_quant_num; i++) { - quant_.filter_zp_[i] = weight_quant_params[i].zeroPoint; - quant_.filter_scale_[i] = weight_quant_params[i].scale; - } - - for (int i = 0; i < weight_quant_num; ++i) { - const double in_scale = static_cast(quant_.input_.scale_ * quant_.filter_scale_[i]); - double real_multiplier = in_scale / static_cast(quant_.output_.scale_); - QuantizeRoundParameterWithDoublePrecision(real_multiplier, &quant_.quant_multiplier_[i], &quant_.left_shift_[i], - &quant_.right_shift_[i]); - } - - CalculateActivationRangeQuantized(fc_param_->act_type_ == ActType_Relu, fc_param_->act_type_ == ActType_Relu6, - quant_.output_.zp_, quant_.output_.scale_, &quant_.out_act_min_, - &quant_.out_act_max_); - if (!InferShapeDone()) { return RET_OK; } return ReSize(); } -void FullconnectionInt8CPUKernel::InitParam() { +int FullconnectionInt8CPUKernel::ReSize() { int row = 1; for (size_t i = 0; i < out_tensors_.at(0)->shape().size() - 1; ++i) { row *= (out_tensors_.at(0)->shape()).at(i); } - fc_param_->row_ = row; - fc_param_->col_ = out_tensors_.at(0)->shape().back(); - fc_param_->deep_ = (in_tensors_.at(1)->shape()).at(1); + param_->row_ = row; + param_->col_ = out_tensors_.at(0)->shape().back(); + param_->deep_ = (in_tensors_.at(1)->shape()).at(1); - fc_param_->row_4_ = UP_ROUND(fc_param_->row_, C4NUM); - fc_param_->col_4_ = UP_ROUND(fc_param_->col_, C4NUM); - fc_param_->col_8_ = UP_ROUND(fc_param_->col_, C8NUM); - fc_param_->deep_16_ = UP_ROUND(fc_param_->deep_, C16NUM); - - thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(fc_param_->col_4_, C4NUM)); - thread_stride_ = UP_DIV(UP_DIV(fc_param_->col_4_, C4NUM), thread_count_); - return; -} - -int FullconnectionInt8CPUKernel::ReSize() { - FreeTmpBuffer(); - - InitParam(); - - pack_a_ptr_ = reinterpret_cast(malloc(fc_param_->row_4_ * fc_param_->deep_16_ * sizeof(int8_t))); - if (pack_a_ptr_ == nullptr) { - FreeTmpBuffer(); - return RET_ERROR; - } - pack_b_ptr_ = reinterpret_cast(malloc(fc_param_->col_4_ * fc_param_->deep_16_ * sizeof(int8_t))); - if (pack_b_ptr_ == nullptr) { - FreeTmpBuffer(); - return RET_ERROR; - } - input_sums_ = reinterpret_cast(malloc(fc_param_->row_4_ * sizeof(int))); - if (input_sums_ == nullptr) { - FreeTmpBuffer(); - return RET_ERROR; - } - weight_bias_sums_ = reinterpret_cast(malloc(fc_param_->col_4_ * sizeof(int))); - if (weight_bias_sums_ == nullptr) { - FreeTmpBuffer(); - return RET_ERROR; - } - - memset(pack_a_ptr_, 0, fc_param_->row_4_ * fc_param_->deep_16_ * sizeof(int8_t)); - memset(pack_b_ptr_, 0, fc_param_->col_4_ * fc_param_->deep_16_ * sizeof(int8_t)); - memset(input_sums_, 0, fc_param_->row_4_ * sizeof(int)); - memset(weight_bias_sums_, 0, fc_param_->col_4_ * sizeof(int)); - - if (in_tensors_.size() == 3) { - bias_ptr_ = reinterpret_cast(malloc(fc_param_->col_4_ * sizeof(int))); - if (bias_ptr_ == nullptr) { - MS_LOG(ERROR) << "Memory allocation failed"; - FreeTmpBuffer(); - return RET_MEMORY_FAILED; - } - memcpy(bias_ptr_, in_tensors_.at(2)->data_c(), fc_param_->col_ * sizeof(int)); - } else { - bias_ptr_ = nullptr; - } - - if (fc_param_->b_const_) { - auto weight_data = reinterpret_cast(in_tensors_.at(1)->data_c()); - RowMajor2Row16x4MajorInt8(weight_data, pack_b_ptr_, fc_param_->col_, fc_param_->deep_); - CalcWeightBiasSums(weight_data, fc_param_->deep_, fc_param_->col_, quant_.input_.zp_, quant_.filter_zp_, bias_ptr_, - weight_bias_sums_, ColMajor, filter_per_channel_); - } - return RET_OK; -} - -int FullconnectionInt8CPUKernel::RunImpl(int task_id) { - int stride = thread_stride_ * C4NUM; - int cur_stride = task_id * stride; - int res_stride = fc_param_->col_ - cur_stride; - int cur_oc = MSMIN(stride, res_stride); - if (cur_oc <= 0) { - return RET_OK; - } - - int32_t *cur_left = filter_per_channel_ ? quant_.left_shift_ + cur_stride : quant_.left_shift_; - int32_t *cur_right = filter_per_channel_ ? quant_.right_shift_ + cur_stride : quant_.right_shift_; - int32_t *cur_mul = filter_per_channel_ ? quant_.quant_multiplier_ + cur_stride : quant_.quant_multiplier_; - int32_t *cur_zp = filter_per_channel_ ? quant_.filter_zp_ + cur_stride : quant_.filter_zp_; - - MatmulInt8Opt(pack_a_ptr_, pack_b_ptr_ + cur_stride * fc_param_->deep_16_, c_ptr_ + cur_stride, fc_param_->row_, - cur_oc, fc_param_->deep_16_, input_sums_, weight_bias_sums_ + cur_stride, quant_.out_act_min_, - quant_.out_act_max_, quant_.output_.zp_, cur_mul, cur_left, cur_right, fc_param_->col_, - filter_per_channel_, cur_zp); - - return RET_OK; -} - -int FcInt8Run(void *cdata, int task_id) { - auto fc = reinterpret_cast(cdata); - auto ret = fc->RunImpl(task_id); + auto ret = MatmulBaseInt8CPUKernel::ReSize(); if (ret != RET_OK) { - MS_LOG(ERROR) << "FcInt8Run error task_id[" << task_id << "] error_code[" << ret << "]"; - return ret; - } - return RET_OK; -} - -int FullconnectionInt8CPUKernel::Run() { - auto input_ptr = reinterpret_cast(in_tensors_.at(0)->data_c()); - RowMajor2Row16x4MajorInt8(input_ptr, pack_a_ptr_, fc_param_->row_, fc_param_->deep_); - - int32_t tmp_weight_zp = filter_per_channel_ ? 1 : quant_.filter_zp_[0]; - CalcInputSums(input_ptr, fc_param_->row_, fc_param_->deep_, tmp_weight_zp, input_sums_, RowMajor); - - if (!fc_param_->b_const_) { - auto weight_data = reinterpret_cast(in_tensors_.at(1)->data_c()); - RowMajor2Row16x4MajorInt8(weight_data, pack_b_ptr_, fc_param_->col_, fc_param_->deep_); - CalcWeightBiasSums(weight_data, fc_param_->deep_, fc_param_->col_, quant_.input_.zp_, quant_.filter_zp_, bias_ptr_, - weight_bias_sums_, ColMajor, filter_per_channel_); - } - - c_ptr_ = reinterpret_cast(out_tensors_.at(0)->data_c()); - auto ret = ParallelLaunch(this->context_->thread_pool_, FcInt8Run, this, thread_count_); - if (ret != RET_OK) { - MS_LOG(ERROR) << "ParallelLaunch failed"; + MS_LOG(ERROR) << "MatmulBaseInt8CPUKernel failed"; return ret; } return RET_OK; diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/fullconnection_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/fullconnection_int8.h index 5ca9ac46cc..5a71f2afeb 100644 --- a/mindspore/lite/src/runtime/kernel/arm/int8/fullconnection_int8.h +++ b/mindspore/lite/src/runtime/kernel/arm/int8/fullconnection_int8.h @@ -18,52 +18,19 @@ #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_FULLCONNECTION_INT8_H_ #include -#include "src/lite_kernel.h" #include "include/errorcode.h" -#include "mindspore/lite/nnacl/int8/quantize.h" -#include "nnacl/common_func.h" -#include "nnacl/int8/common_func_int8.h" -#include "nnacl/int8/matmul_int8.h" +#include "src/runtime/kernel/arm/int8/matmul_base_int8.h" namespace mindspore::kernel { -class FullconnectionInt8CPUKernel : public LiteKernel { +class FullconnectionInt8CPUKernel : public MatmulBaseInt8CPUKernel { public: FullconnectionInt8CPUKernel(OpParameter *parameter, const std::vector &inputs, const std::vector &outputs, const mindspore::lite::InnerContext *ctx, const mindspore::lite::PrimitiveC *primitive) - : LiteKernel(parameter, inputs, outputs, ctx, primitive) { - fc_param_ = reinterpret_cast(op_parameter_); - } - ~FullconnectionInt8CPUKernel() override { - FreeTmpBuffer(); - FreeQuantParam(); - } - + : MatmulBaseInt8CPUKernel(parameter, inputs, outputs, ctx, primitive) {} + ~FullconnectionInt8CPUKernel() override = default; int Init() override; int ReSize() override; - int Run() override; - - public: - int RunImpl(int task_id); - - private: - void InitParam(); - void FreeTmpBuffer(); - void FreeQuantParam(); - int MallocQuantParam(); - - private: - MatMulParameter *fc_param_ = nullptr; - MatmulQuantParameter quant_; - int thread_count_ = 1; - int thread_stride_ = 0; - int8_t *pack_a_ptr_ = nullptr; - int8_t *pack_b_ptr_ = nullptr; - int8_t *c_ptr_ = nullptr; - int *input_sums_ = nullptr; - int *weight_bias_sums_ = nullptr; - int *bias_ptr_ = nullptr; - bool filter_per_channel_ = true; }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/matmul_base_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/matmul_base_int8.cc new file mode 100644 index 0000000000..ac8fdad9b5 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/int8/matmul_base_int8.cc @@ -0,0 +1,323 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/kernel/arm/int8/matmul_base_int8.h" +#include "src/runtime/runtime_api.h" + +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_MEMORY_FAILED; +using mindspore::lite::RET_OK; + +namespace mindspore::kernel { +int MatmulBaseInt8Run(void *cdata, int task_id) { + auto op = reinterpret_cast(cdata); + auto ret = op->RunImpl(task_id); + if (ret != RET_OK) { + MS_LOG(ERROR) << "MatmulInt8Run error task_id[" << task_id << "] error_code[" << ret << "]"; + return ret; + } + return RET_OK; +} + +int MatmulBaseInt8CPUKernel::RunImpl(int task_id) { + int stride = thread_stride_ * C4NUM; + int cur_stride = task_id * stride; + int res_stride = param_->col_ - cur_stride; + int cur_oc = MSMIN(stride, res_stride); + if (cur_oc <= 0) { + return RET_OK; + } + + int32_t *cur_left = filter_per_channel_ ? quant_.left_shift_ + cur_stride : quant_.left_shift_; + int32_t *cur_right = filter_per_channel_ ? quant_.right_shift_ + cur_stride : quant_.right_shift_; + int32_t *cur_mul = filter_per_channel_ ? quant_.quant_multiplier_ + cur_stride : quant_.quant_multiplier_; + int32_t *cur_zp = filter_per_channel_ ? quant_.filter_zp_ + cur_stride : quant_.filter_zp_; + + MatmulInt8Opt(pack_a_ptr_, batch_b_ptr_ + cur_stride * param_->deep_16_, batch_c_ptr_ + cur_stride, param_->row_, + cur_oc, param_->deep_16_, input_sums_, weight_bias_sums_ + cur_stride, quant_.out_act_min_, + quant_.out_act_max_, quant_.output_.zp_, cur_mul, cur_left, cur_right, param_->col_, + filter_per_channel_, cur_zp); + + return RET_OK; +} + +MatmulBaseInt8CPUKernel::~MatmulBaseInt8CPUKernel() { + FreeQuantParam(); + + FreeTmpBuffer(); + + if (bias_ptr_ != nullptr) { + free(bias_ptr_); + bias_ptr_ = nullptr; + } + return; +} + +void MatmulBaseInt8CPUKernel::FreeQuantParam() { + if (quant_.filter_scale_ != nullptr) { + free(quant_.filter_scale_); + quant_.filter_scale_ = nullptr; + } + if (quant_.filter_zp_ != nullptr) { + free(quant_.filter_zp_); + quant_.filter_zp_ = nullptr; + } + if (quant_.left_shift_ != nullptr) { + free(quant_.left_shift_); + quant_.left_shift_ = nullptr; + } + if (quant_.right_shift_ != nullptr) { + free(quant_.right_shift_); + quant_.right_shift_ = nullptr; + } + if (quant_.quant_multiplier_ != nullptr) { + free(quant_.quant_multiplier_); + quant_.quant_multiplier_ = nullptr; + } + return; +} + +int MatmulBaseInt8CPUKernel::MallocQuantParam() { + auto weight_tensor = in_tensors_.at(1); + auto weight_quant_params = weight_tensor->quant_params(); + int col = weight_tensor->shape().front(); + + filter_per_channel_ = (weight_quant_params.size() > 1); + + int init_size = filter_per_channel_ ? col : 1; + + quant_.filter_scale_ = reinterpret_cast(malloc(init_size * sizeof(float))); + if (quant_.filter_scale_ == nullptr) { + return RET_ERROR; + } + quant_.filter_zp_ = reinterpret_cast(malloc(init_size * sizeof(int32_t))); + if (quant_.filter_zp_ == nullptr) { + return RET_ERROR; + } + quant_.left_shift_ = reinterpret_cast(malloc(init_size * sizeof(int32_t))); + if (quant_.left_shift_ == nullptr) { + return RET_ERROR; + } + quant_.right_shift_ = reinterpret_cast(malloc(init_size * sizeof(int32_t))); + if (quant_.right_shift_ == nullptr) { + return RET_ERROR; + } + quant_.quant_multiplier_ = reinterpret_cast(malloc(init_size * sizeof(int32_t))); + if (quant_.quant_multiplier_ == nullptr) { + return RET_ERROR; + } + return RET_OK; +} + +void MatmulBaseInt8CPUKernel::InitQuantParam() { + auto in_quant_params = in_tensors_.at(0)->quant_params(); + quant_.input_.zp_ = in_quant_params.front().zeroPoint; + quant_.input_.scale_ = in_quant_params.front().scale; + + auto out_quant_params = out_tensors_.at(0)->quant_params(); + quant_.output_.zp_ = out_quant_params.front().zeroPoint; + quant_.output_.scale_ = out_quant_params.front().scale; + + auto weight_tensor = in_tensors_.at(1); + int weight_quant_num = filter_per_channel_ ? weight_tensor->shape().front() : 1; + auto weight_quant_params = weight_tensor->quant_params(); + + for (int i = 0; i < weight_quant_num; i++) { + quant_.filter_zp_[i] = weight_quant_params[i].zeroPoint; + quant_.filter_scale_[i] = weight_quant_params[i].scale; + } + + for (int i = 0; i < weight_quant_num; ++i) { + const double in_scale = static_cast(quant_.input_.scale_ * quant_.filter_scale_[i]); + double real_multiplier = in_scale / static_cast(quant_.output_.scale_); + QuantizeRoundParameterWithDoublePrecision(real_multiplier, &quant_.quant_multiplier_[i], &quant_.left_shift_[i], + &quant_.right_shift_[i]); + } + + CalculateActivationRangeQuantized(param_->act_type_ == ActType_Relu, param_->act_type_ == ActType_Relu6, + quant_.output_.zp_, quant_.output_.scale_, &quant_.out_act_min_, + &quant_.out_act_max_); +} + +void MatmulBaseInt8CPUKernel::InitParameter() { + param_->a_const_ = (in_tensors_[0]->data_c() != nullptr); + param_->b_const_ = (in_tensors_[1]->data_c() != nullptr); + return; +} + +void MatmulBaseInt8CPUKernel::ResizeParameter() { + param_->row_align_ = UP_ROUND(param_->row_, C4NUM); + param_->col_align_ = UP_ROUND(param_->col_, C4NUM); + param_->deep_16_ = UP_ROUND(param_->deep_, C16NUM); + + thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(param_->col_align_, C4NUM)); + thread_stride_ = UP_DIV(UP_DIV(param_->col_align_, C4NUM), thread_count_); + return; +} + +void MatmulBaseInt8CPUKernel::FreeTmpBuffer() { + if (pack_a_ptr_ != nullptr) { + free(pack_a_ptr_); + pack_a_ptr_ = nullptr; + } + if (pack_b_ptr_ != nullptr) { + free(pack_b_ptr_); + pack_b_ptr_ = nullptr; + } + if (input_sums_ != nullptr) { + free(input_sums_); + input_sums_ = nullptr; + } + if (weight_bias_sums_ != nullptr) { + free(weight_bias_sums_); + weight_bias_sums_ = nullptr; + } + return; +} + +void MatmulBaseInt8CPUKernel::TransferB() { + auto weight_data = reinterpret_cast(in_tensors_.at(1)->data_c()); + for (int i = 0; i < param_->batch; i++) { + auto current_weight = weight_data + i * param_->deep_ * param_->col_; + auto current_b_pack = pack_b_ptr_ + i * param_->col_align_ * param_->deep_16_; + auto current_sums = weight_bias_sums_ + i * param_->col_align_; + if (param_->b_transpose_) { + RowMajor2Row16x4MajorInt8(current_weight, current_b_pack, param_->col_, param_->deep_); + CalcWeightBiasSums(current_weight, param_->deep_, param_->col_, quant_.input_.zp_, quant_.filter_zp_, bias_ptr_, + current_sums, ColMajor, filter_per_channel_); + } else { + RowMajor2Col16x4MajorInt8(current_weight, param_->deep_, param_->col_, current_b_pack); + CalcWeightBiasSums(current_weight, param_->deep_, param_->col_, quant_.input_.zp_, quant_.filter_zp_, bias_ptr_, + current_sums, RowMajor, false); + } + } + return; +} + +int MatmulBaseInt8CPUKernel::InitTmpBuffer() { + pack_a_ptr_ = reinterpret_cast(malloc(param_->row_align_ * param_->deep_16_ * sizeof(int8_t))); + if (pack_a_ptr_ == nullptr) { + FreeTmpBuffer(); + return RET_ERROR; + } + pack_b_ptr_ = + reinterpret_cast(malloc(param_->batch * param_->col_align_ * param_->deep_16_ * sizeof(int8_t))); + if (pack_b_ptr_ == nullptr) { + FreeTmpBuffer(); + return RET_ERROR; + } + input_sums_ = reinterpret_cast(malloc(param_->row_align_ * sizeof(int))); + if (input_sums_ == nullptr) { + FreeTmpBuffer(); + return RET_ERROR; + } + weight_bias_sums_ = reinterpret_cast(malloc(param_->batch * param_->col_align_ * sizeof(int))); + if (weight_bias_sums_ == nullptr) { + FreeTmpBuffer(); + return RET_ERROR; + } + + memset(pack_a_ptr_, 0, param_->row_align_ * param_->deep_16_ * sizeof(int8_t)); + memset(pack_b_ptr_, 0, param_->batch * param_->col_align_ * param_->deep_16_ * sizeof(int8_t)); + memset(input_sums_, 0, param_->row_align_ * sizeof(int)); + memset(weight_bias_sums_, 0, param_->batch * param_->col_align_ * sizeof(int)); + + return RET_OK; +} + +int MatmulBaseInt8CPUKernel::InitBias() { + if (in_tensors_.size() == 3) { + auto bias_tensor = in_tensors_[2]; + int max_bias_data = UP_ROUND(bias_tensor->ElementsNum(), C4NUM); + bias_ptr_ = reinterpret_cast(malloc(max_bias_data * sizeof(int))); + if (bias_ptr_ == nullptr) { + MS_LOG(ERROR) << "Memory allocation failed"; + FreeTmpBuffer(); + return RET_MEMORY_FAILED; + } + memcpy(bias_ptr_, bias_tensor->data_c(), bias_tensor->ElementsNum() * sizeof(int)); + } else { + bias_ptr_ = nullptr; + } + return RET_OK; +} + +int MatmulBaseInt8CPUKernel::Init() { + auto ret = MallocQuantParam(); + if (ret != RET_OK) { + FreeQuantParam(); + return ret; + } + + InitQuantParam(); + + ret = InitBias(); + if (ret != RET_OK) { + FreeQuantParam(); + return ret; + } + + return RET_OK; +} + +int MatmulBaseInt8CPUKernel::ReSize() { + FreeTmpBuffer(); + + ResizeParameter(); + + auto ret = InitTmpBuffer(); + if (ret != RET_OK) { + FreeQuantParam(); + return ret; + } + + if (param_->b_const_ == true) { + TransferB(); + } + return RET_OK; +} + +int MatmulBaseInt8CPUKernel::Run() { + if (param_->b_const_ == false) { + TransferB(); + } + + int8_t *a_ptr = reinterpret_cast(in_tensors_.at(0)->data_c()); + int8_t *c_ptr = reinterpret_cast(out_tensors_.at(0)->data_c()); + int32_t tmp_weight_zp = filter_per_channel_ ? 1 : quant_.filter_zp_[0]; + for (int i = 0; i < param_->batch; i++) { + auto current_src_a = a_ptr + i * param_->row_ * param_->deep_; + if (param_->a_transpose_) { + RowMajor2Col16x4MajorInt8(current_src_a, param_->deep_, param_->row_, pack_a_ptr_); + CalcInputSums(current_src_a, param_->row_, param_->deep_, tmp_weight_zp, input_sums_, ColMajor); + } else { + RowMajor2Row16x4MajorInt8(current_src_a, pack_a_ptr_, param_->row_, param_->deep_); + CalcInputSums(current_src_a, param_->row_, param_->deep_, tmp_weight_zp, input_sums_, RowMajor); + } + + batch_b_ptr_ = pack_b_ptr_ + i * param_->col_align_ * param_->deep_16_; + batch_sums_ = weight_bias_sums_ + i * param_->col_align_; + batch_c_ptr_ = c_ptr + i * param_->row_ * param_->col_; + + auto ret = ParallelLaunch(this->context_->thread_pool_, MatmulBaseInt8Run, this, thread_count_); + if (ret != RET_OK) { + MS_LOG(ERROR) << "MatmulInt8Run error: [" << ret << "]"; + return ret; + } + } + return RET_OK; +} +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/matmul_base_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/matmul_base_int8.h new file mode 100644 index 0000000000..d125f2ea9b --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/int8/matmul_base_int8.h @@ -0,0 +1,82 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_MATMUL_BASE_INT8_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_MATMUL_BASE_INT8_H_ + +#include +#include "include/errorcode.h" +#include "include/context.h" +#include "src/lite_kernel.h" +#include "nnacl/matmul_parameter.h" +#include "nnacl/common_func.h" +#include "nnacl/int8/quantize.h" +#include "nnacl/int8/common_func_int8.h" +#include "nnacl/int8/matmul_int8.h" + +namespace mindspore::kernel { +class MatmulBaseInt8CPUKernel : public LiteKernel { + public: + MatmulBaseInt8CPUKernel(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs, const lite::InnerContext *ctx, + const mindspore::lite::PrimitiveC *primitive) + : LiteKernel(parameter, inputs, outputs, ctx, primitive) { + param_ = reinterpret_cast(op_parameter_); + } + ~MatmulBaseInt8CPUKernel() override; + int Init() override; + int ReSize() override; + int Run() override; + + public: + int RunImpl(int task_id); + + protected: + void InitParameter(); + + private: + void ResizeParameter(); + int InitBias(); + + private: + int InitTmpBuffer(); + void FreeTmpBuffer(); + void TransferA(); + void TransferB(); + + private: + int MallocQuantParam(); + void FreeQuantParam(); + void InitQuantParam(); + + protected: + MatMulParameter *param_ = nullptr; + MatmulQuantParameter quant_; + int thread_count_ = 1; + int thread_stride_ = 0; + int8_t *pack_a_ptr_ = nullptr; + int8_t *pack_b_ptr_ = nullptr; + int *input_sums_ = nullptr; + int *weight_bias_sums_ = nullptr; + int *bias_ptr_ = nullptr; + bool filter_per_channel_ = true; + int8_t *batch_b_ptr_ = nullptr; + int8_t *batch_c_ptr_ = nullptr; + int *batch_sums_ = nullptr; +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_MATMUL_BASE_INT8_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/matmul_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/matmul_int8.cc index 42d6ddce2f..627df29b58 100644 --- a/mindspore/lite/src/runtime/kernel/arm/int8/matmul_int8.cc +++ b/mindspore/lite/src/runtime/kernel/arm/int8/matmul_int8.cc @@ -22,46 +22,27 @@ #include "src/kernel_registry.h" using mindspore::lite::KernelRegistrar; -using mindspore::lite::RET_MEMORY_FAILED; using mindspore::lite::RET_OK; using mindspore::schema::PrimitiveType_MatMul; namespace mindspore::kernel { -MatmulInt8CPUKernel::~MatmulInt8CPUKernel() { FreeTmpBuffer(); } +int MatmulInt8CPUKernel::Init() { + InitParameter(); -void MatmulInt8CPUKernel::FreeTmpBuffer() { - if (a_r4x16_ptr_ != nullptr) { - context_->allocator->Free(a_r4x16_ptr_); - a_r4x16_ptr_ = nullptr; - } - if (input_sums_ != nullptr) { - context_->allocator->Free(input_sums_); - input_sums_ = nullptr; - } - if (b_c16x4_batch_ != nullptr) { - context_->allocator->Free(b_c16x4_batch_); - b_c16x4_batch_ = nullptr; - } - if (weight_bias_sums_batch_ != nullptr) { - context_->allocator->Free(weight_bias_sums_batch_); - weight_bias_sums_batch_ = nullptr; - } - if (bias_ptr_ != nullptr) { - context_->allocator->Free(bias_ptr_); - bias_ptr_ = nullptr; + auto ret = MatmulBaseInt8CPUKernel::Init(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "ParallelLaunch failed"; + return ret; } - return; -} -int MatmulInt8CPUKernel::Init() { if (!InferShapeDone()) { return RET_OK; } + return ReSize(); } int MatmulInt8CPUKernel::ReSize() { - FreeTmpBuffer(); int batch = 1; auto x_shape = in_tensors_.at(0)->shape(); auto o_shape = out_tensors_.at(0)->shape(); @@ -69,159 +50,19 @@ int MatmulInt8CPUKernel::ReSize() { for (size_t i = 0; i < x_shape.size() - 2; ++i) { batch *= x_shape[i]; } - params_->batch = batch; + param_->batch = batch; MS_ASSERT(o_shape.size() >= 2); - params_->row_ = o_shape[o_shape.size() - 2]; - params_->col_ = o_shape[o_shape.size() - 1]; - params_->deep_ = params_->a_transpose_ ? x_shape[x_shape.size() - 2] : x_shape[x_shape.size() - 1]; - params_->row_4_ = UP_ROUND(params_->row_, 4); - params_->col_4_ = UP_ROUND(params_->col_, 4); - params_->deep_16_ = UP_ROUND(params_->deep_, 16); - a_r4x16_ptr_ = - reinterpret_cast(context_->allocator->Malloc(params_->row_4_ * params_->deep_16_ * sizeof(int8_t))); - if (!a_r4x16_ptr_) return RET_MEMORY_FAILED; - memset(a_r4x16_ptr_, 0, params_->row_4_ * params_->deep_16_ * sizeof(int8_t)); - input_sums_ = reinterpret_cast(context_->allocator->Malloc(params_->row_4_ * sizeof(int))); - if (!input_sums_) return RET_MEMORY_FAILED; - memset(input_sums_, 0, params_->row_4_ * sizeof(int)); - b_c16x4_batch_ = reinterpret_cast( - context_->allocator->Malloc(params_->batch * params_->col_4_ * params_->deep_16_ * sizeof(int8_t))); - if (!b_c16x4_batch_) return RET_MEMORY_FAILED; - memset(b_c16x4_batch_, 0, params_->batch * params_->col_4_ * params_->deep_16_ * sizeof(int8_t)); - weight_bias_sums_batch_ = - reinterpret_cast(context_->allocator->Malloc(params_->batch * params_->col_4_ * sizeof(int))); - if (!weight_bias_sums_batch_) return RET_MEMORY_FAILED; - memset(weight_bias_sums_batch_, 0, params_->batch * params_->col_4_ * sizeof(int)); - if (in_tensors_.size() == 3) { - auto bias_size = params_->col_4_ * sizeof(int); - bias_ptr_ = reinterpret_cast(context_->allocator->Malloc(bias_size)); - if (!bias_ptr_) return RET_MEMORY_FAILED; - memcpy(bias_ptr_, in_tensors_[2]->data_c(), bias_size); - } else { - bias_ptr_ = NULL; - } - thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(params_->col_4_, 4)); - thread_stride_ = UP_DIV(UP_DIV(params_->col_4_, 4), thread_count_); - - auto input_tensor = in_tensors_.at(0); - auto params = input_tensor->quant_params(); - MS_ASSERT(params.size() == 1); - quant_params_.input.zp_ = params.front().zeroPoint; - quant_params_.input.scale_ = params.front().scale; - auto weight_tensor = in_tensors_.at(1); - params = weight_tensor->quant_params(); - MS_ASSERT(params.size() == 1); - quant_params_.weight.zp_ = params.front().zeroPoint; - quant_params_.weight.scale_ = params.front().scale; - auto output_tensor = out_tensors_.at(0); - params = output_tensor->quant_params(); - MS_ASSERT(params.size() == 1); - quant_params_.output.zp_ = params.front().zeroPoint; - quant_params_.output.scale_ = params.front().scale; - - params_->b_const_ = (in_tensors_.at(1)->data_c() != nullptr); - if (params_->b_const_) { - auto b_ptr = reinterpret_cast(in_tensors_.at(1)->data_c()); - for (int i = 0; i < params_->batch; ++i) { - auto cur_b = b_ptr + i * params_->deep_ * params_->col_; - auto cur_b_pack = b_c16x4_batch_ + i * params_->col_4_ * params_->deep_16_; - auto cur_sums = weight_bias_sums_batch_ + i * params_->col_4_; - if (params_->b_transpose_) { - RowMajor2Row16x4MajorInt8(cur_b, cur_b_pack, params_->col_, params_->deep_); - CalcWeightBiasSums(cur_b, params_->deep_, params_->col_, quant_params_.input.zp_, &quant_params_.weight.zp_, - bias_ptr_, cur_sums, ColMajor, false); - } else { - RowMajor2Col16x4MajorInt8(cur_b, params_->deep_, params_->col_, cur_b_pack); - CalcWeightBiasSums(cur_b, params_->deep_, params_->col_, quant_params_.input.zp_, &quant_params_.weight.zp_, - bias_ptr_, cur_sums, RowMajor, false); - } - } - } - double real_multiplier = quant_params_.input.scale_ * quant_params_.weight.scale_ / quant_params_.output.scale_; - QuantizeRoundParameterWithDoublePrecision(real_multiplier, &quant_params_.quant_multiplier, &quant_params_.left_shift, - &quant_params_.right_shift); - return RET_OK; -} - -int MatmulInt8CPUKernel::RunImpl(int task_id) { - int cur_oc = MSMIN(thread_stride_, UP_DIV(params_->col_4_, 4) - task_id * thread_stride_); - if (cur_oc <= 0) { - return RET_OK; - } - int cur_oc_res = MSMIN(thread_stride_ * C4NUM, params_->col_ - task_id * thread_stride_ * C4NUM); - auto cur_b = b_c16x4_ptr_ + task_id * thread_stride_ * 4 * params_->deep_16_; - auto cur_bias = weight_bias_sums_ + task_id * thread_stride_ * 4; - auto cur_c = c_ptr_ + task_id * thread_stride_ * 4; - - auto &p = quant_params_; -#ifdef ENABLE_ARM64 - MatmulInt8Neon64(a_r4x16_ptr_, cur_b, cur_c, params_->row_4_, cur_oc * C4NUM, params_->deep_16_, input_sums_, - cur_bias, INT8_MIN, INT8_MAX, p.output.zp_, &p.quant_multiplier, &p.left_shift, &p.right_shift, - params_->row_, cur_oc_res, params_->col_ * sizeof(int8_t), false); -#else - MatMulInt8_16x4_r(a_r4x16_ptr_, cur_b, cur_c, params_->row_, cur_oc_res, params_->deep_16_, params_->col_, - input_sums_, cur_bias, &p.left_shift, &p.right_shift, &p.quant_multiplier, p.output.zp_, INT8_MIN, - INT8_MAX, false); -#endif - - return RET_OK; -} + param_->row_ = o_shape[o_shape.size() - 2]; + param_->col_ = o_shape[o_shape.size() - 1]; + param_->deep_ = param_->a_transpose_ ? x_shape[x_shape.size() - 2] : x_shape[x_shape.size() - 1]; -int MatmulInt8Run(void *cdata, int task_id) { - auto op = reinterpret_cast(cdata); - auto ret = op->RunImpl(task_id); + auto ret = MatmulBaseInt8CPUKernel::ReSize(); if (ret != RET_OK) { - MS_LOG(ERROR) << "MatmulInt8Run error task_id[" << task_id << "] error_code[" << ret << "]"; + MS_LOG(ERROR) << "MatmulBaseInt8CPUKernel failed"; return ret; } return RET_OK; } -int MatmulInt8CPUKernel::Run() { - auto a_ptr = reinterpret_cast(in_tensors_.at(0)->data_c()); - auto c_ptr = reinterpret_cast(out_tensors_.at(0)->data_c()); - auto a_stride = params_->row_ * params_->deep_; - auto b_stride = params_->deep_ * params_->col_; - auto c_stride = params_->row_ * params_->col_; - - if (!params_->b_const_) { - auto b_ptr = reinterpret_cast(in_tensors_.at(1)->data_c()); - for (int i = 0; i < params_->batch; ++i) { - auto cur_b = b_ptr + i * b_stride; - auto cur_b_pack = b_c16x4_batch_ + i * params_->col_4_ * params_->deep_16_; - auto cur_sums = weight_bias_sums_batch_ + i * params_->col_4_; - if (params_->b_transpose_) { - RowMajor2Row16x4MajorInt8(cur_b, cur_b_pack, params_->col_, params_->deep_); - CalcWeightBiasSums(cur_b, params_->deep_, params_->col_, quant_params_.input.zp_, &quant_params_.weight.zp_, - bias_ptr_, cur_sums, ColMajor, false); - } else { - RowMajor2Col16x4MajorInt8(cur_b, params_->deep_, params_->col_, cur_b_pack); - CalcWeightBiasSums(cur_b, params_->deep_, params_->col_, quant_params_.input.zp_, &quant_params_.weight.zp_, - bias_ptr_, cur_sums, RowMajor, false); - } - } - } - - for (int i = 0; i < params_->batch; ++i) { - auto cur_a_ptr = a_ptr + i * a_stride; - if (params_->a_transpose_) { - RowMajor2Col16x4MajorInt8(cur_a_ptr, params_->deep_, params_->row_, a_r4x16_ptr_); - CalcInputSums(cur_a_ptr, params_->row_, params_->deep_, quant_params_.weight.zp_, input_sums_, ColMajor); - } else { - RowMajor2Row16x4MajorInt8(cur_a_ptr, a_r4x16_ptr_, params_->row_, params_->deep_); - CalcInputSums(cur_a_ptr, params_->row_, params_->deep_, quant_params_.weight.zp_, input_sums_, RowMajor); - } - b_c16x4_ptr_ = b_c16x4_batch_ + i * params_->col_4_ * params_->deep_16_; - weight_bias_sums_ = weight_bias_sums_batch_ + i * params_->col_4_; - c_ptr_ = c_ptr + i * c_stride; - auto ret = ParallelLaunch(this->context_->thread_pool_, MatmulInt8Run, this, thread_count_); - if (ret != RET_OK) { - MS_LOG(ERROR) << "MatmulInt8Run error: [" << ret << "]"; - return ret; - } - } - return RET_OK; -} - REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_MatMul, LiteKernelCreator) } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/matmul_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/matmul_int8.h index 601a2ac209..71f6d205b2 100644 --- a/mindspore/lite/src/runtime/kernel/arm/int8/matmul_int8.h +++ b/mindspore/lite/src/runtime/kernel/arm/int8/matmul_int8.h @@ -22,39 +22,18 @@ #include "nnacl/matmul_parameter.h" #include "mindspore/lite/nnacl/int8/quantize.h" #include "src/lite_kernel.h" +#include "src/runtime/kernel/arm/int8/matmul_base_int8.h" -using mindspore::lite::InnerContext; namespace mindspore::kernel { -class MatmulInt8CPUKernel : public LiteKernel { +class MatmulInt8CPUKernel : public MatmulBaseInt8CPUKernel { public: MatmulInt8CPUKernel(OpParameter *parameter, const std::vector &inputs, - const std::vector &outputs, const InnerContext *ctx, + const std::vector &outputs, const lite::InnerContext *ctx, const mindspore::lite::PrimitiveC *primitive) - : LiteKernel(parameter, inputs, outputs, ctx, primitive) { - params_ = reinterpret_cast(op_parameter_); - } - ~MatmulInt8CPUKernel() override; + : MatmulBaseInt8CPUKernel(parameter, inputs, outputs, ctx, primitive) {} + ~MatmulInt8CPUKernel() override = default; int Init() override; int ReSize() override; - int Run() override; - int RunImpl(int task_id); - - private: - void FreeTmpBuffer(); - - private: - MatMulParameter *params_ = nullptr; - MatmulQuantArg quant_params_; - int8_t *a_r4x16_ptr_ = nullptr; - int8_t *b_c16x4_ptr_ = nullptr; - int8_t *c_ptr_ = nullptr; - int8_t *b_c16x4_batch_ = nullptr; - int *bias_ptr_ = nullptr; - int *input_sums_ = nullptr; - int *weight_bias_sums_ = nullptr; - int *weight_bias_sums_batch_ = nullptr; - int thread_stride_ = 0; - int thread_count_ = 0; }; } // namespace mindspore::kernel diff --git a/mindspore/lite/test/run_benchmark_nets.sh b/mindspore/lite/test/run_benchmark_nets.sh index 209a5029d6..79d07f75cd 100755 --- a/mindspore/lite/test/run_benchmark_nets.sh +++ b/mindspore/lite/test/run_benchmark_nets.sh @@ -599,9 +599,9 @@ function Run_x86() { echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib:./third_party/libjpeg-turbo/lib:./third_party/opencv/lib;./benchmark/benchmark --modelFile='${ms_models_path}'/'${model_name}'.ms --inDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/'${model_name}'.ms.bin --benchmarkDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/'${model_name}'.ms.out --accuracyThreshold=${accuracy_limit}' >> "${run_x86_log_file}" export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib:./third_party/libjpeg-turbo/lib:./third_party/opencv/lib;./benchmark/benchmark --modelFile=${ms_models_path}/${model_name}_weightquant.ms --inDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/${model_name}.ms.bin --benchmarkDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/${model_name}.ms.out --accuracyThreshold=${accuracy_limit} >> "${run_x86_log_file}" if [ $? = 0 ]; then - run_result='x86: '${model_name}'[weight quant] pass'; echo ${run_result} >> ${run_benchmark_result_file} + run_result='x86: '${model_name}'[weight_quant] pass'; echo ${run_result} >> ${run_benchmark_result_file} else - run_result='x86: '${model_name}'[weight quant] failed'; echo ${run_result} >> ${run_benchmark_result_file}; return 1 + run_result='x86: '${model_name}'[weight_quant] failed'; echo ${run_result} >> ${run_benchmark_result_file}; return 1 fi done < ${models_mindspore_weightquant_config} diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/matmul_int8_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/matmul_int8_tests.cc index e9b57c3975..4130948a5f 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/matmul_int8_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/matmul_int8_tests.cc @@ -79,58 +79,6 @@ void MMInt8TestInit(std::vector *inputs, std::vector