From: @ling_qiao_min Reviewed-by: Signed-off-by:tags/v1.2.0-rc1
| @@ -182,40 +182,6 @@ void MatMulInt8_16x4(const int8_t *a, const int8_t *b, int *dst, int row_4, int | |||||
| return; | return; | ||||
| } | } | ||||
| void MatMulInt8_16x4_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16, | |||||
| size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift, | |||||
| int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi, | |||||
| bool peroc) { | |||||
| /* support per-layer && weight per-channel */ | |||||
| /* row4x16-major * row16x4-major => (int8)row-major*/ | |||||
| for (int r = 0; r < row; r++) { | |||||
| for (int c = 0; c < col; c++) { | |||||
| int r4div = r / C4NUM, r4mod = r % C4NUM; | |||||
| int c4div = c / C4NUM, c4mod = c % C4NUM; | |||||
| size_t ci = r * stride + c; | |||||
| int32_t value = 0; | |||||
| for (int d = 0; d < deep_16; d++) { | |||||
| int d16div = d / C16NUM, d16mod = d % C16NUM; | |||||
| size_t ai = r4div * deep_16 * C4NUM + d16div * C4NUM * C16NUM + r4mod * C16NUM + d16mod; | |||||
| size_t bi = c4div * deep_16 * C4NUM + d16div * C4NUM * C16NUM + c4mod * C16NUM + d16mod; | |||||
| value = value + a[ai] * b[bi]; | |||||
| } | |||||
| int32_t cur_input_sum = | |||||
| peroc ? input_sum[c4div * UP_ROUND(row, C4NUM) * C4NUM + r * C4NUM + c4mod] : input_sum[r]; | |||||
| value -= cur_input_sum; | |||||
| value += bias[c]; | |||||
| int32_t cur_left_shift = peroc ? left_shift[c] : left_shift[0]; | |||||
| int32_t cur_right_shift = peroc ? right_shift[c] : right_shift[0]; | |||||
| int32_t cur_multiplier = peroc ? multiplier[c] : multiplier[0]; | |||||
| value = MultiplyByQuantizedMultiplier(value, cur_multiplier, cur_left_shift, cur_right_shift) + output_zp; | |||||
| value = MSMIN(maxi, value); | |||||
| value = MSMAX(mini, value); | |||||
| dst[ci] = (int8_t)value; | |||||
| } | |||||
| } | |||||
| return; | |||||
| } | |||||
| void MatMulInt8_4x2_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16, | void MatMulInt8_4x2_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16, | ||||
| size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift, | size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift, | ||||
| int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi, | int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi, | ||||
| @@ -353,6 +319,105 @@ void MatMulInt8_4x16_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row | |||||
| return; | return; | ||||
| } | } | ||||
| #ifdef ENABLE_ARM64 | |||||
| void PackInput4x4AndInputSumPert_arm64(const int8_t *src_ic, int8_t *pack_ic, int32_t *input_sum_r, size_t src_stride, | |||||
| size_t ic_4div, size_t ic_4res, int32_t filter_zp) { | |||||
| asm volatile( | |||||
| "dup v2.4s, wzr \n" | |||||
| "mov x14, %[input_sum_r] \n" | |||||
| "dup v3.4s, %w[filter_zp] \n" | |||||
| "mov x10, %[src_ic] \n" | |||||
| "mov x11, %[pack_ic] \n" | |||||
| "mov x15, #0 \n" | |||||
| "1: \n" | |||||
| "cmp x15, %[ic_4div] \n" | |||||
| "add x15, x15, #4\n" | |||||
| "mov x12, x10 \n" | |||||
| "add x10, x10, #4\n" | |||||
| "blt 2f \n" | |||||
| "cmp %[ic_4res], #0\n" | |||||
| "beq 6f \n" | |||||
| "cmp %[ic_4res], #1\n" | |||||
| "beq 3f \n" | |||||
| "cmp %[ic_4res], #2\n" | |||||
| "beq 4f \n" | |||||
| "cmp %[ic_4res], #3\n" | |||||
| "beq 5f \n" | |||||
| "2: \n" | |||||
| "ld1 {v0.s}[0], [x12], %[src_stride]\n" | |||||
| "ld1 {v0.s}[1], [x12], %[src_stride]\n" | |||||
| "ld1 {v0.s}[2], [x12], %[src_stride]\n" | |||||
| "ld1 {v0.s}[3], [x12], %[src_stride]\n" | |||||
| "st1 {v0.16b}, [x11], #16\n" | |||||
| "saddlp v1.8h, v0.16b \n" | |||||
| "saddlp v0.4s, v1.8h \n" | |||||
| "add v2.4s, v2.4s, v0.4s \n" | |||||
| "b 1b \n" | |||||
| "3: \n" /* ic res 1 */ | |||||
| "dup v0.4s, wzr \n" | |||||
| "ld1 {v0.b}[0], [x12], %[src_stride]\n" | |||||
| "ld1 {v0.b}[4], [x12], %[src_stride]\n" | |||||
| "ld1 {v0.b}[8], [x12], %[src_stride]\n" | |||||
| "ld1 {v0.b}[12], [x12], %[src_stride]\n" | |||||
| "st1 {v0.16b}, [x11], #16\n" | |||||
| "saddlp v1.8h, v0.16b \n" | |||||
| "saddlp v0.4s, v1.8h \n" | |||||
| "add v2.4s, v2.4s, v0.4s \n" | |||||
| "b 6f \n" | |||||
| "4: \n" /* ic res 2 */ | |||||
| "dup v0.4s, wzr \n" | |||||
| "ld1 {v0.h}[0], [x12], %[src_stride]\n" | |||||
| "ld1 {v0.h}[2], [x12], %[src_stride]\n" | |||||
| "ld1 {v0.h}[4], [x12], %[src_stride]\n" | |||||
| "ld1 {v0.h}[6], [x12], %[src_stride]\n" | |||||
| "st1 {v0.16b}, [x11], #16\n" | |||||
| "saddlp v1.8h, v0.16b \n" | |||||
| "saddlp v0.4s, v1.8h \n" | |||||
| "add v2.4s, v2.4s, v0.4s \n" | |||||
| "b 6f \n" | |||||
| "5: \n" /* ic res 3 */ | |||||
| "dup v0.4s, wzr \n" | |||||
| "add x13, x12, #2 \n" | |||||
| "ld1 {v0.h}[0], [x12], %[src_stride]\n" | |||||
| "ld1 {v0.b}[2], [x13], %[src_stride]\n" | |||||
| "ld1 {v0.h}[2], [x12], %[src_stride]\n" | |||||
| "ld1 {v0.b}[6], [x13], %[src_stride]\n" | |||||
| "ld1 {v0.h}[4], [x12], %[src_stride]\n" | |||||
| "ld1 {v0.b}[10], [x13], %[src_stride]\n" | |||||
| "ld1 {v0.h}[6], [x12], %[src_stride]\n" | |||||
| "ld1 {v0.b}[14], [x13], %[src_stride]\n" | |||||
| "st1 {v0.16b}, [x11], #16\n" | |||||
| "saddlp v1.8h, v0.16b \n" | |||||
| "saddlp v0.4s, v1.8h \n" | |||||
| "add v2.4s, v2.4s, v0.4s \n" | |||||
| "b 6f \n" | |||||
| "6: \n" | |||||
| "mul v2.4s, v2.4s, v3.4s \n" | |||||
| "st1 {v2.4s}, [x14], #16 \n" | |||||
| : | |||||
| : [ src_ic ] "r"(src_ic), [ pack_ic ] "r"(pack_ic), [ input_sum_r ] "r"(input_sum_r), | |||||
| [ src_stride ] "r"(src_stride), [ ic_4div ] "r"(ic_4div), [ ic_4res ] "r"(ic_4res), [ filter_zp ] "r"(filter_zp) | |||||
| : "x10", "x11", "x12", "x13", "x14", "x15", "v0", "v1", "v2", "v3"); | |||||
| return; | |||||
| } | |||||
| #endif | |||||
| void PackInput4x4AndInputSumPert(const int8_t *src_input, int8_t *packed_input, int32_t *input_sum, | void PackInput4x4AndInputSumPert(const int8_t *src_input, int8_t *packed_input, int32_t *input_sum, | ||||
| size_t input_channel, size_t plane_size, int32_t filter_zp) { | size_t input_channel, size_t plane_size, int32_t filter_zp) { | ||||
| int ic4 = UP_ROUND(input_channel, C4NUM); | int ic4 = UP_ROUND(input_channel, C4NUM); | ||||
| @@ -370,99 +435,7 @@ void PackInput4x4AndInputSumPert(const int8_t *src_input, int8_t *packed_input, | |||||
| #ifdef ENABLE_ARM64 | #ifdef ENABLE_ARM64 | ||||
| size_t src_stride = input_channel; | size_t src_stride = input_channel; | ||||
| size_t ic_4res = input_channel - ic_4div; | size_t ic_4res = input_channel - ic_4div; | ||||
| asm volatile( | |||||
| "dup v2.4s, wzr \n" | |||||
| "mov x14, %[input_sum_r] \n" | |||||
| "dup v3.4s, %w[filter_zp] \n" | |||||
| "mov x10, %[src_ic] \n" | |||||
| "mov x11, %[pack_ic] \n" | |||||
| "mov x15, #0 \n" | |||||
| "1: \n" | |||||
| "cmp x15, %[ic_4div] \n" | |||||
| "add x15, x15, #4\n" | |||||
| "mov x12, x10 \n" | |||||
| "add x10, x10, #4\n" | |||||
| "blt 2f \n" | |||||
| "cmp %[ic_4res], #0\n" | |||||
| "beq 6f \n" | |||||
| "cmp %[ic_4res], #1\n" | |||||
| "beq 3f \n" | |||||
| "cmp %[ic_4res], #2\n" | |||||
| "beq 4f \n" | |||||
| "cmp %[ic_4res], #3\n" | |||||
| "beq 5f \n" | |||||
| "2: \n" | |||||
| "ld1 {v0.s}[0], [x12], %[src_stride]\n" | |||||
| "ld1 {v0.s}[1], [x12], %[src_stride]\n" | |||||
| "ld1 {v0.s}[2], [x12], %[src_stride]\n" | |||||
| "ld1 {v0.s}[3], [x12], %[src_stride]\n" | |||||
| "st1 {v0.16b}, [x11], #16\n" | |||||
| "saddlp v1.8h, v0.16b \n" | |||||
| "saddlp v0.4s, v1.8h \n" | |||||
| "add v2.4s, v2.4s, v0.4s \n" | |||||
| "b 1b \n" | |||||
| "3: \n" /* ic res 1 */ | |||||
| "dup v0.4s, wzr \n" | |||||
| "ld1 {v0.b}[0], [x12], %[src_stride]\n" | |||||
| "ld1 {v0.b}[4], [x12], %[src_stride]\n" | |||||
| "ld1 {v0.b}[8], [x12], %[src_stride]\n" | |||||
| "ld1 {v0.b}[12], [x12], %[src_stride]\n" | |||||
| "st1 {v0.16b}, [x11], #16\n" | |||||
| "saddlp v1.8h, v0.16b \n" | |||||
| "saddlp v0.4s, v1.8h \n" | |||||
| "add v2.4s, v2.4s, v0.4s \n" | |||||
| "b 6f \n" | |||||
| "4: \n" /* ic res 2 */ | |||||
| "dup v0.4s, wzr \n" | |||||
| "ld1 {v0.h}[0], [x12], %[src_stride]\n" | |||||
| "ld1 {v0.h}[2], [x12], %[src_stride]\n" | |||||
| "ld1 {v0.h}[4], [x12], %[src_stride]\n" | |||||
| "ld1 {v0.h}[6], [x12], %[src_stride]\n" | |||||
| "st1 {v0.16b}, [x11], #16\n" | |||||
| "saddlp v1.8h, v0.16b \n" | |||||
| "saddlp v0.4s, v1.8h \n" | |||||
| "add v2.4s, v2.4s, v0.4s \n" | |||||
| "b 6f \n" | |||||
| "5: \n" /* ic res 3 */ | |||||
| "dup v0.4s, wzr \n" | |||||
| "add x13, x12, #2 \n" | |||||
| "ld1 {v0.h}[0], [x12], %[src_stride]\n" | |||||
| "ld1 {v0.b}[2], [x13], %[src_stride]\n" | |||||
| "ld1 {v0.h}[2], [x12], %[src_stride]\n" | |||||
| "ld1 {v0.b}[6], [x13], %[src_stride]\n" | |||||
| "ld1 {v0.h}[4], [x12], %[src_stride]\n" | |||||
| "ld1 {v0.b}[10], [x13], %[src_stride]\n" | |||||
| "ld1 {v0.h}[6], [x12], %[src_stride]\n" | |||||
| "ld1 {v0.b}[14], [x13], %[src_stride]\n" | |||||
| "st1 {v0.16b}, [x11], #16\n" | |||||
| "saddlp v1.8h, v0.16b \n" | |||||
| "saddlp v0.4s, v1.8h \n" | |||||
| "add v2.4s, v2.4s, v0.4s \n" | |||||
| "b 6f \n" | |||||
| "6: \n" | |||||
| "mul v2.4s, v2.4s, v3.4s \n" | |||||
| "st1 {v2.4s}, [x14], #16 \n" | |||||
| : | |||||
| : [ src_ic ] "r"(src_ic), [ pack_ic ] "r"(pack_ic), [ input_sum_r ] "r"(input_sum_r), | |||||
| [ src_stride ] "r"(src_stride), [ ic_4div ] "r"(ic_4div), [ ic_4res ] "r"(ic_4res), [ filter_zp ] "r"(filter_zp) | |||||
| : "x10", "x11", "x12", "x13", "x14", "x15", "v0", "v1", "v2", "v3"); | |||||
| PackInput4x4AndInputSumPert_arm64(src_ic, pack_ic, input_sum_r, src_stride, ic_4div, ic_4res, filter_zp); | |||||
| #else | #else | ||||
| int32_t tmp_sum_value[4] = {0}; | int32_t tmp_sum_value[4] = {0}; | ||||
| for (int ici = 0; ici < ic_4div; ici += C4NUM) { | for (int ici = 0; ici < ic_4div; ici += C4NUM) { | ||||
| @@ -25,12 +25,9 @@ | |||||
| extern "C" { | extern "C" { | ||||
| #endif | #endif | ||||
| /* 4x16 16x4 -> 4x4 */ | /* 4x16 16x4 -> 4x4 */ | ||||
| /* matmul */ | |||||
| void MatMulInt8_16x4(const int8_t *a, const int8_t *b, int *dst, int row_4, int col_4, int deep_16, | void MatMulInt8_16x4(const int8_t *a, const int8_t *b, int *dst, int row_4, int col_4, int deep_16, | ||||
| const int *input_sum, const int *bias); | const int *input_sum, const int *bias); | ||||
| void MatMulInt8_16x4_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16, | |||||
| size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift, | |||||
| int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi, | |||||
| bool per_channel); | |||||
| void RowMajor2Row16x4MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col); | void RowMajor2Row16x4MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col); | ||||
| void RowMajor2Col16x4MajorInt8(int8_t *src, int row, int col, int8_t *dst); | void RowMajor2Col16x4MajorInt8(int8_t *src, int row, int col, int8_t *dst); | ||||
| void CalcInputSums(int8_t *input, int row, int col, int weight_zp, int *dst, DataOrder order); | void CalcInputSums(int8_t *input, int row, int col, int weight_zp, int *dst, DataOrder order); | ||||
| @@ -41,6 +38,7 @@ void MatmulInt8Opt(const int8_t *a, const int8_t *b, int8_t *dst, int row, int c | |||||
| int32_t *right_shift, size_t stride, size_t filter_peroc, int32_t *filter_zp); | int32_t *right_shift, size_t stride, size_t filter_peroc, int32_t *filter_zp); | ||||
| /* 8x4 4x8 -> 8x8 */ | /* 8x4 4x8 -> 8x8 */ | ||||
| /* optimize conv */ | |||||
| void RowMajor2Row8x4MajorInt8(const int8_t *src_ptr, int8_t *dst_ptr, int row, int col); | void RowMajor2Row8x4MajorInt8(const int8_t *src_ptr, int8_t *dst_ptr, int row, int col); | ||||
| void MatMulInt8_8x8_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_4, | void MatMulInt8_8x8_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_4, | ||||
| size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift, | size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift, | ||||
| @@ -48,6 +46,7 @@ void MatMulInt8_8x8_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, | |||||
| size_t per_channel); | size_t per_channel); | ||||
| /* 4x16 16x2 -> 4x2 */ | /* 4x16 16x2 -> 4x2 */ | ||||
| /* arm32 conv1x1 */ | |||||
| void RowMajor2Row2x16MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col); | void RowMajor2Row2x16MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col); | ||||
| void MatMulInt8_4x2_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16, | void MatMulInt8_4x2_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16, | ||||
| size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift, | size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift, | ||||
| @@ -55,6 +54,7 @@ void MatMulInt8_4x2_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, | |||||
| bool peroc); | bool peroc); | ||||
| /* 4x4 4x16 -> 4x16 */ | /* 4x4 4x16 -> 4x16 */ | ||||
| /* optimize conv1x1 */ | |||||
| void RowMajor2Row4x16MajorInt8(const int8_t *src_ptr, int8_t *dst_ptr, int row, int col); | void RowMajor2Row4x16MajorInt8(const int8_t *src_ptr, int8_t *dst_ptr, int row, int col); | ||||
| void PackInput4x4AndInputSumPert(const int8_t *src_input, int8_t *packed_input, int32_t *input_sum, | void PackInput4x4AndInputSumPert(const int8_t *src_input, int8_t *packed_input, int32_t *input_sum, | ||||
| size_t input_channel, size_t plane_size, int32_t filter_zp); | size_t input_channel, size_t plane_size, int32_t filter_zp); | ||||
| @@ -66,17 +66,6 @@ typedef struct PreluQuantArg { | |||||
| QuantArg out_quant_args_; | QuantArg out_quant_args_; | ||||
| } PreluQuantArg; | } PreluQuantArg; | ||||
| typedef struct MatmulQuantArg { | |||||
| QuantArg input; | |||||
| QuantArg weight; | |||||
| QuantArg output; | |||||
| int32_t out_act_min; | |||||
| int32_t out_act_max; | |||||
| int32_t left_shift; | |||||
| int32_t right_shift; | |||||
| int32_t quant_multiplier; | |||||
| } MatmulQuantArg; | |||||
| typedef struct CropQuantArg { | typedef struct CropQuantArg { | ||||
| QuantArg in_args_; | QuantArg in_args_; | ||||
| QuantArg out_args_; | QuantArg out_args_; | ||||
| @@ -73,4 +73,15 @@ typedef struct MatmulQuantParameter { | |||||
| int32_t *quant_multiplier_; | int32_t *quant_multiplier_; | ||||
| } MatmulQuantParameter; | } MatmulQuantParameter; | ||||
| typedef struct MatmulQuantArg { | |||||
| QuantArg input; | |||||
| QuantArg weight; | |||||
| QuantArg output; | |||||
| int32_t out_act_min; | |||||
| int32_t out_act_max; | |||||
| int32_t left_shift; | |||||
| int32_t right_shift; | |||||
| int32_t quant_multiplier; | |||||
| } MatmulQuantArg; | |||||
| #endif // MINDSPORE_LITE_NNACL_MATMUL_H_ | #endif // MINDSPORE_LITE_NNACL_MATMUL_H_ | ||||
| @@ -67,10 +67,5 @@ int FullconnectionCPUKernel::ReSize() { | |||||
| return MatmulFp32BaseCPUKernel::ReSize(); | return MatmulFp32BaseCPUKernel::ReSize(); | ||||
| } | } | ||||
| int FullconnectionCPUKernel::Run() { | |||||
| MatmulFp32BaseCPUKernel::Run(); | |||||
| return RET_OK; | |||||
| } | |||||
| REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_FullConnection, LiteKernelCreator<FullconnectionCPUKernel>) | REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_FullConnection, LiteKernelCreator<FullconnectionCPUKernel>) | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -33,7 +33,6 @@ class FullconnectionCPUKernel : public MatmulFp32BaseCPUKernel { | |||||
| ~FullconnectionCPUKernel() = default; | ~FullconnectionCPUKernel() = default; | ||||
| int Init() override; | int Init() override; | ||||
| int ReSize() override; | int ReSize() override; | ||||
| int Run() override; | |||||
| }; | }; | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_FULLCONNECTION_H_ | #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_FULLCONNECTION_H_ | ||||
| @@ -15,258 +15,44 @@ | |||||
| */ | */ | ||||
| #include "src/runtime/kernel/arm/int8/fullconnection_int8.h" | #include "src/runtime/kernel/arm/int8/fullconnection_int8.h" | ||||
| #include "src/runtime/runtime_api.h" | |||||
| #include "src/kernel_registry.h" | #include "src/kernel_registry.h" | ||||
| using mindspore::lite::KernelRegistrar; | using mindspore::lite::KernelRegistrar; | ||||
| using mindspore::lite::RET_ERROR; | |||||
| using mindspore::lite::RET_MEMORY_FAILED; | |||||
| using mindspore::lite::RET_OK; | using mindspore::lite::RET_OK; | ||||
| using mindspore::schema::PrimitiveType_FullConnection; | using mindspore::schema::PrimitiveType_FullConnection; | ||||
| namespace mindspore::kernel { | namespace mindspore::kernel { | ||||
| void FullconnectionInt8CPUKernel::FreeQuantParam() { | |||||
| if (quant_.filter_scale_ != nullptr) { | |||||
| free(quant_.filter_scale_); | |||||
| quant_.filter_scale_ = nullptr; | |||||
| } | |||||
| if (quant_.filter_zp_ != nullptr) { | |||||
| free(quant_.filter_zp_); | |||||
| quant_.filter_zp_ = nullptr; | |||||
| } | |||||
| if (quant_.left_shift_ != nullptr) { | |||||
| free(quant_.left_shift_); | |||||
| quant_.left_shift_ = nullptr; | |||||
| } | |||||
| if (quant_.right_shift_ != nullptr) { | |||||
| free(quant_.right_shift_); | |||||
| quant_.right_shift_ = nullptr; | |||||
| } | |||||
| if (quant_.quant_multiplier_ != nullptr) { | |||||
| free(quant_.quant_multiplier_); | |||||
| quant_.quant_multiplier_ = nullptr; | |||||
| } | |||||
| return; | |||||
| } | |||||
| void FullconnectionInt8CPUKernel::FreeTmpBuffer() { | |||||
| if (pack_a_ptr_ != nullptr) { | |||||
| free(pack_a_ptr_); | |||||
| pack_a_ptr_ = nullptr; | |||||
| } | |||||
| if (pack_b_ptr_ != nullptr) { | |||||
| free(pack_b_ptr_); | |||||
| pack_b_ptr_ = nullptr; | |||||
| } | |||||
| if (input_sums_ != nullptr) { | |||||
| free(input_sums_); | |||||
| input_sums_ = nullptr; | |||||
| } | |||||
| if (weight_bias_sums_ != nullptr) { | |||||
| free(weight_bias_sums_); | |||||
| weight_bias_sums_ = nullptr; | |||||
| } | |||||
| if (bias_ptr_ != nullptr) { | |||||
| free(bias_ptr_); | |||||
| bias_ptr_ = nullptr; | |||||
| } | |||||
| return; | |||||
| } | |||||
| int FullconnectionInt8CPUKernel::MallocQuantParam() { | |||||
| auto weight_tensor = in_tensors_.at(1); | |||||
| auto weight_quant_params = weight_tensor->quant_params(); | |||||
| int col = weight_tensor->shape().front(); | |||||
| filter_per_channel_ = (weight_quant_params.size() > 1); | |||||
| int init_size = filter_per_channel_ ? col : 1; | |||||
| int FullconnectionInt8CPUKernel::Init() { | |||||
| param_->batch = 1; | |||||
| param_->a_transpose_ = false; | |||||
| param_->b_transpose_ = true; | |||||
| quant_.filter_scale_ = reinterpret_cast<float *>(malloc(init_size * sizeof(float))); | |||||
| if (quant_.filter_scale_ == nullptr) { | |||||
| return RET_ERROR; | |||||
| } | |||||
| quant_.filter_zp_ = reinterpret_cast<int32_t *>(malloc(init_size * sizeof(int32_t))); | |||||
| if (quant_.filter_zp_ == nullptr) { | |||||
| return RET_ERROR; | |||||
| } | |||||
| quant_.left_shift_ = reinterpret_cast<int32_t *>(malloc(init_size * sizeof(int32_t))); | |||||
| if (quant_.left_shift_ == nullptr) { | |||||
| return RET_ERROR; | |||||
| } | |||||
| quant_.right_shift_ = reinterpret_cast<int32_t *>(malloc(init_size * sizeof(int32_t))); | |||||
| if (quant_.right_shift_ == nullptr) { | |||||
| return RET_ERROR; | |||||
| } | |||||
| quant_.quant_multiplier_ = reinterpret_cast<int32_t *>(malloc(init_size * sizeof(int32_t))); | |||||
| if (quant_.quant_multiplier_ == nullptr) { | |||||
| return RET_ERROR; | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| InitParameter(); | |||||
| int FullconnectionInt8CPUKernel::Init() { | |||||
| auto ret = MallocQuantParam(); | |||||
| auto ret = MatmulBaseInt8CPUKernel::Init(); | |||||
| if (ret != RET_OK) { | if (ret != RET_OK) { | ||||
| FreeQuantParam(); | |||||
| MS_LOG(ERROR) << "ParallelLaunch failed"; | |||||
| return ret; | return ret; | ||||
| } | } | ||||
| auto in_quant_params = in_tensors_.at(0)->quant_params(); | |||||
| quant_.input_.zp_ = in_quant_params.front().zeroPoint; | |||||
| quant_.input_.scale_ = in_quant_params.front().scale; | |||||
| auto out_quant_params = out_tensors_.at(0)->quant_params(); | |||||
| quant_.output_.zp_ = out_quant_params.front().zeroPoint; | |||||
| quant_.output_.scale_ = out_quant_params.front().scale; | |||||
| auto weight_tensor = in_tensors_.at(1); | |||||
| fc_param_->b_const_ = (weight_tensor->data_c() != nullptr); | |||||
| int weight_quant_num = filter_per_channel_ ? weight_tensor->shape().front() : 1; | |||||
| auto weight_quant_params = weight_tensor->quant_params(); | |||||
| for (int i = 0; i < weight_quant_num; i++) { | |||||
| quant_.filter_zp_[i] = weight_quant_params[i].zeroPoint; | |||||
| quant_.filter_scale_[i] = weight_quant_params[i].scale; | |||||
| } | |||||
| for (int i = 0; i < weight_quant_num; ++i) { | |||||
| const double in_scale = static_cast<double>(quant_.input_.scale_ * quant_.filter_scale_[i]); | |||||
| double real_multiplier = in_scale / static_cast<double>(quant_.output_.scale_); | |||||
| QuantizeRoundParameterWithDoublePrecision(real_multiplier, &quant_.quant_multiplier_[i], &quant_.left_shift_[i], | |||||
| &quant_.right_shift_[i]); | |||||
| } | |||||
| CalculateActivationRangeQuantized(fc_param_->act_type_ == ActType_Relu, fc_param_->act_type_ == ActType_Relu6, | |||||
| quant_.output_.zp_, quant_.output_.scale_, &quant_.out_act_min_, | |||||
| &quant_.out_act_max_); | |||||
| if (!InferShapeDone()) { | if (!InferShapeDone()) { | ||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| return ReSize(); | return ReSize(); | ||||
| } | } | ||||
| void FullconnectionInt8CPUKernel::InitParam() { | |||||
| int FullconnectionInt8CPUKernel::ReSize() { | |||||
| int row = 1; | int row = 1; | ||||
| for (size_t i = 0; i < out_tensors_.at(0)->shape().size() - 1; ++i) { | for (size_t i = 0; i < out_tensors_.at(0)->shape().size() - 1; ++i) { | ||||
| row *= (out_tensors_.at(0)->shape()).at(i); | row *= (out_tensors_.at(0)->shape()).at(i); | ||||
| } | } | ||||
| fc_param_->row_ = row; | |||||
| fc_param_->col_ = out_tensors_.at(0)->shape().back(); | |||||
| fc_param_->deep_ = (in_tensors_.at(1)->shape()).at(1); | |||||
| param_->row_ = row; | |||||
| param_->col_ = out_tensors_.at(0)->shape().back(); | |||||
| param_->deep_ = (in_tensors_.at(1)->shape()).at(1); | |||||
| fc_param_->row_4_ = UP_ROUND(fc_param_->row_, C4NUM); | |||||
| fc_param_->col_4_ = UP_ROUND(fc_param_->col_, C4NUM); | |||||
| fc_param_->col_8_ = UP_ROUND(fc_param_->col_, C8NUM); | |||||
| fc_param_->deep_16_ = UP_ROUND(fc_param_->deep_, C16NUM); | |||||
| thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(fc_param_->col_4_, C4NUM)); | |||||
| thread_stride_ = UP_DIV(UP_DIV(fc_param_->col_4_, C4NUM), thread_count_); | |||||
| return; | |||||
| } | |||||
| int FullconnectionInt8CPUKernel::ReSize() { | |||||
| FreeTmpBuffer(); | |||||
| InitParam(); | |||||
| pack_a_ptr_ = reinterpret_cast<int8_t *>(malloc(fc_param_->row_4_ * fc_param_->deep_16_ * sizeof(int8_t))); | |||||
| if (pack_a_ptr_ == nullptr) { | |||||
| FreeTmpBuffer(); | |||||
| return RET_ERROR; | |||||
| } | |||||
| pack_b_ptr_ = reinterpret_cast<int8_t *>(malloc(fc_param_->col_4_ * fc_param_->deep_16_ * sizeof(int8_t))); | |||||
| if (pack_b_ptr_ == nullptr) { | |||||
| FreeTmpBuffer(); | |||||
| return RET_ERROR; | |||||
| } | |||||
| input_sums_ = reinterpret_cast<int *>(malloc(fc_param_->row_4_ * sizeof(int))); | |||||
| if (input_sums_ == nullptr) { | |||||
| FreeTmpBuffer(); | |||||
| return RET_ERROR; | |||||
| } | |||||
| weight_bias_sums_ = reinterpret_cast<int *>(malloc(fc_param_->col_4_ * sizeof(int))); | |||||
| if (weight_bias_sums_ == nullptr) { | |||||
| FreeTmpBuffer(); | |||||
| return RET_ERROR; | |||||
| } | |||||
| memset(pack_a_ptr_, 0, fc_param_->row_4_ * fc_param_->deep_16_ * sizeof(int8_t)); | |||||
| memset(pack_b_ptr_, 0, fc_param_->col_4_ * fc_param_->deep_16_ * sizeof(int8_t)); | |||||
| memset(input_sums_, 0, fc_param_->row_4_ * sizeof(int)); | |||||
| memset(weight_bias_sums_, 0, fc_param_->col_4_ * sizeof(int)); | |||||
| if (in_tensors_.size() == 3) { | |||||
| bias_ptr_ = reinterpret_cast<int *>(malloc(fc_param_->col_4_ * sizeof(int))); | |||||
| if (bias_ptr_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Memory allocation failed"; | |||||
| FreeTmpBuffer(); | |||||
| return RET_MEMORY_FAILED; | |||||
| } | |||||
| memcpy(bias_ptr_, in_tensors_.at(2)->data_c(), fc_param_->col_ * sizeof(int)); | |||||
| } else { | |||||
| bias_ptr_ = nullptr; | |||||
| } | |||||
| if (fc_param_->b_const_) { | |||||
| auto weight_data = reinterpret_cast<int8_t *>(in_tensors_.at(1)->data_c()); | |||||
| RowMajor2Row16x4MajorInt8(weight_data, pack_b_ptr_, fc_param_->col_, fc_param_->deep_); | |||||
| CalcWeightBiasSums(weight_data, fc_param_->deep_, fc_param_->col_, quant_.input_.zp_, quant_.filter_zp_, bias_ptr_, | |||||
| weight_bias_sums_, ColMajor, filter_per_channel_); | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| int FullconnectionInt8CPUKernel::RunImpl(int task_id) { | |||||
| int stride = thread_stride_ * C4NUM; | |||||
| int cur_stride = task_id * stride; | |||||
| int res_stride = fc_param_->col_ - cur_stride; | |||||
| int cur_oc = MSMIN(stride, res_stride); | |||||
| if (cur_oc <= 0) { | |||||
| return RET_OK; | |||||
| } | |||||
| int32_t *cur_left = filter_per_channel_ ? quant_.left_shift_ + cur_stride : quant_.left_shift_; | |||||
| int32_t *cur_right = filter_per_channel_ ? quant_.right_shift_ + cur_stride : quant_.right_shift_; | |||||
| int32_t *cur_mul = filter_per_channel_ ? quant_.quant_multiplier_ + cur_stride : quant_.quant_multiplier_; | |||||
| int32_t *cur_zp = filter_per_channel_ ? quant_.filter_zp_ + cur_stride : quant_.filter_zp_; | |||||
| MatmulInt8Opt(pack_a_ptr_, pack_b_ptr_ + cur_stride * fc_param_->deep_16_, c_ptr_ + cur_stride, fc_param_->row_, | |||||
| cur_oc, fc_param_->deep_16_, input_sums_, weight_bias_sums_ + cur_stride, quant_.out_act_min_, | |||||
| quant_.out_act_max_, quant_.output_.zp_, cur_mul, cur_left, cur_right, fc_param_->col_, | |||||
| filter_per_channel_, cur_zp); | |||||
| return RET_OK; | |||||
| } | |||||
| int FcInt8Run(void *cdata, int task_id) { | |||||
| auto fc = reinterpret_cast<FullconnectionInt8CPUKernel *>(cdata); | |||||
| auto ret = fc->RunImpl(task_id); | |||||
| auto ret = MatmulBaseInt8CPUKernel::ReSize(); | |||||
| if (ret != RET_OK) { | if (ret != RET_OK) { | ||||
| MS_LOG(ERROR) << "FcInt8Run error task_id[" << task_id << "] error_code[" << ret << "]"; | |||||
| return ret; | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| int FullconnectionInt8CPUKernel::Run() { | |||||
| auto input_ptr = reinterpret_cast<int8_t *>(in_tensors_.at(0)->data_c()); | |||||
| RowMajor2Row16x4MajorInt8(input_ptr, pack_a_ptr_, fc_param_->row_, fc_param_->deep_); | |||||
| int32_t tmp_weight_zp = filter_per_channel_ ? 1 : quant_.filter_zp_[0]; | |||||
| CalcInputSums(input_ptr, fc_param_->row_, fc_param_->deep_, tmp_weight_zp, input_sums_, RowMajor); | |||||
| if (!fc_param_->b_const_) { | |||||
| auto weight_data = reinterpret_cast<int8_t *>(in_tensors_.at(1)->data_c()); | |||||
| RowMajor2Row16x4MajorInt8(weight_data, pack_b_ptr_, fc_param_->col_, fc_param_->deep_); | |||||
| CalcWeightBiasSums(weight_data, fc_param_->deep_, fc_param_->col_, quant_.input_.zp_, quant_.filter_zp_, bias_ptr_, | |||||
| weight_bias_sums_, ColMajor, filter_per_channel_); | |||||
| } | |||||
| c_ptr_ = reinterpret_cast<int8_t *>(out_tensors_.at(0)->data_c()); | |||||
| auto ret = ParallelLaunch(this->context_->thread_pool_, FcInt8Run, this, thread_count_); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "ParallelLaunch failed"; | |||||
| MS_LOG(ERROR) << "MatmulBaseInt8CPUKernel failed"; | |||||
| return ret; | return ret; | ||||
| } | } | ||||
| return RET_OK; | return RET_OK; | ||||
| @@ -18,52 +18,19 @@ | |||||
| #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_FULLCONNECTION_INT8_H_ | #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_FULLCONNECTION_INT8_H_ | ||||
| #include <vector> | #include <vector> | ||||
| #include "src/lite_kernel.h" | |||||
| #include "include/errorcode.h" | #include "include/errorcode.h" | ||||
| #include "mindspore/lite/nnacl/int8/quantize.h" | |||||
| #include "nnacl/common_func.h" | |||||
| #include "nnacl/int8/common_func_int8.h" | |||||
| #include "nnacl/int8/matmul_int8.h" | |||||
| #include "src/runtime/kernel/arm/int8/matmul_base_int8.h" | |||||
| namespace mindspore::kernel { | namespace mindspore::kernel { | ||||
| class FullconnectionInt8CPUKernel : public LiteKernel { | |||||
| class FullconnectionInt8CPUKernel : public MatmulBaseInt8CPUKernel { | |||||
| public: | public: | ||||
| FullconnectionInt8CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | FullconnectionInt8CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | ||||
| const std::vector<lite::Tensor *> &outputs, const mindspore::lite::InnerContext *ctx, | const std::vector<lite::Tensor *> &outputs, const mindspore::lite::InnerContext *ctx, | ||||
| const mindspore::lite::PrimitiveC *primitive) | const mindspore::lite::PrimitiveC *primitive) | ||||
| : LiteKernel(parameter, inputs, outputs, ctx, primitive) { | |||||
| fc_param_ = reinterpret_cast<MatMulParameter *>(op_parameter_); | |||||
| } | |||||
| ~FullconnectionInt8CPUKernel() override { | |||||
| FreeTmpBuffer(); | |||||
| FreeQuantParam(); | |||||
| } | |||||
| : MatmulBaseInt8CPUKernel(parameter, inputs, outputs, ctx, primitive) {} | |||||
| ~FullconnectionInt8CPUKernel() override = default; | |||||
| int Init() override; | int Init() override; | ||||
| int ReSize() override; | int ReSize() override; | ||||
| int Run() override; | |||||
| public: | |||||
| int RunImpl(int task_id); | |||||
| private: | |||||
| void InitParam(); | |||||
| void FreeTmpBuffer(); | |||||
| void FreeQuantParam(); | |||||
| int MallocQuantParam(); | |||||
| private: | |||||
| MatMulParameter *fc_param_ = nullptr; | |||||
| MatmulQuantParameter quant_; | |||||
| int thread_count_ = 1; | |||||
| int thread_stride_ = 0; | |||||
| int8_t *pack_a_ptr_ = nullptr; | |||||
| int8_t *pack_b_ptr_ = nullptr; | |||||
| int8_t *c_ptr_ = nullptr; | |||||
| int *input_sums_ = nullptr; | |||||
| int *weight_bias_sums_ = nullptr; | |||||
| int *bias_ptr_ = nullptr; | |||||
| bool filter_per_channel_ = true; | |||||
| }; | }; | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -0,0 +1,323 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include "src/runtime/kernel/arm/int8/matmul_base_int8.h" | |||||
| #include "src/runtime/runtime_api.h" | |||||
| using mindspore::lite::RET_ERROR; | |||||
| using mindspore::lite::RET_MEMORY_FAILED; | |||||
| using mindspore::lite::RET_OK; | |||||
| namespace mindspore::kernel { | |||||
| int MatmulBaseInt8Run(void *cdata, int task_id) { | |||||
| auto op = reinterpret_cast<MatmulBaseInt8CPUKernel *>(cdata); | |||||
| auto ret = op->RunImpl(task_id); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "MatmulInt8Run error task_id[" << task_id << "] error_code[" << ret << "]"; | |||||
| return ret; | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| int MatmulBaseInt8CPUKernel::RunImpl(int task_id) { | |||||
| int stride = thread_stride_ * C4NUM; | |||||
| int cur_stride = task_id * stride; | |||||
| int res_stride = param_->col_ - cur_stride; | |||||
| int cur_oc = MSMIN(stride, res_stride); | |||||
| if (cur_oc <= 0) { | |||||
| return RET_OK; | |||||
| } | |||||
| int32_t *cur_left = filter_per_channel_ ? quant_.left_shift_ + cur_stride : quant_.left_shift_; | |||||
| int32_t *cur_right = filter_per_channel_ ? quant_.right_shift_ + cur_stride : quant_.right_shift_; | |||||
| int32_t *cur_mul = filter_per_channel_ ? quant_.quant_multiplier_ + cur_stride : quant_.quant_multiplier_; | |||||
| int32_t *cur_zp = filter_per_channel_ ? quant_.filter_zp_ + cur_stride : quant_.filter_zp_; | |||||
| MatmulInt8Opt(pack_a_ptr_, batch_b_ptr_ + cur_stride * param_->deep_16_, batch_c_ptr_ + cur_stride, param_->row_, | |||||
| cur_oc, param_->deep_16_, input_sums_, weight_bias_sums_ + cur_stride, quant_.out_act_min_, | |||||
| quant_.out_act_max_, quant_.output_.zp_, cur_mul, cur_left, cur_right, param_->col_, | |||||
| filter_per_channel_, cur_zp); | |||||
| return RET_OK; | |||||
| } | |||||
| MatmulBaseInt8CPUKernel::~MatmulBaseInt8CPUKernel() { | |||||
| FreeQuantParam(); | |||||
| FreeTmpBuffer(); | |||||
| if (bias_ptr_ != nullptr) { | |||||
| free(bias_ptr_); | |||||
| bias_ptr_ = nullptr; | |||||
| } | |||||
| return; | |||||
| } | |||||
| void MatmulBaseInt8CPUKernel::FreeQuantParam() { | |||||
| if (quant_.filter_scale_ != nullptr) { | |||||
| free(quant_.filter_scale_); | |||||
| quant_.filter_scale_ = nullptr; | |||||
| } | |||||
| if (quant_.filter_zp_ != nullptr) { | |||||
| free(quant_.filter_zp_); | |||||
| quant_.filter_zp_ = nullptr; | |||||
| } | |||||
| if (quant_.left_shift_ != nullptr) { | |||||
| free(quant_.left_shift_); | |||||
| quant_.left_shift_ = nullptr; | |||||
| } | |||||
| if (quant_.right_shift_ != nullptr) { | |||||
| free(quant_.right_shift_); | |||||
| quant_.right_shift_ = nullptr; | |||||
| } | |||||
| if (quant_.quant_multiplier_ != nullptr) { | |||||
| free(quant_.quant_multiplier_); | |||||
| quant_.quant_multiplier_ = nullptr; | |||||
| } | |||||
| return; | |||||
| } | |||||
| int MatmulBaseInt8CPUKernel::MallocQuantParam() { | |||||
| auto weight_tensor = in_tensors_.at(1); | |||||
| auto weight_quant_params = weight_tensor->quant_params(); | |||||
| int col = weight_tensor->shape().front(); | |||||
| filter_per_channel_ = (weight_quant_params.size() > 1); | |||||
| int init_size = filter_per_channel_ ? col : 1; | |||||
| quant_.filter_scale_ = reinterpret_cast<float *>(malloc(init_size * sizeof(float))); | |||||
| if (quant_.filter_scale_ == nullptr) { | |||||
| return RET_ERROR; | |||||
| } | |||||
| quant_.filter_zp_ = reinterpret_cast<int32_t *>(malloc(init_size * sizeof(int32_t))); | |||||
| if (quant_.filter_zp_ == nullptr) { | |||||
| return RET_ERROR; | |||||
| } | |||||
| quant_.left_shift_ = reinterpret_cast<int32_t *>(malloc(init_size * sizeof(int32_t))); | |||||
| if (quant_.left_shift_ == nullptr) { | |||||
| return RET_ERROR; | |||||
| } | |||||
| quant_.right_shift_ = reinterpret_cast<int32_t *>(malloc(init_size * sizeof(int32_t))); | |||||
| if (quant_.right_shift_ == nullptr) { | |||||
| return RET_ERROR; | |||||
| } | |||||
| quant_.quant_multiplier_ = reinterpret_cast<int32_t *>(malloc(init_size * sizeof(int32_t))); | |||||
| if (quant_.quant_multiplier_ == nullptr) { | |||||
| return RET_ERROR; | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| void MatmulBaseInt8CPUKernel::InitQuantParam() { | |||||
| auto in_quant_params = in_tensors_.at(0)->quant_params(); | |||||
| quant_.input_.zp_ = in_quant_params.front().zeroPoint; | |||||
| quant_.input_.scale_ = in_quant_params.front().scale; | |||||
| auto out_quant_params = out_tensors_.at(0)->quant_params(); | |||||
| quant_.output_.zp_ = out_quant_params.front().zeroPoint; | |||||
| quant_.output_.scale_ = out_quant_params.front().scale; | |||||
| auto weight_tensor = in_tensors_.at(1); | |||||
| int weight_quant_num = filter_per_channel_ ? weight_tensor->shape().front() : 1; | |||||
| auto weight_quant_params = weight_tensor->quant_params(); | |||||
| for (int i = 0; i < weight_quant_num; i++) { | |||||
| quant_.filter_zp_[i] = weight_quant_params[i].zeroPoint; | |||||
| quant_.filter_scale_[i] = weight_quant_params[i].scale; | |||||
| } | |||||
| for (int i = 0; i < weight_quant_num; ++i) { | |||||
| const double in_scale = static_cast<double>(quant_.input_.scale_ * quant_.filter_scale_[i]); | |||||
| double real_multiplier = in_scale / static_cast<double>(quant_.output_.scale_); | |||||
| QuantizeRoundParameterWithDoublePrecision(real_multiplier, &quant_.quant_multiplier_[i], &quant_.left_shift_[i], | |||||
| &quant_.right_shift_[i]); | |||||
| } | |||||
| CalculateActivationRangeQuantized(param_->act_type_ == ActType_Relu, param_->act_type_ == ActType_Relu6, | |||||
| quant_.output_.zp_, quant_.output_.scale_, &quant_.out_act_min_, | |||||
| &quant_.out_act_max_); | |||||
| } | |||||
| void MatmulBaseInt8CPUKernel::InitParameter() { | |||||
| param_->a_const_ = (in_tensors_[0]->data_c() != nullptr); | |||||
| param_->b_const_ = (in_tensors_[1]->data_c() != nullptr); | |||||
| return; | |||||
| } | |||||
| void MatmulBaseInt8CPUKernel::ResizeParameter() { | |||||
| param_->row_align_ = UP_ROUND(param_->row_, C4NUM); | |||||
| param_->col_align_ = UP_ROUND(param_->col_, C4NUM); | |||||
| param_->deep_16_ = UP_ROUND(param_->deep_, C16NUM); | |||||
| thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(param_->col_align_, C4NUM)); | |||||
| thread_stride_ = UP_DIV(UP_DIV(param_->col_align_, C4NUM), thread_count_); | |||||
| return; | |||||
| } | |||||
| void MatmulBaseInt8CPUKernel::FreeTmpBuffer() { | |||||
| if (pack_a_ptr_ != nullptr) { | |||||
| free(pack_a_ptr_); | |||||
| pack_a_ptr_ = nullptr; | |||||
| } | |||||
| if (pack_b_ptr_ != nullptr) { | |||||
| free(pack_b_ptr_); | |||||
| pack_b_ptr_ = nullptr; | |||||
| } | |||||
| if (input_sums_ != nullptr) { | |||||
| free(input_sums_); | |||||
| input_sums_ = nullptr; | |||||
| } | |||||
| if (weight_bias_sums_ != nullptr) { | |||||
| free(weight_bias_sums_); | |||||
| weight_bias_sums_ = nullptr; | |||||
| } | |||||
| return; | |||||
| } | |||||
| void MatmulBaseInt8CPUKernel::TransferB() { | |||||
| auto weight_data = reinterpret_cast<int8_t *>(in_tensors_.at(1)->data_c()); | |||||
| for (int i = 0; i < param_->batch; i++) { | |||||
| auto current_weight = weight_data + i * param_->deep_ * param_->col_; | |||||
| auto current_b_pack = pack_b_ptr_ + i * param_->col_align_ * param_->deep_16_; | |||||
| auto current_sums = weight_bias_sums_ + i * param_->col_align_; | |||||
| if (param_->b_transpose_) { | |||||
| RowMajor2Row16x4MajorInt8(current_weight, current_b_pack, param_->col_, param_->deep_); | |||||
| CalcWeightBiasSums(current_weight, param_->deep_, param_->col_, quant_.input_.zp_, quant_.filter_zp_, bias_ptr_, | |||||
| current_sums, ColMajor, filter_per_channel_); | |||||
| } else { | |||||
| RowMajor2Col16x4MajorInt8(current_weight, param_->deep_, param_->col_, current_b_pack); | |||||
| CalcWeightBiasSums(current_weight, param_->deep_, param_->col_, quant_.input_.zp_, quant_.filter_zp_, bias_ptr_, | |||||
| current_sums, RowMajor, false); | |||||
| } | |||||
| } | |||||
| return; | |||||
| } | |||||
| int MatmulBaseInt8CPUKernel::InitTmpBuffer() { | |||||
| pack_a_ptr_ = reinterpret_cast<int8_t *>(malloc(param_->row_align_ * param_->deep_16_ * sizeof(int8_t))); | |||||
| if (pack_a_ptr_ == nullptr) { | |||||
| FreeTmpBuffer(); | |||||
| return RET_ERROR; | |||||
| } | |||||
| pack_b_ptr_ = | |||||
| reinterpret_cast<int8_t *>(malloc(param_->batch * param_->col_align_ * param_->deep_16_ * sizeof(int8_t))); | |||||
| if (pack_b_ptr_ == nullptr) { | |||||
| FreeTmpBuffer(); | |||||
| return RET_ERROR; | |||||
| } | |||||
| input_sums_ = reinterpret_cast<int *>(malloc(param_->row_align_ * sizeof(int))); | |||||
| if (input_sums_ == nullptr) { | |||||
| FreeTmpBuffer(); | |||||
| return RET_ERROR; | |||||
| } | |||||
| weight_bias_sums_ = reinterpret_cast<int *>(malloc(param_->batch * param_->col_align_ * sizeof(int))); | |||||
| if (weight_bias_sums_ == nullptr) { | |||||
| FreeTmpBuffer(); | |||||
| return RET_ERROR; | |||||
| } | |||||
| memset(pack_a_ptr_, 0, param_->row_align_ * param_->deep_16_ * sizeof(int8_t)); | |||||
| memset(pack_b_ptr_, 0, param_->batch * param_->col_align_ * param_->deep_16_ * sizeof(int8_t)); | |||||
| memset(input_sums_, 0, param_->row_align_ * sizeof(int)); | |||||
| memset(weight_bias_sums_, 0, param_->batch * param_->col_align_ * sizeof(int)); | |||||
| return RET_OK; | |||||
| } | |||||
| int MatmulBaseInt8CPUKernel::InitBias() { | |||||
| if (in_tensors_.size() == 3) { | |||||
| auto bias_tensor = in_tensors_[2]; | |||||
| int max_bias_data = UP_ROUND(bias_tensor->ElementsNum(), C4NUM); | |||||
| bias_ptr_ = reinterpret_cast<int *>(malloc(max_bias_data * sizeof(int))); | |||||
| if (bias_ptr_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Memory allocation failed"; | |||||
| FreeTmpBuffer(); | |||||
| return RET_MEMORY_FAILED; | |||||
| } | |||||
| memcpy(bias_ptr_, bias_tensor->data_c(), bias_tensor->ElementsNum() * sizeof(int)); | |||||
| } else { | |||||
| bias_ptr_ = nullptr; | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| int MatmulBaseInt8CPUKernel::Init() { | |||||
| auto ret = MallocQuantParam(); | |||||
| if (ret != RET_OK) { | |||||
| FreeQuantParam(); | |||||
| return ret; | |||||
| } | |||||
| InitQuantParam(); | |||||
| ret = InitBias(); | |||||
| if (ret != RET_OK) { | |||||
| FreeQuantParam(); | |||||
| return ret; | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| int MatmulBaseInt8CPUKernel::ReSize() { | |||||
| FreeTmpBuffer(); | |||||
| ResizeParameter(); | |||||
| auto ret = InitTmpBuffer(); | |||||
| if (ret != RET_OK) { | |||||
| FreeQuantParam(); | |||||
| return ret; | |||||
| } | |||||
| if (param_->b_const_ == true) { | |||||
| TransferB(); | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| int MatmulBaseInt8CPUKernel::Run() { | |||||
| if (param_->b_const_ == false) { | |||||
| TransferB(); | |||||
| } | |||||
| int8_t *a_ptr = reinterpret_cast<int8_t *>(in_tensors_.at(0)->data_c()); | |||||
| int8_t *c_ptr = reinterpret_cast<int8_t *>(out_tensors_.at(0)->data_c()); | |||||
| int32_t tmp_weight_zp = filter_per_channel_ ? 1 : quant_.filter_zp_[0]; | |||||
| for (int i = 0; i < param_->batch; i++) { | |||||
| auto current_src_a = a_ptr + i * param_->row_ * param_->deep_; | |||||
| if (param_->a_transpose_) { | |||||
| RowMajor2Col16x4MajorInt8(current_src_a, param_->deep_, param_->row_, pack_a_ptr_); | |||||
| CalcInputSums(current_src_a, param_->row_, param_->deep_, tmp_weight_zp, input_sums_, ColMajor); | |||||
| } else { | |||||
| RowMajor2Row16x4MajorInt8(current_src_a, pack_a_ptr_, param_->row_, param_->deep_); | |||||
| CalcInputSums(current_src_a, param_->row_, param_->deep_, tmp_weight_zp, input_sums_, RowMajor); | |||||
| } | |||||
| batch_b_ptr_ = pack_b_ptr_ + i * param_->col_align_ * param_->deep_16_; | |||||
| batch_sums_ = weight_bias_sums_ + i * param_->col_align_; | |||||
| batch_c_ptr_ = c_ptr + i * param_->row_ * param_->col_; | |||||
| auto ret = ParallelLaunch(this->context_->thread_pool_, MatmulBaseInt8Run, this, thread_count_); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "MatmulInt8Run error: [" << ret << "]"; | |||||
| return ret; | |||||
| } | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| } // namespace mindspore::kernel | |||||
| @@ -0,0 +1,82 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_MATMUL_BASE_INT8_H_ | |||||
| #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_MATMUL_BASE_INT8_H_ | |||||
| #include <vector> | |||||
| #include "include/errorcode.h" | |||||
| #include "include/context.h" | |||||
| #include "src/lite_kernel.h" | |||||
| #include "nnacl/matmul_parameter.h" | |||||
| #include "nnacl/common_func.h" | |||||
| #include "nnacl/int8/quantize.h" | |||||
| #include "nnacl/int8/common_func_int8.h" | |||||
| #include "nnacl/int8/matmul_int8.h" | |||||
| namespace mindspore::kernel { | |||||
| class MatmulBaseInt8CPUKernel : public LiteKernel { | |||||
| public: | |||||
| MatmulBaseInt8CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | |||||
| const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx, | |||||
| const mindspore::lite::PrimitiveC *primitive) | |||||
| : LiteKernel(parameter, inputs, outputs, ctx, primitive) { | |||||
| param_ = reinterpret_cast<MatMulParameter *>(op_parameter_); | |||||
| } | |||||
| ~MatmulBaseInt8CPUKernel() override; | |||||
| int Init() override; | |||||
| int ReSize() override; | |||||
| int Run() override; | |||||
| public: | |||||
| int RunImpl(int task_id); | |||||
| protected: | |||||
| void InitParameter(); | |||||
| private: | |||||
| void ResizeParameter(); | |||||
| int InitBias(); | |||||
| private: | |||||
| int InitTmpBuffer(); | |||||
| void FreeTmpBuffer(); | |||||
| void TransferA(); | |||||
| void TransferB(); | |||||
| private: | |||||
| int MallocQuantParam(); | |||||
| void FreeQuantParam(); | |||||
| void InitQuantParam(); | |||||
| protected: | |||||
| MatMulParameter *param_ = nullptr; | |||||
| MatmulQuantParameter quant_; | |||||
| int thread_count_ = 1; | |||||
| int thread_stride_ = 0; | |||||
| int8_t *pack_a_ptr_ = nullptr; | |||||
| int8_t *pack_b_ptr_ = nullptr; | |||||
| int *input_sums_ = nullptr; | |||||
| int *weight_bias_sums_ = nullptr; | |||||
| int *bias_ptr_ = nullptr; | |||||
| bool filter_per_channel_ = true; | |||||
| int8_t *batch_b_ptr_ = nullptr; | |||||
| int8_t *batch_c_ptr_ = nullptr; | |||||
| int *batch_sums_ = nullptr; | |||||
| }; | |||||
| } // namespace mindspore::kernel | |||||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_MATMUL_BASE_INT8_H_ | |||||
| @@ -22,46 +22,27 @@ | |||||
| #include "src/kernel_registry.h" | #include "src/kernel_registry.h" | ||||
| using mindspore::lite::KernelRegistrar; | using mindspore::lite::KernelRegistrar; | ||||
| using mindspore::lite::RET_MEMORY_FAILED; | |||||
| using mindspore::lite::RET_OK; | using mindspore::lite::RET_OK; | ||||
| using mindspore::schema::PrimitiveType_MatMul; | using mindspore::schema::PrimitiveType_MatMul; | ||||
| namespace mindspore::kernel { | namespace mindspore::kernel { | ||||
| MatmulInt8CPUKernel::~MatmulInt8CPUKernel() { FreeTmpBuffer(); } | |||||
| int MatmulInt8CPUKernel::Init() { | |||||
| InitParameter(); | |||||
| void MatmulInt8CPUKernel::FreeTmpBuffer() { | |||||
| if (a_r4x16_ptr_ != nullptr) { | |||||
| context_->allocator->Free(a_r4x16_ptr_); | |||||
| a_r4x16_ptr_ = nullptr; | |||||
| } | |||||
| if (input_sums_ != nullptr) { | |||||
| context_->allocator->Free(input_sums_); | |||||
| input_sums_ = nullptr; | |||||
| } | |||||
| if (b_c16x4_batch_ != nullptr) { | |||||
| context_->allocator->Free(b_c16x4_batch_); | |||||
| b_c16x4_batch_ = nullptr; | |||||
| } | |||||
| if (weight_bias_sums_batch_ != nullptr) { | |||||
| context_->allocator->Free(weight_bias_sums_batch_); | |||||
| weight_bias_sums_batch_ = nullptr; | |||||
| } | |||||
| if (bias_ptr_ != nullptr) { | |||||
| context_->allocator->Free(bias_ptr_); | |||||
| bias_ptr_ = nullptr; | |||||
| auto ret = MatmulBaseInt8CPUKernel::Init(); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "ParallelLaunch failed"; | |||||
| return ret; | |||||
| } | } | ||||
| return; | |||||
| } | |||||
| int MatmulInt8CPUKernel::Init() { | |||||
| if (!InferShapeDone()) { | if (!InferShapeDone()) { | ||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| return ReSize(); | return ReSize(); | ||||
| } | } | ||||
| int MatmulInt8CPUKernel::ReSize() { | int MatmulInt8CPUKernel::ReSize() { | ||||
| FreeTmpBuffer(); | |||||
| int batch = 1; | int batch = 1; | ||||
| auto x_shape = in_tensors_.at(0)->shape(); | auto x_shape = in_tensors_.at(0)->shape(); | ||||
| auto o_shape = out_tensors_.at(0)->shape(); | auto o_shape = out_tensors_.at(0)->shape(); | ||||
| @@ -69,159 +50,19 @@ int MatmulInt8CPUKernel::ReSize() { | |||||
| for (size_t i = 0; i < x_shape.size() - 2; ++i) { | for (size_t i = 0; i < x_shape.size() - 2; ++i) { | ||||
| batch *= x_shape[i]; | batch *= x_shape[i]; | ||||
| } | } | ||||
| params_->batch = batch; | |||||
| param_->batch = batch; | |||||
| MS_ASSERT(o_shape.size() >= 2); | MS_ASSERT(o_shape.size() >= 2); | ||||
| params_->row_ = o_shape[o_shape.size() - 2]; | |||||
| params_->col_ = o_shape[o_shape.size() - 1]; | |||||
| params_->deep_ = params_->a_transpose_ ? x_shape[x_shape.size() - 2] : x_shape[x_shape.size() - 1]; | |||||
| params_->row_4_ = UP_ROUND(params_->row_, 4); | |||||
| params_->col_4_ = UP_ROUND(params_->col_, 4); | |||||
| params_->deep_16_ = UP_ROUND(params_->deep_, 16); | |||||
| a_r4x16_ptr_ = | |||||
| reinterpret_cast<int8_t *>(context_->allocator->Malloc(params_->row_4_ * params_->deep_16_ * sizeof(int8_t))); | |||||
| if (!a_r4x16_ptr_) return RET_MEMORY_FAILED; | |||||
| memset(a_r4x16_ptr_, 0, params_->row_4_ * params_->deep_16_ * sizeof(int8_t)); | |||||
| input_sums_ = reinterpret_cast<int *>(context_->allocator->Malloc(params_->row_4_ * sizeof(int))); | |||||
| if (!input_sums_) return RET_MEMORY_FAILED; | |||||
| memset(input_sums_, 0, params_->row_4_ * sizeof(int)); | |||||
| b_c16x4_batch_ = reinterpret_cast<int8_t *>( | |||||
| context_->allocator->Malloc(params_->batch * params_->col_4_ * params_->deep_16_ * sizeof(int8_t))); | |||||
| if (!b_c16x4_batch_) return RET_MEMORY_FAILED; | |||||
| memset(b_c16x4_batch_, 0, params_->batch * params_->col_4_ * params_->deep_16_ * sizeof(int8_t)); | |||||
| weight_bias_sums_batch_ = | |||||
| reinterpret_cast<int *>(context_->allocator->Malloc(params_->batch * params_->col_4_ * sizeof(int))); | |||||
| if (!weight_bias_sums_batch_) return RET_MEMORY_FAILED; | |||||
| memset(weight_bias_sums_batch_, 0, params_->batch * params_->col_4_ * sizeof(int)); | |||||
| if (in_tensors_.size() == 3) { | |||||
| auto bias_size = params_->col_4_ * sizeof(int); | |||||
| bias_ptr_ = reinterpret_cast<int *>(context_->allocator->Malloc(bias_size)); | |||||
| if (!bias_ptr_) return RET_MEMORY_FAILED; | |||||
| memcpy(bias_ptr_, in_tensors_[2]->data_c(), bias_size); | |||||
| } else { | |||||
| bias_ptr_ = NULL; | |||||
| } | |||||
| thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(params_->col_4_, 4)); | |||||
| thread_stride_ = UP_DIV(UP_DIV(params_->col_4_, 4), thread_count_); | |||||
| auto input_tensor = in_tensors_.at(0); | |||||
| auto params = input_tensor->quant_params(); | |||||
| MS_ASSERT(params.size() == 1); | |||||
| quant_params_.input.zp_ = params.front().zeroPoint; | |||||
| quant_params_.input.scale_ = params.front().scale; | |||||
| auto weight_tensor = in_tensors_.at(1); | |||||
| params = weight_tensor->quant_params(); | |||||
| MS_ASSERT(params.size() == 1); | |||||
| quant_params_.weight.zp_ = params.front().zeroPoint; | |||||
| quant_params_.weight.scale_ = params.front().scale; | |||||
| auto output_tensor = out_tensors_.at(0); | |||||
| params = output_tensor->quant_params(); | |||||
| MS_ASSERT(params.size() == 1); | |||||
| quant_params_.output.zp_ = params.front().zeroPoint; | |||||
| quant_params_.output.scale_ = params.front().scale; | |||||
| params_->b_const_ = (in_tensors_.at(1)->data_c() != nullptr); | |||||
| if (params_->b_const_) { | |||||
| auto b_ptr = reinterpret_cast<int8_t *>(in_tensors_.at(1)->data_c()); | |||||
| for (int i = 0; i < params_->batch; ++i) { | |||||
| auto cur_b = b_ptr + i * params_->deep_ * params_->col_; | |||||
| auto cur_b_pack = b_c16x4_batch_ + i * params_->col_4_ * params_->deep_16_; | |||||
| auto cur_sums = weight_bias_sums_batch_ + i * params_->col_4_; | |||||
| if (params_->b_transpose_) { | |||||
| RowMajor2Row16x4MajorInt8(cur_b, cur_b_pack, params_->col_, params_->deep_); | |||||
| CalcWeightBiasSums(cur_b, params_->deep_, params_->col_, quant_params_.input.zp_, &quant_params_.weight.zp_, | |||||
| bias_ptr_, cur_sums, ColMajor, false); | |||||
| } else { | |||||
| RowMajor2Col16x4MajorInt8(cur_b, params_->deep_, params_->col_, cur_b_pack); | |||||
| CalcWeightBiasSums(cur_b, params_->deep_, params_->col_, quant_params_.input.zp_, &quant_params_.weight.zp_, | |||||
| bias_ptr_, cur_sums, RowMajor, false); | |||||
| } | |||||
| } | |||||
| } | |||||
| double real_multiplier = quant_params_.input.scale_ * quant_params_.weight.scale_ / quant_params_.output.scale_; | |||||
| QuantizeRoundParameterWithDoublePrecision(real_multiplier, &quant_params_.quant_multiplier, &quant_params_.left_shift, | |||||
| &quant_params_.right_shift); | |||||
| return RET_OK; | |||||
| } | |||||
| int MatmulInt8CPUKernel::RunImpl(int task_id) { | |||||
| int cur_oc = MSMIN(thread_stride_, UP_DIV(params_->col_4_, 4) - task_id * thread_stride_); | |||||
| if (cur_oc <= 0) { | |||||
| return RET_OK; | |||||
| } | |||||
| int cur_oc_res = MSMIN(thread_stride_ * C4NUM, params_->col_ - task_id * thread_stride_ * C4NUM); | |||||
| auto cur_b = b_c16x4_ptr_ + task_id * thread_stride_ * 4 * params_->deep_16_; | |||||
| auto cur_bias = weight_bias_sums_ + task_id * thread_stride_ * 4; | |||||
| auto cur_c = c_ptr_ + task_id * thread_stride_ * 4; | |||||
| auto &p = quant_params_; | |||||
| #ifdef ENABLE_ARM64 | |||||
| MatmulInt8Neon64(a_r4x16_ptr_, cur_b, cur_c, params_->row_4_, cur_oc * C4NUM, params_->deep_16_, input_sums_, | |||||
| cur_bias, INT8_MIN, INT8_MAX, p.output.zp_, &p.quant_multiplier, &p.left_shift, &p.right_shift, | |||||
| params_->row_, cur_oc_res, params_->col_ * sizeof(int8_t), false); | |||||
| #else | |||||
| MatMulInt8_16x4_r(a_r4x16_ptr_, cur_b, cur_c, params_->row_, cur_oc_res, params_->deep_16_, params_->col_, | |||||
| input_sums_, cur_bias, &p.left_shift, &p.right_shift, &p.quant_multiplier, p.output.zp_, INT8_MIN, | |||||
| INT8_MAX, false); | |||||
| #endif | |||||
| return RET_OK; | |||||
| } | |||||
| param_->row_ = o_shape[o_shape.size() - 2]; | |||||
| param_->col_ = o_shape[o_shape.size() - 1]; | |||||
| param_->deep_ = param_->a_transpose_ ? x_shape[x_shape.size() - 2] : x_shape[x_shape.size() - 1]; | |||||
| int MatmulInt8Run(void *cdata, int task_id) { | |||||
| auto op = reinterpret_cast<MatmulInt8CPUKernel *>(cdata); | |||||
| auto ret = op->RunImpl(task_id); | |||||
| auto ret = MatmulBaseInt8CPUKernel::ReSize(); | |||||
| if (ret != RET_OK) { | if (ret != RET_OK) { | ||||
| MS_LOG(ERROR) << "MatmulInt8Run error task_id[" << task_id << "] error_code[" << ret << "]"; | |||||
| MS_LOG(ERROR) << "MatmulBaseInt8CPUKernel failed"; | |||||
| return ret; | return ret; | ||||
| } | } | ||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| int MatmulInt8CPUKernel::Run() { | |||||
| auto a_ptr = reinterpret_cast<int8_t *>(in_tensors_.at(0)->data_c()); | |||||
| auto c_ptr = reinterpret_cast<int8_t *>(out_tensors_.at(0)->data_c()); | |||||
| auto a_stride = params_->row_ * params_->deep_; | |||||
| auto b_stride = params_->deep_ * params_->col_; | |||||
| auto c_stride = params_->row_ * params_->col_; | |||||
| if (!params_->b_const_) { | |||||
| auto b_ptr = reinterpret_cast<int8_t *>(in_tensors_.at(1)->data_c()); | |||||
| for (int i = 0; i < params_->batch; ++i) { | |||||
| auto cur_b = b_ptr + i * b_stride; | |||||
| auto cur_b_pack = b_c16x4_batch_ + i * params_->col_4_ * params_->deep_16_; | |||||
| auto cur_sums = weight_bias_sums_batch_ + i * params_->col_4_; | |||||
| if (params_->b_transpose_) { | |||||
| RowMajor2Row16x4MajorInt8(cur_b, cur_b_pack, params_->col_, params_->deep_); | |||||
| CalcWeightBiasSums(cur_b, params_->deep_, params_->col_, quant_params_.input.zp_, &quant_params_.weight.zp_, | |||||
| bias_ptr_, cur_sums, ColMajor, false); | |||||
| } else { | |||||
| RowMajor2Col16x4MajorInt8(cur_b, params_->deep_, params_->col_, cur_b_pack); | |||||
| CalcWeightBiasSums(cur_b, params_->deep_, params_->col_, quant_params_.input.zp_, &quant_params_.weight.zp_, | |||||
| bias_ptr_, cur_sums, RowMajor, false); | |||||
| } | |||||
| } | |||||
| } | |||||
| for (int i = 0; i < params_->batch; ++i) { | |||||
| auto cur_a_ptr = a_ptr + i * a_stride; | |||||
| if (params_->a_transpose_) { | |||||
| RowMajor2Col16x4MajorInt8(cur_a_ptr, params_->deep_, params_->row_, a_r4x16_ptr_); | |||||
| CalcInputSums(cur_a_ptr, params_->row_, params_->deep_, quant_params_.weight.zp_, input_sums_, ColMajor); | |||||
| } else { | |||||
| RowMajor2Row16x4MajorInt8(cur_a_ptr, a_r4x16_ptr_, params_->row_, params_->deep_); | |||||
| CalcInputSums(cur_a_ptr, params_->row_, params_->deep_, quant_params_.weight.zp_, input_sums_, RowMajor); | |||||
| } | |||||
| b_c16x4_ptr_ = b_c16x4_batch_ + i * params_->col_4_ * params_->deep_16_; | |||||
| weight_bias_sums_ = weight_bias_sums_batch_ + i * params_->col_4_; | |||||
| c_ptr_ = c_ptr + i * c_stride; | |||||
| auto ret = ParallelLaunch(this->context_->thread_pool_, MatmulInt8Run, this, thread_count_); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "MatmulInt8Run error: [" << ret << "]"; | |||||
| return ret; | |||||
| } | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_MatMul, LiteKernelCreator<MatmulInt8CPUKernel>) | REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_MatMul, LiteKernelCreator<MatmulInt8CPUKernel>) | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -22,39 +22,18 @@ | |||||
| #include "nnacl/matmul_parameter.h" | #include "nnacl/matmul_parameter.h" | ||||
| #include "mindspore/lite/nnacl/int8/quantize.h" | #include "mindspore/lite/nnacl/int8/quantize.h" | ||||
| #include "src/lite_kernel.h" | #include "src/lite_kernel.h" | ||||
| #include "src/runtime/kernel/arm/int8/matmul_base_int8.h" | |||||
| using mindspore::lite::InnerContext; | |||||
| namespace mindspore::kernel { | namespace mindspore::kernel { | ||||
| class MatmulInt8CPUKernel : public LiteKernel { | |||||
| class MatmulInt8CPUKernel : public MatmulBaseInt8CPUKernel { | |||||
| public: | public: | ||||
| MatmulInt8CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | MatmulInt8CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | ||||
| const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx, | |||||
| const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx, | |||||
| const mindspore::lite::PrimitiveC *primitive) | const mindspore::lite::PrimitiveC *primitive) | ||||
| : LiteKernel(parameter, inputs, outputs, ctx, primitive) { | |||||
| params_ = reinterpret_cast<MatMulParameter *>(op_parameter_); | |||||
| } | |||||
| ~MatmulInt8CPUKernel() override; | |||||
| : MatmulBaseInt8CPUKernel(parameter, inputs, outputs, ctx, primitive) {} | |||||
| ~MatmulInt8CPUKernel() override = default; | |||||
| int Init() override; | int Init() override; | ||||
| int ReSize() override; | int ReSize() override; | ||||
| int Run() override; | |||||
| int RunImpl(int task_id); | |||||
| private: | |||||
| void FreeTmpBuffer(); | |||||
| private: | |||||
| MatMulParameter *params_ = nullptr; | |||||
| MatmulQuantArg quant_params_; | |||||
| int8_t *a_r4x16_ptr_ = nullptr; | |||||
| int8_t *b_c16x4_ptr_ = nullptr; | |||||
| int8_t *c_ptr_ = nullptr; | |||||
| int8_t *b_c16x4_batch_ = nullptr; | |||||
| int *bias_ptr_ = nullptr; | |||||
| int *input_sums_ = nullptr; | |||||
| int *weight_bias_sums_ = nullptr; | |||||
| int *weight_bias_sums_batch_ = nullptr; | |||||
| int thread_stride_ = 0; | |||||
| int thread_count_ = 0; | |||||
| }; | }; | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -599,9 +599,9 @@ function Run_x86() { | |||||
| echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib:./third_party/libjpeg-turbo/lib:./third_party/opencv/lib;./benchmark/benchmark --modelFile='${ms_models_path}'/'${model_name}'.ms --inDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/'${model_name}'.ms.bin --benchmarkDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/'${model_name}'.ms.out --accuracyThreshold=${accuracy_limit}' >> "${run_x86_log_file}" | echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib:./third_party/libjpeg-turbo/lib:./third_party/opencv/lib;./benchmark/benchmark --modelFile='${ms_models_path}'/'${model_name}'.ms --inDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/'${model_name}'.ms.bin --benchmarkDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/'${model_name}'.ms.out --accuracyThreshold=${accuracy_limit}' >> "${run_x86_log_file}" | ||||
| export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib:./third_party/libjpeg-turbo/lib:./third_party/opencv/lib;./benchmark/benchmark --modelFile=${ms_models_path}/${model_name}_weightquant.ms --inDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/${model_name}.ms.bin --benchmarkDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/${model_name}.ms.out --accuracyThreshold=${accuracy_limit} >> "${run_x86_log_file}" | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib:./third_party/libjpeg-turbo/lib:./third_party/opencv/lib;./benchmark/benchmark --modelFile=${ms_models_path}/${model_name}_weightquant.ms --inDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/${model_name}.ms.bin --benchmarkDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/${model_name}.ms.out --accuracyThreshold=${accuracy_limit} >> "${run_x86_log_file}" | ||||
| if [ $? = 0 ]; then | if [ $? = 0 ]; then | ||||
| run_result='x86: '${model_name}'[weight quant] pass'; echo ${run_result} >> ${run_benchmark_result_file} | |||||
| run_result='x86: '${model_name}'[weight_quant] pass'; echo ${run_result} >> ${run_benchmark_result_file} | |||||
| else | else | ||||
| run_result='x86: '${model_name}'[weight quant] failed'; echo ${run_result} >> ${run_benchmark_result_file}; return 1 | |||||
| run_result='x86: '${model_name}'[weight_quant] failed'; echo ${run_result} >> ${run_benchmark_result_file}; return 1 | |||||
| fi | fi | ||||
| done < ${models_mindspore_weightquant_config} | done < ${models_mindspore_weightquant_config} | ||||
| @@ -79,58 +79,6 @@ void MMInt8TestInit(std::vector<lite::Tensor *> *inputs, std::vector<lite::Tenso | |||||
| delete[] weight_data; | delete[] weight_data; | ||||
| } | } | ||||
| TEST_F(TestMatmulInt8, simple) { | |||||
| #define ROW 10 | |||||
| #define COL 15 | |||||
| #define DEPTH 10 | |||||
| #define ROW4 UP_ROUND(ROW, 4) | |||||
| #define COL4 UP_ROUND(COL, 4) | |||||
| #define DEPTH16 UP_ROUND(DEPTH, 16) | |||||
| int8_t a[ROW * DEPTH] = {-3, -3, 0, -2, -4, -2, 1, 0, -1, 0, 5, 1, 3, 4, 4, -3, -5, 2, -2, 4, | |||||
| 4, 5, 1, -1, 5, 5, 2, -1, 0, 4, -4, 2, 5, -2, 5, 3, -1, 2, -4, 5, | |||||
| -5, 4, 5, 3, 5, 4, -2, 5, 5, -5, -5, -5, 2, -4, -3, 3, -3, -5, 5, 0, | |||||
| 2, -4, 4, 2, -5, 3, -1, 3, -3, 2, -5, -4, 0, -5, 2, 4, 0, -5, -1, 4, | |||||
| 3, 5, 5, 2, -5, -5, -4, -5, 3, 3, 3, 0, -2, 0, -2, -3, -2, 3, 5, -5}; | |||||
| int8_t b[DEPTH * COL] = {1, 2, -2, -5, -4, 2, 3, 2, -5, 4, -5, 4, 1, -2, 1, 5, 5, 5, 2, 5, -3, -3, | |||||
| -1, -3, -1, 0, -4, 0, 1, -2, -2, -3, -5, 1, 1, 0, 4, 5, -3, -1, 4, 3, 5, 4, | |||||
| 2, 4, -3, -4, 1, 4, -4, 5, -1, -2, 3, 5, 5, 2, 1, -4, 1, 2, -3, 0, -2, 4, | |||||
| -3, -3, 1, 3, 4, -1, 3, 1, -5, -1, 2, 0, 0, 5, -1, -5, 5, -5, 0, 3, -3, 4, | |||||
| 3, 1, -3, -3, 2, -2, -3, -3, 3, 4, 2, -1, 2, 0, -2, 4, 5, 3, -1, -3, -2, -1, | |||||
| 4, 3, -5, 1, 0, 0, -1, -4, -3, -2, 5, 3, 2, 1, -4, 1, 4, 5, -1, 2, -2, 2, | |||||
| 1, -2, 5, 2, -4, -4, 1, 1, 2, -1, -5, -4, 4, 1, -3, 4, -1, -4}; | |||||
| int8_t correct[ROW * COL] = { | |||||
| -36, -33, 11, 4, -12, -7, 11, 0, 37, -30, -13, -2, -30, -3, 29, 46, -13, -84, -8, 6, 39, 26, | |||||
| -67, -48, 57, 12, 32, 44, -24, -85, 22, 32, -8, -8, 20, 10, -45, 12, -69, 36, 22, -37, 58, 27, | |||||
| -24, -11, -22, -50, 26, 50, 28, -56, -42, -23, -1, 70, -58, 54, 35, -61, 54, 40, -11, 35, 43, 3, | |||||
| 7, 30, -7, -13, 73, -3, 26, 26, -11, -37, 0, 19, 34, -4, 0, -22, 71, 8, -25, -6, -5, 31, | |||||
| 8, 63, -25, -55, -62, -17, 23, 1, 36, 12, -38, 2, 11, 27, 18, 5, 4, -59, -17, 1, 25, 9, | |||||
| 13, -77, 13, 9, -11, 26, -52, 42, 28, 6, 44, 4, 2, 26, 19, -31, 46, 23, -57, 15, -31, 39, | |||||
| 40, -9, 8, 38, 40, 27, -19, -47, 14, 50, 14, 18, 0, -59, 39, -48, -47, 35}; | |||||
| int8_t output[ROW * COL] = {0}; | |||||
| int8_t *a_r4x16 = new int8_t[ROW4 * DEPTH16]; | |||||
| memset(a_r4x16, 0, ROW4 * DEPTH16); | |||||
| int8_t *b_c16x4 = new int8_t[COL4 * DEPTH16]; | |||||
| memset(b_c16x4, 0, COL4 * DEPTH16); | |||||
| RowMajor2Row16x4MajorInt8(a, a_r4x16, ROW, DEPTH); | |||||
| RowMajor2Col16x4MajorInt8(b, DEPTH, COL, b_c16x4); | |||||
| int a_sums[ROW4] = {0}; | |||||
| int bias[COL4] = {0}; | |||||
| int multiplier, ls, rs; | |||||
| QuantizeRoundParameterWithDoublePrecision(1.0f, &multiplier, &ls, &rs); | |||||
| #ifdef ENABLE_ARM64 | |||||
| MatmulInt8Neon64(a_r4x16, b_c16x4, output, ROW4, COL4, DEPTH16, a_sums, bias, INT8_MIN, INT8_MAX, 0, &multiplier, &ls, | |||||
| &rs, ROW, COL, COL, false); | |||||
| #else | |||||
| MatMulInt8_16x4_r(a_r4x16, b_c16x4, output, ROW, COL, DEPTH16, COL, a_sums, bias, &ls, &rs, &multiplier, 0, INT8_MIN, | |||||
| INT8_MAX, false); | |||||
| #endif | |||||
| ASSERT_EQ(0, CompareOutputData(output, correct, ROW * COL, 0.1)); | |||||
| delete[] a_r4x16; | |||||
| delete[] b_c16x4; | |||||
| } | |||||
| TEST_F(TestMatmulInt8, mmtest1) { | TEST_F(TestMatmulInt8, mmtest1) { | ||||
| float in[] = {6.583835634764597, 11.337275140963907, -4.125256949459629, 10.994337291530833, | float in[] = {6.583835634764597, 11.337275140963907, -4.125256949459629, 10.994337291530833, | ||||
| 19.086065139532636, 3.620842999158455, 13.167624585590346, -18.326739299407755, | 19.086065139532636, 3.620842999158455, 13.167624585590346, -18.326739299407755, | ||||