| @@ -367,6 +367,26 @@ void ConvInt8Opt(int8_t *input_data, int8_t *packed_input, int8_t *packed_weight | |||
| } | |||
| } | |||
| void Conv1x1Int8(const int8_t *packed_input, const int8_t *packed_weight, int8_t *dst, const int32_t *input_sum, | |||
| const int32_t *bias, int row, int col, int deep16, ConvParameter *conv_param, | |||
| MATMUL_OPT_R_FUNC matmul_func) { | |||
| if (matmul_func != NULL) { | |||
| matmul_func(packed_input, packed_weight, dst, row, col, deep16, conv_param->output_channel_, input_sum, bias, | |||
| conv_param->conv_quant_arg_.left_shift_, conv_param->conv_quant_arg_.right_shift_, | |||
| conv_param->conv_quant_arg_.quant_multiplier_, conv_param->conv_quant_arg_.output_quant_args_[0].zp_, | |||
| conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0], | |||
| (conv_param->conv_quant_arg_.filter_arg_num_ > 1)); | |||
| } else { | |||
| MatMulInt8_16x4_r(packed_input, packed_weight, dst, row, col, deep16, conv_param->output_channel_, input_sum, bias, | |||
| conv_param->conv_quant_arg_.left_shift_, conv_param->conv_quant_arg_.right_shift_, | |||
| conv_param->conv_quant_arg_.quant_multiplier_, | |||
| conv_param->conv_quant_arg_.output_quant_args_[0].zp_, | |||
| conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0], | |||
| (conv_param->conv_quant_arg_.filter_arg_num_ > 1)); | |||
| } | |||
| return; | |||
| } | |||
| // int8 convolution 3x3 | |||
| void Conv3x3Int8(int16_t *input_data, int16_t *transed_weight, const int32_t *bias_data, int8_t *output_data, | |||
| int16_t *tile_buffer, int16_t *block_unit_buffer, int32_t *tmp_dst_buffer, int8_t *tmp_out, | |||
| @@ -25,6 +25,8 @@ | |||
| #include "nnacl/conv_parameter.h" | |||
| #include "nnacl/winograd_utils.h" | |||
| #include "nnacl/quantization/quantize.h" | |||
| #include "nnacl/matmul_parameter.h" | |||
| #include "nnacl/int8/matmul_int8.h" | |||
| typedef void (*GEMM_FUNC)(int8_t *dst, const int8_t *src, const int8_t *weight, const int32_t *bias, size_t ksize, | |||
| size_t ic4, size_t output_channel, size_t offset, const int32_t *input_sum, size_t act_min, | |||
| @@ -51,6 +53,11 @@ void ConvInt8Opt(int8_t *input_data, int8_t *packed_input, int8_t *packed_weight | |||
| int32_t *tmp_dst, int8_t *tmp_out, int8_t *output_data, int32_t *input_sum, int task_id, | |||
| ConvParameter *conv_param, GEMM_FUNC gemm_func); | |||
| // int8 convolution 1x1 | |||
| void Conv1x1Int8(const int8_t *packed_input, const int8_t *packed_weight, int8_t *dst, const int32_t *input_sum, | |||
| const int32_t *bias, int row, int col, int deep16, ConvParameter *conv_param, | |||
| MATMUL_OPT_R_FUNC matmul_func); | |||
| // int8 convolution 3x3 | |||
| void Conv3x3Int8(int16_t *input_data, int16_t *transed_weight, const int32_t *bias_data, int8_t *output_data, | |||
| int16_t *tile_buffer, int16_t *block_unit_buffer, int32_t *tmp_dst_buffer, int8_t *tmp_out, | |||
| @@ -172,73 +172,7 @@ void DeConvPackWeightSum(int8_t *weight, int32_t *weight_sum, int32_t input_zp, | |||
| void DeConvPackInputSum(const int8_t *src, int32_t *dst, int32_t filter_zp, size_t row4, size_t col16, | |||
| bool suppport_opt) { | |||
| /* optimize normal -> same layout */ | |||
| #ifdef ENABLE_ARM64 | |||
| asm volatile( | |||
| "mov x10, %[src] \n" | |||
| "mov x11, %[dst] \n" | |||
| "dup v15.4s, %w[filter_zp] \n" | |||
| "mov x0, #0 \n" | |||
| "1: \n" | |||
| "cmp x0, %[row4] \n" | |||
| "beq 4f \n" | |||
| "add x0, x0, #4\n" | |||
| "dup v10.4s, wzr \n" | |||
| "mov x2, #0 \n" | |||
| "2: \n" | |||
| "cmp x2, %[col16] \n" | |||
| "beq 3f \n" | |||
| "add x2, x2, #16\n" | |||
| "ld1 {v0.16b}, [x10], #16\n" | |||
| "ld1 {v1.16b}, [x10], #16\n" | |||
| "ld1 {v2.16b}, [x10], #16\n" | |||
| "ld1 {v3.16b}, [x10], #16\n" | |||
| "saddlp v4.8h, v0.16b \n" | |||
| "saddlp v5.8h, v1.16b \n" | |||
| "saddlp v6.8h, v2.16b \n" | |||
| "saddlp v7.8h, v3.16b \n" | |||
| "saddlp v0.4S, v4.8h \n" | |||
| "saddlp v1.4S, v5.8h \n" | |||
| "saddlp v2.4S, v6.8h \n" | |||
| "saddlp v3.4S, v7.8h \n" | |||
| "addv s4, v0.4S \n" | |||
| "addv s5, v1.4S \n" | |||
| "addv s6, v2.4S \n" | |||
| "addv s7, v3.4S \n" | |||
| "mov v0.s[0], v4.s[0] \n" | |||
| "mov v0.s[1], v5.s[0] \n" | |||
| "mov v0.s[2], v6.s[0] \n" | |||
| "mov v0.s[3], v7.s[0] \n" | |||
| "add v10.4s, v10.4s, v0.4s \n" | |||
| "b 2b\n" | |||
| "3: \n" | |||
| "mul v10.4s, v10.4s, v15.4s \n" | |||
| "st1 {v10.4s}, [x11], #16 \n" | |||
| "beq 1b \n" | |||
| "4: \n" | |||
| : | |||
| : [ dst ] "r"(dst), [ src ] "r"(src), [ row4 ] "r"(row4), [ col16 ] "r"(col16), [ filter_zp ] "r"(filter_zp) | |||
| : "x0", "x1", "x2", "x3", "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v10", "v15"); | |||
| #else | |||
| for (int r = 0; r < row4; r++) { | |||
| int32_t tmp_value = 0; | |||
| for (int c = 0; c < col16; c++) { | |||
| int r4div = r / C4NUM, r4mod = r % C4NUM, c16div = c / C16NUM, c16mod = c % C16NUM; | |||
| int src_index = r4div * C4NUM * col16 + c16div * C16NUM * C4NUM + r4mod * C16NUM + c16mod; | |||
| tmp_value += src[src_index]; | |||
| } | |||
| } | |||
| #endif | |||
| PackInputSum16x4PerLater(src, dst, filter_zp, row4, col16); | |||
| return; | |||
| } | |||
| @@ -28,6 +28,19 @@ void RowMajor2Row8MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col) | |||
| } | |||
| } | |||
| void RowMajor2Row4x16MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col) { | |||
| int col16 = UP_ROUND(col, C16NUM); | |||
| for (int r = 0; r < row; r++) { | |||
| int rd4 = r / C4NUM; | |||
| int rm4 = r % C4NUM; | |||
| for (int c = 0; c < col; c++) { | |||
| int cd16 = c / C16NUM; | |||
| int cm16 = c % C16NUM; | |||
| dst_ptr[cd16 * col16 * C4NUM + rd4 * C4NUM * C16NUM + rm4 * C16NUM + cm16] = src_ptr[r * col16 + c]; | |||
| } | |||
| } | |||
| } | |||
| void MatrixPack4x16UnitInt8(int8_t *src, int8_t *dst, int row, int col, int stride) { | |||
| for (int r = 0; r < row; r++) { | |||
| int8_t *src_r = src + r * stride; | |||
| @@ -145,7 +158,38 @@ void MatMulInt8_16x4(const int8_t *a, const int8_t *b, int *dst, int row_4, int | |||
| return; | |||
| } | |||
| #ifdef ENABLE_ARM64 | |||
| void MatMulInt8_16x4_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16, | |||
| size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift, | |||
| int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi, | |||
| bool per_channel) { | |||
| /* row4x16-major * row16x4-major => (int8)row-major : per-channel */ | |||
| for (int r = 0; r < row; r++) { | |||
| for (int c = 0; c < col; c++) { | |||
| int r4div = r / C4NUM, r4mod = r % C4NUM; | |||
| int c4div = c / C4NUM, c4mod = c % C4NUM; | |||
| size_t ci = r * stride + c; | |||
| int32_t value = 0; | |||
| for (int d = 0; d < deep_16; d++) { | |||
| int d16div = d / C16NUM, d16mod = d % C16NUM; | |||
| size_t ai = r4div * deep_16 * C4NUM + d16div * C4NUM * C16NUM + r4mod * C16NUM + d16mod; | |||
| size_t bi = c4div * deep_16 * C4NUM + d16div * C4NUM * C16NUM + c4mod * C16NUM + d16mod; | |||
| value = value + a[ai] * b[bi]; | |||
| } | |||
| int32_t cur_input_sum = per_channel ? input_sum[c4div * UP_ROUND(row, C4NUM) + r * C4NUM + c4mod] : input_sum[r]; | |||
| value -= cur_input_sum; | |||
| value += bias[c]; | |||
| int32_t cur_left_shift = per_channel ? left_shift[c] : left_shift[0]; | |||
| int32_t cur_right_shift = per_channel ? right_shift[c] : right_shift[0]; | |||
| int32_t cur_multiplier = per_channel ? multiplier[c] : multiplier[0]; | |||
| value = MultiplyByQuantizedMultiplier(value, cur_multiplier, cur_left_shift, cur_right_shift) + output_zp; | |||
| value = MSMIN(maxi, value); | |||
| value = MSMAX(mini, value); | |||
| dst[ci] = (int8_t)value; | |||
| } | |||
| } | |||
| return; | |||
| } | |||
| void RowMajor2Row4x16Major(int8_t *src, int row, int col, int8_t *dst, int col_16) { | |||
| int stride = sizeof(int8_t) * 16 * 4; | |||
| for (int r = 0; r < row; ++r) { | |||
| @@ -201,4 +245,3 @@ void Row4x4Major2RowMajor(int8_t *src, int row4, int8_t *dst, int row, int cow) | |||
| } | |||
| } | |||
| } | |||
| #endif | |||
| @@ -28,17 +28,22 @@ void MatMulInt8(const int8_t *a, const int8_t *b, int *c, const int row8, const | |||
| const int a_zp, const int b_zp); | |||
| void MatMulInt8_16x4(const int8_t *a, const int8_t *b, int *dst, int row_4, int col_4, int deep_16, | |||
| const int *input_sum, const int *bias); | |||
| void MatMulInt8_16x4_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16, | |||
| size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift, | |||
| int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi, | |||
| bool per_channel); | |||
| void RowMajor2Row8MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col); | |||
| void RowMajor2Row4x16MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col); | |||
| void RowMajor2Col8MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col); | |||
| void RowMajor2Row16x4MajorInt8(void *src_ptr, void *dst_ptr, int row, int col); | |||
| #ifdef ENABLE_ARM64 | |||
| void RowMajor2Row4x16Major(int8_t *src, int row, int col, int8_t *dst, int col_16); | |||
| void RowMajor2Col16x4Major(int8_t *src, int row, int col, int8_t *dst, int row_16); | |||
| void RowMajor2Asums(int8_t *a, int row, int col, int b_zp, int *dst); | |||
| void RowMajor2Bbias(int8_t *b, int row, int col, int a_zp, int b_zp, int *bias, int *dst); | |||
| void Row4x4Major2RowMajor(int8_t *src, int row4, int8_t *dst, int row, int cow); | |||
| #ifdef ENABLE_ARM64 | |||
| // bias = bias + depth * a_zp * b_zp - a_zp * b_sums | |||
| void MatmulInt8Neon64(const int8_t *a, const int8_t *b, int8_t *dst, int row4, int col4, int deep16, const int *a_sums, | |||
| const int *bias, int act_min, int act_max, int out_zp, int multiplier, int left_shift, | |||
| @@ -22,6 +22,11 @@ | |||
| typedef void (*MATMUL_OPT_R4_FUNC)(const int8_t *a, const int8_t *b, int *dst, int row_4, int col_4, int deep_16, | |||
| const int *input_sum, const int *bias); | |||
| typedef void (*MATMUL_OPT_R_FUNC)(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16, | |||
| size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift, | |||
| int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, | |||
| int32_t maxi, bool per_channel); | |||
| typedef void (*MAT_TRANS_FUNC)(void *dst, void *a, int row, int col); | |||
| typedef enum ActType { ActType_No, ActType_Relu, ActType_Relu6 } ActType; | |||
| @@ -15,6 +15,7 @@ | |||
| */ | |||
| #include <stdlib.h> | |||
| #include <stdbool.h> | |||
| #ifdef __cplusplus | |||
| extern "C" { | |||
| @@ -45,4 +46,11 @@ void MatMulR4Int8_optimize_handler(const int8_t *a, const int8_t *b, int *dst, i | |||
| const int *input_sum, const int *bias) { | |||
| return MatMulOptR4Int8Neon64(a, b, dst, row4, col4, deep16, input_sum, bias); | |||
| } | |||
| void MatMulRInt8_optimize_handler(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16, | |||
| size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift, | |||
| int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, | |||
| int32_t maxi, bool per_channel) { | |||
| return; | |||
| } | |||
| #endif | |||
| @@ -153,22 +153,24 @@ void PackWeightInt8Opt(int8_t *weight_data, ConvParameter *conv_param, int8_t *p | |||
| } // kernel plane loop | |||
| } | |||
| void Conv1x1InputPackFp32(const float *src, float *dst, ConvParameter *conv_param) { | |||
| void Conv1x1InputPack(const void *src_ptr, void *dst_ptr, ConvParameter *conv_param, int data_size) { | |||
| /* support nhwc */ | |||
| char *src = (char *)src_ptr; | |||
| char *dst = (char *)dst_ptr; | |||
| for (int dst_h = 0; dst_h < conv_param->output_h_; dst_h++) { | |||
| int src_h = dst_h * conv_param->stride_h_ - conv_param->pad_h_; | |||
| if (src_h < 0 || src_h >= conv_param->input_h_) { | |||
| continue; | |||
| } | |||
| const float *src_h_ptr = src + src_h * conv_param->input_w_ * conv_param->input_channel_; | |||
| float *dst_h_ptr = dst + dst_h * conv_param->output_w_ * conv_param->input_channel_; | |||
| const char *src_h_ptr = src + src_h * conv_param->input_w_ * conv_param->input_channel_ * data_size; | |||
| char *dst_h_ptr = dst + dst_h * conv_param->output_w_ * conv_param->input_channel_ * data_size; | |||
| for (int dst_w = 0; dst_w < conv_param->output_w_; dst_w++) { | |||
| int src_w = dst_w * conv_param->stride_w_ - conv_param->pad_w_; | |||
| if (src_w < 0 || src_w >= conv_param->input_w_) { | |||
| continue; | |||
| } | |||
| memcpy(dst_h_ptr + dst_w * conv_param->input_channel_, src_h_ptr + src_w * conv_param->input_channel_, | |||
| conv_param->input_channel_ * sizeof(float)); | |||
| memcpy(dst_h_ptr + dst_w * conv_param->input_channel_ * data_size, | |||
| src_h_ptr + src_w * conv_param->input_channel_ * data_size, conv_param->input_channel_ * data_size); | |||
| } | |||
| } | |||
| return; | |||
| @@ -188,6 +190,105 @@ void Pack1x1WeightFp32(const float *weight_data, float *packed_weight, ConvParam | |||
| return; | |||
| } | |||
| void PackInputSum16x4PerLater(const int8_t *src, int32_t *dst, int32_t filter_zp, size_t row4, size_t col16) { | |||
| /* optimize normal -> same layout */ | |||
| #ifdef ENABLE_ARM64 | |||
| asm volatile( | |||
| "mov x10, %[src] \n" | |||
| "mov x11, %[dst] \n" | |||
| "dup v15.4s, %w[filter_zp] \n" | |||
| "mov x0, #0 \n" | |||
| "1: \n" | |||
| "cmp x0, %[row4] \n" | |||
| "beq 4f \n" | |||
| "add x0, x0, #4\n" | |||
| "dup v10.4s, wzr \n" | |||
| "mov x2, #0 \n" | |||
| "2: \n" | |||
| "cmp x2, %[col16] \n" | |||
| "beq 3f \n" | |||
| "add x2, x2, #16\n" | |||
| "ld1 {v0.16b}, [x10], #16\n" | |||
| "ld1 {v1.16b}, [x10], #16\n" | |||
| "ld1 {v2.16b}, [x10], #16\n" | |||
| "ld1 {v3.16b}, [x10], #16\n" | |||
| "saddlp v4.8h, v0.16b \n" | |||
| "saddlp v5.8h, v1.16b \n" | |||
| "saddlp v6.8h, v2.16b \n" | |||
| "saddlp v7.8h, v3.16b \n" | |||
| "saddlp v0.4S, v4.8h \n" | |||
| "saddlp v1.4S, v5.8h \n" | |||
| "saddlp v2.4S, v6.8h \n" | |||
| "saddlp v3.4S, v7.8h \n" | |||
| "addv s4, v0.4S \n" | |||
| "addv s5, v1.4S \n" | |||
| "addv s6, v2.4S \n" | |||
| "addv s7, v3.4S \n" | |||
| "mov v0.s[0], v4.s[0] \n" | |||
| "mov v0.s[1], v5.s[0] \n" | |||
| "mov v0.s[2], v6.s[0] \n" | |||
| "mov v0.s[3], v7.s[0] \n" | |||
| "add v10.4s, v10.4s, v0.4s \n" | |||
| "b 2b\n" | |||
| "3: \n" | |||
| "mul v10.4s, v10.4s, v15.4s \n" | |||
| "st1 {v10.4s}, [x11], #16 \n" | |||
| "beq 1b \n" | |||
| "4: \n" | |||
| : | |||
| : [ dst ] "r"(dst), [ src ] "r"(src), [ row4 ] "r"(row4), [ col16 ] "r"(col16), [ filter_zp ] "r"(filter_zp) | |||
| : "x0", "x1", "x2", "x3", "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v10", "v15"); | |||
| #else | |||
| for (int r = 0; r < row4; r++) { | |||
| int32_t tmp_value = 0; | |||
| for (int c = 0; c < col16; c++) { | |||
| int r4div = r / C4NUM, r4mod = r % C4NUM, c16div = c / C16NUM, c16mod = c % C16NUM; | |||
| int src_index = r4div * C4NUM * col16 + c16div * C16NUM * C4NUM + r4mod * C16NUM + c16mod; | |||
| tmp_value += src[src_index]; | |||
| } | |||
| dst[r] = tmp_value * filter_zp; | |||
| } | |||
| #endif | |||
| return; | |||
| } | |||
| void PackInputSum16x4Int8(int8_t *input_value, int32_t *input_sum, size_t input_channel, size_t output_channel, | |||
| size_t plane_size, ConvParameter *conv_param) { | |||
| size_t hw4 = UP_ROUND(plane_size, C4NUM); | |||
| size_t ic16 = UP_ROUND(input_channel, C16NUM); | |||
| if (conv_param->conv_quant_arg_.filter_arg_num_ == 1) { | |||
| PackInputSum16x4PerLater(input_value, input_sum, conv_param->conv_quant_arg_.filter_quant_args_[0].zp_, hw4, ic16); | |||
| } else { | |||
| for (int ri = 0; ri < plane_size; ri++) { | |||
| int ri4div = ri / C4NUM, ri4mod = ri % C4NUM; | |||
| for (int ci = 0; ci < output_channel; ci++) { | |||
| int32_t tmp_sum_value = 0; | |||
| int ci4div = ci / C4NUM, ci4mod = ci % C4NUM; | |||
| int32_t filter_zp = conv_param->conv_quant_arg_.filter_quant_args_[ci].zp_; | |||
| for (int di = 0; di < input_channel; di++) { | |||
| size_t di16div = di / C16NUM, di16mod = di % C16NUM; | |||
| int src_index = ri4div * C4NUM * ic16 + di16div * C16NUM * C4NUM + ri4mod * C16NUM + di16mod; | |||
| tmp_sum_value += input_value[src_index]; | |||
| } | |||
| int dst_index = ci4div * C4NUM * hw4 + ri * C4NUM + ci4mod; | |||
| input_sum[dst_index] = tmp_sum_value * filter_zp; | |||
| } | |||
| } | |||
| } | |||
| return; | |||
| } | |||
| void Im2ColPackUnitFp32(const float *input_data, ConvParameter *conv_param, float *packed_input, int real_cal_num, | |||
| int block_index) { | |||
| // input format : nhwc | |||
| @@ -35,10 +35,15 @@ void Im2ColPackUnitInt8(const int8_t *input_data, int8_t *packed_input, int real | |||
| void Im2ColPackUnitInt8Opt(const int8_t *input_data, int8_t *packed_input, int real_cal_num, int block_index, | |||
| int32_t *input_sum, ConvParameter *conv_param); | |||
| void Conv1x1InputPackFp32(const float *src, float *dst, ConvParameter *conv_param); | |||
| void PackInputSum16x4PerLater(const int8_t *src, int32_t *dst, int32_t filter_zp, size_t row4, size_t col16); | |||
| void Conv1x1InputPack(const void *src_ptr, void *dst_ptr, ConvParameter *conv_param, int data_size); | |||
| void Pack1x1WeightFp32(const float *weight_data, float *packed_weight, ConvParameter *conv_param); | |||
| void PackInputSum16x4Int8(int8_t *input_value, int32_t *input_sum, size_t input_channel, size_t output_channel, | |||
| size_t plane_size, ConvParameter *conv_param); | |||
| void MatrixPack(const float *src, float *dst, int row, int ic4, int stride); | |||
| void PackInputToC8Int8(const int8_t *input_data, int16_t *packed_input, ConvParameter *conv_param); | |||
| @@ -118,10 +118,13 @@ int ConvolutionBaseCPUKernel::CheckLayout(lite::tensor::Tensor *input_tensor) { | |||
| } | |||
| int ConvolutionBaseCPUKernel::SetIfPerChannel() { | |||
| auto filter_tensor = in_tensors_.at(kWeightIndex); | |||
| auto input_channel = filter_tensor->Channel(); | |||
| auto output_channel = filter_tensor->Batch(); | |||
| uint8_t per_channel = 0b0; | |||
| if (conv_quant_arg_->input_arg_num_ != kPerTensor) { | |||
| int in_channel = conv_param_->input_channel_; | |||
| if (static_cast<int>(conv_quant_arg_->input_arg_num_) != in_channel) { | |||
| if (static_cast<int>(conv_quant_arg_->input_arg_num_) != input_channel) { | |||
| MS_LOG(ERROR) << "input per channel quant param length is not equal to input channel."; | |||
| return RET_ERROR; | |||
| } | |||
| @@ -129,8 +132,7 @@ int ConvolutionBaseCPUKernel::SetIfPerChannel() { | |||
| } | |||
| if (conv_quant_arg_->filter_arg_num_ != kPerTensor) { | |||
| int filter_num = conv_param_->output_channel_; | |||
| if (static_cast<int>(conv_quant_arg_->filter_arg_num_) != filter_num) { | |||
| if (static_cast<int>(conv_quant_arg_->filter_arg_num_) != output_channel) { | |||
| MS_LOG(ERROR) << "weight per channel quant param length is not equal to filter num."; | |||
| return RET_ERROR; | |||
| } | |||
| @@ -138,8 +140,7 @@ int ConvolutionBaseCPUKernel::SetIfPerChannel() { | |||
| } | |||
| if (conv_quant_arg_->output_arg_num_ != kPerTensor) { | |||
| int out_channel = conv_param_->output_channel_; | |||
| if (static_cast<int>(conv_quant_arg_->output_arg_num_) != out_channel) { | |||
| if (static_cast<int>(conv_quant_arg_->output_arg_num_) != output_channel) { | |||
| MS_LOG(ERROR) << "output per channel quant param length is not equal to output channel."; | |||
| return RET_ERROR; | |||
| } | |||
| @@ -113,7 +113,7 @@ void Convolution1x1CPUKernel::Pre1x1Trans(float *src_input, float *src_output) { | |||
| output_ptr_ = src_output; | |||
| if (pre_trans_input_) { | |||
| Conv1x1InputPackFp32(src_input, input_ptr_, conv_param_); | |||
| Conv1x1InputPack(src_input, input_ptr_, conv_param_, sizeof(float)); | |||
| } else { | |||
| input_ptr_ = src_input; | |||
| } | |||
| @@ -0,0 +1,270 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "src/runtime/kernel/arm/int8/convolution_1x1_int8.h" | |||
| #include "src/runtime/runtime_api.h" | |||
| using mindspore::lite::RET_ERROR; | |||
| using mindspore::lite::RET_MEMORY_FAILED; | |||
| using mindspore::lite::RET_OK; | |||
| namespace mindspore::kernel { | |||
| Convolution1x1Int8CPUKernel::~Convolution1x1Int8CPUKernel() { | |||
| if (matmul_param_ != nullptr) { | |||
| delete matmul_param_; | |||
| matmul_param_ = nullptr; | |||
| } | |||
| if (packed_weight_ != nullptr) { | |||
| delete packed_weight_; | |||
| packed_weight_ = nullptr; | |||
| } | |||
| FreeResizeBuf(); | |||
| FreeQuantParam(); | |||
| } | |||
| void Convolution1x1Int8CPUKernel::FreeResizeBuf() { | |||
| if (packed_input_ != nullptr) { | |||
| free(packed_input_); | |||
| packed_input_ = nullptr; | |||
| } | |||
| if (input_sum_ != nullptr) { | |||
| free(input_sum_); | |||
| input_sum_ = nullptr; | |||
| } | |||
| return; | |||
| } | |||
| void Convolution1x1Int8CPUKernel::CheckSupportOptimize() { | |||
| support_optimize_ = false; | |||
| matmul_func_ = MatMulInt8_16x4_r; | |||
| #ifdef ENABLE_ARM64 | |||
| void *optimize_op_handler = OptimizeModule::GetInstance()->optimized_op_handler_; | |||
| if (optimize_op_handler != nullptr) { | |||
| dlerror(); | |||
| *(reinterpret_cast<void **>(&matmul_func_)) = dlsym(optimize_op_handler, "MatMulRInt8_optimize_handler"); | |||
| auto dlopen_error = dlerror(); | |||
| if (dlopen_error != nullptr) { | |||
| MS_LOG(ERROR) << "load matmul func failed! " << dlopen_error << "."; | |||
| support_optimize_ = false; | |||
| matmul_func_ = nullptr; | |||
| } else { | |||
| support_optimize_ = true; | |||
| } | |||
| } else { | |||
| support_optimize_ = false; | |||
| matmul_func_ = nullptr; | |||
| } | |||
| #endif | |||
| matmul_func_ = MatMulInt8_16x4_r; | |||
| return; | |||
| } | |||
| int Convolution1x1Int8CPUKernel::InitWeightBias() { | |||
| auto filter_tensor = in_tensors_.at(kWeightIndex); | |||
| auto input_channel = filter_tensor->Channel(); | |||
| auto output_channel = filter_tensor->Batch(); | |||
| /* weight */ | |||
| size_t size = UP_ROUND(input_channel, C16NUM) * UP_ROUND(output_channel, C4NUM) * sizeof(int8_t); | |||
| packed_weight_ = reinterpret_cast<int8_t *>(malloc(size)); | |||
| if (packed_weight_ == nullptr) { | |||
| MS_LOG(ERROR) << "Conv1x1 int8 Malloc weight error!"; | |||
| return RET_ERROR; | |||
| } | |||
| memset(packed_weight_, 0, size); | |||
| RowMajor2Row4x16MajorInt8(reinterpret_cast<int8_t *>(filter_tensor->Data()), packed_weight_, output_channel, | |||
| input_channel); | |||
| /* bias = bias - v2 x zp1 + zp1 x zp2 */ | |||
| int col4 = UP_ROUND(output_channel, C4NUM); | |||
| bias_data_ = malloc(col4 * sizeof(int32_t)); | |||
| if (bias_data_ == nullptr) { | |||
| MS_LOG(ERROR) << "Conv1x1 int8 Malloc bias_ptr_ error!"; | |||
| return RET_ERROR; | |||
| } | |||
| memset(bias_data_, 0, col4 * sizeof(int32_t)); | |||
| if (in_tensors_.size() == 3) { | |||
| memcpy(bias_data_, in_tensors_[kBiasIndex]->Data(), output_channel * sizeof(int32_t)); | |||
| } | |||
| int32_t *bias_data = reinterpret_cast<int32_t *>(bias_data_); | |||
| int8_t *weight = reinterpret_cast<int8_t *>(filter_tensor->Data()); | |||
| int32_t input_zp = conv_param_->conv_quant_arg_.input_quant_args_[0].zp_; | |||
| for (int oc = 0; oc < output_channel; oc++) { | |||
| int32_t weight_sum_value = 0; | |||
| int32_t filter_zp = (conv_param_->conv_quant_arg_.filter_arg_num_ == 1) | |||
| ? conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_ | |||
| : conv_param_->conv_quant_arg_.filter_quant_args_[oc].zp_; | |||
| for (int ic = 0; ic < input_channel; ic++) { | |||
| weight_sum_value += weight[oc * input_channel + ic]; | |||
| } | |||
| bias_data[oc] += filter_zp * input_zp * input_channel - weight_sum_value * input_zp; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int Convolution1x1Int8CPUKernel::Init() { | |||
| if (!InferShapeDone()) { | |||
| return RET_OK; | |||
| } | |||
| matmul_param_ = new (std::nothrow) MatMulParameter(); | |||
| if (matmul_param_ == nullptr) { | |||
| MS_LOG(ERROR) << "Init matmul_param_ failed."; | |||
| return RET_ERROR; | |||
| } | |||
| CheckSupportOptimize(); | |||
| auto ret = SetQuantParam(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Set quant param failed."; | |||
| return ret; | |||
| } | |||
| ret = InitWeightBias(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Init weight bias failed."; | |||
| return ret; | |||
| } | |||
| return ReSize(); | |||
| } | |||
| int Convolution1x1Int8CPUKernel::InitParam() { | |||
| pre_trans_input_ = (conv_param_->pad_h_ != 0 || conv_param_->pad_w_ != 0 || conv_param_->stride_h_ != 1 || | |||
| conv_param_->stride_w_ != 1); | |||
| matmul_param_->row_ = conv_param_->output_h_ * conv_param_->output_w_; | |||
| matmul_param_->deep_ = conv_param_->input_channel_; | |||
| matmul_param_->col_ = conv_param_->output_channel_; | |||
| thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, C4NUM)); | |||
| thread_stride_ = UP_DIV(UP_DIV(matmul_param_->col_, C4NUM), thread_count_); | |||
| size_t size = UP_ROUND(matmul_param_->row_, C4NUM) * UP_ROUND(matmul_param_->deep_, C16NUM); | |||
| packed_input_ = reinterpret_cast<int8_t *>(malloc(size * sizeof(int8_t))); | |||
| if (packed_input_ == nullptr) { | |||
| MS_LOG(ERROR) << "conv1x1 int8 Malloc packed_input_ error!"; | |||
| return RET_ERROR; | |||
| } | |||
| memset(packed_input_, 0, size * sizeof(int8_t)); | |||
| if (conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL) { | |||
| size = UP_ROUND(conv_param_->output_channel_, C4NUM) * UP_ROUND(matmul_param_->row_, C4NUM); | |||
| } else { | |||
| size = UP_ROUND(matmul_param_->row_, C4NUM); | |||
| } | |||
| input_sum_ = reinterpret_cast<int32_t *>(malloc(size * sizeof(int32_t))); | |||
| if (input_sum_ == nullptr) { | |||
| MS_LOG(ERROR) << "malloc input_sum_ failed."; | |||
| return RET_ERROR; | |||
| } | |||
| memset(input_sum_, 0, size * sizeof(int32_t)); | |||
| return RET_OK; | |||
| } | |||
| int Convolution1x1Int8CPUKernel::ReSize() { | |||
| FreeResizeBuf(); | |||
| ConvolutionBaseCPUKernel::Init(); | |||
| int error_code = InitParam(); | |||
| if (error_code != RET_OK) { | |||
| MS_LOG(ERROR) << "Convolution base init failed."; | |||
| return error_code; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| void Convolution1x1Int8CPUKernel::Pre1x1Trans(int8_t *src_input, int8_t *src_output) { | |||
| output_ptr_ = src_output; | |||
| if (pre_trans_input_) { | |||
| Conv1x1InputPack(src_input, input_ptr_, conv_param_, sizeof(int8_t)); | |||
| } else { | |||
| input_ptr_ = src_input; | |||
| } | |||
| RowMajor2Row16x4MajorInt8(input_ptr_, packed_input_, matmul_param_->row_, matmul_param_->deep_); | |||
| return; | |||
| } | |||
| int Convolution1x1Int8CPUKernel::RunImpl(int task_id) { | |||
| int cur_oc = MSMIN(thread_stride_ * C4NUM, matmul_param_->col_ - task_id * thread_stride_ * C4NUM); | |||
| if (cur_oc <= 0) { | |||
| return RET_OK; | |||
| } | |||
| int32_t *bias = reinterpret_cast<int32_t *>(bias_data_) + thread_stride_ * C4NUM * task_id; | |||
| Conv1x1Int8(packed_input_, packed_weight_ + task_id * thread_stride_ * C4NUM * matmul_param_->deep_, | |||
| output_ptr_ + task_id * thread_stride_ * C4NUM, input_sum_, bias + task_id * thread_stride_ * C4NUM, | |||
| matmul_param_->row_, cur_oc, UP_ROUND(matmul_param_->deep_, C16NUM), conv_param_, matmul_func_); | |||
| return RET_OK; | |||
| } | |||
| int Convolution1x1Int8Impl(int task_id, LiteParallelGroupEnv *penv, void *cdata) { | |||
| auto conv = reinterpret_cast<Convolution1x1Int8CPUKernel *>(cdata); | |||
| auto error_code = conv->RunImpl(task_id); | |||
| if (error_code != RET_OK) { | |||
| MS_LOG(ERROR) << "conv1x1 Int8 Run error task_id[" << task_id << "] error_code[" << error_code << "]"; | |||
| return RET_ERROR; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int Convolution1x1Int8CPUKernel::Run() { | |||
| auto ret = Prepare(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Prepare failed."; | |||
| return RET_ERROR; | |||
| } | |||
| if (pre_trans_input_) { | |||
| input_ptr_ = | |||
| reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(matmul_param_->row_ * matmul_param_->deep_ * sizeof(int8_t))); | |||
| if (input_ptr_ == nullptr) { | |||
| MS_LOG(ERROR) << "Conv1x1 int8 Malloc input_ptr_ error!"; | |||
| return RET_MEMORY_FAILED; | |||
| } | |||
| } | |||
| int8_t *src_in = reinterpret_cast<int8_t *>(in_tensors_[0]->Data()); | |||
| int8_t *src_out = reinterpret_cast<int8_t *>(out_tensors_[0]->Data()); | |||
| for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) { | |||
| Pre1x1Trans(src_in + batch_index * conv_param_->input_h_ * conv_param_->input_w_ * conv_param_->input_channel_, | |||
| src_out + batch_index * matmul_param_->row_ * matmul_param_->col_); | |||
| PackInputSum16x4Int8(packed_input_, input_sum_, matmul_param_->deep_, matmul_param_->col_, matmul_param_->row_, | |||
| conv_param_); | |||
| int error_code = LiteBackendParallelLaunch(Convolution1x1Int8Impl, this, thread_count_); | |||
| if (error_code != RET_OK) { | |||
| MS_LOG(ERROR) << "conv1x1 fp16 error error_code[" << error_code << "]"; | |||
| return RET_ERROR; | |||
| } | |||
| } | |||
| if (pre_trans_input_ && input_ptr_ != nullptr) { | |||
| ctx_->allocator->Free(input_ptr_); | |||
| input_ptr_ = nullptr; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| } // namespace mindspore::kernel | |||
| @@ -0,0 +1,68 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_CONVOLUTION_1x1_INT8_H_ | |||
| #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_CONVOLUTION_1x1_INT8_H_ | |||
| #include <vector> | |||
| #include "src/lite_kernel.h" | |||
| #include "include/errorcode.h" | |||
| #include "schema/model_generated.h" | |||
| #include "src/runtime/kernel/arm/base/convolution_base.h" | |||
| #include "nnacl/int8/conv_int8.h" | |||
| #include "nnacl/int8/matmul_int8.h" | |||
| #include "nnacl/matmul_parameter.h" | |||
| #include "nnacl/optimized_kernel.h" | |||
| namespace mindspore::kernel { | |||
| class Convolution1x1Int8CPUKernel : public ConvolutionBaseCPUKernel { | |||
| public: | |||
| Convolution1x1Int8CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs, | |||
| const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx, | |||
| const mindspore::lite::PrimitiveC *primitive) | |||
| : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {} | |||
| ~Convolution1x1Int8CPUKernel() override; | |||
| int Init() override; | |||
| int ReSize() override; | |||
| int Run() override; | |||
| public: | |||
| int RunImpl(int task_id); | |||
| private: | |||
| void FreeResizeBuf(); | |||
| int InitParam(); | |||
| int InitWeightBias(); | |||
| void Pre1x1Trans(int8_t *src_input, int8_t *src_output); | |||
| void CheckSupportOptimize(); | |||
| private: | |||
| int32_t *input_sum_ = nullptr; /* per-channel: oc4 format */ | |||
| int8_t *packed_weight_ = nullptr; | |||
| int8_t *packed_input_ = nullptr; | |||
| int8_t *input_ptr_ = nullptr; | |||
| int8_t *output_ptr_ = nullptr; | |||
| size_t thread_count_ = 1; | |||
| size_t thread_stride_ = 0; | |||
| bool pre_trans_input_ = false; | |||
| MatMulParameter *matmul_param_ = nullptr; | |||
| MATMUL_OPT_R_FUNC matmul_func_ = nullptr; | |||
| bool support_optimize_ = false; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_CONVOLUTION_1x1_INT8_H_ | |||
| @@ -16,6 +16,7 @@ | |||
| #include "src/runtime/kernel/arm/int8/convolution_int8.h" | |||
| #include "src/runtime/kernel/arm/int8/convolution_3x3_int8.h" | |||
| #include "src/runtime/kernel/arm/int8/convolution_1x1_int8.h" | |||
| #include "nnacl/int8/conv_int8.h" | |||
| #include "src/runtime/kernel/arm/base/layout_transform.h" | |||
| #include "schema/model_generated.h" | |||
| @@ -400,6 +401,9 @@ kernel::LiteKernel *CpuConvInt8KernelCreator(const std::vector<lite::tensor::Ten | |||
| kernel::LiteKernel *kernel; | |||
| if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) { | |||
| kernel = new (std::nothrow) kernel::Convolution3x3Int8CPUKernel(opParameter, inputs, outputs, ctx, primitive); | |||
| } else if (kernel_h == 1 && kernel_w == 1) { | |||
| /* Convolution1x1Int8CPUKernel */ | |||
| kernel = new (std::nothrow) kernel::ConvolutionInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive); | |||
| } else { | |||
| kernel = new (std::nothrow) kernel::ConvolutionInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive); | |||
| } | |||
| @@ -54,7 +54,7 @@ TEST_F(TestConv1x1Fp32, Input1x1PrePack1) { | |||
| conv_param->pad_h_ = conv_param->pad_w_ = 2; | |||
| float out[20] = {0}; | |||
| Conv1x1InputPackFp32(in, out, conv_param); | |||
| Conv1x1InputPack(in, out, conv_param, sizeof(float)); | |||
| EXPECT_EQ(0, lite::CompareOutputData(out, correct, 20)); | |||
| delete conv_param; | |||
| } | |||
| @@ -95,7 +95,7 @@ TEST_F(TestConv1x1Fp32, Input1x1PrePack2) { | |||
| conv_param->pad_h_ = conv_param->pad_w_ = 0; | |||
| float out[28] = {0}; | |||
| Conv1x1InputPackFp32(in, out, conv_param); | |||
| Conv1x1InputPack(in, out, conv_param, sizeof(float)); | |||
| CompareOutputData(out, correct, 28, 0.0001); | |||
| delete conv_param; | |||
| } | |||
| @@ -114,7 +114,7 @@ TEST_F(TestConv1x1Fp32, Input1x1PrePack3) { | |||
| float correct[] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 17.025112, | |||
| -5.052577, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; | |||
| Conv1x1InputPackFp32(in, out, conv_param); | |||
| Conv1x1InputPack(in, out, conv_param, sizeof(float)); | |||
| EXPECT_EQ(0, lite::CompareOutputData(out, correct, 18)); | |||
| delete conv_param; | |||
| } | |||
| @@ -136,7 +136,7 @@ TEST_F(TestConv1x1Fp32, Input1x1PrePack4) { | |||
| -1.770, 41.903, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, | |||
| 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; | |||
| float out[54] = {0}; | |||
| Conv1x1InputPackFp32(in, out, conv_param); | |||
| Conv1x1InputPack(in, out, conv_param, sizeof(float)); | |||
| EXPECT_EQ(0, lite::CompareOutputData(out, correct, 54)); | |||
| delete conv_param; | |||
| } | |||
| @@ -0,0 +1,281 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "utils/log_adapter.h" | |||
| #include "common/common_test.h" | |||
| #include "mindspore/lite/src/lite_kernel.h" | |||
| #include "src/common/file_utils.h" | |||
| #include "nnacl/quantization/quantize.h" | |||
| #include "nnacl/common_func.h" | |||
| #include "mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.h" | |||
| namespace mindspore { | |||
| using lite::tensor::Tensor; | |||
| class TestConv1x1Int8 : public mindspore::CommonTest { | |||
| public: | |||
| TestConv1x1Int8() {} | |||
| }; | |||
| TEST_F(TestConv1x1Int8, Input1x1PrePack1) { | |||
| auto conv_param = new ConvParameter(); | |||
| conv_param->input_channel_ = 6; | |||
| conv_param->input_h_ = conv_param->input_w_ = 3; | |||
| conv_param->output_h_ = conv_param->output_w_ = 3; | |||
| conv_param->stride_h_ = conv_param->stride_w_ = 2; | |||
| conv_param->pad_h_ = conv_param->pad_w_ = 1; | |||
| int8_t in[] = {4, 13, -3, 16, 19, 8, 19, -6, -2, -9, 9, 18, 23, 8, 47, -14, 15, 4, | |||
| -0, 37, -0, 6, 0, -1, 37, 13, 11, 1, -1, 41, 9, 14, 3, 0, 8, 9, | |||
| 14, -14, -8, -8, -8, 7, 19, 17, 13, 3, 9, 18, -1, -0, 18, 0, 4, -2}; | |||
| int8_t correct[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 37, 13, 11, | |||
| 1, -1, 41, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; | |||
| int8_t out[54] = {0}; | |||
| Conv1x1InputPack(in, out, conv_param, sizeof(int8_t)); | |||
| CompareOutputData(out, correct, 54, 0); | |||
| delete conv_param; | |||
| } | |||
| TEST_F(TestConv1x1Int8, Input1x1PrePack2) { | |||
| auto conv_param = new ConvParameter(); | |||
| int8_t in[] = {-0, -0, -7, -0, -6, 4, 9, 9, 12, -0, 6, 2, 13, 15, 16, -7, 9, 1, 10, 13, 17, 17, 4, 13, | |||
| -6, 5, 7, -7, 15, 0, 1, -5, -7, 18, 15, 19, -7, 13, 7, -0, 16, -5, 16, -7, 6, 10, -5, 10, | |||
| 9, 12, -9, -8, -4, 18, -5, 0, 7, 12, 13, 16, -9, -4, 18, -0, 8, 6, 2, 10, 16, 1, -1, 2, | |||
| 9, 8, 9, 13, 7, -0, 15, -7, 0, -0, 17, 19, 9, 17, -6, -2, 7, -0, 10, -6, -6, 18, -0, 9, | |||
| 9, 6, 3, -1, -8, 10, 17, -9, 17, 6, -3, 7, -2, -0, -9, 1, -3, 15, 13, 4, 18}; | |||
| int8_t correct[] = {0, 0, 0, 0, 0, 0, 15, -7, -7, 0, 0, 0, 9, 7, 0, 0, 0, 0, 0, 0}; | |||
| conv_param->input_h_ = 9; | |||
| conv_param->input_w_ = 13; | |||
| conv_param->input_channel_ = 1; | |||
| conv_param->output_h_ = 4; | |||
| conv_param->output_w_ = 5; | |||
| conv_param->stride_h_ = conv_param->stride_w_ = 4; | |||
| conv_param->pad_h_ = conv_param->pad_w_ = 2; | |||
| int8_t out[20] = {0}; | |||
| Conv1x1InputPack(in, out, conv_param, sizeof(int8_t)); | |||
| CompareOutputData(out, correct, 20, 0); | |||
| delete conv_param; | |||
| } | |||
| int Conv1x1Int8TestInit1_perchannel(std::vector<lite::tensor::Tensor *> *inputs_, | |||
| std::vector<lite::tensor::Tensor *> *outputs_, ConvParameter *conv_param, | |||
| int8_t **correct) { | |||
| Tensor *in_t = new Tensor(kNumberTypeInt8, {1, 2, 3, 4}, schema::Format_NHWC, static_cast<schema::NodeType>(1)); | |||
| auto in_quant_arg = new mindspore::lite::tensor::QuantArg(); | |||
| in_quant_arg->zeroPoint = -42, in_quant_arg->scale = 0.117647; | |||
| in_t->AddQuantParam(*in_quant_arg); | |||
| in_t->MallocData(); | |||
| int8_t in[] = {62, -14, 88, 2, -35, 43, 83, -111, 75, 26, 14, -121, | |||
| -78, 56, 37, -31, 15, -75, -10, -115, -71, 74, -65, -15}; | |||
| memcpy(in_t->Data(), in, in_t->ElementsNum() * sizeof(int8_t)); | |||
| inputs_->push_back(in_t); | |||
| Tensor *weight_t = new Tensor(kNumberTypeInt8, {3, 1, 1, 4}, schema::Format_NHWC, static_cast<schema::NodeType>(1)); | |||
| weight_t->MallocData(); | |||
| auto weight_quant_arg1 = new mindspore::lite::tensor::QuantArg(); | |||
| weight_quant_arg1->zeroPoint = 66, weight_quant_arg1->scale = 0.96439215686275; | |||
| auto weight_quant_arg2 = new mindspore::lite::tensor::QuantArg(); | |||
| weight_quant_arg2->zeroPoint = 33, weight_quant_arg2->scale = 0.76439215686275; | |||
| auto weight_quant_arg3 = new mindspore::lite::tensor::QuantArg(); | |||
| weight_quant_arg3->zeroPoint = -20, weight_quant_arg3->scale = 0.99117647; | |||
| weight_t->AddQuantParam(*weight_quant_arg1); | |||
| weight_t->AddQuantParam(*weight_quant_arg2); | |||
| weight_t->AddQuantParam(*weight_quant_arg3); | |||
| int8_t weight[] = {65, 67, 65, 65, 32, 33, 34, 33, -19, -20, -19, -20}; | |||
| memcpy(weight_t->Data(), weight, weight_t->ElementsNum() * sizeof(int8_t)); | |||
| inputs_->push_back(weight_t); | |||
| Tensor *out_t = new Tensor(kNumberTypeInt8, {1, 2, 3, 3}, schema::Format_NHWC, static_cast<schema::NodeType>(1)); | |||
| out_t->MallocData(); | |||
| auto output_quant_arg = new mindspore::lite::tensor::QuantArg(); | |||
| output_quant_arg->zeroPoint = 7, output_quant_arg->scale = 0.294321233; | |||
| out_t->AddQuantParam(*output_quant_arg); | |||
| outputs_->push_back(out_t); | |||
| *correct = reinterpret_cast<int8_t *>(malloc(out_t->ElementsNum() * sizeof(int8_t))); | |||
| int8_t nchw_co[] = {-83, 34, 100, 10, 113, 55, 3, 16, 63, 6, 93, 20, 5, 6, 42, 35, 28, -24}; | |||
| memcpy(*correct, nchw_co, out_t->ElementsNum() * sizeof(int8_t)); | |||
| conv_param->kernel_h_ = conv_param->kernel_w_ = 1; | |||
| conv_param->stride_h_ = conv_param->stride_w_ = 1; | |||
| conv_param->dilation_h_ = conv_param->dilation_w_ = 1; | |||
| conv_param->pad_h_ = conv_param->pad_w_ = 0; | |||
| conv_param->is_relu_ = conv_param->is_relu6_ = false; | |||
| return out_t->ElementsNum(); | |||
| } | |||
| TEST_F(TestConv1x1Int8, Conv1x1TestPerChannel) { | |||
| std::vector<lite::tensor::Tensor *> inputs_; | |||
| std::vector<lite::tensor::Tensor *> outputs_; | |||
| auto conv_param = new ConvParameter(); | |||
| int8_t *correct; | |||
| auto ctx = new lite::Context; | |||
| ctx->thread_num_ = 1; | |||
| int total_size = Conv1x1Int8TestInit1_perchannel(&inputs_, &outputs_, conv_param, &correct); | |||
| kernel::Convolution1x1Int8CPUKernel *conv1x1 = new kernel::Convolution1x1Int8CPUKernel( | |||
| reinterpret_cast<OpParameter *>(conv_param), inputs_, outputs_, ctx, nullptr); | |||
| conv1x1->Init(); | |||
| conv1x1->Run(); | |||
| CompareOutputData(reinterpret_cast<int8_t *>(outputs_[0]->Data()), correct, total_size, 70); | |||
| delete conv1x1; | |||
| for (auto t : inputs_) delete t; | |||
| for (auto t : outputs_) delete t; | |||
| free(correct); | |||
| } | |||
| int Conv1x1Int8TestInit1(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<lite::tensor::Tensor *> *outputs_, | |||
| ConvParameter *conv_param, int8_t **correct) { | |||
| Tensor *in_t = new Tensor(kNumberTypeInt8, {1, 2, 3, 4}, schema::Format_NHWC, static_cast<schema::NodeType>(1)); | |||
| auto in_quant_arg = new mindspore::lite::tensor::QuantArg(); | |||
| in_quant_arg->zeroPoint = -42, in_quant_arg->scale = 0.117647; | |||
| in_t->AddQuantParam(*in_quant_arg); | |||
| in_t->MallocData(); | |||
| float in[] = {12.216284, 3.3466918, 15.327419, 5.234958, 0.804376, 9.952188, 14.727955, -8.080715, | |||
| 13.71383, 8.055829, 6.5845337, -9.25232, -4.24519, 11.550042, 9.262012, 1.2780352, | |||
| 6.7263746, -3.9301445, 3.764492, -8.602078, -3.3558068, 13.619035, -2.6694393, 3.2008505}; | |||
| Quantize(in, in_t->ElementsNum(), in_quant_arg->scale, in_quant_arg->zeroPoint, | |||
| reinterpret_cast<int8_t *>(in_t->Data())); | |||
| inputs_->push_back(in_t); | |||
| Tensor *weight_t = new Tensor(kNumberTypeInt8, {3, 1, 1, 4}, schema::Format_NHWC, static_cast<schema::NodeType>(1)); | |||
| auto weight_quant_arg = new mindspore::lite::tensor::QuantArg(); | |||
| weight_quant_arg->zeroPoint = 66, weight_quant_arg->scale = 0.036439215686275; | |||
| weight_t->AddQuantParam(*weight_quant_arg); | |||
| weight_t->MallocData(); | |||
| float weight[] = {-0.7308652, 0.5257509, -0.87825793, -1.123181, -1.2206168, 0.562695, | |||
| 1.5382664, -0.5020635, 0.8591602, -0.26410004, 1.1262615, 0.073132955}; | |||
| Quantize(weight, weight_t->ElementsNum(), weight_quant_arg->scale, weight_quant_arg->zeroPoint, | |||
| reinterpret_cast<int8_t *>(weight_t->Data())); | |||
| inputs_->push_back(weight_t); | |||
| Tensor *out_t = new Tensor(kNumberTypeInt8, {1, 2, 3, 3}, schema::Format_NHWC, static_cast<schema::NodeType>(1)); | |||
| out_t->MallocData(); | |||
| auto output_quant_arg = new mindspore::lite::tensor::QuantArg(); | |||
| output_quant_arg->zeroPoint = 7, output_quant_arg->scale = 0.234321233; | |||
| out_t->AddQuantParam(*output_quant_arg); | |||
| outputs_->push_back(out_t); | |||
| *correct = reinterpret_cast<int8_t *>(malloc(out_t->ElementsNum() * sizeof(int8_t))); | |||
| float nchw_co[] = {-26.51016327, 7.92113757, 27.25741343, 0.785643655, 31.3307619, 14.05927672, | |||
| -1.178490666, 2.5676252, 16.39408946, -0.394793726, 25.2866881, 3.827249175, | |||
| -0.626854507, -0.3122176, 10.42769169, 8.362184085, 6.04617807, -9.252362384}; | |||
| Quantize(nchw_co, out_t->ElementsNum(), output_quant_arg->scale, output_quant_arg->zeroPoint, *correct); | |||
| conv_param->kernel_h_ = conv_param->kernel_w_ = 1; | |||
| conv_param->stride_h_ = conv_param->stride_w_ = 1; | |||
| conv_param->dilation_h_ = conv_param->dilation_w_ = 1; | |||
| conv_param->pad_h_ = conv_param->pad_w_ = 0; | |||
| conv_param->is_relu_ = conv_param->is_relu6_ = false; | |||
| return out_t->ElementsNum(); | |||
| } | |||
| TEST_F(TestConv1x1Int8, Conv1x1Int8Test1) { | |||
| std::vector<lite::tensor::Tensor *> inputs_; | |||
| std::vector<lite::tensor::Tensor *> outputs_; | |||
| auto conv_param = new ConvParameter(); | |||
| int8_t *correct; | |||
| auto ctx = new lite::Context; | |||
| ctx->thread_num_ = 1; | |||
| int total_size = Conv1x1Int8TestInit1(&inputs_, &outputs_, conv_param, &correct); | |||
| kernel::Convolution1x1Int8CPUKernel *conv1x1 = new kernel::Convolution1x1Int8CPUKernel( | |||
| reinterpret_cast<OpParameter *>(conv_param), inputs_, outputs_, ctx, nullptr); | |||
| conv1x1->Init(); | |||
| conv1x1->Run(); | |||
| CompareOutputData(reinterpret_cast<int8_t *>(outputs_[0]->Data()), correct, total_size, 2); | |||
| delete conv1x1; | |||
| for (auto t : inputs_) delete t; | |||
| for (auto t : outputs_) delete t; | |||
| free(correct); | |||
| } | |||
| int Conv1x1Int8TestInit2(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<lite::tensor::Tensor *> *outputs_, | |||
| ConvParameter *conv_param, int8_t **correct) { | |||
| size_t buffer_size; | |||
| Tensor *in_t = new Tensor(kNumberTypeInt8, {1, 2, 3, 4}, schema::Format_NHWC, static_cast<schema::NodeType>(1)); | |||
| auto in_quant_arg = new mindspore::lite::tensor::QuantArg(); | |||
| in_quant_arg->zeroPoint = -42, in_quant_arg->scale = 0.117647; | |||
| in_t->AddQuantParam(*in_quant_arg); | |||
| in_t->MallocData(); | |||
| std::string input_path = "./input"; | |||
| auto input = mindspore::lite::ReadFile(input_path.c_str(), &buffer_size); | |||
| memcpy(in_t->Data(), input, buffer_size); | |||
| inputs_->push_back(in_t); | |||
| delete[] input; | |||
| Tensor *weight_t = new Tensor(kNumberTypeInt8, {3, 1, 1, 4}, schema::Format_NHWC, static_cast<schema::NodeType>(1)); | |||
| auto weight_quant_arg = new mindspore::lite::tensor::QuantArg(); | |||
| weight_quant_arg->zeroPoint = 66, weight_quant_arg->scale = 0.036439215686275; | |||
| weight_t->AddQuantParam(*weight_quant_arg); | |||
| weight_t->MallocData(); | |||
| std::string weight_path = "./weight"; | |||
| auto weight = mindspore::lite::ReadFile(weight_path.c_str(), &buffer_size); | |||
| memcpy(weight_t->Data(), weight, buffer_size); | |||
| inputs_->push_back(weight_t); | |||
| delete[] weight; | |||
| Tensor *bias_t = new Tensor(kNumberTypeInt32, {4}, schema::Format_NHWC, static_cast<schema::NodeType>(1)); | |||
| weight_t->MallocData(); | |||
| std::string bias_path = "./bias"; | |||
| auto bias = mindspore::lite::ReadFile(bias_path.c_str(), &buffer_size); | |||
| memcpy(bias_t->Data(), bias, buffer_size); | |||
| inputs_->push_back(bias_t); | |||
| delete[] bias; | |||
| Tensor *out_t = new Tensor(kNumberTypeInt8, {1, 2, 3, 3}, schema::Format_NHWC, static_cast<schema::NodeType>(1)); | |||
| out_t->MallocData(); | |||
| auto output_quant_arg = new mindspore::lite::tensor::QuantArg(); | |||
| output_quant_arg->zeroPoint = 7, output_quant_arg->scale = 0.234321233; | |||
| out_t->AddQuantParam(*output_quant_arg); | |||
| outputs_->push_back(out_t); | |||
| *correct = reinterpret_cast<int8_t *>(malloc(out_t->ElementsNum() * sizeof(int8_t))); | |||
| std::string output_path = "./output"; | |||
| auto output = mindspore::lite::ReadFile(output_path.c_str(), &buffer_size); | |||
| memcpy(*correct, output, buffer_size); | |||
| delete[] output; | |||
| conv_param->kernel_h_ = conv_param->kernel_w_ = 1; | |||
| conv_param->stride_h_ = conv_param->stride_w_ = 1; | |||
| conv_param->dilation_h_ = conv_param->dilation_w_ = 1; | |||
| conv_param->pad_h_ = conv_param->pad_w_ = 0; | |||
| conv_param->is_relu_ = conv_param->is_relu6_ = false; | |||
| return out_t->ElementsNum(); | |||
| } | |||
| TEST_F(TestConv1x1Int8, Conv1x1Int8Test2) { | |||
| std::vector<lite::tensor::Tensor *> inputs_; | |||
| std::vector<lite::tensor::Tensor *> outputs_; | |||
| auto conv_param = new ConvParameter(); | |||
| int8_t *correct; | |||
| auto ctx = new lite::Context; | |||
| ctx->thread_num_ = 1; | |||
| int total_size = Conv1x1Int8TestInit2(&inputs_, &outputs_, conv_param, &correct); | |||
| kernel::Convolution1x1Int8CPUKernel *conv1x1 = new kernel::Convolution1x1Int8CPUKernel( | |||
| reinterpret_cast<OpParameter *>(conv_param), inputs_, outputs_, ctx, nullptr); | |||
| conv1x1->Init(); | |||
| conv1x1->Run(); | |||
| CompareOutputData(reinterpret_cast<int8_t *>(outputs_[0]->Data()), correct, total_size, 2); | |||
| delete conv1x1; | |||
| for (auto t : inputs_) delete t; | |||
| for (auto t : outputs_) delete t; | |||
| free(correct); | |||
| } | |||
| } // namespace mindspore | |||