| @@ -20,98 +20,144 @@ | |||||
| #endif | #endif | ||||
| #include "nnacl/quantization/fixed_point.h" | #include "nnacl/quantization/fixed_point.h" | ||||
| #ifdef ENABLE_NEON | |||||
| int16x8_t LoadAndAddOffset(int8_t *data, int index, int offset) { | |||||
| int8x8_t input_s8 = vld1_s8(data + index); | |||||
| int16x8_t input_s16 = vmovl_s8(input_s8); | |||||
| return vaddq_s16(input_s16, vdupq_n_s16(offset)); | |||||
| } | |||||
| void AddInt8(const int8_t *input0, const int8_t *input1, int8_t *output, int size, AddQuantParameter *params) { | |||||
| int in0_left_shift = (1 << params->left_shift_) * (1 << params->in0_left_shift_); | |||||
| int in1_left_shift = (1 << params->left_shift_) * (1 << params->in1_left_shift_); | |||||
| int index = 0; | |||||
| int32x4_t ClacScaledInput(int32x4_t input, int32x4_t left_shift_result_vec, int32x4_t input_multiplier_vec, | |||||
| int32x4_t right_shift_vec) { | |||||
| int32x4_t shifted_input = vmulq_s32(input, left_shift_result_vec); | |||||
| shifted_input = vqrdmulhq_s32(shifted_input, input_multiplier_vec); | |||||
| const int32x4_t fixup = vshrq_n_s32(vandq_s32(shifted_input, right_shift_vec), 31); | |||||
| return vrshlq_s32(vqaddq_s32(shifted_input, fixup), right_shift_vec); | |||||
| } | |||||
| #ifdef ENABLE_ARM | |||||
| const int8x16_t min_vec = vdupq_n_s8(params->min_); | |||||
| const int8x16_t max_vac = vdupq_n_s8(params->max_); | |||||
| int16x4_t AddClacSumHalfWord(int32x4_t scaled_input0, int32x4_t scaled_input1, int32x4_t left_shift_out_vec, | |||||
| int32x4_t output_multiplier_vec, AddQuantParameter *para) { | |||||
| int32x4_t raw_sum = vaddq_s32(scaled_input0, scaled_input1); | |||||
| const int16x8_t in0_zp_vec = vdupq_n_s16(params->in0_zp_); | |||||
| const int16x8_t in1_zp_vec = vdupq_n_s16(params->in1_zp_); | |||||
| const int16x8_t out_zp_vec = vdupq_n_s16(params->out_zp_); | |||||
| raw_sum = RoundingDivideByPOTInt32x4(vqrdmulhq_s32(vmulq_s32(raw_sum, left_shift_out_vec), output_multiplier_vec), | |||||
| para->right_shift_out_); | |||||
| raw_sum = vaddq_s32(raw_sum, vdupq_n_s32(para->output_offset_)); | |||||
| raw_sum = vmaxq_s32(raw_sum, vdupq_n_s32(para->output_activation_min_)); | |||||
| raw_sum = vminq_s32(raw_sum, vdupq_n_s32(para->output_activation_max_)); | |||||
| return vqmovn_s32(raw_sum); | |||||
| } | |||||
| const int32x4_t in0_left_vec = vdupq_n_s32(in0_left_shift); | |||||
| const int32x4_t in1_left_vec = vdupq_n_s32(in1_left_shift); | |||||
| const int32x4_t in0_right_vec = vdupq_n_s32(-params->in0_right_shift_); | |||||
| const int32x4_t in1_right_vec = vdupq_n_s32(-params->in1_right_shift_); | |||||
| const int32x4_t out_left_vec = vdupq_n_s32(params->out_left_shift_); | |||||
| const int32x4_t out_right_vec = vdupq_n_s32(-params->out_right_shift_); | |||||
| for (; index <= size - 16; index += 16) { | |||||
| const int8x16_t in0_src = vld1q_s8(input0 + index); | |||||
| const int8x16_t in1_src = vld1q_s8(input1 + index); | |||||
| const int16x8_t in0_s16_low = vmovl_s8(vget_low_s8(in0_src)); | |||||
| const int16x8_t in0_s16_high = vmovl_s8(vget_high_s8(in0_src)); | |||||
| const int16x8_t in1_s16_low = vmovl_s8(vget_low_s8(in1_src)); | |||||
| const int16x8_t in1_s16_high = vmovl_s8(vget_high_s8(in1_src)); | |||||
| const int16x8_t in0_zp_low = vaddq_s16(in0_s16_low, in0_zp_vec); | |||||
| const int16x8_t in0_zp_high = vaddq_s16(in0_s16_high, in0_zp_vec); | |||||
| const int16x8_t in1_zp_low = vaddq_s16(in1_s16_low, in1_zp_vec); | |||||
| const int16x8_t in1_zp_high = vaddq_s16(in1_s16_high, in1_zp_vec); | |||||
| int32x4_t in0_1 = vmovl_s16(vget_low_s16(in0_zp_low)); | |||||
| int32x4_t in0_2 = vmovl_s16(vget_high_s16(in0_zp_low)); | |||||
| int32x4_t in0_3 = vmovl_s16(vget_low_s16(in0_zp_high)); | |||||
| int32x4_t in0_4 = vmovl_s16(vget_high_s16(in0_zp_high)); | |||||
| int32x4_t in1_1 = vmovl_s16(vget_low_s16(in1_zp_low)); | |||||
| int32x4_t in1_2 = vmovl_s16(vget_high_s16(in1_zp_low)); | |||||
| int32x4_t in1_3 = vmovl_s16(vget_low_s16(in1_zp_high)); | |||||
| int32x4_t in1_4 = vmovl_s16(vget_high_s16(in1_zp_high)); | |||||
| // Apply left shift | |||||
| in0_1 = vmulq_s32(in0_1, in0_left_vec); | |||||
| in0_2 = vmulq_s32(in0_2, in0_left_vec); | |||||
| in0_3 = vmulq_s32(in0_3, in0_left_vec); | |||||
| in0_4 = vmulq_s32(in0_4, in0_left_vec); | |||||
| in1_1 = vmulq_s32(in1_1, in1_left_vec); | |||||
| in1_2 = vmulq_s32(in1_2, in1_left_vec); | |||||
| in1_3 = vmulq_s32(in1_3, in1_left_vec); | |||||
| in1_4 = vmulq_s32(in1_4, in1_left_vec); | |||||
| // Apply the fixed-point part of the multiplier. | |||||
| in0_1 = vqrdmulhq_n_s32(in0_1, params->in0_multiplier_); | |||||
| in0_2 = vqrdmulhq_n_s32(in0_2, params->in0_multiplier_); | |||||
| in0_3 = vqrdmulhq_n_s32(in0_3, params->in0_multiplier_); | |||||
| in0_4 = vqrdmulhq_n_s32(in0_4, params->in0_multiplier_); | |||||
| in1_1 = vqrdmulhq_n_s32(in1_1, params->in1_multiplier_); | |||||
| in1_2 = vqrdmulhq_n_s32(in1_2, params->in1_multiplier_); | |||||
| in1_3 = vqrdmulhq_n_s32(in1_3, params->in1_multiplier_); | |||||
| in1_4 = vqrdmulhq_n_s32(in1_4, params->in1_multiplier_); | |||||
| // Apply right shift | |||||
| in0_1 = vqaddq_s32(in0_1, vshrq_n_s32(vandq_s32(in0_1, in0_right_vec), 31)); | |||||
| in0_2 = vqaddq_s32(in0_2, vshrq_n_s32(vandq_s32(in0_2, in0_right_vec), 31)); | |||||
| in0_3 = vqaddq_s32(in0_3, vshrq_n_s32(vandq_s32(in0_3, in0_right_vec), 31)); | |||||
| in0_4 = vqaddq_s32(in0_4, vshrq_n_s32(vandq_s32(in0_4, in0_right_vec), 31)); | |||||
| in1_1 = vqaddq_s32(in1_1, vshrq_n_s32(vandq_s32(in1_1, in1_right_vec), 31)); | |||||
| in1_2 = vqaddq_s32(in1_2, vshrq_n_s32(vandq_s32(in1_2, in1_right_vec), 31)); | |||||
| in1_3 = vqaddq_s32(in1_3, vshrq_n_s32(vandq_s32(in1_3, in1_right_vec), 31)); | |||||
| in1_4 = vqaddq_s32(in1_4, vshrq_n_s32(vandq_s32(in1_4, in1_right_vec), 31)); | |||||
| void AddInt8NEON(int8_t *input0_data, int8_t *input1_data, int8_t *output_data, int64_t real_dst_count, | |||||
| AddQuantParameter *para, int *index) { | |||||
| int32x4_t left_shift_result0_vec = vdupq_n_s32(para->left_shift_result0_); | |||||
| int32x4_t left_shift_result1_vec = vdupq_n_s32(para->left_shift_result1_); | |||||
| int32x4_t input0_multiplier_vec = vdupq_n_s32(para->input0_multiplier_); | |||||
| int32x4_t input1_multiplier_vec = vdupq_n_s32(para->input1_multiplier_); | |||||
| int32x4_t output_multiplier_vec = vdupq_n_s32(para->output_multiplier_); | |||||
| int32x4_t left_shift_out_vec = vdupq_n_s32((1 << para->left_shift_out_)); | |||||
| int32x4_t right_shift0_vec = vdupq_n_s32(-para->right_shift0_); | |||||
| int32x4_t right_shift1_vec = vdupq_n_s32(-para->right_shift1_); | |||||
| for (; (*index) <= real_dst_count - 8; (*index) += 8) { | |||||
| int16x8_t input0_val = LoadAndAddOffset(input0_data, *index, para->input0_offset_); | |||||
| int16x8_t input1_val = LoadAndAddOffset(input1_data, *index, para->input1_offset_); | |||||
| int32x4_t input0_low = vmovl_s16(vget_low_s16(input0_val)); | |||||
| int32x4_t input0_high = vmovl_s16(vget_high_s16(input0_val)); | |||||
| int32x4_t input1_low = vmovl_s16(vget_low_s16(input1_val)); | |||||
| int32x4_t input1_high = vmovl_s16(vget_high_s16(input1_val)); | |||||
| int32x4_t scaled_input0_low = | |||||
| ClacScaledInput(input0_low, left_shift_result0_vec, input0_multiplier_vec, right_shift0_vec); | |||||
| int32x4_t scaled_input0_high = | |||||
| ClacScaledInput(input0_high, left_shift_result0_vec, input0_multiplier_vec, right_shift0_vec); | |||||
| int32x4_t scaled_input1_low = | |||||
| ClacScaledInput(input1_low, left_shift_result1_vec, input1_multiplier_vec, right_shift1_vec); | |||||
| int32x4_t scaled_input1_high = | |||||
| ClacScaledInput(input1_high, left_shift_result1_vec, input1_multiplier_vec, right_shift1_vec); | |||||
| int16x4_t sum_low = | |||||
| AddClacSumHalfWord(scaled_input0_low, scaled_input1_low, left_shift_out_vec, output_multiplier_vec, para); | |||||
| int16x4_t sum_high = | |||||
| AddClacSumHalfWord(scaled_input0_high, scaled_input1_high, left_shift_out_vec, output_multiplier_vec, para); | |||||
| int16x8_t res_s16 = vcombine_s16(sum_low, sum_high); | |||||
| int8x8_t res_u8_n0 = vqmovn_s16(res_s16); | |||||
| vst1_s8(output_data + *index, res_u8_n0); | |||||
| in0_1 = vrshlq_s32(in0_1, in0_right_vec); | |||||
| in0_2 = vrshlq_s32(in0_2, in0_right_vec); | |||||
| in0_3 = vrshlq_s32(in0_3, in0_right_vec); | |||||
| in0_4 = vrshlq_s32(in0_4, in0_right_vec); | |||||
| in1_1 = vrshlq_s32(in1_1, in1_right_vec); | |||||
| in1_2 = vrshlq_s32(in1_2, in1_right_vec); | |||||
| in1_3 = vrshlq_s32(in1_3, in1_right_vec); | |||||
| in1_4 = vrshlq_s32(in1_4, in1_right_vec); | |||||
| /* calculate output */ | |||||
| int32x4_t out1 = vaddq_s32(in0_1, in1_1); | |||||
| int32x4_t out2 = vaddq_s32(in0_2, in1_2); | |||||
| int32x4_t out3 = vaddq_s32(in0_3, in1_3); | |||||
| int32x4_t out4 = vaddq_s32(in0_4, in1_4); | |||||
| // Apply left shift | |||||
| out1 = vshlq_s32(out1, out_left_vec); | |||||
| out2 = vshlq_s32(out2, out_left_vec); | |||||
| out3 = vshlq_s32(out3, out_left_vec); | |||||
| out4 = vshlq_s32(out4, out_left_vec); | |||||
| // Apply the fixed-point part of the multiplier. | |||||
| out1 = vqrdmulhq_n_s32(out1, params->out_multiplier_); | |||||
| out2 = vqrdmulhq_n_s32(out2, params->out_multiplier_); | |||||
| out3 = vqrdmulhq_n_s32(out3, params->out_multiplier_); | |||||
| out4 = vqrdmulhq_n_s32(out4, params->out_multiplier_); | |||||
| // Apply right shift | |||||
| out1 = vqaddq_s32(out1, vshrq_n_s32(vandq_s32(out1, out_right_vec), 31)); | |||||
| out2 = vqaddq_s32(out2, vshrq_n_s32(vandq_s32(out2, out_right_vec), 31)); | |||||
| out3 = vqaddq_s32(out3, vshrq_n_s32(vandq_s32(out3, out_right_vec), 31)); | |||||
| out4 = vqaddq_s32(out4, vshrq_n_s32(vandq_s32(out4, out_right_vec), 31)); | |||||
| out1 = vrshlq_s32(out1, out_right_vec); | |||||
| out2 = vrshlq_s32(out2, out_right_vec); | |||||
| out3 = vrshlq_s32(out3, out_right_vec); | |||||
| out4 = vrshlq_s32(out4, out_right_vec); | |||||
| const int16x4_t out1_s16 = vmovn_s32(out1); | |||||
| const int16x4_t out2_s16 = vmovn_s32(out2); | |||||
| const int16x4_t out3_s16 = vmovn_s32(out3); | |||||
| const int16x4_t out4_s16 = vmovn_s32(out4); | |||||
| const int16x8_t out_s16_1 = vaddq_s16(vcombine_s16(out1_s16, out2_s16), out_zp_vec); | |||||
| const int16x8_t out_s16_2 = vaddq_s16(vcombine_s16(out3_s16, out4_s16), out_zp_vec); | |||||
| const int8x16_t out = vcombine_s8(vqmovn_s16(out_s16_1), vqmovn_s16(out_s16_2)); | |||||
| const int8x16_t int8_out = vmaxq_s8(min_vec, vminq_s8(max_vac, out)); | |||||
| vst1q_s8(output + index, int8_out); | |||||
| } | } | ||||
| } | |||||
| #endif | #endif | ||||
| void AddInt8(int8_t *input0_data, int8_t *input1_data, int8_t *output_data, int64_t real_dst_count, | |||||
| AddQuantParameter *para) { | |||||
| int index = 0; | |||||
| #ifdef ENABLE_NEON | |||||
| AddInt8NEON(input0_data, input1_data, output_data, real_dst_count, para, &index); | |||||
| #endif | |||||
| for (; index < real_dst_count; ++index) { | |||||
| const int32_t input0_val = para->input0_offset_ + input0_data[index]; | |||||
| const int32_t input1_val = para->input1_offset_ + input1_data[index]; | |||||
| const int32_t shifted_input0_val = input0_val * para->left_shift_result0_; | |||||
| const int32_t shifted_input1_val = input1_val * para->left_shift_result1_; | |||||
| const int32_t scaled_input0_val = RoundingDivideByPOT( | |||||
| SaturatingRoundingDoublingHighMul(shifted_input0_val, para->input0_multiplier_), para->right_shift0_); | |||||
| const int32_t scaled_input1_val = RoundingDivideByPOT( | |||||
| SaturatingRoundingDoublingHighMul(shifted_input1_val, para->input1_multiplier_), para->right_shift1_); | |||||
| const int32_t raw_sum = scaled_input0_val + scaled_input1_val; | |||||
| const int32_t raw_output = | |||||
| RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(raw_sum * (1 << (unsigned int)para->left_shift_out_), | |||||
| para->output_multiplier_), | |||||
| para->right_shift_out_) + | |||||
| para->output_offset_; | |||||
| output_data[index] = (int8_t)MSMAX(para->output_activation_min_, MSMIN(raw_output, para->output_activation_max_)); | |||||
| for (; index < size; index++) { | |||||
| const int32_t in0_left = (input0[index] + params->in0_zp_) * in0_left_shift; | |||||
| const int32_t in1_left = (input1[index] + params->in1_zp_) * in1_left_shift; | |||||
| const int32_t in0 = MultiplyByMultiplierAndRightShift(in0_left, params->in0_multiplier_, params->in0_right_shift_); | |||||
| const int32_t in1 = MultiplyByMultiplierAndRightShift(in1_left, params->in1_multiplier_, params->in1_right_shift_); | |||||
| int32_t out = MultiplyByQuantizedMultiplier(in0 + in1, params->out_multiplier_, params->out_left_shift_, | |||||
| -params->out_right_shift_); | |||||
| out += params->out_zp_; | |||||
| output[index] = (int8_t)MSMAX(params->min_, MSMIN(out, params->max_)); | |||||
| } | } | ||||
| return; | return; | ||||
| } | } | ||||
| @@ -20,43 +20,35 @@ | |||||
| #include "nnacl/op_base.h" | #include "nnacl/op_base.h" | ||||
| typedef struct AddQuantParameter { | typedef struct AddQuantParameter { | ||||
| int input0_offset_; | |||||
| int input1_offset_; | |||||
| int output_offset_; | |||||
| float input0_scale_; | |||||
| float input1_scale_; | |||||
| float output_scale_; | |||||
| int input0_multiplier_; | |||||
| int input1_multiplier_; | |||||
| int output_multiplier_; | |||||
| int input0_shift_; | |||||
| int input1_shift_; | |||||
| int output_shift_; | |||||
| int output_activation_min_; | |||||
| int output_activation_max_; | |||||
| int left_shift_result0_; | |||||
| int left_shift_result1_; | |||||
| int right_shift0_; | |||||
| int right_shift1_; | |||||
| int left_shift_out_; | |||||
| int right_shift_out_; | |||||
| int left_shift_; | |||||
| int32_t min_; | |||||
| int32_t max_; | |||||
| int32_t in0_zp_; | |||||
| int32_t in1_zp_; | |||||
| int32_t out_zp_; | |||||
| int32_t in0_left_shift_; | |||||
| int32_t in0_right_shift_; | |||||
| int32_t in0_multiplier_; | |||||
| int32_t in1_left_shift_; | |||||
| int32_t in1_right_shift_; | |||||
| int32_t in1_multiplier_; | |||||
| int32_t out_left_shift_; | |||||
| int32_t out_right_shift_; | |||||
| int32_t out_multiplier_; | |||||
| } AddQuantParameter; | } AddQuantParameter; | ||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||
| extern "C" { | extern "C" { | ||||
| #endif | #endif | ||||
| void AddInt8(int8_t *input0_data, int8_t *input1_data, int8_t *output_data, int64_t real_dst_count, | |||||
| AddQuantParameter *para); | |||||
| void AddInt8(const int8_t *input0, const int8_t *input1, int8_t *output, int size, AddQuantParameter *params); | |||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||
| } | } | ||||
| #endif | #endif | ||||
| #ifdef ENABLE_NEON | |||||
| #include <arm_neon.h> | |||||
| int16x8_t LoadAndAddOffset(int8_t *data, int index, int offset); | |||||
| int32x4_t ClacScaledInput(int32x4_t input, int32x4_t left_shift_result_vec, int32x4_t input_multiplier_vec, | |||||
| int32x4_t right_shift_vec); | |||||
| #endif | |||||
| #endif // MINDSPORE_LITE_NNACL_ADD_INT8_H_ | #endif // MINDSPORE_LITE_NNACL_ADD_INT8_H_ | ||||
| @@ -18,7 +18,7 @@ | |||||
| #include "nnacl/int8/arithmetic_self_int8.h" | #include "nnacl/int8/arithmetic_self_int8.h" | ||||
| #ifdef ENABLE_NEON | #ifdef ENABLE_NEON | ||||
| #include <arm_neon.h> | #include <arm_neon.h> | ||||
| #include "nnacl/int8/add_int8.h" | |||||
| #include "nnacl/int8/common_func_int8.h" | |||||
| #endif | #endif | ||||
| #include "nnacl/quantization/fixed_point.h" | #include "nnacl/quantization/fixed_point.h" | ||||
| @@ -56,3 +56,19 @@ void PostFuncInt8C4(const int32_t *in, const int32_t *bias, int8_t *out, size_t | |||||
| #endif | #endif | ||||
| return; | return; | ||||
| } | } | ||||
| #ifdef ENABLE_ARM | |||||
| int16x8_t LoadAndAddOffset(int8_t *data, int index, int offset) { | |||||
| int8x8_t input_s8 = vld1_s8(data + index); | |||||
| int16x8_t input_s16 = vmovl_s8(input_s8); | |||||
| return vaddq_s16(input_s16, vdupq_n_s16(offset)); | |||||
| } | |||||
| int32x4_t ClacScaledInput(int32x4_t input, int32x4_t left_shift_result_vec, int32x4_t input_multiplier_vec, | |||||
| int32x4_t right_shift_vec) { | |||||
| int32x4_t shifted_input = vmulq_s32(input, left_shift_result_vec); | |||||
| shifted_input = vqrdmulhq_s32(shifted_input, input_multiplier_vec); | |||||
| const int32x4_t fixup = vshrq_n_s32(vandq_s32(shifted_input, right_shift_vec), 31); | |||||
| return vrshlq_s32(vqaddq_s32(shifted_input, fixup), right_shift_vec); | |||||
| } | |||||
| #endif | |||||
| @@ -20,6 +20,9 @@ | |||||
| #include <stdint.h> | #include <stdint.h> | ||||
| #include <stdio.h> | #include <stdio.h> | ||||
| #include <string.h> | #include <string.h> | ||||
| #ifdef ENABLE_NEON | |||||
| #include <arm_neon.h> | |||||
| #endif | |||||
| #include "nnacl/op_base.h" | #include "nnacl/op_base.h" | ||||
| #include "nnacl/conv_parameter.h" | #include "nnacl/conv_parameter.h" | ||||
| @@ -50,6 +53,9 @@ void ConvDwInt8Center(int8_t *dst, const int8_t *src, const int16_t *weight, con | |||||
| void DeconvDwInt8Post(int8_t *dst, int32_t *output_buffer, const int32_t *bias, int block_channel, int pixel_nums, | void DeconvDwInt8Post(int8_t *dst, int32_t *output_buffer, const int32_t *bias, int block_channel, int pixel_nums, | ||||
| int out_multiplier, int left_shift, int right_shift, int32_t out_zp, int32_t acc_min, | int out_multiplier, int left_shift, int right_shift, int32_t out_zp, int32_t acc_min, | ||||
| int32_t acc_max); | int32_t acc_max); | ||||
| int16x8_t LoadAndAddOffset(int8_t *data, int index, int offset); | |||||
| int32x4_t ClacScaledInput(int32x4_t input, int32x4_t left_shift_result_vec, int32x4_t input_multiplier_vec, | |||||
| int32x4_t right_shift_vec); | |||||
| #endif | #endif | ||||
| #ifdef ENABLE_ARM32 | #ifdef ENABLE_ARM32 | ||||
| @@ -18,7 +18,7 @@ | |||||
| #include "nnacl/mul_parameter.h" | #include "nnacl/mul_parameter.h" | ||||
| #ifdef ENABLE_NEON | #ifdef ENABLE_NEON | ||||
| #include <arm_neon.h> | #include <arm_neon.h> | ||||
| #include "nnacl/int8/add_int8.h" | |||||
| #include "nnacl/int8/common_func_int8.h" | |||||
| #endif | #endif | ||||
| #include "nnacl/quantization/fixed_point.h" | #include "nnacl/quantization/fixed_point.h" | ||||
| @@ -17,7 +17,7 @@ | |||||
| #include "nnacl/int8/sub_int8.h" | #include "nnacl/int8/sub_int8.h" | ||||
| #ifdef ENABLE_NEON | #ifdef ENABLE_NEON | ||||
| #include <arm_neon.h> | #include <arm_neon.h> | ||||
| #include "nnacl/int8/add_int8.h" | |||||
| #include "nnacl/int8/common_func_int8.h" | |||||
| #endif | #endif | ||||
| #include "nnacl/quantization/fixed_point.h" | #include "nnacl/quantization/fixed_point.h" | ||||
| @@ -54,6 +54,10 @@ int MultiplyByQuantizedMultiplier(int32_t value, int32_t multiplier, int32_t lef | |||||
| return RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(value * (1 << left_shift), multiplier), -right_shift); | return RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(value * (1 << left_shift), multiplier), -right_shift); | ||||
| } | } | ||||
| int MultiplyByMultiplierAndRightShift(int32_t value, int32_t multiplier, int32_t right_shift) { | |||||
| return RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(value, multiplier), right_shift); | |||||
| } | |||||
| int FractionsBits(int integer_bits) { return 8 * sizeof(int32_t) - 1 - integer_bits; } | int FractionsBits(int integer_bits) { return 8 * sizeof(int32_t) - 1 - integer_bits; } | ||||
| int FixedPoint_One(int integer_bits, int fractions_bits) { | int FixedPoint_One(int integer_bits, int fractions_bits) { | ||||
| @@ -42,6 +42,8 @@ int RoundingDivideByPOT(int x, int exponent); | |||||
| int MultiplyByQuantizedMultiplier(int32_t value, int32_t multiplier, int32_t left_shift, int32_t right_shift); | int MultiplyByQuantizedMultiplier(int32_t value, int32_t multiplier, int32_t left_shift, int32_t right_shift); | ||||
| int MultiplyByMultiplierAndRightShift(int32_t value, int32_t multiplier, int32_t right_shift); | |||||
| int SaturatingRoundingMultiplyByPOT(int32_t x, int exponent); | int SaturatingRoundingMultiplyByPOT(int32_t x, int exponent); | ||||
| int32_t Rescale(int x, int kIntegerBitsSrc, int kIntegerBitsDst); | int32_t Rescale(int x, int kIntegerBitsSrc, int kIntegerBitsDst); | ||||
| @@ -22,6 +22,7 @@ | |||||
| #include "src/runtime/runtime_api.h" | #include "src/runtime/runtime_api.h" | ||||
| #include "src/kernel_registry.h" | #include "src/kernel_registry.h" | ||||
| #include "include/errorcode.h" | #include "include/errorcode.h" | ||||
| #include "src/common/file_utils.h" | |||||
| using mindspore::lite::KernelRegistrar; | using mindspore::lite::KernelRegistrar; | ||||
| using mindspore::lite::RET_ERROR; | using mindspore::lite::RET_ERROR; | ||||
| @@ -30,111 +31,108 @@ using mindspore::schema::PrimitiveType_Add; | |||||
| namespace mindspore::kernel { | namespace mindspore::kernel { | ||||
| int QuantizedAddCPUKernel::Init() { | int QuantizedAddCPUKernel::Init() { | ||||
| lite::Tensor *input0 = in_tensors_.at(0); | |||||
| lite::Tensor *input1 = in_tensors_.at(1); | |||||
| lite::Tensor *output = out_tensors_.at(0); | |||||
| MS_ASSERT(input0); | |||||
| MS_ASSERT(input1); | |||||
| MS_ASSERT(output); | |||||
| para_.input0_scale_ = input0->GetQuantParams().front().scale; | |||||
| para_.input0_offset_ = input0->GetQuantParams().front().zeroPoint * -1; | |||||
| para_.input1_scale_ = input1->GetQuantParams().front().scale; | |||||
| para_.input1_offset_ = input1->GetQuantParams().front().zeroPoint * -1; | |||||
| para_.output_scale_ = output->GetQuantParams().front().scale; | |||||
| para_.output_offset_ = output->GetQuantParams().front().zeroPoint; | |||||
| const int left_shift = 20; // 1 << 20, 2/20 | |||||
| const double twice_max_input_scale = 2 * std::max(para_.input0_scale_, para_.input1_scale_); | |||||
| const double real_input0_multiplier = para_.input0_scale_ / twice_max_input_scale; | |||||
| const double real_input1_multiplier = para_.input1_scale_ / twice_max_input_scale; | |||||
| const double real_output_multiplier = twice_max_input_scale / ((1 << left_shift) * para_.output_scale_); | |||||
| QuantizeMultiplierSmallerThanOne(real_input0_multiplier, ¶_.input0_multiplier_, ¶_.input0_shift_); | |||||
| QuantizeMultiplierSmallerThanOne(real_input1_multiplier, ¶_.input1_multiplier_, ¶_.input1_shift_); | |||||
| QuantizeMultiplierSmallerThanOne(real_output_multiplier, ¶_.output_multiplier_, ¶_.output_shift_); | |||||
| switch (arith_para_->activation_type_) { | |||||
| case schema::ActivationType_RELU: | |||||
| para_.output_activation_min_ = 0; | |||||
| para_.output_activation_max_ = std::numeric_limits<int8_t>::max(); | |||||
| break; | |||||
| case schema::ActivationType_RELU6: | |||||
| para_.output_activation_min_ = 0; | |||||
| para_.output_activation_max_ = 6; | |||||
| break; | |||||
| case schema::ActivationType_NO_ACTIVATION: | |||||
| para_.output_activation_min_ = std::numeric_limits<int8_t>::min(); | |||||
| para_.output_activation_max_ = std::numeric_limits<int8_t>::max(); | |||||
| break; | |||||
| default: | |||||
| MS_LOG(ERROR) << "Add does not support activation type " << arith_para_->activation_type_; | |||||
| return RET_ERROR; | |||||
| } | |||||
| int left_shift0 = -para_.input0_shift_ > 0 ? -para_.input0_shift_ : 0; | |||||
| para_.right_shift0_ = -para_.input0_shift_ > 0 ? 0 : para_.input0_shift_; | |||||
| int left_shift1 = -para_.input1_shift_ > 0 ? -para_.input1_shift_ : 0; | |||||
| para_.right_shift1_ = -para_.input1_shift_ > 0 ? 0 : para_.input1_shift_; | |||||
| auto *input0 = in_tensors_.at(0); | |||||
| auto *input1 = in_tensors_.at(1); | |||||
| auto *output = out_tensors_.at(0); | |||||
| auto act = arith_para_->activation_type_; | |||||
| para_.left_shift_out_ = -para_.output_shift_ > 0 ? -para_.output_shift_ : 0; | |||||
| para_.right_shift_out_ = -para_.output_shift_ > 0 ? 0 : para_.output_shift_; | |||||
| para_.in0_zp_ = input0->GetQuantParams().front().zeroPoint * -1; | |||||
| para_.in1_zp_ = input1->GetQuantParams().front().zeroPoint * -1; | |||||
| para_.out_zp_ = output->GetQuantParams().front().zeroPoint; | |||||
| para_.left_shift_result0_ = (1 << left_shift) * ((1 << left_shift0)); | |||||
| para_.left_shift_result1_ = (1 << left_shift) * ((1 << left_shift1)); | |||||
| const double in0_scale = input0->GetQuantParams().front().scale; | |||||
| const double in1_scale = input1->GetQuantParams().front().scale; | |||||
| const double out_scale = output->GetQuantParams().front().scale; | |||||
| MS_ASSERT(left_shift + left_shift0 == left_shift); | |||||
| MS_ASSERT(left_shift + left_shift1 == left_shift); | |||||
| return 0; | |||||
| } | |||||
| para_.left_shift_ = 20; | |||||
| const double twice_max_input_scale = 2 * std::max(in0_scale, in1_scale); | |||||
| const double in0_multiplier = in0_scale / twice_max_input_scale; | |||||
| const double in1_multiplier = in1_scale / twice_max_input_scale; | |||||
| const double out_multiplier = twice_max_input_scale / ((1 << para_.left_shift_) * out_scale); | |||||
| int QuantizedAddCPUKernel::ReSize() { return 0; } | |||||
| QuantizeMultiplierSmallerThanOne(in0_multiplier, ¶_.in0_multiplier_, ¶_.in0_left_shift_); | |||||
| QuantizeMultiplierSmallerThanOne(in1_multiplier, ¶_.in1_multiplier_, ¶_.in1_left_shift_); | |||||
| QuantizeMultiplierSmallerThanOne(out_multiplier, ¶_.out_multiplier_, ¶_.out_left_shift_); | |||||
| int QuantizedAddCPUKernel::Run() { | |||||
| input0_data_ = static_cast<int8_t *>(in_tensors_.at(0)->MutableData()); | |||||
| input1_data_ = static_cast<int8_t *>(in_tensors_.at(1)->MutableData()); | |||||
| output_data_ = static_cast<int8_t *>(out_tensors_.at(0)->MutableData()); | |||||
| para_.in0_right_shift_ = -para_.in0_left_shift_ > 0 ? 0 : para_.in0_left_shift_; | |||||
| para_.in1_right_shift_ = -para_.in1_left_shift_ > 0 ? 0 : para_.in1_left_shift_; | |||||
| para_.out_right_shift_ = -para_.out_left_shift_ > 0 ? 0 : para_.out_left_shift_; | |||||
| elements_num_ = out_tensors_.at(0)->ElementsNum(); | |||||
| count_unit_ = thread_count_ > 1 ? UP_DIV(elements_num_, thread_count_) : elements_num_; | |||||
| para_.in0_left_shift_ = -para_.in0_left_shift_ > 0 ? -para_.in0_left_shift_ : 0; | |||||
| para_.in1_left_shift_ = -para_.in1_left_shift_ > 0 ? -para_.in1_left_shift_ : 0; | |||||
| para_.out_left_shift_ = -para_.out_left_shift_ > 0 ? -para_.out_left_shift_ : 0; | |||||
| if (in_tensors_.at(0)->ElementsNum() != in_tensors_.at(1)->ElementsNum()) { | |||||
| input0_data_ = static_cast<int8_t *>(ctx_->allocator->Malloc(out_tensors_.at(0)->Size())); | |||||
| input1_data_ = static_cast<int8_t *>(ctx_->allocator->Malloc(out_tensors_.at(0)->Size())); | |||||
| if (!input0_data_ || !input1_data_) { | |||||
| MS_LOG(ERROR) << "malloc input0_data_ || input1_data_ failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| CalculateActivationRangeQuantized(act == ActType_Relu, act == ActType_Relu6, 0, 1, ¶_.min_, ¶_.max_); | |||||
| TileDimensionsUint8(static_cast<uint8_t *>(in_tensors_.at(0)->MutableData()), | |||||
| static_cast<uint8_t *>(in_tensors_.at(1)->MutableData()), | |||||
| reinterpret_cast<uint8_t *>(input0_data_), reinterpret_cast<uint8_t *>(input1_data_), | |||||
| arith_para_); | |||||
| auto ret = ParallelLaunch(this->context_->thread_pool_, AddInt8Run, this, thread_count_); | |||||
| ctx_->allocator->Free(input0_data_); | |||||
| ctx_->allocator->Free(input1_data_); | |||||
| return ret; | |||||
| if (!InferShapeDone()) { | |||||
| return RET_OK; | |||||
| } | } | ||||
| return ReSize(); | |||||
| } | |||||
| auto ret = ParallelLaunch(this->context_->thread_pool_, AddInt8Run, this, thread_count_); | |||||
| return ret; | |||||
| int QuantizedAddCPUKernel::ReSize() { | |||||
| elements_num_ = out_tensors_.at(0)->ElementsNum(); | |||||
| arith_para_->broadcasting_ = in_tensors_.at(0)->ElementsNum() != in_tensors_.at(1)->ElementsNum(); | |||||
| thread_count_ = MSMIN(elements_num_, op_parameter_->thread_num_); | |||||
| thread_stride_ = UP_DIV(elements_num_, thread_count_); | |||||
| return RET_OK; | |||||
| } | } | ||||
| int AddInt8Run(void *cdata, int task_id) { | int AddInt8Run(void *cdata, int task_id) { | ||||
| auto add = reinterpret_cast<QuantizedAddCPUKernel *>(cdata); | auto add = reinterpret_cast<QuantizedAddCPUKernel *>(cdata); | ||||
| add->DoExecute(task_id); | add->DoExecute(task_id); | ||||
| return lite::RET_OK; | |||||
| return RET_OK; | |||||
| } | } | ||||
| int QuantizedAddCPUKernel::DoExecute(int tId) { | |||||
| int64_t real_dst_count = MSMIN(elements_num_ - tId * count_unit_, count_unit_); | |||||
| int8_t *cur_input0_data = input0_data_ + tId * count_unit_; | |||||
| int8_t *cur_input1_data = input1_data_ + tId * count_unit_; | |||||
| int8_t *cur_output_data = output_data_ + tId * count_unit_; | |||||
| int QuantizedAddCPUKernel::DoExecute(int task_id) { | |||||
| int rest_count = elements_num_ - task_id * thread_stride_; | |||||
| int real_count = MSMIN(thread_stride_, rest_count); | |||||
| if (real_count <= 0) { | |||||
| return RET_OK; | |||||
| } | |||||
| AddInt8(cur_input0_data, cur_input1_data, cur_output_data, real_dst_count, ¶_); | |||||
| return lite::RET_OK; | |||||
| int8_t *cur_input0_data = input0_data_ + task_id * thread_stride_; | |||||
| int8_t *cur_input1_data = input1_data_ + task_id * thread_stride_; | |||||
| int8_t *cur_output_data = output_data_ + task_id * thread_stride_; | |||||
| AddInt8(cur_input0_data, cur_input1_data, cur_output_data, real_count, ¶_); | |||||
| return RET_OK; | |||||
| } | |||||
| int QuantizedAddCPUKernel::Run() { | |||||
| int8_t *src_in0 = static_cast<int8_t *>(in_tensors_.at(0)->data_c()); | |||||
| int8_t *src_in1 = static_cast<int8_t *>(in_tensors_.at(1)->data_c()); | |||||
| output_data_ = static_cast<int8_t *>(out_tensors_.at(0)->data_c()); | |||||
| if (arith_para_->broadcasting_) { | |||||
| input0_data_ = static_cast<int8_t *>(context_->allocator->Malloc(elements_num_ * sizeof(int8_t))); | |||||
| if (input0_data_ == nullptr) { | |||||
| MS_LOG(ERROR) << "malloc input0_data_ failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| input1_data_ = static_cast<int8_t *>(context_->allocator->Malloc(elements_num_ * sizeof(int8_t))); | |||||
| if (input1_data_ == nullptr) { | |||||
| context_->allocator->Free(input0_data_); | |||||
| input0_data_ = nullptr; | |||||
| MS_LOG(ERROR) << "malloc input1_data_ failed."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| TileDimensionsInt8(src_in0, src_in1, input0_data_, input1_data_, arith_para_); | |||||
| auto ret = ParallelLaunch(context_->thread_pool_, AddInt8Run, this, thread_count_); | |||||
| context_->allocator->Free(input0_data_); | |||||
| context_->allocator->Free(input1_data_); | |||||
| input0_data_ = nullptr; | |||||
| input1_data_ = nullptr; | |||||
| return ret; | |||||
| } | |||||
| input0_data_ = src_in0; | |||||
| input1_data_ = src_in1; | |||||
| auto ret = ParallelLaunch(this->context_->thread_pool_, AddInt8Run, this, thread_count_); | |||||
| return ret; | |||||
| } | } | ||||
| kernel::LiteKernel *CpuAddInt8KernelCreator(const std::vector<lite::Tensor *> &inputs, | kernel::LiteKernel *CpuAddInt8KernelCreator(const std::vector<lite::Tensor *> &inputs, | ||||
| @@ -28,7 +28,7 @@ class QuantizedAddCPUKernel : public LiteKernel { | |||||
| explicit QuantizedAddCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | explicit QuantizedAddCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | ||||
| const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx, | const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx, | ||||
| const mindspore::lite::PrimitiveC *primitive) | const mindspore::lite::PrimitiveC *primitive) | ||||
| : LiteKernel(parameter, inputs, outputs, ctx, primitive), ctx_(ctx), thread_count_(ctx_->thread_num_) { | |||||
| : LiteKernel(parameter, inputs, outputs, ctx, primitive) { | |||||
| arith_para_ = reinterpret_cast<ArithmeticParameter *>(parameter); | arith_para_ = reinterpret_cast<ArithmeticParameter *>(parameter); | ||||
| } | } | ||||
| ~QuantizedAddCPUKernel() override {} | ~QuantizedAddCPUKernel() override {} | ||||
| @@ -39,12 +39,11 @@ class QuantizedAddCPUKernel : public LiteKernel { | |||||
| int DoExecute(int tId); | int DoExecute(int tId); | ||||
| private: | private: | ||||
| const lite::InnerContext *ctx_; | |||||
| AddQuantParameter para_; | AddQuantParameter para_; | ||||
| ArithmeticParameter *arith_para_; | ArithmeticParameter *arith_para_; | ||||
| int thread_count_; | int thread_count_; | ||||
| int64_t elements_num_; | |||||
| int64_t count_unit_; | |||||
| int thread_stride_; | |||||
| int elements_num_; | |||||
| int8_t *input0_data_ = nullptr; | int8_t *input0_data_ = nullptr; | ||||
| int8_t *input1_data_ = nullptr; | int8_t *input1_data_ = nullptr; | ||||
| int8_t *output_data_ = nullptr; | int8_t *output_data_ = nullptr; | ||||
| @@ -132,11 +132,14 @@ int TransposeInt8CPUKernel::Run() { | |||||
| auto in_tensor = in_tensors_.front(); | auto in_tensor = in_tensors_.front(); | ||||
| auto out_tensor = out_tensors_.front(); | auto out_tensor = out_tensors_.front(); | ||||
| auto in_dims = in_tensor->shape(); | |||||
| auto out_dims = out_tensor->shape(); | |||||
| in_ptr_ = reinterpret_cast<int8_t *>(in_tensor->data_c()); | in_ptr_ = reinterpret_cast<int8_t *>(in_tensor->data_c()); | ||||
| out_ptr_ = reinterpret_cast<int8_t *>(out_tensor->data_c()); | out_ptr_ = reinterpret_cast<int8_t *>(out_tensor->data_c()); | ||||
| in_shape_ = in_tensor->shape().data(); | |||||
| out_shape_ = out_tensor->shape().data(); | |||||
| in_shape_ = in_dims.data(); | |||||
| out_shape_ = out_dims.data(); | |||||
| int ret = MallocTmpBuf(); | int ret = MallocTmpBuf(); | ||||
| if (ret != RET_OK) { | if (ret != RET_OK) { | ||||
| @@ -75,6 +75,7 @@ static const std::vector<schema::PrimitiveType> int8OpList = {schema::PrimitiveT | |||||
| schema::PrimitiveType_Conv2D, | schema::PrimitiveType_Conv2D, | ||||
| schema::PrimitiveType_DepthwiseConv2D, | schema::PrimitiveType_DepthwiseConv2D, | ||||
| schema::PrimitiveType_Add, | schema::PrimitiveType_Add, | ||||
| schema::PrimitiveType_Transpose, | |||||
| schema::PrimitiveType_Pooling, | schema::PrimitiveType_Pooling, | ||||
| schema::PrimitiveType_Concat, | schema::PrimitiveType_Concat, | ||||
| schema::PrimitiveType_SoftMax, | schema::PrimitiveType_SoftMax, | ||||
| @@ -108,6 +108,7 @@ bool QuantStrategy::CanOpPostQuantized(AnfNodePtr &node) const { | |||||
| schema::PrimitiveType_DeDepthwiseConv2D, | schema::PrimitiveType_DeDepthwiseConv2D, | ||||
| schema::PrimitiveType_DeConv2D, | schema::PrimitiveType_DeConv2D, | ||||
| schema::PrimitiveType_Activation, | schema::PrimitiveType_Activation, | ||||
| schema::PrimitiveType_Transpose, | |||||
| schema::PrimitiveType_Eltwise, | schema::PrimitiveType_Eltwise, | ||||
| }; | }; | ||||
| bool contain = IsContain(int8OpList, type); | bool contain = IsContain(int8OpList, type); | ||||