diff --git a/mindspore/lite/nnacl/int8/add_int8.c b/mindspore/lite/nnacl/int8/add_int8.c index 19bd1f2dd9..e828ec6d29 100644 --- a/mindspore/lite/nnacl/int8/add_int8.c +++ b/mindspore/lite/nnacl/int8/add_int8.c @@ -20,98 +20,144 @@ #endif #include "nnacl/quantization/fixed_point.h" -#ifdef ENABLE_NEON -int16x8_t LoadAndAddOffset(int8_t *data, int index, int offset) { - int8x8_t input_s8 = vld1_s8(data + index); - int16x8_t input_s16 = vmovl_s8(input_s8); - return vaddq_s16(input_s16, vdupq_n_s16(offset)); -} +void AddInt8(const int8_t *input0, const int8_t *input1, int8_t *output, int size, AddQuantParameter *params) { + int in0_left_shift = (1 << params->left_shift_) * (1 << params->in0_left_shift_); + int in1_left_shift = (1 << params->left_shift_) * (1 << params->in1_left_shift_); + int index = 0; -int32x4_t ClacScaledInput(int32x4_t input, int32x4_t left_shift_result_vec, int32x4_t input_multiplier_vec, - int32x4_t right_shift_vec) { - int32x4_t shifted_input = vmulq_s32(input, left_shift_result_vec); - shifted_input = vqrdmulhq_s32(shifted_input, input_multiplier_vec); - const int32x4_t fixup = vshrq_n_s32(vandq_s32(shifted_input, right_shift_vec), 31); - return vrshlq_s32(vqaddq_s32(shifted_input, fixup), right_shift_vec); -} +#ifdef ENABLE_ARM + const int8x16_t min_vec = vdupq_n_s8(params->min_); + const int8x16_t max_vac = vdupq_n_s8(params->max_); -int16x4_t AddClacSumHalfWord(int32x4_t scaled_input0, int32x4_t scaled_input1, int32x4_t left_shift_out_vec, - int32x4_t output_multiplier_vec, AddQuantParameter *para) { - int32x4_t raw_sum = vaddq_s32(scaled_input0, scaled_input1); + const int16x8_t in0_zp_vec = vdupq_n_s16(params->in0_zp_); + const int16x8_t in1_zp_vec = vdupq_n_s16(params->in1_zp_); + const int16x8_t out_zp_vec = vdupq_n_s16(params->out_zp_); - raw_sum = RoundingDivideByPOTInt32x4(vqrdmulhq_s32(vmulq_s32(raw_sum, left_shift_out_vec), output_multiplier_vec), - para->right_shift_out_); - raw_sum = vaddq_s32(raw_sum, vdupq_n_s32(para->output_offset_)); - raw_sum = vmaxq_s32(raw_sum, vdupq_n_s32(para->output_activation_min_)); - raw_sum = vminq_s32(raw_sum, vdupq_n_s32(para->output_activation_max_)); - return vqmovn_s32(raw_sum); -} + const int32x4_t in0_left_vec = vdupq_n_s32(in0_left_shift); + const int32x4_t in1_left_vec = vdupq_n_s32(in1_left_shift); + + const int32x4_t in0_right_vec = vdupq_n_s32(-params->in0_right_shift_); + const int32x4_t in1_right_vec = vdupq_n_s32(-params->in1_right_shift_); + + const int32x4_t out_left_vec = vdupq_n_s32(params->out_left_shift_); + const int32x4_t out_right_vec = vdupq_n_s32(-params->out_right_shift_); + + for (; index <= size - 16; index += 16) { + const int8x16_t in0_src = vld1q_s8(input0 + index); + const int8x16_t in1_src = vld1q_s8(input1 + index); + + const int16x8_t in0_s16_low = vmovl_s8(vget_low_s8(in0_src)); + const int16x8_t in0_s16_high = vmovl_s8(vget_high_s8(in0_src)); + const int16x8_t in1_s16_low = vmovl_s8(vget_low_s8(in1_src)); + const int16x8_t in1_s16_high = vmovl_s8(vget_high_s8(in1_src)); + + const int16x8_t in0_zp_low = vaddq_s16(in0_s16_low, in0_zp_vec); + const int16x8_t in0_zp_high = vaddq_s16(in0_s16_high, in0_zp_vec); + const int16x8_t in1_zp_low = vaddq_s16(in1_s16_low, in1_zp_vec); + const int16x8_t in1_zp_high = vaddq_s16(in1_s16_high, in1_zp_vec); + + int32x4_t in0_1 = vmovl_s16(vget_low_s16(in0_zp_low)); + int32x4_t in0_2 = vmovl_s16(vget_high_s16(in0_zp_low)); + int32x4_t in0_3 = vmovl_s16(vget_low_s16(in0_zp_high)); + int32x4_t in0_4 = vmovl_s16(vget_high_s16(in0_zp_high)); + int32x4_t in1_1 = vmovl_s16(vget_low_s16(in1_zp_low)); + int32x4_t in1_2 = vmovl_s16(vget_high_s16(in1_zp_low)); + int32x4_t in1_3 = vmovl_s16(vget_low_s16(in1_zp_high)); + int32x4_t in1_4 = vmovl_s16(vget_high_s16(in1_zp_high)); + + // Apply left shift + in0_1 = vmulq_s32(in0_1, in0_left_vec); + in0_2 = vmulq_s32(in0_2, in0_left_vec); + in0_3 = vmulq_s32(in0_3, in0_left_vec); + in0_4 = vmulq_s32(in0_4, in0_left_vec); + in1_1 = vmulq_s32(in1_1, in1_left_vec); + in1_2 = vmulq_s32(in1_2, in1_left_vec); + in1_3 = vmulq_s32(in1_3, in1_left_vec); + in1_4 = vmulq_s32(in1_4, in1_left_vec); + + // Apply the fixed-point part of the multiplier. + in0_1 = vqrdmulhq_n_s32(in0_1, params->in0_multiplier_); + in0_2 = vqrdmulhq_n_s32(in0_2, params->in0_multiplier_); + in0_3 = vqrdmulhq_n_s32(in0_3, params->in0_multiplier_); + in0_4 = vqrdmulhq_n_s32(in0_4, params->in0_multiplier_); + in1_1 = vqrdmulhq_n_s32(in1_1, params->in1_multiplier_); + in1_2 = vqrdmulhq_n_s32(in1_2, params->in1_multiplier_); + in1_3 = vqrdmulhq_n_s32(in1_3, params->in1_multiplier_); + in1_4 = vqrdmulhq_n_s32(in1_4, params->in1_multiplier_); + + // Apply right shift + in0_1 = vqaddq_s32(in0_1, vshrq_n_s32(vandq_s32(in0_1, in0_right_vec), 31)); + in0_2 = vqaddq_s32(in0_2, vshrq_n_s32(vandq_s32(in0_2, in0_right_vec), 31)); + in0_3 = vqaddq_s32(in0_3, vshrq_n_s32(vandq_s32(in0_3, in0_right_vec), 31)); + in0_4 = vqaddq_s32(in0_4, vshrq_n_s32(vandq_s32(in0_4, in0_right_vec), 31)); + in1_1 = vqaddq_s32(in1_1, vshrq_n_s32(vandq_s32(in1_1, in1_right_vec), 31)); + in1_2 = vqaddq_s32(in1_2, vshrq_n_s32(vandq_s32(in1_2, in1_right_vec), 31)); + in1_3 = vqaddq_s32(in1_3, vshrq_n_s32(vandq_s32(in1_3, in1_right_vec), 31)); + in1_4 = vqaddq_s32(in1_4, vshrq_n_s32(vandq_s32(in1_4, in1_right_vec), 31)); -void AddInt8NEON(int8_t *input0_data, int8_t *input1_data, int8_t *output_data, int64_t real_dst_count, - AddQuantParameter *para, int *index) { - int32x4_t left_shift_result0_vec = vdupq_n_s32(para->left_shift_result0_); - int32x4_t left_shift_result1_vec = vdupq_n_s32(para->left_shift_result1_); - int32x4_t input0_multiplier_vec = vdupq_n_s32(para->input0_multiplier_); - int32x4_t input1_multiplier_vec = vdupq_n_s32(para->input1_multiplier_); - int32x4_t output_multiplier_vec = vdupq_n_s32(para->output_multiplier_); - int32x4_t left_shift_out_vec = vdupq_n_s32((1 << para->left_shift_out_)); - int32x4_t right_shift0_vec = vdupq_n_s32(-para->right_shift0_); - int32x4_t right_shift1_vec = vdupq_n_s32(-para->right_shift1_); - - for (; (*index) <= real_dst_count - 8; (*index) += 8) { - int16x8_t input0_val = LoadAndAddOffset(input0_data, *index, para->input0_offset_); - int16x8_t input1_val = LoadAndAddOffset(input1_data, *index, para->input1_offset_); - - int32x4_t input0_low = vmovl_s16(vget_low_s16(input0_val)); - int32x4_t input0_high = vmovl_s16(vget_high_s16(input0_val)); - int32x4_t input1_low = vmovl_s16(vget_low_s16(input1_val)); - int32x4_t input1_high = vmovl_s16(vget_high_s16(input1_val)); - - int32x4_t scaled_input0_low = - ClacScaledInput(input0_low, left_shift_result0_vec, input0_multiplier_vec, right_shift0_vec); - int32x4_t scaled_input0_high = - ClacScaledInput(input0_high, left_shift_result0_vec, input0_multiplier_vec, right_shift0_vec); - int32x4_t scaled_input1_low = - ClacScaledInput(input1_low, left_shift_result1_vec, input1_multiplier_vec, right_shift1_vec); - int32x4_t scaled_input1_high = - ClacScaledInput(input1_high, left_shift_result1_vec, input1_multiplier_vec, right_shift1_vec); - - int16x4_t sum_low = - AddClacSumHalfWord(scaled_input0_low, scaled_input1_low, left_shift_out_vec, output_multiplier_vec, para); - int16x4_t sum_high = - AddClacSumHalfWord(scaled_input0_high, scaled_input1_high, left_shift_out_vec, output_multiplier_vec, para); - - int16x8_t res_s16 = vcombine_s16(sum_low, sum_high); - int8x8_t res_u8_n0 = vqmovn_s16(res_s16); - vst1_s8(output_data + *index, res_u8_n0); + in0_1 = vrshlq_s32(in0_1, in0_right_vec); + in0_2 = vrshlq_s32(in0_2, in0_right_vec); + in0_3 = vrshlq_s32(in0_3, in0_right_vec); + in0_4 = vrshlq_s32(in0_4, in0_right_vec); + in1_1 = vrshlq_s32(in1_1, in1_right_vec); + in1_2 = vrshlq_s32(in1_2, in1_right_vec); + in1_3 = vrshlq_s32(in1_3, in1_right_vec); + in1_4 = vrshlq_s32(in1_4, in1_right_vec); + + /* calculate output */ + int32x4_t out1 = vaddq_s32(in0_1, in1_1); + int32x4_t out2 = vaddq_s32(in0_2, in1_2); + int32x4_t out3 = vaddq_s32(in0_3, in1_3); + int32x4_t out4 = vaddq_s32(in0_4, in1_4); + + // Apply left shift + out1 = vshlq_s32(out1, out_left_vec); + out2 = vshlq_s32(out2, out_left_vec); + out3 = vshlq_s32(out3, out_left_vec); + out4 = vshlq_s32(out4, out_left_vec); + + // Apply the fixed-point part of the multiplier. + out1 = vqrdmulhq_n_s32(out1, params->out_multiplier_); + out2 = vqrdmulhq_n_s32(out2, params->out_multiplier_); + out3 = vqrdmulhq_n_s32(out3, params->out_multiplier_); + out4 = vqrdmulhq_n_s32(out4, params->out_multiplier_); + + // Apply right shift + out1 = vqaddq_s32(out1, vshrq_n_s32(vandq_s32(out1, out_right_vec), 31)); + out2 = vqaddq_s32(out2, vshrq_n_s32(vandq_s32(out2, out_right_vec), 31)); + out3 = vqaddq_s32(out3, vshrq_n_s32(vandq_s32(out3, out_right_vec), 31)); + out4 = vqaddq_s32(out4, vshrq_n_s32(vandq_s32(out4, out_right_vec), 31)); + + out1 = vrshlq_s32(out1, out_right_vec); + out2 = vrshlq_s32(out2, out_right_vec); + out3 = vrshlq_s32(out3, out_right_vec); + out4 = vrshlq_s32(out4, out_right_vec); + + const int16x4_t out1_s16 = vmovn_s32(out1); + const int16x4_t out2_s16 = vmovn_s32(out2); + const int16x4_t out3_s16 = vmovn_s32(out3); + const int16x4_t out4_s16 = vmovn_s32(out4); + + const int16x8_t out_s16_1 = vaddq_s16(vcombine_s16(out1_s16, out2_s16), out_zp_vec); + const int16x8_t out_s16_2 = vaddq_s16(vcombine_s16(out3_s16, out4_s16), out_zp_vec); + + const int8x16_t out = vcombine_s8(vqmovn_s16(out_s16_1), vqmovn_s16(out_s16_2)); + const int8x16_t int8_out = vmaxq_s8(min_vec, vminq_s8(max_vac, out)); + + vst1q_s8(output + index, int8_out); } -} #endif -void AddInt8(int8_t *input0_data, int8_t *input1_data, int8_t *output_data, int64_t real_dst_count, - AddQuantParameter *para) { - int index = 0; -#ifdef ENABLE_NEON - AddInt8NEON(input0_data, input1_data, output_data, real_dst_count, para, &index); -#endif - for (; index < real_dst_count; ++index) { - const int32_t input0_val = para->input0_offset_ + input0_data[index]; - const int32_t input1_val = para->input1_offset_ + input1_data[index]; - const int32_t shifted_input0_val = input0_val * para->left_shift_result0_; - const int32_t shifted_input1_val = input1_val * para->left_shift_result1_; - const int32_t scaled_input0_val = RoundingDivideByPOT( - SaturatingRoundingDoublingHighMul(shifted_input0_val, para->input0_multiplier_), para->right_shift0_); - const int32_t scaled_input1_val = RoundingDivideByPOT( - SaturatingRoundingDoublingHighMul(shifted_input1_val, para->input1_multiplier_), para->right_shift1_); - - const int32_t raw_sum = scaled_input0_val + scaled_input1_val; - const int32_t raw_output = - RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(raw_sum * (1 << (unsigned int)para->left_shift_out_), - para->output_multiplier_), - para->right_shift_out_) + - para->output_offset_; - - output_data[index] = (int8_t)MSMAX(para->output_activation_min_, MSMIN(raw_output, para->output_activation_max_)); + for (; index < size; index++) { + const int32_t in0_left = (input0[index] + params->in0_zp_) * in0_left_shift; + const int32_t in1_left = (input1[index] + params->in1_zp_) * in1_left_shift; + const int32_t in0 = MultiplyByMultiplierAndRightShift(in0_left, params->in0_multiplier_, params->in0_right_shift_); + const int32_t in1 = MultiplyByMultiplierAndRightShift(in1_left, params->in1_multiplier_, params->in1_right_shift_); + + int32_t out = MultiplyByQuantizedMultiplier(in0 + in1, params->out_multiplier_, params->out_left_shift_, + -params->out_right_shift_); + out += params->out_zp_; + output[index] = (int8_t)MSMAX(params->min_, MSMIN(out, params->max_)); } return; } diff --git a/mindspore/lite/nnacl/int8/add_int8.h b/mindspore/lite/nnacl/int8/add_int8.h index 92b7e9e53f..127be10383 100644 --- a/mindspore/lite/nnacl/int8/add_int8.h +++ b/mindspore/lite/nnacl/int8/add_int8.h @@ -20,43 +20,35 @@ #include "nnacl/op_base.h" typedef struct AddQuantParameter { - int input0_offset_; - int input1_offset_; - int output_offset_; - float input0_scale_; - float input1_scale_; - float output_scale_; - int input0_multiplier_; - int input1_multiplier_; - int output_multiplier_; - int input0_shift_; - int input1_shift_; - int output_shift_; - int output_activation_min_; - int output_activation_max_; - int left_shift_result0_; - int left_shift_result1_; - int right_shift0_; - int right_shift1_; - int left_shift_out_; - int right_shift_out_; + int left_shift_; + int32_t min_; + int32_t max_; + + int32_t in0_zp_; + int32_t in1_zp_; + int32_t out_zp_; + + int32_t in0_left_shift_; + int32_t in0_right_shift_; + int32_t in0_multiplier_; + + int32_t in1_left_shift_; + int32_t in1_right_shift_; + int32_t in1_multiplier_; + + int32_t out_left_shift_; + int32_t out_right_shift_; + int32_t out_multiplier_; } AddQuantParameter; #ifdef __cplusplus extern "C" { #endif -void AddInt8(int8_t *input0_data, int8_t *input1_data, int8_t *output_data, int64_t real_dst_count, - AddQuantParameter *para); +void AddInt8(const int8_t *input0, const int8_t *input1, int8_t *output, int size, AddQuantParameter *params); + #ifdef __cplusplus } #endif -#ifdef ENABLE_NEON -#include -int16x8_t LoadAndAddOffset(int8_t *data, int index, int offset); -int32x4_t ClacScaledInput(int32x4_t input, int32x4_t left_shift_result_vec, int32x4_t input_multiplier_vec, - int32x4_t right_shift_vec); -#endif - #endif // MINDSPORE_LITE_NNACL_ADD_INT8_H_ diff --git a/mindspore/lite/nnacl/int8/arithmetic_self_int8.c b/mindspore/lite/nnacl/int8/arithmetic_self_int8.c index 931bf1c5f1..c3576843f2 100644 --- a/mindspore/lite/nnacl/int8/arithmetic_self_int8.c +++ b/mindspore/lite/nnacl/int8/arithmetic_self_int8.c @@ -18,7 +18,7 @@ #include "nnacl/int8/arithmetic_self_int8.h" #ifdef ENABLE_NEON #include -#include "nnacl/int8/add_int8.h" +#include "nnacl/int8/common_func_int8.h" #endif #include "nnacl/quantization/fixed_point.h" diff --git a/mindspore/lite/nnacl/int8/common_func_int8.c b/mindspore/lite/nnacl/int8/common_func_int8.c index dfe7149fff..816ddfc896 100644 --- a/mindspore/lite/nnacl/int8/common_func_int8.c +++ b/mindspore/lite/nnacl/int8/common_func_int8.c @@ -56,3 +56,19 @@ void PostFuncInt8C4(const int32_t *in, const int32_t *bias, int8_t *out, size_t #endif return; } + +#ifdef ENABLE_ARM +int16x8_t LoadAndAddOffset(int8_t *data, int index, int offset) { + int8x8_t input_s8 = vld1_s8(data + index); + int16x8_t input_s16 = vmovl_s8(input_s8); + return vaddq_s16(input_s16, vdupq_n_s16(offset)); +} + +int32x4_t ClacScaledInput(int32x4_t input, int32x4_t left_shift_result_vec, int32x4_t input_multiplier_vec, + int32x4_t right_shift_vec) { + int32x4_t shifted_input = vmulq_s32(input, left_shift_result_vec); + shifted_input = vqrdmulhq_s32(shifted_input, input_multiplier_vec); + const int32x4_t fixup = vshrq_n_s32(vandq_s32(shifted_input, right_shift_vec), 31); + return vrshlq_s32(vqaddq_s32(shifted_input, fixup), right_shift_vec); +} +#endif diff --git a/mindspore/lite/nnacl/int8/common_func_int8.h b/mindspore/lite/nnacl/int8/common_func_int8.h index 912200619f..d90045c20b 100644 --- a/mindspore/lite/nnacl/int8/common_func_int8.h +++ b/mindspore/lite/nnacl/int8/common_func_int8.h @@ -20,6 +20,9 @@ #include #include #include +#ifdef ENABLE_NEON +#include +#endif #include "nnacl/op_base.h" #include "nnacl/conv_parameter.h" @@ -50,6 +53,9 @@ void ConvDwInt8Center(int8_t *dst, const int8_t *src, const int16_t *weight, con void DeconvDwInt8Post(int8_t *dst, int32_t *output_buffer, const int32_t *bias, int block_channel, int pixel_nums, int out_multiplier, int left_shift, int right_shift, int32_t out_zp, int32_t acc_min, int32_t acc_max); +int16x8_t LoadAndAddOffset(int8_t *data, int index, int offset); +int32x4_t ClacScaledInput(int32x4_t input, int32x4_t left_shift_result_vec, int32x4_t input_multiplier_vec, + int32x4_t right_shift_vec); #endif #ifdef ENABLE_ARM32 diff --git a/mindspore/lite/nnacl/int8/mul_int8.c b/mindspore/lite/nnacl/int8/mul_int8.c index 3e519ba287..35c530b244 100644 --- a/mindspore/lite/nnacl/int8/mul_int8.c +++ b/mindspore/lite/nnacl/int8/mul_int8.c @@ -18,7 +18,7 @@ #include "nnacl/mul_parameter.h" #ifdef ENABLE_NEON #include -#include "nnacl/int8/add_int8.h" +#include "nnacl/int8/common_func_int8.h" #endif #include "nnacl/quantization/fixed_point.h" diff --git a/mindspore/lite/nnacl/int8/sub_int8.c b/mindspore/lite/nnacl/int8/sub_int8.c index c7b4faf2af..1961dd0d80 100644 --- a/mindspore/lite/nnacl/int8/sub_int8.c +++ b/mindspore/lite/nnacl/int8/sub_int8.c @@ -17,7 +17,7 @@ #include "nnacl/int8/sub_int8.h" #ifdef ENABLE_NEON #include -#include "nnacl/int8/add_int8.h" +#include "nnacl/int8/common_func_int8.h" #endif #include "nnacl/quantization/fixed_point.h" diff --git a/mindspore/lite/nnacl/quantization/fixed_point.c b/mindspore/lite/nnacl/quantization/fixed_point.c index 52adfa8dec..bfdc17cacb 100644 --- a/mindspore/lite/nnacl/quantization/fixed_point.c +++ b/mindspore/lite/nnacl/quantization/fixed_point.c @@ -54,6 +54,10 @@ int MultiplyByQuantizedMultiplier(int32_t value, int32_t multiplier, int32_t lef return RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(value * (1 << left_shift), multiplier), -right_shift); } +int MultiplyByMultiplierAndRightShift(int32_t value, int32_t multiplier, int32_t right_shift) { + return RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(value, multiplier), right_shift); +} + int FractionsBits(int integer_bits) { return 8 * sizeof(int32_t) - 1 - integer_bits; } int FixedPoint_One(int integer_bits, int fractions_bits) { diff --git a/mindspore/lite/nnacl/quantization/fixed_point.h b/mindspore/lite/nnacl/quantization/fixed_point.h index 5a2848312f..6d81a0a8e6 100644 --- a/mindspore/lite/nnacl/quantization/fixed_point.h +++ b/mindspore/lite/nnacl/quantization/fixed_point.h @@ -42,6 +42,8 @@ int RoundingDivideByPOT(int x, int exponent); int MultiplyByQuantizedMultiplier(int32_t value, int32_t multiplier, int32_t left_shift, int32_t right_shift); +int MultiplyByMultiplierAndRightShift(int32_t value, int32_t multiplier, int32_t right_shift); + int SaturatingRoundingMultiplyByPOT(int32_t x, int exponent); int32_t Rescale(int x, int kIntegerBitsSrc, int kIntegerBitsDst); diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/add_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/add_int8.cc index c594e01b65..ff3114293c 100644 --- a/mindspore/lite/src/runtime/kernel/arm/int8/add_int8.cc +++ b/mindspore/lite/src/runtime/kernel/arm/int8/add_int8.cc @@ -22,6 +22,7 @@ #include "src/runtime/runtime_api.h" #include "src/kernel_registry.h" #include "include/errorcode.h" +#include "src/common/file_utils.h" using mindspore::lite::KernelRegistrar; using mindspore::lite::RET_ERROR; @@ -30,111 +31,108 @@ using mindspore::schema::PrimitiveType_Add; namespace mindspore::kernel { int QuantizedAddCPUKernel::Init() { - lite::Tensor *input0 = in_tensors_.at(0); - lite::Tensor *input1 = in_tensors_.at(1); - lite::Tensor *output = out_tensors_.at(0); - MS_ASSERT(input0); - MS_ASSERT(input1); - MS_ASSERT(output); - - para_.input0_scale_ = input0->GetQuantParams().front().scale; - para_.input0_offset_ = input0->GetQuantParams().front().zeroPoint * -1; - para_.input1_scale_ = input1->GetQuantParams().front().scale; - para_.input1_offset_ = input1->GetQuantParams().front().zeroPoint * -1; - para_.output_scale_ = output->GetQuantParams().front().scale; - para_.output_offset_ = output->GetQuantParams().front().zeroPoint; - - const int left_shift = 20; // 1 << 20, 2/20 - const double twice_max_input_scale = 2 * std::max(para_.input0_scale_, para_.input1_scale_); - const double real_input0_multiplier = para_.input0_scale_ / twice_max_input_scale; - const double real_input1_multiplier = para_.input1_scale_ / twice_max_input_scale; - const double real_output_multiplier = twice_max_input_scale / ((1 << left_shift) * para_.output_scale_); - - QuantizeMultiplierSmallerThanOne(real_input0_multiplier, ¶_.input0_multiplier_, ¶_.input0_shift_); - QuantizeMultiplierSmallerThanOne(real_input1_multiplier, ¶_.input1_multiplier_, ¶_.input1_shift_); - QuantizeMultiplierSmallerThanOne(real_output_multiplier, ¶_.output_multiplier_, ¶_.output_shift_); - - switch (arith_para_->activation_type_) { - case schema::ActivationType_RELU: - para_.output_activation_min_ = 0; - para_.output_activation_max_ = std::numeric_limits::max(); - break; - case schema::ActivationType_RELU6: - para_.output_activation_min_ = 0; - para_.output_activation_max_ = 6; - break; - case schema::ActivationType_NO_ACTIVATION: - para_.output_activation_min_ = std::numeric_limits::min(); - para_.output_activation_max_ = std::numeric_limits::max(); - break; - default: - MS_LOG(ERROR) << "Add does not support activation type " << arith_para_->activation_type_; - return RET_ERROR; - } - - int left_shift0 = -para_.input0_shift_ > 0 ? -para_.input0_shift_ : 0; - para_.right_shift0_ = -para_.input0_shift_ > 0 ? 0 : para_.input0_shift_; - - int left_shift1 = -para_.input1_shift_ > 0 ? -para_.input1_shift_ : 0; - para_.right_shift1_ = -para_.input1_shift_ > 0 ? 0 : para_.input1_shift_; + auto *input0 = in_tensors_.at(0); + auto *input1 = in_tensors_.at(1); + auto *output = out_tensors_.at(0); + auto act = arith_para_->activation_type_; - para_.left_shift_out_ = -para_.output_shift_ > 0 ? -para_.output_shift_ : 0; - para_.right_shift_out_ = -para_.output_shift_ > 0 ? 0 : para_.output_shift_; + para_.in0_zp_ = input0->GetQuantParams().front().zeroPoint * -1; + para_.in1_zp_ = input1->GetQuantParams().front().zeroPoint * -1; + para_.out_zp_ = output->GetQuantParams().front().zeroPoint; - para_.left_shift_result0_ = (1 << left_shift) * ((1 << left_shift0)); - para_.left_shift_result1_ = (1 << left_shift) * ((1 << left_shift1)); + const double in0_scale = input0->GetQuantParams().front().scale; + const double in1_scale = input1->GetQuantParams().front().scale; + const double out_scale = output->GetQuantParams().front().scale; - MS_ASSERT(left_shift + left_shift0 == left_shift); - MS_ASSERT(left_shift + left_shift1 == left_shift); - return 0; -} + para_.left_shift_ = 20; + const double twice_max_input_scale = 2 * std::max(in0_scale, in1_scale); + const double in0_multiplier = in0_scale / twice_max_input_scale; + const double in1_multiplier = in1_scale / twice_max_input_scale; + const double out_multiplier = twice_max_input_scale / ((1 << para_.left_shift_) * out_scale); -int QuantizedAddCPUKernel::ReSize() { return 0; } + QuantizeMultiplierSmallerThanOne(in0_multiplier, ¶_.in0_multiplier_, ¶_.in0_left_shift_); + QuantizeMultiplierSmallerThanOne(in1_multiplier, ¶_.in1_multiplier_, ¶_.in1_left_shift_); + QuantizeMultiplierSmallerThanOne(out_multiplier, ¶_.out_multiplier_, ¶_.out_left_shift_); -int QuantizedAddCPUKernel::Run() { - input0_data_ = static_cast(in_tensors_.at(0)->MutableData()); - input1_data_ = static_cast(in_tensors_.at(1)->MutableData()); - output_data_ = static_cast(out_tensors_.at(0)->MutableData()); + para_.in0_right_shift_ = -para_.in0_left_shift_ > 0 ? 0 : para_.in0_left_shift_; + para_.in1_right_shift_ = -para_.in1_left_shift_ > 0 ? 0 : para_.in1_left_shift_; + para_.out_right_shift_ = -para_.out_left_shift_ > 0 ? 0 : para_.out_left_shift_; - elements_num_ = out_tensors_.at(0)->ElementsNum(); - count_unit_ = thread_count_ > 1 ? UP_DIV(elements_num_, thread_count_) : elements_num_; + para_.in0_left_shift_ = -para_.in0_left_shift_ > 0 ? -para_.in0_left_shift_ : 0; + para_.in1_left_shift_ = -para_.in1_left_shift_ > 0 ? -para_.in1_left_shift_ : 0; + para_.out_left_shift_ = -para_.out_left_shift_ > 0 ? -para_.out_left_shift_ : 0; - if (in_tensors_.at(0)->ElementsNum() != in_tensors_.at(1)->ElementsNum()) { - input0_data_ = static_cast(ctx_->allocator->Malloc(out_tensors_.at(0)->Size())); - input1_data_ = static_cast(ctx_->allocator->Malloc(out_tensors_.at(0)->Size())); - if (!input0_data_ || !input1_data_) { - MS_LOG(ERROR) << "malloc input0_data_ || input1_data_ failed."; - return RET_ERROR; - } + CalculateActivationRangeQuantized(act == ActType_Relu, act == ActType_Relu6, 0, 1, ¶_.min_, ¶_.max_); - TileDimensionsUint8(static_cast(in_tensors_.at(0)->MutableData()), - static_cast(in_tensors_.at(1)->MutableData()), - reinterpret_cast(input0_data_), reinterpret_cast(input1_data_), - arith_para_); - auto ret = ParallelLaunch(this->context_->thread_pool_, AddInt8Run, this, thread_count_); - ctx_->allocator->Free(input0_data_); - ctx_->allocator->Free(input1_data_); - return ret; + if (!InferShapeDone()) { + return RET_OK; } + return ReSize(); +} - auto ret = ParallelLaunch(this->context_->thread_pool_, AddInt8Run, this, thread_count_); - return ret; +int QuantizedAddCPUKernel::ReSize() { + elements_num_ = out_tensors_.at(0)->ElementsNum(); + arith_para_->broadcasting_ = in_tensors_.at(0)->ElementsNum() != in_tensors_.at(1)->ElementsNum(); + + thread_count_ = MSMIN(elements_num_, op_parameter_->thread_num_); + thread_stride_ = UP_DIV(elements_num_, thread_count_); + return RET_OK; } int AddInt8Run(void *cdata, int task_id) { auto add = reinterpret_cast(cdata); add->DoExecute(task_id); - return lite::RET_OK; + return RET_OK; } -int QuantizedAddCPUKernel::DoExecute(int tId) { - int64_t real_dst_count = MSMIN(elements_num_ - tId * count_unit_, count_unit_); - int8_t *cur_input0_data = input0_data_ + tId * count_unit_; - int8_t *cur_input1_data = input1_data_ + tId * count_unit_; - int8_t *cur_output_data = output_data_ + tId * count_unit_; +int QuantizedAddCPUKernel::DoExecute(int task_id) { + int rest_count = elements_num_ - task_id * thread_stride_; + int real_count = MSMIN(thread_stride_, rest_count); + if (real_count <= 0) { + return RET_OK; + } - AddInt8(cur_input0_data, cur_input1_data, cur_output_data, real_dst_count, ¶_); - return lite::RET_OK; + int8_t *cur_input0_data = input0_data_ + task_id * thread_stride_; + int8_t *cur_input1_data = input1_data_ + task_id * thread_stride_; + int8_t *cur_output_data = output_data_ + task_id * thread_stride_; + + AddInt8(cur_input0_data, cur_input1_data, cur_output_data, real_count, ¶_); + return RET_OK; +} + +int QuantizedAddCPUKernel::Run() { + int8_t *src_in0 = static_cast(in_tensors_.at(0)->data_c()); + int8_t *src_in1 = static_cast(in_tensors_.at(1)->data_c()); + output_data_ = static_cast(out_tensors_.at(0)->data_c()); + + if (arith_para_->broadcasting_) { + input0_data_ = static_cast(context_->allocator->Malloc(elements_num_ * sizeof(int8_t))); + if (input0_data_ == nullptr) { + MS_LOG(ERROR) << "malloc input0_data_ failed."; + return RET_ERROR; + } + input1_data_ = static_cast(context_->allocator->Malloc(elements_num_ * sizeof(int8_t))); + if (input1_data_ == nullptr) { + context_->allocator->Free(input0_data_); + input0_data_ = nullptr; + MS_LOG(ERROR) << "malloc input1_data_ failed."; + return RET_ERROR; + } + + TileDimensionsInt8(src_in0, src_in1, input0_data_, input1_data_, arith_para_); + auto ret = ParallelLaunch(context_->thread_pool_, AddInt8Run, this, thread_count_); + + context_->allocator->Free(input0_data_); + context_->allocator->Free(input1_data_); + input0_data_ = nullptr; + input1_data_ = nullptr; + return ret; + } + + input0_data_ = src_in0; + input1_data_ = src_in1; + auto ret = ParallelLaunch(this->context_->thread_pool_, AddInt8Run, this, thread_count_); + return ret; } kernel::LiteKernel *CpuAddInt8KernelCreator(const std::vector &inputs, diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/add_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/add_int8.h index 5acaaf0ac9..ca98a6fffd 100644 --- a/mindspore/lite/src/runtime/kernel/arm/int8/add_int8.h +++ b/mindspore/lite/src/runtime/kernel/arm/int8/add_int8.h @@ -28,7 +28,7 @@ class QuantizedAddCPUKernel : public LiteKernel { explicit QuantizedAddCPUKernel(OpParameter *parameter, const std::vector &inputs, const std::vector &outputs, const lite::InnerContext *ctx, const mindspore::lite::PrimitiveC *primitive) - : LiteKernel(parameter, inputs, outputs, ctx, primitive), ctx_(ctx), thread_count_(ctx_->thread_num_) { + : LiteKernel(parameter, inputs, outputs, ctx, primitive) { arith_para_ = reinterpret_cast(parameter); } ~QuantizedAddCPUKernel() override {} @@ -39,12 +39,11 @@ class QuantizedAddCPUKernel : public LiteKernel { int DoExecute(int tId); private: - const lite::InnerContext *ctx_; AddQuantParameter para_; ArithmeticParameter *arith_para_; int thread_count_; - int64_t elements_num_; - int64_t count_unit_; + int thread_stride_; + int elements_num_; int8_t *input0_data_ = nullptr; int8_t *input1_data_ = nullptr; int8_t *output_data_ = nullptr; diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/transpose_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/transpose_int8.cc index 089055b30a..c6794db43d 100644 --- a/mindspore/lite/src/runtime/kernel/arm/int8/transpose_int8.cc +++ b/mindspore/lite/src/runtime/kernel/arm/int8/transpose_int8.cc @@ -132,11 +132,14 @@ int TransposeInt8CPUKernel::Run() { auto in_tensor = in_tensors_.front(); auto out_tensor = out_tensors_.front(); + auto in_dims = in_tensor->shape(); + auto out_dims = out_tensor->shape(); + in_ptr_ = reinterpret_cast(in_tensor->data_c()); out_ptr_ = reinterpret_cast(out_tensor->data_c()); - in_shape_ = in_tensor->shape().data(); - out_shape_ = out_tensor->shape().data(); + in_shape_ = in_dims.data(); + out_shape_ = out_dims.data(); int ret = MallocTmpBuf(); if (ret != RET_OK) { diff --git a/mindspore/lite/tools/common/node_util.cc b/mindspore/lite/tools/common/node_util.cc index 5d320e198e..c9dbe3321b 100644 --- a/mindspore/lite/tools/common/node_util.cc +++ b/mindspore/lite/tools/common/node_util.cc @@ -75,6 +75,7 @@ static const std::vector int8OpList = {schema::PrimitiveT schema::PrimitiveType_Conv2D, schema::PrimitiveType_DepthwiseConv2D, schema::PrimitiveType_Add, + schema::PrimitiveType_Transpose, schema::PrimitiveType_Pooling, schema::PrimitiveType_Concat, schema::PrimitiveType_SoftMax, diff --git a/mindspore/lite/tools/converter/quantizer/quantize_util.cc b/mindspore/lite/tools/converter/quantizer/quantize_util.cc index 37543b1126..a02251eba4 100644 --- a/mindspore/lite/tools/converter/quantizer/quantize_util.cc +++ b/mindspore/lite/tools/converter/quantizer/quantize_util.cc @@ -108,6 +108,7 @@ bool QuantStrategy::CanOpPostQuantized(AnfNodePtr &node) const { schema::PrimitiveType_DeDepthwiseConv2D, schema::PrimitiveType_DeConv2D, schema::PrimitiveType_Activation, + schema::PrimitiveType_Transpose, schema::PrimitiveType_Eltwise, }; bool contain = IsContain(int8OpList, type);