Browse Source

[MSLITE] add optimize

tags/v1.1.0
ling 5 years ago
parent
commit
b42a660d3b
14 changed files with 278 additions and 210 deletions
  1. +131
    -85
      mindspore/lite/nnacl/int8/add_int8.c
  2. +21
    -29
      mindspore/lite/nnacl/int8/add_int8.h
  3. +1
    -1
      mindspore/lite/nnacl/int8/arithmetic_self_int8.c
  4. +16
    -0
      mindspore/lite/nnacl/int8/common_func_int8.c
  5. +6
    -0
      mindspore/lite/nnacl/int8/common_func_int8.h
  6. +1
    -1
      mindspore/lite/nnacl/int8/mul_int8.c
  7. +1
    -1
      mindspore/lite/nnacl/int8/sub_int8.c
  8. +4
    -0
      mindspore/lite/nnacl/quantization/fixed_point.c
  9. +2
    -0
      mindspore/lite/nnacl/quantization/fixed_point.h
  10. +85
    -87
      mindspore/lite/src/runtime/kernel/arm/int8/add_int8.cc
  11. +3
    -4
      mindspore/lite/src/runtime/kernel/arm/int8/add_int8.h
  12. +5
    -2
      mindspore/lite/src/runtime/kernel/arm/int8/transpose_int8.cc
  13. +1
    -0
      mindspore/lite/tools/common/node_util.cc
  14. +1
    -0
      mindspore/lite/tools/converter/quantizer/quantize_util.cc

+ 131
- 85
mindspore/lite/nnacl/int8/add_int8.c View File

@@ -20,98 +20,144 @@
#endif #endif
#include "nnacl/quantization/fixed_point.h" #include "nnacl/quantization/fixed_point.h"


#ifdef ENABLE_NEON
int16x8_t LoadAndAddOffset(int8_t *data, int index, int offset) {
int8x8_t input_s8 = vld1_s8(data + index);
int16x8_t input_s16 = vmovl_s8(input_s8);
return vaddq_s16(input_s16, vdupq_n_s16(offset));
}
void AddInt8(const int8_t *input0, const int8_t *input1, int8_t *output, int size, AddQuantParameter *params) {
int in0_left_shift = (1 << params->left_shift_) * (1 << params->in0_left_shift_);
int in1_left_shift = (1 << params->left_shift_) * (1 << params->in1_left_shift_);
int index = 0;


int32x4_t ClacScaledInput(int32x4_t input, int32x4_t left_shift_result_vec, int32x4_t input_multiplier_vec,
int32x4_t right_shift_vec) {
int32x4_t shifted_input = vmulq_s32(input, left_shift_result_vec);
shifted_input = vqrdmulhq_s32(shifted_input, input_multiplier_vec);
const int32x4_t fixup = vshrq_n_s32(vandq_s32(shifted_input, right_shift_vec), 31);
return vrshlq_s32(vqaddq_s32(shifted_input, fixup), right_shift_vec);
}
#ifdef ENABLE_ARM
const int8x16_t min_vec = vdupq_n_s8(params->min_);
const int8x16_t max_vac = vdupq_n_s8(params->max_);


int16x4_t AddClacSumHalfWord(int32x4_t scaled_input0, int32x4_t scaled_input1, int32x4_t left_shift_out_vec,
int32x4_t output_multiplier_vec, AddQuantParameter *para) {
int32x4_t raw_sum = vaddq_s32(scaled_input0, scaled_input1);
const int16x8_t in0_zp_vec = vdupq_n_s16(params->in0_zp_);
const int16x8_t in1_zp_vec = vdupq_n_s16(params->in1_zp_);
const int16x8_t out_zp_vec = vdupq_n_s16(params->out_zp_);


raw_sum = RoundingDivideByPOTInt32x4(vqrdmulhq_s32(vmulq_s32(raw_sum, left_shift_out_vec), output_multiplier_vec),
para->right_shift_out_);
raw_sum = vaddq_s32(raw_sum, vdupq_n_s32(para->output_offset_));
raw_sum = vmaxq_s32(raw_sum, vdupq_n_s32(para->output_activation_min_));
raw_sum = vminq_s32(raw_sum, vdupq_n_s32(para->output_activation_max_));
return vqmovn_s32(raw_sum);
}
const int32x4_t in0_left_vec = vdupq_n_s32(in0_left_shift);
const int32x4_t in1_left_vec = vdupq_n_s32(in1_left_shift);

const int32x4_t in0_right_vec = vdupq_n_s32(-params->in0_right_shift_);
const int32x4_t in1_right_vec = vdupq_n_s32(-params->in1_right_shift_);

const int32x4_t out_left_vec = vdupq_n_s32(params->out_left_shift_);
const int32x4_t out_right_vec = vdupq_n_s32(-params->out_right_shift_);

for (; index <= size - 16; index += 16) {
const int8x16_t in0_src = vld1q_s8(input0 + index);
const int8x16_t in1_src = vld1q_s8(input1 + index);

const int16x8_t in0_s16_low = vmovl_s8(vget_low_s8(in0_src));
const int16x8_t in0_s16_high = vmovl_s8(vget_high_s8(in0_src));
const int16x8_t in1_s16_low = vmovl_s8(vget_low_s8(in1_src));
const int16x8_t in1_s16_high = vmovl_s8(vget_high_s8(in1_src));

const int16x8_t in0_zp_low = vaddq_s16(in0_s16_low, in0_zp_vec);
const int16x8_t in0_zp_high = vaddq_s16(in0_s16_high, in0_zp_vec);
const int16x8_t in1_zp_low = vaddq_s16(in1_s16_low, in1_zp_vec);
const int16x8_t in1_zp_high = vaddq_s16(in1_s16_high, in1_zp_vec);

int32x4_t in0_1 = vmovl_s16(vget_low_s16(in0_zp_low));
int32x4_t in0_2 = vmovl_s16(vget_high_s16(in0_zp_low));
int32x4_t in0_3 = vmovl_s16(vget_low_s16(in0_zp_high));
int32x4_t in0_4 = vmovl_s16(vget_high_s16(in0_zp_high));
int32x4_t in1_1 = vmovl_s16(vget_low_s16(in1_zp_low));
int32x4_t in1_2 = vmovl_s16(vget_high_s16(in1_zp_low));
int32x4_t in1_3 = vmovl_s16(vget_low_s16(in1_zp_high));
int32x4_t in1_4 = vmovl_s16(vget_high_s16(in1_zp_high));

// Apply left shift
in0_1 = vmulq_s32(in0_1, in0_left_vec);
in0_2 = vmulq_s32(in0_2, in0_left_vec);
in0_3 = vmulq_s32(in0_3, in0_left_vec);
in0_4 = vmulq_s32(in0_4, in0_left_vec);
in1_1 = vmulq_s32(in1_1, in1_left_vec);
in1_2 = vmulq_s32(in1_2, in1_left_vec);
in1_3 = vmulq_s32(in1_3, in1_left_vec);
in1_4 = vmulq_s32(in1_4, in1_left_vec);

// Apply the fixed-point part of the multiplier.
in0_1 = vqrdmulhq_n_s32(in0_1, params->in0_multiplier_);
in0_2 = vqrdmulhq_n_s32(in0_2, params->in0_multiplier_);
in0_3 = vqrdmulhq_n_s32(in0_3, params->in0_multiplier_);
in0_4 = vqrdmulhq_n_s32(in0_4, params->in0_multiplier_);
in1_1 = vqrdmulhq_n_s32(in1_1, params->in1_multiplier_);
in1_2 = vqrdmulhq_n_s32(in1_2, params->in1_multiplier_);
in1_3 = vqrdmulhq_n_s32(in1_3, params->in1_multiplier_);
in1_4 = vqrdmulhq_n_s32(in1_4, params->in1_multiplier_);

// Apply right shift
in0_1 = vqaddq_s32(in0_1, vshrq_n_s32(vandq_s32(in0_1, in0_right_vec), 31));
in0_2 = vqaddq_s32(in0_2, vshrq_n_s32(vandq_s32(in0_2, in0_right_vec), 31));
in0_3 = vqaddq_s32(in0_3, vshrq_n_s32(vandq_s32(in0_3, in0_right_vec), 31));
in0_4 = vqaddq_s32(in0_4, vshrq_n_s32(vandq_s32(in0_4, in0_right_vec), 31));
in1_1 = vqaddq_s32(in1_1, vshrq_n_s32(vandq_s32(in1_1, in1_right_vec), 31));
in1_2 = vqaddq_s32(in1_2, vshrq_n_s32(vandq_s32(in1_2, in1_right_vec), 31));
in1_3 = vqaddq_s32(in1_3, vshrq_n_s32(vandq_s32(in1_3, in1_right_vec), 31));
in1_4 = vqaddq_s32(in1_4, vshrq_n_s32(vandq_s32(in1_4, in1_right_vec), 31));


void AddInt8NEON(int8_t *input0_data, int8_t *input1_data, int8_t *output_data, int64_t real_dst_count,
AddQuantParameter *para, int *index) {
int32x4_t left_shift_result0_vec = vdupq_n_s32(para->left_shift_result0_);
int32x4_t left_shift_result1_vec = vdupq_n_s32(para->left_shift_result1_);
int32x4_t input0_multiplier_vec = vdupq_n_s32(para->input0_multiplier_);
int32x4_t input1_multiplier_vec = vdupq_n_s32(para->input1_multiplier_);
int32x4_t output_multiplier_vec = vdupq_n_s32(para->output_multiplier_);
int32x4_t left_shift_out_vec = vdupq_n_s32((1 << para->left_shift_out_));
int32x4_t right_shift0_vec = vdupq_n_s32(-para->right_shift0_);
int32x4_t right_shift1_vec = vdupq_n_s32(-para->right_shift1_);

for (; (*index) <= real_dst_count - 8; (*index) += 8) {
int16x8_t input0_val = LoadAndAddOffset(input0_data, *index, para->input0_offset_);
int16x8_t input1_val = LoadAndAddOffset(input1_data, *index, para->input1_offset_);

int32x4_t input0_low = vmovl_s16(vget_low_s16(input0_val));
int32x4_t input0_high = vmovl_s16(vget_high_s16(input0_val));
int32x4_t input1_low = vmovl_s16(vget_low_s16(input1_val));
int32x4_t input1_high = vmovl_s16(vget_high_s16(input1_val));

int32x4_t scaled_input0_low =
ClacScaledInput(input0_low, left_shift_result0_vec, input0_multiplier_vec, right_shift0_vec);
int32x4_t scaled_input0_high =
ClacScaledInput(input0_high, left_shift_result0_vec, input0_multiplier_vec, right_shift0_vec);
int32x4_t scaled_input1_low =
ClacScaledInput(input1_low, left_shift_result1_vec, input1_multiplier_vec, right_shift1_vec);
int32x4_t scaled_input1_high =
ClacScaledInput(input1_high, left_shift_result1_vec, input1_multiplier_vec, right_shift1_vec);

int16x4_t sum_low =
AddClacSumHalfWord(scaled_input0_low, scaled_input1_low, left_shift_out_vec, output_multiplier_vec, para);
int16x4_t sum_high =
AddClacSumHalfWord(scaled_input0_high, scaled_input1_high, left_shift_out_vec, output_multiplier_vec, para);

int16x8_t res_s16 = vcombine_s16(sum_low, sum_high);
int8x8_t res_u8_n0 = vqmovn_s16(res_s16);
vst1_s8(output_data + *index, res_u8_n0);
in0_1 = vrshlq_s32(in0_1, in0_right_vec);
in0_2 = vrshlq_s32(in0_2, in0_right_vec);
in0_3 = vrshlq_s32(in0_3, in0_right_vec);
in0_4 = vrshlq_s32(in0_4, in0_right_vec);
in1_1 = vrshlq_s32(in1_1, in1_right_vec);
in1_2 = vrshlq_s32(in1_2, in1_right_vec);
in1_3 = vrshlq_s32(in1_3, in1_right_vec);
in1_4 = vrshlq_s32(in1_4, in1_right_vec);

/* calculate output */
int32x4_t out1 = vaddq_s32(in0_1, in1_1);
int32x4_t out2 = vaddq_s32(in0_2, in1_2);
int32x4_t out3 = vaddq_s32(in0_3, in1_3);
int32x4_t out4 = vaddq_s32(in0_4, in1_4);

// Apply left shift
out1 = vshlq_s32(out1, out_left_vec);
out2 = vshlq_s32(out2, out_left_vec);
out3 = vshlq_s32(out3, out_left_vec);
out4 = vshlq_s32(out4, out_left_vec);

// Apply the fixed-point part of the multiplier.
out1 = vqrdmulhq_n_s32(out1, params->out_multiplier_);
out2 = vqrdmulhq_n_s32(out2, params->out_multiplier_);
out3 = vqrdmulhq_n_s32(out3, params->out_multiplier_);
out4 = vqrdmulhq_n_s32(out4, params->out_multiplier_);

// Apply right shift
out1 = vqaddq_s32(out1, vshrq_n_s32(vandq_s32(out1, out_right_vec), 31));
out2 = vqaddq_s32(out2, vshrq_n_s32(vandq_s32(out2, out_right_vec), 31));
out3 = vqaddq_s32(out3, vshrq_n_s32(vandq_s32(out3, out_right_vec), 31));
out4 = vqaddq_s32(out4, vshrq_n_s32(vandq_s32(out4, out_right_vec), 31));

out1 = vrshlq_s32(out1, out_right_vec);
out2 = vrshlq_s32(out2, out_right_vec);
out3 = vrshlq_s32(out3, out_right_vec);
out4 = vrshlq_s32(out4, out_right_vec);

const int16x4_t out1_s16 = vmovn_s32(out1);
const int16x4_t out2_s16 = vmovn_s32(out2);
const int16x4_t out3_s16 = vmovn_s32(out3);
const int16x4_t out4_s16 = vmovn_s32(out4);

const int16x8_t out_s16_1 = vaddq_s16(vcombine_s16(out1_s16, out2_s16), out_zp_vec);
const int16x8_t out_s16_2 = vaddq_s16(vcombine_s16(out3_s16, out4_s16), out_zp_vec);

const int8x16_t out = vcombine_s8(vqmovn_s16(out_s16_1), vqmovn_s16(out_s16_2));
const int8x16_t int8_out = vmaxq_s8(min_vec, vminq_s8(max_vac, out));

vst1q_s8(output + index, int8_out);
} }
}
#endif #endif


void AddInt8(int8_t *input0_data, int8_t *input1_data, int8_t *output_data, int64_t real_dst_count,
AddQuantParameter *para) {
int index = 0;
#ifdef ENABLE_NEON
AddInt8NEON(input0_data, input1_data, output_data, real_dst_count, para, &index);
#endif
for (; index < real_dst_count; ++index) {
const int32_t input0_val = para->input0_offset_ + input0_data[index];
const int32_t input1_val = para->input1_offset_ + input1_data[index];
const int32_t shifted_input0_val = input0_val * para->left_shift_result0_;
const int32_t shifted_input1_val = input1_val * para->left_shift_result1_;
const int32_t scaled_input0_val = RoundingDivideByPOT(
SaturatingRoundingDoublingHighMul(shifted_input0_val, para->input0_multiplier_), para->right_shift0_);
const int32_t scaled_input1_val = RoundingDivideByPOT(
SaturatingRoundingDoublingHighMul(shifted_input1_val, para->input1_multiplier_), para->right_shift1_);

const int32_t raw_sum = scaled_input0_val + scaled_input1_val;
const int32_t raw_output =
RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(raw_sum * (1 << (unsigned int)para->left_shift_out_),
para->output_multiplier_),
para->right_shift_out_) +
para->output_offset_;

output_data[index] = (int8_t)MSMAX(para->output_activation_min_, MSMIN(raw_output, para->output_activation_max_));
for (; index < size; index++) {
const int32_t in0_left = (input0[index] + params->in0_zp_) * in0_left_shift;
const int32_t in1_left = (input1[index] + params->in1_zp_) * in1_left_shift;
const int32_t in0 = MultiplyByMultiplierAndRightShift(in0_left, params->in0_multiplier_, params->in0_right_shift_);
const int32_t in1 = MultiplyByMultiplierAndRightShift(in1_left, params->in1_multiplier_, params->in1_right_shift_);

int32_t out = MultiplyByQuantizedMultiplier(in0 + in1, params->out_multiplier_, params->out_left_shift_,
-params->out_right_shift_);
out += params->out_zp_;
output[index] = (int8_t)MSMAX(params->min_, MSMIN(out, params->max_));
} }
return; return;
} }

+ 21
- 29
mindspore/lite/nnacl/int8/add_int8.h View File

@@ -20,43 +20,35 @@
#include "nnacl/op_base.h" #include "nnacl/op_base.h"


typedef struct AddQuantParameter { typedef struct AddQuantParameter {
int input0_offset_;
int input1_offset_;
int output_offset_;
float input0_scale_;
float input1_scale_;
float output_scale_;
int input0_multiplier_;
int input1_multiplier_;
int output_multiplier_;
int input0_shift_;
int input1_shift_;
int output_shift_;
int output_activation_min_;
int output_activation_max_;
int left_shift_result0_;
int left_shift_result1_;
int right_shift0_;
int right_shift1_;
int left_shift_out_;
int right_shift_out_;
int left_shift_;
int32_t min_;
int32_t max_;

int32_t in0_zp_;
int32_t in1_zp_;
int32_t out_zp_;

int32_t in0_left_shift_;
int32_t in0_right_shift_;
int32_t in0_multiplier_;

int32_t in1_left_shift_;
int32_t in1_right_shift_;
int32_t in1_multiplier_;

int32_t out_left_shift_;
int32_t out_right_shift_;
int32_t out_multiplier_;
} AddQuantParameter; } AddQuantParameter;


#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif


void AddInt8(int8_t *input0_data, int8_t *input1_data, int8_t *output_data, int64_t real_dst_count,
AddQuantParameter *para);
void AddInt8(const int8_t *input0, const int8_t *input1, int8_t *output, int size, AddQuantParameter *params);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif


#ifdef ENABLE_NEON
#include <arm_neon.h>
int16x8_t LoadAndAddOffset(int8_t *data, int index, int offset);
int32x4_t ClacScaledInput(int32x4_t input, int32x4_t left_shift_result_vec, int32x4_t input_multiplier_vec,
int32x4_t right_shift_vec);
#endif

#endif // MINDSPORE_LITE_NNACL_ADD_INT8_H_ #endif // MINDSPORE_LITE_NNACL_ADD_INT8_H_

+ 1
- 1
mindspore/lite/nnacl/int8/arithmetic_self_int8.c View File

@@ -18,7 +18,7 @@
#include "nnacl/int8/arithmetic_self_int8.h" #include "nnacl/int8/arithmetic_self_int8.h"
#ifdef ENABLE_NEON #ifdef ENABLE_NEON
#include <arm_neon.h> #include <arm_neon.h>
#include "nnacl/int8/add_int8.h"
#include "nnacl/int8/common_func_int8.h"
#endif #endif
#include "nnacl/quantization/fixed_point.h" #include "nnacl/quantization/fixed_point.h"




+ 16
- 0
mindspore/lite/nnacl/int8/common_func_int8.c View File

@@ -56,3 +56,19 @@ void PostFuncInt8C4(const int32_t *in, const int32_t *bias, int8_t *out, size_t
#endif #endif
return; return;
} }

#ifdef ENABLE_ARM
int16x8_t LoadAndAddOffset(int8_t *data, int index, int offset) {
int8x8_t input_s8 = vld1_s8(data + index);
int16x8_t input_s16 = vmovl_s8(input_s8);
return vaddq_s16(input_s16, vdupq_n_s16(offset));
}

int32x4_t ClacScaledInput(int32x4_t input, int32x4_t left_shift_result_vec, int32x4_t input_multiplier_vec,
int32x4_t right_shift_vec) {
int32x4_t shifted_input = vmulq_s32(input, left_shift_result_vec);
shifted_input = vqrdmulhq_s32(shifted_input, input_multiplier_vec);
const int32x4_t fixup = vshrq_n_s32(vandq_s32(shifted_input, right_shift_vec), 31);
return vrshlq_s32(vqaddq_s32(shifted_input, fixup), right_shift_vec);
}
#endif

+ 6
- 0
mindspore/lite/nnacl/int8/common_func_int8.h View File

@@ -20,6 +20,9 @@
#include <stdint.h> #include <stdint.h>
#include <stdio.h> #include <stdio.h>
#include <string.h> #include <string.h>
#ifdef ENABLE_NEON
#include <arm_neon.h>
#endif
#include "nnacl/op_base.h" #include "nnacl/op_base.h"
#include "nnacl/conv_parameter.h" #include "nnacl/conv_parameter.h"


@@ -50,6 +53,9 @@ void ConvDwInt8Center(int8_t *dst, const int8_t *src, const int16_t *weight, con
void DeconvDwInt8Post(int8_t *dst, int32_t *output_buffer, const int32_t *bias, int block_channel, int pixel_nums, void DeconvDwInt8Post(int8_t *dst, int32_t *output_buffer, const int32_t *bias, int block_channel, int pixel_nums,
int out_multiplier, int left_shift, int right_shift, int32_t out_zp, int32_t acc_min, int out_multiplier, int left_shift, int right_shift, int32_t out_zp, int32_t acc_min,
int32_t acc_max); int32_t acc_max);
int16x8_t LoadAndAddOffset(int8_t *data, int index, int offset);
int32x4_t ClacScaledInput(int32x4_t input, int32x4_t left_shift_result_vec, int32x4_t input_multiplier_vec,
int32x4_t right_shift_vec);
#endif #endif


#ifdef ENABLE_ARM32 #ifdef ENABLE_ARM32


+ 1
- 1
mindspore/lite/nnacl/int8/mul_int8.c View File

@@ -18,7 +18,7 @@
#include "nnacl/mul_parameter.h" #include "nnacl/mul_parameter.h"
#ifdef ENABLE_NEON #ifdef ENABLE_NEON
#include <arm_neon.h> #include <arm_neon.h>
#include "nnacl/int8/add_int8.h"
#include "nnacl/int8/common_func_int8.h"
#endif #endif
#include "nnacl/quantization/fixed_point.h" #include "nnacl/quantization/fixed_point.h"




+ 1
- 1
mindspore/lite/nnacl/int8/sub_int8.c View File

@@ -17,7 +17,7 @@
#include "nnacl/int8/sub_int8.h" #include "nnacl/int8/sub_int8.h"
#ifdef ENABLE_NEON #ifdef ENABLE_NEON
#include <arm_neon.h> #include <arm_neon.h>
#include "nnacl/int8/add_int8.h"
#include "nnacl/int8/common_func_int8.h"
#endif #endif
#include "nnacl/quantization/fixed_point.h" #include "nnacl/quantization/fixed_point.h"




+ 4
- 0
mindspore/lite/nnacl/quantization/fixed_point.c View File

@@ -54,6 +54,10 @@ int MultiplyByQuantizedMultiplier(int32_t value, int32_t multiplier, int32_t lef
return RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(value * (1 << left_shift), multiplier), -right_shift); return RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(value * (1 << left_shift), multiplier), -right_shift);
} }


int MultiplyByMultiplierAndRightShift(int32_t value, int32_t multiplier, int32_t right_shift) {
return RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(value, multiplier), right_shift);
}

int FractionsBits(int integer_bits) { return 8 * sizeof(int32_t) - 1 - integer_bits; } int FractionsBits(int integer_bits) { return 8 * sizeof(int32_t) - 1 - integer_bits; }


int FixedPoint_One(int integer_bits, int fractions_bits) { int FixedPoint_One(int integer_bits, int fractions_bits) {


+ 2
- 0
mindspore/lite/nnacl/quantization/fixed_point.h View File

@@ -42,6 +42,8 @@ int RoundingDivideByPOT(int x, int exponent);


int MultiplyByQuantizedMultiplier(int32_t value, int32_t multiplier, int32_t left_shift, int32_t right_shift); int MultiplyByQuantizedMultiplier(int32_t value, int32_t multiplier, int32_t left_shift, int32_t right_shift);


int MultiplyByMultiplierAndRightShift(int32_t value, int32_t multiplier, int32_t right_shift);

int SaturatingRoundingMultiplyByPOT(int32_t x, int exponent); int SaturatingRoundingMultiplyByPOT(int32_t x, int exponent);


int32_t Rescale(int x, int kIntegerBitsSrc, int kIntegerBitsDst); int32_t Rescale(int x, int kIntegerBitsSrc, int kIntegerBitsDst);


+ 85
- 87
mindspore/lite/src/runtime/kernel/arm/int8/add_int8.cc View File

@@ -22,6 +22,7 @@
#include "src/runtime/runtime_api.h" #include "src/runtime/runtime_api.h"
#include "src/kernel_registry.h" #include "src/kernel_registry.h"
#include "include/errorcode.h" #include "include/errorcode.h"
#include "src/common/file_utils.h"


using mindspore::lite::KernelRegistrar; using mindspore::lite::KernelRegistrar;
using mindspore::lite::RET_ERROR; using mindspore::lite::RET_ERROR;
@@ -30,111 +31,108 @@ using mindspore::schema::PrimitiveType_Add;


namespace mindspore::kernel { namespace mindspore::kernel {
int QuantizedAddCPUKernel::Init() { int QuantizedAddCPUKernel::Init() {
lite::Tensor *input0 = in_tensors_.at(0);
lite::Tensor *input1 = in_tensors_.at(1);
lite::Tensor *output = out_tensors_.at(0);
MS_ASSERT(input0);
MS_ASSERT(input1);
MS_ASSERT(output);

para_.input0_scale_ = input0->GetQuantParams().front().scale;
para_.input0_offset_ = input0->GetQuantParams().front().zeroPoint * -1;
para_.input1_scale_ = input1->GetQuantParams().front().scale;
para_.input1_offset_ = input1->GetQuantParams().front().zeroPoint * -1;
para_.output_scale_ = output->GetQuantParams().front().scale;
para_.output_offset_ = output->GetQuantParams().front().zeroPoint;

const int left_shift = 20; // 1 << 20, 2/20
const double twice_max_input_scale = 2 * std::max(para_.input0_scale_, para_.input1_scale_);
const double real_input0_multiplier = para_.input0_scale_ / twice_max_input_scale;
const double real_input1_multiplier = para_.input1_scale_ / twice_max_input_scale;
const double real_output_multiplier = twice_max_input_scale / ((1 << left_shift) * para_.output_scale_);

QuantizeMultiplierSmallerThanOne(real_input0_multiplier, &para_.input0_multiplier_, &para_.input0_shift_);
QuantizeMultiplierSmallerThanOne(real_input1_multiplier, &para_.input1_multiplier_, &para_.input1_shift_);
QuantizeMultiplierSmallerThanOne(real_output_multiplier, &para_.output_multiplier_, &para_.output_shift_);

switch (arith_para_->activation_type_) {
case schema::ActivationType_RELU:
para_.output_activation_min_ = 0;
para_.output_activation_max_ = std::numeric_limits<int8_t>::max();
break;
case schema::ActivationType_RELU6:
para_.output_activation_min_ = 0;
para_.output_activation_max_ = 6;
break;
case schema::ActivationType_NO_ACTIVATION:
para_.output_activation_min_ = std::numeric_limits<int8_t>::min();
para_.output_activation_max_ = std::numeric_limits<int8_t>::max();
break;
default:
MS_LOG(ERROR) << "Add does not support activation type " << arith_para_->activation_type_;
return RET_ERROR;
}

int left_shift0 = -para_.input0_shift_ > 0 ? -para_.input0_shift_ : 0;
para_.right_shift0_ = -para_.input0_shift_ > 0 ? 0 : para_.input0_shift_;

int left_shift1 = -para_.input1_shift_ > 0 ? -para_.input1_shift_ : 0;
para_.right_shift1_ = -para_.input1_shift_ > 0 ? 0 : para_.input1_shift_;
auto *input0 = in_tensors_.at(0);
auto *input1 = in_tensors_.at(1);
auto *output = out_tensors_.at(0);
auto act = arith_para_->activation_type_;


para_.left_shift_out_ = -para_.output_shift_ > 0 ? -para_.output_shift_ : 0;
para_.right_shift_out_ = -para_.output_shift_ > 0 ? 0 : para_.output_shift_;
para_.in0_zp_ = input0->GetQuantParams().front().zeroPoint * -1;
para_.in1_zp_ = input1->GetQuantParams().front().zeroPoint * -1;
para_.out_zp_ = output->GetQuantParams().front().zeroPoint;


para_.left_shift_result0_ = (1 << left_shift) * ((1 << left_shift0));
para_.left_shift_result1_ = (1 << left_shift) * ((1 << left_shift1));
const double in0_scale = input0->GetQuantParams().front().scale;
const double in1_scale = input1->GetQuantParams().front().scale;
const double out_scale = output->GetQuantParams().front().scale;


MS_ASSERT(left_shift + left_shift0 == left_shift);
MS_ASSERT(left_shift + left_shift1 == left_shift);
return 0;
}
para_.left_shift_ = 20;
const double twice_max_input_scale = 2 * std::max(in0_scale, in1_scale);
const double in0_multiplier = in0_scale / twice_max_input_scale;
const double in1_multiplier = in1_scale / twice_max_input_scale;
const double out_multiplier = twice_max_input_scale / ((1 << para_.left_shift_) * out_scale);


int QuantizedAddCPUKernel::ReSize() { return 0; }
QuantizeMultiplierSmallerThanOne(in0_multiplier, &para_.in0_multiplier_, &para_.in0_left_shift_);
QuantizeMultiplierSmallerThanOne(in1_multiplier, &para_.in1_multiplier_, &para_.in1_left_shift_);
QuantizeMultiplierSmallerThanOne(out_multiplier, &para_.out_multiplier_, &para_.out_left_shift_);


int QuantizedAddCPUKernel::Run() {
input0_data_ = static_cast<int8_t *>(in_tensors_.at(0)->MutableData());
input1_data_ = static_cast<int8_t *>(in_tensors_.at(1)->MutableData());
output_data_ = static_cast<int8_t *>(out_tensors_.at(0)->MutableData());
para_.in0_right_shift_ = -para_.in0_left_shift_ > 0 ? 0 : para_.in0_left_shift_;
para_.in1_right_shift_ = -para_.in1_left_shift_ > 0 ? 0 : para_.in1_left_shift_;
para_.out_right_shift_ = -para_.out_left_shift_ > 0 ? 0 : para_.out_left_shift_;


elements_num_ = out_tensors_.at(0)->ElementsNum();
count_unit_ = thread_count_ > 1 ? UP_DIV(elements_num_, thread_count_) : elements_num_;
para_.in0_left_shift_ = -para_.in0_left_shift_ > 0 ? -para_.in0_left_shift_ : 0;
para_.in1_left_shift_ = -para_.in1_left_shift_ > 0 ? -para_.in1_left_shift_ : 0;
para_.out_left_shift_ = -para_.out_left_shift_ > 0 ? -para_.out_left_shift_ : 0;


if (in_tensors_.at(0)->ElementsNum() != in_tensors_.at(1)->ElementsNum()) {
input0_data_ = static_cast<int8_t *>(ctx_->allocator->Malloc(out_tensors_.at(0)->Size()));
input1_data_ = static_cast<int8_t *>(ctx_->allocator->Malloc(out_tensors_.at(0)->Size()));
if (!input0_data_ || !input1_data_) {
MS_LOG(ERROR) << "malloc input0_data_ || input1_data_ failed.";
return RET_ERROR;
}
CalculateActivationRangeQuantized(act == ActType_Relu, act == ActType_Relu6, 0, 1, &para_.min_, &para_.max_);


TileDimensionsUint8(static_cast<uint8_t *>(in_tensors_.at(0)->MutableData()),
static_cast<uint8_t *>(in_tensors_.at(1)->MutableData()),
reinterpret_cast<uint8_t *>(input0_data_), reinterpret_cast<uint8_t *>(input1_data_),
arith_para_);
auto ret = ParallelLaunch(this->context_->thread_pool_, AddInt8Run, this, thread_count_);
ctx_->allocator->Free(input0_data_);
ctx_->allocator->Free(input1_data_);
return ret;
if (!InferShapeDone()) {
return RET_OK;
} }
return ReSize();
}


auto ret = ParallelLaunch(this->context_->thread_pool_, AddInt8Run, this, thread_count_);
return ret;
int QuantizedAddCPUKernel::ReSize() {
elements_num_ = out_tensors_.at(0)->ElementsNum();
arith_para_->broadcasting_ = in_tensors_.at(0)->ElementsNum() != in_tensors_.at(1)->ElementsNum();

thread_count_ = MSMIN(elements_num_, op_parameter_->thread_num_);
thread_stride_ = UP_DIV(elements_num_, thread_count_);
return RET_OK;
} }


int AddInt8Run(void *cdata, int task_id) { int AddInt8Run(void *cdata, int task_id) {
auto add = reinterpret_cast<QuantizedAddCPUKernel *>(cdata); auto add = reinterpret_cast<QuantizedAddCPUKernel *>(cdata);
add->DoExecute(task_id); add->DoExecute(task_id);
return lite::RET_OK;
return RET_OK;
} }


int QuantizedAddCPUKernel::DoExecute(int tId) {
int64_t real_dst_count = MSMIN(elements_num_ - tId * count_unit_, count_unit_);
int8_t *cur_input0_data = input0_data_ + tId * count_unit_;
int8_t *cur_input1_data = input1_data_ + tId * count_unit_;
int8_t *cur_output_data = output_data_ + tId * count_unit_;
int QuantizedAddCPUKernel::DoExecute(int task_id) {
int rest_count = elements_num_ - task_id * thread_stride_;
int real_count = MSMIN(thread_stride_, rest_count);
if (real_count <= 0) {
return RET_OK;
}


AddInt8(cur_input0_data, cur_input1_data, cur_output_data, real_dst_count, &para_);
return lite::RET_OK;
int8_t *cur_input0_data = input0_data_ + task_id * thread_stride_;
int8_t *cur_input1_data = input1_data_ + task_id * thread_stride_;
int8_t *cur_output_data = output_data_ + task_id * thread_stride_;

AddInt8(cur_input0_data, cur_input1_data, cur_output_data, real_count, &para_);
return RET_OK;
}

int QuantizedAddCPUKernel::Run() {
int8_t *src_in0 = static_cast<int8_t *>(in_tensors_.at(0)->data_c());
int8_t *src_in1 = static_cast<int8_t *>(in_tensors_.at(1)->data_c());
output_data_ = static_cast<int8_t *>(out_tensors_.at(0)->data_c());

if (arith_para_->broadcasting_) {
input0_data_ = static_cast<int8_t *>(context_->allocator->Malloc(elements_num_ * sizeof(int8_t)));
if (input0_data_ == nullptr) {
MS_LOG(ERROR) << "malloc input0_data_ failed.";
return RET_ERROR;
}
input1_data_ = static_cast<int8_t *>(context_->allocator->Malloc(elements_num_ * sizeof(int8_t)));
if (input1_data_ == nullptr) {
context_->allocator->Free(input0_data_);
input0_data_ = nullptr;
MS_LOG(ERROR) << "malloc input1_data_ failed.";
return RET_ERROR;
}

TileDimensionsInt8(src_in0, src_in1, input0_data_, input1_data_, arith_para_);
auto ret = ParallelLaunch(context_->thread_pool_, AddInt8Run, this, thread_count_);

context_->allocator->Free(input0_data_);
context_->allocator->Free(input1_data_);
input0_data_ = nullptr;
input1_data_ = nullptr;
return ret;
}

input0_data_ = src_in0;
input1_data_ = src_in1;
auto ret = ParallelLaunch(this->context_->thread_pool_, AddInt8Run, this, thread_count_);
return ret;
} }


kernel::LiteKernel *CpuAddInt8KernelCreator(const std::vector<lite::Tensor *> &inputs, kernel::LiteKernel *CpuAddInt8KernelCreator(const std::vector<lite::Tensor *> &inputs,


+ 3
- 4
mindspore/lite/src/runtime/kernel/arm/int8/add_int8.h View File

@@ -28,7 +28,7 @@ class QuantizedAddCPUKernel : public LiteKernel {
explicit QuantizedAddCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, explicit QuantizedAddCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx, const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
const mindspore::lite::PrimitiveC *primitive) const mindspore::lite::PrimitiveC *primitive)
: LiteKernel(parameter, inputs, outputs, ctx, primitive), ctx_(ctx), thread_count_(ctx_->thread_num_) {
: LiteKernel(parameter, inputs, outputs, ctx, primitive) {
arith_para_ = reinterpret_cast<ArithmeticParameter *>(parameter); arith_para_ = reinterpret_cast<ArithmeticParameter *>(parameter);
} }
~QuantizedAddCPUKernel() override {} ~QuantizedAddCPUKernel() override {}
@@ -39,12 +39,11 @@ class QuantizedAddCPUKernel : public LiteKernel {
int DoExecute(int tId); int DoExecute(int tId);


private: private:
const lite::InnerContext *ctx_;
AddQuantParameter para_; AddQuantParameter para_;
ArithmeticParameter *arith_para_; ArithmeticParameter *arith_para_;
int thread_count_; int thread_count_;
int64_t elements_num_;
int64_t count_unit_;
int thread_stride_;
int elements_num_;
int8_t *input0_data_ = nullptr; int8_t *input0_data_ = nullptr;
int8_t *input1_data_ = nullptr; int8_t *input1_data_ = nullptr;
int8_t *output_data_ = nullptr; int8_t *output_data_ = nullptr;


+ 5
- 2
mindspore/lite/src/runtime/kernel/arm/int8/transpose_int8.cc View File

@@ -132,11 +132,14 @@ int TransposeInt8CPUKernel::Run() {
auto in_tensor = in_tensors_.front(); auto in_tensor = in_tensors_.front();
auto out_tensor = out_tensors_.front(); auto out_tensor = out_tensors_.front();


auto in_dims = in_tensor->shape();
auto out_dims = out_tensor->shape();

in_ptr_ = reinterpret_cast<int8_t *>(in_tensor->data_c()); in_ptr_ = reinterpret_cast<int8_t *>(in_tensor->data_c());
out_ptr_ = reinterpret_cast<int8_t *>(out_tensor->data_c()); out_ptr_ = reinterpret_cast<int8_t *>(out_tensor->data_c());


in_shape_ = in_tensor->shape().data();
out_shape_ = out_tensor->shape().data();
in_shape_ = in_dims.data();
out_shape_ = out_dims.data();


int ret = MallocTmpBuf(); int ret = MallocTmpBuf();
if (ret != RET_OK) { if (ret != RET_OK) {


+ 1
- 0
mindspore/lite/tools/common/node_util.cc View File

@@ -75,6 +75,7 @@ static const std::vector<schema::PrimitiveType> int8OpList = {schema::PrimitiveT
schema::PrimitiveType_Conv2D, schema::PrimitiveType_Conv2D,
schema::PrimitiveType_DepthwiseConv2D, schema::PrimitiveType_DepthwiseConv2D,
schema::PrimitiveType_Add, schema::PrimitiveType_Add,
schema::PrimitiveType_Transpose,
schema::PrimitiveType_Pooling, schema::PrimitiveType_Pooling,
schema::PrimitiveType_Concat, schema::PrimitiveType_Concat,
schema::PrimitiveType_SoftMax, schema::PrimitiveType_SoftMax,


+ 1
- 0
mindspore/lite/tools/converter/quantizer/quantize_util.cc View File

@@ -108,6 +108,7 @@ bool QuantStrategy::CanOpPostQuantized(AnfNodePtr &node) const {
schema::PrimitiveType_DeDepthwiseConv2D, schema::PrimitiveType_DeDepthwiseConv2D,
schema::PrimitiveType_DeConv2D, schema::PrimitiveType_DeConv2D,
schema::PrimitiveType_Activation, schema::PrimitiveType_Activation,
schema::PrimitiveType_Transpose,
schema::PrimitiveType_Eltwise, schema::PrimitiveType_Eltwise,
}; };
bool contain = IsContain(int8OpList, type); bool contain = IsContain(int8OpList, type);


Loading…
Cancel
Save