Browse Source

!15716 [MSLITE][Develop] clean codex in lite op runtime module

From: @yangruoqi713
Reviewed-by: @zhang_xue_tong,@zhanghaibo5
Signed-off-by: @zhang_xue_tong
pull/15716/MERGE
mindspore-ci-bot Gitee 4 years ago
parent
commit
d5b106799d
46 changed files with 474 additions and 286 deletions
  1. +47
    -44
      mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/mul_int8.c
  2. +2
    -2
      mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/mul_int8.h
  3. +5
    -5
      mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/softmax_int8.c
  4. +1
    -1
      mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/softmax_int8.h
  5. +0
    -1
      mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/mul_parameter.h
  6. +1
    -1
      mindspore/lite/src/runtime/allocator.h
  7. +1
    -1
      mindspore/lite/src/runtime/kernel/arm/base/constant_of_shape.h
  8. +13
    -6
      mindspore/lite/src/runtime/kernel/arm/base/group_convolution_creator.cc
  9. +1
    -1
      mindspore/lite/src/runtime/kernel/arm/base/tile_base.h
  10. +1
    -1
      mindspore/lite/src/runtime/kernel/arm/fp16/group_convolution_fp16.h
  11. +2
    -2
      mindspore/lite/src/runtime/kernel/arm/fp32/batch_to_space_fp32.h
  12. +17
    -7
      mindspore/lite/src/runtime/kernel/arm/fp32/broadcast_to_fp32.cc
  13. +2
    -2
      mindspore/lite/src/runtime/kernel/arm/fp32/broadcast_to_fp32.h
  14. +11
    -2
      mindspore/lite/src/runtime/kernel/arm/fp32/convolution_delegate_fp32.cc
  15. +3
    -3
      mindspore/lite/src/runtime/kernel/arm/fp32/convolution_fp32.cc
  16. +2
    -2
      mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd_fp32.cc
  17. +3
    -3
      mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd_fp32.h
  18. +1
    -0
      mindspore/lite/src/runtime/kernel/arm/fp32/group_convolution_fp32.h
  19. +1
    -1
      mindspore/lite/src/runtime/kernel/arm/fp32/transpose_fp32.h
  20. +1
    -0
      mindspore/lite/src/runtime/kernel/arm/fp32/uniform_real_fp32.cc
  21. +36
    -24
      mindspore/lite/src/runtime/kernel/arm/int8/add_int8.cc
  22. +2
    -2
      mindspore/lite/src/runtime/kernel/arm/int8/add_int8.h
  23. +30
    -9
      mindspore/lite/src/runtime/kernel/arm/int8/argminmax_int8.cc
  24. +3
    -3
      mindspore/lite/src/runtime/kernel/arm/int8/argminmax_int8.h
  25. +28
    -8
      mindspore/lite/src/runtime/kernel/arm/int8/batch_to_space_int8.cc
  26. +3
    -3
      mindspore/lite/src/runtime/kernel/arm/int8/batch_to_space_int8.h
  27. +1
    -1
      mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_int8.h
  28. +27
    -6
      mindspore/lite/src/runtime/kernel/arm/int8/depth_to_space_int8.cc
  29. +3
    -3
      mindspore/lite/src/runtime/kernel/arm/int8/depth_to_space_int8.h
  30. +24
    -13
      mindspore/lite/src/runtime/kernel/arm/int8/div_int8.cc
  31. +2
    -2
      mindspore/lite/src/runtime/kernel/arm/int8/div_int8.h
  32. +1
    -0
      mindspore/lite/src/runtime/kernel/arm/int8/group_convolution_int8.h
  33. +17
    -5
      mindspore/lite/src/runtime/kernel/arm/int8/l2_norm_int8.cc
  34. +2
    -2
      mindspore/lite/src/runtime/kernel/arm/int8/l2_norm_int8.h
  35. +15
    -6
      mindspore/lite/src/runtime/kernel/arm/int8/layer_norm_int8.cc
  36. +1
    -1
      mindspore/lite/src/runtime/kernel/arm/int8/layer_norm_int8.h
  37. +58
    -49
      mindspore/lite/src/runtime/kernel/arm/int8/matmul_base_int8.cc
  38. +1
    -1
      mindspore/lite/src/runtime/kernel/arm/int8/matmul_base_int8.h
  39. +28
    -18
      mindspore/lite/src/runtime/kernel/arm/int8/mul_int8.cc
  40. +2
    -2
      mindspore/lite/src/runtime/kernel/arm/int8/mul_int8.h
  41. +22
    -11
      mindspore/lite/src/runtime/kernel/arm/int8/softmax_int8.cc
  42. +2
    -2
      mindspore/lite/src/runtime/kernel/arm/int8/softmax_int8.h
  43. +40
    -25
      mindspore/lite/src/runtime/kernel/arm/int8/sub_int8.cc
  44. +2
    -2
      mindspore/lite/src/runtime/kernel/arm/int8/sub_int8.h
  45. +1
    -1
      mindspore/lite/src/runtime/kernel/arm/int8/transpose_int8.h
  46. +8
    -2
      mindspore/lite/src/sub_graph_split.cc

+ 47
- 44
mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/mul_int8.c View File

@@ -28,19 +28,19 @@ int16x4_t ClacSumHalfWordMul(int16x4_t scaled_input0, int16x4_t scaled_input1, i
}

void MulInt8NEON(int8_t *input0_data, int8_t *input1_data, int8_t *output_data, int64_t real_dst_count,
MulQuantArg para, int *index) {
int32x4_t output_multiplier_vec = vdupq_n_s32(para.output_multiplier_);
int32x4_t left_shift_out_vec = vdupq_n_s32(1 << para.shift_left_);
int32x4_t right_shift_out_vec = vdupq_n_s32(-para.shift_right_);
int16x8_t out_zp_vec = vdupq_n_s16(para.out_quant_arg_.zp_);
int8x16_t out_min_vec = vdupq_n_s8(para.output_activation_min_);
int8x16_t out_max_vec = vdupq_n_s8(para.output_activation_max_);
int8x8_t out_min_vec_s8 = vdup_n_s8(para.output_activation_min_);
int8x8_t out_max_vec_s8 = vdup_n_s8(para.output_activation_max_);
MulQuantArg *quant_arg, int *index) {
int32x4_t output_multiplier_vec = vdupq_n_s32(quant_arg->output_multiplier_);
int32x4_t left_shift_out_vec = vdupq_n_s32(1 << quant_arg->shift_left_);
int32x4_t right_shift_out_vec = vdupq_n_s32(-quant_arg->shift_right_);
int16x8_t out_zp_vec = vdupq_n_s16(quant_arg->out_quant_arg_.zp_);
int8x16_t out_min_vec = vdupq_n_s8(quant_arg->output_activation_min_);
int8x16_t out_max_vec = vdupq_n_s8(quant_arg->output_activation_max_);
int8x8_t out_min_vec_s8 = vdup_n_s8(quant_arg->output_activation_min_);
int8x8_t out_max_vec_s8 = vdup_n_s8(quant_arg->output_activation_max_);

for (; (*index) <= real_dst_count - 16; (*index) += 16) {
int16x8_t zp1_vec = vdupq_n_s16(para.in_quant_args_[0].zp_);
int16x8_t zp2_vec = vdupq_n_s16(para.in_quant_args_[1].zp_);
int16x8_t zp1_vec = vdupq_n_s16(quant_arg->in_quant_args_[0].zp_);
int16x8_t zp2_vec = vdupq_n_s16(quant_arg->in_quant_args_[1].zp_);
int8x16_t input0_vec = vld1q_s8(input0_data + *index);
int8x16_t input1_vec = vld1q_s8(input1_data + *index);
int16x8_t input0_low = vmovl_s8(vget_low_s8(input0_vec));
@@ -81,8 +81,8 @@ void MulInt8NEON(int8_t *input0_data, int8_t *input1_data, int8_t *output_data,
output_data += 16;
}
for (; (*index) <= real_dst_count - 8; (*index) += 8) {
int16x8_t input0_val = LoadAndAddOffset(input0_data, *index, para.in_quant_args_[0].zp_);
int16x8_t input1_val = LoadAndAddOffset(input1_data, *index, para.in_quant_args_[1].zp_);
int16x8_t input0_val = LoadAndAddOffset(input0_data, *index, quant_arg->in_quant_args_[0].zp_);
int16x8_t input1_val = LoadAndAddOffset(input1_data, *index, quant_arg->in_quant_args_[1].zp_);

int16x4_t input0_low = vget_low_s16(input0_val);
int16x4_t input0_high = vget_high_s16(input0_val);
@@ -105,23 +105,23 @@ void MulInt8NEON(int8_t *input0_data, int8_t *input1_data, int8_t *output_data,
#endif

void FastMul(int8_t *input0_data, int8_t *input1_data, int8_t *output_data, int depth, int64_t real_dst_count,
bool input1_broad, MulQuantArg para) {
bool input1_broad, MulQuantArg *quant_arg) {
// input0 need broadcast
int32_t zp1 = para.in_quant_args_[0].zp_;
int32_t zp2 = para.in_quant_args_[1].zp_;
int32_t zp1 = quant_arg->in_quant_args_[0].zp_;
int32_t zp2 = quant_arg->in_quant_args_[1].zp_;
if (input1_broad) {
zp1 = para.in_quant_args_[1].zp_;
zp2 = para.in_quant_args_[0].zp_;
zp1 = quant_arg->in_quant_args_[1].zp_;
zp2 = quant_arg->in_quant_args_[0].zp_;
}
#ifdef ENABLE_ARM
int32x4_t output_multiplier_vec = vdupq_n_s32(para.output_multiplier_);
int32x4_t left_shift_out_vec = vdupq_n_s32(1 << para.shift_left_);
int32x4_t right_shift_out_vec = vdupq_n_s32(-para.shift_right_);
int16x8_t out_zp_vec = vdupq_n_s16(para.out_quant_arg_.zp_);
int8x16_t out_min_vec = vdupq_n_s8(para.output_activation_min_);
int8x16_t out_max_vec = vdupq_n_s8(para.output_activation_max_);
int8x8_t out_min_vec_s8 = vdup_n_s8(para.output_activation_min_);
int8x8_t out_max_vec_s8 = vdup_n_s8(para.output_activation_max_);
int32x4_t output_multiplier_vec = vdupq_n_s32(quant_arg->output_multiplier_);
int32x4_t left_shift_out_vec = vdupq_n_s32(1 << quant_arg->shift_left_);
int32x4_t right_shift_out_vec = vdupq_n_s32(-quant_arg->shift_right_);
int16x8_t out_zp_vec = vdupq_n_s16(quant_arg->out_quant_arg_.zp_);
int8x16_t out_min_vec = vdupq_n_s8(quant_arg->output_activation_min_);
int8x16_t out_max_vec = vdupq_n_s8(quant_arg->output_activation_max_);
int8x8_t out_min_vec_s8 = vdup_n_s8(quant_arg->output_activation_min_);
int8x8_t out_max_vec_s8 = vdup_n_s8(quant_arg->output_activation_max_);
int16x8_t zp1_vec = vdupq_n_s16(zp1);
int16x8_t zp2_vec = vdupq_n_s16(zp2);
#endif
@@ -199,13 +199,14 @@ void FastMul(int8_t *input0_data, int8_t *input1_data, int8_t *output_data, int
for (; j < depth; ++j) {
const int32_t input0_val = zp1 + input0_data[j];
const int32_t input1_val = zp2 + input1_data[0];
int32_t mul_result = RoundingDivideByPOT(
SaturatingRoundingDoublingHighMul(input0_val * input1_val * (1 << para.shift_left_), para.output_multiplier_),
para.shift_right_);

mul_result += para.out_quant_arg_.zp_;
mul_result = mul_result < para.output_activation_max_ ? mul_result : para.output_activation_max_;
mul_result = mul_result > para.output_activation_min_ ? mul_result : para.output_activation_min_;
int32_t mul_result =
RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(input0_val * input1_val * (1 << quant_arg->shift_left_),
quant_arg->output_multiplier_),
quant_arg->shift_right_);

mul_result += quant_arg->out_quant_arg_.zp_;
mul_result = mul_result < quant_arg->output_activation_max_ ? mul_result : quant_arg->output_activation_max_;
mul_result = mul_result > quant_arg->output_activation_min_ ? mul_result : quant_arg->output_activation_min_;
output_data[0] = (int8_t)mul_result;
input1_data++;
output_data++;
@@ -214,21 +215,23 @@ void FastMul(int8_t *input0_data, int8_t *input1_data, int8_t *output_data, int
return;
}

void Mul(int8_t *input0_data, int8_t *input1_data, int8_t *output_data, int64_t real_dst_count, MulQuantArg para) {
void Mul(int8_t *input0_data, int8_t *input1_data, int8_t *output_data, int64_t real_dst_count,
MulQuantArg *quant_arg) {
int index = 0;
#ifdef ENABLE_NEON
MulInt8NEON(input0_data, input1_data, output_data, real_dst_count, para, &index);
MulInt8NEON(input0_data, input1_data, output_data, real_dst_count, quant_arg, &index);
#endif
for (; index < real_dst_count; ++index) {
const int32_t input0_val = para.in_quant_args_[0].zp_ + input0_data[index];
const int32_t input1_val = para.in_quant_args_[1].zp_ + input1_data[index];
int32_t mul_result = RoundingDivideByPOT(
SaturatingRoundingDoublingHighMul(input0_val * input1_val * (1 << para.shift_left_), para.output_multiplier_),
para.shift_right_);

mul_result += para.out_quant_arg_.zp_;
mul_result = mul_result < para.output_activation_max_ ? mul_result : para.output_activation_max_;
mul_result = mul_result > para.output_activation_min_ ? mul_result : para.output_activation_min_;
const int32_t input0_val = quant_arg->in_quant_args_[0].zp_ + input0_data[index];
const int32_t input1_val = quant_arg->in_quant_args_[1].zp_ + input1_data[index];
int32_t mul_result =
RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(input0_val * input1_val * (1 << quant_arg->shift_left_),
quant_arg->output_multiplier_),
quant_arg->shift_right_);

mul_result += quant_arg->out_quant_arg_.zp_;
mul_result = mul_result < quant_arg->output_activation_max_ ? mul_result : quant_arg->output_activation_max_;
mul_result = mul_result > quant_arg->output_activation_min_ ? mul_result : quant_arg->output_activation_min_;
output_data[index] = (int8_t)mul_result;
}
return;


+ 2
- 2
mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/mul_int8.h View File

@@ -28,9 +28,9 @@
#ifdef __cplusplus
extern "C" {
#endif
void Mul(int8_t *input0_data, int8_t *input1_data, int8_t *output_data, int64_t real_dst_count, MulQuantArg para);
void Mul(int8_t *input0_data, int8_t *input1_data, int8_t *output_data, int64_t real_dst_count, MulQuantArg *quant_arg);
void FastMul(int8_t *input0_data, int8_t *input1_data, int8_t *output_data, int depth, int64_t real_dst_count,
bool input1_broad, MulQuantArg para);
bool input1_broad, MulQuantArg *quant_arg);
#ifdef __cplusplus
}
#endif


+ 5
- 5
mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/softmax_int8.c View File

@@ -17,7 +17,7 @@
#include "nnacl/int8/softmax_int8.h"

int SoftmaxInt8(const int8_t *input_ptr, int8_t *output_ptr, int count, int *exp_data, int *sum_data,
SoftmaxQuantArg quant_param, SoftmaxParameter *parameter) {
SoftmaxQuantArg *quant_param, SoftmaxParameter *parameter) {
int32_t axis = parameter->axis_;
int n_dim = parameter->n_dim_;
int *input_shape = parameter->input_shape_;
@@ -32,7 +32,7 @@ int SoftmaxInt8(const int8_t *input_ptr, int8_t *output_ptr, int count, int *exp
int outter_offset = o * axis_shape_size * inner_size;

for (int c = 0; c < inner_size; c++) {
int8_t max_row = quant_param.output_activation_min_;
int8_t max_row = quant_param->output_activation_min_;
for (int i = 0; i < axis_shape_size; ++i) {
int axis_offset = outter_offset + c + i * inner_size;
max_row = MSMAX(max_row, input_ptr[axis_offset]);
@@ -43,7 +43,7 @@ int SoftmaxInt8(const int8_t *input_ptr, int8_t *output_ptr, int count, int *exp
int axis_offset = outter_offset + c + i * inner_size;
const int32_t input_val = input_ptr[axis_offset] - max_row;
const int32_t input_scaled = SaturatingRoundingDoublingHighMul(
input_val * (1 << (unsigned int)quant_param.shift_left_), quant_param.output_multiplier_);
input_val * (1 << (unsigned int)quant_param->shift_left_), quant_param->output_multiplier_);
int exp_val = exp_on_negative_values(input_scaled, 5);
exp_data[axis_offset] = exp_val;
exp_sum = exp_sum + Rescale(exp_val, 0, 12);
@@ -58,9 +58,9 @@ int SoftmaxInt8(const int8_t *input_ptr, int8_t *output_ptr, int count, int *exp
int unsat_output = RoundingDivideByPOT(
SaturatingRoundingDoublingHighMul(shifted_scale, exp_data[axis_offset + c]), num_bits_over_unit + 31 - 8);

int raw_output = unsat_output + quant_param.output_activation_min_;
int raw_output = unsat_output + quant_param->output_activation_min_;
output_ptr[axis_offset + c] =
(int8_t)MSMAX(quant_param.output_activation_min_, MSMIN(raw_output, quant_param.output_activation_max_));
(int8_t)MSMAX(quant_param->output_activation_min_, MSMIN(raw_output, quant_param->output_activation_max_));
}
}
}


+ 1
- 1
mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/softmax_int8.h View File

@@ -27,7 +27,7 @@
extern "C" {
#endif
int SoftmaxInt8(const int8_t *input_ptr, int8_t *output_ptr, int count, int *exp_data, int *sum_data,
SoftmaxQuantArg quant_param, SoftmaxParameter *parameter);
SoftmaxQuantArg *quant_param, SoftmaxParameter *parameter);
#ifdef __cplusplus
}
#endif


+ 0
- 1
mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/mul_parameter.h View File

@@ -34,7 +34,6 @@ typedef struct MulParameter {
OpParameter op_parameter_;
// other parameter
int thread_count_;
MulQuantArg mul_quant_arg_;
} MulParameter;

#endif // MINDSPORE_NNACL_MUL_PARAMETER_H_

+ 1
- 1
mindspore/lite/src/runtime/allocator.h View File

@@ -69,7 +69,7 @@ class DefaultAllocator : public Allocator {
std::unordered_map<void *, MemBuf *> allocatedList_;
std::multimap<size_t, MemBuf *> freeList_;
// 6 is empirical value
int shiftFactor_ = 6;
unsigned shiftFactor_ = 6;
bool lockFlag_ = true;
};



+ 1
- 1
mindspore/lite/src/runtime/kernel/arm/base/constant_of_shape.h View File

@@ -42,7 +42,7 @@ class ConstantOfShapeCPUKernel : public LiteKernel {
private:
ConstantOfShapeParameter *param_ = nullptr;
void *output_ptr_ = nullptr;
int thread_stride_;
int thread_stride_ = 0;
};
} // namespace mindspore::kernel



+ 13
- 6
mindspore/lite/src/runtime/kernel/arm/base/group_convolution_creator.cc View File

@@ -33,16 +33,22 @@ ConvParameter *CreateNewConvParameter(ConvParameter *parameter) {
return conv_parameter;
}

void FreeCurrentConv(ConvParameter *conv_param, const std::vector<lite::Tensor *> *new_inputs,
const std::vector<lite::Tensor *> *new_outputs) {
void FreeCurrentConv(ConvParameter *conv_param, std::vector<lite::Tensor *> *new_inputs,
std::vector<lite::Tensor *> *new_outputs) {
if (conv_param != nullptr) {
free(conv_param);
}
for (auto &in_tensor : *new_inputs) {
delete in_tensor;
if (new_inputs != nullptr) {
for (auto &in_tensor : *new_inputs) {
delete in_tensor;
in_tensor = nullptr;
}
}
for (auto &out_tensor : *new_outputs) {
delete out_tensor;
if (new_outputs != nullptr) {
for (auto &out_tensor : *new_outputs) {
delete out_tensor;
out_tensor = nullptr;
}
}
}

@@ -112,6 +118,7 @@ void GroupConvCreator::FreeGroupConvs() {
delete out_tensor;
}
delete sub_conv;
sub_conv = nullptr;
}
group_convs_.clear();
}


+ 1
- 1
mindspore/lite/src/runtime/kernel/arm/base/tile_base.h View File

@@ -37,7 +37,7 @@ class TileCPUKernel : public LiteKernel {
int RunSimpleTile();
void ComputeStrides(const int *shape, int *strides, int ndim);
void FillOneDimTileParam();
bool one_dim_tile_;
bool one_dim_tile_ = false;
uint8_t *input_addr_ = nullptr;
uint8_t *output_addr_ = nullptr;
TileParameter *tile_parameter_ = nullptr;


+ 1
- 1
mindspore/lite/src/runtime/kernel/arm/fp16/group_convolution_fp16.h View File

@@ -33,7 +33,7 @@ class GroupConvolutionFP16CPUKernel : public GroupConvolutionBaseCPUKernel {
: GroupConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, std::move(group_convs), group_num) {
} // opParameter(in channel, out channel) in this kernel has been split to groups, if
// you want to get real params, multiply in channel / out channel with group num
~GroupConvolutionFP16CPUKernel() = default;
int SeparateInput(int group_id) override;
int PostConcat(int group_id) override;
};


+ 2
- 2
mindspore/lite/src/runtime/kernel/arm/fp32/batch_to_space_fp32.h View File

@@ -36,8 +36,8 @@ class BatchToSpaceCPUKernel : public LiteKernel {
int Processinput();

private:
int32_t block_shape_[BATCH_TO_SPACE_BLOCK_SHAPE_SIZE];
int32_t crops_[COMM_SHAPE_SIZE];
int32_t block_shape_[BATCH_TO_SPACE_BLOCK_SHAPE_SIZE] = {0};
int32_t crops_[COMM_SHAPE_SIZE] = {0};
bool no_crop_ = false;
};
} // namespace mindspore::kernel


+ 17
- 7
mindspore/lite/src/runtime/kernel/arm/fp32/broadcast_to_fp32.cc View File

@@ -25,34 +25,44 @@ using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_BroadcastTo;

namespace mindspore::kernel {
BroadcastToCPUKernel::~BroadcastToCPUKernel() {
if (shape_info_ != nullptr) {
free(shape_info_);
shape_info_ = nullptr;
}
}

int BroadcastToCPUKernel::ReSize() {
auto input_shape = in_tensors_.at(0)->shape();
for (size_t i = 0; i < input_shape.size(); ++i) {
shape_info_.input_shape_[i] = input_shape[i];
shape_info_->input_shape_[i] = input_shape[i];
}

shape_info_.input_shape_size_ = static_cast<int>(input_shape.size());
shape_info_->input_shape_size_ = static_cast<int>(input_shape.size());
auto output_shape = out_tensors_.at(0)->shape();
for (size_t i = 0; i < output_shape.size(); ++i) {
shape_info_.output_shape_[i] = output_shape[i];
shape_info_->output_shape_[i] = output_shape[i];
}
shape_info_.output_shape_size_ = static_cast<int>(output_shape.size());
shape_info_->output_shape_size_ = static_cast<int>(output_shape.size());
return RET_OK;
}

int BroadcastToCPUKernel::Init() {
shape_info_ = reinterpret_cast<BroadcastShapeInfo *>(malloc(sizeof(BroadcastShapeInfo)));
if (shape_info_ == nullptr) {
MS_LOG(ERROR) << "Malloc BroadcastShapeInfo failed!";
return RET_ERROR;
}
if (!InferShapeDone()) {
return RET_OK;
}

return ReSize();
}

int BroadcastToCPUKernel::Run() {
const auto input_data = reinterpret_cast<float *>(in_tensors_.at(0)->MutableData());
auto output_data = reinterpret_cast<float *>(out_tensors_.at(0)->MutableData());

return BroadcastTo(float, input_data, &shape_info_, output_data);
return BroadcastTo(float, input_data, shape_info_, output_data);
}

REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_BroadcastTo, LiteKernelCreator<BroadcastToCPUKernel>)


+ 2
- 2
mindspore/lite/src/runtime/kernel/arm/fp32/broadcast_to_fp32.h View File

@@ -27,14 +27,14 @@ class BroadcastToCPUKernel : public LiteKernel {
BroadcastToCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
: LiteKernel(parameter, inputs, outputs, ctx) {}
~BroadcastToCPUKernel() = default;
~BroadcastToCPUKernel();

int Init() override;
int ReSize() override;
int Run() override;

private:
BroadcastShapeInfo shape_info_;
BroadcastShapeInfo *shape_info_ = nullptr;
};
} // namespace mindspore::kernel



+ 11
- 2
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_delegate_fp32.cc View File

@@ -168,9 +168,13 @@ kernel::LiteKernel *ConvolutionDelegateCPUKernel::CpuConvFp32KernelSelect() {
kernel::LiteKernel *CpuConvDwFp32KernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
const InnerContext *ctx) {
if (opParameter == nullptr) {
MS_LOG(ERROR) << "Get null opParameter for CpuConvDwFp32KernelCreator.";
return nullptr;
}
auto conv_param = reinterpret_cast<ConvParameter *>(opParameter);
kernel::LiteKernel *kernel = nullptr;
if (opParameter != nullptr && opParameter->infer_flag_) {
if (opParameter->infer_flag_) {
#if defined(ENABLE_ARM) || (defined(ENABLE_SSE) && !defined(ENABLE_AVX))
if (CheckConvDw1DWinograd(conv_param, ctx->thread_num_)) {
kernel = new (std::nothrow) kernel::ConvolutionDepthwise3x3CPUKernel(opParameter, inputs, outputs, ctx);
@@ -209,9 +213,14 @@ kernel::LiteKernel *CpuGroupConvFp32KernelCreator(const std::vector<lite::Tensor
group_conv_creator.get_group_conv()->emplace_back(new (std::nothrow) ConvolutionDelegateCPUKernel(
reinterpret_cast<OpParameter *>(new_conv_param), new_inputs, new_outputs, ctx));
}
return new (std::nothrow)
auto group_kernel = new (std::nothrow)
GroupConvolutionFp32CPUKernel(op_parameter, inputs, outputs, ctx, *(group_conv_creator.get_group_conv()),
reinterpret_cast<ConvParameter *>(op_parameter)->group_);
if (group_kernel == nullptr) {
MS_LOG(ERROR) << "New GroupConvolutionFp32CPUKernel failed.";
return nullptr;
}
return group_kernel;
}

/* creator func */


+ 3
- 3
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_fp32.cc View File

@@ -38,11 +38,11 @@ namespace mindspore::kernel {

int ConvolutionCPUKernel::InitWeightBias() {
auto filter_tensor = in_tensors_.at(kWeightIndex);
int in_channel = filter_tensor->Channel();
int out_channel = filter_tensor->Batch();
size_t in_channel = filter_tensor->Channel();
size_t out_channel = filter_tensor->Batch();
conv_param_->input_channel_ = in_channel;
conv_param_->output_channel_ = out_channel;
int kernel_plane = filter_tensor->Height() * filter_tensor->Width();
size_t kernel_plane = filter_tensor->Height() * filter_tensor->Width();
int oc_block_num = UP_ROUND(out_channel, OC_BLOCK);
int pack_weight_size = oc_block_num * in_channel * kernel_plane;



+ 2
- 2
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd_fp32.cc View File

@@ -25,8 +25,8 @@ using mindspore::lite::RET_MEMORY_FAILED;
using mindspore::lite::RET_OK;

namespace mindspore::kernel {
int ConvolutionWinogradCPUKernel::WinogradFilterTransform(const float *weight_data, float *matrix_g, float *matrix_gt,
int oc_block) {
int ConvolutionWinogradCPUKernel::WinogradFilterTransform(const float *weight_data, float *matrix_g,
const float *matrix_gt, int oc_block) {
if (oc_block == 0) {
MS_LOG(ERROR) << "Divide by zero";
return RET_ERROR;


+ 3
- 3
mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd_fp32.h View File

@@ -48,7 +48,7 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel {
int InitWeightBias();
int InitTmpBuffer();
int ConfigInputOutput();
int WinogradFilterTransform(const float *weight_data, float *matrix_g, float *matrix_gt, int oc_block);
int WinogradFilterTransform(const float *weight_data, float *matrix_g, const float *matrix_gt, int oc_block);

private:
void FreeTmpBuffer() {
@@ -82,8 +82,8 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel {
float *col_buffer_ = nullptr;
float *trans_weight_ = nullptr;
TmpBufferAddress tmp_buffer_address_list_[4];
InputTransFunc in_func_;
OutputTransFunc out_func_;
InputTransFunc in_func_ = nullptr;
OutputTransFunc out_func_ = nullptr;
};

} // namespace mindspore::kernel


+ 1
- 0
mindspore/lite/src/runtime/kernel/arm/fp32/group_convolution_fp32.h View File

@@ -32,6 +32,7 @@ class GroupConvolutionFp32CPUKernel : public GroupConvolutionBaseCPUKernel {
: GroupConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, std::move(group_convs), group_num) {
} // opParameter(in channel, out channel) in this kernel has been split to groups, if
// you want to get real params, multiply in channel / out channel with group num
~GroupConvolutionFp32CPUKernel() = default;
int SeparateInput(int group_id) override;
int PostConcat(int group_id) override;
};


+ 1
- 1
mindspore/lite/src/runtime/kernel/arm/fp32/transpose_fp32.h View File

@@ -51,7 +51,7 @@ class TransposeCPUKernel : public LiteKernel {
TransposeParameter *param_ = nullptr;
TransposeFunc NHNCTransposeFunc_ = nullptr;
int thread_count_ = 0;
int nhnc_param_[3];
int nhnc_param_[3] = {0};
int dims_ = 0;
};
} // namespace mindspore::kernel


+ 1
- 0
mindspore/lite/src/runtime/kernel/arm/fp32/uniform_real_fp32.cc View File

@@ -35,6 +35,7 @@ class PhiloxRandom {
counter_[2] = static_cast<uint32_t>(seed_hi);
counter_[3] = static_cast<uint32_t>(seed_hi >> 32);
}
~PhiloxRandom() = default;

// Skip the specified number of samples of 128-bits in the current stream.
void Skip(uint64_t count) {


+ 36
- 24
mindspore/lite/src/runtime/kernel/arm/int8/add_int8.cc View File

@@ -27,40 +27,52 @@ using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_AddFusion;

namespace mindspore::kernel {
QuantizedAddCPUKernel::~QuantizedAddCPUKernel() {
if (para_ != nullptr) {
free(para_);
para_ = nullptr;
}
}

int QuantizedAddCPUKernel::Init() {
para_ = reinterpret_cast<AddQuantParameter *>(malloc(sizeof(AddQuantParameter)));
if (para_ == nullptr) {
MS_LOG(ERROR) << "Malloc AddQuantParameter for add int8 op failed!";
return RET_ERROR;
}
auto *input0 = in_tensors_.at(0);
auto *input1 = in_tensors_.at(1);
auto *output = out_tensors_.at(0);

para_.in0_args_.zp_ = input0->quant_params().front().zeroPoint * -1;
para_.in1_args_.zp_ = input1->quant_params().front().zeroPoint * -1;
para_.out_zp_ = output->quant_params().front().zeroPoint;
para_->in0_args_.zp_ = input0->quant_params().front().zeroPoint * -1;
para_->in1_args_.zp_ = input1->quant_params().front().zeroPoint * -1;
para_->out_zp_ = output->quant_params().front().zeroPoint;

const double in0_scale = input0->quant_params().front().scale;
const double in1_scale = input1->quant_params().front().scale;
const double out_scale = output->quant_params().front().scale;

para_.left_shift_ = 20;
para_->left_shift_ = 20;
const double twice_max_input_scale = 2 * std::max(in0_scale, in1_scale);
const double in0_multiplier = in0_scale / twice_max_input_scale;
const double in1_multiplier = in1_scale / twice_max_input_scale;
const double out_multiplier = twice_max_input_scale / ((1 << para_.left_shift_) * out_scale);
const double out_multiplier = twice_max_input_scale / ((1 << para_->left_shift_) * out_scale);

QuantizeMultiplierSmallerThanOne(in0_multiplier, &para_.in0_args_.multiplier_, &para_.in0_args_.left_shift_);
QuantizeMultiplierSmallerThanOne(in1_multiplier, &para_.in1_args_.multiplier_, &para_.in1_args_.left_shift_);
QuantizeMultiplierSmallerThanOne(out_multiplier, &para_.out_multiplier_, &para_.out_left_shift_);
QuantizeMultiplierSmallerThanOne(in0_multiplier, &(para_->in0_args_.multiplier_), &(para_->in0_args_.left_shift_));
QuantizeMultiplierSmallerThanOne(in1_multiplier, &(para_->in1_args_.multiplier_), &(para_->in1_args_.left_shift_));
QuantizeMultiplierSmallerThanOne(out_multiplier, &(para_->out_multiplier_), &(para_->out_left_shift_));

para_.in0_args_.right_shift_ = -para_.in0_args_.left_shift_ > 0 ? 0 : para_.in0_args_.left_shift_;
para_.in1_args_.right_shift_ = -para_.in1_args_.left_shift_ > 0 ? 0 : para_.in1_args_.left_shift_;
para_.out_right_shift_ = -para_.out_left_shift_ > 0 ? 0 : para_.out_left_shift_;
para_->in0_args_.right_shift_ = -para_->in0_args_.left_shift_ > 0 ? 0 : para_->in0_args_.left_shift_;
para_->in1_args_.right_shift_ = -para_->in1_args_.left_shift_ > 0 ? 0 : para_->in1_args_.left_shift_;
para_->out_right_shift_ = -para_->out_left_shift_ > 0 ? 0 : para_->out_left_shift_;

para_.in0_args_.left_shift_ = -para_.in0_args_.left_shift_ > 0 ? -para_.in0_args_.left_shift_ : 0;
para_.in1_args_.left_shift_ = -para_.in1_args_.left_shift_ > 0 ? -para_.in1_args_.left_shift_ : 0;
para_.out_left_shift_ = -para_.out_left_shift_ > 0 ? -para_.out_left_shift_ : 0;
para_->in0_args_.left_shift_ = -para_->in0_args_.left_shift_ > 0 ? -para_->in0_args_.left_shift_ : 0;
para_->in1_args_.left_shift_ = -para_->in1_args_.left_shift_ > 0 ? -para_->in1_args_.left_shift_ : 0;
para_->out_left_shift_ = -para_->out_left_shift_ > 0 ? -para_->out_left_shift_ : 0;

auto act = arith_para_->activation_type_;
CalculateActivationRangeQuantized(act == ActType_Relu, act == ActType_Relu6, para_.out_zp_,
static_cast<float>(out_scale), &para_.min_, &para_.max_);
CalculateActivationRangeQuantized(act == ActType_Relu, act == ActType_Relu6, para_->out_zp_,
static_cast<float>(out_scale), &(para_->min_), &(para_->max_));

if (!InferShapeDone()) {
return RET_OK;
@@ -154,9 +166,9 @@ void QuantizedAddCPUKernel::BroadcastRun(int task_id) {
cur_out = output_data_ + task_id * stride * in_size_ + i * in_size_;
}
#ifdef ENABLE_AVX
AddInt8_AVX2(cur_in0, cur_in1, cur_out, in_size_, &para_);
AddInt8_AVX2(cur_in0, cur_in1, cur_out, in_size_, para_);
#else
AddInt8(cur_in0, cur_in1, cur_out, in_size_, &para_);
AddInt8(cur_in0, cur_in1, cur_out, in_size_, para_);
#endif
}
return;
@@ -182,18 +194,18 @@ int QuantizedAddCPUKernel::DoExecute(int task_id) {
if (support_opt_add_) {
int8_t *ptr_in = arith_para_->in_elements_num0_ == 1 ? cur_in1 : cur_in0;
int8_t element_in = arith_para_->in_elements_num0_ == 1 ? input0_data_[0] : input1_data_[0];
AddQuantQrgs *ptr_args = arith_para_->in_elements_num0_ == 1 ? &para_.in1_args_ : &para_.in0_args_;
AddQuantQrgs *ele_args = arith_para_->in_elements_num0_ == 1 ? &para_.in0_args_ : &para_.in1_args_;
AddQuantQrgs *ptr_args = arith_para_->in_elements_num0_ == 1 ? &(para_->in1_args_) : &(para_->in0_args_);
AddQuantQrgs *ele_args = arith_para_->in_elements_num0_ == 1 ? &(para_->in0_args_) : &(para_->in1_args_);
#ifdef ENABLE_AVX
AddOptInt8_AVX2(ptr_in, element_in, cur_out, rest_count, &para_, ptr_args, ele_args);
AddOptInt8_AVX2(ptr_in, element_in, cur_out, rest_count, para_, ptr_args, ele_args);
#else
AddOptInt8(ptr_in, element_in, cur_out, rest_count, &para_, ptr_args, ele_args);
AddOptInt8(ptr_in, element_in, cur_out, rest_count, para_, ptr_args, ele_args);
#endif
} else {
#ifdef ENABLE_AVX
AddInt8_AVX2(cur_in0, cur_in1, cur_out, rest_count, &para_);
AddInt8_AVX2(cur_in0, cur_in1, cur_out, rest_count, para_);
#else
AddInt8(cur_in0, cur_in1, cur_out, rest_count, &para_);
AddInt8(cur_in0, cur_in1, cur_out, rest_count, para_);
#endif
}



+ 2
- 2
mindspore/lite/src/runtime/kernel/arm/int8/add_int8.h View File

@@ -32,7 +32,7 @@ class QuantizedAddCPUKernel : public LiteKernel {
: LiteKernel(parameter, inputs, outputs, ctx) {
arith_para_ = reinterpret_cast<ArithmeticParameter *>(parameter);
}
~QuantizedAddCPUKernel() override = default;
~QuantizedAddCPUKernel() override;

int Init() override;
int ReSize() override;
@@ -43,7 +43,7 @@ class QuantizedAddCPUKernel : public LiteKernel {
void BroadcastRun(int task_id);

private:
AddQuantParameter para_;
AddQuantParameter *para_ = nullptr;
ArithmeticParameter *arith_para_ = nullptr;
int in_size_ = 0;
int out_size_ = 0;


+ 30
- 9
mindspore/lite/src/runtime/kernel/arm/int8/argminmax_int8.cc View File

@@ -27,18 +27,39 @@ using mindspore::schema::PrimitiveType_ArgMaxFusion;
using mindspore::schema::PrimitiveType_ArgMinFusion;

namespace mindspore::kernel {
ArgMinMaxInt8CPUKernel::~ArgMinMaxInt8CPUKernel() {
if (in_quant_arg_ != nullptr) {
free(in_quant_arg_);
in_quant_arg_ = nullptr;
}
if (out_quant_arg_ != nullptr) {
free(out_quant_arg_);
out_quant_arg_ = nullptr;
}
}

int ArgMinMaxInt8CPUKernel::Init() {
auto param = reinterpret_cast<ArgMinMaxParameter *>(op_parameter_);
param->data_type_ = kNumberTypeInt8;
in_quant_arg_ = reinterpret_cast<QuantArg *>(malloc(sizeof(QuantArg)));
if (in_quant_arg_ == nullptr) {
MS_LOG(ERROR) << "Malloc QuantArg for argmin or argmax int8 op failed!";
return RET_ERROR;
}
auto *input_tensor = in_tensors_.at(kInputIndex);
auto in_quant_args = input_tensor->quant_params();
in_quant_arg_.scale_ = in_quant_args.front().scale;
in_quant_arg_.zp_ = in_quant_args.front().zeroPoint;
in_quant_arg_->scale_ = in_quant_args.front().scale;
in_quant_arg_->zp_ = in_quant_args.front().zeroPoint;

auto *out_tensor = out_tensors_.at(kOutputIndex);
auto out_quant_args = out_tensor->quant_params();
out_quant_arg_.scale_ = out_quant_args.front().scale;
out_quant_arg_.zp_ = out_quant_args.front().zeroPoint;
out_quant_arg_->scale_ = out_quant_args.front().scale;
out_quant_arg_->zp_ = out_quant_args.front().zeroPoint;
out_quant_arg_ = reinterpret_cast<QuantArg *>(malloc(sizeof(QuantArg)));
if (out_quant_arg_ == nullptr) {
MS_LOG(ERROR) << "Malloc QuantArg for argmin or argmax int8 op failed!";
return RET_ERROR;
}
if (!InferShapeDone()) {
return RET_OK;
}
@@ -72,22 +93,22 @@ int ArgMinMaxInt8CPUKernel::Run() {
auto in_shape = input->shape();
auto param = reinterpret_cast<ArgMinMaxParameter *>(op_parameter_);
if (param->topk_ == 1) {
Int8ArgMinMaxQuant(input_data, output_data, in_shape.data(), param, &in_quant_arg_, &out_quant_arg_);
Int8ArgMinMaxQuant(input_data, output_data, in_shape.data(), param, in_quant_arg_, out_quant_arg_);
return RET_OK;
}

switch (param->axis_) {
case 0:
Int8ArgMinMaxDim0(input_data, output_data, in_shape.data(), param, &in_quant_arg_, &out_quant_arg_);
Int8ArgMinMaxDim0(input_data, output_data, in_shape.data(), param, in_quant_arg_, out_quant_arg_);
break;
case 1:
Int8ArgMinMaxDim1(input_data, output_data, in_shape.data(), param, &in_quant_arg_, &out_quant_arg_);
Int8ArgMinMaxDim1(input_data, output_data, in_shape.data(), param, in_quant_arg_, out_quant_arg_);
break;
case 2:
Int8ArgMinMaxDim2(input_data, output_data, in_shape.data(), param, &in_quant_arg_, &out_quant_arg_);
Int8ArgMinMaxDim2(input_data, output_data, in_shape.data(), param, in_quant_arg_, out_quant_arg_);
break;
case 3:
Int8ArgMinMaxDim3(input_data, output_data, in_shape.data(), param, &in_quant_arg_, &out_quant_arg_);
Int8ArgMinMaxDim3(input_data, output_data, in_shape.data(), param, in_quant_arg_, out_quant_arg_);
break;
default:
MS_LOG(ERROR) << "axis is invalid";


+ 3
- 3
mindspore/lite/src/runtime/kernel/arm/int8/argminmax_int8.h View File

@@ -30,15 +30,15 @@ class ArgMinMaxInt8CPUKernel : public LiteKernel {
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
: LiteKernel(parameter, inputs, outputs, ctx) {}

~ArgMinMaxInt8CPUKernel() = default;
~ArgMinMaxInt8CPUKernel() override;

int Init() override;
int ReSize() override;
int Run() override;

private:
QuantArg in_quant_arg_;
QuantArg out_quant_arg_;
QuantArg *in_quant_arg_ = nullptr;
QuantArg *out_quant_arg_ = nullptr;
};
} // namespace mindspore::kernel



+ 28
- 8
mindspore/lite/src/runtime/kernel/arm/int8/batch_to_space_int8.cc View File

@@ -24,18 +24,38 @@ using mindspore::schema::PrimitiveType_BatchToSpace;
using mindspore::schema::PrimitiveType_BatchToSpaceND;

namespace mindspore::kernel {
BatchToSpaceInt8CPUKernel::~BatchToSpaceInt8CPUKernel() {
if (in_quant_arg_ != nullptr) {
free(in_quant_arg_);
in_quant_arg_ = nullptr;
}
if (out_quant_arg_ != nullptr) {
free(out_quant_arg_);
out_quant_arg_ = nullptr;
}
}

int BatchToSpaceInt8CPUKernel::Init() {
MS_ASSERT(in_tensors_.at(0)->format() == schema::Format::Format_NHWC);

in_quant_arg_ = reinterpret_cast<QuantArg *>(malloc(sizeof(QuantArg)));
if (in_quant_arg_ == nullptr) {
MS_LOG(ERROR) << "Malloc QuantArg for BatchToSpace int8 op failed!";
return RET_ERROR;
}
auto *input_tensor = in_tensors_.at(kInputIndex);
auto in_quant_args = input_tensor->quant_params();
in_quant_arg_.scale_ = in_quant_args.front().scale;
in_quant_arg_.zp_ = in_quant_args.front().zeroPoint;
in_quant_arg_->scale_ = in_quant_args.front().scale;
in_quant_arg_->zp_ = in_quant_args.front().zeroPoint;

out_quant_arg_ = reinterpret_cast<QuantArg *>(malloc(sizeof(QuantArg)));
if (out_quant_arg_ == nullptr) {
MS_LOG(ERROR) << "Malloc QuantArg for BatchToSpace int8 op failed!";
return RET_ERROR;
}
auto *out_tensor = out_tensors_.at(kOutputIndex);
auto out_quant_args = out_tensor->quant_params();
out_quant_arg_.scale_ = out_quant_args.front().scale;
out_quant_arg_.zp_ = out_quant_args.front().zeroPoint;
out_quant_arg_->scale_ = out_quant_args.front().scale;
out_quant_arg_->zp_ = out_quant_args.front().zeroPoint;
if (!InferShapeDone()) {
return RET_OK;
}
@@ -56,7 +76,7 @@ int BatchToSpaceInt8CPUKernel::Run() {
auto out_shape = output->shape();
BatchToSpaceParameter *param = reinterpret_cast<BatchToSpaceParameter *>(this->op_parameter_);

if (in_quant_arg_.scale_ == out_quant_arg_.scale_ && in_quant_arg_.zp_ == out_quant_arg_.zp_) {
if (in_quant_arg_->scale_ == out_quant_arg_->scale_ && in_quant_arg_->zp_ == out_quant_arg_->zp_) {
if (param->no_crop_) {
BatchToSpaceNoCropForNHWC(input_data, output_data, in_shape.data(), out_shape[0], param->block_shape_,
sizeof(int8_t));
@@ -67,10 +87,10 @@ int BatchToSpaceInt8CPUKernel::Run() {
} else {
if (param->no_crop_) {
BatchToSpaceNoCropForNHWCInt8(input_data, output_data, in_shape.data(), out_shape[0], param->block_shape_,
&in_quant_arg_, &out_quant_arg_);
in_quant_arg_, out_quant_arg_);
} else {
BatchToSpaceForNHWCInt8(input_data, output_data, in_shape.data(), out_shape[0], param->block_shape_,
param->crops_, &in_quant_arg_, &out_quant_arg_);
param->crops_, in_quant_arg_, out_quant_arg_);
}
}



+ 3
- 3
mindspore/lite/src/runtime/kernel/arm/int8/batch_to_space_int8.h View File

@@ -30,15 +30,15 @@ class BatchToSpaceInt8CPUKernel : public LiteKernel {
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
: LiteKernel(parameter, inputs, outputs, ctx) {}

~BatchToSpaceInt8CPUKernel() = default;
~BatchToSpaceInt8CPUKernel() override;

int Init() override;
int ReSize() override;
int Run() override;

private:
QuantArg in_quant_arg_;
QuantArg out_quant_arg_;
QuantArg *in_quant_arg_ = nullptr;
QuantArg *out_quant_arg_ = nullptr;
};
} // namespace mindspore::kernel



+ 1
- 1
mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_int8.h View File

@@ -62,7 +62,7 @@ class DeConvInt8CPUKernel : public ConvolutionBaseCPUKernel {
int8_t *output_ptr_ = nullptr;
size_t thread_count_ = 1;
size_t thread_stride_ = 0;
MATMUL_OPT_R4_FUNC matmul_func_;
MATMUL_OPT_R4_FUNC matmul_func_ = nullptr;
MatMulParameter *matmul_param_ = nullptr;
bool support_optimize_ = true;
};


+ 27
- 6
mindspore/lite/src/runtime/kernel/arm/int8/depth_to_space_int8.cc View File

@@ -25,18 +25,39 @@ using mindspore::lite::RET_PARAM_INVALID;
using mindspore::schema::PrimitiveType_DepthToSpace;

namespace mindspore::kernel {
DepthToSpaceInt8CPUKernel::~DepthToSpaceInt8CPUKernel() {
if (in_quant_arg_ != nullptr) {
free(in_quant_arg_);
in_quant_arg_ = nullptr;
}
if (out_quant_arg_ != nullptr) {
free(out_quant_arg_);
out_quant_arg_ = nullptr;
}
}

int DepthToSpaceInt8CPUKernel::Init() {
param_->data_type_size_ = sizeof(int8_t);

in_quant_arg_ = reinterpret_cast<QuantArg *>(malloc(sizeof(QuantArg)));
if (in_quant_arg_ == nullptr) {
MS_LOG(ERROR) << "Malloc QuantArg for DepthToSpace int8 op failed!";
return RET_ERROR;
}
auto *input_tensor = in_tensors_.at(kInputIndex);
auto in_quant_args = input_tensor->quant_params();
in_quant_arg_.scale_ = in_quant_args.front().scale;
in_quant_arg_.zp_ = in_quant_args.front().zeroPoint;
in_quant_arg_->scale_ = in_quant_args.front().scale;
in_quant_arg_->zp_ = in_quant_args.front().zeroPoint;

out_quant_arg_ = reinterpret_cast<QuantArg *>(malloc(sizeof(QuantArg)));
if (out_quant_arg_ == nullptr) {
MS_LOG(ERROR) << "Malloc QuantArg for DepthToSpace int8 op failed!";
return RET_ERROR;
}
auto *out_tensor = out_tensors_.at(kOutputIndex);
auto out_quant_args = out_tensor->quant_params();
out_quant_arg_.scale_ = out_quant_args.front().scale;
out_quant_arg_.zp_ = out_quant_args.front().zeroPoint;
out_quant_arg_->scale_ = out_quant_args.front().scale;
out_quant_arg_->zp_ = out_quant_args.front().zeroPoint;
if (!InferShapeDone()) {
return RET_OK;
}
@@ -51,10 +72,10 @@ int DepthToSpaceInt8CPUKernel::Run() {
const int8_t *input_data = reinterpret_cast<const int8_t *>(input->data_c());
int8_t *output_data = reinterpret_cast<int8_t *>(output->data_c());
auto in_shape = input->shape();
if (in_quant_arg_.scale_ == out_quant_arg_.scale_ && in_quant_arg_.zp_ == out_quant_arg_.zp_) {
if (in_quant_arg_->scale_ == out_quant_arg_->scale_ && in_quant_arg_->zp_ == out_quant_arg_->zp_) {
DepthToSpaceForNHWC(input_data, output_data, in_shape.data(), param_);
} else {
DepthToSpaceForNHWCInt8(input_data, output_data, in_shape.data(), param_, &in_quant_arg_, &out_quant_arg_);
DepthToSpaceForNHWCInt8(input_data, output_data, in_shape.data(), param_, in_quant_arg_, out_quant_arg_);
}
return RET_OK;
}


+ 3
- 3
mindspore/lite/src/runtime/kernel/arm/int8/depth_to_space_int8.h View File

@@ -29,15 +29,15 @@ class DepthToSpaceInt8CPUKernel : public DepthToSpaceBaseCPUKernel {
DepthToSpaceInt8CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
: DepthToSpaceBaseCPUKernel(parameter, inputs, outputs, ctx) {}
~DepthToSpaceInt8CPUKernel() = default;
~DepthToSpaceInt8CPUKernel() override;

int Init() override;
int ReSize() override;
int Run() override;

private:
QuantArg in_quant_arg_;
QuantArg out_quant_arg_;
QuantArg *in_quant_arg_ = nullptr;
QuantArg *out_quant_arg_ = nullptr;
};
} // namespace mindspore::kernel



+ 24
- 13
mindspore/lite/src/runtime/kernel/arm/int8/div_int8.cc View File

@@ -28,7 +28,12 @@ using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_DivFusion;

namespace mindspore::kernel {

DivInt8CPUKernel::~DivInt8CPUKernel() {
if (quant_args_ != nullptr) {
free(quant_args_);
quant_args_ = nullptr;
}
}
int DivInt8CPUKernel::Init() {
lite::Tensor *input0 = in_tensors_.at(0);
lite::Tensor *input1 = in_tensors_.at(1);
@@ -39,19 +44,25 @@ int DivInt8CPUKernel::Init() {

broadcast_ = input0->ElementsNum() != input1->ElementsNum();

param_.in0_args_.scale_ = input0->quant_params().front().scale;
param_.in0_args_.zp_ = -input0->quant_params().front().zeroPoint;
param_.in1_args_.scale_ = input1->quant_params().front().scale;
param_.in1_args_.zp_ = -input1->quant_params().front().zeroPoint;
param_.out_args_.scale_ = output->quant_params().front().scale;
param_.out_args_.zp_ = output->quant_params().front().zeroPoint;
quant_args_ = reinterpret_cast<DivQuantArg *>(malloc(sizeof(DivQuantArg)));
if (quant_args_ == nullptr) {
MS_LOG(ERROR) << "Malloc DivQuantArg for Div int8 op failed!";
return RET_ERROR;
}
quant_args_->in0_args_.scale_ = input0->quant_params().front().scale;
quant_args_->in0_args_.zp_ = -input0->quant_params().front().zeroPoint;
quant_args_->in1_args_.scale_ = input1->quant_params().front().scale;
quant_args_->in1_args_.zp_ = -input1->quant_params().front().zeroPoint;
quant_args_->out_args_.scale_ = output->quant_params().front().scale;
quant_args_->out_args_.zp_ = output->quant_params().front().zeroPoint;

const double real_multiplier = param_.in0_args_.scale_ / (param_.in1_args_.scale_ * param_.out_args_.scale_);
const double real_multiplier =
quant_args_->in0_args_.scale_ / (quant_args_->in1_args_.scale_ * quant_args_->out_args_.scale_);

QuantizeMultiplier(real_multiplier, &param_.output_multiplier_, &param_.output_shift_);
QuantizeMultiplier(real_multiplier, &quant_args_->output_multiplier_, &quant_args_->output_shift_);

param_.output_activation_min_ = std::numeric_limits<int8_t>::min();
param_.output_activation_max_ = std::numeric_limits<int8_t>::max();
quant_args_->output_activation_min_ = std::numeric_limits<int8_t>::min();
quant_args_->output_activation_max_ = std::numeric_limits<int8_t>::max();

if (!InferShapeDone()) {
return RET_OK;
@@ -74,10 +85,10 @@ int DivInt8CPUKernel::DoExecute(int task_id) {
auto ret = RET_OK;
if (broadcast_) {
ret = DivInt8(tile0_data_ + task_id * count, tile1_data_ + task_id * count, output_data_ + task_id * count, count,
&param_);
quant_args_);
} else {
ret = DivInt8(input0_data_ + task_id * count, input1_data_ + task_id * count, output_data_ + task_id * count, count,
&param_);
quant_args_);
}

if (ret != RET_OK) {


+ 2
- 2
mindspore/lite/src/runtime/kernel/arm/int8/div_int8.h View File

@@ -27,7 +27,7 @@ class DivInt8CPUKernel : public LiteKernel {
explicit DivInt8CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
: LiteKernel(parameter, inputs, outputs, ctx) {}
~DivInt8CPUKernel() override {}
~DivInt8CPUKernel() override;

int Init() override;
int ReSize() override;
@@ -35,7 +35,7 @@ class DivInt8CPUKernel : public LiteKernel {
int DoExecute(int task_id);

private:
DivQuantArg param_;
DivQuantArg *quant_args_ = nullptr;
int8_t *tile0_data_ = nullptr;
int8_t *tile1_data_ = nullptr;
bool broadcast_ = false;


+ 1
- 0
mindspore/lite/src/runtime/kernel/arm/int8/group_convolution_int8.h View File

@@ -32,6 +32,7 @@ class GroupConvolutionInt8CPUKernel : public GroupConvolutionBaseCPUKernel {
: GroupConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, std::move(group_convs), group_num) {
} // opParameter(in channel, out channel) in this kernel has been split to groups, if
// you want to get real params, multiply in channel / out channel with group num
~GroupConvolutionInt8CPUKernel() = default;
int SeparateInput(int group_id) override;
int PostConcat(int group_id) override;
};


+ 17
- 5
mindspore/lite/src/runtime/kernel/arm/int8/l2_norm_int8.cc View File

@@ -24,16 +24,28 @@ using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_L2NormalizeFusion;

namespace mindspore::kernel {
L2NormInt8CPUKernel::~L2NormInt8CPUKernel() {
if (quant_param_ != nullptr) {
free(quant_param_);
quant_param_ = nullptr;
}
}

int L2NormInt8CPUKernel::Init() {
lite::Tensor *input = in_tensors_.at(0);
lite::Tensor *output = out_tensors_.at(0);
MS_ASSERT(input);
MS_ASSERT(output);

quant_param_.in_.scale_ = input->quant_params().front().scale;
quant_param_.in_.zp_ = input->quant_params().front().zeroPoint;
quant_param_.out_.scale_ = output->quant_params().front().scale;
quant_param_.out_.zp_ = output->quant_params().front().zeroPoint;
quant_param_ = reinterpret_cast<L2NormQuantArg *>(malloc(sizeof(L2NormQuantArg)));
if (quant_param_ == nullptr) {
MS_LOG(ERROR) << "Malloc L2NormQuantArg for L2Norm int8 op failed!";
return RET_ERROR;
}
quant_param_->in_.scale_ = input->quant_params().front().scale;
quant_param_->in_.zp_ = input->quant_params().front().zeroPoint;
quant_param_->out_.scale_ = output->quant_params().front().scale;
quant_param_->out_.zp_ = output->quant_params().front().zeroPoint;
return ReSize();
}

@@ -68,7 +80,7 @@ int L2NormInt8CPUKernel::DoExecute(int task_id) {
int8_t *output_data = static_cast<int8_t *>(out_tensors().front()->MutableData());
MS_ASSERT(output_data);
MS_ASSERT(l2_norm_param_);
return L2NormalizationInt8(input_data, output_data, l2_norm_param_, &quant_param_, begin, end);
return L2NormalizationInt8(input_data, output_data, l2_norm_param_, quant_param_, begin, end);
}

REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_L2NormalizeFusion, LiteKernelCreator<L2NormInt8CPUKernel>)


+ 2
- 2
mindspore/lite/src/runtime/kernel/arm/int8/l2_norm_int8.h View File

@@ -26,14 +26,14 @@ class L2NormInt8CPUKernel : public L2NormCPUKernel {
explicit L2NormInt8CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
: L2NormCPUKernel(parameter, inputs, outputs, ctx) {}
~L2NormInt8CPUKernel() {}
~L2NormInt8CPUKernel() override;

int Init() override;
int Run() override;
int DoExecute(int tId);

private:
L2NormQuantArg quant_param_;
L2NormQuantArg *quant_param_ = nullptr;
};
} // namespace mindspore::kernel



+ 15
- 6
mindspore/lite/src/runtime/kernel/arm/int8/layer_norm_int8.cc View File

@@ -24,6 +24,10 @@ using mindspore::schema::PrimitiveType_LayerNormFusion;

namespace mindspore::kernel {
LayerNormInt8CPUKernel::~LayerNormInt8CPUKernel() {
if (quant_param_ != nullptr) {
free(quant_param_);
quant_param_ = nullptr;
}
if (gamma_ptr_ != nullptr) {
free(gamma_ptr_);
gamma_ptr_ = nullptr;
@@ -38,10 +42,15 @@ int LayerNormInt8CPUKernel::SetQuantArgs() {
lite::Tensor *input = in_tensors_.at(0);
lite::Tensor *output = out_tensors_.at(0);

quant_param_.in_zp_ = input->quant_params().front().zeroPoint;
quant_param_.in_scale_ = input->quant_params().front().scale;
quant_param_.out_zp_ = output->quant_params().front().zeroPoint;
quant_param_.out_scale_ = output->quant_params().front().scale;
quant_param_ = reinterpret_cast<LayerNormQuantArg *>(malloc(sizeof(LayerNormQuantArg)));
if (quant_param_ == nullptr) {
MS_LOG(ERROR) << "Malloc LayerNormQuantArg for LayerNorm int8 op failed!";
return RET_ERROR;
}
quant_param_->in_zp_ = input->quant_params().front().zeroPoint;
quant_param_->in_scale_ = input->quant_params().front().scale;
quant_param_->out_zp_ = output->quant_params().front().zeroPoint;
quant_param_->out_scale_ = output->quant_params().front().scale;

lite::Tensor *gamma_tensor = in_tensors_.at(1);
lite::Tensor *beta_tensor = in_tensors_.at(2);
@@ -67,7 +76,7 @@ int LayerNormInt8CPUKernel::SetQuantArgs() {
}
int32_t *src_beta = reinterpret_cast<int32_t *>(beta_tensor->data_c());
for (int i = 0; i < beta_tensor->ElementsNum(); i++) {
beta_ptr_[i] = src_beta[i] * quant_param_.in_scale_ * gamma_scale;
beta_ptr_[i] = src_beta[i] * quant_param_->in_scale_ * gamma_scale;
}
return RET_OK;
}
@@ -109,7 +118,7 @@ int LayerNormInt8CPUKernel::ReSize() {
}

int LayerNormInt8CPUKernel::DoExecute(int task_id) {
LayerNormInt8(src_ptr_, gamma_ptr_, beta_ptr_, dst_ptr_, param_, &quant_param_, task_id);
LayerNormInt8(src_ptr_, gamma_ptr_, beta_ptr_, dst_ptr_, param_, quant_param_, task_id);
return RET_OK;
}



+ 1
- 1
mindspore/lite/src/runtime/kernel/arm/int8/layer_norm_int8.h View File

@@ -44,7 +44,7 @@ class LayerNormInt8CPUKernel : public LiteKernel {

private:
LayerNormParameter *param_ = nullptr;
LayerNormQuantArg quant_param_;
LayerNormQuantArg *quant_param_ = nullptr;
int8_t *src_ptr_ = nullptr;
int8_t *dst_ptr_ = nullptr;
float *gamma_ptr_ = nullptr;


+ 58
- 49
mindspore/lite/src/runtime/kernel/arm/int8/matmul_base_int8.cc View File

@@ -41,14 +41,15 @@ int MatmulBaseInt8CPUKernel::RunImpl(int task_id) {
return RET_OK;
}

int32_t *cur_left = filter_per_channel_ ? quant_.left_shift_ + cur_stride : quant_.left_shift_;
int32_t *cur_right = filter_per_channel_ ? quant_.right_shift_ + cur_stride : quant_.right_shift_;
int32_t *cur_mul = filter_per_channel_ ? quant_.quant_multiplier_ + cur_stride : quant_.quant_multiplier_;
int32_t *cur_zp = filter_per_channel_ ? quant_.filter_zp_ + cur_stride : quant_.filter_zp_;
int32_t *cur_left = filter_per_channel_ ? quant_param_->left_shift_ + cur_stride : quant_param_->left_shift_;
int32_t *cur_right = filter_per_channel_ ? quant_param_->right_shift_ + cur_stride : quant_param_->right_shift_;
int32_t *cur_mul =
filter_per_channel_ ? quant_param_->quant_multiplier_ + cur_stride : quant_param_->quant_multiplier_;
int32_t *cur_zp = filter_per_channel_ ? quant_param_->filter_zp_ + cur_stride : quant_param_->filter_zp_;

MatmulInt8Opt(pack_a_ptr_, batch_b_ptr_ + cur_stride * param_->deep_16_, batch_c_ptr_ + cur_stride, param_->row_,
cur_oc, param_->deep_16_, input_sums_, weight_bias_sums_ + cur_stride, quant_.out_act_min_,
quant_.out_act_max_, quant_.output_.zp_, cur_mul, cur_left, cur_right, param_->col_,
cur_oc, param_->deep_16_, input_sums_, weight_bias_sums_ + cur_stride, quant_param_->out_act_min_,
quant_param_->out_act_max_, quant_param_->output_.zp_, cur_mul, cur_left, cur_right, param_->col_,
filter_per_channel_, cur_zp);

return RET_OK;
@@ -63,29 +64,32 @@ MatmulBaseInt8CPUKernel::~MatmulBaseInt8CPUKernel() {
free(bias_ptr_);
bias_ptr_ = nullptr;
}
return;
if (quant_param_ != nullptr) {
free(quant_param_);
quant_param_ = nullptr;
}
}

void MatmulBaseInt8CPUKernel::FreeQuantParam() {
if (quant_.filter_scale_ != nullptr) {
free(quant_.filter_scale_);
quant_.filter_scale_ = nullptr;
if (quant_param_->filter_scale_ != nullptr) {
free(quant_param_->filter_scale_);
quant_param_->filter_scale_ = nullptr;
}
if (quant_.filter_zp_ != nullptr) {
free(quant_.filter_zp_);
quant_.filter_zp_ = nullptr;
if (quant_param_->filter_zp_ != nullptr) {
free(quant_param_->filter_zp_);
quant_param_->filter_zp_ = nullptr;
}
if (quant_.left_shift_ != nullptr) {
free(quant_.left_shift_);
quant_.left_shift_ = nullptr;
if (quant_param_->left_shift_ != nullptr) {
free(quant_param_->left_shift_);
quant_param_->left_shift_ = nullptr;
}
if (quant_.right_shift_ != nullptr) {
free(quant_.right_shift_);
quant_.right_shift_ = nullptr;
if (quant_param_->right_shift_ != nullptr) {
free(quant_param_->right_shift_);
quant_param_->right_shift_ = nullptr;
}
if (quant_.quant_multiplier_ != nullptr) {
free(quant_.quant_multiplier_);
quant_.quant_multiplier_ = nullptr;
if (quant_param_->quant_multiplier_ != nullptr) {
free(quant_param_->quant_multiplier_);
quant_param_->quant_multiplier_ = nullptr;
}
return;
}
@@ -99,24 +103,29 @@ int MatmulBaseInt8CPUKernel::MallocQuantParam() {

int init_size = filter_per_channel_ ? col : 1;

quant_.filter_scale_ = reinterpret_cast<float *>(malloc(init_size * sizeof(float)));
if (quant_.filter_scale_ == nullptr) {
quant_param_ = reinterpret_cast<MatmulQuantParameter *>(malloc(sizeof(MatmulQuantParameter)));
if (quant_param_ == nullptr) {
MS_LOG(ERROR) << "Malloc MatmulQuantParameter for Matmul int8 op failed!";
return RET_ERROR;
}
quant_param_->filter_scale_ = reinterpret_cast<float *>(malloc(init_size * sizeof(float)));
if (quant_param_->filter_scale_ == nullptr) {
return RET_ERROR;
}
quant_.filter_zp_ = reinterpret_cast<int32_t *>(malloc(init_size * sizeof(int32_t)));
if (quant_.filter_zp_ == nullptr) {
quant_param_->filter_zp_ = reinterpret_cast<int32_t *>(malloc(init_size * sizeof(int32_t)));
if (quant_param_->filter_zp_ == nullptr) {
return RET_ERROR;
}
quant_.left_shift_ = reinterpret_cast<int32_t *>(malloc(init_size * sizeof(int32_t)));
if (quant_.left_shift_ == nullptr) {
quant_param_->left_shift_ = reinterpret_cast<int32_t *>(malloc(init_size * sizeof(int32_t)));
if (quant_param_->left_shift_ == nullptr) {
return RET_ERROR;
}
quant_.right_shift_ = reinterpret_cast<int32_t *>(malloc(init_size * sizeof(int32_t)));
if (quant_.right_shift_ == nullptr) {
quant_param_->right_shift_ = reinterpret_cast<int32_t *>(malloc(init_size * sizeof(int32_t)));
if (quant_param_->right_shift_ == nullptr) {
return RET_ERROR;
}
quant_.quant_multiplier_ = reinterpret_cast<int32_t *>(malloc(init_size * sizeof(int32_t)));
if (quant_.quant_multiplier_ == nullptr) {
quant_param_->quant_multiplier_ = reinterpret_cast<int32_t *>(malloc(init_size * sizeof(int32_t)));
if (quant_param_->quant_multiplier_ == nullptr) {
return RET_ERROR;
}
return RET_OK;
@@ -124,32 +133,32 @@ int MatmulBaseInt8CPUKernel::MallocQuantParam() {

void MatmulBaseInt8CPUKernel::InitQuantParam() {
auto in_quant_params = in_tensors_.at(0)->quant_params();
quant_.input_.zp_ = in_quant_params.front().zeroPoint;
quant_.input_.scale_ = in_quant_params.front().scale;
quant_param_->input_.zp_ = in_quant_params.front().zeroPoint;
quant_param_->input_.scale_ = in_quant_params.front().scale;

auto out_quant_params = out_tensors_.at(0)->quant_params();
quant_.output_.zp_ = out_quant_params.front().zeroPoint;
quant_.output_.scale_ = out_quant_params.front().scale;
quant_param_->output_.zp_ = out_quant_params.front().zeroPoint;
quant_param_->output_.scale_ = out_quant_params.front().scale;

auto weight_tensor = in_tensors_.at(1);
int weight_quant_num = filter_per_channel_ ? weight_tensor->shape().front() : 1;
auto weight_quant_params = weight_tensor->quant_params();

for (int i = 0; i < weight_quant_num; i++) {
quant_.filter_zp_[i] = weight_quant_params[i].zeroPoint;
quant_.filter_scale_[i] = weight_quant_params[i].scale;
quant_param_->filter_zp_[i] = weight_quant_params[i].zeroPoint;
quant_param_->filter_scale_[i] = weight_quant_params[i].scale;
}

for (int i = 0; i < weight_quant_num; ++i) {
const double in_scale = static_cast<double>(quant_.input_.scale_ * quant_.filter_scale_[i]);
double real_multiplier = in_scale / static_cast<double>(quant_.output_.scale_);
QuantizeRoundParameterWithDoublePrecision(real_multiplier, &quant_.quant_multiplier_[i], &quant_.left_shift_[i],
&quant_.right_shift_[i]);
const double in_scale = static_cast<double>(quant_param_->input_.scale_ * quant_param_->filter_scale_[i]);
double real_multiplier = in_scale / static_cast<double>(quant_param_->output_.scale_);
QuantizeRoundParameterWithDoublePrecision(real_multiplier, &quant_param_->quant_multiplier_[i],
&quant_param_->left_shift_[i], &quant_param_->right_shift_[i]);
}

CalculateActivationRangeQuantized(param_->act_type_ == ActType_Relu, param_->act_type_ == ActType_Relu6,
quant_.output_.zp_, quant_.output_.scale_, &quant_.out_act_min_,
&quant_.out_act_max_);
quant_param_->output_.zp_, quant_param_->output_.scale_,
&quant_param_->out_act_min_, &quant_param_->out_act_max_);
}

void MatmulBaseInt8CPUKernel::InitParameter() {
@@ -207,16 +216,16 @@ void MatmulBaseInt8CPUKernel::TransferB() {
#else
RowMajor2Row16x4MajorInt8(current_weight, current_b_pack, param_->col_, param_->deep_);
#endif
CalcWeightBiasSums(current_weight, param_->deep_, param_->col_, quant_.input_.zp_, quant_.filter_zp_, bias_ptr_,
current_sums, ColMajor, filter_per_channel_);
CalcWeightBiasSums(current_weight, param_->deep_, param_->col_, quant_param_->input_.zp_,
quant_param_->filter_zp_, bias_ptr_, current_sums, ColMajor, filter_per_channel_);
} else {
#ifdef ENABLE_ARM32
RowMajor2Col16x2MajorInt8(current_weight, current_b_pack, param_->deep_, param_->col_);
#else
RowMajor2Col16x4MajorInt8(current_weight, param_->deep_, param_->col_, current_b_pack);
#endif
CalcWeightBiasSums(current_weight, param_->deep_, param_->col_, quant_.input_.zp_, quant_.filter_zp_, bias_ptr_,
current_sums, RowMajor, false);
CalcWeightBiasSums(current_weight, param_->deep_, param_->col_, quant_param_->input_.zp_,
quant_param_->filter_zp_, bias_ptr_, current_sums, RowMajor, false);
}
}
return;
@@ -312,7 +321,7 @@ int MatmulBaseInt8CPUKernel::Run() {

int8_t *a_ptr = reinterpret_cast<int8_t *>(in_tensors_.at(0)->data_c());
int8_t *c_ptr = reinterpret_cast<int8_t *>(out_tensors_.at(0)->data_c());
int32_t tmp_weight_zp = filter_per_channel_ ? 1 : quant_.filter_zp_[0];
int32_t tmp_weight_zp = filter_per_channel_ ? 1 : quant_param_->filter_zp_[0];
for (int i = 0; i < param_->batch; i++) {
auto current_src_a = a_ptr + i * param_->row_ * param_->deep_;
if (param_->a_transpose_) {


+ 1
- 1
mindspore/lite/src/runtime/kernel/arm/int8/matmul_base_int8.h View File

@@ -63,7 +63,7 @@ class MatmulBaseInt8CPUKernel : public LiteKernel {

protected:
MatMulParameter *param_ = nullptr;
MatmulQuantParameter quant_;
MatmulQuantParameter *quant_param_ = nullptr;
int thread_count_ = 1;
int thread_stride_ = 0;
int8_t *pack_a_ptr_ = nullptr;


+ 28
- 18
mindspore/lite/src/runtime/kernel/arm/int8/mul_int8.cc View File

@@ -25,6 +25,13 @@ using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_MulFusion;

namespace mindspore::kernel {
MulInt8CPUKernel::~MulInt8CPUKernel() {
if (quant_args_ != nullptr) {
free(quant_args_);
quant_args_ = nullptr;
}
}

int MulInt8CPUKernel::Init() {
lite::Tensor *input0 = in_tensors_.at(0);
lite::Tensor *input1 = in_tensors_.at(1);
@@ -33,24 +40,28 @@ int MulInt8CPUKernel::Init() {
MS_ASSERT(input1);
MS_ASSERT(output);

para_.mul_quant_arg_.in_quant_args_[0].scale_ = input0->quant_params().front().scale;
para_.mul_quant_arg_.in_quant_args_[0].zp_ = input0->quant_params().front().zeroPoint * -1;
para_.mul_quant_arg_.in_quant_args_[1].scale_ = input1->quant_params().front().scale;
para_.mul_quant_arg_.in_quant_args_[1].zp_ = input1->quant_params().front().zeroPoint * -1;
para_.mul_quant_arg_.out_quant_arg_.scale_ = output->quant_params().front().scale;
para_.mul_quant_arg_.out_quant_arg_.zp_ = output->quant_params().front().zeroPoint;
para_.mul_quant_arg_.output_activation_max_ = std::numeric_limits<int8_t>::max();
para_.mul_quant_arg_.output_activation_min_ = std::numeric_limits<int8_t>::min();

const double real_multiplier =
(para_.mul_quant_arg_.in_quant_args_[0].scale_ * para_.mul_quant_arg_.in_quant_args_[1].scale_) /
para_.mul_quant_arg_.out_quant_arg_.scale_;
quant_args_ = reinterpret_cast<MulQuantArg *>(malloc(sizeof(MulQuantArg)));
if (quant_args_ == nullptr) {
MS_LOG(ERROR) << "Malloc MulQuantArg for Mul int8 op failed!";
return RET_ERROR;
}
quant_args_->in_quant_args_[0].scale_ = input0->quant_params().front().scale;
quant_args_->in_quant_args_[0].zp_ = input0->quant_params().front().zeroPoint * -1;
quant_args_->in_quant_args_[1].scale_ = input1->quant_params().front().scale;
quant_args_->in_quant_args_[1].zp_ = input1->quant_params().front().zeroPoint * -1;
quant_args_->out_quant_arg_.scale_ = output->quant_params().front().scale;
quant_args_->out_quant_arg_.zp_ = output->quant_params().front().zeroPoint;
quant_args_->output_activation_max_ = std::numeric_limits<int8_t>::max();
quant_args_->output_activation_min_ = std::numeric_limits<int8_t>::min();

const double real_multiplier = (quant_args_->in_quant_args_[0].scale_ * quant_args_->in_quant_args_[1].scale_) /
quant_args_->out_quant_arg_.scale_;

int right_shift = 0;
QuantizeMultiplierSmallerThanOne(real_multiplier, &para_.mul_quant_arg_.output_multiplier_, &right_shift);
QuantizeMultiplierSmallerThanOne(real_multiplier, &quant_args_->output_multiplier_, &right_shift);

para_.mul_quant_arg_.shift_left_ = right_shift < 0 ? -right_shift : 0;
para_.mul_quant_arg_.shift_right_ = right_shift > 0 ? right_shift : 0;
quant_args_->shift_left_ = right_shift < 0 ? -right_shift : 0;
quant_args_->shift_right_ = right_shift > 0 ? right_shift : 0;

if (!InferShapeDone()) {
return RET_OK;
@@ -202,8 +213,7 @@ int MulInt8CPUKernel::FastDoExecute(int task_id) {
cur_input0_data = input1_data_;
cur_input1_data = input0_data_ + task_id * count_unit_ * depth;
}
FastMul(cur_input0_data, cur_input1_data, cur_output_data, depth, real_dst_count, input1_hw_broadcast_,
para_.mul_quant_arg_);
FastMul(cur_input0_data, cur_input1_data, cur_output_data, depth, real_dst_count, input1_hw_broadcast_, quant_args_);
return RET_OK;
}

@@ -216,7 +226,7 @@ int MulInt8CPUKernel::DoExecute(int task_id) {
int8_t *cur_input1_data = input1_data_ + task_id * count_unit_;
int8_t *cur_output_data = output_data_ + task_id * count_unit_;

Mul(cur_input0_data, cur_input1_data, cur_output_data, real_dst_count, para_.mul_quant_arg_);
Mul(cur_input0_data, cur_input1_data, cur_output_data, real_dst_count, quant_args_);
return lite::RET_OK;
}



+ 2
- 2
mindspore/lite/src/runtime/kernel/arm/int8/mul_int8.h View File

@@ -33,7 +33,7 @@ class MulInt8CPUKernel : public LiteKernel {
: LiteKernel(parameter, inputs, outputs, ctx), ctx_(ctx), thread_count_(ctx_->thread_num_) {
tile_para = reinterpret_cast<ArithmeticParameter *>(parameter);
}
~MulInt8CPUKernel() override{};
~MulInt8CPUKernel() override;

int Init() override;
int ReSize() override;
@@ -46,7 +46,7 @@ class MulInt8CPUKernel : public LiteKernel {
private:
const lite::InnerContext *ctx_ = nullptr;
ArithmeticParameter *tile_para = nullptr;
MulParameter para_;
MulQuantArg *quant_args_ = nullptr;
bool fast_hw_broadcast_ = false;
bool input1_hw_broadcast_ = false;
int thread_count_ = 1;


+ 22
- 11
mindspore/lite/src/runtime/kernel/arm/int8/softmax_int8.cc View File

@@ -30,35 +30,46 @@ using mindspore::lite::RET_NULL_PTR;
using mindspore::schema::PrimitiveType_Softmax;

namespace mindspore::kernel {
SoftmaxInt8CPUKernel::~SoftmaxInt8CPUKernel() {
if (quant_param_ != nullptr) {
free(quant_param_);
quant_param_ = nullptr;
}
}

int SoftmaxInt8CPUKernel::Init() {
auto ret = SoftmaxBaseCPUKernel::Init();
if (ret != RET_OK) {
return ret;
}
quant_param_ = reinterpret_cast<SoftmaxQuantArg *>(malloc(sizeof(SoftmaxQuantArg)));
if (quant_param_ == nullptr) {
MS_LOG(ERROR) << "Malloc SoftmaxQuantArg for Softmax int8 op failed!";
return RET_ERROR;
}

auto *input_tensor = in_tensors_.at(kInputIndex);
MS_ASSERT(input_tensor);

auto in_quant_args = input_tensor->quant_params();
quant_params_.in_quant_args_.scale_ = in_quant_args.front().scale;
quant_params_.in_quant_args_.zp_ = -in_quant_args.front().zeroPoint;
quant_param_->in_quant_args_.scale_ = in_quant_args.front().scale;
quant_param_->in_quant_args_.zp_ = -in_quant_args.front().zeroPoint;

auto *out_tensor = out_tensors_.at(kOutputIndex);
MS_ASSERT(out_tensor);

auto out_quant_args = out_tensor->quant_params();
quant_params_.out_quant_arg_.scale_ = out_quant_args.front().scale;
quant_params_.out_quant_arg_.zp_ = -out_quant_args.front().zeroPoint;
quant_params_.output_activation_min_ = std::numeric_limits<int8_t>::min();
quant_params_.output_activation_max_ = std::numeric_limits<int8_t>::max();
quant_param_->out_quant_arg_.scale_ = out_quant_args.front().scale;
quant_param_->out_quant_arg_.zp_ = -out_quant_args.front().zeroPoint;
quant_param_->output_activation_min_ = std::numeric_limits<int8_t>::min();
quant_param_->output_activation_max_ = std::numeric_limits<int8_t>::max();

const double input_real_multiplier =
MSMIN(quant_params_.in_quant_args_.scale_ * (1 << (unsigned int)(31 - 5)), (1ll << 31) - 1.0);
MSMIN(quant_param_->in_quant_args_.scale_ * (1 << (unsigned int)(31 - 5)), (1ll << 31) - 1.0);
int right_shift = 0;
QuantizeMultiplierSmallerThanOne(input_real_multiplier, &quant_params_.output_multiplier_, &right_shift);
quant_params_.shift_left_ = right_shift < 0 ? -right_shift : 0;
quant_params_.shift_right_ = right_shift > 0 ? right_shift : 0;
QuantizeMultiplierSmallerThanOne(input_real_multiplier, &quant_param_->output_multiplier_, &right_shift);
quant_param_->shift_left_ = right_shift < 0 ? -right_shift : 0;
quant_param_->shift_right_ = right_shift > 0 ? right_shift : 0;

if (!InferShapeDone()) {
return RET_OK;
@@ -91,7 +102,7 @@ int SoftmaxInt8CPUKernel::DoSoftmax(int task_id) {
int stride_size = stride * task_id * inner_size;

auto error_code = SoftmaxInt8(input_ptr + stride_size, output_ptr + stride_size, count, exp_data_ + stride_size,
sum_data_, quant_params_, softmax_param_);
sum_data_, quant_param_, softmax_param_);
if (error_code != RET_OK) {
MS_LOG(ERROR) << "DoSoftmax error task_id[" << task_id << "] error_code[" << error_code << "]";
return RET_ERROR;


+ 2
- 2
mindspore/lite/src/runtime/kernel/arm/int8/softmax_int8.h View File

@@ -27,7 +27,7 @@ class SoftmaxInt8CPUKernel : public SoftmaxBaseCPUKernel {
SoftmaxInt8CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
: SoftmaxBaseCPUKernel(parameter, inputs, outputs, ctx) {}
~SoftmaxInt8CPUKernel() = default;
~SoftmaxInt8CPUKernel() override;

int Init() override;
int ReSize() override;
@@ -37,7 +37,7 @@ class SoftmaxInt8CPUKernel : public SoftmaxBaseCPUKernel {
private:
int *sum_data_ = nullptr;
int *exp_data_ = nullptr;
SoftmaxQuantArg quant_params_;
SoftmaxQuantArg *quant_param_ = nullptr;
};
} // namespace mindspore::kernel



+ 40
- 25
mindspore/lite/src/runtime/kernel/arm/int8/sub_int8.cc View File

@@ -25,6 +25,13 @@ using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_SubFusion;

namespace mindspore::kernel {
SubInt8CPUKernel::~SubInt8CPUKernel() {
if (quant_param_ != nullptr) {
free(quant_param_);
quant_param_ = nullptr;
}
}

int SubInt8CPUKernel::Init() {
lite::Tensor *input0 = in_tensors_.at(0);
lite::Tensor *input1 = in_tensors_.at(1);
@@ -35,37 +42,45 @@ int SubInt8CPUKernel::Init() {

broadcast_ = input0->ElementsNum() != input1->ElementsNum();

param_.in0_args_.scale_ = input0->quant_params().front().scale;
param_.in0_args_.zp_ = -input0->quant_params().front().zeroPoint;
param_.in1_args_.scale_ = input1->quant_params().front().scale;
param_.in1_args_.zp_ = -input1->quant_params().front().zeroPoint;
param_.out_args_.scale_ = output->quant_params().front().scale;
param_.out_args_.zp_ = output->quant_params().front().zeroPoint;
quant_param_ = reinterpret_cast<SubQuantArg *>(malloc(sizeof(SubQuantArg)));
if (quant_param_ == nullptr) {
MS_LOG(ERROR) << "Malloc SubQuantArg for Sub int8 op failed!";
return RET_ERROR;
}
quant_param_->in0_args_.scale_ = input0->quant_params().front().scale;
quant_param_->in0_args_.zp_ = -input0->quant_params().front().zeroPoint;
quant_param_->in1_args_.scale_ = input1->quant_params().front().scale;
quant_param_->in1_args_.zp_ = -input1->quant_params().front().zeroPoint;
quant_param_->out_args_.scale_ = output->quant_params().front().scale;
quant_param_->out_args_.zp_ = output->quant_params().front().zeroPoint;

const int left_shift = 20;
const double twice_max_input_scale = 2 * std::max(param_.in0_args_.scale_, param_.in1_args_.scale_);
const double real_input0_multiplier = param_.in0_args_.scale_ / twice_max_input_scale;
const double real_input1_multiplier = param_.in1_args_.scale_ / twice_max_input_scale;
const double real_output_multiplier = twice_max_input_scale / ((1 << left_shift) * param_.out_args_.scale_);
const double twice_max_input_scale = 2 * std::max(quant_param_->in0_args_.scale_, quant_param_->in1_args_.scale_);
const double real_input0_multiplier = quant_param_->in0_args_.scale_ / twice_max_input_scale;
const double real_input1_multiplier = quant_param_->in1_args_.scale_ / twice_max_input_scale;
const double real_output_multiplier = twice_max_input_scale / ((1 << left_shift) * quant_param_->out_args_.scale_);

QuantizeMultiplierSmallerThanOne(real_input0_multiplier, &param_.input0_multiplier_, &param_.input0_shift_);
QuantizeMultiplierSmallerThanOne(real_input1_multiplier, &param_.input1_multiplier_, &param_.input1_shift_);
QuantizeMultiplierSmallerThanOne(real_output_multiplier, &param_.output_multiplier_, &param_.output_shift_);
QuantizeMultiplierSmallerThanOne(real_input0_multiplier, &quant_param_->input0_multiplier_,
&quant_param_->input0_shift_);
QuantizeMultiplierSmallerThanOne(real_input1_multiplier, &quant_param_->input1_multiplier_,
&quant_param_->input1_shift_);
QuantizeMultiplierSmallerThanOne(real_output_multiplier, &quant_param_->output_multiplier_,
&quant_param_->output_shift_);

param_.output_activation_min_ = std::numeric_limits<int8_t>::min();
param_.output_activation_max_ = std::numeric_limits<int8_t>::max();
quant_param_->output_activation_min_ = std::numeric_limits<int8_t>::min();
quant_param_->output_activation_max_ = std::numeric_limits<int8_t>::max();

int left_shift0 = -param_.input0_shift_ > 0 ? -param_.input0_shift_ : 0;
param_.right_shift0_ = -param_.input0_shift_ > 0 ? 0 : param_.input0_shift_;
int left_shift0 = -quant_param_->input0_shift_ > 0 ? -quant_param_->input0_shift_ : 0;
quant_param_->right_shift0_ = -quant_param_->input0_shift_ > 0 ? 0 : quant_param_->input0_shift_;

int left_shift1 = -param_.input1_shift_ > 0 ? -param_.input1_shift_ : 0;
param_.right_shift1_ = -param_.input1_shift_ > 0 ? 0 : param_.input1_shift_;
int left_shift1 = -quant_param_->input1_shift_ > 0 ? -quant_param_->input1_shift_ : 0;
quant_param_->right_shift1_ = -quant_param_->input1_shift_ > 0 ? 0 : quant_param_->input1_shift_;

param_.left_shift_out_ = -param_.output_shift_ > 0 ? -param_.output_shift_ : 0;
param_.right_shift_out_ = -param_.output_shift_ > 0 ? 0 : param_.output_shift_;
quant_param_->left_shift_out_ = -quant_param_->output_shift_ > 0 ? -quant_param_->output_shift_ : 0;
quant_param_->right_shift_out_ = -quant_param_->output_shift_ > 0 ? 0 : quant_param_->output_shift_;

param_.left_shift_result0_ = (1 << left_shift) * ((1 << left_shift0));
param_.left_shift_result1_ = (1 << left_shift) * ((1 << left_shift1));
quant_param_->left_shift_result0_ = (1 << left_shift) * ((1 << left_shift0));
quant_param_->left_shift_result1_ = (1 << left_shift) * ((1 << left_shift1));

MS_ASSERT(left_shift + left_shift0 == left_shift);
MS_ASSERT(left_shift + left_shift1 == left_shift);
@@ -94,10 +109,10 @@ int SubInt8CPUKernel::DoExecute(int task_id) {
auto ret = RET_OK;
if (broadcast_) {
ret = SubInt8(tile0_data_ + task_id * stride, tile1_data_ + task_id * stride, output_data_ + task_id * stride,
count, &param_);
count, quant_param_);
} else {
ret = SubInt8(input0_data_ + task_id * stride, input1_data_ + task_id * stride, output_data_ + task_id * stride,
count, &param_);
count, quant_param_);
}

if (ret != RET_OK) {


+ 2
- 2
mindspore/lite/src/runtime/kernel/arm/int8/sub_int8.h View File

@@ -31,7 +31,7 @@ class SubInt8CPUKernel : public LiteKernel {
explicit SubInt8CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
: LiteKernel(parameter, inputs, outputs, ctx) {}
~SubInt8CPUKernel() = default;
~SubInt8CPUKernel() override;

int Init() override;
int ReSize() override;
@@ -39,7 +39,7 @@ class SubInt8CPUKernel : public LiteKernel {
int DoExecute(int task_id);

private:
SubQuantArg param_;
SubQuantArg *quant_param_ = nullptr;
int8_t *tile0_data_ = nullptr;
int8_t *tile1_data_ = nullptr;
bool broadcast_ = false;


+ 1
- 1
mindspore/lite/src/runtime/kernel/arm/int8/transpose_int8.h View File

@@ -62,7 +62,7 @@ class TransposeInt8CPUKernel : public LiteKernel {
int num_unit_ = 0;
int in_shape_[8] = {0};
int out_shape_[8] = {0};
int nhnc_param_[3];
int nhnc_param_[3] = {0};
};
} // namespace mindspore::kernel



+ 8
- 2
mindspore/lite/src/sub_graph_split.cc View File

@@ -29,6 +29,9 @@ const schema::Primitive *SearchSubGraph::CreatePartialPrimitive(int64_t subgraph
fbb.Finish(prim_offset);
auto tmp_buf = fbb.GetBufferPointer();
auto prim_buf = reinterpret_cast<char *>(malloc(fbb.GetSize()));
if (prim_buf == nullptr) {
return nullptr;
}
memcpy(prim_buf, tmp_buf, fbb.GetSize());

auto primitive = flatbuffers::GetRoot<schema::Primitive>(prim_buf);
@@ -59,6 +62,7 @@ void SearchSubGraph::ConvertSubGraphToModel() {
Model::Node *new_partial_node = new (std::nothrow) Model::Node();
if (new_partial_node == nullptr) {
MS_LOG(ERROR) << "New partial node failed!";
free(new_sub_graph);
return;
}
new_partial_node->name_ = "Partial-subgraph-split-" + std::to_string(new_sub_index);
@@ -230,15 +234,17 @@ void SearchSubGraph::InitMainGraphDevice() { return; }
void SearchSubGraph::SubgraphFusion() {
while (sub_graphs_.size() > 2) {
size_t sub1_index = 0;
int sub2_index = -1;
size_t sub2_index = 0;
bool is_found = false;
for (; sub1_index < sub_graphs_.size(); sub1_index++) {
for (size_t tmp2 = sub1_index + 1; tmp2 < sub_graphs_.size(); tmp2++) {
if (sub_graphs_[sub1_index].device_ == sub_graphs_[tmp2].device_) {
sub2_index = tmp2;
is_found = true;
break;
}
}
if (sub2_index != -1) {
if (!is_found) {
break;
}
}


Loading…
Cancel
Save