Browse Source

[MSLITE][Develop]int8 conv1x1 aupport arm32

tags/v1.0.0
ling 5 years ago
parent
commit
a3cc26ffcc
6 changed files with 102 additions and 22 deletions
  1. +15
    -0
      mindspore/lite/nnacl/int8/matmul_int8.c
  2. +2
    -1
      mindspore/lite/nnacl/int8/matmul_int8.h
  3. +1
    -0
      mindspore/lite/nnacl/matmul_parameter.h
  4. +1
    -0
      mindspore/lite/nnacl/op_base.h
  5. +81
    -21
      mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc
  6. +2
    -0
      mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.h

+ 15
- 0
mindspore/lite/nnacl/int8/matmul_int8.c View File

@@ -43,6 +43,21 @@ void RowMajor2Row4x16MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int co
} }
} }


void RowMajor2Row2x16MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col) {
int col16 = UP_ROUND(col, C16NUM);
for (int r = 0; r < row; r++) {
int rd4 = r / C2NUM;
int rm4 = r % C2NUM;
for (int c = 0; c < col; c++) {
int cd16 = c / C16NUM;
int cm16 = c % C16NUM;
int dst_index = rd4 * col16 * C2NUM + cd16 * C2NUM * C16NUM + rm4 * C16NUM + cm16;
int src_index = r * col + c;
dst_ptr[dst_index] = src_ptr[src_index];
}
}
}

void RowMajor2Row8x4MajorInt8(const int8_t *src_ptr, int8_t *dst_ptr, int row, int col) { void RowMajor2Row8x4MajorInt8(const int8_t *src_ptr, int8_t *dst_ptr, int row, int col) {
int col4 = UP_ROUND(col, C4NUM); int col4 = UP_ROUND(col, C4NUM);
for (int r = 0; r < row; r++) { for (int r = 0; r < row; r++) {


+ 2
- 1
mindspore/lite/nnacl/int8/matmul_int8.h View File

@@ -42,7 +42,6 @@ void MatMulInt8_8x8_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row,
bool per_channel); bool per_channel);
void RowMajor2Row8x4MajorInt8(const int8_t *src_ptr, int8_t *dst_ptr, int row, int col); void RowMajor2Row8x4MajorInt8(const int8_t *src_ptr, int8_t *dst_ptr, int row, int col);
void RowMajor2Row4x8MajorInt8(const int8_t *src_ptr, int8_t *dst_ptr, int row, int col); void RowMajor2Row4x8MajorInt8(const int8_t *src_ptr, int8_t *dst_ptr, int row, int col);

void RowMajor2Row4x16Major(int8_t *src, int row, int col, int8_t *dst, int col_16); void RowMajor2Row4x16Major(int8_t *src, int row, int col, int8_t *dst, int col_16);
void RowMajor2Col16x4Major(int8_t *src, int row, int col, int8_t *dst, int row_16); void RowMajor2Col16x4Major(int8_t *src, int row, int col, int8_t *dst, int row_16);
void CalcInputSums(int8_t *input, int row, int col, int weight_zp, int *dst, DataOrder order); void CalcInputSums(int8_t *input, int row, int col, int weight_zp, int *dst, DataOrder order);
@@ -52,6 +51,8 @@ void MatmulInt8(const int8_t *a, const int8_t *b, int8_t *dst, const int *a_sums
int act_max, int out_zp, int multiplier, int left_shift, int right_shift, int row, int col, int deep16, int act_max, int out_zp, int multiplier, int left_shift, int right_shift, int row, int col, int deep16,
int stride); int stride);


void RowMajor2Row2x16MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col);

#ifdef ENABLE_ARM64 #ifdef ENABLE_ARM64
void MatmulInt8Neon64(const int8_t *a, const int8_t *b, int8_t *dst, int row4, int col4, int deep16, const int *a_sums, void MatmulInt8Neon64(const int8_t *a, const int8_t *b, int8_t *dst, int row4, int col4, int deep16, const int *a_sums,
const int *bias, int act_min, int act_max, int out_zp, int multiplier, int left_shift, const int *bias, int act_min, int act_max, int out_zp, int multiplier, int left_shift,


+ 1
- 0
mindspore/lite/nnacl/matmul_parameter.h View File

@@ -39,6 +39,7 @@ typedef struct MatMulParameter {
int row_8_; int row_8_;
int row_12_; int row_12_;
int row_16_; int row_16_;
int col_2_;
int col_4_; int col_4_;
int col_8_; int col_8_;
int deep_; int deep_;


+ 1
- 0
mindspore/lite/nnacl/op_base.h View File

@@ -21,6 +21,7 @@
#include <stdlib.h> #include <stdlib.h>
#include <stdbool.h> #include <stdbool.h>


#define C2NUM 2
#define C4NUM 4 #define C4NUM 4
#define C8NUM 8 #define C8NUM 8
#define C12NUM 12 #define C12NUM 12


+ 81
- 21
mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc View File

@@ -86,6 +86,33 @@ void Convolution1x1Int8CPUKernel::CheckSupportOptimize() {
return; return;
} }


int Convolution1x1Int8CPUKernel::InitBiasByzp(void *src_weight, int input_channel, int output_channel) {
/* bias = bias - v2 x zp1 + zp1 x zp2 */
int32_t *bias_data = reinterpret_cast<int32_t *>(bias_data_);
int8_t *weight = reinterpret_cast<int8_t *>(src_weight);
int32_t input_zp = conv_param_->conv_quant_arg_.input_quant_args_[0].zp_;
for (int oc = 0; oc < output_channel; oc++) {
int32_t weight_sum_value = 0;
int32_t filter_zp = (filter_peroc_) ? conv_param_->conv_quant_arg_.filter_quant_args_[oc].zp_
: conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_;
for (int ic = 0; ic < input_channel; ic++) {
weight_sum_value += weight[oc * input_channel + ic];
}
bias_data[oc] += filter_zp * input_zp * input_channel - weight_sum_value * input_zp;
}

if (filter_peroc_) {
filter_zp_ptr_ = reinterpret_cast<int32_t *>(malloc(output_channel * sizeof(int32_t)));
if (filter_zp_ptr_ == nullptr) {
return RET_ERROR;
}
for (int fi = 0; fi < output_channel; fi++) {
filter_zp_ptr_[fi] = conv_param_->conv_quant_arg_.filter_quant_args_[fi].zp_;
}
}
return RET_OK;
}

int Convolution1x1Int8CPUKernel::InitWeightBias() { int Convolution1x1Int8CPUKernel::InitWeightBias() {
auto filter_tensor = in_tensors_.at(kWeightIndex); auto filter_tensor = in_tensors_.at(kWeightIndex);
auto input_channel = filter_tensor->Channel(); auto input_channel = filter_tensor->Channel();
@@ -108,7 +135,6 @@ int Convolution1x1Int8CPUKernel::InitWeightBias() {
input_channel); input_channel);
} }


/* bias = bias - v2 x zp1 + zp1 x zp2 */
int col4 = UP_ROUND(output_channel, C4NUM); int col4 = UP_ROUND(output_channel, C4NUM);
int col8 = UP_ROUND(output_channel, C8NUM); int col8 = UP_ROUND(output_channel, C8NUM);
size = support_optimize_ ? col8 * sizeof(int32_t) : col4 * sizeof(int32_t); size = support_optimize_ ? col8 * sizeof(int32_t) : col4 * sizeof(int32_t);
@@ -122,28 +148,39 @@ int Convolution1x1Int8CPUKernel::InitWeightBias() {
memcpy(bias_data_, in_tensors_[kBiasIndex]->MutableData(), output_channel * sizeof(int32_t)); memcpy(bias_data_, in_tensors_[kBiasIndex]->MutableData(), output_channel * sizeof(int32_t));
} }


int32_t *bias_data = reinterpret_cast<int32_t *>(bias_data_);
int8_t *weight = reinterpret_cast<int8_t *>(filter_tensor->MutableData());
int32_t input_zp = conv_param_->conv_quant_arg_.input_quant_args_[0].zp_;
for (int oc = 0; oc < output_channel; oc++) {
int32_t weight_sum_value = 0;
int32_t filter_zp = (filter_peroc_) ? conv_param_->conv_quant_arg_.filter_quant_args_[oc].zp_
: conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_;
for (int ic = 0; ic < input_channel; ic++) {
weight_sum_value += weight[oc * input_channel + ic];
}
bias_data[oc] += filter_zp * input_zp * input_channel - weight_sum_value * input_zp;
InitBiasByzp(filter_tensor->MutableData(), input_channel, output_channel);
return RET_OK;
}

int Convolution1x1Int8CPUKernel::InitWeightBiasArm32() {
auto filter_tensor = in_tensors_.at(kWeightIndex);
auto input_channel = filter_tensor->Channel();
auto output_channel = filter_tensor->Batch();

/* weight */
size_t size = UP_ROUND(input_channel, C16NUM) * UP_ROUND(output_channel, C2NUM) * sizeof(int8_t);
packed_weight_ = reinterpret_cast<int8_t *>(malloc(size));
if (packed_weight_ == nullptr) {
MS_LOG(ERROR) << "Conv1x1 int8 arm32 Malloc weight error!";
return RET_ERROR;
} }
memset(packed_weight_, 0, size);
RowMajor2Row2x16MajorInt8(reinterpret_cast<int8_t *>(filter_tensor->MutableData()), packed_weight_, output_channel,
input_channel);


if (filter_peroc_) {
filter_zp_ptr_ = reinterpret_cast<int32_t *>(malloc(output_channel * sizeof(int32_t)));
if (filter_zp_ptr_ == nullptr) {
return RET_ERROR;
}
for (int fi = 0; fi < output_channel; fi++) {
filter_zp_ptr_[fi] = conv_param_->conv_quant_arg_.filter_quant_args_[fi].zp_;
}
/* bias */
int col2 = UP_ROUND(output_channel, C2NUM);
bias_data_ = malloc(col2 * sizeof(int32_t));
if (bias_data_ == nullptr) {
MS_LOG(ERROR) << "Conv1x1 int8 arm32 Malloc bias_ptr_ error!";
return RET_ERROR;
}
memset(bias_data_, 0, size);
if (in_tensors_.size() == 3) {
memcpy(bias_data_, in_tensors_[kBiasIndex]->MutableData(), output_channel * sizeof(int32_t));
} }

InitBiasByzp(filter_tensor->MutableData(), input_channel, output_channel);
return RET_OK; return RET_OK;
} }


@@ -164,7 +201,11 @@ int Convolution1x1Int8CPUKernel::Init() {


CheckSupportOptimize(); CheckSupportOptimize();


#ifdef ENABLE_ARM32
ret = InitWeightBiasArm32();
#else
ret = InitWeightBias(); ret = InitWeightBias();
#endif
if (ret != RET_OK) { if (ret != RET_OK) {
MS_LOG(ERROR) << "Init weight bias failed."; MS_LOG(ERROR) << "Init weight bias failed.";
return ret; return ret;
@@ -183,6 +224,7 @@ int Convolution1x1Int8CPUKernel::InitParam() {
matmul_param_->row_ = conv_param_->output_h_ * conv_param_->output_w_; matmul_param_->row_ = conv_param_->output_h_ * conv_param_->output_w_;
matmul_param_->deep_ = conv_param_->input_channel_; matmul_param_->deep_ = conv_param_->input_channel_;
matmul_param_->col_ = conv_param_->output_channel_; matmul_param_->col_ = conv_param_->output_channel_;
matmul_param_->col_2_ = UP_ROUND(matmul_param_->col_, C2NUM);
matmul_param_->col_4_ = UP_ROUND(matmul_param_->col_, C4NUM); matmul_param_->col_4_ = UP_ROUND(matmul_param_->col_, C4NUM);
matmul_param_->col_8_ = UP_ROUND(matmul_param_->col_, C8NUM); matmul_param_->col_8_ = UP_ROUND(matmul_param_->col_, C8NUM);
matmul_param_->row_4_ = UP_ROUND(matmul_param_->row_, C4NUM); matmul_param_->row_4_ = UP_ROUND(matmul_param_->row_, C4NUM);
@@ -192,6 +234,10 @@ int Convolution1x1Int8CPUKernel::InitParam() {


int row_pack_count = 0; int row_pack_count = 0;
int col_pack_count = 0; int col_pack_count = 0;
#ifdef ENABLE_ARM32
row_pack_count = C4NUM;
col_pack_count = C2NUM;
#else
if (support_optimize_) { if (support_optimize_) {
row_pack_count = C8NUM; row_pack_count = C8NUM;
col_pack_count = C8NUM; col_pack_count = C8NUM;
@@ -199,6 +245,7 @@ int Convolution1x1Int8CPUKernel::InitParam() {
row_pack_count = C4NUM; row_pack_count = C4NUM;
col_pack_count = C4NUM; col_pack_count = C4NUM;
} }
#endif


/* init input sum size */ /* init input sum size */
if (conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL) { if (conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL) {
@@ -222,7 +269,7 @@ int Convolution1x1Int8CPUKernel::InitParam() {
memset(input_ptr_, 0, matmul_param_->row_ * matmul_param_->deep_ * sizeof(int8_t)); memset(input_ptr_, 0, matmul_param_->row_ * matmul_param_->deep_ * sizeof(int8_t));
} }
return RET_OK; return RET_OK;
}
} // namespace mindspore::kernel


int Convolution1x1Int8CPUKernel::ReSize() { int Convolution1x1Int8CPUKernel::ReSize() {
FreeResizeBuf(); FreeResizeBuf();
@@ -260,6 +307,18 @@ int Convolution1x1Int8CPUKernel::RunImpl(int task_id) {
int32_t *cur_right_shift = conv_param_->conv_quant_arg_.right_shift_; int32_t *cur_right_shift = conv_param_->conv_quant_arg_.right_shift_;
int32_t *cur_multiplier = conv_param_->conv_quant_arg_.quant_multiplier_; int32_t *cur_multiplier = conv_param_->conv_quant_arg_.quant_multiplier_;


#ifdef ENABLE_ARM32
int cur_stride = thread_stride_ * C2NUM;
int res_stride = matmul_param_->col_ - task_id * thread_stride_ * C2NUM;
int cur_oc = MSMIN(cur_stride, res_stride);
if (cur_oc <= 0) {
return RET_OK;
}
Conv1x1Int8(packed_input_, packed_weight_ + task_id * thread_stride_ * C2NUM * matmul_param_->deep_16_,
output_ptr_ + task_id * thread_stride_ * C2NUM, cur_input_sum,
reinterpret_cast<int32_t *>(bias_data_) + task_id * thread_stride_ * C2NUM, matmul_param_->row_, cur_oc,
matmul_param_->deep_16_, cur_left_shift, cur_right_shift, cur_multiplier, conv_param_);
#else
if (support_optimize_) { if (support_optimize_) {
int cur_stride = thread_stride_ * C8NUM; int cur_stride = thread_stride_ * C8NUM;
int res_stride = matmul_param_->col_ - task_id * thread_stride_ * C8NUM; int res_stride = matmul_param_->col_ - task_id * thread_stride_ * C8NUM;
@@ -296,6 +355,7 @@ int Convolution1x1Int8CPUKernel::RunImpl(int task_id) {
reinterpret_cast<int32_t *>(bias_data_) + task_id * thread_stride_ * C4NUM, matmul_param_->row_, cur_oc, reinterpret_cast<int32_t *>(bias_data_) + task_id * thread_stride_ * C4NUM, matmul_param_->row_, cur_oc,
matmul_param_->deep_16_, cur_left_shift, cur_right_shift, cur_multiplier, conv_param_); matmul_param_->deep_16_, cur_left_shift, cur_right_shift, cur_multiplier, conv_param_);
} }
#endif
return RET_OK; return RET_OK;
} }




+ 2
- 0
mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.h View File

@@ -52,8 +52,10 @@ class Convolution1x1Int8CPUKernel : public ConvolutionBaseCPUKernel {
void FreeResizeBuf(); void FreeResizeBuf();
int InitParam(); int InitParam();
int InitWeightBias(); int InitWeightBias();
int InitWeightBiasArm32();
void Pre1x1Trans(int8_t *src_input, int8_t *src_output); void Pre1x1Trans(int8_t *src_input, int8_t *src_output);
void CheckSupportOptimize(); void CheckSupportOptimize();
int InitBiasByzp(void *src_weight, int input_channel, int output_channel);


private: private:
int32_t *input_sum_ = nullptr; /* per-channel: oc4 format */ int32_t *input_sum_ = nullptr; /* per-channel: oc4 format */


Loading…
Cancel
Save