| @@ -735,6 +735,16 @@ void Conv1x1Int8Opt(const int8_t *packed_input, const int8_t *packed_weight, int | |||
| return; | |||
| } | |||
| void Conv1x1Int8Arm32(const int8_t *packed_input, const int8_t *packed_weight, int8_t *dst, const int32_t *input_sum, | |||
| const int32_t *bias, int row, int col, int deep16, int32_t *left_shift, int32_t *right_shift, | |||
| int32_t *multiplier, ConvParameter *conv_param) { | |||
| MatMulInt8_4x2_r(packed_input, packed_weight, dst, row, col, deep16, conv_param->output_channel_, input_sum, bias, | |||
| left_shift, right_shift, multiplier, conv_param->conv_quant_arg_.output_quant_args_[0].zp_, | |||
| conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0], | |||
| conv_param->conv_quant_arg_.filter_arg_num_ != 1); | |||
| return; | |||
| } | |||
| void Conv1x1Int8(const int8_t *packed_input, const int8_t *packed_weight, int8_t *dst, const int32_t *input_sum, | |||
| const int32_t *bias, int row, int col, int deep16, int32_t *left_shift, int32_t *right_shift, | |||
| int32_t *multiplier, ConvParameter *conv_param) { | |||
| @@ -64,6 +64,9 @@ void Conv1x1Int8(const int8_t *packed_input, const int8_t *packed_weight, int8_t | |||
| void Conv1x1Int8Opt(const int8_t *packed_input, const int8_t *packed_weight, int8_t *dst, const int32_t *input_sum, | |||
| const int32_t *bias, int row, int col, int deep4, int32_t *left_shift, int32_t *right_shift, | |||
| int32_t *multiplier, ConvParameter *conv_param, MATMUL_OPT_R_FUNC matmul_func); | |||
| void Conv1x1Int8Arm32(const int8_t *packed_input, const int8_t *packed_weight, int8_t *dst, const int32_t *input_sum, | |||
| const int32_t *bias, int row, int col, int deep16, int32_t *left_shift, int32_t *right_shift, | |||
| int32_t *multiplier, ConvParameter *conv_param); | |||
| // int8 convolution 3x3 | |||
| void Conv3x3Int8(int16_t *input_data, int16_t *transed_weight, const int32_t *bias_data, int8_t *output_data, | |||
| @@ -46,12 +46,12 @@ void RowMajor2Row4x16MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int co | |||
| void RowMajor2Row2x16MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col) { | |||
| int col16 = UP_ROUND(col, C16NUM); | |||
| for (int r = 0; r < row; r++) { | |||
| int rd4 = r / C2NUM; | |||
| int rm4 = r % C2NUM; | |||
| int rd2 = r / C2NUM; | |||
| int rm2 = r % C2NUM; | |||
| for (int c = 0; c < col; c++) { | |||
| int cd16 = c / C16NUM; | |||
| int cm16 = c % C16NUM; | |||
| int dst_index = rd4 * col16 * C2NUM + cd16 * C2NUM * C16NUM + rm4 * C16NUM + cm16; | |||
| int dst_index = rd2 * col16 * C2NUM + cd16 * C2NUM * C16NUM + rm2 * C16NUM + cm16; | |||
| int src_index = r * col + c; | |||
| dst_ptr[dst_index] = src_ptr[src_index]; | |||
| } | |||
| @@ -232,6 +232,40 @@ void MatMulInt8_16x4_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row | |||
| return; | |||
| } | |||
| void MatMulInt8_4x2_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16, | |||
| size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift, | |||
| int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi, | |||
| bool peroc) { | |||
| /* support per-layer && weight per-channel */ | |||
| /* row4x16-major * row16x2-major => (int8)row-major*/ | |||
| for (int r = 0; r < row; r++) { | |||
| for (int c = 0; c < col; c++) { | |||
| int r4div = r / C4NUM, r4mod = r % C4NUM; | |||
| int c2div = c / C2NUM, c2mod = c % C2NUM; | |||
| size_t ci = r * stride + c; | |||
| int32_t value = 0; | |||
| for (int d = 0; d < deep_16; d++) { | |||
| int d16div = d / C16NUM, d16mod = d % C16NUM; | |||
| size_t ai = r4div * deep_16 * C4NUM + d16div * C4NUM * C16NUM + r4mod * C16NUM + d16mod; | |||
| size_t bi = c2div * deep_16 * C2NUM + d16div * C2NUM * C16NUM + c2mod * C16NUM + d16mod; | |||
| value = value + a[ai] * b[bi]; | |||
| } | |||
| int32_t cur_input_sum = | |||
| peroc ? input_sum[c2div * UP_ROUND(row, C4NUM) * C2NUM + r * C2NUM + c2mod] : input_sum[r]; | |||
| value -= cur_input_sum; | |||
| value += bias[c]; | |||
| int32_t cur_left_shift = peroc ? left_shift[c] : left_shift[0]; | |||
| int32_t cur_right_shift = peroc ? right_shift[c] : right_shift[0]; | |||
| int32_t cur_multiplier = peroc ? multiplier[c] : multiplier[0]; | |||
| value = MultiplyByQuantizedMultiplier(value, cur_multiplier, cur_left_shift, cur_right_shift) + output_zp; | |||
| value = MSMIN(maxi, value); | |||
| value = MSMAX(mini, value); | |||
| dst[ci] = (int8_t)value; | |||
| } | |||
| } | |||
| return; | |||
| } | |||
| void MatMulInt8_8x8_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_4, | |||
| size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift, | |||
| int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi, | |||
| @@ -52,6 +52,10 @@ void MatmulInt8(const int8_t *a, const int8_t *b, int8_t *dst, const int *a_sums | |||
| int stride); | |||
| void RowMajor2Row2x16MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col); | |||
| void MatMulInt8_4x2_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16, | |||
| size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift, | |||
| int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi, | |||
| bool peroc); | |||
| #ifdef ENABLE_ARM64 | |||
| void MatmulInt8Neon64(const int8_t *a, const int8_t *b, int8_t *dst, int row4, int col4, int deep16, const int *a_sums, | |||
| @@ -404,14 +404,42 @@ void PackInputSum16x4PerChannel(const int8_t *input_value, int32_t *input_sum, i | |||
| return; | |||
| } | |||
| void PackInputSum16x4PerChannelArm32(const int8_t *input_value, int32_t *input_sum, int32_t *filter_zp_ptr, | |||
| size_t plane_size, size_t input_channel, size_t output_channel) { | |||
| size_t hw4 = UP_ROUND(plane_size, C4NUM); | |||
| size_t ic16 = UP_ROUND(input_channel, C16NUM); | |||
| for (int ri = 0; ri < plane_size; ri++) { | |||
| int ri4div = ri / C4NUM, ri4mod = ri % C4NUM; | |||
| for (int ci = 0; ci < output_channel; ci++) { | |||
| int32_t tmp_sum_value = 0; | |||
| int ci2div = ci / C2NUM, ci2mod = ci % C2NUM; | |||
| int32_t filter_zp = filter_zp_ptr[ci]; | |||
| for (int di = 0; di < input_channel; di++) { | |||
| size_t di16div = di / C16NUM, di16mod = di % C16NUM; | |||
| int src_index = ri4div * C4NUM * ic16 + di16div * C16NUM * C4NUM + ri4mod * C16NUM + di16mod; | |||
| tmp_sum_value += input_value[src_index]; | |||
| } | |||
| int dst_index = ci2div * C2NUM * hw4 + ri * C2NUM + ci2mod; | |||
| input_sum[dst_index] = tmp_sum_value * filter_zp; | |||
| } | |||
| } | |||
| return; | |||
| } | |||
| void PackInputSum16x4Int8(const int8_t *input, int32_t *input_sum, int32_t *filter_zp, ConvParameter *conv_param) { | |||
| size_t hw4 = UP_ROUND(conv_param->input_h_ * conv_param->input_w_, C4NUM); | |||
| size_t ic16 = UP_ROUND(conv_param->input_channel_, C16NUM); | |||
| if (conv_param->conv_quant_arg_.filter_arg_num_ == 1) { | |||
| PackInputSum16x4PerLayer(input, input_sum, conv_param->conv_quant_arg_.filter_quant_args_[0].zp_, hw4, ic16); | |||
| } else { | |||
| #ifdef ENABLE_ARM32 | |||
| PackInputSum16x4PerChannelArm32(input, input_sum, filter_zp, conv_param->input_h_ * conv_param->input_w_, | |||
| conv_param->input_channel_, conv_param->output_channel_); | |||
| #else | |||
| PackInputSum16x4PerChannel(input, input_sum, filter_zp, conv_param->input_h_ * conv_param->input_w_, | |||
| conv_param->input_channel_, conv_param->output_channel_); | |||
| #endif | |||
| } | |||
| return; | |||
| } | |||
| @@ -175,7 +175,7 @@ int Convolution1x1Int8CPUKernel::InitWeightBiasArm32() { | |||
| MS_LOG(ERROR) << "Conv1x1 int8 arm32 Malloc bias_ptr_ error!"; | |||
| return RET_ERROR; | |||
| } | |||
| memset(bias_data_, 0, size); | |||
| memset(bias_data_, 0, col2 * sizeof(int32_t)); | |||
| if (in_tensors_.size() == 3) { | |||
| memcpy(bias_data_, in_tensors_[kBiasIndex]->MutableData(), output_channel * sizeof(int32_t)); | |||
| } | |||
| @@ -249,16 +249,16 @@ int Convolution1x1Int8CPUKernel::InitParam() { | |||
| /* init input sum size */ | |||
| if (conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL) { | |||
| input_sum_size = UP_ROUND(matmul_param_->col_, col_pack_count) * UP_ROUND(matmul_param_->row_, row_pack_count); | |||
| input_sum_size_ = UP_ROUND(matmul_param_->col_, col_pack_count) * UP_ROUND(matmul_param_->row_, row_pack_count); | |||
| } else { | |||
| input_sum_size = UP_ROUND(matmul_param_->row_, row_pack_count); | |||
| input_sum_size_ = UP_ROUND(matmul_param_->row_, row_pack_count); | |||
| } | |||
| thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, row_pack_count)); | |||
| thread_stride_ = UP_DIV(UP_DIV(matmul_param_->col_, row_pack_count), thread_count_); | |||
| thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, col_pack_count)); | |||
| thread_stride_ = UP_DIV(UP_DIV(matmul_param_->col_, col_pack_count), thread_count_); | |||
| thread_count_hw_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->row_, col_pack_count)); | |||
| thread_stride_hw_ = UP_DIV(UP_DIV(matmul_param_->row_, col_pack_count), thread_count_hw_); | |||
| thread_count_hw_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->row_, row_pack_count)); | |||
| thread_stride_hw_ = UP_DIV(UP_DIV(matmul_param_->row_, row_pack_count), thread_count_hw_); | |||
| if (pre_trans_input_) { | |||
| input_ptr_ = reinterpret_cast<int8_t *>(malloc(matmul_param_->row_ * matmul_param_->deep_ * sizeof(int8_t))); | |||
| @@ -269,7 +269,7 @@ int Convolution1x1Int8CPUKernel::InitParam() { | |||
| memset(input_ptr_, 0, matmul_param_->row_ * matmul_param_->deep_ * sizeof(int8_t)); | |||
| } | |||
| return RET_OK; | |||
| } // namespace mindspore::kernel | |||
| } | |||
| int Convolution1x1Int8CPUKernel::ReSize() { | |||
| FreeResizeBuf(); | |||
| @@ -314,10 +314,10 @@ int Convolution1x1Int8CPUKernel::RunImpl(int task_id) { | |||
| if (cur_oc <= 0) { | |||
| return RET_OK; | |||
| } | |||
| Conv1x1Int8(packed_input_, packed_weight_ + task_id * thread_stride_ * C2NUM * matmul_param_->deep_16_, | |||
| output_ptr_ + task_id * thread_stride_ * C2NUM, cur_input_sum, | |||
| reinterpret_cast<int32_t *>(bias_data_) + task_id * thread_stride_ * C2NUM, matmul_param_->row_, cur_oc, | |||
| matmul_param_->deep_16_, cur_left_shift, cur_right_shift, cur_multiplier, conv_param_); | |||
| Conv1x1Int8Arm32(packed_input_, packed_weight_ + task_id * thread_stride_ * C2NUM * matmul_param_->deep_16_, | |||
| output_ptr_ + task_id * thread_stride_ * C2NUM, cur_input_sum, | |||
| reinterpret_cast<int32_t *>(bias_data_) + task_id * thread_stride_ * C2NUM, matmul_param_->row_, | |||
| cur_oc, matmul_param_->deep_16_, cur_left_shift, cur_right_shift, cur_multiplier, conv_param_); | |||
| #else | |||
| if (support_optimize_) { | |||
| int cur_stride = thread_stride_ * C8NUM; | |||
| @@ -392,7 +392,7 @@ int Convolution1x1Int8Impl(void *cdata, int task_id) { | |||
| } | |||
| int Convolution1x1Int8CPUKernel::InitRunBuf() { | |||
| input_sum_ = reinterpret_cast<int32_t *>(ctx_->allocator->Malloc(input_sum_size * sizeof(int32_t))); | |||
| input_sum_ = reinterpret_cast<int32_t *>(ctx_->allocator->Malloc(input_sum_size_ * sizeof(int32_t))); | |||
| if (input_sum_ == nullptr) { | |||
| MS_LOG(ERROR) << "malloc input_sum_ failed."; | |||
| return RET_ERROR; | |||
| @@ -69,7 +69,7 @@ class Convolution1x1Int8CPUKernel : public ConvolutionBaseCPUKernel { | |||
| size_t thread_count_hw_ = 1; | |||
| size_t thread_stride_hw_ = 0; | |||
| bool pre_trans_input_ = false; | |||
| size_t input_sum_size = 0; | |||
| size_t input_sum_size_ = 0; | |||
| MatMulParameter *matmul_param_ = nullptr; | |||
| MATMUL_OPT_R_FUNC matmul_func_ = nullptr; | |||
| bool support_optimize_ = false; | |||