2.free reduandent memory in fp16 kernelstags/v1.1.0
| @@ -19,193 +19,6 @@ | |||
| #include "nnacl/winograd_transform.h" | |||
| #include "nnacl/int8/common_func.h" | |||
| void IndirectGemmInt8(int8_t *dst, int32_t *tmp_dst, const int8_t *src, const int8_t *weight, const int32_t *bias, | |||
| int ic4, size_t kernel_plane, size_t output_channel, const int32_t *input_sum, | |||
| ConvParameter *conv_param) { | |||
| int32_t *shift_before = conv_param->conv_quant_arg_.left_shift_; | |||
| int32_t *shift_after = conv_param->conv_quant_arg_.right_shift_; | |||
| int32_t *out_multiplier = conv_param->conv_quant_arg_.quant_multiplier_; | |||
| int32_t out_zp = conv_param->conv_quant_arg_.output_quant_args_[0].zp_; | |||
| int32_t act_min = conv_param->conv_quant_arg_.out_act_min_[0]; | |||
| int32_t act_max = conv_param->conv_quant_arg_.out_act_max_[0]; | |||
| int oc4 = UP_DIV(output_channel, C4NUM); | |||
| #ifdef ENABLE_ARM64 | |||
| size_t asymmetric = conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC; | |||
| size_t per_channel = conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL; | |||
| IndirectGemmInt8_4x4(dst, src, weight, bias, UP_DIV(kernel_plane, C4NUM), ic4, output_channel, | |||
| output_channel * sizeof(int8_t), input_sum, act_min, act_max, out_zp, out_multiplier, | |||
| shift_before, shift_after, asymmetric, per_channel, oc4 * C4NUM * sizeof(int32_t)); | |||
| #elif ENABLE_ARM32 | |||
| size_t asymmetric = conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC; | |||
| size_t per_channel = conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL; | |||
| IndirectGemmInt8_2x4(dst, src, weight, bias, UP_DIV(kernel_plane, C4NUM), ic4, output_channel, | |||
| output_channel * sizeof(int8_t), input_sum, act_min, act_max, out_zp, out_multiplier, | |||
| shift_before, shift_after, asymmetric, per_channel, oc4 * C4NUM * sizeof(int32_t)); | |||
| #else | |||
| int tile_num = conv_param->tile_num_; | |||
| int plane_c4 = UP_DIV(kernel_plane, C4NUM); | |||
| for (int oc = 0; oc < output_channel; oc++) { | |||
| int oc4_block = oc / C4NUM; | |||
| int oc4_res = oc % C4NUM; | |||
| int weight_oc4_offset = oc4_block * C4NUM * plane_c4 * C4NUM * ic4 * C4NUM + oc4_res * C4NUM * C4NUM; | |||
| int dst_oc_offset = oc; | |||
| for (int n = 0; n < tile_num; n++) { | |||
| int src_tile_offset = n * C4NUM * C4NUM; | |||
| int dst_tile_offset = dst_oc_offset + n * output_channel; | |||
| for (int b = 0; b < kernel_plane; b++) { | |||
| int plane_c4_block = b / C4NUM; | |||
| int plane_c4_res = b % C4NUM; | |||
| int src_plane_offset = src_tile_offset + plane_c4_block * tile_num * C4NUM * ic4 * C4NUM + plane_c4_res * C4NUM; | |||
| int weight_plane_offset = | |||
| weight_oc4_offset + plane_c4_block * C4NUM * C4NUM * ic4 * C4NUM + plane_c4_res * C4NUM; | |||
| for (int i = 0; i < ic4; i++) { | |||
| int src_ic4_offset = src_plane_offset + i * tile_num * C4NUM * C4NUM; | |||
| int weight_ic4_offset = weight_plane_offset + i * C4NUM * C4NUM * C4NUM; | |||
| for (int j = 0; j < C4NUM; j++) { | |||
| int weight_ic_offset = weight_ic4_offset + j; | |||
| tmp_dst[dst_tile_offset] += weight[weight_ic_offset] * src[src_ic4_offset + j]; | |||
| } // in c4num loop | |||
| } // ic4 loop | |||
| } // kernel_plane loop | |||
| if (!(conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC) && | |||
| (conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL)) { | |||
| int result = tmp_dst[dst_tile_offset] + bias[oc]; | |||
| result = RoundingDivideByPOT( | |||
| SaturatingRoundingDoublingHighMul(result * (1 << (unsigned int)shift_before[oc]), out_multiplier[oc]), | |||
| -shift_after[oc]); | |||
| result += out_zp; | |||
| result = result > act_min ? result : act_min; | |||
| result = result < act_max ? result : act_max; | |||
| dst[dst_tile_offset] = (int8_t)result; | |||
| } else if (!(conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC) && | |||
| !(conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL)) { | |||
| int result = tmp_dst[dst_tile_offset] + bias[oc]; | |||
| result = RoundingDivideByPOT( | |||
| SaturatingRoundingDoublingHighMul(result * (1 << (unsigned int)shift_before[0]), out_multiplier[0]), | |||
| -shift_after[0]); | |||
| result += out_zp; | |||
| result = result > act_min ? result : act_min; | |||
| result = result < act_max ? result : act_max; | |||
| dst[dst_tile_offset] = (int8_t)result; | |||
| } else if ((conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC) && | |||
| !(conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL)) { | |||
| tmp_dst[dst_tile_offset] -= input_sum[n]; | |||
| int result = tmp_dst[dst_tile_offset] + bias[oc]; | |||
| result = RoundingDivideByPOT( | |||
| SaturatingRoundingDoublingHighMul(result * (1 << (unsigned int)shift_before[0]), out_multiplier[0]), | |||
| -shift_after[0]); | |||
| result += out_zp; | |||
| result = result > act_min ? result : act_min; | |||
| result = result < act_max ? result : act_max; | |||
| dst[dst_tile_offset] = (int8_t)result; | |||
| } else if ((conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC) && | |||
| (conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL)) { | |||
| tmp_dst[dst_tile_offset] -= input_sum[n * oc4 * C4NUM + oc]; | |||
| int result = tmp_dst[dst_tile_offset] + bias[oc]; | |||
| result = RoundingDivideByPOT( | |||
| SaturatingRoundingDoublingHighMul(result * (1 << (unsigned int)shift_before[oc]), out_multiplier[oc]), | |||
| -shift_after[oc]); | |||
| result += out_zp; | |||
| result = result > act_min ? result : act_min; | |||
| result = result < act_max ? result : act_max; | |||
| dst[dst_tile_offset] = (int8_t)result; | |||
| } | |||
| } // tile_num loop | |||
| } // output_channel loop | |||
| #endif | |||
| } | |||
| void IndirectGemmInt8Opt(int8_t *dst, int32_t *tmp_dst, const int8_t *src, const int8_t *weight, const int32_t *bias, | |||
| int ic4, size_t kernel_plane, size_t output_channel, const int32_t *input_sum, | |||
| ConvParameter *conv_param, GEMM_FUNC gemm_func) { | |||
| int32_t *shift_before = conv_param->conv_quant_arg_.left_shift_; | |||
| int32_t *shift_after = conv_param->conv_quant_arg_.right_shift_; | |||
| int32_t *out_multiplier = conv_param->conv_quant_arg_.quant_multiplier_; | |||
| int32_t out_zp = conv_param->conv_quant_arg_.output_quant_args_[0].zp_; | |||
| int32_t act_min = conv_param->conv_quant_arg_.out_act_min_[0]; | |||
| int32_t act_max = conv_param->conv_quant_arg_.out_act_max_[0]; | |||
| int oc4 = UP_DIV(output_channel, C4NUM); | |||
| if (gemm_func != NULL) { | |||
| #ifdef __aarch64__ | |||
| size_t asymmetric = conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC; | |||
| size_t per_channel = conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL; | |||
| gemm_func(dst, src, weight, bias, kernel_plane, ic4, output_channel, output_channel * sizeof(int8_t), input_sum, | |||
| act_min, act_max, out_zp, out_multiplier, shift_before, shift_after, asymmetric, per_channel, | |||
| oc4 * C4NUM * sizeof(int32_t)); | |||
| #endif | |||
| } else { | |||
| int tile_num = conv_param->tile_num_; | |||
| for (int oc = 0; oc < output_channel; oc++) { | |||
| int oc4_block = oc / C4NUM; | |||
| int oc4_res = oc % C4NUM; | |||
| int weight_oc4_offset = oc4_block * C4NUM * kernel_plane * ic4 * C4NUM + oc4_res * C4NUM; | |||
| int dst_oc_offset = oc; | |||
| for (int n = 0; n < tile_num; n++) { | |||
| int src_tile_offset = n * C4NUM; | |||
| int dst_tile_offset = dst_oc_offset + n * output_channel; | |||
| for (int b = 0; b < kernel_plane; b++) { | |||
| int src_plane_offset = src_tile_offset + b * tile_num * ic4 * C4NUM; | |||
| int weight_plane_offset = weight_oc4_offset + b * C4NUM * ic4 * C4NUM; | |||
| for (int i = 0; i < ic4; i++) { | |||
| int src_ic4_offset = src_plane_offset + i * tile_num * C4NUM; | |||
| int weight_ic4_offset = weight_plane_offset + i * C4NUM * C4NUM; | |||
| for (int j = 0; j < C4NUM; j++) { | |||
| int weight_ic_offset = weight_ic4_offset + j; | |||
| tmp_dst[dst_tile_offset] += weight[weight_ic_offset] * src[src_ic4_offset + j]; | |||
| } // in c4num loop | |||
| } // ic4 loop | |||
| } // kernel_plane loop | |||
| if (!(conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC) && | |||
| (conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL)) { | |||
| int result = tmp_dst[dst_tile_offset] + bias[oc]; | |||
| result = RoundingDivideByPOT( | |||
| SaturatingRoundingDoublingHighMul(result * (1 << (unsigned int)shift_before[oc]), out_multiplier[oc]), | |||
| -shift_after[oc]); | |||
| result += out_zp; | |||
| result = result > act_min ? result : act_min; | |||
| result = result < act_max ? result : act_max; | |||
| dst[dst_tile_offset] = (int8_t)result; | |||
| } else if (!(conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC) && | |||
| !(conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL)) { | |||
| int result = tmp_dst[dst_tile_offset] + bias[oc]; | |||
| result = RoundingDivideByPOT( | |||
| SaturatingRoundingDoublingHighMul(result * (1 << (unsigned int)shift_before[0]), out_multiplier[0]), | |||
| -shift_after[0]); | |||
| result += out_zp; | |||
| result = result > act_min ? result : act_min; | |||
| result = result < act_max ? result : act_max; | |||
| dst[dst_tile_offset] = (int8_t)result; | |||
| } else if ((conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC) && | |||
| !(conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL)) { | |||
| tmp_dst[dst_tile_offset] -= input_sum[n]; | |||
| int result = tmp_dst[dst_tile_offset] + bias[oc]; | |||
| result = RoundingDivideByPOT( | |||
| SaturatingRoundingDoublingHighMul(result * (1 << (unsigned int)shift_before[0]), out_multiplier[0]), | |||
| -shift_after[0]); | |||
| result += out_zp; | |||
| result = result > act_min ? result : act_min; | |||
| result = result < act_max ? result : act_max; | |||
| dst[dst_tile_offset] = (int8_t)result; | |||
| } else if ((conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC) && | |||
| (conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL)) { | |||
| tmp_dst[dst_tile_offset] -= input_sum[n * oc4 * C4NUM + oc]; | |||
| int result = tmp_dst[dst_tile_offset] + bias[oc]; | |||
| result = RoundingDivideByPOT( | |||
| SaturatingRoundingDoublingHighMul(result * (1 << (unsigned int)shift_before[oc]), out_multiplier[oc]), | |||
| -shift_after[oc]); | |||
| result += out_zp; | |||
| result = result > act_min ? result : act_min; | |||
| result = result < act_max ? result : act_max; | |||
| dst[dst_tile_offset] = (int8_t)result; | |||
| } | |||
| } // tile_num loop | |||
| } // output_channel loop | |||
| } | |||
| } | |||
| void Conv3x3Int8Gemm(int32_t *dst, const int16_t *src, const int16_t *weight, int oc, int ic8, size_t real_cal_num) { | |||
| int oc4 = UP_DIV(oc, C4NUM); | |||
| #ifdef ENABLE_ARM | |||
| @@ -249,73 +62,9 @@ void Conv3x3Int8Gemm(int32_t *dst, const int16_t *src, const int16_t *weight, in | |||
| #endif | |||
| } | |||
| // int8 conv common | |||
| void ConvInt8(int8_t *input_data, int8_t *packed_input, int8_t *packed_weight, const int32_t *bias_data, | |||
| int32_t *tmp_dst, int8_t *tmp_out, int8_t *output_data, int32_t *input_sum, int task_id, | |||
| ConvParameter *conv_param) { | |||
| int kernel_h = conv_param->kernel_h_; | |||
| int kernel_w = conv_param->kernel_w_; | |||
| int in_batch = conv_param->input_batch_; | |||
| int in_channel = conv_param->input_channel_; | |||
| int in_h = conv_param->input_h_; | |||
| int in_w = conv_param->input_w_; | |||
| int out_h = conv_param->output_h_; | |||
| int out_w = conv_param->output_w_; | |||
| int out_channel = conv_param->output_channel_; | |||
| int oc4 = UP_DIV(out_channel, C4NUM); | |||
| int32_t input_zp = conv_param->conv_quant_arg_.input_quant_args_[0].zp_; | |||
| int tile_n = conv_param->tile_num_; | |||
| int thread_count = conv_param->thread_num_; | |||
| int output_count = out_h * out_w; | |||
| int output_tile_count = UP_DIV(output_count, tile_n); | |||
| int ic4 = UP_DIV(in_channel, C4NUM); | |||
| int kernel_plane = kernel_h * kernel_w; | |||
| int plane_block = UP_DIV(kernel_plane, C4NUM); | |||
| int unit_size = plane_block * C4NUM * ic4 * C4NUM; | |||
| int input_sum_offset; | |||
| if (conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL) { | |||
| input_sum_offset = tile_n * oc4 * C4NUM; | |||
| } else { | |||
| input_sum_offset = tile_n; | |||
| } | |||
| for (int b = 0; b < in_batch; b++) { | |||
| int in_batch_offset = b * ic4 * C4NUM * in_h * in_w; | |||
| int out_batch_offset = b * out_channel * out_h * out_w; | |||
| for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_count) { | |||
| int start_index = thread_id * tile_n; | |||
| int real_cal_num = (output_count - start_index) < tile_n ? (output_count - start_index) : tile_n; | |||
| int32_t *tmp_input_sum = input_sum + task_id * input_sum_offset; | |||
| int8_t *gemm_input = packed_input + task_id * unit_size * tile_n; | |||
| // clear tmp buffer before compute | |||
| memset(gemm_input, (int8_t)input_zp, unit_size * tile_n); | |||
| int out_offset = thread_id * tile_n * out_channel + out_batch_offset; | |||
| size_t tmp_dst_size = tile_n * conv_param->output_channel_ * sizeof(int32_t); | |||
| int tmp_dst_offset = task_id * tile_n * conv_param->output_channel_; | |||
| memset(tmp_dst + tmp_dst_offset, 0, tmp_dst_size); | |||
| Im2ColPackUnitInt8(input_data + in_batch_offset, gemm_input, real_cal_num, start_index, tmp_input_sum, | |||
| conv_param); | |||
| if (real_cal_num == tile_n) { | |||
| int8_t *gemm_output = output_data + out_offset; | |||
| IndirectGemmInt8(gemm_output, tmp_dst + tmp_dst_offset, gemm_input, packed_weight, bias_data, ic4, kernel_plane, | |||
| out_channel, tmp_input_sum, conv_param); | |||
| } else { | |||
| // res part | |||
| int8_t *tmp_out_ptr = tmp_out + task_id * tile_n * out_channel; | |||
| IndirectGemmInt8(tmp_out_ptr, tmp_dst + tmp_dst_offset, gemm_input, packed_weight, bias_data, ic4, kernel_plane, | |||
| out_channel, tmp_input_sum, conv_param); | |||
| memcpy(output_data + out_offset, tmp_out_ptr, real_cal_num * out_channel); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| void ConvInt8Opt(int8_t *input_data, int8_t *packed_input, int8_t *matmul_input, int8_t *packed_weight, | |||
| const int32_t *bias_data, int8_t *output_data, int32_t *filter_zp, int32_t *input_sum, int task_id, | |||
| ConvParameter *conv_param, MATMUL_OPT_R_FUNC matmul_func) { | |||
| ConvParameter *conv_param, MATMUL_OPT_R_FUNC matmul_func, bool is_optimize) { | |||
| int kernel_h = conv_param->kernel_h_; | |||
| int kernel_w = conv_param->kernel_w_; | |||
| int in_batch = conv_param->input_batch_; | |||
| @@ -325,18 +74,29 @@ void ConvInt8Opt(int8_t *input_data, int8_t *packed_input, int8_t *matmul_input, | |||
| int out_h = conv_param->output_h_; | |||
| int out_w = conv_param->output_w_; | |||
| int out_channel = conv_param->output_channel_; | |||
| int oc8 = UP_DIV(out_channel, C8NUM); | |||
| int tile_n = conv_param->tile_num_; | |||
| int thread_count = conv_param->thread_num_; | |||
| int output_count = out_h * out_w; | |||
| int output_tile_count = UP_DIV(output_count, tile_n); | |||
| int ic4 = UP_DIV(in_channel, C4NUM); | |||
| int kernel_plane = kernel_h * kernel_w; | |||
| int unit_size = UP_ROUND(kernel_plane * in_channel, C4NUM); | |||
| int unit_size; | |||
| int input_sum_offset; | |||
| int up_round_oc; | |||
| #ifdef ENABLE_ARM32 | |||
| up_round_oc = UP_ROUND(out_channel, C2NUM); | |||
| unit_size = UP_ROUND(kernel_plane * in_channel, C16NUM); | |||
| #else | |||
| if (is_optimize) { | |||
| up_round_oc = UP_ROUND(out_channel, C8NUM); | |||
| unit_size = UP_ROUND(kernel_plane * in_channel, C4NUM); | |||
| } else { | |||
| up_round_oc = UP_ROUND(out_channel, C4NUM); | |||
| unit_size = UP_ROUND(kernel_plane * in_channel, C16NUM); | |||
| } | |||
| #endif | |||
| bool per_channel; | |||
| if (conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL) { | |||
| input_sum_offset = tile_n * oc8 * C8NUM; | |||
| input_sum_offset = tile_n * up_round_oc; | |||
| per_channel = true; | |||
| } else { | |||
| input_sum_offset = tile_n; | |||
| @@ -344,7 +104,7 @@ void ConvInt8Opt(int8_t *input_data, int8_t *packed_input, int8_t *matmul_input, | |||
| } | |||
| for (int b = 0; b < in_batch; b++) { | |||
| int in_batch_offset = b * ic4 * C4NUM * in_h * in_w; | |||
| int in_batch_offset = b * in_channel * in_h * in_w; | |||
| int out_batch_offset = b * out_channel * out_h * out_w; | |||
| for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_count) { | |||
| int start_index = thread_id * tile_n; | |||
| @@ -354,15 +114,38 @@ void ConvInt8Opt(int8_t *input_data, int8_t *packed_input, int8_t *matmul_input, | |||
| int8_t *matmul = matmul_input + task_id * kernel_plane * in_channel * tile_n; | |||
| memset(matmul, conv_param->conv_quant_arg_.input_quant_args_[0].zp_, kernel_plane * in_channel * tile_n); | |||
| Im2ColPackUnitInt8Opt(input_data + in_batch_offset, gemm_input, matmul, real_cal_num, start_index, filter_zp, | |||
| tmp_input_sum, conv_param, per_channel); | |||
| tmp_input_sum, conv_param, per_channel, is_optimize); | |||
| int out_offset = thread_id * tile_n * out_channel + out_batch_offset; | |||
| int8_t *gemm_output = output_data + out_offset; | |||
| matmul_func(gemm_input, packed_weight, gemm_output, real_cal_num, out_channel, unit_size, out_channel, | |||
| tmp_input_sum, bias_data, conv_param->conv_quant_arg_.left_shift_, | |||
| conv_param->conv_quant_arg_.right_shift_, conv_param->conv_quant_arg_.quant_multiplier_, | |||
| conv_param->conv_quant_arg_.output_quant_args_[0].zp_, conv_param->conv_quant_arg_.out_act_min_[0], | |||
| conv_param->conv_quant_arg_.out_act_max_[0], per_channel); | |||
| #ifdef ENABLE_ARM32 | |||
| MatmulInt8Neon32( | |||
| gemm_input, packed_weight, gemm_output, real_cal_num, out_channel, unit_size, tmp_input_sum, bias_data, | |||
| conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0], | |||
| conv_param->conv_quant_arg_.output_quant_args_[0].zp_, conv_param->conv_quant_arg_.quant_multiplier_, | |||
| conv_param->conv_quant_arg_.left_shift_, conv_param->conv_quant_arg_.right_shift_, out_channel, per_channel); | |||
| #elif ENABLE_ARM64 | |||
| if (is_optimize) { | |||
| matmul_func(gemm_input, packed_weight, gemm_output, real_cal_num, out_channel, unit_size, out_channel, | |||
| tmp_input_sum, bias_data, conv_param->conv_quant_arg_.left_shift_, | |||
| conv_param->conv_quant_arg_.right_shift_, conv_param->conv_quant_arg_.quant_multiplier_, | |||
| conv_param->conv_quant_arg_.output_quant_args_[0].zp_, conv_param->conv_quant_arg_.out_act_min_[0], | |||
| conv_param->conv_quant_arg_.out_act_max_[0], per_channel); | |||
| } else { | |||
| MatmulInt8Neon64(gemm_input, packed_weight, gemm_output, UP_ROUND(real_cal_num, C4NUM), | |||
| UP_ROUND(out_channel, C4NUM), unit_size, tmp_input_sum, bias_data, | |||
| conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0], | |||
| conv_param->conv_quant_arg_.output_quant_args_[0].zp_, | |||
| conv_param->conv_quant_arg_.quant_multiplier_, conv_param->conv_quant_arg_.left_shift_, | |||
| conv_param->conv_quant_arg_.right_shift_, real_cal_num, out_channel, out_channel, per_channel); | |||
| } | |||
| #else | |||
| MatMulInt8_8x8_r( | |||
| gemm_input, packed_weight, gemm_output, real_cal_num, out_channel, unit_size, out_channel, tmp_input_sum, | |||
| bias_data, conv_param->conv_quant_arg_.left_shift_, conv_param->conv_quant_arg_.right_shift_, | |||
| conv_param->conv_quant_arg_.quant_multiplier_, conv_param->conv_quant_arg_.output_quant_args_[0].zp_, | |||
| conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0], per_channel); | |||
| #endif | |||
| } | |||
| } | |||
| } | |||
| @@ -28,30 +28,13 @@ | |||
| #include "nnacl/matmul_parameter.h" | |||
| #include "nnacl/int8/matmul_int8.h" | |||
| typedef void (*GEMM_FUNC)(int8_t *dst, const int8_t *src, const int8_t *weight, const int32_t *bias, size_t ksize, | |||
| size_t ic4, size_t output_channel, size_t offset, const int32_t *input_sum, size_t act_min, | |||
| size_t act_max, size_t out_zp, int32_t *out_multiplier, int32_t *shift_before, | |||
| int32_t *shift_after, size_t asymmetric, size_t per_channel, size_t per_channel_offset); | |||
| #ifdef __cplusplus | |||
| extern "C" { | |||
| #endif | |||
| void IndirectGemmInt8(int8_t *dst, int32_t *tmp_dst, const int8_t *src, const int8_t *weight, const int32_t *bias, | |||
| int ic4, size_t kernel_plane, size_t output_channel, const int32_t *input_sum, | |||
| ConvParameter *conv_param); | |||
| void IndirectGemmInt8Opt(int8_t *dst, int32_t *tmp_dst, const int8_t *src, const int8_t *weight, const int32_t *bias, | |||
| int ic4, size_t kernel_plane, size_t output_channel, const int32_t *input_sum, | |||
| ConvParameter *conv_param, GEMM_FUNC gemm_func); | |||
| // int8 conv common | |||
| void ConvInt8(int8_t *input_data, int8_t *packed_input, int8_t *packed_weight, const int32_t *bias_data, | |||
| int32_t *tmp_dst, int8_t *tmp_out, int8_t *output_data, int32_t *input_sum, int task_id, | |||
| ConvParameter *conv_param); | |||
| void ConvInt8Opt(int8_t *input_data, int8_t *packed_input, int8_t *matmul_input, int8_t *packed_weight, | |||
| const int32_t *bias_data, int8_t *output_data, int32_t *filter_zp, int32_t *input_sum, int task_id, | |||
| ConvParameter *conv_param, MATMUL_OPT_R_FUNC matmul_func); | |||
| ConvParameter *conv_param, MATMUL_OPT_R_FUNC matmul_func, bool is_optimize); | |||
| // int8 convolution 1x1 | |||
| void Conv1x1PreOptPeroc(const int8_t *src_input, int8_t *packed_input, int32_t *input_sum, size_t input_channel, | |||
| @@ -260,93 +260,9 @@ void Im2ColPackUnitFp32(const float *input_data, ConvParameter *conv_param, floa | |||
| } // tile num loop | |||
| } | |||
| void Im2ColPackUnitInt8(const int8_t *input_data, int8_t *packed_input, int real_cal_num, int block_index, | |||
| int32_t *input_sum, ConvParameter *conv_param) { | |||
| // input format : nhwc | |||
| int tile_num = conv_param->tile_num_; | |||
| QuantArg *filter_arg = conv_param->conv_quant_arg_.filter_quant_args_; | |||
| int kernel_h = conv_param->kernel_h_; | |||
| int kernel_w = conv_param->kernel_w_; | |||
| int stride_h = conv_param->stride_h_; | |||
| int stride_w = conv_param->stride_w_; | |||
| int pad_h = conv_param->pad_u_; | |||
| int pad_w = conv_param->pad_l_; | |||
| int dilation_h = conv_param->dilation_h_; | |||
| int dilation_w = conv_param->dilation_w_; | |||
| int in_channel = conv_param->input_channel_; | |||
| int in_h = conv_param->input_h_; | |||
| int in_w = conv_param->input_w_; | |||
| int ic4_minus = in_channel / C4NUM; | |||
| int ic4 = UP_DIV(in_channel, C4NUM); | |||
| int oc4 = UP_DIV(conv_param->output_channel_, C4NUM); | |||
| int out_w = conv_param->output_w_; | |||
| for (int i = 0; i < real_cal_num; i++) { | |||
| int block_start = block_index + i; | |||
| int input_h = block_start / out_w * stride_h - pad_h; | |||
| int input_w = block_start % out_w * stride_w - pad_w; | |||
| int input_cal_num_offset = i * C4NUM * C4NUM; | |||
| int32_t input_accumulator = 0; | |||
| for (int j = 0; j < kernel_h; j++) { | |||
| int input_y = input_h + j * dilation_h; | |||
| if (input_y < 0 || input_y >= in_h) { | |||
| input_accumulator += ic4 * C4NUM * conv_param->conv_quant_arg_.input_quant_args_[0].zp_ * kernel_w; | |||
| continue; | |||
| } | |||
| int input_y_stride = input_y * in_w * in_channel; | |||
| for (int n = 0; n < kernel_w; n++) { | |||
| int input_x = input_w + n * dilation_w; | |||
| if (input_x < 0 || input_x >= in_w) { | |||
| input_accumulator += ic4 * C4NUM * conv_param->conv_quant_arg_.input_quant_args_[0].zp_; | |||
| continue; | |||
| } | |||
| int input_x_stride = input_y_stride + input_x * in_channel; | |||
| int plane_c4_block = (j * kernel_w + n) / C4NUM; | |||
| int plane_c4_res = (j * kernel_w + n) % C4NUM; | |||
| int input_plane_offset = | |||
| plane_c4_block * tile_num * C4NUM * C4NUM * ic4 + plane_c4_res * C4NUM + input_cal_num_offset; | |||
| for (int m = 0; m < ic4_minus; m++) { | |||
| int channel_block_stride = input_x_stride + m * C4NUM; | |||
| int channel_block_offset = input_plane_offset + m * tile_num * C4NUM * C4NUM; | |||
| (packed_input + channel_block_offset)[0] = (input_data + channel_block_stride)[0]; | |||
| (packed_input + channel_block_offset)[1] = (input_data + channel_block_stride)[1]; | |||
| (packed_input + channel_block_offset)[2] = (input_data + channel_block_stride)[2]; | |||
| (packed_input + channel_block_offset)[3] = (input_data + channel_block_stride)[3]; | |||
| input_accumulator += (packed_input + channel_block_offset)[0]; | |||
| input_accumulator += (packed_input + channel_block_offset)[1]; | |||
| input_accumulator += (packed_input + channel_block_offset)[2]; | |||
| input_accumulator += (packed_input + channel_block_offset)[3]; | |||
| } // channel_block loop | |||
| int ic_res = conv_param->input_channel_ - ic4_minus * C4NUM; | |||
| for (int l = 0; l < ic_res; ++l) { | |||
| int channel_block_stride = input_x_stride + ic4_minus * C4NUM + l; | |||
| int channel_block_offset = input_plane_offset + ic4_minus * tile_num * C4NUM + l; | |||
| packed_input[channel_block_offset] = input_data[channel_block_stride]; | |||
| input_accumulator += (packed_input + channel_block_offset)[0]; | |||
| } | |||
| for (int l = 0; l < (C4NUM - ic_res); l++) { | |||
| input_accumulator += conv_param->conv_quant_arg_.input_quant_args_[0].zp_; | |||
| } | |||
| } // kernel_w loop | |||
| } // kernel_h loop | |||
| if (!(conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC)) { | |||
| continue; | |||
| } else if ((conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC) && | |||
| (conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL)) { | |||
| int cal_num_offset = i * oc4 * C4NUM; | |||
| for (int l = 0; l < conv_param->output_channel_; ++l) { | |||
| input_sum[cal_num_offset + l] = input_accumulator * filter_arg[l].zp_; | |||
| } | |||
| } else if ((conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC) && | |||
| !(conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL)) { | |||
| input_sum[i] = input_accumulator * filter_arg[0].zp_; | |||
| } | |||
| } // tile num loop | |||
| } | |||
| void Im2ColPackUnitInt8Opt(const int8_t *input_data, int8_t *packed_input, int8_t *matmul_input, int real_cal_num, | |||
| int block_index, int32_t *filter_zp, int32_t *input_sum, ConvParameter *conv_param, | |||
| bool per_channel) { | |||
| bool per_channel, bool is_optimize) { | |||
| // input format : nhwc | |||
| int kernel_h = conv_param->kernel_h_; | |||
| int kernel_w = conv_param->kernel_w_; | |||
| @@ -389,11 +305,29 @@ void Im2ColPackUnitInt8Opt(const int8_t *input_data, int8_t *packed_input, int8_ | |||
| } // kernel_h loop | |||
| } | |||
| } // tile num loop | |||
| if (per_channel) { | |||
| Conv1x1PreOptPeroc(matmul_input, packed_input, input_sum, kernel_plane * in_channel, conv_param->output_channel_, | |||
| real_cal_num, filter_zp, C8NUM * C8NUM); | |||
| int deep = kernel_plane * in_channel; | |||
| if (is_optimize) { | |||
| if (per_channel) { | |||
| Conv1x1PreOptPeroc(matmul_input, packed_input, input_sum, deep, conv_param->output_channel_, real_cal_num, | |||
| filter_zp, C8NUM * C8NUM); | |||
| } else { | |||
| Conv1x1PreOptPert(matmul_input, packed_input, input_sum, deep, real_cal_num, conv_param); | |||
| } | |||
| } else { | |||
| Conv1x1PreOptPert(matmul_input, packed_input, input_sum, kernel_plane * in_channel, real_cal_num, conv_param); | |||
| RowMajor2Row16x4MajorInt8(matmul_input, packed_input, real_cal_num, deep); | |||
| if (per_channel) { | |||
| #ifdef ENABLE_ARM32 | |||
| PackInputSum16x4PerChannelArm32(packed_input, input_sum, filter_zp, real_cal_num, deep, | |||
| conv_param->output_channel_); | |||
| #else | |||
| PackInputSum16x4PerChannel(packed_input, input_sum, filter_zp, real_cal_num, deep, conv_param->output_channel_); | |||
| #endif | |||
| } else { | |||
| size_t hw4 = UP_ROUND(real_cal_num, C4NUM); | |||
| size_t ic16 = UP_ROUND(deep, C16NUM); | |||
| PackInputSum16x4PerLayer(packed_input, input_sum, conv_param->conv_quant_arg_.filter_quant_args_[0].zp_, hw4, | |||
| ic16); | |||
| } | |||
| } | |||
| } | |||
| @@ -32,12 +32,9 @@ void Im2ColPackUnitFp32(const float *input_data, ConvParameter *conv_param, floa | |||
| void PackHWCToWHC(const float *src, float *dst, int height, int width, int channel); | |||
| void Im2ColPackUnitInt8(const int8_t *input_data, int8_t *packed_input, int real_cal_num, int block_index, | |||
| int32_t *input_sum, ConvParameter *conv_param); | |||
| void Im2ColPackUnitInt8Opt(const int8_t *input_data, int8_t *packed_input, int8_t *matmul_input, int real_cal_num, | |||
| int block_index, int32_t *filter_zp, int32_t *input_sum, ConvParameter *conv_param, | |||
| bool per_channel); | |||
| bool per_channel, bool is_optimize); | |||
| void PackInputSum16x4PerLayer(const int8_t *src, int32_t *dst, int32_t filter_zp, size_t row4, size_t col16); | |||
| @@ -53,6 +53,10 @@ int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() { | |||
| } | |||
| PackNCHWToNHWCFp16(fp16_weight_, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(), | |||
| weight_tensor->Batch()); | |||
| if (fp16_weight_ != nullptr) { | |||
| free(fp16_weight_); | |||
| fp16_weight_ = nullptr; | |||
| } | |||
| bias_data_ = reinterpret_cast<float16_t *>(malloc(channel * sizeof(float16_t))); | |||
| if (bias_data_ == nullptr) { | |||
| @@ -78,6 +78,10 @@ int ConvolutionWinogradFP16CPUKernel::InitWeightBias() { | |||
| MS_LOG(ERROR) << "winograd filter transfrom failed."; | |||
| return ret; | |||
| } | |||
| if (fp16_weight_ != nullptr) { | |||
| free(fp16_weight_); | |||
| fp16_weight_ = nullptr; | |||
| } | |||
| // init bias | |||
| bias_data_ = malloc(oc_block_num * oc_block * sizeof(float16_t)); | |||
| @@ -33,9 +33,8 @@ using mindspore::schema::PrimitiveType_Conv2D; | |||
| namespace mindspore::kernel { | |||
| void ConvolutionInt8CPUKernel::CheckSupportOptimize() { | |||
| tile_num_ = 8; | |||
| matmul_func_ = MatMulInt8_8x8_r; | |||
| #ifdef ENABLE_ARM32 | |||
| tile_num_ = 2; | |||
| tile_num_ = 4; | |||
| support_optimize_ = false; | |||
| #endif | |||
| @@ -48,138 +47,43 @@ void ConvolutionInt8CPUKernel::CheckSupportOptimize() { | |||
| if (dlopen_error != nullptr) { | |||
| MS_LOG(ERROR) << "load matmul func failed! " << dlopen_error << "."; | |||
| support_optimize_ = false; | |||
| matmul_func_ = nullptr; | |||
| tile_num_ = 4; | |||
| } else { | |||
| support_optimize_ = true; | |||
| } | |||
| } else { | |||
| tile_num_ = 4; | |||
| support_optimize_ = false; | |||
| matmul_func_ = nullptr; | |||
| } | |||
| #endif | |||
| conv_param_->tile_num_ = tile_num_; | |||
| } | |||
| int ConvolutionInt8CPUKernel::InitWeightBias() { | |||
| int ConvolutionInt8CPUKernel::InitWeightBiasOpt() { | |||
| auto filter_tensor = in_tensors_.at(kWeightIndex); | |||
| auto input_channel = filter_tensor->Channel(); | |||
| auto output_channel = filter_tensor->Batch(); | |||
| int kernel_h = filter_tensor->Height(); | |||
| int kernel_w = filter_tensor->Width(); | |||
| int kernel_plane = kernel_h * kernel_w; | |||
| conv_param_->input_channel_ = input_channel; | |||
| conv_param_->output_channel_ = output_channel; | |||
| int ic4 = UP_DIV(input_channel, C4NUM); | |||
| int oc4 = UP_DIV(output_channel, C4NUM); | |||
| int kernel_plane = kernel_h * kernel_w; | |||
| int plane_c4 = UP_DIV(kernel_plane, C4NUM); | |||
| int pack_weight_size = oc4 * ic4 * C4NUM * C4NUM * plane_c4 * C4NUM; | |||
| auto filter_arg = conv_param_->conv_quant_arg_.filter_quant_args_; | |||
| int32_t input_zp = conv_param_->conv_quant_arg_.input_quant_args_[0].zp_; | |||
| // init weight | |||
| auto origin_weight = reinterpret_cast<int8_t *>(in_tensors_.at(kWeightIndex)->MutableData()); | |||
| packed_weight_ = reinterpret_cast<int8_t *>(malloc(pack_weight_size)); | |||
| if (packed_weight_ == nullptr) { | |||
| MS_LOG(ERROR) << "malloc packed_weight_ failed."; | |||
| return RET_ERROR; | |||
| } | |||
| memset(packed_weight_, 0, pack_weight_size); | |||
| auto *weight_sum = reinterpret_cast<int32_t *>(malloc(sizeof(int32_t) * output_channel)); | |||
| if (weight_sum == nullptr) { | |||
| MS_LOG(ERROR) << "malloc weight_sum failed."; | |||
| return RET_ERROR; | |||
| } | |||
| for (int i = 0; i < output_channel; i++) { | |||
| if (conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL) { | |||
| weight_sum[i] = ic4 * C4NUM * kernel_plane * filter_arg[i].zp_; | |||
| } else { | |||
| weight_sum[i] = ic4 * C4NUM * kernel_plane * filter_arg[0].zp_; | |||
| } | |||
| } | |||
| PackWeightInt8(origin_weight, conv_param_, packed_weight_, weight_sum); | |||
| // init bias | |||
| bias_data_ = reinterpret_cast<int32_t *>(malloc(oc4 * C4NUM * sizeof(int32_t))); | |||
| if (bias_data_ == nullptr) { | |||
| MS_LOG(ERROR) << "malloc bias_data_ failed."; | |||
| return RET_ERROR; | |||
| } | |||
| memset(bias_data_, 0, oc4 * C4NUM * sizeof(int32_t)); | |||
| if (in_tensors_.size() == kInputSize2) { | |||
| auto ori_bias = reinterpret_cast<int32_t *>(in_tensors_.at(kBiasIndex)->MutableData()); | |||
| memcpy(bias_data_, ori_bias, output_channel * sizeof(int32_t)); | |||
| } else { | |||
| MS_ASSERT(in_tensors_.size() == kInputSize1); | |||
| } | |||
| auto *bias_data = reinterpret_cast<int32_t *>(bias_data_); | |||
| int c4_kernel_plane_size = kernel_plane * ic4 * C4NUM; | |||
| if (conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL) { | |||
| for (int i = 0; i < output_channel; i++) { | |||
| bias_data[i] += filter_arg[i].zp_ * input_zp * c4_kernel_plane_size - weight_sum[i] * input_zp; | |||
| } | |||
| int up_round_deep; | |||
| int up_round_oc; | |||
| #ifdef ENABLE_ARM32 | |||
| up_round_oc = UP_ROUND(output_channel, C2NUM); | |||
| up_round_deep = UP_ROUND(kernel_plane * input_channel, C16NUM); | |||
| #else | |||
| if (support_optimize_) { | |||
| up_round_oc = UP_ROUND(output_channel, C8NUM); | |||
| up_round_deep = UP_ROUND(kernel_plane * input_channel, C4NUM); | |||
| } else { | |||
| for (int i = 0; i < output_channel; i++) { | |||
| bias_data[i] += filter_arg[0].zp_ * input_zp * c4_kernel_plane_size - weight_sum[i] * input_zp; | |||
| } | |||
| up_round_oc = UP_ROUND(output_channel, C4NUM); | |||
| up_round_deep = UP_ROUND(kernel_plane * input_channel, C16NUM); | |||
| } | |||
| free(weight_sum); | |||
| size_t input_sum_size; | |||
| if (conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL) { | |||
| input_sum_size = oc4 * C4NUM * tile_num_ * thread_count_ * sizeof(int32_t); | |||
| } else { | |||
| input_sum_size = tile_num_ * thread_count_ * sizeof(int32_t); | |||
| } | |||
| input_sum_ = reinterpret_cast<int32_t *>(malloc(input_sum_size)); | |||
| if (input_sum_ == nullptr) { | |||
| MS_LOG(ERROR) << "malloc input_sum_ failed."; | |||
| return RET_ERROR; | |||
| } | |||
| memset(input_sum_, 0, input_sum_size); | |||
| return RET_OK; | |||
| } | |||
| int ConvolutionInt8CPUKernel::InitTmpBuffer() { | |||
| MS_ASSERT(ctx_->allocator != nullptr); | |||
| int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM); | |||
| int kernel_plane = conv_param_->kernel_h_ * conv_param_->kernel_w_; | |||
| int plane_c4 = UP_DIV(kernel_plane, C4NUM); | |||
| int unit_size = plane_c4 * C4NUM * ic4 * C4NUM; | |||
| packed_input_ = reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(unit_size * thread_count_ * tile_num_)); | |||
| if (packed_input_ == nullptr) { | |||
| MS_LOG(ERROR) << "malloc packed_input_ failed."; | |||
| return RET_ERROR; | |||
| } | |||
| size_t tmp_dst_size = thread_count_ * tile_num_ * conv_param_->output_channel_ * sizeof(int32_t); | |||
| tmp_dst_ = reinterpret_cast<int32_t *>(ctx_->allocator->Malloc(tmp_dst_size)); | |||
| if (tmp_dst_ == nullptr) { | |||
| MS_LOG(ERROR) << "malloc tmp_dst_ failed."; | |||
| return RET_ERROR; | |||
| } | |||
| tmp_out_ = | |||
| reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(thread_count_ * tile_num_ * conv_param_->output_channel_)); | |||
| if (tmp_out_ == nullptr) { | |||
| MS_LOG(ERROR) << "malloc tmp_out_ failed."; | |||
| return RET_ERROR; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int ConvolutionInt8CPUKernel::InitWeightBiasOpt() { | |||
| auto filter_tensor = in_tensors_.at(kWeightIndex); | |||
| auto input_channel = filter_tensor->Channel(); | |||
| auto output_channel = filter_tensor->Batch(); | |||
| int kernel_h = filter_tensor->Height(); | |||
| int kernel_w = filter_tensor->Width(); | |||
| conv_param_->input_channel_ = input_channel; | |||
| conv_param_->output_channel_ = output_channel; | |||
| int oc8 = UP_DIV(output_channel, C8NUM); | |||
| int kernel_plane = kernel_h * kernel_w; | |||
| int up_round_deep = UP_ROUND(kernel_plane * input_channel, C4NUM); | |||
| int pack_weight_size = oc8 * C8NUM * up_round_deep; | |||
| #endif | |||
| int pack_weight_size = up_round_oc * up_round_deep; | |||
| int bias_size = up_round_oc * sizeof(int32_t); | |||
| int32_t input_zp = conv_param_->conv_quant_arg_.input_quant_args_[0].zp_; | |||
| // init weight | |||
| @@ -190,15 +94,23 @@ int ConvolutionInt8CPUKernel::InitWeightBiasOpt() { | |||
| return RET_ERROR; | |||
| } | |||
| memset(packed_weight_, 0, pack_weight_size); | |||
| RowMajor2Row8x4MajorInt8(origin_weight, packed_weight_, output_channel, input_channel * kernel_plane); | |||
| #ifdef ENABLE_ARM32 | |||
| RowMajor2Row2x16MajorInt8(origin_weight, packed_weight_, output_channel, input_channel * kernel_plane); | |||
| #else | |||
| if (support_optimize_) { | |||
| RowMajor2Row8x4MajorInt8(origin_weight, packed_weight_, output_channel, input_channel * kernel_plane); | |||
| } else { | |||
| RowMajor2Row16x4MajorInt8(origin_weight, packed_weight_, output_channel, input_channel * kernel_plane); | |||
| } | |||
| #endif | |||
| // init bias | |||
| bias_data_ = reinterpret_cast<int32_t *>(malloc(oc8 * C8NUM * sizeof(int32_t))); | |||
| bias_data_ = reinterpret_cast<int32_t *>(malloc(bias_size)); | |||
| if (bias_data_ == nullptr) { | |||
| MS_LOG(ERROR) << "malloc bias_data_ failed."; | |||
| return RET_ERROR; | |||
| } | |||
| memset(bias_data_, 0, oc8 * C8NUM * sizeof(int32_t)); | |||
| memset(bias_data_, 0, bias_size); | |||
| if (in_tensors_.size() == kInputSize2) { | |||
| auto ori_bias = reinterpret_cast<int32_t *>(in_tensors_.at(kBiasIndex)->MutableData()); | |||
| memcpy(bias_data_, ori_bias, output_channel * sizeof(int32_t)); | |||
| @@ -225,7 +137,7 @@ int ConvolutionInt8CPUKernel::InitWeightBiasOpt() { | |||
| size_t input_sum_size; | |||
| if (conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL) { | |||
| input_sum_size = oc8 * C8NUM * tile_num_ * thread_count_ * sizeof(int32_t); | |||
| input_sum_size = up_round_oc * tile_num_ * thread_count_ * sizeof(int32_t); | |||
| } else { | |||
| input_sum_size = tile_num_ * thread_count_ * sizeof(int32_t); | |||
| } | |||
| @@ -241,14 +153,19 @@ int ConvolutionInt8CPUKernel::InitWeightBiasOpt() { | |||
| int ConvolutionInt8CPUKernel::InitTmpBufferOpt() { | |||
| MS_ASSERT(ctx_->allocator != nullptr); | |||
| int kernel_plane = conv_param_->kernel_h_ * conv_param_->kernel_w_; | |||
| int tmp_unit = UP_ROUND(kernel_plane * conv_param_->input_channel_, C4NUM); | |||
| int tmp_size; | |||
| if (support_optimize_) { | |||
| tmp_size = UP_ROUND(kernel_plane * conv_param_->input_channel_, C4NUM); | |||
| } else { | |||
| tmp_size = UP_ROUND(kernel_plane * conv_param_->input_channel_, C16NUM); | |||
| } | |||
| matmul_packed_input_ = reinterpret_cast<int8_t *>( | |||
| ctx_->allocator->Malloc(thread_count_ * tile_num_ * kernel_plane * conv_param_->input_channel_)); | |||
| if (matmul_packed_input_ == nullptr) { | |||
| MS_LOG(ERROR) << "malloc matmul_packed_input_ failed."; | |||
| return RET_ERROR; | |||
| } | |||
| packed_input_ = reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(tmp_unit * thread_count_ * tile_num_)); | |||
| packed_input_ = reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(tmp_size * thread_count_ * tile_num_)); | |||
| if (packed_input_ == nullptr) { | |||
| MS_LOG(ERROR) << "malloc packed_input_ failed."; | |||
| return RET_ERROR; | |||
| @@ -263,26 +180,13 @@ int ConvolutionInt8CPUKernel::Init() { | |||
| MS_LOG(ERROR) << "Set quant param failed."; | |||
| return ret; | |||
| } | |||
| // init for opt | |||
| if (support_optimize_) { | |||
| ret = InitWeightBiasOpt(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Initialization for optimized int8 conv failed."; | |||
| return RET_ERROR; | |||
| } | |||
| } else { | |||
| ret = SetIfAsymmetric(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Set if per asymmetric failed."; | |||
| return ret; | |||
| } | |||
| // init for situation that not support sdot | |||
| ret = InitWeightBias(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Init weight bias failed."; | |||
| return RET_ERROR; | |||
| } | |||
| ret = InitWeightBiasOpt(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Initialization for optimized int8 conv failed."; | |||
| return RET_ERROR; | |||
| } | |||
| if (!InferShapeDone()) { | |||
| return RET_OK; | |||
| } | |||
| @@ -308,14 +212,9 @@ int ConvolutionInt8CPUKernel::RunImpl(int task_id) { | |||
| auto input_tensor = in_tensors_.at(kInputIndex); | |||
| auto ori_input_data = reinterpret_cast<int8_t *>(input_tensor->MutableData()); | |||
| auto output_addr = reinterpret_cast<int8_t *>(out_tensors_.at(kOutputIndex)->MutableData()); | |||
| if (support_optimize_) { | |||
| ConvInt8Opt(ori_input_data, packed_input_, matmul_packed_input_, packed_weight_, | |||
| reinterpret_cast<int32_t *>(bias_data_), output_addr, filter_zp_ptr_, input_sum_, task_id, conv_param_, | |||
| matmul_func_); | |||
| } else { | |||
| ConvInt8(ori_input_data, packed_input_, packed_weight_, reinterpret_cast<int32_t *>(bias_data_), tmp_dst_, tmp_out_, | |||
| output_addr, input_sum_, task_id, conv_param_); | |||
| } | |||
| ConvInt8Opt(ori_input_data, packed_input_, matmul_packed_input_, packed_weight_, | |||
| reinterpret_cast<int32_t *>(bias_data_), output_addr, filter_zp_ptr_, input_sum_, task_id, conv_param_, | |||
| matmul_func_, support_optimize_); | |||
| return RET_OK; | |||
| } | |||
| @@ -330,18 +229,10 @@ int ConvolutionInt8Impl(void *cdata, int task_id) { | |||
| } | |||
| int ConvolutionInt8CPUKernel::Run() { | |||
| if (support_optimize_) { | |||
| auto ret = InitTmpBufferOpt(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Init tmp buffer failed."; | |||
| return RET_ERROR; | |||
| } | |||
| } else { | |||
| auto ret = InitTmpBuffer(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Init tmp buffer failed."; | |||
| return RET_ERROR; | |||
| } | |||
| auto ret = InitTmpBufferOpt(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Init tmp buffer failed."; | |||
| return RET_ERROR; | |||
| } | |||
| int error_code = ParallelLaunch(this->context_->thread_pool_, ConvolutionInt8Impl, this, thread_count_); | |||
| @@ -369,18 +260,7 @@ kernel::LiteKernel *CpuConvInt8KernelCreator(const std::vector<lite::Tensor *> & | |||
| int dilation_w = conv_param->dilation_w_; | |||
| kernel::LiteKernel *kernel; | |||
| if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) { | |||
| #ifdef ENABLE_ARM64 | |||
| void *optimize_op_handler = OptimizeModule::GetInstance()->optimized_op_handler_; | |||
| if (optimize_op_handler != nullptr) { | |||
| kernel = new (std::nothrow) kernel::ConvolutionInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive); | |||
| } else { | |||
| kernel = new (std::nothrow) kernel::Convolution3x3Int8CPUKernel(opParameter, inputs, outputs, ctx, primitive); | |||
| } | |||
| #elif ENABLE_ARM32 | |||
| kernel = new (std::nothrow) kernel::Convolution3x3Int8CPUKernel(opParameter, inputs, outputs, ctx, primitive); | |||
| #else | |||
| kernel = new (std::nothrow) kernel::ConvolutionInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive); | |||
| #endif | |||
| } else if (kernel_h == 1 && kernel_w == 1) { | |||
| kernel = new (std::nothrow) kernel::Convolution1x1Int8CPUKernel(opParameter, inputs, outputs, ctx, primitive); | |||
| } else { | |||
| @@ -53,8 +53,6 @@ class ConvolutionInt8CPUKernel : public ConvolutionBaseCPUKernel { | |||
| void CheckSupportOptimize(); | |||
| int InitWeightBiasOpt(); | |||
| int InitTmpBufferOpt(); | |||
| int InitWeightBias(); | |||
| int InitTmpBuffer(); | |||
| private: | |||
| void FreeTmpBuffer() { | |||
| @@ -169,68 +169,6 @@ TEST_F(TestPack, PackInputFp16) { | |||
| } | |||
| #endif | |||
| TEST_F(TestPack, PackInputUint8) { | |||
| auto conv_param = new ConvParameter; | |||
| InitConvParamPack(conv_param); | |||
| int kernel_h = conv_param->kernel_h_; | |||
| int kernel_w = conv_param->kernel_w_; | |||
| int in_batch = conv_param->input_batch_; | |||
| int in_channel = conv_param->input_channel_; | |||
| int in_h = conv_param->input_h_; | |||
| int in_w = conv_param->input_w_; | |||
| int out_h = conv_param->output_h_; | |||
| int out_w = conv_param->output_w_; | |||
| int thread_count = 1; | |||
| int tile_n = 8; | |||
| int output_count = out_h * out_w; | |||
| int output_tile_count = UP_DIV(output_count, tile_n); | |||
| int inchannel_block = 4; | |||
| int channel_block = UP_DIV(in_channel, inchannel_block); | |||
| int kernel_plane = kernel_h * kernel_w; | |||
| int unit_size = kernel_plane * channel_block * inchannel_block; | |||
| int packed_input_size = output_tile_count * tile_n * unit_size; | |||
| // input | |||
| size_t input_size; | |||
| std::string input_path = "./test_data/conv/convuint8_input_1_28_28_3.bin"; | |||
| auto input_data = reinterpret_cast<uint8_t *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size)); | |||
| auto int8_input = reinterpret_cast<int8_t *>(malloc(input_size)); | |||
| for (int i = 0; i < input_size; i++) { | |||
| int8_input[i] = (int8_t)(input_data[i] - 128); | |||
| } | |||
| auto packed_input = reinterpret_cast<int8_t *>(malloc(in_batch * packed_input_size)); | |||
| memset(packed_input, 0, in_batch * packed_input_size); | |||
| int32_t *input_sum = reinterpret_cast<int32_t *>(malloc(tile_n * thread_count * sizeof(int32_t))); | |||
| for (int b = 0; b < in_batch; b++) { | |||
| int in_batch_offset = b * in_channel * in_h * in_w; | |||
| int gemm_in_batch_offset = b * packed_input_size; | |||
| for (int thread_id = 0; thread_id < output_tile_count; thread_id += thread_count) { | |||
| int start_index = thread_id * tile_n; | |||
| int real_cal_num = (output_count - start_index) < tile_n ? (output_count - tile_n) : tile_n; | |||
| int8_t *gemm_input = | |||
| reinterpret_cast<int8_t *>(packed_input) + thread_id * unit_size * tile_n + gemm_in_batch_offset; | |||
| memset(input_sum, 0, tile_n * thread_count * sizeof(int32_t)); | |||
| Im2ColPackUnitInt8(int8_input + in_batch_offset, gemm_input, real_cal_num, start_index, input_sum, conv_param); | |||
| } | |||
| } | |||
| printf("==================output data=================\n"); | |||
| for (int i = 0; i < 20; i++) { | |||
| std::cout << static_cast<int>(packed_input[i]) << " ,"; | |||
| } | |||
| std::cout << std::endl; | |||
| delete input_data; | |||
| delete conv_param; | |||
| free(int8_input); | |||
| free(packed_input); | |||
| free(input_sum); | |||
| MS_LOG(INFO) << "TestPackInputUint8 passed"; | |||
| } | |||
| TEST_F(TestPack, PackWeightUint8) { | |||
| auto conv_param = new ConvParameter; | |||
| InitConvParamPack(conv_param); | |||