1.change int8 conv which not support sdot scheme (using matmul)

2.free reduandent memory in fp16 kernels
5 years ago · b43cdc5df9
--- a/mindspore/lite/nnacl/int8/conv_int8.c
+++ b/mindspore/lite/nnacl/int8/conv_int8.c
@@ -19,193 +19,6 @@
 #include "nnacl/winograd_transform.h"
 #include "nnacl/int8/common_func.h"

 void IndirectGemmInt8(int8_t *dst, int32_t *tmp_dst, const int8_t *src, const int8_t *weight, const int32_t *bias,
                      int ic4, size_t kernel_plane, size_t output_channel, const int32_t *input_sum,
                      ConvParameter *conv_param) {
  int32_t *shift_before = conv_param->conv_quant_arg_.left_shift_;
  int32_t *shift_after = conv_param->conv_quant_arg_.right_shift_;
  int32_t *out_multiplier = conv_param->conv_quant_arg_.quant_multiplier_;
  int32_t out_zp = conv_param->conv_quant_arg_.output_quant_args_[0].zp_;
  int32_t act_min = conv_param->conv_quant_arg_.out_act_min_[0];
  int32_t act_max = conv_param->conv_quant_arg_.out_act_max_[0];
  int oc4 = UP_DIV(output_channel, C4NUM);

 #ifdef ENABLE_ARM64
  size_t asymmetric = conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC;
  size_t per_channel = conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL;
  IndirectGemmInt8_4x4(dst, src, weight, bias, UP_DIV(kernel_plane, C4NUM), ic4, output_channel,
                       output_channel * sizeof(int8_t), input_sum, act_min, act_max, out_zp, out_multiplier,
                       shift_before, shift_after, asymmetric, per_channel, oc4 * C4NUM * sizeof(int32_t));
 #elif ENABLE_ARM32
  size_t asymmetric = conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC;
  size_t per_channel = conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL;
  IndirectGemmInt8_2x4(dst, src, weight, bias, UP_DIV(kernel_plane, C4NUM), ic4, output_channel,
                       output_channel * sizeof(int8_t), input_sum, act_min, act_max, out_zp, out_multiplier,
                       shift_before, shift_after, asymmetric, per_channel, oc4 * C4NUM * sizeof(int32_t));
 #else
  int tile_num = conv_param->tile_num_;
  int plane_c4 = UP_DIV(kernel_plane, C4NUM);
  for (int oc = 0; oc < output_channel; oc++) {
    int oc4_block = oc / C4NUM;
    int oc4_res = oc % C4NUM;
    int weight_oc4_offset = oc4_block * C4NUM * plane_c4 * C4NUM * ic4 * C4NUM + oc4_res * C4NUM * C4NUM;
    int dst_oc_offset = oc;
    for (int n = 0; n < tile_num; n++) {
      int src_tile_offset = n * C4NUM * C4NUM;
      int dst_tile_offset = dst_oc_offset + n * output_channel;

      for (int b = 0; b < kernel_plane; b++) {
        int plane_c4_block = b / C4NUM;
        int plane_c4_res = b % C4NUM;
        int src_plane_offset = src_tile_offset + plane_c4_block * tile_num * C4NUM * ic4 * C4NUM + plane_c4_res * C4NUM;
        int weight_plane_offset =
          weight_oc4_offset + plane_c4_block * C4NUM * C4NUM * ic4 * C4NUM + plane_c4_res * C4NUM;
        for (int i = 0; i < ic4; i++) {
          int src_ic4_offset = src_plane_offset + i * tile_num * C4NUM * C4NUM;
          int weight_ic4_offset = weight_plane_offset + i * C4NUM * C4NUM * C4NUM;
          for (int j = 0; j < C4NUM; j++) {
            int weight_ic_offset = weight_ic4_offset + j;
            tmp_dst[dst_tile_offset] += weight[weight_ic_offset] * src[src_ic4_offset + j];
          }  // in c4num loop
        }    // ic4 loop
      }      // kernel_plane loop
      if (!(conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC) &&
          (conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL)) {
        int result = tmp_dst[dst_tile_offset] + bias[oc];
        result = RoundingDivideByPOT(
          SaturatingRoundingDoublingHighMul(result * (1 << (unsigned int)shift_before[oc]), out_multiplier[oc]),
          -shift_after[oc]);
        result += out_zp;
        result = result > act_min ? result : act_min;
        result = result < act_max ? result : act_max;
        dst[dst_tile_offset] = (int8_t)result;
      } else if (!(conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC) &&
                 !(conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL)) {
        int result = tmp_dst[dst_tile_offset] + bias[oc];
        result = RoundingDivideByPOT(
          SaturatingRoundingDoublingHighMul(result * (1 << (unsigned int)shift_before[0]), out_multiplier[0]),
          -shift_after[0]);
        result += out_zp;
        result = result > act_min ? result : act_min;
        result = result < act_max ? result : act_max;
        dst[dst_tile_offset] = (int8_t)result;
      } else if ((conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC) &&
                 !(conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL)) {
        tmp_dst[dst_tile_offset] -= input_sum[n];
        int result = tmp_dst[dst_tile_offset] + bias[oc];
        result = RoundingDivideByPOT(
          SaturatingRoundingDoublingHighMul(result * (1 << (unsigned int)shift_before[0]), out_multiplier[0]),
          -shift_after[0]);
        result += out_zp;
        result = result > act_min ? result : act_min;
        result = result < act_max ? result : act_max;
        dst[dst_tile_offset] = (int8_t)result;
      } else if ((conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC) &&
                 (conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL)) {
        tmp_dst[dst_tile_offset] -= input_sum[n * oc4 * C4NUM + oc];
        int result = tmp_dst[dst_tile_offset] + bias[oc];
        result = RoundingDivideByPOT(
          SaturatingRoundingDoublingHighMul(result * (1 << (unsigned int)shift_before[oc]), out_multiplier[oc]),
          -shift_after[oc]);
        result += out_zp;
        result = result > act_min ? result : act_min;
        result = result < act_max ? result : act_max;
        dst[dst_tile_offset] = (int8_t)result;
      }
    }  // tile_num loop
  }    // output_channel loop
 #endif
 }

 void IndirectGemmInt8Opt(int8_t *dst, int32_t *tmp_dst, const int8_t *src, const int8_t *weight, const int32_t *bias,
                         int ic4, size_t kernel_plane, size_t output_channel, const int32_t *input_sum,
                         ConvParameter *conv_param, GEMM_FUNC gemm_func) {
  int32_t *shift_before = conv_param->conv_quant_arg_.left_shift_;
  int32_t *shift_after = conv_param->conv_quant_arg_.right_shift_;
  int32_t *out_multiplier = conv_param->conv_quant_arg_.quant_multiplier_;
  int32_t out_zp = conv_param->conv_quant_arg_.output_quant_args_[0].zp_;
  int32_t act_min = conv_param->conv_quant_arg_.out_act_min_[0];
  int32_t act_max = conv_param->conv_quant_arg_.out_act_max_[0];
  int oc4 = UP_DIV(output_channel, C4NUM);
  if (gemm_func != NULL) {
 #ifdef __aarch64__
    size_t asymmetric = conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC;
    size_t per_channel = conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL;
    gemm_func(dst, src, weight, bias, kernel_plane, ic4, output_channel, output_channel * sizeof(int8_t), input_sum,
              act_min, act_max, out_zp, out_multiplier, shift_before, shift_after, asymmetric, per_channel,
              oc4 * C4NUM * sizeof(int32_t));
 #endif
  } else {
    int tile_num = conv_param->tile_num_;
    for (int oc = 0; oc < output_channel; oc++) {
      int oc4_block = oc / C4NUM;
      int oc4_res = oc % C4NUM;
      int weight_oc4_offset = oc4_block * C4NUM * kernel_plane * ic4 * C4NUM + oc4_res * C4NUM;
      int dst_oc_offset = oc;
      for (int n = 0; n < tile_num; n++) {
        int src_tile_offset = n * C4NUM;
        int dst_tile_offset = dst_oc_offset + n * output_channel;

        for (int b = 0; b < kernel_plane; b++) {
          int src_plane_offset = src_tile_offset + b * tile_num * ic4 * C4NUM;
          int weight_plane_offset = weight_oc4_offset + b * C4NUM * ic4 * C4NUM;
          for (int i = 0; i < ic4; i++) {
            int src_ic4_offset = src_plane_offset + i * tile_num * C4NUM;
            int weight_ic4_offset = weight_plane_offset + i * C4NUM * C4NUM;
            for (int j = 0; j < C4NUM; j++) {
              int weight_ic_offset = weight_ic4_offset + j;
              tmp_dst[dst_tile_offset] += weight[weight_ic_offset] * src[src_ic4_offset + j];
            }  // in c4num loop
          }    // ic4 loop
        }      // kernel_plane loop
        if (!(conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC) &&
            (conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL)) {
          int result = tmp_dst[dst_tile_offset] + bias[oc];
          result = RoundingDivideByPOT(
            SaturatingRoundingDoublingHighMul(result * (1 << (unsigned int)shift_before[oc]), out_multiplier[oc]),
            -shift_after[oc]);
          result += out_zp;
          result = result > act_min ? result : act_min;
          result = result < act_max ? result : act_max;
          dst[dst_tile_offset] = (int8_t)result;
        } else if (!(conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC) &&
                   !(conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL)) {
          int result = tmp_dst[dst_tile_offset] + bias[oc];
          result = RoundingDivideByPOT(
            SaturatingRoundingDoublingHighMul(result * (1 << (unsigned int)shift_before[0]), out_multiplier[0]),
            -shift_after[0]);
          result += out_zp;
          result = result > act_min ? result : act_min;
          result = result < act_max ? result : act_max;
          dst[dst_tile_offset] = (int8_t)result;
        } else if ((conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC) &&
                   !(conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL)) {
          tmp_dst[dst_tile_offset] -= input_sum[n];
          int result = tmp_dst[dst_tile_offset] + bias[oc];
          result = RoundingDivideByPOT(
            SaturatingRoundingDoublingHighMul(result * (1 << (unsigned int)shift_before[0]), out_multiplier[0]),
            -shift_after[0]);
          result += out_zp;
          result = result > act_min ? result : act_min;
          result = result < act_max ? result : act_max;
          dst[dst_tile_offset] = (int8_t)result;
        } else if ((conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC) &&
                   (conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL)) {
          tmp_dst[dst_tile_offset] -= input_sum[n * oc4 * C4NUM + oc];
          int result = tmp_dst[dst_tile_offset] + bias[oc];
          result = RoundingDivideByPOT(
            SaturatingRoundingDoublingHighMul(result * (1 << (unsigned int)shift_before[oc]), out_multiplier[oc]),
            -shift_after[oc]);
          result += out_zp;
          result = result > act_min ? result : act_min;
          result = result < act_max ? result : act_max;
          dst[dst_tile_offset] = (int8_t)result;
        }
      }  // tile_num loop
    }    // output_channel loop
  }
 }

 void Conv3x3Int8Gemm(int32_t *dst, const int16_t *src, const int16_t *weight, int oc, int ic8, size_t real_cal_num) {
  int oc4 = UP_DIV(oc, C4NUM);
 #ifdef ENABLE_ARM
@@ -249,73 +62,9 @@ void Conv3x3Int8Gemm(int32_t *dst, const int16_t *src, const int16_t *weight, in
 #endif
 }

 // int8 conv common
 void ConvInt8(int8_t *input_data, int8_t *packed_input, int8_t *packed_weight, const int32_t *bias_data,
              int32_t *tmp_dst, int8_t *tmp_out, int8_t *output_data, int32_t *input_sum, int task_id,
              ConvParameter *conv_param) {
  int kernel_h = conv_param->kernel_h_;
  int kernel_w = conv_param->kernel_w_;
  int in_batch = conv_param->input_batch_;
  int in_channel = conv_param->input_channel_;
  int in_h = conv_param->input_h_;
  int in_w = conv_param->input_w_;
  int out_h = conv_param->output_h_;
  int out_w = conv_param->output_w_;
  int out_channel = conv_param->output_channel_;
  int oc4 = UP_DIV(out_channel, C4NUM);
  int32_t input_zp = conv_param->conv_quant_arg_.input_quant_args_[0].zp_;

  int tile_n = conv_param->tile_num_;
  int thread_count = conv_param->thread_num_;
  int output_count = out_h * out_w;
  int output_tile_count = UP_DIV(output_count, tile_n);
  int ic4 = UP_DIV(in_channel, C4NUM);
  int kernel_plane = kernel_h * kernel_w;
  int plane_block = UP_DIV(kernel_plane, C4NUM);
  int unit_size = plane_block * C4NUM * ic4 * C4NUM;
  int input_sum_offset;
  if (conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL) {
    input_sum_offset = tile_n * oc4 * C4NUM;
  } else {
    input_sum_offset = tile_n;
  }

  for (int b = 0; b < in_batch; b++) {
    int in_batch_offset = b * ic4 * C4NUM * in_h * in_w;
    int out_batch_offset = b * out_channel * out_h * out_w;
    for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_count) {
      int start_index = thread_id * tile_n;
      int real_cal_num = (output_count - start_index) < tile_n ? (output_count - start_index) : tile_n;
      int32_t *tmp_input_sum = input_sum + task_id * input_sum_offset;
      int8_t *gemm_input = packed_input + task_id * unit_size * tile_n;
      // clear tmp buffer before compute
      memset(gemm_input, (int8_t)input_zp, unit_size * tile_n);
      int out_offset = thread_id * tile_n * out_channel + out_batch_offset;

      size_t tmp_dst_size = tile_n * conv_param->output_channel_ * sizeof(int32_t);
      int tmp_dst_offset = task_id * tile_n * conv_param->output_channel_;
      memset(tmp_dst + tmp_dst_offset, 0, tmp_dst_size);

      Im2ColPackUnitInt8(input_data + in_batch_offset, gemm_input, real_cal_num, start_index, tmp_input_sum,
                         conv_param);
      if (real_cal_num == tile_n) {
        int8_t *gemm_output = output_data + out_offset;
        IndirectGemmInt8(gemm_output, tmp_dst + tmp_dst_offset, gemm_input, packed_weight, bias_data, ic4, kernel_plane,
                         out_channel, tmp_input_sum, conv_param);
      } else {
        // res part
        int8_t *tmp_out_ptr = tmp_out + task_id * tile_n * out_channel;
        IndirectGemmInt8(tmp_out_ptr, tmp_dst + tmp_dst_offset, gemm_input, packed_weight, bias_data, ic4, kernel_plane,
                         out_channel, tmp_input_sum, conv_param);
        memcpy(output_data + out_offset, tmp_out_ptr, real_cal_num * out_channel);
      }
    }
  }
 }

 void ConvInt8Opt(int8_t *input_data, int8_t *packed_input, int8_t *matmul_input, int8_t *packed_weight,
                 const int32_t *bias_data, int8_t *output_data, int32_t *filter_zp, int32_t *input_sum, int task_id,
                 ConvParameter *conv_param, MATMUL_OPT_R_FUNC matmul_func) {
                 ConvParameter *conv_param, MATMUL_OPT_R_FUNC matmul_func, bool is_optimize) {
  int kernel_h = conv_param->kernel_h_;
  int kernel_w = conv_param->kernel_w_;
  int in_batch = conv_param->input_batch_;
@@ -325,18 +74,29 @@ void ConvInt8Opt(int8_t *input_data, int8_t *packed_input, int8_t *matmul_input,
  int out_h = conv_param->output_h_;
  int out_w = conv_param->output_w_;
  int out_channel = conv_param->output_channel_;
  int oc8 = UP_DIV(out_channel, C8NUM);
  int tile_n = conv_param->tile_num_;
  int thread_count = conv_param->thread_num_;
  int output_count = out_h * out_w;
  int output_tile_count = UP_DIV(output_count, tile_n);
  int ic4 = UP_DIV(in_channel, C4NUM);
  int kernel_plane = kernel_h * kernel_w;
  int unit_size = UP_ROUND(kernel_plane * in_channel, C4NUM);
  int unit_size;
  int input_sum_offset;
  int up_round_oc;
 #ifdef ENABLE_ARM32
  up_round_oc = UP_ROUND(out_channel, C2NUM);
  unit_size = UP_ROUND(kernel_plane * in_channel, C16NUM);
 #else
  if (is_optimize) {
    up_round_oc = UP_ROUND(out_channel, C8NUM);
    unit_size = UP_ROUND(kernel_plane * in_channel, C4NUM);
  } else {
    up_round_oc = UP_ROUND(out_channel, C4NUM);
    unit_size = UP_ROUND(kernel_plane * in_channel, C16NUM);
  }
 #endif
  bool per_channel;
  if (conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL) {
    input_sum_offset = tile_n * oc8 * C8NUM;
    input_sum_offset = tile_n * up_round_oc;
    per_channel = true;
  } else {
    input_sum_offset = tile_n;
@@ -344,7 +104,7 @@ void ConvInt8Opt(int8_t *input_data, int8_t *packed_input, int8_t *matmul_input,
  }

  for (int b = 0; b < in_batch; b++) {
    int in_batch_offset = b * ic4 * C4NUM * in_h * in_w;
    int in_batch_offset = b * in_channel * in_h * in_w;
    int out_batch_offset = b * out_channel * out_h * out_w;
    for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_count) {
      int start_index = thread_id * tile_n;
@@ -354,15 +114,38 @@ void ConvInt8Opt(int8_t *input_data, int8_t *packed_input, int8_t *matmul_input,
      int8_t *matmul = matmul_input + task_id * kernel_plane * in_channel * tile_n;
      memset(matmul, conv_param->conv_quant_arg_.input_quant_args_[0].zp_, kernel_plane * in_channel * tile_n);
      Im2ColPackUnitInt8Opt(input_data + in_batch_offset, gemm_input, matmul, real_cal_num, start_index, filter_zp,
                            tmp_input_sum, conv_param, per_channel);
                            tmp_input_sum, conv_param, per_channel, is_optimize);

      int out_offset = thread_id * tile_n * out_channel + out_batch_offset;
      int8_t *gemm_output = output_data + out_offset;
      matmul_func(gemm_input, packed_weight, gemm_output, real_cal_num, out_channel, unit_size, out_channel,
                  tmp_input_sum, bias_data, conv_param->conv_quant_arg_.left_shift_,
                  conv_param->conv_quant_arg_.right_shift_, conv_param->conv_quant_arg_.quant_multiplier_,
                  conv_param->conv_quant_arg_.output_quant_args_[0].zp_, conv_param->conv_quant_arg_.out_act_min_[0],
                  conv_param->conv_quant_arg_.out_act_max_[0], per_channel);
 #ifdef ENABLE_ARM32
      MatmulInt8Neon32(
        gemm_input, packed_weight, gemm_output, real_cal_num, out_channel, unit_size, tmp_input_sum, bias_data,
        conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0],
        conv_param->conv_quant_arg_.output_quant_args_[0].zp_, conv_param->conv_quant_arg_.quant_multiplier_,
        conv_param->conv_quant_arg_.left_shift_, conv_param->conv_quant_arg_.right_shift_, out_channel, per_channel);
 #elif ENABLE_ARM64
      if (is_optimize) {
        matmul_func(gemm_input, packed_weight, gemm_output, real_cal_num, out_channel, unit_size, out_channel,
                    tmp_input_sum, bias_data, conv_param->conv_quant_arg_.left_shift_,
                    conv_param->conv_quant_arg_.right_shift_, conv_param->conv_quant_arg_.quant_multiplier_,
                    conv_param->conv_quant_arg_.output_quant_args_[0].zp_, conv_param->conv_quant_arg_.out_act_min_[0],
                    conv_param->conv_quant_arg_.out_act_max_[0], per_channel);
      } else {
        MatmulInt8Neon64(gemm_input, packed_weight, gemm_output, UP_ROUND(real_cal_num, C4NUM),
                         UP_ROUND(out_channel, C4NUM), unit_size, tmp_input_sum, bias_data,
                         conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0],
                         conv_param->conv_quant_arg_.output_quant_args_[0].zp_,
                         conv_param->conv_quant_arg_.quant_multiplier_, conv_param->conv_quant_arg_.left_shift_,
                         conv_param->conv_quant_arg_.right_shift_, real_cal_num, out_channel, out_channel, per_channel);
      }
 #else
      MatMulInt8_8x8_r(
        gemm_input, packed_weight, gemm_output, real_cal_num, out_channel, unit_size, out_channel, tmp_input_sum,
        bias_data, conv_param->conv_quant_arg_.left_shift_, conv_param->conv_quant_arg_.right_shift_,
        conv_param->conv_quant_arg_.quant_multiplier_, conv_param->conv_quant_arg_.output_quant_args_[0].zp_,
        conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0], per_channel);
 #endif
    }
  }
 }
--- a/mindspore/lite/nnacl/int8/conv_int8.h
+++ b/mindspore/lite/nnacl/int8/conv_int8.h
@@ -28,30 +28,13 @@
 #include "nnacl/matmul_parameter.h"
 #include "nnacl/int8/matmul_int8.h"

 typedef void (*GEMM_FUNC)(int8_t *dst, const int8_t *src, const int8_t *weight, const int32_t *bias, size_t ksize,
                          size_t ic4, size_t output_channel, size_t offset, const int32_t *input_sum, size_t act_min,
                          size_t act_max, size_t out_zp, int32_t *out_multiplier, int32_t *shift_before,
                          int32_t *shift_after, size_t asymmetric, size_t per_channel, size_t per_channel_offset);

 #ifdef __cplusplus
 extern "C" {
 #endif
 void IndirectGemmInt8(int8_t *dst, int32_t *tmp_dst, const int8_t *src, const int8_t *weight, const int32_t *bias,
                      int ic4, size_t kernel_plane, size_t output_channel, const int32_t *input_sum,
                      ConvParameter *conv_param);

 void IndirectGemmInt8Opt(int8_t *dst, int32_t *tmp_dst, const int8_t *src, const int8_t *weight, const int32_t *bias,
                         int ic4, size_t kernel_plane, size_t output_channel, const int32_t *input_sum,
                         ConvParameter *conv_param, GEMM_FUNC gemm_func);

 // int8 conv common
 void ConvInt8(int8_t *input_data, int8_t *packed_input, int8_t *packed_weight, const int32_t *bias_data,
              int32_t *tmp_dst, int8_t *tmp_out, int8_t *output_data, int32_t *input_sum, int task_id,
              ConvParameter *conv_param);

 void ConvInt8Opt(int8_t *input_data, int8_t *packed_input, int8_t *matmul_input, int8_t *packed_weight,
                 const int32_t *bias_data, int8_t *output_data, int32_t *filter_zp, int32_t *input_sum, int task_id,
                 ConvParameter *conv_param, MATMUL_OPT_R_FUNC matmul_func);
                 ConvParameter *conv_param, MATMUL_OPT_R_FUNC matmul_func, bool is_optimize);

 // int8 convolution 1x1
 void Conv1x1PreOptPeroc(const int8_t *src_input, int8_t *packed_input, int32_t *input_sum, size_t input_channel,
--- a/mindspore/lite/nnacl/pack.c
+++ b/mindspore/lite/nnacl/pack.c
@@ -260,93 +260,9 @@ void Im2ColPackUnitFp32(const float *input_data, ConvParameter *conv_param, floa
  }  // tile num loop
 }

 void Im2ColPackUnitInt8(const int8_t *input_data, int8_t *packed_input, int real_cal_num, int block_index,
                        int32_t *input_sum, ConvParameter *conv_param) {
  // input format : nhwc
  int tile_num = conv_param->tile_num_;
  QuantArg *filter_arg = conv_param->conv_quant_arg_.filter_quant_args_;
  int kernel_h = conv_param->kernel_h_;
  int kernel_w = conv_param->kernel_w_;
  int stride_h = conv_param->stride_h_;
  int stride_w = conv_param->stride_w_;
  int pad_h = conv_param->pad_u_;
  int pad_w = conv_param->pad_l_;
  int dilation_h = conv_param->dilation_h_;
  int dilation_w = conv_param->dilation_w_;
  int in_channel = conv_param->input_channel_;
  int in_h = conv_param->input_h_;
  int in_w = conv_param->input_w_;
  int ic4_minus = in_channel / C4NUM;
  int ic4 = UP_DIV(in_channel, C4NUM);
  int oc4 = UP_DIV(conv_param->output_channel_, C4NUM);
  int out_w = conv_param->output_w_;

  for (int i = 0; i < real_cal_num; i++) {
    int block_start = block_index + i;
    int input_h = block_start / out_w * stride_h - pad_h;
    int input_w = block_start % out_w * stride_w - pad_w;
    int input_cal_num_offset = i * C4NUM * C4NUM;
    int32_t input_accumulator = 0;
    for (int j = 0; j < kernel_h; j++) {
      int input_y = input_h + j * dilation_h;
      if (input_y < 0 || input_y >= in_h) {
        input_accumulator += ic4 * C4NUM * conv_param->conv_quant_arg_.input_quant_args_[0].zp_ * kernel_w;
        continue;
      }
      int input_y_stride = input_y * in_w * in_channel;
      for (int n = 0; n < kernel_w; n++) {
        int input_x = input_w + n * dilation_w;
        if (input_x < 0 || input_x >= in_w) {
          input_accumulator += ic4 * C4NUM * conv_param->conv_quant_arg_.input_quant_args_[0].zp_;
          continue;
        }
        int input_x_stride = input_y_stride + input_x * in_channel;
        int plane_c4_block = (j * kernel_w + n) / C4NUM;
        int plane_c4_res = (j * kernel_w + n) % C4NUM;
        int input_plane_offset =
          plane_c4_block * tile_num * C4NUM * C4NUM * ic4 + plane_c4_res * C4NUM + input_cal_num_offset;
        for (int m = 0; m < ic4_minus; m++) {
          int channel_block_stride = input_x_stride + m * C4NUM;
          int channel_block_offset = input_plane_offset + m * tile_num * C4NUM * C4NUM;
          (packed_input + channel_block_offset)[0] = (input_data + channel_block_stride)[0];
          (packed_input + channel_block_offset)[1] = (input_data + channel_block_stride)[1];
          (packed_input + channel_block_offset)[2] = (input_data + channel_block_stride)[2];
          (packed_input + channel_block_offset)[3] = (input_data + channel_block_stride)[3];
          input_accumulator += (packed_input + channel_block_offset)[0];
          input_accumulator += (packed_input + channel_block_offset)[1];
          input_accumulator += (packed_input + channel_block_offset)[2];
          input_accumulator += (packed_input + channel_block_offset)[3];
        }  // channel_block loop
        int ic_res = conv_param->input_channel_ - ic4_minus * C4NUM;
        for (int l = 0; l < ic_res; ++l) {
          int channel_block_stride = input_x_stride + ic4_minus * C4NUM + l;
          int channel_block_offset = input_plane_offset + ic4_minus * tile_num * C4NUM + l;
          packed_input[channel_block_offset] = input_data[channel_block_stride];
          input_accumulator += (packed_input + channel_block_offset)[0];
        }
        for (int l = 0; l < (C4NUM - ic_res); l++) {
          input_accumulator += conv_param->conv_quant_arg_.input_quant_args_[0].zp_;
        }
      }  // kernel_w loop
    }    // kernel_h loop
    if (!(conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC)) {
      continue;
    } else if ((conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC) &&
               (conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL)) {
      int cal_num_offset = i * oc4 * C4NUM;
      for (int l = 0; l < conv_param->output_channel_; ++l) {
        input_sum[cal_num_offset + l] = input_accumulator * filter_arg[l].zp_;
      }
    } else if ((conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC) &&
               !(conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL)) {
      input_sum[i] = input_accumulator * filter_arg[0].zp_;
    }
  }  // tile num loop
 }

 void Im2ColPackUnitInt8Opt(const int8_t *input_data, int8_t *packed_input, int8_t *matmul_input, int real_cal_num,
                           int block_index, int32_t *filter_zp, int32_t *input_sum, ConvParameter *conv_param,
                           bool per_channel) {
                           bool per_channel, bool is_optimize) {
  // input format : nhwc
  int kernel_h = conv_param->kernel_h_;
  int kernel_w = conv_param->kernel_w_;
@@ -389,11 +305,29 @@ void Im2ColPackUnitInt8Opt(const int8_t *input_data, int8_t *packed_input, int8_
      }  // kernel_h loop
    }
  }  // tile num loop
  if (per_channel) {
    Conv1x1PreOptPeroc(matmul_input, packed_input, input_sum, kernel_plane * in_channel, conv_param->output_channel_,
                       real_cal_num, filter_zp, C8NUM * C8NUM);
  int deep = kernel_plane * in_channel;
  if (is_optimize) {
    if (per_channel) {
      Conv1x1PreOptPeroc(matmul_input, packed_input, input_sum, deep, conv_param->output_channel_, real_cal_num,
                         filter_zp, C8NUM * C8NUM);
    } else {
      Conv1x1PreOptPert(matmul_input, packed_input, input_sum, deep, real_cal_num, conv_param);
    }
  } else {
    Conv1x1PreOptPert(matmul_input, packed_input, input_sum, kernel_plane * in_channel, real_cal_num, conv_param);
    RowMajor2Row16x4MajorInt8(matmul_input, packed_input, real_cal_num, deep);
    if (per_channel) {
 #ifdef ENABLE_ARM32
      PackInputSum16x4PerChannelArm32(packed_input, input_sum, filter_zp, real_cal_num, deep,
                                      conv_param->output_channel_);
 #else
      PackInputSum16x4PerChannel(packed_input, input_sum, filter_zp, real_cal_num, deep, conv_param->output_channel_);
 #endif
    } else {
      size_t hw4 = UP_ROUND(real_cal_num, C4NUM);
      size_t ic16 = UP_ROUND(deep, C16NUM);
      PackInputSum16x4PerLayer(packed_input, input_sum, conv_param->conv_quant_arg_.filter_quant_args_[0].zp_, hw4,
                               ic16);
    }
  }
 }

--- a/mindspore/lite/nnacl/pack.h
+++ b/mindspore/lite/nnacl/pack.h
@@ -32,12 +32,9 @@ void Im2ColPackUnitFp32(const float *input_data, ConvParameter *conv_param, floa

 void PackHWCToWHC(const float *src, float *dst, int height, int width, int channel);

 void Im2ColPackUnitInt8(const int8_t *input_data, int8_t *packed_input, int real_cal_num, int block_index,
                        int32_t *input_sum, ConvParameter *conv_param);

 void Im2ColPackUnitInt8Opt(const int8_t *input_data, int8_t *packed_input, int8_t *matmul_input, int real_cal_num,
                           int block_index, int32_t *filter_zp, int32_t *input_sum, ConvParameter *conv_param,
                           bool per_channel);
                           bool per_channel, bool is_optimize);

 void PackInputSum16x4PerLayer(const int8_t *src, int32_t *dst, int32_t filter_zp, size_t row4, size_t col16);

--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
@@ -53,6 +53,10 @@ int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
  }
  PackNCHWToNHWCFp16(fp16_weight_, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(),
                     weight_tensor->Batch());
  if (fp16_weight_ != nullptr) {
    free(fp16_weight_);
    fp16_weight_ = nullptr;
  }

  bias_data_ = reinterpret_cast<float16_t *>(malloc(channel * sizeof(float16_t)));
  if (bias_data_ == nullptr) {
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
@@ -78,6 +78,10 @@ int ConvolutionWinogradFP16CPUKernel::InitWeightBias() {
    MS_LOG(ERROR) << "winograd filter transfrom failed.";
    return ret;
  }
  if (fp16_weight_ != nullptr) {
    free(fp16_weight_);
    fp16_weight_ = nullptr;
  }

  // init bias
  bias_data_ = malloc(oc_block_num * oc_block * sizeof(float16_t));
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc
@@ -33,9 +33,8 @@ using mindspore::schema::PrimitiveType_Conv2D;
 namespace mindspore::kernel {
 void ConvolutionInt8CPUKernel::CheckSupportOptimize() {
  tile_num_ = 8;
  matmul_func_ = MatMulInt8_8x8_r;
 #ifdef ENABLE_ARM32
  tile_num_ = 2;
  tile_num_ = 4;
  support_optimize_ = false;
 #endif

@@ -48,138 +47,43 @@ void ConvolutionInt8CPUKernel::CheckSupportOptimize() {
    if (dlopen_error != nullptr) {
      MS_LOG(ERROR) << "load matmul func failed! " << dlopen_error << ".";
      support_optimize_ = false;
      matmul_func_ = nullptr;
      tile_num_ = 4;
    } else {
      support_optimize_ = true;
    }
  } else {
    tile_num_ = 4;
    support_optimize_ = false;
    matmul_func_ = nullptr;
  }
 #endif
  conv_param_->tile_num_ = tile_num_;
 }

 int ConvolutionInt8CPUKernel::InitWeightBias() {
 int ConvolutionInt8CPUKernel::InitWeightBiasOpt() {
  auto filter_tensor = in_tensors_.at(kWeightIndex);
  auto input_channel = filter_tensor->Channel();
  auto output_channel = filter_tensor->Batch();
  int kernel_h = filter_tensor->Height();
  int kernel_w = filter_tensor->Width();
  int kernel_plane = kernel_h * kernel_w;
  conv_param_->input_channel_ = input_channel;
  conv_param_->output_channel_ = output_channel;
  int ic4 = UP_DIV(input_channel, C4NUM);
  int oc4 = UP_DIV(output_channel, C4NUM);
  int kernel_plane = kernel_h * kernel_w;
  int plane_c4 = UP_DIV(kernel_plane, C4NUM);
  int pack_weight_size = oc4 * ic4 * C4NUM * C4NUM * plane_c4 * C4NUM;
  auto filter_arg = conv_param_->conv_quant_arg_.filter_quant_args_;
  int32_t input_zp = conv_param_->conv_quant_arg_.input_quant_args_[0].zp_;

  // init weight
  auto origin_weight = reinterpret_cast<int8_t *>(in_tensors_.at(kWeightIndex)->MutableData());
  packed_weight_ = reinterpret_cast<int8_t *>(malloc(pack_weight_size));
  if (packed_weight_ == nullptr) {
    MS_LOG(ERROR) << "malloc packed_weight_ failed.";
    return RET_ERROR;
  }
  memset(packed_weight_, 0, pack_weight_size);
  auto *weight_sum = reinterpret_cast<int32_t *>(malloc(sizeof(int32_t) * output_channel));
  if (weight_sum == nullptr) {
    MS_LOG(ERROR) << "malloc weight_sum failed.";
    return RET_ERROR;
  }
  for (int i = 0; i < output_channel; i++) {
    if (conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL) {
      weight_sum[i] = ic4 * C4NUM * kernel_plane * filter_arg[i].zp_;
    } else {
      weight_sum[i] = ic4 * C4NUM * kernel_plane * filter_arg[0].zp_;
    }
  }
  PackWeightInt8(origin_weight, conv_param_, packed_weight_, weight_sum);

  // init bias
  bias_data_ = reinterpret_cast<int32_t *>(malloc(oc4 * C4NUM * sizeof(int32_t)));
  if (bias_data_ == nullptr) {
    MS_LOG(ERROR) << "malloc bias_data_ failed.";
    return RET_ERROR;
  }
  memset(bias_data_, 0, oc4 * C4NUM * sizeof(int32_t));
  if (in_tensors_.size() == kInputSize2) {
    auto ori_bias = reinterpret_cast<int32_t *>(in_tensors_.at(kBiasIndex)->MutableData());
    memcpy(bias_data_, ori_bias, output_channel * sizeof(int32_t));
  } else {
    MS_ASSERT(in_tensors_.size() == kInputSize1);
  }
  auto *bias_data = reinterpret_cast<int32_t *>(bias_data_);
  int c4_kernel_plane_size = kernel_plane * ic4 * C4NUM;
  if (conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL) {
    for (int i = 0; i < output_channel; i++) {
      bias_data[i] += filter_arg[i].zp_ * input_zp * c4_kernel_plane_size - weight_sum[i] * input_zp;
    }
  int up_round_deep;
  int up_round_oc;
 #ifdef ENABLE_ARM32
  up_round_oc = UP_ROUND(output_channel, C2NUM);
  up_round_deep = UP_ROUND(kernel_plane * input_channel, C16NUM);
 #else
  if (support_optimize_) {
    up_round_oc = UP_ROUND(output_channel, C8NUM);
    up_round_deep = UP_ROUND(kernel_plane * input_channel, C4NUM);
  } else {
    for (int i = 0; i < output_channel; i++) {
      bias_data[i] += filter_arg[0].zp_ * input_zp * c4_kernel_plane_size - weight_sum[i] * input_zp;
    }
    up_round_oc = UP_ROUND(output_channel, C4NUM);
    up_round_deep = UP_ROUND(kernel_plane * input_channel, C16NUM);
  }
  free(weight_sum);

  size_t input_sum_size;
  if (conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL) {
    input_sum_size = oc4 * C4NUM * tile_num_ * thread_count_ * sizeof(int32_t);
  } else {
    input_sum_size = tile_num_ * thread_count_ * sizeof(int32_t);
  }
  input_sum_ = reinterpret_cast<int32_t *>(malloc(input_sum_size));
  if (input_sum_ == nullptr) {
    MS_LOG(ERROR) << "malloc input_sum_ failed.";
    return RET_ERROR;
  }
  memset(input_sum_, 0, input_sum_size);
  return RET_OK;
 }

 int ConvolutionInt8CPUKernel::InitTmpBuffer() {
  MS_ASSERT(ctx_->allocator != nullptr);
  int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM);
  int kernel_plane = conv_param_->kernel_h_ * conv_param_->kernel_w_;
  int plane_c4 = UP_DIV(kernel_plane, C4NUM);
  int unit_size = plane_c4 * C4NUM * ic4 * C4NUM;
  packed_input_ = reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(unit_size * thread_count_ * tile_num_));
  if (packed_input_ == nullptr) {
    MS_LOG(ERROR) << "malloc packed_input_ failed.";
    return RET_ERROR;
  }

  size_t tmp_dst_size = thread_count_ * tile_num_ * conv_param_->output_channel_ * sizeof(int32_t);
  tmp_dst_ = reinterpret_cast<int32_t *>(ctx_->allocator->Malloc(tmp_dst_size));
  if (tmp_dst_ == nullptr) {
    MS_LOG(ERROR) << "malloc tmp_dst_ failed.";
    return RET_ERROR;
  }

  tmp_out_ =
    reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(thread_count_ * tile_num_ * conv_param_->output_channel_));
  if (tmp_out_ == nullptr) {
    MS_LOG(ERROR) << "malloc tmp_out_ failed.";
    return RET_ERROR;
  }
  return RET_OK;
 }

 int ConvolutionInt8CPUKernel::InitWeightBiasOpt() {
  auto filter_tensor = in_tensors_.at(kWeightIndex);
  auto input_channel = filter_tensor->Channel();
  auto output_channel = filter_tensor->Batch();
  int kernel_h = filter_tensor->Height();
  int kernel_w = filter_tensor->Width();
  conv_param_->input_channel_ = input_channel;
  conv_param_->output_channel_ = output_channel;
  int oc8 = UP_DIV(output_channel, C8NUM);
  int kernel_plane = kernel_h * kernel_w;
  int up_round_deep = UP_ROUND(kernel_plane * input_channel, C4NUM);
  int pack_weight_size = oc8 * C8NUM * up_round_deep;
 #endif
  int pack_weight_size = up_round_oc * up_round_deep;
  int bias_size = up_round_oc * sizeof(int32_t);
  int32_t input_zp = conv_param_->conv_quant_arg_.input_quant_args_[0].zp_;

  // init weight
@@ -190,15 +94,23 @@ int ConvolutionInt8CPUKernel::InitWeightBiasOpt() {
    return RET_ERROR;
  }
  memset(packed_weight_, 0, pack_weight_size);
  RowMajor2Row8x4MajorInt8(origin_weight, packed_weight_, output_channel, input_channel * kernel_plane);
 #ifdef ENABLE_ARM32
  RowMajor2Row2x16MajorInt8(origin_weight, packed_weight_, output_channel, input_channel * kernel_plane);
 #else
  if (support_optimize_) {
    RowMajor2Row8x4MajorInt8(origin_weight, packed_weight_, output_channel, input_channel * kernel_plane);
  } else {
    RowMajor2Row16x4MajorInt8(origin_weight, packed_weight_, output_channel, input_channel * kernel_plane);
  }
 #endif

  // init bias
  bias_data_ = reinterpret_cast<int32_t *>(malloc(oc8 * C8NUM * sizeof(int32_t)));
  bias_data_ = reinterpret_cast<int32_t *>(malloc(bias_size));
  if (bias_data_ == nullptr) {
    MS_LOG(ERROR) << "malloc bias_data_ failed.";
    return RET_ERROR;
  }
  memset(bias_data_, 0, oc8 * C8NUM * sizeof(int32_t));
  memset(bias_data_, 0, bias_size);
  if (in_tensors_.size() == kInputSize2) {
    auto ori_bias = reinterpret_cast<int32_t *>(in_tensors_.at(kBiasIndex)->MutableData());
    memcpy(bias_data_, ori_bias, output_channel * sizeof(int32_t));
@@ -225,7 +137,7 @@ int ConvolutionInt8CPUKernel::InitWeightBiasOpt() {

  size_t input_sum_size;
  if (conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL) {
    input_sum_size = oc8 * C8NUM * tile_num_ * thread_count_ * sizeof(int32_t);
    input_sum_size = up_round_oc * tile_num_ * thread_count_ * sizeof(int32_t);
  } else {
    input_sum_size = tile_num_ * thread_count_ * sizeof(int32_t);
  }
@@ -241,14 +153,19 @@ int ConvolutionInt8CPUKernel::InitWeightBiasOpt() {
 int ConvolutionInt8CPUKernel::InitTmpBufferOpt() {
  MS_ASSERT(ctx_->allocator != nullptr);
  int kernel_plane = conv_param_->kernel_h_ * conv_param_->kernel_w_;
  int tmp_unit = UP_ROUND(kernel_plane * conv_param_->input_channel_, C4NUM);
  int tmp_size;
  if (support_optimize_) {
    tmp_size = UP_ROUND(kernel_plane * conv_param_->input_channel_, C4NUM);
  } else {
    tmp_size = UP_ROUND(kernel_plane * conv_param_->input_channel_, C16NUM);
  }
  matmul_packed_input_ = reinterpret_cast<int8_t *>(
    ctx_->allocator->Malloc(thread_count_ * tile_num_ * kernel_plane * conv_param_->input_channel_));
  if (matmul_packed_input_ == nullptr) {
    MS_LOG(ERROR) << "malloc matmul_packed_input_ failed.";
    return RET_ERROR;
  }
  packed_input_ = reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(tmp_unit * thread_count_ * tile_num_));
  packed_input_ = reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(tmp_size * thread_count_ * tile_num_));
  if (packed_input_ == nullptr) {
    MS_LOG(ERROR) << "malloc packed_input_ failed.";
    return RET_ERROR;
@@ -263,26 +180,13 @@ int ConvolutionInt8CPUKernel::Init() {
    MS_LOG(ERROR) << "Set quant param failed.";
    return ret;
  }
  // init for opt
  if (support_optimize_) {
    ret = InitWeightBiasOpt();
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "Initialization for optimized int8 conv failed.";
      return RET_ERROR;
    }
  } else {
    ret = SetIfAsymmetric();
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "Set if per asymmetric failed.";
      return ret;
    }
    // init for situation that not support sdot
    ret = InitWeightBias();
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "Init weight bias failed.";
      return RET_ERROR;
    }

  ret = InitWeightBiasOpt();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Initialization for optimized int8 conv failed.";
    return RET_ERROR;
  }

  if (!InferShapeDone()) {
    return RET_OK;
  }
@@ -308,14 +212,9 @@ int ConvolutionInt8CPUKernel::RunImpl(int task_id) {
  auto input_tensor = in_tensors_.at(kInputIndex);
  auto ori_input_data = reinterpret_cast<int8_t *>(input_tensor->MutableData());
  auto output_addr = reinterpret_cast<int8_t *>(out_tensors_.at(kOutputIndex)->MutableData());
  if (support_optimize_) {
    ConvInt8Opt(ori_input_data, packed_input_, matmul_packed_input_, packed_weight_,
                reinterpret_cast<int32_t *>(bias_data_), output_addr, filter_zp_ptr_, input_sum_, task_id, conv_param_,
                matmul_func_);
  } else {
    ConvInt8(ori_input_data, packed_input_, packed_weight_, reinterpret_cast<int32_t *>(bias_data_), tmp_dst_, tmp_out_,
             output_addr, input_sum_, task_id, conv_param_);
  }
  ConvInt8Opt(ori_input_data, packed_input_, matmul_packed_input_, packed_weight_,
              reinterpret_cast<int32_t *>(bias_data_), output_addr, filter_zp_ptr_, input_sum_, task_id, conv_param_,
              matmul_func_, support_optimize_);
  return RET_OK;
 }

@@ -330,18 +229,10 @@ int ConvolutionInt8Impl(void *cdata, int task_id) {
 }

 int ConvolutionInt8CPUKernel::Run() {
  if (support_optimize_) {
    auto ret = InitTmpBufferOpt();
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "Init tmp buffer failed.";
      return RET_ERROR;
    }
  } else {
    auto ret = InitTmpBuffer();
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "Init tmp buffer failed.";
      return RET_ERROR;
    }
  auto ret = InitTmpBufferOpt();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Init tmp buffer failed.";
    return RET_ERROR;
  }

  int error_code = ParallelLaunch(this->context_->thread_pool_, ConvolutionInt8Impl, this, thread_count_);
@@ -369,18 +260,7 @@ kernel::LiteKernel *CpuConvInt8KernelCreator(const std::vector<lite::Tensor *> &
  int dilation_w = conv_param->dilation_w_;
  kernel::LiteKernel *kernel;
  if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) {
 #ifdef ENABLE_ARM64
    void *optimize_op_handler = OptimizeModule::GetInstance()->optimized_op_handler_;
    if (optimize_op_handler != nullptr) {
      kernel = new (std::nothrow) kernel::ConvolutionInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
    } else {
      kernel = new (std::nothrow) kernel::Convolution3x3Int8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
    }
 #elif ENABLE_ARM32
    kernel = new (std::nothrow) kernel::Convolution3x3Int8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
 #else
    kernel = new (std::nothrow) kernel::ConvolutionInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
 #endif
  } else if (kernel_h == 1 && kernel_w == 1) {
    kernel = new (std::nothrow) kernel::Convolution1x1Int8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
  } else {
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.h
@@ -53,8 +53,6 @@ class ConvolutionInt8CPUKernel : public ConvolutionBaseCPUKernel {
  void CheckSupportOptimize();
  int InitWeightBiasOpt();
  int InitTmpBufferOpt();
  int InitWeightBias();
  int InitTmpBuffer();

 private:
  void FreeTmpBuffer() {
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/common/pack_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/common/pack_tests.cc
@@ -169,68 +169,6 @@ TEST_F(TestPack, PackInputFp16) {
 }
 #endif

 TEST_F(TestPack, PackInputUint8) {
  auto conv_param = new ConvParameter;
  InitConvParamPack(conv_param);
  int kernel_h = conv_param->kernel_h_;
  int kernel_w = conv_param->kernel_w_;
  int in_batch = conv_param->input_batch_;
  int in_channel = conv_param->input_channel_;
  int in_h = conv_param->input_h_;
  int in_w = conv_param->input_w_;
  int out_h = conv_param->output_h_;
  int out_w = conv_param->output_w_;

  int thread_count = 1;
  int tile_n = 8;
  int output_count = out_h * out_w;
  int output_tile_count = UP_DIV(output_count, tile_n);

  int inchannel_block = 4;
  int channel_block = UP_DIV(in_channel, inchannel_block);
  int kernel_plane = kernel_h * kernel_w;
  int unit_size = kernel_plane * channel_block * inchannel_block;
  int packed_input_size = output_tile_count * tile_n * unit_size;

  // input
  size_t input_size;
  std::string input_path = "./test_data/conv/convuint8_input_1_28_28_3.bin";
  auto input_data = reinterpret_cast<uint8_t *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
  auto int8_input = reinterpret_cast<int8_t *>(malloc(input_size));
  for (int i = 0; i < input_size; i++) {
    int8_input[i] = (int8_t)(input_data[i] - 128);
  }
  auto packed_input = reinterpret_cast<int8_t *>(malloc(in_batch * packed_input_size));
  memset(packed_input, 0, in_batch * packed_input_size);
  int32_t *input_sum = reinterpret_cast<int32_t *>(malloc(tile_n * thread_count * sizeof(int32_t)));

  for (int b = 0; b < in_batch; b++) {
    int in_batch_offset = b * in_channel * in_h * in_w;
    int gemm_in_batch_offset = b * packed_input_size;
    for (int thread_id = 0; thread_id < output_tile_count; thread_id += thread_count) {
      int start_index = thread_id * tile_n;
      int real_cal_num = (output_count - start_index) < tile_n ? (output_count - tile_n) : tile_n;
      int8_t *gemm_input =
        reinterpret_cast<int8_t *>(packed_input) + thread_id * unit_size * tile_n + gemm_in_batch_offset;
      memset(input_sum, 0, tile_n * thread_count * sizeof(int32_t));
      Im2ColPackUnitInt8(int8_input + in_batch_offset, gemm_input, real_cal_num, start_index, input_sum, conv_param);
    }
  }

  printf("==================output data=================\n");
  for (int i = 0; i < 20; i++) {
    std::cout << static_cast<int>(packed_input[i]) << " ,";
  }
  std::cout << std::endl;

  delete input_data;
  delete conv_param;
  free(int8_input);
  free(packed_input);
  free(input_sum);
  MS_LOG(INFO) << "TestPackInputUint8 passed";
 }

 TEST_F(TestPack, PackWeightUint8) {
  auto conv_param = new ConvParameter;
  InitConvParamPack(conv_param);