optimize conv pre process

5 years ago · e14ffdc8f4
--- a/mindspore/lite/nnacl/fp32/conv.c
+++ b/mindspore/lite/nnacl/fp32/conv.c
@@ -221,7 +221,6 @@ void ConvFp32(float *input_data, float *packed_input, float *packed_weight, cons
  int ic4 = UP_DIV(in_channel, C4NUM);
  int kernel_plane = kernel_h * kernel_w;
  int unit_size = kernel_plane * ic4 * C4NUM;
  int packed_input_size = output_tile_count * TILE_NUM * unit_size;
  bool relu = conv_param->act_type_ == ActType_Relu;
  bool relu6 = conv_param->act_type_ == ActType_Relu6;

@@ -232,13 +231,14 @@ void ConvFp32(float *input_data, float *packed_input, float *packed_weight, cons
  size_t output_offset = out_channel * sizeof(float);

  for (int b = 0; b < in_batch; b++) {
    int in_batch_offset = b * ic4 * C4NUM * in_h * in_w;
    int in_batch_offset = b * in_channel * in_h * in_w;
    int out_batch_offset = b * out_channel * out_h * out_w;
    int gemm_in_batch_offset = b * packed_input_size;
    for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_count) {
      int start_index = thread_id * TILE_NUM;
      int real_cal_num = (output_count - start_index) < TILE_NUM ? (output_count - start_index) : TILE_NUM;
      float *gemm_input = packed_input + thread_id * unit_size * TILE_NUM + gemm_in_batch_offset;
      float *gemm_input = packed_input + task_id * unit_size * TILE_NUM;
      size_t packed_input_size = unit_size * TILE_NUM * sizeof(float);
      memset(gemm_input, 0, packed_input_size);
      Im2ColPackUnitFp32(input_data + in_batch_offset, conv_param, gemm_input, real_cal_num, start_index);

      int out_offset = thread_id * TILE_NUM * out_channel + out_batch_offset;
@@ -291,7 +291,7 @@ void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_
  // step 1 : filter transform (pre-processed offline)
  // step 2 : input transform (online)
  for (int b = 0; b < in_batch; b++) {
    int in_batch_offset = b * ic4 * C4NUM * conv_param->input_h_ * conv_param->input_w_;
    int in_batch_offset = b * in_channel * conv_param->input_h_ * conv_param->input_w_;
    int out_batch_offset = b * out_channel * conv_param->output_w_ * conv_param->output_h_;
    for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_num) {
      int out_tile_index = thread_id * tile_num;
@@ -322,144 +322,9 @@ void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_
  }
 }

 void UnPackWinogradOutput(const float *src, float *dst, int batch, int height, int width, int channel,
                          int output_unit) {
  int out_h_block_num = UP_DIV(height, output_unit);
  int out_w_block_num = UP_DIV(width, output_unit);
  int c4 = UP_DIV(channel, C4NUM);
  int c4_block = C4NUM * out_h_block_num * output_unit * out_w_block_num * output_unit;
  for (int b = 0; b < batch; b++) {
    int src_batch_offset = b * c4 * c4_block;
    int dst_batch_offset = b * height * width * channel;
    for (int h = 0; h < height; h++) {
      int src_h_offset = src_batch_offset + C4NUM * (h * out_w_block_num * output_unit);
      int dst_h_offset = dst_batch_offset + h * width * channel;
      for (int w = 0; w < width; w++) {
        int src_w_offset = src_h_offset + w * C4NUM;
        int dst_w_offset = dst_h_offset + w * channel;
        for (int c = 0; c < c4 - 1; c++) {
          int src_c4_offset = src_w_offset + c * c4_block;
          int dst_c4_offset = dst_w_offset + c * C4NUM;
 #ifdef ENABLE_NEON
          vst1q_f32(dst + dst_c4_offset, vld1q_f32(src + src_c4_offset));
 #else
          for (int i = 0; i < C4NUM; ++i) {
            dst[dst_c4_offset + i] = src[src_c4_offset + i];
          }
 #endif
        }
        int c_res = channel - (c4 - 1) * C4NUM;
        int src_c_res_offset = (c4 - 1) * c4_block;
        int dst_c_res_offset = (c4 - 1) * C4NUM;
        for (int c = 0; c < c_res; c++) {
          int src_c4_res_offset = src_w_offset + src_c_res_offset + c;
          int dst_c4_res_offset = dst_w_offset + dst_c_res_offset + c;
          dst[dst_c4_res_offset] = src[src_c4_res_offset];
        }
      }
    }
  }
 }

 void UnPackWinogradReluOutput(const float *src, float *dst, int batch, int height, int width, int channel,
                              int output_unit) {
  int out_h_block_num = UP_DIV(height, output_unit);
  int out_w_block_num = UP_DIV(width, output_unit);
  int c4 = UP_DIV(channel, C4NUM);
  int c4_block = C4NUM * out_h_block_num * output_unit * out_w_block_num * output_unit;
  for (int b = 0; b < batch; b++) {
    int src_batch_offset = b * c4 * c4_block;
    int dst_batch_offset = b * height * width * channel;
    for (int h = 0; h < height; h++) {
      int src_h_offset = src_batch_offset + C4NUM * (h * out_w_block_num * output_unit);
      int dst_h_offset = dst_batch_offset + h * width * channel;
      for (int w = 0; w < width; w++) {
        int src_w_offset = src_h_offset + w * C4NUM;
        int dst_w_offset = dst_h_offset + w * channel;
        for (int c = 0; c < c4 - 1; c++) {
          int src_c4_offset = src_w_offset + c * c4_block;
          int dst_c4_offset = dst_w_offset + c * C4NUM;
 #ifdef ENABLE_NEON
          float32x4_t input_ptr = vld1q_f32(src + src_c4_offset);
          float32x4_t zero = vdupq_n_f32(0);
          input_ptr = vmaxq_f32(zero, input_ptr);
          vst1q_f32(dst + dst_c4_offset, input_ptr);
 #else
          for (int i = 0; i < C4NUM; ++i) {
            float input_data = src[src_c4_offset + i];
            input_data = input_data < 0 ? 0 : input_data;
            dst[dst_c4_offset + i] = input_data;
          }
 #endif
        }
        int c_res = channel - (c4 - 1) * C4NUM;
        int src_c_res_offset = (c4 - 1) * c4_block;
        int dst_c_res_offset = (c4 - 1) * C4NUM;
        for (int c = 0; c < c_res; c++) {
          int src_c4_res_offset = src_w_offset + src_c_res_offset + c;
          int dst_c4_res_offset = dst_w_offset + dst_c_res_offset + c;
          float input_data = src[src_c4_res_offset];
          input_data = input_data < 0 ? 0 : input_data;
          dst[dst_c4_res_offset] = input_data;
        }
      }
    }
  }
 }

 void UnPackWinogradRelu6Output(const float *src, float *dst, int batch, int height, int width, int channel,
                               int output_unit) {
  int out_h_block_num = UP_DIV(height, output_unit);
  int out_w_block_num = UP_DIV(width, output_unit);
  int c4 = UP_DIV(channel, C4NUM);
  int c4_block = C4NUM * out_h_block_num * output_unit * out_w_block_num * output_unit;
  for (int b = 0; b < batch; b++) {
    int src_batch_offset = b * c4 * c4_block;
    int dst_batch_offset = b * height * width * channel;
    for (int h = 0; h < height; h++) {
      int src_h_offset = src_batch_offset + C4NUM * (h * out_w_block_num * output_unit);
      int dst_h_offset = dst_batch_offset + h * width * channel;
      for (int w = 0; w < width; w++) {
        int src_w_offset = src_h_offset + w * C4NUM;
        int dst_w_offset = dst_h_offset + w * channel;
        for (int c = 0; c < c4 - 1; c++) {
          int src_c4_offset = src_w_offset + c * c4_block;
          int dst_c4_offset = dst_w_offset + c * C4NUM;
 #ifdef ENABLE_NEON
          float32x4_t input_ptr = vld1q_f32(src + src_c4_offset);
          float32x4_t zero = vdupq_n_f32(0);
          float32x4_t six = vdupq_n_f32(6);
          input_ptr = vmaxq_f32(zero, input_ptr);
          input_ptr = vminq_f32(six, input_ptr);
          vst1q_f32(dst + dst_c4_offset, input_ptr);
 #else
          for (int i = 0; i < C4NUM; ++i) {
            float input_data = src[src_c4_offset + i];
            input_data = input_data < 0 ? 0 : input_data;
            input_data = input_data > 6 ? 6 : input_data;
            dst[dst_c4_offset + i] = input_data;
          }
 #endif
        }
        int c_res = channel - (c4 - 1) * C4NUM;
        int src_c_res_offset = (c4 - 1) * c4_block;
        int dst_c_res_offset = (c4 - 1) * C4NUM;
        for (int c = 0; c < c_res; c++) {
          int src_c4_res_offset = src_w_offset + src_c_res_offset + c;
          int dst_c4_res_offset = dst_w_offset + dst_c_res_offset + c;
          float input_data = src[src_c4_res_offset];
          input_data = input_data < 0 ? 0 : input_data;
          input_data = input_data > 6 ? 6 : input_data;
          dst[dst_c4_res_offset] = input_data;
        }
      }
    }
  }
 }

 // fp32 conv3x3
 void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_data, TmpBufferAddress *buffer_list,
                 int task_id, ConvParameter *conv_param, GEMM_FUNC_FP32 gemm_func) {
                 int task_id, ConvParameter *conv_param) {
  int thread_count = conv_param->thread_num_;
  int ic4 = UP_DIV(conv_param->input_channel_, C4NUM);
  int output_channel = conv_param->output_channel_;
@@ -488,7 +353,7 @@ void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_dat

  int input_batch = conv_param->input_batch_;
  for (int batch = 0; batch < input_batch; batch++) {
    int in_batch_offset = batch * ic4 * C4NUM * conv_param->input_h_ * conv_param->input_w_;
    int in_batch_offset = batch * conv_param->input_channel_ * conv_param->input_h_ * conv_param->input_w_;
    int nc4hw4_buffer_offset = batch * oc4 * C4NUM * conv_param->output_h_ * conv_param->output_w_;

    for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_count) {
--- a/mindspore/lite/nnacl/fp32/conv.h
+++ b/mindspore/lite/nnacl/fp32/conv.h
@@ -67,7 +67,7 @@ void UnPackWinogradRelu6Output(const float *src, float *dst, int batch, int heig

 // fp32 conv3x3
 void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_data, TmpBufferAddress *buffer_list,
                 int task_id, ConvParameter *conv_param, GEMM_FUNC_FP32 gemm_func);
                 int task_id, ConvParameter *conv_param);
 #ifdef __cplusplus
 }
 #endif
--- a/mindspore/lite/nnacl/int8/conv_int8.c
+++ b/mindspore/lite/nnacl/int8/conv_int8.c
@@ -273,7 +273,6 @@ void ConvInt8(int8_t *input_data, int8_t *packed_input, int8_t *packed_weight, c
  int kernel_plane = kernel_h * kernel_w;
  int plane_block = UP_DIV(kernel_plane, C4NUM);
  int unit_size = plane_block * C4NUM * ic4 * C4NUM;
  int packed_input_size = output_tile_count * tile_n * unit_size;
  int input_sum_offset;
  if (conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL) {
    input_sum_offset = tile_n * oc4 * C4NUM;
@@ -284,12 +283,11 @@ void ConvInt8(int8_t *input_data, int8_t *packed_input, int8_t *packed_weight, c
  for (int b = 0; b < in_batch; b++) {
    int in_batch_offset = b * ic4 * C4NUM * in_h * in_w;
    int out_batch_offset = b * out_channel * out_h * out_w;
    int gemm_in_batch_offset = b * packed_input_size;
    for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_count) {
      int start_index = thread_id * tile_n;
      int real_cal_num = (output_count - start_index) < tile_n ? (output_count - start_index) : tile_n;
      int32_t *tmp_input_sum = input_sum + task_id * input_sum_offset;
      int8_t *gemm_input = packed_input + thread_id * unit_size * tile_n + gemm_in_batch_offset;
      int8_t *gemm_input = packed_input + task_id * unit_size * tile_n;
      // clear tmp buffer before compute
      memset(gemm_input, (int8_t)input_zp, unit_size * tile_n);
      int out_offset = thread_id * tile_n * out_channel + out_batch_offset;
@@ -336,7 +334,6 @@ void ConvInt8Opt(int8_t *input_data, int8_t *packed_input, int8_t *packed_weight
  int ic4 = UP_DIV(in_channel, C4NUM);
  int kernel_plane = kernel_h * kernel_w;
  int unit_size = kernel_plane * ic4 * C4NUM;
  int packed_input_size = output_tile_count * tile_n * unit_size;
  int input_sum_offset;
  if (conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL) {
    input_sum_offset = tile_n * oc4 * C4NUM;
@@ -347,12 +344,11 @@ void ConvInt8Opt(int8_t *input_data, int8_t *packed_input, int8_t *packed_weight
  for (int b = 0; b < in_batch; b++) {
    int in_batch_offset = b * ic4 * C4NUM * in_h * in_w;
    int out_batch_offset = b * out_channel * out_h * out_w;
    int gemm_in_batch_offset = b * packed_input_size;
    for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_count) {
      int start_index = thread_id * tile_n;
      int real_cal_num = (output_count - start_index) < tile_n ? (output_count - start_index) : tile_n;
      int32_t *tmp_input_sum = input_sum + task_id * input_sum_offset;
      int8_t *gemm_input = packed_input + thread_id * unit_size * tile_n + gemm_in_batch_offset;
      int8_t *gemm_input = packed_input + task_id * unit_size * tile_n;
      // clear tmp buffer before compute
      memset(gemm_input, (int8_t)input_zp, unit_size * tile_n);
      int out_offset = thread_id * tile_n * out_channel + out_batch_offset;
--- a/mindspore/lite/nnacl/pack.c
+++ b/mindspore/lite/nnacl/pack.c
@@ -297,24 +297,24 @@ void Im2ColPackUnitFp32(const float *input_data, ConvParameter *conv_param, floa
  int in_h = conv_param->input_h_;
  int in_w = conv_param->input_w_;
  int out_w = conv_param->output_w_;
  int ic4_minus = in_channel / C4NUM;
  int ic4 = UP_DIV(in_channel, C4NUM);
  memset(packed_input, 0, kernel_h * kernel_w * ic4 * C4NUM * TILE_NUM * sizeof(float));

  for (int i = 0; i < real_cal_num; i++) {
    int block_start = block_index + i;
    int input_h = block_start / out_w * stride_h - pad_h;
    int input_w = block_start % out_w * stride_w - pad_w;
    int input_stride = input_h * in_w * ic4 * C4NUM + input_w * ic4 * C4NUM;
    int input_stride = (input_h * in_w + input_w) * in_channel;
    int kh_s = MSMAX(0, UP_DIV(-input_h, dilation_h));
    int kh_e = MSMIN(kernel_h, UP_DIV(in_h - input_h, dilation_h));
    int kw_s = MSMAX(0, UP_DIV(-input_w, dilation_w));
    int kw_e = MSMIN(kernel_w, UP_DIV(in_w - input_w, dilation_w));
    for (int j = kh_s; j < kh_e; j++) {
      int input_y_stride = j * dilation_h * in_w * ic4 * C4NUM + input_stride;
      int input_y_stride = j * dilation_h * in_w * in_channel + input_stride;
      for (int n = kw_s; n < kw_e; n++) {
        int input_x_stride = input_y_stride + n * dilation_w * ic4 * C4NUM;
        int input_x_stride = input_y_stride + n * dilation_w * in_channel;
        int input_plane_offset = (j * kernel_w + n) * C8NUM * C4NUM * ic4 + i * C4NUM;
        for (int m = 0; m < ic4; m++) {
        for (int m = 0; m < ic4_minus; m++) {
          int channel_block_stride = input_x_stride + m * C4NUM;
          int channel_block_offset = input_plane_offset + m * C8NUM * C4NUM;
 #ifdef ENABLE_NEON
@@ -325,9 +325,15 @@ void Im2ColPackUnitFp32(const float *input_data, ConvParameter *conv_param, floa
          }
 #endif
        }  // channel_block loop
      }    // kernel_w loop
    }      // kernel_h loop
  }        // tile num loop
        int ic_res = conv_param->input_channel_ - ic4_minus * C4NUM;
        for (int l = 0; l < ic_res; ++l) {
          int channel_block_stride = input_x_stride + ic4_minus * C4NUM + l;
          int channel_block_offset = input_plane_offset + ic4_minus * C8NUM * C4NUM + l;
          packed_input[channel_block_offset] = input_data[channel_block_stride];
        }
      }  // kernel_w loop
    }    // kernel_h loop
  }      // tile num loop
 }

 void Im2ColPackUnitInt8(const int8_t *input_data, int8_t *packed_input, int real_cal_num, int block_index,
@@ -346,6 +352,7 @@ void Im2ColPackUnitInt8(const int8_t *input_data, int8_t *packed_input, int real
  int in_channel = conv_param->input_channel_;
  int in_h = conv_param->input_h_;
  int in_w = conv_param->input_w_;
  int ic4_minus = in_channel / C4NUM;
  int ic4 = UP_DIV(in_channel, C4NUM);
  int oc4 = UP_DIV(conv_param->output_channel_, C4NUM);
  int out_w = conv_param->output_w_;
@@ -362,19 +369,19 @@ void Im2ColPackUnitInt8(const int8_t *input_data, int8_t *packed_input, int real
        input_accumulator += ic4 * C4NUM * conv_param->conv_quant_arg_.input_quant_args_[0].zp_ * kernel_w;
        continue;
      }
      int input_y_stride = input_y * in_w * ic4 * C4NUM;
      int input_y_stride = input_y * in_w * in_channel;
      for (int n = 0; n < kernel_w; n++) {
        int input_x = input_w + n * dilation_w;
        if (input_x < 0 || input_x >= in_w) {
          input_accumulator += ic4 * C4NUM * conv_param->conv_quant_arg_.input_quant_args_[0].zp_;
          continue;
        }
        int input_x_stride = input_y_stride + input_x * ic4 * C4NUM;
        int input_x_stride = input_y_stride + input_x * in_channel;
        int plane_c4_block = (j * kernel_w + n) / C4NUM;
        int plane_c4_res = (j * kernel_w + n) % C4NUM;
        int input_plane_offset =
          plane_c4_block * tile_num * C4NUM * C4NUM * ic4 + plane_c4_res * C4NUM + input_cal_num_offset;
        for (int m = 0; m < ic4; m++) {
        for (int m = 0; m < ic4_minus; m++) {
          int channel_block_stride = input_x_stride + m * C4NUM;
          int channel_block_offset = input_plane_offset + m * tile_num * C4NUM * C4NUM;
          (packed_input + channel_block_offset)[0] = (input_data + channel_block_stride)[0];
@@ -386,8 +393,15 @@ void Im2ColPackUnitInt8(const int8_t *input_data, int8_t *packed_input, int real
          input_accumulator += (packed_input + channel_block_offset)[2];
          input_accumulator += (packed_input + channel_block_offset)[3];
        }  // channel_block loop
      }    // kernel_w loop
    }      // kernel_h loop
        int ic_res = conv_param->input_channel_ - ic4_minus * C4NUM;
        for (int l = 0; l < ic_res; ++l) {
          int channel_block_stride = input_x_stride + ic4_minus * C4NUM + l;
          int channel_block_offset = input_plane_offset + ic4_minus * tile_num * C4NUM + l;
          packed_input[channel_block_offset] = input_data[channel_block_stride];
          input_accumulator += (packed_input + channel_block_offset)[0];
        }
      }  // kernel_w loop
    }    // kernel_h loop
    if (!(conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC)) {
      continue;
    } else if ((conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC) &&
@@ -419,6 +433,7 @@ void Im2ColPackUnitInt8Opt(const int8_t *input_data, int8_t *packed_input, int r
  int in_channel = conv_param->input_channel_;
  int in_h = conv_param->input_h_;
  int in_w = conv_param->input_w_;
  int ic4_minus = in_channel / C4NUM;
  int ic4 = UP_DIV(in_channel, C4NUM);
  int oc4 = UP_DIV(conv_param->output_channel_, C4NUM);
  int out_w = conv_param->output_w_;
@@ -428,26 +443,29 @@ void Im2ColPackUnitInt8Opt(const int8_t *input_data, int8_t *packed_input, int r
    int block_start = block_index + i;
    int input_h = block_start / out_w * stride_h - pad_h;
    int input_w = block_start % out_w * stride_w - pad_w;
    for (int j = 0; j < kernel_h; j++) {
      int input_y = input_h + j * dilation_h;
      if (input_y < 0 || input_y >= in_h) {
        continue;
      }
      int input_y_stride = input_y * in_w * ic4 * C4NUM;
      for (int n = 0; n < kernel_w; n++) {
        int input_x = input_w + n * dilation_w;
        if (input_x < 0 || input_x >= in_w) {
          continue;
        }
        int input_x_stride = input_y_stride + input_x * ic4 * C4NUM;
    int input_stride = input_h * in_w * in_channel + input_w * in_channel;
    int kh_s = MSMAX(0, UP_DIV(-input_h, dilation_h));
    int kh_e = MSMIN(kernel_h, UP_DIV(in_h - input_h, dilation_h));
    int kw_s = MSMAX(0, UP_DIV(-input_w, dilation_w));
    int kw_e = MSMIN(kernel_w, UP_DIV(in_w - input_w, dilation_w));
    for (int j = kh_s; j < kh_e; j++) {
      int input_y_stride = j * dilation_h * in_w * in_channel + input_stride;
      for (int n = kw_s; n < kw_e; n++) {
        int input_x_stride = input_y_stride + n * dilation_w * in_channel;
        int input_plane_offset = (j * kernel_w + n) * tile_num * C4NUM * ic4 + i * C4NUM;
        for (int m = 0; m < ic4; m++) {
        for (int m = 0; m < ic4_minus; m++) {
          int channel_block_stride = input_x_stride + m * C4NUM;
          int channel_block_offset = input_plane_offset + m * tile_num * C4NUM;
          memcpy(packed_input + channel_block_offset, input_data + channel_block_stride, 4);
        }  // channel_block loop
      }    // kernel_w loop
    }      // kernel_h loop
        int ic_res = conv_param->input_channel_ - ic4_minus * C4NUM;
        for (int l = 0; l < ic_res; ++l) {
          int channel_block_stride = input_x_stride + ic4_minus * C4NUM + l;
          int channel_block_offset = input_plane_offset + ic4_minus * tile_num * C4NUM + l;
          packed_input[channel_block_offset] = input_data[channel_block_stride];
        }
      }  // kernel_w loop
    }    // kernel_h loop
    int32_t input_accumulator = 0;
    for (int j = 0; j < block_size; j++) {
      int block_offset = j * tile_num * ic4 * C4NUM + i * C4NUM;
--- a/mindspore/lite/nnacl/winograd_transform.c
+++ b/mindspore/lite/nnacl/winograd_transform.c
@@ -41,30 +41,48 @@ void WinogradInputTransform(const float *input_data, float *trans_input, float *
    int interval_x_e = src_x_e < input_w ? input_unit : (input_w - src_x_s);
    int interval_y_e = src_y_e < input_h ? input_unit : (input_h - src_y_s);

    int src_plane_offset = ic4 * C4NUM * (src_y_s * input_w + src_x_s);
    int src_plane_offset = in_channel * (src_y_s * input_w + src_x_s);
    int dst_plane_offset = c * C4NUM * ic4;
    for (int ic = 0; ic < ic4; ic++) {
      // clear tmp buffer
      memset(tmp_data, 0, input_unit * input_unit * C4NUM * sizeof(float));

      // get real input block with padding
      int real_c = in_channel - ic * C4NUM;
      real_c = real_c > C4NUM ? C4NUM : real_c;
      int src_ic4_offset = src_plane_offset + ic * C4NUM;
      for (int interval = interval_y_s; interval < interval_y_e; interval++) {
        int src_y_offset = src_ic4_offset + (interval * input_w + interval_x_s) * ic4 * C4NUM;
        int dst_y_offset = interval * input_unit * C4NUM + interval_x_s * C4NUM;
        for (int j = 0; j < (interval_x_e - interval_x_s); j++) {
          int src_x_offset = src_y_offset + j * ic4 * C4NUM;
          int dst_x_offset = dst_y_offset + j * C4NUM;
          float *src_addr = (float *)(input_data) + src_x_offset;
          float *dst_addr = tmp_data + dst_x_offset;
      // get real input block with padding
      if (real_c == C4NUM) {
        for (int interval = interval_y_s; interval < interval_y_e; interval++) {
          int src_y_offset = src_ic4_offset + (interval * input_w + interval_x_s) * in_channel;
          int dst_y_offset = interval * input_unit * C4NUM + interval_x_s * C4NUM;
          for (int j = 0; j < (interval_x_e - interval_x_s); j++) {
            int src_x_offset = src_y_offset + j * in_channel;
            int dst_x_offset = dst_y_offset + j * C4NUM;
            float *src_addr = (float *)(input_data) + src_x_offset;
            float *dst_addr = tmp_data + dst_x_offset;
 #ifdef ENABLE_NEON
          vst1q_f32(dst_addr, vld1q_f32(src_addr));
            vst1q_f32(dst_addr, vld1q_f32(src_addr));
 #else
          for (int k = 0; k < C4NUM; k++) {
            dst_addr[k] = src_addr[k];
          }
            for (int k = 0; k < C4NUM; k++) {
              dst_addr[k] = src_addr[k];
            }
 #endif
        }
          }  // interval x loop
        }    // interval y loop
      } else {
        for (int interval = interval_y_s; interval < interval_y_e; interval++) {
          int src_y_offset = src_ic4_offset + (interval * input_w + interval_x_s) * in_channel;
          int dst_y_offset = interval * input_unit * C4NUM + interval_x_s * C4NUM;
          for (int j = 0; j < (interval_x_e - interval_x_s); j++) {
            int src_x_offset = src_y_offset + j * in_channel;
            int dst_x_offset = dst_y_offset + j * C4NUM;
            float *src_addr = (float *)(input_data) + src_x_offset;
            float *dst_addr = tmp_data + dst_x_offset;
            for (int k = 0; k < real_c; k++) {
              dst_addr[k] = src_addr[k];
            }
          }  // interval x loop
        }    // interval y loop
      }
      // input transform
 #ifdef ENABLE_ARM32
@@ -314,29 +332,47 @@ void Conv3x3Fp32InputTransform(const float *input_data, float *trans_input, floa
    int real_y_start = origin_y > 0 ? 0 : -origin_y;
    int real_y_end = (origin_y + input_unit) < input_height ? input_unit : (input_height - origin_y);

    int src_plane_offset = ic4 * C4NUM * (origin_y * input_width + origin_x);
    int src_plane_offset = input_channel * (origin_y * input_width + origin_x);
    int dst_plane_offset = cal_id * C4NUM * ic4;
    for (int ic = 0; ic < ic4; ic++) {
      // clear tmp buffer
      memset(tmp_data, 0, input_unit * input_unit * C4NUM * sizeof(float));
      int real_c = input_channel - ic * C4NUM;
      real_c = real_c > C4NUM ? C4NUM : real_c;

      // get real input block with padding
      int src_ic4_offset = src_plane_offset + ic * C4NUM;
      for (int interval = real_y_start; interval < real_y_end; interval++) {
        int src_y_offset = src_ic4_offset + (interval * input_width + real_x_start) * ic4 * C4NUM;
        int dst_y_offset = interval * input_unit * C4NUM + real_x_start * C4NUM;
        for (int j = 0; j < (real_x_end - real_x_start); j++) {
          int src_x_offset = src_y_offset + j * ic4 * C4NUM;
          int dst_x_offset = dst_y_offset + j * C4NUM;
          float *src_addr = (float *)(input_data) + src_x_offset;
          float *dst_addr = tmp_data + dst_x_offset;
      if (real_c == C4NUM) {
        for (int interval = real_y_start; interval < real_y_end; interval++) {
          int src_y_offset = src_ic4_offset + (interval * input_width + real_x_start) * input_channel;
          int dst_y_offset = interval * input_unit * C4NUM + real_x_start * C4NUM;
          for (int j = 0; j < (real_x_end - real_x_start); j++) {
            int src_x_offset = src_y_offset + j * input_channel;
            int dst_x_offset = dst_y_offset + j * C4NUM;
            float *src_addr = (float *)(input_data) + src_x_offset;
            float *dst_addr = tmp_data + dst_x_offset;
 #ifdef ENABLE_NEON
          vst1q_f32(dst_addr, vld1q_f32(src_addr));
            vst1q_f32(dst_addr, vld1q_f32(src_addr));
 #else
          for (int k = 0; k < C4NUM; k++) {
            (dst_addr + k)[0] = (src_addr + k)[0];
          }
            for (int k = 0; k < C4NUM; k++) {
              dst_addr[k] = src_addr[k];
            }
 #endif
          }
        }
      } else {
        for (int interval = real_y_start; interval < real_y_end; interval++) {
          int src_y_offset = src_ic4_offset + (interval * input_width + real_x_start) * input_channel;
          int dst_y_offset = interval * input_unit * C4NUM + real_x_start * C4NUM;
          for (int j = 0; j < (real_x_end - real_x_start); j++) {
            int src_x_offset = src_y_offset + j * input_channel;
            int dst_x_offset = dst_y_offset + j * C4NUM;
            float *src_addr = (float *)(input_data) + src_x_offset;
            float *dst_addr = tmp_data + dst_x_offset;
            for (int k = 0; k < real_c; k++) {
              dst_addr[k] = src_addr[k];
            }
          }
        }
      }

--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc
@@ -84,21 +84,8 @@ int ConvolutionCPUKernel::InitTmpBuffer() {
  MS_ASSERT(ctx_->allocator != nullptr);

  int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM);
  size_t nhwc4_input_size =
    ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float);
  MS_ASSERT(nullptr != ctx_->allocator);
  nhwc4_input_ = ctx_->allocator->Malloc(nhwc4_input_size);
  if (nhwc4_input_ == nullptr) {
    MS_LOG(ERROR) << "malloc nhwc4 input failed.";
    return RET_ERROR;
  }

  int output_count = conv_param_->output_h_ * conv_param_->output_w_;
  int output_tile_count = UP_DIV(output_count, TILE_NUM);
  int unit_size = conv_param_->kernel_h_ * conv_param_->kernel_w_ * ic4 * C4NUM;
  int packed_input_size = output_tile_count * TILE_NUM * unit_size;
  packed_input_ =
    reinterpret_cast<float *>(ctx_->allocator->Malloc(conv_param_->input_batch_ * packed_input_size * sizeof(float)));
  int unit_size = conv_param_->kernel_h_ * conv_param_->kernel_w_ * ic4 * C4NUM * TILE_NUM * thread_count_;
  packed_input_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(unit_size * sizeof(float)));
  if (packed_input_ == nullptr) {
    MS_LOG(ERROR) << "malloc packed input failed.";
    return RET_ERROR;
@@ -158,9 +145,11 @@ int ConvolutionCPUKernel::RunImpl(int task_id) {
    MS_LOG(ERROR) << "gemm_func is nullptr.";
    return RET_ERROR;
  }
  auto input_tensor = in_tensors_.at(kInputIndex);
  auto ori_input_data = reinterpret_cast<float *>(input_tensor->MutableData());
  auto output_addr = reinterpret_cast<float *>(out_tensors_.at(kOutputIndex)->MutableData());
  ConvFp32(reinterpret_cast<float *>(nhwc4_input_), packed_input_, packed_weight_,
           reinterpret_cast<float *>(bias_data_), tmp_output_block_, output_addr, task_id, conv_param_, gemm_func_);
  ConvFp32(ori_input_data, packed_input_, packed_weight_, reinterpret_cast<float *>(bias_data_), tmp_output_block_,
           output_addr, task_id, conv_param_, gemm_func_);
  return RET_OK;
 }

@@ -187,11 +176,6 @@ int ConvolutionCPUKernel::Run() {
    return RET_ERROR;
  }

  auto input_tensor = in_tensors_.at(kInputIndex);
  auto ori_input_data = input_tensor->MutableData();
  PackNHWCToNHWC4Fp32(ori_input_data, nhwc4_input_, conv_param_->input_batch_,
                      conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);

  int error_code = ParallelLaunch(this->context_->thread_pool_, ConvolutionImpl, this, thread_count_);
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "conv error error_code[" << error_code << "]";
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.h
@@ -51,10 +51,6 @@ class ConvolutionCPUKernel : public ConvolutionBaseCPUKernel {
      ctx_->allocator->Free(tmp_output_block_);
      tmp_output_block_ = nullptr;
    }
    if (nhwc4_input_ != nullptr) {
      ctx_->allocator->Free(nhwc4_input_);
      nhwc4_input_ = nullptr;
    }
    if (packed_input_ != nullptr) {
      ctx_->allocator->Free(packed_input_);
      packed_input_ = nullptr;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc
@@ -100,13 +100,6 @@ int Convolution3x3CPUKernel::InitTmpBuffer() {
 #else
  const int tile_num = 12;
 #endif
  size_t nhwc4_input_size =
    ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float);
  nhwc4_input_ = ctx_->allocator->Malloc(nhwc4_input_size);
  if (nhwc4_input_ == nullptr) {
    MS_LOG(ERROR) << "malloc nhwc4_input_ failed.";
    return RET_ERROR;
  }

  size_t tile_buffer_size = thread_count_ * tile_num * C16NUM * ic4 * C4NUM * sizeof(float);
  tile_buffer_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(tile_buffer_size));
@@ -152,16 +145,6 @@ int Convolution3x3CPUKernel::InitTmpBuffer() {
  return RET_OK;
 }

 void Convolution3x3CPUKernel::ConfigInputOutput() {
  auto output_tensor = out_tensors_.at(kOutputIndex);
  output_tensor->SetFormat(schema::Format::Format_NHWC);
  // #ifdef ENABLE_ARM32
  //   gemm_func_ = IndirectGemmFp32_8x4;
  // #else
  gemm_func_ = IndirectGemmFp32_8x8;
  // #endif
 }

 int Convolution3x3CPUKernel::Init() {
  auto ret = InitWeightBias();
  if (ret != RET_OK) {
@@ -171,7 +154,6 @@ int Convolution3x3CPUKernel::Init() {
  if (!InferShapeDone()) {
    return RET_OK;
  }
  ConfigInputOutput();
  return ReSize();
 }

@@ -191,12 +173,10 @@ int Convolution3x3CPUKernel::ReSize() {
 }

 int Convolution3x3CPUKernel::RunImpl(int task_id) {
  if (gemm_func_ == nullptr) {
    MS_LOG(ERROR) << "gemm_func is nullptr.";
    return RET_ERROR;
  }
  Conv3x3Fp32(reinterpret_cast<float *>(nhwc4_input_), transformed_filter_addr_, reinterpret_cast<float *>(bias_data_),
              tmp_buffer_address_list_, task_id, conv_param_, gemm_func_);
  auto input_tensor = in_tensors_.at(kInputIndex);
  auto ori_input_data = reinterpret_cast<float *>(input_tensor->MutableData());
  Conv3x3Fp32(ori_input_data, transformed_filter_addr_, reinterpret_cast<float *>(bias_data_), tmp_buffer_address_list_,
              task_id, conv_param_);
  return RET_OK;
 }

@@ -245,10 +225,6 @@ int Convolution3x3CPUKernel::Run() {
    MS_LOG(ERROR) << "Init tmp buffer failed.ret: " << ret;
    return RET_ERROR;
  }
  auto input_tensor = in_tensors_.at(kInputIndex);
  auto ori_input_data = input_tensor->MutableData();
  PackNHWCToNHWC4Fp32(ori_input_data, nhwc4_input_, conv_param_->input_batch_,
                      conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);

  int error_code = ParallelLaunch(this->context_->thread_pool_, Convolution3x3Impl, this, thread_count_);
  if (error_code != RET_OK) {
@@ -260,6 +236,7 @@ int Convolution3x3CPUKernel::Run() {
  ret = PostProcess();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Post process failed.";
    FreeTmpBuffer();
    return ret;
  }
  FreeTmpBuffer();
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h
@@ -40,15 +40,10 @@ class Convolution3x3CPUKernel : public ConvolutionBaseCPUKernel {
  int RunImpl(int task_id);
  int InitWeightBias();
  int InitTmpBuffer();
  void ConfigInputOutput();
  int PostProcess();

 private:
  void FreeTmpBuffer() {
    if (nhwc4_input_ != nullptr) {
      ctx_->allocator->Free(nhwc4_input_);
      nhwc4_input_ = nullptr;
    }
    if (tile_buffer_ != nullptr) {
      ctx_->allocator->Free(tile_buffer_);
      tile_buffer_ = nullptr;
@@ -78,7 +73,6 @@ class Convolution3x3CPUKernel : public ConvolutionBaseCPUKernel {
  float *col_buffer_ = nullptr;
  float *nc4hw4_out_ = nullptr;
  TmpBufferAddress tmp_buffer_address_list_[5];
  GEMM_FUNC_FP32 gemm_func_ = nullptr;
 };
 void ProcessFilter(float *origin_weight, float *dst_weight, ConvParameter *conv_param, int oc_block, int oc_block_num);
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc
@@ -184,14 +184,6 @@ int ConvolutionWinogradCPUKernel::InitTmpBuffer() {
 #endif
  MS_ASSERT(ctx_->allocator != nullptr);

  size_t nhwc4_input_size =
    ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float);
  nhwc4_input_ = ctx_->allocator->Malloc(nhwc4_input_size);
  if (nhwc4_input_ == nullptr) {
    MS_LOG(ERROR) << "malloc nhwc4_input_ failed.";
    return RET_MEMORY_FAILED;
  }

  size_t tile_buffer_size = thread_count_ * tile_num * input_unit_ * input_unit_ * ic4 * C4NUM * sizeof(float);
  trans_input_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(tile_buffer_size));
  if (trans_input_ == nullptr) {
@@ -298,9 +290,11 @@ int ConvolutionWinogradCPUKernel::ReSize() {
 }

 int ConvolutionWinogradCPUKernel::RunImpl(int task_id) {
  auto input_tensor = in_tensors_.at(kInputIndex);
  auto ori_input_data = reinterpret_cast<float *>(input_tensor->MutableData());
  auto output_data = reinterpret_cast<float *>(out_tensors_.front()->MutableData());
  ConvWinogardFp32(reinterpret_cast<float *>(nhwc4_input_), trans_weight_, reinterpret_cast<const float *>(bias_data_),
                   output_data, tmp_buffer_address_list_, task_id, conv_param_, in_func_, out_func_);
  ConvWinogardFp32(ori_input_data, trans_weight_, reinterpret_cast<const float *>(bias_data_), output_data,
                   tmp_buffer_address_list_, task_id, conv_param_, in_func_, out_func_);
  return RET_OK;
 }

@@ -314,30 +308,6 @@ int ConvolutionWinogradImpl(void *cdata, int task_id) {
  return RET_OK;
 }

 int ConvolutionWinogradCPUKernel::PostProcess() {
  auto out_tensor = out_tensors_.front();
  auto out_data = reinterpret_cast<float *>(out_tensor->MutableData());
  auto act_type = conv_param_->act_type_;
  switch (act_type) {
    case ActType_No:
      UnPackWinogradOutput(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_,
                           conv_param_->output_w_, conv_param_->output_channel_, output_unit_);
      break;
    case ActType_Relu:
      UnPackWinogradReluOutput(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_,
                               conv_param_->output_w_, conv_param_->output_channel_, output_unit_);
      break;
    case ActType_Relu6:
      UnPackWinogradRelu6Output(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_,
                                conv_param_->output_w_, conv_param_->output_channel_, output_unit_);
      break;
    default:
      MS_LOG(ERROR) << "Unsupport activation type.";
      return RET_ERROR;
  }
  return RET_OK;
 }

 int ConvolutionWinogradCPUKernel::Run() {
  auto prepare_ret = Prepare();
  if (prepare_ret != RET_OK) {
@@ -351,11 +321,6 @@ int ConvolutionWinogradCPUKernel::Run() {
    return RET_ERROR;
  }

  auto input_tensor = in_tensors_.at(kInputIndex);
  auto ori_input_data = input_tensor->MutableData();
  PackNHWCToNHWC4Fp32(ori_input_data, nhwc4_input_, conv_param_->input_batch_,
                      conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);

  int error_code = ParallelLaunch(this->context_->thread_pool_, ConvolutionWinogradImpl, this, thread_count_);
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "conv winograd error error_code[" << error_code << "]";
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h
@@ -45,15 +45,10 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel {
  int InitWeightBias();
  int InitTmpBuffer();
  int ConfigInputOutput();
  int PostProcess();
  int WinogradFilterTransform(const float *weight_data, float *matrix_g, float *matrix_gt, int oc_block);

 private:
  void FreeTmpBuffer() {
    if (nhwc4_input_ != nullptr) {
      ctx_->allocator->Free(nhwc4_input_);
      nhwc4_input_ = nullptr;
    }
    if (trans_input_ != nullptr) {
      ctx_->allocator->Free(trans_input_);
      trans_input_ = nullptr;
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc
@@ -137,25 +137,15 @@ int ConvolutionInt8CPUKernel::InitTmpBuffer() {
  MS_ASSERT(ctx_->allocator != nullptr);

  int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM);
  int output_count = conv_param_->output_h_ * conv_param_->output_w_;
  int output_tile_count = UP_DIV(output_count, tile_num_);
  int kernel_plane = conv_param_->kernel_h_ * conv_param_->kernel_w_;
  int plane_c4 = UP_DIV(kernel_plane, C4NUM);
  int unit_size = plane_c4 * C4NUM * ic4 * C4NUM;
  int packed_input_size = output_tile_count * tile_num_ * unit_size;
  packed_input_ = reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(conv_param_->input_batch_ * packed_input_size));
  packed_input_ = reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(unit_size * thread_count_ * tile_num_));
  if (packed_input_ == nullptr) {
    MS_LOG(ERROR) << "malloc packed_input_ failed.";
    return RET_ERROR;
  }

  size_t nhwc4_input_size = ic4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_;
  nhwc4_input_ = ctx_->allocator->Malloc(nhwc4_input_size);
  if (nhwc4_input_ == nullptr) {
    MS_LOG(ERROR) << "malloc nhwc4 input failed.";
    return RET_ERROR;
  }

  size_t tmp_dst_size = thread_count_ * tile_num_ * conv_param_->output_channel_ * sizeof(int32_t);
  tmp_dst_ = reinterpret_cast<int32_t *>(ctx_->allocator->Malloc(tmp_dst_size));
  if (tmp_dst_ == nullptr) {
@@ -322,15 +312,15 @@ int ConvolutionInt8CPUKernel::ReSize() {
 }

 int ConvolutionInt8CPUKernel::RunImpl(int task_id) {
  auto input_tensor = in_tensors_.at(kInputIndex);
  auto ori_input_data = reinterpret_cast<int8_t *>(input_tensor->MutableData());
  auto output_addr = reinterpret_cast<int8_t *>(out_tensors_.at(kOutputIndex)->MutableData());
  if (support_optimize_) {
    ConvInt8Opt(reinterpret_cast<int8_t *>(nhwc4_input_), packed_input_, packed_weight_,
                reinterpret_cast<int32_t *>(bias_data_), tmp_dst_, tmp_out_, output_addr, input_sum_, task_id,
                conv_param_, gemm_func_);
    ConvInt8Opt(ori_input_data, packed_input_, packed_weight_, reinterpret_cast<int32_t *>(bias_data_), tmp_dst_,
                tmp_out_, output_addr, input_sum_, task_id, conv_param_, gemm_func_);
  } else {
    ConvInt8(reinterpret_cast<int8_t *>(nhwc4_input_), packed_input_, packed_weight_,
             reinterpret_cast<int32_t *>(bias_data_), tmp_dst_, tmp_out_, output_addr, input_sum_, task_id,
             conv_param_);
    ConvInt8(ori_input_data, packed_input_, packed_weight_, reinterpret_cast<int32_t *>(bias_data_), tmp_dst_, tmp_out_,
             output_addr, input_sum_, task_id, conv_param_);
  }
  return RET_OK;
 }
@@ -358,11 +348,6 @@ int ConvolutionInt8CPUKernel::Run() {
    return RET_ERROR;
  }

  auto input_tensor = in_tensors_.at(kInputIndex);
  auto ori_input_data = input_tensor->MutableData();
  PackNHWCToNHWC4Int8(ori_input_data, nhwc4_input_, conv_param_->input_batch_,
                      conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);

  int error_code = ParallelLaunch(this->context_->thread_pool_, ConvolutionInt8Impl, this, thread_count_);
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "conv int8 error error_code[" << error_code << "]";
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.h
@@ -55,10 +55,6 @@ class ConvolutionInt8CPUKernel : public ConvolutionBaseCPUKernel {

 private:
  void FreeTmpBuffer() {
    if (nhwc4_input_ != nullptr) {
      ctx_->allocator->Free(nhwc4_input_);
      nhwc4_input_ = nullptr;
    }
    if (packed_input_ != nullptr) {
      ctx_->allocator->Free(packed_input_);
      packed_input_ = nullptr;