optimize fp16 common conv preprocess

5 years ago · 06142a330b
--- a/mindspore/lite/nnacl/fp16/conv_fp16.c
+++ b/mindspore/lite/nnacl/fp16/conv_fp16.c
@@ -344,7 +344,6 @@ void ConvFp16(float16_t *input_data, float16_t *packed_input, float16_t *packed_
  int channel_block = UP_DIV(in_channel, C4NUM);
  int kernel_plane = kernel_h * kernel_w;
  int unit_size = kernel_plane * channel_block * C4NUM;
  int packed_input_size = output_tile_count * tile_n * unit_size;

  // we accumulate 4 channels per time for input blocks
  int ic4 = UP_DIV(in_channel, C4NUM);
@@ -355,11 +354,10 @@ void ConvFp16(float16_t *input_data, float16_t *packed_input, float16_t *packed_
  for (int b = 0; b < in_batch; b++) {
    int in_batch_offset = b * ic4 * C4NUM * in_h * in_w;
    int out_batch_offset = b * out_channel * out_h * out_w;
    int gemm_in_batch_offset = b * packed_input_size;
    for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_count) {
      int start_index = thread_id * tile_n;
      int real_cal_num = (output_count - start_index) < tile_n ? (output_count - start_index) : tile_n;
      float16_t *gemm_input = (float16_t *)(packed_input + thread_id * unit_size * tile_n + gemm_in_batch_offset);
      float16_t *gemm_input = (float16_t *)(packed_input + task_id * unit_size * tile_n);
      Im2ColPackUnitFp16(input_data + in_batch_offset, conv_param, gemm_input, real_cal_num, start_index);

      int out_offset = thread_id * tile_n * out_channel + out_batch_offset;
--- a/mindspore/lite/nnacl/fp16/pack_fp16.c
+++ b/mindspore/lite/nnacl/fp16/pack_fp16.c
@@ -55,23 +55,24 @@ void Im2ColPackUnitFp16(float16_t *input_data, ConvParameter *conv_param, float1
  int in_w = conv_param->input_w_;
  int out_w = conv_param->output_w_;
  int ic4 = UP_DIV(in_channel, 4);
  int ic4_minus = in_channel / 4;
  memset(packed_input, 0, kernel_w * kernel_h * ic4 * C4NUM * 16 * sizeof(float16_t));

  for (int i = 0; i < real_cal_num; i++) {
    int block_start = block_index + i;
    int input_h = block_start / out_w * stride_h - pad_h;
    int input_w = block_start % out_w * stride_w - pad_w;
    int input_stride = input_h * in_w * ic4 * C4NUM + input_w * ic4 * C4NUM;
    int input_stride = (input_h * in_w + input_w) * in_channel;
    int kh_s = MSMAX(0, UP_DIV(-input_h, dilation_h));
    int kh_e = MSMIN(kernel_h, UP_DIV(in_h - input_h, dilation_h));
    int kw_s = MSMAX(0, UP_DIV(-input_w, dilation_w));
    int kw_e = MSMIN(kernel_w, UP_DIV(in_w - input_w, dilation_w));
    for (int j = kh_s; j < kh_e; j++) {
      int input_y_stride = j * dilation_h * in_w * ic4 * C4NUM + input_stride;
      int input_y_stride = j * dilation_h * in_w * in_channel + input_stride;
      for (int n = kw_s; n < kw_e; n++) {
        int input_x_stride = input_y_stride + n * dilation_w * ic4 * C4NUM;
        int input_x_stride = input_y_stride + n * dilation_w * in_channel;
        int input_plane_offset = (j * kernel_w + n) * 16 * C4NUM * ic4 + i * C4NUM;
        for (int m = 0; m < ic4; m++) {
        for (int m = 0; m < ic4_minus; m++) {
          int channel_block_stride = input_x_stride + m * C4NUM;
          int channel_block_offset = input_plane_offset + m * 16 * C4NUM;
 #ifdef ENABLE_ARM64
@@ -82,9 +83,15 @@ void Im2ColPackUnitFp16(float16_t *input_data, ConvParameter *conv_param, float1
          }
 #endif
        }  // channel_block loop
      }    // kernel_w loop
    }      // kernel_h loop
  }        // tile num loop
        int ic_res = in_channel - ic4_minus * C4NUM;
        for (int l = 0; l < ic_res; ++l) {
          int channel_block_stride = input_x_stride + ic4_minus * C4NUM + l;
          int channel_block_offset = input_plane_offset + ic4_minus * 16 * C4NUM + l;
          packed_input[channel_block_offset] = input_data[channel_block_stride];
        }
      }  // kernel_w loop
    }    // kernel_h loop
  }      // tile num loop
 }

 void PackWeightFp16(float16_t *weight_data, ConvParameter *conv_param, float16_t *packed_weight) {
@@ -334,7 +341,8 @@ void PackNHWCToNCHWFp16(const void *src, void *dst, int batches, int plane, int
          "st1 {v27.8h}, [x11], %[dstStride]\n"
          "st1 {v31.8h}, [x10], %[dstStride]\n"
          :
          : [ dst_ptr ] "r"(dst_ptr), [ src_ptr ] "r"(src_ptr), [ srcStride ] "r"(srcStride), [ dstStride ] "r"(dstStride)
          :
          [ dst_ptr ] "r"(dst_ptr), [ src_ptr ] "r"(src_ptr), [ srcStride ] "r"(srcStride), [ dstStride ] "r"(dstStride)
          : "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14",
            "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
            "v30", "v31");
--- a/mindspore/lite/nnacl/pack.c
+++ b/mindspore/lite/nnacl/pack.c
@@ -78,6 +78,7 @@ void PackWeightInt8(int8_t *weight_data, ConvParameter *conv_param, int8_t *pack
  int plane_c4 = UP_DIV(kernel_plane, C4NUM);
  int pack_weight_size = oc4 * C4NUM * ic4 * C4NUM * plane_c4 * C4NUM;
  int block_size = pack_weight_size / oc4;
  QuantArg *filter_args = conv_param->conv_quant_arg_.filter_quant_args_;

  for (int m = 0; m < kernel_plane; m++) {
    int kernel_plane_stride = m * in_channel;
@@ -101,7 +102,13 @@ void PackWeightInt8(int8_t *weight_data, ConvParameter *conv_param, int8_t *pack
            int8_t *origin_data_ptr = weight_data + kernel_block_stride + k * kernel_plane * in_channel;
            int8_t *packed_data_ptr = packed_weight + packed_kernel_block_size + k * C4NUM * C4NUM;
            *packed_data_ptr = origin_data_ptr[0];
            weight_sum[j * C4NUM + k] += (int32_t)packed_data_ptr[0];
            int32_t f_zp;
            if (conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL) {
              f_zp = filter_args[j * C4NUM + k].zp_;
            } else {
              f_zp = filter_args[0].zp_;
            }
            weight_sum[j * C4NUM + k] += (int32_t)(packed_data_ptr[0] - f_zp);
          }
        }  // kernel block loop
      }    // inchannel block loop
@@ -121,6 +128,7 @@ void PackWeightInt8Opt(int8_t *weight_data, ConvParameter *conv_param, int8_t *p
  int pack_weight_size = oc4 * ic4 * C4NUM * C4NUM * kernel_plane;
  int unit_size = C4NUM * C4NUM;
  int block_size = pack_weight_size / oc4;
  QuantArg *filter_args = conv_param->conv_quant_arg_.filter_quant_args_;

  for (int m = 0; m < kernel_plane; m++) {
    int kernel_plane_stride = m * in_channel;
@@ -142,7 +150,13 @@ void PackWeightInt8Opt(int8_t *weight_data, ConvParameter *conv_param, int8_t *p
            int8_t *origin_data_ptr = weight_data + kernel_block_stride + k * kernel_plane * in_channel;
            int8_t *packed_data_ptr = packed_weight + packed_kernel_block_size + k * C4NUM;
            *packed_data_ptr = origin_data_ptr[0];
            weight_sum[j * C4NUM + k] += (int32_t)(packed_data_ptr[0]);
            int32_t f_zp;
            if (conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL) {
              f_zp = filter_args[j * C4NUM + k].zp_;
            } else {
              f_zp = filter_args[0].zp_;
            }
            weight_sum[j * C4NUM + k] += (int32_t)(packed_data_ptr[0] - f_zp);
          }
        }  // kernel block loop
      }    // inchannel block loop
@@ -400,6 +414,9 @@ void Im2ColPackUnitInt8(const int8_t *input_data, int8_t *packed_input, int real
          packed_input[channel_block_offset] = input_data[channel_block_stride];
          input_accumulator += (packed_input + channel_block_offset)[0];
        }
        for (int l = 0; l < (C4NUM - ic_res); l++) {
          input_accumulator += conv_param->conv_quant_arg_.input_quant_args_[0].zp_;
        }
      }  // kernel_w loop
    }    // kernel_h loop
    if (!(conv_param->conv_quant_arg_.asymmetric_ & FILTER_ASYMMETRIC)) {
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
@@ -84,53 +84,29 @@ int ConvolutionFP16CPUKernel::InitWeightBias() {
 }

 int ConvolutionFP16CPUKernel::InitTmpBuffer() {
  int in_batch = conv_param_->input_batch_;
  int in_channel = conv_param_->input_channel_;
  int out_channel = conv_param_->output_channel_;
  int channel_block = UP_DIV(in_channel, C4NUM);
  int cal_num = 16;
  int output_count = conv_param_->output_h_ * conv_param_->output_w_;
  int output_tile_count = UP_DIV(output_count, cal_num);
  int kernel_plane = conv_param_->kernel_h_ * conv_param_->kernel_w_;
  int unit_size = kernel_plane * channel_block * C4NUM;
  int packed_input_size = output_tile_count * cal_num * unit_size;
  int packed_input_size = thread_count_ * cal_num * unit_size;

  packed_input_ =
    reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(in_batch * packed_input_size * sizeof(float16_t)));
  packed_input_ = reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(packed_input_size * sizeof(float16_t)));
  if (packed_input_ == nullptr) {
    MS_LOG(ERROR) << "malloc packed_input_ failed.";
    return RET_ERROR;
  }

  size_t nhwc4_input_size =
    channel_block * C4NUM * in_batch * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t);
  nhwc4_input_ = ctx_->allocator->Malloc(nhwc4_input_size);
  if (nhwc4_input_ == nullptr) {
    MS_LOG(ERROR) << "malloc nhwc4_input_ failed.";
    return RET_ERROR;
  }

  tmp_output_block_ =
    reinterpret_cast<float16_t *>(ctx_->allocator->Malloc(thread_count_ * cal_num * out_channel * sizeof(float16_t)));
  if (tmp_output_block_ == nullptr) {
    MS_LOG(ERROR) << "malloc tmp_output_block_ failed.";
    return RET_ERROR;
  }

  return RET_OK;
 }

 void ConvolutionFP16CPUKernel::ConfigInputOutput() {
  auto input_tensor = in_tensors_.at(kInputIndex);
  auto input_format = input_tensor->GetFormat();
  schema::Format execute_format = schema::Format::Format_NHWC4;
  convert_func_ = LayoutTransformFp16(input_format, execute_format);
  if (convert_func_ == nullptr) {
    MS_LOG(ERROR) << "layout convert func is nullptr.";
    return;
  }
 }

 int ConvolutionFP16CPUKernel::Init() {
  auto ret = InitWeightBias();
  if (ret != RET_OK) {
@@ -140,7 +116,6 @@ int ConvolutionFP16CPUKernel::Init() {
  if (!InferShapeDone()) {
    return RET_OK;
  }
  ConfigInputOutput();
  return ReSize();
 }

@@ -160,8 +135,8 @@ int ConvolutionFP16CPUKernel::ReSize() {
 }

 int ConvolutionFP16CPUKernel::RunImpl(int task_id) {
  ConvFp16(reinterpret_cast<float16_t *>(nhwc4_input_), packed_input_, packed_weight_,
           reinterpret_cast<float16_t *>(bias_data_), tmp_output_block_, execute_output_, task_id, conv_param_);
  ConvFp16(execute_input_, packed_input_, packed_weight_, reinterpret_cast<float16_t *>(bias_data_), tmp_output_block_,
           execute_output_, task_id, conv_param_);
  return RET_OK;
 }

@@ -194,12 +169,6 @@ int ConvolutionFP16CPUKernel::Run() {
    return RET_ERROR;
  }

  int in_batch = conv_param_->input_batch_;
  int in_h = conv_param_->input_h_;
  int in_w = conv_param_->input_w_;
  int in_channel = conv_param_->input_channel_;
  convert_func_(reinterpret_cast<void *>(execute_input_), nhwc4_input_, in_batch, in_h * in_w, in_channel);

  int error_code = ParallelLaunch(this->context_->thread_pool_, ConvolutionFp16Impl, this, thread_count_);
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "conv fp16 error error_code[" << error_code << "]";
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.h
@@ -46,14 +46,9 @@ class ConvolutionFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
  int RunImpl(int task_id);
  int InitWeightBias();
  int InitTmpBuffer();
  void ConfigInputOutput();

 private:
  void FreeTmpBuffer() {
    if (nhwc4_input_ != nullptr) {
      ctx_->allocator->Free(nhwc4_input_);
      nhwc4_input_ = nullptr;
    }
    if (packed_input_ != nullptr) {
      ctx_->allocator->Free(packed_input_);
      packed_input_ = nullptr;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc
@@ -236,12 +236,8 @@ kernel::LiteKernel *CpuConvFp32KernelCreator(const std::vector<lite::Tensor *> &
  if (kernel_h == 1 && kernel_w == 1) {
    kernel = new (std::nothrow) kernel::Convolution1x1CPUKernel(op_parameter, inputs, outputs, ctx, primitive);
  } else if (use_winograd) {
    if (kernel_h == 3 && kernel_w == 3 && out_unit == 2) {
      kernel = new (std::nothrow) kernel::Convolution3x3CPUKernel(op_parameter, inputs, outputs, ctx, primitive);
    } else {
      kernel = new (std::nothrow)
        kernel::ConvolutionWinogradCPUKernel(op_parameter, inputs, outputs, ctx, primitive, out_unit);
    }
    kernel =
      new (std::nothrow) kernel::ConvolutionWinogradCPUKernel(op_parameter, inputs, outputs, ctx, primitive, out_unit);
  } else {
    kernel = new (std::nothrow) kernel::ConvolutionCPUKernel(op_parameter, inputs, outputs, ctx, primitive);
  }
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc
@@ -89,7 +89,13 @@ int ConvolutionInt8CPUKernel::InitWeightBias() {
    MS_LOG(ERROR) << "malloc weight_sum failed.";
    return RET_ERROR;
  }
  for (int i = 0; i < output_channel; i++) weight_sum[i] = 0;
  for (int i = 0; i < output_channel; i++) {
    if (conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL) {
      weight_sum[i] = ic4 * C4NUM * kernel_plane * filter_arg[i].zp_;
    } else {
      weight_sum[i] = ic4 * C4NUM * kernel_plane * filter_arg[0].zp_;
    }
  }
  PackWeightInt8(origin_weight, conv_param_, packed_weight_, weight_sum);

  // init bias
@@ -190,7 +196,13 @@ int ConvolutionInt8CPUKernel::InitWeightBiasOpt() {
    MS_LOG(ERROR) << "malloc weight_sum failed.";
    return RET_ERROR;
  }
  for (int i = 0; i < output_channel; i++) weight_sum[i] = 0;
  for (int i = 0; i < output_channel; i++) {
    if (conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL) {
      weight_sum[i] = ic4 * C4NUM * kernel_plane * filter_arg[i].zp_;
    } else {
      weight_sum[i] = ic4 * C4NUM * kernel_plane * filter_arg[0].zp_;
    }
  }
  PackWeightInt8Opt(origin_weight, conv_param_, packed_weight_, weight_sum);

  // init bias
@@ -261,14 +273,7 @@ int ConvolutionInt8CPUKernel::InitTmpBufferOpt() {
  return RET_OK;
 }

 void ConvolutionInt8CPUKernel::ConfigInputOutput() {
  auto output_tensor = out_tensors_.at(kOutputIndex);
  output_tensor->SetFormat(schema::Format::Format_NHWC);
 }

 int ConvolutionInt8CPUKernel::Init() {
  // config input output
  ConfigInputOutput();
  CheckSupportOptimize();
  auto ret = SetQuantParam();
  if (ret != RET_OK) {
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.h
@@ -51,7 +51,6 @@ class ConvolutionInt8CPUKernel : public ConvolutionBaseCPUKernel {
  int InitTmpBufferOpt();
  int InitWeightBias();
  int InitTmpBuffer();
  void ConfigInputOutput();

 private:
  void FreeTmpBuffer() {