!7249 [MS][LITE][CPU] conv 1x1 init time optimize

Merge pull request !7249 from liuzhongkai/conv1x1_asmop
5 years ago · 634cdd3485
--- a/mindspore/lite/nnacl/fp32/matmul.c
+++ b/mindspore/lite/nnacl/fp32/matmul.c
@@ -220,68 +220,104 @@ void RowMajor2Col12Major(float *src_ptr, float *dst_ptr, size_t row, size_t col)
 void RowMajor2Col8Major(float *src_ptr, float *dst_ptr, size_t row, size_t col) {
  size_t row8 = row / C8NUM * C8NUM;
  size_t col4 = col / C4NUM * C4NUM;
 #ifdef ENABLE_ARM64
  size_t col_skip = col / C8NUM * C8NUM;
  int skip_size = C8NUM;
 #else
  size_t col_skip = col / C4NUM * C4NUM;
  int skip_size = C4NUM;
 #endif
  float *src_r = src_ptr;
  float *dst_r = dst_ptr;
  size_t ri = 0;
  for (; ri < row8; ri += C8NUM) {
    size_t ci = 0;
    for (; ci < col4; ci += C4NUM) {
    for (; ci < col_skip; ci += skip_size) {
      float *src_c = src_r + ci;
      float *dst_c = dst_r + ci * C8NUM;
      /* 8x4 row-major to col-major */
 #ifdef ENABLE_ARM64
      /* 8x8 row-major to col-major */
      size_t stride = col * sizeof(float);
      asm volatile(
        "mov x10, %[src_c]\n"
        "mov x11, %[dst_c]\n"
        "ld1 {v0.4s}, [x10], %[stride]\n"
        "ld1 {v1.4s}, [x10], %[stride]\n"
        "ld1 {v2.4s}, [x10], %[stride]\n"
        "ld1 {v3.4s}, [x10], %[stride]\n"
        "zip1 v4.4s, v0.4s, v1.4s\n"
        "zip2 v5.4s, v0.4s, v1.4s\n"
        "zip1 v6.4s, v2.4s, v3.4s\n"
        "zip2 v7.4s, v2.4s, v3.4s\n"
        "ld1 {v8.4s},  [x10], %[stride]\n"
        "ld1 {v9.4s},  [x10], %[stride]\n"
        "ld1 {v10.4s}, [x10],  %[stride]\n"
        "ld1 {v11.4s}, [x10],  %[stride]\n"
        "trn1 v0.2d, v4.2d, v6.2d\n"
        "trn2 v1.2d, v4.2d, v6.2d\n"
        "trn1 v2.2d, v5.2d, v7.2d\n"
        "trn2 v3.2d, v5.2d, v7.2d\n"
        "zip1 v12.4s, v8.4s, v9.4s\n"
        "zip2 v13.4s, v8.4s, v9.4s\n"
        "zip1 v14.4s, v10.4s, v11.4s\n"
        "zip2 v15.4s, v10.4s, v11.4s\n"
        "trn1 v8.2d, v12.2d, v14.2d\n"
        "trn2 v9.2d, v12.2d, v14.2d\n"
        "trn1 v10.2d, v13.2d, v15.2d\n"
        "trn2 v11.2d, v13.2d, v15.2d\n"
        "st1 {v0.4s}, [x11],  #16\n"
        "st1 {v8.4s}, [x11],  #16\n"
        "st1 {v1.4s}, [x11],  #16\n"
        "st1 {v9.4s}, [x11],  #16\n"
        "st1 {v2.4s},  [x11],#16\n"
        "st1 {v10.4s}, [x11], #16\n"
        "st1 {v3.4s},  [x11],#16\n"
        "st1 {v11.4s}, [x11], #16\n"
        "ld1 {v0.4s, v1.4s}, [x10], %[stride]\n"
        "ld1 {v2.4s, v3.4s}, [x10], %[stride]\n"
        "ld1 {v4.4s, v5.4s}, [x10], %[stride]\n"
        "ld1 {v6.4s, v7.4s}, [x10], %[stride]\n"
        "zip1 v8.4s, v0.4s, v2.4s\n"
        "zip2 v9.4s, v0.4s, v2.4s\n"
        "zip1 v10.4s, v4.4s, v6.4s\n"
        "zip2 v11.4s, v4.4s, v6.4s\n"
        "ld1 {v16.4s, v17.4s}, [x10], %[stride]\n"
        "ld1 {v18.4s, v19.4s}, [x10], %[stride]\n"
        "ld1 {v20.4s, v21.4s}, [x10], %[stride]\n"
        "ld1 {v22.4s, v23.4s}, [x10], %[stride]\n"
        "zip1 v12.4s, v1.4s, v3.4s\n"
        "zip2 v13.4s, v1.4s, v3.4s\n"
        "zip1 v14.4s, v5.4s, v7.4s\n"
        "zip2 v15.4s, v5.4s, v7.4s\n"
        "trn1 v0.2d, v8.2d, v10.2d\n"
        "trn2 v1.2d, v8.2d, v10.2d\n"
        "trn1 v2.2d, v9.2d, v11.2d\n"
        "trn2 v3.2d, v9.2d, v11.2d\n"
        "zip1 v24.4s, v16.4s, v18.4s\n"
        "zip2 v25.4s, v16.4s, v18.4s\n"
        "zip1 v26.4s, v20.4s, v22.4s\n"
        "zip2 v27.4s, v20.4s, v22.4s\n"
        "trn1 v4.2d, v12.2d, v14.2d\n"
        "trn2 v5.2d, v12.2d, v14.2d\n"
        "trn1 v6.2d, v13.2d, v15.2d\n"
        "trn2 v7.2d, v13.2d, v15.2d\n"
        "zip1 v28.4s, v17.4s, v19.4s\n"
        "zip2 v29.4s, v17.4s, v19.4s\n"
        "zip1 v30.4s, v21.4s, v23.4s\n"
        "zip2 v31.4s, v21.4s, v23.4s\n"
        "trn1 v16.2d, v24.2d, v26.2d\n"
        "trn2 v17.2d, v24.2d, v26.2d\n"
        "trn1 v18.2d, v25.2d, v27.2d\n"
        "trn2 v19.2d, v25.2d, v27.2d\n"
        "trn1 v20.2d, v28.2d, v30.2d\n"
        "trn2 v21.2d, v28.2d, v30.2d\n"
        "trn1 v22.2d, v29.2d, v31.2d\n"
        "trn2 v23.2d, v29.2d, v31.2d\n"
        "st1 {v0.4s}, [x11], #16\n"
        "st1 {v16.4s}, [x11], #16\n"
        "st1 {v1.4s}, [x11], #16\n"
        "st1 {v17.4s}, [x11], #16\n"
        "st1 {v2.4s}, [x11], #16\n"
        "st1 {v18.4s}, [x11], #16\n"
        "st1 {v3.4s}, [x11], #16\n"
        "st1 {v19.4s}, [x11], #16\n"
        "st1 {v4.4s}, [x11], #16\n"
        "st1 {v20.4s}, [x11], #16\n"
        "st1 {v5.4s}, [x11], #16\n"
        "st1 {v21.4s}, [x11], #16\n"
        "st1 {v6.4s}, [x11], #16\n"
        "st1 {v22.4s}, [x11], #16\n"
        "st1 {v7.4s}, [x11], #16\n"
        "st1 {v23.4s}, [x11], #16\n"
        :
        : [ dst_c ] "r"(dst_c), [ src_c ] "r"(src_c), [ stride ] "r"(stride)
        : "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14",
          "v15");
          "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
          "v30", "v31");
 #elif ENABLE_ARM32
      /* 8x4 row-major to col-major */
      size_t stride = col * sizeof(float);
      asm volatile(
        "mov r10, %[src_c]\n"
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
@@ -85,14 +85,14 @@ int Convolution1x1FP16CPUKernel::InitWeightBias() {
  auto input_channel = weight_tensor->Channel();
  auto output_channel = weight_tensor->Batch();
  size_t size = UP_ROUND(output_channel, C8NUM) * sizeof(float16_t);
  bias_data_ = malloc(size);
  if (bias_data_ == nullptr) {
    MS_LOG(ERROR) << "Conv1x1 Malloc bias_ptr_ error!";
    return RET_ERROR;
  }
  memset(bias_data_, 0, size);
  if (in_tensors_.size() == 3) {
    size_t size = UP_ROUND(output_channel, C8NUM) * sizeof(float16_t);
    size_t weight_size = output_channel * sizeof(float16_t);
    bias_data_ = malloc(size);
    if (bias_data_ == nullptr) {
      MS_LOG(ERROR) << "Conv1x1 Malloc bias_ptr_ error!";
      return RET_ERROR;
    }
    auto bias_tensor = in_tensors_.at(kBiasIndex);
    if (bias_tensor->data_type() == kNumberTypeFloat16) {
      memcpy(bias_data_, bias_tensor->MutableData(), output_channel * sizeof(float16_t));
@@ -100,15 +100,17 @@ int Convolution1x1FP16CPUKernel::InitWeightBias() {
      Float32ToFloat16(reinterpret_cast<float *>(bias_tensor->MutableData()), reinterpret_cast<float16_t *>(bias_data_),
                       output_channel);
    }
    memset(reinterpret_cast<char *>(bias_data_) + weight_size, 0, size - weight_size);
  }
  size = input_channel * UP_ROUND(output_channel, C8NUM) * sizeof(float16_t);
  size_t size = input_channel * UP_ROUND(output_channel, C8NUM) * sizeof(float16_t);
  size_t down_size = input_channel * DOWN_DIV(output_channel, C8NUM) * C8NUM * sizeof(float16_t);
  weight_ptr_ = reinterpret_cast<float16_t *>(malloc(size));
  if (weight_ptr_ == nullptr) {
    MS_LOG(ERROR) << "Conv1x1 Malloc weight_ptr_ error!";
    return RET_ERROR;
  }
  memset(weight_ptr_, 0, size);
  memset(reinterpret_cast<char *>(weight_ptr_) + down_size, 0, size - down_size);
  ColMajor2Row8MajorFp16(weight_tensor->MutableData(), weight_ptr_, input_channel, output_channel,
                         weight_tensor->data_type() == kNumberTypeFloat16);
  return RET_OK;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc
@@ -71,24 +71,26 @@ int Convolution1x1CPUKernel::InitConv1x1BiasWeight() {
  auto input_channel = filter_tensor->Channel();
  auto output_channel = filter_tensor->Batch();
  int size = UP_ROUND(output_channel, C8NUM) * sizeof(float);
  bias_data_ = malloc(size);
  if (bias_data_ == nullptr) {
    MS_LOG(ERROR) << "Conv1x1 Malloc bias_ptr_ error!";
    return RET_ERROR;
  }
  memset(bias_data_, 0, size);
  if (in_tensors_.size() == 3) {
    memcpy(bias_data_, in_tensors_[kBiasIndex]->MutableData(), output_channel * sizeof(float));
    int size = UP_ROUND(output_channel, C8NUM) * sizeof(float);
    int weight_size = output_channel * sizeof(float);
    bias_data_ = malloc(size);
    if (bias_data_ == nullptr) {
      MS_LOG(ERROR) << "Conv1x1 Malloc bias_ptr_ error!";
      return RET_ERROR;
    }
    memcpy(bias_data_, in_tensors_[kBiasIndex]->MutableData(), weight_size);
    memset(reinterpret_cast<char *>(bias_data_) + weight_size, 0, size - weight_size);
  }
  size = input_channel * UP_ROUND(output_channel, C8NUM) * sizeof(float);
  int size = input_channel * UP_ROUND(output_channel, C8NUM) * sizeof(float);
  int down_size = input_channel * DOWN_DIV(output_channel, C8NUM) * C8NUM * sizeof(float);
  weight_ptr_ = reinterpret_cast<float *>(malloc(size));
  if (weight_ptr_ == nullptr) {
    MS_LOG(ERROR) << "Conv1x1 Malloc weight_ptr_ error!";
    return RET_ERROR;
  }
  memset(weight_ptr_, 0, size);
  memset(reinterpret_cast<char *>(weight_ptr_) + down_size, 0, size - down_size);
  RowMajor2Col8Major(reinterpret_cast<float *>(filter_tensor->MutableData()), weight_ptr_, output_channel,
                     input_channel);
  return RET_OK;
@@ -141,10 +143,10 @@ int Convolution1x1CPUKernel::DoConv1x1(int task_id) {
  if (cur_oc <= 0) {
    return RET_OK;
  }
  auto bias = (bias_data_ == nullptr) ? nullptr : reinterpret_cast<float *>(bias_data_) + thread_stride_ * task_id;
  MatMulOpt(pack_input_, weight_ptr_ + task_id * thread_stride_ * matmul_param_->deep_,
            output_ptr_ + task_id * thread_stride_, reinterpret_cast<float *>(bias_data_) + thread_stride_ * task_id,
            matmul_param_->act_type_, matmul_param_->deep_, matmul_param_->row_, cur_oc, matmul_param_->col_,
            OutType_Nhwc);
            output_ptr_ + task_id * thread_stride_, bias, matmul_param_->act_type_, matmul_param_->deep_,
            matmul_param_->row_, cur_oc, matmul_param_->col_, OutType_Nhwc);
  return RET_OK;
 }
@@ -178,7 +180,6 @@ int Convolution1x1CPUKernel::DoConv1x1Hw(int task_id) {
  MatMulOpt(thread_pack_input, weight_ptr_, thread_output_ptr, reinterpret_cast<float *>(bias_data_),
            matmul_param_->act_type_, matmul_param_->deep_, cur_hw_, matmul_param_->col_, matmul_param_->col_,
            OutType_Nhwc);
  return RET_OK;
 }