From 8213de44347d29bcb8f38888de5def6e676bde93 Mon Sep 17 00:00:00 2001
From: liuzhongkai <liuzhongkai2@huawei.com>
Date: Tue, 13 Oct 2020 09:55:23 +0800
Subject: [PATCH] conv1x1 init time optimize

---
 mindspore/lite/nnacl/fp32/matmul.c            | 120 ++++++++++++------
 .../kernel/arm/fp16/convolution_1x1_fp16.cc   |  20 +--
 .../kernel/arm/fp32/convolution_1x1.cc        |  29 +++--
 3 files changed, 104 insertions(+), 65 deletions(-)

diff --git a/mindspore/lite/nnacl/fp32/matmul.c b/mindspore/lite/nnacl/fp32/matmul.c
index ed0eb1c633..dd2a7a77ec 100644
--- a/mindspore/lite/nnacl/fp32/matmul.c
+++ b/mindspore/lite/nnacl/fp32/matmul.c
@@ -220,68 +220,104 @@ void RowMajor2Col12Major(float *src_ptr, float *dst_ptr, size_t row, size_t col)
 
 void RowMajor2Col8Major(float *src_ptr, float *dst_ptr, size_t row, size_t col) {
   size_t row8 = row / C8NUM * C8NUM;
-  size_t col4 = col / C4NUM * C4NUM;
+#ifdef ENABLE_ARM64
+  size_t col_skip = col / C8NUM * C8NUM;
+  int skip_size = C8NUM;
+#else
+  size_t col_skip = col / C4NUM * C4NUM;
+  int skip_size = C4NUM;
+#endif
   float *src_r = src_ptr;
   float *dst_r = dst_ptr;
 
   size_t ri = 0;
   for (; ri < row8; ri += C8NUM) {
     size_t ci = 0;
-    for (; ci < col4; ci += C4NUM) {
+    for (; ci < col_skip; ci += skip_size) {
       float *src_c = src_r + ci;
       float *dst_c = dst_r + ci * C8NUM;
 
-      /* 8x4 row-major to col-major */
 #ifdef ENABLE_ARM64
+      /* 8x8 row-major to col-major */
       size_t stride = col * sizeof(float);
       asm volatile(
         "mov x10, %[src_c]\n"
         "mov x11, %[dst_c]\n"
 
-        "ld1 {v0.4s}, [x10], %[stride]\n"
-        "ld1 {v1.4s}, [x10], %[stride]\n"
-        "ld1 {v2.4s}, [x10], %[stride]\n"
-        "ld1 {v3.4s}, [x10], %[stride]\n"
-
-        "zip1 v4.4s, v0.4s, v1.4s\n"
-        "zip2 v5.4s, v0.4s, v1.4s\n"
-        "zip1 v6.4s, v2.4s, v3.4s\n"
-        "zip2 v7.4s, v2.4s, v3.4s\n"
-
-        "ld1 {v8.4s},  [x10], %[stride]\n"
-        "ld1 {v9.4s},  [x10], %[stride]\n"
-        "ld1 {v10.4s}, [x10],  %[stride]\n"
-        "ld1 {v11.4s}, [x10],  %[stride]\n"
-
-        "trn1 v0.2d, v4.2d, v6.2d\n"
-        "trn2 v1.2d, v4.2d, v6.2d\n"
-        "trn1 v2.2d, v5.2d, v7.2d\n"
-        "trn2 v3.2d, v5.2d, v7.2d\n"
-
-        "zip1 v12.4s, v8.4s, v9.4s\n"
-        "zip2 v13.4s, v8.4s, v9.4s\n"
-        "zip1 v14.4s, v10.4s, v11.4s\n"
-        "zip2 v15.4s, v10.4s, v11.4s\n"
-
-        "trn1 v8.2d, v12.2d, v14.2d\n"
-        "trn2 v9.2d, v12.2d, v14.2d\n"
-        "trn1 v10.2d, v13.2d, v15.2d\n"
-        "trn2 v11.2d, v13.2d, v15.2d\n"
-
-        "st1 {v0.4s}, [x11],  #16\n"
-        "st1 {v8.4s}, [x11],  #16\n"
-        "st1 {v1.4s}, [x11],  #16\n"
-        "st1 {v9.4s}, [x11],  #16\n"
-        "st1 {v2.4s},  [x11],#16\n"
-        "st1 {v10.4s}, [x11], #16\n"
-        "st1 {v3.4s},  [x11],#16\n"
-        "st1 {v11.4s}, [x11], #16\n"
+        "ld1 {v0.4s, v1.4s}, [x10], %[stride]\n"
+        "ld1 {v2.4s, v3.4s}, [x10], %[stride]\n"
+        "ld1 {v4.4s, v5.4s}, [x10], %[stride]\n"
+        "ld1 {v6.4s, v7.4s}, [x10], %[stride]\n"
+
+        "zip1 v8.4s, v0.4s, v2.4s\n"
+        "zip2 v9.4s, v0.4s, v2.4s\n"
+        "zip1 v10.4s, v4.4s, v6.4s\n"
+        "zip2 v11.4s, v4.4s, v6.4s\n"
+
+        "ld1 {v16.4s, v17.4s}, [x10], %[stride]\n"
+        "ld1 {v18.4s, v19.4s}, [x10], %[stride]\n"
+        "ld1 {v20.4s, v21.4s}, [x10], %[stride]\n"
+        "ld1 {v22.4s, v23.4s}, [x10], %[stride]\n"
+
+        "zip1 v12.4s, v1.4s, v3.4s\n"
+        "zip2 v13.4s, v1.4s, v3.4s\n"
+        "zip1 v14.4s, v5.4s, v7.4s\n"
+        "zip2 v15.4s, v5.4s, v7.4s\n"
+
+        "trn1 v0.2d, v8.2d, v10.2d\n"
+        "trn2 v1.2d, v8.2d, v10.2d\n"
+        "trn1 v2.2d, v9.2d, v11.2d\n"
+        "trn2 v3.2d, v9.2d, v11.2d\n"
+
+        "zip1 v24.4s, v16.4s, v18.4s\n"
+        "zip2 v25.4s, v16.4s, v18.4s\n"
+        "zip1 v26.4s, v20.4s, v22.4s\n"
+        "zip2 v27.4s, v20.4s, v22.4s\n"
+
+        "trn1 v4.2d, v12.2d, v14.2d\n"
+        "trn2 v5.2d, v12.2d, v14.2d\n"
+        "trn1 v6.2d, v13.2d, v15.2d\n"
+        "trn2 v7.2d, v13.2d, v15.2d\n"
+
+        "zip1 v28.4s, v17.4s, v19.4s\n"
+        "zip2 v29.4s, v17.4s, v19.4s\n"
+        "zip1 v30.4s, v21.4s, v23.4s\n"
+        "zip2 v31.4s, v21.4s, v23.4s\n"
+
+        "trn1 v16.2d, v24.2d, v26.2d\n"
+        "trn2 v17.2d, v24.2d, v26.2d\n"
+        "trn1 v18.2d, v25.2d, v27.2d\n"
+        "trn2 v19.2d, v25.2d, v27.2d\n"
+
+        "trn1 v20.2d, v28.2d, v30.2d\n"
+        "trn2 v21.2d, v28.2d, v30.2d\n"
+        "trn1 v22.2d, v29.2d, v31.2d\n"
+        "trn2 v23.2d, v29.2d, v31.2d\n"
+
+        "st1 {v0.4s}, [x11], #16\n"
+        "st1 {v16.4s}, [x11], #16\n"
+        "st1 {v1.4s}, [x11], #16\n"
+        "st1 {v17.4s}, [x11], #16\n"
+        "st1 {v2.4s}, [x11], #16\n"
+        "st1 {v18.4s}, [x11], #16\n"
+        "st1 {v3.4s}, [x11], #16\n"
+        "st1 {v19.4s}, [x11], #16\n"
+        "st1 {v4.4s}, [x11], #16\n"
+        "st1 {v20.4s}, [x11], #16\n"
+        "st1 {v5.4s}, [x11], #16\n"
+        "st1 {v21.4s}, [x11], #16\n"
+        "st1 {v6.4s}, [x11], #16\n"
+        "st1 {v22.4s}, [x11], #16\n"
+        "st1 {v7.4s}, [x11], #16\n"
+        "st1 {v23.4s}, [x11], #16\n"
 
         :
         : [ dst_c ] "r"(dst_c), [ src_c ] "r"(src_c), [ stride ] "r"(stride)
         : "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14",
-          "v15");
+          "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
+          "v30", "v31");
 #elif ENABLE_ARM32
+      /* 8x4 row-major to col-major */
       size_t stride = col * sizeof(float);
       asm volatile(
         "mov r10, %[src_c]\n"
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
index 0948bc4e63..e5f62f8a46 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
@@ -85,14 +85,14 @@ int Convolution1x1FP16CPUKernel::InitWeightBias() {
   auto input_channel = weight_tensor->Channel();
   auto output_channel = weight_tensor->Batch();
 
-  size_t size = UP_ROUND(output_channel, C8NUM) * sizeof(float16_t);
-  bias_data_ = malloc(size);
-  if (bias_data_ == nullptr) {
-    MS_LOG(ERROR) << "Conv1x1 Malloc bias_ptr_ error!";
-    return RET_ERROR;
-  }
-  memset(bias_data_, 0, size);
   if (in_tensors_.size() == 3) {
+    size_t size = UP_ROUND(output_channel, C8NUM) * sizeof(float16_t);
+    size_t weight_size = output_channel * sizeof(float16_t);
+    bias_data_ = malloc(size);
+    if (bias_data_ == nullptr) {
+      MS_LOG(ERROR) << "Conv1x1 Malloc bias_ptr_ error!";
+      return RET_ERROR;
+    }
     auto bias_tensor = in_tensors_.at(kBiasIndex);
     if (bias_tensor->data_type() == kNumberTypeFloat16) {
       memcpy(bias_data_, bias_tensor->MutableData(), output_channel * sizeof(float16_t));
@@ -100,15 +100,17 @@ int Convolution1x1FP16CPUKernel::InitWeightBias() {
       Float32ToFloat16(reinterpret_cast<float *>(bias_tensor->MutableData()), reinterpret_cast<float16_t *>(bias_data_),
                        output_channel);
     }
+    memset(reinterpret_cast<char *>(bias_data_) + weight_size, 0, size - weight_size);
   }
 
-  size = input_channel * UP_ROUND(output_channel, C8NUM) * sizeof(float16_t);
+  size_t size = input_channel * UP_ROUND(output_channel, C8NUM) * sizeof(float16_t);
+  size_t down_size = input_channel * DOWN_DIV(output_channel, C8NUM) * C8NUM * sizeof(float16_t);
   weight_ptr_ = reinterpret_cast<float16_t *>(malloc(size));
   if (weight_ptr_ == nullptr) {
     MS_LOG(ERROR) << "Conv1x1 Malloc weight_ptr_ error!";
     return RET_ERROR;
   }
-  memset(weight_ptr_, 0, size);
+  memset(reinterpret_cast<char *>(weight_ptr_) + down_size, 0, size - down_size);
   ColMajor2Row8MajorFp16(weight_tensor->MutableData(), weight_ptr_, input_channel, output_channel,
                          weight_tensor->data_type() == kNumberTypeFloat16);
   return RET_OK;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc
index 4da5295f5a..d4478d4702 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc
@@ -71,24 +71,26 @@ int Convolution1x1CPUKernel::InitConv1x1BiasWeight() {
   auto input_channel = filter_tensor->Channel();
   auto output_channel = filter_tensor->Batch();
 
-  int size = UP_ROUND(output_channel, C8NUM) * sizeof(float);
-  bias_data_ = malloc(size);
-  if (bias_data_ == nullptr) {
-    MS_LOG(ERROR) << "Conv1x1 Malloc bias_ptr_ error!";
-    return RET_ERROR;
-  }
-  memset(bias_data_, 0, size);
   if (in_tensors_.size() == 3) {
-    memcpy(bias_data_, in_tensors_[kBiasIndex]->MutableData(), output_channel * sizeof(float));
+    int size = UP_ROUND(output_channel, C8NUM) * sizeof(float);
+    int weight_size = output_channel * sizeof(float);
+    bias_data_ = malloc(size);
+    if (bias_data_ == nullptr) {
+      MS_LOG(ERROR) << "Conv1x1 Malloc bias_ptr_ error!";
+      return RET_ERROR;
+    }
+    memcpy(bias_data_, in_tensors_[kBiasIndex]->MutableData(), weight_size);
+    memset(reinterpret_cast<char *>(bias_data_) + weight_size, 0, size - weight_size);
   }
 
-  size = input_channel * UP_ROUND(output_channel, C8NUM) * sizeof(float);
+  int size = input_channel * UP_ROUND(output_channel, C8NUM) * sizeof(float);
+  int down_size = input_channel * DOWN_DIV(output_channel, C8NUM) * C8NUM * sizeof(float);
   weight_ptr_ = reinterpret_cast<float *>(malloc(size));
   if (weight_ptr_ == nullptr) {
     MS_LOG(ERROR) << "Conv1x1 Malloc weight_ptr_ error!";
     return RET_ERROR;
   }
-  memset(weight_ptr_, 0, size);
+  memset(reinterpret_cast<char *>(weight_ptr_) + down_size, 0, size - down_size);
   RowMajor2Col8Major(reinterpret_cast<float *>(filter_tensor->MutableData()), weight_ptr_, output_channel,
                      input_channel);
   return RET_OK;
@@ -141,10 +143,10 @@ int Convolution1x1CPUKernel::DoConv1x1(int task_id) {
   if (cur_oc <= 0) {
     return RET_OK;
   }
+  auto bias = (bias_data_ == nullptr) ? nullptr : reinterpret_cast<float *>(bias_data_) + thread_stride_ * task_id;
   MatMulOpt(pack_input_, weight_ptr_ + task_id * thread_stride_ * matmul_param_->deep_,
-            output_ptr_ + task_id * thread_stride_, reinterpret_cast<float *>(bias_data_) + thread_stride_ * task_id,
-            matmul_param_->act_type_, matmul_param_->deep_, matmul_param_->row_, cur_oc, matmul_param_->col_,
-            OutType_Nhwc);
+            output_ptr_ + task_id * thread_stride_, bias, matmul_param_->act_type_, matmul_param_->deep_,
+            matmul_param_->row_, cur_oc, matmul_param_->col_, OutType_Nhwc);
   return RET_OK;
 }
 
@@ -178,7 +180,6 @@ int Convolution1x1CPUKernel::DoConv1x1Hw(int task_id) {
   MatMulOpt(thread_pack_input, weight_ptr_, thread_output_ptr, reinterpret_cast<float *>(bias_data_),
             matmul_param_->act_type_, matmul_param_->deep_, cur_hw_, matmul_param_->col_, matmul_param_->col_,
             OutType_Nhwc);
-
   return RET_OK;
 }