From 0fac817a2d49698aee7badbc9ee2772e755f8c6c Mon Sep 17 00:00:00 2001
From: ling <lingqiaomin.huawei.com>
Date: Sat, 22 Aug 2020 11:31:57 +0800
Subject: [PATCH] [MS][LITE][Develop]Fp16 conv1x1 bug

---
 mindspore/lite/nnacl/fp16/matmul_fp16.c       | 26 ++++++++++---
 mindspore/lite/nnacl/fp16/matmul_fp16.h       |  2 +-
 .../kernel/arm/fp16/convolution_1x1_fp16.cc   | 38 ++++++++++++-------
 .../kernel/arm/fp16/convolution_1x1_fp16.h    |  6 +--
 .../kernel/arm/fp16/convolution_base_fp16.cc  |  9 +++++
 .../kernel/arm/fp16/convolution_base_fp16.h   |  2 +-
 .../kernel/arm/fp16/convolution_fp16.cc       |  3 +-
 7 files changed, 59 insertions(+), 27 deletions(-)
diff --git a/mindspore/lite/nnacl/fp16/matmul_fp16.c b/mindspore/lite/nnacl/fp16/matmul_fp16.c
index 7d0b785fd2..3181feb978 100644
--- a/mindspore/lite/nnacl/fp16/matmul_fp16.c
+++ b/mindspore/lite/nnacl/fp16/matmul_fp16.c
@@ -15,14 +15,28 @@
  */
 
 #include "nnacl/fp16/matmul_fp16.h"
-void ColMajor2Row8MajorFp16(float16_t *src_ptr, float16_t *dst_ptr, size_t row, size_t col) {
-  for (int r = 0; r < row; r++) {
-    for (int c = 0; c < col; c++) {
-      int cd8 = c / 8;
-      int cm8 = c % 8;
-      dst_ptr[cd8 * 8 * row + r * 8 + cm8] = src_ptr[c * row + r];
+
+void ColMajor2Row8MajorFp16(void *src_ptr, float16_t *dst_ptr, size_t row, size_t col, bool src_float16) {
+  if (src_float16) {
+    float16_t *src = (float16_t *)src_ptr;
+    for (int r = 0; r < row; r++) {
+      for (int c = 0; c < col; c++) {
+        int cd8 = c / 8;
+        int cm8 = c % 8;
+        dst_ptr[cd8 * 8 * row + r * 8 + cm8] = (float16_t)(src[c * row + r]);
+      }
+    }
+  } else {
+    float *src = (float *)src_ptr;
+    for (int r = 0; r < row; r++) {
+      for (int c = 0; c < col; c++) {
+        int cd8 = c / 8;
+        int cm8 = c % 8;
+        dst_ptr[cd8 * 8 * row + r * 8 + cm8] = (float16_t)(src[c * row + r]);
+      }
     }
   }
+  return;
 }
 
 void MatMul16x8(const float16_t *a, const float16_t *b, float16_t *dst, const float16_t *bias, ActType act_type,
diff --git a/mindspore/lite/nnacl/fp16/matmul_fp16.h b/mindspore/lite/nnacl/fp16/matmul_fp16.h
index fae70ba61d..0f9212cae8 100644
--- a/mindspore/lite/nnacl/fp16/matmul_fp16.h
+++ b/mindspore/lite/nnacl/fp16/matmul_fp16.h
@@ -32,7 +32,7 @@ extern "C" {
 void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, ActType act_type,
                 int depth, int row, int col, int stride, bool write_nhwc);
 
-void ColMajor2Row8MajorFp16(float16_t *src_ptr, float16_t *dst_ptr, size_t row, size_t col);
+void ColMajor2Row8MajorFp16(void *src_ptr, float16_t *dst_ptr, size_t row, size_t col, bool src_float16);
 
 void RowMajor2Col16MajorFp16(float16_t *src_ptr, float16_t *dst_ptr, size_t row, size_t col);
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
index 7f56320630..1bd9c81f36 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
@@ -74,31 +74,36 @@ int Convolution1x1FP16CPUKernel::InitConv1x1Param() {
 }
 
 int Convolution1x1FP16CPUKernel::InitWeightBias() {
-  auto ret = ConvolutionBaseFP16CPUKernel::GetExecuteFilter();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Get Execute filter failed.";
-    return ret;
-  }
+  auto bias_tensor = in_tensors_.at(kBiasIndex);
+  auto weight_tensor = in_tensors_.at(kWeightIndex);
+  auto input_channel = weight_tensor->Channel();
+  auto output_channel = weight_tensor->Batch();
 
-  bias_data_ = malloc(matmul_param_->col_8_ * sizeof(float16_t));
+  size_t size = UP_ROUND(output_channel, C8NUM) * sizeof(float16_t);
+  bias_data_ = malloc(size);
   if (bias_data_ == nullptr) {
     MS_LOG(ERROR) << "Conv1x1 Malloc bias_ptr_ error!";
     return RET_ERROR;
   }
-  memset(bias_data_, 0, matmul_param_->col_8_ * sizeof(float16_t));
+  memset(bias_data_, 0, size);
   if (in_tensors_.size() == 3) {
-    Float32ToFloat16(reinterpret_cast<float *>(in_tensors_[2]->Data()), reinterpret_cast<float16_t *>(bias_data_),
-                     conv_param_->output_channel_);
+    if (bias_tensor->data_type() == kNumberTypeFloat16) {
+      memcpy(bias_data_, bias_tensor->Data(), output_channel * sizeof(float16_t));
+    } else {
+      Float32ToFloat16(reinterpret_cast<float *>(bias_tensor->Data()), reinterpret_cast<float16_t *>(bias_data_),
+                       output_channel);
+    }
   }
 
-  weight_ptr_ = reinterpret_cast<float16_t *>(malloc(matmul_param_->deep_ * matmul_param_->col_8_ * sizeof(float16_t)));
+  size = input_channel * UP_ROUND(output_channel, C8NUM) * sizeof(float16_t);
+  weight_ptr_ = reinterpret_cast<float16_t *>(malloc(size));
   if (weight_ptr_ == nullptr) {
     MS_LOG(ERROR) << "Conv1x1 Malloc weight_ptr_ error!";
     return RET_ERROR;
   }
-  memset(weight_ptr_, 0, matmul_param_->deep_ * matmul_param_->col_8_ * sizeof(float16_t));
-  ColMajor2Row8MajorFp16(reinterpret_cast<float16_t *>(execute_weight_), weight_ptr_, matmul_param_->deep_,
-                         matmul_param_->col_);
+  memset(weight_ptr_, 0, size);
+  ColMajor2Row8MajorFp16(weight_tensor->Data(), weight_ptr_, input_channel, output_channel,
+                         weight_tensor->data_type() == kNumberTypeFloat16);
   return RET_OK;
 }
 
@@ -106,6 +111,13 @@ int Convolution1x1FP16CPUKernel::Init() {
   if (!InferShapeDone()) {
     return RET_OK;
   }
+
+  matmul_param_ = new (std::nothrow) MatMulParameter();
+  if (matmul_param_ == nullptr) {
+    MS_LOG(ERROR) << "Init matmul_param_ failed.";
+    return RET_ERROR;
+  }
+
   int ret = InitWeightBias();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Init weight bias failed.";
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h
index b2a43426b6..61133c0486 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h
@@ -31,9 +31,7 @@ class Convolution1x1FP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
   Convolution1x1FP16CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
                               const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx,
                               const mindspore::lite::PrimitiveC *primitive)
-      : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {
-    matmul_param_ = new MatMulParameter();
-  }
+      : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {}
   ~Convolution1x1FP16CPUKernel() override;
 
   int Init() override;
@@ -50,7 +48,7 @@ class Convolution1x1FP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
 
  private:
   bool pre_trans_input_ = false;
-  int thread_count_ = 0;
+  int thread_count_ = 1;
   int thread_stride_ = 0;
   float16_t *weight_ptr_ = nullptr;
   float16_t *input_ptr_ = nullptr;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.cc
index dcc110d068..fc264c1737 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.cc
@@ -23,6 +23,14 @@
 #include "src/runtime/runtime_api.h"
 
 namespace mindspore::kernel {
+
+ConvolutionBaseFP16CPUKernel::~ConvolutionBaseFP16CPUKernel() {
+  if (fp16_weight_ != nullptr) {
+    free(fp16_weight_);
+    fp16_weight_ = nullptr;
+  }
+}
+
 int ConvolutionBaseFP16CPUKernel::GetExecuteTensor() {
   // ===================input====================//
   auto input_tensor = in_tensors_.at(kInputIndex);
@@ -65,6 +73,7 @@ int ConvolutionBaseFP16CPUKernel::GetExecuteFilter() {
   } else {
     auto *origin_weight = reinterpret_cast<float16_t *>(in_tensors_.at(kWeightIndex)->Data());
     execute_weight_ = origin_weight;
+    fp16_weight_ = nullptr;
   }
   return RET_OK;
 }
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.h
index e507ab3d3e..5029c342c8 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.h
@@ -30,7 +30,7 @@ class ConvolutionBaseFP16CPUKernel : public ConvolutionBaseCPUKernel {
                                const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx,
                                const mindspore::lite::PrimitiveC *primitive)
       : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
-  ~ConvolutionBaseFP16CPUKernel() override = default;
+  ~ConvolutionBaseFP16CPUKernel() override;
 
   int Init() override { return RET_OK; }
   int ReSize() override { return RET_OK; }
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
index e4c1429b20..8a6f3baf05 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
@@ -244,8 +244,7 @@ kernel::LiteKernel *CpuConvFp16KernelCreator(const std::vector<lite::tensor::Ten
   if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) {
     kernel = new (std::nothrow) kernel::Convolution3x3FP16CPUKernel(opParameter, inputs, outputs, ctx, primitive);
   } else if (kernel_h == 1 && kernel_w == 1) {
-    // kernel = new (std::nothrow) kernel::Convolution1x1FP16CPUKernel(opParameter, inputs, outputs, ctx, primitive);
-    kernel = new (std::nothrow) kernel::ConvolutionFP16CPUKernel(opParameter, inputs, outputs, ctx, primitive);
+    kernel = new (std::nothrow) kernel::Convolution1x1FP16CPUKernel(opParameter, inputs, outputs, ctx, primitive);
   } else {
     bool use_winograd = false;
     int out_unit;