From d573a1180d017367fef7511372452b5808f1d20c Mon Sep 17 00:00:00 2001
From: lixian <lixian16@huawei.com>
Date: Fri, 9 Oct 2020 17:30:55 +0800
Subject: [PATCH] fix fp16 matmul bug

---
 mindspore/lite/nnacl/assembly/fp16/MatmulFp16Opt.S     | 10 ----------
 mindspore/lite/nnacl/fp16/conv_fp16.c                  |  4 ++--
 mindspore/lite/nnacl/fp16/matmul_fp16.c                |  8 ++++----
 mindspore/lite/nnacl/fp16/matmul_fp16.h                |  4 ++--
 .../runtime/kernel/arm/fp16/convolution_1x1_fp16.cc    |  5 +++--
 .../src/runtime/kernel/arm/fp16/deconvolution_fp16.cc  |  2 +-
 .../src/runtime/kernel/arm/fp16/fullconnection_fp16.cc |  2 +-
 .../lite/src/runtime/kernel/arm/fp16/matmul_fp16.cc    |  2 +-
 8 files changed, 14 insertions(+), 23 deletions(-)

diff --git a/mindspore/lite/nnacl/assembly/fp16/MatmulFp16Opt.S b/mindspore/lite/nnacl/assembly/fp16/MatmulFp16Opt.S
index 80ae772e8d..20285677fe 100644
--- a/mindspore/lite/nnacl/assembly/fp16/MatmulFp16Opt.S
+++ b/mindspore/lite/nnacl/assembly/fp16/MatmulFp16Opt.S
@@ -1195,8 +1195,6 @@ LoopRow:
             st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x19], #64
             st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x19], #64
             st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x19], #64
-            st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x19], #64
-            st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x19], #64
             add x11, x11, x16
             b WriteEnd
         WriteWino:
@@ -1217,14 +1215,6 @@ LoopRow:
             st1 {v29.8h}, [x11], x15
             st1 {v30.8h}, [x11], x15
             st1 {v31.8h}, [x11], x15
-            st1 {v24.8h}, [x11], x15
-            st1 {v25.8h}, [x11], x15
-            st1 {v26.8h}, [x11], x15
-            st1 {v27.8h}, [x11], x15
-            st1 {v28.8h}, [x11], x15
-            st1 {v29.8h}, [x11], x15
-            st1 {v30.8h}, [x11], x15
-            st1 {v31.8h}, [x11], x15
             b WriteEnd
         Write8:
             add x2, x2, #16
diff --git a/mindspore/lite/nnacl/fp16/conv_fp16.c b/mindspore/lite/nnacl/fp16/conv_fp16.c
index 91eab4bb15..b48cb5c656 100644
--- a/mindspore/lite/nnacl/fp16/conv_fp16.c
+++ b/mindspore/lite/nnacl/fp16/conv_fp16.c
@@ -205,8 +205,8 @@ void ConvWinogardFp16(float16_t *input_data, float16_t *trans_weight, const floa
       float16_t *tmp_col_ptr = col_buffer + task_id * col_buffer_offset;
       for (int i = 0; i < input_unit_square; ++i) {
         RowMajor2Col16MajorFp16Opt(src_ptr + i * tile_num * in_channel, tmp_col_ptr, tile_num, in_channel);
-        MatMul16x8(tmp_col_ptr, trans_weight + i * in_channel * oc8 * C8NUM, dst_ptr + i * C8NUM, NULL, 0, in_channel,
-                   cal_num, oc8 * C8NUM, input_unit_square, false);
+        MatMulFp16(tmp_col_ptr, trans_weight + i * in_channel * oc8 * C8NUM, dst_ptr + i * C8NUM, NULL, 0, in_channel,
+                   cal_num, oc8 * C8NUM, input_unit_square, OutType_TileC8);
       }
 
       // step 4 : output transform
diff --git a/mindspore/lite/nnacl/fp16/matmul_fp16.c b/mindspore/lite/nnacl/fp16/matmul_fp16.c
index beb62bb043..85d7998b94 100644
--- a/mindspore/lite/nnacl/fp16/matmul_fp16.c
+++ b/mindspore/lite/nnacl/fp16/matmul_fp16.c
@@ -104,11 +104,11 @@ void MatMul16x8(const float16_t *a, const float16_t *b, float16_t *dst, const fl
 }
 
 void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, ActType act_type,
-                int depth, int row, int col, int stride, bool write_nhwc) {
-  if (!write_nhwc) {
-    MatmulFp16Neon64(a, b, c, bias, (int)act_type, depth, row, col, stride, write_nhwc);
+                int depth, int row, int col, int stride, int out_type) {
+  if (out_type == OutType_C8) {
+    MatmulFp16Neon64(a, b, c, bias, (int)act_type, depth, row, col, stride, false);
   } else {
-    MatmulFp16Neon64Opt(a, b, c, bias, (int)act_type, depth, row, col, stride, 1);
+    MatmulFp16Neon64Opt(a, b, c, bias, (int)act_type, depth, row, col, stride, out_type);
   }
   return;
 }
diff --git a/mindspore/lite/nnacl/fp16/matmul_fp16.h b/mindspore/lite/nnacl/fp16/matmul_fp16.h
index d7503fff61..306098096e 100644
--- a/mindspore/lite/nnacl/fp16/matmul_fp16.h
+++ b/mindspore/lite/nnacl/fp16/matmul_fp16.h
@@ -33,7 +33,7 @@ void MatMul16x8(const float16_t *a, const float16_t *b, float16_t *dst, const fl
                 int deep, int row, int col, int stride, bool write_nhwc);
 
 void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, ActType act_type,
-                int depth, int row, int col, int stride, bool write_nhwc);
+                int depth, int row, int col, int stride, int out_type);
 
 void ColMajor2Row8MajorFp16(void *src_ptr, float16_t *dst_ptr, size_t row, size_t col, bool src_float16);
 
@@ -43,7 +43,7 @@ void MatmulFp16Neon64(const float16_t *a, const float16_t *b, float16_t *c, cons
                       size_t depth, size_t row, size_t col, size_t stride, bool write_nhwc);
 
 void MatmulFp16Neon64Opt(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type,
-                         size_t depth, size_t row, size_t col, size_t stride, int write_nhwc);
+                         size_t depth, size_t row, size_t col, size_t stride, size_t write_nhwc);
 
 void RowMajor2Col16MajorFp16(void *src, float16_t *dst, int row, int col, bool is_fp32_src);
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
index 5189512009..0948bc4e63 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
@@ -171,7 +171,7 @@ int Convolution1x1FP16CPUKernel::RunOc(int task_id) {
 
   MatMulFp16(pack_input_, weight_ptr_ + task_id * thread_stride_ * matmul_param_->deep_,
              output_ptr_ + task_id * thread_stride_, bias, matmul_param_->act_type_, matmul_param_->deep_,
-             matmul_param_->row_, cur_oc, matmul_param_->col_, true);
+             matmul_param_->row_, cur_oc, matmul_param_->col_, OutType_Nhwc);
 
   return RET_OK;
 }
@@ -189,7 +189,8 @@ int Convolution1x1FP16CPUKernel::RunHw(int task_id) {
 
   float16_t *thread_output_ptr = output_ptr_ + task_id * thread_stride_ * matmul_param_->col_;
   MatMulFp16(thread_pack_input, weight_ptr_, thread_output_ptr, reinterpret_cast<float16_t *>(bias_data_),
-             matmul_param_->act_type_, matmul_param_->deep_, cur_hw_, matmul_param_->col_, matmul_param_->col_, true);
+             matmul_param_->act_type_, matmul_param_->deep_, cur_hw_, matmul_param_->col_, matmul_param_->col_,
+             OutType_Nhwc);
 
   return RET_OK;
 }
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc
index c721e13a1d..81577e245d 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc
@@ -156,7 +156,7 @@ int DeConvolutionFp16CPUKernel::DoDeconv(int task_id) {
   auto tmp_buf = tmp_buffer_ + task_id * thread_stride_ * C8NUM * kernel_plane_ * matmul_param_->row_16_;
   MatMulFp16(pack_input_, execute_weight_ + task_id * thread_stride_ * C8NUM * kernel_plane_ * matmul_param_->deep_,
              tmp_buf, nullptr, ActType_No, matmul_param_->deep_, matmul_param_->row_, oc * C8NUM * kernel_plane_, 0,
-             false);
+             OutType_C8);
   DeConvPostFp16(tmp_buf, pack_output_ + task_id * thread_stride_ * C8NUM * output_plane_,
                  reinterpret_cast<float16_t *>(bias_data_) + task_id * thread_stride_ * C8NUM,
                  execute_output_ + task_id * thread_stride_ * C8NUM, oc_res, conv_param_);
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.cc
index 7831681152..0a53a76faf 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.cc
@@ -137,7 +137,7 @@ int FullconnectionFP16CPUKernel::RunImpl(int task_id) {
   auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_ + thread_stride_ * task_id;
   auto c = output_ptr_ + task_id * thread_stride_;
   MatMulFp16(a_pack_ptr_, b, c, bias, fc_param_->act_type_, fc_param_->deep_, fc_param_->row_, cur_oc, fc_param_->col_,
-             true);
+             OutType_Nhwc);
   return RET_OK;
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.cc
index 5ba2e0bae1..c2a5a0d235 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.cc
@@ -193,7 +193,7 @@ int MatmulFP16CPUKernel::RunImpl(int task_id) {
   auto b = current_b_ + task_id * thread_stride_ * params_->deep_;
   auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_ + thread_stride_ * task_id;
   auto c = current_c_ + task_id * thread_stride_;
-  MatMulFp16(current_a_, b, c, bias, ActType_No, params_->deep_, params_->row_, cur_oc, params_->col_, true);
+  MatMulFp16(current_a_, b, c, bias, ActType_No, params_->deep_, params_->row_, cur_oc, params_->col_, OutType_Nhwc);
 
   return RET_OK;
 }