!7059 [MS][LITE][Develop]fix fp16 matmul kernel write bug

Merge pull request !7059 from lixian/master
5 years ago · beb8bf5d65
--- a/mindspore/lite/nnacl/assembly/fp16/MatmulFp16Opt.S
+++ b/mindspore/lite/nnacl/assembly/fp16/MatmulFp16Opt.S
@@ -1195,8 +1195,6 @@ LoopRow:
            st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x19], #64
            st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x19], #64
            st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x19], #64
            st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x19], #64
            st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x19], #64
            add x11, x11, x16
            b WriteEnd
        WriteWino:
@@ -1217,14 +1215,6 @@ LoopRow:
            st1 {v29.8h}, [x11], x15
            st1 {v30.8h}, [x11], x15
            st1 {v31.8h}, [x11], x15
            st1 {v24.8h}, [x11], x15
            st1 {v25.8h}, [x11], x15
            st1 {v26.8h}, [x11], x15
            st1 {v27.8h}, [x11], x15
            st1 {v28.8h}, [x11], x15
            st1 {v29.8h}, [x11], x15
            st1 {v30.8h}, [x11], x15
            st1 {v31.8h}, [x11], x15
            b WriteEnd
        Write8:
            add x2, x2, #16
--- a/mindspore/lite/nnacl/fp16/conv_fp16.c
+++ b/mindspore/lite/nnacl/fp16/conv_fp16.c
@@ -205,8 +205,8 @@ void ConvWinogardFp16(float16_t *input_data, float16_t *trans_weight, const floa
      float16_t *tmp_col_ptr = col_buffer + task_id * col_buffer_offset;
      for (int i = 0; i < input_unit_square; ++i) {
        RowMajor2Col16MajorFp16Opt(src_ptr + i * tile_num * in_channel, tmp_col_ptr, tile_num, in_channel);
        MatMul16x8(tmp_col_ptr, trans_weight + i * in_channel * oc8 * C8NUM, dst_ptr + i * C8NUM, NULL, 0, in_channel,
                   cal_num, oc8 * C8NUM, input_unit_square, false);
        MatMulFp16(tmp_col_ptr, trans_weight + i * in_channel * oc8 * C8NUM, dst_ptr + i * C8NUM, NULL, 0, in_channel,
                   cal_num, oc8 * C8NUM, input_unit_square, OutType_TileC8);
      }

      // step 4 : output transform
--- a/mindspore/lite/nnacl/fp16/matmul_fp16.c
+++ b/mindspore/lite/nnacl/fp16/matmul_fp16.c
@@ -104,11 +104,11 @@ void MatMul16x8(const float16_t *a, const float16_t *b, float16_t *dst, const fl
 }

 void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, ActType act_type,
                int depth, int row, int col, int stride, bool write_nhwc) {
  if (!write_nhwc) {
    MatmulFp16Neon64(a, b, c, bias, (int)act_type, depth, row, col, stride, write_nhwc);
                int depth, int row, int col, int stride, int out_type) {
  if (out_type == OutType_C8) {
    MatmulFp16Neon64(a, b, c, bias, (int)act_type, depth, row, col, stride, false);
  } else {
    MatmulFp16Neon64Opt(a, b, c, bias, (int)act_type, depth, row, col, stride, 1);
    MatmulFp16Neon64Opt(a, b, c, bias, (int)act_type, depth, row, col, stride, out_type);
  }
  return;
 }
--- a/mindspore/lite/nnacl/fp16/matmul_fp16.h
+++ b/mindspore/lite/nnacl/fp16/matmul_fp16.h
@@ -33,7 +33,7 @@ void MatMul16x8(const float16_t *a, const float16_t *b, float16_t *dst, const fl
                int deep, int row, int col, int stride, bool write_nhwc);

 void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, ActType act_type,
                int depth, int row, int col, int stride, bool write_nhwc);
                int depth, int row, int col, int stride, int out_type);

 void ColMajor2Row8MajorFp16(void *src_ptr, float16_t *dst_ptr, size_t row, size_t col, bool src_float16);

@@ -43,7 +43,7 @@ void MatmulFp16Neon64(const float16_t *a, const float16_t *b, float16_t *c, cons
                      size_t depth, size_t row, size_t col, size_t stride, bool write_nhwc);

 void MatmulFp16Neon64Opt(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type,
                         size_t depth, size_t row, size_t col, size_t stride, int write_nhwc);
                         size_t depth, size_t row, size_t col, size_t stride, size_t write_nhwc);

 void RowMajor2Col16MajorFp16(void *src, float16_t *dst, int row, int col, bool is_fp32_src);

--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
@@ -171,7 +171,7 @@ int Convolution1x1FP16CPUKernel::RunOc(int task_id) {

  MatMulFp16(pack_input_, weight_ptr_ + task_id * thread_stride_ * matmul_param_->deep_,
             output_ptr_ + task_id * thread_stride_, bias, matmul_param_->act_type_, matmul_param_->deep_,
             matmul_param_->row_, cur_oc, matmul_param_->col_, true);
             matmul_param_->row_, cur_oc, matmul_param_->col_, OutType_Nhwc);

  return RET_OK;
 }
@@ -189,7 +189,8 @@ int Convolution1x1FP16CPUKernel::RunHw(int task_id) {

  float16_t *thread_output_ptr = output_ptr_ + task_id * thread_stride_ * matmul_param_->col_;
  MatMulFp16(thread_pack_input, weight_ptr_, thread_output_ptr, reinterpret_cast<float16_t *>(bias_data_),
             matmul_param_->act_type_, matmul_param_->deep_, cur_hw_, matmul_param_->col_, matmul_param_->col_, true);
             matmul_param_->act_type_, matmul_param_->deep_, cur_hw_, matmul_param_->col_, matmul_param_->col_,
             OutType_Nhwc);

  return RET_OK;
 }
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc
@@ -156,7 +156,7 @@ int DeConvolutionFp16CPUKernel::DoDeconv(int task_id) {
  auto tmp_buf = tmp_buffer_ + task_id * thread_stride_ * C8NUM * kernel_plane_ * matmul_param_->row_16_;
  MatMulFp16(pack_input_, execute_weight_ + task_id * thread_stride_ * C8NUM * kernel_plane_ * matmul_param_->deep_,
             tmp_buf, nullptr, ActType_No, matmul_param_->deep_, matmul_param_->row_, oc * C8NUM * kernel_plane_, 0,
             false);
             OutType_C8);
  DeConvPostFp16(tmp_buf, pack_output_ + task_id * thread_stride_ * C8NUM * output_plane_,
                 reinterpret_cast<float16_t *>(bias_data_) + task_id * thread_stride_ * C8NUM,
                 execute_output_ + task_id * thread_stride_ * C8NUM, oc_res, conv_param_);
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.cc
@@ -137,7 +137,7 @@ int FullconnectionFP16CPUKernel::RunImpl(int task_id) {
  auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_ + thread_stride_ * task_id;
  auto c = output_ptr_ + task_id * thread_stride_;
  MatMulFp16(a_pack_ptr_, b, c, bias, fc_param_->act_type_, fc_param_->deep_, fc_param_->row_, cur_oc, fc_param_->col_,
             true);
             OutType_Nhwc);
  return RET_OK;
 }

--- a/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.cc
@@ -193,7 +193,7 @@ int MatmulFP16CPUKernel::RunImpl(int task_id) {
  auto b = current_b_ + task_id * thread_stride_ * params_->deep_;
  auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_ + thread_stride_ * task_id;
  auto c = current_c_ + task_id * thread_stride_;
  MatMulFp16(current_a_, b, c, bias, ActType_No, params_->deep_, params_->row_, cur_oc, params_->col_, true);
  MatMulFp16(current_a_, b, c, bias, ActType_No, params_->deep_, params_->row_, cur_oc, params_->col_, OutType_Nhwc);

  return RET_OK;
 }