From d573a1180d017367fef7511372452b5808f1d20c Mon Sep 17 00:00:00 2001 From: lixian Date: Fri, 9 Oct 2020 17:30:55 +0800 Subject: [PATCH] fix fp16 matmul bug --- mindspore/lite/nnacl/assembly/fp16/MatmulFp16Opt.S | 10 ---------- mindspore/lite/nnacl/fp16/conv_fp16.c | 4 ++-- mindspore/lite/nnacl/fp16/matmul_fp16.c | 8 ++++---- mindspore/lite/nnacl/fp16/matmul_fp16.h | 4 ++-- .../runtime/kernel/arm/fp16/convolution_1x1_fp16.cc | 5 +++-- .../src/runtime/kernel/arm/fp16/deconvolution_fp16.cc | 2 +- .../src/runtime/kernel/arm/fp16/fullconnection_fp16.cc | 2 +- .../lite/src/runtime/kernel/arm/fp16/matmul_fp16.cc | 2 +- 8 files changed, 14 insertions(+), 23 deletions(-) diff --git a/mindspore/lite/nnacl/assembly/fp16/MatmulFp16Opt.S b/mindspore/lite/nnacl/assembly/fp16/MatmulFp16Opt.S index 80ae772e8d..20285677fe 100644 --- a/mindspore/lite/nnacl/assembly/fp16/MatmulFp16Opt.S +++ b/mindspore/lite/nnacl/assembly/fp16/MatmulFp16Opt.S @@ -1195,8 +1195,6 @@ LoopRow: st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x19], #64 st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x19], #64 st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x19], #64 - st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x19], #64 - st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x19], #64 add x11, x11, x16 b WriteEnd WriteWino: @@ -1217,14 +1215,6 @@ LoopRow: st1 {v29.8h}, [x11], x15 st1 {v30.8h}, [x11], x15 st1 {v31.8h}, [x11], x15 - st1 {v24.8h}, [x11], x15 - st1 {v25.8h}, [x11], x15 - st1 {v26.8h}, [x11], x15 - st1 {v27.8h}, [x11], x15 - st1 {v28.8h}, [x11], x15 - st1 {v29.8h}, [x11], x15 - st1 {v30.8h}, [x11], x15 - st1 {v31.8h}, [x11], x15 b WriteEnd Write8: add x2, x2, #16 diff --git a/mindspore/lite/nnacl/fp16/conv_fp16.c b/mindspore/lite/nnacl/fp16/conv_fp16.c index 91eab4bb15..b48cb5c656 100644 --- a/mindspore/lite/nnacl/fp16/conv_fp16.c +++ b/mindspore/lite/nnacl/fp16/conv_fp16.c @@ -205,8 +205,8 @@ void ConvWinogardFp16(float16_t *input_data, float16_t *trans_weight, const floa float16_t *tmp_col_ptr = col_buffer + task_id * col_buffer_offset; for (int i = 0; i < input_unit_square; ++i) { RowMajor2Col16MajorFp16Opt(src_ptr + i * tile_num * in_channel, tmp_col_ptr, tile_num, in_channel); - MatMul16x8(tmp_col_ptr, trans_weight + i * in_channel * oc8 * C8NUM, dst_ptr + i * C8NUM, NULL, 0, in_channel, - cal_num, oc8 * C8NUM, input_unit_square, false); + MatMulFp16(tmp_col_ptr, trans_weight + i * in_channel * oc8 * C8NUM, dst_ptr + i * C8NUM, NULL, 0, in_channel, + cal_num, oc8 * C8NUM, input_unit_square, OutType_TileC8); } // step 4 : output transform diff --git a/mindspore/lite/nnacl/fp16/matmul_fp16.c b/mindspore/lite/nnacl/fp16/matmul_fp16.c index beb62bb043..85d7998b94 100644 --- a/mindspore/lite/nnacl/fp16/matmul_fp16.c +++ b/mindspore/lite/nnacl/fp16/matmul_fp16.c @@ -104,11 +104,11 @@ void MatMul16x8(const float16_t *a, const float16_t *b, float16_t *dst, const fl } void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, ActType act_type, - int depth, int row, int col, int stride, bool write_nhwc) { - if (!write_nhwc) { - MatmulFp16Neon64(a, b, c, bias, (int)act_type, depth, row, col, stride, write_nhwc); + int depth, int row, int col, int stride, int out_type) { + if (out_type == OutType_C8) { + MatmulFp16Neon64(a, b, c, bias, (int)act_type, depth, row, col, stride, false); } else { - MatmulFp16Neon64Opt(a, b, c, bias, (int)act_type, depth, row, col, stride, 1); + MatmulFp16Neon64Opt(a, b, c, bias, (int)act_type, depth, row, col, stride, out_type); } return; } diff --git a/mindspore/lite/nnacl/fp16/matmul_fp16.h b/mindspore/lite/nnacl/fp16/matmul_fp16.h index d7503fff61..306098096e 100644 --- a/mindspore/lite/nnacl/fp16/matmul_fp16.h +++ b/mindspore/lite/nnacl/fp16/matmul_fp16.h @@ -33,7 +33,7 @@ void MatMul16x8(const float16_t *a, const float16_t *b, float16_t *dst, const fl int deep, int row, int col, int stride, bool write_nhwc); void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, ActType act_type, - int depth, int row, int col, int stride, bool write_nhwc); + int depth, int row, int col, int stride, int out_type); void ColMajor2Row8MajorFp16(void *src_ptr, float16_t *dst_ptr, size_t row, size_t col, bool src_float16); @@ -43,7 +43,7 @@ void MatmulFp16Neon64(const float16_t *a, const float16_t *b, float16_t *c, cons size_t depth, size_t row, size_t col, size_t stride, bool write_nhwc); void MatmulFp16Neon64Opt(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, - size_t depth, size_t row, size_t col, size_t stride, int write_nhwc); + size_t depth, size_t row, size_t col, size_t stride, size_t write_nhwc); void RowMajor2Col16MajorFp16(void *src, float16_t *dst, int row, int col, bool is_fp32_src); diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc index 5189512009..0948bc4e63 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc @@ -171,7 +171,7 @@ int Convolution1x1FP16CPUKernel::RunOc(int task_id) { MatMulFp16(pack_input_, weight_ptr_ + task_id * thread_stride_ * matmul_param_->deep_, output_ptr_ + task_id * thread_stride_, bias, matmul_param_->act_type_, matmul_param_->deep_, - matmul_param_->row_, cur_oc, matmul_param_->col_, true); + matmul_param_->row_, cur_oc, matmul_param_->col_, OutType_Nhwc); return RET_OK; } @@ -189,7 +189,8 @@ int Convolution1x1FP16CPUKernel::RunHw(int task_id) { float16_t *thread_output_ptr = output_ptr_ + task_id * thread_stride_ * matmul_param_->col_; MatMulFp16(thread_pack_input, weight_ptr_, thread_output_ptr, reinterpret_cast(bias_data_), - matmul_param_->act_type_, matmul_param_->deep_, cur_hw_, matmul_param_->col_, matmul_param_->col_, true); + matmul_param_->act_type_, matmul_param_->deep_, cur_hw_, matmul_param_->col_, matmul_param_->col_, + OutType_Nhwc); return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc index c721e13a1d..81577e245d 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc @@ -156,7 +156,7 @@ int DeConvolutionFp16CPUKernel::DoDeconv(int task_id) { auto tmp_buf = tmp_buffer_ + task_id * thread_stride_ * C8NUM * kernel_plane_ * matmul_param_->row_16_; MatMulFp16(pack_input_, execute_weight_ + task_id * thread_stride_ * C8NUM * kernel_plane_ * matmul_param_->deep_, tmp_buf, nullptr, ActType_No, matmul_param_->deep_, matmul_param_->row_, oc * C8NUM * kernel_plane_, 0, - false); + OutType_C8); DeConvPostFp16(tmp_buf, pack_output_ + task_id * thread_stride_ * C8NUM * output_plane_, reinterpret_cast(bias_data_) + task_id * thread_stride_ * C8NUM, execute_output_ + task_id * thread_stride_ * C8NUM, oc_res, conv_param_); diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.cc index 7831681152..0a53a76faf 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.cc @@ -137,7 +137,7 @@ int FullconnectionFP16CPUKernel::RunImpl(int task_id) { auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_ + thread_stride_ * task_id; auto c = output_ptr_ + task_id * thread_stride_; MatMulFp16(a_pack_ptr_, b, c, bias, fc_param_->act_type_, fc_param_->deep_, fc_param_->row_, cur_oc, fc_param_->col_, - true); + OutType_Nhwc); return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.cc index 5ba2e0bae1..c2a5a0d235 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.cc @@ -193,7 +193,7 @@ int MatmulFP16CPUKernel::RunImpl(int task_id) { auto b = current_b_ + task_id * thread_stride_ * params_->deep_; auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_ + thread_stride_ * task_id; auto c = current_c_ + task_id * thread_stride_; - MatMulFp16(current_a_, b, c, bias, ActType_No, params_->deep_, params_->row_, cur_oc, params_->col_, true); + MatMulFp16(current_a_, b, c, bias, ActType_No, params_->deep_, params_->row_, cur_oc, params_->col_, OutType_Nhwc); return RET_OK; }