Browse Source

!7059 [MS][LITE][Develop]fix fp16 matmul kernel write bug

Merge pull request !7059 from lixian/master
tags/v1.1.0
mindspore-ci-bot Gitee 5 years ago
parent
commit
beb8bf5d65
8 changed files with 14 additions and 23 deletions
  1. +0
    -10
      mindspore/lite/nnacl/assembly/fp16/MatmulFp16Opt.S
  2. +2
    -2
      mindspore/lite/nnacl/fp16/conv_fp16.c
  3. +4
    -4
      mindspore/lite/nnacl/fp16/matmul_fp16.c
  4. +2
    -2
      mindspore/lite/nnacl/fp16/matmul_fp16.h
  5. +3
    -2
      mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
  6. +1
    -1
      mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc
  7. +1
    -1
      mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.cc
  8. +1
    -1
      mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.cc

+ 0
- 10
mindspore/lite/nnacl/assembly/fp16/MatmulFp16Opt.S View File

@@ -1195,8 +1195,6 @@ LoopRow:
st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x19], #64 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x19], #64
st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x19], #64 st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x19], #64
st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x19], #64 st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x19], #64
st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x19], #64
st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x19], #64
add x11, x11, x16 add x11, x11, x16
b WriteEnd b WriteEnd
WriteWino: WriteWino:
@@ -1217,14 +1215,6 @@ LoopRow:
st1 {v29.8h}, [x11], x15 st1 {v29.8h}, [x11], x15
st1 {v30.8h}, [x11], x15 st1 {v30.8h}, [x11], x15
st1 {v31.8h}, [x11], x15 st1 {v31.8h}, [x11], x15
st1 {v24.8h}, [x11], x15
st1 {v25.8h}, [x11], x15
st1 {v26.8h}, [x11], x15
st1 {v27.8h}, [x11], x15
st1 {v28.8h}, [x11], x15
st1 {v29.8h}, [x11], x15
st1 {v30.8h}, [x11], x15
st1 {v31.8h}, [x11], x15
b WriteEnd b WriteEnd
Write8: Write8:
add x2, x2, #16 add x2, x2, #16


+ 2
- 2
mindspore/lite/nnacl/fp16/conv_fp16.c View File

@@ -205,8 +205,8 @@ void ConvWinogardFp16(float16_t *input_data, float16_t *trans_weight, const floa
float16_t *tmp_col_ptr = col_buffer + task_id * col_buffer_offset; float16_t *tmp_col_ptr = col_buffer + task_id * col_buffer_offset;
for (int i = 0; i < input_unit_square; ++i) { for (int i = 0; i < input_unit_square; ++i) {
RowMajor2Col16MajorFp16Opt(src_ptr + i * tile_num * in_channel, tmp_col_ptr, tile_num, in_channel); RowMajor2Col16MajorFp16Opt(src_ptr + i * tile_num * in_channel, tmp_col_ptr, tile_num, in_channel);
MatMul16x8(tmp_col_ptr, trans_weight + i * in_channel * oc8 * C8NUM, dst_ptr + i * C8NUM, NULL, 0, in_channel,
cal_num, oc8 * C8NUM, input_unit_square, false);
MatMulFp16(tmp_col_ptr, trans_weight + i * in_channel * oc8 * C8NUM, dst_ptr + i * C8NUM, NULL, 0, in_channel,
cal_num, oc8 * C8NUM, input_unit_square, OutType_TileC8);
} }


// step 4 : output transform // step 4 : output transform


+ 4
- 4
mindspore/lite/nnacl/fp16/matmul_fp16.c View File

@@ -104,11 +104,11 @@ void MatMul16x8(const float16_t *a, const float16_t *b, float16_t *dst, const fl
} }


void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, ActType act_type, void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, ActType act_type,
int depth, int row, int col, int stride, bool write_nhwc) {
if (!write_nhwc) {
MatmulFp16Neon64(a, b, c, bias, (int)act_type, depth, row, col, stride, write_nhwc);
int depth, int row, int col, int stride, int out_type) {
if (out_type == OutType_C8) {
MatmulFp16Neon64(a, b, c, bias, (int)act_type, depth, row, col, stride, false);
} else { } else {
MatmulFp16Neon64Opt(a, b, c, bias, (int)act_type, depth, row, col, stride, 1);
MatmulFp16Neon64Opt(a, b, c, bias, (int)act_type, depth, row, col, stride, out_type);
} }
return; return;
} }


+ 2
- 2
mindspore/lite/nnacl/fp16/matmul_fp16.h View File

@@ -33,7 +33,7 @@ void MatMul16x8(const float16_t *a, const float16_t *b, float16_t *dst, const fl
int deep, int row, int col, int stride, bool write_nhwc); int deep, int row, int col, int stride, bool write_nhwc);


void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, ActType act_type, void MatMulFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, ActType act_type,
int depth, int row, int col, int stride, bool write_nhwc);
int depth, int row, int col, int stride, int out_type);


void ColMajor2Row8MajorFp16(void *src_ptr, float16_t *dst_ptr, size_t row, size_t col, bool src_float16); void ColMajor2Row8MajorFp16(void *src_ptr, float16_t *dst_ptr, size_t row, size_t col, bool src_float16);


@@ -43,7 +43,7 @@ void MatmulFp16Neon64(const float16_t *a, const float16_t *b, float16_t *c, cons
size_t depth, size_t row, size_t col, size_t stride, bool write_nhwc); size_t depth, size_t row, size_t col, size_t stride, bool write_nhwc);


void MatmulFp16Neon64Opt(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, void MatmulFp16Neon64Opt(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type,
size_t depth, size_t row, size_t col, size_t stride, int write_nhwc);
size_t depth, size_t row, size_t col, size_t stride, size_t write_nhwc);


void RowMajor2Col16MajorFp16(void *src, float16_t *dst, int row, int col, bool is_fp32_src); void RowMajor2Col16MajorFp16(void *src, float16_t *dst, int row, int col, bool is_fp32_src);




+ 3
- 2
mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc View File

@@ -171,7 +171,7 @@ int Convolution1x1FP16CPUKernel::RunOc(int task_id) {


MatMulFp16(pack_input_, weight_ptr_ + task_id * thread_stride_ * matmul_param_->deep_, MatMulFp16(pack_input_, weight_ptr_ + task_id * thread_stride_ * matmul_param_->deep_,
output_ptr_ + task_id * thread_stride_, bias, matmul_param_->act_type_, matmul_param_->deep_, output_ptr_ + task_id * thread_stride_, bias, matmul_param_->act_type_, matmul_param_->deep_,
matmul_param_->row_, cur_oc, matmul_param_->col_, true);
matmul_param_->row_, cur_oc, matmul_param_->col_, OutType_Nhwc);


return RET_OK; return RET_OK;
} }
@@ -189,7 +189,8 @@ int Convolution1x1FP16CPUKernel::RunHw(int task_id) {


float16_t *thread_output_ptr = output_ptr_ + task_id * thread_stride_ * matmul_param_->col_; float16_t *thread_output_ptr = output_ptr_ + task_id * thread_stride_ * matmul_param_->col_;
MatMulFp16(thread_pack_input, weight_ptr_, thread_output_ptr, reinterpret_cast<float16_t *>(bias_data_), MatMulFp16(thread_pack_input, weight_ptr_, thread_output_ptr, reinterpret_cast<float16_t *>(bias_data_),
matmul_param_->act_type_, matmul_param_->deep_, cur_hw_, matmul_param_->col_, matmul_param_->col_, true);
matmul_param_->act_type_, matmul_param_->deep_, cur_hw_, matmul_param_->col_, matmul_param_->col_,
OutType_Nhwc);


return RET_OK; return RET_OK;
} }


+ 1
- 1
mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc View File

@@ -156,7 +156,7 @@ int DeConvolutionFp16CPUKernel::DoDeconv(int task_id) {
auto tmp_buf = tmp_buffer_ + task_id * thread_stride_ * C8NUM * kernel_plane_ * matmul_param_->row_16_; auto tmp_buf = tmp_buffer_ + task_id * thread_stride_ * C8NUM * kernel_plane_ * matmul_param_->row_16_;
MatMulFp16(pack_input_, execute_weight_ + task_id * thread_stride_ * C8NUM * kernel_plane_ * matmul_param_->deep_, MatMulFp16(pack_input_, execute_weight_ + task_id * thread_stride_ * C8NUM * kernel_plane_ * matmul_param_->deep_,
tmp_buf, nullptr, ActType_No, matmul_param_->deep_, matmul_param_->row_, oc * C8NUM * kernel_plane_, 0, tmp_buf, nullptr, ActType_No, matmul_param_->deep_, matmul_param_->row_, oc * C8NUM * kernel_plane_, 0,
false);
OutType_C8);
DeConvPostFp16(tmp_buf, pack_output_ + task_id * thread_stride_ * C8NUM * output_plane_, DeConvPostFp16(tmp_buf, pack_output_ + task_id * thread_stride_ * C8NUM * output_plane_,
reinterpret_cast<float16_t *>(bias_data_) + task_id * thread_stride_ * C8NUM, reinterpret_cast<float16_t *>(bias_data_) + task_id * thread_stride_ * C8NUM,
execute_output_ + task_id * thread_stride_ * C8NUM, oc_res, conv_param_); execute_output_ + task_id * thread_stride_ * C8NUM, oc_res, conv_param_);


+ 1
- 1
mindspore/lite/src/runtime/kernel/arm/fp16/fullconnection_fp16.cc View File

@@ -137,7 +137,7 @@ int FullconnectionFP16CPUKernel::RunImpl(int task_id) {
auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_ + thread_stride_ * task_id; auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_ + thread_stride_ * task_id;
auto c = output_ptr_ + task_id * thread_stride_; auto c = output_ptr_ + task_id * thread_stride_;
MatMulFp16(a_pack_ptr_, b, c, bias, fc_param_->act_type_, fc_param_->deep_, fc_param_->row_, cur_oc, fc_param_->col_, MatMulFp16(a_pack_ptr_, b, c, bias, fc_param_->act_type_, fc_param_->deep_, fc_param_->row_, cur_oc, fc_param_->col_,
true);
OutType_Nhwc);
return RET_OK; return RET_OK;
} }




+ 1
- 1
mindspore/lite/src/runtime/kernel/arm/fp16/matmul_fp16.cc View File

@@ -193,7 +193,7 @@ int MatmulFP16CPUKernel::RunImpl(int task_id) {
auto b = current_b_ + task_id * thread_stride_ * params_->deep_; auto b = current_b_ + task_id * thread_stride_ * params_->deep_;
auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_ + thread_stride_ * task_id; auto bias = (bias_ptr_ == nullptr) ? nullptr : bias_ptr_ + thread_stride_ * task_id;
auto c = current_c_ + task_id * thread_stride_; auto c = current_c_ + task_id * thread_stride_;
MatMulFp16(current_a_, b, c, bias, ActType_No, params_->deep_, params_->row_, cur_oc, params_->col_, true);
MatMulFp16(current_a_, b, c, bias, ActType_No, params_->deep_, params_->row_, cur_oc, params_->col_, OutType_Nhwc);


return RET_OK; return RET_OK;
} }


Loading…
Cancel
Save