From d50b324456732d1e0424bb330618ddf4bff69cd7 Mon Sep 17 00:00:00 2001 From: ling Date: Fri, 25 Sep 2020 09:40:55 +0800 Subject: [PATCH] [MSLITE][Develop] fp16 conv1x1 parallel by hw --- mindspore/lite/nnacl/fp16/pack_fp16.c | 21 ----- mindspore/lite/nnacl/fp16/pack_fp16.h | 2 - .../kernel/arm/fp16/convolution_1x1_fp16.cc | 79 +++++++++++++------ .../kernel/arm/fp16/convolution_1x1_fp16.h | 7 +- 4 files changed, 59 insertions(+), 50 deletions(-) diff --git a/mindspore/lite/nnacl/fp16/pack_fp16.c b/mindspore/lite/nnacl/fp16/pack_fp16.c index f0f965d5bf..b08f258415 100644 --- a/mindspore/lite/nnacl/fp16/pack_fp16.c +++ b/mindspore/lite/nnacl/fp16/pack_fp16.c @@ -18,27 +18,6 @@ #include #include -void Conv1x1InputPackFp16(const float16_t *src, float16_t *dst, ConvParameter *conv_param) { - /* support nhwc */ - for (int dst_h = 0; dst_h < conv_param->output_h_; dst_h++) { - int src_h = dst_h * conv_param->stride_h_ - conv_param->pad_u_; - if (src_h < 0 || src_h >= conv_param->input_h_) { - continue; - } - const float16_t *src_h_ptr = src + src_h * conv_param->input_w_ * conv_param->input_channel_; - float16_t *dst_h_ptr = dst + dst_h * conv_param->output_w_ * conv_param->input_channel_; - for (int dst_w = 0; dst_w < conv_param->output_w_; dst_w++) { - int src_w = dst_w * conv_param->stride_w_ - conv_param->pad_l_; - if (src_w < 0 || src_w >= conv_param->input_w_) { - continue; - } - memcpy(dst_h_ptr + dst_w * conv_param->input_channel_, src_h_ptr + src_w * conv_param->input_channel_, - conv_param->input_channel_ * sizeof(float16_t)); - } - } - return; -} - void Im2ColPackUnitFp16(float16_t *input_data, ConvParameter *conv_param, float16_t *packed_input, int real_cal_num, int block_index) { // input format : nhwc diff --git a/mindspore/lite/nnacl/fp16/pack_fp16.h b/mindspore/lite/nnacl/fp16/pack_fp16.h index 8569a2b338..759a0e04a7 100644 --- a/mindspore/lite/nnacl/fp16/pack_fp16.h +++ b/mindspore/lite/nnacl/fp16/pack_fp16.h @@ -26,8 +26,6 @@ #ifdef __cplusplus extern "C" { #endif -void Conv1x1InputPackFp16(const float16_t *src, float16_t *dst, ConvParameter *conv_param); - void Im2ColPackUnitFp16(float16_t *input_data, ConvParameter *conv_param, float16_t *packed_input, int real_cal_num, int block_index); diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc index 192924ab40..5189512009 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc @@ -59,8 +59,15 @@ int Convolution1x1FP16CPUKernel::InitConv1x1Param() { pre_trans_input_ = (conv_param_->pad_u_ != 0 || conv_param_->pad_l_ != 0 || conv_param_->stride_h_ != 1 || conv_param_->stride_w_ != 1); - thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, C8NUM)); - thread_stride_ = UP_DIV(UP_DIV(matmul_param_->col_, C8NUM), thread_count_) * C8NUM; + if ((matmul_param_->row_ > (C16NUM * op_parameter_->thread_num_)) && (matmul_param_->row_ > matmul_param_->col_)) { + multi_thread_by_hw_ = true; + thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->row_, C16NUM)); + thread_stride_ = UP_DIV(UP_DIV(matmul_param_->row_, C16NUM), thread_count_) * C16NUM; + } else { + multi_thread_by_hw_ = false; + thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, C8NUM)); + thread_stride_ = UP_DIV(UP_DIV(matmul_param_->col_, C8NUM), thread_count_) * C8NUM; + } if (pre_trans_input_) { input_ptr_ = reinterpret_cast(malloc(matmul_param_->row_ * matmul_param_->deep_ * sizeof(float16_t))); @@ -153,19 +160,7 @@ int Convolution1x1FP16CPUKernel::ReSize() { return RET_OK; } -void Convolution1x1FP16CPUKernel::Pre1x1Trans(float16_t *src_input, float16_t *src_output) { - output_ptr_ = src_output; - if (pre_trans_input_) { - Conv1x1InputPackFp16(src_input, input_ptr_, conv_param_); - } else { - input_ptr_ = src_input; - } - - RowMajor2Col16MajorFp16Opt(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_); - return; -} - -int Convolution1x1FP16CPUKernel::RunImpl(int task_id) { +int Convolution1x1FP16CPUKernel::RunOc(int task_id) { int cur_stride = matmul_param_->col_ - task_id * thread_stride_; int cur_oc = MSMIN(thread_stride_, cur_stride); if (cur_oc <= 0) { @@ -181,9 +176,27 @@ int Convolution1x1FP16CPUKernel::RunImpl(int task_id) { return RET_OK; } -static int Convolution1x1Fp16Impl(void *cdata, int task_id) { +int Convolution1x1FP16CPUKernel::RunHw(int task_id) { + int res_stride = matmul_param_->row_ - task_id * thread_stride_; + int cur_hw_ = MSMIN(thread_stride_, res_stride); + if (cur_hw_ <= 0) { + return RET_OK; + } + + float16_t *thread_input_ptr = input_ptr_ + task_id * thread_stride_ * matmul_param_->deep_; + float16_t *thread_pack_input = pack_input_ + task_id * thread_stride_ * matmul_param_->deep_; + RowMajor2Col16MajorFp16Opt(thread_input_ptr, thread_pack_input, cur_hw_, matmul_param_->deep_); + + float16_t *thread_output_ptr = output_ptr_ + task_id * thread_stride_ * matmul_param_->col_; + MatMulFp16(thread_pack_input, weight_ptr_, thread_output_ptr, reinterpret_cast(bias_data_), + matmul_param_->act_type_, matmul_param_->deep_, cur_hw_, matmul_param_->col_, matmul_param_->col_, true); + + return RET_OK; +} + +static int Convolution1x1Fp16RunOc(void *cdata, int task_id) { auto conv = reinterpret_cast(cdata); - auto error_code = conv->RunImpl(task_id); + auto error_code = conv->RunOc(task_id); if (error_code != RET_OK) { MS_LOG(ERROR) << "Convolution1x1 Fp16 Run error task_id[" << task_id << "] error_code[" << error_code << "]"; return RET_ERROR; @@ -191,6 +204,16 @@ static int Convolution1x1Fp16Impl(void *cdata, int task_id) { return RET_OK; } +static int Convolution1x1Fp16RunHw(void *cdata, int task_id) { + auto conv = reinterpret_cast(cdata); + auto error_code = conv->RunHw(task_id); + if (error_code != RET_OK) { + MS_LOG(ERROR) << "Convolution1x1 Fp16 Run hw error task_id[" << task_id << "] error_code[" << error_code << "]"; + return RET_ERROR; + } + return RET_OK; +} + int Convolution1x1FP16CPUKernel::Run() { auto ret = Prepare(); if (ret != RET_OK) { @@ -212,14 +235,20 @@ int Convolution1x1FP16CPUKernel::Run() { } for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) { - Pre1x1Trans( - execute_input_ + batch_index * conv_param_->input_h_ * conv_param_->input_w_ * conv_param_->input_channel_, - execute_output_ + batch_index * matmul_param_->row_ * matmul_param_->col_); - - int error_code = ParallelLaunch(this->context_->thread_pool_, Convolution1x1Fp16Impl, this, thread_count_); - if (error_code != RET_OK) { - MS_LOG(ERROR) << "conv1x1 fp16 error error_code[" << error_code << "]"; - return RET_ERROR; + output_ptr_ = execute_output_ + batch_index * matmul_param_->row_ * matmul_param_->col_; + float16_t *batch_in = + execute_input_ + batch_index * conv_param_->input_h_ * conv_param_->input_w_ * conv_param_->input_channel_; + if (pre_trans_input_) { + Conv1x1InputPack(batch_in, input_ptr_, conv_param_, sizeof(float16_t)); + } else { + input_ptr_ = batch_in; + } + + if (multi_thread_by_hw_) { + ParallelLaunch(this->context_->thread_pool_, Convolution1x1Fp16RunHw, this, thread_count_); + } else { + RowMajor2Col16MajorFp16Opt(input_ptr_, pack_input_, matmul_param_->row_, matmul_param_->deep_); + ParallelLaunch(this->context_->thread_pool_, Convolution1x1Fp16RunOc, this, thread_count_); } } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h index a7f9b4c627..318fa37e59 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h @@ -37,17 +37,20 @@ class Convolution1x1FP16CPUKernel : public ConvolutionBaseFP16CPUKernel { int Init() override; int ReSize() override; int Run() override; - int RunImpl(int task_id); + + public: + int RunOc(int task_id); + int RunHw(int task_id); private: void FreeTmpBuffer(); int InitConv1x1Param(); int InitMatmulParam(); int InitWeightBias(); - void Pre1x1Trans(float16_t *src_input, float16_t *src_output); private: bool pre_trans_input_ = false; + bool multi_thread_by_hw_ = false; int thread_count_ = 1; int thread_stride_ = 0; float16_t *weight_ptr_ = nullptr;