From 06b366fd77a8198bb09a7347734263ae5a49d754 Mon Sep 17 00:00:00 2001 From: ling Date: Thu, 14 Jan 2021 15:20:42 +0800 Subject: [PATCH] conv fp16 cast delete --- .../src/runtime/kernel/arm/fp16/concat_fp16.h | 4 -- .../kernel/arm/fp16/convolution_1x1_fp16.cc | 13 +------ .../kernel/arm/fp16/convolution_base_fp16.cc | 38 ++----------------- .../kernel/arm/fp16/convolution_base_fp16.h | 4 -- .../arm/fp16/convolution_depthwise_fp16.cc | 10 +---- .../convolution_depthwise_slidewindow_fp16.cc | 12 ++---- .../kernel/arm/fp16/convolution_fp16.cc | 13 ++----- .../arm/fp16/convolution_winograd_fp16.cc | 12 +----- .../arm/fp16/deconvolution_depthwise_fp16.cc | 12 ++---- .../kernel/arm/fp16/deconvolution_fp16.cc | 3 -- .../arm/fp16/deconvolution_winograd_fp16.cc | 3 -- .../src/runtime/kernel/arm/fp16/scale_fp16.cc | 32 +++------------- .../src/runtime/kernel/arm/fp16/scale_fp16.h | 2 - .../src/runtime/kernel/arm/fp16/stack_fp16.cc | 1 - .../src/runtime/kernel/arm/fp16/stack_fp16.h | 2 - 15 files changed, 24 insertions(+), 137 deletions(-) diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/concat_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/concat_fp16.h index 368646276c..d05374468a 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/concat_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/concat_fp16.h @@ -36,13 +36,9 @@ class ConcatFp16CPUKernel : public LiteKernel { : LiteKernel(parameter, inputs, outputs, ctx, primitive) { concat_param_ = reinterpret_cast(op_parameter_); } - ~ConcatFp16CPUKernel() = default; - int Init() override; - int ReSize() override; - int Run() override; private: diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc index 0f93bdb4f0..d16052dd8b 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc @@ -207,18 +207,12 @@ static int Convolution1x1Fp16RunHw(void *cdata, int task_id) { } int Convolution1x1FP16CPUKernel::Run() { - auto ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Get executor tensor failed."; - ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); - return ret; - } + ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); pack_input_ = reinterpret_cast( ctx_->allocator->Malloc(matmul_param_->row_16_ * matmul_param_->deep_ * sizeof(float16_t))); if (pack_input_ == nullptr) { MS_LOG(ERROR) << "Conv1x1 Malloc pack_input_ error!"; - ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); return RET_MEMORY_FAILED; } @@ -232,6 +226,7 @@ int Convolution1x1FP16CPUKernel::Run() { input_ptr_ = batch_in; } + int ret = RET_ERROR; if (multi_thread_by_hw_) { ret = ParallelLaunch(this->context_->thread_pool_, Convolution1x1Fp16RunHw, this, thread_count_); } else { @@ -240,16 +235,12 @@ int Convolution1x1FP16CPUKernel::Run() { } if (ret != RET_OK) { MS_LOG(ERROR) << "ParallelLaunch failed."; - ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); ctx_->allocator->Free(pack_input_); pack_input_ = nullptr; return ret; } } - ConvolutionBaseFP16CPUKernel::IfCastOutput(); - ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); - ctx_->allocator->Free(pack_input_); pack_input_ = nullptr; return RET_OK; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.cc index 435aa8d518..6212d5b8e5 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.cc @@ -33,19 +33,10 @@ ConvolutionBaseFP16CPUKernel::~ConvolutionBaseFP16CPUKernel() { } int ConvolutionBaseFP16CPUKernel::GetExecuteTensor() { - // ===================input====================// - auto input_tensor = in_tensors_.at(kInputIndex); - in_data_type_ = input_tensor->data_type(); - MS_ASSERT(in_data_type_ == kNumberTypeFloat32 || in_data_type_ == kNumberTypeFloat16); - - execute_input_ = ConvertInputFp32toFp16(input_tensor, context_); - - // ==================output====================// - auto out_tensor = out_tensors_.at(kOutputIndex); - out_data_type_ = out_tensor->data_type(); - MS_ASSERT(out_data_type_ == kNumberTypeFloat32 || out_data_type_ == kNumberTypeFloat16); - - execute_output_ = MallocOutputFp16(out_tensor, context_); + auto input_tensor = in_tensors_.at(0); + auto output_tensor = out_tensors_.at(0); + execute_input_ = reinterpret_cast(input_tensor->data_c()); + execute_output_ = reinterpret_cast(output_tensor->data_c()); return RET_OK; } @@ -78,25 +69,4 @@ int ConvolutionBaseFP16CPUKernel::GetExecuteFilter() { } return RET_OK; } - -void ConvolutionBaseFP16CPUKernel::IfCastOutput() { - if (out_data_type_ == kNumberTypeFloat32) { - auto out_tensor = out_tensors_.at(kOutputIndex); - auto out_ele_num = out_tensor->ElementsNum(); - auto output_addr = reinterpret_cast(out_tensor->MutableData()); - Float16ToFloat32(execute_output_, output_addr, out_ele_num); - } -} - -void ConvolutionBaseFP16CPUKernel::FreeTmpBuffer() { - if (in_data_type_ == kNumberTypeFloat32) { - context_->allocator->Free(execute_input_); - execute_input_ = nullptr; - } - if (out_data_type_ == kNumberTypeFloat32) { - context_->allocator->Free(execute_output_); - execute_output_ = nullptr; - } -} - } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.h index 972795cd12..5805f92cec 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.h @@ -38,16 +38,12 @@ class ConvolutionBaseFP16CPUKernel : public ConvolutionBaseCPUKernel { int RunImpl(int task_id) { return mindspore::lite::RET_OK; } virtual int GetExecuteTensor(); virtual int GetExecuteFilter(); - virtual void IfCastOutput(); - void FreeTmpBuffer(); protected: float16_t *fp16_weight_ = nullptr; float16_t *execute_input_ = nullptr; float16_t *execute_weight_ = nullptr; float16_t *execute_output_ = nullptr; - TypeId in_data_type_; - TypeId out_data_type_; }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc index 1605dc5c27..04fa5aa80a 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc @@ -114,19 +114,13 @@ static int ConvDwFp16Run(void *cdata, int task_id) { } int ConvolutionDepthwiseFp16CPUKernel::Run() { - auto ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Get Execute tensor failed."; - return ret; - } + ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); - ret = ParallelLaunch(this->context_->thread_pool_, ConvDwFp16Run, this, conv_param_->thread_num_); + auto ret = ParallelLaunch(this->context_->thread_pool_, ConvDwFp16Run, this, conv_param_->thread_num_); if (ret != RET_OK) { MS_LOG(ERROR) << "ConvDwFp16Run error: error_code[" << ret << "]"; } - ConvolutionBaseFP16CPUKernel::IfCastOutput(); - ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); return ret; } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc index 07caca2f2a..4179749623 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc @@ -149,13 +149,8 @@ int ConvolutionDepthwiseSWFp16CPUKernel::Run() { return ret; } - ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Get Execute tensor failed."; - FreePackedInputOutput(); - ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); - return ret; - } + ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); + if (need_align_) { PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_, conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); @@ -172,8 +167,7 @@ int ConvolutionDepthwiseSWFp16CPUKernel::Run() { PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); } - ConvolutionBaseFP16CPUKernel::IfCastOutput(); - ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); + FreePackedInputOutput(); return ret; } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc index b8f289c63f..cb03977635 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc @@ -128,17 +128,11 @@ static int ConvolutionFp16Impl(void *cdata, int task_id) { } int ConvolutionFP16CPUKernel::Run() { - auto ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Get Execute tensor failed."; - ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); - return ret; - } + ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); - ret = InitTmpBuffer(); + auto ret = InitTmpBuffer(); if (ret != RET_OK) { MS_LOG(ERROR) << "Init tmp buffer failed."; - ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); FreeTmpBuffer(); return RET_ERROR; } @@ -147,8 +141,7 @@ int ConvolutionFP16CPUKernel::Run() { if (ret != RET_OK) { MS_LOG(ERROR) << "conv fp16 error ret[" << ret << "]"; } - ConvolutionBaseFP16CPUKernel::IfCastOutput(); - ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); + FreeTmpBuffer(); return ret; } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc index 5706c4c29c..30a6d6f133 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc @@ -195,17 +195,11 @@ static int ConvolutionWinogradFp16Impl(void *cdata, int task_id) { } int ConvolutionWinogradFP16CPUKernel::Run() { - auto ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Get Execute tensor failed."; - ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); - return ret; - } + ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); - ret = InitTmpBuffer(); + auto ret = InitTmpBuffer(); if (ret != RET_OK) { MS_LOG(ERROR) << "Init tmp buffer failed."; - ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); FreeTmpBuffer(); return RET_ERROR; } @@ -215,8 +209,6 @@ int ConvolutionWinogradFP16CPUKernel::Run() { MS_LOG(ERROR) << "conv winograd error error_code[" << ret << "]"; } - ConvolutionBaseFP16CPUKernel::IfCastOutput(); - ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); FreeTmpBuffer(); return ret; } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc index b9f9fb1012..9607a9c4e2 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc @@ -162,13 +162,8 @@ int DeconvolutionDepthwiseFp16CPUKernel::Run() { return RET_ERROR; } - ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Get Execute tensor failed."; - FreePackedInputOutput(); - ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); - return ret; - } + ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); + if (need_align_) { PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_, conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); @@ -189,8 +184,7 @@ int DeconvolutionDepthwiseFp16CPUKernel::Run() { PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); } - ConvolutionBaseFP16CPUKernel::IfCastOutput(); - ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); + FreePackedInputOutput(); return ret; } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc index 520d1885d9..c65d64f26b 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc @@ -189,7 +189,6 @@ int DeConvolutionFp16CPUKernel::Run() { int error_code = InitRunBuf(); if (error_code != RET_OK) { MS_LOG(ERROR) << "deconv fp16 InitRunBuf error! error_code[" << error_code << "]"; - ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); FreeRunBuf(); return RET_ERROR; } @@ -206,8 +205,6 @@ int DeConvolutionFp16CPUKernel::Run() { } } - ConvolutionBaseFP16CPUKernel::IfCastOutput(); - ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); FreeRunBuf(); return error_code; } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_winograd_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_winograd_fp16.cc index 44e13c2376..dd8faa8f2d 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_winograd_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_winograd_fp16.cc @@ -405,9 +405,6 @@ int DeConvWinogradFp16CPUKernel::Run() { ParallelLaunch(this->context_->thread_pool_, DeConvWgPostFp16Run, this, thread_num_hw_); } - ConvolutionBaseFP16CPUKernel::IfCastOutput(); - ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); - return RET_OK; } } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/scale_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/scale_fp16.cc index 74f4d6e01a..9f86d14155 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/scale_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/scale_fp16.cc @@ -33,9 +33,6 @@ using mindspore::schema::PrimitiveType_Scale; namespace mindspore::kernel { int ScaleFp16CPUKernel::InitScaleOffset() { - auto input_tensor = in_tensors_.at(0); - malloc_input_ = input_tensor->data_type() == kNumberTypeFloat32; - auto scale_tensor = in_tensors_.at(1); malloc_scale_ = scale_tensor->data_type() == kNumberTypeFloat32; @@ -45,9 +42,6 @@ int ScaleFp16CPUKernel::InitScaleOffset() { auto offset_tensor = in_tensors_.at(2); malloc_offset_ = offset_tensor->data_type() == kNumberTypeFloat32; } - - auto output_tensor = out_tensors_.at(0); - malloc_output_ = output_tensor->data_type() == kNumberTypeFloat32; return RET_OK; } @@ -103,6 +97,11 @@ int ScaleFp16Run(void *cdata, int task_id) { } int ScaleFp16CPUKernel::Run() { + auto input_tensor = in_tensors_.at(0); + auto output_tensor = out_tensors_.at(0); + input_ = reinterpret_cast(input_tensor->MutableData()); + output_ = reinterpret_cast(output_tensor->MutableData()); + auto ret = InitScaleOffset(); if (ret != RET_OK) { MS_LOG(ERROR) << "Scale fp16 InitScaleOffset failed."; @@ -123,20 +122,11 @@ int ScaleFp16CPUKernel::Run() { return RET_ERROR; } - // if output tensor is fp32, we need to transform - if (malloc_output_) { - auto out_tensor = out_tensors_.at(0); - Float16ToFloat32(output_, reinterpret_cast(out_tensor->MutableData()), out_tensor->ElementsNum()); - } FreeTmpBuffer(); return RET_OK; } int ScaleFp16CPUKernel::MallocAssignTmpBuffer() { - input_ = ConvertInputFp32toFp16(in_tensors_.at(0), context_); - if (input_ == nullptr) { - return RET_ERROR; - } scale_ = ConvertInputFp32toFp16(in_tensors_.at(1), context_); if (scale_ == nullptr) { return RET_ERROR; @@ -155,18 +145,10 @@ int ScaleFp16CPUKernel::MallocAssignTmpBuffer() { } memset(offset_, 0, in_tensors_.at(1)->ElementsNum() * sizeof(float16_t)); } - output_ = MallocOutputFp16(out_tensors_.at(0), context_); - if (output_ == nullptr) { - return RET_ERROR; - } return RET_OK; } void ScaleFp16CPUKernel::FreeTmpBuffer() { - if (malloc_input_ && input_ != nullptr) { - context_->allocator->Free(input_); - input_ = nullptr; - } if (malloc_scale_ && scale_ != nullptr) { context_->allocator->Free(scale_); scale_ = nullptr; @@ -175,10 +157,6 @@ void ScaleFp16CPUKernel::FreeTmpBuffer() { context_->allocator->Free(offset_); offset_ = nullptr; } - if (malloc_output_ && output_ != nullptr) { - context_->allocator->Free(output_); - output_ = nullptr; - } } REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Scale, LiteKernelCreator) diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/scale_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/scale_fp16.h index 26da3846d0..a54b95c017 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/scale_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/scale_fp16.h @@ -43,10 +43,8 @@ class ScaleFp16CPUKernel : public ScaleCPUKernel { void FreeTmpBuffer(); private: - bool malloc_input_ = false; bool malloc_scale_ = false; bool malloc_offset_ = false; - bool malloc_output_ = false; float16_t *input_ = nullptr; float16_t *scale_ = nullptr; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/stack_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/stack_fp16.cc index 8f34b522a8..2429b5aa82 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/stack_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/stack_fp16.cc @@ -29,7 +29,6 @@ using mindspore::lite::RET_OK; using mindspore::schema::PrimitiveType_Stack; namespace mindspore::kernel { - int StackFp16CPUKernel::Init() { if (!InferShapeDone()) { return RET_OK; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/stack_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/stack_fp16.h index a6a19332f0..776a87d240 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/stack_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/stack_fp16.h @@ -27,9 +27,7 @@ class StackFp16CPUKernel : public StackCPUKernel { const std::vector &outputs, const lite::InnerContext *ctx, const mindspore::lite::PrimitiveC *primitive) : StackCPUKernel(parameter, inputs, outputs, ctx, primitive) {} - ~StackFp16CPUKernel() = default; - int Init() override; int Run() override;