From: @ling_qiao_min Reviewed-by: @zhang_xue_tong Signed-off-by: @zhang_xue_tongtags/v1.2.0-rc1
| @@ -36,13 +36,9 @@ class ConcatFp16CPUKernel : public LiteKernel { | |||
| : LiteKernel(parameter, inputs, outputs, ctx, primitive) { | |||
| concat_param_ = reinterpret_cast<ConcatParameter *>(op_parameter_); | |||
| } | |||
| ~ConcatFp16CPUKernel() = default; | |||
| int Init() override; | |||
| int ReSize() override; | |||
| int Run() override; | |||
| private: | |||
| @@ -207,18 +207,12 @@ static int Convolution1x1Fp16RunHw(void *cdata, int task_id) { | |||
| } | |||
| int Convolution1x1FP16CPUKernel::Run() { | |||
| auto ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Get executor tensor failed."; | |||
| ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); | |||
| return ret; | |||
| } | |||
| ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); | |||
| pack_input_ = reinterpret_cast<float16_t *>( | |||
| ctx_->allocator->Malloc(matmul_param_->row_16_ * matmul_param_->deep_ * sizeof(float16_t))); | |||
| if (pack_input_ == nullptr) { | |||
| MS_LOG(ERROR) << "Conv1x1 Malloc pack_input_ error!"; | |||
| ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); | |||
| return RET_MEMORY_FAILED; | |||
| } | |||
| @@ -232,6 +226,7 @@ int Convolution1x1FP16CPUKernel::Run() { | |||
| input_ptr_ = batch_in; | |||
| } | |||
| int ret = RET_ERROR; | |||
| if (multi_thread_by_hw_) { | |||
| ret = ParallelLaunch(this->context_->thread_pool_, Convolution1x1Fp16RunHw, this, thread_count_); | |||
| } else { | |||
| @@ -240,16 +235,12 @@ int Convolution1x1FP16CPUKernel::Run() { | |||
| } | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "ParallelLaunch failed."; | |||
| ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); | |||
| ctx_->allocator->Free(pack_input_); | |||
| pack_input_ = nullptr; | |||
| return ret; | |||
| } | |||
| } | |||
| ConvolutionBaseFP16CPUKernel::IfCastOutput(); | |||
| ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); | |||
| ctx_->allocator->Free(pack_input_); | |||
| pack_input_ = nullptr; | |||
| return RET_OK; | |||
| @@ -33,19 +33,10 @@ ConvolutionBaseFP16CPUKernel::~ConvolutionBaseFP16CPUKernel() { | |||
| } | |||
| int ConvolutionBaseFP16CPUKernel::GetExecuteTensor() { | |||
| // ===================input====================// | |||
| auto input_tensor = in_tensors_.at(kInputIndex); | |||
| in_data_type_ = input_tensor->data_type(); | |||
| MS_ASSERT(in_data_type_ == kNumberTypeFloat32 || in_data_type_ == kNumberTypeFloat16); | |||
| execute_input_ = ConvertInputFp32toFp16(input_tensor, context_); | |||
| // ==================output====================// | |||
| auto out_tensor = out_tensors_.at(kOutputIndex); | |||
| out_data_type_ = out_tensor->data_type(); | |||
| MS_ASSERT(out_data_type_ == kNumberTypeFloat32 || out_data_type_ == kNumberTypeFloat16); | |||
| execute_output_ = MallocOutputFp16(out_tensor, context_); | |||
| auto input_tensor = in_tensors_.at(0); | |||
| auto output_tensor = out_tensors_.at(0); | |||
| execute_input_ = reinterpret_cast<float16_t *>(input_tensor->data_c()); | |||
| execute_output_ = reinterpret_cast<float16_t *>(output_tensor->data_c()); | |||
| return RET_OK; | |||
| } | |||
| @@ -78,25 +69,4 @@ int ConvolutionBaseFP16CPUKernel::GetExecuteFilter() { | |||
| } | |||
| return RET_OK; | |||
| } | |||
| void ConvolutionBaseFP16CPUKernel::IfCastOutput() { | |||
| if (out_data_type_ == kNumberTypeFloat32) { | |||
| auto out_tensor = out_tensors_.at(kOutputIndex); | |||
| auto out_ele_num = out_tensor->ElementsNum(); | |||
| auto output_addr = reinterpret_cast<float *>(out_tensor->MutableData()); | |||
| Float16ToFloat32(execute_output_, output_addr, out_ele_num); | |||
| } | |||
| } | |||
| void ConvolutionBaseFP16CPUKernel::FreeTmpBuffer() { | |||
| if (in_data_type_ == kNumberTypeFloat32) { | |||
| context_->allocator->Free(execute_input_); | |||
| execute_input_ = nullptr; | |||
| } | |||
| if (out_data_type_ == kNumberTypeFloat32) { | |||
| context_->allocator->Free(execute_output_); | |||
| execute_output_ = nullptr; | |||
| } | |||
| } | |||
| } // namespace mindspore::kernel | |||
| @@ -38,16 +38,12 @@ class ConvolutionBaseFP16CPUKernel : public ConvolutionBaseCPUKernel { | |||
| int RunImpl(int task_id) { return mindspore::lite::RET_OK; } | |||
| virtual int GetExecuteTensor(); | |||
| virtual int GetExecuteFilter(); | |||
| virtual void IfCastOutput(); | |||
| void FreeTmpBuffer(); | |||
| protected: | |||
| float16_t *fp16_weight_ = nullptr; | |||
| float16_t *execute_input_ = nullptr; | |||
| float16_t *execute_weight_ = nullptr; | |||
| float16_t *execute_output_ = nullptr; | |||
| TypeId in_data_type_; | |||
| TypeId out_data_type_; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| @@ -114,19 +114,13 @@ static int ConvDwFp16Run(void *cdata, int task_id) { | |||
| } | |||
| int ConvolutionDepthwiseFp16CPUKernel::Run() { | |||
| auto ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Get Execute tensor failed."; | |||
| return ret; | |||
| } | |||
| ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); | |||
| ret = ParallelLaunch(this->context_->thread_pool_, ConvDwFp16Run, this, conv_param_->thread_num_); | |||
| auto ret = ParallelLaunch(this->context_->thread_pool_, ConvDwFp16Run, this, conv_param_->thread_num_); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "ConvDwFp16Run error: error_code[" << ret << "]"; | |||
| } | |||
| ConvolutionBaseFP16CPUKernel::IfCastOutput(); | |||
| ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); | |||
| return ret; | |||
| } | |||
| @@ -149,13 +149,8 @@ int ConvolutionDepthwiseSWFp16CPUKernel::Run() { | |||
| return ret; | |||
| } | |||
| ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Get Execute tensor failed."; | |||
| FreePackedInputOutput(); | |||
| ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); | |||
| return ret; | |||
| } | |||
| ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); | |||
| if (need_align_) { | |||
| PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_, | |||
| conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); | |||
| @@ -172,8 +167,7 @@ int ConvolutionDepthwiseSWFp16CPUKernel::Run() { | |||
| PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_, | |||
| conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); | |||
| } | |||
| ConvolutionBaseFP16CPUKernel::IfCastOutput(); | |||
| ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); | |||
| FreePackedInputOutput(); | |||
| return ret; | |||
| } | |||
| @@ -128,17 +128,11 @@ static int ConvolutionFp16Impl(void *cdata, int task_id) { | |||
| } | |||
| int ConvolutionFP16CPUKernel::Run() { | |||
| auto ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Get Execute tensor failed."; | |||
| ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); | |||
| return ret; | |||
| } | |||
| ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); | |||
| ret = InitTmpBuffer(); | |||
| auto ret = InitTmpBuffer(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Init tmp buffer failed."; | |||
| ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); | |||
| FreeTmpBuffer(); | |||
| return RET_ERROR; | |||
| } | |||
| @@ -147,8 +141,7 @@ int ConvolutionFP16CPUKernel::Run() { | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "conv fp16 error ret[" << ret << "]"; | |||
| } | |||
| ConvolutionBaseFP16CPUKernel::IfCastOutput(); | |||
| ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); | |||
| FreeTmpBuffer(); | |||
| return ret; | |||
| } | |||
| @@ -195,17 +195,11 @@ static int ConvolutionWinogradFp16Impl(void *cdata, int task_id) { | |||
| } | |||
| int ConvolutionWinogradFP16CPUKernel::Run() { | |||
| auto ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Get Execute tensor failed."; | |||
| ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); | |||
| return ret; | |||
| } | |||
| ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); | |||
| ret = InitTmpBuffer(); | |||
| auto ret = InitTmpBuffer(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Init tmp buffer failed."; | |||
| ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); | |||
| FreeTmpBuffer(); | |||
| return RET_ERROR; | |||
| } | |||
| @@ -215,8 +209,6 @@ int ConvolutionWinogradFP16CPUKernel::Run() { | |||
| MS_LOG(ERROR) << "conv winograd error error_code[" << ret << "]"; | |||
| } | |||
| ConvolutionBaseFP16CPUKernel::IfCastOutput(); | |||
| ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); | |||
| FreeTmpBuffer(); | |||
| return ret; | |||
| } | |||
| @@ -162,13 +162,8 @@ int DeconvolutionDepthwiseFp16CPUKernel::Run() { | |||
| return RET_ERROR; | |||
| } | |||
| ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Get Execute tensor failed."; | |||
| FreePackedInputOutput(); | |||
| ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); | |||
| return ret; | |||
| } | |||
| ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); | |||
| if (need_align_) { | |||
| PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_, | |||
| conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); | |||
| @@ -189,8 +184,7 @@ int DeconvolutionDepthwiseFp16CPUKernel::Run() { | |||
| PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_, | |||
| conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); | |||
| } | |||
| ConvolutionBaseFP16CPUKernel::IfCastOutput(); | |||
| ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); | |||
| FreePackedInputOutput(); | |||
| return ret; | |||
| } | |||
| @@ -189,7 +189,6 @@ int DeConvolutionFp16CPUKernel::Run() { | |||
| int error_code = InitRunBuf(); | |||
| if (error_code != RET_OK) { | |||
| MS_LOG(ERROR) << "deconv fp16 InitRunBuf error! error_code[" << error_code << "]"; | |||
| ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); | |||
| FreeRunBuf(); | |||
| return RET_ERROR; | |||
| } | |||
| @@ -206,8 +205,6 @@ int DeConvolutionFp16CPUKernel::Run() { | |||
| } | |||
| } | |||
| ConvolutionBaseFP16CPUKernel::IfCastOutput(); | |||
| ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); | |||
| FreeRunBuf(); | |||
| return error_code; | |||
| } | |||
| @@ -405,9 +405,6 @@ int DeConvWinogradFp16CPUKernel::Run() { | |||
| ParallelLaunch(this->context_->thread_pool_, DeConvWgPostFp16Run, this, thread_num_hw_); | |||
| } | |||
| ConvolutionBaseFP16CPUKernel::IfCastOutput(); | |||
| ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); | |||
| return RET_OK; | |||
| } | |||
| } // namespace mindspore::kernel | |||
| @@ -33,9 +33,6 @@ using mindspore::schema::PrimitiveType_Scale; | |||
| namespace mindspore::kernel { | |||
| int ScaleFp16CPUKernel::InitScaleOffset() { | |||
| auto input_tensor = in_tensors_.at(0); | |||
| malloc_input_ = input_tensor->data_type() == kNumberTypeFloat32; | |||
| auto scale_tensor = in_tensors_.at(1); | |||
| malloc_scale_ = scale_tensor->data_type() == kNumberTypeFloat32; | |||
| @@ -45,9 +42,6 @@ int ScaleFp16CPUKernel::InitScaleOffset() { | |||
| auto offset_tensor = in_tensors_.at(2); | |||
| malloc_offset_ = offset_tensor->data_type() == kNumberTypeFloat32; | |||
| } | |||
| auto output_tensor = out_tensors_.at(0); | |||
| malloc_output_ = output_tensor->data_type() == kNumberTypeFloat32; | |||
| return RET_OK; | |||
| } | |||
| @@ -103,6 +97,11 @@ int ScaleFp16Run(void *cdata, int task_id) { | |||
| } | |||
| int ScaleFp16CPUKernel::Run() { | |||
| auto input_tensor = in_tensors_.at(0); | |||
| auto output_tensor = out_tensors_.at(0); | |||
| input_ = reinterpret_cast<float16_t *>(input_tensor->MutableData()); | |||
| output_ = reinterpret_cast<float16_t *>(output_tensor->MutableData()); | |||
| auto ret = InitScaleOffset(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Scale fp16 InitScaleOffset failed."; | |||
| @@ -123,20 +122,11 @@ int ScaleFp16CPUKernel::Run() { | |||
| return RET_ERROR; | |||
| } | |||
| // if output tensor is fp32, we need to transform | |||
| if (malloc_output_) { | |||
| auto out_tensor = out_tensors_.at(0); | |||
| Float16ToFloat32(output_, reinterpret_cast<float *>(out_tensor->MutableData()), out_tensor->ElementsNum()); | |||
| } | |||
| FreeTmpBuffer(); | |||
| return RET_OK; | |||
| } | |||
| int ScaleFp16CPUKernel::MallocAssignTmpBuffer() { | |||
| input_ = ConvertInputFp32toFp16(in_tensors_.at(0), context_); | |||
| if (input_ == nullptr) { | |||
| return RET_ERROR; | |||
| } | |||
| scale_ = ConvertInputFp32toFp16(in_tensors_.at(1), context_); | |||
| if (scale_ == nullptr) { | |||
| return RET_ERROR; | |||
| @@ -155,18 +145,10 @@ int ScaleFp16CPUKernel::MallocAssignTmpBuffer() { | |||
| } | |||
| memset(offset_, 0, in_tensors_.at(1)->ElementsNum() * sizeof(float16_t)); | |||
| } | |||
| output_ = MallocOutputFp16(out_tensors_.at(0), context_); | |||
| if (output_ == nullptr) { | |||
| return RET_ERROR; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| void ScaleFp16CPUKernel::FreeTmpBuffer() { | |||
| if (malloc_input_ && input_ != nullptr) { | |||
| context_->allocator->Free(input_); | |||
| input_ = nullptr; | |||
| } | |||
| if (malloc_scale_ && scale_ != nullptr) { | |||
| context_->allocator->Free(scale_); | |||
| scale_ = nullptr; | |||
| @@ -175,10 +157,6 @@ void ScaleFp16CPUKernel::FreeTmpBuffer() { | |||
| context_->allocator->Free(offset_); | |||
| offset_ = nullptr; | |||
| } | |||
| if (malloc_output_ && output_ != nullptr) { | |||
| context_->allocator->Free(output_); | |||
| output_ = nullptr; | |||
| } | |||
| } | |||
| REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Scale, LiteKernelCreator<ScaleFp16CPUKernel>) | |||
| @@ -43,10 +43,8 @@ class ScaleFp16CPUKernel : public ScaleCPUKernel { | |||
| void FreeTmpBuffer(); | |||
| private: | |||
| bool malloc_input_ = false; | |||
| bool malloc_scale_ = false; | |||
| bool malloc_offset_ = false; | |||
| bool malloc_output_ = false; | |||
| float16_t *input_ = nullptr; | |||
| float16_t *scale_ = nullptr; | |||
| @@ -29,7 +29,6 @@ using mindspore::lite::RET_OK; | |||
| using mindspore::schema::PrimitiveType_Stack; | |||
| namespace mindspore::kernel { | |||
| int StackFp16CPUKernel::Init() { | |||
| if (!InferShapeDone()) { | |||
| return RET_OK; | |||
| @@ -27,9 +27,7 @@ class StackFp16CPUKernel : public StackCPUKernel { | |||
| const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx, | |||
| const mindspore::lite::PrimitiveC *primitive) | |||
| : StackCPUKernel(parameter, inputs, outputs, ctx, primitive) {} | |||
| ~StackFp16CPUKernel() = default; | |||
| int Init() override; | |||
| int Run() override; | |||