From 1b89036bea2faaca697312668e362469e5963e8c Mon Sep 17 00:00:00 2001 From: yangruoqi713 Date: Sun, 16 Aug 2020 15:29:27 +0800 Subject: [PATCH] [MS][LITE] optimize arm cpu fp16 conv op: add common converter functions for input and output --- .../runtime/kernel/arm/fp16/common_fp16.cc | 46 +++++++++++++++++++ .../src/runtime/kernel/arm/fp16/common_fp16.h | 28 +++++++++++ .../kernel/arm/fp16/convolution_1x1_fp16.cc | 40 +--------------- .../kernel/arm/fp16/convolution_1x1_fp16.h | 7 --- .../kernel/arm/fp16/convolution_3x3_fp16.cc | 25 +--------- .../kernel/arm/fp16/convolution_3x3_fp16.h | 6 --- .../kernel/arm/fp16/convolution_base_fp16.cc | 41 ++++++++--------- .../kernel/arm/fp16/convolution_base_fp16.h | 8 ++-- .../arm/fp16/convolution_depthwise_fp16.cc | 27 ++++------- .../arm/fp16/convolution_depthwise_fp16.h | 6 +-- .../kernel/arm/fp16/convolution_fp16.cc | 24 +--------- .../kernel/arm/fp16/convolution_fp16.h | 6 --- .../kernel/arm/fp16/convolution_sw_fp16.cc | 24 +--------- .../kernel/arm/fp16/convolution_sw_fp16.h | 6 --- .../arm/fp16/convolution_winograd_fp16.cc | 24 +--------- .../arm/fp16/convolution_winograd_fp16.h | 6 --- .../arm/fp16/deconvolution_depthwise_fp16.cc | 18 +++++--- .../arm/fp16/deconvolution_depthwise_fp16.h | 6 +-- .../runtime/kernel/arm/nnacl/fp16/pack_fp16.c | 4 +- 19 files changed, 130 insertions(+), 222 deletions(-) create mode 100644 mindspore/lite/src/runtime/kernel/arm/fp16/common_fp16.cc create mode 100644 mindspore/lite/src/runtime/kernel/arm/fp16/common_fp16.h diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/common_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/common_fp16.cc new file mode 100644 index 0000000000..87f5446040 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/common_fp16.cc @@ -0,0 +1,46 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/kernel/arm/fp16/common_fp16.h" +#include "src/runtime/kernel/arm/nnacl/fp16/cast_fp16.h" + +namespace mindspore::kernel { +float16_t *ConvertInputFp32toFp16(lite::tensor::Tensor *input, const lite::Context *ctx) { + float16_t *fp16_data = nullptr; + auto data_type = input->data_type(); + if (data_type == kNumberTypeFloat32) { + auto ele_num = input->ElementsNum(); + fp16_data = reinterpret_cast(ctx->allocator->Malloc(ele_num * sizeof(float16_t))); + auto ori_data = reinterpret_cast(input->Data()); + Float32ToFloat16(ori_data, fp16_data, ele_num); + } else { + fp16_data = reinterpret_cast(input->Data()); + } + return fp16_data; +} + +float16_t *MallocOutputFp16(lite::tensor::Tensor *output, const lite::Context *ctx) { + float16_t *fp16_data = nullptr; + auto data_type = output->data_type(); + if (data_type == kNumberTypeFloat32) { + auto ele_num = output->ElementsNum(); + fp16_data = reinterpret_cast(ctx->allocator->Malloc(ele_num * sizeof(float16_t))); + } else { + fp16_data = reinterpret_cast(output->Data()); + } + return fp16_data; +} +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/common_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/common_fp16.h new file mode 100644 index 0000000000..9a177d0924 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/common_fp16.h @@ -0,0 +1,28 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_COMMON_FP16_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_COMMON_FP16_H_ + +#include "src/lite_kernel.h" + +namespace mindspore::kernel { +float16_t *ConvertInputFp32toFp16(lite::tensor::Tensor *input, const lite::Context *ctx); + +float16_t *MallocOutputFp16(lite::tensor::Tensor *output, const lite::Context *ctx); + +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_COMMON_FP16_H_ diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc index 5d4e361f64..e0e00f5bcf 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc @@ -98,28 +98,6 @@ int Convolution1x1FP16CPUKernel::InitWeightBias() { return RET_OK; } -int Convolution1x1FP16CPUKernel::InitBuffer() { - /*=============================fp16_input_============================*/ - size_t fp16_input_size = conv_param_->input_channel_ * conv_param_->input_batch_ * conv_param_->input_h_ * - conv_param_->input_w_ * sizeof(float16_t); - fp16_input_ = reinterpret_cast(malloc(fp16_input_size)); - if (fp16_input_ == nullptr) { - MS_LOG(ERROR) << "malloc fp16_input_ failed."; - return RET_ERROR; - } - memset(fp16_input_, 0, fp16_input_size); - - /*=============================fp16_out_============================*/ - size_t fp16_output_size = conv_param_->output_channel_ * conv_param_->output_batch_ * conv_param_->output_h_ * - conv_param_->output_w_ * sizeof(float16_t); - fp16_out_ = reinterpret_cast(malloc(fp16_output_size)); - if (fp16_out_ == nullptr) { - MS_LOG(ERROR) << "malloc fp16_out_ failed."; - return RET_ERROR; - } - return RET_OK; -} - int Convolution1x1FP16CPUKernel::Init() { auto ret = ConvolutionBaseCPUKernel::Init(); if (ret != RET_OK) { @@ -136,11 +114,6 @@ int Convolution1x1FP16CPUKernel::Init() { MS_LOG(ERROR) << "Init conv1x1 param failed."; return ret; } - ret = InitBuffer(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Init buffer failed."; - return ret; - } ret = InitWeightBias(); if (ret != RET_OK) { MS_LOG(ERROR) << "Init weight bias failed."; @@ -150,12 +123,6 @@ int Convolution1x1FP16CPUKernel::Init() { } int Convolution1x1FP16CPUKernel::ReSize() { - if (fp16_out_ != nullptr) { - free(fp16_out_); - } - if (fp16_input_ != nullptr) { - free(fp16_input_); - } if (fp16_weight_ != nullptr) { free(fp16_weight_); } @@ -181,12 +148,6 @@ int Convolution1x1FP16CPUKernel::ReSize() { MS_LOG(ERROR) << "Init conv1x1 param failed."; return ret; } - ret = InitBuffer(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Init buffer failed."; - return ret; - } - return RET_OK; } @@ -253,6 +214,7 @@ int Convolution1x1FP16CPUKernel::Run() { } ConvolutionBaseFP16CPUKernel::IfCastOutput(); + ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); return RET_OK; } } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h index d88e79755f..919fe41f36 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h @@ -35,15 +35,9 @@ class Convolution1x1FP16CPUKernel : public ConvolutionBaseFP16CPUKernel { matmul_param_ = new MatMulParameter(); } ~Convolution1x1FP16CPUKernel() override { - if (fp16_input_ != nullptr) { - free(fp16_input_); - } if (fp16_weight_ != nullptr) { free(fp16_weight_); } - if (fp16_out_ != nullptr) { - free(fp16_out_); - } if (input_ptr_ != nullptr) { free(input_ptr_); } @@ -57,7 +51,6 @@ class Convolution1x1FP16CPUKernel : public ConvolutionBaseFP16CPUKernel { int ReSize() override; int Run() override; int RunImpl(int task_id); - int InitBuffer(); int InitConv1x1Param(); int InitMatmulParam(); int InitWeightBias(); diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc index fa129b3c81..b111624431 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc @@ -132,16 +132,6 @@ int Convolution3x3FP16CPUKernel::InitTmpBuffer() { } memset(tmp_out_, 0, tmp_out_size); - /*=============================fp16_input_============================*/ - size_t fp16_input_size = conv_param_->input_channel_ * conv_param_->input_batch_ * conv_param_->input_h_ * - conv_param_->input_w_ * sizeof(float16_t); - fp16_input_ = reinterpret_cast(malloc(fp16_input_size)); - if (fp16_input_ == nullptr) { - MS_LOG(ERROR) << "malloc fp16_input_ failed."; - return RET_ERROR; - } - memset(fp16_input_, 0, fp16_input_size); - /*=============================nhwc4_input_============================*/ size_t nhwc4_input_size = iC4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t); @@ -152,14 +142,6 @@ int Convolution3x3FP16CPUKernel::InitTmpBuffer() { } memset(nhwc4_input_, 0, nhwc4_input_size); - /*=============================fp16_out_============================*/ - size_t fp16_output_size = conv_param_->output_channel_ * conv_param_->output_batch_ * conv_param_->output_h_ * - conv_param_->output_w_ * sizeof(float16_t); - fp16_out_ = reinterpret_cast(malloc(fp16_output_size)); - if (fp16_out_ == nullptr) { - MS_LOG(ERROR) << "malloc fp16_out_ failed."; - return RET_ERROR; - } return RET_OK; } @@ -207,12 +189,6 @@ int Convolution3x3FP16CPUKernel::ReSize() { if (tmp_out_ != nullptr) { free(tmp_out_); } - if (fp16_out_ != nullptr) { - free(fp16_out_); - } - if (fp16_input_ != nullptr) { - free(fp16_input_); - } if (nhwc4_input_ != nullptr) { free(nhwc4_input_); } @@ -304,6 +280,7 @@ int Convolution3x3FP16CPUKernel::Run() { } ConvolutionBaseFP16CPUKernel::IfCastOutput(); + ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); return RET_OK; } } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h index e0e7e516ad..d61ec538be 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h @@ -31,15 +31,9 @@ class Convolution3x3FP16CPUKernel : public ConvolutionBaseFP16CPUKernel { const lite::Primitive *primitive) : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {} ~Convolution3x3FP16CPUKernel() override { - if (fp16_input_ != nullptr) { - free(fp16_input_); - } if (fp16_weight_ != nullptr) { free(fp16_weight_); } - if (fp16_out_ != nullptr) { - free(fp16_out_); - } if (transformed_filter_addr_ != nullptr) { free(transformed_filter_addr_); } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.cc index a56c4d06f4..5d44fbb003 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.cc @@ -16,6 +16,7 @@ #include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h" #include "src/runtime/kernel/arm/nnacl/fp16/cast_fp16.h" +#include "src/runtime/kernel/arm/fp16/common_fp16.h" #include "schema/model_generated.h" #include "src/kernel_factory.h" #include "include/errorcode.h" @@ -25,28 +26,17 @@ namespace mindspore::kernel { int ConvolutionBaseFP16CPUKernel::GetExecuteTensor() { // ===================input====================// auto input_tensor = in_tensors_.at(kInputIndex); - auto input_data_type = input_tensor->data_type(); - MS_ASSERT(input_data_type == kNumberTypeFloat32 || input_data_type == kNumberTypeFloat16); - if (input_data_type == kNumberTypeFloat32) { - auto input_ele_num = input_tensor->ElementsNum(); - auto ori_input_data = reinterpret_cast(input_tensor->Data()); - Float32ToFloat16(ori_input_data, fp16_input_, input_ele_num); - execute_input_ = fp16_input_; - } else { - auto ori_input_data = reinterpret_cast(input_tensor->Data()); - execute_input_ = ori_input_data; - } + in_data_type_ = input_tensor->data_type(); + MS_ASSERT(in_data_type_ == kNumberTypeFloat32 || in_data_type_ == kNumberTypeFloat16); + + execute_input_ = ConvertInputFp32toFp16(input_tensor, context_); + // ==================output====================// auto out_tensor = out_tensors_.at(kOutputIndex); - auto out_data_type = out_tensor->data_type(); - MS_ASSERT(out_data_type == kNumberTypeFloat32 || out_data_type == kNumberTypeFloat16); - out_data_type_ = out_data_type; - if (out_data_type == kNumberTypeFloat32) { - execute_output_ = fp16_out_; - } else { - auto out_ptr = reinterpret_cast(out_tensor->Data()); - execute_output_ = out_ptr; - } + out_data_type_ = out_tensor->data_type(); + MS_ASSERT(out_data_type_ == kNumberTypeFloat32 || out_data_type_ == kNumberTypeFloat16); + + execute_output_ = MallocOutputFp16(out_tensor, context_); return RET_OK; } @@ -79,7 +69,16 @@ void ConvolutionBaseFP16CPUKernel::IfCastOutput() { auto out_tensor = out_tensors_.at(kOutputIndex); auto out_ele_num = out_tensor->ElementsNum(); auto output_addr = reinterpret_cast(out_tensor->Data()); - Float16ToFloat32(fp16_out_, output_addr, out_ele_num); + Float16ToFloat32(execute_output_, output_addr, out_ele_num); + } +} + +void ConvolutionBaseFP16CPUKernel::FreeTmpBuffer() { + if (in_data_type_ == kNumberTypeFloat32) { + context_->allocator->Free(execute_input_); + } + if (out_data_type_ == kNumberTypeFloat32) { + context_->allocator->Free(execute_output_); } } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.h index c4845a762c..e8d0f1eb81 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.h @@ -39,14 +39,14 @@ class ConvolutionBaseFP16CPUKernel : public ConvolutionBaseCPUKernel { virtual int GetExecuteTensor(); virtual int GetExecuteFilter(); virtual void IfCastOutput(); + void FreeTmpBuffer(); protected: - float16_t *fp16_input_ = nullptr; float16_t *fp16_weight_ = nullptr; - float16_t *fp16_out_ = nullptr; - float16_t *execute_input_; + float16_t *execute_input_; // ctx allocator malloc and free float16_t *execute_weight_; - float16_t *execute_output_; + float16_t *execute_output_; // ctx allocator malloc and free + TypeId in_data_type_; TypeId out_data_type_; }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc index ed38b6b82e..b247e32e38 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc @@ -173,22 +173,13 @@ int ConvolutionDepthwiseFp16CPUKernel::Run() { return RET_ERROR; } - auto input_tensor = in_tensors_.at(kInputIndex); - float16_t *input_addr; - if (input_tensor->data_type() == kNumberTypeFloat32) { - input_addr = - reinterpret_cast(context_->allocator->Malloc(input_tensor->ElementsNum() * sizeof(float16_t))); - if (input_addr == nullptr) { - MS_LOG(ERROR) << "Malloc buffer failed."; - return RET_ERROR; - } - Float32ToFloat16(reinterpret_cast(input_tensor->Data()), input_addr, input_tensor->ElementsNum()); - } else { - input_addr = reinterpret_cast(input_tensor->Data()); + ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Get Execute tensor failed."; + return ret; } - // pack input: to nhwc8 - PackNHWCToNHWC8Fp16(input_addr, packed_input_, conv_param_->input_batch_, + PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_, conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); ret = LiteBackendParallelLaunch(ConvDwFp16Run, this, conv_param_->thread_num_); @@ -197,13 +188,11 @@ int ConvolutionDepthwiseFp16CPUKernel::Run() { return RET_ERROR; } - auto output_addr = reinterpret_cast(out_tensors_.at(kOutputIndex)->Data()); - PackNHWC8ToNHWCFp16(packed_output_, output_addr, conv_param_->output_batch_, + PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); - if (input_tensor->data_type() == kNumberTypeFloat32) { - context_->allocator->Free(input_addr); - } + ConvolutionBaseFP16CPUKernel::IfCastOutput(); + ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h index 6003efaf5f..00827510f3 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h @@ -19,7 +19,7 @@ #include #include "src/lite_kernel.h" -#include "src/runtime/kernel/arm/base/convolution_base.h" +#include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h" #include "src/runtime/kernel/arm/nnacl/fp16/conv_depthwise_fp16.h" #ifdef __cplusplus @@ -34,12 +34,12 @@ void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const flo namespace mindspore::kernel { -class ConvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseCPUKernel { +class ConvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseFP16CPUKernel { public: ConvolutionDepthwiseFp16CPUKernel(OpParameter *parameter, const std::vector &inputs, const std::vector &outputs, const Context *ctx, const lite::Primitive *primitive) - : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {} + : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {} ~ConvolutionDepthwiseFp16CPUKernel() override; int Init() override; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc index 7cb95f8165..a3ea280767 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc @@ -103,15 +103,6 @@ int ConvolutionFP16CPUKernel::InitTmpBuffer() { } memset(packed_input_, 0, in_batch * packed_input_size * sizeof(float16_t)); - /*=============================fp16_input_============================*/ - size_t fp16_input_size = - in_channel * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t); - fp16_input_ = reinterpret_cast(malloc(fp16_input_size)); - if (fp16_input_ == nullptr) { - MS_LOG(ERROR) << "malloc fp16_input_ failed."; - return RET_ERROR; - } - /*=============================nhwc4_input_============================*/ size_t nhwc4_input_size = channel_block * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t); @@ -129,14 +120,6 @@ int ConvolutionFP16CPUKernel::InitTmpBuffer() { return RET_ERROR; } - /*=============================fp16_out_============================*/ - size_t fp16_output_size = - out_channel * conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * sizeof(float16_t); - fp16_out_ = reinterpret_cast(malloc(fp16_output_size)); - if (fp16_out_ == nullptr) { - MS_LOG(ERROR) << "malloc fp16_out_ failed."; - return RET_ERROR; - } return RET_OK; } @@ -181,12 +164,6 @@ int ConvolutionFP16CPUKernel::ReSize() { if (nhwc4_input_ != nullptr) { free(nhwc4_input_); } - if (fp16_input_ != nullptr) { - free(fp16_input_); - } - if (fp16_out_ != nullptr) { - free(fp16_out_); - } auto ret = ConvolutionBaseCPUKernel::Init(); if (ret != RET_OK) { @@ -242,6 +219,7 @@ int ConvolutionFP16CPUKernel::Run() { return RET_ERROR; } + ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); ConvolutionBaseFP16CPUKernel::IfCastOutput(); return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.h index ad53277c88..0a4364235f 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.h @@ -30,15 +30,9 @@ class ConvolutionFP16CPUKernel : public ConvolutionBaseFP16CPUKernel { const lite::Primitive *primitive) : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {} ~ConvolutionFP16CPUKernel() override { - if (fp16_input_ != nullptr) { - free(fp16_input_); - } if (fp16_weight_ != nullptr) { free(fp16_weight_); } - if (fp16_out_ != nullptr) { - free(fp16_out_); - } if (packed_input_ != nullptr) { free(packed_input_); } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.cc index 465ab99e20..516f001f92 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.cc @@ -106,15 +106,6 @@ int ConvolutionSWFP16CPUKernel::InitTmpBuffer() { int channel_block = UP_DIV(in_channel, C4NUM); int oc4 = UP_DIV(out_channel, C4NUM); - /*=============================fp16_input_============================*/ - size_t fp16_input_size = - in_channel * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t); - fp16_input_ = reinterpret_cast(malloc(fp16_input_size)); - if (fp16_input_ == nullptr) { - MS_LOG(ERROR) << "malloc fp16_input_ failed."; - return RET_ERROR; - } - /*=============================nhwc4_input_============================*/ size_t nhwc4_input_size = channel_block * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t); @@ -133,14 +124,6 @@ int ConvolutionSWFP16CPUKernel::InitTmpBuffer() { return RET_ERROR; } - /*=============================fp16_out_============================*/ - size_t fp16_output_size = - out_channel * conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * sizeof(float16_t); - fp16_out_ = reinterpret_cast(malloc(fp16_output_size)); - if (fp16_out_ == nullptr) { - MS_LOG(ERROR) << "malloc fp16_out_ failed."; - return RET_ERROR; - } return RET_OK; } @@ -186,12 +169,6 @@ int ConvolutionSWFP16CPUKernel::ReSize() { if (nhwc4_input_ != nullptr) { free(nhwc4_input_); } - if (fp16_input_ != nullptr) { - free(fp16_input_); - } - if (fp16_out_ != nullptr) { - free(fp16_out_); - } delete slidingWindow_param_; auto ret = ConvolutionBaseCPUKernel::Init(); @@ -258,6 +235,7 @@ int ConvolutionSWFP16CPUKernel::Run() { conv_param_->output_channel_); } ConvolutionBaseFP16CPUKernel::IfCastOutput(); + ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); return RET_OK; } } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.h index ce9aa0b674..aa81475fd0 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.h @@ -29,15 +29,9 @@ class ConvolutionSWFP16CPUKernel : public ConvolutionBaseFP16CPUKernel { const lite::Primitive *primitive) : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {} ~ConvolutionSWFP16CPUKernel() override { - if (fp16_input_ != nullptr) { - free(fp16_input_); - } if (fp16_weight_ != nullptr) { free(fp16_weight_); } - if (fp16_out_ != nullptr) { - free(fp16_out_); - } if (packed_weight_ != nullptr) { free(packed_weight_); } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc index a3866c14e3..495cc10319 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc @@ -187,15 +187,6 @@ int ConvolutionWinogradFP16CPUKernel::InitTmpBuffer() { int ic4 = UP_DIV(channel_in, C4NUM); int oc8 = UP_DIV(channel_out, C8NUM); - /*=============================fp16_input_============================*/ - size_t fp16_input_size = conv_param_->input_channel_ * conv_param_->input_batch_ * conv_param_->input_h_ * - conv_param_->input_w_ * sizeof(float16_t); - fp16_input_ = reinterpret_cast(malloc(fp16_input_size)); - if (fp16_input_ == nullptr) { - MS_LOG(ERROR) << "malloc fp16_input_ failed."; - return RET_ERROR; - } - /*=============================trans_input_============================*/ size_t tile_buffer_size = thread_count_ * cal_num * input_unit_ * input_unit_ * ic4 * C4NUM * sizeof(float16_t); trans_input_ = reinterpret_cast(malloc(tile_buffer_size)); @@ -222,14 +213,6 @@ int ConvolutionWinogradFP16CPUKernel::InitTmpBuffer() { MS_LOG(ERROR) << "malloc tmp_out_data_ failed."; return RET_ERROR; } - /*=============================fp16_out_============================*/ - size_t fp16_output_size = conv_param_->output_channel_ * conv_param_->output_batch_ * conv_param_->output_h_ * - conv_param_->output_w_ * sizeof(float16_t); - fp16_out_ = reinterpret_cast(malloc(fp16_output_size)); - if (fp16_out_ == nullptr) { - MS_LOG(ERROR) << "malloc fp16_out_ failed."; - return RET_ERROR; - } /*=============================tmp_data_============================*/ tmp_data_ = @@ -327,12 +310,6 @@ int ConvolutionWinogradFP16CPUKernel::ReSize() { if (nhwc4_input_ != nullptr) { free(nhwc4_input_); } - if (fp16_input_ != nullptr) { - free(fp16_input_); - } - if (fp16_out_ != nullptr) { - free(fp16_out_); - } auto ret = ConvolutionBaseCPUKernel::Init(); if (ret != RET_OK) { @@ -412,6 +389,7 @@ int ConvolutionWinogradFP16CPUKernel::Run() { // do nothing } ConvolutionBaseFP16CPUKernel::IfCastOutput(); + ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); return RET_OK; } } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h index c5d2f0e3a0..8baacb5b53 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h @@ -34,15 +34,9 @@ class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseFP16CPUKernel { const lite::Primitive *primitive) : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {} ~ConvolutionWinogradFP16CPUKernel() override { - if (fp16_input_ != nullptr) { - free(fp16_input_); - } if (fp16_weight_ != nullptr) { free(fp16_weight_); } - if (fp16_out_ != nullptr) { - free(fp16_out_); - } if (tmp_data_ != nullptr) { free(tmp_data_); } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc index 366232034d..a00a82aefb 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc @@ -185,11 +185,14 @@ int DeconvolutionDepthwiseFp16CPUKernel::Run() { return RET_ERROR; } - auto input_tensor = in_tensors_.at(kInputIndex); - auto input_addr = reinterpret_cast(input_tensor->Data()); + ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Get Execute tensor failed."; + return ret; + } // pack input: to nhwc8 - PackNHWCFp32ToNHWC8Fp16(input_addr, packed_input_, conv_param_->input_batch_, - conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); + PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_, + conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_); ret = LiteBackendParallelLaunch(DeconvDwFp16Run, this, conv_param_->thread_num_); if (ret != RET_OK) { @@ -197,9 +200,10 @@ int DeconvolutionDepthwiseFp16CPUKernel::Run() { return RET_ERROR; } - auto output_addr = reinterpret_cast(out_tensors_.at(kOutputIndex)->Data()); - PackNHWC8Fp16ToNHWCFp32(packed_output_, output_addr, conv_param_->output_batch_, - conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); + PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_, + conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_); + ConvolutionBaseFP16CPUKernel::IfCastOutput(); + ConvolutionBaseFP16CPUKernel::FreeTmpBuffer(); return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h index b7d473490b..79fd014543 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h @@ -19,7 +19,7 @@ #include #include "src/lite_kernel.h" -#include "src/runtime/kernel/arm/base/convolution_base.h" +#include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h" #include "src/runtime/kernel/arm/nnacl/fp16/conv_depthwise_fp16.h" #ifdef __cplusplus @@ -34,12 +34,12 @@ void ComputeStrides(int *shape, int *strides, int ndim); #endif namespace mindspore::kernel { -class DeconvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseCPUKernel { +class DeconvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseFP16CPUKernel { public: DeconvolutionDepthwiseFp16CPUKernel(OpParameter *parameter, const std::vector &inputs, const std::vector &outputs, const Context *ctx, const lite::Primitive *primitive) - : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {} + : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {} ~DeconvolutionDepthwiseFp16CPUKernel() override; int Init() override; diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/pack_fp16.c b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/pack_fp16.c index dd84dcef22..b3872f6cc0 100644 --- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/pack_fp16.c +++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/pack_fp16.c @@ -392,7 +392,7 @@ void PackNHWCToNHWC8Fp16(float16_t *src, float16_t *dst, int batch, int plane, i for (int i = 0; i < plane; i++) { float16_t *dst_plane = dst_batch + i * c8_channel; float16_t *src_plane = src_batch + i * channel; - memcpy(dst_plane, src_batch, channel * sizeof(float16_t)); + memcpy(dst_plane, src_plane, channel * sizeof(float16_t)); } } } @@ -405,7 +405,7 @@ void PackNHWC8ToNHWCFp16(float16_t *src, float16_t *dst, int batch, int plane, i for (int i = 0; i < plane; i++) { float16_t *src_plane = src_batch + i * c8_channel; float16_t *dst_plane = dst_batch + i * channel; - memcpy(dst_plane, src_batch, channel * sizeof(float16_t)); + memcpy(dst_plane, src_plane, channel * sizeof(float16_t)); } } }