From 1b89036bea2faaca697312668e362469e5963e8c Mon Sep 17 00:00:00 2001
From: yangruoqi713 <yangruoqi@huawei.com>
Date: Sun, 16 Aug 2020 15:29:27 +0800
Subject: [PATCH] [MS][LITE] optimize arm cpu fp16 conv op: add common
 converter functions for input and output

---
 .../runtime/kernel/arm/fp16/common_fp16.cc    | 46 +++++++++++++++++++
 .../src/runtime/kernel/arm/fp16/common_fp16.h | 28 +++++++++++
 .../kernel/arm/fp16/convolution_1x1_fp16.cc   | 40 +---------------
 .../kernel/arm/fp16/convolution_1x1_fp16.h    |  7 ---
 .../kernel/arm/fp16/convolution_3x3_fp16.cc   | 25 +---------
 .../kernel/arm/fp16/convolution_3x3_fp16.h    |  6 ---
 .../kernel/arm/fp16/convolution_base_fp16.cc  | 41 ++++++++---------
 .../kernel/arm/fp16/convolution_base_fp16.h   |  8 ++--
 .../arm/fp16/convolution_depthwise_fp16.cc    | 27 ++++-------
 .../arm/fp16/convolution_depthwise_fp16.h     |  6 +--
 .../kernel/arm/fp16/convolution_fp16.cc       | 24 +---------
 .../kernel/arm/fp16/convolution_fp16.h        |  6 ---
 .../kernel/arm/fp16/convolution_sw_fp16.cc    | 24 +---------
 .../kernel/arm/fp16/convolution_sw_fp16.h     |  6 ---
 .../arm/fp16/convolution_winograd_fp16.cc     | 24 +---------
 .../arm/fp16/convolution_winograd_fp16.h      |  6 ---
 .../arm/fp16/deconvolution_depthwise_fp16.cc  | 18 +++++---
 .../arm/fp16/deconvolution_depthwise_fp16.h   |  6 +--
 .../runtime/kernel/arm/nnacl/fp16/pack_fp16.c |  4 +-
 19 files changed, 130 insertions(+), 222 deletions(-)
 create mode 100644 mindspore/lite/src/runtime/kernel/arm/fp16/common_fp16.cc
 create mode 100644 mindspore/lite/src/runtime/kernel/arm/fp16/common_fp16.h
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/common_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/common_fp16.cc
new file mode 100644
index 0000000000..87f5446040
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/common_fp16.cc
@@ -0,0 +1,46 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/kernel/arm/fp16/common_fp16.h"
+#include "src/runtime/kernel/arm/nnacl/fp16/cast_fp16.h"
+
+namespace mindspore::kernel {
+float16_t *ConvertInputFp32toFp16(lite::tensor::Tensor *input, const lite::Context *ctx) {
+  float16_t *fp16_data = nullptr;
+  auto data_type = input->data_type();
+  if (data_type == kNumberTypeFloat32) {
+    auto ele_num = input->ElementsNum();
+    fp16_data = reinterpret_cast<float16_t *>(ctx->allocator->Malloc(ele_num * sizeof(float16_t)));
+    auto ori_data = reinterpret_cast<float *>(input->Data());
+    Float32ToFloat16(ori_data, fp16_data, ele_num);
+  } else {
+    fp16_data = reinterpret_cast<float16_t *>(input->Data());
+  }
+  return fp16_data;
+}
+
+float16_t *MallocOutputFp16(lite::tensor::Tensor *output, const lite::Context *ctx) {
+  float16_t *fp16_data = nullptr;
+  auto data_type = output->data_type();
+  if (data_type == kNumberTypeFloat32) {
+    auto ele_num = output->ElementsNum();
+    fp16_data = reinterpret_cast<float16_t *>(ctx->allocator->Malloc(ele_num * sizeof(float16_t)));
+  } else {
+    fp16_data = reinterpret_cast<float16_t *>(output->Data());
+  }
+  return fp16_data;
+}
+}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/common_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/common_fp16.h
new file mode 100644
index 0000000000..9a177d0924
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/common_fp16.h
@@ -0,0 +1,28 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_COMMON_FP16_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_COMMON_FP16_H_
+
+#include "src/lite_kernel.h"
+
+namespace mindspore::kernel {
+float16_t *ConvertInputFp32toFp16(lite::tensor::Tensor *input, const lite::Context *ctx);
+
+float16_t *MallocOutputFp16(lite::tensor::Tensor *output, const lite::Context *ctx);
+
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_COMMON_FP16_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
index 5d4e361f64..e0e00f5bcf 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
@@ -98,28 +98,6 @@ int Convolution1x1FP16CPUKernel::InitWeightBias() {
   return RET_OK;
 }
 
-int Convolution1x1FP16CPUKernel::InitBuffer() {
-  /*=============================fp16_input_============================*/
-  size_t fp16_input_size = conv_param_->input_channel_ * conv_param_->input_batch_ * conv_param_->input_h_ *
-                           conv_param_->input_w_ * sizeof(float16_t);
-  fp16_input_ = reinterpret_cast<float16_t *>(malloc(fp16_input_size));
-  if (fp16_input_ == nullptr) {
-    MS_LOG(ERROR) << "malloc fp16_input_ failed.";
-    return RET_ERROR;
-  }
-  memset(fp16_input_, 0, fp16_input_size);
-
-  /*=============================fp16_out_============================*/
-  size_t fp16_output_size = conv_param_->output_channel_ * conv_param_->output_batch_ * conv_param_->output_h_ *
-                            conv_param_->output_w_ * sizeof(float16_t);
-  fp16_out_ = reinterpret_cast<float16_t *>(malloc(fp16_output_size));
-  if (fp16_out_ == nullptr) {
-    MS_LOG(ERROR) << "malloc fp16_out_ failed.";
-    return RET_ERROR;
-  }
-  return RET_OK;
-}
-
 int Convolution1x1FP16CPUKernel::Init() {
   auto ret = ConvolutionBaseCPUKernel::Init();
   if (ret != RET_OK) {
@@ -136,11 +114,6 @@ int Convolution1x1FP16CPUKernel::Init() {
     MS_LOG(ERROR) << "Init conv1x1 param failed.";
     return ret;
   }
-  ret = InitBuffer();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Init buffer failed.";
-    return ret;
-  }
   ret = InitWeightBias();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Init weight bias failed.";
@@ -150,12 +123,6 @@ int Convolution1x1FP16CPUKernel::Init() {
 }
 
 int Convolution1x1FP16CPUKernel::ReSize() {
-  if (fp16_out_ != nullptr) {
-    free(fp16_out_);
-  }
-  if (fp16_input_ != nullptr) {
-    free(fp16_input_);
-  }
   if (fp16_weight_ != nullptr) {
     free(fp16_weight_);
   }
@@ -181,12 +148,6 @@ int Convolution1x1FP16CPUKernel::ReSize() {
     MS_LOG(ERROR) << "Init conv1x1 param failed.";
     return ret;
   }
-  ret = InitBuffer();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Init buffer failed.";
-    return ret;
-  }
-
   return RET_OK;
 }
 
@@ -253,6 +214,7 @@ int Convolution1x1FP16CPUKernel::Run() {
   }
 
   ConvolutionBaseFP16CPUKernel::IfCastOutput();
+  ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
   return RET_OK;
 }
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h
index d88e79755f..919fe41f36 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h
@@ -35,15 +35,9 @@ class Convolution1x1FP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
     matmul_param_ = new MatMulParameter();
   }
   ~Convolution1x1FP16CPUKernel() override {
-    if (fp16_input_ != nullptr) {
-      free(fp16_input_);
-    }
     if (fp16_weight_ != nullptr) {
       free(fp16_weight_);
     }
-    if (fp16_out_ != nullptr) {
-      free(fp16_out_);
-    }
     if (input_ptr_ != nullptr) {
       free(input_ptr_);
     }
@@ -57,7 +51,6 @@ class Convolution1x1FP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
   int ReSize() override;
   int Run() override;
   int RunImpl(int task_id);
-  int InitBuffer();
   int InitConv1x1Param();
   int InitMatmulParam();
   int InitWeightBias();
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc
index fa129b3c81..b111624431 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc
@@ -132,16 +132,6 @@ int Convolution3x3FP16CPUKernel::InitTmpBuffer() {
   }
   memset(tmp_out_, 0, tmp_out_size);
 
-  /*=============================fp16_input_============================*/
-  size_t fp16_input_size = conv_param_->input_channel_ * conv_param_->input_batch_ * conv_param_->input_h_ *
-                           conv_param_->input_w_ * sizeof(float16_t);
-  fp16_input_ = reinterpret_cast<float16_t *>(malloc(fp16_input_size));
-  if (fp16_input_ == nullptr) {
-    MS_LOG(ERROR) << "malloc fp16_input_ failed.";
-    return RET_ERROR;
-  }
-  memset(fp16_input_, 0, fp16_input_size);
-
   /*=============================nhwc4_input_============================*/
   size_t nhwc4_input_size =
     iC4 * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t);
@@ -152,14 +142,6 @@ int Convolution3x3FP16CPUKernel::InitTmpBuffer() {
   }
   memset(nhwc4_input_, 0, nhwc4_input_size);
 
-  /*=============================fp16_out_============================*/
-  size_t fp16_output_size = conv_param_->output_channel_ * conv_param_->output_batch_ * conv_param_->output_h_ *
-                            conv_param_->output_w_ * sizeof(float16_t);
-  fp16_out_ = reinterpret_cast<float16_t *>(malloc(fp16_output_size));
-  if (fp16_out_ == nullptr) {
-    MS_LOG(ERROR) << "malloc fp16_out_ failed.";
-    return RET_ERROR;
-  }
   return RET_OK;
 }
 
@@ -207,12 +189,6 @@ int Convolution3x3FP16CPUKernel::ReSize() {
   if (tmp_out_ != nullptr) {
     free(tmp_out_);
   }
-  if (fp16_out_ != nullptr) {
-    free(fp16_out_);
-  }
-  if (fp16_input_ != nullptr) {
-    free(fp16_input_);
-  }
   if (nhwc4_input_ != nullptr) {
     free(nhwc4_input_);
   }
@@ -304,6 +280,7 @@ int Convolution3x3FP16CPUKernel::Run() {
   }
 
   ConvolutionBaseFP16CPUKernel::IfCastOutput();
+  ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
   return RET_OK;
 }
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h
index e0e7e516ad..d61ec538be 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.h
@@ -31,15 +31,9 @@ class Convolution3x3FP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
                               const lite::Primitive *primitive)
       : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {}
   ~Convolution3x3FP16CPUKernel() override {
-    if (fp16_input_ != nullptr) {
-      free(fp16_input_);
-    }
     if (fp16_weight_ != nullptr) {
       free(fp16_weight_);
     }
-    if (fp16_out_ != nullptr) {
-      free(fp16_out_);
-    }
     if (transformed_filter_addr_ != nullptr) {
       free(transformed_filter_addr_);
     }
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.cc
index a56c4d06f4..5d44fbb003 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.cc
@@ -16,6 +16,7 @@
 
 #include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h"
 #include "src/runtime/kernel/arm/nnacl/fp16/cast_fp16.h"
+#include "src/runtime/kernel/arm/fp16/common_fp16.h"
 #include "schema/model_generated.h"
 #include "src/kernel_factory.h"
 #include "include/errorcode.h"
@@ -25,28 +26,17 @@ namespace mindspore::kernel {
 int ConvolutionBaseFP16CPUKernel::GetExecuteTensor() {
   // ===================input====================//
   auto input_tensor = in_tensors_.at(kInputIndex);
-  auto input_data_type = input_tensor->data_type();
-  MS_ASSERT(input_data_type == kNumberTypeFloat32 || input_data_type == kNumberTypeFloat16);
-  if (input_data_type == kNumberTypeFloat32) {
-    auto input_ele_num = input_tensor->ElementsNum();
-    auto ori_input_data = reinterpret_cast<float *>(input_tensor->Data());
-    Float32ToFloat16(ori_input_data, fp16_input_, input_ele_num);
-    execute_input_ = fp16_input_;
-  } else {
-    auto ori_input_data = reinterpret_cast<float16_t *>(input_tensor->Data());
-    execute_input_ = ori_input_data;
-  }
+  in_data_type_ = input_tensor->data_type();
+  MS_ASSERT(in_data_type_ == kNumberTypeFloat32 || in_data_type_ == kNumberTypeFloat16);
+
+  execute_input_ = ConvertInputFp32toFp16(input_tensor, context_);
+
   // ==================output====================//
   auto out_tensor = out_tensors_.at(kOutputIndex);
-  auto out_data_type = out_tensor->data_type();
-  MS_ASSERT(out_data_type == kNumberTypeFloat32 || out_data_type == kNumberTypeFloat16);
-  out_data_type_ = out_data_type;
-  if (out_data_type == kNumberTypeFloat32) {
-    execute_output_ = fp16_out_;
-  } else {
-    auto out_ptr = reinterpret_cast<float16_t *>(out_tensor->Data());
-    execute_output_ = out_ptr;
-  }
+  out_data_type_ = out_tensor->data_type();
+  MS_ASSERT(out_data_type_ == kNumberTypeFloat32 || out_data_type_ == kNumberTypeFloat16);
+
+  execute_output_ = MallocOutputFp16(out_tensor, context_);
   return RET_OK;
 }
 
@@ -79,7 +69,16 @@ void ConvolutionBaseFP16CPUKernel::IfCastOutput() {
     auto out_tensor = out_tensors_.at(kOutputIndex);
     auto out_ele_num = out_tensor->ElementsNum();
     auto output_addr = reinterpret_cast<float *>(out_tensor->Data());
-    Float16ToFloat32(fp16_out_, output_addr, out_ele_num);
+    Float16ToFloat32(execute_output_, output_addr, out_ele_num);
+  }
+}
+
+void ConvolutionBaseFP16CPUKernel::FreeTmpBuffer() {
+  if (in_data_type_ == kNumberTypeFloat32) {
+    context_->allocator->Free(execute_input_);
+  }
+  if (out_data_type_ == kNumberTypeFloat32) {
+    context_->allocator->Free(execute_output_);
   }
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.h
index c4845a762c..e8d0f1eb81 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.h
@@ -39,14 +39,14 @@ class ConvolutionBaseFP16CPUKernel : public ConvolutionBaseCPUKernel {
   virtual int GetExecuteTensor();
   virtual int GetExecuteFilter();
   virtual void IfCastOutput();
+  void FreeTmpBuffer();
 
  protected:
-  float16_t *fp16_input_ = nullptr;
   float16_t *fp16_weight_ = nullptr;
-  float16_t *fp16_out_ = nullptr;
-  float16_t *execute_input_;
+  float16_t *execute_input_;  // ctx allocator malloc and free
   float16_t *execute_weight_;
-  float16_t *execute_output_;
+  float16_t *execute_output_;  // ctx allocator malloc and free
+  TypeId in_data_type_;
   TypeId out_data_type_;
 };
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
index ed38b6b82e..b247e32e38 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
@@ -173,22 +173,13 @@ int ConvolutionDepthwiseFp16CPUKernel::Run() {
     return RET_ERROR;
   }
 
-  auto input_tensor = in_tensors_.at(kInputIndex);
-  float16_t *input_addr;
-  if (input_tensor->data_type() == kNumberTypeFloat32) {
-    input_addr =
-      reinterpret_cast<float16_t *>(context_->allocator->Malloc(input_tensor->ElementsNum() * sizeof(float16_t)));
-    if (input_addr == nullptr) {
-      MS_LOG(ERROR) << "Malloc buffer failed.";
-      return RET_ERROR;
-    }
-    Float32ToFloat16(reinterpret_cast<float *>(input_tensor->Data()), input_addr, input_tensor->ElementsNum());
-  } else {
-    input_addr = reinterpret_cast<float16_t *>(input_tensor->Data());
+  ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Get Execute tensor failed.";
+    return ret;
   }
-
   // pack input: to nhwc8
-  PackNHWCToNHWC8Fp16(input_addr, packed_input_, conv_param_->input_batch_,
+  PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_,
                       conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
 
   ret = LiteBackendParallelLaunch(ConvDwFp16Run, this, conv_param_->thread_num_);
@@ -197,13 +188,11 @@ int ConvolutionDepthwiseFp16CPUKernel::Run() {
     return RET_ERROR;
   }
 
-  auto output_addr = reinterpret_cast<float16_t *>(out_tensors_.at(kOutputIndex)->Data());
-  PackNHWC8ToNHWCFp16(packed_output_, output_addr, conv_param_->output_batch_,
+  PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_,
                       conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
 
-  if (input_tensor->data_type() == kNumberTypeFloat32) {
-    context_->allocator->Free(input_addr);
-  }
+  ConvolutionBaseFP16CPUKernel::IfCastOutput();
+  ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
   return RET_OK;
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h
index 6003efaf5f..00827510f3 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h
@@ -19,7 +19,7 @@
 
 #include <vector>
 #include "src/lite_kernel.h"
-#include "src/runtime/kernel/arm/base/convolution_base.h"
+#include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h"
 #include "src/runtime/kernel/arm/nnacl/fp16/conv_depthwise_fp16.h"
 
 #ifdef __cplusplus
@@ -34,12 +34,12 @@ void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const flo
 
 
 namespace mindspore::kernel {
-class ConvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseCPUKernel {
+class ConvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseFP16CPUKernel {
  public:
   ConvolutionDepthwiseFp16CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
                                     const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx,
                                     const lite::Primitive *primitive)
-      : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
+      : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {}
   ~ConvolutionDepthwiseFp16CPUKernel() override;
 
   int Init() override;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
index 7cb95f8165..a3ea280767 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
@@ -103,15 +103,6 @@ int ConvolutionFP16CPUKernel::InitTmpBuffer() {
   }
   memset(packed_input_, 0, in_batch * packed_input_size * sizeof(float16_t));
 
-  /*=============================fp16_input_============================*/
-  size_t fp16_input_size =
-    in_channel * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t);
-  fp16_input_ = reinterpret_cast<float16_t *>(malloc(fp16_input_size));
-  if (fp16_input_ == nullptr) {
-    MS_LOG(ERROR) << "malloc fp16_input_ failed.";
-    return RET_ERROR;
-  }
-
   /*=============================nhwc4_input_============================*/
   size_t nhwc4_input_size = channel_block * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ *
                             conv_param_->input_w_ * sizeof(float16_t);
@@ -129,14 +120,6 @@ int ConvolutionFP16CPUKernel::InitTmpBuffer() {
     return RET_ERROR;
   }
 
-  /*=============================fp16_out_============================*/
-  size_t fp16_output_size =
-    out_channel * conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * sizeof(float16_t);
-  fp16_out_ = reinterpret_cast<float16_t *>(malloc(fp16_output_size));
-  if (fp16_out_ == nullptr) {
-    MS_LOG(ERROR) << "malloc fp16_out_ failed.";
-    return RET_ERROR;
-  }
   return RET_OK;
 }
 
@@ -181,12 +164,6 @@ int ConvolutionFP16CPUKernel::ReSize() {
   if (nhwc4_input_ != nullptr) {
     free(nhwc4_input_);
   }
-  if (fp16_input_ != nullptr) {
-    free(fp16_input_);
-  }
-  if (fp16_out_ != nullptr) {
-    free(fp16_out_);
-  }
 
   auto ret = ConvolutionBaseCPUKernel::Init();
   if (ret != RET_OK) {
@@ -242,6 +219,7 @@ int ConvolutionFP16CPUKernel::Run() {
     return RET_ERROR;
   }
 
+  ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
   ConvolutionBaseFP16CPUKernel::IfCastOutput();
   return RET_OK;
 }
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.h
index ad53277c88..0a4364235f 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.h
@@ -30,15 +30,9 @@ class ConvolutionFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
                            const lite::Primitive *primitive)
       : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {}
   ~ConvolutionFP16CPUKernel() override {
-    if (fp16_input_ != nullptr) {
-      free(fp16_input_);
-    }
     if (fp16_weight_ != nullptr) {
       free(fp16_weight_);
     }
-    if (fp16_out_ != nullptr) {
-      free(fp16_out_);
-    }
     if (packed_input_ != nullptr) {
       free(packed_input_);
     }
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.cc
index 465ab99e20..516f001f92 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.cc
@@ -106,15 +106,6 @@ int ConvolutionSWFP16CPUKernel::InitTmpBuffer() {
   int channel_block = UP_DIV(in_channel, C4NUM);
   int oc4 = UP_DIV(out_channel, C4NUM);
 
-  /*=============================fp16_input_============================*/
-  size_t fp16_input_size =
-    in_channel * conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * sizeof(float16_t);
-  fp16_input_ = reinterpret_cast<float16_t *>(malloc(fp16_input_size));
-  if (fp16_input_ == nullptr) {
-    MS_LOG(ERROR) << "malloc fp16_input_ failed.";
-    return RET_ERROR;
-  }
-
   /*=============================nhwc4_input_============================*/
   size_t nhwc4_input_size = channel_block * C4NUM * conv_param_->input_batch_ * conv_param_->input_h_ *
                             conv_param_->input_w_ * sizeof(float16_t);
@@ -133,14 +124,6 @@ int ConvolutionSWFP16CPUKernel::InitTmpBuffer() {
     return RET_ERROR;
   }
 
-  /*=============================fp16_out_============================*/
-  size_t fp16_output_size =
-    out_channel * conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * sizeof(float16_t);
-  fp16_out_ = reinterpret_cast<float16_t *>(malloc(fp16_output_size));
-  if (fp16_out_ == nullptr) {
-    MS_LOG(ERROR) << "malloc fp16_out_ failed.";
-    return RET_ERROR;
-  }
   return RET_OK;
 }
 
@@ -186,12 +169,6 @@ int ConvolutionSWFP16CPUKernel::ReSize() {
   if (nhwc4_input_ != nullptr) {
     free(nhwc4_input_);
   }
-  if (fp16_input_ != nullptr) {
-    free(fp16_input_);
-  }
-  if (fp16_out_ != nullptr) {
-    free(fp16_out_);
-  }
   delete slidingWindow_param_;
 
   auto ret = ConvolutionBaseCPUKernel::Init();
@@ -258,6 +235,7 @@ int ConvolutionSWFP16CPUKernel::Run() {
                         conv_param_->output_channel_);
   }
   ConvolutionBaseFP16CPUKernel::IfCastOutput();
+  ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
   return RET_OK;
 }
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.h
index ce9aa0b674..aa81475fd0 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.h
@@ -29,15 +29,9 @@ class ConvolutionSWFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
                              const lite::Primitive *primitive)
       : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {}
   ~ConvolutionSWFP16CPUKernel() override {
-    if (fp16_input_ != nullptr) {
-      free(fp16_input_);
-    }
     if (fp16_weight_ != nullptr) {
       free(fp16_weight_);
     }
-    if (fp16_out_ != nullptr) {
-      free(fp16_out_);
-    }
     if (packed_weight_ != nullptr) {
       free(packed_weight_);
     }
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
index a3866c14e3..495cc10319 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
@@ -187,15 +187,6 @@ int ConvolutionWinogradFP16CPUKernel::InitTmpBuffer() {
   int ic4 = UP_DIV(channel_in, C4NUM);
   int oc8 = UP_DIV(channel_out, C8NUM);
 
-  /*=============================fp16_input_============================*/
-  size_t fp16_input_size = conv_param_->input_channel_ * conv_param_->input_batch_ * conv_param_->input_h_ *
-                           conv_param_->input_w_ * sizeof(float16_t);
-  fp16_input_ = reinterpret_cast<float16_t *>(malloc(fp16_input_size));
-  if (fp16_input_ == nullptr) {
-    MS_LOG(ERROR) << "malloc fp16_input_ failed.";
-    return RET_ERROR;
-  }
-
   /*=============================trans_input_============================*/
   size_t tile_buffer_size = thread_count_ * cal_num * input_unit_ * input_unit_ * ic4 * C4NUM * sizeof(float16_t);
   trans_input_ = reinterpret_cast<float16_t *>(malloc(tile_buffer_size));
@@ -222,14 +213,6 @@ int ConvolutionWinogradFP16CPUKernel::InitTmpBuffer() {
     MS_LOG(ERROR) << "malloc tmp_out_data_ failed.";
     return RET_ERROR;
   }
-  /*=============================fp16_out_============================*/
-  size_t fp16_output_size = conv_param_->output_channel_ * conv_param_->output_batch_ * conv_param_->output_h_ *
-                            conv_param_->output_w_ * sizeof(float16_t);
-  fp16_out_ = reinterpret_cast<float16_t *>(malloc(fp16_output_size));
-  if (fp16_out_ == nullptr) {
-    MS_LOG(ERROR) << "malloc fp16_out_ failed.";
-    return RET_ERROR;
-  }
 
   /*=============================tmp_data_============================*/
   tmp_data_ =
@@ -327,12 +310,6 @@ int ConvolutionWinogradFP16CPUKernel::ReSize() {
   if (nhwc4_input_ != nullptr) {
     free(nhwc4_input_);
   }
-  if (fp16_input_ != nullptr) {
-    free(fp16_input_);
-  }
-  if (fp16_out_ != nullptr) {
-    free(fp16_out_);
-  }
 
   auto ret = ConvolutionBaseCPUKernel::Init();
   if (ret != RET_OK) {
@@ -412,6 +389,7 @@ int ConvolutionWinogradFP16CPUKernel::Run() {
     // do nothing
   }
   ConvolutionBaseFP16CPUKernel::IfCastOutput();
+  ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
   return RET_OK;
 }
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h
index c5d2f0e3a0..8baacb5b53 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h
@@ -34,15 +34,9 @@ class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
                                    const lite::Primitive *primitive)
       : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {}
   ~ConvolutionWinogradFP16CPUKernel() override {
-    if (fp16_input_ != nullptr) {
-      free(fp16_input_);
-    }
     if (fp16_weight_ != nullptr) {
       free(fp16_weight_);
     }
-    if (fp16_out_ != nullptr) {
-      free(fp16_out_);
-    }
     if (tmp_data_ != nullptr) {
       free(tmp_data_);
     }
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc
index 366232034d..a00a82aefb 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc
@@ -185,11 +185,14 @@ int DeconvolutionDepthwiseFp16CPUKernel::Run() {
     return RET_ERROR;
   }
 
-  auto input_tensor = in_tensors_.at(kInputIndex);
-  auto input_addr = reinterpret_cast<float *>(input_tensor->Data());
+  ret = ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Get Execute tensor failed.";
+    return ret;
+  }
   // pack input: to nhwc8
-  PackNHWCFp32ToNHWC8Fp16(input_addr, packed_input_, conv_param_->input_batch_,
-                          conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
+  PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_,
+                      conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
 
   ret = LiteBackendParallelLaunch(DeconvDwFp16Run, this, conv_param_->thread_num_);
   if (ret != RET_OK) {
@@ -197,9 +200,10 @@ int DeconvolutionDepthwiseFp16CPUKernel::Run() {
     return RET_ERROR;
   }
 
-  auto output_addr = reinterpret_cast<float *>(out_tensors_.at(kOutputIndex)->Data());
-  PackNHWC8Fp16ToNHWCFp32(packed_output_, output_addr, conv_param_->output_batch_,
-                          conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
+  PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_,
+                      conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
+  ConvolutionBaseFP16CPUKernel::IfCastOutput();
+  ConvolutionBaseFP16CPUKernel::FreeTmpBuffer();
   return RET_OK;
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h
index b7d473490b..79fd014543 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h
@@ -19,7 +19,7 @@
 
 #include <vector>
 #include "src/lite_kernel.h"
-#include "src/runtime/kernel/arm/base/convolution_base.h"
+#include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h"
 #include "src/runtime/kernel/arm/nnacl/fp16/conv_depthwise_fp16.h"
 
 #ifdef __cplusplus
@@ -34,12 +34,12 @@ void ComputeStrides(int *shape, int *strides, int ndim);
 #endif
 
 namespace mindspore::kernel {
-class DeconvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseCPUKernel {
+class DeconvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseFP16CPUKernel {
  public:
   DeconvolutionDepthwiseFp16CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
                                       const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx,
                                       const lite::Primitive *primitive)
-      : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
+      : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {}
   ~DeconvolutionDepthwiseFp16CPUKernel() override;
 
   int Init() override;
diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/pack_fp16.c b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/pack_fp16.c
index dd84dcef22..b3872f6cc0 100644
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/pack_fp16.c
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp16/pack_fp16.c
@@ -392,7 +392,7 @@ void PackNHWCToNHWC8Fp16(float16_t *src, float16_t *dst, int batch, int plane, i
     for (int i = 0; i < plane; i++) {
       float16_t *dst_plane = dst_batch + i * c8_channel;
       float16_t *src_plane = src_batch + i * channel;
-      memcpy(dst_plane, src_batch, channel * sizeof(float16_t));
+      memcpy(dst_plane, src_plane, channel * sizeof(float16_t));
     }
   }
 }
@@ -405,7 +405,7 @@ void PackNHWC8ToNHWCFp16(float16_t *src, float16_t *dst, int batch, int plane, i
     for (int i = 0; i < plane; i++) {
       float16_t *src_plane = src_batch + i * c8_channel;
       float16_t *dst_plane = dst_batch + i * channel;
-      memcpy(dst_plane, src_batch, channel * sizeof(float16_t));
+      memcpy(dst_plane, src_plane, channel * sizeof(float16_t));
     }
   }
 }