From 3e9ebc745f2719991967be0883a36b73f19b1176 Mon Sep 17 00:00:00 2001
From: yangruoqi713 <yangruoqi@huawei.com>
Date: Mon, 12 Apr 2021 17:38:04 +0800
Subject: [PATCH] [MSLITE][DEVELOP] rectify arm cpu fp16 conv register

---
 .../arm/base/group_convolution_creator.cc     | 109 +++-----
 .../arm/base/group_convolution_creator.h      |  28 +--
 .../kernel/arm/fp16/convolution_1x1_fp16.cc   |  22 +-
 .../kernel/arm/fp16/convolution_1x1_fp16.h    |   8 +-
 .../kernel/arm/fp16/convolution_base_fp16.cc  |  52 ----
 .../kernel/arm/fp16/convolution_base_fp16.h   |  57 -----
 .../arm/fp16/convolution_delegate_fp16.cc     | 235 +++---------------
 .../arm/fp16/convolution_delegate_fp16.h      |  12 +-
 .../arm/fp16/convolution_depthwise_fp16.cc    |  31 +--
 .../arm/fp16/convolution_depthwise_fp16.h     |  14 +-
 .../convolution_depthwise_slidewindow_fp16.cc |  28 ++-
 .../convolution_depthwise_slidewindow_fp16.h  |  14 +-
 .../kernel/arm/fp16/convolution_fp16.cc       |  24 +-
 .../kernel/arm/fp16/convolution_fp16.h        |   8 +-
 .../arm/fp16/convolution_winograd_fp16.cc     |  38 +--
 .../arm/fp16/convolution_winograd_fp16.h      |   9 +-
 .../arm/fp16/deconvolution_depthwise_fp16.cc  |  25 +-
 .../arm/fp16/deconvolution_depthwise_fp16.h   |   9 +-
 .../kernel/arm/fp16/deconvolution_fp16.cc     |  43 ++--
 .../kernel/arm/fp16/deconvolution_fp16.h      |  10 +-
 .../arm/fp16/deconvolution_winograd_fp16.cc   |  30 +--
 .../arm/fp16/deconvolution_winograd_fp16.h    |   9 +-
 .../arm/fp32/convolution_delegate_fp32.cc     |  32 ++-
 .../arm/int8/convolution_int8_creator.cc      |  24 ++
 24 files changed, 273 insertions(+), 598 deletions(-)
 delete mode 100644 mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.cc
 delete mode 100644 mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.h

diff --git a/mindspore/lite/src/runtime/kernel/arm/base/group_convolution_creator.cc b/mindspore/lite/src/runtime/kernel/arm/base/group_convolution_creator.cc
index 5e447781a5..23895fddd1 100644
--- a/mindspore/lite/src/runtime/kernel/arm/base/group_convolution_creator.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/group_convolution_creator.cc
@@ -15,10 +15,6 @@
  */
 
 #include "src/runtime/kernel/arm/base/group_convolution_creator.h"
-#include "src/runtime/kernel/arm/base/group_convolution.h"
-#include "src/runtime/kernel/arm/int8/convolution_int8_creator.h"
-#include "src/runtime/kernel/arm/fp32/convolution_delegate_fp32.h"
-#include "src/runtime/kernel/arm/int8/group_convolution_int8.h"
 
 namespace mindspore::kernel {
 void CopyTensorQuantParam(lite::Tensor *dst, lite::Tensor *src) {
@@ -37,15 +33,11 @@ ConvParameter *CreateNewConvParameter(ConvParameter *parameter) {
   return conv_parameter;
 }
 
-void FreeMemory(ConvParameter *conv_param, const std::vector<lite::Tensor *> &new_inputs,
-                const std::vector<lite::Tensor *> &new_outputs) {
-  if (conv_param != nullptr) {
-    free(conv_param);
-  }
-  for (auto &in_tensor : new_inputs) {
+void FreeMemory(const std::vector<lite::Tensor *> *new_inputs, const std::vector<lite::Tensor *> *new_outputs) {
+  for (auto &in_tensor : *new_inputs) {
     delete in_tensor;
   }
-  for (auto &out_tensor : new_outputs) {
+  for (auto &out_tensor : *new_outputs) {
     delete out_tensor;
   }
 }
@@ -106,6 +98,7 @@ void GroupConvCreator::CopyQuantParam(std::vector<lite::Tensor *> *tensors) {
     CopyTensorQuantParam(tensors->at(j), origin_inputs_.at(j));
   }
 }
+
 bool GroupConvCreator::CheckIfValidPoint(void *ptr) {
   if (ptr == nullptr) {
     for (auto &sub_conv : group_convs_) {
@@ -117,18 +110,17 @@ bool GroupConvCreator::CheckIfValidPoint(void *ptr) {
 }
 
 int GroupConvCreator::NewInputTensor(std::vector<lite::Tensor *> *tensors) {
-  auto in_tensor = CreateVarTensor(
-    {input_shape_, schema::Format_NHWC, origin_inputs_.at(0)->data_type(), lite::Tensor::Category::VAR, true},
-    infered_);
+  auto in_tensor =
+    CreateVarTensor({input_shape_, schema::Format_NHWC, data_type_, lite::Tensor::Category::VAR, true}, infered_);
   if (!CheckIfValidPoint(in_tensor)) {
     return lite::RET_ERROR;
   }
   tensors->emplace_back(in_tensor);
   return lite::RET_OK;
 }
+
 int GroupConvCreator::NewOutputTensor(std::vector<lite::Tensor *> *tensors, lite::Tensor *output) {
-  auto out_tensor =
-    CreateVarTensor({output_shape_, output->format(), output->data_type(), output->category(), false}, infered_);
+  auto out_tensor = CreateVarTensor({output_shape_, output->format(), data_type_, output->category(), false}, infered_);
   if (!CheckIfValidPoint(out_tensor)) {
     return lite::RET_ERROR;
   }
@@ -153,6 +145,7 @@ int GroupConvCreator::NewConstTensor(std::vector<lite::Tensor *> *tensors, int g
   }
   return lite::RET_OK;
 }
+
 void GroupConvCreator::SetShapeOfTensors() {
   int new_in_channel = origin_inputs_.at(kWeightIndex)->Channel();
   int new_out_channel;
@@ -176,71 +169,31 @@ void GroupConvCreator::SetShapeOfTensors() {
   }
 }
 
-int GroupConvCreator::CreatGroupConv() {
-  for (int i = 0; i < conv_param_->group_; ++i) {
-    auto new_conv_parameter = CreateNewConvParameter(conv_param_);
-    if (!CheckIfValidPoint(new_conv_parameter)) {
-      return lite::RET_ERROR;
-    }
-    // create new input for each group
-    std::vector<lite::Tensor *> new_inputs;
-    if (NewInputTensor(&new_inputs) != lite::RET_OK) {
-      MS_LOG(ERROR) << "new input tensor failed.";
-      FreeMemory(new_conv_parameter, new_inputs, {});
-      return lite::RET_ERROR;
-    }
-    // const tensor
-    if (NewConstTensor(&new_inputs, i) != lite::RET_OK) {
-      MS_LOG(ERROR) << "new const tensor failed.";
-      FreeMemory(new_conv_parameter, new_inputs, {});
+int GroupConvCreator::GetSingleConvParam(ConvParameter *conv_param, std::vector<lite::Tensor *> *new_inputs,
+                                         std::vector<lite::Tensor *> *new_outputs, int group_id) {
+  if (!CheckIfValidPoint(conv_param)) {
+    return lite::RET_ERROR;
+  }
+  // create new input for each group
+  if (NewInputTensor(new_inputs) != lite::RET_OK) {
+    MS_LOG(ERROR) << "new input tensor failed.";
+    FreeMemory(new_inputs, {});
+    return lite::RET_ERROR;
+  }
+  // const tensor
+  if (NewConstTensor(new_inputs, group_id) != lite::RET_OK) {
+    MS_LOG(ERROR) << "new const tensor failed.";
+    FreeMemory(new_inputs, {});
+    return lite::RET_ERROR;
+  }
+  // create new output tensor
+  for (auto &output : origin_outputs_) {
+    if (NewOutputTensor(new_outputs, output) != lite::RET_OK) {
+      MS_LOG(ERROR) << "new output tensor failed.";
+      FreeMemory(new_inputs, new_outputs);
       return lite::RET_ERROR;
     }
-    // create new output tensor
-    std::vector<lite::Tensor *> new_outputs;
-    for (auto &output : origin_outputs_) {
-      if (NewOutputTensor(&new_outputs, output) != lite::RET_OK) {
-        MS_LOG(ERROR) << "new output tensor failed.";
-        FreeMemory(new_conv_parameter, new_inputs, new_outputs);
-        return lite::RET_ERROR;
-      }
-    }
-
-    if (is_quant_) {
-      CopyQuantParam(&new_inputs);
-      group_convs_.emplace_back(CpuConvInt8KernelSelect(new_inputs, new_outputs,
-                                                        reinterpret_cast<OpParameter *>(new_conv_parameter), context_));
-    } else {
-      group_convs_.emplace_back(new (std::nothrow) kernel::ConvolutionDelegateCPUKernel(
-        reinterpret_cast<OpParameter *>(new_conv_parameter), new_inputs, new_outputs, context_));
-    }
   }
   return lite::RET_OK;
 }
-
-kernel::LiteKernel *CpuGroupConvFp32KernelCreator(const std::vector<lite::Tensor *> &inputs,
-                                                  const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
-                                                  const lite::InnerContext *ctx) {
-  GroupConvCreator group_conv_creator(inputs, outputs, op_parameter, ctx, false);
-  group_conv_creator.SetShapeOfTensors();
-  if (group_conv_creator.CreatGroupConv() != lite::RET_OK) {
-    MS_LOG(ERROR) << "Create fp32 group conv failed.";
-    return nullptr;
-  }
-  return new (std::nothrow)
-    GroupConvolutionCPUKernel(op_parameter, inputs, outputs, ctx, group_conv_creator.get_group_conv(),
-                              reinterpret_cast<ConvParameter *>(op_parameter)->group_);
-}
-
-kernel::LiteKernel *CpuGroupConvInt8KernelCreator(const std::vector<lite::Tensor *> &inputs,
-                                                  const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
-                                                  const lite::InnerContext *ctx, int group) {
-  GroupConvCreator group_conv_creator(inputs, outputs, op_parameter, ctx, true);
-  group_conv_creator.SetShapeOfTensors();
-  if (group_conv_creator.CreatGroupConv() != lite::RET_OK) {
-    MS_LOG(ERROR) << "Create int8 group conv failed.";
-    return nullptr;
-  }
-  return new (std::nothrow)
-    GroupConvolutionInt8CPUKernel(op_parameter, inputs, outputs, ctx, group_conv_creator.get_group_conv(), group);
-}
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/base/group_convolution_creator.h b/mindspore/lite/src/runtime/kernel/arm/base/group_convolution_creator.h
index 5c0e616ede..c0f844626f 100644
--- a/mindspore/lite/src/runtime/kernel/arm/base/group_convolution_creator.h
+++ b/mindspore/lite/src/runtime/kernel/arm/base/group_convolution_creator.h
@@ -34,12 +34,12 @@ struct TensorInfo {
 class GroupConvCreator {
  public:
   GroupConvCreator(std::vector<lite::Tensor *> inputs, std::vector<lite::Tensor *> outputs, OpParameter *op_parameter,
-                   const lite::InnerContext *ctx, bool is_quant)
+                   const lite::InnerContext *ctx, bool is_quant, TypeId data_type)
       : origin_inputs_(std::move(inputs)),
         origin_outputs_(std::move(outputs)),
-        context_(ctx),
         infered_(op_parameter->infer_flag_),
-        is_quant_(is_quant) {
+        is_quant_(is_quant),
+        data_type_(data_type) {
     conv_param_ = reinterpret_cast<ConvParameter *>(op_parameter);
   }
 
@@ -47,15 +47,16 @@ class GroupConvCreator {
 
  public:
   void SetShapeOfTensors();
-  int CreatGroupConv();
-  std::vector<kernel::LiteKernel *> get_group_conv() { return group_convs_; }
+  std::vector<kernel::LiteKernel *> *get_group_conv() { return &group_convs_; }
+  void CopyQuantParam(std::vector<lite::Tensor *> *tensors);
+  int GetSingleConvParam(ConvParameter *conv_param, std::vector<lite::Tensor *> *new_inputs,
+                         std::vector<lite::Tensor *> *new_outputs, int group_id);
 
  protected:
   void set_input_shape(const std::vector<int> &shape) { input_shape_ = shape; }
   void set_output_shape(const std::vector<int> &shape) { output_shape_ = shape; }
   void set_filter_shape(const std::vector<int> &shape) { filter_shape_ = shape; }
   void set_bias_shape(const std::vector<int> &shape) { bias_shape_ = shape; }
-  void CopyQuantParam(std::vector<lite::Tensor *> *tensors);
   bool CheckIfValidPoint(void *ptr);
   int NewInputTensor(std::vector<lite::Tensor *> *tensors);
   int NewConstTensor(std::vector<lite::Tensor *> *tensors, int group_id);
@@ -69,20 +70,13 @@ class GroupConvCreator {
   std::vector<int> output_shape_;
   std::vector<int> filter_shape_;
   std::vector<int> bias_shape_;
-  const lite::InnerContext *context_;
   ConvParameter *conv_param_;
-  bool infered_;
-  bool is_quant_;
+  bool infered_ = false;
+  bool is_quant_ = false;
+  TypeId data_type_;
 };
 
-LiteKernel *CpuGroupConvFp32KernelCreator(const std::vector<lite::Tensor *> &inputs,
-                                          const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
-                                          const lite::InnerContext *ctx);
-
-LiteKernel *CpuGroupConvInt8KernelCreator(const std::vector<lite::Tensor *> &inputs,
-                                          const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
-                                          const lite::InnerContext *ctx, int group);
-
+ConvParameter *CreateNewConvParameter(ConvParameter *parameter);
 }  // namespace mindspore::kernel
 
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_GROUP_CONVOLUTION_CREATOR_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
index 09b6e20cd4..c47450f8e6 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
@@ -88,12 +88,7 @@ int Convolution1x1FP16CPUKernel::InitWeightBias() {
       MS_LOG(ERROR) << "Conv1x1 Malloc bias_ptr_ error!";
       return RET_ERROR;
     }
-    if (origin_bias_data_type_ == kNumberTypeFloat16) {
-      memcpy(bias_data_, origin_bias_, output_channel * sizeof(float16_t));
-    } else {
-      MS_LOG(ERROR) << "Conv1x1 only support fp16 weight";
-      return RET_ERROR;
-    }
+    memcpy(bias_data_, origin_bias_, output_channel * sizeof(float16_t));
     memset(reinterpret_cast<char *>(bias_data_) + bias_size, 0, size - bias_size);
   }
 
@@ -105,8 +100,7 @@ int Convolution1x1FP16CPUKernel::InitWeightBias() {
     return RET_ERROR;
   }
   memset(reinterpret_cast<char *>(weight_ptr_) + down_size, 0, size - down_size);
-  ColMajor2Row8MajorFp16(origin_weight_, weight_ptr_, input_channel, output_channel,
-                         origin_weight_data_type_ == kNumberTypeFloat16);
+  ColMajor2Row8MajorFp16(origin_weight_, weight_ptr_, input_channel, output_channel, true);
   return RET_OK;
 }
 
@@ -217,8 +211,12 @@ static int Convolution1x1Fp16RunHw(void *cdata, int task_id) {
 }
 
 int Convolution1x1FP16CPUKernel::Run() {
-  ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
-
+  auto input_data = reinterpret_cast<float16_t *>(in_tensors_.at(0)->data_c());
+  auto output_data = reinterpret_cast<float16_t *>(out_tensors_.at(0)->data_c());
+  if (input_data == nullptr || output_data == nullptr) {
+    MS_LOG(ERROR) << "Convolution1x1 Fp16 get null tensor data!";
+    return RET_ERROR;
+  }
   pack_input_ = reinterpret_cast<float16_t *>(
     ctx_->allocator->Malloc(matmul_param_->row_align_ * matmul_param_->deep_ * sizeof(float16_t)));
   if (pack_input_ == nullptr) {
@@ -227,9 +225,9 @@ int Convolution1x1FP16CPUKernel::Run() {
   }
 
   for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) {
-    output_ptr_ = execute_output_ + batch_index * matmul_param_->row_ * matmul_param_->col_;
+    output_ptr_ = output_data + batch_index * matmul_param_->row_ * matmul_param_->col_;
     float16_t *batch_in =
-      execute_input_ + batch_index * conv_param_->input_h_ * conv_param_->input_w_ * conv_param_->input_channel_;
+      input_data + batch_index * conv_param_->input_h_ * conv_param_->input_w_ * conv_param_->input_channel_;
     if (pre_trans_input_) {
       Conv1x1InputPack(batch_in, input_ptr_, conv_param_, sizeof(float16_t));
     } else {
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h
index 680ba6ef40..a799abfc84 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h
@@ -20,18 +20,18 @@
 #include <arm_neon.h>
 #include <vector>
 #include "src/lite_kernel.h"
-#include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h"
+#include "src/runtime/kernel/arm/base/convolution_base.h"
 #include "src/common/utils.h"
 #include "nnacl/matmul_parameter.h"
 #include "nnacl/fp16/matmul_fp16.h"
 
 namespace mindspore::kernel {
-class Convolution1x1FP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
+class Convolution1x1FP16CPUKernel : public ConvolutionBaseCPUKernel {
  public:
   Convolution1x1FP16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
                               const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx, void *origin_weight,
-                              void *origin_bias, TypeId origin_weight_data_type, TypeId origin_bias_data_type)
-      : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, origin_weight_data_type, origin_bias_data_type),
+                              void *origin_bias)
+      : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx),
         origin_weight_(origin_weight),
         origin_bias_(origin_bias) {}
   ~Convolution1x1FP16CPUKernel() override;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.cc
deleted file mode 100644
index bdaf6cd6f0..0000000000
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.cc
+++ /dev/null
@@ -1,52 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h"
-#include "nnacl/fp16/cast_fp16.h"
-#include "src/runtime/kernel/arm/fp16/common_fp16.h"
-#include "include/errorcode.h"
-#include "src/runtime/runtime_api.h"
-
-namespace mindspore::kernel {
-using mindspore::lite::RET_ERROR;
-using mindspore::lite::RET_OK;
-ConvolutionBaseFP16CPUKernel::~ConvolutionBaseFP16CPUKernel() {
-  if (fp16_weight_ != nullptr) {
-    free(fp16_weight_);
-    fp16_weight_ = nullptr;
-  }
-}
-
-int ConvolutionBaseFP16CPUKernel::GetExecuteTensor() {
-  auto input_tensor = in_tensors_.at(0);
-  auto output_tensor = out_tensors_.at(0);
-  execute_input_ = reinterpret_cast<float16_t *>(input_tensor->data_c());
-  execute_output_ = reinterpret_cast<float16_t *>(output_tensor->data_c());
-  return RET_OK;
-}
-
-int ConvolutionBaseFP16CPUKernel::GetExecuteFilter(lite::Tensor *weight_tensor, void *origin_data) {
-  MS_ASSERT(origin_weight_data_type_ == kNumberTypeFloat32 || origin_weight_data_type_ == kNumberTypeFloat16);
-  if (origin_weight_data_type_ == kNumberTypeFloat32) {
-    MS_LOG(ERROR) << "Conv fp16 only support fp16 weight";
-    return RET_ERROR;
-  } else {
-    execute_weight_ = reinterpret_cast<float16_t *>(origin_data);
-    fp16_weight_ = nullptr;
-  }
-  return RET_OK;
-}
-}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.h
deleted file mode 100644
index 4dc7ddd755..0000000000
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_base_fp16.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_BASE_FP16_H_
-#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_BASE_FP16_H_
-
-#include <arm_neon.h>
-#include <vector>
-#include "src/lite_kernel.h"
-#include "src/runtime/kernel/arm/base/convolution_base.h"
-#include "src/common/utils.h"
-
-namespace mindspore::kernel {
-class ConvolutionBaseFP16CPUKernel : public ConvolutionBaseCPUKernel {
- public:
-  ConvolutionBaseFP16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
-                               const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx,
-                               TypeId origin_weight_data_type, TypeId origin_bias_data_type)
-      : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx),
-        origin_weight_data_type_(origin_weight_data_type),
-        origin_bias_data_type_(origin_bias_data_type) {}
-  ~ConvolutionBaseFP16CPUKernel() override;
-
-  int Init() override { return mindspore::lite::RET_OK; }
-  int ReSize() override { return mindspore::lite::RET_OK; }
-  int Run() override { return mindspore::lite::RET_OK; }
-  int RunImpl(int task_id) { return mindspore::lite::RET_OK; }
-  virtual int GetExecuteTensor();
-  // origin_data may not be the same as the data in the weight tensor,
-  // because weight tensor has released data already. In this situation,
-  // origin_data is the pointer of another memory block.
-  virtual int GetExecuteFilter(lite::Tensor *weight_tensor, void *origin_data);
-
- protected:
-  float16_t *fp16_weight_ = nullptr;
-  float16_t *execute_input_ = nullptr;
-  float16_t *execute_weight_ = nullptr;
-  float16_t *execute_output_ = nullptr;
-  TypeId origin_weight_data_type_;
-  TypeId origin_bias_data_type_;
-};
-}  // namespace mindspore::kernel
-
-#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_BASE_FP16_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_delegate_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_delegate_fp16.cc
index 13c4862a76..4526900ca3 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_delegate_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_delegate_fp16.cc
@@ -22,6 +22,7 @@
 #include "src/runtime/kernel/arm/fp16/group_convolution_fp16.h"
 #include "src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h"
 #include "src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.h"
+#include "src/runtime/kernel/arm/base/group_convolution_creator.h"
 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
 #include "include/errorcode.h"
@@ -96,8 +97,8 @@ int ConvolutionDelegateFP16CPUKernel::ReSize() {
   kernel::SetInputOutputShapeInfo(reinterpret_cast<ConvParameter *>(op_parameter_), in_tensors_.front(),
                                   out_tensors_.front(), context_);
   if (fp16_conv_kernel_ == nullptr) {
-    fp16_conv_kernel_ = CpuConvFp16KernelSelect(in_tensors_, out_tensors_, op_parameter_, context_, origin_weight_,
-                                                origin_bias_, origin_weight_data_type_, origin_bias_data_type_);
+    fp16_conv_kernel_ =
+      CpuConvFp16KernelSelect(in_tensors_, out_tensors_, op_parameter_, context_, origin_weight_, origin_bias_);
     if (fp16_conv_kernel_ == nullptr) {
       MS_LOG(ERROR) << "Selecting execute kernel failed for conv_kernel, got a nullptr.";
       return RET_ERROR;
@@ -108,29 +109,16 @@ int ConvolutionDelegateFP16CPUKernel::ReSize() {
   return fp16_conv_kernel_->ReSize();
 }
 
-ConvParameter *CreateNewConvParameterFp16(ConvParameter *parameter) {
-  auto conv_parameter = reinterpret_cast<ConvParameter *>(malloc(sizeof(ConvParameter)));
-  if (conv_parameter == nullptr) {
-    MS_LOG(ERROR) << "Malloc new conv parameter failed.";
-    return nullptr;
-  }
-  memcpy(conv_parameter, parameter, sizeof(ConvParameter));
-  return conv_parameter;
-}
-
 kernel::LiteKernel *CpuConvDwFp16KernelCreator(const std::vector<lite::Tensor *> &inputs,
                                                const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
-                                               const InnerContext *ctx, void *origin_weight, void *origin_bias,
-                                               TypeId origin_weight_data_type, TypeId origin_bias_data_type) {
+                                               const InnerContext *ctx) {
   MS_ASSERT(opParameter != nullptr);
   auto conv_param = reinterpret_cast<ConvParameter *>(opParameter);
   kernel::LiteKernel *kernel = nullptr;
   if (conv_param->input_channel_ < 32) {
-    kernel = new (std::nothrow) kernel::ConvolutionDepthwiseSWFp16CPUKernel(
-      opParameter, inputs, outputs, ctx, origin_weight, origin_bias, origin_weight_data_type, origin_bias_data_type);
+    kernel = new (std::nothrow) kernel::ConvolutionDepthwiseSWFp16CPUKernel(opParameter, inputs, outputs, ctx);
   } else {
-    kernel = new (std::nothrow) kernel::ConvolutionDepthwiseFp16CPUKernel(
-      opParameter, inputs, outputs, ctx, origin_weight, origin_bias, origin_weight_data_type, origin_bias_data_type);
+    kernel = new (std::nothrow) kernel::ConvolutionDepthwiseFp16CPUKernel(opParameter, inputs, outputs, ctx);
   }
   if (kernel == nullptr) {
     MS_LOG(ERROR) << "kernel is nullptr.";
@@ -142,27 +130,22 @@ kernel::LiteKernel *CpuConvDwFp16KernelCreator(const std::vector<lite::Tensor *>
 
 kernel::LiteKernel *CpuConvFp16KernelSelect(const std::vector<lite::Tensor *> &inputs,
                                             const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
-                                            const lite::InnerContext *ctx, void *origin_weight, void *origin_bias,
-                                            TypeId origin_weight_data_type, TypeId origin_bias_data_type) {
+                                            const lite::InnerContext *ctx, void *origin_weight, void *origin_bias) {
   auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter);
   bool use_winograd = false;
   int out_unit;
   CheckIfUseWinogradFp16(&use_winograd, &out_unit, conv_param);
   kernel::LiteKernel *kernel = nullptr;
 
-  if (conv_param->group_ == conv_param->input_channel_ && conv_param->group_ == conv_param->output_channel_) {
-    kernel = CpuConvDwFp16KernelCreator(inputs, outputs, op_parameter, ctx, origin_weight, origin_bias,
-                                        origin_weight_data_type, origin_bias_data_type);
-  } else if (conv_param->kernel_h_ == 1 && conv_param->kernel_w_ == 1) {
-    kernel = new (std::nothrow) kernel::Convolution1x1FP16CPUKernel(
-      op_parameter, inputs, outputs, ctx, origin_weight, origin_bias, origin_weight_data_type, origin_bias_data_type);
-  } else if (use_winograd) {
+  if (conv_param->kernel_h_ == 1 && conv_param->kernel_w_ == 1) {
     kernel = new (std::nothrow)
-      kernel::ConvolutionWinogradFP16CPUKernel(op_parameter, inputs, outputs, ctx, out_unit, origin_weight, origin_bias,
-                                               origin_weight_data_type, origin_bias_data_type);
+      kernel::Convolution1x1FP16CPUKernel(op_parameter, inputs, outputs, ctx, origin_weight, origin_bias);
+  } else if (use_winograd) {
+    kernel = new (std::nothrow) kernel::ConvolutionWinogradFP16CPUKernel(op_parameter, inputs, outputs, ctx, out_unit,
+                                                                         origin_weight, origin_bias);
   } else {
-    kernel = new (std::nothrow) kernel::ConvolutionFP16CPUKernel(
-      op_parameter, inputs, outputs, ctx, origin_weight, origin_bias, origin_weight_data_type, origin_bias_data_type);
+    kernel = new (std::nothrow)
+      kernel::ConvolutionFP16CPUKernel(op_parameter, inputs, outputs, ctx, origin_weight, origin_bias);
   }
   // Once kernel is selected, init func will invoke InitWeightAndBias
   auto ret = kernel->Init();
@@ -174,194 +157,54 @@ kernel::LiteKernel *CpuConvFp16KernelSelect(const std::vector<lite::Tensor *> &i
   return kernel;
 }
 
-void FreeMemoryFp16(const std::vector<kernel::LiteKernel *> &group_convs, const std::vector<lite::Tensor *> &new_inputs,
-                    const std::vector<lite::Tensor *> &new_outputs) {
-  for (auto sub_conv : group_convs) {
-    delete sub_conv;
-  }
-  for (auto in_tensor : new_inputs) {
-    delete in_tensor;
-  }
-  for (auto out_tensor : new_outputs) {
-    delete out_tensor;
-  }
-}
-
-static lite::Tensor *CreateInputTensorFp16(TypeId data_type, const std::vector<int> &in_shape, bool infered_flag) {
-  auto in_tensor = new (std::nothrow) lite::Tensor(data_type, in_shape, Format_NHWC, lite::Tensor::Category::VAR);
-  if (in_tensor == nullptr) {
-    MS_LOG(ERROR) << "new in_tensor failed.";
-    return nullptr;
-  }
-  if (infered_flag) {
-    auto ret = in_tensor->MallocData();
-    if (ret != RET_OK) {
-      delete in_tensor;
-      MS_LOG(ERROR) << "in tensor malloc failed.";
-      return nullptr;
-    }
-  }
-  return in_tensor;
-}
-
-static lite::Tensor *CreateConstTensorFp16(lite::Tensor *tensor, const std::vector<int> &shape, const int index) {
-  auto new_tensor =
-    new (std::nothrow) lite::Tensor(tensor->data_type(), shape, Format_NHWC, lite::Tensor::Category::CONST_TENSOR);
-  if (new_tensor == nullptr) {
-    MS_LOG(ERROR) << "Create new_tensor failed.";
-    return nullptr;
-  }
-  auto ret = new_tensor->MallocData();
-  if (ret != RET_OK) {
-    delete new_tensor;
-    MS_LOG(ERROR) << "Malloc new_tensor failed.";
-    return nullptr;
-  }
-  memcpy(new_tensor->data_c(), reinterpret_cast<char *>(tensor->data_c()) + index * new_tensor->Size(),
-         new_tensor->Size());
-  return new_tensor;
-}
-
-static lite::Tensor *CreateOutputTensorFp16(const std::vector<int> &out_shape,
-                                            const std::vector<lite::Tensor *> &outputs, bool infered_flag, int index) {
-  auto out_tensor = new (std::nothrow) lite::Tensor();
-  if (out_tensor == nullptr) {
-    MS_LOG(ERROR) << "new tmp_out_tensor failed.";
-    return nullptr;
-  }
-  out_tensor->set_data_type(mindspore::kNumberTypeFloat16);
-  out_tensor->set_format(outputs.at(index)->format());
-  if (infered_flag) {
-    out_tensor->set_shape(out_shape);
-    auto ret = out_tensor->MallocData();
-    if (ret != RET_OK) {
-      delete out_tensor;
-      MS_LOG(ERROR) << "out_tensor malloc data failed.";
-      return nullptr;
-    }
-  }
-  return out_tensor;
-}
-
-kernel::LiteKernel *CreateDelegateConvFp16(const std::vector<lite::Tensor *> &inputs,
-                                           const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
-                                           const InnerContext *ctx) {
-  auto weight_data_type = inputs.at(1)->data_type();
-  if (weight_data_type != kNumberTypeFloat16) {
-    MS_LOG(ERROR) << "Convfp16 only support fp16 weight";
-    return nullptr;
-  }
-  TypeId bias_data_type = kTypeUnknown;
-  if (inputs.size() == 3) {
-    bias_data_type = inputs.at(2)->data_type();
-  }
-  return new (std::nothrow)
-    kernel::ConvolutionDelegateFP16CPUKernel(op_parameter, inputs, outputs, ctx, weight_data_type, bias_data_type);
-}
-
 kernel::LiteKernel *CpuGroupConvFp16KernelCreator(const std::vector<lite::Tensor *> &inputs,
                                                   const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
                                                   const InnerContext *ctx) {
-  bool infer_flag = op_parameter->infer_flag_;
   auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter);
-  // update new shape info for each sub kernel
-  int new_in_channel = inputs.at(kWeightIndex)->Channel();
-  int new_out_channel = 0;
-  if (conv_param->group_ == 0) {
-    MS_LOG(ERROR) << "Divisor 'group' cannot be 0.";
-    return nullptr;
-  } else {
-    new_out_channel = inputs.at(kWeightIndex)->Batch() / conv_param->group_;
-  }
+  GroupConvCreator group_conv_creator(inputs, outputs, op_parameter, ctx, false, kNumberTypeFloat16);
+  group_conv_creator.SetShapeOfTensors();
 
-  std::vector<int> in_shape;
-  std::vector<int> out_shape;
-  if (infer_flag) {
-    conv_param->input_channel_ = new_in_channel;
-    conv_param->output_channel_ = new_out_channel;
-    in_shape = {inputs.front()->Batch(), inputs.front()->Height(), inputs.front()->Width(), new_in_channel};
-    out_shape = {inputs.front()->Batch(), outputs.front()->Height(), outputs.front()->Width(), new_out_channel};
-  }
-  std::vector<int> filter_shape = {new_out_channel, conv_param->kernel_h_, conv_param->kernel_w_, new_in_channel};
-  std::vector<int> bias_shape = {new_out_channel};
-
-  // new group conv op
-  std::vector<kernel::LiteKernel *> group_convs;
-  // create tensors for every sub conv kernel
   for (int i = 0; i < conv_param->group_; ++i) {
+    ConvParameter *new_conv_param = CreateNewConvParameter(conv_param);
     std::vector<lite::Tensor *> new_inputs;
     std::vector<lite::Tensor *> new_outputs;
-    auto new_conv_parameter = CreateNewConvParameterFp16(conv_param);
-    if (new_conv_parameter == nullptr) {
-      FreeMemoryFp16(group_convs, new_inputs, new_outputs);
-      MS_LOG(ERROR) << "Get new conv parameter failed.";
-      return nullptr;
-    }
-    // create new input for each group
-    auto in_tensor = CreateInputTensorFp16(mindspore::kNumberTypeFloat16, in_shape, infer_flag);
-    if (in_tensor == nullptr) {
-      delete new_conv_parameter;
-      FreeMemoryFp16(group_convs, new_inputs, new_outputs);
-      MS_LOG(ERROR) << "create input tensor failed.";
-      return nullptr;
-    }
-    new_inputs.emplace_back(in_tensor);
-
-    // create new weight
-    auto filter_tensor = CreateConstTensorFp16(inputs.at(kWeightIndex), filter_shape, i);
-    if (filter_tensor == nullptr) {
-      delete new_conv_parameter;
-      FreeMemoryFp16(group_convs, new_inputs, new_outputs);
-      MS_LOG(ERROR) << "create filter tensor failed.";
+    auto ret = group_conv_creator.GetSingleConvParam(new_conv_param, &new_inputs, &new_outputs, i);
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "GetSingleConv for fp16 group conv failed.";
       return nullptr;
     }
-    new_inputs.emplace_back(filter_tensor);
-
-    // if has bias, create new bias
-    if (inputs.size() == 3) {
-      auto bias_tensor = CreateConstTensorFp16(inputs.at(kBiasIndex), bias_shape, i);
-      if (bias_tensor == nullptr) {
-        delete new_conv_parameter;
-        FreeMemoryFp16(group_convs, new_inputs, new_outputs);
-        MS_LOG(ERROR) << "create bias_tensor failed.";
-        return nullptr;
-      }
-      new_inputs.emplace_back(bias_tensor);
-    }
-
-    // create new output tensors
-    for (size_t j = 0; j < outputs.size(); ++j) {
-      auto out_tensor = CreateOutputTensorFp16(out_shape, outputs, infer_flag, j);
-      if (out_tensor == nullptr) {
-        delete new_conv_parameter;
-        FreeMemoryFp16(group_convs, new_inputs, new_outputs);
-        MS_LOG(ERROR) << "new out_tensor failed.";
-        return nullptr;
-      }
-      new_outputs.emplace_back(out_tensor);
-    }
-    group_convs.emplace_back(
-      CreateDelegateConvFp16(new_inputs, new_outputs, reinterpret_cast<OpParameter *>(new_conv_parameter), ctx));
+    group_conv_creator.get_group_conv()->emplace_back(new (std::nothrow) ConvolutionDelegateFP16CPUKernel(
+      reinterpret_cast<OpParameter *>(new_conv_param), new_inputs, new_outputs, ctx));
   }
   return new (std::nothrow)
-    GroupConvolutionFP16CPUKernel(op_parameter, inputs, outputs, ctx, group_convs, conv_param->group_);
+    GroupConvolutionFP16CPUKernel(op_parameter, inputs, outputs, ctx, *(group_conv_creator.get_group_conv()),
+                                  reinterpret_cast<ConvParameter *>(op_parameter)->group_);
 }
 
+/* creator func */
 kernel::LiteKernel *CpuConvFp16KernelCreator(const std::vector<lite::Tensor *> &inputs,
                                              const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
                                              const InnerContext *ctx, const kernel::KernelKey &desc) {
   MS_ASSERT(opParameter != nullptr);
   MS_ASSERT(desc.type == schema::PrimitiveType_Conv2DFusion);
 
+  auto weight_data_type = inputs.at(1)->data_type();
+  TypeId bias_data_type = weight_data_type;
+  if (inputs.size() == 3) {
+    bias_data_type = inputs.at(2)->data_type();
+  }
+  if (weight_data_type != kNumberTypeFloat16 || bias_data_type != kNumberTypeFloat16) {
+    MS_LOG(ERROR) << "Convfp16 only support fp16 weight and fp16 bias.";
+    return nullptr;
+  }
   auto conv_param = reinterpret_cast<ConvParameter *>(opParameter);
   kernel::LiteKernel *kernel = nullptr;
-  bool is_depthwise =
-    (conv_param->group_ == conv_param->input_channel_ && conv_param->group_ == conv_param->output_channel_);
-
-  if (conv_param->group_ > 1 && !is_depthwise) {
-    kernel = CpuGroupConvFp16KernelCreator(inputs, outputs, opParameter, ctx);
+  if (conv_param->group_ == 1) {
+    kernel = new (std::nothrow) kernel::ConvolutionDelegateFP16CPUKernel(opParameter, inputs, outputs, ctx);
+  } else if (conv_param->group_ == conv_param->input_channel_ && conv_param->group_ == conv_param->output_channel_) {
+    kernel = CpuConvDwFp16KernelCreator(inputs, outputs, opParameter, ctx);
   } else {
-    kernel = CreateDelegateConvFp16(inputs, outputs, opParameter, ctx);
+    kernel = CpuGroupConvFp16KernelCreator(inputs, outputs, opParameter, ctx);
   }
 
   if (kernel == nullptr) {
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_delegate_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_delegate_fp16.h
index bc686cc76e..eb1e722e84 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_delegate_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_delegate_fp16.h
@@ -29,11 +29,8 @@ namespace mindspore::kernel {
 class ConvolutionDelegateFP16CPUKernel : public LiteKernel {
  public:
   ConvolutionDelegateFP16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
-                                   const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
-                                   TypeId origin_weight_data_type, TypeId origin_bias_data_type)
-      : LiteKernel(parameter, inputs, outputs, ctx),
-        origin_weight_data_type_(origin_weight_data_type),
-        origin_bias_data_type_(origin_bias_data_type) {}
+                                   const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
+      : LiteKernel(parameter, inputs, outputs, ctx) {}
   ~ConvolutionDelegateFP16CPUKernel() override {
     FreeCopiedData();
     if (fp16_conv_kernel_ != nullptr) {
@@ -56,14 +53,11 @@ class ConvolutionDelegateFP16CPUKernel : public LiteKernel {
   void *origin_weight_ = nullptr;
   void *origin_bias_ = nullptr;
   kernel::LiteKernel *fp16_conv_kernel_ = nullptr;
-  TypeId origin_weight_data_type_;
-  TypeId origin_bias_data_type_;
 };
 
 kernel::LiteKernel *CpuConvFp16KernelSelect(const std::vector<lite::Tensor *> &inputs,
                                             const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
-                                            const lite::InnerContext *ctx, void *origin_weight, void *origin_bias,
-                                            TypeId origin_weight_data_type, TypeId origin_bias_data_type);
+                                            const lite::InnerContext *ctx, void *origin_weight, void *origin_bias);
 }  // namespace mindspore::kernel
 
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_DELEGATE_FP16_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
index 521f5b1501..9446cfd10e 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
@@ -36,23 +36,15 @@ int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
   auto weight_tensor = in_tensors_.at(kWeightIndex);
   int channel = weight_tensor->Batch();
   int pack_weight_size = channel * weight_tensor->Height() * weight_tensor->Width();
+  auto origin_weight = reinterpret_cast<float16_t *>(weight_tensor->data_c());
 
   packed_weight_ = reinterpret_cast<float16_t *>(malloc(pack_weight_size * sizeof(float16_t)));
   if (packed_weight_ == nullptr) {
     MS_LOG(ERROR) << "Malloc buffer failed.";
     return RET_ERROR;
   }
-  auto ret = ConvolutionBaseFP16CPUKernel::GetExecuteFilter(weight_tensor, origin_weight_);
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "get execute filter data failed.";
-    return ret;
-  }
-  PackNCHWToNHWCFp16(execute_weight_, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(),
+  PackNCHWToNHWCFp16(origin_weight, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(),
                      weight_tensor->Batch());
-  if (fp16_weight_ != nullptr) {
-    free(fp16_weight_);
-    fp16_weight_ = nullptr;
-  }
 
   bias_data_ = reinterpret_cast<float16_t *>(malloc(channel * sizeof(float16_t)));
   if (bias_data_ == nullptr) {
@@ -60,14 +52,10 @@ int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
     return RET_ERROR;
   }
   memset(bias_data_, 0, channel * sizeof(float16_t));
-  auto bias_fp16 = reinterpret_cast<float16_t *>(bias_data_);
   if (in_tensors_.size() == kInputSize2) {
     auto bias_tensor = in_tensors_.at(kBiasIndex);
-    MS_ASSERT(origin_bias_);
-    auto ori_bias = reinterpret_cast<float16_t *>(origin_bias_);
-    for (int i = 0; i < bias_tensor->ElementsNum(); i++) {
-      bias_fp16[i] = (float16_t)ori_bias[i];
-    }
+    auto ori_bias = reinterpret_cast<float16_t *>(bias_tensor->data_c());
+    memcpy(bias_data_, ori_bias, bias_tensor->Size());
   }
   return RET_OK;
 }
@@ -95,8 +83,13 @@ int ConvolutionDepthwiseFp16CPUKernel::ReSize() {
 }
 
 int ConvolutionDepthwiseFp16CPUKernel::Execute(int task_id) {
-  ConvDwFp16(execute_output_, execute_input_, packed_weight_, reinterpret_cast<float16_t *>(bias_data_), conv_param_,
-             task_id);
+  auto input_ptr = reinterpret_cast<float16_t *>(in_tensors_.at(0)->data_c());
+  auto output_ptr = reinterpret_cast<float16_t *>(out_tensors_.at(0)->data_c());
+  if (input_ptr == nullptr || output_ptr == nullptr) {
+    MS_LOG(ERROR) << "Convolution depthwise Fp16 get null tensor data!";
+    return RET_ERROR;
+  }
+  ConvDwFp16(output_ptr, input_ptr, packed_weight_, reinterpret_cast<float16_t *>(bias_data_), conv_param_, task_id);
   return RET_OK;
 }
 
@@ -111,8 +104,6 @@ static int ConvDwFp16Run(void *cdata, int task_id) {
 }
 
 int ConvolutionDepthwiseFp16CPUKernel::Run() {
-  ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
-
   auto ret = ParallelLaunch(this->context_->thread_pool_, ConvDwFp16Run, this, conv_param_->thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "ConvDwFp16Run error: error_code[" << ret << "]";
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h
index 0f08247b83..674466cab0 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.h
@@ -19,7 +19,7 @@
 
 #include <vector>
 #include "src/lite_kernel.h"
-#include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h"
+#include "src/runtime/kernel/arm/base/convolution_base.h"
 #include "nnacl/fp16/conv_depthwise_fp16.h"
 
 #ifdef __cplusplus
@@ -32,15 +32,11 @@ void ConvDwFp16(float16_t *output_data, const float16_t *input_data, const float
 #endif
 
 namespace mindspore::kernel {
-class ConvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseFP16CPUKernel {
+class ConvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseCPUKernel {
  public:
   ConvolutionDepthwiseFp16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
-                                    const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx,
-                                    void *origin_weight, void *origin_bias, TypeId origin_weight_data_type,
-                                    TypeId origin_bias_data_type)
-      : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, origin_weight_data_type, origin_bias_data_type),
-        origin_weight_(origin_weight),
-        origin_bias_(origin_bias) {}
+                                    const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx)
+      : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx) {}
   ~ConvolutionDepthwiseFp16CPUKernel() override;
 
   int Init() override;
@@ -51,8 +47,6 @@ class ConvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseFP16CPUKernel {
   int Execute(int task_id);
 
  private:
-  void *origin_weight_;  // do not free
-  void *origin_bias_;    // do not free
   float16_t *packed_weight_ = nullptr;
 };
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc
index 02cdf2721c..a479709231 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc
@@ -62,14 +62,15 @@ int ConvolutionDepthwiseSWFp16CPUKernel::InitWeightBias() {
   auto weight_tensor = in_tensors_.at(kWeightIndex);
   int OC8 = UP_DIV(weight_tensor->Batch(), C8NUM);
   int pack_weight_size = C8NUM * OC8 * weight_tensor->Height() * weight_tensor->Width();
+  auto origin_weight = reinterpret_cast<float16_t *>(weight_tensor->data_c());
 
   packed_weight_ = reinterpret_cast<float16_t *>(malloc(pack_weight_size * sizeof(float16_t)));
   if (packed_weight_ == nullptr) {
     MS_LOG(ERROR) << "Malloc buffer failed.";
     return RET_ERROR;
   }
-  PackNCHWFp16ToNC8HW8Fp16(reinterpret_cast<float16_t *>(origin_weight_), packed_weight_, 1,
-                           weight_tensor->Height() * weight_tensor->Width(), weight_tensor->Batch());
+  PackNCHWFp16ToNC8HW8Fp16(origin_weight, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(),
+                           weight_tensor->Batch());
 
   bias_data_ = reinterpret_cast<float16_t *>(malloc(C8NUM * OC8 * sizeof(float16_t)));
   if (bias_data_ == nullptr) {
@@ -77,14 +78,10 @@ int ConvolutionDepthwiseSWFp16CPUKernel::InitWeightBias() {
     return RET_ERROR;
   }
   memset(bias_data_, 0, C8NUM * OC8 * sizeof(float16_t));
-  auto bias_fp16 = reinterpret_cast<float16_t *>(bias_data_);
   if (in_tensors_.size() == kInputSize2) {
     auto bias_tensor = in_tensors_.at(kBiasIndex);
-    MS_ASSERT(origin_bias_);
-    auto ori_bias = reinterpret_cast<float16_t *>(origin_bias_);
-    for (int i = 0; i < bias_tensor->ElementsNum(); i++) {
-      bias_fp16[i] = (float16_t)ori_bias[i];
-    }
+    auto ori_bias = reinterpret_cast<float16_t *>(bias_tensor->data_c());
+    memcpy(bias_data_, ori_bias, bias_tensor->Size());
   }
 
   conv_param_->thread_num_ = MSMIN(thread_count_, OC8);
@@ -143,14 +140,19 @@ int ConvolutionDepthwiseSWFp16CPUKernel::Run() {
     return ret;
   }
 
-  ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
+  auto input_ptr = reinterpret_cast<float16_t *>(in_tensors_.at(0)->data_c());
+  auto output_ptr = reinterpret_cast<float16_t *>(out_tensors_.at(0)->data_c());
+  if (input_ptr == nullptr || output_ptr == nullptr) {
+    MS_LOG(ERROR) << "Convolution depthwise Fp16 get null tensor data!";
+    return RET_ERROR;
+  }
 
   if (need_align_) {
-    PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_,
+    PackNHWCToNHWC8Fp16(input_ptr, packed_input_, conv_param_->input_batch_,
                         conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
   } else {
-    packed_input_ = execute_input_;
-    packed_output_ = execute_output_;
+    packed_input_ = input_ptr;
+    packed_output_ = output_ptr;
   }
 
   ret = ParallelLaunch(this->context_->thread_pool_, ConvDwSWFp16Run, this, conv_param_->thread_num_);
@@ -158,7 +160,7 @@ int ConvolutionDepthwiseSWFp16CPUKernel::Run() {
     MS_LOG(ERROR) << "ConvDwSWFp16Run error: error_code[" << ret << "]";
   }
   if (need_align_) {
-    PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_,
+    PackNHWC8ToNHWCFp16(packed_output_, output_ptr, conv_param_->output_batch_,
                         conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
   }
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.h
index 4dadf8ff28..51d4ab94f6 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.h
@@ -19,7 +19,7 @@
 
 #include <vector>
 #include "src/lite_kernel.h"
-#include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h"
+#include "src/runtime/kernel/arm/base/convolution_base.h"
 #include "nnacl/fp16/conv_depthwise_fp16.h"
 
 #ifdef __cplusplus
@@ -33,15 +33,11 @@ void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const flo
 #endif
 
 namespace mindspore::kernel {
-class ConvolutionDepthwiseSWFp16CPUKernel : public ConvolutionBaseFP16CPUKernel {
+class ConvolutionDepthwiseSWFp16CPUKernel : public ConvolutionBaseCPUKernel {
  public:
   ConvolutionDepthwiseSWFp16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
-                                      const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx,
-                                      void *origin_weight, void *origin_bias, TypeId origin_weight_data_type,
-                                      TypeId origin_bias_data_type)
-      : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, origin_weight_data_type, origin_bias_data_type),
-        origin_weight_(origin_weight),
-        origin_bias_(origin_bias) {}
+                                      const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx)
+      : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx) {}
   ~ConvolutionDepthwiseSWFp16CPUKernel() override;
 
   int Init() override;
@@ -54,8 +50,6 @@ class ConvolutionDepthwiseSWFp16CPUKernel : public ConvolutionBaseFP16CPUKernel
 
  private:
   void FreePackedInputOutput();
-  void *origin_weight_;  // do not free
-  void *origin_bias_;    // do not free
   SlidingWindowParam *sliding_ = nullptr;
   float16_t *packed_weight_ = nullptr;
   float16_t *packed_input_ = nullptr;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
index faca0d38aa..3e15eec4ff 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
@@ -45,8 +45,7 @@ int ConvolutionFP16CPUKernel::InitWeightBias() {
     return RET_ERROR;
   }
   memset(packed_weight_, 0, pack_weight_size * sizeof(float16_t));
-  RowMajor2Col8MajorFp16(origin_weight_, packed_weight_, out_channel, in_channel * kernel_plane,
-                         origin_weight_data_type_ == kNumberTypeFloat32);
+  RowMajor2Col8MajorFp16(origin_weight_, packed_weight_, out_channel, in_channel * kernel_plane, false);
 
   // init bias
   bias_data_ = malloc(oc8 * sizeof(float16_t));
@@ -56,14 +55,7 @@ int ConvolutionFP16CPUKernel::InitWeightBias() {
   }
   memset(bias_data_, 0, oc8 * sizeof(float16_t));
   if (in_tensors_.size() == kInputSize2) {
-    if (origin_bias_data_type_ == kNumberTypeFloat16) {
-      memcpy(bias_data_, origin_bias_, out_channel * sizeof(float16_t));
-    } else {
-      MS_LOG(ERROR) << "Conv fp16 only support fp16 bias";
-      return RET_ERROR;
-    }
-  } else {
-    MS_ASSERT(in_tensors_.size() == kInputSize1);
+    memcpy(bias_data_, origin_bias_, out_channel * sizeof(float16_t));
   }
   return RET_OK;
 }
@@ -123,8 +115,14 @@ int ConvolutionFP16CPUKernel::ReSize() {
 }
 
 int ConvolutionFP16CPUKernel::RunImpl(int task_id) {
-  ConvFp16(execute_input_, packed_input_, packed_weight_, reinterpret_cast<float16_t *>(bias_data_), col_major_input_,
-           execute_output_, task_id, conv_param_);
+  auto input_ptr = reinterpret_cast<float16_t *>(in_tensors_.at(0)->data_c());
+  auto output_ptr = reinterpret_cast<float16_t *>(out_tensors_.at(0)->data_c());
+  if (input_ptr == nullptr || output_ptr == nullptr) {
+    MS_LOG(ERROR) << "Convolution Fp16 get null tensor data!";
+    return RET_ERROR;
+  }
+  ConvFp16(input_ptr, packed_input_, packed_weight_, reinterpret_cast<float16_t *>(bias_data_), col_major_input_,
+           output_ptr, task_id, conv_param_);
   return RET_OK;
 }
 
@@ -139,8 +137,6 @@ static int ConvolutionFp16Impl(void *cdata, int task_id) {
 }
 
 int ConvolutionFP16CPUKernel::Run() {
-  ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
-
   auto ret = InitTmpBuffer();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Init tmp buffer failed.";
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.h
index f9c0859139..2750c8da60 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.h
@@ -20,15 +20,15 @@
 #include <arm_neon.h>
 #include <vector>
 #include "src/lite_kernel.h"
-#include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h"
+#include "src/runtime/kernel/arm/base/convolution_base.h"
 
 namespace mindspore::kernel {
-class ConvolutionFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
+class ConvolutionFP16CPUKernel : public ConvolutionBaseCPUKernel {
  public:
   ConvolutionFP16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
                            const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx, void *origin_weight,
-                           void *origin_bias, TypeId origin_weight_data_type, TypeId origin_bias_data_type)
-      : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, origin_weight_data_type, origin_bias_data_type),
+                           void *origin_bias)
+      : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx),
         origin_weight_(origin_weight),
         origin_bias_(origin_bias) {}
   ~ConvolutionFP16CPUKernel() override {
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
index 0bded8410c..c7d0d89267 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
@@ -33,9 +33,9 @@ int ConvolutionWinogradFP16CPUKernel::WinogradFilterTransformFp16(const float16_
 }
 
 int ConvolutionWinogradFP16CPUKernel::InitWeightBias() {
-  auto filter_tensor = in_tensors_.at(kWeightIndex);
-  int in_channel = filter_tensor->Channel();
-  int out_channel = filter_tensor->Batch();
+  auto weight_tensor = in_tensors_.at(kWeightIndex);
+  int in_channel = weight_tensor->Channel();
+  int out_channel = weight_tensor->Batch();
   conv_param_->input_channel_ = in_channel;
   conv_param_->output_channel_ = out_channel;
   int oc_block_num = UP_DIV(out_channel, col_tile_);
@@ -65,21 +65,11 @@ int ConvolutionWinogradFP16CPUKernel::InitWeightBias() {
     MS_LOG(ERROR) << "get matrix g from CookToomFilter failed.";
     return ret;
   }
-  ret = GetExecuteFilter(filter_tensor, origin_weight_);
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "get execute filter failed.";
-    return ret;
-  }
-  ret = WinogradFilterTransformFp16(execute_weight_, matrix_g, matrix_gt, col_tile_);
+  ret = WinogradFilterTransformFp16(reinterpret_cast<float16_t *>(origin_weight_), matrix_g, matrix_gt, col_tile_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "winograd filter transform failed.";
     return ret;
   }
-  // if fp16_weight is malloced, free it. It will not be used in runtime anymore.
-  if (fp16_weight_ != nullptr) {
-    free(fp16_weight_);
-    fp16_weight_ = nullptr;
-  }
 
   // init bias
   bias_data_ = malloc(oc_block_num * col_tile_ * sizeof(float16_t));
@@ -88,16 +78,8 @@ int ConvolutionWinogradFP16CPUKernel::InitWeightBias() {
     return RET_ERROR;
   }
   memset(bias_data_, 0, oc_block_num * col_tile_ * sizeof(float16_t));
-
   if (in_tensors_.size() == kInputSize2) {
-    if (origin_bias_data_type_ == kNumberTypeFloat16) {
-      memcpy(bias_data_, origin_bias_, out_channel * sizeof(float16_t));
-    } else {
-      MS_LOG(ERROR) << "Conv winograd fp16 only support fp16 bias";
-      return RET_ERROR;
-    }
-  } else {
-    MS_ASSERT(in_tensors_.size() == kInputSize1);
+    memcpy(bias_data_, origin_bias_, out_channel * sizeof(float16_t));
   }
   return RET_OK;
 }
@@ -202,7 +184,13 @@ int ConvolutionWinogradFP16CPUKernel::ReSize() {
 }
 
 int ConvolutionWinogradFP16CPUKernel::RunImpl(int task_id) {
-  ConvWinogardFp16(execute_input_, trans_weight_, reinterpret_cast<const float16_t *>(bias_data_), execute_output_,
+  auto input_ptr = reinterpret_cast<float16_t *>(in_tensors_.at(0)->data_c());
+  auto output_ptr = reinterpret_cast<float16_t *>(out_tensors_.at(0)->data_c());
+  if (input_ptr == nullptr || output_ptr == nullptr) {
+    MS_LOG(ERROR) << "Convolution Winograd Fp16 get null tensor data!";
+    return RET_ERROR;
+  }
+  ConvWinogardFp16(input_ptr, trans_weight_, reinterpret_cast<const float16_t *>(bias_data_), output_ptr,
                    tmp_buffer_address_list_, task_id, conv_param_, in_func_, out_func_);
   return RET_OK;
 }
@@ -218,8 +206,6 @@ static int ConvolutionWinogradFp16Impl(void *cdata, int task_id) {
 }
 
 int ConvolutionWinogradFP16CPUKernel::Run() {
-  ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
-
   auto ret = InitTmpBuffer();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Init tmp buffer failed.";
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h
index e9793875df..6cf454e085 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h
@@ -20,20 +20,19 @@
 #include <arm_neon.h>
 #include <vector>
 #include "src/lite_kernel.h"
-#include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h"
+#include "src/runtime/kernel/arm/base/convolution_base.h"
 #include "nnacl/fp16/conv_fp16.h"
 #include "nnacl/fp16/winograd_utils_fp16.h"
 #include "src/common/utils.h"
 #include "nnacl/base/minimal_filtering_generator.h"
 
 namespace mindspore::kernel {
-class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
+class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseCPUKernel {
  public:
   ConvolutionWinogradFP16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
                                    const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx, int out_unit,
-                                   void *origin_weight, void *origin_bias, TypeId origin_weight_data_type,
-                                   TypeId origin_bias_data_type)
-      : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, origin_weight_data_type, origin_bias_data_type),
+                                   void *origin_weight, void *origin_bias)
+      : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx),
         output_unit_(out_unit),
         origin_weight_(origin_weight),
         origin_bias_(origin_bias) {}
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc
index 3c1200fb97..99f4f21892 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc
@@ -73,7 +73,7 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
   // init weight: o, h, w, i; o == group, i == 1
   auto weight_tensor = in_tensors_.at(kWeightIndex);
   int OC8 = UP_DIV(weight_tensor->Batch(), C8NUM);
-  auto origin_weight = reinterpret_cast<float16_t *>(weight_tensor->MutableData());
+  auto origin_weight = reinterpret_cast<float16_t *>(weight_tensor->data_c());
   int pack_weight_size = C8NUM * OC8 * weight_tensor->Height() * weight_tensor->Width();
 
   packed_weight_ = reinterpret_cast<float16_t *>(malloc(pack_weight_size * sizeof(float16_t)));
@@ -92,10 +92,8 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
   memset(bias_data_, 0, C8NUM * OC8 * sizeof(float16_t));
   if (in_tensors_.size() == kInputSize2) {
     auto bias_tensor = in_tensors_.at(kBiasIndex);
-    auto ori_bias = reinterpret_cast<float16_t *>(bias_tensor->MutableData());
-    for (int i = 0; i < bias_tensor->ElementsNum(); i++) {
-      reinterpret_cast<float16_t *>(bias_data_)[i] = ori_bias[i];
-    }
+    auto ori_bias = reinterpret_cast<float16_t *>(bias_tensor->data_c());
+    memcpy(bias_data_, ori_bias, bias_tensor->Size());
   }
 
   conv_param_->thread_num_ = MSMIN(thread_count_, OC8);
@@ -157,18 +155,23 @@ int DeconvolutionDepthwiseFp16CPUKernel::Run() {
     return RET_ERROR;
   }
 
-  ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
+  auto input_ptr = reinterpret_cast<float16_t *>(in_tensors_.at(0)->data_c());
+  auto output_ptr = reinterpret_cast<float16_t *>(out_tensors_.at(0)->data_c());
+  if (input_ptr == nullptr || output_ptr == nullptr) {
+    MS_LOG(ERROR) << "Deconvolution depthwise Fp16 get null tensor data!";
+    return RET_ERROR;
+  }
 
   if (need_align_) {
-    PackNHWCToNHWC8Fp16(execute_input_, packed_input_, conv_param_->input_batch_,
+    PackNHWCToNHWC8Fp16(input_ptr, packed_input_, conv_param_->input_batch_,
                         conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
   } else {
-    packed_input_ = execute_input_;
+    packed_input_ = input_ptr;
   }
 
   if (!need_align_) {
-    memset(execute_output_, 0, out_tensors_.at(kOutputIndex)->ElementsNum() * sizeof(float16_t));
-    packed_output_ = execute_output_;
+    memset(output_ptr, 0, out_tensors_.at(kOutputIndex)->ElementsNum() * sizeof(float16_t));
+    packed_output_ = output_ptr;
   }
   ret = ParallelLaunch(this->context_->thread_pool_, DeconvDwFp16Run, this, conv_param_->thread_num_);
   if (ret != RET_OK) {
@@ -176,7 +179,7 @@ int DeconvolutionDepthwiseFp16CPUKernel::Run() {
   }
 
   if (need_align_) {
-    PackNHWC8ToNHWCFp16(packed_output_, execute_output_, conv_param_->output_batch_,
+    PackNHWC8ToNHWCFp16(packed_output_, output_ptr, conv_param_->output_batch_,
                         conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
   }
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h
index 0d6dfdd87b..f9b1c20fc8 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.h
@@ -19,7 +19,7 @@
 
 #include <vector>
 #include "src/lite_kernel.h"
-#include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h"
+#include "src/runtime/kernel/arm/base/convolution_base.h"
 #include "nnacl/fp16/conv_depthwise_fp16.h"
 
 #ifdef __cplusplus
@@ -34,12 +34,11 @@ void ComputeStrides(int *shape, int *strides, int ndim);
 #endif
 
 namespace mindspore::kernel {
-class DeconvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseFP16CPUKernel {
+class DeconvolutionDepthwiseFp16CPUKernel : public ConvolutionBaseCPUKernel {
  public:
   DeconvolutionDepthwiseFp16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
-                                      const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx,
-                                      TypeId origin_weight_data_type, TypeId origin_bias_data_type)
-      : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, origin_weight_data_type, origin_bias_data_type) {}
+                                      const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx)
+      : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx) {}
   ~DeconvolutionDepthwiseFp16CPUKernel() override;
 
   int Init() override;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc
index 353f172bed..b57bf7a37e 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc
@@ -32,9 +32,9 @@ DeConvolutionFp16CPUKernel::~DeConvolutionFp16CPUKernel() {
     delete matmul_param_;
     matmul_param_ = nullptr;
   }
-  if (execute_weight_ != nullptr) {
-    free(execute_weight_);
-    execute_weight_ = nullptr;
+  if (pack_weight_ != nullptr) {
+    free(pack_weight_);
+    pack_weight_ = nullptr;
   }
   return;
 }
@@ -78,17 +78,17 @@ int DeConvolutionFp16CPUKernel::InitWeightBias() {
   }
 
   size_t weight_pack_size = input_channel * kernel_w * kernel_h * UP_ROUND(output_channel, C8NUM) * sizeof(float16_t);
-  execute_weight_ = reinterpret_cast<float16_t *>(malloc(weight_pack_size));
-  if (execute_weight_ == nullptr) {
-    MS_LOG(ERROR) << "deconv malloc execute_weight_ error!";
+  pack_weight_ = reinterpret_cast<float16_t *>(malloc(weight_pack_size));
+  if (pack_weight_ == nullptr) {
+    MS_LOG(ERROR) << "deconv malloc pack_weight_ error!";
     return RET_ERROR;
   }
-  memset(execute_weight_, 0, weight_pack_size);
+  memset(pack_weight_, 0, weight_pack_size);
   if (in_tensors_.at(1)->data_type() != kNumberTypeFloat16) {
     MS_LOG(ERROR) << "deconv fp16 kernel require fp16 weight";
     return RET_ERROR;
   }
-  PackNHWCFp16ToC8HWN8Fp16(reinterpret_cast<float16_t *>(in_tensors_.at(1)->data_c()), execute_weight_, input_channel,
+  PackNHWCFp16ToC8HWN8Fp16(reinterpret_cast<float16_t *>(in_tensors_.at(1)->data_c()), pack_weight_, input_channel,
                            kernel_w * kernel_h, output_channel);
   return RET_OK;
 }
@@ -169,7 +169,7 @@ int DeConvolutionFp16CPUKernel::DoDeconv(int task_id) {
   }
 
   auto tmp_buf = tmp_buffer_ + task_id * thread_stride_ * C8NUM * kernel_plane_ * matmul_param_->row_16_;
-  MatMulFp16(pack_input_, execute_weight_ + task_id * thread_stride_ * C8NUM * kernel_plane_ * matmul_param_->deep_,
+  MatMulFp16(pack_input_, pack_weight_ + task_id * thread_stride_ * C8NUM * kernel_plane_ * matmul_param_->deep_,
              tmp_buf, nullptr, ActType_No, matmul_param_->deep_, matmul_param_->row_, oc * C8NUM * kernel_plane_, 0,
              OutType_C8);
 
@@ -197,7 +197,12 @@ int DeConvolutionFp16CPUKernel::Init() {
 }
 
 int DeConvolutionFp16CPUKernel::Run() {
-  ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
+  auto input_ptr = reinterpret_cast<float16_t *>(in_tensors_.at(0)->data_c());
+  auto output_ptr = reinterpret_cast<float16_t *>(out_tensors_.at(0)->data_c());
+  if (input_ptr == nullptr || output_ptr == nullptr) {
+    MS_LOG(ERROR) << "DeConvolution Fp16 get null tensor data!";
+    return RET_ERROR;
+  }
 
   int error_code = InitRunBuf();
   if (error_code != RET_OK) {
@@ -207,8 +212,8 @@ int DeConvolutionFp16CPUKernel::Run() {
   }
 
   for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) {
-    batch_input_ = execute_input_ + batch_index * conv_param_->input_channel_ * input_plane_;
-    batch_output_ = execute_output_ + batch_index * conv_param_->output_channel_ * output_plane_;
+    batch_input_ = input_ptr + batch_index * conv_param_->input_channel_ * input_plane_;
+    batch_output_ = output_ptr + batch_index * conv_param_->output_channel_ * output_plane_;
 
     RowMajor2Col16MajorFp16Opt(batch_input_, pack_input_, input_plane_, conv_param_->input_channel_);
 
@@ -228,25 +233,17 @@ kernel::LiteKernel *CpuDeConvFp16KernelCreator(const std::vector<lite::Tensor *>
   MS_ASSERT(op_parameter != nullptr);
   MS_ASSERT(desc.type == schema::PrimitiveType_Conv2dTransposeFusion);
 
-  auto weight_data_type = inputs.at(1)->data_type();
-  TypeId bias_data_type = kTypeUnknown;
-  if (inputs.size() == 3) {
-    bias_data_type = inputs.at(2)->data_type();
-  }
   kernel::LiteKernel *kernel = nullptr;
   auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter);
   if (conv_param->group_ == 1) {
     if ((conv_param->stride_h_ != 1 || conv_param->stride_w_ != 1) &&
         (conv_param->dilation_h_ == 1 && conv_param->dilation_w_ == 1)) {
-      kernel = new (std::nothrow)
-        kernel::DeConvWinogradFp16CPUKernel(op_parameter, inputs, outputs, ctx, weight_data_type, bias_data_type);
+      kernel = new (std::nothrow) kernel::DeConvWinogradFp16CPUKernel(op_parameter, inputs, outputs, ctx);
     } else {
-      kernel = new (std::nothrow)
-        kernel::DeConvolutionFp16CPUKernel(op_parameter, inputs, outputs, ctx, weight_data_type, bias_data_type);
+      kernel = new (std::nothrow) kernel::DeConvolutionFp16CPUKernel(op_parameter, inputs, outputs, ctx);
     }
   } else if (conv_param->group_ == conv_param->input_channel_ && conv_param->group_ == conv_param->output_channel_) {
-    kernel = new (std::nothrow)
-      DeconvolutionDepthwiseFp16CPUKernel(op_parameter, inputs, outputs, ctx, weight_data_type, bias_data_type);
+    kernel = new (std::nothrow) DeconvolutionDepthwiseFp16CPUKernel(op_parameter, inputs, outputs, ctx);
   }
 
   if (kernel == nullptr) {
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.h
index d1ecc46057..3256bb75ad 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.h
@@ -21,15 +21,14 @@
 #include "nnacl/fp16/deconv_fp16.h"
 #include "nnacl/fp16/matmul_fp16.h"
 #include "src/kernel_registry.h"
-#include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h"
+#include "src/runtime/kernel/arm/base/convolution_base.h"
 
 namespace mindspore::kernel {
-class DeConvolutionFp16CPUKernel : public ConvolutionBaseFP16CPUKernel {
+class DeConvolutionFp16CPUKernel : public ConvolutionBaseCPUKernel {
  public:
   DeConvolutionFp16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
-                             const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
-                             TypeId origin_weight_data_type, TypeId origin_bias_data_type)
-      : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, origin_weight_data_type, origin_bias_data_type) {}
+                             const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
+      : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx) {}
   ~DeConvolutionFp16CPUKernel() override;
   int Init() override;
   int Run() override;
@@ -52,6 +51,7 @@ class DeConvolutionFp16CPUKernel : public ConvolutionBaseFP16CPUKernel {
   int thread_count_;
   int thread_stride_;
   float16_t *pack_input_;
+  float16_t *pack_weight_;
   float16_t *pack_output_;
   float16_t *tmp_buffer_;
   float16_t *batch_input_;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_winograd_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_winograd_fp16.cc
index 9cbf54369b..b198df7218 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_winograd_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_winograd_fp16.cc
@@ -317,15 +317,11 @@ int DeConvWinogradFp16CPUKernel::InitComputeParam() {
 int DeConvWinogradFp16CPUKernel::InitDataParam() {
   /* unit data : weight & winograd data*/
   auto weight_tensor = in_tensors_.at(kWeightIndex);
-  auto ret = ConvolutionBaseFP16CPUKernel::GetExecuteFilter(weight_tensor, weight_tensor->data_c());
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Get Execute filter failed.";
-    return ret;
-  }
+  auto origin_weight = reinterpret_cast<float16_t *>(weight_tensor->data_c());
 
   for (int i = 0; i < deconv_param_->compute_size_; i++) {
     DeConvComputeUnit *unit = &deconv_param_->compute_units_[i];
-    ret = PackDeConvWgDataFp16(execute_weight_, unit, conv_param_, deconv_param_);
+    auto ret = PackDeConvWgDataFp16(origin_weight, unit, conv_param_, deconv_param_);
     if (ret != RET_OK) {
       return ret;
     }
@@ -338,18 +334,11 @@ int DeConvWinogradFp16CPUKernel::InitDataParam() {
     return RET_ERROR;
   }
   memset(bias_data_, 0, deconv_param_->oc_up4_ * sizeof(float16_t));
-  auto fp16_bias_data = reinterpret_cast<float16_t *>(bias_data_);
   if (in_tensors_.size() == 3 && in_tensors_.at(kBiasIndex)->shape().size() == 1 &&
       in_tensors_.at(kBiasIndex)->DimensionSize(0) == conv_param_->output_channel_) {
-    auto src_bias = reinterpret_cast<float16_t *>(in_tensors_.at(kBiasIndex)->MutableData());
-    MS_ASSERT(src_bias);
-    for (int i = 0; i < conv_param_->output_channel_; ++i) {
-      fp16_bias_data[i] = (float16_t)src_bias[i];
-    }
-  } else {
-    MS_ASSERT(in_tensors_.size() == kInputSize1);
+    auto src_bias = reinterpret_cast<float16_t *>(in_tensors_.at(kBiasIndex)->data_c());
+    memcpy(bias_data_, src_bias, in_tensors_.at(kBiasIndex)->Size());
   }
-
   return RET_OK;
 }
 
@@ -391,11 +380,16 @@ int DeConvWinogradFp16CPUKernel::Init() {
 }
 
 int DeConvWinogradFp16CPUKernel::Run() {
-  ConvolutionBaseFP16CPUKernel::GetExecuteTensor();
+  auto input_ptr = reinterpret_cast<float16_t *>(in_tensors_.at(0)->data_c());
+  auto output_ptr = reinterpret_cast<float16_t *>(out_tensors_.at(0)->data_c());
+  if (input_ptr == nullptr || output_ptr == nullptr) {
+    MS_LOG(ERROR) << "Deconvolution Winograd Fp16 get null tensor data!";
+    return RET_ERROR;
+  }
 
   for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) {
-    nhwc_input_ = execute_input_ + batch_index * deconv_param_->input_plane_ * conv_param_->input_channel_;
-    nhwc_output_ = execute_output_ + batch_index * deconv_param_->output_plane_ * conv_param_->output_channel_;
+    nhwc_input_ = input_ptr + batch_index * deconv_param_->input_plane_ * conv_param_->input_channel_;
+    nhwc_output_ = output_ptr + batch_index * deconv_param_->output_plane_ * conv_param_->output_channel_;
 
     ::memset(nc4hw4_output_, 0, deconv_param_->output_plane_ * deconv_param_->oc_div4_ * C4NUM * sizeof(float16_t));
     ParallelLaunch(this->context_->thread_pool_, DeConvWgFp16Run, this, deconv_param_->thread_num_);
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_winograd_fp16.h b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_winograd_fp16.h
index e099e91fc0..19f0d9c7df 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_winograd_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_winograd_fp16.h
@@ -22,15 +22,14 @@
 #include "nnacl/fp16/common_func_fp16.h"
 #include "nnacl/fp16/deconv_winograd_fp16.h"
 #include "nnacl/fp16/pack_fp16.h"
-#include "src/runtime/kernel/arm/fp16/convolution_base_fp16.h"
+#include "src/runtime/kernel/arm/base/convolution_base.h"
 
 namespace mindspore::kernel {
-class DeConvWinogradFp16CPUKernel : public ConvolutionBaseFP16CPUKernel {
+class DeConvWinogradFp16CPUKernel : public ConvolutionBaseCPUKernel {
  public:
   DeConvWinogradFp16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
-                              const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
-                              TypeId origin_weight_data_type, TypeId origin_bias_data_type)
-      : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, origin_weight_data_type, origin_bias_data_type) {}
+                              const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
+      : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx) {}
   ~DeConvWinogradFp16CPUKernel() override;
   int Init() override;
   int Run() override;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_delegate_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_delegate_fp32.cc
index 4b39ff7c47..0564bd29ea 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_delegate_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_delegate_fp32.cc
@@ -24,6 +24,7 @@
 #include "src/runtime/kernel/arm/fp32/convolution_depthwise_slidewindow_fp32.h"
 #include "src/runtime/kernel/arm/fp32/convolution_depthwise_indirect_fp32.h"
 #include "src/runtime/kernel/arm/base/group_convolution_creator.h"
+#include "src/runtime/kernel/arm/base/group_convolution.h"
 #include "schema/model_generated.h"
 #include "include/errorcode.h"
 
@@ -161,9 +162,9 @@ kernel::LiteKernel *ConvolutionDelegateCPUKernel::CpuConvFp32KernelSelect() {
   return kernel;
 }
 
-kernel::LiteKernel *DispatchConvDw(const std::vector<lite::Tensor *> &inputs,
-                                   const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
-                                   const InnerContext *ctx) {
+kernel::LiteKernel *CpuConvDwFp32KernelCreator(const std::vector<lite::Tensor *> &inputs,
+                                               const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
+                                               const InnerContext *ctx) {
   auto conv_param = reinterpret_cast<ConvParameter *>(opParameter);
   kernel::LiteKernel *kernel = nullptr;
   if (opParameter != nullptr && opParameter->infer_flag_) {
@@ -187,6 +188,29 @@ kernel::LiteKernel *DispatchConvDw(const std::vector<lite::Tensor *> &inputs,
   return kernel;
 }
 
+kernel::LiteKernel *CpuGroupConvFp32KernelCreator(const std::vector<lite::Tensor *> &inputs,
+                                                  const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
+                                                  const lite::InnerContext *ctx) {
+  auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter);
+  GroupConvCreator group_conv_creator(inputs, outputs, op_parameter, ctx, false, kNumberTypeFloat32);
+  group_conv_creator.SetShapeOfTensors();
+  for (int i = 0; i < conv_param->group_; ++i) {
+    ConvParameter *new_conv_param = CreateNewConvParameter(conv_param);
+    std::vector<lite::Tensor *> new_inputs;
+    std::vector<lite::Tensor *> new_outputs;
+    auto ret = group_conv_creator.GetSingleConvParam(new_conv_param, &new_inputs, &new_outputs, i);
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "GetSingleConv for fp32 group conv failed.";
+      return nullptr;
+    }
+    group_conv_creator.get_group_conv()->emplace_back(new (std::nothrow) ConvolutionDelegateCPUKernel(
+      reinterpret_cast<OpParameter *>(new_conv_param), new_inputs, new_outputs, ctx));
+  }
+  return new (std::nothrow)
+    GroupConvolutionCPUKernel(op_parameter, inputs, outputs, ctx, *(group_conv_creator.get_group_conv()),
+                              reinterpret_cast<ConvParameter *>(op_parameter)->group_);
+}
+
 /* creator func */
 kernel::LiteKernel *CpuConvFp32KernelCreator(const std::vector<lite::Tensor *> &inputs,
                                              const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
@@ -200,7 +224,7 @@ kernel::LiteKernel *CpuConvFp32KernelCreator(const std::vector<lite::Tensor *> &
   if (conv_param->group_ == 1) {
     kernel = new (std::nothrow) kernel::ConvolutionDelegateCPUKernel(op_parameter, inputs, outputs, ctx);
   } else if (conv_param->group_ == conv_param->input_channel_ && conv_param->group_ == conv_param->output_channel_) {
-    kernel = DispatchConvDw(inputs, outputs, op_parameter, ctx);
+    kernel = CpuConvDwFp32KernelCreator(inputs, outputs, op_parameter, ctx);
   } else {
     kernel = CpuGroupConvFp32KernelCreator(inputs, outputs, op_parameter, ctx);
   }
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8_creator.cc b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8_creator.cc
index de91a2910d..e1b9e22584 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8_creator.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8_creator.cc
@@ -21,6 +21,7 @@
 #include "src/runtime/kernel/arm/int8/convolution_depthwise_int8.h"
 #include "src/runtime/kernel/arm/int8/convolution_depthwise_3x3_int8.h"
 #include "src/runtime/kernel/arm/int8/convolution_depthwise_slidewindow_int8.h"
+#include "src/runtime/kernel/arm/int8/group_convolution_int8.h"
 #include "src/runtime/kernel/arm/base/group_convolution_creator.h"
 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
@@ -83,6 +84,29 @@ kernel::LiteKernel *CpuConvInt8KernelSelect(const std::vector<lite::Tensor *> &i
   return kernel;
 }
 
+kernel::LiteKernel *CpuGroupConvInt8KernelCreator(const std::vector<lite::Tensor *> &inputs,
+                                                  const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
+                                                  const lite::InnerContext *ctx, int group) {
+  auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter);
+  GroupConvCreator group_conv_creator(inputs, outputs, op_parameter, ctx, true, kNumberTypeInt8);
+  group_conv_creator.SetShapeOfTensors();
+  for (int i = 0; i < conv_param->group_; ++i) {
+    ConvParameter *new_conv_param = CreateNewConvParameter(conv_param);
+    std::vector<lite::Tensor *> new_inputs;
+    std::vector<lite::Tensor *> new_outputs;
+    auto ret = group_conv_creator.GetSingleConvParam(new_conv_param, &new_inputs, &new_outputs, i);
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "GetSingleConv for int8 group conv failed.";
+      return nullptr;
+    }
+    group_conv_creator.CopyQuantParam(&new_inputs);
+    group_conv_creator.get_group_conv()->emplace_back(
+      CpuConvInt8KernelSelect(new_inputs, new_outputs, reinterpret_cast<OpParameter *>(new_conv_param), ctx));
+  }
+  return new (std::nothrow)
+    GroupConvolutionInt8CPUKernel(op_parameter, inputs, outputs, ctx, *(group_conv_creator.get_group_conv()), group);
+}
+
 kernel::LiteKernel *CpuConvInt8KernelCreator(const std::vector<lite::Tensor *> &inputs,
                                              const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
                                              const InnerContext *ctx, const kernel::KernelKey &desc) {