!11105 [MS][LITE][CPU]dispatch convolution op through delegate

From: @fuzhiye Reviewed-by: @hangangqiang,@zhang_xue_tong Signed-off-by: @zhang_xue_tong
5 years ago · 3f03b83257
--- a/mindspore/lite/nnacl/int8/quantize.h
+++ b/mindspore/lite/nnacl/int8/quantize.h
@@ -21,9 +21,6 @@
 #include <limits.h>
 #include "nnacl/op_base.h"

 #define INPUT_ASYMMETRIC 0b001
 #define FILTER_ASYMMETRIC 0b010
 #define OUTPUT_ASYMMETRIC 0b100
 #define INPUT_PER_CHANNEL 0b001
 #define FILTER_PER_CHANNEL 0b010
 #define OUTPUT_PER_CHANNEL 0b100
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
@@ -93,13 +93,7 @@ int Convolution1x1FP16CPUKernel::InitWeightBias() {
      MS_LOG(ERROR) << "Conv1x1 Malloc bias_ptr_ error!";
      return RET_ERROR;
    }
    auto bias_tensor = in_tensors_.at(kBiasIndex);
    if (bias_tensor->data_type() == kNumberTypeFloat16) {
      memcpy(bias_data_, bias_tensor->MutableData(), output_channel * sizeof(float16_t));
    } else {
      Float32ToFloat16(reinterpret_cast<float *>(bias_tensor->MutableData()), reinterpret_cast<float16_t *>(bias_data_),
                       output_channel);
    }
    memcpy(bias_data_, fp16_bias_, output_channel * sizeof(float16_t));
    memset(reinterpret_cast<char *>(bias_data_) + weight_size, 0, size - weight_size);
  }

@@ -111,8 +105,7 @@ int Convolution1x1FP16CPUKernel::InitWeightBias() {
    return RET_ERROR;
  }
  memset(reinterpret_cast<char *>(weight_ptr_) + down_size, 0, size - down_size);
  ColMajor2Row8MajorFp16(weight_tensor->MutableData(), weight_ptr_, input_channel, output_channel,
                         weight_tensor->data_type() == kNumberTypeFloat16);
  ColMajor2Row8MajorFp16(fp16_weight_, weight_ptr_, input_channel, output_channel, true);
  return RET_OK;
 }

@@ -127,10 +120,7 @@ int Convolution1x1FP16CPUKernel::Init() {
    MS_LOG(ERROR) << "Init weight bias failed.";
    return ret;
  }
  if (!InferShapeDone()) {
    return RET_OK;
  }
  return ReSize();
  return RET_OK;
 }

 void Convolution1x1FP16CPUKernel::FreeTmpBuffer() {
@@ -143,7 +133,6 @@ void Convolution1x1FP16CPUKernel::FreeTmpBuffer() {

 int Convolution1x1FP16CPUKernel::ReSize() {
  FreeTmpBuffer();

  auto ret = ConvolutionBaseCPUKernel::Init();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ConvolutionBase init failed.";
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h
@@ -30,8 +30,11 @@ class Convolution1x1FP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
 public:
  Convolution1x1FP16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
                              const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx,
                              const mindspore::lite::PrimitiveC *primitive)
      : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {}
                              const mindspore::lite::PrimitiveC *primitive, float16_t *fp16_weight,
                              float16_t *fp16_bias)
      : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive),
        fp16_weight_(fp16_weight),
        fp16_bias_(fp16_bias) {}
  ~Convolution1x1FP16CPUKernel() override;

  int Init() override;
@@ -53,6 +56,8 @@ class Convolution1x1FP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
  bool multi_thread_by_hw_ = false;
  int thread_count_ = 1;
  int thread_stride_ = 0;
  float16_t *fp16_weight_;  // do not free
  float16_t *fp16_bias_;    // do not free
  float16_t *weight_ptr_ = nullptr;
  float16_t *input_ptr_ = nullptr;
  float16_t *pack_input_ = nullptr;
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_delegate_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_delegate_fp16.cc
@@ -0,0 +1,418 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "src/runtime/kernel/arm/fp16/convolution_delegate_fp16.h"
 #include <vector>
 #include "src/runtime/kernel/arm/fp32/convolution_delegate_fp32.h"
 #include "src/runtime/kernel/arm/fp16/convolution_fp16.h"
 #include "src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h"
 #include "src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h"
 #include "src/runtime/kernel/arm/fp16/group_convolution_fp16.h"
 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
 #include "include/errorcode.h"
 #include "src/runtime/runtime_api.h"
 #include "src/runtime/kernel/arm/base/dequant.h"

 using mindspore::kernel::KERNEL_ARCH::kCPU;
 using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_Conv2D;
 using mindspore::schema::Format::Format_NHWC;

 namespace mindspore::kernel {
 void ConvolutionDelegateFP16CPUKernel::FreeCopiedData() {
  if ((fp16_weight_ != nullptr) && (need_free_ & WEIGHT_NEED_FREE)) {
    free(fp16_weight_);
    fp16_weight_ = nullptr;
  }
  if ((fp16_bias_ != nullptr) && (need_free_ & BIAS_NEED_FREE)) {
    free(fp16_bias_);
    fp16_bias_ = nullptr;
  }
 }

 int ConvolutionDelegateFP16CPUKernel::GetFp16WeightAndBias() {
  auto ret = GetFp16Weight();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Get Fp16 Weight failed.";
    return RET_ERROR;
  }

  ret = GetFp16Bias();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Get Fp16 Bias failed.";
    return RET_ERROR;
  }
  return RET_OK;
 }

 int ConvolutionDelegateFP16CPUKernel::GetFp16Weight() {
  auto weight_tensor = in_tensors_.at(kWeightIndex);
  if (weight_tensor->data_type() == kNumberTypeFloat16 && InferShapeDone()) {
    // do not need malloc new memory to store origin data
    fp16_weight_ = reinterpret_cast<float16_t *>(weight_tensor->data_c());
    return RET_OK;
  } else {
    fp16_weight_ = CopyData(weight_tensor);
    if (fp16_weight_ == nullptr) {
      MS_LOG(ERROR) << "Generate fp16_weight failed.";
      return RET_ERROR;
    }
    need_free_ = need_free_ | WEIGHT_NEED_FREE;
    return RET_OK;
  }
  return RET_OK;
 }

 int ConvolutionDelegateFP16CPUKernel::GetFp16Bias() {
  if (in_tensors_.size() == 3) {
    // has bias situation
    auto bias_tensor = in_tensors_.at(kBiasIndex);
    if (bias_tensor->data_type() == kNumberTypeFloat16 && InferShapeDone()) {
      // do not need malloc new memory to store origin data
      fp16_bias_ = reinterpret_cast<float16_t *>(bias_tensor->data_c());
      return RET_OK;
    } else {
      fp16_bias_ = CopyData(bias_tensor);
      if (fp16_bias_ == nullptr) {
        MS_LOG(ERROR) << "Generate fp16_bias failed.";
        return RET_ERROR;
      }
      need_free_ = need_free_ | BIAS_NEED_FREE;
      return RET_OK;
    }
  }
  return RET_OK;
 }

 float16_t *ConvolutionDelegateFP16CPUKernel::CopyData(lite::Tensor *tensor) {
  auto data_type = tensor->data_type();
  MS_ASSERT(data_type == kNumberTypeFloat32 || data_type == kNumberTypeFloat16);
  auto fp16_data = reinterpret_cast<float16_t *>(malloc(tensor->ElementsNum() * sizeof(float16_t)));
  if (fp16_data == nullptr) {
    MS_LOG(ERROR) << "Malloc fp16_data failed.";
    return nullptr;
  }
  if (data_type == kNumberTypeFloat32) {
    float *origin_data = reinterpret_cast<float *>(tensor->data_c());
    for (size_t i = 0; i < tensor->ElementsNum(); ++i) {
      fp16_data[i] = (float16_t)origin_data[i];
    }
  } else {
    auto *origin_data = reinterpret_cast<float16_t *>(tensor->data_c());
    memcpy(fp16_data, origin_data, tensor->Size());
  }
  return fp16_data;
 }

 int ConvolutionDelegateFP16CPUKernel::Init() {
  auto ret = GetFp16WeightAndBias();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Get fp16 weight and bias failed.";
    return ret;
  }
  if (!InferShapeDone()) {
    return RET_OK;
  }
  return ReSize();
 }

 int ConvolutionDelegateFP16CPUKernel::ReSize() {
  // Update shape info of input and output
  SetInputOutputShapeInfo(reinterpret_cast<ConvParameter *>(op_parameter_), in_tensors_.front(), out_tensors_.front(),
                          context_);
  if (fp16_conv_kernel_ == nullptr) {
    fp16_conv_kernel_ =
      CpuConvFp16KernelSelect(in_tensors_, out_tensors_, op_parameter_, context_, primitive_, fp16_weight_, fp16_bias_);
    if (fp16_conv_kernel_ == nullptr) {
      MS_LOG(ERROR) << "Selecting execute kernel failed for conv_kernel, got a nullptr.";
      return RET_ERROR;
    }
  }
  // copied weight and bias are not be used anymore,free them.
  FreeCopiedData();
  return fp16_conv_kernel_->ReSize();
 }

 ConvParameter *CreateNewConvParameterFp16(ConvParameter *parameter) {
  auto conv_parameter = reinterpret_cast<ConvParameter *>(malloc(sizeof(ConvParameter)));
  if (conv_parameter == nullptr) {
    MS_LOG(ERROR) << "Malloc new conv parameter failed.";
    return nullptr;
  }
  memcpy(conv_parameter, parameter, sizeof(ConvParameter));
  return conv_parameter;
 }

 kernel::LiteKernel *CpuConvFp16KernelSelect(const std::vector<lite::Tensor *> &inputs,
                                            const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
                                            const lite::InnerContext *ctx, const mindspore::lite::PrimitiveC *primitive,
                                            float16_t *fp16_weight, float16_t *fp16_bias) {
  auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter);
  bool use_winograd = false;
  int out_unit;
  CheckIfUseWinogradFp16(&use_winograd, &out_unit, conv_param);
  kernel::LiteKernel *kernel = nullptr;
  if (conv_param->kernel_h_ == 1 && conv_param->kernel_w_ == 1) {
    kernel = new (std::nothrow)
      kernel::Convolution1x1FP16CPUKernel(op_parameter, inputs, outputs, ctx, primitive, fp16_weight, fp16_bias);
  } else if (use_winograd) {
    kernel = new (std::nothrow) kernel::ConvolutionWinogradFP16CPUKernel(op_parameter, inputs, outputs, ctx, primitive,
                                                                         out_unit, fp16_weight, fp16_bias);
  } else {
    kernel = new (std::nothrow)
      kernel::ConvolutionFP16CPUKernel(op_parameter, inputs, outputs, ctx, primitive, fp16_weight, fp16_bias);
  }
  // Once kernel is selected, init func will invoke InitWeightAndBias
  auto ret = kernel->Init();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "kernel init failed.";
    delete kernel;
    return nullptr;
  }
  return kernel;
 }

 void FreeMemoryFp16(const std::vector<kernel::LiteKernel *> &group_convs, const std::vector<lite::Tensor *> &new_inputs,
                    const std::vector<lite::Tensor *> &new_outputs) {
  for (auto sub_conv : group_convs) {
    delete sub_conv;
  }
  for (auto in_tensor : new_inputs) {
    delete in_tensor;
  }
  for (auto out_tensor : new_outputs) {
    delete out_tensor;
  }
 }

 static lite::Tensor *CreateInputTensorFp16(TypeId data_type, const std::vector<int> &in_shape, bool infered_flag) {
  auto in_tensor = new (std::nothrow) lite::Tensor(data_type, in_shape, Format_NHWC, lite::Tensor::Category::VAR);
  if (in_tensor == nullptr) {
    MS_LOG(ERROR) << "new in_tensor failed.";
    return nullptr;
  }
  if (infered_flag) {
    auto ret = in_tensor->MallocData();
    if (ret != RET_OK) {
      delete in_tensor;
      MS_LOG(ERROR) << "in tensor malloc failed.";
      return nullptr;
    }
  }
  return in_tensor;
 }

 static lite::Tensor *CreateConstTensorFp16(lite::Tensor *tensor, const std::vector<int> &shape, const int index) {
  auto new_tensor =
    new (std::nothrow) lite::Tensor(tensor->data_type(), shape, Format_NHWC, lite::Tensor::Category::CONST_TENSOR);
  if (new_tensor == nullptr) {
    MS_LOG(ERROR) << "Create new_tensor failed.";
    return nullptr;
  }
  auto ret = new_tensor->MallocData();
  if (ret != RET_OK) {
    delete new_tensor;
    MS_LOG(ERROR) << "Malloc new_tensor failed.";
    return nullptr;
  }
  memcpy(new_tensor->data_c(), reinterpret_cast<char *>(tensor->data_c()) + index * new_tensor->Size(),
         new_tensor->Size());
  return new_tensor;
 }

 static lite::Tensor *CreateOutputTensorFp16(const std::vector<int> &out_shape,
                                            const std::vector<lite::Tensor *> &outputs, bool infered_flag, int index) {
  auto out_tensor = new (std::nothrow) lite::Tensor();
  if (out_tensor == nullptr) {
    MS_LOG(ERROR) << "new tmp_out_tensor failed.";
    return nullptr;
  }
  out_tensor->set_data_type(mindspore::kNumberTypeFloat16);
  out_tensor->set_format(outputs.at(index)->format());
  if (infered_flag) {
    out_tensor->set_shape(out_shape);
    auto ret = out_tensor->MallocData();
    if (ret != RET_OK) {
      delete out_tensor;
      MS_LOG(ERROR) << "out_tensor malloc data failed.";
      return nullptr;
    }
  }
  return out_tensor;
 }

 kernel::LiteKernel *CreateDelegateConvFp16(const std::vector<lite::Tensor *> &inputs,
                                           const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
                                           const InnerContext *ctx, const mindspore::lite::PrimitiveC *primitive) {
  return new (std::nothrow) kernel::ConvolutionDelegateFP16CPUKernel(op_parameter, inputs, outputs, ctx, primitive);
 }

 kernel::LiteKernel *CpuGroupConvFp16KernelCreator(const std::vector<lite::Tensor *> &inputs,
                                                  const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
                                                  const InnerContext *ctx,
                                                  const mindspore::lite::PrimitiveC *primitive) {
  bool infer_flag = (primitive != nullptr && primitive->infer_flag());
  auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter);
  // update new shape info for each sub kernel
  int new_in_channel = inputs.at(kWeightIndex)->Channel();
  int new_out_channel = 0;
  if (conv_param->group_ == 0) {
    MS_LOG(ERROR) << "Divisor 'group' cannot be 0.";
    return nullptr;
  } else {
    new_out_channel = inputs.at(kWeightIndex)->Batch() / conv_param->group_;
  }

  std::vector<int> in_shape;
  std::vector<int> out_shape;
  if (infer_flag) {
    conv_param->input_channel_ = new_in_channel;
    conv_param->output_channel_ = new_out_channel;
    in_shape = {inputs.front()->Batch(), inputs.front()->Height(), inputs.front()->Width(), new_in_channel};
    out_shape = {inputs.front()->Batch(), outputs.front()->Height(), outputs.front()->Width(), new_out_channel};
  }
  std::vector<int> filter_shape = {new_out_channel, conv_param->kernel_h_, conv_param->kernel_w_, new_in_channel};
  std::vector<int> bias_shape = {new_out_channel};

  // new group conv op
  std::vector<kernel::LiteKernel *> group_convs;
  // create tensors for every sub conv kernel
  for (int i = 0; i < conv_param->group_; ++i) {
    std::vector<lite::Tensor *> new_inputs;
    std::vector<lite::Tensor *> new_outputs;
    auto new_conv_parameter = CreateNewConvParameterFp16(conv_param);
    if (new_conv_parameter == nullptr) {
      FreeMemoryFp16(group_convs, new_inputs, new_outputs);
      MS_LOG(ERROR) << "Get new conv parameter failed.";
      return nullptr;
    }
    // create new input for each group
    auto in_tensor = CreateInputTensorFp16(mindspore::kNumberTypeFloat16, in_shape, infer_flag);
    if (in_tensor == nullptr) {
      delete new_conv_parameter;
      FreeMemoryFp16(group_convs, new_inputs, new_outputs);
      MS_LOG(ERROR) << "create input tensor failed.";
      return nullptr;
    }
    new_inputs.emplace_back(in_tensor);

    // create new weight
    auto filter_tensor = CreateConstTensorFp16(inputs.at(kWeightIndex), filter_shape, i);
    if (filter_tensor == nullptr) {
      delete new_conv_parameter;
      FreeMemoryFp16(group_convs, new_inputs, new_outputs);
      MS_LOG(ERROR) << "create filter tensor failed.";
      return nullptr;
    }
    new_inputs.emplace_back(filter_tensor);

    // if has bias, create new bias
    if (inputs.size() == 3) {
      auto bias_tensor = CreateConstTensorFp16(inputs.at(kBiasIndex), bias_shape, i);
      if (bias_tensor == nullptr) {
        delete new_conv_parameter;
        FreeMemoryFp16(group_convs, new_inputs, new_outputs);
        MS_LOG(ERROR) << "create bias_tensor failed.";
        return nullptr;
      }
      new_inputs.emplace_back(bias_tensor);
    }

    // create new output tensors
    for (size_t j = 0; j < outputs.size(); ++j) {
      auto out_tensor = CreateOutputTensorFp16(out_shape, outputs, infer_flag, j);
      if (out_tensor == nullptr) {
        delete new_conv_parameter;
        FreeMemoryFp16(group_convs, new_inputs, new_outputs);
        MS_LOG(ERROR) << "new out_tensor failed.";
        return nullptr;
      }
      new_outputs.emplace_back(out_tensor);
    }
    group_convs.emplace_back(CreateDelegateConvFp16(
      new_inputs, new_outputs, reinterpret_cast<OpParameter *>(new_conv_parameter), ctx, primitive));
  }
  return new (std::nothrow)
    GroupConvolutionFP16CPUKernel(op_parameter, inputs, outputs, ctx, primitive, group_convs, conv_param->group_);
 }

 kernel::LiteKernel *CpuConvFp16KernelCreator(const std::vector<lite::Tensor *> &inputs,
                                             const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
                                             const InnerContext *ctx, const kernel::KernelKey &desc,
                                             const mindspore::lite::PrimitiveC *primitive) {
  MS_ASSERT(opParameter != nullptr);
  MS_ASSERT(desc.type == schema::PrimitiveType_Conv2D);

  auto *weight_tensor = inputs.at(kWeightIndex);
  auto *restore_data = weight_tensor->data_c();
  auto restore_type = weight_tensor->data_type();
  bool dequant_flag =
    !weight_tensor->quant_params().empty() && weight_tensor->quant_params().front().inited && restore_data != nullptr;
  if (dequant_flag) {
    auto *dequant_weight = kernel::DequantUtil::DequantWeight(weight_tensor);
    if (dequant_weight == nullptr) {
      MS_LOG(ERROR) << "dequant data is nullptr.";
      free(opParameter);
      return nullptr;
    }
    weight_tensor->set_data_type(kNumberTypeFloat32);
    weight_tensor->set_data(dequant_weight);
  }

  auto conv_param = reinterpret_cast<ConvParameter *>(opParameter);
  kernel::LiteKernel *kernel = nullptr;
  if (conv_param->group_ == 1) {
    kernel = CreateDelegateConvFp16(inputs, outputs, opParameter, ctx, primitive);
  } else {
    kernel = CpuGroupConvFp16KernelCreator(inputs, outputs, opParameter, ctx, primitive);
  }

  if (kernel == nullptr) {
    MS_LOG(DEBUG) << "Create conv fp16 kernel failed.";
    if (dequant_flag) {
      weight_tensor->FreeData();
      weight_tensor->set_data(restore_data);
      weight_tensor->set_data_type(restore_type);
    }
    free(opParameter);
    return nullptr;
  }

  auto ret = kernel->Init();
  if (ret != RET_OK) {
    MS_LOG(INFO) << "Init fp16 kernel failed, name: " << opParameter->name_
                 << ", type: " << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
    if (dequant_flag) {
      weight_tensor->FreeData();
      weight_tensor->set_data(restore_data);
      weight_tensor->set_data_type(restore_type);
    }
    delete kernel;
    return nullptr;
  }

  if (dequant_flag) {
    weight_tensor->FreeData();
    weight_tensor->set_data(restore_data);
    weight_tensor->set_data_type(restore_type);
  }
  return kernel;
 }
 REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Conv2D, CpuConvFp16KernelCreator)
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_delegate_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_delegate_fp16.h
@@ -0,0 +1,65 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_DELEGATE_FP16_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_DELEGATE_FP16_H_

 #include <arm_neon.h>
 #include <vector>
 #include "src/lite_kernel.h"
 #include "nnacl/conv_parameter.h"
 #include "nnacl/op_base.h"

 #define WEIGHT_NEED_FREE 0b01
 #define BIAS_NEED_FREE 0b10

 namespace mindspore::kernel {
 class ConvolutionDelegateFP16CPUKernel : public LiteKernel {
 public:
  ConvolutionDelegateFP16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
                                   const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
                                   const mindspore::lite::PrimitiveC *primitive)
      : LiteKernel(parameter, inputs, outputs, ctx, primitive) {}
  ~ConvolutionDelegateFP16CPUKernel() override {
    FreeCopiedData();
    if (fp16_conv_kernel_ != nullptr) {
      op_parameter_ = nullptr;  // set op_parameter of delegate to nullptr, avoiding double free
      delete fp16_conv_kernel_;
      fp16_conv_kernel_ = nullptr;
    }
  }
  int GetFp16WeightAndBias();
  int GetFp16Weight();
  int GetFp16Bias();
  float16_t *CopyData(lite::Tensor *tensor);
  void FreeCopiedData();
  int Init() override;
  int ReSize() override;
  int Run() override { return fp16_conv_kernel_->Run(); }

 private:
  uint8_t need_free_ = 0b00;
  kernel::LiteKernel *fp16_conv_kernel_ = nullptr;
  float16_t *fp16_weight_ = nullptr;
  float16_t *fp16_bias_ = nullptr;
 };

 kernel::LiteKernel *CpuConvFp16KernelSelect(const std::vector<lite::Tensor *> &inputs,
                                            const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
                                            const lite::InnerContext *ctx, const mindspore::lite::PrimitiveC *primitive,
                                            float16_t *fp16_weight, float16_t *fp16_bias);
 }  // namespace mindspore::kernel

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP16_CONVOLUTION_DELEGATE_FP16_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
@@ -16,19 +16,16 @@

 #include "src/runtime/kernel/arm/fp16/convolution_fp16.h"
 #include <vector>
 #include "src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h"
 #include "src/runtime/kernel/arm/fp16/convolution_1x1_fp16.h"
 #include "src/runtime/kernel/arm/fp16/group_convolution_fp16.h"
 #include "nnacl/fp16/conv_fp16.h"
 #include "nnacl/fp16/cast_fp16.h"
 #include "nnacl/fp16/pack_fp16.h"
 #include "src/runtime/kernel/arm/fp16/layout_transform_fp16.h"
 #include "include/errorcode.h"
 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
 #include "include/errorcode.h"
 #include "src/runtime/runtime_api.h"
 #include "nnacl/fp16/winograd_utils_fp16.h"
 #include "src/runtime/kernel/arm/base/dequant.h"
 #include "nnacl/fp16/conv_fp16.h"
 #include "nnacl/fp16/matmul_fp16.h"
 #include "nnacl/fp16/cast_fp16.h"
 #include "nnacl/fp16/pack_fp16.h"
 #include "nnacl/fp16/winograd_utils_fp16.h"

 using mindspore::kernel::KERNEL_ARCH::kCPU;
 using mindspore::lite::KernelRegistrar;
@@ -49,23 +46,13 @@ int ConvolutionFP16CPUKernel::InitWeightBias() {
  int pack_weight_size = oc8 * in_channel * kernel_plane;

  // init weight
  auto ret = ConvolutionBaseFP16CPUKernel::GetExecuteFilter();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Get Execute filter failed.";
    return ret;
  }
  packed_weight_ = reinterpret_cast<float16_t *>(malloc(pack_weight_size * sizeof(float16_t)));
  if (packed_weight_ == nullptr) {
    MS_LOG(ERROR) << "malloc packed_weight_ failed.";
    return RET_ERROR;
  }
  memset(packed_weight_, 0, pack_weight_size * sizeof(float16_t));
  RowMajor2Col8MajorFp16(execute_weight_, packed_weight_, out_channel, in_channel * kernel_plane, false);
  if (fp16_weight_ != nullptr) {
    free(fp16_weight_);
    fp16_weight_ = nullptr;
    execute_weight_ = nullptr;
  }
  RowMajor2Col8MajorFp16(fp16_weight_, packed_weight_, out_channel, in_channel * kernel_plane, false);

  // init bias
  bias_data_ = malloc(oc8 * sizeof(float16_t));
@@ -74,12 +61,9 @@ int ConvolutionFP16CPUKernel::InitWeightBias() {
    return RET_ERROR;
  }
  memset(bias_data_, 0, oc8 * sizeof(float16_t));
  auto fp16_bias_data = reinterpret_cast<float16_t *>(bias_data_);
  if (in_tensors_.size() == kInputSize2) {
    auto ori_bias = reinterpret_cast<float *>(in_tensors_.at(kBiasIndex)->data_c());
    for (int i = 0; i < out_channel; ++i) {
      fp16_bias_data[i] = (float16_t)ori_bias[i];
    }
    auto fp16_bias_data = reinterpret_cast<float16_t *>(bias_data_);
    memcpy(fp16_bias_data, fp16_bias_, out_channel * sizeof(float16_t));
  } else {
    MS_ASSERT(in_tensors_.size() == kInputSize1);
  }
@@ -111,10 +95,7 @@ int ConvolutionFP16CPUKernel::Init() {
    MS_LOG(ERROR) << "Init weight bias failed.";
    return RET_ERROR;
  }
  if (!InferShapeDone()) {
    return RET_OK;
  }
  return ReSize();
  return RET_OK;
 }

 int ConvolutionFP16CPUKernel::ReSize() {
@@ -123,7 +104,6 @@ int ConvolutionFP16CPUKernel::ReSize() {
    MS_LOG(ERROR) << "Resize is invalid.";
    return ret;
  }

  ret = ConvolutionBaseCPUKernel::Init();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ConvolutionBase init fail!ret: " << ret;
@@ -173,309 +153,4 @@ int ConvolutionFP16CPUKernel::Run() {
  FreeTmpBuffer();
  return ret;
 }

 ConvParameter *CreateNewConvParameterFp16(ConvParameter *parameter) {
  auto conv_parameter = reinterpret_cast<ConvParameter *>(malloc(sizeof(ConvParameter)));
  if (conv_parameter == nullptr) {
    MS_LOG(ERROR) << "Malloc new conv parameter failed.";
    return nullptr;
  }
  memcpy(conv_parameter, parameter, sizeof(ConvParameter));
  return conv_parameter;
 }

 kernel::LiteKernel *CpuConvFp16KernelSelect(const std::vector<lite::Tensor *> &inputs,
                                            const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
                                            const InnerContext *ctx, const mindspore::lite::PrimitiveC *primitive,
                                            bool use_winograd, int out_unit) {
  auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter);
  if (conv_param->kernel_h_ == 1 && conv_param->kernel_w_ == 1) {
    return new (std::nothrow) kernel::Convolution1x1FP16CPUKernel(op_parameter, inputs, outputs, ctx, primitive);
  } else if (use_winograd) {
    return new (std::nothrow)
      kernel::ConvolutionWinogradFP16CPUKernel(op_parameter, inputs, outputs, ctx, primitive, out_unit);
  } else {
    return new (std::nothrow) kernel::ConvolutionFP16CPUKernel(op_parameter, inputs, outputs, ctx, primitive);
  }
  return nullptr;
 }

 void FreeMemoryFp16(const std::vector<kernel::LiteKernel *> &group_convs, const std::vector<lite::Tensor *> &new_inputs,
                    const std::vector<lite::Tensor *> &new_outputs) {
  for (auto sub_conv : group_convs) {
    delete sub_conv;
  }
  for (auto in_tensor : new_inputs) {
    delete in_tensor;
  }
  for (auto out_tensor : new_outputs) {
    delete out_tensor;
  }
 }

 lite::Tensor *CreateInputTensorFp16(TypeId data_type, std::vector<int> in_shape, bool infered_flag) {
  auto in_tensor = new (std::nothrow) lite::Tensor(data_type, in_shape, Format_NHWC, lite::Tensor::Category::VAR);
  if (in_tensor == nullptr) {
    MS_LOG(ERROR) << "new in_tensor failed.";
    return nullptr;
  }
  if (infered_flag) {
    auto ret = in_tensor->MallocData();
    if (ret != RET_OK) {
      delete in_tensor;
      MS_LOG(ERROR) << "in tensor malloc failed.";
      return nullptr;
    }
  }
  return in_tensor;
 }

 lite::Tensor *CreateFilterTensorFp16(TypeId data_type, std::vector<int> filter_shape,
                                     const std::vector<lite::Tensor *> &inputs, int copy_length, int index) {
  auto filter_tensor =
    new (std::nothrow) lite::Tensor(data_type, filter_shape, Format_NHWC, lite::Tensor::Category::CONST_TENSOR);
  if (filter_tensor == nullptr) {
    MS_LOG(ERROR) << "new filter_tensor failed.";
    return nullptr;
  }
  auto ret = filter_tensor->MallocData();
  if (ret != RET_OK) {
    delete filter_tensor;
    MS_LOG(ERROR) << "filter_tensor malloc failed.";
    return nullptr;
  }
  if (data_type == kNumberTypeFloat16) {
    auto *origin_weight = reinterpret_cast<float16_t *>(inputs.at(kWeightIndex)->data_c());
    memcpy(filter_tensor->data_c(), origin_weight + index * copy_length, copy_length * sizeof(float16_t));
  } else {
    MS_ASSERT(data_type == kNumberTypeFloat32);
    auto *origin_weight = reinterpret_cast<float *>(inputs.at(kWeightIndex)->data_c());
    memcpy(filter_tensor->data_c(), origin_weight + index * copy_length, copy_length * sizeof(float));
  }
  return filter_tensor;
 }

 lite::Tensor *CreateBiasTensorFp16(TypeId data_type, std::vector<int> bias_shape,
                                   const std::vector<lite::Tensor *> &inputs, int new_out_channel, int index) {
  auto *origin_bias = inputs.at(kBiasIndex)->data_c();
  auto bias_tensor =
    new (std::nothrow) lite::Tensor(data_type, bias_shape, Format_NHWC, lite::Tensor::Category::CONST_TENSOR);
  if (bias_tensor == nullptr) {
    MS_LOG(ERROR) << "new bias_tensor failed.";
    return nullptr;
  }
  auto ret = bias_tensor->MallocData();
  if (ret != RET_OK) {
    delete bias_tensor;
    MS_LOG(ERROR) << "bias_tensor malloc failed.";
    return nullptr;
  }
  if (data_type == kNumberTypeFloat16) {
    auto bias_data = reinterpret_cast<float16_t *>(origin_bias);
    memcpy(bias_tensor->data_c(), bias_data + index * new_out_channel, new_out_channel * sizeof(float16_t));
  } else {
    MS_ASSERT(data_type == kNumberTypeFloat32);
    auto bias_data = reinterpret_cast<float *>(origin_bias);
    memcpy(bias_tensor->data_c(), bias_data + index * new_out_channel, new_out_channel * sizeof(float));
  }
  return bias_tensor;
 }

 lite::Tensor *CreateOutputTensorFp16(std::vector<int> out_shape, const std::vector<lite::Tensor *> &outputs,
                                     bool infered_flag, int index) {
  auto out_tensor = new (std::nothrow) lite::Tensor();
  if (out_tensor == nullptr) {
    MS_LOG(ERROR) << "new tmp_out_tensor failed.";
    return nullptr;
  }
  out_tensor->set_data_type(mindspore::kNumberTypeFloat16);
  out_tensor->set_format(outputs.at(index)->format());
  if (infered_flag) {
    out_tensor->set_shape(out_shape);
    auto ret = out_tensor->MallocData();
    if (ret != RET_OK) {
      delete out_tensor;
      MS_LOG(ERROR) << "out_tensor malloc data failed.";
      return nullptr;
    }
  }
  return out_tensor;
 }

 kernel::LiteKernel *CpuGroupConvFp16KernelCreator(const std::vector<lite::Tensor *> &inputs,
                                                  const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
                                                  const InnerContext *ctx, const mindspore::lite::PrimitiveC *primitive,
                                                  int group) {
  int out_unit;
  bool has_bias = inputs.size() == 3;
  bool use_winograd = false;
  bool infered_flag = (primitive != nullptr && primitive->infer_flag());
  auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter);

  // update new shape info for each sub kernel
  int new_in_channel = inputs.at(kWeightIndex)->Channel();
  int new_out_channel = 0;
  if (group == 0) {
    MS_LOG(ERROR) << "Divisor 'group' cannot be 0.";
    return nullptr;
  } else {
    new_out_channel = inputs.at(kWeightIndex)->Batch() / group;
  }

  std::vector<int> in_shape;
  std::vector<int> out_shape;
  int batch = inputs.front()->Batch();
  conv_param->input_batch_ = batch;
  conv_param->output_batch_ = batch;
  if (infered_flag) {
    conv_param->input_channel_ = new_in_channel;
    conv_param->output_channel_ = new_out_channel;
    CheckIfUseWinogradFp16(&use_winograd, &out_unit, conv_param);
    in_shape = {batch, inputs.front()->Height(), inputs.front()->Width(), new_in_channel};
    out_shape = {batch, conv_param->output_h_, conv_param->output_w_, new_out_channel};
  }
  std::vector<int> filter_shape = {new_out_channel, conv_param->kernel_h_, conv_param->kernel_w_, new_in_channel};
  std::vector<int> bias_shape = {new_out_channel};

  // new group conv op
  std::vector<kernel::LiteKernel *> group_convs;
  // create tensors for every sub conv kernel
  for (int i = 0; i < group; ++i) {
    std::vector<lite::Tensor *> new_inputs;
    std::vector<lite::Tensor *> new_outputs;
    auto new_conv_parameter = CreateNewConvParameterFp16(conv_param);
    if (new_conv_parameter == nullptr) {
      FreeMemoryFp16(group_convs, new_inputs, new_outputs);
      MS_LOG(ERROR) << "Get new conv parameter failed.";
      return nullptr;
    }
    // create new input for each group
    auto in_tensor = CreateInputTensorFp16(mindspore::kNumberTypeFloat16, in_shape, infered_flag);
    if (in_tensor == nullptr) {
      delete new_conv_parameter;
      FreeMemoryFp16(group_convs, new_inputs, new_outputs);
      MS_LOG(ERROR) << "create input tensor failed.";
      return nullptr;
    }
    new_inputs.emplace_back(in_tensor);

    // create new weight
    int copy_length = conv_param->kernel_h_ * conv_param->kernel_w_ * new_in_channel * new_out_channel;
    auto filter_tensor =
      CreateFilterTensorFp16(inputs.at(kWeightIndex)->data_type(), filter_shape, inputs, copy_length, i);
    if (filter_tensor == nullptr) {
      delete new_conv_parameter;
      FreeMemoryFp16(group_convs, new_inputs, new_outputs);
      MS_LOG(ERROR) << "create filter tensor failed.";
      return nullptr;
    }
    new_inputs.emplace_back(filter_tensor);

    // if has bias, create new bias
    if (has_bias) {
      auto bias_tensor =
        CreateBiasTensorFp16(inputs.at(kBiasIndex)->data_type(), bias_shape, inputs, new_out_channel, i);
      if (bias_tensor == nullptr) {
        delete new_conv_parameter;
        FreeMemoryFp16(group_convs, new_inputs, new_outputs);
        MS_LOG(ERROR) << "create bias_tensor failed.";
        return nullptr;
      }
      new_inputs.emplace_back(bias_tensor);
    }

    // create new output tensors
    for (size_t j = 0; j < outputs.size(); ++j) {
      auto out_tensor = CreateOutputTensorFp16(out_shape, outputs, infered_flag, j);
      if (out_tensor == nullptr) {
        delete new_conv_parameter;
        FreeMemoryFp16(group_convs, new_inputs, new_outputs);
        MS_LOG(ERROR) << "new out_tensor failed.";
        return nullptr;
      }
      new_outputs.emplace_back(out_tensor);
    }
    group_convs.emplace_back(CpuConvFp16KernelSelect(new_inputs, new_outputs,
                                                     reinterpret_cast<OpParameter *>(new_conv_parameter), ctx,
                                                     primitive, use_winograd, out_unit));
  }

  return new (std::nothrow)
    GroupConvolutionFP16CPUKernel(op_parameter, inputs, outputs, ctx, primitive, group_convs, group);
 }

 kernel::LiteKernel *CpuConvFp16KernelCreator(const std::vector<lite::Tensor *> &inputs,
                                             const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
                                             const InnerContext *ctx, const kernel::KernelKey &desc,
                                             const mindspore::lite::PrimitiveC *primitive) {
  MS_ASSERT(opParameter != nullptr);
  MS_ASSERT(desc.type == schema::PrimitiveType_Conv2D);

  auto *weight_tensor = inputs.at(kWeightIndex);
  auto *restore_data = weight_tensor->data_c();
  auto restore_type = weight_tensor->data_type();
  bool dequant_flag =
    !weight_tensor->quant_params().empty() && weight_tensor->quant_params().front().inited && restore_data != nullptr;
  if (dequant_flag) {
    auto *dequant_weight = kernel::DequantUtil::DequantWeight(weight_tensor);
    if (dequant_weight == nullptr) {
      MS_LOG(ERROR) << "dequant data is nullptr.";
      free(opParameter);
      return nullptr;
    }
    weight_tensor->set_data_type(kNumberTypeFloat32);
    weight_tensor->set_data(dequant_weight);
  }

  auto conv_param = reinterpret_cast<ConvParameter *>(opParameter);
  bool use_winograd = false;
  int out_unit;
  if (primitive != nullptr && primitive->infer_flag()) {
    conv_param->input_h_ = inputs.front()->Height();
    conv_param->input_w_ = inputs.front()->Width();
    conv_param->input_channel_ = inputs.front()->Channel();
    conv_param->output_h_ = outputs.front()->Height();
    conv_param->output_w_ = outputs.front()->Width();
    conv_param->output_channel_ = outputs.front()->Channel();
    conv_param->op_parameter_.thread_num_ = ctx->thread_num_;
    CheckIfUseWinogradFp16(&use_winograd, &out_unit, conv_param);
  }
  int group = conv_param->group_;
  kernel::LiteKernel *kernel = nullptr;
  if (group == 1) {
    kernel = CpuConvFp16KernelSelect(inputs, outputs, opParameter, ctx, primitive, use_winograd, out_unit);
  } else {
    kernel = CpuGroupConvFp16KernelCreator(inputs, outputs, opParameter, ctx, primitive, group);
  }

  if (kernel == nullptr) {
    MS_LOG(DEBUG) << "Create conv fp16 kernel failed.";
    if (dequant_flag) {
      weight_tensor->FreeData();
      weight_tensor->set_data(restore_data);
      weight_tensor->set_data_type(restore_type);
    }
    free(opParameter);
    return nullptr;
  }
  auto ret = kernel->Init();
  if (ret != RET_OK) {
    MS_LOG(INFO) << "Init fp16 kernel failed, name: " << opParameter->name_
                 << ", type: " << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
    if (dequant_flag) {
      weight_tensor->FreeData();
      weight_tensor->set_data(restore_data);
      weight_tensor->set_data_type(restore_type);
    }
    delete kernel;
    return nullptr;
  }
  if (dequant_flag) {
    weight_tensor->FreeData();
    weight_tensor->set_data(restore_data);
    weight_tensor->set_data_type(restore_type);
  }
  return kernel;
 }
 REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_Conv2D, CpuConvFp16KernelCreator)
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.h
@@ -27,13 +27,11 @@ class ConvolutionFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
 public:
  ConvolutionFP16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
                           const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx,
                           const mindspore::lite::PrimitiveC *primitive)
      : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive) {}
                           const mindspore::lite::PrimitiveC *primitive, float16_t *fp16_weight, float16_t *fp16_bias)
      : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive),
        fp16_weight_(fp16_weight),
        fp16_bias_(fp16_bias) {}
  ~ConvolutionFP16CPUKernel() override {
    if (fp16_weight_ != nullptr) {
      free(fp16_weight_);
      fp16_weight_ = nullptr;
    }
    if (packed_weight_ != nullptr) {
      free(packed_weight_);
      packed_weight_ = nullptr;
@@ -58,6 +56,8 @@ class ConvolutionFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
      col_major_input_ = nullptr;
    }
  }
  float16_t *fp16_weight_;  // do not free
  float16_t *fp16_bias_;    // do not free
  float16_t *packed_input_ = nullptr;
  float16_t *packed_weight_ = nullptr;
  float16_t *col_major_input_ = nullptr;
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
@@ -43,12 +43,6 @@ int ConvolutionWinogradFP16CPUKernel::InitWeightBias() {
  int oc_block_num = UP_DIV(out_channel, C8NUM);

  // init weight
  auto ret = ConvolutionBaseFP16CPUKernel::GetExecuteFilter();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Get Execute filter failed.";
    return ret;
  }

  // set data
  auto trans_matrix_data_size = input_unit_ * input_unit_ * in_channel * oc_block_num * oc_block * sizeof(float16_t);
  trans_weight_ = reinterpret_cast<float16_t *>(malloc(trans_matrix_data_size));
@@ -68,21 +62,17 @@ int ConvolutionWinogradFP16CPUKernel::InitWeightBias() {
  if (input_unit_ == 8) {
    coef = 0.5f;
  }
  ret = CookToomFilter(matrix_a, matrix_at, matrix_b, matrix_bt, matrix_g, matrix_gt, coef, output_unit_, kernel_unit_);
  auto ret =
    CookToomFilter(matrix_a, matrix_at, matrix_b, matrix_bt, matrix_g, matrix_gt, coef, output_unit_, kernel_unit_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "get matrix g from CookToomFilter failed.";
    return ret;
  }
  ret = WinogradFilterTransformFp16(execute_weight_, matrix_g, matrix_gt, oc_block);
  ret = WinogradFilterTransformFp16(fp16_origin_weight_, matrix_g, matrix_gt, oc_block);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "winograd filter transfrom failed.";
    return ret;
  }
  if (fp16_weight_ != nullptr) {
    free(fp16_weight_);
    fp16_weight_ = nullptr;
    execute_weight_ = nullptr;
  }

  // init bias
  bias_data_ = malloc(oc_block_num * oc_block * sizeof(float16_t));
@@ -93,10 +83,7 @@ int ConvolutionWinogradFP16CPUKernel::InitWeightBias() {
  memset(bias_data_, 0, oc_block_num * oc_block * sizeof(float16_t));
  auto fp16_bias_data = reinterpret_cast<float16_t *>(bias_data_);
  if (in_tensors_.size() == kInputSize2) {
    auto ori_bias = reinterpret_cast<float *>(in_tensors_.at(kBiasIndex)->MutableData());
    for (int i = 0; i < out_channel; ++i) {
      fp16_bias_data[i] = (float16_t)ori_bias[i];
    }
    memcpy(fp16_bias_data, fp16_bias_, out_channel * sizeof(float16_t));
  } else {
    MS_ASSERT(in_tensors_.size() == kInputSize1);
  }
@@ -163,15 +150,13 @@ int ConvolutionWinogradFP16CPUKernel::Init() {
  input_unit_ = output_unit_ + kernel_unit_ - 1;
  conv_param_->input_unit_ = input_unit_;
  conv_param_->output_unit_ = output_unit_;

  auto ret = InitWeightBias();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Init weight bias failed.";
    return RET_ERROR;
  }
  if (!InferShapeDone()) {
    return RET_OK;
  }
  return ReSize();
  return RET_OK;
 }

 int ConvolutionWinogradFP16CPUKernel::ReSize() {
@@ -180,17 +165,11 @@ int ConvolutionWinogradFP16CPUKernel::ReSize() {
    MS_LOG(ERROR) << "Resize is invalid.";
    return ret;
  }

  ret = ConvolutionBaseCPUKernel::Init();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ConvolutionBase init failed.";
    return RET_ERROR;
  }
  kernel_unit_ = conv_param_->kernel_h_;
  input_unit_ = output_unit_ + kernel_unit_ - 1;
  conv_param_->input_unit_ = input_unit_;
  conv_param_->output_unit_ = output_unit_;

  ret = ConfigInputOutput();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ConfigInputOutput failed.";
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.h
@@ -31,13 +31,13 @@ class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
 public:
  ConvolutionWinogradFP16CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
                                   const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx,
                                   const mindspore::lite::PrimitiveC *primitive, int out_unit)
      : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive), output_unit_(out_unit) {}
                                   const mindspore::lite::PrimitiveC *primitive, int out_unit, float16_t *fp16_weight,
                                   float16_t *fp16_bias)
      : ConvolutionBaseFP16CPUKernel(parameter, inputs, outputs, ctx, primitive),
        output_unit_(out_unit),
        fp16_origin_weight_(fp16_weight),
        fp16_bias_(fp16_bias) {}
  ~ConvolutionWinogradFP16CPUKernel() override {
    if (fp16_weight_ != nullptr) {
      free(fp16_weight_);
      fp16_weight_ = nullptr;
    }
    if (trans_weight_ != nullptr) {
      free(trans_weight_);
      trans_weight_ = nullptr;
@@ -75,6 +75,8 @@ class ConvolutionWinogradFP16CPUKernel : public ConvolutionBaseFP16CPUKernel {
  int kernel_unit_;
  int input_unit_;
  int output_unit_;
  float16_t *fp16_origin_weight_;  // do not free
  float16_t *fp16_bias_;           // do not free
  float16_t *tmp_data_ = nullptr;
  float16_t *trans_input_ = nullptr;
  float16_t *gemm_out_ = nullptr;
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/group_convolution_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/group_convolution_fp16.cc
@@ -87,11 +87,8 @@ int GroupConvolutionFP16CPUKernel::PreProcess() {
    std::vector<int> out_shape;
    for (int i = 0; i < group_num_; ++i) {
      // in
      int in_batch = conv_param_->input_batch_;
      int in_h = conv_param_->input_h_;
      int in_w = conv_param_->input_w_;
      int in_c = conv_param_->input_channel_;
      in_shape = {in_batch, in_h, in_w, in_c};
      auto in_tensor = in_tensors_.front();
      in_shape = {in_tensor->Batch(), in_tensor->Height(), in_tensor->Width(), conv_param_->input_channel_};
      auto sub_kernel_in_tensor = group_convs_.at(i)->in_tensors().front();
      sub_kernel_in_tensor->set_shape(in_shape);
      ret = sub_kernel_in_tensor->MallocData();
@@ -101,11 +98,8 @@ int GroupConvolutionFP16CPUKernel::PreProcess() {
        return ret;
      }
      // out
      int out_batch = conv_param_->output_batch_;
      int out_h = conv_param_->output_h_;
      int out_w = conv_param_->output_w_;
      int out_c = conv_param_->output_channel_;
      out_shape = {out_batch, out_h, out_w, out_c};
      auto out_tensor = out_tensors_.front();
      out_shape = {out_tensor->Batch(), out_tensor->Height(), out_tensor->Width(), conv_param_->output_channel_};
      auto sub_kernel_out_tensors = group_convs_[i]->out_tensors();
      for (auto tensor : sub_kernel_out_tensors) {
        tensor->set_shape(out_shape);
@@ -139,7 +133,8 @@ int GroupConvolutionFP16CPUKernel::PreProcess() {

 int GroupConvolutionFP16CPUKernel::SeparateInput(int group_id) {
  // input may either be float32 or float16
  int in_plane = conv_param_->input_h_ * conv_param_->input_w_ * conv_param_->input_batch_;
  auto in_tensor = in_tensors_.front();
  int in_plane = in_tensor->Height() * in_tensor->Width() * in_tensor->Batch();
  int sub_in_channel = conv_param_->input_channel_;
  int ori_in_channel = sub_in_channel * group_num_;
  auto sub_in_data = group_convs_.at(group_id)->in_tensors().front()->data_c();
@@ -179,7 +174,8 @@ int GroupConvolutionFP16CPUKernel::SeparateInput(int group_id) {

 void GroupConvolutionFP16CPUKernel::PostConcat(int group_id) {
  // output is must float16 data type
  int out_plane = conv_param_->output_h_ * conv_param_->output_w_ * conv_param_->output_batch_;
  auto out_tensor = out_tensors_.front();
  int out_plane = out_tensor->Height() * out_tensor->Width() * out_tensor->Batch();
  int sub_out_channel = conv_param_->output_channel_;
  int ori_out_channel = sub_out_channel * group_num_;
  auto sub_out_data = reinterpret_cast<float16_t *>(group_convs_.at(group_id)->out_tensors().front()->data_c());
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/adder_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/adder_fp32.cc
@@ -31,6 +31,33 @@ using mindspore::schema::PrimitiveType_Adder;
 using mindspore::schema::Format::Format_NHWC;

 namespace mindspore::kernel {
 int AdderCPUKernel::Init() {
  auto ret = InitWeightBias();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Init weight bias failed.";
    return RET_ERROR;
  }
  if (!InferShapeDone()) {
    return RET_OK;
  }
  return ReSize();
 }

 int AdderCPUKernel::ReSize() {
  auto ret = ConvolutionBaseCPUKernel::CheckResizeValid();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Resize is invalid.";
    return ret;
  }

  ret = ConvolutionBaseCPUKernel::Init();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ConvolutionBase init failed.";
    return ret;
  }
  return RET_OK;
 }

 int AdderCPUKernel::InitWeightBias() {
  auto filter_tensor = in_tensors_.at(kWeightIndex);
  int kernel_h = filter_tensor->Height();
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/adder_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/adder_fp32.h
@@ -29,10 +29,12 @@ class AdderCPUKernel : public ConvolutionCPUKernel {
  AdderCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
                 const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
                 const mindspore::lite::PrimitiveC *primitive)
      : ConvolutionCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
      : ConvolutionCPUKernel(parameter, inputs, outputs, ctx, primitive, nullptr, nullptr) {}
  ~AdderCPUKernel() override = default;

  int InitWeightBias() override;
  int Init() override;
  int ReSize() override;
  int Run() override;
  int RunImpl(int task_id) override;
 };
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.cc
@@ -44,10 +44,13 @@ void Convolution1x1CPUKernel::FreeTmpBuffer() {

 int Convolution1x1CPUKernel::ReSize() {
  FreeTmpBuffer();
  ConvolutionBaseCPUKernel::Init();
  auto error_code = ConvolutionBaseCPUKernel::Init();
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "conv base init failed.";
    return error_code;
  }
  InitConv1x1MatmulParam();

  int error_code = InitConv1x1Param();
  error_code = InitConv1x1Param();
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "Convolution base init failed.";
    return error_code;
@@ -95,7 +98,7 @@ int Convolution1x1CPUKernel::InitConv1x1BiasWeight() {
      MS_LOG(ERROR) << "Conv1x1 Malloc bias_ptr_ error!";
      return RET_ERROR;
    }
    memcpy(bias_data_, in_tensors_[kBiasIndex]->MutableData(), weight_size);
    memcpy(bias_data_, origin_bias_, weight_size);
    memset(reinterpret_cast<char *>(bias_data_) + weight_size, 0, size - weight_size);
  }

@@ -108,14 +111,11 @@ int Convolution1x1CPUKernel::InitConv1x1BiasWeight() {
  }
  memset(reinterpret_cast<char *>(weight_ptr_) + down_size, 0, size - down_size);
 #ifdef ENABLE_AVX
  RowMajor2Col16Major(reinterpret_cast<float *>(filter_tensor->MutableData()), weight_ptr_, output_channel,
                      input_channel);
  RowMajor2Col16Major(origin_weight_, weight_ptr_, output_channel, input_channel);
 #elif defined(ENABLE_ARM32)
  RowMajor2Col4Major(reinterpret_cast<float *>(filter_tensor->MutableData()), weight_ptr_, output_channel,
                     input_channel);
  RowMajor2Col4Major(origin_weight_, weight_ptr_, output_channel, input_channel);
 #else
  RowMajor2Col8Major(reinterpret_cast<float *>(filter_tensor->MutableData()), weight_ptr_, output_channel,
                     input_channel);
  RowMajor2Col8Major(origin_weight_, weight_ptr_, output_channel, input_channel);
 #endif
  return RET_OK;
 }
@@ -153,13 +153,10 @@ int Convolution1x1CPUKernel::Init() {
  }
  int error_code = InitConv1x1BiasWeight();
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "Convolution base init failed.";
    MS_LOG(ERROR) << "Convolution1x1 init weight and bias failed.";
    return error_code;
  }
  if (!InferShapeDone()) {
    return RET_OK;
  }
  return ReSize();
  return RET_OK;
 }

 void Convolution1x1CPUKernel::PackMatmulInput(const float *src_ptr, float *dst_ptr, int row, int col) {
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1_fp32.h
@@ -34,8 +34,10 @@ class Convolution1x1CPUKernel : public ConvolutionBaseCPUKernel {
 public:
  Convolution1x1CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
                          const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
                          const mindspore::lite::PrimitiveC *primitive)
      : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
                          const mindspore::lite::PrimitiveC *primitive, float *origin_weight, float *origin_bias)
      : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive),
        origin_weight_(origin_weight),
        origin_bias_(origin_bias) {}
  ~Convolution1x1CPUKernel();
  int Init() override;
  int Run() override;
@@ -58,6 +60,8 @@ class Convolution1x1CPUKernel : public ConvolutionBaseCPUKernel {
  bool multi_thread_by_hw_ = false;
  int thread_count_ = 0;
  int thread_stride_ = 0;
  float *origin_weight_;  // do not free
  float *origin_bias_;    // do not free
  float *weight_ptr_ = nullptr;
  float *pack_input_ = nullptr;
  float *input_ptr_ = nullptr;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_delegate_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_delegate_fp32.cc
@@ -0,0 +1,416 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "src/runtime/kernel/arm/fp32/convolution_delegate_fp32.h"
 #include "src/runtime/kernel/arm/fp32/convolution_fp32.h"
 #include "src/runtime/kernel/arm/fp32/convolution_1x1_fp32.h"
 #include "src/runtime/kernel/arm/fp32/convolution_winograd_fp32.h"
 #include "src/runtime/kernel/arm/fp32/group_convolution_fp32.h"
 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
 #include "include/errorcode.h"
 #include "src/runtime/runtime_api.h"
 #include "src/runtime/kernel/arm/base/dequant.h"

 using mindspore::kernel::KERNEL_ARCH::kCPU;
 using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_INFER_INVALID;
 using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_Conv2D;
 using mindspore::schema::Format::Format_NHWC;

 namespace mindspore::kernel {
 float *ConvolutionDelegateCPUKernel::CopyData(lite::Tensor *tensor) {
  auto data = reinterpret_cast<float *>(malloc(tensor->Size()));
  if (data == nullptr) {
    MS_LOG(ERROR) << "Malloc data failed.";
    return nullptr;
  }
  memcpy(data, tensor->data_c(), tensor->Size());
  return data;
 }

 void ConvolutionDelegateCPUKernel::FreeCopiedData() {
  if (origin_weight_ != nullptr && need_free_weight_) {
    free(origin_weight_);
    origin_weight_ = nullptr;
  }
  if (origin_bias_ != nullptr && need_free_bias_) {
    free(origin_bias_);
    origin_bias_ = nullptr;
  }
 }

 int ConvolutionDelegateCPUKernel::GetWeightAndBias() {
  auto ret = GetWeightData();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Get weight data failed.";
    return ret;
  }
  ret = GetBiasData();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Get bias data failed.";
    return ret;
  }
  return RET_OK;
 }

 int ConvolutionDelegateCPUKernel::GetWeightData() {
  if (InferShapeDone()) {
    origin_weight_ = reinterpret_cast<float *>(in_tensors_.at(kWeightIndex)->data_c());
    return RET_OK;
  } else {
    origin_weight_ = CopyData(in_tensors_.at(kWeightIndex));
    if (origin_weight_ == nullptr) {
      MS_LOG(ERROR) << "Copy weight data failed.";
      return RET_ERROR;
    }
    need_free_weight_ = true;
    return RET_OK;
  }
  return RET_OK;
 }

 int ConvolutionDelegateCPUKernel::GetBiasData() {
  if (in_tensors_.size() == 3) {
    if (InferShapeDone()) {
      origin_bias_ = reinterpret_cast<float *>(in_tensors_.at(kBiasIndex)->data_c());
      return RET_OK;
    } else {
      origin_bias_ = CopyData(in_tensors_.at(kBiasIndex));
      if (origin_bias_ == nullptr) {
        MS_LOG(ERROR) << "Copy bias data failed.";
        return RET_ERROR;
      }
      need_free_bias_ = true;
      return RET_OK;
    }
  }
  return RET_OK;
 }

 int ConvolutionDelegateCPUKernel::Init() {
  auto ret = GetWeightAndBias();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Get weight and bias failed.";
    return ret;
  }
  if (!InferShapeDone()) {
    return RET_OK;
  }
  return ReSize();
 }

 int ConvolutionDelegateCPUKernel::ReSize() {
  // Updata shape info of input and output
  SetInputOutputShapeInfo(reinterpret_cast<ConvParameter *>(op_parameter_), in_tensors_.front(), out_tensors_.front(),
                          context_);
  if (conv_kernel_ == nullptr) {
    // need to select actual execute kernel here
    conv_kernel_ = CpuConvFp32KernelSelect(in_tensors_, out_tensors_, op_parameter_, context_, primitive_,
                                           origin_weight_, origin_bias_);
    if (conv_kernel_ == nullptr) {
      MS_LOG(ERROR) << "Selecting execute kernel failed for conv_kernel, got a nullptr.";
      return RET_ERROR;
    }
  }
  FreeCopiedData();
  return conv_kernel_->ReSize();
 }

 void SetInputOutputShapeInfo(ConvParameter *conv_param, const lite::Tensor *input, const lite::Tensor *output,
                             const InnerContext *ctx) {
  conv_param->input_batch_ = input->Batch();
  conv_param->input_h_ = input->Height();
  conv_param->input_w_ = input->Width();
  conv_param->input_channel_ = input->Channel();
  conv_param->output_batch_ = output->Batch();
  conv_param->output_h_ = output->Height();
  conv_param->output_w_ = output->Width();
  conv_param->output_channel_ = output->Channel();
  conv_param->op_parameter_.thread_num_ = ctx->thread_num_;
 }

 ConvParameter *CreateNewConvParameter(ConvParameter *parameter) {
  auto conv_parameter = new (std::nothrow) ConvParameter;
  if (conv_parameter == nullptr) {
    MS_LOG(ERROR) << "Malloc new conv parameter failed.";
    return nullptr;
  }
  memcpy(conv_parameter, parameter, sizeof(ConvParameter));
  return conv_parameter;
 }

 void FreeMemory(const std::vector<kernel::LiteKernel *> &group_convs, const std::vector<lite::Tensor *> &new_inputs,
                const std::vector<lite::Tensor *> &new_outputs) {
  for (auto sub_conv : group_convs) {
    delete sub_conv;
  }
  for (auto in_tensor : new_inputs) {
    delete in_tensor;
  }
  for (auto out_tensor : new_outputs) {
    delete out_tensor;
  }
 }

 lite::Tensor *CreateInputTensor(TypeId data_type, const std::vector<int> &in_shape, bool infered_flag) {
  auto in_tensor = new (std::nothrow) lite::Tensor(data_type, in_shape, Format_NHWC, lite::Tensor::Category::VAR);
  if (in_tensor == nullptr) {
    MS_LOG(ERROR) << "new in_tensor failed.";
    return nullptr;
  }
  if (infered_flag) {
    auto ret = in_tensor->MallocData();
    if (ret != RET_OK) {
      delete in_tensor;
      MS_LOG(ERROR) << "in tensor malloc failed.";
      return nullptr;
    }
  }
  return in_tensor;
 }

 // weight and bias are const
 static lite::Tensor *CreateConstTensorFp32(lite::Tensor *tensor, const std::vector<int> &shape, const int index) {
  auto new_tensor =
    new (std::nothrow) lite::Tensor(tensor->data_type(), shape, Format_NHWC, lite::Tensor::Category::CONST_TENSOR);
  if (new_tensor == nullptr) {
    MS_LOG(ERROR) << "Create new_tensor failed.";
    return nullptr;
  }
  auto ret = new_tensor->MallocData();
  if (ret != RET_OK) {
    delete new_tensor;
    MS_LOG(ERROR) << "Malloc new_tensor failed.";
    return nullptr;
  }
  MS_ASSERT(tensor->data_type() == kNumberTypeFloat32);
  memcpy(new_tensor->data_c(), reinterpret_cast<char *>(tensor->data_c()) + index * new_tensor->Size(),
         new_tensor->Size());
  return new_tensor;
 }

 lite::Tensor *CreateOutputTensor(const std::vector<int> &out_shape, const std::vector<lite::Tensor *> &outputs,
                                 bool infered_flag, int index) {
  auto out_tensor = new (std::nothrow) lite::Tensor();
  if (out_tensor == nullptr) {
    MS_LOG(ERROR) << "new tmp_out_tensor failed.";
    return nullptr;
  }
  out_tensor->set_data_type(outputs.at(index)->data_type());
  out_tensor->set_format(outputs.at(index)->format());
  if (infered_flag) {
    out_tensor->set_shape(out_shape);
    auto ret = out_tensor->MallocData();
    if (ret != RET_OK) {
      delete out_tensor;
      MS_LOG(ERROR) << "out_tensor malloc data failed.";
      return nullptr;
    }
  }
  return out_tensor;
 }

 kernel::LiteKernel *CpuConvFp32KernelSelect(const std::vector<lite::Tensor *> &inputs,
                                            const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
                                            const InnerContext *ctx, const mindspore::lite::PrimitiveC *primitive,
                                            float *origin_weight, float *origin_bias) {
  auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter);
  bool use_winograd = false;
  int out_unit;
  CheckIfUseWinograd(&use_winograd, &out_unit, conv_param);
  kernel::LiteKernel *kernel = nullptr;
  if (conv_param->kernel_h_ == 1 && conv_param->kernel_w_ == 1) {
    kernel = new (std::nothrow)
      kernel::Convolution1x1CPUKernel(op_parameter, inputs, outputs, ctx, primitive, origin_weight, origin_bias);
  } else if (use_winograd) {
    kernel = new (std::nothrow) kernel::ConvolutionWinogradCPUKernel(op_parameter, inputs, outputs, ctx, primitive,
                                                                     out_unit, origin_weight, origin_bias);
  } else {
    kernel = new (std::nothrow)
      kernel::ConvolutionCPUKernel(op_parameter, inputs, outputs, ctx, primitive, origin_weight, origin_bias);
  }
  if (kernel != nullptr) {
    auto ret = kernel->Init();
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "conv kernel init failed.";
      delete kernel;
      return nullptr;
    }
  }
  return kernel;
 }

 static kernel::LiteKernel *CreateDelegateConv(const std::vector<lite::Tensor *> &inputs,
                                              const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
                                              const InnerContext *ctx, const mindspore::lite::PrimitiveC *primitive) {
  return new (std::nothrow) kernel::ConvolutionDelegateCPUKernel(op_parameter, inputs, outputs, ctx, primitive);
 }

 kernel::LiteKernel *CpuGroupConvFp32KernelCreator(const std::vector<lite::Tensor *> &inputs,
                                                  const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
                                                  const InnerContext *ctx,
                                                  const mindspore::lite::PrimitiveC *primitive) {
  bool infer_flag = primitive != nullptr && primitive->infer_flag();
  auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter);
  int new_in_channel = inputs.at(kWeightIndex)->Channel();
  int new_out_channel;
  if (conv_param->group_ == 0) {
    MS_LOG(ERROR) << "Divisor 'group' cannot be 0.";
    return nullptr;
  } else {
    new_out_channel = inputs.at(kWeightIndex)->Batch() / conv_param->group_;
  }
  std::vector<int> in_shape;
  std::vector<int> out_shape;
  if (infer_flag) {
    conv_param->input_channel_ = new_in_channel;
    conv_param->output_channel_ = new_out_channel;
    in_shape = {inputs.front()->Batch(), inputs.front()->Height(), inputs.front()->Width(), new_in_channel};
    out_shape = {inputs.front()->Batch(), outputs.front()->Height(), outputs.front()->Width(), new_out_channel};
  }
  std::vector<int> filter_shape = {new_out_channel, conv_param->kernel_h_, conv_param->kernel_w_, new_in_channel};
  std::vector<int> bias_shape = {new_out_channel};

  // create sub kernels
  std::vector<kernel::LiteKernel *> group_convs;
  for (int i = 0; i < conv_param->group_; ++i) {
    std::vector<lite::Tensor *> new_inputs;
    std::vector<lite::Tensor *> new_outputs;
    auto new_conv_parameter = CreateNewConvParameter(conv_param);
    if (new_conv_parameter == nullptr) {
      FreeMemory(group_convs, new_inputs, new_outputs);
      MS_LOG(ERROR) << "Get new conv parameter failed.";
      return nullptr;
    }

    // create new input for each group
    auto in_tensor = CreateInputTensor(inputs.front()->data_type(), in_shape, infer_flag);
    if (in_tensor == nullptr) {
      delete new_conv_parameter;
      FreeMemory(group_convs, new_inputs, new_outputs);
      MS_LOG(ERROR) << "create input tensor failed.";
      return nullptr;
    }
    new_inputs.emplace_back(in_tensor);

    // create new weight
    auto filter_tensor = CreateConstTensorFp32(inputs.at(kWeightIndex), filter_shape, i);
    if (filter_tensor == nullptr) {
      delete new_conv_parameter;
      FreeMemory(group_convs, new_inputs, new_outputs);
      MS_LOG(ERROR) << "create filter tensor failed.";
      return nullptr;
    }
    new_inputs.emplace_back(filter_tensor);

    // if has bias, create new bias
    if (inputs.size() == 3) {
      auto bias_tensor = CreateConstTensorFp32(inputs.at(kBiasIndex), bias_shape, i);
      if (bias_tensor == nullptr) {
        delete new_conv_parameter;
        FreeMemory(group_convs, new_inputs, new_outputs);
        MS_LOG(ERROR) << "create bias_tensor failed.";
        return nullptr;
      }
      new_inputs.emplace_back(bias_tensor);
    }

    // create new output tensor
    for (size_t j = 0; j < outputs.size(); ++j) {
      auto out_tensor = CreateOutputTensor(out_shape, outputs, infer_flag, j);
      if (out_tensor == nullptr) {
        delete new_conv_parameter;
        FreeMemory(group_convs, new_inputs, new_outputs);
        MS_LOG(ERROR) << "new out_tensor failed.";
        return nullptr;
      }
      new_outputs.emplace_back(out_tensor);
    }
    group_convs.emplace_back(
      CreateDelegateConv(new_inputs, new_outputs, reinterpret_cast<OpParameter *>(new_conv_parameter), ctx, primitive));
  }
  return new (std::nothrow)
    GroupConvolutionCPUKernel(op_parameter, inputs, outputs, ctx, primitive, group_convs, conv_param->group_);
 }

 kernel::LiteKernel *CpuConvFp32KernelCreator(const std::vector<lite::Tensor *> &inputs,
                                             const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
                                             const InnerContext *ctx, const kernel::KernelKey &desc,
                                             const mindspore::lite::PrimitiveC *primitive) {
  MS_ASSERT(op_parameter != nullptr);
  MS_ASSERT(desc.type == schema::PrimitiveType_Conv2D);
  MS_ASSERT(desc.data_type == kNumberTypeFloat32);

  // if get quantized weight, dequantize it to float32 type data.
  auto *weight_tensor = inputs.at(kWeightIndex);
  auto *restore_data = weight_tensor->data_c();
  auto restore_type = weight_tensor->data_type();
  bool dequant_flag =
    !weight_tensor->quant_params().empty() && weight_tensor->quant_params().front().inited && restore_data != nullptr;
  if (dequant_flag) {
    auto *dequant_weight = kernel::DequantUtil::DequantWeight(weight_tensor);
    if (dequant_weight == nullptr) {
      MS_LOG(ERROR) << "dequant data is nullptr.";
      free(op_parameter);
      return nullptr;
    }
    weight_tensor->set_data(dequant_weight);
  }

  auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter);
  kernel::LiteKernel *kernel = nullptr;
  if (conv_param->group_ == 1) {
    kernel = CreateDelegateConv(inputs, outputs, op_parameter, ctx, primitive);
  } else {
    kernel = CpuGroupConvFp32KernelCreator(inputs, outputs, op_parameter, ctx, primitive);
  }

  if (kernel == nullptr) {
    MS_LOG(ERROR) << "kernel is nullptr.";
    if (dequant_flag) {
      weight_tensor->FreeData();
      weight_tensor->set_data(restore_data);
      weight_tensor->set_data_type(restore_type);
    }
    free(op_parameter);
    return nullptr;
  }

  auto ret = kernel->Init();
  if (ret != RET_OK && ret != RET_INFER_INVALID) {
    MS_LOG(ERROR) << "Init kernel failed, name: " << op_parameter->name_ << ", type: "
                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(op_parameter->type_));
    if (dequant_flag) {
      weight_tensor->FreeData();
      weight_tensor->set_data(restore_data);
      weight_tensor->set_data_type(restore_type);
    }
    delete kernel;
    return nullptr;
  }

  if (dequant_flag) {
    weight_tensor->FreeData();
    weight_tensor->set_data(restore_data);
    weight_tensor->set_data_type(restore_type);
  }
  return kernel;
 }

 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Conv2D, CpuConvFp32KernelCreator)
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_delegate_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_delegate_fp32.h
@@ -0,0 +1,77 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_DELEGATE_FP32_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_DELEGATE_FP32_H_

 #include <vector>
 #include "src/ops/conv2d.h"
 #include "src/lite_kernel.h"
 #include "nnacl/conv_parameter.h"
 #include "nnacl/op_base.h"

 using mindspore::lite::InnerContext;
 namespace mindspore::kernel {
 class ConvolutionDelegateCPUKernel : public LiteKernel {
 public:
  ConvolutionDelegateCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
                               const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
                               const mindspore::lite::PrimitiveC *primitive)
      : LiteKernel(parameter, inputs, outputs, ctx, primitive) {}
  ~ConvolutionDelegateCPUKernel() override {
    FreeCopiedData();
    if (conv_kernel_ != nullptr) {
      op_parameter_ = nullptr;  // op_parameter will be freed in conv_kernel
      delete conv_kernel_;
      conv_kernel_ = nullptr;
    }
  };
  int Init() override;
  int ReSize() override;
  int Run() override { return conv_kernel_->Run(); }
  int GetWeightAndBias();
  int GetWeightData();
  int GetBiasData();
  static float *CopyData(lite::Tensor *tensor);
  void FreeCopiedData();

 protected:
  bool need_free_weight_ = false;
  bool need_free_bias_ = false;
  kernel::LiteKernel *conv_kernel_ = nullptr;
  float *origin_weight_ = nullptr;
  float *origin_bias_ = nullptr;
 };

 void SetInputOutputShapeInfo(ConvParameter *conv_param, const lite::Tensor *input, const lite::Tensor *output,
                             const InnerContext *ctx);

 void FreeMemory(const std::vector<kernel::LiteKernel *> &group_convs, const std::vector<lite::Tensor *> &new_inputs,
                const std::vector<lite::Tensor *> &new_outputs);

 ConvParameter *CreateNewConvParameter(ConvParameter *parameter);

 lite::Tensor *CreateInputTensor(TypeId data_type, const std::vector<int> &in_shape, bool infered_flag);

 lite::Tensor *CreateOutputTensor(const std::vector<int> &out_shape, const std::vector<lite::Tensor *> &outputs,
                                 bool infered_flag, int index);

 kernel::LiteKernel *CpuConvFp32KernelSelect(const std::vector<lite::Tensor *> &inputs,
                                            const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
                                            const InnerContext *ctx, const mindspore::lite::PrimitiveC *primitive,
                                            float *origin_weight, float *origin_bias);
 }  // namespace mindspore::kernel

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_DELEGATE_FP32_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_fp32.cc
@@ -15,16 +15,13 @@
 */

 #include "src/runtime/kernel/arm/fp32/convolution_fp32.h"
 #include "src/runtime/kernel/arm/fp32/convolution_1x1_fp32.h"
 #include "src/runtime/kernel/arm/fp32/convolution_winograd_fp32.h"
 #include "src/runtime/kernel/arm/fp32/group_convolution_fp32.h"
 #include "nnacl/fp32/conv_fp32.h"
 #include "nnacl/common_func.h"
 #include "include/errorcode.h"
 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
 #include "include/errorcode.h"
 #include "src/runtime/runtime_api.h"
 #include "src/runtime/kernel/arm/base/dequant.h"
 #include "nnacl/fp32/conv_fp32.h"
 #include "nnacl/fp32/matmul_fp32.h"

 using mindspore::kernel::KERNEL_ARCH::kCPU;
 using mindspore::lite::KernelRegistrar;
@@ -52,7 +49,6 @@ int ConvolutionCPUKernel::InitWeightBias() {
  int oc_block_num = UP_ROUND(out_channel, oc_block);
  int pack_weight_size = oc_block_num * in_channel * kernel_plane;

  auto origin_weight = reinterpret_cast<float *>(filter_tensor->data_c());
  packed_weight_ = reinterpret_cast<float *>(malloc(pack_weight_size * sizeof(float)));
  if (packed_weight_ == nullptr) {
    MS_LOG(ERROR) << "malloc packed weight failed.";
@@ -60,11 +56,11 @@ int ConvolutionCPUKernel::InitWeightBias() {
  }
  memset(packed_weight_, 0, pack_weight_size * sizeof(float));
 #ifdef ENABLE_AVX
  RowMajor2Col16Major(origin_weight, packed_weight_, out_channel, in_channel * kernel_plane);
  RowMajor2Col16Major(origin_weight_, packed_weight_, out_channel, in_channel * kernel_plane);
 #elif ENABLE_ARM32
  RowMajor2Col4Major(origin_weight, packed_weight_, out_channel, in_channel * kernel_plane);
  RowMajor2Col4Major(origin_weight_, packed_weight_, out_channel, in_channel * kernel_plane);
 #else
  RowMajor2Col8Major(origin_weight, packed_weight_, out_channel, in_channel * kernel_plane);
  RowMajor2Col8Major(origin_weight_, packed_weight_, out_channel, in_channel * kernel_plane);
 #endif

  bias_data_ = reinterpret_cast<float *>(malloc(oc_block_num * sizeof(float)));
@@ -75,8 +71,7 @@ int ConvolutionCPUKernel::InitWeightBias() {
  memset(bias_data_, 0, oc_block_num * sizeof(float));

  if (in_tensors_.size() == kInputSize2) {
    auto ori_bias = reinterpret_cast<float *>(in_tensors_.at(kBiasIndex)->data_c());
    memcpy(bias_data_, ori_bias, out_channel * sizeof(float));
    memcpy(bias_data_, origin_bias_, out_channel * sizeof(float));
  } else {
    MS_ASSERT(in_tensors_.size() == kInputSize1);
  }
@@ -114,10 +109,7 @@ int ConvolutionCPUKernel::Init() {
    MS_LOG(ERROR) << "Init weight bias failed.";
    return RET_ERROR;
  }
  if (!InferShapeDone()) {
    return RET_OK;
  }
  return ReSize();
  return RET_OK;
 }

 int ConvolutionCPUKernel::ReSize() {
@@ -126,11 +118,10 @@ int ConvolutionCPUKernel::ReSize() {
    MS_LOG(ERROR) << "Resize is invalid.";
    return ret;
  }

  ret = ConvolutionBaseCPUKernel::Init();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ConvolutionBase init failed.";
    return RET_ERROR;
    MS_LOG(ERROR) << "conv base init failed.";
    return ret;
  }
  return RET_OK;
 }
@@ -168,304 +159,4 @@ int ConvolutionCPUKernel::Run() {
  FreeTmpBuffer();
  return ret;
 }

 ConvParameter *CreateNewConvParameter(ConvParameter *parameter) {
  auto conv_parameter = new (std::nothrow) ConvParameter;
  if (conv_parameter == nullptr) {
    MS_LOG(ERROR) << "Malloc new conv parameter failed.";
    return nullptr;
  }
  memcpy(conv_parameter, parameter, sizeof(ConvParameter));
  return conv_parameter;
 }

 void FreeMemory(const std::vector<kernel::LiteKernel *> &group_convs, const std::vector<lite::Tensor *> &new_inputs,
                const std::vector<lite::Tensor *> &new_outputs) {
  for (auto sub_conv : group_convs) {
    delete sub_conv;
  }
  for (auto in_tensor : new_inputs) {
    delete in_tensor;
  }
  for (auto out_tensor : new_outputs) {
    delete out_tensor;
  }
 }

 lite::Tensor *CreateInputTensor(TypeId data_type, std::vector<int> in_shape, bool infered_flag) {
  auto in_tensor = new (std::nothrow) lite::Tensor(data_type, in_shape, Format_NHWC, lite::Tensor::Category::VAR);
  if (in_tensor == nullptr) {
    MS_LOG(ERROR) << "new in_tensor failed.";
    return nullptr;
  }
  if (infered_flag) {
    auto ret = in_tensor->MallocData();
    if (ret != RET_OK) {
      delete in_tensor;
      MS_LOG(ERROR) << "in tensor malloc failed.";
      return nullptr;
    }
  }
  return in_tensor;
 }

 lite::Tensor *CreateFilterTensorFp32(TypeId data_type, std::vector<int> filter_shape,
                                     const std::vector<lite::Tensor *> &inputs, int copy_length, int index) {
  auto filter_tensor =
    new (std::nothrow) lite::Tensor(data_type, filter_shape, Format_NHWC, lite::Tensor::Category::CONST_TENSOR);
  if (filter_tensor == nullptr) {
    MS_LOG(ERROR) << "new filter_tensor failed.";
    return nullptr;
  }
  auto ret = filter_tensor->MallocData();
  if (ret != RET_OK) {
    delete filter_tensor;
    MS_LOG(ERROR) << "filter_tensor malloc failed.";
    return nullptr;
  }

  MS_ASSERT(data_type == kNumberTypeFloat32);
  auto *origin_weight = reinterpret_cast<float *>(inputs.at(kWeightIndex)->data_c());
  memcpy(filter_tensor->data_c(), origin_weight + index * copy_length, copy_length * sizeof(float));
  return filter_tensor;
 }

 lite::Tensor *CreateBiasTensorFp32(TypeId data_type, std::vector<int> bias_shape,
                                   const std::vector<lite::Tensor *> &inputs, int new_out_channel, int index) {
  auto *origin_bias = inputs.at(kBiasIndex)->data_c();
  auto bias_tensor =
    new (std::nothrow) lite::Tensor(data_type, bias_shape, Format_NHWC, lite::Tensor::Category::CONST_TENSOR);
  if (bias_tensor == nullptr) {
    MS_LOG(ERROR) << "new bias_tensor failed.";
    return nullptr;
  }
  auto ret = bias_tensor->MallocData();
  if (ret != RET_OK) {
    delete bias_tensor;
    MS_LOG(ERROR) << "bias_tensor malloc failed.";
    return nullptr;
  }
  MS_ASSERT(data_type == kNumberTypeFloat32);
  auto bias_data = reinterpret_cast<float *>(origin_bias);
  memcpy(bias_tensor->data_c(), bias_data + index * new_out_channel, new_out_channel * sizeof(float));

  return bias_tensor;
 }

 lite::Tensor *CreateOutputTensor(std::vector<int> out_shape, const std::vector<lite::Tensor *> &outputs,
                                 bool infered_flag, int index) {
  auto out_tensor = new (std::nothrow) lite::Tensor();
  if (out_tensor == nullptr) {
    MS_LOG(ERROR) << "new tmp_out_tensor failed.";
    return nullptr;
  }
  out_tensor->set_data_type(outputs.at(index)->data_type());
  out_tensor->set_format(outputs.at(index)->format());
  if (infered_flag) {
    out_tensor->set_shape(out_shape);
    auto ret = out_tensor->MallocData();
    if (ret != RET_OK) {
      delete out_tensor;
      MS_LOG(ERROR) << "out_tensor malloc data failed.";
      return nullptr;
    }
  }
  return out_tensor;
 }

 kernel::LiteKernel *CpuConvFp32KernelSelect(const std::vector<lite::Tensor *> &inputs,
                                            const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
                                            const InnerContext *ctx, const mindspore::lite::PrimitiveC *primitive,
                                            bool use_winograd, int out_unit) {
  auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter);
  if (conv_param->kernel_h_ == 1 && conv_param->kernel_w_ == 1) {
    return new (std::nothrow) kernel::Convolution1x1CPUKernel(op_parameter, inputs, outputs, ctx, primitive);
  } else if (use_winograd) {
    return new (std::nothrow)
      kernel::ConvolutionWinogradCPUKernel(op_parameter, inputs, outputs, ctx, primitive, out_unit);
  } else {
    return new (std::nothrow) kernel::ConvolutionCPUKernel(op_parameter, inputs, outputs, ctx, primitive);
  }
  return nullptr;
 }

 kernel::LiteKernel *CpuGroupConvFp32KernelCreator(const std::vector<lite::Tensor *> &inputs,
                                                  const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
                                                  const InnerContext *ctx, const mindspore::lite::PrimitiveC *primitive,
                                                  int group) {
  int out_unit;
  bool has_bias = inputs.size() == 3;
  bool use_winograd = false;
  bool infered_flag = primitive != nullptr && primitive->infer_flag();
  auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter);

  std::vector<int> in_shape;
  std::vector<int> out_shape;
  int new_in_channel = inputs.at(kWeightIndex)->Channel();
  int new_out_channel = 0;
  if (group == 0) {
    MS_LOG(ERROR) << "Divisor 'group' cannot be 0.";
    return nullptr;
  } else {
    new_out_channel = inputs.at(kWeightIndex)->Batch() / group;
  }
  int batch = inputs.front()->Batch();
  conv_param->input_batch_ = batch;
  conv_param->output_batch_ = batch;
  if (infered_flag) {
    int in_h = inputs.front()->Height();
    int in_w = inputs.front()->Width();
    conv_param->input_channel_ = new_in_channel;
    conv_param->output_channel_ = new_out_channel;
    CheckIfUseWinograd(&use_winograd, &out_unit, conv_param);
    in_shape = {batch, in_h, in_w, new_in_channel};
    out_shape = {batch, conv_param->output_h_, conv_param->output_w_, new_out_channel};
  }
  std::vector<int> filter_shape = {new_out_channel, conv_param->kernel_h_, conv_param->kernel_w_, new_in_channel};
  std::vector<int> bias_shape = {new_out_channel};

  // create sub kernels
  std::vector<kernel::LiteKernel *> group_convs;
  for (int i = 0; i < group; ++i) {
    std::vector<lite::Tensor *> new_inputs;
    std::vector<lite::Tensor *> new_outputs;
    auto new_conv_parameter = CreateNewConvParameter(conv_param);
    if (new_conv_parameter == nullptr) {
      FreeMemory(group_convs, new_inputs, new_outputs);
      MS_LOG(ERROR) << "Get new conv parameter failed.";
      return nullptr;
    }

    // create new input for each group
    auto in_tensor = CreateInputTensor(inputs.front()->data_type(), in_shape, infered_flag);
    if (in_tensor == nullptr) {
      delete new_conv_parameter;
      FreeMemory(group_convs, new_inputs, new_outputs);
      MS_LOG(ERROR) << "create input tensor failed.";
      return nullptr;
    }
    new_inputs.emplace_back(in_tensor);

    // create new weight
    int copy_length = conv_param->kernel_h_ * conv_param->kernel_w_ * new_in_channel * new_out_channel;
    auto filter_tensor =
      CreateFilterTensorFp32(inputs.at(kWeightIndex)->data_type(), filter_shape, inputs, copy_length, i);
    if (filter_tensor == nullptr) {
      delete new_conv_parameter;
      FreeMemory(group_convs, new_inputs, new_outputs);
      MS_LOG(ERROR) << "create filter tensor failed.";
      return nullptr;
    }
    new_inputs.emplace_back(filter_tensor);

    // if has bias, create new bias
    if (has_bias) {
      auto bias_tensor =
        CreateBiasTensorFp32(inputs.at(kBiasIndex)->data_type(), bias_shape, inputs, new_out_channel, i);
      if (bias_tensor == nullptr) {
        delete new_conv_parameter;
        FreeMemory(group_convs, new_inputs, new_outputs);
        MS_LOG(ERROR) << "create bias_tensor failed.";
        return nullptr;
      }
      new_inputs.emplace_back(bias_tensor);
    }

    // create new output tensor
    for (size_t j = 0; j < outputs.size(); ++j) {
      auto out_tensor = CreateOutputTensor(out_shape, outputs, infered_flag, j);
      if (out_tensor == nullptr) {
        delete new_conv_parameter;
        FreeMemory(group_convs, new_inputs, new_outputs);
        MS_LOG(ERROR) << "new out_tensor failed.";
        return nullptr;
      }
      new_outputs.emplace_back(out_tensor);
    }
    group_convs.emplace_back(CpuConvFp32KernelSelect(new_inputs, new_outputs,
                                                     reinterpret_cast<OpParameter *>(new_conv_parameter), ctx,
                                                     primitive, use_winograd, out_unit));
  }

  return new (std::nothrow)
    GroupConvolutionCPUKernel(op_parameter, inputs, outputs, ctx, primitive, group_convs, group);
 }

 kernel::LiteKernel *CpuConvFp32KernelCreator(const std::vector<lite::Tensor *> &inputs,
                                             const std::vector<lite::Tensor *> &outputs, OpParameter *op_parameter,
                                             const InnerContext *ctx, const kernel::KernelKey &desc,
                                             const mindspore::lite::PrimitiveC *primitive) {
  MS_ASSERT(op_parameter != nullptr);
  MS_ASSERT(desc.type == schema::PrimitiveType_Conv2D);
  MS_ASSERT(desc.data_type == kNumberTypeFloat32);
  auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter);
  int group = conv_param->group_;
  bool use_winograd = false;
  int out_unit;
  if (primitive != nullptr && primitive->infer_flag()) {
    conv_param->input_h_ = inputs.front()->Height();
    conv_param->input_w_ = inputs.front()->Width();
    conv_param->input_channel_ = inputs.front()->Channel();
    conv_param->output_h_ = outputs.front()->Height();
    conv_param->output_w_ = outputs.front()->Width();
    conv_param->output_channel_ = outputs.front()->Channel();
    conv_param->op_parameter_.thread_num_ = ctx->thread_num_;
    CheckIfUseWinograd(&use_winograd, &out_unit, conv_param);
  }

  auto *weight_tensor = inputs.at(kWeightIndex);
  auto *restore_data = weight_tensor->data_c();
  auto restore_type = weight_tensor->data_type();
  bool dequant_flag =
    !weight_tensor->quant_params().empty() && weight_tensor->quant_params().front().inited && restore_data != nullptr;
  if (dequant_flag) {
    auto *dequant_weight = kernel::DequantUtil::DequantWeight(weight_tensor);
    if (dequant_weight == nullptr) {
      MS_LOG(ERROR) << "dequant data is nullptr.";
      free(op_parameter);
      return nullptr;
    }
    weight_tensor->set_data(dequant_weight);
  }

  kernel::LiteKernel *kernel;
  if (group == 1) {
    kernel = CpuConvFp32KernelSelect(inputs, outputs, op_parameter, ctx, primitive, use_winograd, out_unit);
  } else {
    kernel = CpuGroupConvFp32KernelCreator(inputs, outputs, op_parameter, ctx, primitive, group);
  }

  if (kernel == nullptr) {
    MS_LOG(ERROR) << "kernel is nullptr.";
    if (dequant_flag) {
      weight_tensor->FreeData();
      weight_tensor->set_data(restore_data);
      weight_tensor->set_data_type(restore_type);
    }
    free(op_parameter);
    return nullptr;
  }
  auto ret = kernel->Init();
  if (ret != RET_OK && ret != RET_INFER_INVALID) {
    MS_LOG(ERROR) << "Init kernel failed, name: " << op_parameter->name_ << ", type: "
                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(op_parameter->type_));
    if (dequant_flag) {
      weight_tensor->FreeData();
      weight_tensor->set_data(restore_data);
      weight_tensor->set_data_type(restore_type);
    }
    delete kernel;
    return nullptr;
  }

  if (dequant_flag) {
    weight_tensor->FreeData();
    weight_tensor->set_data(restore_data);
    weight_tensor->set_data_type(restore_type);
  }

  return kernel;
 }

 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Conv2D, CpuConvFp32KernelCreator)
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_fp32.h
@@ -28,8 +28,10 @@ class ConvolutionCPUKernel : public ConvolutionBaseCPUKernel {
 public:
  ConvolutionCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
                       const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
                       const mindspore::lite::PrimitiveC *primitive)
      : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
                       const mindspore::lite::PrimitiveC *primitive, float *origin_weight, float *origin_bias)
      : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive),
        origin_weight_(origin_weight),
        origin_bias_(origin_bias) {}
  ~ConvolutionCPUKernel() override {
    if (packed_weight_ != nullptr) {
      free(packed_weight_);
@@ -57,20 +59,12 @@ class ConvolutionCPUKernel : public ConvolutionBaseCPUKernel {
  }

 protected:
  float *origin_weight_;  // do not free
  float *origin_bias_;    // do not free
  float *packed_weight_ = nullptr;
  float *packed_input_ = nullptr;
  float *col_major_input_ = nullptr;
 };

 void FreeMemory(const std::vector<kernel::LiteKernel *> &group_convs, const std::vector<lite::Tensor *> &new_inputs,
                const std::vector<lite::Tensor *> &new_outputs);

 ConvParameter *CreateNewConvParameter(ConvParameter *parameter);

 lite::Tensor *CreateInputTensor(TypeId data_type, std::vector<int> in_shape, bool infered_flag);

 lite::Tensor *CreateOutputTensor(std::vector<int> out_shape, const std::vector<lite::Tensor *> &outputs,
                                 bool infered_flag, int index);
 }  // namespace mindspore::kernel

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd_fp32.cc
@@ -81,8 +81,7 @@ int ConvolutionWinogradCPUKernel::InitWeightBias() {
    MS_LOG(ERROR) << "get matrix g from CookToomFilter failed.";
    return ret;
  }
  auto weight_data = reinterpret_cast<float *>(filter_tensor->MutableData());
  ret = WinogradFilterTransform(weight_data, matrix_g, matrix_gt, oc_block);
  ret = WinogradFilterTransform(origin_weight_, matrix_g, matrix_gt, oc_block);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "winograd filter transfrom failed.";
    return ret;
@@ -97,8 +96,7 @@ int ConvolutionWinogradCPUKernel::InitWeightBias() {
  }
  memset(bias_data_, 0, new_bias_size);
  if (in_tensors_.size() == kInputSize2) {
    auto ori_bias_addr = reinterpret_cast<float *>(in_tensors_.at(kBiasIndex)->MutableData());
    memcpy(bias_data_, ori_bias_addr, out_channel * sizeof(float));
    memcpy(bias_data_, origin_bias_, out_channel * sizeof(float));
  } else {
    MS_ASSERT(in_tensors_.size() == kInputSize1);
  }
@@ -171,10 +169,7 @@ int ConvolutionWinogradCPUKernel::Init() {
    MS_LOG(ERROR) << "Init weight bias failed.";
    return RET_ERROR;
  }
  if (!InferShapeDone()) {
    return RET_OK;
  }
  return ReSize();
  return RET_OK;
 }

 int ConvolutionWinogradCPUKernel::ReSize() {
@@ -183,18 +178,11 @@ int ConvolutionWinogradCPUKernel::ReSize() {
    MS_LOG(ERROR) << "Resize is invalid.";
    return ret;
  }

  ret = ConvolutionBaseCPUKernel::Init();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ConvolutionBase init failed.";
    return RET_ERROR;
    MS_LOG(ERROR) << "conv base init failed.";
    return ret;
  }

  kernel_unit_ = conv_param_->kernel_h_;
  input_unit_ = output_unit_ + kernel_unit_ - 1;
  conv_param_->input_unit_ = input_unit_;
  conv_param_->output_unit_ = output_unit_;

  ret = ConfigInputOutput();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ConfigInputOutput failed.";
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd_fp32.h
@@ -28,10 +28,12 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel {
 public:
  ConvolutionWinogradCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
                               const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
                               const mindspore::lite::PrimitiveC *primitive, int output_unit)
                               const mindspore::lite::PrimitiveC *primitive, int output_unit, float *origin_weight,
                               float *origin_bias)
      : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive),
        output_unit_(output_unit),
        trans_weight_(nullptr) {}
        origin_weight_(origin_weight),
        origin_bias_(origin_bias) {}
  ~ConvolutionWinogradCPUKernel() override {
    if (trans_weight_ != nullptr) {
      free(trans_weight_);
@@ -69,6 +71,8 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel {
  int kernel_unit_;
  int input_unit_;
  int output_unit_;
  float *origin_weight_;  // do not free
  float *origin_bias_;    // do not free
  float *tmp_data_ = nullptr;
  float *trans_input_ = nullptr;
  float *gemm_out_ = nullptr;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/group_convolution_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/group_convolution_fp32.cc
@@ -92,11 +92,8 @@ int GroupConvolutionCPUKernel::PreProcess() {
    std::vector<int> out_shape;
    for (int i = 0; i < group_num_; ++i) {
      // in
      int in_batch = conv_param_->input_batch_;
      int in_h = conv_param_->input_h_;
      int in_w = conv_param_->input_w_;
      int in_c = conv_param_->input_channel_;
      in_shape = {in_batch, in_h, in_w, in_c};
      auto in_tensor = in_tensors_.front();
      in_shape = {in_tensor->Batch(), in_tensor->Height(), in_tensor->Width(), conv_param_->input_channel_};
      auto sub_kernel_in_tensor = group_convs_.at(i)->in_tensors().front();
      sub_kernel_in_tensor->set_shape(in_shape);
      ret = sub_kernel_in_tensor->MallocData();
@@ -106,11 +103,8 @@ int GroupConvolutionCPUKernel::PreProcess() {
        return ret;
      }
      // out
      int out_batch = conv_param_->output_batch_;
      int out_h = conv_param_->output_h_;
      int out_w = conv_param_->output_w_;
      int out_c = conv_param_->output_channel_;
      out_shape = {out_batch, out_h, out_w, out_c};
      auto out_tensor = out_tensors_.front();
      out_shape = {out_tensor->Batch(), out_tensor->Height(), out_tensor->Width(), conv_param_->output_channel_};
      auto sub_kernel_out_tensors = group_convs_.at(i)->out_tensors();
      for (auto tensor : sub_kernel_out_tensors) {
        tensor->set_shape(out_shape);
@@ -143,7 +137,8 @@ int GroupConvolutionCPUKernel::PreProcess() {
 }

 void GroupConvolutionCPUKernel::SeparateInput(int group_id) {
  int in_plane = conv_param_->input_h_ * conv_param_->input_w_ * conv_param_->input_batch_;
  auto in_tensor = in_tensors_.front();
  int in_plane = in_tensor->Height() * in_tensor->Width() * in_tensor->Batch();
  int sub_in_channel = conv_param_->input_channel_;
  int ori_in_channel = sub_in_channel * group_num_;
  auto sub_in_data = reinterpret_cast<float *>(group_convs_.at(group_id)->in_tensors().front()->data_c());
@@ -157,7 +152,8 @@ void GroupConvolutionCPUKernel::SeparateInput(int group_id) {
 }

 void GroupConvolutionCPUKernel::PostConcat(int group_id) {
  int out_plane = conv_param_->output_h_ * conv_param_->output_w_ * conv_param_->output_batch_;
  auto out_tensor = out_tensors_.front();
  int out_plane = out_tensor->Height() * out_tensor->Width() * out_tensor->Batch();
  int sub_out_channel = conv_param_->output_channel_;
  int ori_out_channel = sub_out_channel * group_num_;
  auto sub_out_data = reinterpret_cast<float *>(group_convs_.at(group_id)->out_tensors().front()->data_c());
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc
@@ -19,8 +19,7 @@
 #include "nnacl/int8/conv_int8.h"
 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
 #include "src/runtime/kernel/arm/base/layout_transform.h"
 #include "src/runtime/kernel/arm/fp32/convolution_fp32.h"
 #include "src/runtime/kernel/arm/fp32/convolution_delegate_fp32.h"
 #include "src/runtime/kernel/arm/int8/convolution_1x1_int8.h"
 #include "src/runtime/kernel/arm/int8/convolution_3x3_int8.h"
 #include "src/runtime/kernel/arm/int8/group_convolution_int8.h"
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/conv1x1_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/conv1x1_fp32_tests.cc
@@ -139,146 +139,4 @@ TEST_F(TestConv1x1Fp32, Input1x1PrePack4) {
  EXPECT_EQ(0, CompareOutputData(out, correct, 54));
  delete conv_param;
 }

 int Conv1x1TestInit1(std::vector<lite::Tensor *> *inputs_, std::vector<lite::Tensor *> *outputs_,
                     ConvParameter *conv_param, float **correct) {
  auto *in_t = new lite::Tensor(kNumberTypeFloat, {1, 2, 3, 4}, schema::Format_NHWC, lite::Tensor::VAR);
  in_t->MallocData();
  float in[] = {12.216284, 3.3466918,  15.327419, 5.234958,  0.804376,   9.952188,  14.727955,  -8.080715,
                13.71383,  8.055829,   6.5845337, -9.25232,  -4.24519,   11.550042, 9.262012,   1.2780352,
                6.7263746, -3.9301445, 3.764492,  -8.602078, -3.3558068, 13.619035, -2.6694393, 3.2008505};
  memcpy(in_t->MutableData(), in, sizeof(float) * 24);
  inputs_->push_back(in_t);

  auto *weight_t = new lite::Tensor(kNumberTypeFloat, {3, 1, 1, 4}, schema::Format_NHWC, lite::Tensor::CONST_TENSOR);
  weight_t->MallocData();
  float weight[] = {-0.7308652, 0.5257509,  -0.87825793, -1.123181,   -1.2206168, 0.562695,
                    1.5382664,  -0.5020635, 0.8591602,   -0.26410004, 1.1262615,  0.073132955}; /* nhwc */
  memcpy(weight_t->MutableData(), weight, sizeof(float) * 12);
  inputs_->push_back(weight_t);

  auto *bias_t = new lite::Tensor(kNumberTypeFloat, {3}, schema::Format_NHWC, lite::Tensor::CONST_TENSOR);
  bias_t->MallocData();
  float bias[] = {2, 2, 2};
  memcpy(bias_t->MutableData(), bias, sizeof(float) * 3);
  inputs_->push_back(bias_t);

  auto *out_t = new lite::Tensor(kNumberTypeFloat, {1, 2, 3, 3}, schema::Format_NHWC, lite::Tensor::VAR);
  out_t->MallocData();
  outputs_->push_back(out_t);

  *correct = reinterpret_cast<float *>(malloc(out_t->ElementsNum() * sizeof(float)));
  float co[] = {2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1.3731456, 1.6877825, 12.427691, 2., 2., 2.};
  memcpy(*correct, co, out_t->ElementsNum() * sizeof(float));

  conv_param->kernel_h_ = conv_param->kernel_w_ = 1;
  conv_param->stride_h_ = conv_param->stride_w_ = 2;
  conv_param->dilation_h_ = conv_param->dilation_w_ = 1;
  conv_param->pad_u_ = conv_param->pad_l_ = 1;
  conv_param->act_type_ = ActType_No;
  return out_t->ElementsNum();
 }

 TEST_F(TestConv1x1Fp32, Conv1x1Test1) {
  std::vector<lite::Tensor *> inputs_;
  std::vector<lite::Tensor *> outputs_;
  auto conv_param = new ConvParameter();
  auto *ctx = new lite::InnerContext();
  ctx->thread_num_ = 1;
  ASSERT_EQ(lite::RET_OK, ctx->Init());
  float *correct;
  int total_size = Conv1x1TestInit1(&inputs_, &outputs_, conv_param, &correct);
  auto *conv1x1 =
    new kernel::Convolution1x1CPUKernel(reinterpret_cast<OpParameter *>(conv_param), inputs_, outputs_, ctx, nullptr);

  conv1x1->Init();
  conv1x1->Run();

  ASSERT_EQ(0, CompareOutputData(reinterpret_cast<float *>(outputs_[0]->MutableData()), correct, total_size, 0.0001));
  delete conv_param;
  delete conv1x1;
  for (auto t : inputs_) delete t;
  for (auto t : outputs_) delete t;
  free(correct);
 }

 int Conv1x1TestInit2(std::vector<lite::Tensor *> *inputs_, std::vector<lite::Tensor *> *outputs_,
                     ConvParameter *conv_param, float **correct) {
  size_t buffer_size;
  auto *in_t = new lite::Tensor(kNumberTypeFloat, {1, 300, 300, 24}, schema::Format_NHWC, lite::Tensor::VAR);
  in_t->MallocData();
  std::string input_path = "./conv/conv1x1fp32_input1_nhwc.bin";
  auto in = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &buffer_size));
  memcpy(in_t->MutableData(), in, buffer_size);
  inputs_->push_back(in_t);

  auto *weight_t = new lite::Tensor(kNumberTypeFloat, {40, 1, 1, 24}, schema::Format_NHWC, lite::Tensor::CONST_TENSOR);
  weight_t->MallocData();
  std::string weight_path = "./conv/conv1x1fp32_weight1_nhwc.bin";
  auto weight = reinterpret_cast<float *>(mindspore::lite::ReadFile(weight_path.c_str(), &buffer_size));
  memcpy(weight_t->MutableData(), weight, buffer_size);
  inputs_->push_back(weight_t);

  auto *bias_t = new lite::Tensor(kNumberTypeFloat, {40}, schema::Format_NHWC, lite::Tensor::CONST_TENSOR);
  bias_t->MallocData();
  std::string bias_path = "./conv/conv1x1fp32_bias1_nhwc.bin";
  auto bias = mindspore::lite::ReadFile(bias_path.c_str(), &buffer_size);
  memcpy(bias_t->MutableData(), bias, buffer_size);
  inputs_->push_back(bias_t);

  auto *out_t = new lite::Tensor(kNumberTypeFloat, {1, 300, 300, 40}, schema::Format_NHWC, lite::Tensor::VAR);
  out_t->MallocData();
  outputs_->push_back(out_t);

  std::string out_path = "./conv/conv1x1fp32_output1_nhwc.bin";
  auto out_nhwc = mindspore::lite::ReadFile(out_path.c_str(), &buffer_size);
  *correct = reinterpret_cast<float *>(malloc(buffer_size));
  memcpy(*correct, out_nhwc, buffer_size);

  conv_param->kernel_h_ = conv_param->kernel_w_ = 1;
  conv_param->stride_h_ = conv_param->stride_w_ = 1;
  conv_param->dilation_h_ = conv_param->dilation_w_ = 1;
  conv_param->pad_u_ = conv_param->pad_l_ = 0;
  conv_param->act_type_ = ActType_No;
  return out_t->ElementsNum();
 }

 TEST_F(TestConv1x1Fp32, Conv1x1Test2) {
  std::vector<lite::Tensor *> inputs_;
  std::vector<lite::Tensor *> outputs_;
  auto conv_param = new ConvParameter();
  auto *ctx = new lite::InnerContext();
  ctx->thread_num_ = 2;
  ASSERT_EQ(lite::RET_OK, ctx->Init());
  float *correct;
  int total_size = Conv1x1TestInit2(&inputs_, &outputs_, conv_param, &correct);
  auto *conv1x1 =
    new kernel::Convolution1x1CPUKernel(reinterpret_cast<OpParameter *>(conv_param), inputs_, outputs_, ctx, nullptr);

  conv1x1->Init();
  conv1x1->Run();
  ASSERT_EQ(0, CompareOutputData(reinterpret_cast<float *>(outputs_[0]->MutableData()), correct, total_size, 0.0001));

  /* running warm up */
  for (int i = 0; i < 0; i++) {
    conv1x1->Run();
  }

  /* running time cost */
  int loop_count = 1;
  auto time_start = mindspore::lite::GetTimeUs();
  for (int i = 0; i < loop_count; i++) {
    conv1x1->Run();
  }
  auto time_end = mindspore::lite::GetTimeUs();
  auto cost = time_end - time_start;
  uint64_t time_avg = cost / loop_count;
  printf("1x1 average time : %f ms\n", time_avg / 1000.0f);

  delete conv_param;
  delete conv1x1;
  for (auto t : inputs_) delete t;
  for (auto t : outputs_) delete t;
  free(correct);
 }
 }  // namespace mindspore