From 88a44a0ef0faacb1e57a7cf1d97c99e2abcc8df6 Mon Sep 17 00:00:00 2001
From: yangruoqi713 <yangruoqi@huawei.com>
Date: Mon, 3 Aug 2020 16:33:57 +0800
Subject: [PATCH] optimize arm cpu op: conv_depthwise, deconv_depthwise

---
 .../arm/fp16/convolution_depthwise_fp16.cc    |   1 -
 .../kernel/arm/fp32/convolution_depthwise.cc  | 106 ++++++++++-------
 .../kernel/arm/fp32/convolution_depthwise.h   |   7 +-
 .../arm/fp32/deconvolution_depthwise.cc       | 109 +++++++++++-------
 .../kernel/arm/fp32/deconvolution_depthwise.h |  14 ++-
 .../arm/int8/convolution_depthwise_int8.cc    |  72 ++++++++----
 .../arm/int8/convolution_depthwise_int8.h     |   1 +
 .../arm/int8/deconvolution_depthwise_int8.cc  |  92 ++++++++++-----
 .../arm/int8/deconvolution_depthwise_int8.h   |   1 +
 .../arm/opclib/fp16/conv_depthwise_fp16.cc    |   7 +-
 .../kernel/arm/opclib/fp32/conv_depthwise.cc  |  10 +-
 .../arm/opclib/int8/conv_depthwise_int8.cc    |   4 +-
 12 files changed, 276 insertions(+), 148 deletions(-)
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
index 6b00c60b59..dc8a70e3d8 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
@@ -46,7 +46,6 @@ int ConvolutionDepthwiseFp16CPUKernel::InitBuffer() {
     MS_LOG(ERROR) << "Malloc buffer failed.";
     return RET_ERROR;
   }
-  memset(packed_output_, 0, pack_output_size * sizeof(float16_t));
   return RET_OK;
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.cc
index 75856bfba3..077cbd812d 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.cc
@@ -27,27 +27,7 @@ using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_DepthwiseConv2D;
 
 namespace mindspore::kernel {
-int ConvolutionDepthwiseCPUKernel::Init() {
-  // conv base init
-  ConvolutionBaseCPUKernel::Init();
-
-  // init sliding window param
-  sliding_ = new SlidingWindowParam;
-  InitSlidingParam(sliding_, conv_param_, C4NUM);
-
-  // pack input function: convert_func_
-  auto input_tensor = inputs_[kInputIndex];
-  auto data_type = input_tensor->data_type();
-  auto input_format = input_tensor->GetFormat();
-  schema::Format execute_format = schema::Format_NHWC4;
-  if (input_format != execute_format) {
-    convert_func_ = LayoutTransform(data_type, input_format, execute_format);
-    if (convert_func_ == nullptr) {
-      MS_LOG(ERROR) << "layout convert func is nullptr.";
-      return RET_ERROR;
-    }
-  }
-
+int ConvolutionDepthwiseCPUKernel::InitWeightBias() {
   // init weight: o, h, w, i; o == group, i == 1
   auto weight_tensor = inputs_[kWeightIndex];
   auto origin_weight = reinterpret_cast<float *>(weight_tensor->Data());
@@ -55,42 +35,93 @@ int ConvolutionDepthwiseCPUKernel::Init() {
   int pack_weight_size = C4NUM * OC4 * conv_param_->kernel_h_ * conv_param_->kernel_w_;
 
   packed_weight_ = reinterpret_cast<float *>(malloc(pack_weight_size * sizeof(float)));
+  if (packed_weight_ == nullptr) {
+    MS_LOG(ERROR) << "Malloc buffer failed.";
+    return RET_ERROR;
+  }
   memset(packed_weight_, 0, pack_weight_size * sizeof(float));
   PackNCHWToNC4HW4Fp32(origin_weight, packed_weight_, 1, conv_param_->kernel_h_ * conv_param_->kernel_w_,
                        conv_param_->output_channel_);
 
   // init bias
   bias_data_ = reinterpret_cast<float *>(malloc(C4NUM * OC4 * sizeof(float)));
+  if (bias_data_ == nullptr) {
+    MS_LOG(ERROR) << "Malloc buffer failed.";
+    return RET_ERROR;
+  }
   memset(bias_data_, 0, C4NUM * OC4 * sizeof(float));
   if (inputs_.size() == kInputSize2) {
     auto ori_bias = reinterpret_cast<float *>(inputs_.at(kBiasIndex)->Data());
     memcpy(bias_data_, ori_bias, conv_param_->output_channel_ * sizeof(float));
-  } else {
-    MS_ASSERT(inputs_.size() == kInputSize1);
   }
 
   // init threadNum;
   conv_param_->thread_num_ = MSMIN(thread_count_, OC4);
-  ReSize();
   return RET_OK;
 }
 
-int ConvolutionDepthwiseCPUKernel::ReSize() {
-  // malloc pack input buffer
-  if (convert_func_ != nullptr) {
+int ConvolutionDepthwiseCPUKernel::InitBuffer() {
+  // malloc pack input and output buffer
+  if (conv_param_->input_channel_ % C4NUM != 0) {
+    need_align_ = true;
     int IC4 = UP_DIV(conv_param_->input_channel_, C4NUM);
     int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM * IC4;
     packed_input_ = reinterpret_cast<float *>(malloc(pack_input_size * sizeof(float)));
+    if (packed_input_ == nullptr) {
+      MS_LOG(ERROR) << "Malloc buffer failed.";
+      return RET_ERROR;
+    }
     memset(packed_input_, 0, pack_input_size * sizeof(float));
-  }
 
-  // malloc tmp output buffer
-  if (conv_param_->output_channel_ % C4NUM != 0) {
-    need_align_ = true;
     int OC4 = UP_DIV(conv_param_->output_channel_, C4NUM);
     int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * OC4;
     packed_output_ = reinterpret_cast<float *>(malloc(pack_output_size * sizeof(float)));
-    memset(packed_output_, 0, pack_output_size * sizeof(float));
+    if (packed_output_ == nullptr) {
+      MS_LOG(ERROR) << "Malloc buffer failed.";
+      return RET_ERROR;
+    }
+  }
+  return RET_OK;
+}
+
+int ConvolutionDepthwiseCPUKernel::Init() {
+  // conv base init
+  ConvolutionBaseCPUKernel::Init();
+
+  // init sliding window param
+  sliding_ = new SlidingWindowParam;
+  InitSlidingParam(sliding_, conv_param_, C4NUM);
+
+  auto ret = InitWeightBias();
+  if (ret != 0) {
+    MS_LOG(ERROR) << "Convolution depthwise fp32 InitWeightBias failed.";
+    return RET_ERROR;
+  }
+
+  ret = InitBuffer();
+  if (ret != 0) {
+    MS_LOG(ERROR) << "Convolution depthwise fp32 InitBuffer failed.";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int ConvolutionDepthwiseCPUKernel::ReSize() {
+  if (need_align_) {
+    free(packed_input_);
+    free(packed_output_);
+  }
+  // conv base init
+  ConvolutionBaseCPUKernel::Init();
+
+  // init sliding window param
+  sliding_ = new SlidingWindowParam;
+  InitSlidingParam(sliding_, conv_param_, C4NUM);
+
+  auto ret = InitBuffer();
+  if (ret != 0) {
+    MS_LOG(ERROR) << "Convolution depthwise fp32 InitBuffer failed.";
+    return RET_ERROR;
   }
   return RET_OK;
 }
@@ -120,15 +151,14 @@ int ConvolutionDepthwiseCPUKernel::Run() {
   auto input_addr = reinterpret_cast<float *>(input_tensor->Data());
 
   // pack input: to nhwc4
-  if (convert_func_ != nullptr) {
-    convert_func_(input_addr, packed_input_, conv_param_->input_batch_, conv_param_->input_h_ * conv_param_->input_w_,
-                  conv_param_->input_channel_);
+  if (need_align_) {
+    PackNHWCToNHWC4Fp32(input_addr, packed_input_, conv_param_->input_batch_,
+                        conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
   } else {
     packed_input_ = input_addr;
   }
 
-  output_addr = reinterpret_cast<float *>(outputs_.at(kOutputIndex)->Data());
-  memset(output_addr, 0, outputs_.at(kOutputIndex)->ElementsNum() * sizeof(float));
+  auto output_addr = reinterpret_cast<float *>(outputs_.at(kOutputIndex)->Data());
   if (!need_align_) {
     packed_output_ = output_addr;
   }
@@ -146,7 +176,6 @@ int ConvolutionDepthwiseCPUKernel::Run() {
   return RET_OK;
 }
 
-
 kernel::LiteKernel *CpuConvDwFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
                                                const std::vector<lite::tensor::Tensor *> &outputs,
                                                OpParameter *opParameter, const Context *ctx,
@@ -170,4 +199,3 @@ kernel::LiteKernel *CpuConvDwFp32KernelCreator(const std::vector<lite::tensor::T
 
 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_DepthwiseConv2D, CpuConvDwFp32KernelCreator)
 }  // namespace mindspore::kernel
-
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.h b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.h
index 0e326529f9..dedcdf5016 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.h
@@ -31,10 +31,8 @@ class ConvolutionDepthwiseCPUKernel : public ConvolutionBaseCPUKernel {
   ~ConvolutionDepthwiseCPUKernel() override {
     delete sliding_;
     free(packed_weight_);
-    if (convert_func_ != nullptr) {
-      free(packed_input_);
-    }
     if (need_align_) {
+      free(packed_input_);
       free(packed_output_);
     }
   };
@@ -43,6 +41,8 @@ class ConvolutionDepthwiseCPUKernel : public ConvolutionBaseCPUKernel {
   int ReSize() override;
   int Run() override;
 
+  int InitBuffer();
+  int InitWeightBias();
   int Execute(int task_id);
 
  private:
@@ -50,7 +50,6 @@ class ConvolutionDepthwiseCPUKernel : public ConvolutionBaseCPUKernel {
   float *packed_weight_;
   float *packed_input_;
   float *packed_output_;
-  float *output_addr;
   bool need_align_ = false;
 };
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.cc
index 7f09307a43..07e5747847 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.cc
@@ -43,24 +43,7 @@ int DeconvolutionDepthwiseCPUKernel::InitSlideParam() {
   return RET_OK;
 }
 
-int DeconvolutionDepthwiseCPUKernel::Init() {
-  InitSlideParam();
-  // conv base init
-  ConvolutionBaseCPUKernel::Init();
-
-  // pack input function: convert_func_
-  auto input_tensor = inputs_[kInputIndex];
-  auto data_type = input_tensor->data_type();
-  auto input_format = input_tensor->GetFormat();
-  schema::Format execute_format = schema::Format_NHWC4;
-  if (input_format != execute_format) {
-    convert_func_ = LayoutTransform(data_type, input_format, execute_format);
-    if (convert_func_ == nullptr) {
-      MS_LOG(ERROR) << "layout convert func is nullptr.";
-      return RET_ERROR;
-    }
-  }
-
+int DeconvolutionDepthwiseCPUKernel::InitWeightBias() {
   // init weight: o, h, w, i; o == group, i == 1
   auto weight_tensor = inputs_[kWeightIndex];
   auto origin_weight = reinterpret_cast<float *>(weight_tensor->Data());
@@ -68,55 +51,102 @@ int DeconvolutionDepthwiseCPUKernel::Init() {
   int pack_weight_size = C4NUM * OC4 * conv_param_->kernel_h_ * conv_param_->kernel_w_;
 
   packed_weight_ = reinterpret_cast<float *>(malloc(pack_weight_size * sizeof(float)));
+  if (packed_weight_ == nullptr) {
+    MS_LOG(ERROR) << "Malloc buffer failed.";
+    return RET_ERROR;
+  }
   memset(packed_weight_, 0, pack_weight_size * sizeof(float));
   PackNCHWToNC4HW4Fp32(origin_weight, packed_weight_, 1, conv_param_->kernel_h_ * conv_param_->kernel_w_,
                        conv_param_->output_channel_);
 
   // init bias
   bias_data_ = reinterpret_cast<float *>(malloc(C4NUM * OC4 * sizeof(float)));
+  if (bias_data_ == nullptr) {
+    MS_LOG(ERROR) << "Malloc buffer failed.";
+    return RET_ERROR;
+  }
   memset(bias_data_, 0, C4NUM * OC4 * sizeof(float));
   if (inputs_.size() == kInputSize2) {
     auto ori_bias = reinterpret_cast<float *>(inputs_.at(kBiasIndex)->Data());
     memcpy(bias_data_, ori_bias, conv_param_->output_channel_ * sizeof(float));
-  } else {
-    MS_ASSERT(inputs_.size() == kInputSize1);
   }
 
   // init threadNum;
   conv_param_->thread_num_ = MSMIN(conv_param_->thread_num_, OC4);
-  ReSize();
   return RET_OK;
 }
 
-int DeconvolutionDepthwiseCPUKernel::ReSize() {
-  // malloc pack input buffer
-  if (convert_func_ != nullptr) {
+int DeconvolutionDepthwiseCPUKernel::InitBuffer() {
+  // malloc pack input and output buffer
+  if (conv_param_->input_channel_ % C4NUM != 0) {
+    need_align_ = true;
     int IC4 = UP_DIV(conv_param_->input_channel_, C4NUM);
     int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM * IC4;
     packed_input_ = reinterpret_cast<float *>(malloc(pack_input_size * sizeof(float)));
+    if (packed_input_ == nullptr) {
+      MS_LOG(ERROR) << "Malloc buffer failed.";
+      return RET_ERROR;
+    }
     memset(packed_input_, 0, pack_input_size * sizeof(float));
-  }
 
-  // malloc tmp output buffer
-  if (conv_param_->output_channel_ % C4NUM != 0) {
-    need_pack_ = true;
     int OC4 = UP_DIV(conv_param_->output_channel_, C4NUM);
     int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * OC4;
     packed_output_ = reinterpret_cast<float *>(malloc(pack_output_size * sizeof(float)));
+    if (packed_output_ == nullptr) {
+      MS_LOG(ERROR) << "Malloc buffer failed.";
+      return RET_ERROR;
+    }
     memset(packed_output_, 0, pack_output_size * sizeof(float));
   }
   return RET_OK;
 }
 
-int DeconvolutionDepthwiseCPUKernel::DoExcute(int task_id) {
+int DeconvolutionDepthwiseCPUKernel::Init() {
+  InitSlideParam();
+  // conv base init
+  ConvolutionBaseCPUKernel::Init();
+
+  auto ret = InitWeightBias();
+  if (ret != 0) {
+    MS_LOG(ERROR) << "Deconvolution depthwise fp32 InitWeightBias failed.";
+    return RET_ERROR;
+  }
+
+  ret = InitBuffer();
+  if (ret != 0) {
+    MS_LOG(ERROR) << "Deconvolution depthwise fp32 InitBuffer failed.";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int DeconvolutionDepthwiseCPUKernel::ReSize() {
+  if (need_align_) {
+    free(packed_input_);
+    free(packed_output_);
+  }
+  InitSlideParam();
+
+  // conv base init
+  ConvolutionBaseCPUKernel::Init();
+
+  auto ret = InitBuffer();
+  if (ret != 0) {
+    MS_LOG(ERROR) << "Deconvolution depthwise fp32 InitBuffer failed.";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int DeconvolutionDepthwiseCPUKernel::Execute(int task_id) {
   DeconvDwC4Fp32(packed_output_, packed_input_, packed_weight_, reinterpret_cast<float *>(bias_data_), conv_param_,
                  sliding_, task_id);
   return RET_OK;
 }
 
 int DeconvDwRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
-  auto conv_dw = reinterpret_cast<DeconvolutionDepthwiseCPUKernel *>(cdata);
-  auto ret = conv_dw->DoExcute(task_id);
+  auto deconv_dw = reinterpret_cast<DeconvolutionDepthwiseCPUKernel *>(cdata);
+  auto ret = deconv_dw->Execute(task_id);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "DeconvolutionDepthwiseRun error task_id[" << task_id << "] error_code[" << ret << "]";
     return RET_ERROR;
@@ -133,26 +163,26 @@ int DeconvolutionDepthwiseCPUKernel::Run() {
   auto input_addr = reinterpret_cast<float *>(input_tensor->Data());
 
   // pack input: to nhwc4
-  if (convert_func_ != nullptr) {
-    convert_func_(input_addr, packed_input_, conv_param_->input_batch_, conv_param_->input_h_ * conv_param_->input_w_,
-                  conv_param_->input_channel_);
+  if (need_align_) {
+    PackNHWCToNHWC4Fp32(input_addr, packed_input_, conv_param_->input_batch_,
+                        conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
   } else {
     packed_input_ = input_addr;
   }
 
-  output_addr = reinterpret_cast<float *>(outputs_.at(kOutputIndex)->Data());
-  memset(output_addr, 0, outputs_.at(kOutputIndex)->ElementsNum() * sizeof(float));
-  if (!need_pack_) {
+  auto output_addr = reinterpret_cast<float *>(outputs_.at(kOutputIndex)->Data());
+  if (!need_align_) {
+    memset(output_addr, 0, outputs_.at(kOutputIndex)->ElementsNum() * sizeof(float));
     packed_output_ = output_addr;
   }
 
   auto ret = LiteBackendParallelLaunch(DeconvDwRun, this, conv_param_->thread_num_);
   if (ret != RET_OK) {
-    MS_LOG(ERROR) << "ConvDwRun error: error_code[" << ret << "]";
+    MS_LOG(ERROR) << "DeconvDwRun error: error_code[" << ret << "]";
     return RET_ERROR;
   }
 
-  if (need_pack_) {
+  if (need_align_) {
     PackNHWC4ToNHWCFp32(packed_output_, output_addr, conv_param_->output_batch_,
                         conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_);
   }
@@ -182,4 +212,3 @@ kernel::LiteKernel *CpuDeconvDwFp32KernelCreator(const std::vector<lite::tensor:
 
 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_DeDepthwiseConv2D, CpuDeconvDwFp32KernelCreator)
 }  // namespace mindspore::kernel
-
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.h b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.h
index f993d818c4..5db24df3b6 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.h
@@ -31,8 +31,10 @@ class DeconvolutionDepthwiseCPUKernel : public ConvolutionBaseCPUKernel {
   ~DeconvolutionDepthwiseCPUKernel() override {
     delete sliding_;
     free(packed_weight_);
-    free(packed_input_);
-    free(packed_output_);
+    if (need_align_) {
+      free(packed_input_);
+      free(packed_output_);
+    }
   };
 
   int Init() override;
@@ -40,17 +42,17 @@ class DeconvolutionDepthwiseCPUKernel : public ConvolutionBaseCPUKernel {
   int ReSize() override;
   int Run() override;
 
-  int DoExcute(int task_id);
+  int InitBuffer();
+  int InitWeightBias();
+  int Execute(int task_id);
 
  private:
   SlidingWindowParam *sliding_;
   float *packed_weight_;
   float *packed_input_;
   float *packed_output_;
-  float *output_addr;
-  bool need_pack_ = false;
+  bool need_align_ = false;
 };
 }  // namespace mindspore::kernel
 
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_DECONVOLUTION_DEPTHWISE_H_
-
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.cc
index d68a7ea3ac..0a68b1301c 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.cc
@@ -35,11 +35,19 @@ int ConvolutionDepthwiseInt8CPUKernel::InitWeightBias() {
   int OC4 = UP_DIV(conv_param_->output_channel_, C4NUM);
   int pack_weight_size = C4NUM * OC4 * conv_param_->kernel_h_ * conv_param_->kernel_w_;
   packed_weight_ = reinterpret_cast<int16_t *>(malloc(pack_weight_size * sizeof(int16_t)));
+  if (packed_weight_ == nullptr) {
+    MS_LOG(ERROR) << "Malloc buffer failed.";
+    return RET_ERROR;
+  }
   memset(packed_weight_, 0, pack_weight_size * sizeof(int16_t));
   PackDepthwiseInt8Weight(origin_weight, packed_weight_, conv_param_);
 
   // init bias, add output zp
   bias_data_ = reinterpret_cast<int32_t *>(malloc(C4NUM * OC4 * sizeof(int32_t)));
+  if (bias_data_ == nullptr) {
+    MS_LOG(ERROR) << "Malloc buffer failed.";
+    return RET_ERROR;
+  }
   memset(bias_data_, 0, C4NUM * OC4 * sizeof(int32_t));
   if (inputs_.size() == kInputSize2) {
     auto ori_bias = reinterpret_cast<int32_t *>(inputs_.at(kBiasIndex)->Data());
@@ -48,6 +56,30 @@ int ConvolutionDepthwiseInt8CPUKernel::InitWeightBias() {
   return RET_OK;
 }
 
+int ConvolutionDepthwiseInt8CPUKernel::InitBuffer() {
+  // malloc packed input buffer
+  int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM *
+                        UP_DIV(conv_param_->input_channel_, 4);
+  packed_input_ = reinterpret_cast<int16_t *>(malloc(pack_input_size * sizeof(int16_t)));
+  memset(packed_input_, 0, pack_input_size * sizeof(int16_t));
+  if (packed_input_ == nullptr) {
+    MS_LOG(ERROR) << "Malloc buffer failed.";
+    return RET_ERROR;
+  }
+
+  if (conv_param_->input_channel_ % C4NUM != 0) {
+    need_align_ = true;
+    int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM *
+                           UP_DIV(conv_param_->output_channel_, C4NUM);
+    packed_output_ = reinterpret_cast<int8_t *>(malloc(pack_output_size * sizeof(int8_t)));
+    if (packed_input_ == nullptr) {
+      MS_LOG(ERROR) << "Malloc buffer failed.";
+      return RET_ERROR;
+    }
+  }
+  return RET_OK;
+}
+
 int ConvolutionDepthwiseInt8CPUKernel::Init() {
   // conv base init
   ConvolutionBaseCPUKernel::Init();
@@ -66,7 +98,7 @@ int ConvolutionDepthwiseInt8CPUKernel::Init() {
     return ret;
   }
 
-  ret = ReSize();
+  ret = InitBuffer();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Depthwise int8 ReSize error!";
     return ret;
@@ -75,26 +107,23 @@ int ConvolutionDepthwiseInt8CPUKernel::Init() {
 }
 
 int ConvolutionDepthwiseInt8CPUKernel::ReSize() {
-  // malloc packed input buffer
-  int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM *
-                        UP_DIV(conv_param_->input_channel_, 4);
-  packed_input_ = reinterpret_cast<int16_t *>(malloc(pack_input_size * sizeof(int16_t)));
-  memset(packed_input_, 0, pack_input_size * sizeof(int16_t));
-  if (packed_input_ == nullptr) {
-    MS_LOG(ERROR) << "Malloc buffer failed.";
-    return RET_ERROR;
+  free(packed_input_);
+  if (need_align_) {
+    free(packed_output_);
   }
+  // conv base init
+  ConvolutionBaseCPUKernel::Init();
 
-  if (conv_param_->input_channel_ % C4NUM != 0) {
-    need_align_ = true;
-    int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM *
-                           (conv_param_->output_channel_, C4NUM);
-    packed_output_ = reinterpret_cast<int8_t *>(malloc(pack_output_size * sizeof(int8_t)));
-    if (packed_input_ == nullptr) {
-      MS_LOG(ERROR) << "Malloc buffer failed.";
-      return RET_ERROR;
-    }
-    memset(packed_output_, 0, pack_output_size * sizeof(int8_t));
+  // init sliding window param
+  InitSlidingParam(sliding, conv_param_, C4NUM);
+
+  // init quant param
+  ConvolutionBaseCPUKernel::SetQuantParam();
+
+  auto ret = InitBuffer();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Depthwise int8 ReSize error!";
+    return ret;
   }
   return RET_OK;
 }
@@ -106,8 +135,8 @@ int ConvolutionDepthwiseInt8CPUKernel::Execute(int task_id) {
 }
 
 int ConvDwInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
-  auto conv_dw = reinterpret_cast<ConvolutionDepthwiseInt8CPUKernel *>(cdata);
-  auto ret = conv_dw->Execute(task_id);
+  auto conv_dw_int8 = reinterpret_cast<ConvolutionDepthwiseInt8CPUKernel *>(cdata);
+  auto ret = conv_dw_int8->Execute(task_id);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "ConvolutionDepthwiseInt8Run error task_id[" << task_id << "] error_code[" << ret << "]";
     return RET_ERROR;
@@ -127,7 +156,6 @@ int ConvolutionDepthwiseInt8CPUKernel::Run() {
   PackDepthwiseInt8Input(input_addr, packed_input_, conv_param_);
 
   auto output_addr = reinterpret_cast<int8_t *>(outputs_.at(kOutputIndex)->Data());
-  memset(output_addr, 0, outputs_.at(kOutputIndex)->ElementsNum() * sizeof(int8_t));
   if (!need_align_) {
     packed_output_ = output_addr;
   }
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.h
index 2e9ad6fd39..5e5f687006 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.h
@@ -42,6 +42,7 @@ class ConvolutionDepthwiseInt8CPUKernel : public ConvolutionBaseCPUKernel {
   int Run() override;
 
   int InitWeightBias();
+  int InitBuffer();
   int Execute(int task_id);
 
  private:
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.cc
index 52a0b1ffde..b2e59c7255 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.cc
@@ -35,11 +35,19 @@ int DeconvolutionDepthwiseInt8CPUKernel::InitWeightBias() {
   int OC4 = UP_DIV(conv_param_->output_channel_, C4NUM);
   int pack_weight_size = C4NUM * OC4 * conv_param_->kernel_h_ * conv_param_->kernel_w_;
   packed_weight_ = reinterpret_cast<int16_t *>(malloc(pack_weight_size * sizeof(int16_t)));
+  if (packed_weight_ == nullptr) {
+    MS_LOG(ERROR) << "Malloc buffer failed.";
+    return RET_ERROR;
+  }
   memset(packed_weight_, 0, pack_weight_size * sizeof(int16_t));
   PackDepthwiseInt8Weight(origin_weight, packed_weight_, conv_param_);
 
   // init bias, add output zp
   bias_data_ = reinterpret_cast<int32_t *>(malloc(C4NUM * OC4 * sizeof(int32_t)));
+  if (bias_data_ == nullptr) {
+    MS_LOG(ERROR) << "Malloc buffer failed.";
+    return RET_ERROR;
+  }
   memset(bias_data_, 0, C4NUM * OC4 * sizeof(int32_t));
   if (inputs_.size() == kInputSize2) {
     auto ori_bias = reinterpret_cast<int32_t *>(inputs_.at(kBiasIndex)->Data());
@@ -59,7 +67,6 @@ int DeconvolutionDepthwiseInt8CPUKernel::InitSlideParam() {
   conv_param_->output_channel_ = inputs_.front()->shape().at(kNHWC_C);
 
   // init sliding window param
-  sliding = new SlidingWindowParam;
   InitSlidingParam(sliding, conv_param_, C4NUM);
 
   sliding->in_h_step_ = conv_param_->input_w_ * C4NUM;
@@ -70,31 +77,7 @@ int DeconvolutionDepthwiseInt8CPUKernel::InitSlideParam() {
   return RET_OK;
 }
 
-int DeconvolutionDepthwiseInt8CPUKernel::Init() {
-  InitSlideParam();
-
-  // conv base init
-  ConvolutionBaseCPUKernel::Init();
-
-  // init quant param
-  ConvolutionBaseCPUKernel::SetQuantParam();
-
-  // init weight and bias
-  auto ret = InitWeightBias();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Deconv Depthwise int8 InitWeightBias error!";
-    return ret;
-  }
-
-  ret = ReSize();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Deconv Depthwise int8 ReSize error!";
-    return ret;
-  }
-  return RET_OK;
-}
-
-int DeconvolutionDepthwiseInt8CPUKernel::ReSize() {
+int DeconvolutionDepthwiseInt8CPUKernel::InitBuffer() {
   // malloc packed input buffer
   int pack_input_size = conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * C4NUM *
                         UP_DIV(conv_param_->input_channel_, 4);
@@ -108,9 +91,9 @@ int DeconvolutionDepthwiseInt8CPUKernel::ReSize() {
   if (conv_param_->input_channel_ % C4NUM != 0) {
     need_align_ = true;
     int pack_output_size = conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * C4NUM *
-                           (conv_param_->output_channel_, C4NUM);
+                           UP_DIV(conv_param_->output_channel_, C4NUM);
     packed_output_ = reinterpret_cast<int8_t *>(malloc(pack_output_size * sizeof(int8_t)));
-    if (packed_input_ == nullptr) {
+    if (packed_output_ == nullptr) {
       MS_LOG(ERROR) << "Malloc buffer failed.";
       return RET_ERROR;
     }
@@ -120,6 +103,10 @@ int DeconvolutionDepthwiseInt8CPUKernel::ReSize() {
   // malloc tmp buffer for int32 output
   output_buffer =
     reinterpret_cast<int32_t *>(malloc(conv_param_->output_h_ * conv_param_->output_w_ * C4NUM * sizeof(int32_t)));
+  if (output_buffer == nullptr) {
+    MS_LOG(ERROR) << "Malloc buffer failed.";
+    return RET_ERROR;
+  }
   if (packed_input_ == nullptr) {
     MS_LOG(ERROR) << "Malloc buffer failed.";
     return RET_ERROR;
@@ -127,6 +114,49 @@ int DeconvolutionDepthwiseInt8CPUKernel::ReSize() {
   return RET_OK;
 }
 
+int DeconvolutionDepthwiseInt8CPUKernel::Init() {
+  sliding = new SlidingWindowParam;
+  InitSlideParam();
+
+  // conv base init
+  ConvolutionBaseCPUKernel::Init();
+
+  // init quant param
+  ConvolutionBaseCPUKernel::SetQuantParam();
+
+  // init weight and bias
+  auto ret = InitWeightBias();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Deconv Depthwise int8 InitWeightBias error!";
+    return ret;
+  }
+
+  ret = InitBuffer();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Deconv Depthwise int8 InitBuffer error!";
+    return ret;
+  }
+  return RET_OK;
+}
+
+int DeconvolutionDepthwiseInt8CPUKernel::ReSize() {
+  free(packed_input_);
+  if (need_align_) {
+    free(packed_output_);
+  }
+  InitSlideParam();
+
+  // conv base init
+  ConvolutionBaseCPUKernel::Init();
+
+  auto ret = InitBuffer();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Deconv Depthwise int8 InitBuffer error!";
+    return ret;
+  }
+  return RET_OK;
+}
+
 int DeconvolutionDepthwiseInt8CPUKernel::Execute(int task_id) {
   DeconvDwInt8(packed_output_, output_buffer, packed_input_, packed_weight_, reinterpret_cast<int32_t *>(bias_data_),
                conv_param_, sliding, task_id);
@@ -134,8 +164,8 @@ int DeconvolutionDepthwiseInt8CPUKernel::Execute(int task_id) {
 }
 
 int DeconvDwInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
-  auto deconv_dw = reinterpret_cast<DeconvolutionDepthwiseInt8CPUKernel *>(cdata);
-  auto ret = deconv_dw->Execute(task_id);
+  auto deconv_dw_int8 = reinterpret_cast<DeconvolutionDepthwiseInt8CPUKernel *>(cdata);
+  auto ret = deconv_dw_int8->Execute(task_id);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "DeconvolutionDepthwiseInt8Run error task_id[" << task_id << "] error_code[" << ret << "]";
     return RET_ERROR;
@@ -155,8 +185,8 @@ int DeconvolutionDepthwiseInt8CPUKernel::Run() {
   PackDepthwiseInt8Input(input_addr, packed_input_, conv_param_);
 
   auto output_addr = reinterpret_cast<int8_t *>(outputs_.at(kOutputIndex)->Data());
-  memset(output_addr, 0, outputs_.at(kOutputIndex)->ElementsNum() * sizeof(int8_t));
   if (!need_align_) {
+    memset(output_addr, 0, outputs_.at(kOutputIndex)->ElementsNum() * sizeof(int8_t));
     packed_output_ = output_addr;
   }
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.h
index a394839bca..74f658b2d2 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.h
@@ -43,6 +43,7 @@ class DeconvolutionDepthwiseInt8CPUKernel : public ConvolutionBaseCPUKernel {
 
   int InitSlideParam();
   int InitWeightBias();
+  int InitBuffer();
   int Execute(int task_id);
 
  private:
diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/fp16/conv_depthwise_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/opclib/fp16/conv_depthwise_fp16.cc
index 9117d4f821..122f1fe29d 100644
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/fp16/conv_depthwise_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp16/conv_depthwise_fp16.cc
@@ -21,6 +21,9 @@
 void DepthwiseBorderPixelFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias,
                               int height, int width, int in_kh_step, int in_kw_step, int kernel_w, bool is_relu,
                               bool is_relu6) {
+  for (int c = 0; c < C8NUM; c++) {
+    dst[c] = 0;
+  }
   const float16_t *src_kh = src;
   const float16_t *weight_kh = weight;
   for (int kh = 0; kh < height; kh++) {
@@ -87,6 +90,9 @@ void DepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float16_t *
     for (int ow = 0; ow < width; ow++) {
       const float16_t *src_kh = src_w;
       const float16_t *weight_kh = weight;
+      for (int c = 0; c < C8NUM; c++) {
+        dst_w[c] = 0;
+      }
       for (int kh = 0; kh < kernel_h; kh++) {
         const float16_t *src_kw = src_kh;
         const float16_t *weight_kw = weight_kh;
@@ -297,4 +303,3 @@ void DeconvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const f
   // output nchwc8
 }
 /*deconv depthwise fp16 end*/
-
diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/conv_depthwise.cc b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/conv_depthwise.cc
index 90d7240537..8e706605ee 100644
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/conv_depthwise.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/conv_depthwise.cc
@@ -63,6 +63,9 @@ void DepthwiseBorderPixel(float *dst, const float *src, const float *weight, con
                           int in_kh_step, int in_kw_step, int kernel_w, bool is_relu, bool is_relu6) {
   const float *src_kh = src;
   const float *weight_kh = weight;
+  for (int c = 0; c < C4NUM; c++) {
+    dst[c] = 0;
+  }
   for (int kh = 0; kh < height; kh++) {
     const float *src_kw = src_kh;
     const float *weight_kw = weight_kh;
@@ -132,6 +135,9 @@ void DepthwiseCenter(float *dst, const float *src, const float *weight, const fl
     for (int ow = 0; ow < width; ow++) {
       const float *src_kh = src_w;
       const float *weight_kh = weight;
+      for (int c = 0; c < C4NUM; c++) {
+        dst_w[c] = 0;
+      }
       for (int kh = 0; kh < kernel_h; kh++) {
         const float *src_kw = src_kh;
         const float *weight_kw = weight_kh;
@@ -202,7 +208,7 @@ void ConvDwC4Fp32(float *output_data, const float *input_data, const float *weig
     src += sliding->in_step_;
     dst += sliding->out_step_;
   }  // batch loop
-  // output nc4hwc4
+  // output nhwc4
 }
 /*conv depthwise fp32 end*/
 
@@ -350,6 +356,6 @@ void DeconvDwC4Fp32(float *output_data, const float *input_data, const float *we
     src += sliding->in_step_;
     dst += sliding->out_step_;
   }  // batch loop
-  // output nc4hwc4
+  // output nhwc4
 }
 /*deconv depthwise fp32 end*/
diff --git a/mindspore/lite/src/runtime/kernel/arm/opclib/int8/conv_depthwise_int8.cc b/mindspore/lite/src/runtime/kernel/arm/opclib/int8/conv_depthwise_int8.cc
index 02bba0ae38..b44024d913 100644
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/int8/conv_depthwise_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/int8/conv_depthwise_int8.cc
@@ -171,7 +171,7 @@ void ConvDwInt8(int8_t *output_data, const int16_t *input_data, const int16_t *w
     src += sliding->in_step_;
     dst += sliding->out_step_;
   }  // batch loop
-  // output nc4hwc4
+  // output nhwc4
 }
 /*conv depthwise int8 end*/
 
@@ -317,6 +317,6 @@ void DeconvDwInt8(int8_t *output_data, int32_t *output_buffer, const int16_t *in
     src += sliding->in_step_;
     dst += sliding->out_step_;
   }  // batch loop
-  // output nc4hwc4
+  // output nhwc4
 }
 /*deconv depthwise int8 end*/