diff --git a/mindspore/lite/nnacl/conv_parameter.h b/mindspore/lite/nnacl/conv_parameter.h
index ef01f73ab2..ae1a60982f 100644
--- a/mindspore/lite/nnacl/conv_parameter.h
+++ b/mindspore/lite/nnacl/conv_parameter.h
@@ -71,4 +71,57 @@ typedef struct SlidingWindowParam {
   int kernel_step_;
 } SlidingWindowParam;
 
+#define DECONV_WINOGRAD_DEFAULT_UNIT 3
+#define DECONV_WINOGRAD_DEFAULT_TILE 8
+#define DECONV_WINOGRAD_BUFFER_COUNT 8
+typedef struct DeConvWg {
+  void *b_buffer_;
+  void *AT_;
+  void *BT_;
+
+  int kh_;
+  int kw_;
+
+  int k_;
+  int i_;
+  int o_;
+} DeConvWg;
+
+typedef struct DeConvWgABuffer {
+  bool buf_init_;
+  bool trans_formed_;
+  void *middle_buffer_;
+  void *dest_buffer_;
+} DeConvWgABuffer;
+
+typedef struct DeConvComputeUnit {
+  void *weight_;
+  void *tmp_buffer_;
+  int w_start_;
+  int h_start_;
+  int w_size_;
+  int h_size_;
+  bool use_winograd_;
+  DeConvWg winograd_;
+} DeConvComputeUnit;
+
+typedef struct DeConvParam {
+  DeConvComputeUnit *compute_units_;
+  int compute_size_;
+  DeConvWgABuffer a_buffer_[DECONV_WINOGRAD_BUFFER_COUNT];
+  int input_plane_;
+  int output_plane_;
+  int kernel_plane_;
+  int ic_div4_;
+  int oc_div4_;
+  int ic_up4_;
+  int oc_up4_;
+  int thread_num_;
+  int in_tile_count_;
+  int in_tile_h_count_;
+  int in_tile_w_count_;
+  int out_tile_h_;
+  int out_tile_w_;
+} DeConvParam;
+
 #endif  // MINDSPORE_LITE_NNACL_CONV_PARAMETER_H_
diff --git a/mindspore/lite/nnacl/errorcode.h b/mindspore/lite/nnacl/errorcode.h
index a26a095997..f80ce1bb67 100644
--- a/mindspore/lite/nnacl/errorcode.h
+++ b/mindspore/lite/nnacl/errorcode.h
@@ -22,7 +22,7 @@ typedef enum ErrorCodeCommonEnum {
   NNACL_ERR = 1,
   NNACL_NULL_PTR,
   NNACL_PARAM_INVALID,
-  OPLIB_COMMON_END = 9999
+  NNACL_COMMON_END = 9999
 } ErrorCodeCommonEnum;
 
 typedef enum ErrorCodeFp32OpEnum {
@@ -34,6 +34,7 @@ typedef enum ErrorCodeFp32OpEnum {
   NNACL_ERRCODE_LOG_NEGATIVE_OR_ZERO,
   NNACL_ERRCODE_DIVISOR_ZERO,
   NNACL_ERRCODE_INDEX_OUT_OF_RANGE,
+  NNACL_ERRCODE_WINOGRAD_GENERATOR_ERROR,
   NNACL_ERRCODE_OP_FP32_END = 19999
 } ErrorCodeFp32OpEnum;
 
diff --git a/mindspore/lite/nnacl/fp32/common_func.c b/mindspore/lite/nnacl/fp32/common_func.c
index 2ff8a4303f..fe6ed6e9b2 100644
--- a/mindspore/lite/nnacl/fp32/common_func.c
+++ b/mindspore/lite/nnacl/fp32/common_func.c
@@ -15,8 +15,9 @@
  */
 
 #include "nnacl/fp32/common_func.h"
+
 void PostConvFuncComm(const float *src_ptr_, float *out_ptr, const float *bias_ptr, size_t output_channel,
-                      size_t plane_size, size_t stride, bool is_relu, bool is_relu6, int size) {
+                      size_t plane_size, size_t plane_stride, size_t oc_stride, bool is_relu, bool is_relu6, int size) {
   int oc_div = 0, oc_mod = 0;
   for (int oc = 0; oc < output_channel; oc++) {
     if (size != 0) {
@@ -26,8 +27,8 @@ void PostConvFuncComm(const float *src_ptr_, float *out_ptr, const float *bias_p
       return;
     }
     for (int hw = 0; hw < plane_size; hw++) {
-      int src_index = oc_div * size * plane_size + hw * size + oc_mod;
-      int dst_index = hw * stride + oc;
+      int src_index = oc_div * size * plane_stride + hw * size + oc_mod;
+      int dst_index = hw * oc_stride + oc;
       float value = src_ptr_[src_index];
       if (bias_ptr != NULL) {
         value = value + bias_ptr[oc];
@@ -43,7 +44,8 @@ void PostConvFuncComm(const float *src_ptr_, float *out_ptr, const float *bias_p
 void PostConvFuncFp32C8(const float *c8_out_ptr, float *out_ptr, const float *bias_ptr, size_t output_channel,
                         size_t plane_size, size_t stride, bool is_relu, bool is_relu6) {
 #ifndef ENABLE_ARM
-  PostConvFuncComm(c8_out_ptr, out_ptr, bias_ptr, output_channel, plane_size, stride, is_relu, is_relu6, C8NUM);
+  PostConvFuncComm(c8_out_ptr, out_ptr, bias_ptr, output_channel, plane_size, plane_size, stride, is_relu, is_relu6,
+                   C8NUM);
 #else
   size_t oc8mod = output_channel % C8NUM;
   size_t oc8div = output_channel - oc8mod;
@@ -55,6 +57,59 @@ void PostConvFuncFp32C8(const float *c8_out_ptr, float *out_ptr, const float *bi
   return;
 }
 
+void PostConvFuncFp32C4(const float *c4_out_ptr, float *out_ptr, const float *bias_ptr, size_t output_channel,
+                        size_t plane_size, size_t plane_stride, bool is_relu, bool is_relu6) {
+  PostConvFuncComm(c4_out_ptr, out_ptr, bias_ptr, output_channel, plane_size, plane_stride, output_channel, is_relu,
+                   is_relu6, C4NUM);
+  return;
+}
+
+void WinogradMatrixProductLeft(const float *S, const float *B, float *M, size_t w, size_t h, size_t k, size_t length) {
+  int unitStep = 4 * length;
+  for (int y = 0; y < h; ++y) {
+    float *dstY = M + y * w * unitStep;
+    for (int x = 0; x < w; ++x) {
+      float *dstX = dstY + x * unitStep;
+      const float *srcX = S + x * unitStep;
+      memset(dstX, 0, unitStep * sizeof(float));
+      for (int i = 0; i < k; ++i) {
+        float b = B[i * h + y];
+        const float *srcY = srcX + i * w * unitStep;
+        if (0.0f == b) {
+          continue;
+        }
+        for (int j = 0; j < unitStep; ++j) {
+          dstX[j] += srcY[j] * b;
+        }
+      }
+    }
+  }
+}
+
+// M = S * B , M = w*h * l, S = k*h * l, B = w*k
+void WinogradMatrixProductRight(const float *S, const float *B, float *M, size_t w, size_t h, size_t k, size_t length) {
+  int unitStep = 4 * length;
+  for (int y = 0; y < h; ++y) {
+    float *dstY = M + y * w * unitStep;
+    const float *srcY = S + y * k * unitStep;
+
+    for (int x = 0; x < w; ++x) {
+      float *dstX = dstY + x * unitStep;
+      memset(dstX, 0, unitStep * sizeof(float));
+      for (int i = 0; i < k; ++i) {
+        const float *srcX = srcY + i * unitStep;
+        float b = B[i * h + x];
+        if (0.0f == b) {
+          continue;
+        }
+        for (int j = 0; j < unitStep; ++j) {
+          dstX[j] += srcX[j] * b;
+        }
+      }
+    }
+  }
+}
+
 union float32_bits {
   unsigned int u;
   float f;
diff --git a/mindspore/lite/nnacl/fp32/common_func.h b/mindspore/lite/nnacl/fp32/common_func.h
index 4969851225..ab40622391 100644
--- a/mindspore/lite/nnacl/fp32/common_func.h
+++ b/mindspore/lite/nnacl/fp32/common_func.h
@@ -29,6 +29,12 @@ extern "C" {
 
 void PostConvFuncFp32C8(const float *c8_out_ptr, float *out_ptr, const float *bias_ptr, size_t output_channel,
                         size_t plane_size, size_t stride, bool is_relu, bool is_relu6);
+void PostConvFuncFp32C4(const float *c4_out_ptr, float *out_ptr, const float *bias_ptr, size_t output_channel,
+                        size_t plane_size, size_t plane_stride, bool is_relu, bool is_relu6);
+
+void WinogradMatrixProductLeft(const float *S, const float *B, float *M, size_t w, size_t h, size_t k, size_t length);
+void WinogradMatrixProductRight(const float *S, const float *B, float *M, size_t w, size_t h, size_t k, size_t length);
+
 float ShortToFloat32(uint16_t src_value);
 
 uint16_t Float32ToShort(float src_value);
diff --git a/mindspore/lite/nnacl/fp32/deconv.c b/mindspore/lite/nnacl/fp32/deconv.c
index 61c27e12f0..a6f41bd424 100644
--- a/mindspore/lite/nnacl/fp32/deconv.c
+++ b/mindspore/lite/nnacl/fp32/deconv.c
@@ -33,9 +33,10 @@ void PackDeConvWeightFp32(const float *weight, float *dst, int input_channel, in
   return;
 }
 
-int DeConvPostFp32C12x8(const float *src, float *tmp, const float *bias, float *dst, int output_channel,
-                        ConvParameter *conv_param) {
-  /* row12x8-major(ih*iw x oc*kh*kw)  ->  row8-major(oh*ow x oc) */
+void DeConvPostFp32C8(const float *src, float *tmp, const float *bias, float *dst, int output_channel,
+                      ConvParameter *conv_param) {
+  /* arm64 row12x8-major(ih*iw x oc*kh*kw)  ->  row8-major(oh*ow x oc) */
+  /* arm32 row4x8-major(ih*iw x oc*kh*kw)   ->  row8-major(oh*ow x oc) */
   size_t input_plane = conv_param->input_w_ * conv_param->input_h_;
   size_t kernel_plane = conv_param->kernel_w_ * conv_param->kernel_h_;
   size_t output_plane = conv_param->output_w_ * conv_param->output_h_;
@@ -45,11 +46,11 @@ int DeConvPostFp32C12x8(const float *src, float *tmp, const float *bias, float *
 #else
   const int tile_num = 12;
 #endif
-  int in_plane12 = UP_ROUND(input_plane, tile_num);
+  int in_plane_round = UP_ROUND(input_plane, tile_num);
   int src_iw_stride = C8NUM;
   int src_ih_stride = conv_param->input_w_ * C8NUM;
-  int src_kw_stride = in_plane12 * C8NUM;
-  int src_kh_stride = in_plane12 * conv_param->kernel_w_ * C8NUM;
+  int src_kw_stride = in_plane_round * C8NUM;
+  int src_kh_stride = in_plane_round * conv_param->kernel_w_ * C8NUM;
   int dst_oh_stride = conv_param->output_w_ * C8NUM;
   int dst_ow_stride = C8NUM;
   int dst_kh_stride = conv_param->dilation_h_ * conv_param->output_w_ * C8NUM;
@@ -57,7 +58,7 @@ int DeConvPostFp32C12x8(const float *src, float *tmp, const float *bias, float *
 
   for (int c = 0; c < oc8; c += 8) {
     float *dst_ptr = tmp + c * output_plane;
-    const float *src_ptr = src + c * in_plane12 * kernel_plane;
+    const float *src_ptr = src + c * in_plane_round * kernel_plane;
     memset(dst_ptr, 0, output_plane * C8NUM * sizeof(float));
 
     for (int ih = 0; ih < conv_param->input_h_; ih++) {
@@ -104,5 +105,5 @@ int DeConvPostFp32C12x8(const float *src, float *tmp, const float *bias, float *
 
   PostConvFuncFp32C8(tmp, dst, bias, output_channel, output_plane, conv_param->output_channel_,
                      conv_param->act_type_ == ActType_Relu, conv_param->act_type_ == ActType_Relu6);
-  return NNACL_OK;
+  return;
 }
diff --git a/mindspore/lite/nnacl/fp32/deconv.h b/mindspore/lite/nnacl/fp32/deconv.h
index 65b9b41fbe..e3140d39ee 100644
--- a/mindspore/lite/nnacl/fp32/deconv.h
+++ b/mindspore/lite/nnacl/fp32/deconv.h
@@ -22,13 +22,15 @@
 #include "nnacl/conv_parameter.h"
 #include "nnacl/errorcode.h"
 #include "nnacl/fp32/common_func.h"
+#include "nnacl/fp32/conv.h"
+#include "nnacl/minimal_filtering_generator.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 void PackDeConvWeightFp32(const float *weight, float *dst, int input_channel, int output_channel, int plane);
-int DeConvPostFp32C12x8(const float *src, float *tmp_out, const float *bias, float *dst, int output_channel,
-                        ConvParameter *conv_param);
+void DeConvPostFp32C8(const float *src, float *tmp_out, const float *bias, float *dst, int output_channel,
+                      ConvParameter *conv_param);
 #ifdef __cplusplus
 }
 #endif
diff --git a/mindspore/lite/nnacl/fp32/deconv_winograd.c b/mindspore/lite/nnacl/fp32/deconv_winograd.c
new file mode 100644
index 0000000000..24da2dc977
--- /dev/null
+++ b/mindspore/lite/nnacl/fp32/deconv_winograd.c
@@ -0,0 +1,337 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nnacl/fp32/deconv_winograd.h"
+
+int PackDeConvWgDataFp32(float *nhwc_weight, DeConvComputeUnit *unit, ConvParameter *conv_param,
+                         DeConvParam *deconv_param) {
+  int tmp_kernel_plane = unit->w_size_ * unit->h_size_;
+  int size = conv_param->input_channel_ * conv_param->output_channel_ * tmp_kernel_plane;
+  float *current_unit_weight = (float *)malloc(size * sizeof(float));
+  if (current_unit_weight == NULL) {
+    return NNACL_NULL_PTR;
+  }
+  for (int ic = 0; ic < conv_param->input_channel_; ic++) {
+    float *src_ic = nhwc_weight + deconv_param->kernel_plane_ * conv_param->output_channel_ * ic;
+    float *dst_ic = current_unit_weight + tmp_kernel_plane * conv_param->output_channel_ * ic;
+    for (int uhi = 0; uhi < unit->h_size_; uhi++) {
+      for (int uwi = 0; uwi < unit->w_size_; uwi++) {
+        int src_h_offset = unit->h_start_ + uhi * conv_param->stride_h_;
+        int src_w_offset = unit->w_start_ + uwi * conv_param->stride_w_;
+        float *src_hw = src_ic + (src_h_offset * conv_param->kernel_w_ + src_w_offset) * conv_param->output_channel_;
+        float *dst_hw = dst_ic + (uhi * unit->w_size_ + uwi) * conv_param->output_channel_;
+        memcpy(dst_hw, src_hw, conv_param->output_channel_ * sizeof(float));
+      }
+    }
+  }
+
+  if (unit->use_winograd_) {
+    /* Generate winograd  */
+    float matrix_g[64];
+    float matrix_gt[64];
+    float matrix_a[64];
+    float matrix_at[64];
+    float matrix_b[64];
+    float matrix_bt[64];
+    int ret = CookToomFilter(matrix_a, matrix_at, matrix_b, matrix_bt, matrix_g, matrix_gt, 0.5f,
+                             DECONV_WINOGRAD_DEFAULT_UNIT, unit->h_size_);
+    if (ret != NNACL_OK) {
+      return NNACL_ERRCODE_WINOGRAD_GENERATOR_ERROR;
+    }
+
+    /* winograd AT */
+    unit->winograd_.AT_ = malloc(unit->winograd_.i_ * unit->winograd_.o_ * sizeof(float));
+    if (unit->winograd_.AT_ == NULL) {
+      return NNACL_NULL_PTR;
+    }
+    memcpy(unit->winograd_.AT_, matrix_at, unit->winograd_.i_ * unit->winograd_.o_ * sizeof(float));
+
+    /* winograd BT */
+    unit->winograd_.BT_ = malloc(unit->winograd_.o_ * unit->winograd_.o_ * sizeof(float));
+    if (unit->winograd_.BT_ == NULL) {
+      return NNACL_NULL_PTR;
+    }
+    memcpy(unit->winograd_.BT_, matrix_bt, unit->winograd_.o_ * unit->winograd_.o_ * sizeof(float));
+
+    /* winograd Weight */
+    size = conv_param->input_channel_ * conv_param->output_channel_ * unit->winograd_.kh_ * unit->winograd_.kw_;
+    float *winograd_unit_weight = (float *)malloc(size * sizeof(float));
+    if (winograd_unit_weight == NULL) {
+      return NNACL_NULL_PTR;
+    }
+    WinogradWeightTransform(current_unit_weight, winograd_unit_weight, matrix_g, matrix_gt, C4NUM, unit->winograd_.kh_,
+                            unit->h_size_, conv_param->output_channel_, conv_param->input_channel_, false);
+
+    /* reset weight data & info */
+    tmp_kernel_plane = unit->winograd_.kh_ * unit->winograd_.kw_;
+    free(current_unit_weight);
+    current_unit_weight = winograd_unit_weight;
+    winograd_unit_weight = NULL;
+  }
+
+  /* trans mhwc -> hw1:k1-knc0-c4:k1-knc5-c8:hw2:k1-knc0-c4:k1 */
+  float *dst_weight = (float *)unit->weight_;
+  size = deconv_param->ic_up4_ * deconv_param->oc_up4_ * tmp_kernel_plane;
+  memset(dst_weight, 0, size * sizeof(float));
+  for (int ic = 0; ic < conv_param->input_channel_; ic++) {
+    for (int oc = 0; oc < conv_param->output_channel_; oc++) {
+      int oc4div = oc / C4NUM, oc4mod = oc % C4NUM;
+      for (int upi = 0; upi < tmp_kernel_plane; upi++) {
+        int src_index = ic * conv_param->output_channel_ * tmp_kernel_plane + upi * conv_param->output_channel_ + oc;
+        int dst_index = upi * deconv_param->oc_up4_ * deconv_param->ic_up4_ + oc4div * C4NUM * deconv_param->ic_up4_ +
+                        ic * C4NUM + oc4mod;
+        dst_weight[dst_index] = current_unit_weight[src_index];
+      }
+    }
+  }
+
+  free(current_unit_weight);
+  return NNACL_OK;
+}
+
+void DeConvWgInputPack(float *src_ptr, float *dst_ptr, int channel, int stride) {
+  int ic4div = channel / C4NUM;
+  int ic4mod = channel % C4NUM;
+  float *src = src_ptr;
+  float *dst = dst_ptr;
+
+  for (int ic = 0; ic < ic4div; ic++) {
+    memcpy(dst, src, C4NUM * sizeof(float));
+    dst += stride;
+    src += C4NUM;
+  }
+
+  if (ic4mod != 0) {
+    int ic_res = 0;
+    for (; ic_res < ic4mod; ic_res++) {
+      dst[ic_res] = src[ic_res];
+    }
+    for (; ic_res < C4NUM; ic_res++) {
+      dst[ic_res] = 0;
+    }
+  }
+  return;
+}
+
+void MSGemmFloatCommon_4(float *dst, const float *src, const float *weight, size_t src_depth_quad, size_t dst_step,
+                         size_t dst_depth_quad, size_t width, size_t weight_depth_offset) {
+  int dx, sz, dz;
+  int src_depth_step = 4 * width;
+  for (dz = 0; dz < dst_depth_quad; ++dz) {
+    float *dst_z = dst + dz * dst_step;
+    const float *weight_dz = weight + dz * (src_depth_quad * 16 + weight_depth_offset);
+    for (dx = 0; dx < width; ++dx) {
+      float *dst_x = dst_z + dx * 4;
+      dst_x[0] = 0.0f;
+      dst_x[1] = 0.0f;
+      dst_x[2] = 0.0f;
+      dst_x[3] = 0.0f;
+      const float *src_dx = src + 4 * dx;
+      for (sz = 0; sz < src_depth_quad; ++sz) {
+        const float *src_z = src_dx + sz * src_depth_step;
+        const float *weight_z = weight_dz + sz * 16;
+        for (int i = 0; i < 4; ++i) {
+          for (int j = 0; j < 4; ++j) {
+            dst_x[j] += src_z[i] * weight_z[4 * i + j];
+          }
+        }
+      }
+    }
+  }
+}
+
+void MSGemmFloatUnit_4(float *dstOrigin, const float *src, const float *weight, size_t src_depth_quad, size_t dst_step,
+                       size_t dst_depth_quad, size_t weight_depth_offset) {
+  MSGemmFloatCommon_4(dstOrigin, src, weight, src_depth_quad, dst_step, dst_depth_quad, DECONV_WINOGRAD_DEFAULT_TILE,
+                      weight_depth_offset);
+}
+
+void DeConvWgMerge(const float *source, float *dest, size_t srcStride, size_t dstStride, size_t count) {
+  for (int i = 0; i < count; ++i) {
+    const float *s = source + i * srcStride;
+    float *d = dest + i * dstStride;
+    for (int j = 0; j < 4; ++j) {
+      d[j] += s[j];
+    }
+  }
+}
+
+void _deConvWinograd(float *tile_in, float *tile_out, float *weight_buf, float *tmp_buf, float *at_buf,
+                     float *a_mid_buf, float *trans_a_buf, bool a_trans, float *bt_buf, float *b_tmp_buf, int unit_size,
+                     int w_start, int h_start, ConvParameter *conv_param, DeConvParam *deconv_param) {
+  int winograd_plane = unit_size * unit_size;
+  if (!a_trans) {
+    WinogradMatrixProductLeft(tile_in, at_buf, a_mid_buf, DECONV_WINOGRAD_DEFAULT_UNIT, unit_size,
+                              DECONV_WINOGRAD_DEFAULT_UNIT, deconv_param->ic_div4_ * DECONV_WINOGRAD_DEFAULT_TILE);
+    WinogradMatrixProductRight(a_mid_buf, at_buf, trans_a_buf, unit_size, unit_size, DECONV_WINOGRAD_DEFAULT_UNIT,
+                               deconv_param->ic_div4_ * DECONV_WINOGRAD_DEFAULT_TILE);
+  }
+
+  for (int index = 0; index < winograd_plane; index++) {
+    float *src = trans_a_buf + index * DECONV_WINOGRAD_DEFAULT_TILE * deconv_param->ic_up4_;
+    float *dst = tmp_buf + index * deconv_param->oc_up4_ * DECONV_WINOGRAD_DEFAULT_TILE;
+    float *weight = weight_buf + index * deconv_param->ic_up4_ * deconv_param->oc_up4_;
+    MSGemmFloatUnit_4(dst, src, weight, deconv_param->ic_div4_, DECONV_WINOGRAD_DEFAULT_TILE * C4NUM,
+                      deconv_param->oc_div4_, 0);
+  }
+
+  WinogradMatrixProductLeft(tmp_buf, bt_buf, b_tmp_buf, unit_size, unit_size, unit_size,
+                            deconv_param->oc_div4_ * DECONV_WINOGRAD_DEFAULT_TILE);
+  WinogradMatrixProductRight(b_tmp_buf, bt_buf, tmp_buf, unit_size, unit_size, unit_size,
+                             deconv_param->oc_div4_ * DECONV_WINOGRAD_DEFAULT_TILE);
+
+  // Add to dest
+  for (int uhi = 0; uhi < unit_size; uhi++) {
+    int h_index = uhi * conv_param->stride_h_ + h_start;
+    for (int uwi = 0; uwi < unit_size; uwi++) {
+      int w_index = uwi * conv_param->stride_w_ + w_start;
+
+      float *dst = tile_out + w_index * DECONV_WINOGRAD_DEFAULT_TILE * deconv_param->oc_up4_ +
+                   h_index * deconv_param->out_tile_w_ * DECONV_WINOGRAD_DEFAULT_TILE * deconv_param->oc_up4_;
+      float *src = tmp_buf + (uwi + uhi * unit_size) * DECONV_WINOGRAD_DEFAULT_TILE * deconv_param->oc_up4_;
+      DeConvWgMerge(src, dst, 4, 4, DECONV_WINOGRAD_DEFAULT_TILE * deconv_param->oc_div4_);
+    }
+  }
+  return;
+}
+
+void _deConvCommon(float *tile_in, float *tile_out, float *weight, float *tmp_buf, int h_start, int w_start, int h_size,
+                   int w_size, ConvParameter *conv_param, DeConvParam *deconv_param) {
+  int count = deconv_param->oc_div4_ * w_size * h_size;
+  int in_stride = DECONV_WINOGRAD_DEFAULT_TILE * deconv_param->ic_up4_;
+  int out_stride = DECONV_WINOGRAD_DEFAULT_TILE * deconv_param->oc_up4_;
+
+  for (int hi = 0; hi < DECONV_WINOGRAD_DEFAULT_UNIT; hi++) {
+    for (int wi = 0; wi < DECONV_WINOGRAD_DEFAULT_UNIT; wi++) {
+      float *src_in = tile_in + (wi + hi * DECONV_WINOGRAD_DEFAULT_UNIT) * in_stride;
+      MSGemmFloatUnit_4(tmp_buf, src_in, weight, deconv_param->ic_div4_, DECONV_WINOGRAD_DEFAULT_TILE * 4, count, 0);
+
+      for (int uhi = 0; uhi < h_size; uhi++) {
+        for (int uwi = 0; uwi < w_size; uwi++) {
+          int w_index = (wi + uwi) * conv_param->stride_w_ + w_start;
+          int h_index = (hi + uhi) * conv_param->stride_h_ + h_start;
+          float *dst = tile_out + h_index * out_stride * deconv_param->out_tile_w_ + w_index * out_stride;
+          float *src = tmp_buf + (uwi + uhi * w_size) * out_stride;
+          DeConvWgMerge(src, dst, 4, 4, DECONV_WINOGRAD_DEFAULT_TILE * deconv_param->oc_div4_);
+        }
+      }
+    }
+  }
+
+  return;
+}
+
+void DeconvWg(float *nhwc_input_, float *tile_in, float *tile_out, int start_index, int calculate_count,
+              ConvParameter *conv_param, DeConvParam *deconv_param, int task_id) {
+  /* pack tile input */
+  int tile_in_unit_stride = deconv_param->ic_up4_ * DECONV_WINOGRAD_DEFAULT_TILE;
+#ifdef ENABLE_ARM
+  float32x4_t zero = vdupq_n_f32(0.0f);
+#endif
+  for (int unit_index = 0; unit_index < calculate_count; unit_index++) {
+    int plane_index = start_index + unit_index;
+    int w_unit_index = plane_index % deconv_param->in_tile_w_count_;
+    int h_unit_index = plane_index / deconv_param->in_tile_w_count_;
+    int w_start = w_unit_index * DECONV_WINOGRAD_DEFAULT_UNIT;
+    int h_start = h_unit_index * DECONV_WINOGRAD_DEFAULT_UNIT;
+
+    float *dst_unit = tile_in + unit_index * C4NUM;
+    for (int hi = 0; hi < DECONV_WINOGRAD_DEFAULT_UNIT; hi++) {
+      for (int wi = 0; wi < DECONV_WINOGRAD_DEFAULT_UNIT; wi++) {
+        float *dst = dst_unit + (wi + hi * DECONV_WINOGRAD_DEFAULT_UNIT) * tile_in_unit_stride;
+        int w_index = w_start + wi;
+        int h_index = h_start + hi;
+        if (w_index >= conv_param->input_w_ || h_index >= conv_param->input_h_) {
+          for (int ic4_index = 0; ic4_index < deconv_param->ic_div4_; ic4_index++) {
+#ifdef ENABLE_ARM
+            vst1q_f32(dst + ic4_index * DECONV_WINOGRAD_DEFAULT_TILE * C4NUM, zero);
+#else
+            for (int i = 0; i < 4; i++) {
+              dst[C4NUM * DECONV_WINOGRAD_DEFAULT_TILE * ic4_index + i] = 0;
+            }
+#endif
+          }
+          continue;
+        }
+
+        float *src = nhwc_input_ + (w_index + h_index * conv_param->input_w_) * conv_param->input_channel_;
+        DeConvWgInputPack(src, dst, conv_param->input_channel_, DECONV_WINOGRAD_DEFAULT_TILE * C4NUM);
+      }
+    }
+  }
+
+  /* compute */
+  for (int i = 0; i < deconv_param->compute_size_; i++) {
+    DeConvComputeUnit *unit = &deconv_param->compute_units_[i];
+    if (unit->use_winograd_) {
+      float *tmp_buf = (float *)unit->tmp_buffer_ + task_id * unit->winograd_.kh_ * unit->winograd_.kw_ *
+                                                      deconv_param->oc_div4_ * DECONV_WINOGRAD_DEFAULT_TILE * C4NUM;
+
+      /* winograd a buffer */
+      DeConvWgABuffer *wg_buf = &deconv_param->a_buffer_[unit->winograd_.kh_];
+      float *wg_mid_a_buf = (float *)wg_buf->middle_buffer_ + task_id * unit->winograd_.kw_ * unit->winograd_.kh_ *
+                                                                DECONV_WINOGRAD_DEFAULT_TILE * deconv_param->ic_up4_;
+      float *wg_dst_a_buf = (float *)wg_buf->dest_buffer_ + task_id * unit->winograd_.kw_ * unit->winograd_.kh_ *
+                                                              DECONV_WINOGRAD_DEFAULT_TILE * deconv_param->ic_up4_;
+      float *tmp_b_buf = (float *)unit->winograd_.b_buffer_ + task_id * unit->winograd_.kh_ * unit->winograd_.kw_ *
+                                                                deconv_param->oc_up4_ * DECONV_WINOGRAD_DEFAULT_TILE;
+      _deConvWinograd(tile_in, tile_out, (float *)unit->weight_, tmp_buf, unit->winograd_.AT_, wg_mid_a_buf,
+                      wg_dst_a_buf, wg_buf->trans_formed_, unit->winograd_.BT_, tmp_b_buf, unit->winograd_.kh_,
+                      unit->w_start_, unit->h_start_, conv_param, deconv_param);
+      wg_buf->trans_formed_ = true;
+    } else {
+      float *tmp_buf = (float *)unit->tmp_buffer_ + task_id * deconv_param->oc_div4_ * unit->w_size_ * unit->h_size_ *
+                                                      DECONV_WINOGRAD_DEFAULT_TILE * C4NUM;
+      _deConvCommon(tile_in, tile_out, (float *)unit->weight_, tmp_buf, unit->h_start_, unit->w_start_, unit->h_size_,
+                    unit->w_size_, conv_param, deconv_param);
+    }
+  }
+  return;
+}
+
+void DeconvWgPost(float *tile_out, float *nc4hw4_output, ConvParameter *conv_param, DeConvParam *deconv_param,
+                  int calculate_count, int tile_index) {
+  /* merge */
+  int src_unit_stride = deconv_param->oc_up4_ * DECONV_WINOGRAD_DEFAULT_TILE;
+
+  int src_stride = DECONV_WINOGRAD_DEFAULT_TILE * C4NUM;
+  int dst_stride = conv_param->output_w_ * conv_param->output_h_ * C4NUM;
+
+  for (int index = 0; index < calculate_count; ++index) {
+    float *src_start = tile_out + index * C4NUM;
+
+    int plane_index = tile_index * DECONV_WINOGRAD_DEFAULT_TILE + index;
+    int w_unit_index = plane_index % deconv_param->in_tile_w_count_;
+    int h_unit_index = plane_index / deconv_param->in_tile_w_count_;
+    int w_start = w_unit_index * DECONV_WINOGRAD_DEFAULT_UNIT * conv_param->stride_w_ - conv_param->pad_l_;
+    int h_start = h_unit_index * DECONV_WINOGRAD_DEFAULT_UNIT * conv_param->stride_h_ - conv_param->pad_u_;
+    float *dst_start = nc4hw4_output + h_start * conv_param->output_w_ * C4NUM + w_start * C4NUM;
+
+    int merge_w_start = MSMAX(-w_start, 0);
+    int merge_h_start = MSMAX(-h_start, 0);
+    int merge_h_end = MSMIN(deconv_param->out_tile_h_, conv_param->output_h_ - h_start);
+    int merge_w_end = MSMIN(deconv_param->out_tile_w_, conv_param->output_w_ - w_start);
+
+    for (int hi = merge_h_start; hi < merge_h_end; hi++) {
+      for (int wi = merge_w_start; wi < merge_w_end; wi++) {
+        float *src = src_start + (hi * deconv_param->out_tile_w_ + wi) * src_unit_stride;
+        float *dst = dst_start + (hi * conv_param->output_w_ + wi) * C4NUM;
+        DeConvWgMerge(src, dst, src_stride, dst_stride, deconv_param->oc_div4_);
+      }
+    }
+  }
+  return;
+}
diff --git a/mindspore/lite/nnacl/fp32/deconv_winograd.h b/mindspore/lite/nnacl/fp32/deconv_winograd.h
new file mode 100644
index 0000000000..576b772d7d
--- /dev/null
+++ b/mindspore/lite/nnacl/fp32/deconv_winograd.h
@@ -0,0 +1,42 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_NNACL_FP32_DECONV_WINOGRAD_H_
+#define MINDSPORE_LITE_NNACL_FP32_DECONV_WINOGRAD_H_
+
+#include <string.h>
+#include "nnacl/pack.h"
+#include "nnacl/op_base.h"
+#include "nnacl/conv_parameter.h"
+#include "nnacl/errorcode.h"
+#include "nnacl/fp32/common_func.h"
+#include "nnacl/minimal_filtering_generator.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int PackDeConvWgDataFp32(float *nhwc_weight, DeConvComputeUnit *unit, ConvParameter *conv_param,
+                         DeConvParam *deconv_param);
+void DeconvWg(float *nhwc_input_, float *tile_in, float *tile_out, int start_index, int calculate_count,
+              ConvParameter *conv_param, DeConvParam *deconv_param, int task_id);
+void DeconvWgPost(float *tile_out, float *nc4hw4_output, ConvParameter *conv_param, DeConvParam *deconv_param,
+                  int calculate_count, int tile_index);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // MINDSPORE_LITE_NNACL_FP32_DECONV_WINOGRAD_H_
diff --git a/mindspore/lite/nnacl/minimal_filtering_generator.c b/mindspore/lite/nnacl/minimal_filtering_generator.c
index 0d41315051..7e4b0be340 100644
--- a/mindspore/lite/nnacl/minimal_filtering_generator.c
+++ b/mindspore/lite/nnacl/minimal_filtering_generator.c
@@ -254,3 +254,88 @@ void MatrixMultiplyVec(const float32x4_t *matrix_a, const float32x4_t *matrix_b,
   }
 }
 #endif
+
+int WinogradWeightTransform(const float *weight_data, float *winograd_data, float *matrix_g, float *matrix_gt,
+                            int oc_block, int input_unit, int kernel_unit, int channel, int batch, bool pack) {
+  // original weight format : ohwi
+  int oc_block_num = UP_DIV(batch, oc_block);
+  int block_stride = channel * oc_block;
+  int block_num_stride = block_stride * oc_block_num;
+
+  // trans_filter = G*g*GT (g represents weight_data)
+  // separate into two steps ===> tmp = (g * GT)T ===> trans = (tmp * GT)T   use same function:MatrixMultiplyWinograd
+  float *tmp_data = (float *)(malloc(channel * input_unit * kernel_unit * sizeof(float)));
+  if (tmp_data == NULL) {
+    return NNACL_ERR;
+  }
+  float *trans_out_data = (float *)(malloc(channel * input_unit * input_unit * sizeof(float)));
+  if (trans_out_data == NULL) {
+    free(tmp_data);
+    return NNACL_ERR;
+  }
+
+#ifndef ENABLE_ARM
+  float *tmp_data1 = (float *)(malloc(channel * input_unit * kernel_unit * sizeof(float)));
+  if (tmp_data1 == NULL) {
+    free(tmp_data);
+    free(trans_out_data);
+    return NNACL_ERR;
+  }
+  float *trans_out_data1 = (float *)(malloc(channel * input_unit * input_unit * sizeof(float)));
+  if (trans_out_data1 == NULL) {
+    free(tmp_data);
+    free(tmp_data1);
+    free(trans_out_data);
+    return NNACL_ERR;
+  }
+#endif
+
+  int input_oz_offset = kernel_unit * kernel_unit * channel;
+  for (int i = 0; i < batch; i++) {
+    int out_c_block = i / oc_block;
+    int out_c_res = i % oc_block;
+    int output_oz_offset = out_c_block * block_stride + out_c_res;
+
+#ifndef ENABLE_ARM
+    // tmp_data = g * GT
+    MatrixMultiplyWinograd(weight_data + i * input_oz_offset, matrix_gt, tmp_data, kernel_unit, kernel_unit, input_unit,
+                           channel, channel * 4);
+    // tmp_data1 = (tmp_data)T
+    PackHWCToWHC(tmp_data, tmp_data1, kernel_unit, input_unit, channel);
+    // trans_out_data1 = tmp * GT
+    MatrixMultiplyWinograd(tmp_data1, matrix_gt, trans_out_data1, input_unit, kernel_unit, input_unit, channel,
+                           channel * 4);
+    // trans_out_data = (trans_out_data1)T
+    PackHWCToWHC(trans_out_data1, trans_out_data, input_unit, input_unit, channel);
+#else
+    // tmp = (g * GT)T
+    MatrixMultiplyWinograd(weight_data + i * input_oz_offset, matrix_gt, tmp_data, kernel_unit, kernel_unit, input_unit,
+                           channel, channel * 4);
+    // trans = (tmp * GT)T
+    MatrixMultiplyWinograd(tmp_data, matrix_gt, trans_out_data, input_unit, kernel_unit, input_unit, channel,
+                           channel * 4);
+#endif
+    if (pack) {
+      int in_offset = 0;
+      for (int j = 0; j < input_unit; ++j) {
+        for (int k = 0; k < input_unit; ++k) {
+          for (int c = 0; c < channel; ++c) {
+            *(winograd_data + output_oz_offset + c * oc_block) = trans_out_data[in_offset + c];
+          }
+          in_offset += channel;
+          output_oz_offset += block_num_stride;
+        }
+      }
+    } else {
+      memcpy(winograd_data + i * channel * input_unit * input_unit, trans_out_data,
+             channel * input_unit * input_unit * sizeof(float));
+    }
+  }
+#ifndef ENABLE_ARM
+  free(tmp_data1);
+  free(trans_out_data1);
+#endif
+  free(tmp_data);
+  free(trans_out_data);
+  return NNACL_OK;
+}
diff --git a/mindspore/lite/nnacl/minimal_filtering_generator.h b/mindspore/lite/nnacl/minimal_filtering_generator.h
index 2936bbedc0..53b146940a 100644
--- a/mindspore/lite/nnacl/minimal_filtering_generator.h
+++ b/mindspore/lite/nnacl/minimal_filtering_generator.h
@@ -20,6 +20,8 @@
 #ifdef ENABLE_ARM
 #include <arm_neon.h>
 #endif
+#include <stdbool.h>
+#include "nnacl/pack.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -47,6 +49,9 @@ int CookToomFilter(float *matrix_a, float *matrix_at, float *matrix_b, float *ma
 void MatrixMultiplyWinograd(const float *matix_a, const float *matrix_b, float *matrix_c, int m, int k, int n,
                             int in_channel, int c4_channel);
 
+int WinogradWeightTransform(const float *weight_data, float *winograd_data, float *matrix_g, float *matrix_gt,
+                            int oc_block, int input_unit_, int kernel_unit_, int channel, int batch, bool pack);
+
 #ifdef ENABLE_ARM
 void MatrixMultiplyVec(const float32x4_t *matrix_a, const float32x4_t *matrix_b, float32x4_t *matrix_c,
                        const float *bias, int m, int k, int n);
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc
index 47107834e5..60c2cbca39 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc
@@ -36,89 +36,9 @@ int ConvolutionWinogradCPUKernel::WinogradFilterTransform(const float *weight_da
     MS_LOG(ERROR) << "Divide by zero";
     return RET_ERROR;
   }
-  // original weight format : ohwi
-  auto channel_in = conv_param_->input_channel_;
-  auto channel_out = conv_param_->output_channel_;
-  int oc_block_num = UP_DIV(channel_out, oc_block);
-  int block_stride = channel_in * oc_block;
-  int block_num_stride = block_stride * oc_block_num;
 
-  // trans_filter = G*g*GT (g represents weight_data)
-  // separate into two steps ===> tmp = (g * GT)T ===> trans = (tmp * GT)T   use same function:MatrixMultiplyWinograd
-  auto tmp_data = reinterpret_cast<float *>(malloc(channel_in * input_unit_ * kernel_unit_ * sizeof(float)));
-  if (tmp_data == nullptr) {
-    MS_LOG(ERROR) << "malloc tmp_data failed.";
-    return RET_MEMORY_FAILED;
-  }
-  auto trans_out_data = reinterpret_cast<float *>(malloc(channel_in * input_unit_ * input_unit_ * sizeof(float)));
-  if (trans_out_data == nullptr) {
-    free(tmp_data);
-    MS_LOG(ERROR) << "malloc trans_out_data failed.";
-    return RET_MEMORY_FAILED;
-  }
-
-#ifndef ENABLE_ARM
-  auto tmp_data1 = reinterpret_cast<float *>(malloc(channel_in * input_unit_ * kernel_unit_ * sizeof(float)));
-  if (tmp_data1 == nullptr) {
-    free(tmp_data);
-    free(trans_out_data);
-    MS_LOG(ERROR) << "malloc tmp_data1 failed.";
-    return RET_MEMORY_FAILED;
-  }
-  auto trans_out_data1 = reinterpret_cast<float *>(malloc(channel_in * input_unit_ * input_unit_ * sizeof(float)));
-  if (trans_out_data1 == nullptr) {
-    free(tmp_data);
-    free(tmp_data1);
-    free(trans_out_data);
-    MS_LOG(ERROR) << "malloc trans_out_data1 failed.";
-    return RET_MEMORY_FAILED;
-  }
-#endif
-
-  int input_oz_offset = kernel_unit_ * kernel_unit_ * channel_in;
-  for (int i = 0; i < channel_out; i++) {
-    int out_c_block = i / oc_block;
-    int out_c_res = i % oc_block;
-    int output_oz_offset = out_c_block * block_stride + out_c_res;
-
-#ifndef ENABLE_ARM
-    // tmp_data = g * GT
-    MatrixMultiplyWinograd(weight_data + i * input_oz_offset, matrix_gt, tmp_data, kernel_unit_, kernel_unit_,
-                           input_unit_, channel_in, channel_in * 4);
-    // tmp_data1 = (tmp_data)T
-    PackHWCToWHC(tmp_data, tmp_data1, kernel_unit_, input_unit_, channel_in);
-    // trans_out_data1 = tmp * GT
-    MatrixMultiplyWinograd(tmp_data1, matrix_gt, trans_out_data1, input_unit_, kernel_unit_, input_unit_, channel_in,
-                           channel_in * 4);
-    // trans_out_data = (trans_out_data1)T
-    PackHWCToWHC(trans_out_data1, trans_out_data, input_unit_, input_unit_, channel_in);
-#else
-    // tmp = (g * GT)T
-    MatrixMultiplyWinograd(weight_data + i * input_oz_offset, matrix_gt, tmp_data, kernel_unit_, kernel_unit_,
-                           input_unit_, channel_in, channel_in * 4);
-    // trans = (tmp * GT)T
-    MatrixMultiplyWinograd(tmp_data, matrix_gt, trans_out_data, input_unit_, kernel_unit_, input_unit_, channel_in,
-                           channel_in * 4);
-#endif
-
-    int in_offset = 0;
-    for (int j = 0; j < input_unit_; ++j) {
-      for (int k = 0; k < input_unit_; ++k) {
-        for (int c = 0; c < channel_in; ++c) {
-          *(trans_weight_ + output_oz_offset + c * oc_block) = trans_out_data[in_offset + c];
-        }
-        in_offset += channel_in;
-        output_oz_offset += block_num_stride;
-      }
-    }
-  }
-#ifndef ENABLE_ARM
-  free(tmp_data1);
-  free(trans_out_data1);
-#endif
-  free(tmp_data);
-  free(trans_out_data);
-  return RET_OK;
+  return WinogradWeightTransform(weight_data, trans_weight_, matrix_g, matrix_gt, oc_block, input_unit_, kernel_unit_,
+                                 conv_param_->input_channel_, conv_param_->output_channel_, true);
 }
 
 int ConvolutionWinogradCPUKernel::InitWeightBias() {
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution.cc
index d219b44def..d3b65cab7c 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution.cc
@@ -15,6 +15,7 @@
  */
 
 #include "src/runtime/kernel/arm/fp32/deconvolution.h"
+#include "src/runtime/kernel/arm/fp32/deconvolution_winograd.h"
 #include "src/runtime/runtime_api.h"
 
 using mindspore::kernel::KERNEL_ARCH::kCPU;
@@ -125,9 +126,9 @@ int DeConvolutionCPUKernel::DoDeconv(int task_id) {
             matmul_param_->col_, OutType_C8);
 #endif
 
-  DeConvPostFp32C12x8(tmp_buffer, pack_output_ + task_id * thread_stride_ * C8NUM * output_plane_,
-                      reinterpret_cast<float *>(bias_data_) + thread_stride_ * task_id * C8NUM,
-                      output_ptr_ + task_id * thread_stride_ * C8NUM, oc_res, conv_param_);
+  DeConvPostFp32C8(tmp_buffer, pack_output_ + task_id * thread_stride_ * C8NUM * output_plane_,
+                   reinterpret_cast<float *>(bias_data_) + thread_stride_ * task_id * C8NUM,
+                   output_ptr_ + task_id * thread_stride_ * C8NUM, oc_res, conv_param_);
   return RET_OK;
 }
 
@@ -246,7 +247,17 @@ kernel::LiteKernel *CpuDeConvFp32KernelCreator(const std::vector<lite::Tensor *>
     }
     weight_tensor->SetData(dequant_weight);
   }
-  auto kernel = new (std::nothrow) kernel::DeConvolutionCPUKernel(opParameter, inputs, outputs, ctx, primitive);
+
+  kernel::LiteKernel *kernel;
+  auto conv_param = reinterpret_cast<ConvParameter *>(opParameter);
+  if ((conv_param->stride_h_ != 1 || conv_param->stride_w_ != 1) &&
+      (conv_param->dilation_w_ == 1 && conv_param->dilation_h_ == 1)) {
+    /* DeConvolutionWinogradCPUKernel */
+    kernel = new (std::nothrow) kernel::DeConvolutionCPUKernel(opParameter, inputs, outputs, ctx, primitive);
+  } else {
+    kernel = new (std::nothrow) kernel::DeConvolutionCPUKernel(opParameter, inputs, outputs, ctx, primitive);
+  }
+
   if (kernel == nullptr) {
     MS_LOG(ERROR) << "kernel is nullptr.";
     if (weight_tensor->data_type() == kNumberTypeInt8) {
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_winograd.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_winograd.cc
new file mode 100644
index 0000000000..da47475d34
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_winograd.cc
@@ -0,0 +1,368 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/kernel/arm/fp32/deconvolution_winograd.h"
+#include "src/runtime/runtime_api.h"
+
+using mindspore::kernel::KERNEL_ARCH::kCPU;
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_NULL_PTR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_DeConv2D;
+using mindspore::schema::Format::Format_NHWC;
+
+namespace mindspore::kernel {
+
+DeConvolutionWinogradCPUKernel::~DeConvolutionWinogradCPUKernel() {
+  FreeResizeBuf();
+  FreeDeconvParam();
+  return;
+}
+
+void DeConvolutionWinogradCPUKernel::FreeResizeBuf() {
+  for (int i = 0; i < deconv_param_->compute_size_; i++) {
+    DeConvComputeUnit &unit = deconv_param_->compute_units_[i];
+    if (unit.tmp_buffer_ != nullptr) {
+      free(unit.tmp_buffer_);
+      unit.tmp_buffer_ = nullptr;
+    }
+
+    if (unit.use_winograd_) {
+      if (unit.winograd_.b_buffer_ != nullptr) {
+        free(unit.winograd_.b_buffer_);
+        unit.winograd_.b_buffer_ = nullptr;
+      }
+    }
+  }
+
+  for (int i = 0; i < DECONV_WINOGRAD_BUFFER_COUNT; i++) {
+    DeConvWgABuffer &wg = deconv_param_->a_buffer_[i];
+    if (wg.buf_init_) {
+      if (wg.dest_buffer_ != nullptr) {
+        free(wg.dest_buffer_);
+        wg.dest_buffer_ = nullptr;
+      }
+      if (wg.middle_buffer_ != nullptr) {
+        free(wg.middle_buffer_);
+        wg.middle_buffer_ = nullptr;
+      }
+    }
+    wg.buf_init_ = false;
+  }
+
+  if (nc4hw4_output_ != nullptr) {
+    free(nc4hw4_output_);
+    nc4hw4_output_ = nullptr;
+  }
+
+  if (tile_input_ != nullptr) {
+    free(tile_input_);
+    tile_input_ = nullptr;
+  }
+
+  if (tile_output_ != nullptr) {
+    free(tile_output_);
+    tile_output_ = nullptr;
+  }
+  return;
+}
+
+void DeConvolutionWinogradCPUKernel::FreeDeconvParam() {
+  for (int i = 0; i < deconv_param_->compute_size_; i++) {
+    DeConvComputeUnit &unit = deconv_param_->compute_units_[i];
+
+    if (unit.weight_ != nullptr) {
+      free(unit.weight_);
+      unit.weight_ = nullptr;
+    }
+
+    if (unit.use_winograd_) {
+      if (unit.winograd_.AT_ != nullptr) {
+        free(unit.winograd_.AT_);
+        unit.winograd_.AT_ = nullptr;
+      }
+      if (unit.winograd_.BT_ != nullptr) {
+        free(unit.winograd_.BT_);
+        unit.winograd_.BT_ = nullptr;
+      }
+    }
+  }
+
+  if (deconv_param_ != nullptr) {
+    delete (deconv_param_);
+    deconv_param_ = nullptr;
+  }
+  return;
+}
+
+int DeConvolutionWinogradCPUKernel::InitParameter() {
+  deconv_param_->input_plane_ = conv_param_->input_h_ * conv_param_->input_w_;
+  deconv_param_->output_plane_ = conv_param_->output_h_ * conv_param_->output_w_;
+
+  nc4hw4_output_ =
+    reinterpret_cast<float *>(malloc(deconv_param_->oc_up4_ * deconv_param_->output_plane_ * sizeof(float)));
+
+  deconv_param_->in_tile_w_count_ = UP_DIV(conv_param_->input_w_, DECONV_WINOGRAD_DEFAULT_UNIT);
+  deconv_param_->in_tile_h_count_ = UP_DIV(conv_param_->input_h_, DECONV_WINOGRAD_DEFAULT_UNIT);
+
+  deconv_param_->in_tile_count_ =
+    UP_DIV(deconv_param_->in_tile_w_count_ * deconv_param_->in_tile_h_count_, DECONV_WINOGRAD_DEFAULT_TILE);
+  deconv_param_->thread_num_ = MSMAX(1, op_parameter_->thread_num_);
+  deconv_param_->thread_num_ = MSMIN(deconv_param_->thread_num_, deconv_param_->in_tile_count_);
+
+  thread_num_hw_ = MSMIN(op_parameter_->thread_num_, deconv_param_->output_plane_);
+  thread_stride_hw_ = UP_DIV(deconv_param_->output_plane_, thread_num_hw_);
+
+  int size = deconv_param_->thread_num_ * DECONV_WINOGRAD_DEFAULT_UNIT * DECONV_WINOGRAD_DEFAULT_UNIT *
+             DECONV_WINOGRAD_DEFAULT_TILE * deconv_param_->ic_up4_;
+  tile_input_ = reinterpret_cast<float *>(malloc(size * sizeof(float)));
+  memset(tile_input_, 0, size * sizeof(float));
+
+  deconv_param_->out_tile_w_ = (DECONV_WINOGRAD_DEFAULT_UNIT - 1) * conv_param_->stride_w_ + conv_param_->kernel_w_;
+  deconv_param_->out_tile_h_ = (DECONV_WINOGRAD_DEFAULT_UNIT - 1) * conv_param_->stride_h_ + conv_param_->kernel_h_;
+  size = deconv_param_->thread_num_ * deconv_param_->out_tile_w_ * deconv_param_->out_tile_h_ *
+         DECONV_WINOGRAD_DEFAULT_TILE * deconv_param_->oc_up4_;
+  tile_output_ = reinterpret_cast<float *>(malloc(size * sizeof(float)));
+
+  for (int i = 0; i < deconv_param_->compute_size_; i++) {
+    DeConvComputeUnit &unit = deconv_param_->compute_units_[i];
+    if (unit.use_winograd_) {
+      if (deconv_param_->a_buffer_[unit.winograd_.kh_].buf_init_ == false) {
+        deconv_param_->a_buffer_[unit.winograd_.kh_].buf_init_ = true;
+        deconv_param_->a_buffer_[unit.winograd_.kh_].trans_formed_ = false;
+
+        size = unit.winograd_.kh_ * unit.winograd_.kw_ * DECONV_WINOGRAD_DEFAULT_TILE * deconv_param_->ic_up4_;
+        deconv_param_->a_buffer_[unit.winograd_.kh_].middle_buffer_ =
+          malloc(deconv_param_->thread_num_ * size * sizeof(float));
+        deconv_param_->a_buffer_[unit.winograd_.kh_].dest_buffer_ =
+          malloc(deconv_param_->thread_num_ * size * sizeof(float));
+      }
+
+      unit.winograd_.b_buffer_ = malloc(deconv_param_->thread_num_ * unit.winograd_.kh_ * unit.winograd_.kw_ *
+                                        deconv_param_->oc_up4_ * DECONV_WINOGRAD_DEFAULT_TILE * sizeof(float));
+      unit.tmp_buffer_ = malloc(deconv_param_->thread_num_ * unit.winograd_.kh_ * unit.winograd_.kw_ *
+                                deconv_param_->oc_div4_ * DECONV_WINOGRAD_DEFAULT_TILE * C4NUM * sizeof(float));
+
+    } else {
+      unit.tmp_buffer_ = malloc(deconv_param_->thread_num_ * deconv_param_->oc_div4_ * unit.w_size_ * unit.h_size_ *
+                                DECONV_WINOGRAD_DEFAULT_TILE * C4NUM * sizeof(float));
+    }
+  }
+
+  return RET_OK;
+}
+
+int DeConvWgFp32Run(void *cdata, int task_id) {
+  auto deconvWg = reinterpret_cast<DeConvolutionWinogradCPUKernel *>(cdata);
+  deconvWg->DoDeconv(task_id);
+  return RET_OK;
+}
+
+int DeConvWgPostFp32Run(void *cdata, int task_id) {
+  auto deconvWg = reinterpret_cast<DeConvolutionWinogradCPUKernel *>(cdata);
+  deconvWg->DeDeconvPost(task_id);
+  return RET_OK;
+}
+
+int DeConvolutionWinogradCPUKernel::InitComputeParam() {
+  auto weight_tensor = in_tensors_[1];
+
+  conv_param_->input_channel_ = weight_tensor->Batch();
+  conv_param_->output_channel_ = weight_tensor->Channel();
+  conv_param_->kernel_w_ = weight_tensor->Width();
+  conv_param_->kernel_h_ = weight_tensor->Height();
+
+  deconv_param_->kernel_plane_ = conv_param_->kernel_w_ * conv_param_->kernel_h_;
+  deconv_param_->ic_div4_ = UP_DIV(conv_param_->input_channel_, C4NUM);
+  deconv_param_->oc_div4_ = UP_DIV(conv_param_->output_channel_, C4NUM);
+  deconv_param_->ic_up4_ = deconv_param_->ic_div4_ * C4NUM;
+  deconv_param_->oc_up4_ = deconv_param_->oc_div4_ * C4NUM;
+
+  deconv_param_->compute_size_ = 0;
+  for (int si_h = 0; si_h < conv_param_->stride_h_; si_h++) {
+    for (int si_w = 0; si_w < conv_param_->stride_w_; si_w++) {
+      if (si_h < conv_param_->kernel_h_ && si_w < conv_param_->kernel_w_) {
+        deconv_param_->compute_size_++;
+      }
+    }
+  }
+
+  int size = deconv_param_->compute_size_ * sizeof(DeConvComputeUnit);
+  deconv_param_->compute_units_ = reinterpret_cast<DeConvComputeUnit *>(malloc(size));
+  if (deconv_param_->compute_units_ == nullptr) {
+    return RET_NULL_PTR;
+  }
+  int cur_count = 0;
+  for (int si_h = 0; si_h < conv_param_->stride_h_; si_h++) {
+    if (si_h >= conv_param_->kernel_h_) {
+      continue;
+    }
+    for (int si_w = 0; si_w < conv_param_->stride_w_; si_w++) {
+      if (si_w >= conv_param_->kernel_w_) {
+        continue;
+      }
+
+      int h_size = 1 + (conv_param_->kernel_h_ - si_h - 1) / conv_param_->stride_h_;
+      int w_size = 1 + (conv_param_->kernel_w_ - si_w - 1) / conv_param_->stride_w_;
+
+      DeConvComputeUnit unit;
+
+      unit.h_start_ = si_h;
+      unit.w_start_ = si_w;
+      unit.h_size_ = h_size;
+      unit.w_size_ = w_size;
+
+      if (h_size == w_size) {
+        unit.use_winograd_ = true;
+
+        unit.winograd_.k_ = unit.h_size_;
+        unit.winograd_.i_ = DECONV_WINOGRAD_DEFAULT_UNIT;
+        unit.winograd_.o_ = DECONV_WINOGRAD_DEFAULT_UNIT + unit.h_size_ - 1;
+        unit.winograd_.kh_ = unit.h_size_ + DECONV_WINOGRAD_DEFAULT_UNIT - 1;
+        unit.winograd_.kw_ = unit.w_size_ + DECONV_WINOGRAD_DEFAULT_UNIT - 1;
+
+        unit.winograd_.b_buffer_ = nullptr;
+        unit.weight_ = malloc(unit.winograd_.kh_ * unit.winograd_.kw_ * deconv_param_->oc_up4_ *
+                              deconv_param_->ic_up4_ * sizeof(float));
+      } else {
+        unit.use_winograd_ = false;
+        unit.weight_ = malloc(h_size * w_size * deconv_param_->ic_up4_ * deconv_param_->oc_up4_ * sizeof(float));
+      }
+      unit.tmp_buffer_ = nullptr;
+      deconv_param_->compute_units_[cur_count] = unit;
+      cur_count++;
+    }
+  }
+  return RET_OK;
+}
+
+int DeConvolutionWinogradCPUKernel::InitDataParam() {
+  auto weight_tensor = in_tensors_.at(kWeightIndex);
+  float *nhwc_weight = reinterpret_cast<float *>(weight_tensor->data_c());
+
+  /* unit data : weight & winograd data*/
+  for (int i = 0; i < deconv_param_->compute_size_; i++) {
+    DeConvComputeUnit *unit = &deconv_param_->compute_units_[i];
+    int ret = PackDeConvWgDataFp32(nhwc_weight, unit, conv_param_, deconv_param_);
+    if (ret != RET_OK) {
+      return ret;
+    }
+  }
+
+  /* bias */
+  auto bias_tensor = in_tensors_.at(kBiasIndex);
+  bias_data_ = malloc(deconv_param_->oc_up4_ * sizeof(float));
+  memset(bias_data_, 0, deconv_param_->oc_up4_ * sizeof(float));
+  if (in_tensors_.size() == 3) {
+    memcpy(bias_data_, bias_tensor->data_c(), conv_param_->output_channel_ * sizeof(float));
+  }
+  return RET_OK;
+}
+
+int DeConvolutionWinogradCPUKernel::ReSize() {
+  FreeResizeBuf();
+  ConvolutionBaseCPUKernel::Init();
+  InitParameter();
+  return RET_OK;
+}
+
+int DeConvolutionWinogradCPUKernel::Init() {
+  int error_code = InitComputeParam();
+  if (error_code != RET_OK) {
+    MS_LOG(ERROR) << "InitComputeParam error! ret: " << error_code;
+    return error_code;
+  }
+
+  error_code = InitDataParam();
+  if (error_code != RET_OK) {
+    MS_LOG(ERROR) << "InitWeightBias error! ret: " << error_code;
+    return error_code;
+  }
+
+  if (!InferShapeDone()) {
+    return RET_OK;
+  }
+
+  return ReSize();
+}
+
+int DeConvolutionWinogradCPUKernel::DoDeconv(int task_id) {
+  for (int tile_index = task_id; tile_index < deconv_param_->in_tile_count_; tile_index += deconv_param_->thread_num_) {
+    float *tile_in = tile_input_ + task_id * DECONV_WINOGRAD_DEFAULT_UNIT * DECONV_WINOGRAD_DEFAULT_UNIT *
+                                     DECONV_WINOGRAD_DEFAULT_TILE * deconv_param_->ic_up4_;
+    int size = deconv_param_->out_tile_w_ * deconv_param_->out_tile_h_ * DECONV_WINOGRAD_DEFAULT_TILE *
+               deconv_param_->oc_div4_ * C4NUM;
+    float *tile_out = tile_output_ + task_id * size;
+    memset(tile_out, 0, size * sizeof(float));
+
+    int start_index = tile_index * DECONV_WINOGRAD_DEFAULT_TILE;
+    int calculate_count = MSMIN(DECONV_WINOGRAD_DEFAULT_TILE,
+                                deconv_param_->in_tile_w_count_ * deconv_param_->in_tile_h_count_ - start_index);
+
+    for (int i = 0; i < DECONV_WINOGRAD_BUFFER_COUNT; i++) {
+      deconv_param_->a_buffer_[i].trans_formed_ = false;
+    }
+    DeconvWg(nhwc_input_, tile_in, tile_out, start_index, calculate_count, conv_param_, deconv_param_, task_id);
+
+    std::unique_lock<std::mutex> merge_lock(lock_);
+    DeconvWgPost(tile_out, nc4hw4_output_, conv_param_, deconv_param_, calculate_count, tile_index);
+  }
+  return RET_OK;
+}
+
+int DeConvolutionWinogradCPUKernel::DeDeconvPost(int task_id) {
+  int rest_plane = deconv_param_->output_plane_ - task_id * thread_stride_hw_;
+  int current_plane = MSMIN(rest_plane, thread_stride_hw_);
+  if (current_plane <= 0) {
+    return RET_OK;
+  }
+
+  PostConvFuncFp32C4(nc4hw4_output_ + task_id * thread_stride_hw_ * C4NUM,
+                     nhwc_output_ + task_id * thread_stride_hw_ * conv_param_->output_channel_,
+                     reinterpret_cast<float *>(bias_data_), conv_param_->output_channel_, current_plane,
+                     deconv_param_->output_plane_, conv_param_->act_type_ == ActType_Relu,
+                     conv_param_->act_type_ == ActType_Relu6);
+  return RET_OK;
+}
+
+int DeConvolutionWinogradCPUKernel::Run() {
+  auto prepare_ret = Prepare();
+  if (prepare_ret != RET_OK) {
+    MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret;
+    return prepare_ret;
+  }
+  float *src_in = reinterpret_cast<float *>(in_tensors_[0]->data_c());
+  float *src_out = reinterpret_cast<float *>(out_tensors_[0]->data_c());
+
+  for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) {
+    nhwc_input_ = src_in + batch_index * deconv_param_->input_plane_ * conv_param_->input_channel_;
+    nhwc_output_ = src_out + batch_index * deconv_param_->output_plane_ * conv_param_->output_channel_;
+
+    ::memset(nc4hw4_output_, 0, deconv_param_->output_plane_ * deconv_param_->oc_div4_ * C4NUM * sizeof(float));
+    for (int i = 0; i < deconv_param_->thread_num_; i++) {
+      DoDeconv(i);
+    }
+    //    ParallelLaunch(this->context_->thread_pool_, DeConvWgFp32Run, this, deconv_param_->thread_num_);
+
+    /*post bias activate and nhwc */
+    ParallelLaunch(this->context_->thread_pool_, DeConvWgPostFp32Run, this, thread_num_hw_);
+  }
+
+  return RET_OK;
+}
+
+}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_winograd.h b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_winograd.h
new file mode 100644
index 0000000000..17205a78f2
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_winograd.h
@@ -0,0 +1,70 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_DECONVOLUTION_WINOGRAD_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_DECONVOLUTION_WINOGRAD_H_
+
+#include <float.h>
+#include <vector>
+#include "src/lite_kernel.h"
+#include "src/kernel_registry.h"
+#include "include/errorcode.h"
+#include "schema/model_generated.h"
+#include "nnacl/fp32/matmul.h"
+#include "nnacl/fp32/deconv_winograd.h"
+#include "src/runtime/kernel/arm/base/convolution_base.h"
+
+namespace mindspore::kernel {
+class DeConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel {
+ public:
+  DeConvolutionWinogradCPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
+                                 const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
+                                 const mindspore::lite::PrimitiveC *primitive)
+      : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {
+    deconv_param_ = new DeConvParam();
+    for (auto &wg : deconv_param_->a_buffer_) {
+      wg.buf_init_ = false;
+    }
+  }
+  ~DeConvolutionWinogradCPUKernel() override;
+  int Init() override;
+  int Run() override;
+  int ReSize() override;
+
+ public:
+  int DoDeconv(int task_id);
+  int DeDeconvPost(int task_id);
+
+ private:
+  int InitComputeParam();
+  int InitDataParam();
+  int InitParameter();
+  void FreeDeconvParam();
+  void FreeResizeBuf();
+
+ private:
+  DeConvParam *deconv_param_;
+  float *nhwc_input_ = nullptr;
+  float *nhwc_output_ = nullptr;
+  float *nc4hw4_output_ = nullptr;
+  float *tile_input_ = nullptr;
+  float *tile_output_ = nullptr;
+  std::mutex lock_;
+  int thread_num_hw_;
+  int thread_stride_hw_;
+};
+}  // namespace mindspore::kernel
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_DECONVOLUTION_WINOGRAD_H_