!5086 [MS][LITE]optimize winograd

Merge pull request !5086 from fuzhiye/tmp
5 years ago · 8fbde8e349
--- a/mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S
+++ b/mindspore/lite/nnacl/assembly/arm64/MatmulFp32Opt.S
@@ -47,7 +47,7 @@
 /////////////////////////////////////////////////////////////////////////////////
 //
 // void MatmulFloatNeon64(const float *a, const float *b, float *c, const float *bias, int act_type, int depth
 //                        int row, int col, size_t stride, size_t writeNhwc, size_t writeC4)
 //                        int row, int col, size_t stride, size_t writeNhwc, size_t WriteWino)
 // x0: a
 // x1: b
 // x2: c
@@ -64,11 +64,20 @@ MatmulFloatNeon64Opt:
  st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
  st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
  ldr x9, [sp, #8]
  ldr x14, [sp, #16]
  mov w18, #32 // sizeof(float) * 8
  mul w15, w5, w18 // block stride of lhs/rhs: sizeof(float) * 8 * depth
  mov x11, x3 // bias flag
  mov x18, #4
  ldr x17, [sp]
  cbz x14, NoWinoSteps
  mul x8, x7, x17
  mov x11, #8
  mul x11, x11, x17
  mul x8, x8, x18
  mul x11, x11, x18
 NoWinoSteps:
  mul x17, x17, x18
 L1:
@@ -79,7 +88,6 @@ L1:
 L2:
  mov x16, x1 // reload rhs ptr
  mov w13, w5 // reload depth
  mov x14, x3 // reload bias ptr
  dup v8.4s, wzr
  dup v9.4s, wzr
  dup v10.4s, wzr
@@ -150,7 +158,7 @@ Loop:
  fmla v11.4s, v4.4s, v0.s[1]
  fmla v13.4s, v4.4s, v0.s[2]
  fmla v15.4s, v4.4s, v0.s[3]
  subs w13, w13, #1
  bgt Loop
@@ -173,9 +181,10 @@ LoopEnd:
  fmla v31.4s, v4.4s, v2.s[3]
 Bias:
  cbz x11, Activation
  ld1 {v0.4s}, [x14], #16
  ld1 {v1.4s}, [x14], #16
  cbz x3, Activation
  ld1 {v0.4s}, [x3], #16
  ld1 {v1.4s}, [x3]
  sub x3, x3, #16
  fadd v8.4s, v8.4s, v0.4s
  fadd v9.4s, v9.4s, v1.4s
  fadd v10.4s, v10.4s, v0.4s
@@ -265,10 +274,8 @@ Relu:
  fmax v31.4s, v31.4s, v3.4s
 Write:
  ldr w8, [sp, #8]
  cbz w8, WriteC8
  ldr w8, [sp, #16]
  cbnz w8, WriteC4
  cbnz x14, WriteWino
  cbz x9, WriteC8
  cmp w7, #1
  beq Write1
  cmp w7, #2
@@ -721,39 +728,26 @@ Write7:
  st1 {v31.s}[2], [x16], x17
  b WriteEnd
 WriteC8:
  st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [x2], #64
  st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [x2], #64
  st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64
  st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], #64
  st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x2], #64
  st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x2], #64
  st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x2], #64
  st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x2], #64
  st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x2], #64
  st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x2], #64
  st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x2], #64
  st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x2], #64
  b WriteEnd
 WriteC4:
  st1 {v8.8h}, [x2], #16
  st1 {v10.8h}, [x2], #16
  st1 {v12.8h}, [x2], #16
  st1 {v14.8h}, [x2], #16
  st1 {v16.8h}, [x2], #16
  st1 {v18.8h}, [x2], #16
  st1 {v20.8h}, [x2], #16
  st1 {v22.8h}, [x2], #16
  st1 {v24.8h}, [x2], #16
  st1 {v26.8h}, [x2], #16
  st1 {v28.8h}, [x2], #16
  st1 {v30.8h}, [x2], #16
  add x18, x2, x17
  st1 {v9.8h}, [x18], #16
  st1 {v11.8h}, [x18], #16
  st1 {v13.8h}, [x18], #16
  st1 {v15.8h}, [x18], #16
  st1 {v17.8h}, [x18], #16
  st1 {v19.8h}, [x18], #16
  st1 {v21.8h}, [x18], #16
  st1 {v23.8h}, [x18], #16
  st1 {v25.8h}, [x18], #16
  st1 {v27.8h}, [x18], #16
  st1 {v29.8h}, [x18], #16
  st1 {v31.8h}, [x18], #16
 WriteWino:
  st1 {v8.4s, v9.4s}, [x18], x8
  st1 {v10.4s, v11.4s}, [x18], x8
  st1 {v12.4s, v13.4s}, [x18], x8
  st1 {v14.4s, v15.4s}, [x18], x8
  st1 {v16.4s, v17.4s}, [x18], x8
  st1 {v18.4s, v19.4s}, [x18], x8
  st1 {v20.4s, v21.4s}, [x18], x8
  st1 {v22.4s, v23.4s}, [x18], x8
  st1 {v24.4s, v25.4s}, [x18], x8
  st1 {v26.4s, v27.4s}, [x18], x8
  st1 {v28.4s, v29.4s}, [x18], x8
  st1 {v30.4s, v31.4s}, [x18], x8
  b WriteEnd
 Write8:
  st1 {v8.4s, v9.4s}, [x18], x17
@@ -798,15 +792,15 @@ WriteEnd:
 End2:
  subs w7, w7, #8 // rhs col - 8
  add x1, x1, x15 // rhs ptr + stride
  cbz x3, NoBiasStep
  add x3, x3, #32 // bias ptr + stride
  ldr w8, [sp, #8]
  cbz w8, NoDstStep
  ldr w8, [sp, #16]
  cbnz w8, C4DstStep
 NoBiasStep:
  cbnz x14, WinoDstStep
  cbz x9, NoDstStep
  add x2, x2, #32 // dst ptr + stride
  b NoDstStep
 C4DstStep:
  add x2, x2, x17
 WinoDstStep:
  add x2, x2, x11
 NoDstStep:
  bgt L1
--- a/mindspore/lite/nnacl/conv_parameter.h
+++ b/mindspore/lite/nnacl/conv_parameter.h
@@ -32,8 +32,6 @@ typedef struct ConvParameter {
  int stride_w_;
  int dilation_h_;
  int dilation_w_;
  int pad_h_;
  int pad_w_;
  int pad_u_;
  int pad_d_;
  int pad_l_;
@@ -51,8 +49,7 @@ typedef struct ConvParameter {
  int thread_num_;
  int input_unit_;
  int output_unit_;
  bool is_relu_;
  bool is_relu6_;
  ActType act_type_;
 } ConvParameter;
 typedef struct SlidingWindowParam {
--- a/mindspore/lite/nnacl/fp16/conv_depthwise_fp16.c
+++ b/mindspore/lite/nnacl/fp16/conv_depthwise_fp16.c
@@ -53,16 +53,18 @@ void DepthwiseBorderPixelFp16(float16_t *dst, const float16_t *src, const float1
 void DepthwiseBorderFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, int top,
                         int bottom, int left, int right, const ConvParameter *conv_param,
                         const SlidingWindowParam *sliding) {
  bool relu = conv_param->act_type_ == ActType_Relu;
  bool relu6 = conv_param->act_type_ == ActType_Relu6;
  float16_t *dst_h = dst + top * sliding->out_h_step_;
  for (int oh = top; oh < bottom; oh++) {
    int ih = oh * conv_param->stride_h_ - conv_param->pad_h_;
    int ih = oh * conv_param->stride_h_ - conv_param->pad_u_;
    int start_kh = MSMAX(0, UP_DIV(-ih, conv_param->dilation_h_));
    int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih, conv_param->dilation_h_));
    const float16_t *src_h = src + ih * sliding->in_h_step_;
    float16_t *dst_kernel = dst_h + left * sliding->block_channel_;
    for (int ow = left; ow < right; ow++) {
      int iw = ow * conv_param->stride_w_ - conv_param->pad_w_;
      int iw = ow * conv_param->stride_w_ - conv_param->pad_l_;
      int start_kw = MSMAX(0, UP_DIV(-iw, conv_param->dilation_w_));
      int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->input_w_ - iw, conv_param->dilation_w_));
      const float16_t *src_w = src_h + iw * sliding->block_channel_;
@@ -72,11 +74,10 @@ void DepthwiseBorderFp16(float16_t *dst, const float16_t *src, const float16_t *
 #ifdef ENABLE_ARM64
      ConvDwFp16Border(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw,
                       sliding->in_kh_step_ * sizeof(float16_t), sliding->in_kw_step_ * sizeof(float16_t),
                       conv_param->kernel_w_ * C8NUM * sizeof(float16_t), conv_param->is_relu_, conv_param->is_relu6_);
                       conv_param->kernel_w_ * C8NUM * sizeof(float16_t), relu, relu6);
 #else
      DepthwiseBorderPixelFp16(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw,
                               sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_ * C8NUM,
                               conv_param->is_relu_, conv_param->is_relu6_);
                               sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_ * C8NUM, relu, relu6);
 #endif
      dst_kernel += sliding->block_channel_;
    }  // width loop
@@ -139,6 +140,8 @@ void DepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float16_t *
 void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const float16_t *weight_data,
                  const float16_t *bias_data, const ConvParameter *conv_param, const SlidingWindowParam *sliding,
                  int task_id) {
  bool relu = conv_param->act_type_ == ActType_Relu;
  bool relu6 = conv_param->act_type_ == ActType_Relu6;
  const float16_t *src = input_data;
  float16_t *dst = output_data;
  for (int b = 0; b < conv_param->output_batch_; b++) {
@@ -157,8 +160,8 @@ void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const flo
                          conv_param->output_w_, conv_param, sliding);
      if (sliding->right_ > sliding->left_ && sliding->bottom_ > sliding->top_) {
        int in_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_h_;
        int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_;
        int in_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_u_;
        int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_l_;
        const float16_t *in_t = src_data + in_h_start * sliding->in_h_step_ + in_w_start * sliding->block_channel_;
        float16_t *out_t = dst_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_;
 #ifdef ENABLE_ARM64
@@ -166,12 +169,12 @@ void ConvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const flo
                         conv_param->kernel_h_, conv_param->kernel_w_, sliding->out_h_step_ * sizeof(float16_t),
                         sliding->block_channel_ * sizeof(float16_t), sliding->in_sh_step_ * sizeof(float16_t),
                         sliding->in_sw_step_ * sizeof(float16_t), sliding->in_kh_step_ * sizeof(float16_t),
                         sliding->in_kw_step_ * sizeof(float16_t), conv_param->is_relu_, conv_param->is_relu6_);
                         sliding->in_kw_step_ * sizeof(float16_t), relu, relu6);
 #else
        DepthwiseCenterFp16(out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_,
                            sliding->right_ - sliding->left_, conv_param->kernel_h_, conv_param->kernel_w_,
                            sliding->out_h_step_, sliding->block_channel_, sliding->in_sh_step_, sliding->in_sw_step_,
                            sliding->in_kh_step_, sliding->in_kw_step_, conv_param->is_relu_, conv_param->is_relu6_);
                            sliding->in_kh_step_, sliding->in_kw_step_, relu, relu6);
 #endif
      }
    }  // output C8 loop
@@ -210,14 +213,14 @@ void DeconvDepthwiseBorderFp16(float16_t *dst, const float16_t *src, const float
                               const SlidingWindowParam *sliding) {
  const float16_t *src_h = src + top * sliding->out_h_step_;
  for (int ih = top; ih < bottom; ih++) {
    int oh = ih * conv_param->stride_h_ - conv_param->pad_h_;
    int oh = ih * conv_param->stride_h_ - conv_param->pad_u_;
    int start_kh = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_));
    int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_));
    float16_t *dst_h = dst + oh * sliding->in_h_step_;
    const float16_t *src_kernel = src_h + left * sliding->block_channel_;
    for (int iw = left; iw < right; iw++) {
      int ow = iw * conv_param->stride_w_ - conv_param->pad_w_;
      int ow = iw * conv_param->stride_w_ - conv_param->pad_l_;
      int start_kw = MSMAX(0, UP_DIV(-ow, conv_param->dilation_w_));
      int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->output_w_ - ow, conv_param->dilation_w_));
      float16_t *dst_w = dst_h + ow * sliding->block_channel_;
@@ -282,12 +285,14 @@ void DeconvDepthwiseCenterFp16(float16_t *dst, const float16_t *src, const float
 void DeconvDepthwisePostFuncFp16(float16_t *dst, const float16_t *bias, int block_channel,
                                 const ConvParameter *conv_param) {
  bool relu = conv_param->act_type_ == ActType_Relu;
  bool relu6 = conv_param->act_type_ == ActType_Relu6;
  float16_t *dst_k = dst;
  for (int k = 0; k < conv_param->output_h_ * conv_param->output_w_; k++) {
    for (int c = 0; c < C8NUM; c++) {
      dst_k[c] += bias[c];
      dst_k[c] = (conv_param->is_relu_) ? (MSMAX(0, dst_k[c])) : (dst_k[c]);
      dst_k[c] = (conv_param->is_relu6_) ? (MSMIN(6, MSMAX(0, dst_k[c]))) : (dst_k[c]);
      dst_k[c] = (relu) ? (MSMAX(0, dst_k[c])) : (dst_k[c]);
      dst_k[c] = (relu6) ? (MSMIN(6, MSMAX(0, dst_k[c]))) : (dst_k[c]);
    }
    dst_k += block_channel;
  }
@@ -315,8 +320,8 @@ void DeconvDwC8Fp16(float16_t *output_data, const float16_t *input_data, const f
                                conv_param->input_w_, conv_param, sliding);
      if (sliding->right_ > sliding->left_ && sliding->bottom_ > sliding->top_) {
        int oh_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_h_;
        int oh_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_;
        int oh_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_u_;
        int oh_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_l_;
        float16_t *out_t = dst_data + oh_h_start * sliding->in_h_step_ + oh_w_start * sliding->block_channel_;
        const float16_t *in_t =
          src_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_;
--- a/mindspore/lite/nnacl/fp16/conv_fp16.c
+++ b/mindspore/lite/nnacl/fp16/conv_fp16.c
@@ -173,16 +173,18 @@ void SWBorderPixel(float16_t *dst, const float16_t *src, const float16_t *weight
 void SWBorderFp16(float16_t *dst, const float16_t *src, const float16_t *weight, const float16_t *bias, int top,
                  int bottom, int left, int right, const ConvParameter *conv_param, const SlidingWindowParam *sliding) {
  bool relu = conv_param->act_type_ == ActType_Relu;
  bool relu6 = conv_param->act_type_ == ActType_Relu6;
  float16_t *dst_h = dst + top * sliding->out_h_step_;
  for (int oh = top; oh < bottom; oh++) {
    int ih = oh * conv_param->stride_h_ - conv_param->pad_h_;
    int ih = oh * conv_param->stride_h_ - conv_param->pad_u_;
    int start_kh = MSMAX(0, UP_DIV(-ih, conv_param->dilation_h_));
    int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih, conv_param->dilation_h_));
    const float16_t *src_h = src + ih * sliding->in_h_step_;
    float16_t *dst_kernel = dst_h + left * sliding->block_channel_;
    for (int ow = left; ow < right; ow++) {
      int iw = ow * conv_param->stride_w_ - conv_param->pad_w_;
      int iw = ow * conv_param->stride_w_ - conv_param->pad_l_;
      int start_kw = MSMAX(0, UP_DIV(-iw, conv_param->dilation_w_));
      int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->input_w_ - iw, conv_param->dilation_w_));
      const float16_t *src_w = src_h + iw * sliding->ic4_channel_;
@@ -192,7 +194,7 @@ void SWBorderFp16(float16_t *dst, const float16_t *src, const float16_t *weight,
      SWBorderPixel(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw,
                    sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_h_, conv_param->kernel_w_,
                    sliding->ic4_channel_, conv_param->is_relu_, conv_param->is_relu6_);
                    sliding->ic4_channel_, relu, relu6);
      dst_kernel += sliding->block_channel_;
    }  // width loop
@@ -273,6 +275,8 @@ void SWCenterFp16(float16_t *dst, const float16_t *src, const float16_t *weight,
 void ConvSWFp16(const float16_t *input_data, const float16_t *packed_weight, const float16_t *bias_data,
                float16_t *tmp_out_block, float16_t *output_data, int task_id, ConvParameter *conv_param,
                SlidingWindowParam *slidingWindow_param) {
  bool relu = conv_param->act_type_ == ActType_Relu;
  bool relu6 = conv_param->act_type_ == ActType_Relu6;
  int oc4_res = conv_param->output_channel_ % C4NUM;
  const float16_t *src = input_data;
  float16_t *dst;
@@ -299,8 +303,8 @@ void ConvSWFp16(const float16_t *input_data, const float16_t *packed_weight, con
      if (slidingWindow_param->right_ > slidingWindow_param->left_ &&
          slidingWindow_param->bottom_ > slidingWindow_param->top_) {
        int in_h_start = slidingWindow_param->top_ * conv_param->stride_h_ - conv_param->pad_h_;
        int in_w_start = slidingWindow_param->left_ * conv_param->stride_w_ - conv_param->pad_w_;
        int in_h_start = slidingWindow_param->top_ * conv_param->stride_h_ - conv_param->pad_u_;
        int in_w_start = slidingWindow_param->left_ * conv_param->stride_w_ - conv_param->pad_l_;
        const float16_t *in_t =
          src_data + in_h_start * slidingWindow_param->in_h_step_ + in_w_start * slidingWindow_param->ic4_channel_;
        float16_t *out_t = dst_data + slidingWindow_param->top_ * slidingWindow_param->out_h_step_ +
@@ -310,7 +314,7 @@ void ConvSWFp16(const float16_t *input_data, const float16_t *packed_weight, con
                     conv_param->kernel_w_, slidingWindow_param->out_h_step_, slidingWindow_param->block_channel_,
                     slidingWindow_param->ic4_channel_, slidingWindow_param->in_sh_step_,
                     slidingWindow_param->in_sw_step_, slidingWindow_param->in_kh_step_,
                     slidingWindow_param->in_kw_step_, conv_param->is_relu_, conv_param->is_relu6_);
                     slidingWindow_param->in_kw_step_, relu, relu6);
      }
    }  // output C4 loop
    src += slidingWindow_param->in_step_;
@@ -330,8 +334,8 @@ void ConvFp16(float16_t *input_data, float16_t *packed_input, float16_t *packed_
  int out_h = conv_param->output_h_;
  int out_w = conv_param->output_w_;
  int out_channel = conv_param->output_channel_;
  bool relu = conv_param->is_relu_;
  bool relu6 = conv_param->is_relu6_;
  bool relu = conv_param->act_type_ == ActType_Relu;
  bool relu6 = conv_param->act_type_ == ActType_Relu6;
  int thread_count = conv_param->thread_num_;
  const int tile_n = 16;
  int output_count = out_h * out_w;
--- a/mindspore/lite/nnacl/fp16/deconv_fp16.c
+++ b/mindspore/lite/nnacl/fp16/deconv_fp16.c
@@ -73,8 +73,8 @@ int DeConvPostFp16(const float16_t *src, float16_t *tmp, const float16_t *bias,
    for (int ih = 0; ih < conv_param->input_h_; ih++) {
      for (int iw = 0; iw < conv_param->input_w_; iw++) {
        int oh = ih * conv_param->stride_h_ - conv_param->pad_h_;
        int ow = iw * conv_param->stride_w_ - conv_param->pad_w_;
        int oh = ih * conv_param->stride_h_ - conv_param->pad_u_;
        int ow = iw * conv_param->stride_w_ - conv_param->pad_l_;
        int kh_start = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_));
        int kh_end = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_));
@@ -112,7 +112,7 @@ int DeConvPostFp16(const float16_t *src, float16_t *tmp, const float16_t *bias,
    }       /*ih*/
  }         /*oc8*/
  PostConvFuncFp16C8(tmp, dst, bias, output_channel, output_plane, conv_param->output_channel_, conv_param->is_relu_,
                     conv_param->is_relu6_);
  PostConvFuncFp16C8(tmp, dst, bias, output_channel, output_plane, conv_param->output_channel_,
                     conv_param->act_type_ == ActType_Relu, conv_param->act_type_ == ActType_Relu6);
  return NNACL_OK;
 }
--- a/mindspore/lite/nnacl/fp16/pack_fp16.c
+++ b/mindspore/lite/nnacl/fp16/pack_fp16.c
@@ -21,14 +21,14 @@
 void Conv1x1InputPackFp16(const float16_t *src, float16_t *dst, ConvParameter *conv_param) {
  /* support nhwc */
  for (int dst_h = 0; dst_h < conv_param->output_h_; dst_h++) {
    int src_h = dst_h * conv_param->stride_h_ - conv_param->pad_h_;
    int src_h = dst_h * conv_param->stride_h_ - conv_param->pad_u_;
    if (src_h < 0 || src_h >= conv_param->input_h_) {
      continue;
    }
    const float16_t *src_h_ptr = src + src_h * conv_param->input_w_ * conv_param->input_channel_;
    float16_t *dst_h_ptr = dst + dst_h * conv_param->output_w_ * conv_param->input_channel_;
    for (int dst_w = 0; dst_w < conv_param->output_w_; dst_w++) {
      int src_w = dst_w * conv_param->stride_w_ - conv_param->pad_w_;
      int src_w = dst_w * conv_param->stride_w_ - conv_param->pad_l_;
      if (src_w < 0 || src_w >= conv_param->input_w_) {
        continue;
      }
@@ -46,8 +46,8 @@ void Im2ColPackUnitFp16(float16_t *input_data, ConvParameter *conv_param, float1
  int kernel_w = conv_param->kernel_w_;
  int stride_h = conv_param->stride_h_;
  int stride_w = conv_param->stride_w_;
  int pad_h = conv_param->pad_h_;
  int pad_w = conv_param->pad_w_;
  int pad_h = conv_param->pad_u_;
  int pad_w = conv_param->pad_l_;
  int dilation_h = conv_param->dilation_h_;
  int dilation_w = conv_param->dilation_w_;
  int in_channel = conv_param->input_channel_;
--- a/mindspore/lite/nnacl/fp16/winograd_transform_fp16.c
+++ b/mindspore/lite/nnacl/fp16/winograd_transform_fp16.c
@@ -230,8 +230,8 @@ void Conv3x3Fp16InputTransform(const float16_t *input_data, float16_t *trans_inp
  int input_channel = conv_param->input_channel_;
  int input_width = conv_param->input_w_;
  int input_height = conv_param->input_h_;
  int pad_w = conv_param->pad_w_;
  int pad_h = conv_param->pad_h_;
  int pad_w = conv_param->pad_l_;
  int pad_h = conv_param->pad_u_;
  int ic8 = UP_DIV(input_channel, C8NUM);
  if (out_w_block == 0) {
    return;
@@ -576,8 +576,8 @@ void WinogradInputTransformFp16(const float16_t *input_data, float16_t *trans_in
  int output_unit = conv_param->output_unit_;
  int in_channel = conv_param->input_channel_;
  int ic8 = UP_DIV(in_channel, C8NUM);
  int pad_h = conv_param->pad_h_;
  int pad_w = conv_param->pad_w_;
  int pad_h = conv_param->pad_u_;
  int pad_w = conv_param->pad_l_;
  int input_h = conv_param->input_h_;
  int input_w = conv_param->input_w_;
  if (out_w_block_num == 0) {
--- a/mindspore/lite/nnacl/fp32/conv.c
+++ b/mindspore/lite/nnacl/fp32/conv.c
@@ -18,6 +18,7 @@
 #include <string.h>
 #include "nnacl/fp32/common_func.h"
 #include "nnacl/winograd_transform.h"
 #include "nnacl/fp32/matmul.h"
 void SWBorderPixel(float *dst, const float *src, const float *weight, const float *bias, int height, int width,
                   int in_kh_step, int in_kw_step, int kernel_h, int kernel_w, int ic4, bool is_relu, bool is_relu6) {
@@ -57,16 +58,18 @@ void SWBorderPixel(float *dst, const float *src, const float *weight, const floa
 void SWBorder(float *dst, const float *src, const float *weight, const float *bias, int top, int bottom, int left,
              int right, const ConvParameter *conv_param, const SlidingWindowParam *sliding) {
  int ic4 = sliding->ic4_channel_ / C4NUM;
  bool relu = conv_param->act_type_ == ActType_Relu;
  bool relu6 = conv_param->act_type_ == ActType_Relu6;
  float *dst_h = dst + top * sliding->out_h_step_;
  for (int oh = top; oh < bottom; oh++) {
    int ih = oh * conv_param->stride_h_ - conv_param->pad_h_;
    int ih = oh * conv_param->stride_h_ - conv_param->pad_u_;
    int start_kh = MSMAX(0, UP_DIV(-ih, conv_param->dilation_h_));
    int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih, conv_param->dilation_h_));
    const float *src_h = src + ih * sliding->in_h_step_;
    float *dst_kernel = dst_h + left * sliding->block_channel_;
    for (int ow = left; ow < right; ow++) {
      int iw = ow * conv_param->stride_w_ - conv_param->pad_w_;
      int iw = ow * conv_param->stride_w_ - conv_param->pad_l_;
      int start_kw = MSMAX(0, UP_DIV(-iw, conv_param->dilation_w_));
      int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->input_w_ - iw, conv_param->dilation_w_));
      const float *src_w = src_h + iw * sliding->ic4_channel_;
@@ -75,8 +78,8 @@ void SWBorder(float *dst, const float *src, const float *weight, const float *bi
      const float *weight_kernel = weight + (start_kh * conv_param->kernel_w_ + start_kw) * sliding->ic4_channel_;
      SWBorderPixel(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw,
                    sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_h_, conv_param->kernel_w_, ic4,
                    conv_param->is_relu_, conv_param->is_relu6_);
                    sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_h_, conv_param->kernel_w_, ic4, relu,
                    relu6);
      dst_kernel += sliding->block_channel_;
    }  // width loop
@@ -144,6 +147,8 @@ void ConvSWFp32(const float *input_data, const float *packed_weight, const float
                float *output_data, int task_id, ConvParameter *conv_param, SlidingWindowParam *slidingWindow_param) {
  int ic4 = slidingWindow_param->ic4_channel_ / C4NUM;
  int oc4_res = conv_param->output_channel_ % C4NUM;
  bool relu = conv_param->act_type_ == ActType_Relu;
  bool relu6 = conv_param->act_type_ == ActType_Relu6;
  const float *src = input_data;
  float *dst = NULL;
  if (oc4_res == 0) {
@@ -169,28 +174,26 @@ void ConvSWFp32(const float *input_data, const float *packed_weight, const float
      if (slidingWindow_param->right_ > slidingWindow_param->left_ &&
          slidingWindow_param->bottom_ > slidingWindow_param->top_) {
        int in_h_start = slidingWindow_param->top_ * conv_param->stride_h_ - conv_param->pad_h_;
        int in_w_start = slidingWindow_param->left_ * conv_param->stride_w_ - conv_param->pad_w_;
        int in_h_start = slidingWindow_param->top_ * conv_param->stride_h_ - conv_param->pad_u_;
        int in_w_start = slidingWindow_param->left_ * conv_param->stride_w_ - conv_param->pad_l_;
        const float *in_t =
          src_data + in_h_start * slidingWindow_param->in_h_step_ + in_w_start * slidingWindow_param->ic4_channel_;
        float *out_t = dst_data + slidingWindow_param->top_ * slidingWindow_param->out_h_step_ +
                       slidingWindow_param->left_ * slidingWindow_param->block_channel_;
 #ifdef ENABLE_ARM64
        ConvSwFp32Center(out_t, in_t, weight, bias, slidingWindow_param->bottom_ - slidingWindow_param->top_,
                         slidingWindow_param->right_ - slidingWindow_param->left_, conv_param->kernel_h_,
                         conv_param->kernel_w_, slidingWindow_param->out_h_step_ * sizeof(float),
                         slidingWindow_param->block_channel_ * sizeof(float), ic4,
                         slidingWindow_param->in_sh_step_ * sizeof(float),
                         slidingWindow_param->in_sw_step_ * sizeof(float),
                         slidingWindow_param->in_kh_step_ * sizeof(float),
                         slidingWindow_param->in_kw_step_ * sizeof(float),
                         conv_param->is_relu_, conv_param->is_relu6_);
        ConvSwFp32Center(
          out_t, in_t, weight, bias, slidingWindow_param->bottom_ - slidingWindow_param->top_,
          slidingWindow_param->right_ - slidingWindow_param->left_, conv_param->kernel_h_, conv_param->kernel_w_,
          slidingWindow_param->out_h_step_ * sizeof(float), slidingWindow_param->block_channel_ * sizeof(float), ic4,
          slidingWindow_param->in_sh_step_ * sizeof(float), slidingWindow_param->in_sw_step_ * sizeof(float),
          slidingWindow_param->in_kh_step_ * sizeof(float), slidingWindow_param->in_kw_step_ * sizeof(float), relu,
          relu6);
 #else
        SWCenter(out_t, in_t, weight, bias, slidingWindow_param->bottom_ - slidingWindow_param->top_,
                 slidingWindow_param->right_ - slidingWindow_param->left_, conv_param->kernel_h_,
                 conv_param->kernel_w_, slidingWindow_param->out_h_step_, slidingWindow_param->block_channel_, ic4,
                 slidingWindow_param->right_ - slidingWindow_param->left_, conv_param->kernel_h_, conv_param->kernel_w_,
                 slidingWindow_param->out_h_step_, slidingWindow_param->block_channel_, ic4,
                 slidingWindow_param->in_sh_step_, slidingWindow_param->in_sw_step_, slidingWindow_param->in_kh_step_,
                 slidingWindow_param->in_kw_step_, conv_param->is_relu_, conv_param->is_relu6_);
                 slidingWindow_param->in_kw_step_, relu, relu6);
 #endif
      }
    }  // output C4 loop
@@ -219,6 +222,8 @@ void ConvFp32(float *input_data, float *packed_input, float *packed_weight, cons
  int kernel_plane = kernel_h * kernel_w;
  int unit_size = kernel_plane * ic4 * C4NUM;
  int packed_input_size = output_tile_count * TILE_NUM * unit_size;
  bool relu = conv_param->act_type_ == ActType_Relu;
  bool relu6 = conv_param->act_type_ == ActType_Relu6;
  // we accumulate 4 channels per time for input blocks
  int conv_depth = kernel_h * kernel_w;
@@ -240,11 +245,11 @@ void ConvFp32(float *input_data, float *packed_input, float *packed_weight, cons
      if (real_cal_num == TILE_NUM) {
        float *gemm_output = output_data + out_offset;
        gemm_func(gemm_output, gemm_input, packed_weight, bias_data, conv_depth, ic4, out_channel, output_offset, 0, 0,
                  conv_param->is_relu_, conv_param->is_relu6_);
                  relu, relu6);
      } else {
        // res part
        gemm_func(tmp_out_block, gemm_input, packed_weight, bias_data, conv_depth, ic4, out_channel, output_offset, 0,
                  0, conv_param->is_relu_, conv_param->is_relu6_);
                  0, relu, relu6);
        memcpy(output_data + out_offset, tmp_out_block, real_cal_num * out_channel * sizeof(float));
      }
    }
@@ -264,34 +269,42 @@ void ConvWinogardFp32(float *input_data, float *trans_weight, const float *bias_
  int out_w_block = UP_DIV(conv_param->output_w_, out_unit);
  int out_h_block = UP_DIV(conv_param->output_h_, out_unit);
  int output_count = out_w_block * out_h_block;
  int output_tile_count = UP_DIV(output_count, TILE_NUM);
  int output_tile_count = UP_DIV(output_count, C12NUM);
  int out_channel = conv_param->output_channel_;
  int oc4 = UP_DIV(out_channel, C4NUM);
  int oc8 = UP_DIV(out_channel, C8NUM);
  int input_unit_square = input_unit * input_unit;
  size_t output_offset = oc4 * C4NUM * input_unit_square * sizeof(float);
  float *trans_input = buffer_list[0];
  float *gemm_out = buffer_list[1];
  float *tmp_out_data = buffer_list[2];
  float *tmp_data = buffer_list[3];
  int trans_input_offset = TILE_NUM * input_unit_square * ic4 * C4NUM;
  int gemm_out_offset = TILE_NUM * input_unit_square * oc4 * C4NUM;
  float *col_buffer = buffer_list[4];
  int trans_input_offset = C12NUM * input_unit_square * ic4 * C4NUM;
  int gemm_out_offset = C12NUM * input_unit_square * oc8 * C8NUM;
  int tmp_data_offset = input_unit_square * C4NUM;
  int col_buffer_offset = C12NUM * ic4 * C4NUM;
  // step 1 : filter transform (pre-processed offline)
  // step 2 : input transform (online)
  for (int b = 0; b < in_batch; b++) {
    int in_batch_offset = b * ic4 * C4NUM * conv_param->input_h_ * conv_param->input_w_;
    int tmp_out_batch_offset = b * out_w_block * out_h_block * out_unit * out_unit * oc4 * C4NUM;
    for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_num) {
      int out_tile_index = thread_id * TILE_NUM;
      int cal_num = output_count - thread_id * TILE_NUM;
      cal_num = cal_num > TILE_NUM ? TILE_NUM : cal_num;
      int out_tile_index = thread_id * C12NUM;
      int cal_num = output_count - thread_id * C12NUM;
      cal_num = cal_num > C12NUM ? C12NUM : cal_num;
      WinogradInputTransform(input_data + in_batch_offset, trans_input + task_id * trans_input_offset,
                             tmp_data + task_id * tmp_data_offset, cal_num, out_tile_index, out_w_block, conv_param,
                             input_trans_func);
      // step 3 : gemm
      gemm_func(gemm_out + task_id * gemm_out_offset, trans_input + task_id * trans_input_offset, trans_weight, NULL,
                input_unit_square, ic4, oc4 * C4NUM, output_offset, 1, 1, 0, 0);
      float *src_ptr = trans_input + task_id * trans_input_offset;
      float *dst_ptr = gemm_out + task_id * gemm_out_offset;
      float *tmp_col_ptr = col_buffer + task_id * col_buffer_offset;
      for (int i = 0; i < input_unit_square; ++i) {
        RowMajor2Col12Major(src_ptr + i * C12NUM * ic4 * C4NUM, tmp_col_ptr, C12NUM, ic4 * C4NUM);
        MatMulOpt(tmp_col_ptr, trans_weight + i * ic4 * C4NUM * oc8 * C8NUM, dst_ptr + i * C8NUM, NULL, 0, ic4 * C4NUM,
                  C12NUM, oc8 * C8NUM, input_unit_square, 2);
      }
      // step 4 : output transform
      WinogradOutputTransform(gemm_out + task_id * gemm_out_offset, tmp_out_data + tmp_out_batch_offset, bias_data,
@@ -442,18 +455,21 @@ void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_dat
  int ic4 = UP_DIV(conv_param->input_channel_, C4NUM);
  int output_channel = conv_param->output_channel_;
  int oc4 = UP_DIV(output_channel, C4NUM);
  int oc8 = UP_DIV(output_channel, C8NUM);
  int out_w_block = UP_DIV(conv_param->output_w_, OUPUT_UNIT);
  int out_h_block = UP_DIV(conv_param->output_h_, OUPUT_UNIT);
  int output_count = out_w_block * out_h_block;
  int output_tile_count = UP_DIV(output_count, TILE_NUM);
  int output_tile_count = UP_DIV(output_count, C12NUM);
  const int input_unit_square = 4 * 4;
  float *tile_buffer = buffer_list[0];
  float *block_unit_buffer = buffer_list[1];
  float *tmp_dst_buffer = buffer_list[2];
  float *nc4hw4_out = buffer_list[3];
  int tile_buffer_offset = TILE_NUM * input_unit_square * ic4 * C4NUM;
  float *col_buffer = buffer_list[4];
  int tile_buffer_offset = C12NUM * input_unit_square * ic4 * C4NUM;
  int block_unit_buffer_offset = input_unit_square * C4NUM;
  int tmp_dst_buffer_offset = TILE_NUM * input_unit_square * oc4 * C4NUM;
  int tmp_dst_buffer_offset = C12NUM * input_unit_square * oc8 * C8NUM;
  int col_buffer_offset = C12NUM * ic4 * C4NUM;
  int input_batch = conv_param->input_batch_;
  for (int batch = 0; batch < input_batch; batch++) {
@@ -461,15 +477,20 @@ void Conv3x3Fp32(float *input_data, float *transed_weight, const float *bias_dat
    int nc4hw4_buffer_offset = batch * oc4 * C4NUM * conv_param->output_h_ * conv_param->output_w_;
    for (int thread_id = task_id; thread_id < output_tile_count; thread_id += thread_count) {
      int start_index = thread_id * TILE_NUM;
      int real_cal_num = (output_count - start_index) < TILE_NUM ? (output_count - start_index) : TILE_NUM;
      int start_index = thread_id * C12NUM;
      int real_cal_num = (output_count - start_index) < C12NUM ? (output_count - start_index) : C12NUM;
      Conv3x3Fp32InputTransform(input_data + in_batch_offset, tile_buffer + task_id * tile_buffer_offset,
                                block_unit_buffer + task_id * block_unit_buffer_offset, start_index, real_cal_num,
                                out_w_block, conv_param);
      gemm_func(tmp_dst_buffer + task_id * tmp_dst_buffer_offset, tile_buffer + task_id * tile_buffer_offset,
                transed_weight, NULL, input_unit_square, ic4, oc4 * C4NUM,
                oc4 * C4NUM * input_unit_square * sizeof(float), 1, 1, 0, 0);
      float *src_ptr = tile_buffer + task_id * tile_buffer_offset;
      float *tmp_col_ptr = col_buffer + task_id * col_buffer_offset;
      float *dst_ptr = tmp_dst_buffer + task_id * tmp_dst_buffer_offset;
      for (int i = 0; i < input_unit_square; ++i) {
        RowMajor2Col12Major(src_ptr + i * C12NUM * ic4 * C4NUM, tmp_col_ptr, C12NUM, ic4 * C4NUM);
        MatMulOpt(tmp_col_ptr, transed_weight + i * ic4 * C4NUM * oc8 * C8NUM, dst_ptr + i * C8NUM, NULL, 0,
                  ic4 * C4NUM, C12NUM, oc8 * C8NUM, input_unit_square, 2);
      }
      Conv3x3Fp32OutputTransform(tmp_dst_buffer + task_id * tmp_dst_buffer_offset, nc4hw4_out + nc4hw4_buffer_offset,
                                 bias_data, start_index, real_cal_num, out_w_block, conv_param);
--- a/mindspore/lite/nnacl/fp32/conv_depthwise.c
+++ b/mindspore/lite/nnacl/fp32/conv_depthwise.c
@@ -38,13 +38,15 @@ void ConvDw(float *output_data, const float *input_data, const float *weight_dat
  int h_step = UP_DIV(conv_param->output_h_, conv_param->thread_num_);
  int h_start = h_step * task_id;
  int h_end = MSMIN(h_start + h_step, conv_param->output_h_);
  bool relu = conv_param->act_type_ == ActType_Relu;
  bool relu6 = conv_param->act_type_ == ActType_Relu6;
  for (int b = 0; b < conv_param->output_batch_; b++) {
    const float *src = input_data + b * conv_param->input_h_ * conv_param->input_w_ * conv_param->input_channel_;
    float *dst = output_data + b * conv_param->output_h_ * conv_param->output_w_ * conv_param->output_channel_;
    for (int oh = h_start; oh < h_end; oh++) {
      float *dst_data = dst + oh * conv_param->output_w_ * conv_param->output_channel_;
      int ih_origin = oh * conv_param->stride_h_ - conv_param->pad_h_;
      int ih_origin = oh * conv_param->stride_h_ - conv_param->pad_u_;
      int start_kh = MSMAX(0, UP_DIV(-ih_origin, conv_param->dilation_h_));
      int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih_origin, conv_param->dilation_h_));
@@ -60,13 +62,13 @@ void ConvDw(float *output_data, const float *input_data, const float *weight_dat
        int in_sw_step = conv_param->stride_w_ * conv_param->input_channel_;
        for (int kw = 0; kw < conv_param->kernel_w_; kw++) {
          int out_w_start = MSMAX(
            0, (conv_param->pad_w_ - conv_param->dilation_w_ * kw + conv_param->stride_w_ - 1) / conv_param->stride_w_);
          int out_w_end = MSMIN(conv_param->output_w_, (conv_param->input_w_ + conv_param->pad_w_ -
            0, (conv_param->pad_l_ - conv_param->dilation_w_ * kw + conv_param->stride_w_ - 1) / conv_param->stride_w_);
          int out_w_end = MSMIN(conv_param->output_w_, (conv_param->input_w_ + conv_param->pad_l_ -
                                                        conv_param->dilation_w_ * kw + conv_param->stride_w_ - 1) /
                                                         conv_param->stride_w_);
          float *dst_w = dst_data + out_w_start * conv_param->output_channel_;
          int iw_origin = (out_w_start * conv_param->stride_w_) - conv_param->pad_w_ + conv_param->dilation_w_ * kw;
          int iw_origin = (out_w_start * conv_param->stride_w_) - conv_param->pad_l_ + conv_param->dilation_w_ * kw;
          const float *src_kw = src_kh + iw_origin * conv_param->input_channel_;
          int num_pixels = out_w_end - out_w_start;
@@ -75,10 +77,10 @@ void ConvDw(float *output_data, const float *input_data, const float *weight_dat
          weight_kh += conv_param->output_channel_;
        }
      }
      if (conv_param->is_relu_) {
      if (relu) {
        ReluFp32(dst_data, dst_data, conv_param->output_w_ * conv_param->output_channel_);
      }
      if (conv_param->is_relu6_) {
      if (relu6) {
        Relu6Fp32(dst_data, dst_data, conv_param->output_w_ * conv_param->output_channel_);
      }
    }
@@ -91,16 +93,16 @@ void InitSlidingParam(SlidingWindowParam *sliding, const ConvParameter *conv_par
  int top = 0;
  int bottom = conv_param->output_h_;
  for (; left * conv_param->stride_w_ < conv_param->pad_w_; left++) {
  for (; left * conv_param->stride_w_ < conv_param->pad_l_; left++) {
  }
  for (; (right - 1) * conv_param->stride_w_ - conv_param->pad_w_ + conv_param->kernel_w_ * conv_param->dilation_w_ >
  for (; (right - 1) * conv_param->stride_w_ - conv_param->pad_l_ + conv_param->kernel_w_ * conv_param->dilation_w_ >
           conv_param->input_w_ &&
         right > left;
       right--) {
  }
  for (; top * conv_param->stride_h_ < conv_param->pad_h_; top++) {
  for (; top * conv_param->stride_h_ < conv_param->pad_u_; top++) {
  }
  for (; (bottom - 1) * conv_param->stride_h_ - conv_param->pad_h_ + conv_param->kernel_h_ * conv_param->dilation_h_ >
  for (; (bottom - 1) * conv_param->stride_h_ - conv_param->pad_u_ + conv_param->kernel_h_ * conv_param->dilation_h_ >
           conv_param->input_h_ &&
         bottom > top;
       bottom--) {
@@ -181,16 +183,18 @@ void DepthwiseBorderPixel(float *dst, const float *src, const float *weight, con
 void DepthwiseBorder(float *dst, const float *src, const float *weight, const float *bias, int top, int bottom,
                     int left, int right, const ConvParameter *conv_param, const SlidingWindowParam *sliding) {
  bool relu = conv_param->act_type_ == ActType_Relu;
  bool relu6 = conv_param->act_type_ == ActType_Relu6;
  float *dst_h = dst + top * sliding->out_h_step_;
  for (int oh = top; oh < bottom; oh++) {
    int ih = oh * conv_param->stride_h_ - conv_param->pad_h_;
    int ih = oh * conv_param->stride_h_ - conv_param->pad_u_;
    int start_kh = MSMAX(0, UP_DIV(-ih, conv_param->dilation_h_));
    int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih, conv_param->dilation_h_));
    const float *src_h = src + ih * sliding->in_h_step_;
    float *dst_kernel = dst_h + left * sliding->block_channel_;
    for (int ow = left; ow < right; ow++) {
      int iw = ow * conv_param->stride_w_ - conv_param->pad_w_;
      int iw = ow * conv_param->stride_w_ - conv_param->pad_l_;
      int start_kw = MSMAX(0, UP_DIV(-iw, conv_param->dilation_w_));
      int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->input_w_ - iw, conv_param->dilation_w_));
      const float *src_w = src_h + iw * sliding->block_channel_;
@@ -201,11 +205,10 @@ void DepthwiseBorder(float *dst, const float *src, const float *weight, const fl
 #ifdef ENABLE_ARM64
      ConvDwFp32Border(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw,
                       sliding->in_kh_step_ * sizeof(float), sliding->in_kw_step_ * sizeof(float),
                       conv_param->kernel_w_ * C4NUM * sizeof(float), conv_param->is_relu_, conv_param->is_relu6_);
                       conv_param->kernel_w_ * C4NUM * sizeof(float), relu, relu6);
 #else
      DepthwiseBorderPixel(dst_kernel, src_kernel, weight_kernel, bias, end_kh - start_kh, end_kw - start_kw,
                           sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_ * C4NUM,
                           conv_param->is_relu_, conv_param->is_relu6_);
                           sliding->in_kh_step_, sliding->in_kw_step_, conv_param->kernel_w_ * C4NUM, relu, relu6);
 #endif
      dst_kernel += sliding->block_channel_;
    }  // width loop
@@ -259,6 +262,8 @@ void DepthwiseCenter(float *dst, const float *src, const float *weight, const fl
 // conv depthwise fp32: sliding window
 void ConvDwC4Fp32(float *output_data, const float *input_data, const float *weight_data, const float *bias_data,
                  const ConvParameter *conv_param, const SlidingWindowParam *sliding, int task_id) {
  bool relu = conv_param->act_type_ == ActType_Relu;
  bool relu6 = conv_param->act_type_ == ActType_Relu6;
  const float *src = input_data;
  float *dst = output_data;
  for (int b = 0; b < conv_param->output_batch_; b++) {
@@ -277,8 +282,8 @@ void ConvDwC4Fp32(float *output_data, const float *input_data, const float *weig
                      conv_param->output_w_, conv_param, sliding);
      if (sliding->right_ > sliding->left_ && sliding->bottom_ > sliding->top_) {
        int in_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_h_;
        int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_;
        int in_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_u_;
        int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_l_;
        const float *in_t = src_data + in_h_start * sliding->in_h_step_ + in_w_start * sliding->block_channel_;
        float *out_t = dst_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_;
 #ifdef ENABLE_ARM64
@@ -286,12 +291,12 @@ void ConvDwC4Fp32(float *output_data, const float *input_data, const float *weig
                         conv_param->kernel_h_, conv_param->kernel_w_, sliding->out_h_step_ * sizeof(float),
                         sliding->block_channel_ * sizeof(float), sliding->in_sh_step_ * sizeof(float),
                         sliding->in_sw_step_ * sizeof(float), sliding->in_kh_step_ * sizeof(float),
                         sliding->in_kw_step_ * sizeof(float), conv_param->is_relu_, conv_param->is_relu6_);
                         sliding->in_kw_step_ * sizeof(float), relu, relu6);
 #else
        DepthwiseCenter(out_t, in_t, weight, bias, sliding->bottom_ - sliding->top_, sliding->right_ - sliding->left_,
                        conv_param->kernel_h_, conv_param->kernel_w_, sliding->out_h_step_, sliding->block_channel_,
                        sliding->in_sh_step_, sliding->in_sw_step_, sliding->in_kh_step_, sliding->in_kw_step_,
                        conv_param->is_relu_, conv_param->is_relu6_);
                        sliding->in_sh_step_, sliding->in_sw_step_, sliding->in_kh_step_, sliding->in_kw_step_, relu,
                        relu6);
 #endif
      }
    }  // output C4 loop
@@ -454,11 +459,11 @@ void ConvDw3x3Fp32InputTrans(const float *input_data, float *trans_input, float
  memset(trans_input, 0, out_h_block * out_h_block * 16 * C4NUM * sizeof(float));
  for (int oh = 0; oh < out_h_block; oh++) {
    int ih = oh * 2 - conv_param->pad_h_;
    int ih = oh * 2 - conv_param->pad_u_;
    int real_h_start = ih > 0 ? 0 : -ih;
    int real_h_end = (ih + input_unit) < conv_param->input_h_ ? input_unit : (conv_param->input_h_ - ih);
    for (int ow = 0; ow < out_w_block; ow++) {
      int iw = ow * 2 - conv_param->pad_w_;
      int iw = ow * 2 - conv_param->pad_l_;
      int real_w_start = iw > 0 ? 0 : -iw;
      int real_w_end = (iw + input_unit) < conv_param->input_w_ ? input_unit : (conv_param->input_w_ - iw);
@@ -642,6 +647,8 @@ void ConvDw3x3Fp32OutputUnit(float *src_buf, float *dst_output, const float *bia
 void ConvDw3x3Fp32OutputTrans(float *trans_buffer, float *output_data, const float *bias, int out_h_block,
                              int out_w_block, const ConvParameter *conv_param) {
  bool relu = conv_param->act_type_ == ActType_Relu;
  bool relu6 = conv_param->act_type_ == ActType_Relu6;
  int oc4 = UP_DIV(conv_param->output_channel_, C4NUM);
  bool h_in_range = true;
  for (int oh = 0; oh < out_h_block; oh++) {
@@ -661,8 +668,8 @@ void ConvDw3x3Fp32OutputTrans(float *trans_buffer, float *output_data, const flo
      float *buf_ow = buf_oh + ow * 16 * C4NUM;
      float *output_ow = output_oh + real_ow * oc4 * C4NUM;
      ConvDw3x3Fp32OutputUnit(buf_ow, output_ow, bias, oc4 * C4NUM, conv_param->output_w_, h_in_range, w_in_range,
                              conv_param->is_relu_, conv_param->is_relu6_);
      ConvDw3x3Fp32OutputUnit(buf_ow, output_ow, bias, oc4 * C4NUM, conv_param->output_w_, h_in_range, w_in_range, relu,
                              relu6);
    }
  }
 }
@@ -727,14 +734,14 @@ void DeconvDepthwiseBorder(float *dst, const float *src, const float *weight, in
                           const ConvParameter *conv_param, const SlidingWindowParam *sliding) {
  const float *src_h = src + top * sliding->out_h_step_;
  for (int ih = top; ih < bottom; ih++) {
    int oh = ih * conv_param->stride_h_ - conv_param->pad_h_;
    int oh = ih * conv_param->stride_h_ - conv_param->pad_u_;
    int start_kh = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_));
    int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_));
    float *dst_h = dst + oh * sliding->in_h_step_;
    const float *src_kernel = src_h + left * sliding->block_channel_;
    for (int iw = left; iw < right; iw++) {
      int ow = iw * conv_param->stride_w_ - conv_param->pad_w_;
      int ow = iw * conv_param->stride_w_ - conv_param->pad_l_;
      int start_kw = MSMAX(0, UP_DIV(-ow, conv_param->dilation_w_));
      int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->output_w_ - ow, conv_param->dilation_w_));
      float *dst_w = dst_h + ow * sliding->block_channel_;
@@ -790,12 +797,14 @@ void DeconvDepthwiseCenter(float *dst, const float *src, const float *weight, in
 #endif
 void DeconvDepthwisePostFunc(float *dst, const float *bias, int block_channel, const ConvParameter *conv_param) {
  bool relu = conv_param->act_type_ == ActType_Relu;
  bool relu6 = conv_param->act_type_ == ActType_Relu6;
  float *dst_k = dst;
  for (int k = 0; k < conv_param->output_h_ * conv_param->output_w_; k++) {
    for (int c = 0; c < C4NUM; c++) {
      dst_k[c] += bias[c];
      dst_k[c] = (conv_param->is_relu_) ? (MSMAX(0, dst_k[c])) : (dst_k[c]);
      dst_k[c] = (conv_param->is_relu6_) ? (MSMIN(6, MSMAX(0, dst_k[c]))) : (dst_k[c]);
      dst_k[c] = (relu) ? (MSMAX(0, dst_k[c])) : (dst_k[c]);
      dst_k[c] = (relu6) ? (MSMIN(6, MSMAX(0, dst_k[c]))) : (dst_k[c]);
    }
    dst_k += block_channel;
  }
@@ -821,8 +830,8 @@ void DeconvDwC4Fp32(float *output_data, const float *input_data, const float *we
                            conv_param->input_w_, conv_param, sliding);
      if (sliding->right_ > sliding->left_ && sliding->bottom_ > sliding->top_) {
        int oh_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_h_;
        int oh_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_;
        int oh_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_u_;
        int oh_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_l_;
        float *out_t = dst_data + oh_h_start * sliding->in_h_step_ + oh_w_start * sliding->block_channel_;
        const float *in_t = src_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_;
--- a/mindspore/lite/nnacl/fp32/deconv.c
+++ b/mindspore/lite/nnacl/fp32/deconv.c
@@ -57,8 +57,8 @@ int DeConvPostFp32C12x8(const float *src, float *tmp, const float *bias, float *
    for (int ih = 0; ih < conv_param->input_h_; ih++) {
      for (int iw = 0; iw < conv_param->input_w_; iw++) {
        int oh = ih * conv_param->stride_h_ - conv_param->pad_h_;
        int ow = iw * conv_param->stride_w_ - conv_param->pad_w_;
        int oh = ih * conv_param->stride_h_ - conv_param->pad_u_;
        int ow = iw * conv_param->stride_w_ - conv_param->pad_l_;
        int kh_start = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_));
        int kh_end = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_));
@@ -97,7 +97,7 @@ int DeConvPostFp32C12x8(const float *src, float *tmp, const float *bias, float *
    }       /*ih*/
  }         /*oc8*/
  PostConvFuncFp32C8(tmp, dst, bias, output_channel, output_plane, conv_param->output_channel_, conv_param->is_relu_,
                     conv_param->is_relu6_);
  PostConvFuncFp32C8(tmp, dst, bias, output_channel, output_plane, conv_param->output_channel_,
                     conv_param->act_type_ == ActType_Relu, conv_param->act_type_ == ActType_Relu6);
  return NNACL_OK;
 }
--- a/mindspore/lite/nnacl/fp32/matmul.c
+++ b/mindspore/lite/nnacl/fp32/matmul.c
@@ -356,7 +356,7 @@ void MatMul12x8(const float *a, const float *b, float *dst, const float *bias, A
        dst[ci] = value;
      }
    }
  } else {
  } else if (out_type == OutType_C8) {
    /*  col8-major * row8-major => col12x8-major  */
    int col_8 = UP_ROUND(col, C8NUM);
    int row_12 = UP_ROUND(row, C12NUM);
@@ -364,9 +364,7 @@ void MatMul12x8(const float *a, const float *b, float *dst, const float *bias, A
      for (int c = 0; c < col_8; c++) {
        int r12div = r / C12NUM, r12mod = r % C12NUM;
        int c8div = c / C8NUM, c8mod = c % C8NUM;
        int c4div = c / C4NUM, c4mod = c % C4NUM;
        size_t ci = (out_type == OutType_C4) ? (c4div * C4NUM * row_12 + r * C4NUM + c4mod)
                                             : (c8div * C8NUM * row_12 + r * C8NUM + c8mod);
        size_t ci = (c8div * C8NUM * row_12 + r * C8NUM + c8mod);
        float value = 0;
        for (int d = 0; d < deep; d++) {
          size_t ai = r12div * deep * C12NUM + d * C12NUM + r12mod;
@@ -379,6 +377,25 @@ void MatMul12x8(const float *a, const float *b, float *dst, const float *bias, A
        dst[ci] = value;
      }
    }
  } else {
    for (int i = 0; i < row; ++i) {
      int src_r_offset = i;
      int dst_r_offset = i * col * stride;
      for (int j = 0; j < col; ++j) {
        int c8div = j / 8, c8mod = j % 8;
        size_t ci = dst_r_offset + c8div * 8 * stride + c8mod;
        float value = 0;
        for (int d = 0; d < deep; ++d) {
          size_t ai = src_r_offset + d * row;
          size_t bi = c8div * deep * 8 + d * 8 + c8mod;
          value = value + a[ai] * b[bi];
        }
        if (bias != NULL) value += bias[j];
        if (act_type == ActType_Relu6) value = MSMIN(6.0f, value);
        if (act_type != ActType_No) value = MSMAX(0.0f, value);
        dst[ci] = value;
      }
    }
  }
  return;
 }
@@ -387,7 +404,7 @@ void MatMulOpt(const float *a, const float *b, float *c, const float *bias, ActT
               int col, size_t stride, int out_type) {
 #ifdef ENABLE_ARM64
  MatmulFloatNeon64Opt(a, b, c, bias, (int)act_type, deep, row, col, stride, (int)(out_type == OutType_Nhwc),
                       (int)(out_type == OutType_C4));
                       (int)(out_type == OutType_TileC8));
 #else
  MatMul12x8(a, b, c, bias, act_type, deep, row, col, stride, out_type);
 #endif
--- a/mindspore/lite/nnacl/fp32_grad/pack_ext.c
+++ b/mindspore/lite/nnacl/fp32_grad/pack_ext.c
@@ -20,9 +20,9 @@
 static int is_a_ge_zero_and_a_lt_b(int a, int b) { return (unsigned)(a) < (unsigned)(b); }
 void im2col_hwc(const float *in_data, float *data_col, ConvParameter *conv_param) {
  const int pad_left = /*conv_param->pad_l_*/ conv_param->pad_w_;
  const int pad_left = /*conv_param->pad_l_*/ conv_param->pad_l_;
  // const int pad_right =  /*conv_param->pad_r_*/conv_param->pad_w_;
  const int pad_up = /*conv_param->pad_u_*/ conv_param->pad_h_;
  const int pad_up = /*conv_param->pad_u_*/ conv_param->pad_u_;
  // const int pad_down =   /*conv_param->pad_d/*/conv_param->pad_h_;
  const int stride_h = conv_param->stride_h_;
@@ -72,9 +72,9 @@ void im2col_hwc(const float *in_data, float *data_col, ConvParameter *conv_param
 // output matrix is (kernel_h*kernel_w*channels)X(output_h*output_w)
 void im2row_hwc(const float *in_data, float *data_row, ConvParameter *conv_param) {
  const int pad_left = /*conv_param->pad_l_*/ conv_param->pad_w_;
  const int pad_left = /*conv_param->pad_l_*/ conv_param->pad_l_;
  // const int pad_right =  /*conv_param->pad_r_*/conv_param->pad_w_;
  const int pad_up = /*conv_param->pad_u_*/ conv_param->pad_h_;
  const int pad_up = /*conv_param->pad_u_*/ conv_param->pad_u_;
  // const int pad_down =   /*conv_param->pad_d/*/conv_param->pad_h_;
  const int stride_h = conv_param->stride_h_;
--- a/mindspore/lite/nnacl/int8/conv_depthwise_int8.c
+++ b/mindspore/lite/nnacl/int8/conv_depthwise_int8.c
@@ -68,14 +68,14 @@ void DepthwiseBorderInt8(int8_t *dst, const int16_t *src, const int16_t *weight,
                         bool per_channel) {
  int8_t *dst_h = dst + top * sliding->out_h_step_;
  for (int oh = top; oh < bottom; oh++) {
    int ih = oh * conv_param->stride_h_ - conv_param->pad_h_;
    int ih = oh * conv_param->stride_h_ - conv_param->pad_u_;
    int start_kh = MSMAX(0, UP_DIV(-ih, conv_param->dilation_h_));
    int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->input_h_ - ih, conv_param->dilation_h_));
    const int16_t *src_h = src + ih * sliding->in_h_step_;
    int8_t *dst_kernel = dst_h + left * sliding->block_channel_;
    for (int ow = left; ow < right; ow++) {
      int iw = ow * conv_param->stride_w_ - conv_param->pad_w_;
      int iw = ow * conv_param->stride_w_ - conv_param->pad_l_;
      int start_kw = MSMAX(0, UP_DIV(-iw, conv_param->dilation_w_));
      int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->input_w_ - iw, conv_param->dilation_w_));
      const int16_t *src_w = src_h + iw * sliding->block_channel_;
@@ -186,8 +186,8 @@ void ConvDwInt8(int8_t *output_data, const int16_t *input_data, const int16_t *w
                          per_channel);
      if (sliding->right_ > sliding->left_ && sliding->bottom_ > sliding->top_) {
        int in_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_h_;
        int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_;
        int in_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_u_;
        int in_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_l_;
        const int16_t *in_t = src_data + in_h_start * sliding->in_h_step_ + in_w_start * sliding->block_channel_;
        int8_t *out_t = dst_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_;
 #ifdef ENABLE_ARM64
@@ -241,14 +241,14 @@ void DeconvDepthwiseBorderInt8(int32_t *dst, const int16_t *src, const int16_t *
                               int right, const ConvParameter *conv_param, const SlidingWindowParam *sliding) {
  const int16_t *src_h = src + top * sliding->out_h_step_;
  for (int ih = top; ih < bottom; ih++) {
    int oh = ih * conv_param->stride_h_ - conv_param->pad_h_;
    int oh = ih * conv_param->stride_h_ - conv_param->pad_u_;
    int start_kh = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_));
    int end_kh = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_));
    int32_t *dst_h = dst + oh * sliding->in_h_step_;
    const int16_t *src_kernel = src_h + left * sliding->block_channel_;
    for (int iw = left; iw < right; iw++) {
      int ow = iw * conv_param->stride_w_ - conv_param->pad_w_;
      int ow = iw * conv_param->stride_w_ - conv_param->pad_l_;
      int start_kw = MSMAX(0, UP_DIV(-ow, conv_param->dilation_w_));
      int end_kw = MSMIN(conv_param->kernel_w_, UP_DIV(conv_param->output_w_ - ow, conv_param->dilation_w_));
      int32_t *dst_w = dst_h + ow * C4NUM;
@@ -341,8 +341,8 @@ void DeconvDwInt8(int8_t *output_data, int32_t *output_buffer, const int16_t *in
                                conv_param->input_w_, conv_param, sliding);
      if (sliding->right_ > sliding->left_ && sliding->bottom_ > sliding->top_) {
        int oh_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_h_;
        int oh_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_w_;
        int oh_h_start = sliding->top_ * conv_param->stride_h_ - conv_param->pad_u_;
        int oh_w_start = sliding->left_ * conv_param->stride_w_ - conv_param->pad_l_;
        int32_t *out_t = output_buffer + oh_h_start * sliding->in_h_step_ + oh_w_start * sliding->block_channel_;
        const int16_t *in_t =
          src_data + sliding->top_ * sliding->out_h_step_ + sliding->left_ * sliding->block_channel_;
--- a/mindspore/lite/nnacl/int8/deconv.c
+++ b/mindspore/lite/nnacl/int8/deconv.c
@@ -33,8 +33,8 @@ int DeConvPostInt8C8(const int32_t *src, const int32_t *bias, int32_t *tmp, int8
    for (int ih = 0; ih < conv_param->input_h_; ih++) {
      for (int iw = 0; iw < conv_param->input_w_; iw++) {
        int oh = ih * conv_param->stride_h_ - conv_param->pad_h_;
        int ow = iw * conv_param->stride_w_ - conv_param->pad_w_;
        int oh = ih * conv_param->stride_h_ - conv_param->pad_u_;
        int ow = iw * conv_param->stride_w_ - conv_param->pad_l_;
        int kh_start = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_));
        int kh_end = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_));
@@ -88,8 +88,8 @@ int DeConvPostInt8C4(const int32_t *src, const int32_t *bias, int32_t *tmp, int8
    for (int ih = 0; ih < conv_param->input_h_; ih++) {
      for (int iw = 0; iw < conv_param->input_w_; iw++) {
        int oh = ih * conv_param->stride_h_ - conv_param->pad_h_;
        int ow = iw * conv_param->stride_w_ - conv_param->pad_w_;
        int oh = ih * conv_param->stride_h_ - conv_param->pad_u_;
        int ow = iw * conv_param->stride_w_ - conv_param->pad_l_;
        int kh_start = MSMAX(0, UP_DIV(-oh, conv_param->dilation_h_));
        int kh_end = MSMIN(conv_param->kernel_h_, UP_DIV(conv_param->output_h_ - oh, conv_param->dilation_h_));
--- a/mindspore/lite/nnacl/matmul_parameter.h
+++ b/mindspore/lite/nnacl/matmul_parameter.h
@@ -29,9 +29,7 @@ typedef void (*MATMUL_OPT_R_FUNC)(const int8_t *a, const int8_t *b, int8_t *dst,
 typedef void (*MAT_TRANS_FUNC)(void *dst, void *a, int row, int col);
 typedef enum ActType { ActType_No, ActType_Relu, ActType_Relu6 } ActType;
 typedef enum OutType { OutType_C8 = 0, OutType_Nhwc = 1, OutType_C4 = 2 } OutType;
 typedef enum OutType { OutType_C8 = 0, OutType_Nhwc = 1, OutType_TileC8 = 2 } OutType;
 typedef struct MatMulParameter {
  OpParameter op_parameter_;
--- a/mindspore/lite/nnacl/op_base.h
+++ b/mindspore/lite/nnacl/op_base.h
@@ -25,7 +25,6 @@
 #define C8NUM 8
 #define C12NUM 12
 #define C16NUM 16
 #define BLOCK 4
 #define TILE_NUM 8
 #define MSMIN(x, y) ((x) < (y) ? (x) : (y))
@@ -62,4 +61,6 @@ typedef struct OpParameter {
  int thread_num_;
 } OpParameter;
 typedef enum ActType { ActType_No, ActType_Relu, ActType_Relu6 } ActType;
 #endif  // MINDSPORE_LITE_NNACL_OP_BASE_H_
--- a/mindspore/lite/nnacl/pack.c
+++ b/mindspore/lite/nnacl/pack.c
@@ -158,14 +158,14 @@ void Conv1x1InputPack(const void *src_ptr, void *dst_ptr, ConvParameter *conv_pa
  char *src = (char *)src_ptr;
  char *dst = (char *)dst_ptr;
  for (int dst_h = 0; dst_h < conv_param->output_h_; dst_h++) {
    int src_h = dst_h * conv_param->stride_h_ - conv_param->pad_h_;
    int src_h = dst_h * conv_param->stride_h_ - conv_param->pad_u_;
    if (src_h < 0 || src_h >= conv_param->input_h_) {
      continue;
    }
    const char *src_h_ptr = src + src_h * conv_param->input_w_ * conv_param->input_channel_ * data_size;
    char *dst_h_ptr = dst + dst_h * conv_param->output_w_ * conv_param->input_channel_ * data_size;
    for (int dst_w = 0; dst_w < conv_param->output_w_; dst_w++) {
      int src_w = dst_w * conv_param->stride_w_ - conv_param->pad_w_;
      int src_w = dst_w * conv_param->stride_w_ - conv_param->pad_l_;
      if (src_w < 0 || src_w >= conv_param->input_w_) {
        continue;
      }
@@ -296,8 +296,8 @@ void Im2ColPackUnitFp32(const float *input_data, ConvParameter *conv_param, floa
  int kernel_w = conv_param->kernel_w_;
  int stride_h = conv_param->stride_h_;
  int stride_w = conv_param->stride_w_;
  int pad_h = conv_param->pad_h_;
  int pad_w = conv_param->pad_w_;
  int pad_h = conv_param->pad_u_;
  int pad_w = conv_param->pad_l_;
  int dilation_h = conv_param->dilation_h_;
  int dilation_w = conv_param->dilation_w_;
  int in_channel = conv_param->input_channel_;
@@ -348,8 +348,8 @@ void Im2ColPackUnitInt8(const int8_t *input_data, int8_t *packed_input, int real
  int kernel_w = conv_param->kernel_w_;
  int stride_h = conv_param->stride_h_;
  int stride_w = conv_param->stride_w_;
  int pad_h = conv_param->pad_h_;
  int pad_w = conv_param->pad_w_;
  int pad_h = conv_param->pad_u_;
  int pad_w = conv_param->pad_l_;
  int dilation_h = conv_param->dilation_h_;
  int dilation_w = conv_param->dilation_w_;
  int in_channel = conv_param->input_channel_;
@@ -419,8 +419,8 @@ void Im2ColPackUnitInt8Opt(const int8_t *input_data, int8_t *packed_input, int r
  int kernel_w = conv_param->kernel_w_;
  int stride_h = conv_param->stride_h_;
  int stride_w = conv_param->stride_w_;
  int pad_h = conv_param->pad_h_;
  int pad_w = conv_param->pad_w_;
  int pad_h = conv_param->pad_u_;
  int pad_w = conv_param->pad_l_;
  int dilation_h = conv_param->dilation_h_;
  int dilation_w = conv_param->dilation_w_;
  int in_channel = conv_param->input_channel_;
--- a/mindspore/lite/nnacl/winograd_transform.c
+++ b/mindspore/lite/nnacl/winograd_transform.c
@@ -24,8 +24,8 @@ void WinogradInputTransform(const float *input_data, float *trans_input, float *
  int output_unit = conv_param->output_unit_;
  int in_channel = conv_param->input_channel_;
  int ic4 = UP_DIV(in_channel, C4NUM);
  int pad_h = conv_param->pad_h_;
  int pad_w = conv_param->pad_w_;
  int pad_h = conv_param->pad_u_;
  int pad_w = conv_param->pad_l_;
  int input_h = conv_param->input_h_;
  int input_w = conv_param->input_w_;
  if (out_w_block_num == 0) {
@@ -42,7 +42,7 @@ void WinogradInputTransform(const float *input_data, float *trans_input, float *
    int interval_y_e = src_y_e < input_h ? input_unit : (input_h - src_y_s);
    int src_plane_offset = ic4 * C4NUM * (src_y_s * input_w + src_x_s);
    int dst_plane_offset = c * C4NUM;
    int dst_plane_offset = c * C4NUM * ic4;
    for (int ic = 0; ic < ic4; ic++) {
      // clear tmp buffer
      memset(tmp_data, 0, input_unit * input_unit * C4NUM * sizeof(float));
@@ -67,8 +67,8 @@ void WinogradInputTransform(const float *input_data, float *trans_input, float *
        }
      }
      // input transform
      int dst_ic4_offset = dst_plane_offset + ic * TILE_NUM * C4NUM;
      size_t dst_step = ic4 * C4NUM * TILE_NUM;
      int dst_ic4_offset = dst_plane_offset + ic * C4NUM;
      size_t dst_step = C12NUM * ic4 * C4NUM;
      float *trans_input_ptr = trans_input + dst_ic4_offset;
      input_trans_func(tmp_data, trans_input_ptr, C4NUM, dst_step);
    }
@@ -86,6 +86,7 @@ void WinogradOutputTransform(const float *gemm_out, float *tmp_out_data, const f
  int output_h_unit_block = UP_DIV(output_h, output_unit);
  int output_channel = conv_param->output_channel_;
  int oc4 = UP_DIV(output_channel, C4NUM);
  int oc8 = UP_DIV(output_channel, C8NUM);
  int input_unit = conv_param->input_unit_;
  if (output_unit_num == 0) {
    return;
@@ -93,17 +94,19 @@ void WinogradOutputTransform(const float *gemm_out, float *tmp_out_data, const f
  for (int i = 0; i < cal_num; i++) {
    int dst_x_s = out_tile_index % output_unit_num;
    int dst_y_s = out_tile_index / output_unit_num;
    int src_tile_offset = i * oc4 * C4NUM * input_unit * input_unit;
    int src_tile_offset = i * oc8 * C8NUM * input_unit * input_unit;
    int dst_tile_offset = C4NUM * output_unit * (dst_x_s + dst_y_s * output_w_unit_block * output_unit);
    for (int j = 0; j < oc4; j++) {
      int src_oc4_offset = src_tile_offset + j * input_unit * input_unit * C4NUM;
      int c8_block = j / 2;
      int c8_res = j % 2;
      int src_oc4_offset = src_tile_offset + c8_block * input_unit * input_unit * C8NUM + c8_res * C4NUM;
      int dst_oc4_offset =
        dst_tile_offset + j * C4NUM * output_h_unit_block * output_w_unit_block * output_unit * output_unit;
      const float *src_ptr = gemm_out + src_oc4_offset;
      const float *bias_ptr = bias_data + j * C4NUM;
      float *dst_ptr = tmp_out_data + dst_oc4_offset;
      output_trans_func(src_ptr, dst_ptr, bias_ptr, C4NUM, output_w_unit_block * output_unit);
      output_trans_func(src_ptr, dst_ptr, bias_ptr, C8NUM, output_w_unit_block * output_unit);
    }
    out_tile_index++;
  }
@@ -283,8 +286,8 @@ void Conv3x3Fp32InputTransform(const float *input_data, float *trans_input, floa
  int input_channel = conv_param->input_channel_;
  int input_width = conv_param->input_w_;
  int input_height = conv_param->input_h_;
  int pad_w = conv_param->pad_w_;
  int pad_h = conv_param->pad_h_;
  int pad_w = conv_param->pad_l_;
  int pad_h = conv_param->pad_u_;
  int ic4 = UP_DIV(input_channel, C4NUM);
  const int input_unit = 4;
  if (out_w_block == 0) {
@@ -300,7 +303,7 @@ void Conv3x3Fp32InputTransform(const float *input_data, float *trans_input, floa
    int real_y_end = (origin_y + input_unit) < input_height ? input_unit : (input_height - origin_y);
    int src_plane_offset = ic4 * C4NUM * (origin_y * input_width + origin_x);
    int dst_plane_offset = cal_id * C4NUM;
    int dst_plane_offset = cal_id * C4NUM * ic4;
    for (int ic = 0; ic < ic4; ic++) {
      // clear tmp buffer
      memset(tmp_data, 0, input_unit * input_unit * C4NUM * sizeof(float));
@@ -326,8 +329,8 @@ void Conv3x3Fp32InputTransform(const float *input_data, float *trans_input, floa
      }
      // input transform
      int dst_ic4_offset = dst_plane_offset + ic * TILE_NUM * C4NUM;
      size_t dst_step = ic4 * C4NUM * TILE_NUM;
      int dst_ic4_offset = dst_plane_offset + ic * C4NUM;
      size_t dst_step = C12NUM * ic4 * C4NUM;
      float *trans_input_ptr = trans_input + dst_ic4_offset;
      Conv3x3Fp32InputUnit(tmp_data, trans_input_ptr, dst_step);
    }
@@ -336,8 +339,8 @@ void Conv3x3Fp32InputTransform(const float *input_data, float *trans_input, floa
 void Conv3x3Fp32FilterTransform(float *weight_data, float *trans_weight, int iC4, int output_channel, int kernel_plane,
                                int oc_block) {
  const int input_unit = 4;
  int dst_step = iC4 * C4NUM * oc_block;
  int oc_plane_block = UP_DIV(output_channel, oc_block);
  int dst_step = iC4 * C4NUM * oc_block * oc_plane_block;
  if (oc_block == 0) {
    return;
  }
@@ -345,7 +348,7 @@ void Conv3x3Fp32FilterTransform(float *weight_data, float *trans_weight, int iC4
    int oc_block_num = o / oc_block;
    int oc_block_rem = o % oc_block;
    int src_oc_offset = o * iC4 * C4NUM * kernel_plane;
    int dst_oc_offset = oc_block_num * oc_block * iC4 * C4NUM * input_unit * input_unit + oc_block_rem;
    int dst_oc_offset = oc_block_num * oc_block * iC4 * C4NUM + oc_block_rem;
    for (int i = 0; i < iC4; i++) {
      float *src_ic4_ptr = weight_data + src_oc_offset + i * kernel_plane * C4NUM;
      float *dst_ic4_ptr = trans_weight + dst_oc_offset + i * oc_block * C4NUM;
@@ -559,24 +562,24 @@ void Conv3x3Fp32OutputUnit(const float *gemm_out, const float *bias_data, float
  float32x4_t bias_ptr = vld1q_f32(bias_data);
  float32x4_t s00 = vld1q_f32(gemm_out);
  float32x4_t s01 = vld1q_f32(gemm_out + 4);
  float32x4_t s02 = vld1q_f32(gemm_out + 8);
  float32x4_t s03 = vld1q_f32(gemm_out + 12);
  float32x4_t s01 = vld1q_f32(gemm_out + 8);
  float32x4_t s02 = vld1q_f32(gemm_out + 16);
  float32x4_t s03 = vld1q_f32(gemm_out + 24);
  float32x4_t s10 = vld1q_f32(gemm_out + 16);
  float32x4_t s11 = vld1q_f32(gemm_out + 20);
  float32x4_t s12 = vld1q_f32(gemm_out + 24);
  float32x4_t s13 = vld1q_f32(gemm_out + 28);
  float32x4_t s10 = vld1q_f32(gemm_out + 32);
  float32x4_t s11 = vld1q_f32(gemm_out + 40);
  float32x4_t s12 = vld1q_f32(gemm_out + 48);
  float32x4_t s13 = vld1q_f32(gemm_out + 56);
  float32x4_t s20 = vld1q_f32(gemm_out + 32);
  float32x4_t s21 = vld1q_f32(gemm_out + 36);
  float32x4_t s22 = vld1q_f32(gemm_out + 40);
  float32x4_t s23 = vld1q_f32(gemm_out + 44);
  float32x4_t s20 = vld1q_f32(gemm_out + 64);
  float32x4_t s21 = vld1q_f32(gemm_out + 72);
  float32x4_t s22 = vld1q_f32(gemm_out + 80);
  float32x4_t s23 = vld1q_f32(gemm_out + 88);
  float32x4_t s30 = vld1q_f32(gemm_out + 48);
  float32x4_t s31 = vld1q_f32(gemm_out + 52);
  float32x4_t s32 = vld1q_f32(gemm_out + 56);
  float32x4_t s33 = vld1q_f32(gemm_out + 60);
  float32x4_t s30 = vld1q_f32(gemm_out + 96);
  float32x4_t s31 = vld1q_f32(gemm_out + 104);
  float32x4_t s32 = vld1q_f32(gemm_out + 112);
  float32x4_t s33 = vld1q_f32(gemm_out + 120);
  float32x4_t t00 = vaddq_f32(vaddq_f32(s00, s10), s20);
  float32x4_t t01 = vaddq_f32(vaddq_f32(s01, s11), s21);
@@ -609,24 +612,24 @@ void Conv3x3Fp32OutputUnit(const float *gemm_out, const float *bias_data, float
    const float *bias_ptr = bias_data + i;
    float s00 = local_ptr[0];
    float s01 = (local_ptr + 4)[0];
    float s02 = (local_ptr + 8)[0];
    float s03 = (local_ptr + 12)[0];
    float s01 = (local_ptr + 8)[0];
    float s02 = (local_ptr + 16)[0];
    float s03 = (local_ptr + 24)[0];
    float s10 = (local_ptr + 16)[0];
    float s11 = (local_ptr + 20)[0];
    float s12 = (local_ptr + 24)[0];
    float s13 = (local_ptr + 28)[0];
    float s10 = (local_ptr + 32)[0];
    float s11 = (local_ptr + 40)[0];
    float s12 = (local_ptr + 48)[0];
    float s13 = (local_ptr + 56)[0];
    float s20 = (local_ptr + 32)[0];
    float s21 = (local_ptr + 36)[0];
    float s22 = (local_ptr + 40)[0];
    float s23 = (local_ptr + 44)[0];
    float s20 = (local_ptr + 64)[0];
    float s21 = (local_ptr + 72)[0];
    float s22 = (local_ptr + 80)[0];
    float s23 = (local_ptr + 88)[0];
    float s30 = (local_ptr + 48)[0];
    float s31 = (local_ptr + 52)[0];
    float s32 = (local_ptr + 56)[0];
    float s33 = (local_ptr + 60)[0];
    float s30 = (local_ptr + 96)[0];
    float s31 = (local_ptr + 104)[0];
    float s32 = (local_ptr + 112)[0];
    float s33 = (local_ptr + 120)[0];
    float t00 = s00 + s10 + s20;
    float t01 = s01 + s11 + s21;
@@ -663,6 +666,7 @@ void Conv3x3Fp32OutputTransform(const float *gemm_out, float *out_data, const fl
  int output_w = conv_param->output_w_;
  int output_h = conv_param->output_h_;
  int oc4 = UP_DIV(output_channel, C4NUM);
  int oc8 = UP_DIV(output_channel, C8NUM);
  const int input_unit = 4;
  if (out_w_block == 0) {
    return;
@@ -670,11 +674,13 @@ void Conv3x3Fp32OutputTransform(const float *gemm_out, float *out_data, const fl
  for (int i = 0; i < real_cal_num; i++) {
    int out_w_index = (start_index + i) % out_w_block;
    int out_h_index = (start_index + i) / out_w_block;
    int src_tile_offset = i * oc4 * C4NUM * input_unit * input_unit;
    int src_tile_offset = i * oc8 * C8NUM * input_unit * input_unit;
    int dst_tile_offset = C4NUM * (out_w_index * OUPUT_UNIT + out_h_index * OUPUT_UNIT * output_w);
    for (int j = 0; j < oc4; j++) {
      int src_oc4_offset = src_tile_offset + j * input_unit * input_unit * C4NUM;
      int c8_block = j / 2;
      int c8_res = j % 2;
      int src_oc4_offset = src_tile_offset + c8_block * input_unit * input_unit * C8NUM + c8_res * C4NUM;
      int dst_oc4_offset = dst_tile_offset + j * C4NUM * output_h * output_w;
      const float *src_ptr = gemm_out + src_oc4_offset;
      const float *bias_ptr = bias_data + j * C4NUM;
@@ -864,8 +870,8 @@ void Conv3x3Uint8InputTransform(const int16_t *input_data, int16_t *trans_input,
  int input_channel = conv_param->input_channel_;
  int input_width = conv_param->input_w_;
  int input_height = conv_param->input_h_;
  int pad_w = conv_param->pad_w_;
  int pad_h = conv_param->pad_h_;
  int pad_w = conv_param->pad_l_;
  int pad_h = conv_param->pad_u_;
  ConvQuantArg quant_arg = conv_param->conv_quant_arg_;
  int input_zp = quant_arg.input_quant_args_[0].zp_;
  const int ic8 = UP_DIV(input_channel, C8NUM);
@@ -1221,9 +1227,9 @@ void Conv3x3Uint8OutputUnit(const int32_t *gemm_out, const int32_t *bias_data, i
  int32x4_t ls;
  int32x4_t rs;
  if ((conv_param->conv_quant_arg_.per_channel_ & FILTER_PER_CHANNEL)) {
    out_multiplier = vld1q_s32(quant_multiplier);
    ls = vld1q_s32(left_shift);
    rs = vld1q_s32(right_shift);
    out_multiplier = vld1q_s32(quant_multiplier + oc_start);
    ls = vld1q_s32(left_shift + oc_start);
    rs = vld1q_s32(right_shift + oc_start);
  } else {
    out_multiplier = vdupq_n_s32(quant_multiplier[0]);
    ls = vdupq_n_s32(left_shift[0]);
--- a/mindspore/lite/nnacl/winograd_utils.c
+++ b/mindspore/lite/nnacl/winograd_utils.c
@@ -4649,43 +4649,41 @@ void OutputTransform8x7Unit(const float *src_data, float *dst_data, const float
 // Utilize cost model to compute performance gain.
 // If the gain is greater than got from Im2col, winograd algorithm will be chosen.
 int SelectOutputUnit(ConvParameter *conv_param) {
  int input_batch = conv_param->input_batch_;
  int kernel_h = conv_param->kernel_h_;
  int kernel_w = conv_param->kernel_w_;
  int in_channel = conv_param->input_channel_;
  int out_h = conv_param->output_h_;
  int in_c = conv_param->input_channel_;
  int out_w = conv_param->output_w_;
  int out_channel = conv_param->output_channel_;
  int out_plane = out_h * out_w;
  int max_unit = sqrt((float)(out_plane));
  max_unit = max_unit > MIN_UNIT ? max_unit : MIN_UNIT;
  max_unit = max_unit < MAX_UNIT ? max_unit : MAX_UNIT;
  int output_unit = 1;
  float ratio = 0.0f;
  // cost of conventional convolution multiplications
  float ori_cost = out_plane * out_channel * in_channel * kernel_h * kernel_w;
  for (int u = MIN_UNIT; u < max_unit; u++) {
    int input_unit = u + kernel_h - 1;
    if (input_unit != 4 && input_unit != 8) {
  int out_h = conv_param->output_h_;
  int out_c = conv_param->output_channel_;
  int unit2 = UP_DIV(out_w * out_h, C12NUM * conv_param->op_parameter_.thread_num_);
  int max_out_unit = (int)(sqrtf((float)unit2));
  max_out_unit = max_out_unit < MAX_UNIT ? MAX_UNIT : max_out_unit;
  max_out_unit = max_out_unit > MIN_UNIT ? max_out_unit : MIN_UNIT;
  int unit = 0;
  float max_rate = 0.0f;
  float common_cost = (float)out_h * out_w * in_c * out_c * kernel_h * kernel_w;
  for (int i = MIN_UNIT; i <= max_out_unit; ++i) {
    int input_unit = i + kernel_w - 1;
    OutputTransformUnitFunc output_trans_func = GetOutputTransFunc(input_unit, i);
    if (output_trans_func == NULL) {
      continue;
    }
    // don't count filter transform cost, because it can be processed once offline.
    const float input_trans_unit_cost = 2 * input_unit * input_unit * input_unit * in_channel;
    float gemm_unit_cost = input_unit * input_unit * in_channel * out_channel;
    float output_trans_unit_cost = input_unit * u * (u + input_unit) * out_channel;
    // equation (23) in papar
    float winograd_cost = (input_trans_unit_cost + gemm_unit_cost + output_trans_unit_cost) *
                          (UP_DIV(out_w, u) * (UP_DIV(out_h, u))) * input_batch;
    float reduce_rate = ori_cost / winograd_cost;
    if (reduce_rate > ratio && reduce_rate > 1) {
      ratio = reduce_rate;
      output_unit = u;
    float penalty = ((float)input_unit * input_unit) / ((float)kernel_h * kernel_w) * 0.12f;
    float wino_cost = ((2 + out_c) * (float)input_unit * input_unit * in_c + ((float)input_unit + i) * i * out_c) *
                      UP_DIV(out_w, i) * UP_DIV(out_h, i);
    float reduce_rate = common_cost / wino_cost - penalty;
    if (reduce_rate > max_rate) {
      max_rate = reduce_rate;
      unit = i;
    }
  }
  if (max_rate < 1.0f) {
    return 1;
  }
  // If output_unit is 1, then it is conventional convolution
  return output_unit;
  return unit;
 }
 InputTransformUnitFunc GetInputTransFunc(int input_unit) {
@@ -4719,17 +4717,6 @@ void CheckIfUseWinograd(bool *use_winograd, int *output_unit, ConvParameter *con
    *output_unit = SelectOutputUnit(conv_param);
    if (*output_unit > 1) {
      *use_winograd = true;
      int input_unit = conv_param->kernel_h_ + *output_unit - 1;
      input_trans_func = GetInputTransFunc(input_unit);
      if (input_trans_func == NULL) {
        *use_winograd = false;
      }
      output_trans_func = GetOutputTransFunc(input_unit, *output_unit);
      if (output_trans_func == NULL) {
        *use_winograd = false;
      }
    } else {
      *use_winograd = false;
    }
  } else {
    *use_winograd = false;
--- a/mindspore/lite/src/ops/conv2d.cc
+++ b/mindspore/lite/src/ops/conv2d.cc
@@ -376,10 +376,18 @@ void Conv2D::ConvInferShape(int input_h, int input_w, int *output_h, int *output
    *output_h = std::ceil(static_cast<float>(input_h) / static_cast<float>(stride_h));
    auto pad_h_all = ((*output_h - 1) * stride_h + (kernel_h - 1) * dilate_h + 1 - input_h);
    auto pad_w_all = ((*output_w - 1) * stride_w + (kernel_w - 1) * dilate_w + 1 - input_w);
    pad_u_ = pad_h_all / 2;
    pad_d_ = pad_h_all - pad_u_;
    pad_l_ = pad_w_all / 2;
    pad_r_ = pad_w_all - pad_l_;
    if (pad_h_all < 0) {
      pad_u_ = pad_d_ = 0;
    } else {
      pad_u_ = pad_h_all / 2;
      pad_d_ = pad_h_all - pad_u_;
    }
    if (pad_w_all < 0) {
      pad_l_ = pad_r_ = 0;
    } else {
      pad_l_ = pad_w_all / 2;
      pad_r_ = pad_w_all - pad_l_;
    }
  } else {
    *output_w = std::ceil((static_cast<float>(input_w) + pad_l_ + pad_r_ -
                           (static_cast<float>(kernel_w) - 1) * static_cast<float>(dilate_w)) /
--- a/mindspore/lite/src/ops/deconv2d.cc
+++ b/mindspore/lite/src/ops/deconv2d.cc
@@ -126,14 +126,12 @@ int DeConv2D::InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vecto
  output->set_shape(out_shape);
  if (pad_mode == schema::PadMode_SAME) {
    pad_h_ = ((input_h - 1) * stride_h + (kernel_h - 1) * dilate_h + 1 - output_h) / 2;
    pad_w_ = ((input_w - 1) * stride_w + (kernel_w - 1) * dilate_w + 1 - output_w) / 2;
    pad_u_ = ((input_h - 1) * stride_h + (kernel_h - 1) * dilate_h + 1 - output_h) / 2;
    pad_l_ = ((input_w - 1) * stride_w + (kernel_w - 1) * dilate_w + 1 - output_w) / 2;
  } else if (pad_mode == schema::PadMode_VALID) {
    pad_h_ = 0;
    pad_w_ = 0;
    pad_u_ = 0;
    pad_l_ = 0;
  } else if (pad_mode == schema::PadMode_CAFFE) {
    pad_h_ = pad_u_;
    pad_w_ = pad_l_;
  } else {
    MS_LOG(ERROR) << "unsupported pad mode for deconv";
  }
--- a/mindspore/lite/src/ops/deconv2d.h
+++ b/mindspore/lite/src/ops/deconv2d.h
@@ -74,16 +74,12 @@ class DeConv2D : public PrimitiveC {
  int PadDown() const { return this->pad_d_; }
  int PadLeft() const { return this->pad_l_; }
  int PadRight() const { return this->pad_r_; }
  int PadH() const { return this->pad_h_; }
  int PadW() const { return this->pad_w_; }
 protected:
  int pad_u_ = 0;
  int pad_d_ = 0;
  int pad_l_ = 0;
  int pad_r_ = 0;
  int pad_h_ = 0;
  int pad_w_ = 0;
 };
 }  // namespace lite
 }  // namespace mindspore
--- a/mindspore/lite/src/ops/pooling.cc
+++ b/mindspore/lite/src/ops/pooling.cc
@@ -170,10 +170,18 @@ int Pooling::InferShape(std::vector<tensor::Tensor *> inputs_, std::vector<tenso
    output_h = std::ceil(static_cast<float>(input_h) / static_cast<float>(GetStrideH()));
    auto pad_h_all = ((output_h - 1) * GetStrideH() + (window_h - 1) + 1 - input_h);
    auto pad_w_all = ((output_w - 1) * GetStrideW() + (window_w - 1) + 1 - input_w);
    pad_u_ = pad_h_all / 2;
    pad_d_ = pad_h_all - pad_u_;
    pad_l_ = pad_w_all / 2;
    pad_r_ = pad_w_all - pad_l_;
    if (pad_h_all < 0) {
      pad_u_ = pad_d_ = 0;
    } else {
      pad_u_ = pad_h_all / 2;
      pad_d_ = pad_h_all - pad_u_;
    }
    if (pad_w_all < 0) {
      pad_l_ = pad_r_ = 0;
    } else {
      pad_l_ = pad_w_all / 2;
      pad_r_ = pad_w_all - pad_l_;
    }
  } else {
    auto round_mode = (schema::RoundMode)GetRoundMode();
    if (round_mode == schema::RoundMode_FLOOR) {
--- a/mindspore/lite/src/populate_parameter.cc
+++ b/mindspore/lite/src/populate_parameter.cc
@@ -376,8 +376,6 @@ OpParameter *PopulateConvParameter(const mindspore::lite::PrimitiveC *primitive)
  conv_param->pad_d_ = conv2d_lite_primitive->PadDown();
  conv_param->pad_l_ = conv2d_lite_primitive->PadLeft();
  conv_param->pad_r_ = conv2d_lite_primitive->PadRight();
  conv_param->pad_h_ = conv2d_lite_primitive->PadUp();
  conv_param->pad_w_ = conv2d_lite_primitive->PadLeft();
  conv_param->dilation_h_ = conv_primitive->GetDilateH();
  conv_param->dilation_w_ = conv_primitive->GetDilateW();
  conv_param->input_channel_ = conv_primitive->GetChannelIn();
@@ -386,16 +384,13 @@ OpParameter *PopulateConvParameter(const mindspore::lite::PrimitiveC *primitive)
  auto act_type = conv_primitive->GetActivationType();
  switch (act_type) {
    case schema::ActivationType_RELU:
      conv_param->is_relu_ = true;
      conv_param->is_relu6_ = false;
      conv_param->act_type_ = ActType_Relu;
      break;
    case schema::ActivationType_RELU6:
      conv_param->is_relu_ = false;
      conv_param->is_relu6_ = true;
      conv_param->act_type_ = ActType_Relu6;
      break;
    default:
      conv_param->is_relu_ = false;
      conv_param->is_relu6_ = false;
      conv_param->act_type_ = ActType_No;
      break;
  }
  return reinterpret_cast<OpParameter *>(conv_param);
@@ -422,23 +417,18 @@ OpParameter *PopulateConvDwParameter(const mindspore::lite::PrimitiveC *primitiv
  conv_param->pad_d_ = convdw_lite_primitive->PadDown();
  conv_param->pad_l_ = convdw_lite_primitive->PadLeft();
  conv_param->pad_r_ = convdw_lite_primitive->PadRight();
  conv_param->pad_h_ = convdw_lite_primitive->PadUp();
  conv_param->pad_w_ = convdw_lite_primitive->PadLeft();
  conv_param->dilation_h_ = conv_primitive->GetDilateH();
  conv_param->dilation_w_ = conv_primitive->GetDilateW();
  auto act_type = conv_primitive->GetActivationType();
  switch (act_type) {
    case schema::ActivationType_RELU:
      conv_param->is_relu_ = true;
      conv_param->is_relu6_ = false;
      conv_param->act_type_ = ActType_Relu;
      break;
    case schema::ActivationType_RELU6:
      conv_param->is_relu_ = false;
      conv_param->is_relu6_ = true;
      conv_param->act_type_ = ActType_Relu6;
      break;
    default:
      conv_param->is_relu_ = false;
      conv_param->is_relu6_ = false;
      conv_param->act_type_ = ActType_No;
      break;
  }
  return reinterpret_cast<OpParameter *>(conv_param);
@@ -464,23 +454,18 @@ OpParameter *PopulateDeconvDwParameter(const mindspore::lite::PrimitiveC *primit
  conv_param->pad_d_ = deconvdw_lite_primitive->PadDown();
  conv_param->pad_l_ = deconvdw_lite_primitive->PadLeft();
  conv_param->pad_r_ = deconvdw_lite_primitive->PadRight();
  conv_param->pad_h_ = deconvdw_lite_primitive->PadUp();
  conv_param->pad_w_ = deconvdw_lite_primitive->PadLeft();
  conv_param->dilation_h_ = conv_primitive->GetDilateH();
  conv_param->dilation_w_ = conv_primitive->GetDilateW();
  auto act_type = conv_primitive->GetActivationType();
  switch (act_type) {
    case schema::ActivationType_RELU:
      conv_param->is_relu_ = true;
      conv_param->is_relu6_ = false;
      conv_param->act_type_ = ActType_Relu;
      break;
    case schema::ActivationType_RELU6:
      conv_param->is_relu_ = false;
      conv_param->is_relu6_ = true;
      conv_param->act_type_ = ActType_Relu6;
      break;
    default:
      conv_param->is_relu_ = false;
      conv_param->is_relu6_ = false;
      conv_param->act_type_ = ActType_No;
      break;
  }
  return reinterpret_cast<OpParameter *>(conv_param);
@@ -506,23 +491,18 @@ OpParameter *PopulateDeconvParameter(const mindspore::lite::PrimitiveC *primitiv
  conv_param->pad_d_ = deconv_lite_primitive->PadDown();
  conv_param->pad_l_ = deconv_lite_primitive->PadLeft();
  conv_param->pad_r_ = deconv_lite_primitive->PadRight();
  conv_param->pad_h_ = deconv_lite_primitive->PadH();
  conv_param->pad_w_ = deconv_lite_primitive->PadW();
  conv_param->dilation_h_ = conv_primitive->GetDilateH();
  conv_param->dilation_w_ = conv_primitive->GetDilateW();
  auto act_type = conv_primitive->GetActivationType();
  switch (act_type) {
    case schema::ActivationType_RELU:
      conv_param->is_relu_ = true;
      conv_param->is_relu6_ = false;
      conv_param->act_type_ = ActType_Relu;
      break;
    case schema::ActivationType_RELU6:
      conv_param->is_relu_ = false;
      conv_param->is_relu6_ = true;
      conv_param->act_type_ = ActType_Relu6;
      break;
    default:
      conv_param->is_relu_ = false;
      conv_param->is_relu6_ = false;
      conv_param->act_type_ = ActType_No;
      break;
  }
  return reinterpret_cast<OpParameter *>(conv_param);
--- a/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.cc
@@ -322,10 +322,12 @@ int ConvolutionBaseCPUKernel::SetQuantParam() {
    return ret;
  }
  // now only consider per tensor for output
  CalculateActivationRangeQuantized(
    conv_param_->is_relu_, conv_param_->is_relu6_, conv_param_->conv_quant_arg_.output_quant_args_[0].zp_,
    conv_param_->conv_quant_arg_.output_quant_args_[0].scale_, &conv_param_->conv_quant_arg_.out_act_min_[0],
    &conv_param_->conv_quant_arg_.out_act_max_[0]);
  bool relu = conv_param_->act_type_ == ActType_Relu;
  bool relu6 = conv_param_->act_type_ == ActType_Relu6;
  CalculateActivationRangeQuantized(relu, relu6, conv_param_->conv_quant_arg_.output_quant_args_[0].zp_,
                                    conv_param_->conv_quant_arg_.output_quant_args_[0].scale_,
                                    &conv_param_->conv_quant_arg_.out_act_min_[0],
                                    &conv_param_->conv_quant_arg_.out_act_max_[0]);
  return RET_OK;
 }
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
@@ -38,8 +38,7 @@ int Convolution1x1FP16CPUKernel::InitMatmulParam() {
  matmul_param_->deep_ = conv_param_->input_channel_;
  matmul_param_->row_16_ = UP_ROUND(matmul_param_->row_, C16NUM);
  matmul_param_->col_8_ = UP_ROUND(matmul_param_->col_, C8NUM);
  matmul_param_->act_type_ = (conv_param_->is_relu6_) ? ActType_Relu6 : ActType_No;
  matmul_param_->act_type_ = (conv_param_->is_relu_) ? ActType_Relu : matmul_param_->act_type_;
  matmul_param_->act_type_ = conv_param_->act_type_;
  return RET_OK;
 }
@@ -57,7 +56,7 @@ Convolution1x1FP16CPUKernel::~Convolution1x1FP16CPUKernel() {
 }
 int Convolution1x1FP16CPUKernel::InitConv1x1Param() {
  pre_trans_input_ = (conv_param_->pad_h_ != 0 || conv_param_->pad_w_ != 0 || conv_param_->stride_h_ != 1 ||
  pre_trans_input_ = (conv_param_->pad_u_ != 0 || conv_param_->pad_l_ != 0 || conv_param_->stride_h_ != 1 ||
                      conv_param_->stride_w_ != 1);
  thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, C8NUM));
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc
@@ -237,8 +237,8 @@ int Convolution3x3FP16CPUKernel::Run() {
  }
  // get real output
  bool relu = conv_param_->is_relu_;
  bool relu6 = conv_param_->is_relu6_;
  bool relu = conv_param_->act_type_ == ActType_Relu;
  bool relu6 = conv_param_->act_type_ == ActType_Relu6;
  if (relu) {
    UnPack3x3ReluOutputFp16(tmp_out_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_,
                            conv_param_->output_w_, conv_param_->output_channel_);
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
@@ -391,10 +391,10 @@ int ConvolutionWinogradFP16CPUKernel::Run() {
  }
  // get real output
  if (conv_param_->is_relu_) {
  if (conv_param_->act_type_ == ActType_Relu) {
    UnPackWinogradReluOutputFp16(tmp_out_data_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_,
                                 conv_param_->output_w_, conv_param_->output_channel_, output_unit_);
  } else if (conv_param_->is_relu6_) {
  } else if (conv_param_->act_type_ == ActType_Relu6) {
    UnPackWinogradRelu6OutputFp16(tmp_out_data_, execute_output_, conv_param_->output_batch_, conv_param_->output_h_,
                                  conv_param_->output_w_, conv_param_->output_channel_, output_unit_);
  } else {
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc
@@ -232,34 +232,31 @@ kernel::LiteKernel *CpuConvFp32KernelCreator(const std::vector<lite::tensor::Ten
  auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter);
  int kernel_h = conv_param->kernel_h_;
  int kernel_w = conv_param->kernel_w_;
  int stride_h = conv_param->stride_h_;
  int stride_w = conv_param->stride_w_;
  int dilation_h = conv_param->dilation_h_;
  int dilation_w = conv_param->dilation_w_;
  conv_param->input_h_ = inputs.front()->Height();
  conv_param->input_w_ = inputs.front()->Width();
  conv_param->input_channel_ = inputs.front()->Channel();
  conv_param->output_h_ = outputs.front()->Height();
  conv_param->output_w_ = outputs.front()->Width();
  conv_param->output_channel_ = outputs.front()->Channel();
  conv_param->op_parameter_.thread_num_ = ctx->thread_num_;
  bool use_winograd = false;
  bool use_sw = false;
  int out_unit;
  InputTransformUnitFunc input_trans_func = nullptr;
  OutputTransformUnitFunc output_trans_func = nullptr;
  if (primitive != nullptr && primitive->GetInferFlag()) {
    CheckIfUseWinograd(&use_winograd, &out_unit, conv_param, input_trans_func, output_trans_func);
    use_sw = CheckIfUseSlideWindow(conv_param);
  }
  kernel::LiteKernel *kernel;
  if (kernel_h == 1 && kernel_w == 1) {
    kernel = new (std::nothrow) kernel::Convolution1x1CPUKernel(op_parameter, inputs, outputs, ctx, primitive);
  } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) {
    kernel = new (std::nothrow) kernel::Convolution3x3CPUKernel(op_parameter, inputs, outputs, ctx, primitive);
  } else if (use_winograd) {
    kernel =
      new (std::nothrow) kernel::ConvolutionWinogradCPUKernel(op_parameter, inputs, outputs, ctx, primitive, out_unit);
  } else if (use_sw) {
    kernel = new (std::nothrow) kernel::ConvolutionSWCPUKernel(op_parameter, inputs, outputs, ctx, primitive);
    if (kernel_h == 3 && kernel_w == 3 && out_unit == 2) {
      kernel = new (std::nothrow) kernel::Convolution3x3CPUKernel(op_parameter, inputs, outputs, ctx, primitive);
    } else {
      kernel = new (std::nothrow)
        kernel::ConvolutionWinogradCPUKernel(op_parameter, inputs, outputs, ctx, primitive, out_unit);
    }
  } else {
    kernel = new (std::nothrow) kernel::ConvolutionCPUKernel(op_parameter, inputs, outputs, ctx, primitive);
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc
@@ -65,8 +65,7 @@ void Convolution1x1CPUKernel::InitConv1x1MatmulParam() {
  matmul_param_->deep_ = conv_param_->input_channel_;
  matmul_param_->row_12_ = UP_ROUND(matmul_param_->row_, C12NUM);
  matmul_param_->col_8_ = UP_ROUND(matmul_param_->col_, C8NUM);
  matmul_param_->act_type_ = (conv_param_->is_relu6_) ? ActType_Relu6 : ActType_No;
  matmul_param_->act_type_ = (conv_param_->is_relu_) ? ActType_Relu : matmul_param_->act_type_;
  matmul_param_->act_type_ = conv_param_->act_type_;
  return;
 }
@@ -98,7 +97,7 @@ int Convolution1x1CPUKernel::InitConv1x1BiasWeight() {
 }
 int Convolution1x1CPUKernel::InitConv1x1Param() {
  pre_trans_input_ = (conv_param_->pad_h_ != 0 || conv_param_->pad_w_ != 0 || conv_param_->stride_h_ != 1 ||
  pre_trans_input_ = (conv_param_->pad_u_ != 0 || conv_param_->pad_l_ != 0 || conv_param_->stride_h_ != 1 ||
                      conv_param_->stride_w_ != 1);
  thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, C8NUM));
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc
@@ -94,7 +94,9 @@ int Convolution3x3CPUKernel::InitWeightBias() {
 }
 int Convolution3x3CPUKernel::InitTmpBuffer() {
  int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM);
  int oC4 = UP_DIV(conv_param_->output_channel_, C4NUM);
  int oC8 = UP_DIV(conv_param_->output_channel_, C8NUM);
  const int k_plane = 16;
  MS_ASSERT(ctx_->allocator != nullptr);
@@ -105,13 +107,20 @@ int Convolution3x3CPUKernel::InitTmpBuffer() {
    return RET_ERROR;
  }
  size_t tmp_dst_buffer_size = thread_count_ * TILE_NUM * k_plane * oC4 * C4NUM * sizeof(float);
  size_t tmp_dst_buffer_size = thread_count_ * C12NUM * k_plane * oC8 * C8NUM * sizeof(float);
  tmp_dst_buffer_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(tmp_dst_buffer_size));
  if (tmp_dst_buffer_ == nullptr) {
    MS_LOG(ERROR) << "malloc tmp_dst_buffer_ failed.";
    return RET_ERROR;
  }
  size_t col_buffer_size = thread_count_ * C12NUM * C4NUM * ic4 * sizeof(float);
  col_buffer_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(col_buffer_size));
  if (col_buffer_ == nullptr) {
    MS_LOG(ERROR) << "malloc col_buffer_ failed.";
    return RET_ERROR;
  }
  size_t nc4hw4_out_size =
    oC4 * C4NUM * conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * sizeof(float);
  nc4hw4_out_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(nc4hw4_out_size));
@@ -124,6 +133,7 @@ int Convolution3x3CPUKernel::InitTmpBuffer() {
  tmp_buffer_address_list_[1] = block_unit_buffer_;
  tmp_buffer_address_list_[2] = tmp_dst_buffer_;
  tmp_buffer_address_list_[3] = nc4hw4_out_;
  tmp_buffer_address_list_[4] = col_buffer_;
  return RET_OK;
 }
@@ -182,7 +192,7 @@ int Convolution3x3CPUKernel::ReSize() {
  }
  memset(nhwc4_input_, 0, nhwc4_input_size);
  size_t tile_buffer_size = thread_count_ * TILE_NUM * C16NUM * iC4 * C4NUM * sizeof(float);
  size_t tile_buffer_size = thread_count_ * C12NUM * C16NUM * iC4 * C4NUM * sizeof(float);
  tile_buffer_ = reinterpret_cast<float *>(malloc(tile_buffer_size));
  if (tile_buffer_ == nullptr) {
    MS_LOG(ERROR) << "malloc tile buffer failed.";
@@ -237,8 +247,8 @@ int Convolution3x3CPUKernel::Run() {
    return RET_ERROR;
  }
  auto is_relu = conv_param_->is_relu_;
  auto is_relu6 = conv_param_->is_relu6_;
  auto is_relu = conv_param_->act_type_ == ActType_Relu;
  auto is_relu6 = conv_param_->act_type_ == ActType_Relu6;
  auto output_addr = reinterpret_cast<float *>(out_tensors_.at(kOutputIndex)->Data());
  if (is_relu) {
    PackNC4HW4ToNHWCReluFp32(nc4hw4_out_, output_addr, conv_param_->output_batch_,
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.h
@@ -60,14 +60,19 @@ class Convolution3x3CPUKernel : public ConvolutionBaseCPUKernel {
      ctx_->allocator->Free(nc4hw4_out_);
      nc4hw4_out_ = nullptr;
    }
    if (col_buffer_ != nullptr) {
      ctx_->allocator->Free(col_buffer_);
      col_buffer_ = nullptr;
    }
  }
  float *transformed_filter_addr_ = nullptr;
  float *tile_buffer_ = nullptr;
  float *block_unit_buffer_ = nullptr;
  float *tmp_dst_buffer_ = nullptr;
  float *col_buffer_ = nullptr;
  float *nc4hw4_out_ = nullptr;
  TmpBufferAddress tmp_buffer_address_list_[4];
  TmpBufferAddress tmp_buffer_address_list_[5];
  GEMM_FUNC_FP32 gemm_func_ = nullptr;
 };
 void ProcessFilter(float *origin_weight, float *dst_weight, ConvParameter *conv_param, int oc_block, int oc_block_num);
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc
@@ -76,7 +76,7 @@ int WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int
    int out_c_block = i / oc_block;
    int out_c_res = i % oc_block;
    int input_oz_offset = i * kernel_unit * kernel_unit * channel_in;
    int output_oz_offset = out_c_block * strides[1] * input_unit * input_unit + out_c_res;
    int output_oz_offset = out_c_block * strides[1] + out_c_res;
    for (int j = 0; j < channel_in; j++) {
      int ic4_block = j / C4NUM;
      int ic4_res = j % C4NUM;
@@ -93,7 +93,7 @@ int WinogradFilterTransform(const float *weight_data, Matrix *trans_weight, int
      MatrixMultiply(tmp_data, matrix_gt_data, trans_out_data, input_unit, kernel_unit, input_unit, row);
      for (int z = 0; z < input_unit_square; z++) {
        int output_xy_offset = output_iz_offset + z * strides[1];
        int output_xy_offset = output_iz_offset + z * strides[0];
        *(trans_weight_data + output_xy_offset) = trans_out_data[z];
      }
    }
@@ -151,7 +151,7 @@ int ConvolutionWinogradCPUKernel::InitWeightBias() {
 int ConvolutionWinogradCPUKernel::MallocFilterMatrix(int oc_block, int oc_block_num) {
  int channel_in = conv_param_->input_channel_;
  int ic4 = UP_DIV(channel_in, BLOCK);
  int ic4 = UP_DIV(channel_in, C4NUM);
  // set data
  auto trans_matrix_data_size = input_unit_ * input_unit_ * ic4 * C4NUM * oc_block_num * oc_block * sizeof(float);
@@ -196,10 +196,12 @@ int ConvolutionWinogradCPUKernel::InitTmpBuffer() {
  int output_h = conv_param_->output_h_;
  int output_w = conv_param_->output_w_;
  int oc4 = UP_DIV(channel_out, C4NUM);
  int oc8 = UP_DIV(channel_out, C8NUM);
  int ic4 = UP_DIV(conv_param_->input_channel_, C4NUM);
  MS_ASSERT(ctx_->allocator != nullptr);
  gemm_out_ = reinterpret_cast<float *>(
    ctx_->allocator->Malloc(thread_count_ * TILE_NUM * input_unit_ * input_unit_ * oc4 * C4NUM * sizeof(float)));
    ctx_->allocator->Malloc(thread_count_ * C12NUM * input_unit_ * input_unit_ * oc8 * C8NUM * sizeof(float)));
  if (gemm_out_ == nullptr) {
    MS_LOG(ERROR) << "malloc gemm_out_ failed.";
    return RET_ERROR;
@@ -222,10 +224,18 @@ int ConvolutionWinogradCPUKernel::InitTmpBuffer() {
    return RET_ERROR;
  }
  col_buffer_ =
    reinterpret_cast<float *>(ctx_->allocator->Malloc(thread_count_ * C12NUM * ic4 * C4NUM * sizeof(float)));
  if (col_buffer_ == nullptr) {
    MS_LOG(ERROR) << "malloc col_buffer_ failed.";
    return RET_ERROR;
  }
  tmp_buffer_address_list_[0] = trans_input_;
  tmp_buffer_address_list_[1] = gemm_out_;
  tmp_buffer_address_list_[2] = tmp_out_data_;
  tmp_buffer_address_list_[3] = tmp_data_;
  tmp_buffer_address_list_[4] = col_buffer_;
  return RET_OK;
 }
@@ -306,7 +316,7 @@ int ConvolutionWinogradCPUKernel::ReSize() {
  }
  memset(nhwc4_input_, 0, nhwc4_input_size);
  size_t tile_buffer_size = thread_count_ * TILE_NUM * input_unit_ * input_unit_ * ic4 * C4NUM * sizeof(float);
  size_t tile_buffer_size = thread_count_ * C12NUM * input_unit_ * input_unit_ * ic4 * C4NUM * sizeof(float);
  trans_input_ = reinterpret_cast<float *>(malloc(tile_buffer_size));
  if (trans_input_ == nullptr) {
    MS_LOG(ERROR) << "malloc trans_input_ failed.";
@@ -370,10 +380,10 @@ int ConvolutionWinogradCPUKernel::Run() {
  // get real output
  auto out_tensor = out_tensors_.front();
  auto out_data = reinterpret_cast<float *>(out_tensor->Data());
  if (conv_param_->is_relu_) {
  if (conv_param_->act_type_ == ActType_Relu) {
    UnPackWinogradReluOutput(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_,
                             conv_param_->output_w_, conv_param_->output_channel_, output_unit_);
  } else if (conv_param_->is_relu6_) {
  } else if (conv_param_->act_type_ == ActType_Relu6) {
    UnPackWinogradRelu6Output(tmp_out_data_, out_data, conv_param_->output_batch_, conv_param_->output_h_,
                              conv_param_->output_w_, conv_param_->output_channel_, output_unit_);
  } else {
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.h
@@ -66,6 +66,10 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel {
      ctx_->allocator->Free(tmp_out_data_);
      tmp_out_data_ = nullptr;
    }
    if (col_buffer_ != nullptr) {
      ctx_->allocator->Free(col_buffer_);
      col_buffer_ = nullptr;
    }
  }
  int kernel_unit_;
  int input_unit_;
@@ -74,6 +78,7 @@ class ConvolutionWinogradCPUKernel : public ConvolutionBaseCPUKernel {
  float *trans_input_ = nullptr;
  float *gemm_out_ = nullptr;
  float *tmp_out_data_ = nullptr;
  float *col_buffer_ = nullptr;
  Matrix *trans_weight_ = nullptr;
  InputTransformUnitFunc input_trans_func_;
  OutputTransformUnitFunc output_trans_func_;
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc
@@ -146,7 +146,7 @@ int Convolution1x1Int8CPUKernel::Init() {
 }
 int Convolution1x1Int8CPUKernel::InitParam() {
  pre_trans_input_ = (conv_param_->pad_h_ != 0 || conv_param_->pad_w_ != 0 || conv_param_->stride_h_ != 1 ||
  pre_trans_input_ = (conv_param_->pad_u_ != 0 || conv_param_->pad_l_ != 0 || conv_param_->stride_h_ != 1 ||
                      conv_param_->stride_w_ != 1);
  matmul_param_->row_ = conv_param_->output_h_ * conv_param_->output_w_;
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
@@ -36,7 +36,7 @@ int Conv2dTransposeOpenCLKernel::Init() {
    MS_LOG(ERROR) << "only support kh=kw=2 and stride_h=stride_w=2.";
    return RET_ERROR;
  }
  if (param->pad_h_ != 0 || param->pad_w_ != 0) {
  if (param->pad_u_ != 0 || param->pad_l_ != 0) {
    MS_LOG(ERROR) << "only support pad =0.";
    return RET_ERROR;
  }
@@ -170,7 +170,7 @@ int Conv2dTransposeOpenCLKernel::Run() {
  int co = out_tensors_[0]->Channel();
  int kh = param->kernel_h_;
  int kw = param->kernel_w_;
  int pad = param->pad_h_;
  int pad = param->pad_u_;
  int oh = out_tensors_[0]->Height();
  int ow = out_tensors_[0]->Width();
  int h = in_tensors_[0]->Height();
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
@@ -382,9 +382,9 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolution() {
    "    }\n\n";
  code += "    FLT4 out0_c4_bias = out0_c4 + bias[co_slice];\n";
  if (param->is_relu_) {
  if (param->act_type_ == ActType_Relu) {
    code += "    out0_c4_bias = max(out0_c4_bias, (FLT4)(0.0f));\n";
  } else if (param->is_relu6_) {
  } else if (param->act_type_ == ActType_Relu6) {
    code += "    out0_c4_bias = clamp(out0_c4_bias, (FLT4)(0.0f), (FLT4)(6.0f));\n";
  }
@@ -609,9 +609,9 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() {
    "        acc += bias[slice];\n";
  auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
  if (param->is_relu_) {
  if (param->act_type_ == ActType_Relu) {
    code += "    acc = max(acc, (float4)(0.0f));\n";
  } else if (param->is_relu6_) {
  } else if (param->act_type_ == ActType_Relu6) {
    code += "    acc = clamp(acc, (float4)(0.0f), (float4)(6.0f));\n";
  }
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
@@ -163,7 +163,7 @@ int DepthwiseConv2dOpenCLKernel::Run() {
  float relu_clip1 = 6.0;
  cl_int2 kernel_size = {parameter->kernel_h_, parameter->kernel_w_};
  cl_int2 stride = {parameter->stride_h_, parameter->stride_w_};
  cl_int2 padding = {-parameter->pad_h_, -parameter->pad_w_};
  cl_int2 padding = {-parameter->pad_u_, -parameter->pad_l_};
  cl_int2 dilation = {parameter->dilation_h_, parameter->dilation_w_};
  cl_int4 src_size = {in_tensors_[0]->Width(), in_tensors_[0]->Height(), (cl_int)CI4, in_tensors_[0]->Batch()};
  cl_int4 dst_size = {(cl_int)out_tensors_[0]->Width(), (cl_int)out_tensors_[0]->Height(), (cl_int)CO4,
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/common/pack_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/common/pack_tests.cc
@@ -47,8 +47,8 @@ void InitConvParamPack(ConvParameter *conv_param) {
  conv_param->dilation_h_ = 1;
  conv_param->dilation_w_ = 1;
  conv_param->pad_h_ = 1;
  conv_param->pad_w_ = 1;
  conv_param->pad_u_ = 1;
  conv_param->pad_l_ = 1;
 }
 TEST_F(TestPack, PackInputFp32) {
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp16/convolution_fp16_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp16/convolution_fp16_tests.cc
@@ -50,8 +50,8 @@ void InitConvParamGroup1Fp16(ConvParameter *conv_param) {
  conv_param->dilation_h_ = 1;
  conv_param->dilation_w_ = 1;
  conv_param->pad_h_ = 1;
  conv_param->pad_w_ = 1;
  conv_param->pad_u_ = 1;
  conv_param->pad_l_ = 1;
  conv_param->thread_num_ = 1;
 }
@@ -75,8 +75,8 @@ void InitConvParamGroup2Fp16(ConvParameter *conv_param) {
  conv_param->dilation_h_ = 1;
  conv_param->dilation_w_ = 1;
  conv_param->pad_h_ = 1;
  conv_param->pad_w_ = 1;
  conv_param->pad_u_ = 1;
  conv_param->pad_l_ = 1;
  conv_param->thread_num_ = 1;
 }
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/conv1x1_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/conv1x1_fp32_tests.cc
@@ -50,7 +50,7 @@ TEST_F(TestConv1x1Fp32, Input1x1PrePack1) {
  conv_param->output_h_ = 4;
  conv_param->output_w_ = 5;
  conv_param->stride_h_ = conv_param->stride_w_ = 4;
  conv_param->pad_h_ = conv_param->pad_w_ = 2;
  conv_param->pad_u_ = conv_param->pad_l_ = 2;
  float out[20] = {0};
  Conv1x1InputPack(in, out, conv_param, sizeof(float));
@@ -91,7 +91,7 @@ TEST_F(TestConv1x1Fp32, Input1x1PrePack2) {
  conv_param->output_h_ = 7;
  conv_param->output_w_ = 4;
  conv_param->stride_h_ = conv_param->stride_w_ = 3;
  conv_param->pad_h_ = conv_param->pad_w_ = 0;
  conv_param->pad_u_ = conv_param->pad_l_ = 0;
  float out[28] = {0};
  Conv1x1InputPack(in, out, conv_param, sizeof(float));
@@ -105,7 +105,7 @@ TEST_F(TestConv1x1Fp32, Input1x1PrePack3) {
  conv_param->input_h_ = conv_param->input_w_ = 3;
  conv_param->output_h_ = conv_param->output_w_ = 3;
  conv_param->stride_h_ = conv_param->stride_w_ = 2;
  conv_param->pad_h_ = conv_param->pad_w_ = 1;
  conv_param->pad_u_ = conv_param->pad_l_ = 1;
  float in[] = {1.6767339, 12.25904,  19.018835, 3.0790641,  -9.252135, -8.685675, 3.6115494, 3.2282279, 17.025112,
                -5.052577, 12.750252, 12.701241, -8.9477215, -9.080522, 19.03931,  -6.501229, -4.122992, 9.540845};
@@ -124,7 +124,7 @@ TEST_F(TestConv1x1Fp32, Input1x1PrePack4) {
  conv_param->input_h_ = conv_param->input_w_ = 3;
  conv_param->output_h_ = conv_param->output_w_ = 3;
  conv_param->stride_h_ = conv_param->stride_w_ = 2;
  conv_param->pad_h_ = conv_param->pad_w_ = 1;
  conv_param->pad_u_ = conv_param->pad_l_ = 1;
  float in[] = {4.1795, 13.142, -3.593, 16.505, 19.899, 8.5562, 19.969, -6.235, -2.380, -9.027, 9.5542,
                18.974, 23.622, 8.3608, 47.325, -14.36, 15.370, 4.3049, -0.784, 37.925, -0.081, 6.1298,
                0.6721, -1.517, 37.998, 13.719, 11.029, 1.7127, -1.770, 41.903, 9.0560, 14.988, 3.1866,
@@ -281,8 +281,8 @@ int Conv1x1TestInit1(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<l
  conv_param->kernel_h_ = conv_param->kernel_w_ = 1;
  conv_param->stride_h_ = conv_param->stride_w_ = 2;
  conv_param->dilation_h_ = conv_param->dilation_w_ = 1;
  conv_param->pad_h_ = conv_param->pad_w_ = 1;
  conv_param->is_relu_ = conv_param->is_relu6_ = false;
  conv_param->pad_u_ = conv_param->pad_l_ = 1;
  conv_param->act_type_ = ActType_No;
  return out_t->ElementsNum();
 }
@@ -348,9 +348,8 @@ int Conv1x1TestInit2(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<l
  conv_param->kernel_h_ = conv_param->kernel_w_ = 1;
  conv_param->stride_h_ = conv_param->stride_w_ = 1;
  conv_param->dilation_h_ = conv_param->dilation_w_ = 1;
  conv_param->pad_h_ = conv_param->pad_w_ = 0;
  conv_param->is_relu_ = false;
  conv_param->is_relu6_ = false;
  conv_param->pad_u_ = conv_param->pad_l_ = 0;
  conv_param->act_type_ = ActType_No;
  return out_t->ElementsNum();
 }
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/convolution_depthwise_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/convolution_depthwise_fp32_tests.cc
@@ -47,8 +47,8 @@ void InitConvDwParam(ConvParameter *conv_param) {
  conv_param->dilation_h_ = 1;
  conv_param->dilation_w_ = 1;
  conv_param->pad_h_ = 1;
  conv_param->pad_w_ = 1;
  conv_param->pad_u_ = 1;
  conv_param->pad_l_ = 1;
 }
 void InitConvDwCreator(std::vector<lite::tensor::Tensor *> *inputs, std::vector<lite::tensor::Tensor *> *outputs,
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/deconvolution_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/deconvolution_fp32_tests.cc
@@ -468,7 +468,7 @@ int DeConvTestInit1(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<li
  conv_param->kernel_h_ = conv_param->kernel_w_ = 3;
  conv_param->stride_h_ = conv_param->stride_w_ = 2;
  conv_param->dilation_h_ = conv_param->dilation_w_ = 1;
  conv_param->pad_h_ = conv_param->pad_w_ = 1;
  conv_param->pad_u_ = conv_param->pad_l_ = 1;
  return out_t->ElementsNum();
 }
@@ -537,7 +537,7 @@ int DeConvTestInit2(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<li
  conv_param->kernel_h_ = conv_param->kernel_w_ = 3;
  conv_param->stride_h_ = conv_param->stride_w_ = 2;
  conv_param->dilation_h_ = conv_param->dilation_w_ = 1;
  conv_param->pad_h_ = conv_param->pad_w_ = 1;
  conv_param->pad_u_ = conv_param->pad_l_ = 1;
  return out_t->ElementsNum();
 }
@@ -616,7 +616,7 @@ int DeConvTestInit3(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<li
  conv_param->kernel_h_ = conv_param->kernel_w_ = 2;
  conv_param->stride_h_ = conv_param->stride_w_ = 3;
  conv_param->dilation_h_ = conv_param->dilation_w_ = 2;
  conv_param->pad_h_ = conv_param->pad_w_ = 0;
  conv_param->pad_u_ = conv_param->pad_l_ = 0;
  return out_t->ElementsNum();
 }
@@ -685,8 +685,8 @@ int DeConvTestInit4(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<li
  conv_param->kernel_h_ = conv_param->kernel_w_ = 3;
  conv_param->stride_h_ = conv_param->stride_w_ = 1;
  conv_param->dilation_h_ = conv_param->dilation_w_ = 1;
  conv_param->pad_h_ = conv_param->pad_w_ = 0;
  conv_param->is_relu_ = conv_param->is_relu6_ = false;
  conv_param->pad_u_ = conv_param->pad_l_ = 0;
  conv_param->act_type_ = ActType_No;
  return out_t->ElementsNum();
 }
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32_grad/convolution_grad_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32_grad/convolution_grad_fp32_tests.cc
@@ -52,12 +52,11 @@ void InitConvParamGroup1FP32(ConvParameter *conv_param) {
  conv_param->dilation_h_ = 1;
  conv_param->dilation_w_ = 1;
  conv_param->pad_h_ = 1;
  conv_param->pad_w_ = 1;
  conv_param->pad_u_ = 1;
  conv_param->pad_l_ = 1;
  conv_param->group_ = 1;
  conv_param->is_relu_ = false;
  conv_param->is_relu6_ = false;
  conv_param->act_type_ = ActType_No;
  conv_param->thread_num_ = 1;
 }
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/conv_1x1_int8_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/conv_1x1_int8_tests.cc
@@ -34,7 +34,7 @@ TEST_F(TestConv1x1Int8, Input1x1PrePack1) {
  conv_param->input_h_ = conv_param->input_w_ = 3;
  conv_param->output_h_ = conv_param->output_w_ = 3;
  conv_param->stride_h_ = conv_param->stride_w_ = 2;
  conv_param->pad_h_ = conv_param->pad_w_ = 1;
  conv_param->pad_u_ = conv_param->pad_l_ = 1;
  int8_t in[] = {4,  13,  -3, 16, 19, 8,  19, -6, -2, -9, 9,  18, 23, 8,  47, -14, 15, 4,
                 -0, 37,  -0, 6,  0,  -1, 37, 13, 11, 1,  -1, 41, 9,  14, 3,  0,   8,  9,
                 14, -14, -8, -8, -8, 7,  19, 17, 13, 3,  9,  18, -1, -0, 18, 0,   4,  -2};
@@ -61,7 +61,7 @@ TEST_F(TestConv1x1Int8, Input1x1PrePack2) {
  conv_param->output_h_ = 4;
  conv_param->output_w_ = 5;
  conv_param->stride_h_ = conv_param->stride_w_ = 4;
  conv_param->pad_h_ = conv_param->pad_w_ = 2;
  conv_param->pad_u_ = conv_param->pad_l_ = 2;
  int8_t out[20] = {0};
  Conv1x1InputPack(in, out, conv_param, sizeof(int8_t));
@@ -111,8 +111,8 @@ int Conv1x1Int8TestInit1_perchannel(std::vector<lite::tensor::Tensor *> *inputs_
  conv_param->kernel_h_ = conv_param->kernel_w_ = 1;
  conv_param->stride_h_ = conv_param->stride_w_ = 1;
  conv_param->dilation_h_ = conv_param->dilation_w_ = 1;
  conv_param->pad_h_ = conv_param->pad_w_ = 0;
  conv_param->is_relu_ = conv_param->is_relu6_ = false;
  conv_param->pad_u_ = conv_param->pad_l_ = 0;
  conv_param->act_type_ = ActType_No;
  return out_t->ElementsNum();
 }
@@ -178,8 +178,8 @@ int Conv1x1Int8TestInit1(std::vector<lite::tensor::Tensor *> *inputs_, std::vect
  conv_param->kernel_h_ = conv_param->kernel_w_ = 1;
  conv_param->stride_h_ = conv_param->stride_w_ = 1;
  conv_param->dilation_h_ = conv_param->dilation_w_ = 1;
  conv_param->pad_h_ = conv_param->pad_w_ = 0;
  conv_param->is_relu_ = conv_param->is_relu6_ = false;
  conv_param->pad_u_ = conv_param->pad_l_ = 0;
  conv_param->act_type_ = ActType_No;
  return out_t->ElementsNum();
 }
@@ -253,8 +253,8 @@ int Conv1x1Int8TestInit2(std::vector<lite::tensor::Tensor *> *inputs_, std::vect
  conv_param->kernel_h_ = conv_param->kernel_w_ = 1;
  conv_param->stride_h_ = conv_param->stride_w_ = 1;
  conv_param->dilation_h_ = conv_param->dilation_w_ = 1;
  conv_param->pad_h_ = conv_param->pad_w_ = 0;
  conv_param->is_relu_ = conv_param->is_relu6_ = false;
  conv_param->pad_u_ = conv_param->pad_l_ = 0;
  conv_param->act_type_ = ActType_No;
  return out_t->ElementsNum();
 }
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/deconv_int8_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/deconv_int8_tests.cc
@@ -343,7 +343,7 @@ int DeConvInt8TestInit1(std::vector<lite::tensor::Tensor *> *inputs_, std::vecto
  PackNCHWToNHWCInt8(co_nchw, *correct, out_t->Batch(), out_t->Width() * out_t->Height(), out_t->Channel());
  conv_param->kernel_h_ = conv_param->kernel_w_ = 3;
  conv_param->pad_h_ = conv_param->pad_w_ = 1;
  conv_param->pad_u_ = conv_param->pad_l_ = 1;
  conv_param->stride_h_ = conv_param->stride_w_ = 2;
  conv_param->dilation_h_ = conv_param->dilation_w_ = 1;
  return out_t->ElementsNum();
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/conv2d_transpose_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/conv2d_transpose_tests.cc
@@ -119,8 +119,8 @@ void RunTestCase(const std::vector<int> shape, const std::vector<std::string> fi
  opParameter->kernel_w_ = kw;
  opParameter->stride_h_ = 2;
  opParameter->stride_w_ = 2;
  opParameter->pad_h_ = pad;
  opParameter->pad_w_ = pad;
  opParameter->pad_u_ = pad;
  opParameter->pad_l_ = pad;
  opParameter->input_channel_ = ci;
  opParameter->output_channel_ = co;
  auto op_kernel_ptr = std::make_unique<kernel::Conv2dTransposeOpenCLKernel>(
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/depthwise_conv2d_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/depthwise_conv2d_tests.cc
@@ -169,8 +169,8 @@ TEST_F(TestConvolutionDwOpenCL, NoPadNC4HW4Fp32) {
    conv_param->stride_w_ = 1;
    conv_param->dilation_h_ = 1;
    conv_param->dilation_w_ = 1;
    conv_param->pad_h_ = 0;
    conv_param->pad_w_ = 0;
    conv_param->pad_u_ = 0;
    conv_param->pad_l_ = 0;
  }
  // nhwc
@@ -214,8 +214,8 @@ TEST_F(TestConvolutionDwOpenCL, PadNC4HW4Fp32) {
    conv_param->stride_w_ = 1;
    conv_param->dilation_h_ = 1;
    conv_param->dilation_w_ = 1;
    conv_param->pad_h_ = 1;
    conv_param->pad_w_ = 1;
    conv_param->pad_u_ = 1;
    conv_param->pad_l_ = 1;
  }
  // nhwc
@@ -286,8 +286,8 @@ TEST_F(TestConvolutionDwOpenCL, NoPadNHWC4Fp32) {
    conv_param->stride_w_ = 1;
    conv_param->dilation_h_ = 1;
    conv_param->dilation_w_ = 1;
    conv_param->pad_h_ = 0;
    conv_param->pad_w_ = 0;
    conv_param->pad_u_ = 0;
    conv_param->pad_l_ = 0;
  }
  // nhwc
@@ -331,8 +331,8 @@ TEST_F(TestConvolutionDwOpenCL, PadNHWC4Fp32) {
    conv_param->stride_w_ = 1;
    conv_param->dilation_h_ = 1;
    conv_param->dilation_w_ = 1;
    conv_param->pad_h_ = 1;
    conv_param->pad_w_ = 1;
    conv_param->pad_u_ = 1;
    conv_param->pad_l_ = 1;
  }
  // nhwc
@@ -405,8 +405,8 @@ TEST_F(TestConvolutionDwOpenCL, ConvDwNoPadFp32) {
    conv_param->stride_w_ = 1;
    conv_param->dilation_h_ = 1;
    conv_param->dilation_w_ = 1;
    conv_param->pad_h_ = 0;
    conv_param->pad_w_ = 0;
    conv_param->pad_u_ = 0;
    conv_param->pad_l_ = 0;
  }
  // nhwc
@@ -529,8 +529,8 @@ TEST_F(TestConvolutionDwOpenCL, ConvDwPadFp32) {
    conv_param->stride_w_ = 1;
    conv_param->dilation_h_ = 1;
    conv_param->dilation_w_ = 1;
    conv_param->pad_h_ = 1;
    conv_param->pad_w_ = 1;
    conv_param->pad_u_ = 1;
    conv_param->pad_l_ = 1;
  }
  // nhwc
@@ -724,8 +724,8 @@ TEST_F(TestConvolutionDwOpenCL, ProfilingMobilenetv2) {
        conv_param->kernel_w_ = filter_shape[i][2];
        conv_param->stride_h_ = conv_param->output_h_ / conv_param->input_h_;
        conv_param->stride_w_ = conv_param->output_w_ / conv_param->input_w_;
        conv_param->pad_h_ = (conv_param->kernel_h_ - 1) / 2;
        conv_param->pad_w_ = (conv_param->kernel_w_ - 1) / 2;
        conv_param->pad_u_ = (conv_param->kernel_h_ - 1) / 2;
        conv_param->pad_l_ = (conv_param->kernel_w_ - 1) / 2;
        conv_param->dilation_h_ = 1;
        conv_param->dilation_w_ = 1;
      }
@@ -774,8 +774,8 @@ TEST_F(TestConvolutionDwOpenCL, Buffer2Image) {
    conv_param->kernel_w_ = filter_shape[2];
    conv_param->stride_h_ = conv_param->output_h_ / conv_param->input_h_;
    conv_param->stride_w_ = conv_param->output_w_ / conv_param->input_w_;
    conv_param->pad_h_ = (conv_param->kernel_h_ - 1) / 2;
    conv_param->pad_w_ = (conv_param->kernel_w_ - 1) / 2;
    conv_param->pad_u_ = (conv_param->kernel_h_ - 1) / 2;
    conv_param->pad_l_ = (conv_param->kernel_w_ - 1) / 2;
    conv_param->dilation_h_ = 1;
    conv_param->dilation_w_ = 1;
  }