fix pad/conv2d/winograd bugs

5 years ago · 779d60faeb
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d.cl
@@ -91,11 +91,11 @@ __kernel void Conv2D_H1W1C1(__read_only image2d_t input, __write_only image2d_t
    out_h0_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c0));
  }

  if (OW * CO_SLICES <= MAX_IMAGE2D_WIDTH) {
    WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
  } else {
    WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0);
  }
 #ifndef EXCEDD_MAX_IMAGE2D_WIDTH
  WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
 #else
  WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0);
 #endif
 }

 __kernel void Conv2D_H2W1C1(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
@@ -172,17 +172,17 @@ __kernel void Conv2D_H2W1C1(__read_only image2d_t input, __write_only image2d_t
    out_h1_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c0));
  }

  if (OW * CO_SLICES <= MAX_IMAGE2D_WIDTH) {
    WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
    if (oh1 < OH) {
      WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0);
    }  // end if (oh1 < OH)
  } else {
    WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0);
    if (oh1 < OH) {
      WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow0), out_h1_w0_c0);
    }  // end (oh1 < OH)
  }
 #ifndef EXCEDD_MAX_IMAGE2D_WIDTH
  WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
  if (oh1 < OH) {
    WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0);
  }  // end if (oh1 < OH)
 #else
  WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0);
  if (oh1 < OH) {
    WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow0), out_h1_w0_c0);
  }  // end (oh1 < OH)
 #endif
 }

 __kernel void Conv2D_H2W1C2(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
@@ -283,29 +283,27 @@ __kernel void Conv2D_H2W1C2(__read_only image2d_t input, __write_only image2d_t
    out_h1_w0_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c1));
  }

  if (OW * CO_SLICES <= MAX_IMAGE2D_WIDTH) {
    WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
 #ifndef EXCEDD_MAX_IMAGE2D_WIDTH
  WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
  if (oh1 < OH) {
    WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0);
  }  // end if (oh1 < OH)
  if (co_slice1 < CO_SLICES) {
    WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh0), out_h0_w0_c1);
    if (oh1 < OH) {
      WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0);
      WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh1), out_h1_w0_c1);
    }  // end if (oh1 < OH)
    if (co_slice1 < CO_SLICES) {
      WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh0), out_h0_w0_c1);
      if (oh1 < OH) {
        WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh1), out_h1_w0_c1);
      }  // end if (oh1 < OH)
    }    // end if (co_slice1 < CO_SLICES)
  } else {
    WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0);
    if (oh1 < OH) {
      WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow0), out_h1_w0_c0);
    }  // end (oh1 < OH)
    if (co_slice1 < CO_SLICES) {
      WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow0), out_h0_w0_c1);
      if (oh1 < OH) {
        WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow0), out_h1_w0_c1);
      }  // end if (oh1 < OH)
    }    // end if (co_slice1 < CO_SLICES)
  }
  }    // end if (co_slice1 < CO_SLICES)
 #else
  WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0);
  if (oh1 < OH) {
    WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow0), out_h1_w0_c0);
  }  // end (oh1 < OH)
  WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow0), out_h0_w0_c1);
  if (oh1 < OH) {
    WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow0), out_h1_w0_c1);
  }  // end if (oh1 < OH)
 #endif
 }

 __kernel void Conv2D_H2W2C2(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
@@ -456,37 +454,35 @@ __kernel void Conv2D_H2W2C2(__read_only image2d_t input, __write_only image2d_t
    out_h1_w1_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w1_c1));
  }

  if (OW * CO_SLICES <= MAX_IMAGE2D_WIDTH) {
    WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
    WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh0), out_h0_w1_c0);
 #ifndef EXCEDD_MAX_IMAGE2D_WIDTH
  WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
  WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh0), out_h0_w1_c0);
  if (oh1 < OH) {
    WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0);
    WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh1), out_h1_w1_c0);
  }  // end if (oh1 < OH)
  if (co_slice1 < CO_SLICES) {
    WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh0), out_h0_w0_c1);
    WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh0), out_h0_w1_c1);
    if (oh1 < OH) {
      WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0);
      WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh1), out_h1_w1_c0);
      WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh1), out_h1_w0_c1);
      WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh1), out_h1_w1_c1);
    }  // end if (oh1 < OH)
    if (co_slice1 < CO_SLICES) {
      WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh0), out_h0_w0_c1);
      WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh0), out_h0_w1_c1);
      if (oh1 < OH) {
        WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh1), out_h1_w0_c1);
        WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh1), out_h1_w1_c1);
      }  // end if (oh1 < OH)
    }    // end if (co_slice1 < CO_SLICES)
  } else {
    WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0);
    WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow1), out_h0_w1_c0);
    if (oh1 < OH) {
      WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow0), out_h1_w0_c0);
      WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow1), out_h1_w1_c0);
    }  // end (oh1 < OH)
    if (co_slice1 < CO_SLICES) {
      WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow0), out_h0_w0_c1);
      WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow1), out_h0_w1_c1);
      if (oh1 < OH) {
        WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow0), out_h1_w0_c1);
        WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow1), out_h1_w1_c1);
      }  // end if (oh1 < OH)
    }    // end if (co_slice1 < CO_SLICES)
  }
  }    // end if (co_slice1 < CO_SLICES)
 #else
  WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0);
  WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow1), out_h0_w1_c0);
  if (oh1 < OH) {
    WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow0), out_h1_w0_c0);
    WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow1), out_h1_w1_c0);
  }  // end (oh1 < OH)
  WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow0), out_h0_w0_c1);
  WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow1), out_h0_w1_c1);
  if (oh1 < OH) {
    WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow0), out_h1_w0_c1);
    WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow1), out_h1_w1_c1);
  }  // end if (oh1 < OH)
 #endif
 }

 __kernel void Conv2D_H2W2C2_Img(__read_only image2d_t input, __write_only image2d_t output,
@@ -644,35 +640,33 @@ __kernel void Conv2D_H2W2C2_Img(__read_only image2d_t input, __write_only image2
    out_h1_w1_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w1_c1));
  }

  if (OW * CO_SLICES <= MAX_IMAGE2D_WIDTH) {
    WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
    WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh0), out_h0_w1_c0);
 #ifndef EXCEDD_MAX_IMAGE2D_WIDTH
  WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
  WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh0), out_h0_w1_c0);
  if (oh1 < OH) {
    WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0);
    WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh1), out_h1_w1_c0);
  }  // end if (oh1 < OH)
  if (co_slice1 < CO_SLICES) {
    WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh0), out_h0_w0_c1);
    WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh0), out_h0_w1_c1);
    if (oh1 < OH) {
      WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0);
      WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh1), out_h1_w1_c0);
      WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh1), out_h1_w0_c1);
      WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh1), out_h1_w1_c1);
    }  // end if (oh1 < OH)
    if (co_slice1 < CO_SLICES) {
      WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh0), out_h0_w0_c1);
      WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh0), out_h0_w1_c1);
      if (oh1 < OH) {
        WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh1), out_h1_w0_c1);
        WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh1), out_h1_w1_c1);
      }  // end if (oh1 < OH)
    }    // end if (co_slice1 < CO_SLICES)
  } else {
    WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0);
    WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow1), out_h0_w1_c0);
    if (oh1 < OH) {
      WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow0), out_h1_w0_c0);
      WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow1), out_h1_w1_c0);
    }  // end (oh1 < OH)
    if (co_slice1 < CO_SLICES) {
      WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow0), out_h0_w0_c1);
      WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow1), out_h0_w1_c1);
      if (oh1 < OH) {
        WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow0), out_h1_w0_c1);
        WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow1), out_h1_w1_c1);
      }  // end if (oh1 < OH)
    }    // end if (co_slice1 < CO_SLICES)
  }
  }    // end if (co_slice1 < CO_SLICES)
 #else
  WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0);
  WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow1), out_h0_w1_c0);
  if (oh1 < OH) {
    WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow0), out_h1_w0_c0);
    WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow1), out_h1_w1_c0);
  }  // end (oh1 < OH)
  WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow0), out_h0_w0_c1);
  WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow1), out_h0_w1_c1);
  if (oh1 < OH) {
    WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow0), out_h1_w0_c1);
    WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow1), out_h1_w1_c1);
  }  // end if (oh1 < OH)
 #endif
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/winograd.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/winograd.cl
@@ -35,29 +35,79 @@ __kernel void Winograd4x4To36(__read_only image2d_t input,    // height=N*H
  FLT4 BtD_row[6] = {0};
  int h = tile_h * 4 - pad;
  int w = tile_w * 4 - pad;
  for (int y = 0; y < 6; y++) {
    int x_idx = w * CI_SLICES + ci_slice;
    for (int x = 0; x < 6; x++) {
      // no need to check w: because ci_slice is in [0, CI_SLICES). when w<0, x_idx<0; w>=W, x_idx>=W*CI_SLICES
      // if (w < 0 || w >= W) { continue; }
      BtD_row[x] += Bt_row[y] * READ_IMAGE(input, smp_zero, (int2)(x_idx, h));
      x_idx += CI_SLICES;
    }
    h++;
  }

  int y_idx = ci_slice * 36 + row * 6;
  for (int y = 0; y < 6; y++) {
    FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
    for (int x = 0; x < 6; x++) {
      acc += BtD_row[x] * Bt[y * 6 + x];
    }
  int x_idx = w * CI_SLICES + ci_slice;
  FLT bt0 = Bt_row[0], bt1 = Bt_row[1], bt2 = Bt_row[2], bt3 = Bt_row[3], bt4 = Bt_row[4], bt5 = Bt_row[5];
  BtD_row[0] =
    bt0 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 0)) + bt1 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 1)) +
    bt2 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 2)) + bt3 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 3)) +
    bt4 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 4)) + bt5 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 5));
  x_idx += CI_SLICES;
  BtD_row[1] =
    bt0 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 0)) + bt1 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 1)) +
    bt2 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 2)) + bt3 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 3)) +
    bt4 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 4)) + bt5 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 5));
  x_idx += CI_SLICES;
  BtD_row[2] =
    bt0 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 0)) + bt1 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 1)) +
    bt2 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 2)) + bt3 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 3)) +
    bt4 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 4)) + bt5 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 5));
  x_idx += CI_SLICES;
  BtD_row[3] =
    bt0 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 0)) + bt1 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 1)) +
    bt2 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 2)) + bt3 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 3)) +
    bt4 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 4)) + bt5 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 5));
  x_idx += CI_SLICES;
  BtD_row[4] =
    bt0 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 0)) + bt1 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 1)) +
    bt2 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 2)) + bt3 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 3)) +
    bt4 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 4)) + bt5 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 5));
  x_idx += CI_SLICES;
  BtD_row[5] =
    bt0 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 0)) + bt1 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 1)) +
    bt2 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 2)) + bt3 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 3)) +
    bt4 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 4)) + bt5 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 5));

 #if FP16_ENABLE
    acc = min(acc, HALF_MAX);
    acc = max(acc, -HALF_MAX);
 #ifndef HALF_MAX  // adreno not exist
 #define HALF_MAX 0x1.ffcp15h
 #endif
    WRITE_IMAGE(output, (int2)(tile_hw, y_idx + y), acc);
  }
 #define LimitAcc()          \
  acc = min(acc, HALF_MAX); \
  acc = max(acc, -HALF_MAX);
 #else
 #define LimitAcc() \
  {}
 #endif

  int y_idx = ci_slice * 36 + row * 6;
  FLT4 acc = BtD_row[0] + (FLT)(-2.5f) * BtD_row[2] + BtD_row[4];
  LimitAcc();
  WRITE_IMAGE(output, (int2)(tile_hw, y_idx++), acc);

  FLT4 tmp0 = (FLT)(0.9428091049f) * BtD_row[1] + (FLT)(-0.4714044929f) * BtD_row[3];
  FLT4 tmp1 = (FLT)(1.3333333731f) * BtD_row[2] + (FLT)(-0.6666667461f) * BtD_row[4];
  acc = tmp0 + tmp1;
  LimitAcc();
  WRITE_IMAGE(output, (int2)(tile_hw, y_idx++), acc);

  acc = -tmp0 + tmp1;
  LimitAcc();
  WRITE_IMAGE(output, (int2)(tile_hw, y_idx++), acc);

  tmp0 = (FLT)(-0.1178511307f) * BtD_row[1] + (FLT)(0.2357022613f) * BtD_row[3];
  tmp1 = (FLT)(-0.0833333358f) * BtD_row[2] + (FLT)(0.1666666865f) * BtD_row[4];
  acc = tmp0 + tmp1;
  LimitAcc();
  WRITE_IMAGE(output, (int2)(tile_hw, y_idx++), acc);

  acc = -tmp0 + tmp1;
  LimitAcc();
  WRITE_IMAGE(output, (int2)(tile_hw, y_idx++), acc);

  acc = BtD_row[1] + (FLT)(-2.5f) * BtD_row[3] + BtD_row[5];
  LimitAcc();
  WRITE_IMAGE(output, (int2)(tile_hw, y_idx++), acc);
 }

 __kernel void WinogradConv2D(__read_only image2d_t input,    // height=CI_SLICES*36    width=TILE_HW
@@ -181,6 +231,22 @@ constant FLT At[24] = {1.0000000000f, 1.0000000000f, 1.0000000000f,  1.000000000
                       0.0000000000f, 0.4999999702f, 0.4999999702f,  1.9999998808f, 1.9999998808f,  0.0000000000f,
                       0.0000000000f, 0.3535533845f, -0.3535533845f, 2.8284270763f, -2.8284270763f, 1.0000000000f};

 #define UpdateAcc()                                   \
  if (bias != 0) acc += bias[co_slice];               \
  if (act_type == ActivationType_RELU) {              \
    acc = max(acc, (FLT4)(0.0f));                     \
  } else if (act_type == ActivationType_RELU6) {      \
    acc = clamp(acc, (FLT4)(0.0f), (FLT4)(6.0f));     \
  } else if (act_type == ActivationType_TANH) {       \
    FLT4 exp0 = exp(acc);                             \
    FLT4 exp1 = exp(-acc);                            \
    acc = (exp0 - exp1) / (exp0 + exp1);              \
  } else if (act_type == ActivationType_LEAKY_RELU) { \
    DO_LEAKY_RELU(acc, alpha);                        \
  } else if (act_type == ActivationType_SIGMOID) {    \
    acc = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-acc));    \
  }

 __kernel void Winograd36To4x4(__read_only image2d_t input,    // height=CO_SLICES*36    width=TILE_HW
                              __write_only image2d_t output,  // height=N*H             width=W*CO_SLICES
                              __global FLT4 *bias,
@@ -198,11 +264,49 @@ __kernel void Winograd36To4x4(__read_only image2d_t input,    // height=CO_SLICE

  constant FLT *At_row = At + row * 6;
  FLT4 AtM_row[6] = {0};
  for (int y = 0, idx = co_slice * 36; y < 6; y++) {
    for (int x = 0; x < 6; x++, idx++) {
      AtM_row[x] += At_row[y] * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx));
    }
  }
  int idx = co_slice * 36;
  FLT at = At_row[0];
  AtM_row[0] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
  AtM_row[1] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
  AtM_row[2] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
  AtM_row[3] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
  AtM_row[4] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
  AtM_row[5] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
  at = At_row[1];
  AtM_row[0] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
  AtM_row[1] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
  AtM_row[2] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
  AtM_row[3] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
  AtM_row[4] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
  AtM_row[5] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
  at = At_row[2];
  AtM_row[0] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
  AtM_row[1] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
  AtM_row[2] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
  AtM_row[3] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
  AtM_row[4] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
  AtM_row[5] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
  at = At_row[3];
  AtM_row[0] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
  AtM_row[1] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
  AtM_row[2] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
  AtM_row[3] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
  AtM_row[4] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
  AtM_row[5] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
  at = At_row[4];
  AtM_row[0] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
  AtM_row[1] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
  AtM_row[2] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
  AtM_row[3] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
  AtM_row[4] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
  AtM_row[5] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
  at = At_row[5];
  AtM_row[0] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
  AtM_row[1] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
  AtM_row[2] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
  AtM_row[3] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
  AtM_row[4] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
  AtM_row[5] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));

  int TILE_W = UP_DIV(W, 4);
  int tile_w = tile_hw % TILE_W;
@@ -210,30 +314,24 @@ __kernel void Winograd36To4x4(__read_only image2d_t input,    // height=CO_SLICE
  int h = tile_h * 4 + row;
  int w = tile_w * 4;
  int x_idx = w * CO_SLICES + co_slice;
  for (int x = 0, idx = 0; x < 4; x++) {
    FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
    for (int y = 0; y < 6; y++, idx++) {
      acc += AtM_row[y] * At[idx];
    }

    if (bias != 0) {
      acc += bias[co_slice];
    }

    if (act_type == ActivationType_RELU) {
      acc = max(acc, (FLT4)(0.0f));
    } else if (act_type == ActivationType_RELU6) {
      acc = clamp(acc, (FLT4)(0.0f), (FLT4)(6.0f));
    } else if (act_type == ActivationType_TANH) {
      FLT4 exp0 = exp(acc);
      FLT4 exp1 = exp(-acc);
      acc = (exp0 - exp1) / (exp0 + exp1);
    } else if (act_type == ActivationType_LEAKY_RELU) {
      DO_LEAKY_RELU(acc, alpha);
    } else if (act_type == ActivationType_SIGMOID) {
      acc = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-acc));
    }
    WRITE_IMAGE(output, (int2)(x_idx, h), acc);
    x_idx += CO_SLICES;
  }

  FLT4 acc = AtM_row[0] + AtM_row[1] + AtM_row[2] + AtM_row[3] + AtM_row[4];
  UpdateAcc();
  WRITE_IMAGE(output, (int2)(x_idx, h), acc);
  x_idx += CO_SLICES;

  acc = (FLT)(0.7071067691f) * (AtM_row[1] - AtM_row[2]) + (FLT)(1.4142135382f) * (AtM_row[3] - AtM_row[4]);
  UpdateAcc();
  WRITE_IMAGE(output, (int2)(x_idx, h), acc);
  x_idx += CO_SLICES;

  acc = (FLT)(0.5f) * (AtM_row[1] + AtM_row[2]) + (FLT)(2.0f) * (AtM_row[3] + AtM_row[4]);
  UpdateAcc();
  WRITE_IMAGE(output, (int2)(x_idx, h), acc);
  x_idx += CO_SLICES;

  acc =
    (FLT)(0.3535533845f) * (AtM_row[1] - AtM_row[2]) + (FLT)(2.8284270763f) * (AtM_row[3] - AtM_row[4]) + AtM_row[5];
  UpdateAcc();
  WRITE_IMAGE(output, (int2)(x_idx, h), acc);
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc
@@ -144,7 +144,9 @@ void Conv2DOpenCLKernel::BuildKernel() {
    kernel_name << "_Img";
  }
  ocl_runtime_->LoadSource(program_name, GetActDefines() + conv2d_source);
  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name.str());
  std::string build_option =
    (OW_ * CO_SLICES_ <= ocl_runtime_->GetMaxImage2DWidth()) ? "" : " -DEXCEDD_MAX_IMAGE2D_WIDTH";
  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name.str(), {build_option});
 }

 void Conv2DOpenCLKernel::SetBlockSize() {
@@ -436,13 +438,6 @@ OpParameter *CreateFcParam(const ConvParameter *conv_param, const std::vector<li

 bool UseWinograd4x4To6x6(const ConvParameter *param, const std::vector<lite::Tensor *> &inputs,
                         const std::vector<lite::Tensor *> &outputs) {
  // not use winograd on adreno gpu
  lite::opencl::OpenCLRuntimeWrapper runtime_wrap;
  lite::opencl::OpenCLRuntime *runtime = runtime_wrap.GetInstance();
  if (runtime->GetGpuInfo().type == lite::opencl::GpuType::ADRENO) {
    return false;
  }

  if (!(inputs.size() == 2 || inputs.size() == 3) || outputs.empty()) {
    return false;
  }
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pad.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pad.cc
@@ -62,6 +62,11 @@ int PadOpenCLKernel::CheckSpecs() {
    MS_LOG(ERROR) << "Pad only support CONSTANT MODE.";
    return RET_ERROR;
  }
  auto pad_shape = in_tensors_.at(1)->shape();
  if (pad_shape.size() != 2 || pad_shape[0] != in_ndim || pad_shape[1] != 2) {
    MS_LOG(ERROR) << "pad tensor shape invalid.";
    return RET_ERROR;
  }
  return RET_OK;
 }

@@ -86,8 +91,9 @@ void PadOpenCLKernel::SetConstArgs() {
  int ndim = in_tensors_.front()->shape().size();
  std::vector<int> pad_before_ori;
  pad_before_ori.reserve(ndim);
  auto paddings = reinterpret_cast<int32_t *>(in_tensors_.at(1)->data_c());
  for (size_t i = 0; i < ndim; i++) {
    pad_before_ori.push_back(param_->paddings_[MAX_PAD_SIZE - 2 * ndim + 2 * i]);
    pad_before_ori.push_back(paddings[2 * i]);
  }
  cl_int4 pad_before;
  Broadcast2GpuShape(pad_before.s, pad_before_ori.data(), ndim, 0);
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.cc
@@ -15,7 +15,9 @@
 */

 #include "src/runtime/kernel/opencl/kernel/winograd.h"
 #include <memory>
 #include "src/runtime/kernel/opencl/cl/winograd.cl.inc"
 #include "nnacl/base/minimal_filtering_generator.h"

 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
@@ -30,13 +32,15 @@ void Align(const std::vector<int> &global, const std::vector<int> &local, cl::ND
    cl::NDRange(UP_ROUND(global[0], local[0]), UP_ROUND(global[1], local[1]), UP_ROUND(global[2], local[2]));
 }

 constexpr float Gt[] = {1.0000000000, 1.0000000000, 1.0000000000,  1.0000000000, 1.0000000000,  0.0000000000,
                        0.0000000000, 0.7071067691, -0.7071067691, 1.4142135382, -1.4142135382, 0.0000000000,
                        0.0000000000, 0.4999999702, 0.4999999702,  1.9999998808, 1.9999998808,  1.0000000000};

 #ifndef ENABLE_ARM64
 constexpr float G[] = {1.0000000000, 0.0000000000,  0.0000000000, 1.0000000000, 0.7071067691, 0.4999999702,
                       1.0000000000, -0.7071067691, 0.4999999702, 1.0000000000, 1.4142135382, 1.9999998808,
                       1.0000000000, -1.4142135382, 1.9999998808, 0.0000000000, 0.0000000000, 1.0000000000};
 std::vector<float> GenerateWinogradFilter(void *src, TypeId dtype, size_t CO, size_t CI) {
  constexpr float Gt[] = {1.0000000000, 1.0000000000, 1.0000000000,  1.0000000000, 1.0000000000,  0.0000000000,
                          0.0000000000, 0.7071067691, -0.7071067691, 1.4142135382, -1.4142135382, 0.0000000000,
                          0.0000000000, 0.4999999702, 0.4999999702,  1.9999998808, 1.9999998808,  1.0000000000};
  constexpr float G[] = {1.0000000000, 0.0000000000,  0.0000000000, 1.0000000000, 0.7071067691, 0.4999999702,
                         1.0000000000, -0.7071067691, 0.4999999702, 1.0000000000, 1.4142135382, 1.9999998808,
                         1.0000000000, -1.4142135382, 1.9999998808, 0.0000000000, 0.0000000000, 1.0000000000};
  auto src_fp32 = reinterpret_cast<float *>(src);
  auto src_fp16 = reinterpret_cast<float16_t *>(src);
  std::function<float(int)> access_func;
@@ -71,6 +75,8 @@ std::vector<float> GenerateWinogradFilter(void *src, TypeId dtype, size_t CO, si
  }
  return dst;
 }
 #endif

 }  // namespace

 void WinogradOpenCLKernel::BuildKernel() {
@@ -106,9 +112,17 @@ void WinogradOpenCLKernel::InitFilter() {

  // rearrange filter
  auto filter_tensor = in_tensors_.at(1);
 #ifndef ENABLE_ARM64
  auto winograd_filter = GenerateWinogradFilter(filter_tensor->data_c(), filter_tensor->data_type(), CO_, CI_);

  void *src_data = winograd_filter.data();
 #else
  std::unique_ptr<float[]> winograd_filter(new float[CO_ * 6 * 6 * CI_]);
  WinogradWeightTransform(reinterpret_cast<const float *>(filter_tensor->data_c()),
                          reinterpret_cast<float *>(winograd_filter.get()), nullptr, Gt, 1, 6, 3, CI_, CO_, false);

  void *src_data = winograd_filter.get();
 #endif

  auto src_dtype = kNumberTypeFloat32;
  auto dst_dtype = use_fp16_ ? kNumberTypeFloat16 : kNumberTypeFloat32;
  std::vector<char> tmp(size, 0);
@@ -173,8 +187,8 @@ void WinogradOpenCLKernel::SetConstArgs() {

 void WinogradOpenCLKernel::SetGlobalLocal() {
  Align({TILE_HW_, 6, CI_SLICES_}, {8, 6, 4}, &global_4x4to36_, &local_4x4to36_);
  Align({UP_DIV(TILE_HW_, 2), 36, UP_DIV(CO_SLICES_, 2)}, {8, 6, 2}, &global_range_, &local_range_);
  Align({TILE_HW_, 4, CO_SLICES_}, {32, 4, 2}, &global_36to4x4_, &local_36to4x4_);
  Align({UP_DIV(TILE_HW_, 2), 36, UP_DIV(CO_SLICES_, 2)}, {8, 3, 8}, &global_range_, &local_range_);
  Align({TILE_HW_, 4, CO_SLICES_}, {4, 4, 8}, &global_36to4x4_, &local_36to4x4_);
 }

 int WinogradOpenCLKernel::Run() {
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.h
@@ -29,7 +29,6 @@ class WinogradOpenCLKernel : public Conv2DOpenCLKernel {
                       const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
      : Conv2DOpenCLKernel(parameter, inputs, outputs, ctx) {
    use_winograd_ = true;
    filter_type_ = MemType::BUF;
  }

  ~WinogradOpenCLKernel() override = default;
--- a/mindspore/lite/src/runtime/kernel/opencl/opencl_fusion.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_fusion.cc
@@ -243,10 +243,11 @@ void TryMergePadXxx(LiteKernel *node, std::set<LiteKernel *> *removed_set, std::

  auto *conv_param = reinterpret_cast<ParamType *>(reinterpret_cast<OpenCLKernel *>(node)->GetParameter());
  MS_ASSERT(conv_param);
  conv_param->pad_u_ += pad_param->paddings_[2];
  conv_param->pad_d_ += pad_param->paddings_[3];
  conv_param->pad_l_ += pad_param->paddings_[4];
  conv_param->pad_r_ += pad_param->paddings_[5];
  auto paddings = reinterpret_cast<int32_t *>(pad->in_tensors().at(1)->data_c());
  conv_param->pad_u_ += paddings[2];
  conv_param->pad_d_ += paddings[3];
  conv_param->pad_l_ += paddings[4];
  conv_param->pad_r_ += paddings[5];
  pad->set_in_tensors({pad->in_tensors().front()});
  MergeRemoveA(pad, node, removed_set);
  MS_LOG(DEBUG) << "Merge Pad and " + GetTypeName(node) + " success";