diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d.cl
index 674bc64c91..f31b4045c5 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d.cl
@@ -91,11 +91,11 @@ __kernel void Conv2D_H1W1C1(__read_only image2d_t input, __write_only image2d_t
     out_h0_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c0));
   }
 
-  if (OW * CO_SLICES <= MAX_IMAGE2D_WIDTH) {
-    WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
-  } else {
-    WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0);
-  }
+#ifndef EXCEDD_MAX_IMAGE2D_WIDTH
+  WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
+#else
+  WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0);
+#endif
 }
 
 __kernel void Conv2D_H2W1C1(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
@@ -172,17 +172,17 @@ __kernel void Conv2D_H2W1C1(__read_only image2d_t input, __write_only image2d_t
     out_h1_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c0));
   }
 
-  if (OW * CO_SLICES <= MAX_IMAGE2D_WIDTH) {
-    WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
-    if (oh1 < OH) {
-      WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0);
-    }  // end if (oh1 < OH)
-  } else {
-    WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0);
-    if (oh1 < OH) {
-      WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow0), out_h1_w0_c0);
-    }  // end (oh1 < OH)
-  }
+#ifndef EXCEDD_MAX_IMAGE2D_WIDTH
+  WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
+  if (oh1 < OH) {
+    WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0);
+  }  // end if (oh1 < OH)
+#else
+  WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0);
+  if (oh1 < OH) {
+    WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow0), out_h1_w0_c0);
+  }  // end (oh1 < OH)
+#endif
 }
 
 __kernel void Conv2D_H2W1C2(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
@@ -283,29 +283,27 @@ __kernel void Conv2D_H2W1C2(__read_only image2d_t input, __write_only image2d_t
     out_h1_w0_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c1));
   }
 
-  if (OW * CO_SLICES <= MAX_IMAGE2D_WIDTH) {
-    WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
+#ifndef EXCEDD_MAX_IMAGE2D_WIDTH
+  WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
+  if (oh1 < OH) {
+    WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0);
+  }  // end if (oh1 < OH)
+  if (co_slice1 < CO_SLICES) {
+    WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh0), out_h0_w0_c1);
     if (oh1 < OH) {
-      WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0);
+      WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh1), out_h1_w0_c1);
     }  // end if (oh1 < OH)
-    if (co_slice1 < CO_SLICES) {
-      WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh0), out_h0_w0_c1);
-      if (oh1 < OH) {
-        WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh1), out_h1_w0_c1);
-      }  // end if (oh1 < OH)
-    }    // end if (co_slice1 < CO_SLICES)
-  } else {
-    WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0);
-    if (oh1 < OH) {
-      WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow0), out_h1_w0_c0);
-    }  // end (oh1 < OH)
-    if (co_slice1 < CO_SLICES) {
-      WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow0), out_h0_w0_c1);
-      if (oh1 < OH) {
-        WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow0), out_h1_w0_c1);
-      }  // end if (oh1 < OH)
-    }    // end if (co_slice1 < CO_SLICES)
-  }
+  }    // end if (co_slice1 < CO_SLICES)
+#else
+  WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0);
+  if (oh1 < OH) {
+    WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow0), out_h1_w0_c0);
+  }  // end (oh1 < OH)
+  WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow0), out_h0_w0_c1);
+  if (oh1 < OH) {
+    WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow0), out_h1_w0_c1);
+  }  // end if (oh1 < OH)
+#endif
 }
 
 __kernel void Conv2D_H2W2C2(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
@@ -456,37 +454,35 @@ __kernel void Conv2D_H2W2C2(__read_only image2d_t input, __write_only image2d_t
     out_h1_w1_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w1_c1));
   }
 
-  if (OW * CO_SLICES <= MAX_IMAGE2D_WIDTH) {
-    WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
-    WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh0), out_h0_w1_c0);
+#ifndef EXCEDD_MAX_IMAGE2D_WIDTH
+  WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
+  WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh0), out_h0_w1_c0);
+  if (oh1 < OH) {
+    WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0);
+    WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh1), out_h1_w1_c0);
+  }  // end if (oh1 < OH)
+  if (co_slice1 < CO_SLICES) {
+    WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh0), out_h0_w0_c1);
+    WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh0), out_h0_w1_c1);
     if (oh1 < OH) {
-      WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0);
-      WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh1), out_h1_w1_c0);
+      WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh1), out_h1_w0_c1);
+      WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh1), out_h1_w1_c1);
     }  // end if (oh1 < OH)
-    if (co_slice1 < CO_SLICES) {
-      WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh0), out_h0_w0_c1);
-      WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh0), out_h0_w1_c1);
-      if (oh1 < OH) {
-        WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh1), out_h1_w0_c1);
-        WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh1), out_h1_w1_c1);
-      }  // end if (oh1 < OH)
-    }    // end if (co_slice1 < CO_SLICES)
-  } else {
-    WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0);
-    WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow1), out_h0_w1_c0);
-    if (oh1 < OH) {
-      WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow0), out_h1_w0_c0);
-      WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow1), out_h1_w1_c0);
-    }  // end (oh1 < OH)
-    if (co_slice1 < CO_SLICES) {
-      WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow0), out_h0_w0_c1);
-      WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow1), out_h0_w1_c1);
-      if (oh1 < OH) {
-        WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow0), out_h1_w0_c1);
-        WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow1), out_h1_w1_c1);
-      }  // end if (oh1 < OH)
-    }    // end if (co_slice1 < CO_SLICES)
-  }
+  }    // end if (co_slice1 < CO_SLICES)
+#else
+  WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0);
+  WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow1), out_h0_w1_c0);
+  if (oh1 < OH) {
+    WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow0), out_h1_w0_c0);
+    WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow1), out_h1_w1_c0);
+  }  // end (oh1 < OH)
+  WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow0), out_h0_w0_c1);
+  WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow1), out_h0_w1_c1);
+  if (oh1 < OH) {
+    WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow0), out_h1_w0_c1);
+    WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow1), out_h1_w1_c1);
+  }  // end if (oh1 < OH)
+#endif
 }
 
 __kernel void Conv2D_H2W2C2_Img(__read_only image2d_t input, __write_only image2d_t output,
@@ -644,35 +640,33 @@ __kernel void Conv2D_H2W2C2_Img(__read_only image2d_t input, __write_only image2
     out_h1_w1_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w1_c1));
   }
 
-  if (OW * CO_SLICES <= MAX_IMAGE2D_WIDTH) {
-    WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
-    WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh0), out_h0_w1_c0);
+#ifndef EXCEDD_MAX_IMAGE2D_WIDTH
+  WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
+  WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh0), out_h0_w1_c0);
+  if (oh1 < OH) {
+    WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0);
+    WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh1), out_h1_w1_c0);
+  }  // end if (oh1 < OH)
+  if (co_slice1 < CO_SLICES) {
+    WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh0), out_h0_w0_c1);
+    WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh0), out_h0_w1_c1);
     if (oh1 < OH) {
-      WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0);
-      WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh1), out_h1_w1_c0);
+      WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh1), out_h1_w0_c1);
+      WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh1), out_h1_w1_c1);
     }  // end if (oh1 < OH)
-    if (co_slice1 < CO_SLICES) {
-      WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh0), out_h0_w0_c1);
-      WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh0), out_h0_w1_c1);
-      if (oh1 < OH) {
-        WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh1), out_h1_w0_c1);
-        WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh1), out_h1_w1_c1);
-      }  // end if (oh1 < OH)
-    }    // end if (co_slice1 < CO_SLICES)
-  } else {
-    WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0);
-    WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow1), out_h0_w1_c0);
-    if (oh1 < OH) {
-      WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow0), out_h1_w0_c0);
-      WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow1), out_h1_w1_c0);
-    }  // end (oh1 < OH)
-    if (co_slice1 < CO_SLICES) {
-      WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow0), out_h0_w0_c1);
-      WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow1), out_h0_w1_c1);
-      if (oh1 < OH) {
-        WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow0), out_h1_w0_c1);
-        WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow1), out_h1_w1_c1);
-      }  // end if (oh1 < OH)
-    }    // end if (co_slice1 < CO_SLICES)
-  }
+  }    // end if (co_slice1 < CO_SLICES)
+#else
+  WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0);
+  WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow1), out_h0_w1_c0);
+  if (oh1 < OH) {
+    WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow0), out_h1_w0_c0);
+    WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow1), out_h1_w1_c0);
+  }  // end (oh1 < OH)
+  WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow0), out_h0_w0_c1);
+  WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow1), out_h0_w1_c1);
+  if (oh1 < OH) {
+    WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow0), out_h1_w0_c1);
+    WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow1), out_h1_w1_c1);
+  }  // end if (oh1 < OH)
+#endif
 }
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/winograd.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/winograd.cl
index 6eb7f96a9f..ca3b53af42 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/winograd.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/winograd.cl
@@ -35,29 +35,79 @@ __kernel void Winograd4x4To36(__read_only image2d_t input,    // height=N*H
   FLT4 BtD_row[6] = {0};
   int h = tile_h * 4 - pad;
   int w = tile_w * 4 - pad;
-  for (int y = 0; y < 6; y++) {
-    int x_idx = w * CI_SLICES + ci_slice;
-    for (int x = 0; x < 6; x++) {
-      // no need to check w: because ci_slice is in [0, CI_SLICES). when w<0, x_idx<0; w>=W, x_idx>=W*CI_SLICES
-      // if (w < 0 || w >= W) { continue; }
-      BtD_row[x] += Bt_row[y] * READ_IMAGE(input, smp_zero, (int2)(x_idx, h));
-      x_idx += CI_SLICES;
-    }
-    h++;
-  }
 
-  int y_idx = ci_slice * 36 + row * 6;
-  for (int y = 0; y < 6; y++) {
-    FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
-    for (int x = 0; x < 6; x++) {
-      acc += BtD_row[x] * Bt[y * 6 + x];
-    }
+  int x_idx = w * CI_SLICES + ci_slice;
+  FLT bt0 = Bt_row[0], bt1 = Bt_row[1], bt2 = Bt_row[2], bt3 = Bt_row[3], bt4 = Bt_row[4], bt5 = Bt_row[5];
+  BtD_row[0] =
+    bt0 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 0)) + bt1 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 1)) +
+    bt2 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 2)) + bt3 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 3)) +
+    bt4 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 4)) + bt5 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 5));
+  x_idx += CI_SLICES;
+  BtD_row[1] =
+    bt0 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 0)) + bt1 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 1)) +
+    bt2 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 2)) + bt3 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 3)) +
+    bt4 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 4)) + bt5 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 5));
+  x_idx += CI_SLICES;
+  BtD_row[2] =
+    bt0 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 0)) + bt1 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 1)) +
+    bt2 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 2)) + bt3 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 3)) +
+    bt4 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 4)) + bt5 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 5));
+  x_idx += CI_SLICES;
+  BtD_row[3] =
+    bt0 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 0)) + bt1 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 1)) +
+    bt2 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 2)) + bt3 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 3)) +
+    bt4 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 4)) + bt5 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 5));
+  x_idx += CI_SLICES;
+  BtD_row[4] =
+    bt0 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 0)) + bt1 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 1)) +
+    bt2 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 2)) + bt3 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 3)) +
+    bt4 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 4)) + bt5 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 5));
+  x_idx += CI_SLICES;
+  BtD_row[5] =
+    bt0 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 0)) + bt1 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 1)) +
+    bt2 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 2)) + bt3 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 3)) +
+    bt4 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 4)) + bt5 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 5));
+
 #if FP16_ENABLE
-    acc = min(acc, HALF_MAX);
-    acc = max(acc, -HALF_MAX);
+#ifndef HALF_MAX  // adreno not exist
+#define HALF_MAX 0x1.ffcp15h
 #endif
-    WRITE_IMAGE(output, (int2)(tile_hw, y_idx + y), acc);
-  }
+#define LimitAcc()          \
+  acc = min(acc, HALF_MAX); \
+  acc = max(acc, -HALF_MAX);
+#else
+#define LimitAcc() \
+  {}
+#endif
+
+  int y_idx = ci_slice * 36 + row * 6;
+  FLT4 acc = BtD_row[0] + (FLT)(-2.5f) * BtD_row[2] + BtD_row[4];
+  LimitAcc();
+  WRITE_IMAGE(output, (int2)(tile_hw, y_idx++), acc);
+
+  FLT4 tmp0 = (FLT)(0.9428091049f) * BtD_row[1] + (FLT)(-0.4714044929f) * BtD_row[3];
+  FLT4 tmp1 = (FLT)(1.3333333731f) * BtD_row[2] + (FLT)(-0.6666667461f) * BtD_row[4];
+  acc = tmp0 + tmp1;
+  LimitAcc();
+  WRITE_IMAGE(output, (int2)(tile_hw, y_idx++), acc);
+
+  acc = -tmp0 + tmp1;
+  LimitAcc();
+  WRITE_IMAGE(output, (int2)(tile_hw, y_idx++), acc);
+
+  tmp0 = (FLT)(-0.1178511307f) * BtD_row[1] + (FLT)(0.2357022613f) * BtD_row[3];
+  tmp1 = (FLT)(-0.0833333358f) * BtD_row[2] + (FLT)(0.1666666865f) * BtD_row[4];
+  acc = tmp0 + tmp1;
+  LimitAcc();
+  WRITE_IMAGE(output, (int2)(tile_hw, y_idx++), acc);
+
+  acc = -tmp0 + tmp1;
+  LimitAcc();
+  WRITE_IMAGE(output, (int2)(tile_hw, y_idx++), acc);
+
+  acc = BtD_row[1] + (FLT)(-2.5f) * BtD_row[3] + BtD_row[5];
+  LimitAcc();
+  WRITE_IMAGE(output, (int2)(tile_hw, y_idx++), acc);
 }
 
 __kernel void WinogradConv2D(__read_only image2d_t input,    // height=CI_SLICES*36    width=TILE_HW
@@ -181,6 +231,22 @@ constant FLT At[24] = {1.0000000000f, 1.0000000000f, 1.0000000000f,  1.000000000
                        0.0000000000f, 0.4999999702f, 0.4999999702f,  1.9999998808f, 1.9999998808f,  0.0000000000f,
                        0.0000000000f, 0.3535533845f, -0.3535533845f, 2.8284270763f, -2.8284270763f, 1.0000000000f};
 
+#define UpdateAcc()                                   \
+  if (bias != 0) acc += bias[co_slice];               \
+  if (act_type == ActivationType_RELU) {              \
+    acc = max(acc, (FLT4)(0.0f));                     \
+  } else if (act_type == ActivationType_RELU6) {      \
+    acc = clamp(acc, (FLT4)(0.0f), (FLT4)(6.0f));     \
+  } else if (act_type == ActivationType_TANH) {       \
+    FLT4 exp0 = exp(acc);                             \
+    FLT4 exp1 = exp(-acc);                            \
+    acc = (exp0 - exp1) / (exp0 + exp1);              \
+  } else if (act_type == ActivationType_LEAKY_RELU) { \
+    DO_LEAKY_RELU(acc, alpha);                        \
+  } else if (act_type == ActivationType_SIGMOID) {    \
+    acc = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-acc));    \
+  }
+
 __kernel void Winograd36To4x4(__read_only image2d_t input,    // height=CO_SLICES*36    width=TILE_HW
                               __write_only image2d_t output,  // height=N*H             width=W*CO_SLICES
                               __global FLT4 *bias,
@@ -198,11 +264,49 @@ __kernel void Winograd36To4x4(__read_only image2d_t input,    // height=CO_SLICE
 
   constant FLT *At_row = At + row * 6;
   FLT4 AtM_row[6] = {0};
-  for (int y = 0, idx = co_slice * 36; y < 6; y++) {
-    for (int x = 0; x < 6; x++, idx++) {
-      AtM_row[x] += At_row[y] * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx));
-    }
-  }
+  int idx = co_slice * 36;
+  FLT at = At_row[0];
+  AtM_row[0] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
+  AtM_row[1] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
+  AtM_row[2] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
+  AtM_row[3] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
+  AtM_row[4] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
+  AtM_row[5] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
+  at = At_row[1];
+  AtM_row[0] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
+  AtM_row[1] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
+  AtM_row[2] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
+  AtM_row[3] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
+  AtM_row[4] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
+  AtM_row[5] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
+  at = At_row[2];
+  AtM_row[0] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
+  AtM_row[1] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
+  AtM_row[2] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
+  AtM_row[3] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
+  AtM_row[4] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
+  AtM_row[5] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
+  at = At_row[3];
+  AtM_row[0] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
+  AtM_row[1] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
+  AtM_row[2] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
+  AtM_row[3] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
+  AtM_row[4] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
+  AtM_row[5] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
+  at = At_row[4];
+  AtM_row[0] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
+  AtM_row[1] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
+  AtM_row[2] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
+  AtM_row[3] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
+  AtM_row[4] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
+  AtM_row[5] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
+  at = At_row[5];
+  AtM_row[0] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
+  AtM_row[1] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
+  AtM_row[2] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
+  AtM_row[3] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
+  AtM_row[4] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
+  AtM_row[5] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
 
   int TILE_W = UP_DIV(W, 4);
   int tile_w = tile_hw % TILE_W;
@@ -210,30 +314,24 @@ __kernel void Winograd36To4x4(__read_only image2d_t input,    // height=CO_SLICE
   int h = tile_h * 4 + row;
   int w = tile_w * 4;
   int x_idx = w * CO_SLICES + co_slice;
-  for (int x = 0, idx = 0; x < 4; x++) {
-    FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
-    for (int y = 0; y < 6; y++, idx++) {
-      acc += AtM_row[y] * At[idx];
-    }
-
-    if (bias != 0) {
-      acc += bias[co_slice];
-    }
-
-    if (act_type == ActivationType_RELU) {
-      acc = max(acc, (FLT4)(0.0f));
-    } else if (act_type == ActivationType_RELU6) {
-      acc = clamp(acc, (FLT4)(0.0f), (FLT4)(6.0f));
-    } else if (act_type == ActivationType_TANH) {
-      FLT4 exp0 = exp(acc);
-      FLT4 exp1 = exp(-acc);
-      acc = (exp0 - exp1) / (exp0 + exp1);
-    } else if (act_type == ActivationType_LEAKY_RELU) {
-      DO_LEAKY_RELU(acc, alpha);
-    } else if (act_type == ActivationType_SIGMOID) {
-      acc = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-acc));
-    }
-    WRITE_IMAGE(output, (int2)(x_idx, h), acc);
-    x_idx += CO_SLICES;
-  }
+
+  FLT4 acc = AtM_row[0] + AtM_row[1] + AtM_row[2] + AtM_row[3] + AtM_row[4];
+  UpdateAcc();
+  WRITE_IMAGE(output, (int2)(x_idx, h), acc);
+  x_idx += CO_SLICES;
+
+  acc = (FLT)(0.7071067691f) * (AtM_row[1] - AtM_row[2]) + (FLT)(1.4142135382f) * (AtM_row[3] - AtM_row[4]);
+  UpdateAcc();
+  WRITE_IMAGE(output, (int2)(x_idx, h), acc);
+  x_idx += CO_SLICES;
+
+  acc = (FLT)(0.5f) * (AtM_row[1] + AtM_row[2]) + (FLT)(2.0f) * (AtM_row[3] + AtM_row[4]);
+  UpdateAcc();
+  WRITE_IMAGE(output, (int2)(x_idx, h), acc);
+  x_idx += CO_SLICES;
+
+  acc =
+    (FLT)(0.3535533845f) * (AtM_row[1] - AtM_row[2]) + (FLT)(2.8284270763f) * (AtM_row[3] - AtM_row[4]) + AtM_row[5];
+  UpdateAcc();
+  WRITE_IMAGE(output, (int2)(x_idx, h), acc);
 }
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc
index 8eef95289a..aa745b6613 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc
@@ -144,7 +144,9 @@ void Conv2DOpenCLKernel::BuildKernel() {
     kernel_name << "_Img";
   }
   ocl_runtime_->LoadSource(program_name, GetActDefines() + conv2d_source);
-  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name.str());
+  std::string build_option =
+    (OW_ * CO_SLICES_ <= ocl_runtime_->GetMaxImage2DWidth()) ? "" : " -DEXCEDD_MAX_IMAGE2D_WIDTH";
+  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name.str(), {build_option});
 }
 
 void Conv2DOpenCLKernel::SetBlockSize() {
@@ -436,13 +438,6 @@ OpParameter *CreateFcParam(const ConvParameter *conv_param, const std::vector<li
 
 bool UseWinograd4x4To6x6(const ConvParameter *param, const std::vector<lite::Tensor *> &inputs,
                          const std::vector<lite::Tensor *> &outputs) {
-  // not use winograd on adreno gpu
-  lite::opencl::OpenCLRuntimeWrapper runtime_wrap;
-  lite::opencl::OpenCLRuntime *runtime = runtime_wrap.GetInstance();
-  if (runtime->GetGpuInfo().type == lite::opencl::GpuType::ADRENO) {
-    return false;
-  }
-
   if (!(inputs.size() == 2 || inputs.size() == 3) || outputs.empty()) {
     return false;
   }
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/pad.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/pad.cc
index 49b5ca0d7c..0ab92e2b1e 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pad.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pad.cc
@@ -62,6 +62,11 @@ int PadOpenCLKernel::CheckSpecs() {
     MS_LOG(ERROR) << "Pad only support CONSTANT MODE.";
     return RET_ERROR;
   }
+  auto pad_shape = in_tensors_.at(1)->shape();
+  if (pad_shape.size() != 2 || pad_shape[0] != in_ndim || pad_shape[1] != 2) {
+    MS_LOG(ERROR) << "pad tensor shape invalid.";
+    return RET_ERROR;
+  }
   return RET_OK;
 }
 
@@ -86,8 +91,9 @@ void PadOpenCLKernel::SetConstArgs() {
   int ndim = in_tensors_.front()->shape().size();
   std::vector<int> pad_before_ori;
   pad_before_ori.reserve(ndim);
+  auto paddings = reinterpret_cast<int32_t *>(in_tensors_.at(1)->data_c());
   for (size_t i = 0; i < ndim; i++) {
-    pad_before_ori.push_back(param_->paddings_[MAX_PAD_SIZE - 2 * ndim + 2 * i]);
+    pad_before_ori.push_back(paddings[2 * i]);
   }
   cl_int4 pad_before;
   Broadcast2GpuShape(pad_before.s, pad_before_ori.data(), ndim, 0);
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.cc
index 6256ec1356..96508fe0cc 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.cc
@@ -15,7 +15,9 @@
  */
 
 #include "src/runtime/kernel/opencl/kernel/winograd.h"
+#include <memory>
 #include "src/runtime/kernel/opencl/cl/winograd.cl.inc"
+#include "nnacl/base/minimal_filtering_generator.h"
 
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
@@ -30,13 +32,15 @@ void Align(const std::vector<int> &global, const std::vector<int> &local, cl::ND
     cl::NDRange(UP_ROUND(global[0], local[0]), UP_ROUND(global[1], local[1]), UP_ROUND(global[2], local[2]));
 }
 
+constexpr float Gt[] = {1.0000000000, 1.0000000000, 1.0000000000,  1.0000000000, 1.0000000000,  0.0000000000,
+                        0.0000000000, 0.7071067691, -0.7071067691, 1.4142135382, -1.4142135382, 0.0000000000,
+                        0.0000000000, 0.4999999702, 0.4999999702,  1.9999998808, 1.9999998808,  1.0000000000};
+
+#ifndef ENABLE_ARM64
+constexpr float G[] = {1.0000000000, 0.0000000000,  0.0000000000, 1.0000000000, 0.7071067691, 0.4999999702,
+                       1.0000000000, -0.7071067691, 0.4999999702, 1.0000000000, 1.4142135382, 1.9999998808,
+                       1.0000000000, -1.4142135382, 1.9999998808, 0.0000000000, 0.0000000000, 1.0000000000};
 std::vector<float> GenerateWinogradFilter(void *src, TypeId dtype, size_t CO, size_t CI) {
-  constexpr float Gt[] = {1.0000000000, 1.0000000000, 1.0000000000,  1.0000000000, 1.0000000000,  0.0000000000,
-                          0.0000000000, 0.7071067691, -0.7071067691, 1.4142135382, -1.4142135382, 0.0000000000,
-                          0.0000000000, 0.4999999702, 0.4999999702,  1.9999998808, 1.9999998808,  1.0000000000};
-  constexpr float G[] = {1.0000000000, 0.0000000000,  0.0000000000, 1.0000000000, 0.7071067691, 0.4999999702,
-                         1.0000000000, -0.7071067691, 0.4999999702, 1.0000000000, 1.4142135382, 1.9999998808,
-                         1.0000000000, -1.4142135382, 1.9999998808, 0.0000000000, 0.0000000000, 1.0000000000};
   auto src_fp32 = reinterpret_cast<float *>(src);
   auto src_fp16 = reinterpret_cast<float16_t *>(src);
   std::function<float(int)> access_func;
@@ -71,6 +75,8 @@ std::vector<float> GenerateWinogradFilter(void *src, TypeId dtype, size_t CO, si
   }
   return dst;
 }
+#endif
+
 }  // namespace
 
 void WinogradOpenCLKernel::BuildKernel() {
@@ -106,9 +112,17 @@ void WinogradOpenCLKernel::InitFilter() {
 
   // rearrange filter
   auto filter_tensor = in_tensors_.at(1);
+#ifndef ENABLE_ARM64
   auto winograd_filter = GenerateWinogradFilter(filter_tensor->data_c(), filter_tensor->data_type(), CO_, CI_);
-
   void *src_data = winograd_filter.data();
+#else
+  std::unique_ptr<float[]> winograd_filter(new float[CO_ * 6 * 6 * CI_]);
+  WinogradWeightTransform(reinterpret_cast<const float *>(filter_tensor->data_c()),
+                          reinterpret_cast<float *>(winograd_filter.get()), nullptr, Gt, 1, 6, 3, CI_, CO_, false);
+
+  void *src_data = winograd_filter.get();
+#endif
+
   auto src_dtype = kNumberTypeFloat32;
   auto dst_dtype = use_fp16_ ? kNumberTypeFloat16 : kNumberTypeFloat32;
   std::vector<char> tmp(size, 0);
@@ -173,8 +187,8 @@ void WinogradOpenCLKernel::SetConstArgs() {
 
 void WinogradOpenCLKernel::SetGlobalLocal() {
   Align({TILE_HW_, 6, CI_SLICES_}, {8, 6, 4}, &global_4x4to36_, &local_4x4to36_);
-  Align({UP_DIV(TILE_HW_, 2), 36, UP_DIV(CO_SLICES_, 2)}, {8, 6, 2}, &global_range_, &local_range_);
-  Align({TILE_HW_, 4, CO_SLICES_}, {32, 4, 2}, &global_36to4x4_, &local_36to4x4_);
+  Align({UP_DIV(TILE_HW_, 2), 36, UP_DIV(CO_SLICES_, 2)}, {8, 3, 8}, &global_range_, &local_range_);
+  Align({TILE_HW_, 4, CO_SLICES_}, {4, 4, 8}, &global_36to4x4_, &local_36to4x4_);
 }
 
 int WinogradOpenCLKernel::Run() {
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.h
index e2a48d4c63..edff463cbb 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.h
@@ -29,7 +29,6 @@ class WinogradOpenCLKernel : public Conv2DOpenCLKernel {
                        const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
       : Conv2DOpenCLKernel(parameter, inputs, outputs, ctx) {
     use_winograd_ = true;
-    filter_type_ = MemType::BUF;
   }
 
   ~WinogradOpenCLKernel() override = default;
diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_fusion.cc b/mindspore/lite/src/runtime/kernel/opencl/opencl_fusion.cc
index 8302a2b456..5a97caf666 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/opencl_fusion.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_fusion.cc
@@ -243,10 +243,11 @@ void TryMergePadXxx(LiteKernel *node, std::set<LiteKernel *> *removed_set, std::
 
   auto *conv_param = reinterpret_cast<ParamType *>(reinterpret_cast<OpenCLKernel *>(node)->GetParameter());
   MS_ASSERT(conv_param);
-  conv_param->pad_u_ += pad_param->paddings_[2];
-  conv_param->pad_d_ += pad_param->paddings_[3];
-  conv_param->pad_l_ += pad_param->paddings_[4];
-  conv_param->pad_r_ += pad_param->paddings_[5];
+  auto paddings = reinterpret_cast<int32_t *>(pad->in_tensors().at(1)->data_c());
+  conv_param->pad_u_ += paddings[2];
+  conv_param->pad_d_ += paddings[3];
+  conv_param->pad_l_ += paddings[4];
+  conv_param->pad_r_ += paddings[5];
   pad->set_in_tensors({pad->in_tensors().front()});
   MergeRemoveA(pad, node, removed_set);
   MS_LOG(DEBUG) << "Merge Pad and " + GetTypeName(node) + " success";