diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl
index e6f3c9da2e..943b84c04d 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl
@@ -310,3 +310,13 @@ __kernel void to_format_NHWC4_to_NHWC4_BUF_float(__read_only image2d_t src_data,
   }
   dst_data[(X * size.y + Y) * size.z + Z] = convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
 }
+__kernel void to_format_NHWC4_to_NHWC4_BUF_half(__read_only image2d_t src_data, __global half4 *dst_data, int4 size,
+                                                int4 shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  int Z = get_global_id(2);
+  if (X >= size.x || Y >= size.y || Z >= size.z) {
+    return;
+  }
+  dst_data[(X * size.y + Y) * size.z + Z] = convert_half4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
+}
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
index a7f0ea66cd..725c8817ab 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
@@ -21,24 +21,31 @@
 #include "src/runtime/kernel/opencl/kernel/convolution.h"
 #include "src/kernel_registry.h"
 #include "include/errorcode.h"
-#include "nnacl/fp32/common_func.h"
 
 using mindspore::kernel::KERNEL_ARCH::kGPU;
 using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_Conv2D;
+using mindspore::schema::Format::Format_NC4HW4;
+using mindspore::schema::Format::Format_NCHW;
+using mindspore::schema::Format::Format_NHWC;
+using mindspore::schema::Format::Format_NHWC4;
 
 namespace mindspore::kernel {
 
+constexpr size_t CI_TILE = C4NUM;
+constexpr size_t CO_TILE = C4NUM;
+
 int ConvolutionOpenCLKernel::Init() {
   static int init_count = 0;
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
   auto allocator = ocl_runtime->GetAllocator();
+  auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
   std::set<std::string> build_options;
   init_count++;
   use_fp16_ = ocl_runtime->GetFp16Enable();
 
-  if (op_format_ != schema::Format::Format_NHWC4 && op_format_ != schema::Format::Format_NC4HW4) {
+  if (op_format_ != Format_NHWC4 && op_format_ != Format_NC4HW4) {
     MS_LOG(ERROR) << "op_format_ " << op_format_ << " not support!";
   }
   in_ori_format_ = in_tensors_[0]->GetFormat();
@@ -46,19 +53,21 @@ int ConvolutionOpenCLKernel::Init() {
   in_tensors_[0]->SetFormat(op_format_);
   out_tensors_[0]->SetFormat(op_format_);
 
-  CI = in_tensors_[0]->Channel();
-  IH = in_tensors_[0]->Height();
-  IW = in_tensors_[0]->Width();
-  CO = out_tensors_[0]->Channel();
-  OH = out_tensors_[0]->Height();
-  OW = out_tensors_[0]->Width();
-  CI_SLICES = UP_DIV(CI, C4NUM);
-  CO_SLICES = UP_DIV(CO, C4NUM);
+  CI_ = in_tensors_[0]->Channel();
+  IH_ = in_tensors_[0]->Height();
+  IW_ = in_tensors_[0]->Width();
+  CO_ = out_tensors_[0]->Channel();
+  OH_ = out_tensors_[0]->Height();
+  OW_ = out_tensors_[0]->Width();
+  CI_SLICES_ = UP_DIV(CI_, C4NUM);
+  CO_SLICES_ = UP_DIV(CO_, C4NUM);
+  KH_ = param->kernel_h_;
+  KW_ = param->kernel_w_;
 
   // note: TILES_X TILES_Y TILES_XY is only used when use_winograd_=true
-  TILES_X = UP_DIV(OW, 4);
-  TILES_Y = UP_DIV(OH, 4);
-  TILES_XY = TILES_X * TILES_Y;
+  TILES_X_ = UP_DIV(OW_, 4);
+  TILES_Y_ = UP_DIV(OH_, 4);
+  TILES_XY_ = TILES_X_ * TILES_Y_;
   use_winograd_ = UseWinograd4x4To6x6();
 
   // build kernel
@@ -67,36 +76,34 @@ int ConvolutionOpenCLKernel::Init() {
     std::string program_name;
     program_name = "Winograd4x4To36" + std::to_string(init_count);
     ocl_runtime->LoadSource(program_name, CodeGenWinograd4x4To36());
-    ocl_runtime->BuildKernel(kernel_4x4to36, program_name, "Winograd4x4To36", build_options);
+    ocl_runtime->BuildKernel(kernel_4x4to36_, program_name, "Winograd4x4To36", build_options);
 
     program_name = "WinogradConvolution" + std::to_string(init_count);
     ocl_runtime->LoadSource(program_name, CodeGenWinogradConvolution());
-    ocl_runtime->BuildKernel(kernel_conv, program_name, "WinogradConvolution", build_options);
+    ocl_runtime->BuildKernel(kernel_conv_, program_name, "WinogradConvolution", build_options);
 
     program_name = "Winograd36To4x4" + std::to_string(init_count);
     ocl_runtime->LoadSource(program_name, CodeGenWinograd36To4x4());
-    ocl_runtime->BuildKernel(kernel_36to4x4, program_name, "Winograd36To4x4", build_options);
+    ocl_runtime->BuildKernel(kernel_36to4x4_, program_name, "Winograd36To4x4", build_options);
   } else {
     std::string program_name = "convolution" + std::to_string(init_count);
-    std::string source =
-      op_format_ == schema::Format::Format_NHWC4 ? CodeGenConvolutionNHWC4() : CodeGenConvolutionNC4HW4();
+    std::string source = op_format_ == Format_NHWC4 ? CodeGenConvolutionNHWC4() : CodeGenConvolutionNC4HW4();
     ocl_runtime->LoadSource(program_name, source);
-    ocl_runtime->BuildKernel(kernel_conv, program_name, "Convolution", build_options);
+    ocl_runtime->BuildKernel(kernel_conv_, program_name, "Convolution", build_options);
   }
 
   // allocate winograd memory
   if (use_winograd_) {
     size_t img_dtype = use_fp16_ ? CL_HALF_FLOAT : CL_FLOAT;
-    size_t sizeof_FLT = use_fp16_ ? 2 : 4;
 
-    size_t size = TILES_XY * CI_SLICES * 36 * sizeof_FLT;
-    size_t width = TILES_XY;
-    size_t height = CI_SLICES * 36;
+    size_t size = TILES_XY_ * CI_SLICES_ * 36 * sizeof_FLT();
+    size_t width = TILES_XY_;
+    size_t height = CI_SLICES_ * 36;
     winograd_mem0_ = allocator->Malloc(size, {width, height, img_dtype});
 
-    size = TILES_XY * CO_SLICES * 36 * sizeof_FLT;
-    width = TILES_XY;
-    height = CO_SLICES * 36;
+    size = TILES_XY_ * CO_SLICES_ * 36 * sizeof_FLT();
+    width = TILES_XY_;
+    height = CO_SLICES_ * 36;
     winograd_mem1_ = allocator->Malloc(size, {width, height, img_dtype});
   }
 
@@ -106,142 +113,177 @@ int ConvolutionOpenCLKernel::Init() {
   return RET_OK;
 }
 
-int ConvolutionOpenCLKernel::InitBuffer() {
-  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
-  auto allocator = ocl_runtime->GetAllocator();
-  size_t sizeof_FLT = use_fp16_ ? 2 : 4;
+int ConvolutionOpenCLKernel::RearrangeWinogradWeight() {
+  constexpr float Gt[] = {1.0000000000, 1.0000000000, 1.0000000000,  1.0000000000, 1.0000000000,  0.0000000000,
+                          0.0000000000, 0.7071067691, -0.7071067691, 1.4142135382, -1.4142135382, 0.0000000000,
+                          0.0000000000, 0.4999999702, 0.4999999702,  1.9999998808, 1.9999998808,  1.0000000000};
+  constexpr float G[] = {1.0000000000, 0.0000000000,  0.0000000000, 1.0000000000, 0.7071067691, 0.4999999702,
+                         1.0000000000, -0.7071067691, 0.4999999702, 1.0000000000, 1.4142135382, 1.9999998808,
+                         1.0000000000, -1.4142135382, 1.9999998808, 0.0000000000, 0.0000000000, 1.0000000000};
 
-  auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
-  size_t KH = param->kernel_h_;
-  size_t KW = param->kernel_w_;
-  constexpr size_t CI_TILE = C4NUM;
-  constexpr size_t CO_TILE = C4NUM;
-  size_t packed_weight_size;
-  if (use_winograd_) {
-    packed_weight_size = UP_DIV(CO, 8) * 6 * 6 * CI_SLICES * 2 * CI_TILE * CO_TILE * sizeof_FLT;
+  auto weight_tensor = in_tensors_[1];
+  auto origin_weight_fp32 = reinterpret_cast<float *>(weight_tensor->data_c());
+  auto origin_weight_fp16 = reinterpret_cast<float16_t *>(weight_tensor->data_c());
+  std::function<float(int)> access_func;
+  if (weight_tensor->data_type() == kNumberTypeFloat32) {
+    access_func = [=](int idx) { return origin_weight_fp32[idx]; };
   } else {
-    packed_weight_size = CO_SLICES * KH * KW * CI_SLICES * CI_TILE * CO_TILE * sizeof_FLT;
+    access_func = [=](int idx) { return static_cast<float>(origin_weight_fp16[idx]); };
   }
-  packed_weight_ = allocator->Malloc(packed_weight_size);
-  auto packed_weight_fp32 = reinterpret_cast<float *>(packed_weight_);
-  auto packed_weight_fp16 = reinterpret_cast<uint16_t *>(packed_weight_);
-  allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true);
-  memset(packed_weight_, 0x00, packed_weight_size);
-  auto weight_tensor = in_tensors_[1];
-  auto origin_weight_fp32 = reinterpret_cast<float *>(weight_tensor->MutableData());
-  auto origin_weight_fp16 = reinterpret_cast<uint16_t *>(weight_tensor->MutableData());
-
-  if (use_winograd_) {
-    // weight: OHWI -> O66I -> O/8 6 6 I/4 O2 I4 O4
-    std::vector<float> encoded_weight(CO * 6 * 6 * CI);
-    std::vector<float> Gt = {1.0000000000, 1.0000000000, 1.0000000000,  1.0000000000, 1.0000000000,  0.0000000000,
-                             0.0000000000, 0.7071067691, -0.7071067691, 1.4142135382, -1.4142135382, 0.0000000000,
-                             0.0000000000, 0.4999999702, 0.4999999702,  1.9999998808, 1.9999998808,  1.0000000000};
-
-    std::vector<float> G(Gt.size());
-    for (int y = 0; y < 3; ++y) {
-      for (int x = 0; x < 6; ++x) {
-        G[x * 3 + y] = Gt[y * 6 + x];
-      }
-    }
 
-    for (int co = 0; co < CO; ++co) {
-      for (int ci = 0; ci < CI; ++ci) {
-        std::vector<float> in_vals(9);
-        for (int kh = 0; kh < 3; ++kh) {
-          for (int kw = 0; kw < 3; ++kw) {
-            const int f_index = ((co * 3 + kh) * 3 + kw) * CI + ci;
-            if (use_fp16_) {
-              in_vals[kh * 3 + kw] = ShortToFloat32(origin_weight_fp16[f_index]);
-            } else {
-              in_vals[kh * 3 + kw] = origin_weight_fp32[f_index];
-            }
-          }
-        }
-
-        auto temp_vals = MatrixMultiply(G, in_vals, 6, 3, 3);
-        auto out_vals = MatrixMultiply(temp_vals, Gt, 6, 3, 6);
-        for (int kh = 0; kh < 6; ++kh) {
-          for (int kw = 0; kw < 6; ++kw) {
-            const int f_index = ((co * 6 + kh) * 6 + kw) * CI + ci;
-            encoded_weight[f_index] = out_vals[kh * 6 + kw];
-          }
+  // OHWI -> O66I
+  std::vector<float> encoded_weight(CO_ * 6 * 6 * CI_);
+  for (int co = 0; co < CO_; ++co) {
+    for (int ci = 0; ci < CI_; ++ci) {
+      float in_vals[9];
+      for (int kh = 0; kh < 3; ++kh) {
+        for (int kw = 0; kw < 3; ++kw) {
+          const int f_index = ((co * 3 + kh) * 3 + kw) * CI_ + ci;
+          in_vals[kh * 3 + kw] = access_func(f_index);
         }
       }
-    }
 
-    for (int co = 0, src_idx = 0; co < CO; ++co) {
+      auto temp_vals = MatrixMultiply(G, in_vals, 6, 3, 3);
+      auto out_vals = MatrixMultiply(temp_vals.data(), Gt, 6, 3, 6);
       for (int kh = 0; kh < 6; ++kh) {
         for (int kw = 0; kw < 6; ++kw) {
-          for (int ci = 0; ci < CI; ++ci) {
-            int co_outer = co / 8;
-            int co_inner_group = co % 8 / 4;
-            int co_inner = co % 8 % 4;
-            int ci_outer = ci / 4;
-            int ci_inner = ci % 4;
-            size_t dst_idx =
-              (((((co_outer * 6 + kh) * 6 + kw) * CI_SLICES + ci_outer) * 2 + co_inner_group) * CI_TILE + ci_inner) *
-                CO_TILE +
-              co_inner;
-            if (use_fp16_) {
-              packed_weight_fp16[dst_idx] = Float32ToShort(encoded_weight[src_idx++]);
-            } else {
-              packed_weight_fp32[dst_idx] = encoded_weight[src_idx++];
-            }
-          }
+          const int f_index = ((co * 6 + kh) * 6 + kw) * CI_ + ci;
+          encoded_weight[f_index] = out_vals[kh * 6 + kw];
         }
       }
     }
+  }
+
+  if (use_fp16_) {
+    OHWI2OHWIOGroupI4O4<float, float16_t>(encoded_weight.data(), 6, 6, 2);
   } else {
-    // weight: OHWI -> O/4 H W I/4 I4 O4
-    for (int co = 0, src_idx = 0; co < CO; ++co) {
-      for (int kh = 0; kh < KH; ++kh) {
-        for (int kw = 0; kw < KW; ++kw) {
-          for (int ci = 0; ci < CI; ++ci) {
-            auto co_outer = co / CO_TILE;
-            auto co_inner = co % CO_TILE;
-            auto ci_outer = ci / CI_TILE;
-            auto ci_inner = ci % CI_TILE;
-            size_t dst_idx =
-              ((((co_outer * KH + kh) * KW + kw) * CI_SLICES + ci_outer) * CI_TILE + ci_inner) * CO_TILE + co_inner;
-            if (use_fp16_) {
-              packed_weight_fp16[dst_idx] = origin_weight_fp16[src_idx++];
-            } else {
-              packed_weight_fp32[dst_idx] = origin_weight_fp32[src_idx++];
-            }
-          }
+    OHWI2OHWIOGroupI4O4<float, float>(encoded_weight.data(), 6, 6, 2);
+  }
+
+  return RET_OK;
+}
+
+template <typename SRC_T, typename DST_T>
+int ConvolutionOpenCLKernel::OHWI2OHWIOGroupI4O4(void *weight_OHWI, size_t KH, size_t KW, size_t OGroup) {
+  auto origin_weight = reinterpret_cast<SRC_T *>(weight_OHWI);
+  auto packed_weight = reinterpret_cast<DST_T *>(packed_weight_);
+
+  // OHWI -> O/OGroup/4 KH KW I/4 OGroup I4 O4
+  for (size_t co = 0, src_idx = 0; co < CO_; ++co) {
+    for (size_t kh = 0; kh < KH; ++kh) {
+      for (size_t kw = 0; kw < KW; ++kw) {
+        for (size_t ci = 0; ci < CI_; ++ci) {
+          size_t co_outer = co / (CO_TILE * OGroup);
+          size_t group_idx = co % (CO_TILE * OGroup) / CO_TILE;
+          size_t co_inner = co % CO_TILE;
+          size_t ci_outer = ci / CI_TILE;
+          size_t ci_inner = ci % CI_TILE;
+          size_t dst_idx =
+            (((((co_outer * KH + kh) * KW + kw) * CI_SLICES_ + ci_outer) * OGroup + group_idx) * CI_TILE + ci_inner) *
+              CO_TILE +
+            co_inner;
+          packed_weight[dst_idx] = static_cast<DST_T>(origin_weight[src_idx++]);
         }
       }
     }
   }
+  return RET_OK;
+}
+
+int ConvolutionOpenCLKernel::InitWeight() {
+  auto allocator = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator();
+
+  // allocate memory
+  size_t packed_weight_size;
+  if (use_winograd_) {
+    packed_weight_size = UP_DIV(CO_, 8) * 6 * 6 * CI_SLICES_ * 2 * CI_TILE * CO_TILE * sizeof_FLT();
+  } else {
+    packed_weight_size = CO_SLICES_ * KH_ * KW_ * CI_SLICES_ * CI_TILE * CO_TILE * sizeof_FLT();
+  }
+  packed_weight_ = allocator->Malloc(packed_weight_size);
+  allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true);
+  memset(packed_weight_, 0x00, packed_weight_size);
+
+  // rearrange weight
+  if (use_winograd_) {
+    RearrangeWinogradWeight();
+  } else {
+    auto weight_tensor = in_tensors_[1];
+    if (weight_tensor->data_type() == kNumberTypeFloat16) {
+      if (use_fp16_) {
+        OHWI2OHWIOGroupI4O4<float16_t, float16_t>(weight_tensor->data_c(), KH_, KW_, 1);
+      } else {
+        OHWI2OHWIOGroupI4O4<float16_t, float>(weight_tensor->data_c(), KH_, KW_, 1);
+      }
+    } else {
+      if (use_fp16_) {
+        OHWI2OHWIOGroupI4O4<float, float16_t>(weight_tensor->data_c(), KH_, KW_, 1);
+      } else {
+        OHWI2OHWIOGroupI4O4<float, float>(weight_tensor->data_c(), KH_, KW_, 1);
+      }
+    }
+  }
+
   allocator->UnmapBuffer(packed_weight_);
+  return RET_OK;
+}
+
+int ConvolutionOpenCLKernel::InitBias() {
+  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
+  auto allocator = ocl_runtime->GetAllocator();
 
   // align bias from C to C4
   auto bias_tensor = in_tensors_[2];
-  size_t packed_bias_size = CO_SLICES * CO_TILE * sizeof_FLT;
+  size_t packed_bias_size = CO_SLICES_ * CO_TILE * sizeof_FLT();
   packed_bias_ = allocator->Malloc(packed_bias_size);
+
   allocator->MapBuffer(packed_bias_, CL_MAP_WRITE, nullptr, true);
   memset(packed_bias_, 0x00, packed_bias_size);
-  memcpy(packed_bias_, bias_tensor->MutableData(), CO * sizeof_FLT);
+  if (bias_tensor->data_type() == kNumberTypeFloat16) {
+    if (use_fp16_) {
+      memcpy(packed_bias_, bias_tensor->data_c(), CO_ * sizeof_FLT());
+    } else {
+      auto packed_bias_fp32 = reinterpret_cast<float *>(packed_bias_);
+      auto origin_bias_fp16 = reinterpret_cast<float16_t *>(bias_tensor->data_c());
+      for (int i = 0; i < CO_; ++i) {
+        packed_bias_fp32[i] = static_cast<float>(origin_bias_fp16[i]);
+      }
+    }
+  } else {
+    if (use_fp16_) {
+      auto packed_bias_fp16 = reinterpret_cast<float16_t *>(packed_bias_);
+      auto origin_bias_fp32 = reinterpret_cast<float *>(bias_tensor->data_c());
+      for (int i = 0; i < CO_; ++i) {
+        packed_bias_fp16[i] = static_cast<float16_t>(origin_bias_fp32[i]);
+      }
+    } else {
+      memcpy(packed_bias_, bias_tensor->data_c(), CO_ * sizeof_FLT());
+    }
+  }
   allocator->UnmapBuffer(packed_bias_);
+  return RET_OK;
+}
 
+int ConvolutionOpenCLKernel::InitBuffer() {
+  InitWeight();
+  InitBias();
   return RET_OK;
 }
 
 int ConvolutionOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
   size_t im_dst_x, im_dst_y;
-  if (in_tensors_[0]->GetFormat() == schema::Format::Format_NHWC4) {
-    if (out_tensors_[0]->Width() * CO_SLICES < 65536) {
+  if (in_tensors_[0]->GetFormat() == Format_NHWC4) {
+    if (out_tensors_[0]->Width() * CO_SLICES_ < 65536) {
       {
-        im_dst_x = out_tensors_[0]->Width() * CO_SLICES;
+        im_dst_x = out_tensors_[0]->Width() * CO_SLICES_;
         im_dst_y = out_tensors_[0]->Height();
       }
     } else {
-      im_dst_x = out_tensors_[0]->Height() * CO_SLICES;
+      im_dst_x = out_tensors_[0]->Height() * CO_SLICES_;
       im_dst_y = out_tensors_[0]->Width();
     }
   } else {
-    im_dst_y = out_tensors_[0]->Height() * CO_SLICES;
+    im_dst_y = out_tensors_[0]->Height() * CO_SLICES_;
     im_dst_x = out_tensors_[0]->Width();
   }
   size_t img_dtype = use_fp16_ ? CL_HALF_FLOAT : CL_FLOAT;
@@ -259,52 +301,52 @@ int ConvolutionOpenCLKernel::Run() {
   int arg_cn = 0;
   if (use_winograd_) {
     arg_cn = 0;
-    cl_int4 _4x4to36_in_shape = {1, IH, IW, CI_SLICES};
-    cl_int4 _4x4to36_out_shape = {1, 36, TILES_XY, CI_SLICES};
-    ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, in_tensors_[0]->MutableData(), lite::opencl::MemType::IMG);
-    ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, winograd_mem0_, lite::opencl::MemType::IMG);
-    ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, _4x4to36_in_shape);
-    ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, _4x4to36_out_shape);
+    cl_int4 _4x4to36_in_shape = {1, IH_, IW_, CI_SLICES_};
+    cl_int4 _4x4to36_out_shape = {1, 36, TILES_XY_, CI_SLICES_};
+    ocl_runtime->SetKernelArg(kernel_4x4to36_, arg_cn++, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
+    ocl_runtime->SetKernelArg(kernel_4x4to36_, arg_cn++, winograd_mem0_, lite::opencl::MemType::IMG);
+    ocl_runtime->SetKernelArg(kernel_4x4to36_, arg_cn++, _4x4to36_in_shape);
+    ocl_runtime->SetKernelArg(kernel_4x4to36_, arg_cn++, _4x4to36_out_shape);
 
     arg_cn = 0;
-    cl_int4 conv_in_shape = {1, 36, TILES_XY, CI_SLICES};
-    cl_int4 conv_out_shape = {1, 36, TILES_XY, CO_SLICES};
-    ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, winograd_mem0_, lite::opencl::MemType::IMG);
-    ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, winograd_mem1_, lite::opencl::MemType::IMG);
-    ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, packed_weight_, lite::opencl::MemType::BUF);
-    ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, conv_in_shape);
-    ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, conv_out_shape);
+    cl_int4 conv_in_shape = {1, 36, TILES_XY_, CI_SLICES_};
+    cl_int4 conv_out_shape = {1, 36, TILES_XY_, CO_SLICES_};
+    ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, winograd_mem0_, lite::opencl::MemType::IMG);
+    ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, winograd_mem1_, lite::opencl::MemType::IMG);
+    ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF);
+    ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, conv_in_shape);
+    ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, conv_out_shape);
 
     arg_cn = 0;
-    cl_int4 _36to4x4_in_shape = {1, 16, TILES_XY, CO_SLICES};
-    cl_int4 _36to4x4_out_shape = {1, OH, OW, CO_SLICES};
-    ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, winograd_mem1_, lite::opencl::MemType::IMG);
-    ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, out_tensors_[0]->MutableData(), lite::opencl::MemType::IMG);
-    ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, packed_bias_, lite::opencl::MemType::BUF);
-    ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, _36to4x4_in_shape);
-    ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, _36to4x4_out_shape);
+    cl_int4 _36to4x4_in_shape = {1, 16, TILES_XY_, CO_SLICES_};
+    cl_int4 _36to4x4_out_shape = {1, OH_, OW_, CO_SLICES_};
+    ocl_runtime->SetKernelArg(kernel_36to4x4_, arg_cn++, winograd_mem1_, lite::opencl::MemType::IMG);
+    ocl_runtime->SetKernelArg(kernel_36to4x4_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
+    ocl_runtime->SetKernelArg(kernel_36to4x4_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF);
+    ocl_runtime->SetKernelArg(kernel_36to4x4_, arg_cn++, _36to4x4_in_shape);
+    ocl_runtime->SetKernelArg(kernel_36to4x4_, arg_cn++, _36to4x4_out_shape);
   } else {
     arg_cn = 0;
-    ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, in_tensors_[0]->MutableData(), lite::opencl::MemType::IMG);
-    ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, out_tensors_[0]->MutableData(), lite::opencl::MemType::IMG);
-    ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, packed_weight_, lite::opencl::MemType::BUF);
-    ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, packed_bias_, lite::opencl::MemType::BUF);
-    if (op_format_ == schema::Format::Format_NC4HW4) {
-      cl_int4 input_shape = {1, IH, IW, CI_SLICES};
-      cl_int4 output_shape = {1, OH, OW, CO_SLICES};
-      ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, input_shape);
-      ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, output_shape);
+    ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
+    ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
+    ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF);
+    ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF);
+    if (op_format_ == Format_NC4HW4) {
+      cl_int4 input_shape = {1, IH_, IW_, CI_SLICES_};
+      cl_int4 output_shape = {1, OH_, OW_, CO_SLICES_};
+      ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, input_shape);
+      ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, output_shape);
     }
   }
 
   if (use_winograd_) {
-    ocl_runtime->RunKernel(kernel_4x4to36, {size_t(TILES_XY), 6, size_t(CI_SLICES)}, {8, 6, 4}, nullptr);
-    ocl_runtime->RunKernel(kernel_conv, {size_t(TILES_XY / 2), 36, size_t(CO_SLICES / 2)}, {8, 6, 2}, nullptr);
-    ocl_runtime->RunKernel(kernel_36to4x4, {size_t(TILES_XY), 4, size_t(CO_SLICES)}, {32, 4, 2}, nullptr);
+    ocl_runtime->RunKernel(kernel_4x4to36_, {size_t(TILES_XY_), 6, size_t(CI_SLICES_)}, {8, 6, 4}, nullptr);
+    ocl_runtime->RunKernel(kernel_conv_, {size_t(TILES_XY_ / 2), 36, size_t(CO_SLICES_ / 2)}, {8, 6, 2}, nullptr);
+    ocl_runtime->RunKernel(kernel_36to4x4_, {size_t(TILES_XY_), 4, size_t(CO_SLICES_)}, {32, 4, 2}, nullptr);
   } else {
     std::vector<size_t> global, local;
     SetGlobalLocalConv(&global, &local);
-    ocl_runtime->RunKernel(kernel_conv, global, local, nullptr);
+    ocl_runtime->RunKernel(kernel_conv_, global, local, nullptr);
   }
 
   return RET_OK;
@@ -312,10 +354,8 @@ int ConvolutionOpenCLKernel::Run() {
 
 std::string ConvolutionOpenCLKernel::CodeGenConvolutionNHWC4() {
   auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
-  const size_t CI_ALIGN = CI_SLICES * C4NUM;
-  const size_t CO_ALIGN = CO_SLICES * C4NUM;
-  const size_t KH = param->kernel_h_;
-  const size_t KW = param->kernel_w_;
+  const size_t CI_ALIGN = CI_SLICES_ * C4NUM;
+  const size_t CO_ALIGN = CO_SLICES_ * C4NUM;
   const size_t strideH = param->stride_h_;
   const size_t strideW = param->stride_w_;
   const size_t padTop = param->pad_u_;
@@ -327,21 +367,21 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNHWC4() {
   code += "#define CI_TILE 4\n";
   code += "#define CO_TILE 4\n\n";
   code += "#define CI " + std::to_string(CI_ALIGN) + "\n";
-  code += "#define IH " + std::to_string(IH) + "\n";
-  code += "#define IW " + std::to_string(IW) + "\n";
+  code += "#define IH " + std::to_string(IH_) + "\n";
+  code += "#define IW " + std::to_string(IW_) + "\n";
   code += "#define CO " + std::to_string(CO_ALIGN) + "\n";
-  code += "#define OH " + std::to_string(OH) + "\n";
-  code += "#define OW " + std::to_string(OW) + "\n";
-  code += "#define KH " + std::to_string(KH) + "\n";
-  code += "#define KW " + std::to_string(KW) + "\n";
+  code += "#define OH " + std::to_string(OH_) + "\n";
+  code += "#define OW " + std::to_string(OW_) + "\n";
+  code += "#define KH " + std::to_string(KH_) + "\n";
+  code += "#define KW " + std::to_string(KW_) + "\n";
   code += "#define strideH " + std::to_string(strideH) + "\n";
   code += "#define strideW " + std::to_string(strideW) + "\n";
   code += "#define padTop " + std::to_string(padTop) + "\n";
   code += "#define padBottom " + std::to_string(padBottom) + "\n";
   code += "#define padLeft " + std::to_string(padLeft) + "\n";
   code += "#define padRight " + std::to_string(padRight) + "\n";
-  code += "#define CI_SLICES " + std::to_string(CI_SLICES) + "\n";
-  code += "#define CO_SLICES " + std::to_string(CO_SLICES) + "\n\n";
+  code += "#define CI_SLICES " + std::to_string(CI_SLICES_) + "\n";
+  code += "#define CO_SLICES " + std::to_string(CO_SLICES_) + "\n\n";
 
   if (use_fp16_) {
     code += "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
@@ -401,7 +441,7 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNHWC4() {
     code += "    out0_c4_bias = clamp(out0_c4_bias, (FLT4)(0.0f), (FLT4)(6.0f));\n";
   }
 
-  if (OW * CO_SLICES < 65536) {
+  if (OW_ * CO_SLICES_ < 65536) {
     code += "    WRITE_IMAGE(output, (int2)(ow * CO_SLICES + co_slice, oh), out0_c4_bias);// NHWC4: H WC\n}";
   } else {
     code += "    WRITE_IMAGE(output, (int2)(oh * CO_SLICES + co_slice, ow), out0_c4_bias);// NHWC4: H WC\n}";
@@ -411,8 +451,6 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNHWC4() {
 
 std::string ConvolutionOpenCLKernel::CodeGenConvolutionNC4HW4() {
   auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
-  const size_t KH = param->kernel_h_;
-  const size_t KW = param->kernel_w_;
   const size_t strideH = param->stride_h_;
   const size_t strideW = param->stride_w_;
   const size_t padTop = param->pad_u_;
@@ -442,12 +480,12 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNC4HW4() {
     "    int CI_SLICES = input_shape.w;\n"
     "    int CO_SLICES = output_shape.w;\n\n";
 
-  code += "    #define IH " + std::to_string(IH) + "\n";
-  code += "    #define IW " + std::to_string(IW) + "\n";
-  code += "    #define OH " + std::to_string(OH) + "\n";
-  code += "    #define OW " + std::to_string(OW) + "\n";
-  code += "    #define KH " + std::to_string(KH) + "\n";
-  code += "    #define KW " + std::to_string(KW) + "\n";
+  code += "    #define IH " + std::to_string(IH_) + "\n";
+  code += "    #define IW " + std::to_string(IW_) + "\n";
+  code += "    #define OH " + std::to_string(OH_) + "\n";
+  code += "    #define OW " + std::to_string(OW_) + "\n";
+  code += "    #define KH " + std::to_string(KH_) + "\n";
+  code += "    #define KW " + std::to_string(KW_) + "\n";
   code += "    #define strideH " + std::to_string(strideH) + "\n";
   code += "    #define strideW " + std::to_string(strideW) + "\n";
   code += "    #define padTop " + std::to_string(padTop) + "\n";
@@ -457,7 +495,7 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNC4HW4() {
     "    if (oh >= OH || ow >= OW || co_slice >= CO_SLICES)\n"
     "        return;\n\n";
 
-  bool check_ow = (OW % 2) == 1;
+  bool check_ow = (OW_ % 2) == 1;
   if (check_ow) {
     code +=
       "    int last_is_double = 1;\n"
@@ -607,12 +645,12 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd4x4To36() {
     "    {\n"
     "        int y_idx = tile_y * 4 - PAD + y;\n";
 
-  if (op_format_ == schema::Format::Format_NHWC4) {
+  if (op_format_ == Format_NHWC4) {
     code +=
       "        for (int x = 0; x < 6; x++)\n"
       "        {\n"
       "             int x_idx = (tile_x * 4 - PAD + x) * SLICES + slice;\n";
-  } else if (op_format_ == schema::Format::Format_NC4HW4) {
+  } else if (op_format_ == Format_NC4HW4) {
     code +=
       "        if(y_idx < 0 || y_idx >= IH)\n"
       "        {\n"
@@ -788,9 +826,9 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() {
     "        int tile_x = tile_xy % TILE_X * 4;\n"
     "        int tile_y = tile_xy / TILE_X * 4;\n";
 
-  if (op_format_ == schema::Format::Format_NHWC4) {
+  if (op_format_ == Format_NHWC4) {
     code += "        WRITE_IMAGE(output, (int2)((tile_x + x) * SLICES + slice, tile_y + row), acc);\n";
-  } else if (op_format_ == schema::Format::Format_NC4HW4) {
+  } else if (op_format_ == Format_NC4HW4) {
     code += "        WRITE_IMAGE(output, (int2)(tile_x + x, slice * OH + tile_y + row), acc);\n";
   }
 
@@ -804,12 +842,12 @@ int ConvolutionOpenCLKernel::SetGlobalLocalConv(std::vector<size_t> *global, std
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
   constexpr size_t work_group_size[] = {4, 4, 1};
   auto max_work_item_sizes = ocl_runtime->GetWorkItemSize();
-  size_t max_work_group_size = ocl_runtime->GetKernelMaxWorkGroupSize(kernel_conv(), (*ocl_runtime->Device())());
+  size_t max_work_group_size = ocl_runtime->GetKernelMaxWorkGroupSize(kernel_conv_(), (*ocl_runtime->Device())());
   const size_t max_z_size = std::min<size_t>(16, max_work_item_sizes[2]);
 
-  size_t global_h = UP_DIV(OH, work_group_size[0]) * work_group_size[0];
-  size_t global_w = UP_DIV(OW, work_group_size[1]) * work_group_size[1];
-  size_t global_c = UP_DIV(CO_SLICES, work_group_size[2]) * work_group_size[2];
+  size_t global_h = UP_DIV(OH_, work_group_size[0]) * work_group_size[0];
+  size_t global_w = UP_DIV(OW_, work_group_size[1]) * work_group_size[1];
+  size_t global_c = UP_DIV(CO_SLICES_, work_group_size[2]) * work_group_size[2];
 
   size_t local_c = GetBiggestDivider(global_c, max_z_size);
   if (local_c == 0) {
@@ -823,22 +861,22 @@ int ConvolutionOpenCLKernel::SetGlobalLocalConv(std::vector<size_t> *global, std
     local_h = global_h / 2;
   }
 
-  if (op_format_ == schema::Format::Format_NHWC4) {
-    if (OW * CO_SLICES > 65536) {
+  if (op_format_ == Format_NHWC4) {
+    if (OW_ * CO_SLICES_ > 65536) {
       local_w = 4;
     }
   }
 
   global->clear();
-  global->push_back(UP_DIV(OW, local_w) * local_w);
-  global->push_back(UP_DIV(OH, local_h) * local_h);
-  global->push_back(UP_DIV(CO_SLICES, local_c) * local_c);
+  global->push_back(UP_DIV(OW_, local_w) * local_w);
+  global->push_back(UP_DIV(OH_, local_h) * local_h);
+  global->push_back(UP_DIV(CO_SLICES_, local_c) * local_c);
   local->clear();
   local->push_back(local_w);
   local->push_back(local_h);
   local->push_back(local_c);
 
-  if (op_format_ == schema::Format::Format_NC4HW4) {
+  if (op_format_ == Format_NC4HW4) {
     // calculate 2 FLT4 along width per work-item
     global->at(0) = UP_DIV(global->at(0), 2);
     if (local->at(0) > global->at(0)) {
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
index 458fc8f16b..785e144336 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
@@ -42,27 +42,35 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {
  private:
   bool use_fp16_ = false;
 
-  int CI;
-  int IH;
-  int IW;
-  int CO;
-  int OH;
-  int OW;
-  int CI_SLICES;
-  int CO_SLICES;
+  int CI_{};
+  int IH_{};
+  int IW_{};
+  int CO_{};
+  int OH_{};
+  int OW_{};
+  int CI_SLICES_{};
+  int CO_SLICES_{};
+  int KH_{};
+  int KW_{};
   void *packed_weight_ = nullptr;
   void *packed_bias_ = nullptr;
 
   bool use_winograd_ = false;
-  int TILES_X;
-  int TILES_Y;
-  int TILES_XY;
+  int TILES_X_{};
+  int TILES_Y_{};
+  int TILES_XY_{};
   void *winograd_mem0_ = nullptr;
   void *winograd_mem1_ = nullptr;
 
-  cl::Kernel kernel_4x4to36;
-  cl::Kernel kernel_conv;
-  cl::Kernel kernel_36to4x4;
+  cl::Kernel kernel_4x4to36_;
+  cl::Kernel kernel_conv_;
+  cl::Kernel kernel_36to4x4_;
+
+  int InitWeight();
+  int InitBias();
+  int RearrangeWinogradWeight();
+  template <typename SRC_T, typename DST_T>
+  int OHWI2OHWIOGroupI4O4(void *weight_OHWI, size_t KH, size_t KW, size_t OGroup);
 
   std::string CodeGenConvolutionNHWC4();
   std::string CodeGenConvolutionNC4HW4();
@@ -72,16 +80,18 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {
   std::string CodeGenWinograd36To4x4();
   int SetGlobalLocalConv(std::vector<size_t> *global, std::vector<size_t> *local);
 
+  size_t sizeof_FLT() const { return use_fp16_ ? sizeof(float16_t) : sizeof(float); }
+
   bool UseWinograd4x4To6x6() {
     auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
     const bool attr_valid = param->kernel_h_ == 3 && param->kernel_w_ == 3 && param->dilation_h_ == 1 &&
                             param->dilation_w_ == 1 && param->stride_h_ == 1 && param->stride_w_ == 1;
-    const bool channel_good = CI_SLICES >= 12 && CO_SLICES >= 12;
-    const bool hw_good = TILES_X * TILES_Y >= 16;
+    const bool channel_good = CI_SLICES_ >= 12 && CO_SLICES_ >= 12;
+    const bool hw_good = TILES_X_ * TILES_Y_ >= 16;
     return attr_valid && channel_good && hw_good;
   }
 
-  std::vector<float> MatrixMultiply(const std::vector<float> &A, const std::vector<float> &B, int M, int N, int K) {
+  static std::vector<float> MatrixMultiply(const float A[], const float B[], int M, int N, int K) {
     std::vector<float> C(M * K);
     for (int i = 0; i < M; ++i) {
       for (int j = 0; j < K; ++j) {
diff --git a/mindspore/lite/test/run_test.sh b/mindspore/lite/test/run_test.sh
index f048dbb364..7dd8e10497 100755
--- a/mindspore/lite/test/run_test.sh
+++ b/mindspore/lite/test/run_test.sh
@@ -28,3 +28,6 @@ cp -fr $TEST_DATA_DIR/testPK ./data
 ./lite-test --gtest_filter=TestDeconvInt8.*
 
 ./lite-test --gtest_filter="TestTfliteParser*"
+
+# for GPU OpenCL
+./lite-test --gtest_filter="TestConvolutionOpenCL.simple_test*"
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc
index d15798dd16..e6f56178c7 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc
@@ -21,19 +21,18 @@
 #include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
 #include "mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h"
 #include "nnacl/pack.h"
-#include "nnacl/fp32/common_func.h"
 
 using mindspore::kernel::ConvolutionOpenCLKernel;
 using mindspore::kernel::LiteKernel;
 using mindspore::kernel::SubGraphOpenCLKernel;
 using mindspore::lite::Tensor;
 using mindspore::schema::Format;
-using mindspore::schema::Format_KHWC;
-using mindspore::schema::Format_NC4HW4;
-using mindspore::schema::Format_NCHW;
-using mindspore::schema::Format_NHWC;
-using mindspore::schema::Format_NHWC4;
 using mindspore::schema::NodeType_ValueNode;
+using mindspore::schema::Format::Format_KHWC;
+using mindspore::schema::Format::Format_NC4HW4;
+using mindspore::schema::Format::Format_NCHW;
+using mindspore::schema::Format::Format_NHWC;
+using mindspore::schema::Format::Format_NHWC4;
 
 namespace mindspore {
 
@@ -41,26 +40,25 @@ class TestConvolutionOpenCL : public mindspore::CommonTest {};
 
 void LoadData(Tensor *tensor, const float *src) {
   if (tensor->data_type() == kNumberTypeFloat16) {
-    auto num = tensor->Size() / 2;
-    auto tensor_data = reinterpret_cast<uint16_t *>(tensor->MutableData());
+    auto num = tensor->Size() / sizeof(float16_t);
+    auto tensor_data = reinterpret_cast<float16_t *>(tensor->data_c());
     for (int i = 0; i < num; ++i) {
-      tensor_data[i] = Float32ToShort(src[i]);
+      tensor_data[i] = static_cast<float16_t>(src[i]);
     }
   } else {
-    memcpy(tensor->MutableData(), src, tensor->Size());
+    memcpy(tensor->data_c(), src, tensor->Size());
   }
 }
 
 void CompareOutput(Tensor *output, const float *expect_data, const float atol) {
-  auto num = (output->data_type() == kNumberTypeFloat16) ? output->Size() / 2 : output->Size() / 4;
+  auto num = output->Size() / (output->data_type() == kNumberTypeFloat16 ? 2 : 4);
   std::vector<float> output_data(num);
   if (output->data_type() == kNumberTypeFloat16) {
-    auto output_data_fp16 = reinterpret_cast<uint16_t *>(output->MutableData());
     for (int i = 0; i < output_data.size(); ++i) {
-      output_data[i] = ShortToFloat32((output_data_fp16[i]));
+      output_data[i] = static_cast<float>(reinterpret_cast<float16_t *>(output->data_c())[i]);
     }
   } else {
-    memcpy(output_data.data(), output->MutableData(), output->Size());
+    memcpy(output_data.data(), output->data_c(), output->Size());
   }
 
   printf("output:");
@@ -69,9 +67,9 @@ void CompareOutput(Tensor *output, const float *expect_data, const float atol) {
   }
   printf("\n");
 
-  float max_err = 0.0f;
+  float max_err = -1.0f;
   std::array<int, 5> idx_5d{};
-  int idx = -1;
+  int max_err_idx = -1, first_err_idx = -1;
   auto SLICES = UP_DIV(output->Channel(), 4);
   int I = 1, J = 1, K = 1, L = 1, M = 1;
   switch (output->GetFormat()) {
@@ -98,10 +96,13 @@ void CompareOutput(Tensor *output, const float *expect_data, const float atol) {
         for (int l = 0; l < L; ++l) {
           for (int m = 0; m < M; ++m) {
             auto err = std::fabs(output_data[cn] - expect_data[cn]);
+            if (first_err_idx == -1 && max_err > atol) {
+              first_err_idx = cn;
+            }
             if (err > max_err) {
               max_err = err;
               idx_5d = {i, j, k, l, m};
-              idx = cn;
+              max_err_idx = cn;
             }
             cn++;
           }
@@ -110,18 +111,19 @@ void CompareOutput(Tensor *output, const float *expect_data, const float atol) {
     }
   }
 
-  float relative_err = max_err / std::fabs(std::max(expect_data[idx], output_data[idx]));
-  if (output->GetFormat() == Format_NHWC || output->GetFormat() == Format_NCHW) {
-    printf("max relative error at [%d,%d,%d,%d]", idx_5d[0], idx_5d[1], idx_5d[2], idx_5d[3]);
-  } else {
-    printf("max relative error at [%d,%d,%d,%d,%d]", idx_5d[0], idx_5d[1], idx_5d[2], idx_5d[3], idx_5d[4]);
-  }
-  printf(" expect=%.3f output=%.3f absolute_err=%.2e relative_err=%.2f%%\n", expect_data[idx], output_data[idx],
-         max_err, relative_err * 100);
-
   if (max_err > atol) {
+    printf("first error at %d expect=%.3f output=%.3f\n", first_err_idx, expect_data[first_err_idx],
+           output_data[first_err_idx]);
     FAIL();
   } else {
+    float relative_err = max_err / std::fabs(std::max(expect_data[max_err_idx], output_data[max_err_idx]));
+    if (output->GetFormat() == Format_NHWC || output->GetFormat() == Format_NCHW) {
+      printf("max relative error at [%d,%d,%d,%d]", idx_5d[0], idx_5d[1], idx_5d[2], idx_5d[3]);
+    } else {
+      printf("max relative error at [%d,%d,%d,%d,%d]", idx_5d[0], idx_5d[1], idx_5d[2], idx_5d[3], idx_5d[4]);
+    }
+    printf(" expect=%.3f output=%.3f absolute_err=%.2e relative_err=%.2f%%\n", expect_data[max_err_idx],
+           output_data[max_err_idx], max_err, relative_err * 100);
     printf("COMPARE SUCCESS!\n\n");
   }
 }