!6088 optimize opencl convolution fp16 kernel

Merge pull request !6088 from 王东旭/opencl_convolution_support_fp16
5 years ago · 37561b1b4c
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl
@@ -310,3 +310,13 @@ __kernel void to_format_NHWC4_to_NHWC4_BUF_float(__read_only image2d_t src_data,
  }
  dst_data[(X * size.y + Y) * size.z + Z] = convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
 }
 __kernel void to_format_NHWC4_to_NHWC4_BUF_half(__read_only image2d_t src_data, __global half4 *dst_data, int4 size,
                                                int4 shape) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  int Z = get_global_id(2);
  if (X >= size.x || Y >= size.y || Z >= size.z) {
    return;
  }
  dst_data[(X * size.y + Y) * size.z + Z] = convert_half4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
@@ -21,24 +21,31 @@
 #include "src/runtime/kernel/opencl/kernel/convolution.h"
 #include "src/kernel_registry.h"
 #include "include/errorcode.h"
 #include "nnacl/fp32/common_func.h"

 using mindspore::kernel::KERNEL_ARCH::kGPU;
 using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_Conv2D;
 using mindspore::schema::Format::Format_NC4HW4;
 using mindspore::schema::Format::Format_NCHW;
 using mindspore::schema::Format::Format_NHWC;
 using mindspore::schema::Format::Format_NHWC4;

 namespace mindspore::kernel {

 constexpr size_t CI_TILE = C4NUM;
 constexpr size_t CO_TILE = C4NUM;

 int ConvolutionOpenCLKernel::Init() {
  static int init_count = 0;
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  auto allocator = ocl_runtime->GetAllocator();
  auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
  std::set<std::string> build_options;
  init_count++;
  use_fp16_ = ocl_runtime->GetFp16Enable();

  if (op_format_ != schema::Format::Format_NHWC4 && op_format_ != schema::Format::Format_NC4HW4) {
  if (op_format_ != Format_NHWC4 && op_format_ != Format_NC4HW4) {
    MS_LOG(ERROR) << "op_format_ " << op_format_ << " not support!";
  }
  in_ori_format_ = in_tensors_[0]->GetFormat();
@@ -46,19 +53,21 @@ int ConvolutionOpenCLKernel::Init() {
  in_tensors_[0]->SetFormat(op_format_);
  out_tensors_[0]->SetFormat(op_format_);

  CI = in_tensors_[0]->Channel();
  IH = in_tensors_[0]->Height();
  IW = in_tensors_[0]->Width();
  CO = out_tensors_[0]->Channel();
  OH = out_tensors_[0]->Height();
  OW = out_tensors_[0]->Width();
  CI_SLICES = UP_DIV(CI, C4NUM);
  CO_SLICES = UP_DIV(CO, C4NUM);
  CI_ = in_tensors_[0]->Channel();
  IH_ = in_tensors_[0]->Height();
  IW_ = in_tensors_[0]->Width();
  CO_ = out_tensors_[0]->Channel();
  OH_ = out_tensors_[0]->Height();
  OW_ = out_tensors_[0]->Width();
  CI_SLICES_ = UP_DIV(CI_, C4NUM);
  CO_SLICES_ = UP_DIV(CO_, C4NUM);
  KH_ = param->kernel_h_;
  KW_ = param->kernel_w_;

  // note: TILES_X TILES_Y TILES_XY is only used when use_winograd_=true
  TILES_X = UP_DIV(OW, 4);
  TILES_Y = UP_DIV(OH, 4);
  TILES_XY = TILES_X * TILES_Y;
  TILES_X_ = UP_DIV(OW_, 4);
  TILES_Y_ = UP_DIV(OH_, 4);
  TILES_XY_ = TILES_X_ * TILES_Y_;
  use_winograd_ = UseWinograd4x4To6x6();

  // build kernel
@@ -67,36 +76,34 @@ int ConvolutionOpenCLKernel::Init() {
    std::string program_name;
    program_name = "Winograd4x4To36" + std::to_string(init_count);
    ocl_runtime->LoadSource(program_name, CodeGenWinograd4x4To36());
    ocl_runtime->BuildKernel(kernel_4x4to36, program_name, "Winograd4x4To36", build_options);
    ocl_runtime->BuildKernel(kernel_4x4to36_, program_name, "Winograd4x4To36", build_options);

    program_name = "WinogradConvolution" + std::to_string(init_count);
    ocl_runtime->LoadSource(program_name, CodeGenWinogradConvolution());
    ocl_runtime->BuildKernel(kernel_conv, program_name, "WinogradConvolution", build_options);
    ocl_runtime->BuildKernel(kernel_conv_, program_name, "WinogradConvolution", build_options);

    program_name = "Winograd36To4x4" + std::to_string(init_count);
    ocl_runtime->LoadSource(program_name, CodeGenWinograd36To4x4());
    ocl_runtime->BuildKernel(kernel_36to4x4, program_name, "Winograd36To4x4", build_options);
    ocl_runtime->BuildKernel(kernel_36to4x4_, program_name, "Winograd36To4x4", build_options);
  } else {
    std::string program_name = "convolution" + std::to_string(init_count);
    std::string source =
      op_format_ == schema::Format::Format_NHWC4 ? CodeGenConvolutionNHWC4() : CodeGenConvolutionNC4HW4();
    std::string source = op_format_ == Format_NHWC4 ? CodeGenConvolutionNHWC4() : CodeGenConvolutionNC4HW4();
    ocl_runtime->LoadSource(program_name, source);
    ocl_runtime->BuildKernel(kernel_conv, program_name, "Convolution", build_options);
    ocl_runtime->BuildKernel(kernel_conv_, program_name, "Convolution", build_options);
  }

  // allocate winograd memory
  if (use_winograd_) {
    size_t img_dtype = use_fp16_ ? CL_HALF_FLOAT : CL_FLOAT;
    size_t sizeof_FLT = use_fp16_ ? 2 : 4;

    size_t size = TILES_XY * CI_SLICES * 36 * sizeof_FLT;
    size_t width = TILES_XY;
    size_t height = CI_SLICES * 36;
    size_t size = TILES_XY_ * CI_SLICES_ * 36 * sizeof_FLT();
    size_t width = TILES_XY_;
    size_t height = CI_SLICES_ * 36;
    winograd_mem0_ = allocator->Malloc(size, {width, height, img_dtype});

    size = TILES_XY * CO_SLICES * 36 * sizeof_FLT;
    width = TILES_XY;
    height = CO_SLICES * 36;
    size = TILES_XY_ * CO_SLICES_ * 36 * sizeof_FLT();
    width = TILES_XY_;
    height = CO_SLICES_ * 36;
    winograd_mem1_ = allocator->Malloc(size, {width, height, img_dtype});
  }

@@ -106,142 +113,177 @@ int ConvolutionOpenCLKernel::Init() {
  return RET_OK;
 }

 int ConvolutionOpenCLKernel::InitBuffer() {
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  auto allocator = ocl_runtime->GetAllocator();
  size_t sizeof_FLT = use_fp16_ ? 2 : 4;
 int ConvolutionOpenCLKernel::RearrangeWinogradWeight() {
  constexpr float Gt[] = {1.0000000000, 1.0000000000, 1.0000000000,  1.0000000000, 1.0000000000,  0.0000000000,
                          0.0000000000, 0.7071067691, -0.7071067691, 1.4142135382, -1.4142135382, 0.0000000000,
                          0.0000000000, 0.4999999702, 0.4999999702,  1.9999998808, 1.9999998808,  1.0000000000};
  constexpr float G[] = {1.0000000000, 0.0000000000,  0.0000000000, 1.0000000000, 0.7071067691, 0.4999999702,
                         1.0000000000, -0.7071067691, 0.4999999702, 1.0000000000, 1.4142135382, 1.9999998808,
                         1.0000000000, -1.4142135382, 1.9999998808, 0.0000000000, 0.0000000000, 1.0000000000};

  auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
  size_t KH = param->kernel_h_;
  size_t KW = param->kernel_w_;
  constexpr size_t CI_TILE = C4NUM;
  constexpr size_t CO_TILE = C4NUM;
  size_t packed_weight_size;
  if (use_winograd_) {
    packed_weight_size = UP_DIV(CO, 8) * 6 * 6 * CI_SLICES * 2 * CI_TILE * CO_TILE * sizeof_FLT;
  auto weight_tensor = in_tensors_[1];
  auto origin_weight_fp32 = reinterpret_cast<float *>(weight_tensor->data_c());
  auto origin_weight_fp16 = reinterpret_cast<float16_t *>(weight_tensor->data_c());
  std::function<float(int)> access_func;
  if (weight_tensor->data_type() == kNumberTypeFloat32) {
    access_func = [=](int idx) { return origin_weight_fp32[idx]; };
  } else {
    packed_weight_size = CO_SLICES * KH * KW * CI_SLICES * CI_TILE * CO_TILE * sizeof_FLT;
    access_func = [=](int idx) { return static_cast<float>(origin_weight_fp16[idx]); };
  }
  packed_weight_ = allocator->Malloc(packed_weight_size);
  auto packed_weight_fp32 = reinterpret_cast<float *>(packed_weight_);
  auto packed_weight_fp16 = reinterpret_cast<uint16_t *>(packed_weight_);
  allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true);
  memset(packed_weight_, 0x00, packed_weight_size);
  auto weight_tensor = in_tensors_[1];
  auto origin_weight_fp32 = reinterpret_cast<float *>(weight_tensor->MutableData());
  auto origin_weight_fp16 = reinterpret_cast<uint16_t *>(weight_tensor->MutableData());

  if (use_winograd_) {
    // weight: OHWI -> O66I -> O/8 6 6 I/4 O2 I4 O4
    std::vector<float> encoded_weight(CO * 6 * 6 * CI);
    std::vector<float> Gt = {1.0000000000, 1.0000000000, 1.0000000000,  1.0000000000, 1.0000000000,  0.0000000000,
                             0.0000000000, 0.7071067691, -0.7071067691, 1.4142135382, -1.4142135382, 0.0000000000,
                             0.0000000000, 0.4999999702, 0.4999999702,  1.9999998808, 1.9999998808,  1.0000000000};

    std::vector<float> G(Gt.size());
    for (int y = 0; y < 3; ++y) {
      for (int x = 0; x < 6; ++x) {
        G[x * 3 + y] = Gt[y * 6 + x];
      }
    }

    for (int co = 0; co < CO; ++co) {
      for (int ci = 0; ci < CI; ++ci) {
        std::vector<float> in_vals(9);
        for (int kh = 0; kh < 3; ++kh) {
          for (int kw = 0; kw < 3; ++kw) {
            const int f_index = ((co * 3 + kh) * 3 + kw) * CI + ci;
            if (use_fp16_) {
              in_vals[kh * 3 + kw] = ShortToFloat32(origin_weight_fp16[f_index]);
            } else {
              in_vals[kh * 3 + kw] = origin_weight_fp32[f_index];
            }
          }
        }

        auto temp_vals = MatrixMultiply(G, in_vals, 6, 3, 3);
        auto out_vals = MatrixMultiply(temp_vals, Gt, 6, 3, 6);
        for (int kh = 0; kh < 6; ++kh) {
          for (int kw = 0; kw < 6; ++kw) {
            const int f_index = ((co * 6 + kh) * 6 + kw) * CI + ci;
            encoded_weight[f_index] = out_vals[kh * 6 + kw];
          }
  // OHWI -> O66I
  std::vector<float> encoded_weight(CO_ * 6 * 6 * CI_);
  for (int co = 0; co < CO_; ++co) {
    for (int ci = 0; ci < CI_; ++ci) {
      float in_vals[9];
      for (int kh = 0; kh < 3; ++kh) {
        for (int kw = 0; kw < 3; ++kw) {
          const int f_index = ((co * 3 + kh) * 3 + kw) * CI_ + ci;
          in_vals[kh * 3 + kw] = access_func(f_index);
        }
      }
    }

    for (int co = 0, src_idx = 0; co < CO; ++co) {
      auto temp_vals = MatrixMultiply(G, in_vals, 6, 3, 3);
      auto out_vals = MatrixMultiply(temp_vals.data(), Gt, 6, 3, 6);
      for (int kh = 0; kh < 6; ++kh) {
        for (int kw = 0; kw < 6; ++kw) {
          for (int ci = 0; ci < CI; ++ci) {
            int co_outer = co / 8;
            int co_inner_group = co % 8 / 4;
            int co_inner = co % 8 % 4;
            int ci_outer = ci / 4;
            int ci_inner = ci % 4;
            size_t dst_idx =
              (((((co_outer * 6 + kh) * 6 + kw) * CI_SLICES + ci_outer) * 2 + co_inner_group) * CI_TILE + ci_inner) *
                CO_TILE +
              co_inner;
            if (use_fp16_) {
              packed_weight_fp16[dst_idx] = Float32ToShort(encoded_weight[src_idx++]);
            } else {
              packed_weight_fp32[dst_idx] = encoded_weight[src_idx++];
            }
          }
          const int f_index = ((co * 6 + kh) * 6 + kw) * CI_ + ci;
          encoded_weight[f_index] = out_vals[kh * 6 + kw];
        }
      }
    }
  }

  if (use_fp16_) {
    OHWI2OHWIOGroupI4O4<float, float16_t>(encoded_weight.data(), 6, 6, 2);
  } else {
    // weight: OHWI -> O/4 H W I/4 I4 O4
    for (int co = 0, src_idx = 0; co < CO; ++co) {
      for (int kh = 0; kh < KH; ++kh) {
        for (int kw = 0; kw < KW; ++kw) {
          for (int ci = 0; ci < CI; ++ci) {
            auto co_outer = co / CO_TILE;
            auto co_inner = co % CO_TILE;
            auto ci_outer = ci / CI_TILE;
            auto ci_inner = ci % CI_TILE;
            size_t dst_idx =
              ((((co_outer * KH + kh) * KW + kw) * CI_SLICES + ci_outer) * CI_TILE + ci_inner) * CO_TILE + co_inner;
            if (use_fp16_) {
              packed_weight_fp16[dst_idx] = origin_weight_fp16[src_idx++];
            } else {
              packed_weight_fp32[dst_idx] = origin_weight_fp32[src_idx++];
            }
          }
    OHWI2OHWIOGroupI4O4<float, float>(encoded_weight.data(), 6, 6, 2);
  }

  return RET_OK;
 }

 template <typename SRC_T, typename DST_T>
 int ConvolutionOpenCLKernel::OHWI2OHWIOGroupI4O4(void *weight_OHWI, size_t KH, size_t KW, size_t OGroup) {
  auto origin_weight = reinterpret_cast<SRC_T *>(weight_OHWI);
  auto packed_weight = reinterpret_cast<DST_T *>(packed_weight_);

  // OHWI -> O/OGroup/4 KH KW I/4 OGroup I4 O4
  for (size_t co = 0, src_idx = 0; co < CO_; ++co) {
    for (size_t kh = 0; kh < KH; ++kh) {
      for (size_t kw = 0; kw < KW; ++kw) {
        for (size_t ci = 0; ci < CI_; ++ci) {
          size_t co_outer = co / (CO_TILE * OGroup);
          size_t group_idx = co % (CO_TILE * OGroup) / CO_TILE;
          size_t co_inner = co % CO_TILE;
          size_t ci_outer = ci / CI_TILE;
          size_t ci_inner = ci % CI_TILE;
          size_t dst_idx =
            (((((co_outer * KH + kh) * KW + kw) * CI_SLICES_ + ci_outer) * OGroup + group_idx) * CI_TILE + ci_inner) *
              CO_TILE +
            co_inner;
          packed_weight[dst_idx] = static_cast<DST_T>(origin_weight[src_idx++]);
        }
      }
    }
  }
  return RET_OK;
 }

 int ConvolutionOpenCLKernel::InitWeight() {
  auto allocator = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator();

  // allocate memory
  size_t packed_weight_size;
  if (use_winograd_) {
    packed_weight_size = UP_DIV(CO_, 8) * 6 * 6 * CI_SLICES_ * 2 * CI_TILE * CO_TILE * sizeof_FLT();
  } else {
    packed_weight_size = CO_SLICES_ * KH_ * KW_ * CI_SLICES_ * CI_TILE * CO_TILE * sizeof_FLT();
  }
  packed_weight_ = allocator->Malloc(packed_weight_size);
  allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true);
  memset(packed_weight_, 0x00, packed_weight_size);

  // rearrange weight
  if (use_winograd_) {
    RearrangeWinogradWeight();
  } else {
    auto weight_tensor = in_tensors_[1];
    if (weight_tensor->data_type() == kNumberTypeFloat16) {
      if (use_fp16_) {
        OHWI2OHWIOGroupI4O4<float16_t, float16_t>(weight_tensor->data_c(), KH_, KW_, 1);
      } else {
        OHWI2OHWIOGroupI4O4<float16_t, float>(weight_tensor->data_c(), KH_, KW_, 1);
      }
    } else {
      if (use_fp16_) {
        OHWI2OHWIOGroupI4O4<float, float16_t>(weight_tensor->data_c(), KH_, KW_, 1);
      } else {
        OHWI2OHWIOGroupI4O4<float, float>(weight_tensor->data_c(), KH_, KW_, 1);
      }
    }
  }

  allocator->UnmapBuffer(packed_weight_);
  return RET_OK;
 }

 int ConvolutionOpenCLKernel::InitBias() {
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  auto allocator = ocl_runtime->GetAllocator();

  // align bias from C to C4
  auto bias_tensor = in_tensors_[2];
  size_t packed_bias_size = CO_SLICES * CO_TILE * sizeof_FLT;
  size_t packed_bias_size = CO_SLICES_ * CO_TILE * sizeof_FLT();
  packed_bias_ = allocator->Malloc(packed_bias_size);

  allocator->MapBuffer(packed_bias_, CL_MAP_WRITE, nullptr, true);
  memset(packed_bias_, 0x00, packed_bias_size);
  memcpy(packed_bias_, bias_tensor->MutableData(), CO * sizeof_FLT);
  if (bias_tensor->data_type() == kNumberTypeFloat16) {
    if (use_fp16_) {
      memcpy(packed_bias_, bias_tensor->data_c(), CO_ * sizeof_FLT());
    } else {
      auto packed_bias_fp32 = reinterpret_cast<float *>(packed_bias_);
      auto origin_bias_fp16 = reinterpret_cast<float16_t *>(bias_tensor->data_c());
      for (int i = 0; i < CO_; ++i) {
        packed_bias_fp32[i] = static_cast<float>(origin_bias_fp16[i]);
      }
    }
  } else {
    if (use_fp16_) {
      auto packed_bias_fp16 = reinterpret_cast<float16_t *>(packed_bias_);
      auto origin_bias_fp32 = reinterpret_cast<float *>(bias_tensor->data_c());
      for (int i = 0; i < CO_; ++i) {
        packed_bias_fp16[i] = static_cast<float16_t>(origin_bias_fp32[i]);
      }
    } else {
      memcpy(packed_bias_, bias_tensor->data_c(), CO_ * sizeof_FLT());
    }
  }
  allocator->UnmapBuffer(packed_bias_);
  return RET_OK;
 }

 int ConvolutionOpenCLKernel::InitBuffer() {
  InitWeight();
  InitBias();
  return RET_OK;
 }

 int ConvolutionOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
  size_t im_dst_x, im_dst_y;
  if (in_tensors_[0]->GetFormat() == schema::Format::Format_NHWC4) {
    if (out_tensors_[0]->Width() * CO_SLICES < 65536) {
  if (in_tensors_[0]->GetFormat() == Format_NHWC4) {
    if (out_tensors_[0]->Width() * CO_SLICES_ < 65536) {
      {
        im_dst_x = out_tensors_[0]->Width() * CO_SLICES;
        im_dst_x = out_tensors_[0]->Width() * CO_SLICES_;
        im_dst_y = out_tensors_[0]->Height();
      }
    } else {
      im_dst_x = out_tensors_[0]->Height() * CO_SLICES;
      im_dst_x = out_tensors_[0]->Height() * CO_SLICES_;
      im_dst_y = out_tensors_[0]->Width();
    }
  } else {
    im_dst_y = out_tensors_[0]->Height() * CO_SLICES;
    im_dst_y = out_tensors_[0]->Height() * CO_SLICES_;
    im_dst_x = out_tensors_[0]->Width();
  }
  size_t img_dtype = use_fp16_ ? CL_HALF_FLOAT : CL_FLOAT;
@@ -259,52 +301,52 @@ int ConvolutionOpenCLKernel::Run() {
  int arg_cn = 0;
  if (use_winograd_) {
    arg_cn = 0;
    cl_int4 _4x4to36_in_shape = {1, IH, IW, CI_SLICES};
    cl_int4 _4x4to36_out_shape = {1, 36, TILES_XY, CI_SLICES};
    ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, in_tensors_[0]->MutableData(), lite::opencl::MemType::IMG);
    ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, winograd_mem0_, lite::opencl::MemType::IMG);
    ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, _4x4to36_in_shape);
    ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, _4x4to36_out_shape);
    cl_int4 _4x4to36_in_shape = {1, IH_, IW_, CI_SLICES_};
    cl_int4 _4x4to36_out_shape = {1, 36, TILES_XY_, CI_SLICES_};
    ocl_runtime->SetKernelArg(kernel_4x4to36_, arg_cn++, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
    ocl_runtime->SetKernelArg(kernel_4x4to36_, arg_cn++, winograd_mem0_, lite::opencl::MemType::IMG);
    ocl_runtime->SetKernelArg(kernel_4x4to36_, arg_cn++, _4x4to36_in_shape);
    ocl_runtime->SetKernelArg(kernel_4x4to36_, arg_cn++, _4x4to36_out_shape);

    arg_cn = 0;
    cl_int4 conv_in_shape = {1, 36, TILES_XY, CI_SLICES};
    cl_int4 conv_out_shape = {1, 36, TILES_XY, CO_SLICES};
    ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, winograd_mem0_, lite::opencl::MemType::IMG);
    ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, winograd_mem1_, lite::opencl::MemType::IMG);
    ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, packed_weight_, lite::opencl::MemType::BUF);
    ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, conv_in_shape);
    ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, conv_out_shape);
    cl_int4 conv_in_shape = {1, 36, TILES_XY_, CI_SLICES_};
    cl_int4 conv_out_shape = {1, 36, TILES_XY_, CO_SLICES_};
    ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, winograd_mem0_, lite::opencl::MemType::IMG);
    ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, winograd_mem1_, lite::opencl::MemType::IMG);
    ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF);
    ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, conv_in_shape);
    ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, conv_out_shape);

    arg_cn = 0;
    cl_int4 _36to4x4_in_shape = {1, 16, TILES_XY, CO_SLICES};
    cl_int4 _36to4x4_out_shape = {1, OH, OW, CO_SLICES};
    ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, winograd_mem1_, lite::opencl::MemType::IMG);
    ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, out_tensors_[0]->MutableData(), lite::opencl::MemType::IMG);
    ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, packed_bias_, lite::opencl::MemType::BUF);
    ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, _36to4x4_in_shape);
    ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, _36to4x4_out_shape);
    cl_int4 _36to4x4_in_shape = {1, 16, TILES_XY_, CO_SLICES_};
    cl_int4 _36to4x4_out_shape = {1, OH_, OW_, CO_SLICES_};
    ocl_runtime->SetKernelArg(kernel_36to4x4_, arg_cn++, winograd_mem1_, lite::opencl::MemType::IMG);
    ocl_runtime->SetKernelArg(kernel_36to4x4_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
    ocl_runtime->SetKernelArg(kernel_36to4x4_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF);
    ocl_runtime->SetKernelArg(kernel_36to4x4_, arg_cn++, _36to4x4_in_shape);
    ocl_runtime->SetKernelArg(kernel_36to4x4_, arg_cn++, _36to4x4_out_shape);
  } else {
    arg_cn = 0;
    ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, in_tensors_[0]->MutableData(), lite::opencl::MemType::IMG);
    ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, out_tensors_[0]->MutableData(), lite::opencl::MemType::IMG);
    ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, packed_weight_, lite::opencl::MemType::BUF);
    ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, packed_bias_, lite::opencl::MemType::BUF);
    if (op_format_ == schema::Format::Format_NC4HW4) {
      cl_int4 input_shape = {1, IH, IW, CI_SLICES};
      cl_int4 output_shape = {1, OH, OW, CO_SLICES};
      ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, input_shape);
      ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, output_shape);
    ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
    ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
    ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF);
    ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF);
    if (op_format_ == Format_NC4HW4) {
      cl_int4 input_shape = {1, IH_, IW_, CI_SLICES_};
      cl_int4 output_shape = {1, OH_, OW_, CO_SLICES_};
      ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, input_shape);
      ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, output_shape);
    }
  }

  if (use_winograd_) {
    ocl_runtime->RunKernel(kernel_4x4to36, {size_t(TILES_XY), 6, size_t(CI_SLICES)}, {8, 6, 4}, nullptr);
    ocl_runtime->RunKernel(kernel_conv, {size_t(TILES_XY / 2), 36, size_t(CO_SLICES / 2)}, {8, 6, 2}, nullptr);
    ocl_runtime->RunKernel(kernel_36to4x4, {size_t(TILES_XY), 4, size_t(CO_SLICES)}, {32, 4, 2}, nullptr);
    ocl_runtime->RunKernel(kernel_4x4to36_, {size_t(TILES_XY_), 6, size_t(CI_SLICES_)}, {8, 6, 4}, nullptr);
    ocl_runtime->RunKernel(kernel_conv_, {size_t(TILES_XY_ / 2), 36, size_t(CO_SLICES_ / 2)}, {8, 6, 2}, nullptr);
    ocl_runtime->RunKernel(kernel_36to4x4_, {size_t(TILES_XY_), 4, size_t(CO_SLICES_)}, {32, 4, 2}, nullptr);
  } else {
    std::vector<size_t> global, local;
    SetGlobalLocalConv(&global, &local);
    ocl_runtime->RunKernel(kernel_conv, global, local, nullptr);
    ocl_runtime->RunKernel(kernel_conv_, global, local, nullptr);
  }

  return RET_OK;
@@ -312,10 +354,8 @@ int ConvolutionOpenCLKernel::Run() {

 std::string ConvolutionOpenCLKernel::CodeGenConvolutionNHWC4() {
  auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
  const size_t CI_ALIGN = CI_SLICES * C4NUM;
  const size_t CO_ALIGN = CO_SLICES * C4NUM;
  const size_t KH = param->kernel_h_;
  const size_t KW = param->kernel_w_;
  const size_t CI_ALIGN = CI_SLICES_ * C4NUM;
  const size_t CO_ALIGN = CO_SLICES_ * C4NUM;
  const size_t strideH = param->stride_h_;
  const size_t strideW = param->stride_w_;
  const size_t padTop = param->pad_u_;
@@ -327,21 +367,21 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNHWC4() {
  code += "#define CI_TILE 4\n";
  code += "#define CO_TILE 4\n\n";
  code += "#define CI " + std::to_string(CI_ALIGN) + "\n";
  code += "#define IH " + std::to_string(IH) + "\n";
  code += "#define IW " + std::to_string(IW) + "\n";
  code += "#define IH " + std::to_string(IH_) + "\n";
  code += "#define IW " + std::to_string(IW_) + "\n";
  code += "#define CO " + std::to_string(CO_ALIGN) + "\n";
  code += "#define OH " + std::to_string(OH) + "\n";
  code += "#define OW " + std::to_string(OW) + "\n";
  code += "#define KH " + std::to_string(KH) + "\n";
  code += "#define KW " + std::to_string(KW) + "\n";
  code += "#define OH " + std::to_string(OH_) + "\n";
  code += "#define OW " + std::to_string(OW_) + "\n";
  code += "#define KH " + std::to_string(KH_) + "\n";
  code += "#define KW " + std::to_string(KW_) + "\n";
  code += "#define strideH " + std::to_string(strideH) + "\n";
  code += "#define strideW " + std::to_string(strideW) + "\n";
  code += "#define padTop " + std::to_string(padTop) + "\n";
  code += "#define padBottom " + std::to_string(padBottom) + "\n";
  code += "#define padLeft " + std::to_string(padLeft) + "\n";
  code += "#define padRight " + std::to_string(padRight) + "\n";
  code += "#define CI_SLICES " + std::to_string(CI_SLICES) + "\n";
  code += "#define CO_SLICES " + std::to_string(CO_SLICES) + "\n\n";
  code += "#define CI_SLICES " + std::to_string(CI_SLICES_) + "\n";
  code += "#define CO_SLICES " + std::to_string(CO_SLICES_) + "\n\n";

  if (use_fp16_) {
    code += "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
@@ -401,7 +441,7 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNHWC4() {
    code += "    out0_c4_bias = clamp(out0_c4_bias, (FLT4)(0.0f), (FLT4)(6.0f));\n";
  }

  if (OW * CO_SLICES < 65536) {
  if (OW_ * CO_SLICES_ < 65536) {
    code += "    WRITE_IMAGE(output, (int2)(ow * CO_SLICES + co_slice, oh), out0_c4_bias);// NHWC4: H WC\n}";
  } else {
    code += "    WRITE_IMAGE(output, (int2)(oh * CO_SLICES + co_slice, ow), out0_c4_bias);// NHWC4: H WC\n}";
@@ -411,8 +451,6 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNHWC4() {

 std::string ConvolutionOpenCLKernel::CodeGenConvolutionNC4HW4() {
  auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
  const size_t KH = param->kernel_h_;
  const size_t KW = param->kernel_w_;
  const size_t strideH = param->stride_h_;
  const size_t strideW = param->stride_w_;
  const size_t padTop = param->pad_u_;
@@ -442,12 +480,12 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNC4HW4() {
    "    int CI_SLICES = input_shape.w;\n"
    "    int CO_SLICES = output_shape.w;\n\n";

  code += "    #define IH " + std::to_string(IH) + "\n";
  code += "    #define IW " + std::to_string(IW) + "\n";
  code += "    #define OH " + std::to_string(OH) + "\n";
  code += "    #define OW " + std::to_string(OW) + "\n";
  code += "    #define KH " + std::to_string(KH) + "\n";
  code += "    #define KW " + std::to_string(KW) + "\n";
  code += "    #define IH " + std::to_string(IH_) + "\n";
  code += "    #define IW " + std::to_string(IW_) + "\n";
  code += "    #define OH " + std::to_string(OH_) + "\n";
  code += "    #define OW " + std::to_string(OW_) + "\n";
  code += "    #define KH " + std::to_string(KH_) + "\n";
  code += "    #define KW " + std::to_string(KW_) + "\n";
  code += "    #define strideH " + std::to_string(strideH) + "\n";
  code += "    #define strideW " + std::to_string(strideW) + "\n";
  code += "    #define padTop " + std::to_string(padTop) + "\n";
@@ -457,7 +495,7 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNC4HW4() {
    "    if (oh >= OH || ow >= OW || co_slice >= CO_SLICES)\n"
    "        return;\n\n";

  bool check_ow = (OW % 2) == 1;
  bool check_ow = (OW_ % 2) == 1;
  if (check_ow) {
    code +=
      "    int last_is_double = 1;\n"
@@ -607,12 +645,12 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd4x4To36() {
    "    {\n"
    "        int y_idx = tile_y * 4 - PAD + y;\n";

  if (op_format_ == schema::Format::Format_NHWC4) {
  if (op_format_ == Format_NHWC4) {
    code +=
      "        for (int x = 0; x < 6; x++)\n"
      "        {\n"
      "             int x_idx = (tile_x * 4 - PAD + x) * SLICES + slice;\n";
  } else if (op_format_ == schema::Format::Format_NC4HW4) {
  } else if (op_format_ == Format_NC4HW4) {
    code +=
      "        if(y_idx < 0 || y_idx >= IH)\n"
      "        {\n"
@@ -788,9 +826,9 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() {
    "        int tile_x = tile_xy % TILE_X * 4;\n"
    "        int tile_y = tile_xy / TILE_X * 4;\n";

  if (op_format_ == schema::Format::Format_NHWC4) {
  if (op_format_ == Format_NHWC4) {
    code += "        WRITE_IMAGE(output, (int2)((tile_x + x) * SLICES + slice, tile_y + row), acc);\n";
  } else if (op_format_ == schema::Format::Format_NC4HW4) {
  } else if (op_format_ == Format_NC4HW4) {
    code += "        WRITE_IMAGE(output, (int2)(tile_x + x, slice * OH + tile_y + row), acc);\n";
  }

@@ -804,12 +842,12 @@ int ConvolutionOpenCLKernel::SetGlobalLocalConv(std::vector<size_t> *global, std
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  constexpr size_t work_group_size[] = {4, 4, 1};
  auto max_work_item_sizes = ocl_runtime->GetWorkItemSize();
  size_t max_work_group_size = ocl_runtime->GetKernelMaxWorkGroupSize(kernel_conv(), (*ocl_runtime->Device())());
  size_t max_work_group_size = ocl_runtime->GetKernelMaxWorkGroupSize(kernel_conv_(), (*ocl_runtime->Device())());
  const size_t max_z_size = std::min<size_t>(16, max_work_item_sizes[2]);

  size_t global_h = UP_DIV(OH, work_group_size[0]) * work_group_size[0];
  size_t global_w = UP_DIV(OW, work_group_size[1]) * work_group_size[1];
  size_t global_c = UP_DIV(CO_SLICES, work_group_size[2]) * work_group_size[2];
  size_t global_h = UP_DIV(OH_, work_group_size[0]) * work_group_size[0];
  size_t global_w = UP_DIV(OW_, work_group_size[1]) * work_group_size[1];
  size_t global_c = UP_DIV(CO_SLICES_, work_group_size[2]) * work_group_size[2];

  size_t local_c = GetBiggestDivider(global_c, max_z_size);
  if (local_c == 0) {
@@ -823,22 +861,22 @@ int ConvolutionOpenCLKernel::SetGlobalLocalConv(std::vector<size_t> *global, std
    local_h = global_h / 2;
  }

  if (op_format_ == schema::Format::Format_NHWC4) {
    if (OW * CO_SLICES > 65536) {
  if (op_format_ == Format_NHWC4) {
    if (OW_ * CO_SLICES_ > 65536) {
      local_w = 4;
    }
  }

  global->clear();
  global->push_back(UP_DIV(OW, local_w) * local_w);
  global->push_back(UP_DIV(OH, local_h) * local_h);
  global->push_back(UP_DIV(CO_SLICES, local_c) * local_c);
  global->push_back(UP_DIV(OW_, local_w) * local_w);
  global->push_back(UP_DIV(OH_, local_h) * local_h);
  global->push_back(UP_DIV(CO_SLICES_, local_c) * local_c);
  local->clear();
  local->push_back(local_w);
  local->push_back(local_h);
  local->push_back(local_c);

  if (op_format_ == schema::Format::Format_NC4HW4) {
  if (op_format_ == Format_NC4HW4) {
    // calculate 2 FLT4 along width per work-item
    global->at(0) = UP_DIV(global->at(0), 2);
    if (local->at(0) > global->at(0)) {
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
@@ -42,27 +42,35 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {
 private:
  bool use_fp16_ = false;

  int CI;
  int IH;
  int IW;
  int CO;
  int OH;
  int OW;
  int CI_SLICES;
  int CO_SLICES;
  int CI_{};
  int IH_{};
  int IW_{};
  int CO_{};
  int OH_{};
  int OW_{};
  int CI_SLICES_{};
  int CO_SLICES_{};
  int KH_{};
  int KW_{};
  void *packed_weight_ = nullptr;
  void *packed_bias_ = nullptr;

  bool use_winograd_ = false;
  int TILES_X;
  int TILES_Y;
  int TILES_XY;
  int TILES_X_{};
  int TILES_Y_{};
  int TILES_XY_{};
  void *winograd_mem0_ = nullptr;
  void *winograd_mem1_ = nullptr;

  cl::Kernel kernel_4x4to36;
  cl::Kernel kernel_conv;
  cl::Kernel kernel_36to4x4;
  cl::Kernel kernel_4x4to36_;
  cl::Kernel kernel_conv_;
  cl::Kernel kernel_36to4x4_;

  int InitWeight();
  int InitBias();
  int RearrangeWinogradWeight();
  template <typename SRC_T, typename DST_T>
  int OHWI2OHWIOGroupI4O4(void *weight_OHWI, size_t KH, size_t KW, size_t OGroup);

  std::string CodeGenConvolutionNHWC4();
  std::string CodeGenConvolutionNC4HW4();
@@ -72,16 +80,18 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {
  std::string CodeGenWinograd36To4x4();
  int SetGlobalLocalConv(std::vector<size_t> *global, std::vector<size_t> *local);

  size_t sizeof_FLT() const { return use_fp16_ ? sizeof(float16_t) : sizeof(float); }

  bool UseWinograd4x4To6x6() {
    auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
    const bool attr_valid = param->kernel_h_ == 3 && param->kernel_w_ == 3 && param->dilation_h_ == 1 &&
                            param->dilation_w_ == 1 && param->stride_h_ == 1 && param->stride_w_ == 1;
    const bool channel_good = CI_SLICES >= 12 && CO_SLICES >= 12;
    const bool hw_good = TILES_X * TILES_Y >= 16;
    const bool channel_good = CI_SLICES_ >= 12 && CO_SLICES_ >= 12;
    const bool hw_good = TILES_X_ * TILES_Y_ >= 16;
    return attr_valid && channel_good && hw_good;
  }

  std::vector<float> MatrixMultiply(const std::vector<float> &A, const std::vector<float> &B, int M, int N, int K) {
  static std::vector<float> MatrixMultiply(const float A[], const float B[], int M, int N, int K) {
    std::vector<float> C(M * K);
    for (int i = 0; i < M; ++i) {
      for (int j = 0; j < K; ++j) {
--- a/mindspore/lite/test/run_test.sh
+++ b/mindspore/lite/test/run_test.sh
@@ -28,3 +28,6 @@ cp -fr $TEST_DATA_DIR/testPK ./data
 ./lite-test --gtest_filter=TestDeconvInt8.*

 ./lite-test --gtest_filter="TestTfliteParser*"

 # for GPU OpenCL
 ./lite-test --gtest_filter="TestConvolutionOpenCL.simple_test*"
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc
@@ -21,19 +21,18 @@
 #include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
 #include "mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h"
 #include "nnacl/pack.h"
 #include "nnacl/fp32/common_func.h"

 using mindspore::kernel::ConvolutionOpenCLKernel;
 using mindspore::kernel::LiteKernel;
 using mindspore::kernel::SubGraphOpenCLKernel;
 using mindspore::lite::Tensor;
 using mindspore::schema::Format;
 using mindspore::schema::Format_KHWC;
 using mindspore::schema::Format_NC4HW4;
 using mindspore::schema::Format_NCHW;
 using mindspore::schema::Format_NHWC;
 using mindspore::schema::Format_NHWC4;
 using mindspore::schema::NodeType_ValueNode;
 using mindspore::schema::Format::Format_KHWC;
 using mindspore::schema::Format::Format_NC4HW4;
 using mindspore::schema::Format::Format_NCHW;
 using mindspore::schema::Format::Format_NHWC;
 using mindspore::schema::Format::Format_NHWC4;

 namespace mindspore {

@@ -41,26 +40,25 @@ class TestConvolutionOpenCL : public mindspore::CommonTest {};

 void LoadData(Tensor *tensor, const float *src) {
  if (tensor->data_type() == kNumberTypeFloat16) {
    auto num = tensor->Size() / 2;
    auto tensor_data = reinterpret_cast<uint16_t *>(tensor->MutableData());
    auto num = tensor->Size() / sizeof(float16_t);
    auto tensor_data = reinterpret_cast<float16_t *>(tensor->data_c());
    for (int i = 0; i < num; ++i) {
      tensor_data[i] = Float32ToShort(src[i]);
      tensor_data[i] = static_cast<float16_t>(src[i]);
    }
  } else {
    memcpy(tensor->MutableData(), src, tensor->Size());
    memcpy(tensor->data_c(), src, tensor->Size());
  }
 }

 void CompareOutput(Tensor *output, const float *expect_data, const float atol) {
  auto num = (output->data_type() == kNumberTypeFloat16) ? output->Size() / 2 : output->Size() / 4;
  auto num = output->Size() / (output->data_type() == kNumberTypeFloat16 ? 2 : 4);
  std::vector<float> output_data(num);
  if (output->data_type() == kNumberTypeFloat16) {
    auto output_data_fp16 = reinterpret_cast<uint16_t *>(output->MutableData());
    for (int i = 0; i < output_data.size(); ++i) {
      output_data[i] = ShortToFloat32((output_data_fp16[i]));
      output_data[i] = static_cast<float>(reinterpret_cast<float16_t *>(output->data_c())[i]);
    }
  } else {
    memcpy(output_data.data(), output->MutableData(), output->Size());
    memcpy(output_data.data(), output->data_c(), output->Size());
  }

  printf("output:");
@@ -69,9 +67,9 @@ void CompareOutput(Tensor *output, const float *expect_data, const float atol) {
  }
  printf("\n");

  float max_err = 0.0f;
  float max_err = -1.0f;
  std::array<int, 5> idx_5d{};
  int idx = -1;
  int max_err_idx = -1, first_err_idx = -1;
  auto SLICES = UP_DIV(output->Channel(), 4);
  int I = 1, J = 1, K = 1, L = 1, M = 1;
  switch (output->GetFormat()) {
@@ -98,10 +96,13 @@ void CompareOutput(Tensor *output, const float *expect_data, const float atol) {
        for (int l = 0; l < L; ++l) {
          for (int m = 0; m < M; ++m) {
            auto err = std::fabs(output_data[cn] - expect_data[cn]);
            if (first_err_idx == -1 && max_err > atol) {
              first_err_idx = cn;
            }
            if (err > max_err) {
              max_err = err;
              idx_5d = {i, j, k, l, m};
              idx = cn;
              max_err_idx = cn;
            }
            cn++;
          }
@@ -110,18 +111,19 @@ void CompareOutput(Tensor *output, const float *expect_data, const float atol) {
    }
  }

  float relative_err = max_err / std::fabs(std::max(expect_data[idx], output_data[idx]));
  if (output->GetFormat() == Format_NHWC || output->GetFormat() == Format_NCHW) {
    printf("max relative error at [%d,%d,%d,%d]", idx_5d[0], idx_5d[1], idx_5d[2], idx_5d[3]);
  } else {
    printf("max relative error at [%d,%d,%d,%d,%d]", idx_5d[0], idx_5d[1], idx_5d[2], idx_5d[3], idx_5d[4]);
  }
  printf(" expect=%.3f output=%.3f absolute_err=%.2e relative_err=%.2f%%\n", expect_data[idx], output_data[idx],
         max_err, relative_err * 100);

  if (max_err > atol) {
    printf("first error at %d expect=%.3f output=%.3f\n", first_err_idx, expect_data[first_err_idx],
           output_data[first_err_idx]);
    FAIL();
  } else {
    float relative_err = max_err / std::fabs(std::max(expect_data[max_err_idx], output_data[max_err_idx]));
    if (output->GetFormat() == Format_NHWC || output->GetFormat() == Format_NCHW) {
      printf("max relative error at [%d,%d,%d,%d]", idx_5d[0], idx_5d[1], idx_5d[2], idx_5d[3]);
    } else {
      printf("max relative error at [%d,%d,%d,%d,%d]", idx_5d[0], idx_5d[1], idx_5d[2], idx_5d[3], idx_5d[4]);
    }
    printf(" expect=%.3f output=%.3f absolute_err=%.2e relative_err=%.2f%%\n", expect_data[max_err_idx],
           output_data[max_err_idx], max_err, relative_err * 100);
    printf("COMPARE SUCCESS!\n\n");
  }
 }