update lite gpu opencl convolution kernel

5 years ago · 04e4cba6f0
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/convolution.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/convolution.cl
@@ -1,87 +1,191 @@
 #define CI_TILE 4
 #define CO_TILE 4
 #define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
 //#define __global
 //#pragma OPENCL EXTENSION cl_arm_printf : enable
 __kernel void convolution_NHWC_OHWI(__global float *input,
                                    __global float *weight,
                                    __global float *bias,
                                    __global float *output,
                                    const uint4 input_shape,  // NHWC
                                    const uint4 weight_shape, // OHWI
                                    const uint4 output_shape, // NHWC
                                    const uint2 stride,       // HW
                                    const uint4 pad)          // top bottom left right
                                    const int4 input_shape,   // NHWC
                                    const int4 output_shape,  // NHWC
                                    const int4 kernel_stride, // kernelHW_strideHW
                                    const int4 pad)           // top bottom left right
 {
    uint ow = get_global_id(0);
    uint oh = get_global_id(1);
    uint co_outer = get_global_id(2);
    int ow = get_global_id(0);
    int oh = get_global_id(1);
    int co_slice = get_global_id(2);
    int CI = input_shape.w, IH = input_shape.y, IW = input_shape.z;
    int CO = output_shape.w, OH = output_shape.y, OW = output_shape.z;
    int KH = kernel_stride.x, KW = kernel_stride.y;
    int strideH = kernel_stride.z, strideW = kernel_stride.w;
    int padTop = pad.x, padLeft = pad.z;
    int CI_SLICES = UP_DIV(CI, CI_TILE);
    int CO_SLICES = UP_DIV(CO, CO_TILE);
    uint CI = input_shape.w, IH = input_shape.y, IW = input_shape.z;
    uint CO = output_shape.w, OW = output_shape.z;
    uint KH = weight_shape.y, KW = weight_shape.z;
    uint stride_h = stride.x, stride_w = stride.y;
    uint pad_top = pad.x, pad_left = pad.z;
    uint CI_TILE_NUM = UP_DIV(CI, CI_TILE);
    uint CO_TILE_NUM = UP_DIV(CO, CO_TILE);
    if (oh >= OH || ow >= OW || co_slice >= CO_SLICES)
        return;
    float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
    for (uint kh = 0; kh < KH; ++kh)
    for (int kh = 0; kh < KH; ++kh)
    {
        uint ih = kh + oh * stride_h - pad_top;
        for (uint kw = 0; kw < KW; ++kw)
        int ih = kh + oh * strideH - padTop;
        for (int kw = 0; kw < KW; ++kw)
        {
            uint iw = kw + ow * stride_w - pad_left;
            for (uint ci_outer = 0; ci_outer < CI_TILE_NUM; ++ci_outer)
            int iw = kw + ow * strideW - padLeft;
            for (int ci_slice = 0; ci_slice < CI_SLICES; ++ci_slice)
            {
                for (uint ci_inner = 0; ci_inner < CI_TILE; ++ci_inner)
                for (int ci_inner = 0; ci_inner < CI_TILE; ++ci_inner)
                {
                    uint ci = ci_outer * CI_TILE + ci_inner;
                    int ci = ci_slice * CI_TILE + ci_inner;
                    if (ci >= CI)
                        break;
                    uint input_idx = ih * IW * CI + iw * CI + ci;
                    int input_idx = ih * IW * CI + iw * CI + ci;
                    float value = 0;
                    if (ih < 0 || ih >= IH || iw < 0 || iw >= IW)
                        value = 0;
                    else
                        value = input[input_idx];
                    uint CO_TILE_OFFSET = KH * KW * CI;
                    uint weight_idx = (co_outer * CO_TILE) * CO_TILE_OFFSET +
                                      kh * KW * CI +
                                      kw * CI +
                                      ci;
                    acc.x += weight[weight_idx + 0 * CO_TILE_OFFSET] * value;
                    acc.y += weight[weight_idx + 1 * CO_TILE_OFFSET] * value;
                    acc.z += weight[weight_idx + 2 * CO_TILE_OFFSET] * value;
                    acc.w += weight[weight_idx + 3 * CO_TILE_OFFSET] * value;
                    int CO_OFFSET = KH * KW * CI;
                    int weight_idx = (co_slice * CO_TILE) * CO_OFFSET +
                                     kh * KW * CI +
                                     kw * CI +
                                     ci;
                    acc.x += weight[weight_idx + 0 * CO_OFFSET] * value;
                    acc.y += weight[weight_idx + 1 * CO_OFFSET] * value;
                    acc.z += weight[weight_idx + 2 * CO_OFFSET] * value;
                    acc.w += weight[weight_idx + 3 * CO_OFFSET] * value;
                }
            }
        }
    }
    uint output_idx = oh * OW * CO + ow * CO + (co_outer * CO_TILE);
    if (co_outer < CO_TILE_NUM - 1 || CO % CO_TILE == 0)
    int output_idx = oh * OW * CO + ow * CO + (co_slice * CO_TILE);
    if (co_slice < CO_SLICES - 1 || CO % CO_TILE == 0)
    {
        output[output_idx + 0] = acc.x + bias[co_outer * CO_TILE + 0];
        output[output_idx + 1] = acc.y + bias[co_outer * CO_TILE + 1];
        output[output_idx + 2] = acc.z + bias[co_outer * CO_TILE + 2];
        output[output_idx + 3] = acc.w + bias[co_outer * CO_TILE + 3];
        output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0];
        output[output_idx + 1] = acc.y + bias[co_slice * CO_TILE + 1];
        output[output_idx + 2] = acc.z + bias[co_slice * CO_TILE + 2];
        output[output_idx + 3] = acc.w + bias[co_slice * CO_TILE + 3];
    }
    else if (CO % CO_TILE == 1)
    {
        output[output_idx + 0] = acc.x + bias[co_outer * CO_TILE + 0];
        output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0];
    }
    else if (CO % CO_TILE == 2)
    {
        output[output_idx + 0] = acc.x + bias[co_outer * CO_TILE + 0];
        output[output_idx + 1] = acc.y + bias[co_outer * CO_TILE + 1];
        output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0];
        output[output_idx + 1] = acc.y + bias[co_slice * CO_TILE + 1];
    }
    else if (CO % CO_TILE == 3)
    {
        output[output_idx + 0] = acc.x + bias[co_outer * CO_TILE + 0];
        output[output_idx + 1] = acc.y + bias[co_outer * CO_TILE + 1];
        output[output_idx + 2] = acc.z + bias[co_outer * CO_TILE + 2];
        output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0];
        output[output_idx + 1] = acc.y + bias[co_slice * CO_TILE + 1];
        output[output_idx + 2] = acc.z + bias[co_slice * CO_TILE + 2];
    }
 }
 //#pragma OPENCL EXTENSION cl_khr_fp16 : enable
 //#define FLT4 half4
 #define FLT4 float4
 __kernel void convolution_NHWC4_OHWIIO_float8(__global FLT4 *input,
                                              __global FLT4 *weight,
                                              __global FLT4 *bias,
                                              __global FLT4 *output,
                                              const int4 input_shape,   // NHWC
                                              const int4 output_shape,  // NHWC
                                              const int4 kernel_stride, // kernelHW_strideHW
                                              const int4 pad)           // top bottom left right
 {
    int oh = get_global_id(0);  // [0, OH)
    int ow = get_global_id(1);  // [0, OW)
    int co_slice = get_global_id(2); // [0, UP_DIV(CO, CO_TILE) )
    int CI = input_shape.w, IH = input_shape.y, IW = input_shape.z;
    int CO = output_shape.w, OH = output_shape.y, OW = output_shape.z;
    int CI_SLICES = UP_DIV(CI, CI_TILE);
    int CO_SLICES = UP_DIV(CO, CO_TILE);
    int KH = kernel_stride.x, KW = kernel_stride.y;
    int strideH = kernel_stride.z, strideW = kernel_stride.w;
    int padTop = pad.x, padLeft = pad.z;
    if (oh >= OH || ow >= OW || 2 * co_slice >= CO_SLICES)
        return;
    if (2 * co_slice + 1 >= CO_SLICES)
    {
        FLT4 out0_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
        __global FLT4 *w0_ic1_oc4 = weight + (2 * co_slice + 0) * KH * KW * CI_SLICES * CI_TILE;
        for (int kh = 0; kh < KH; ++kh)
        {
            int ih = kh + oh * strideH - padTop;
            for (int kw = 0; kw < KW; ++kw)
            {
                int iw = kw + ow * strideW - padLeft;
                if (ih >= 0 && ih < IH && iw >= 0 && iw < IW)
                {
                    for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++)
                    {
                        FLT4 in_c4 = input[ih * IW * CI_SLICES + iw * CI_SLICES + ci_slice];
                        out0_c4 += w0_ic1_oc4[0] * in_c4.x;
                        out0_c4 += w0_ic1_oc4[1] * in_c4.y;
                        out0_c4 += w0_ic1_oc4[2] * in_c4.z;
                        out0_c4 += w0_ic1_oc4[3] * in_c4.w;
                        w0_ic1_oc4 += 4;
                    }
                }
                else
                {
                    w0_ic1_oc4 += 4 * CI_SLICES;
                }
            }
        }
        output[oh * OW * CO_SLICES + ow * CO_SLICES + 2 * co_slice + 0] = out0_c4 + bias[2 * co_slice + 0];
    }
    else
    {
        FLT4 out0_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
        FLT4 out1_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
        __global FLT4 *w0_ic1_oc4 = weight + (2 * co_slice + 0) * KH * KW * CI_SLICES * CI_TILE;
        __global FLT4 *w1_ic1_oc4 = weight + (2 * co_slice + 1) * KH * KW * CI_SLICES * CI_TILE;
        for (int kh = 0; kh < KH; ++kh)
        {
            int ih = kh + oh * strideH - padTop;
            for (int kw = 0; kw < KW; ++kw)
            {
                int iw = kw + ow * strideW - padLeft;
                if (ih >= 0 && ih < IH && iw >= 0 && iw < IW)
                {
                    int idx = ih * IW * CI_SLICES + iw * CI_SLICES;
                    for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++)
                    {
                        FLT4 in_c4 = input[idx + ci_slice];
                        out0_c4 += w0_ic1_oc4[0] * in_c4.x;
                        out0_c4 += w0_ic1_oc4[1] * in_c4.y;
                        out0_c4 += w0_ic1_oc4[2] * in_c4.z;
                        out0_c4 += w0_ic1_oc4[3] * in_c4.w;
                        w0_ic1_oc4 += 4;
                        out1_c4 += w1_ic1_oc4[0] * in_c4.x;
                        out1_c4 += w1_ic1_oc4[1] * in_c4.y;
                        out1_c4 += w1_ic1_oc4[2] * in_c4.z;
                        out1_c4 += w1_ic1_oc4[3] * in_c4.w;
                        w1_ic1_oc4 += 4;
                    }
                }
                else
                {
                    w0_ic1_oc4 += 4 * CI_SLICES;
                    w1_ic1_oc4 += 4 * CI_SLICES;
                }
            }
        }
        output[oh * OW * CO_SLICES + ow * CO_SLICES + 2 * co_slice + 0] = out0_c4 + bias[2 * co_slice + 0];
        output[oh * OW * CO_SLICES + ow * CO_SLICES + 2 * co_slice + 1] = out1_c4 + bias[2 * co_slice + 1];
    }
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
@@ -14,16 +14,12 @@
 * limitations under the License.
 */
 #include "src/runtime/kernel/opencl/kernel/convolution.h"
 #include <vector>
 #include <string>
 #include <set>
 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #ifndef PROGRAM_WITH_IL
 #include <algorithm>
 #include "src/runtime/kernel/opencl/kernel/convolution.h"
 #include "src/runtime/kernel/opencl/cl/fp32/convolution.cl.inc"
 #endif
 #include "src/kernel_registry.h"
 using mindspore::kernel::KERNEL_ARCH::kGPU;
 using mindspore::lite::KernelRegistrar;
@@ -38,27 +34,27 @@ int ConvolutionOpenCLKernel::Init() {
    MS_LOG(ERROR) << "ConvolutionOpenCLKernel only support Batch=1!";
  }
  outputs_[0]->SetFormat(schema::Format_NHWC4);
  io_dataformat_ = outputs_[0]->GetFormat();
  auto io_NHWC = inputs_[0]->GetFormat() == schema::Format_NHWC && outputs_[0]->GetFormat() == schema::Format_NHWC;
  auto io_NHWC4 = inputs_[0]->GetFormat() == schema::Format_NHWC4 && outputs_[0]->GetFormat() == schema::Format_NHWC4;
  if (!io_NHWC && !io_NHWC4) {
    MS_LOG(ERROR) << "input and output data_format is invalid!";
  }
  io_dataformat_ = inputs_[0]->GetFormat();
  if (inputs_[1]->GetFormat() != schema::Format_KHWC) {
    MS_LOG(ERROR) << "weight data_format is invalid!";
  }
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  std::string kernel_name = "convolution_NHWC_OHWI";
 #ifdef PROGRAM_WITH_IL
  ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name);
 #else
  std::set<std::string> build_options;
  std::string source = convolution_source_fp32;
  std::string program_name = "convolution";
  std::string kernel_name = io_NHWC4 ? "convolution_NHWC4_OHWIIO_float8" : "convolution_NHWC_OHWI";
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  ocl_runtime->LoadSource(program_name, source);
  ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
 #endif
  this->InitBuffer();
  MS_LOG(DEBUG) << kernel_name << " Init Done!";
  return 0;
 }
 int ConvolutionOpenCLKernel::InitBuffer() {
@@ -78,35 +74,41 @@ int ConvolutionOpenCLKernel::InitBuffer() {
    memcpy_s(packed_bias_, bias_tensor->Size(), bias_tensor->Data(), bias_tensor->Size());
    allocator->UnmapBuffer(packed_bias_);
  } else if (io_dataformat_ == schema::Format_NHWC4) {
    // OHWI -> OHWIIO
    auto weight_shape = weight_tensor->shape();
    size_t CO = weight_shape[0];
    size_t KH = weight_shape[1];
    size_t KW = weight_shape[2];
    size_t CI = weight_shape[3];
    size_t CI_ALIGN = UP_DIV(CI, C4NUM) * C4NUM;
    size_t CO_ALIGN = UP_DIV(CO, C4NUM) * C4NUM;
    size_t weight_size_tiled = CO_ALIGN * KH * KW * CI_ALIGN * sizeof(float);
    size_t CI_SLICES = UP_DIV(CI, C4NUM);
    size_t CO_SLICES = UP_DIV(CO, C4NUM);
    constexpr size_t CI_TILE = C4NUM;
    constexpr size_t CO_TILE = C4NUM;
    size_t packed_weight_size = CO_SLICES * KH * KW * CI_SLICES * CI_TILE * CO_TILE * sizeof(float);
    packed_weight_ = reinterpret_cast<float *>(allocator->Malloc(weight_size_tiled));
    packed_weight_ = reinterpret_cast<float *>(allocator->Malloc(packed_weight_size));
    packed_weight_ = reinterpret_cast<float *>(allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true));
    memset_s(packed_weight_, weight_size_tiled, 0x00, weight_size_tiled);
    memset_s(packed_weight_, packed_weight_size, 0x00, packed_weight_size);
    auto weight_data = reinterpret_cast<float *>(weight_tensor->Data());
    for (int co = 0; co < CO; ++co) {
      for (int kh = 0; kh < KH; ++kh) {
        for (int kw = 0; kw < KW; ++kw) {
          for (int ci = 0; ci < CI; ++ci) {
            packed_weight_[co * KH * KW * CI_ALIGN + kh * KW * CI_ALIGN + kw * CI_ALIGN + ci] =
              weight_data[co * KH * KW * CI + kh * KW * CI + kw * CI + ci];
            auto co_outer = co / CO_TILE;
            auto co_inner = co % CO_TILE;
            auto ci_outer = ci / CI_TILE;
            auto ci_inner = ci % CI_TILE;
            packed_weight_[((((co_outer * KH + kh) * KW + kw) * CI_SLICES + ci_outer) * CI_TILE + ci_inner) * CO_TILE +
                           co_inner] = *(weight_data++);
          }
        }
      }
    }
    allocator->UnmapBuffer(packed_weight_);
    size_t bias_size_tiled = CO_ALIGN * sizeof(float);
    packed_bias_ = reinterpret_cast<float *>(allocator->Malloc(bias_size_tiled));
    size_t packed_bias_size = CO_SLICES * CO_TILE * sizeof(float);
    packed_bias_ = reinterpret_cast<float *>(allocator->Malloc(packed_bias_size));
    packed_bias_ = reinterpret_cast<float *>(allocator->MapBuffer(packed_bias_, CL_MAP_WRITE, nullptr, true));
    memset_s(packed_bias_, bias_size_tiled, 0x00, bias_size_tiled);
    memset_s(packed_bias_, packed_bias_size, 0x00, packed_bias_size);
    auto bias_data = reinterpret_cast<float *>(bias_tensor->Data());
    for (int co = 0; co < CO; ++co) {
      packed_bias_[co] = bias_data[co];
@@ -115,47 +117,80 @@ int ConvolutionOpenCLKernel::InitBuffer() {
  }
  return 0;
 }
 }  // namespace mindspore::kernel
 int ConvolutionOpenCLKernel::ReSize() { return 0; }
 static int GetBiggestDivider(int x, int y) {
  for (int i = y; i != 0; i--) {
    if (x % i == 0) {
      return i;
    }
  }
  return 1;
 }
 static void GetLocalSize(const ConvParameter *param, std::vector<size_t> *global, std::vector<size_t> *local) {
  constexpr size_t work_group_size[] = {4, 4, 1};
  constexpr size_t max_work_item_sizes[] = {512, 512, 512};
  constexpr size_t max_work_group_size = 512;
  const size_t max_z_size = std::min<size_t>(16, max_work_item_sizes[2]);
  // 先用OH OW CO_SLICES初始化global，并且441对齐
  size_t global_h = UP_DIV(param->output_h_, work_group_size[0]) * work_group_size[0];
  size_t global_w = UP_DIV(param->output_w_, work_group_size[1]) * work_group_size[1];
  size_t global_c = UP_DIV(UP_DIV(param->output_channel_, C4NUM), work_group_size[2]) * work_group_size[2];
  // 使用策略计算local
  size_t local_c = GetBiggestDivider(global_c, max_z_size);
  size_t local_hw_size = std::min<size_t>(256, max_work_group_size) / local_c;
  size_t local_w = std::min(global_w, local_hw_size);
  size_t local_h = std::min(local_hw_size / local_w, global_h);
  if (local_h == global_h && global_h % 2 == 0) {
    local_h = global_h / 2;
  }
  global->clear();
  global->push_back(UP_DIV(param->output_h_, local_h) * local_h);
  global->push_back(UP_DIV(param->output_w_, local_w) * local_w);
  global->push_back(UP_DIV(UP_DIV(param->output_channel_, C4NUM), local_c) * local_c);
  local->clear();
  local->push_back(local_h);
  local->push_back(local_w);
  local->push_back(local_c);
 }
 int ConvolutionOpenCLKernel::Run() {
  MS_LOG(DEBUG) << this->Name() << " Running!";
  MS_LOG(INFO) << "ConvolutionOpenCLKernel::Run()";
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  auto param = reinterpret_cast<ConvParameter *>(opParameter);
  auto input0_shape = inputs_[0]->shape();   // NHWC
  auto input1_shape = inputs_[1]->shape();   // OHWI
  auto outpu0_shape = outputs_[0]->shape();  // NHWC
  cl_uint N = input0_shape[0];
  cl_uint CI = input0_shape[3];
  cl_uint IH = input0_shape[1];
  cl_uint IW = input0_shape[2];
  cl_uint CO = outpu0_shape[3];
  cl_uint OH = outpu0_shape[1];
  cl_uint OW = outpu0_shape[2];
  cl_uint KH = input1_shape[1];
  cl_uint KW = input1_shape[2];
  cl_uint CI_TILE_NUM = UP_DIV(CI, C4NUM);
  cl_uint CO_TILE_NUM = UP_DIV(CO, C4NUM);
  cl_uint CI_ALIGN = CI_TILE_NUM * C4NUM;
  cl_uint CO_ALIGN = CO_TILE_NUM * C4NUM;
  cl_uint4 input_shape;
  cl_uint4 weight_shape;
  cl_uint4 output_shape;
  cl_int N = input0_shape[0];
  cl_int CI = input0_shape[3];
  cl_int IH = input0_shape[1];
  cl_int IW = input0_shape[2];
  cl_int CO = outpu0_shape[3];
  cl_int OH = outpu0_shape[1];
  cl_int OW = outpu0_shape[2];
  cl_int KH = input1_shape[1];
  cl_int KW = input1_shape[2];
  cl_int CI_ALIGN = UP_DIV(CI, C4NUM) * C4NUM;
  cl_int CO_ALIGN = UP_DIV(CO, C4NUM) * C4NUM;
  cl_int4 input_shape;
  cl_int4 output_shape;
  if (io_dataformat_ == schema::Format_NHWC) {
    input_shape = {N, IH, IW, CI};
    weight_shape = {CO, KH, KW, CI};
    output_shape = {N, OH, OW, CO};
  } else if (io_dataformat_ == schema::Format_NHWC4) {
    input_shape = {N, IH, IW, CI_ALIGN};
    weight_shape = {CO_ALIGN, KH, KW, CI_ALIGN};
    output_shape = {N, OH, OW, CO_ALIGN};
  }
  cl_uint2 stride = {static_cast<cl_uint>(param->stride_h_), static_cast<cl_uint>(param->stride_w_)};
  cl_uint4 pad = {static_cast<cl_uint>(param->pad_u_), static_cast<cl_uint>(param->pad_d_),
                  static_cast<cl_uint>(param->pad_l_), static_cast<cl_uint>(param->pad_r_)};
  cl_int4 kernel_stride = {KH, KW, param->stride_h_, param->stride_w_};
  cl_int4 pad = {param->pad_u_, param->pad_d_, param->pad_l_, param->pad_r_};
  int arg_cn = 0;
  ocl_runtime->SetKernelArg(kernel_, arg_cn++, inputs_[0]->Data());
@@ -163,14 +198,19 @@ int ConvolutionOpenCLKernel::Run() {
  ocl_runtime->SetKernelArg(kernel_, arg_cn++, packed_bias_);
  ocl_runtime->SetKernelArg(kernel_, arg_cn++, outputs_[0]->Data());
  ocl_runtime->SetKernelArg(kernel_, arg_cn++, input_shape);
  ocl_runtime->SetKernelArg(kernel_, arg_cn++, weight_shape);
  ocl_runtime->SetKernelArg(kernel_, arg_cn++, output_shape);
  ocl_runtime->SetKernelArg(kernel_, arg_cn++, stride);
  ocl_runtime->SetKernelArg(kernel_, arg_cn++, kernel_stride);
  ocl_runtime->SetKernelArg(kernel_, arg_cn++, pad);
  std::vector<size_t> global = {OW, OH, CO_TILE_NUM};
  std::vector<size_t> local = {1, 1, CO_TILE_NUM};
  std::vector<size_t> global;
  std::vector<size_t> local;
  GetLocalSize(reinterpret_cast<ConvParameter *>(this->opParameter), &global, &local);
  // float8 per thread
  if (io_dataformat_ == schema::Format_NHWC4) {
    local[2] = UP_DIV(local[2], 2);
    global[2] = UP_DIV(global[2], 2);
    global[2] = UP_DIV(global[2], global[2]) * global[2];
  }
  ocl_runtime->RunKernel(kernel_, global, local, nullptr);
  return 0;
@@ -196,4 +236,3 @@ kernel::LiteKernel *OpenCLConvolutionKernelCreator(const std::vector<lite::tenso
 REG_KERNEL(kGPU, PrimitiveType_Conv2D, OpenCLConvolutionKernelCreator)
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
@@ -14,11 +14,13 @@
 * limitations under the License.
 */
 #ifndef MINDSPORE_LITE_SRC_BACKEND_OPENCL_CONVOLUTIONOPENCLKERNEL_H_
 #define MINDSPORE_LITE_SRC_BACKEND_OPENCL_CONVOLUTIONOPENCLKERNEL_H_
 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_CONVOLUTION_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_CONVOLUTION_H_
 #include <vector>
 #include "src/runtime/kernel/arm/fp32/convolution.h"
 #include "src/ir/tensor.h"
 #include "src/lite_kernel.h"
 #include "schema/model_generated.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/arm/opclib/conv_parameter.h"
@@ -44,5 +46,4 @@ class ConvolutionOpenCLKernel : public LiteKernel {
 };
 }  // namespace mindspore::kernel
 #endif  // MINDSPORE_LITE_SRC_BACKEND_OPENCL_CONVOLUTIONOPENCLKERNEL_H_
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_CONVOLUTION_H_