fix opencl winograd OH/OW odd bug

5 years ago · 0b4c480474
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl
@@ -54,6 +54,62 @@ __kernel void to_format_NHWC_to_NHWC4_IMG_half(__global half4 *src_data, __write
  }
  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), data);
 }
 __kernel void to_format_NCHW_to_NHWC4_IMG_float(__global float4 *src_data, __write_only image2d_t dst_data, int4 size,
                                                int4 shape) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  int Z = get_global_id(2);
  if (X >= size.x || Y >= size.y || Z >= size.z) {
    return;
  }
  FLT4 data = (FLT4)(0.f);
  __global float *src_addr = (__global float *)src_data;
  __global float *src_addr_0 = src_addr + ((Z * 4 + 0) * shape.y + X) * shape.z + Y;
  __global float *src_addr_1 = src_addr + ((Z * 4 + 1) * shape.y + X) * shape.z + Y;
  __global float *src_addr_2 = src_addr + ((Z * 4 + 2) * shape.y + X) * shape.z + Y;
  if ((Z + 1) * 4 <= shape.w) {
    data = TO_FLT4(((__global float4 *)src_addr_0)[0]);
  } else {
    if ((shape.w - Z * 4) >= 1) {
      data.x = src_addr_0[0];
    }
    if ((shape.w - Z * 4) >= 2) {
      data.y = src_addr_1[0];
    }
    if ((shape.w - Z * 4) >= 3) {
      data.z = src_addr_2[0];
    }
  }
  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), data);
 }
 __kernel void to_format_NCHW_to_NHWC4_IMG_half(__global half4 *src_data, __write_only image2d_t dst_data, int4 size,
                                               int4 shape) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  int Z = get_global_id(2);
  if (X >= size.x || Y >= size.y || Z >= size.z) {
    return;
  }
  FLT4 data = (FLT4)(0.f);
  __global half *src_addr = (__global half *)src_data;
  __global half *src_addr_0 = src_addr + ((Z * 4 + 0) * shape.y + X) * shape.z + Y;
  __global half *src_addr_1 = src_addr + ((Z * 4 + 1) * shape.y + X) * shape.z + Y;
  __global half *src_addr_2 = src_addr + ((Z * 4 + 2) * shape.y + X) * shape.z + Y;
  if ((Z + 1) * 4 <= shape.w) {
    data = TO_FLT4(((__global half4 *)src_addr_0)[0]);
  } else {
    if ((shape.w - Z * 4) >= 1) {
      data.x = src_addr_0[0];
    }
    if ((shape.w - Z * 4) >= 2) {
      data.y = src_addr_1[0];
    }
    if ((shape.w - Z * 4) >= 3) {
      data.z = src_addr_2[0];
    }
  }
  WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), data);
 }
 __kernel void to_format_NHWC_to_NC4HW4_IMG_float(__global float4 *src_data, __write_only image2d_t dst_data, int4 size,
                                                 int4 shape) {
  int X = get_global_id(0);
@@ -198,6 +254,64 @@ __kernel void to_format_NHWC4_to_NHWC_BUF_float(__read_only image2d_t src_data,
    }
  }
 }
 __kernel void to_format_NHWC4_to_NCHW_BUF_float(__read_only image2d_t src_data, __global float4 *dst_data, int4 size,
                                                int4 shape) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  int Z = get_global_id(2);
  if (X >= size.x || Y >= size.y || Z >= size.z) {
    return;
  }
  float4 data = convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
  int offset = (X * shape.z + Y) * shape.w + Z * 4;
  __global float *dst_addr = (__global float *)dst_data;
  __global float *dst_addr_0 = dst_addr + ((Z * 4 + 0) * shape.y + X) * shape.z + Y;
  __global float *dst_addr_1 = dst_addr + ((Z * 4 + 1) * shape.y + X) * shape.z + Y;
  __global float *dst_addr_2 = dst_addr + ((Z * 4 + 2) * shape.y + X) * shape.z + Y;
  dst_addr += offset;
  if ((Z + 1) * 4 <= shape.w) {
    ((__global float4 *)dst_addr_0)[0] = data;
  } else {
    if (shape.w - Z * 4 >= 1) {
      dst_addr_0[0] = data.x;
    }
    if (shape.w - Z * 4 >= 2) {
      dst_addr_1[0] = data.y;
    }
    if (shape.w - Z * 4 >= 3) {
      dst_addr_2[0] = data.z;
    }
  }
 }
 __kernel void to_format_NHWC4_to_NCHW_BUF_half(__read_only image2d_t src_data, __global half4 *dst_data, int4 size,
                                               int4 shape) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  int Z = get_global_id(2);
  if (X >= size.x || Y >= size.y || Z >= size.z) {
    return;
  }
  half4 data = convert_half4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
  int offset = (X * shape.z + Y) * shape.w + Z * 4;
  __global half *dst_addr = (__global half *)dst_data;
  __global half *dst_addr_0 = dst_addr + ((Z * 4 + 0) * shape.y + X) * shape.z + Y;
  __global half *dst_addr_1 = dst_addr + ((Z * 4 + 1) * shape.y + X) * shape.z + Y;
  __global half *dst_addr_2 = dst_addr + ((Z * 4 + 2) * shape.y + X) * shape.z + Y;
  dst_addr += offset;
  if ((Z + 1) * 4 <= shape.w) {
    ((__global half4 *)dst_addr_0)[0] = data;
  } else {
    if (shape.w - Z * 4 >= 1) {
      dst_addr_0[0] = data.x;
    }
    if (shape.w - Z * 4 >= 2) {
      dst_addr_1[0] = data.y;
    }
    if (shape.w - Z * 4 >= 3) {
      dst_addr_2[0] = data.z;
    }
  }
 }
 __kernel void to_format_NHWC4_to_NHWC_BUF_half(__read_only image2d_t src_data, __global half4 *dst_data, int4 size,
                                               int4 shape) {
  int X = get_global_id(0);
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc
@@ -40,8 +40,6 @@ using mindspore::schema::PrimitiveType_Activation;
 namespace mindspore::kernel {
 void ActivationOpenClKernel::InitBuffer() {}
 int ActivationOpenClKernel::Init() {
  in_size_ = in_tensors_[0]->shape().size();
  out_size_ = out_tensors_[0]->shape().size();
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h
@@ -39,7 +39,7 @@ class ActivationOpenClKernel : public OpenCLKernel {
  int Run() override;
  int GetImageSize(size_t idx, std::vector<size_t> *img_size) override;
  cl_int4 GetImg2dShape();
  void InitBuffer();
  void InitBuffer() {}
 private:
  cl::Kernel kernel_;
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.cc
@@ -16,10 +16,10 @@
 #include <cstring>
 #include <algorithm>
 #include <set>
 #include<string>
 #include "src/kernel_registry.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/kernel/arithmetic_self.h"
 #include "src/runtime/kernel/opencl/utils.h"
 #include "src/runtime/kernel/opencl/cl/arithmeticself.cl.inc"
 using mindspore::kernel::KERNEL_ARCH::kGPU;
@@ -145,31 +145,12 @@ int ArithmeticSelfOpenCLKernel::Init() {
 int ArithmeticSelfOpenCLKernel::ReSize() { return RET_OK; }
 int ArithmeticSelfGetBiggestDividerWithPriority(int number, int max_divider) {
  if (number % 8 == 0 && max_divider >= 8) {
    return number / 8;
  }
  if (number % 4 == 0 && 4 <= max_divider) {
    return number / 4;
  }
  if (number % 2 == 0 && 2 <= max_divider) {
    return number / 2;
  }
  for (int i = max_divider; i != 0; i--) {
    if (number % i == 0) {
      return i;
    }
  }
  return RET_OK;
 }
 void ArithmeticSelfGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) {
  const int max_divider = 8;
  const int max_x = 4, max_y = 8;
  int x = std::min(ArithmeticSelfGetBiggestDividerWithPriority(global[0], max_divider), max_x);
  int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x);
  int yz = max_size / x;
  int y = std::min(std::min(ArithmeticSelfGetBiggestDividerWithPriority(global[1], max_divider), yz), max_y);
  int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y);
  int z = std::min(yz / y, static_cast<int>(UP_DIV(global[2], 2)));
  local->clear();
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc
@@ -20,6 +20,7 @@
 #include "src/kernel_registry.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/kernel/batchnorm.h"
 #include "src/runtime/kernel/opencl/utils.h"
 #include "src/runtime/kernel/opencl/cl/batchnorm.cl.inc"
 using mindspore::kernel::KERNEL_ARCH::kGPU;
@@ -49,6 +50,7 @@ int BatchNormOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_siz
  *img_size = vec;
  return RET_OK;
 }
 int BatchNormOpenCLKernel::Init() {
  auto in_format = op_format_;
  if (in_format != schema::Format_NHWC4 && in_format != schema::Format_NC4HW4) {
@@ -79,31 +81,12 @@ int BatchNormOpenCLKernel::Init() {
 int BatchNormOpenCLKernel::ReSize() { return RET_OK; }
 int BatchnormGetBiggestDividerWithPriority(int number, int max_divider) {
  if (number % 8 == 0 && 8 <= max_divider) {
    return number / 8;
  }
  if (number % 4 == 0 && 4 <= max_divider) {
    return number / 4;
  }
  if (number % 2 == 0 && 2 <= max_divider) {
    return number / 2;
  }
  for (int i = max_divider; i != 0; i--) {
    if (number % i == 0) {
      return i;
    }
  }
  return RET_OK;
 }
 void BatchNormGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) {
  const int max_divider = 8;
  const int max_x = 4, max_y = 8;
  int x = std::min(BatchnormGetBiggestDividerWithPriority(global[0], max_divider), max_x);
  int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x);
  int yz = max_size / x;
  int y = std::min(std::min(BatchnormGetBiggestDividerWithPriority(global[1], max_divider), yz), max_y);
  int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y);
  int z = std::min(yz / y, static_cast<int>(UP_DIV(global[2], 2)));
  local->clear();
@@ -111,6 +94,7 @@ void BatchNormGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t
  local->push_back(y);
  local->push_back(z);
 }
 int BatchNormOpenCLKernel::Run() {
  MS_LOG(DEBUG) << this->name() << " Running! ";
  auto param = reinterpret_cast<BatchNormParameter *>(this->op_parameter_);
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc
@@ -14,12 +14,12 @@
 * limitations under the License.
 */
 #include <cstring>
 #include <string>
 #include <algorithm>
 #include <set>
 #include "src/kernel_registry.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/kernel/concat.h"
 #include "src/runtime/kernel/opencl/utils.h"
 #include "src/runtime/kernel/opencl/cl/concat.cl.inc"
 using mindspore::kernel::KERNEL_ARCH::kGPU;
@@ -131,31 +131,12 @@ int ConcatOpenCLKernel::GetSumShape(std::vector<int> *sum_shape, std::vector<int
  return RET_OK;
 }
 int ConcatGetBiggestDividerWithPriority(int number, int max_divider) {
  if (number % 8 == 0 && max_divider >= 8) {
    return number / 8;
  }
  if (number % 4 == 0 && 4 <= max_divider) {
    return number / 4;
  }
  if (number % 2 == 0 && 2 <= max_divider) {
    return number / 2;
  }
  for (int i = max_divider; i != 0; i--) {
    if (number % i == 0) {
      return i;
    }
  }
  return RET_OK;
 }
 void ConcatGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) {
  const int max_divider = 8;
  const int max_x = 4, max_y = 8;
  int x = std::min(ConcatGetBiggestDividerWithPriority(global[0], max_divider), max_x);
  int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x);
  int yz = max_size / x;
  int y = std::min(std::min(ConcatGetBiggestDividerWithPriority(global[1], max_divider), yz), max_y);
  int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y);
  int z = std::min(yz / y, static_cast<int>(UP_DIV(global[2], 2)));
  local->clear();
@@ -163,6 +144,7 @@ void ConcatGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *
  local->push_back(y);
  local->push_back(z);
 }
 int ConcatOpenCLKernel::Run() {
  MS_LOG(DEBUG) << this->name() << " Running! ";
  auto param = reinterpret_cast<ConcatParameter *>(this->op_parameter_);
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
@@ -19,6 +19,7 @@
 #include <algorithm>
 #include "src/common/utils.h"
 #include "src/runtime/kernel/opencl/kernel/convolution.h"
 #include "src/runtime/kernel/opencl/utils.h"
 #include "src/kernel_registry.h"
 #include "include/errorcode.h"
@@ -113,7 +114,7 @@ int ConvolutionOpenCLKernel::Init() {
  return RET_OK;
 }
 int ConvolutionOpenCLKernel::RearrangeWinogradWeight() {
 int ConvolutionOpenCLKernel::GenerateWinogradWeight() {
  constexpr float Gt[] = {1.0000000000, 1.0000000000, 1.0000000000,  1.0000000000, 1.0000000000,  0.0000000000,
                          0.0000000000, 0.7071067691, -0.7071067691, 1.4142135382, -1.4142135382, 0.0000000000,
                          0.0000000000, 0.4999999702, 0.4999999702,  1.9999998808, 1.9999998808,  1.0000000000};
@@ -155,41 +156,16 @@ int ConvolutionOpenCLKernel::RearrangeWinogradWeight() {
  }
  if (use_fp16_) {
    OHWI2OHWIOGroupI4O4<float, float16_t>(encoded_weight.data(), 6, 6, 2);
    ConvertConvWeight4DTo7D<float, float16_t>(reinterpret_cast<void *>(encoded_weight.data()), packed_weight_, CO_, 6,
                                              6, CI_, 2);
  } else {
    OHWI2OHWIOGroupI4O4<float, float>(encoded_weight.data(), 6, 6, 2);
    ConvertConvWeight4DTo7D<float, float>(reinterpret_cast<void *>(encoded_weight.data()), packed_weight_, CO_, 6, 6,
                                          CI_, 2);
  }
  return RET_OK;
 }
 template <typename SRC_T, typename DST_T>
 int ConvolutionOpenCLKernel::OHWI2OHWIOGroupI4O4(void *weight_OHWI, size_t KH, size_t KW, size_t OGroup) {
  auto origin_weight = reinterpret_cast<SRC_T *>(weight_OHWI);
  auto packed_weight = reinterpret_cast<DST_T *>(packed_weight_);
  // OHWI -> O/OGroup/4 KH KW I/4 OGroup I4 O4
  for (size_t co = 0, src_idx = 0; co < CO_; ++co) {
    for (size_t kh = 0; kh < KH; ++kh) {
      for (size_t kw = 0; kw < KW; ++kw) {
        for (size_t ci = 0; ci < CI_; ++ci) {
          size_t co_outer = co / (CO_TILE * OGroup);
          size_t group_idx = co % (CO_TILE * OGroup) / CO_TILE;
          size_t co_inner = co % CO_TILE;
          size_t ci_outer = ci / CI_TILE;
          size_t ci_inner = ci % CI_TILE;
          size_t dst_idx =
            (((((co_outer * KH + kh) * KW + kw) * CI_SLICES_ + ci_outer) * OGroup + group_idx) * CI_TILE + ci_inner) *
              CO_TILE +
            co_inner;
          packed_weight[dst_idx] = static_cast<DST_T>(origin_weight[src_idx++]);
        }
      }
    }
  }
  return RET_OK;
 }
 int ConvolutionOpenCLKernel::InitWeight() {
  auto allocator = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator();
@@ -206,20 +182,20 @@ int ConvolutionOpenCLKernel::InitWeight() {
  // rearrange weight
  if (use_winograd_) {
    RearrangeWinogradWeight();
    GenerateWinogradWeight();
  } else {
    auto weight_tensor = in_tensors_[1];
    if (weight_tensor->data_type() == kNumberTypeFloat16) {
      if (use_fp16_) {
        OHWI2OHWIOGroupI4O4<float16_t, float16_t>(weight_tensor->data_c(), KH_, KW_, 1);
        ConvertConvWeight4DTo7D<float16_t, float16_t>(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_);
      } else {
        OHWI2OHWIOGroupI4O4<float16_t, float>(weight_tensor->data_c(), KH_, KW_, 1);
        ConvertConvWeight4DTo7D<float16_t, float>(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_);
      }
    } else {
      if (use_fp16_) {
        OHWI2OHWIOGroupI4O4<float, float16_t>(weight_tensor->data_c(), KH_, KW_, 1);
        ConvertConvWeight4DTo7D<float, float16_t>(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_);
      } else {
        OHWI2OHWIOGroupI4O4<float, float>(weight_tensor->data_c(), KH_, KW_, 1);
        ConvertConvWeight4DTo7D<float, float>(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_);
      }
    }
  }
@@ -635,7 +611,7 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd4x4To36() {
    "    }\n"
    "\n"
    "    int IH = input_shape.y, IW = input_shape.z;\n"
    "    int TILE_X = IW / 4;\n"
    "    int TILE_X = UP_DIV(IW, 4);\n"
    "    int tile_x = tile_xy % TILE_X;\n"
    "    int tile_y = tile_xy / TILE_X;\n"
    "\n"
@@ -764,6 +740,8 @@ std::string ConvolutionOpenCLKernel::CodeGenWinogradConvolution() {
 std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() {
  std::string code =
    "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
    "#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))\n"
    "\n"
    "__constant sampler_t\n"
    "smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n"
    "\n"
@@ -804,6 +782,7 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() {
    "        }\n"
    "    }\n"
    "\n"
    "    int TILE_X = UP_DIV(OW, 4);\n"
    "    for (int x = 0; x < 4; x++)\n"
    "    {\n"
    "        FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n"
@@ -822,14 +801,15 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() {
  }
  code +=
    "        int TILE_X = OW / 4;\n"
    "        int tile_x = tile_xy % TILE_X * 4;\n"
    "        int tile_y = tile_xy / TILE_X * 4;\n";
    "        int tile_x = tile_xy % TILE_X;\n"
    "        int tile_y = tile_xy / TILE_X;\n"
    "        int ow = tile_x * 4 + x;\n"
    "        int oh = tile_y * 4 + row;\n";
  if (op_format_ == Format_NHWC4) {
    code += "        WRITE_IMAGE(output, (int2)((tile_x + x) * SLICES + slice, tile_y + row), acc);\n";
    code += "        if(ow < OW) { WRITE_IMAGE(output, (int2)(ow * SLICES + slice, oh), acc);}\n";
  } else if (op_format_ == Format_NC4HW4) {
    code += "        WRITE_IMAGE(output, (int2)(tile_x + x, slice * OH + tile_y + row), acc);\n";
    code += "        if(oh < OH) { WRITE_IMAGE(output, (int2)(ow, slice * OH + oh), acc);}\n";
  }
  code +=
@@ -849,7 +829,7 @@ int ConvolutionOpenCLKernel::SetGlobalLocalConv(std::vector<size_t> *global, std
  size_t global_w = UP_DIV(OW_, work_group_size[1]) * work_group_size[1];
  size_t global_c = UP_DIV(CO_SLICES_, work_group_size[2]) * work_group_size[2];
  size_t local_c = GetBiggestDivider(global_c, max_z_size);
  size_t local_c = GetMaxDivisor(global_c, max_z_size);
  if (local_c == 0) {
    MS_LOG(ERROR) << "Divide by zero";
    return RET_ERROR;
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
@@ -68,9 +68,7 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {
  int InitWeight();
  int InitBias();
  int RearrangeWinogradWeight();
  template <typename SRC_T, typename DST_T>
  int OHWI2OHWIOGroupI4O4(void *weight_OHWI, size_t KH, size_t KW, size_t OGroup);
  int GenerateWinogradWeight();
  std::string CodeGenConvolutionNHWC4();
  std::string CodeGenConvolutionNC4HW4();
@@ -90,29 +88,6 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {
    const bool hw_good = TILES_X_ * TILES_Y_ >= 16;
    return attr_valid && channel_good && hw_good;
  }
  static std::vector<float> MatrixMultiply(const float A[], const float B[], int M, int N, int K) {
    std::vector<float> C(M * K);
    for (int i = 0; i < M; ++i) {
      for (int j = 0; j < K; ++j) {
        float s = 0.0f;
        for (int k = 0; k < N; ++k) {
          s += A[i * N + k] * B[k * K + j];
        }
        C[i * K + j] = s;
      }
    }
    return C;
  }
  static int GetBiggestDivider(int x, int y) {
    for (int i = y; i != 0; i--) {
      if (x % i == 0) {
        return i;
      }
    }
    return 1;
  }
 };
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/slice.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/slice.cc
@@ -20,6 +20,7 @@
 #include "src/kernel_registry.h"
 #include "src/runtime/opencl/opencl_runtime.h"
 #include "src/runtime/kernel/opencl/kernel/slice.h"
 #include "src/runtime/kernel/opencl/utils.h"
 #include "src/runtime/kernel/opencl/cl/slice.cl.inc"
 using mindspore::kernel::KERNEL_ARCH::kGPU;
@@ -49,6 +50,7 @@ int SliceOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
  *img_size = vec;
  return RET_OK;
 }
 int SliceOpenCLKernel::Init() {
  std::string kernel_name = "slice";
  auto in_format = op_format_;
@@ -77,28 +79,12 @@ int SliceOpenCLKernel::Init() {
 int SliceOpenCLKernel::ReSize() { return RET_OK; }
 int SliceGetBiggestDividerWithPriority(int number, int max_divider) {
  if (number % 8 == 0 && 8 <= max_divider) {
    return number / 8;
  } else if (number % 4 == 0 && 4 <= max_divider) {
    return number / 4;
  } else if (number % 2 == 0 && 2 <= max_divider) {
    return number / 2;
  }
  for (int i = max_divider; i != 0; i--) {
    if (number % i == 0) {
      return i;
    }
  }
  return 1;
 }
 void SlcieGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) {
  const int max_divider = 8;
  const int max_x = 4, max_y = 8;
  int x = std::min(SliceGetBiggestDividerWithPriority(global[0], max_divider), max_x);
  int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x);
  int yz = max_size / x;
  int y = std::min(std::min(SliceGetBiggestDividerWithPriority(global[1], max_divider), yz), max_y);
  int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y);
  int z = std::min(yz / y, static_cast<int>(UP_DIV(global[2], 2)));
  local->clear();
@@ -106,6 +92,7 @@ void SlcieGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *l
  local->push_back(y);
  local->push_back(z);
 }
 int SliceOpenCLKernel::Run() {
  MS_LOG(DEBUG) << this->name() << " Running! ";
  auto param = reinterpret_cast<SliceParameter *>(this->op_parameter_);
@@ -154,5 +141,4 @@ kernel::LiteKernel *OpenCLSliceKernelCreator(const std::vector<lite::Tensor *> &
 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Slice, OpenCLSliceKernelCreator);
 REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Slice, OpenCLSliceKernelCreator);
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/opencl/utils.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/utils.cc
@@ -16,7 +16,6 @@
 #include "src/runtime/kernel/opencl/utils.h"
 #include <algorithm>
 #include <string>
 #include <vector>
 #include "src/kernel_registry.h"
@@ -34,26 +33,61 @@ kernel::LiteKernel *GetOpenCLKernel(const std::vector<Tensor *> &in_tensors, con
 }
 }  // namespace mindspore::lite
 namespace mindspore {
 namespace kernel {
 namespace mindspore::kernel {
 int GetMaxDivisor(int x, int divisor) {
  int i = divisor;
  while (i > 0) {
    if (x % i == 0) {
      return i;
    }
    i--;
  }
  return 1;
 }
 int GetMaxDivisorStrategy0(int x, int divisor) {
  if (divisor >= 8 && x % 8 == 0) {
    return 8;
  } else if (divisor >= 4 && x % 4 == 0) {
    return 4;
  } else if (divisor >= 2 && x % 2 == 0) {
    return 2;
  } else {
    return GetMaxDivisor(x, divisor);
  }
 }
 int GetMaxDivisorStrategy1(int x, int divisor) {
  if (divisor >= 8 && x % 8 == 0) {
    return x / 8;
  } else if (divisor >= 4 && x % 4 == 0) {
    return x / 4;
  } else if (divisor >= 2 && x % 2 == 0) {
    return x / 2;
  } else {
    return GetMaxDivisor(x, divisor);
  }
 }
 std::vector<size_t> GetCommonGlobalSize(const std::vector<size_t> &local, const std::vector<size_t> &global) {
  std::vector<size_t> result(3, 1);
  std::vector<size_t> result(3);
  for (int i = 0; i < 3; ++i) {
    result[i] = AlignByN(global[i], local[i]);
    result[i] = UP_ROUND(global[i], local[i]);
  }
  return result;
 }
 std::vector<size_t> GetCommonLocalSize(const std::vector<size_t> &global, int max_size) {
  size_t wg_z = GetBiggestDividerWithPriority(global[2], 8);
  if (wg_z == 0) {
  size_t local_z = GetMaxDivisorStrategy0(global[2], 8);
  if (local_z == 0) {
    MS_LOG(ERROR) << "Divide by zero";
    return {};
  }
  size_t wg_xy_size = max_size / wg_z;
  size_t wg_x = std::min(DivideRoundUp(global[0], 2), wg_xy_size);
  size_t wg_y = std::min(wg_xy_size / wg_x, global[1]);
  std::vector<size_t> local = {wg_x, wg_y, wg_z};
  size_t local_xy = max_size / local_z;
  size_t local_x = std::min(UP_DIV(global[0], 2), local_xy);
  size_t local_y = std::min(local_xy / local_x, global[1]);
  std::vector<size_t> local = {local_x, local_y, local_z};
  return local;
 }
@@ -187,5 +221,4 @@ std::string CLErrorCode(cl_int error_code) {
      return "Unknown OpenCL error code";
  }
 }
 }  // namespace kernel
 }  // namespace mindspore
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/opencl/utils.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/utils.h
@@ -23,7 +23,7 @@
 #include "utils/log_adapter.h"
 #include "nnacl/op_base.h"
 #include "src/lite_kernel.h"
 #include "src/common//utils.h"
 #include "src/common/utils.h"
 namespace mindspore::lite {
 kernel::LiteKernel *GetOpenCLKernel(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
@@ -32,59 +32,14 @@ kernel::LiteKernel *GetOpenCLKernel(const std::vector<Tensor *> &in_tensors, con
 namespace mindspore::kernel {
 /**
 * GetLocalSize
 * @param number
 * @param max_divider
 * @return
 */
 template <typename T, typename N>
 T GetBiggestDividerWithPriority(T number, N max_divider) {
  if (number % 8 == 0 && 8 <= max_divider) {
    return (T)8;
  }
  if (number % 4 == 0 && 4 <= max_divider) {
    return (T)4;
  }
  if (number % 2 == 0 && 2 <= max_divider) {
    return (T)2;
  }
  for (int i = max_divider; i != 0; i--) {
    if (number % i == 0) {
      return (T)i;
    }
  }
  return (T)1;
 }
 int GetMaxDivisor(int x, int divisor);
 /**
 * GetLocalSize
 * @param n must be non negative
 * @param divisor must be greater than zero
 * @return
 */
 template <typename T, typename N>
 T DivideRoundUp(T n, N divisor) {
  const T div = static_cast<T>(divisor);
  const T q = n / div;
  return n % div == 0 ? q : q + 1;
 }
 int GetMaxDivisorStrategy0(int x, int divisor);
 /**
 * GetLocalSize
 * @param number
 * @param n
 * @return
 */
 template <typename T, typename N>
 T AlignByN(T number, N n) {
  return DivideRoundUp(number, n) * n;
 }
 int GetMaxDivisorStrategy1(int x, int divisor);
 // GetGlobalSize
 std::vector<size_t> GetCommonGlobalSize(const std::vector<size_t> &local, const std::vector<size_t> &global);
 // GetLocalSize
 std::vector<size_t> GetCommonLocalSize(const std::vector<size_t> &global, int max_size);
 std::string CLErrorCode(cl_int error_code);
@@ -108,6 +63,7 @@ void PackNCHWToNC4HW4(void *src, void *dst, int batch, int plane, int channel, c
    }
  }
 }
 template <class T1, class T2>
 void PackNHWCToNHWC4(void *src, void *dst, int batch, int plane, int channel, const std::function<T2(T1)> &to_dtype) {
  int c4 = UP_DIV(channel, C4NUM);
@@ -132,6 +88,7 @@ void PackNHWCToNHWC4(void *src, void *dst, int batch, int plane, int channel, co
    }
  }
 }
 template <class T1, class T2>
 void PackNHWCToNC4HW4(void *src, void *dst, int batch, int plane, int channel, const std::function<T2(T1)> &to_dtype) {
  int c4 = UP_DIV(channel, C4NUM);
@@ -152,6 +109,47 @@ void PackNHWCToNC4HW4(void *src, void *dst, int batch, int plane, int channel, c
  }
 }
 template <class T>
 std::vector<T> MatrixMultiply(const T A[], const T B[], int M, int N, int K) {
  std::vector<T> C(M * K);
  for (int i = 0; i < M; ++i) {
    for (int j = 0; j < K; ++j) {
      float s = 0.0f;
      for (int k = 0; k < N; ++k) {
        s += A[i * N + k] * B[k * K + j];
      }
      C[i * K + j] = s;
    }
  }
  return C;
 }
 template <typename SRC_T, typename DST_T>
 void ConvertConvWeight4DTo7D(void *src, void *dst, size_t CO, size_t KH, size_t KW, size_t CI, size_t OGroup = 1,
                             size_t CI_TILE = 4, size_t CO_TILE = 4) {
  auto origin_weight = reinterpret_cast<SRC_T *>(src);
  auto packed_weight = reinterpret_cast<DST_T *>(dst);
  auto CI_SLICES = UP_DIV(CI, CI_TILE);
  for (size_t co = 0, src_idx = 0; co < CO; ++co) {
    for (size_t kh = 0; kh < KH; ++kh) {
      for (size_t kw = 0; kw < KW; ++kw) {
        for (size_t ci = 0; ci < CI; ++ci) {
          size_t co_outer = co / (CO_TILE * OGroup);
          size_t group_idx = co % (CO_TILE * OGroup) / CO_TILE;
          size_t co_inner = co % CO_TILE;
          size_t ci_outer = ci / CI_TILE;
          size_t ci_inner = ci % CI_TILE;
          size_t dst_idx =
            (((((co_outer * KH + kh) * KW + kw) * CI_SLICES + ci_outer) * OGroup + group_idx) * CI_TILE + ci_inner) *
              CO_TILE +
            co_inner;
          packed_weight[dst_idx] = static_cast<DST_T>(origin_weight[src_idx++]);
        }
      }
    }
  }
 }
 }  // namespace mindspore::kernel
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_UTILS_H_
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc
@@ -133,6 +133,8 @@ Format get_op_format(Format input_format) {
    case Format_NHWC:
    case Format_NHWC4:
      return Format_NHWC4;
    case Format_NCHW:
      return Format_NHWC4;
    default:
      return Format_NC4HW4;
  }
@@ -249,7 +251,7 @@ TEST_F(TestConvolutionOpenCL, winograd_inputNHWC_1x16x256x96_outputNHWC_1x16x256
  TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, kNumberTypeFloat16, 0.6f, "testcases/test_fp32/");
 }
 TEST_F(TestConvolutionOpenCL, simple_test0) {
 TEST_F(TestConvolutionOpenCL, simple_test0_NHWC) {
  std::string attr =
    "inputNHWC_1x2x2x2_outputNHWC_1x2x2x2_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1";
  float input_data[] = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
@@ -259,6 +261,34 @@ TEST_F(TestConvolutionOpenCL, simple_test0) {
  TEST_MAIN(attr, Format_NHWC, Format_NHWC, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, expect_data);
  TEST_MAIN(attr, Format_NHWC, Format_NHWC, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, expect_data);
 }
 TEST_F(TestConvolutionOpenCL, simple_test0_NCHW) {
  std::string attr =
    "inputNHWC_1x2x2x2_outputNHWC_1x2x2x2_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1";
  float input_data[] = {0.0f, 2.0f, 4.0f, 6.0f, 1.0f, 3.0f, 5.0f, 7.0f};
  float weight_data[] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
  float bias_data[] = {0.0f, 0.0f};
  float expect_data[] = {1.0f, 5.0f, 9.0f, 13.0f, 1.0f, 5.0f, 9.0f, 13.0f};
  TEST_MAIN(attr, Format_NCHW, Format_NCHW, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data, expect_data);
  TEST_MAIN(attr, Format_NCHW, Format_NCHW, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data, expect_data);
 }
 TEST_F(TestConvolutionOpenCL, simple_test0_NHWC4_and_NC4HW4) {
  std::string attr =
    "inputNHWC_1x2x2x2_outputNHWC_1x2x2x2_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_1x1";
  float input_data[] = {0.0f, 1.0f, 0.0f, 0.0f, 2.0f, 3.0f, 0.0f, 0.0f, 4.0f, 5.0f, 0.0f, 0.0f, 6.0f, 7.0f, 0.0f, 0.0f};
  float weight_data[] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
  float bias_data[] = {0.0f, 0.0f};
  float expect_data[] = {1.0f, 1.0f, 0.0f, 0.0f, 5.0f,  5.0f,  0.0f, 0.0f,
                         9.0f, 9.0f, 0.0f, 0.0f, 13.0f, 13.0f, 0.0f, 0.0f};
  TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data,
            expect_data);
  TEST_MAIN(attr, Format_NHWC4, Format_NHWC4, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data,
            expect_data);
  TEST_MAIN(attr, Format_NC4HW4, Format_NC4HW4, kNumberTypeFloat32, 1e-3f, input_data, weight_data, bias_data,
            expect_data);
  TEST_MAIN(attr, Format_NC4HW4, Format_NC4HW4, kNumberTypeFloat16, 1e-6f, input_data, weight_data, bias_data,
            expect_data);
 }
 TEST_F(TestConvolutionOpenCL, simple_test1) {
  std::string attr =