From d431fc717617a2bf26a0b75aced8cdab642ade99 Mon Sep 17 00:00:00 2001
From: wandongdong <wandongdong1@huawei.com>
Date: Tue, 27 Oct 2020 20:19:23 -0700
Subject: [PATCH] optimize opencl depthwise

---
 .../kernel/opencl/cl/depthwise_conv2d.cl      | 149 +++++++++++++++---
 .../kernel/opencl/kernel/depthwise_conv2d.cc  |  18 ++-
 .../kernel/opencl/kernel/depthwise_conv2d.h   |   1 +
 3 files changed, 143 insertions(+), 25 deletions(-)

diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/depthwise_conv2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/depthwise_conv2d.cl
index 070b842fb8..44f3d0a051 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/depthwise_conv2d.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/depthwise_conv2d.cl
@@ -32,44 +32,155 @@ __kernel void DepthwiseConv2d_IMG_NC4HW4(__read_only image2d_t src_data, __globa
   WRITE_IMAGE(dst_data, (int2)(X, (Z * dst_size.y + Y)), res);
 }
 
-__kernel void DepthwiseConv2d_IMG_NHWC4(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias,
-                                        __write_only image2d_t dst_data, int2 kernel_size, int2 stride, int2 padding,
-                                        int2 dilation, int4 src_size, int4 dst_size, float relu_clip_min,
-                                        float relu_clip_max) {
-  int X = get_global_id(0);
-  int Y = get_global_id(1);
-  int Z = get_global_id(2) * 2;
+__kernel void DepthwiseConv2d_IMG_NHWC4_b222(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias,
+                                             __write_only image2d_t dst_data, int2 kernel_size, int2 stride,
+                                             int2 padding, int2 dilation, int4 src_size, int4 dst_size,
+                                             float relu_clip_min, float relu_clip_max) {
+  int X = get_global_id(1) * 2;
+  int Y = get_global_id(2) * 2;
+  int Z = get_global_id(0) * 2;
   if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;
-  FLT4 r[2] = {(FLT4)(0.0f, 0.0f, 0.0f, 0.0f), (FLT4)(0.0f, 0.0f, 0.0f, 0.0f)};
+  FLT4 r[8] = {(FLT4)(0.0f, 0.0f, 0.0f, 0.0f), (FLT4)(0.0f, 0.0f, 0.0f, 0.0f), (FLT4)(0.0f, 0.0f, 0.0f, 0.0f),
+               (FLT4)(0.0f, 0.0f, 0.0f, 0.0f), (FLT4)(0.0f, 0.0f, 0.0f, 0.0f), (FLT4)(0.0f, 0.0f, 0.0f, 0.0f),
+               (FLT4)(0.0f, 0.0f, 0.0f, 0.0f), (FLT4)(0.0f, 0.0f, 0.0f, 0.0f)};
   int x_offset = X * stride.x + padding.x;
   int y_offset = Y * stride.y + padding.y;
   int f_len = kernel_size.x * kernel_size.y;
   int fx_c = Z * f_len;
+  bool last_x = (get_global_id(1) == (dst_size.x + 1) / 2) && ((dst_size.x & 0x1) == 1);
+  bool last_y = (get_global_id(2) == (dst_size.y + 1) / 2) && ((dst_size.y & 0x1) == 1);
+  bool last_c = (get_global_id(0) == (dst_size.z + 1) / 2) && ((dst_size.z & 0x1) == 1);
   for (int ky = 0; ky < kernel_size.y; ++ky) {
     int y_c = y_offset + ky * dilation.y;
-    bool outside_y = y_c < 0 || y_c >= src_size.y;
+    int y_c_a1 = y_c + stride.y;
     for (int kx = 0; kx < kernel_size.x; ++kx) {
       int x_c = x_offset + kx * dilation.x;
-      bool outside_x = x_c < 0 || x_c >= src_size.x;
-      if (!outside_x && !outside_y) {
-        FLT4 flt_p0 = filter[fx_c];
-        FLT4 flt_p1 = filter[fx_c + f_len];
-        FLT4 src_p0 = READ_IMAGE(src_data, smp_zero, (int2)(Z + x_c * src_size.z, y_c));
-        FLT4 src_p1 = READ_IMAGE(src_data, smp_zero, (int2)(Z + 1 + x_c * src_size.z, y_c));
-        r[0] += TO_FLT4(src_p0 * flt_p0);
-        r[1] += TO_FLT4(src_p1 * flt_p1);
+      int x_c_a1 = x_c + stride.x;
+      int x_sign = x_c < 0 ? -1 : 1;
+      int x_a1_sign = x_c_a1 < 0 ? -1 : 1;
+      FLT4 flt_p0 = filter[fx_c];
+      FLT4 flt_p1 = filter[fx_c + f_len];
+      {
+        FLT4 src_p00_c0 = READ_IMAGE(src_data, smp_zero, (int2)(Z * x_sign + x_c * src_size.z, y_c));
+        FLT4 src_p00_c1 = READ_IMAGE(src_data, smp_zero, (int2)((Z + 1) * x_sign + x_c * src_size.z, y_c));
+        r[0] += TO_FLT4(src_p00_c0 * flt_p0);
+        r[1] += TO_FLT4(src_p00_c1 * flt_p1);
+      }
+      {
+        FLT4 src_p01_c0 = READ_IMAGE(src_data, smp_zero, (int2)(Z * x_a1_sign + x_c_a1 * src_size.z, y_c));
+        FLT4 src_p01_c1 = READ_IMAGE(src_data, smp_zero, (int2)((Z + 1) * x_a1_sign + x_c_a1 * src_size.z, y_c));
+        r[2] += TO_FLT4(src_p01_c0 * flt_p0);
+        r[3] += TO_FLT4(src_p01_c1 * flt_p1);
+      }
+      {
+        FLT4 src_p10_c0 = READ_IMAGE(src_data, smp_zero, (int2)(Z + x_c * src_size.z, y_c_a1));
+        FLT4 src_p10_c1 = READ_IMAGE(src_data, smp_zero, (int2)(Z + 1 + x_c * src_size.z, y_c_a1));
+        r[4] += TO_FLT4(src_p10_c0 * flt_p0);
+        r[5] += TO_FLT4(src_p10_c1 * flt_p1);
+      }
+      {
+        FLT4 src_p11_c0 = READ_IMAGE(src_data, smp_zero, (int2)(Z * x_a1_sign + x_c_a1 * src_size.z, y_c_a1));
+        FLT4 src_p11_c1 = READ_IMAGE(src_data, smp_zero, (int2)((Z + 1) * x_a1_sign + x_c_a1 * src_size.z, y_c_a1));
+        r[6] += TO_FLT4(src_p11_c0 * flt_p0);
+        r[7] += TO_FLT4(src_p11_c1 * flt_p1);
       }
       fx_c++;
     }
   }
   r[0] += bias[Z];
-  r[0] = clamp(r[0], (FLT)(relu_clip_min), (FLT)(relu_clip_max));
   r[1] += bias[Z + 1];
+  r[2] += bias[Z];
+  r[3] += bias[Z + 1];
+  r[4] += bias[Z];
+  r[5] += bias[Z + 1];
+  r[6] += bias[Z];
+  r[7] += bias[Z + 1];
+  r[0] = clamp(r[0], (FLT)(relu_clip_min), (FLT)(relu_clip_max));
   r[1] = clamp(r[1], (FLT)(relu_clip_min), (FLT)(relu_clip_max));
+  r[2] = clamp(r[2], (FLT)(relu_clip_min), (FLT)(relu_clip_max));
+  r[3] = clamp(r[3], (FLT)(relu_clip_min), (FLT)(relu_clip_max));
+  r[4] = clamp(r[4], (FLT)(relu_clip_min), (FLT)(relu_clip_max));
+  r[5] = clamp(r[5], (FLT)(relu_clip_min), (FLT)(relu_clip_max));
+  r[6] = clamp(r[6], (FLT)(relu_clip_min), (FLT)(relu_clip_max));
+  r[7] = clamp(r[7], (FLT)(relu_clip_min), (FLT)(relu_clip_max));
   WRITE_IMAGE(dst_data, (int2)(X * dst_size.z + Z, Y), r[0]);
-  if ((dst_size.z & 0x1) == 0) {
+  if (!last_c) {
     WRITE_IMAGE(dst_data, (int2)(X * dst_size.z + Z + 1, Y), r[1]);
   }
+  if (!last_x) {
+    WRITE_IMAGE(dst_data, (int2)((X + 1) * dst_size.z + Z, Y), r[2]);
+    if (!last_c) {
+      WRITE_IMAGE(dst_data, (int2)((X + 1) * dst_size.z + Z + 1, Y), r[3]);
+    }
+  }
+  if (!last_y) {
+    WRITE_IMAGE(dst_data, (int2)(X * dst_size.z + Z, Y + 1), r[4]);
+    if (!last_c) {
+      WRITE_IMAGE(dst_data, (int2)(X * dst_size.z + Z + 1, Y + 1), r[5]);
+    }
+  }
+  if (!last_y && !last_x) {
+    WRITE_IMAGE(dst_data, (int2)((X + 1) * dst_size.z + Z, Y + 1), r[6]);
+    if (!last_c) {
+      WRITE_IMAGE(dst_data, (int2)((X + 1) * dst_size.z + Z + 1, Y + 1), r[7]);
+    }
+  }
+}
+__kernel void DepthwiseConv2d_IMG_NHWC4_b221(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias,
+                                             __write_only image2d_t dst_data, int2 kernel_size, int2 stride,
+                                             int2 padding, int2 dilation, int4 src_size, int4 dst_size,
+                                             float relu_clip_min, float relu_clip_max) {
+  int X = get_global_id(1) * 2;
+  int Y = get_global_id(2) * 2;
+  int Z = get_global_id(0);
+  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;
+  FLT4 r[4] = {(FLT4)(0.0f, 0.0f, 0.0f, 0.0f), (FLT4)(0.0f, 0.0f, 0.0f, 0.0f), (FLT4)(0.0f, 0.0f, 0.0f, 0.0f),
+               (FLT4)(0.0f, 0.0f, 0.0f, 0.0f)};
+  int x_offset = X * stride.x + padding.x;
+  int y_offset = Y * stride.y + padding.y;
+  int f_len = kernel_size.x * kernel_size.y;
+  int fx_c = Z * f_len;
+  bool last_x = (get_global_id(1) == (dst_size.x + 1) / 2) && ((dst_size.x & 0x1) == 1);
+  bool last_y = (get_global_id(2) == (dst_size.y + 1) / 2) && ((dst_size.y & 0x1) == 1);
+  for (int ky = 0; ky < kernel_size.y; ++ky) {
+    int y_c = y_offset + ky * dilation.y;
+    int y_c_a1 = y_c + stride.y;
+    for (int kx = 0; kx < kernel_size.x; ++kx) {
+      int x_c = x_offset + kx * dilation.x;
+      int x_c_a1 = x_c + stride.x;
+      int x_sign = x_c < 0 ? -1 : 1;
+      int x_a1_sign = x_c_a1 < 0 ? -1 : 1;
+      FLT4 flt_p0 = filter[fx_c];
+      FLT4 src_p00_c0 = READ_IMAGE(src_data, smp_zero, (int2)(Z * x_sign + x_c * src_size.z, y_c));
+      r[0] += TO_FLT4(src_p00_c0 * flt_p0);
+      FLT4 src_p01_c0 = READ_IMAGE(src_data, smp_zero, (int2)(Z * x_a1_sign + x_c_a1 * src_size.z, y_c));
+      r[1] += TO_FLT4(src_p01_c0 * flt_p0);
+      FLT4 src_p10_c0 = READ_IMAGE(src_data, smp_zero, (int2)(Z + x_c * src_size.z, y_c_a1));
+      r[2] += TO_FLT4(src_p10_c0 * flt_p0);
+      FLT4 src_p11_c0 = READ_IMAGE(src_data, smp_zero, (int2)(Z * x_a1_sign + x_c_a1 * src_size.z, y_c_a1));
+      r[3] += TO_FLT4(src_p11_c0 * flt_p0);
+
+      fx_c++;
+    }
+  }
+  r[0] += bias[Z];
+  r[1] += bias[Z];
+  r[2] += bias[Z];
+  r[3] += bias[Z];
+  r[0] = clamp(r[0], (FLT)(relu_clip_min), (FLT)(relu_clip_max));
+  r[1] = clamp(r[1], (FLT)(relu_clip_min), (FLT)(relu_clip_max));
+  r[2] = clamp(r[2], (FLT)(relu_clip_min), (FLT)(relu_clip_max));
+  r[3] = clamp(r[3], (FLT)(relu_clip_min), (FLT)(relu_clip_max));
+  WRITE_IMAGE(dst_data, (int2)(X * dst_size.z + Z, Y), r[0]);
+  if (!last_x) {
+    WRITE_IMAGE(dst_data, (int2)((X + 1) * dst_size.z + Z, Y), r[1]);
+  }
+  if (!last_y) {
+    WRITE_IMAGE(dst_data, (int2)(X * dst_size.z + Z, Y + 1), r[2]);
+  }
+  if (!last_y && !last_x) {
+    WRITE_IMAGE(dst_data, (int2)((X + 1) * dst_size.z + Z, Y + 1), r[3]);
+  }
 }
 __kernel void DepthwiseConv2d_IMG_NHWC4_1x1(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias,
                                             __write_only image2d_t dst_data, int2 kernel_size, int2 stride,
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
index 6a33695a37..9cc826f945 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
@@ -19,6 +19,7 @@
 #include <string>
 #include <set>
 #include <map>
+#include <algorithm>
 #include <utility>
 #include "src/kernel_registry.h"
 #include "src/runtime/kernel/opencl/utils.h"
@@ -52,6 +53,10 @@ int DepthwiseConv2dOpenCLKernel::Init() {
   if (parameter->kernel_h_ == 1) {
     kernel_name += "_1x1";
   }
+  kernel_name += "_b";
+  for (auto iv : block_size_) {
+    kernel_name += std::to_string(iv);
+  }
 #ifdef PROGRAM_WITH_IL
   kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
 #else
@@ -135,8 +140,9 @@ int DepthwiseConv2dOpenCLKernel::InitBuffer() {
 }
 
 int DepthwiseConv2dOpenCLKernel::GetGlobalSize(size_t idx, std::vector<size_t> *global_size) {
-  size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM * 2);
-  std::vector<size_t> global = {(size_t)out_tensors_[0]->Width(), (size_t)out_tensors_[0]->Height(), CO4};
+  size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM * block_size_[2]);
+  std::vector<size_t> global = {CO4, (size_t)UP_DIV(out_tensors_[0]->Width(), block_size_[1]),
+                                (size_t)UP_DIV(out_tensors_[0]->Height(), block_size_[0])};
   *global_size = std::move(global);
   return mindspore::lite::RET_OK;
 }
@@ -144,11 +150,11 @@ int DepthwiseConv2dOpenCLKernel::GetGlobalSize(size_t idx, std::vector<size_t> *
 int DepthwiseConv2dOpenCLKernel::GetLocalSize(size_t idx, const std::vector<size_t> &global_size,
                                               std::vector<size_t> *local_size) {
   const int max_group_size = ocl_runtime_->DeviceMaxWorkGroupSize();
-  int z = global_size[2];
-  int y = static_cast<size_t>(std::min(max_group_size / z, GetMaxDivisorStrategy0(global_size[1], 8)));
-  int x = std::max(1, std::min(static_cast<int>(global_size[0]), max_group_size / (y * z)));
+  int z = global_size[0];
+  int y = std::min(max_group_size / z, GetMaxDivisorStrategy0(global_size[2], 8));
+  int x = std::max(1, std::min(static_cast<int>(global_size[1]), max_group_size / (y * z)));
   local_size->clear();
-  *local_size = std::vector<size_t>({static_cast<size_t>(x), static_cast<size_t>(y), static_cast<size_t>(z)});
+  *local_size = std::vector<size_t>({static_cast<size_t>(z), static_cast<size_t>(x), static_cast<size_t>(y)});
   return mindspore::lite::RET_OK;
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h
index 5655579ebe..58bcbb5edb 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h
@@ -45,6 +45,7 @@ class DepthwiseConv2dOpenCLKernel : public OpenCLKernel {
   void *packed_weight_{nullptr};
   void *bias_data_{nullptr};
   cl::Kernel kernel_;
+  std::vector<int> block_size_{2, 2, 1};
 };
 }  // namespace mindspore::kernel