From d431fc717617a2bf26a0b75aced8cdab642ade99 Mon Sep 17 00:00:00 2001 From: wandongdong Date: Tue, 27 Oct 2020 20:19:23 -0700 Subject: [PATCH] optimize opencl depthwise --- .../kernel/opencl/cl/depthwise_conv2d.cl | 149 +++++++++++++++--- .../kernel/opencl/kernel/depthwise_conv2d.cc | 18 ++- .../kernel/opencl/kernel/depthwise_conv2d.h | 1 + 3 files changed, 143 insertions(+), 25 deletions(-) diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/depthwise_conv2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/depthwise_conv2d.cl index 070b842fb8..44f3d0a051 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/depthwise_conv2d.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/depthwise_conv2d.cl @@ -32,44 +32,155 @@ __kernel void DepthwiseConv2d_IMG_NC4HW4(__read_only image2d_t src_data, __globa WRITE_IMAGE(dst_data, (int2)(X, (Z * dst_size.y + Y)), res); } -__kernel void DepthwiseConv2d_IMG_NHWC4(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias, - __write_only image2d_t dst_data, int2 kernel_size, int2 stride, int2 padding, - int2 dilation, int4 src_size, int4 dst_size, float relu_clip_min, - float relu_clip_max) { - int X = get_global_id(0); - int Y = get_global_id(1); - int Z = get_global_id(2) * 2; +__kernel void DepthwiseConv2d_IMG_NHWC4_b222(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias, + __write_only image2d_t dst_data, int2 kernel_size, int2 stride, + int2 padding, int2 dilation, int4 src_size, int4 dst_size, + float relu_clip_min, float relu_clip_max) { + int X = get_global_id(1) * 2; + int Y = get_global_id(2) * 2; + int Z = get_global_id(0) * 2; if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return; - FLT4 r[2] = {(FLT4)(0.0f, 0.0f, 0.0f, 0.0f), (FLT4)(0.0f, 0.0f, 0.0f, 0.0f)}; + FLT4 r[8] = {(FLT4)(0.0f, 0.0f, 0.0f, 0.0f), (FLT4)(0.0f, 0.0f, 0.0f, 0.0f), (FLT4)(0.0f, 0.0f, 0.0f, 0.0f), + (FLT4)(0.0f, 0.0f, 0.0f, 0.0f), (FLT4)(0.0f, 0.0f, 0.0f, 0.0f), (FLT4)(0.0f, 0.0f, 0.0f, 0.0f), + (FLT4)(0.0f, 0.0f, 0.0f, 0.0f), (FLT4)(0.0f, 0.0f, 0.0f, 0.0f)}; int x_offset = X * stride.x + padding.x; int y_offset = Y * stride.y + padding.y; int f_len = kernel_size.x * kernel_size.y; int fx_c = Z * f_len; + bool last_x = (get_global_id(1) == (dst_size.x + 1) / 2) && ((dst_size.x & 0x1) == 1); + bool last_y = (get_global_id(2) == (dst_size.y + 1) / 2) && ((dst_size.y & 0x1) == 1); + bool last_c = (get_global_id(0) == (dst_size.z + 1) / 2) && ((dst_size.z & 0x1) == 1); for (int ky = 0; ky < kernel_size.y; ++ky) { int y_c = y_offset + ky * dilation.y; - bool outside_y = y_c < 0 || y_c >= src_size.y; + int y_c_a1 = y_c + stride.y; for (int kx = 0; kx < kernel_size.x; ++kx) { int x_c = x_offset + kx * dilation.x; - bool outside_x = x_c < 0 || x_c >= src_size.x; - if (!outside_x && !outside_y) { - FLT4 flt_p0 = filter[fx_c]; - FLT4 flt_p1 = filter[fx_c + f_len]; - FLT4 src_p0 = READ_IMAGE(src_data, smp_zero, (int2)(Z + x_c * src_size.z, y_c)); - FLT4 src_p1 = READ_IMAGE(src_data, smp_zero, (int2)(Z + 1 + x_c * src_size.z, y_c)); - r[0] += TO_FLT4(src_p0 * flt_p0); - r[1] += TO_FLT4(src_p1 * flt_p1); + int x_c_a1 = x_c + stride.x; + int x_sign = x_c < 0 ? -1 : 1; + int x_a1_sign = x_c_a1 < 0 ? -1 : 1; + FLT4 flt_p0 = filter[fx_c]; + FLT4 flt_p1 = filter[fx_c + f_len]; + { + FLT4 src_p00_c0 = READ_IMAGE(src_data, smp_zero, (int2)(Z * x_sign + x_c * src_size.z, y_c)); + FLT4 src_p00_c1 = READ_IMAGE(src_data, smp_zero, (int2)((Z + 1) * x_sign + x_c * src_size.z, y_c)); + r[0] += TO_FLT4(src_p00_c0 * flt_p0); + r[1] += TO_FLT4(src_p00_c1 * flt_p1); + } + { + FLT4 src_p01_c0 = READ_IMAGE(src_data, smp_zero, (int2)(Z * x_a1_sign + x_c_a1 * src_size.z, y_c)); + FLT4 src_p01_c1 = READ_IMAGE(src_data, smp_zero, (int2)((Z + 1) * x_a1_sign + x_c_a1 * src_size.z, y_c)); + r[2] += TO_FLT4(src_p01_c0 * flt_p0); + r[3] += TO_FLT4(src_p01_c1 * flt_p1); + } + { + FLT4 src_p10_c0 = READ_IMAGE(src_data, smp_zero, (int2)(Z + x_c * src_size.z, y_c_a1)); + FLT4 src_p10_c1 = READ_IMAGE(src_data, smp_zero, (int2)(Z + 1 + x_c * src_size.z, y_c_a1)); + r[4] += TO_FLT4(src_p10_c0 * flt_p0); + r[5] += TO_FLT4(src_p10_c1 * flt_p1); + } + { + FLT4 src_p11_c0 = READ_IMAGE(src_data, smp_zero, (int2)(Z * x_a1_sign + x_c_a1 * src_size.z, y_c_a1)); + FLT4 src_p11_c1 = READ_IMAGE(src_data, smp_zero, (int2)((Z + 1) * x_a1_sign + x_c_a1 * src_size.z, y_c_a1)); + r[6] += TO_FLT4(src_p11_c0 * flt_p0); + r[7] += TO_FLT4(src_p11_c1 * flt_p1); } fx_c++; } } r[0] += bias[Z]; - r[0] = clamp(r[0], (FLT)(relu_clip_min), (FLT)(relu_clip_max)); r[1] += bias[Z + 1]; + r[2] += bias[Z]; + r[3] += bias[Z + 1]; + r[4] += bias[Z]; + r[5] += bias[Z + 1]; + r[6] += bias[Z]; + r[7] += bias[Z + 1]; + r[0] = clamp(r[0], (FLT)(relu_clip_min), (FLT)(relu_clip_max)); r[1] = clamp(r[1], (FLT)(relu_clip_min), (FLT)(relu_clip_max)); + r[2] = clamp(r[2], (FLT)(relu_clip_min), (FLT)(relu_clip_max)); + r[3] = clamp(r[3], (FLT)(relu_clip_min), (FLT)(relu_clip_max)); + r[4] = clamp(r[4], (FLT)(relu_clip_min), (FLT)(relu_clip_max)); + r[5] = clamp(r[5], (FLT)(relu_clip_min), (FLT)(relu_clip_max)); + r[6] = clamp(r[6], (FLT)(relu_clip_min), (FLT)(relu_clip_max)); + r[7] = clamp(r[7], (FLT)(relu_clip_min), (FLT)(relu_clip_max)); WRITE_IMAGE(dst_data, (int2)(X * dst_size.z + Z, Y), r[0]); - if ((dst_size.z & 0x1) == 0) { + if (!last_c) { WRITE_IMAGE(dst_data, (int2)(X * dst_size.z + Z + 1, Y), r[1]); } + if (!last_x) { + WRITE_IMAGE(dst_data, (int2)((X + 1) * dst_size.z + Z, Y), r[2]); + if (!last_c) { + WRITE_IMAGE(dst_data, (int2)((X + 1) * dst_size.z + Z + 1, Y), r[3]); + } + } + if (!last_y) { + WRITE_IMAGE(dst_data, (int2)(X * dst_size.z + Z, Y + 1), r[4]); + if (!last_c) { + WRITE_IMAGE(dst_data, (int2)(X * dst_size.z + Z + 1, Y + 1), r[5]); + } + } + if (!last_y && !last_x) { + WRITE_IMAGE(dst_data, (int2)((X + 1) * dst_size.z + Z, Y + 1), r[6]); + if (!last_c) { + WRITE_IMAGE(dst_data, (int2)((X + 1) * dst_size.z + Z + 1, Y + 1), r[7]); + } + } +} +__kernel void DepthwiseConv2d_IMG_NHWC4_b221(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias, + __write_only image2d_t dst_data, int2 kernel_size, int2 stride, + int2 padding, int2 dilation, int4 src_size, int4 dst_size, + float relu_clip_min, float relu_clip_max) { + int X = get_global_id(1) * 2; + int Y = get_global_id(2) * 2; + int Z = get_global_id(0); + if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return; + FLT4 r[4] = {(FLT4)(0.0f, 0.0f, 0.0f, 0.0f), (FLT4)(0.0f, 0.0f, 0.0f, 0.0f), (FLT4)(0.0f, 0.0f, 0.0f, 0.0f), + (FLT4)(0.0f, 0.0f, 0.0f, 0.0f)}; + int x_offset = X * stride.x + padding.x; + int y_offset = Y * stride.y + padding.y; + int f_len = kernel_size.x * kernel_size.y; + int fx_c = Z * f_len; + bool last_x = (get_global_id(1) == (dst_size.x + 1) / 2) && ((dst_size.x & 0x1) == 1); + bool last_y = (get_global_id(2) == (dst_size.y + 1) / 2) && ((dst_size.y & 0x1) == 1); + for (int ky = 0; ky < kernel_size.y; ++ky) { + int y_c = y_offset + ky * dilation.y; + int y_c_a1 = y_c + stride.y; + for (int kx = 0; kx < kernel_size.x; ++kx) { + int x_c = x_offset + kx * dilation.x; + int x_c_a1 = x_c + stride.x; + int x_sign = x_c < 0 ? -1 : 1; + int x_a1_sign = x_c_a1 < 0 ? -1 : 1; + FLT4 flt_p0 = filter[fx_c]; + FLT4 src_p00_c0 = READ_IMAGE(src_data, smp_zero, (int2)(Z * x_sign + x_c * src_size.z, y_c)); + r[0] += TO_FLT4(src_p00_c0 * flt_p0); + FLT4 src_p01_c0 = READ_IMAGE(src_data, smp_zero, (int2)(Z * x_a1_sign + x_c_a1 * src_size.z, y_c)); + r[1] += TO_FLT4(src_p01_c0 * flt_p0); + FLT4 src_p10_c0 = READ_IMAGE(src_data, smp_zero, (int2)(Z + x_c * src_size.z, y_c_a1)); + r[2] += TO_FLT4(src_p10_c0 * flt_p0); + FLT4 src_p11_c0 = READ_IMAGE(src_data, smp_zero, (int2)(Z * x_a1_sign + x_c_a1 * src_size.z, y_c_a1)); + r[3] += TO_FLT4(src_p11_c0 * flt_p0); + + fx_c++; + } + } + r[0] += bias[Z]; + r[1] += bias[Z]; + r[2] += bias[Z]; + r[3] += bias[Z]; + r[0] = clamp(r[0], (FLT)(relu_clip_min), (FLT)(relu_clip_max)); + r[1] = clamp(r[1], (FLT)(relu_clip_min), (FLT)(relu_clip_max)); + r[2] = clamp(r[2], (FLT)(relu_clip_min), (FLT)(relu_clip_max)); + r[3] = clamp(r[3], (FLT)(relu_clip_min), (FLT)(relu_clip_max)); + WRITE_IMAGE(dst_data, (int2)(X * dst_size.z + Z, Y), r[0]); + if (!last_x) { + WRITE_IMAGE(dst_data, (int2)((X + 1) * dst_size.z + Z, Y), r[1]); + } + if (!last_y) { + WRITE_IMAGE(dst_data, (int2)(X * dst_size.z + Z, Y + 1), r[2]); + } + if (!last_y && !last_x) { + WRITE_IMAGE(dst_data, (int2)((X + 1) * dst_size.z + Z, Y + 1), r[3]); + } } __kernel void DepthwiseConv2d_IMG_NHWC4_1x1(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias, __write_only image2d_t dst_data, int2 kernel_size, int2 stride, diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc index 6a33695a37..9cc826f945 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc @@ -19,6 +19,7 @@ #include #include #include +#include #include #include "src/kernel_registry.h" #include "src/runtime/kernel/opencl/utils.h" @@ -52,6 +53,10 @@ int DepthwiseConv2dOpenCLKernel::Init() { if (parameter->kernel_h_ == 1) { kernel_name += "_1x1"; } + kernel_name += "_b"; + for (auto iv : block_size_) { + kernel_name += std::to_string(iv); + } #ifdef PROGRAM_WITH_IL kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); #else @@ -135,8 +140,9 @@ int DepthwiseConv2dOpenCLKernel::InitBuffer() { } int DepthwiseConv2dOpenCLKernel::GetGlobalSize(size_t idx, std::vector *global_size) { - size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM * 2); - std::vector global = {(size_t)out_tensors_[0]->Width(), (size_t)out_tensors_[0]->Height(), CO4}; + size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM * block_size_[2]); + std::vector global = {CO4, (size_t)UP_DIV(out_tensors_[0]->Width(), block_size_[1]), + (size_t)UP_DIV(out_tensors_[0]->Height(), block_size_[0])}; *global_size = std::move(global); return mindspore::lite::RET_OK; } @@ -144,11 +150,11 @@ int DepthwiseConv2dOpenCLKernel::GetGlobalSize(size_t idx, std::vector * int DepthwiseConv2dOpenCLKernel::GetLocalSize(size_t idx, const std::vector &global_size, std::vector *local_size) { const int max_group_size = ocl_runtime_->DeviceMaxWorkGroupSize(); - int z = global_size[2]; - int y = static_cast(std::min(max_group_size / z, GetMaxDivisorStrategy0(global_size[1], 8))); - int x = std::max(1, std::min(static_cast(global_size[0]), max_group_size / (y * z))); + int z = global_size[0]; + int y = std::min(max_group_size / z, GetMaxDivisorStrategy0(global_size[2], 8)); + int x = std::max(1, std::min(static_cast(global_size[1]), max_group_size / (y * z))); local_size->clear(); - *local_size = std::vector({static_cast(x), static_cast(y), static_cast(z)}); + *local_size = std::vector({static_cast(z), static_cast(x), static_cast(y)}); return mindspore::lite::RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h index 5655579ebe..58bcbb5edb 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h @@ -45,6 +45,7 @@ class DepthwiseConv2dOpenCLKernel : public OpenCLKernel { void *packed_weight_{nullptr}; void *bias_data_{nullptr}; cl::Kernel kernel_; + std::vector block_size_{2, 2, 1}; }; } // namespace mindspore::kernel