!7720 [MS][LITE][GPU]optimize reduce

Merge pull request !7720 from chenzupeng/master-lite
5 years ago · 8f95f8022c
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/reduce.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/reduce.cl
@@ -1,6 +1,7 @@
 #ifdef cl_khr_fp16
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #endif
 #define LOCAL_CACHE_THREAD 16
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 __kernel void mean_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) {
  int X = get_global_id(0);  // C4
@@ -17,19 +18,31 @@ __kernel void mean_NHWC4(__read_only image2d_t src_data, __write_only image2d_t
  WRITE_IMAGE(dst_data, (int2)(X, 0), TO_FLT4(result));
 }

 __kernel void mean_NC4HW4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) {
 __kernel void mean_local_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) {
  int X = get_global_id(0);  // C4
  if (X >= size.z) {
    return;
  int localy = get_local_id(1);
  int localz = get_local_id(2);
  if (X >= size.z) return;
  __local float4 temp[LOCAL_CACHE_THREAD][LOCAL_CACHE_THREAD];
  temp[localy][localz] = (float4)0.f;
  for (int h = localy; h < size.x; h += LOCAL_CACHE_THREAD) {
    for (int w = localz; w < size.y; w += LOCAL_CACHE_THREAD) {
      temp[localy][localz] += convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + X, h)));
    }
  }
  FLT4 result = (FLT4)0.f;
  for (int h = 0; h < size.x; h++) {
    for (int w = 0; w < size.y; w++) {
      result += READ_IMAGE(src_data, smp_zero, (int2)(w, X * size.x + h));
  barrier(CLK_LOCAL_MEM_FENCE);
  if (localz == 0) {
    for (int i = 1; i < LOCAL_CACHE_THREAD; i++) {
      temp[localy][0] += temp[localy][i];
    }
  }
  barrier(CLK_LOCAL_MEM_FENCE);
  float4 result = temp[0][0];
  for (int i = 1; i < LOCAL_CACHE_THREAD; i++) {
    result += temp[i][0];
  }
  result /= size.x * size.y;
  WRITE_IMAGE(dst_data, (int2)(0, X), result);
  WRITE_IMAGE(dst_data, (int2)(X, 0), TO_FLT4(result));
 }

 __kernel void sum_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) {
@@ -46,16 +59,28 @@ __kernel void sum_NHWC4(__read_only image2d_t src_data, __write_only image2d_t d
  WRITE_IMAGE(dst_data, (int2)(X, 0), result);
 }

 __kernel void sum_NC4HW4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) {
 __kernel void sum_local_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) {
  int X = get_global_id(0);  // C4
  if (X >= size.z) {
    return;
  int localy = get_local_id(1);
  int localz = get_local_id(2);
  if (X >= size.z) return;
  __local float4 temp[LOCAL_CACHE_THREAD][LOCAL_CACHE_THREAD];
  temp[localy][localz] = (float4)0.f;
  for (int h = localy; h < size.x; h += LOCAL_CACHE_THREAD) {
    for (int w = localz; w < size.y; w += LOCAL_CACHE_THREAD) {
      temp[localy][localz] += convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + X, h)));
    }
  }
  FLT4 result = (FLT4)0.f;
  for (int h = 0; h < size.x; h++) {
    for (int w = 0; w < size.y; w++) {
      result += READ_IMAGE(src_data, smp_zero, (int2)(w, X * size.x + h));
  barrier(CLK_LOCAL_MEM_FENCE);
  if (localz == 0) {
    for (int i = 1; i < LOCAL_CACHE_THREAD; i++) {
      temp[localy][0] += temp[localy][i];
    }
  }
  WRITE_IMAGE(dst_data, (int2)(0, X), result);
  barrier(CLK_LOCAL_MEM_FENCE);
  float4 result = temp[0][0];
  for (int i = 1; i < LOCAL_CACHE_THREAD; i++) {
    result += temp[i][0];
  }
  WRITE_IMAGE(dst_data, (int2)(X, 0), TO_FLT4(result));
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.cc
@@ -57,6 +57,10 @@ int ReduceOpenCLKernel::Init() {
    return RET_PARAM_INVALID;
  }
  std::string kernel_name = reduce_type2str.at(reduce_param->mode_);
  if (in_tensors_[0]->shape()[1] >= LOCAL_CACHE_THREAD || in_tensors_[0]->shape()[2] >= LOCAL_CACHE_THREAD) {
    use_local_ = true;
    kernel_name += "_local";
  }
  kernel_name += "_NHWC4";
  enable_fp16_ = ocl_runtime_->GetFp16Enable();

@@ -101,7 +105,10 @@ int ReduceOpenCLKernel::Run() {
  int c = shapex[3];
  int c4 = UP_DIV(c, C4NUM);
  std::vector<size_t> local = {};
  std::vector<size_t> global = {static_cast<size_t>(c4)};
  if (use_local_) {
    local = {1, LOCAL_CACHE_THREAD, LOCAL_CACHE_THREAD};
  }
  std::vector<size_t> global = {static_cast<size_t>(c4), 1, 1};
  cl_int4 size = {h, w, c4, 1};
  int arg_idx = 0;
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.h
@@ -39,6 +39,8 @@ class ReduceOpenCLKernel : public OpenCLKernel {
  cl::Kernel kernel_;
  bool enable_fp16_{false};
  std::vector<size_t> nhwc_shape_;
  bool use_local_{false};
  static const size_t LOCAL_CACHE_THREAD{16};
 };
 }  // namespace mindspore::kernel