diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/reduce.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/reduce.cl index b47e6a2292..2bfb66fe7b 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/reduce.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/reduce.cl @@ -1,6 +1,7 @@ #ifdef cl_khr_fp16 #pragma OPENCL EXTENSION cl_khr_fp16 : enable #endif +#define LOCAL_CACHE_THREAD 16 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; __kernel void mean_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) { int X = get_global_id(0); // C4 @@ -17,19 +18,31 @@ __kernel void mean_NHWC4(__read_only image2d_t src_data, __write_only image2d_t WRITE_IMAGE(dst_data, (int2)(X, 0), TO_FLT4(result)); } -__kernel void mean_NC4HW4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) { +__kernel void mean_local_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) { int X = get_global_id(0); // C4 - if (X >= size.z) { - return; + int localy = get_local_id(1); + int localz = get_local_id(2); + if (X >= size.z) return; + __local float4 temp[LOCAL_CACHE_THREAD][LOCAL_CACHE_THREAD]; + temp[localy][localz] = (float4)0.f; + for (int h = localy; h < size.x; h += LOCAL_CACHE_THREAD) { + for (int w = localz; w < size.y; w += LOCAL_CACHE_THREAD) { + temp[localy][localz] += convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + X, h))); + } } - FLT4 result = (FLT4)0.f; - for (int h = 0; h < size.x; h++) { - for (int w = 0; w < size.y; w++) { - result += READ_IMAGE(src_data, smp_zero, (int2)(w, X * size.x + h)); + barrier(CLK_LOCAL_MEM_FENCE); + if (localz == 0) { + for (int i = 1; i < LOCAL_CACHE_THREAD; i++) { + temp[localy][0] += temp[localy][i]; } } + barrier(CLK_LOCAL_MEM_FENCE); + float4 result = temp[0][0]; + for (int i = 1; i < LOCAL_CACHE_THREAD; i++) { + result += temp[i][0]; + } result /= size.x * size.y; - WRITE_IMAGE(dst_data, (int2)(0, X), result); + WRITE_IMAGE(dst_data, (int2)(X, 0), TO_FLT4(result)); } __kernel void sum_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) { @@ -46,16 +59,28 @@ __kernel void sum_NHWC4(__read_only image2d_t src_data, __write_only image2d_t d WRITE_IMAGE(dst_data, (int2)(X, 0), result); } -__kernel void sum_NC4HW4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) { +__kernel void sum_local_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) { int X = get_global_id(0); // C4 - if (X >= size.z) { - return; + int localy = get_local_id(1); + int localz = get_local_id(2); + if (X >= size.z) return; + __local float4 temp[LOCAL_CACHE_THREAD][LOCAL_CACHE_THREAD]; + temp[localy][localz] = (float4)0.f; + for (int h = localy; h < size.x; h += LOCAL_CACHE_THREAD) { + for (int w = localz; w < size.y; w += LOCAL_CACHE_THREAD) { + temp[localy][localz] += convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + X, h))); + } } - FLT4 result = (FLT4)0.f; - for (int h = 0; h < size.x; h++) { - for (int w = 0; w < size.y; w++) { - result += READ_IMAGE(src_data, smp_zero, (int2)(w, X * size.x + h)); + barrier(CLK_LOCAL_MEM_FENCE); + if (localz == 0) { + for (int i = 1; i < LOCAL_CACHE_THREAD; i++) { + temp[localy][0] += temp[localy][i]; } } - WRITE_IMAGE(dst_data, (int2)(0, X), result); + barrier(CLK_LOCAL_MEM_FENCE); + float4 result = temp[0][0]; + for (int i = 1; i < LOCAL_CACHE_THREAD; i++) { + result += temp[i][0]; + } + WRITE_IMAGE(dst_data, (int2)(X, 0), TO_FLT4(result)); } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.cc index b22e4e269d..214f5b5691 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.cc @@ -57,6 +57,10 @@ int ReduceOpenCLKernel::Init() { return RET_PARAM_INVALID; } std::string kernel_name = reduce_type2str.at(reduce_param->mode_); + if (in_tensors_[0]->shape()[1] >= LOCAL_CACHE_THREAD || in_tensors_[0]->shape()[2] >= LOCAL_CACHE_THREAD) { + use_local_ = true; + kernel_name += "_local"; + } kernel_name += "_NHWC4"; enable_fp16_ = ocl_runtime_->GetFp16Enable(); @@ -101,7 +105,10 @@ int ReduceOpenCLKernel::Run() { int c = shapex[3]; int c4 = UP_DIV(c, C4NUM); std::vector local = {}; - std::vector global = {static_cast(c4)}; + if (use_local_) { + local = {1, LOCAL_CACHE_THREAD, LOCAL_CACHE_THREAD}; + } + std::vector global = {static_cast(c4), 1, 1}; cl_int4 size = {h, w, c4, 1}; int arg_idx = 0; ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.h index 7e4780f57a..83ffdc453d 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.h @@ -39,6 +39,8 @@ class ReduceOpenCLKernel : public OpenCLKernel { cl::Kernel kernel_; bool enable_fp16_{false}; std::vector nhwc_shape_; + bool use_local_{false}; + static const size_t LOCAL_CACHE_THREAD{16}; }; } // namespace mindspore::kernel