diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/reduce.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/reduce.cl index c013d80930..b6006eee29 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/reduce.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/reduce.cl @@ -183,6 +183,23 @@ __kernel void LocalWCSumSquare(__read_only image2d_t src_data, __write_only imag WRITE_IMAGE(dst_data, (int2)(0, X), result2); } +__kernel void GlobalCMean(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size, float4 mask) { + int X = get_global_id(0); // H + int Y = get_global_id(1); // W + if (X >= size.x || Y >= size.y) { + return; + } + float4 result = (float4)0.f; + for (int c = 0; c < size.z; c++) { + result += convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + c, X))); + } + + result /= size.w; + FLT4 result2 = (FLT4)(0.f); + result2.x = dot(TO_FLT4(result), (FLT4)(1.f)); + WRITE_IMAGE(dst_data, (int2)(Y, X), result2); +} + #define GlobalHW(Method) \ __kernel void GlobalHW##Method(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) { \ int X = get_global_id(0); \ @@ -297,7 +314,6 @@ __kernel void LocalWCSumSquare(__read_only image2d_t src_data, __write_only imag #define DoSum(A, B) A += B #define InitSum 0.f GlobalHW(Sum) GlobalWC(Sum) LocalHW(Sum) LocalWC(Sum) - #define DoMin(A, B) A = min(A, B) #define InitMin 10000.f GlobalHW(Min) GlobalWC(Min) LocalHW(Min) LocalWC(Min) diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.cc index 23e1b7bee4..57acd81e0e 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.cc @@ -38,7 +38,8 @@ int ArgMinMaxOpenCLKernel::CheckSpecs() { MS_LOG(ERROR) << "in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size(); return RET_ERROR; } - if (in_tensors_[0]->data_type() != kNumberTypeFloat32 && in_tensors_[0]->data_type() != kNumberTypeFloat16) { + if ((in_tensors_[0]->data_type() != kNumberTypeFloat32 && in_tensors_[0]->data_type() != kNumberTypeFloat16) || + (out_tensors_[0]->data_type() != kNumberTypeFloat32 && out_tensors_[0]->data_type() != kNumberTypeFloat16)) { MS_LOG(ERROR) << "Unsupported data type " << in_tensors_[0]->data_type(); return RET_ERROR; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.cc index 5c9433d4e7..4953a22727 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.cc @@ -54,8 +54,8 @@ std::string ReduceOpenCLKernel::GetReduceTypeStr(int type) { cl_float4 ReduceOpenCLKernel::GenC4Mask() { auto reduce_param = reinterpret_cast(op_parameter_); - int last_c4 = in_tensors_[0]->shape()[3] % C4NUM; - last_c4 = (C4NUM - last_c4) % 4; + int last_c4 = inShape.C % C4NUM; + if (last_c4 == 0) last_c4 = C4NUM; static const std::map reduce_type2init{ {ReduceMode_ReduceMean, 0.f}, {ReduceMode_ReduceSum, 0.f}, {ReduceMode_ReduceMin, 10000.f}, {ReduceMode_ReduceMax, -10000.f}, {ReduceMode_ReduceProd, 1.f}, {ReduceMode_ReduceSumSquare, 0.f}}; @@ -67,23 +67,19 @@ cl_float4 ReduceOpenCLKernel::GenC4Mask() { return mask; } -bool hw_reduce(const int *axes_) { return (axes_[0] == 1 && axes_[1] == 2) || (axes_[0] == 2 && axes_[1] == 1); } +bool IsHWReduce(bool *reduce_axes_) { + return !reduce_axes_[0] && reduce_axes_[1] && reduce_axes_[2] && !reduce_axes_[3]; +} -int ReduceOpenCLKernel::CheckSpecs() { - if (in_tensors_.size() != 2 || out_tensors_.size() != 1) { - MS_LOG(ERROR) << "in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size(); - return RET_ERROR; - } - if (in_tensors_[0]->shape()[0] > 1) { - MS_LOG(ERROR) << "reduce op only support n = 1"; - return RET_PARAM_INVALID; - } - auto reduce_param = reinterpret_cast(op_parameter_); - if (GetReduceTypeStr(reduce_param->mode_).empty()) { - MS_LOG(ERROR) << "not supported reduce type:" << reduce_param->mode_; - return RET_PARAM_INVALID; - } +bool IsWCReduce(bool *reduce_axes_) { + return !reduce_axes_[0] && !reduce_axes_[1] && reduce_axes_[2] && reduce_axes_[3]; +} + +bool IsCReduce(bool *reduce_axes_) { + return !reduce_axes_[0] && !reduce_axes_[1] && !reduce_axes_[2] && reduce_axes_[3]; +} +int ReduceOpenCLKernel::SetAxes() { // axes is input tensor // get num_axes int num_axes = 0; @@ -102,21 +98,57 @@ int ReduceOpenCLKernel::CheckSpecs() { for (int i = 0; i < std::min(num_axes, MAX_SHAPE_SIZE); ++i) { axes_[i] = reinterpret_cast(axes_tensor->data_c())[i]; } - if (num_axes == 1 && axes_[0] == 3 && in_tensors_[0]->shape()[2] == 1) { - num_axes = 2; - axes_[1] = 2; - } - if (num_axes != 2) { - MS_LOG(ERROR) << "reduce op only support num_axes=2"; + if (num_axes > 2 || num_axes < 1) { + MS_LOG(ERROR) << "Unsupported reduce num axes " << num_axes; return RET_PARAM_INVALID; } - wc_reduce_ = (axes_[0] == 2 && axes_[1] == 3) || (axes_[0] == 3 && axes_[1] == 2); - if (!hw_reduce(axes_) && !wc_reduce_) { - MS_LOG(ERROR) << "reduce op only support axis (1,2) or (2,3)"; + for (int i = 0; i < num_axes; i++) { + int axis = axes_[i]; + axis = inShape.AlignAxis(axis); + reduce_axes_[axis] = true; + } + if (num_axes == 1) { + if (reduce_axes_[1] && inShape.W == 1) { + reduce_axes_[2] = true; + } else if (reduce_axes_[2]) { + if (inShape.H == 1) { + reduce_axes_[1] = true; + } else if (inShape.C == 1) { + reduce_axes_[3] = true; + } + } else if (reduce_axes_[3] && inShape.W == 1) { + reduce_axes_[3] = true; + } + } + return RET_OK; +} + +int ReduceOpenCLKernel::CheckSpecs() { + if (in_tensors_.size() != 2 || out_tensors_.size() != 1) { + MS_LOG(ERROR) << "in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size(); + return RET_ERROR; + } + if (in_tensors_[0]->shape()[0] > 1) { + MS_LOG(ERROR) << "reduce op only support n = 1"; + return RET_PARAM_INVALID; + } + inShape = GpuTensorInfo(in_tensors_[0]); + auto reduce_param = reinterpret_cast(op_parameter_); + if (GetReduceTypeStr(reduce_param->mode_).empty()) { + MS_LOG(ERROR) << "not supported reduce type:" << reduce_param->mode_; + return RET_PARAM_INVALID; + } + auto ret = SetAxes(); + if (ret != RET_OK) return ret; + hw_reduce_ = IsHWReduce(reduce_axes_); + wc_reduce_ = IsWCReduce(reduce_axes_); + c_reduce_ = IsCReduce(reduce_axes_); + if (!hw_reduce_ && !wc_reduce_ && !c_reduce_) { + MS_LOG(ERROR) << "Unsupported reduce axes"; return RET_PARAM_INVALID; } - if (wc_reduce_ && reduce_param->keep_dims_ == false) { + if ((c_reduce_ || wc_reduce_) && reduce_param->keep_dims_ == false) { MS_LOG(ERROR) << "reduce axis (2,3) should keep dims"; return RET_PARAM_INVALID; } @@ -130,18 +162,22 @@ int ReduceOpenCLKernel::Prepare() { } std::string kernel_name; - if (in_tensors_[0]->shape()[axes_[0]] >= LOCAL_CACHE_THREAD || - in_tensors_[0]->shape()[axes_[1]] >= LOCAL_CACHE_THREAD) { + use_local_ = false; + kernel_name = "Global"; + if (wc_reduce_ && (inShape.W >= LOCAL_CACHE_THREAD || inShape.C >= LOCAL_CACHE_THREAD)) { use_local_ = true; - kernel_name += "Local"; - } else { - use_local_ = false; - kernel_name += "Global"; + kernel_name = "Local"; + } + if (hw_reduce_ && (inShape.W >= LOCAL_CACHE_THREAD || inShape.H >= LOCAL_CACHE_THREAD)) { + use_local_ = true; + kernel_name = "Local"; } if (wc_reduce_) { kernel_name += "WC"; - } else { + } else if (hw_reduce_) { kernel_name += "HW"; + } else if (c_reduce_) { + kernel_name += "C"; } kernel_name += GetReduceTypeStr(reduce_param->mode_); #ifdef PROGRAM_WITH_IL @@ -150,7 +186,10 @@ int ReduceOpenCLKernel::Prepare() { std::string source = reduce_source; std::string program_name = "Reduce"; ocl_runtime_->LoadSource(program_name, source); - ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); + auto ret = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); + if (ret != RET_OK) { + return ret; + } #endif SetConstArgs(); SetGlobalLocal(); @@ -158,30 +197,31 @@ int ReduceOpenCLKernel::Prepare() { return mindspore::lite::RET_OK; } void ReduceOpenCLKernel::SetConstArgs() { - std::vector shapex = in_tensors_[0]->shape(); - int h = shapex[1]; - int w = shapex[2]; - int c = shapex[3]; + int h = inShape.H; + int w = inShape.W; + int c = inShape.C; int c4 = UP_DIV(c, C4NUM); cl_int4 size = {h, w, c4, c}; int arg_idx = 2; ocl_runtime_->SetKernelArg(kernel_, arg_idx++, size); - if (wc_reduce_) { + if (wc_reduce_ || c_reduce_) { ocl_runtime_->SetKernelArg(kernel_, arg_idx++, GenC4Mask()); } } void ReduceOpenCLKernel::SetGlobalLocal() { - std::vector shapex = in_tensors_[0]->shape(); - int h = shapex[1]; - int c = shapex[3]; - int c4 = UP_DIV(c, C4NUM); + int h = inShape.H; + int w = inShape.W; + int c4 = inShape.Slice; local_size_ = {}; if (use_local_) { local_size_ = {1, LOCAL_CACHE_THREAD, LOCAL_CACHE_THREAD}; } - global_size_ = {static_cast(c4), 1, 1}; - if (wc_reduce_) { + if (hw_reduce_) { + global_size_ = {static_cast(c4), 1, 1}; + } else if (wc_reduce_) { global_size_ = {static_cast(h), 1, 1}; + } else if (c_reduce_ && !use_local_) { + global_size_ = {static_cast(h), static_cast(w)}; } AlignGlobalLocal(global_size_, local_size_); } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.h index 3a1e97d1e7..359fd4d4f0 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.h @@ -37,11 +37,15 @@ class ReduceOpenCLKernel : public OpenCLKernel { int Tune() override; private: + int SetAxes(); cl_float4 GenC4Mask(); static std::string GetReduceTypeStr(int type); - GpuTensorInfo outShape; + GpuTensorInfo inShape; bool use_local_{false}; bool wc_reduce_{false}; + bool hw_reduce_{false}; + bool c_reduce_{false}; + bool reduce_axes_[4]{false}; static const size_t LOCAL_CACHE_THREAD{16}; int axes_[MAX_SHAPE_SIZE]; }; diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/reduce_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/reduce_tests.cc index 0831f160b1..f0b7d49e11 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/reduce_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/reduce_tests.cc @@ -46,7 +46,9 @@ TEST_F(TestOpenCL_Reduce, Mean) { for (auto fp16_enable : {false, true}) { auto *param = CreateParameter(axis, schema::ReduceMode_ReduceMean, false); - TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32}}, {output_shape, output_data}, param, fp16_enable); + TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32}, + {{static_cast(axis.size())}, axis.data(), CONST_TENSOR, kNumberTypeInt32}}, + {output_shape, output_data}, param, fp16_enable); } } @@ -59,7 +61,9 @@ TEST_F(TestOpenCL_Reduce, Sum) { for (auto fp16_enable : {false, true}) { auto *param = CreateParameter(axis, schema::ReduceMode_ReduceSum, false); - TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32}}, {output_shape, output_data}, param, fp16_enable); + TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32}, + {{static_cast(axis.size())}, axis.data(), CONST_TENSOR, kNumberTypeInt32}}, + {output_shape, output_data}, param, fp16_enable); } } @@ -72,7 +76,9 @@ TEST_F(TestOpenCL_Reduce, MeanWC) { for (auto fp16_enable : {false, true}) { auto *param = CreateParameter(axis, schema::ReduceMode_ReduceMean, true); - TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32}}, {output_shape, output_data}, param, fp16_enable); + TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32}, + {{static_cast(axis.size())}, axis.data(), CONST_TENSOR, kNumberTypeInt32}}, + {output_shape, output_data}, param, fp16_enable); } } @@ -85,8 +91,24 @@ TEST_F(TestOpenCL_Reduce, SumWC) { for (auto fp16_enable : {false, true}) { auto *param = CreateParameter(axis, schema::ReduceMode_ReduceSum, true); - TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32}}, {output_shape, output_data}, param, fp16_enable); + TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32}, + {{static_cast(axis.size())}, axis.data(), CONST_TENSOR, kNumberTypeInt32}}, + {output_shape, output_data}, param, fp16_enable); } } +TEST_F(TestOpenCL_Reduce, MeanC) { + std::vector axis = {3}; + std::vector input_shape = {1, 3, 2, 2}; + std::vector output_shape = {1, 3, 2, 1}; + float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; + float output_data[] = {0.5, 2.5, 4.5, 6.5, 8.5, 10.5}; + + for (auto fp16_enable : {false, true}) { + auto *param = CreateParameter(axis, schema::ReduceMode_ReduceMean, true); + TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32}, + {{static_cast(axis.size())}, axis.data(), CONST_TENSOR, kNumberTypeInt32}}, + {output_shape, output_data}, param, fp16_enable); + } +} } // namespace mindspore::lite::opencl::test