|
|
|
@@ -54,8 +54,8 @@ std::string ReduceOpenCLKernel::GetReduceTypeStr(int type) { |
|
|
|
|
|
|
|
cl_float4 ReduceOpenCLKernel::GenC4Mask() { |
|
|
|
auto reduce_param = reinterpret_cast<ReduceParameter *>(op_parameter_); |
|
|
|
int last_c4 = in_tensors_[0]->shape()[3] % C4NUM; |
|
|
|
last_c4 = (C4NUM - last_c4) % 4; |
|
|
|
int last_c4 = inShape.C % C4NUM; |
|
|
|
if (last_c4 == 0) last_c4 = C4NUM; |
|
|
|
static const std::map<int, float> reduce_type2init{ |
|
|
|
{ReduceMode_ReduceMean, 0.f}, {ReduceMode_ReduceSum, 0.f}, {ReduceMode_ReduceMin, 10000.f}, |
|
|
|
{ReduceMode_ReduceMax, -10000.f}, {ReduceMode_ReduceProd, 1.f}, {ReduceMode_ReduceSumSquare, 0.f}}; |
|
|
|
@@ -67,23 +67,19 @@ cl_float4 ReduceOpenCLKernel::GenC4Mask() { |
|
|
|
return mask; |
|
|
|
} |
|
|
|
|
|
|
|
bool hw_reduce(const int *axes_) { return (axes_[0] == 1 && axes_[1] == 2) || (axes_[0] == 2 && axes_[1] == 1); } |
|
|
|
bool IsHWReduce(bool *reduce_axes_) { |
|
|
|
return !reduce_axes_[0] && reduce_axes_[1] && reduce_axes_[2] && !reduce_axes_[3]; |
|
|
|
} |
|
|
|
|
|
|
|
int ReduceOpenCLKernel::CheckSpecs() { |
|
|
|
if (in_tensors_.size() != 2 || out_tensors_.size() != 1) { |
|
|
|
MS_LOG(ERROR) << "in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size(); |
|
|
|
return RET_ERROR; |
|
|
|
} |
|
|
|
if (in_tensors_[0]->shape()[0] > 1) { |
|
|
|
MS_LOG(ERROR) << "reduce op only support n = 1"; |
|
|
|
return RET_PARAM_INVALID; |
|
|
|
} |
|
|
|
auto reduce_param = reinterpret_cast<ReduceParameter *>(op_parameter_); |
|
|
|
if (GetReduceTypeStr(reduce_param->mode_).empty()) { |
|
|
|
MS_LOG(ERROR) << "not supported reduce type:" << reduce_param->mode_; |
|
|
|
return RET_PARAM_INVALID; |
|
|
|
} |
|
|
|
bool IsWCReduce(bool *reduce_axes_) { |
|
|
|
return !reduce_axes_[0] && !reduce_axes_[1] && reduce_axes_[2] && reduce_axes_[3]; |
|
|
|
} |
|
|
|
|
|
|
|
bool IsCReduce(bool *reduce_axes_) { |
|
|
|
return !reduce_axes_[0] && !reduce_axes_[1] && !reduce_axes_[2] && reduce_axes_[3]; |
|
|
|
} |
|
|
|
|
|
|
|
int ReduceOpenCLKernel::SetAxes() { |
|
|
|
// axes is input tensor |
|
|
|
// get num_axes |
|
|
|
int num_axes = 0; |
|
|
|
@@ -102,21 +98,57 @@ int ReduceOpenCLKernel::CheckSpecs() { |
|
|
|
for (int i = 0; i < std::min(num_axes, MAX_SHAPE_SIZE); ++i) { |
|
|
|
axes_[i] = reinterpret_cast<int *>(axes_tensor->data_c())[i]; |
|
|
|
} |
|
|
|
if (num_axes == 1 && axes_[0] == 3 && in_tensors_[0]->shape()[2] == 1) { |
|
|
|
num_axes = 2; |
|
|
|
axes_[1] = 2; |
|
|
|
} |
|
|
|
if (num_axes != 2) { |
|
|
|
MS_LOG(ERROR) << "reduce op only support num_axes=2"; |
|
|
|
if (num_axes > 2 || num_axes < 1) { |
|
|
|
MS_LOG(ERROR) << "Unsupported reduce num axes " << num_axes; |
|
|
|
return RET_PARAM_INVALID; |
|
|
|
} |
|
|
|
|
|
|
|
wc_reduce_ = (axes_[0] == 2 && axes_[1] == 3) || (axes_[0] == 3 && axes_[1] == 2); |
|
|
|
if (!hw_reduce(axes_) && !wc_reduce_) { |
|
|
|
MS_LOG(ERROR) << "reduce op only support axis (1,2) or (2,3)"; |
|
|
|
for (int i = 0; i < num_axes; i++) { |
|
|
|
int axis = axes_[i]; |
|
|
|
axis = inShape.AlignAxis(axis); |
|
|
|
reduce_axes_[axis] = true; |
|
|
|
} |
|
|
|
if (num_axes == 1) { |
|
|
|
if (reduce_axes_[1] && inShape.W == 1) { |
|
|
|
reduce_axes_[2] = true; |
|
|
|
} else if (reduce_axes_[2]) { |
|
|
|
if (inShape.H == 1) { |
|
|
|
reduce_axes_[1] = true; |
|
|
|
} else if (inShape.C == 1) { |
|
|
|
reduce_axes_[3] = true; |
|
|
|
} |
|
|
|
} else if (reduce_axes_[3] && inShape.W == 1) { |
|
|
|
reduce_axes_[3] = true; |
|
|
|
} |
|
|
|
} |
|
|
|
return RET_OK; |
|
|
|
} |
|
|
|
|
|
|
|
int ReduceOpenCLKernel::CheckSpecs() { |
|
|
|
if (in_tensors_.size() != 2 || out_tensors_.size() != 1) { |
|
|
|
MS_LOG(ERROR) << "in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size(); |
|
|
|
return RET_ERROR; |
|
|
|
} |
|
|
|
if (in_tensors_[0]->shape()[0] > 1) { |
|
|
|
MS_LOG(ERROR) << "reduce op only support n = 1"; |
|
|
|
return RET_PARAM_INVALID; |
|
|
|
} |
|
|
|
inShape = GpuTensorInfo(in_tensors_[0]); |
|
|
|
auto reduce_param = reinterpret_cast<ReduceParameter *>(op_parameter_); |
|
|
|
if (GetReduceTypeStr(reduce_param->mode_).empty()) { |
|
|
|
MS_LOG(ERROR) << "not supported reduce type:" << reduce_param->mode_; |
|
|
|
return RET_PARAM_INVALID; |
|
|
|
} |
|
|
|
auto ret = SetAxes(); |
|
|
|
if (ret != RET_OK) return ret; |
|
|
|
hw_reduce_ = IsHWReduce(reduce_axes_); |
|
|
|
wc_reduce_ = IsWCReduce(reduce_axes_); |
|
|
|
c_reduce_ = IsCReduce(reduce_axes_); |
|
|
|
if (!hw_reduce_ && !wc_reduce_ && !c_reduce_) { |
|
|
|
MS_LOG(ERROR) << "Unsupported reduce axes"; |
|
|
|
return RET_PARAM_INVALID; |
|
|
|
} |
|
|
|
if (wc_reduce_ && reduce_param->keep_dims_ == false) { |
|
|
|
if ((c_reduce_ || wc_reduce_) && reduce_param->keep_dims_ == false) { |
|
|
|
MS_LOG(ERROR) << "reduce axis (2,3) should keep dims"; |
|
|
|
return RET_PARAM_INVALID; |
|
|
|
} |
|
|
|
@@ -130,18 +162,22 @@ int ReduceOpenCLKernel::Prepare() { |
|
|
|
} |
|
|
|
|
|
|
|
std::string kernel_name; |
|
|
|
if (in_tensors_[0]->shape()[axes_[0]] >= LOCAL_CACHE_THREAD || |
|
|
|
in_tensors_[0]->shape()[axes_[1]] >= LOCAL_CACHE_THREAD) { |
|
|
|
use_local_ = false; |
|
|
|
kernel_name = "Global"; |
|
|
|
if (wc_reduce_ && (inShape.W >= LOCAL_CACHE_THREAD || inShape.C >= LOCAL_CACHE_THREAD)) { |
|
|
|
use_local_ = true; |
|
|
|
kernel_name += "Local"; |
|
|
|
} else { |
|
|
|
use_local_ = false; |
|
|
|
kernel_name += "Global"; |
|
|
|
kernel_name = "Local"; |
|
|
|
} |
|
|
|
if (hw_reduce_ && (inShape.W >= LOCAL_CACHE_THREAD || inShape.H >= LOCAL_CACHE_THREAD)) { |
|
|
|
use_local_ = true; |
|
|
|
kernel_name = "Local"; |
|
|
|
} |
|
|
|
if (wc_reduce_) { |
|
|
|
kernel_name += "WC"; |
|
|
|
} else { |
|
|
|
} else if (hw_reduce_) { |
|
|
|
kernel_name += "HW"; |
|
|
|
} else if (c_reduce_) { |
|
|
|
kernel_name += "C"; |
|
|
|
} |
|
|
|
kernel_name += GetReduceTypeStr(reduce_param->mode_); |
|
|
|
#ifdef PROGRAM_WITH_IL |
|
|
|
@@ -150,7 +186,10 @@ int ReduceOpenCLKernel::Prepare() { |
|
|
|
std::string source = reduce_source; |
|
|
|
std::string program_name = "Reduce"; |
|
|
|
ocl_runtime_->LoadSource(program_name, source); |
|
|
|
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); |
|
|
|
auto ret = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); |
|
|
|
if (ret != RET_OK) { |
|
|
|
return ret; |
|
|
|
} |
|
|
|
#endif |
|
|
|
SetConstArgs(); |
|
|
|
SetGlobalLocal(); |
|
|
|
@@ -158,30 +197,31 @@ int ReduceOpenCLKernel::Prepare() { |
|
|
|
return mindspore::lite::RET_OK; |
|
|
|
} |
|
|
|
void ReduceOpenCLKernel::SetConstArgs() { |
|
|
|
std::vector<int> shapex = in_tensors_[0]->shape(); |
|
|
|
int h = shapex[1]; |
|
|
|
int w = shapex[2]; |
|
|
|
int c = shapex[3]; |
|
|
|
int h = inShape.H; |
|
|
|
int w = inShape.W; |
|
|
|
int c = inShape.C; |
|
|
|
int c4 = UP_DIV(c, C4NUM); |
|
|
|
cl_int4 size = {h, w, c4, c}; |
|
|
|
int arg_idx = 2; |
|
|
|
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, size); |
|
|
|
if (wc_reduce_) { |
|
|
|
if (wc_reduce_ || c_reduce_) { |
|
|
|
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, GenC4Mask()); |
|
|
|
} |
|
|
|
} |
|
|
|
void ReduceOpenCLKernel::SetGlobalLocal() { |
|
|
|
std::vector<int> shapex = in_tensors_[0]->shape(); |
|
|
|
int h = shapex[1]; |
|
|
|
int c = shapex[3]; |
|
|
|
int c4 = UP_DIV(c, C4NUM); |
|
|
|
int h = inShape.H; |
|
|
|
int w = inShape.W; |
|
|
|
int c4 = inShape.Slice; |
|
|
|
local_size_ = {}; |
|
|
|
if (use_local_) { |
|
|
|
local_size_ = {1, LOCAL_CACHE_THREAD, LOCAL_CACHE_THREAD}; |
|
|
|
} |
|
|
|
global_size_ = {static_cast<size_t>(c4), 1, 1}; |
|
|
|
if (wc_reduce_) { |
|
|
|
if (hw_reduce_) { |
|
|
|
global_size_ = {static_cast<size_t>(c4), 1, 1}; |
|
|
|
} else if (wc_reduce_) { |
|
|
|
global_size_ = {static_cast<size_t>(h), 1, 1}; |
|
|
|
} else if (c_reduce_ && !use_local_) { |
|
|
|
global_size_ = {static_cast<size_t>(h), static_cast<size_t>(w)}; |
|
|
|
} |
|
|
|
AlignGlobalLocal(global_size_, local_size_); |
|
|
|
} |
|
|
|
|