Browse Source

!13407 [MS][LITE][GPU]reduce support axis=3

From: @chenzupeng
Reviewed-by: @ddwsky,@zhanghaibo5
Signed-off-by: @ddwsky
tags/v1.2.0-rc1
mindspore-ci-bot Gitee 5 years ago
parent
commit
01203b3d96
5 changed files with 136 additions and 53 deletions
  1. +17
    -1
      mindspore/lite/src/runtime/kernel/opencl/cl/reduce.cl
  2. +2
    -1
      mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.cc
  3. +86
    -46
      mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.cc
  4. +5
    -1
      mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.h
  5. +26
    -4
      mindspore/lite/test/ut/src/runtime/kernel/opencl/reduce_tests.cc

+ 17
- 1
mindspore/lite/src/runtime/kernel/opencl/cl/reduce.cl View File

@@ -183,6 +183,23 @@ __kernel void LocalWCSumSquare(__read_only image2d_t src_data, __write_only imag
WRITE_IMAGE(dst_data, (int2)(0, X), result2);
}

__kernel void GlobalCMean(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size, float4 mask) {
int X = get_global_id(0); // H
int Y = get_global_id(1); // W
if (X >= size.x || Y >= size.y) {
return;
}
float4 result = (float4)0.f;
for (int c = 0; c < size.z; c++) {
result += convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + c, X)));
}

result /= size.w;
FLT4 result2 = (FLT4)(0.f);
result2.x = dot(TO_FLT4(result), (FLT4)(1.f));
WRITE_IMAGE(dst_data, (int2)(Y, X), result2);
}

#define GlobalHW(Method) \
__kernel void GlobalHW##Method(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) { \
int X = get_global_id(0); \
@@ -297,7 +314,6 @@ __kernel void LocalWCSumSquare(__read_only image2d_t src_data, __write_only imag
#define DoSum(A, B) A += B
#define InitSum 0.f
GlobalHW(Sum) GlobalWC(Sum) LocalHW(Sum) LocalWC(Sum)

#define DoMin(A, B) A = min(A, B)
#define InitMin 10000.f
GlobalHW(Min) GlobalWC(Min) LocalHW(Min) LocalWC(Min)


+ 2
- 1
mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.cc View File

@@ -38,7 +38,8 @@ int ArgMinMaxOpenCLKernel::CheckSpecs() {
MS_LOG(ERROR) << "in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size();
return RET_ERROR;
}
if (in_tensors_[0]->data_type() != kNumberTypeFloat32 && in_tensors_[0]->data_type() != kNumberTypeFloat16) {
if ((in_tensors_[0]->data_type() != kNumberTypeFloat32 && in_tensors_[0]->data_type() != kNumberTypeFloat16) ||
(out_tensors_[0]->data_type() != kNumberTypeFloat32 && out_tensors_[0]->data_type() != kNumberTypeFloat16)) {
MS_LOG(ERROR) << "Unsupported data type " << in_tensors_[0]->data_type();
return RET_ERROR;
}


+ 86
- 46
mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.cc View File

@@ -54,8 +54,8 @@ std::string ReduceOpenCLKernel::GetReduceTypeStr(int type) {

cl_float4 ReduceOpenCLKernel::GenC4Mask() {
auto reduce_param = reinterpret_cast<ReduceParameter *>(op_parameter_);
int last_c4 = in_tensors_[0]->shape()[3] % C4NUM;
last_c4 = (C4NUM - last_c4) % 4;
int last_c4 = inShape.C % C4NUM;
if (last_c4 == 0) last_c4 = C4NUM;
static const std::map<int, float> reduce_type2init{
{ReduceMode_ReduceMean, 0.f}, {ReduceMode_ReduceSum, 0.f}, {ReduceMode_ReduceMin, 10000.f},
{ReduceMode_ReduceMax, -10000.f}, {ReduceMode_ReduceProd, 1.f}, {ReduceMode_ReduceSumSquare, 0.f}};
@@ -67,23 +67,19 @@ cl_float4 ReduceOpenCLKernel::GenC4Mask() {
return mask;
}

bool hw_reduce(const int *axes_) { return (axes_[0] == 1 && axes_[1] == 2) || (axes_[0] == 2 && axes_[1] == 1); }
bool IsHWReduce(bool *reduce_axes_) {
return !reduce_axes_[0] && reduce_axes_[1] && reduce_axes_[2] && !reduce_axes_[3];
}

int ReduceOpenCLKernel::CheckSpecs() {
if (in_tensors_.size() != 2 || out_tensors_.size() != 1) {
MS_LOG(ERROR) << "in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size();
return RET_ERROR;
}
if (in_tensors_[0]->shape()[0] > 1) {
MS_LOG(ERROR) << "reduce op only support n = 1";
return RET_PARAM_INVALID;
}
auto reduce_param = reinterpret_cast<ReduceParameter *>(op_parameter_);
if (GetReduceTypeStr(reduce_param->mode_).empty()) {
MS_LOG(ERROR) << "not supported reduce type:" << reduce_param->mode_;
return RET_PARAM_INVALID;
}
bool IsWCReduce(bool *reduce_axes_) {
return !reduce_axes_[0] && !reduce_axes_[1] && reduce_axes_[2] && reduce_axes_[3];
}

bool IsCReduce(bool *reduce_axes_) {
return !reduce_axes_[0] && !reduce_axes_[1] && !reduce_axes_[2] && reduce_axes_[3];
}

int ReduceOpenCLKernel::SetAxes() {
// axes is input tensor
// get num_axes
int num_axes = 0;
@@ -102,21 +98,57 @@ int ReduceOpenCLKernel::CheckSpecs() {
for (int i = 0; i < std::min(num_axes, MAX_SHAPE_SIZE); ++i) {
axes_[i] = reinterpret_cast<int *>(axes_tensor->data_c())[i];
}
if (num_axes == 1 && axes_[0] == 3 && in_tensors_[0]->shape()[2] == 1) {
num_axes = 2;
axes_[1] = 2;
}
if (num_axes != 2) {
MS_LOG(ERROR) << "reduce op only support num_axes=2";
if (num_axes > 2 || num_axes < 1) {
MS_LOG(ERROR) << "Unsupported reduce num axes " << num_axes;
return RET_PARAM_INVALID;
}

wc_reduce_ = (axes_[0] == 2 && axes_[1] == 3) || (axes_[0] == 3 && axes_[1] == 2);
if (!hw_reduce(axes_) && !wc_reduce_) {
MS_LOG(ERROR) << "reduce op only support axis (1,2) or (2,3)";
for (int i = 0; i < num_axes; i++) {
int axis = axes_[i];
axis = inShape.AlignAxis(axis);
reduce_axes_[axis] = true;
}
if (num_axes == 1) {
if (reduce_axes_[1] && inShape.W == 1) {
reduce_axes_[2] = true;
} else if (reduce_axes_[2]) {
if (inShape.H == 1) {
reduce_axes_[1] = true;
} else if (inShape.C == 1) {
reduce_axes_[3] = true;
}
} else if (reduce_axes_[3] && inShape.W == 1) {
reduce_axes_[3] = true;
}
}
return RET_OK;
}

int ReduceOpenCLKernel::CheckSpecs() {
if (in_tensors_.size() != 2 || out_tensors_.size() != 1) {
MS_LOG(ERROR) << "in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size();
return RET_ERROR;
}
if (in_tensors_[0]->shape()[0] > 1) {
MS_LOG(ERROR) << "reduce op only support n = 1";
return RET_PARAM_INVALID;
}
inShape = GpuTensorInfo(in_tensors_[0]);
auto reduce_param = reinterpret_cast<ReduceParameter *>(op_parameter_);
if (GetReduceTypeStr(reduce_param->mode_).empty()) {
MS_LOG(ERROR) << "not supported reduce type:" << reduce_param->mode_;
return RET_PARAM_INVALID;
}
auto ret = SetAxes();
if (ret != RET_OK) return ret;
hw_reduce_ = IsHWReduce(reduce_axes_);
wc_reduce_ = IsWCReduce(reduce_axes_);
c_reduce_ = IsCReduce(reduce_axes_);
if (!hw_reduce_ && !wc_reduce_ && !c_reduce_) {
MS_LOG(ERROR) << "Unsupported reduce axes";
return RET_PARAM_INVALID;
}
if (wc_reduce_ && reduce_param->keep_dims_ == false) {
if ((c_reduce_ || wc_reduce_) && reduce_param->keep_dims_ == false) {
MS_LOG(ERROR) << "reduce axis (2,3) should keep dims";
return RET_PARAM_INVALID;
}
@@ -130,18 +162,22 @@ int ReduceOpenCLKernel::Prepare() {
}

std::string kernel_name;
if (in_tensors_[0]->shape()[axes_[0]] >= LOCAL_CACHE_THREAD ||
in_tensors_[0]->shape()[axes_[1]] >= LOCAL_CACHE_THREAD) {
use_local_ = false;
kernel_name = "Global";
if (wc_reduce_ && (inShape.W >= LOCAL_CACHE_THREAD || inShape.C >= LOCAL_CACHE_THREAD)) {
use_local_ = true;
kernel_name += "Local";
} else {
use_local_ = false;
kernel_name += "Global";
kernel_name = "Local";
}
if (hw_reduce_ && (inShape.W >= LOCAL_CACHE_THREAD || inShape.H >= LOCAL_CACHE_THREAD)) {
use_local_ = true;
kernel_name = "Local";
}
if (wc_reduce_) {
kernel_name += "WC";
} else {
} else if (hw_reduce_) {
kernel_name += "HW";
} else if (c_reduce_) {
kernel_name += "C";
}
kernel_name += GetReduceTypeStr(reduce_param->mode_);
#ifdef PROGRAM_WITH_IL
@@ -150,7 +186,10 @@ int ReduceOpenCLKernel::Prepare() {
std::string source = reduce_source;
std::string program_name = "Reduce";
ocl_runtime_->LoadSource(program_name, source);
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
auto ret = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
if (ret != RET_OK) {
return ret;
}
#endif
SetConstArgs();
SetGlobalLocal();
@@ -158,30 +197,31 @@ int ReduceOpenCLKernel::Prepare() {
return mindspore::lite::RET_OK;
}
void ReduceOpenCLKernel::SetConstArgs() {
std::vector<int> shapex = in_tensors_[0]->shape();
int h = shapex[1];
int w = shapex[2];
int c = shapex[3];
int h = inShape.H;
int w = inShape.W;
int c = inShape.C;
int c4 = UP_DIV(c, C4NUM);
cl_int4 size = {h, w, c4, c};
int arg_idx = 2;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, size);
if (wc_reduce_) {
if (wc_reduce_ || c_reduce_) {
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, GenC4Mask());
}
}
void ReduceOpenCLKernel::SetGlobalLocal() {
std::vector<int> shapex = in_tensors_[0]->shape();
int h = shapex[1];
int c = shapex[3];
int c4 = UP_DIV(c, C4NUM);
int h = inShape.H;
int w = inShape.W;
int c4 = inShape.Slice;
local_size_ = {};
if (use_local_) {
local_size_ = {1, LOCAL_CACHE_THREAD, LOCAL_CACHE_THREAD};
}
global_size_ = {static_cast<size_t>(c4), 1, 1};
if (wc_reduce_) {
if (hw_reduce_) {
global_size_ = {static_cast<size_t>(c4), 1, 1};
} else if (wc_reduce_) {
global_size_ = {static_cast<size_t>(h), 1, 1};
} else if (c_reduce_ && !use_local_) {
global_size_ = {static_cast<size_t>(h), static_cast<size_t>(w)};
}
AlignGlobalLocal(global_size_, local_size_);
}


+ 5
- 1
mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.h View File

@@ -37,11 +37,15 @@ class ReduceOpenCLKernel : public OpenCLKernel {
int Tune() override;

private:
int SetAxes();
cl_float4 GenC4Mask();
static std::string GetReduceTypeStr(int type);
GpuTensorInfo outShape;
GpuTensorInfo inShape;
bool use_local_{false};
bool wc_reduce_{false};
bool hw_reduce_{false};
bool c_reduce_{false};
bool reduce_axes_[4]{false};
static const size_t LOCAL_CACHE_THREAD{16};
int axes_[MAX_SHAPE_SIZE];
};


+ 26
- 4
mindspore/lite/test/ut/src/runtime/kernel/opencl/reduce_tests.cc View File

@@ -46,7 +46,9 @@ TEST_F(TestOpenCL_Reduce, Mean) {

for (auto fp16_enable : {false, true}) {
auto *param = CreateParameter(axis, schema::ReduceMode_ReduceMean, false);
TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32}}, {output_shape, output_data}, param, fp16_enable);
TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32},
{{static_cast<int>(axis.size())}, axis.data(), CONST_TENSOR, kNumberTypeInt32}},
{output_shape, output_data}, param, fp16_enable);
}
}

@@ -59,7 +61,9 @@ TEST_F(TestOpenCL_Reduce, Sum) {

for (auto fp16_enable : {false, true}) {
auto *param = CreateParameter(axis, schema::ReduceMode_ReduceSum, false);
TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32}}, {output_shape, output_data}, param, fp16_enable);
TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32},
{{static_cast<int>(axis.size())}, axis.data(), CONST_TENSOR, kNumberTypeInt32}},
{output_shape, output_data}, param, fp16_enable);
}
}

@@ -72,7 +76,9 @@ TEST_F(TestOpenCL_Reduce, MeanWC) {

for (auto fp16_enable : {false, true}) {
auto *param = CreateParameter(axis, schema::ReduceMode_ReduceMean, true);
TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32}}, {output_shape, output_data}, param, fp16_enable);
TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32},
{{static_cast<int>(axis.size())}, axis.data(), CONST_TENSOR, kNumberTypeInt32}},
{output_shape, output_data}, param, fp16_enable);
}
}

@@ -85,8 +91,24 @@ TEST_F(TestOpenCL_Reduce, SumWC) {

for (auto fp16_enable : {false, true}) {
auto *param = CreateParameter(axis, schema::ReduceMode_ReduceSum, true);
TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32}}, {output_shape, output_data}, param, fp16_enable);
TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32},
{{static_cast<int>(axis.size())}, axis.data(), CONST_TENSOR, kNumberTypeInt32}},
{output_shape, output_data}, param, fp16_enable);
}
}

TEST_F(TestOpenCL_Reduce, MeanC) {
std::vector<int> axis = {3};
std::vector<int> input_shape = {1, 3, 2, 2};
std::vector<int> output_shape = {1, 3, 2, 1};
float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
float output_data[] = {0.5, 2.5, 4.5, 6.5, 8.5, 10.5};

for (auto fp16_enable : {false, true}) {
auto *param = CreateParameter(axis, schema::ReduceMode_ReduceMean, true);
TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32},
{{static_cast<int>(axis.size())}, axis.data(), CONST_TENSOR, kNumberTypeInt32}},
{output_shape, output_data}, param, fp16_enable);
}
}
} // namespace mindspore::lite::opencl::test

Loading…
Cancel
Save