!13407 [MS][LITE][GPU]reduce support axis=3

From: @chenzupeng Reviewed-by: @ddwsky,@zhanghaibo5 Signed-off-by: @ddwsky
5 years ago · 01203b3d96
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/reduce.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/reduce.cl
@@ -183,6 +183,23 @@ __kernel void LocalWCSumSquare(__read_only image2d_t src_data, __write_only imag
  WRITE_IMAGE(dst_data, (int2)(0, X), result2);
 }

 __kernel void GlobalCMean(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size, float4 mask) {
  int X = get_global_id(0);  // H
  int Y = get_global_id(1);  // W
  if (X >= size.x || Y >= size.y) {
    return;
  }
  float4 result = (float4)0.f;
  for (int c = 0; c < size.z; c++) {
    result += convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + c, X)));
  }

  result /= size.w;
  FLT4 result2 = (FLT4)(0.f);
  result2.x = dot(TO_FLT4(result), (FLT4)(1.f));
  WRITE_IMAGE(dst_data, (int2)(Y, X), result2);
 }

 #define GlobalHW(Method)                                                                                       \
  __kernel void GlobalHW##Method(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) { \
    int X = get_global_id(0);                                                                                  \
@@ -297,7 +314,6 @@ __kernel void LocalWCSumSquare(__read_only image2d_t src_data, __write_only imag
 #define DoSum(A, B) A += B
 #define InitSum 0.f
 GlobalHW(Sum) GlobalWC(Sum) LocalHW(Sum) LocalWC(Sum)

 #define DoMin(A, B) A = min(A, B)
 #define InitMin 10000.f
  GlobalHW(Min) GlobalWC(Min) LocalHW(Min) LocalWC(Min)
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.cc
@@ -38,7 +38,8 @@ int ArgMinMaxOpenCLKernel::CheckSpecs() {
    MS_LOG(ERROR) << "in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size();
    return RET_ERROR;
  }
  if (in_tensors_[0]->data_type() != kNumberTypeFloat32 && in_tensors_[0]->data_type() != kNumberTypeFloat16) {
  if ((in_tensors_[0]->data_type() != kNumberTypeFloat32 && in_tensors_[0]->data_type() != kNumberTypeFloat16) ||
      (out_tensors_[0]->data_type() != kNumberTypeFloat32 && out_tensors_[0]->data_type() != kNumberTypeFloat16)) {
    MS_LOG(ERROR) << "Unsupported data type " << in_tensors_[0]->data_type();
    return RET_ERROR;
  }
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.cc
@@ -54,8 +54,8 @@ std::string ReduceOpenCLKernel::GetReduceTypeStr(int type) {

 cl_float4 ReduceOpenCLKernel::GenC4Mask() {
  auto reduce_param = reinterpret_cast<ReduceParameter *>(op_parameter_);
  int last_c4 = in_tensors_[0]->shape()[3] % C4NUM;
  last_c4 = (C4NUM - last_c4) % 4;
  int last_c4 = inShape.C % C4NUM;
  if (last_c4 == 0) last_c4 = C4NUM;
  static const std::map<int, float> reduce_type2init{
    {ReduceMode_ReduceMean, 0.f},     {ReduceMode_ReduceSum, 0.f},  {ReduceMode_ReduceMin, 10000.f},
    {ReduceMode_ReduceMax, -10000.f}, {ReduceMode_ReduceProd, 1.f}, {ReduceMode_ReduceSumSquare, 0.f}};
@@ -67,23 +67,19 @@ cl_float4 ReduceOpenCLKernel::GenC4Mask() {
  return mask;
 }

 bool hw_reduce(const int *axes_) { return (axes_[0] == 1 && axes_[1] == 2) || (axes_[0] == 2 && axes_[1] == 1); }
 bool IsHWReduce(bool *reduce_axes_) {
  return !reduce_axes_[0] && reduce_axes_[1] && reduce_axes_[2] && !reduce_axes_[3];
 }

 int ReduceOpenCLKernel::CheckSpecs() {
  if (in_tensors_.size() != 2 || out_tensors_.size() != 1) {
    MS_LOG(ERROR) << "in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size();
    return RET_ERROR;
  }
  if (in_tensors_[0]->shape()[0] > 1) {
    MS_LOG(ERROR) << "reduce op only support n = 1";
    return RET_PARAM_INVALID;
  }
  auto reduce_param = reinterpret_cast<ReduceParameter *>(op_parameter_);
  if (GetReduceTypeStr(reduce_param->mode_).empty()) {
    MS_LOG(ERROR) << "not supported reduce type:" << reduce_param->mode_;
    return RET_PARAM_INVALID;
  }
 bool IsWCReduce(bool *reduce_axes_) {
  return !reduce_axes_[0] && !reduce_axes_[1] && reduce_axes_[2] && reduce_axes_[3];
 }

 bool IsCReduce(bool *reduce_axes_) {
  return !reduce_axes_[0] && !reduce_axes_[1] && !reduce_axes_[2] && reduce_axes_[3];
 }

 int ReduceOpenCLKernel::SetAxes() {
  // axes is input tensor
  // get num_axes
  int num_axes = 0;
@@ -102,21 +98,57 @@ int ReduceOpenCLKernel::CheckSpecs() {
  for (int i = 0; i < std::min(num_axes, MAX_SHAPE_SIZE); ++i) {
    axes_[i] = reinterpret_cast<int *>(axes_tensor->data_c())[i];
  }
  if (num_axes == 1 && axes_[0] == 3 && in_tensors_[0]->shape()[2] == 1) {
    num_axes = 2;
    axes_[1] = 2;
  }
  if (num_axes != 2) {
    MS_LOG(ERROR) << "reduce op only support num_axes=2";
  if (num_axes > 2 || num_axes < 1) {
    MS_LOG(ERROR) << "Unsupported reduce num axes " << num_axes;
    return RET_PARAM_INVALID;
  }

  wc_reduce_ = (axes_[0] == 2 && axes_[1] == 3) || (axes_[0] == 3 && axes_[1] == 2);
  if (!hw_reduce(axes_) && !wc_reduce_) {
    MS_LOG(ERROR) << "reduce op only support axis (1,2) or (2,3)";
  for (int i = 0; i < num_axes; i++) {
    int axis = axes_[i];
    axis = inShape.AlignAxis(axis);
    reduce_axes_[axis] = true;
  }
  if (num_axes == 1) {
    if (reduce_axes_[1] && inShape.W == 1) {
      reduce_axes_[2] = true;
    } else if (reduce_axes_[2]) {
      if (inShape.H == 1) {
        reduce_axes_[1] = true;
      } else if (inShape.C == 1) {
        reduce_axes_[3] = true;
      }
    } else if (reduce_axes_[3] && inShape.W == 1) {
      reduce_axes_[3] = true;
    }
  }
  return RET_OK;
 }

 int ReduceOpenCLKernel::CheckSpecs() {
  if (in_tensors_.size() != 2 || out_tensors_.size() != 1) {
    MS_LOG(ERROR) << "in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size();
    return RET_ERROR;
  }
  if (in_tensors_[0]->shape()[0] > 1) {
    MS_LOG(ERROR) << "reduce op only support n = 1";
    return RET_PARAM_INVALID;
  }
  inShape = GpuTensorInfo(in_tensors_[0]);
  auto reduce_param = reinterpret_cast<ReduceParameter *>(op_parameter_);
  if (GetReduceTypeStr(reduce_param->mode_).empty()) {
    MS_LOG(ERROR) << "not supported reduce type:" << reduce_param->mode_;
    return RET_PARAM_INVALID;
  }
  auto ret = SetAxes();
  if (ret != RET_OK) return ret;
  hw_reduce_ = IsHWReduce(reduce_axes_);
  wc_reduce_ = IsWCReduce(reduce_axes_);
  c_reduce_ = IsCReduce(reduce_axes_);
  if (!hw_reduce_ && !wc_reduce_ && !c_reduce_) {
    MS_LOG(ERROR) << "Unsupported reduce axes";
    return RET_PARAM_INVALID;
  }
  if (wc_reduce_ && reduce_param->keep_dims_ == false) {
  if ((c_reduce_ || wc_reduce_) && reduce_param->keep_dims_ == false) {
    MS_LOG(ERROR) << "reduce axis (2,3) should keep dims";
    return RET_PARAM_INVALID;
  }
@@ -130,18 +162,22 @@ int ReduceOpenCLKernel::Prepare() {
  }

  std::string kernel_name;
  if (in_tensors_[0]->shape()[axes_[0]] >= LOCAL_CACHE_THREAD ||
      in_tensors_[0]->shape()[axes_[1]] >= LOCAL_CACHE_THREAD) {
  use_local_ = false;
  kernel_name = "Global";
  if (wc_reduce_ && (inShape.W >= LOCAL_CACHE_THREAD || inShape.C >= LOCAL_CACHE_THREAD)) {
    use_local_ = true;
    kernel_name += "Local";
  } else {
    use_local_ = false;
    kernel_name += "Global";
    kernel_name = "Local";
  }
  if (hw_reduce_ && (inShape.W >= LOCAL_CACHE_THREAD || inShape.H >= LOCAL_CACHE_THREAD)) {
    use_local_ = true;
    kernel_name = "Local";
  }
  if (wc_reduce_) {
    kernel_name += "WC";
  } else {
  } else if (hw_reduce_) {
    kernel_name += "HW";
  } else if (c_reduce_) {
    kernel_name += "C";
  }
  kernel_name += GetReduceTypeStr(reduce_param->mode_);
 #ifdef PROGRAM_WITH_IL
@@ -150,7 +186,10 @@ int ReduceOpenCLKernel::Prepare() {
  std::string source = reduce_source;
  std::string program_name = "Reduce";
  ocl_runtime_->LoadSource(program_name, source);
  ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
  auto ret = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
  if (ret != RET_OK) {
    return ret;
  }
 #endif
  SetConstArgs();
  SetGlobalLocal();
@@ -158,30 +197,31 @@ int ReduceOpenCLKernel::Prepare() {
  return mindspore::lite::RET_OK;
 }
 void ReduceOpenCLKernel::SetConstArgs() {
  std::vector<int> shapex = in_tensors_[0]->shape();
  int h = shapex[1];
  int w = shapex[2];
  int c = shapex[3];
  int h = inShape.H;
  int w = inShape.W;
  int c = inShape.C;
  int c4 = UP_DIV(c, C4NUM);
  cl_int4 size = {h, w, c4, c};
  int arg_idx = 2;
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, size);
  if (wc_reduce_) {
  if (wc_reduce_ || c_reduce_) {
    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, GenC4Mask());
  }
 }
 void ReduceOpenCLKernel::SetGlobalLocal() {
  std::vector<int> shapex = in_tensors_[0]->shape();
  int h = shapex[1];
  int c = shapex[3];
  int c4 = UP_DIV(c, C4NUM);
  int h = inShape.H;
  int w = inShape.W;
  int c4 = inShape.Slice;
  local_size_ = {};
  if (use_local_) {
    local_size_ = {1, LOCAL_CACHE_THREAD, LOCAL_CACHE_THREAD};
  }
  global_size_ = {static_cast<size_t>(c4), 1, 1};
  if (wc_reduce_) {
  if (hw_reduce_) {
    global_size_ = {static_cast<size_t>(c4), 1, 1};
  } else if (wc_reduce_) {
    global_size_ = {static_cast<size_t>(h), 1, 1};
  } else if (c_reduce_ && !use_local_) {
    global_size_ = {static_cast<size_t>(h), static_cast<size_t>(w)};
  }
  AlignGlobalLocal(global_size_, local_size_);
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.h
@@ -37,11 +37,15 @@ class ReduceOpenCLKernel : public OpenCLKernel {
  int Tune() override;

 private:
  int SetAxes();
  cl_float4 GenC4Mask();
  static std::string GetReduceTypeStr(int type);
  GpuTensorInfo outShape;
  GpuTensorInfo inShape;
  bool use_local_{false};
  bool wc_reduce_{false};
  bool hw_reduce_{false};
  bool c_reduce_{false};
  bool reduce_axes_[4]{false};
  static const size_t LOCAL_CACHE_THREAD{16};
  int axes_[MAX_SHAPE_SIZE];
 };
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/reduce_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/reduce_tests.cc
@@ -46,7 +46,9 @@ TEST_F(TestOpenCL_Reduce, Mean) {

  for (auto fp16_enable : {false, true}) {
    auto *param = CreateParameter(axis, schema::ReduceMode_ReduceMean, false);
    TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32}}, {output_shape, output_data}, param, fp16_enable);
    TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32},
              {{static_cast<int>(axis.size())}, axis.data(), CONST_TENSOR, kNumberTypeInt32}},
             {output_shape, output_data}, param, fp16_enable);
  }
 }

@@ -59,7 +61,9 @@ TEST_F(TestOpenCL_Reduce, Sum) {

  for (auto fp16_enable : {false, true}) {
    auto *param = CreateParameter(axis, schema::ReduceMode_ReduceSum, false);
    TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32}}, {output_shape, output_data}, param, fp16_enable);
    TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32},
              {{static_cast<int>(axis.size())}, axis.data(), CONST_TENSOR, kNumberTypeInt32}},
             {output_shape, output_data}, param, fp16_enable);
  }
 }

@@ -72,7 +76,9 @@ TEST_F(TestOpenCL_Reduce, MeanWC) {

  for (auto fp16_enable : {false, true}) {
    auto *param = CreateParameter(axis, schema::ReduceMode_ReduceMean, true);
    TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32}}, {output_shape, output_data}, param, fp16_enable);
    TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32},
              {{static_cast<int>(axis.size())}, axis.data(), CONST_TENSOR, kNumberTypeInt32}},
             {output_shape, output_data}, param, fp16_enable);
  }
 }

@@ -85,8 +91,24 @@ TEST_F(TestOpenCL_Reduce, SumWC) {

  for (auto fp16_enable : {false, true}) {
    auto *param = CreateParameter(axis, schema::ReduceMode_ReduceSum, true);
    TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32}}, {output_shape, output_data}, param, fp16_enable);
    TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32},
              {{static_cast<int>(axis.size())}, axis.data(), CONST_TENSOR, kNumberTypeInt32}},
             {output_shape, output_data}, param, fp16_enable);
  }
 }

 TEST_F(TestOpenCL_Reduce, MeanC) {
  std::vector<int> axis = {3};
  std::vector<int> input_shape = {1, 3, 2, 2};
  std::vector<int> output_shape = {1, 3, 2, 1};
  float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
  float output_data[] = {0.5, 2.5, 4.5, 6.5, 8.5, 10.5};

  for (auto fp16_enable : {false, true}) {
    auto *param = CreateParameter(axis, schema::ReduceMode_ReduceMean, true);
    TestMain({{input_shape, input_data, VAR, kNumberTypeFloat32},
              {{static_cast<int>(axis.size())}, axis.data(), CONST_TENSOR, kNumberTypeInt32}},
             {output_shape, output_data}, param, fp16_enable);
  }
 }
 }  // namespace mindspore::lite::opencl::test