add op reduce min max prod sumsquare

5 years ago · 6cbe44cd9f
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/reduce.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/reduce.cl
@@ -3,7 +3,7 @@
 #endif
 #define LOCAL_CACHE_THREAD 16
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 __kernel void mean_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) {
 __kernel void GlobalHWMean(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) {
  int X = get_global_id(0);  // C4
  if (X >= size.z) {
    return;
@@ -18,7 +18,7 @@ __kernel void mean_NHWC4(__read_only image2d_t src_data, __write_only image2d_t
  WRITE_IMAGE(dst_data, (int2)(X, 0), TO_FLT4(result));
 }

 __kernel void mean_local_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) {
 __kernel void LocalHWMean(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) {
  int X = get_global_id(0);  // C4
  int localy = get_local_id(1);
  int localz = get_local_id(2);
@@ -45,30 +45,34 @@ __kernel void mean_local_NHWC4(__read_only image2d_t src_data, __write_only imag
  WRITE_IMAGE(dst_data, (int2)(X, 0), TO_FLT4(result));
 }

 __kernel void sum_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) {
  int X = get_global_id(0);  // C4
  if (X >= size.z) {
 __kernel void GlobalWCMean(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size, float4 mask) {
  int X = get_global_id(0);  // H
  if (X >= size.x) {
    return;
  }
  FLT4 result = (FLT4)0.f;
  for (int h = 0; h < size.x; h++) {
    for (int w = 0; w < size.y; w++) {
      result += READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + X, h));
  float4 result = (float4)0.f;
  for (int w = 0; w < size.y; w++) {
    for (int c = 0; c < size.z; c++) {
      result += convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + c, X)));
    }
  }
  WRITE_IMAGE(dst_data, (int2)(X, 0), result);

  result /= size.y * size.w;
  FLT4 result2 = (FLT4)(0.f);
  result2.x = dot(TO_FLT4(result), (FLT4)(1.f));
  WRITE_IMAGE(dst_data, (int2)(0, X), result2);
 }

 __kernel void sum_local_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) {
  int X = get_global_id(0);  // C4
 __kernel void LocalWCMean(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size, float4 mask) {
  int X = get_global_id(0);  // H
  int localy = get_local_id(1);
  int localz = get_local_id(2);
  if (X >= size.z) return;
  if (X >= size.x) return;
  __local float4 temp[LOCAL_CACHE_THREAD][LOCAL_CACHE_THREAD];
  temp[localy][localz] = (float4)0.f;
  for (int h = localy; h < size.x; h += LOCAL_CACHE_THREAD) {
    for (int w = localz; w < size.y; w += LOCAL_CACHE_THREAD) {
      temp[localy][localz] += convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + X, h)));
  for (int w = localy; w < size.y; w += LOCAL_CACHE_THREAD) {
    for (int c = localz; c < size.z; c += LOCAL_CACHE_THREAD) {
      temp[localy][localz] += convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + c, X)));
    }
  }
  barrier(CLK_LOCAL_MEM_FENCE);
@@ -82,37 +86,38 @@ __kernel void sum_local_NHWC4(__read_only image2d_t src_data, __write_only image
  for (int i = 1; i < LOCAL_CACHE_THREAD; i++) {
    result += temp[i][0];
  }
  WRITE_IMAGE(dst_data, (int2)(X, 0), TO_FLT4(result));
  result /= size.y * size.w;
  FLT4 result2 = (FLT4)(0.f);
  result2.x = dot(TO_FLT4(result), (FLT4)(1.f));
  WRITE_IMAGE(dst_data, (int2)(0, X), result2);
 }

 __kernel void mean_WC_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) {
  int X = get_global_id(0);  // H
  if (X >= size.x) {
 __kernel void GlobalHWSumSquare(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) {
  int X = get_global_id(0);
  if (X >= size.z) {
    return;
  }
  float4 result = (float4)0.f;
  for (int w = 0; w < size.y; w++) {
    for (int c = 0; c < size.z; c++) {
      result += convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + c, X)));
  FLT4 result = (FLT4)0.f;
  for (int h = 0; h < size.x; h++) {
    for (int w = 0; w < size.y; w++) {
      FLT4 current = READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + X, h));
      result += current * current;
    }
  }

  result /= size.y * size.w;
  FLT4 result2 = (FLT4)(0.f);
  result2.x = dot(TO_FLT4(result), (FLT4)(1.f));
  WRITE_IMAGE(dst_data, (int2)(0, X), result2);
  WRITE_IMAGE(dst_data, (int2)(X, 0), result);
 }

 __kernel void mean_WC_local_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) {
  int X = get_global_id(0);  // H
 __kernel void LocalHWSumSquare(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) {
  int X = get_global_id(0);
  int localy = get_local_id(1);
  int localz = get_local_id(2);
  if (X >= size.x) return;
  __local float4 temp[LOCAL_CACHE_THREAD][LOCAL_CACHE_THREAD];
  temp[localy][localz] = (float4)0.f;
  for (int w = localy; w < size.y; w += LOCAL_CACHE_THREAD) {
    for (int c = localz; c < size.z; c += LOCAL_CACHE_THREAD) {
      temp[localy][localz] += convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + c, X)));
  if (X >= size.z) return;
  __local FLT4 temp[LOCAL_CACHE_THREAD][LOCAL_CACHE_THREAD];
  temp[localy][localz] = (FLT4)0.f;
  for (int h = localy; h < size.x; h += LOCAL_CACHE_THREAD) {
    for (int w = localz; w < size.y; w += LOCAL_CACHE_THREAD) {
      FLT4 current = READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + X, h));
      temp[localy][localz] += current * current;
    }
  }
  barrier(CLK_LOCAL_MEM_FENCE);
@@ -122,42 +127,44 @@ __kernel void mean_WC_local_NHWC4(__read_only image2d_t src_data, __write_only i
    }
  }
  barrier(CLK_LOCAL_MEM_FENCE);
  float4 result = temp[0][0];
  FLT4 result = temp[0][0];
  for (int i = 1; i < LOCAL_CACHE_THREAD; i++) {
    result += temp[i][0];
  }
  result /= size.y * size.w;
  FLT4 result2 = (FLT4)(0.f);
  result2.x = dot(TO_FLT4(result), (FLT4)(1.f));
  WRITE_IMAGE(dst_data, (int2)(0, X), result2);
  WRITE_IMAGE(dst_data, (int2)(X, 0), result);
 }

 __kernel void sum_WC_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) {
  int X = get_global_id(0);  // H
 __kernel void GlobalWCSumSquare(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size,
                                float4 mask) {
  int X = get_global_id(0);
  if (X >= size.x) {
    return;
  }
  FLT4 result = (FLT4)0.f;
  for (int w = 0; w < size.y; w++) {
    for (int c = 0; c < size.z; c++) {
      result += READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + c, X));
      FLT4 current = READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + c, X));
      result += current * current;
    }
  }

  FLT4 result2 = (FLT4)(0.f);
  result2.x = dot(TO_FLT4(result), (FLT4)(1.f));
  result2.x = dot(result, (FLT4)(1.f));
  WRITE_IMAGE(dst_data, (int2)(0, X), result2);
 }

 __kernel void sum_WC_local_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) {
  int X = get_global_id(0);  // H
 __kernel void LocalWCSumSquare(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size,
                               float4 mask) {
  int X = get_global_id(0);
  int localy = get_local_id(1);
  int localz = get_local_id(2);
  if (X >= size.x) return;
  __local float4 temp[LOCAL_CACHE_THREAD][LOCAL_CACHE_THREAD];
  temp[localy][localz] = (float4)0.f;
  __local FLT4 temp[LOCAL_CACHE_THREAD][LOCAL_CACHE_THREAD];
  temp[localy][localz] = (FLT4)0.f;
  for (int w = localy; w < size.y; w += LOCAL_CACHE_THREAD) {
    for (int c = localz; c < size.z; c += LOCAL_CACHE_THREAD) {
      temp[localy][localz] += convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + c, X)));
      FLT4 current = READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + c, X));
      temp[localy][localz] += current * current;
    }
  }
  barrier(CLK_LOCAL_MEM_FENCE);
@@ -167,11 +174,150 @@ __kernel void sum_WC_local_NHWC4(__read_only image2d_t src_data, __write_only im
    }
  }
  barrier(CLK_LOCAL_MEM_FENCE);
  float4 result = temp[0][0];
  FLT4 result = temp[0][0];
  for (int i = 1; i < LOCAL_CACHE_THREAD; i++) {
    result += temp[i][0];
  }
  FLT4 result2 = (FLT4)(0.f);
  result2.x = dot(TO_FLT4(result), (FLT4)(1.f));
  result2.x = dot(result, (FLT4)(1.f));
  WRITE_IMAGE(dst_data, (int2)(0, X), result2);
 }

 #define GlobalHW(Method)                                                                                       \
  __kernel void GlobalHW##Method(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) { \
    int X = get_global_id(0);                                                                                  \
    if (X >= size.z) {                                                                                         \
      return;                                                                                                  \
    }                                                                                                          \
    FLT4 result = (FLT4)Init##Method;                                                                          \
    for (int h = 0; h < size.x; h++) {                                                                         \
      for (int w = 0; w < size.y; w++) {                                                                       \
        FLT4 current = READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + X, h));                              \
        Do##Method(result, current);                                                                           \
      }                                                                                                        \
    }                                                                                                          \
    WRITE_IMAGE(dst_data, (int2)(X, 0), result);                                                               \
  }

 #define GlobalWC(Method)                                                                                     \
  __kernel void GlobalWC##Method(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size, \
                                 float4 mask) {                                                              \
    int X = get_global_id(0);                                                                                \
    if (X >= size.x) {                                                                                       \
      return;                                                                                                \
    }                                                                                                        \
    FLT4 result = (FLT4)Init##Method;                                                                        \
    FLT4 maskFLT4 = TO_FLT4(mask);                                                                           \
    for (int w = 0; w < size.y; w++) {                                                                       \
      int c = 0;                                                                                             \
      for (; c < size.z - 1; c++) {                                                                          \
        FLT4 current = READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + c, X));                            \
        Do##Method(result, current);                                                                         \
      }                                                                                                      \
      FLT4 current = READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + c, X));                              \
      current += maskFLT4;                                                                                   \
      Do##Method(result, current);                                                                           \
    }                                                                                                        \
    Do##Method(result.x, result.y);                                                                          \
    Do##Method(result.x, result.z);                                                                          \
    Do##Method(result.x, result.w);                                                                          \
    FLT4 result2 = (FLT4)(0.f);                                                                              \
    result2.x = TO_FLT(result.x);                                                                            \
    WRITE_IMAGE(dst_data, (int2)(0, X), result2);                                                            \
  }

 #define LocalHW(Method)                                                                                       \
  __kernel void LocalHW##Method(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size) { \
    int X = get_global_id(0);                                                                                 \
    int localy = get_local_id(1);                                                                             \
    int localz = get_local_id(2);                                                                             \
    if (X >= size.z) return;                                                                                  \
    __local float4 temp[LOCAL_CACHE_THREAD][LOCAL_CACHE_THREAD];                                              \
    temp[localy][localz] = (float4)Init##Method;                                                              \
    for (int h = localy; h < size.x; h += LOCAL_CACHE_THREAD) {                                               \
      for (int w = localz; w < size.y; w += LOCAL_CACHE_THREAD) {                                             \
        float4 current = convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + X, h)));           \
        Do##Method(temp[localy][localz], current);                                                            \
      }                                                                                                       \
    }                                                                                                         \
    barrier(CLK_LOCAL_MEM_FENCE);                                                                             \
    if (localz == 0) {                                                                                        \
      for (int i = 1; i < LOCAL_CACHE_THREAD; i++) {                                                          \
        Do##Method(temp[localy][0], temp[localy][i]);                                                         \
      }                                                                                                       \
    }                                                                                                         \
    barrier(CLK_LOCAL_MEM_FENCE);                                                                             \
    float4 result = temp[0][0];                                                                               \
    for (int i = 1; i < LOCAL_CACHE_THREAD; i++) {                                                            \
      Do##Method(result, temp[i][0]);                                                                         \
    }                                                                                                         \
    WRITE_IMAGE(dst_data, (int2)(X, 0), TO_FLT4(result));                                                     \
  }

 #define LocalWC(Method)                                                                                     \
  __kernel void LocalWC##Method(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 size, \
                                float4 mask) {                                                              \
    int X = get_global_id(0);                                                                               \
    int localy = get_local_id(1);                                                                           \
    int localz = get_local_id(2);                                                                           \
    if (X >= size.x) return;                                                                                \
    __local float4 temp[LOCAL_CACHE_THREAD][LOCAL_CACHE_THREAD];                                            \
    temp[localy][localz] = (float4)Init##Method;                                                            \
    for (int w = localy; w < size.y; w += LOCAL_CACHE_THREAD) {                                             \
      int c = localz;                                                                                       \
      for (; c < size.z - 1; c += LOCAL_CACHE_THREAD) {                                                     \
        float4 current = convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + c, X)));         \
        Do##Method(temp[localy][localz], current);                                                          \
      }                                                                                                     \
      if (c == size.z - 1) {                                                                                \
        float4 current = convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(w * size.z + c, X)));         \
        current += mask;                                                                                    \
        Do##Method(temp[localy][localz], current);                                                          \
      }                                                                                                     \
    }                                                                                                       \
    barrier(CLK_LOCAL_MEM_FENCE);                                                                           \
    if (localz == 0) {                                                                                      \
      for (int i = 1; i < LOCAL_CACHE_THREAD; i++) {                                                        \
        Do##Method(temp[localy][0], temp[localy][i]);                                                       \
      }                                                                                                     \
    }                                                                                                       \
    barrier(CLK_LOCAL_MEM_FENCE);                                                                           \
    float4 result = temp[0][0];                                                                             \
    for (int i = 1; i < LOCAL_CACHE_THREAD; i++) {                                                          \
      Do##Method(result, temp[i][0]);                                                                       \
    }                                                                                                       \
    Do##Method(result.x, result.y);                                                                         \
    Do##Method(result.x, result.z);                                                                         \
    Do##Method(result.x, result.w);                                                                         \
    FLT4 result2 = (FLT4)(0.f);                                                                             \
    result2.x = TO_FLT(result.x);                                                                           \
    WRITE_IMAGE(dst_data, (int2)(0, X), result2);                                                           \
  }

 #define DoSum(A, B) A += B
 #define InitSum 0.f
 GlobalHW(Sum)
 GlobalWC(Sum)
 LocalHW(Sum)
 LocalWC(Sum)

 #define DoMin(A, B) A = min(A, B)
 #define InitMin 10000.f
 GlobalHW(Min)
 GlobalWC(Min)
 LocalHW(Min)
 LocalWC(Min)

 #define DoMax(A, B) A = max(A, B)
 #define InitMax -10000.f
 GlobalHW(Max)
 GlobalWC(Max)
 LocalHW(Max)
 LocalWC(Max)

 #define DoProd(A, B) A *= B
 #define InitProd 1.f
 GlobalHW(Prod)
 GlobalWC(Prod)
 LocalHW(Prod)
 LocalWC(Prod)
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.cc
@@ -41,8 +41,9 @@ using mindspore::schema::ReduceMode_ReduceSumSquare;
 namespace mindspore::kernel {

 std::string ReduceOpenCLKernel::GetReduceTypeStr(int type) {
  static const std::map<int, std::string> reduce_type2str{{ReduceMode_ReduceMean, "mean"},
                                                          {ReduceMode_ReduceSum, "sum"}};
  static const std::map<int, std::string> reduce_type2str{
    {ReduceMode_ReduceMean, "Mean"}, {ReduceMode_ReduceSum, "Sum"},   {ReduceMode_ReduceMin, "Min"},
    {ReduceMode_ReduceMax, "Max"},   {ReduceMode_ReduceProd, "Prod"}, {ReduceMode_ReduceSumSquare, "SumSquare"}};
  auto result_iter = reduce_type2str.find(type);
  if (result_iter != reduce_type2str.end()) {
    return result_iter->second;
@@ -50,7 +51,26 @@ std::string ReduceOpenCLKernel::GetReduceTypeStr(int type) {
  return "";
 }

 cl_float4 ReduceOpenCLKernel::GenC4Mask() {
  auto reduce_param = reinterpret_cast<ReduceParameter *>(op_parameter_);
  int last_c4 = in_tensors_[0]->shape()[3] % C4NUM;
  last_c4 = (C4NUM - last_c4) % 4;
  static const std::map<int, float> reduce_type2init{
    {ReduceMode_ReduceMean, 0.f},     {ReduceMode_ReduceSum, 0.f},  {ReduceMode_ReduceMin, 10000.f},
    {ReduceMode_ReduceMax, -10000.f}, {ReduceMode_ReduceProd, 1.f}, {ReduceMode_ReduceSumSquare, 0.f}};
  float init_float = reduce_type2init.find(reduce_param->mode_)->second;
  cl_float4 mask = {0.f, 0.f, 0.f, 0.f};
  for (int i = 0; i < last_c4; i++) {
    mask.s[C4NUM - i - 1] = init_float;
  }
  return mask;
 }

 int ReduceOpenCLKernel::CheckSpecs() {
  if (in_tensors_[0]->shape()[0] > 1) {
    MS_LOG(ERROR) << "reduce op only support n=2";
    return RET_PARAM_INVALID;
  }
  auto reduce_param = reinterpret_cast<ReduceParameter *>(op_parameter_);
  if (GetReduceTypeStr(reduce_param->mode_).empty()) {
    MS_LOG(ERROR) << "not supported reduce type:" << reduce_param->mode_;
@@ -82,17 +102,21 @@ int ReduceOpenCLKernel::Prepare() {
    return RET_NULL_PTR;
  }

  std::string kernel_name = GetReduceTypeStr(reduce_param->mode_);
  if (wc_reduce_) {
    kernel_name += "_WC";
  }
  std::string kernel_name;
  if (in_tensors_[0]->shape()[reduce_param->axes_[0]] >= LOCAL_CACHE_THREAD ||
      in_tensors_[0]->shape()[reduce_param->axes_[1]] >= LOCAL_CACHE_THREAD) {
    use_local_ = true;
    kernel_name += "_local";
    kernel_name += "Local";
  } else {
    use_local_ = false;
    kernel_name += "Global";
  }
  kernel_name += "_NHWC4";

  if (wc_reduce_) {
    kernel_name += "WC";
  } else {
    kernel_name += "HW";
  }
  kernel_name += GetReduceTypeStr(reduce_param->mode_);
 #ifdef PROGRAM_WITH_IL
  kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
 #else
@@ -116,6 +140,9 @@ void ReduceOpenCLKernel::SetConstArgs() {
  cl_int4 size = {h, w, c4, c};
  int arg_idx = 2;
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, size);
  if (wc_reduce_) {
    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, GenC4Mask());
  }
 }
 void ReduceOpenCLKernel::SetGlobalLocal() {
  std::vector<int> shapex = in_tensors_[0]->shape();
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.h
@@ -38,6 +38,7 @@ class ReduceOpenCLKernel : public OpenCLKernel {
  void SetGlobalLocal() override;

 private:
  cl_float4 GenC4Mask();
  static std::string GetReduceTypeStr(int type);
  cl::Kernel kernel_;
  Image2DInfo outShape = Image2DInfo(nullptr);
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/reduce_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/reduce_tests.cc
@@ -99,7 +99,7 @@ void RunTestCaseReduce(const std::vector<int> &shape, void *input_data, void *ou
    CompareOutput(outputs[0]->MutableData(), output_data, outputs[0]->ElementsNum(), static_cast<float16_t>(1e-3),
                  2e-2);
  } else {
    CompareOutput(outputs[0]->MutableData(), output_data, outputs[0]->ElementsNum(), static_cast<float>(1e-5));
    CompareOutput(outputs[0]->MutableData(), output_data, outputs[0]->ElementsNum(), static_cast<float>(1e-3));
  }
  for (auto t : inputs) {
    t->set_data(nullptr);
@@ -135,6 +135,75 @@ TEST_F(TestReduceOpenCL, ReduceMeanFp16) {
  RunTestCaseReduce(shape, input_data.data(), output_data.data(), true, schema::ReduceMode_ReduceMean);
 }

 TEST_F(TestReduceOpenCL, ReduceMeanLocalFp32) {
  int n = 1;
  int h = 17;
  int w = 2;
  int c = 3;
  std::vector<int> shape = {n, h, w, c};
  std::vector<float> input_data = {
    3.0f, 1.0f, 6.0f, 8.0f, 6.0f, 4.0f, 1.0f, 3.0f, 5.0f, 4.0f, 5.0f, 4.0f, 0.0f, 2.0f, 4.0f, 1.0f, 3.0f,
    1.0f, 6.0f, 5.0f, 4.0f, 7.0f, 0.0f, 7.0f, 1.0f, 2.0f, 5.0f, 0.0f, 6.0f, 7.0f, 8.0f, 9.0f, 0.0f, 8.0f,
    5.0f, 7.0f, 6.0f, 2.0f, 5.0f, 3.0f, 2.0f, 9.0f, 1.0f, 0.0f, 2.0f, 0.0f, 6.0f, 0.0f, 3.0f, 6.0f, 0.0f,
    7.0f, 1.0f, 0.0f, 6.0f, 3.0f, 0.0f, 1.0f, 0.0f, 5.0f, 3.0f, 8.0f, 1.0f, 9.0f, 2.0f, 2.0f, 2.0f, 7.0f,
    7.0f, 6.0f, 7.0f, 0.0f, 5.0f, 4.0f, 2.0f, 6.0f, 8.0f, 2.0f, 0.0f, 8.0f, 4.0f, 9.0f, 1.0f, 2.0f, 9.0f,
    9.0f, 6.0f, 0.0f, 8.0f, 5.0f, 2.0f, 9.0f, 3.0f, 1.0f, 9.0f, 0.0f, 4.0f, 6.0f, 0.0f, 5.0f, 2.0f, 3.0f};
  std::vector<float> output_data = {3.971f, 4.559f, 3.294f};

  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceMean);
 }

 TEST_F(TestReduceOpenCL, ReduceMeanLocalFp16) {
  int n = 1;
  int h = 17;
  int w = 2;
  int c = 3;
  std::vector<int> shape = {n, h, w, c};
  std::vector<float16_t> input_data = {
    3.0f, 1.0f, 6.0f, 8.0f, 6.0f, 4.0f, 1.0f, 3.0f, 5.0f, 4.0f, 5.0f, 4.0f, 0.0f, 2.0f, 4.0f, 1.0f, 3.0f,
    1.0f, 6.0f, 5.0f, 4.0f, 7.0f, 0.0f, 7.0f, 1.0f, 2.0f, 5.0f, 0.0f, 6.0f, 7.0f, 8.0f, 9.0f, 0.0f, 8.0f,
    5.0f, 7.0f, 6.0f, 2.0f, 5.0f, 3.0f, 2.0f, 9.0f, 1.0f, 0.0f, 2.0f, 0.0f, 6.0f, 0.0f, 3.0f, 6.0f, 0.0f,
    7.0f, 1.0f, 0.0f, 6.0f, 3.0f, 0.0f, 1.0f, 0.0f, 5.0f, 3.0f, 8.0f, 1.0f, 9.0f, 2.0f, 2.0f, 2.0f, 7.0f,
    7.0f, 6.0f, 7.0f, 0.0f, 5.0f, 4.0f, 2.0f, 6.0f, 8.0f, 2.0f, 0.0f, 8.0f, 4.0f, 9.0f, 1.0f, 2.0f, 9.0f,
    9.0f, 6.0f, 0.0f, 8.0f, 5.0f, 2.0f, 9.0f, 3.0f, 1.0f, 9.0f, 0.0f, 4.0f, 6.0f, 0.0f, 5.0f, 2.0f, 3.0f};
  std::vector<float16_t> output_data = {3.971f, 4.559f, 3.294f};

  RunTestCaseReduce(shape, input_data.data(), output_data.data(), true, schema::ReduceMode_ReduceMean);
 }

 TEST_F(TestReduceOpenCL, ReduceMeanWCFp32) {
  int n = 1;
  int h = 3;
  int w = 2;
  int c = 2;
  std::vector<int> shape = {n, h, w, c};
  std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
  std::vector<float> output_data = {1.5f, 5.5f, 9.5f};

  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceMean, true);
 }

 TEST_F(TestReduceOpenCL, ReduceMeanWCLocalFp32) {
  int n = 1;
  int h = 5;
  int w = 17;
  int c = 2;
  std::vector<int> shape = {n, h, w, c};
  std::vector<float> input_data = {
    6.0f, 3.0f, 6.0f, 1.0f, 4.0f, 2.0f, 5.0f, 1.0f, 7.0f, 5.0f, 7.0f, 3.0f, 0.0f, 2.0f, 9.0f, 8.0f, 3.0f, 1.0f, 6.0f,
    8.0f, 6.0f, 6.0f, 3.0f, 0.0f, 6.0f, 3.0f, 8.0f, 0.0f, 6.0f, 1.0f, 0.0f, 9.0f, 4.0f, 4.0f, 9.0f, 4.0f, 9.0f, 5.0f,
    0.0f, 1.0f, 4.0f, 6.0f, 4.0f, 0.0f, 9.0f, 3.0f, 6.0f, 6.0f, 7.0f, 1.0f, 7.0f, 8.0f, 6.0f, 0.0f, 2.0f, 6.0f, 4.0f,
    4.0f, 3.0f, 7.0f, 7.0f, 5.0f, 2.0f, 3.0f, 4.0f, 3.0f, 1.0f, 5.0f, 4.0f, 8.0f, 7.0f, 5.0f, 0.0f, 7.0f, 5.0f, 5.0f,
    0.0f, 3.0f, 4.0f, 0.0f, 6.0f, 5.0f, 4.0f, 6.0f, 2.0f, 0.0f, 8.0f, 6.0f, 4.0f, 6.0f, 3.0f, 2.0f, 6.0f, 4.0f, 8.0f,
    4.0f, 8.0f, 2.0f, 0.0f, 0.0f, 9.0f, 4.0f, 3.0f, 4.0f, 1.0f, 7.0f, 9.0f, 1.0f, 9.0f, 4.0f, 2.0f, 8.0f, 3.0f, 5.0f,
    8.0f, 7.0f, 8.0f, 8.0f, 4.0f, 8.0f, 2.0f, 8.0f, 9.0f, 4.0f, 5.0f, 0.0f, 2.0f, 1.0f, 0.0f, 8.0f, 4.0f, 7.0f, 2.0f,
    4.0f, 5.0f, 0.0f, 0.0f, 7.0f, 2.0f, 0.0f, 2.0f, 7.0f, 1.0f, 1.0f, 0.0f, 1.0f, 2.0f, 1.0f, 3.0f, 7.0f, 7.0f, 3.0f,
    2.0f, 3.0f, 1.0f, 7.0f, 2.0f, 2.0f, 2.0f, 9.0f, 3.0f, 6.0f, 1.0f, 8.0f, 0.0f, 1.0f, 2.0f, 0.0f, 9.0f, 5.0f};
  std::vector<float> output_data = {4.206f, 4.441f, 4.265f, 4.706f, 3.147f};

  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceMean, true);
 }

 TEST_F(TestReduceOpenCL, ReduceSumFp32) {
  int n = 1;
  int h = 2;
@@ -159,16 +228,40 @@ TEST_F(TestReduceOpenCL, ReduceSumFp16) {
  RunTestCaseReduce(shape, input_data.data(), output_data.data(), true, schema::ReduceMode_ReduceSum);
 }

 TEST_F(TestReduceOpenCL, ReduceMeanWCFp32) {
 TEST_F(TestReduceOpenCL, ReduceSumLocalFp32) {
  int n = 1;
  int h = 3;
  int h = 17;
  int w = 2;
  int c = 2;
  int c = 3;
  std::vector<int> shape = {n, h, w, c};
  std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
  std::vector<float> output_data = {1.5f, 5.5f, 9.5f};
  std::vector<float> input_data = {
    8.0f, 1.0f, 8.0f, 9.0f, 6.0f, 9.0f, 4.0f, 4.0f, 4.0f, 2.0f, 3.0f, 9.0f, 3.0f, 4.0f, 8.0f, 1.0f, 9.0f,
    5.0f, 2.0f, 5.0f, 6.0f, 3.0f, 8.0f, 3.0f, 7.0f, 1.0f, 3.0f, 1.0f, 9.0f, 4.0f, 0.0f, 9.0f, 7.0f, 7.0f,
    5.0f, 0.0f, 2.0f, 4.0f, 8.0f, 7.0f, 3.0f, 0.0f, 4.0f, 8.0f, 5.0f, 3.0f, 8.0f, 2.0f, 5.0f, 3.0f, 5.0f,
    9.0f, 4.0f, 3.0f, 9.0f, 7.0f, 2.0f, 4.0f, 7.0f, 0.0f, 3.0f, 9.0f, 6.0f, 6.0f, 9.0f, 2.0f, 1.0f, 0.0f,
    7.0f, 1.0f, 7.0f, 2.0f, 0.0f, 6.0f, 9.0f, 4.0f, 7.0f, 0.0f, 7.0f, 0.0f, 4.0f, 8.0f, 6.0f, 0.0f, 3.0f,
    2.0f, 1.0f, 2.0f, 9.0f, 6.0f, 2.0f, 6.0f, 2.0f, 9.0f, 4.0f, 0.0f, 1.0f, 9.0f, 7.0f, 6.0f, 9.0f, 8.0f};
  std::vector<float> output_data = {143.000f, 191.000f, 145.000f};

  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceMean, true);
  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceSum);
 }

 TEST_F(TestReduceOpenCL, ReduceSumLocalFp16) {
  int n = 1;
  int h = 17;
  int w = 2;
  int c = 3;
  std::vector<int> shape = {n, h, w, c};
  std::vector<float16_t> input_data = {
    8.0f, 1.0f, 8.0f, 9.0f, 6.0f, 9.0f, 4.0f, 4.0f, 4.0f, 2.0f, 3.0f, 9.0f, 3.0f, 4.0f, 8.0f, 1.0f, 9.0f,
    5.0f, 2.0f, 5.0f, 6.0f, 3.0f, 8.0f, 3.0f, 7.0f, 1.0f, 3.0f, 1.0f, 9.0f, 4.0f, 0.0f, 9.0f, 7.0f, 7.0f,
    5.0f, 0.0f, 2.0f, 4.0f, 8.0f, 7.0f, 3.0f, 0.0f, 4.0f, 8.0f, 5.0f, 3.0f, 8.0f, 2.0f, 5.0f, 3.0f, 5.0f,
    9.0f, 4.0f, 3.0f, 9.0f, 7.0f, 2.0f, 4.0f, 7.0f, 0.0f, 3.0f, 9.0f, 6.0f, 6.0f, 9.0f, 2.0f, 1.0f, 0.0f,
    7.0f, 1.0f, 7.0f, 2.0f, 0.0f, 6.0f, 9.0f, 4.0f, 7.0f, 0.0f, 7.0f, 0.0f, 4.0f, 8.0f, 6.0f, 0.0f, 3.0f,
    2.0f, 1.0f, 2.0f, 9.0f, 6.0f, 2.0f, 6.0f, 2.0f, 9.0f, 4.0f, 0.0f, 1.0f, 9.0f, 7.0f, 6.0f, 9.0f, 8.0f};
  std::vector<float16_t> output_data = {143.000f, 191.000f, 145.000f};

  RunTestCaseReduce(shape, input_data.data(), output_data.data(), true, schema::ReduceMode_ReduceSum);
 }

 TEST_F(TestReduceOpenCL, ReduceSumWCFp32) {
@@ -182,4 +275,439 @@ TEST_F(TestReduceOpenCL, ReduceSumWCFp32) {

  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceSum, true);
 }

 TEST_F(TestReduceOpenCL, ReduceSumWCLocalFp32) {
  int n = 1;
  int h = 3;
  int w = 5;
  int c = 17;
  std::vector<int> shape = {n, h, w, c};
  std::vector<float> input_data = {
    4.0f, 7.0f, 2.0f, 9.0f, 1.0f, 4.0f, 0.0f, 1.0f, 0.0f, 7.0f, 8.0f, 0.0f, 2.0f, 8.0f, 2.0f, 0.0f, 4.0f, 8.0f, 3.0f,
    9.0f, 5.0f, 9.0f, 7.0f, 0.0f, 3.0f, 3.0f, 1.0f, 1.0f, 8.0f, 6.0f, 4.0f, 7.0f, 6.0f, 5.0f, 7.0f, 8.0f, 2.0f, 0.0f,
    0.0f, 4.0f, 1.0f, 1.0f, 4.0f, 6.0f, 0.0f, 5.0f, 1.0f, 0.0f, 3.0f, 9.0f, 3.0f, 7.0f, 8.0f, 1.0f, 6.0f, 9.0f, 2.0f,
    5.0f, 7.0f, 2.0f, 9.0f, 8.0f, 0.0f, 2.0f, 0.0f, 4.0f, 3.0f, 4.0f, 3.0f, 5.0f, 3.0f, 5.0f, 2.0f, 2.0f, 1.0f, 9.0f,
    8.0f, 7.0f, 0.0f, 8.0f, 0.0f, 4.0f, 0.0f, 8.0f, 4.0f, 8.0f, 2.0f, 6.0f, 3.0f, 7.0f, 6.0f, 8.0f, 3.0f, 6.0f, 4.0f,
    8.0f, 3.0f, 8.0f, 1.0f, 0.0f, 9.0f, 6.0f, 4.0f, 9.0f, 0.0f, 6.0f, 8.0f, 6.0f, 7.0f, 8.0f, 2.0f, 3.0f, 3.0f, 7.0f,
    2.0f, 9.0f, 1.0f, 9.0f, 3.0f, 5.0f, 4.0f, 6.0f, 2.0f, 7.0f, 1.0f, 1.0f, 0.0f, 0.0f, 4.0f, 9.0f, 1.0f, 7.0f, 3.0f,
    2.0f, 1.0f, 4.0f, 6.0f, 7.0f, 9.0f, 2.0f, 2.0f, 8.0f, 3.0f, 2.0f, 4.0f, 1.0f, 7.0f, 6.0f, 8.0f, 6.0f, 9.0f, 8.0f,
    6.0f, 8.0f, 3.0f, 4.0f, 8.0f, 5.0f, 6.0f, 9.0f, 9.0f, 2.0f, 0.0f, 5.0f, 0.0f, 0.0f, 2.0f, 4.0f, 2.0f, 2.0f, 6.0f,
    9.0f, 3.0f, 6.0f, 0.0f, 5.0f, 4.0f, 3.0f, 8.0f, 6.0f, 3.0f, 2.0f, 8.0f, 9.0f, 2.0f, 7.0f, 1.0f, 2.0f, 4.0f, 9.0f,
    3.0f, 7.0f, 9.0f, 2.0f, 4.0f, 2.0f, 7.0f, 8.0f, 8.0f, 6.0f, 3.0f, 4.0f, 6.0f, 3.0f, 1.0f, 7.0f, 9.0f, 3.0f, 5.0f,
    9.0f, 7.0f, 1.0f, 8.0f, 6.0f, 1.0f, 9.0f, 2.0f, 8.0f, 2.0f, 9.0f, 8.0f, 3.0f, 2.0f, 7.0f, 8.0f, 9.0f, 3.0f, 6.0f,
    0.0f, 8.0f, 5.0f, 7.0f, 1.0f, 5.0f, 2.0f, 9.0f, 3.0f, 0.0f, 5.0f, 9.0f, 3.0f, 2.0f, 0.0f, 2.0f, 7.0f, 5.0f, 7.0f,
    4.0f, 7.0f, 0.0f, 9.0f, 8.0f, 8.0f, 8.0f, 8.0f};
  std::vector<float> output_data = {344.000f, 395.000f, 434.000f};
  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceSum, true);
 }

 TEST_F(TestReduceOpenCL, ReduceMinFp32) {
  int n = 1;
  int h = 2;
  int w = 2;
  int c = 3;
  std::vector<int> shape = {n, h, w, c};
  std::vector<float> input_data = {3.0f, -5.0f, 4.0f, 3.0f, -1.0f, 1.0f, -5.0f, -2.0f, -3.0f, 5.0f, -1.0f, 5.0f};
  std::vector<float> output_data = {-5.000f, -5.000f, -3.000f};

  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceMin);
 }

 TEST_F(TestReduceOpenCL, ReduceMinFp16) {
  int n = 1;
  int h = 2;
  int w = 2;
  int c = 3;
  std::vector<int> shape = {n, h, w, c};
  std::vector<float16_t> input_data = {3.0f, -5.0f, 4.0f, 3.0f, -1.0f, 1.0f, -5.0f, -2.0f, -3.0f, 5.0f, -1.0f, 5.0f};
  std::vector<float16_t> output_data = {-5.000f, -5.000f, -3.000f};

  RunTestCaseReduce(shape, input_data.data(), output_data.data(), true, schema::ReduceMode_ReduceMin);
 }

 TEST_F(TestReduceOpenCL, ReduceMinLocalFp32) {
  int n = 1;
  int h = 17;
  int w = 2;
  int c = 3;
  std::vector<int> shape = {n, h, w, c};
  std::vector<float> input_data = {
    2.0f,  -8.0f,  -4.0f,  -7.0f, 7.0f,   3.0f,  7.0f,  -3.0f, 2.0f,   -9.0f, -6.0f, 3.0f,   -8.0f,  1.0f,  -10.0f,
    1.0f,  -10.0f, 2.0f,   -5.0f, 6.0f,   -5.0f, 7.0f,  3.0f,  4.0f,   3.0f,  -3.0f, 5.0f,   -1.0f,  -1.0f, -6.0f,
    -4.0f, 9.0f,   5.0f,   -1.0f, 3.0f,   3.0f,  9.0f,  5.0f,  -10.0f, -1.0f, -8.0f, 9.0f,   -4.0f,  8.0f,  3.0f,
    -1.0f, -2.0f,  8.0f,   -1.0f, -7.0f,  2.0f,  4.0f,  2.0f,  4.0f,   6.0f,  -1.0f, 7.0f,   4.0f,   -3.0f, 0.0f,
    -2.0f, -1.0f,  -10.0f, -2.0f, 6.0f,   3.0f,  -4.0f, -9.0f, -5.0f,  -8.0f, 0.0f,  -7.0f,  9.0f,   2.0f,  7.0f,
    -5.0f, 8.0f,   4.0f,   5.0f,  9.0f,   -3.0f, 2.0f,  0.0f,  -4.0f,  -1.0f, -7.0f, -10.0f, -10.0f, -3.0f, 9.0f,
    -8.0f, 1.0f,   1.0f,   -5.0f, -10.0f, -1.0f, 8.0f,  -2.0f, 1.0f,   -4.0f, 1.0f,  0.0f};
  std::vector<float> output_data = {-10.000f, -10.000f, -10.000f};

  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceMin);
 }

 TEST_F(TestReduceOpenCL, ReduceMinLocalFp16) {
  int n = 1;
  int h = 17;
  int w = 2;
  int c = 3;
  std::vector<int> shape = {n, h, w, c};
  std::vector<float16_t> input_data = {
    2.0f,  -8.0f,  -4.0f,  -7.0f, 7.0f,   3.0f,  7.0f,  -3.0f, 2.0f,   -9.0f, -6.0f, 3.0f,   -8.0f,  1.0f,  -10.0f,
    1.0f,  -10.0f, 2.0f,   -5.0f, 6.0f,   -5.0f, 7.0f,  3.0f,  4.0f,   3.0f,  -3.0f, 5.0f,   -1.0f,  -1.0f, -6.0f,
    -4.0f, 9.0f,   5.0f,   -1.0f, 3.0f,   3.0f,  9.0f,  5.0f,  -10.0f, -1.0f, -8.0f, 9.0f,   -4.0f,  8.0f,  3.0f,
    -1.0f, -2.0f,  8.0f,   -1.0f, -7.0f,  2.0f,  4.0f,  2.0f,  4.0f,   6.0f,  -1.0f, 7.0f,   4.0f,   -3.0f, 0.0f,
    -2.0f, -1.0f,  -10.0f, -2.0f, 6.0f,   3.0f,  -4.0f, -9.0f, -5.0f,  -8.0f, 0.0f,  -7.0f,  9.0f,   2.0f,  7.0f,
    -5.0f, 8.0f,   4.0f,   5.0f,  9.0f,   -3.0f, 2.0f,  0.0f,  -4.0f,  -1.0f, -7.0f, -10.0f, -10.0f, -3.0f, 9.0f,
    -8.0f, 1.0f,   1.0f,   -5.0f, -10.0f, -1.0f, 8.0f,  -2.0f, 1.0f,   -4.0f, 1.0f,  0.0f};
  std::vector<float16_t> output_data = {-10.000f, -10.000f, -10.000f};

  RunTestCaseReduce(shape, input_data.data(), output_data.data(), true, schema::ReduceMode_ReduceMin);
 }

 TEST_F(TestReduceOpenCL, ReduceMinWCFp32) {
  int n = 1;
  int h = 3;
  int w = 2;
  int c = 2;
  std::vector<int> shape = {n, h, w, c};
  std::vector<float> input_data = {-0.080f, 0.481f, -0.853f, -0.838f, 0.557f, 0.255f,
                                   0.116f,  0.446f, -0.051f, -0.095f, 0.552f, 0.077f};
  std::vector<float> output_data = {-0.853f, 0.116f, -0.095f};

  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceMin, true);
 }

 TEST_F(TestReduceOpenCL, ReduceMinWCLocalFp32) {
  int n = 1;
  int h = 5;
  int w = 17;
  int c = 2;
  std::vector<int> shape = {n, h, w, c};
  std::vector<float> input_data = {
    0.399f,  -0.139f, 0.238f,  0.779f,  -0.894f, 0.343f,  -0.955f, 0.593f,  0.448f,  0.816f,  0.841f,  -0.614f, 0.636f,
    0.116f,  -0.031f, -0.109f, 0.770f,  0.962f,  0.307f,  -0.170f, 0.789f,  0.197f,  0.530f,  -0.883f, 0.753f,  0.385f,
    -0.158f, 0.237f,  0.971f,  -0.781f, -0.523f, -0.547f, 0.257f,  -0.034f, 0.660f,  -0.666f, -0.379f, 0.092f,  -0.130f,
    0.369f,  0.664f,  -0.747f, -0.687f, -0.628f, -0.434f, 0.736f,  0.673f,  0.125f,  -0.854f, 0.007f,  0.038f,  0.024f,
    0.706f,  -0.806f, 0.042f,  0.532f,  -0.545f, -0.942f, 0.778f,  -0.419f, 0.931f,  -0.848f, 0.501f,  -0.415f, -0.292f,
    -0.575f, 0.192f,  -0.825f, 0.256f,  -0.227f, -0.795f, 0.319f,  0.101f,  -0.337f, 0.940f,  -0.724f, 0.453f,  -0.646f,
    -0.225f, -0.303f, 0.093f,  0.851f,  -0.467f, -0.657f, 0.980f,  0.867f,  0.606f,  0.356f,  0.982f,  -0.199f, 0.816f,
    0.984f,  -0.466f, -0.857f, -0.070f, -0.562f, 0.744f,  0.477f,  0.831f,  -0.064f, 0.891f,  -0.813f, -0.341f, 0.969f,
    0.538f,  0.233f,  -0.545f, 0.994f,  0.241f,  -0.829f, -0.272f, -0.420f, 0.607f,  0.658f,  -0.188f, 0.134f,  0.277f,
    -0.173f, 0.373f,  0.286f,  -0.805f, 0.455f,  0.461f,  0.893f,  -0.457f, 0.360f,  -0.706f, -0.848f, 0.032f,  -0.566f,
    0.014f,  0.507f,  -0.694f, -0.663f, -0.783f, 0.459f,  -0.613f, -0.496f, 0.332f,  0.829f,  -0.437f, 0.759f,  -0.061f,
    -0.400f, -0.561f, 0.471f,  -0.042f, 0.073f,  0.546f,  -0.557f, 0.602f,  0.011f,  -0.214f, 0.733f,  0.289f,  -0.847f,
    -0.637f, -0.791f, 0.519f,  0.449f,  -0.390f, -0.296f, 0.622f,  0.345f,  0.525f,  -0.205f, -0.626f, 0.089f,  -0.811f,
    0.741f};
  std::vector<float> output_data = {-0.955f, -0.942f, -0.857f, -0.848f, -0.847f};

  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceMin, true);
 }

 TEST_F(TestReduceOpenCL, ReduceMaxFp32) {
  int n = 1;
  int h = 2;
  int w = 2;
  int c = 3;
  std::vector<int> shape = {n, h, w, c};
  std::vector<float> input_data = {0.123f,  0.975f, 0.092f, 0.364f,  0.033f,  -0.140f,
                                   -0.566f, 0.693f, 0.540f, -0.588f, -0.992f, -0.386f};
  std::vector<float> output_data = {0.364f, 0.975f, 0.540f};

  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceMax);
 }

 TEST_F(TestReduceOpenCL, ReduceMaxFp16) {
  int n = 1;
  int h = 2;
  int w = 2;
  int c = 3;
  std::vector<int> shape = {n, h, w, c};
  std::vector<float16_t> input_data = {0.123f,  0.975f, 0.092f, 0.364f,  0.033f,  -0.140f,
                                       -0.566f, 0.693f, 0.540f, -0.588f, -0.992f, -0.386f};
  std::vector<float16_t> output_data = {0.364f, 0.975f, 0.540f};

  RunTestCaseReduce(shape, input_data.data(), output_data.data(), true, schema::ReduceMode_ReduceMax);
 }

 TEST_F(TestReduceOpenCL, ReduceMaxLocalFp32) {
  int n = 1;
  int h = 17;
  int w = 2;
  int c = 3;
  std::vector<int> shape = {n, h, w, c};
  std::vector<float> input_data = {
    0.113f,  -0.633f, 0.603f,  0.447f,  -0.588f, 0.039f,  0.494f,  -0.379f, -0.018f, -0.317f, 0.620f,  0.460f,  0.732f,
    0.980f,  0.376f,  0.481f,  -0.371f, -0.219f, -0.496f, 0.670f,  -0.159f, 0.961f,  0.036f,  0.633f,  -0.118f, -0.300f,
    0.971f,  -0.236f, -0.095f, -0.705f, -0.495f, -0.403f, -0.131f, -0.084f, -0.339f, 0.031f,  -0.582f, 0.893f,  -0.311f,
    0.501f,  -0.623f, -0.523f, -0.177f, -0.438f, 0.626f,  0.028f,  -0.106f, 0.916f,  -0.504f, 0.678f,  0.358f,  -0.951f,
    0.741f,  -0.577f, -0.544f, -0.952f, -0.133f, 0.441f,  -0.376f, -0.246f, 0.301f,  0.025f,  -0.904f, -0.337f, 0.132f,
    -0.800f, 0.226f,  -0.135f, -0.617f, -0.871f, -0.393f, -0.195f, 0.591f,  0.034f,  -0.040f, 0.377f,  -0.106f, 0.265f,
    -0.883f, -0.678f, -0.795f, -0.094f, -0.272f, -0.954f, 0.569f,  -0.910f, -0.288f, -0.978f, 0.262f,  -0.973f, -0.750f,
    0.460f,  0.956f,  0.696f,  -0.938f, 0.537f,  0.516f,  -0.339f, -0.289f, 0.498f,  0.135f,  -0.649f};
  std::vector<float> output_data = {0.961f, 0.980f, 0.971f};

  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceMax);
 }

 TEST_F(TestReduceOpenCL, ReduceMaxLocalFp16) {
  int n = 1;
  int h = 17;
  int w = 2;
  int c = 3;
  std::vector<int> shape = {n, h, w, c};
  std::vector<float16_t> input_data = {
    0.314f,  -0.714f, -0.736f, -0.459f, -0.819f, -0.530f, -0.275f, -0.141f, -0.797f, 0.522f,  -0.651f, 0.576f,  -0.644f,
    0.725f,  0.208f,  -0.529f, -0.776f, 0.986f,  -0.862f, -0.327f, 0.922f,  0.554f,  -0.401f, 0.972f,  -0.485f, 0.423f,
    -0.611f, -0.768f, 0.444f,  -0.678f, -0.734f, 0.572f,  0.413f,  0.612f,  -0.783f, -0.138f, -0.624f, -0.284f, 0.873f,
    -0.298f, 0.630f,  -0.463f, 0.195f,  0.196f,  0.167f,  0.227f,  -0.015f, 0.436f,  -0.898f, 0.031f,  -0.149f, -0.218f,
    0.184f,  -0.426f, 0.794f,  0.846f,  0.624f,  -0.889f, -0.336f, 0.401f,  -0.820f, -0.583f, 0.337f,  0.175f,  0.228f,
    -0.626f, -0.505f, -0.088f, 0.833f,  -0.366f, 0.392f,  0.727f,  -0.598f, -0.851f, 0.007f,  -0.707f, 0.575f,  0.243f,
    -0.372f, -0.141f, 0.679f,  -0.646f, 0.422f,  0.322f,  -0.294f, 0.831f,  0.929f,  -0.414f, -0.208f, -0.111f, 0.146f,
    -0.489f, -0.808f, -0.635f, 0.811f,  0.544f,  -0.131f, 0.707f,  0.787f,  0.603f,  -0.149f, -0.095f};
  std::vector<float16_t> output_data = {0.794f, 0.846f, 0.986f};

  RunTestCaseReduce(shape, input_data.data(), output_data.data(), true, schema::ReduceMode_ReduceMax);
 }

 TEST_F(TestReduceOpenCL, ReduceMaxWCFp32) {
  int n = 1;
  int h = 3;
  int w = 2;
  int c = 2;
  std::vector<int> shape = {n, h, w, c};
  std::vector<float> input_data = {0.435f,  -0.949f, 0.580f,  0.858f, -0.465f, 0.255f,
                                   -0.561f, -0.444f, -0.603f, 0.266f, 0.031f,  -0.638f};
  std::vector<float> output_data = {0.858f, 0.255f, 0.266f};

  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceMax, true);
 }

 TEST_F(TestReduceOpenCL, ReduceMaxWCLocalFp32) {
  int n = 1;
  int h = 5;
  int w = 17;
  int c = 2;
  std::vector<int> shape = {n, h, w, c};
  std::vector<float> input_data = {
    0.543f,  0.620f,  0.175f,  -0.275f, -0.570f, 0.516f,  -0.401f, -0.157f, 0.460f,  -0.072f, -0.322f, 0.208f,  0.385f,
    0.919f,  -0.265f, 0.256f,  0.383f,  -0.399f, 0.183f,  0.363f,  -0.779f, -0.191f, -0.446f, 0.063f,  -0.671f, 0.823f,
    -0.049f, -0.182f, -0.409f, 0.589f,  -0.804f, -0.461f, -0.407f, -0.119f, 0.833f,  0.718f,  -0.366f, 0.993f,  0.844f,
    -0.018f, -0.203f, -0.004f, -0.610f, -0.461f, 0.938f,  -0.708f, -0.831f, -0.147f, 0.855f,  0.998f,  0.412f,  -0.393f,
    -0.706f, -0.127f, 0.845f,  -0.236f, -0.341f, 0.299f,  0.793f,  0.794f,  -0.634f, -0.663f, -0.568f, -0.428f, -0.921f,
    0.904f,  0.933f,  -0.985f, -0.760f, -0.673f, -0.080f, 0.235f,  0.539f,  -0.341f, -0.899f, 0.527f,  -0.210f, -0.151f,
    0.148f,  -0.184f, -0.103f, -0.345f, -0.772f, -0.960f, -0.282f, -0.486f, -0.986f, -0.591f, 0.702f,  0.973f,  0.269f,
    0.058f,  -0.831f, -0.677f, -0.665f, -0.403f, 0.241f,  -0.365f, 0.741f,  0.603f,  0.347f,  0.812f,  -0.515f, -0.085f,
    0.251f,  0.631f,  0.819f,  0.622f,  -0.615f, -0.122f, 0.064f,  0.445f,  -0.508f, -0.023f, -0.072f, -0.423f, 0.547f,
    -0.841f, -0.308f, 0.924f,  -0.187f, 0.601f,  0.879f,  -0.868f, 0.395f,  -0.307f, 0.977f,  -0.300f, 0.737f,  0.022f,
    0.106f,  -0.520f, -0.673f, -0.351f, 0.367f,  0.588f,  -0.223f, 0.062f,  0.870f,  -0.017f, 0.583f,  0.405f,  0.507f,
    -0.457f, 0.196f,  0.048f,  -0.173f, 0.596f,  -0.017f, -0.245f, -0.433f, -0.852f, 0.058f,  0.237f,  0.280f,  -0.129f,
    -0.224f, 0.869f,  -0.781f, -0.029f, -0.715f, 0.497f,  -0.341f, 0.230f,  -0.572f, 0.718f,  -0.408f, -0.998f, -0.752f,
    -0.701f};
  std::vector<float> output_data = {0.919f, 0.998f, 0.973f, 0.977f, 0.870f};
  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceMax, true);
 }

 TEST_F(TestReduceOpenCL, ReduceProdFp32) {
  int n = 1;
  int h = 2;
  int w = 2;
  int c = 3;
  std::vector<int> shape = {n, h, w, c};
  std::vector<float> input_data = {4.0f, 3.0f, 1.0f, 4.0f, 1.0f, 3.0f, 1.0f, 4.0f, 2.0f, 4.0f, 4.0f, 3.0f};
  std::vector<float> output_data = {64.0f, 48.0f, 18.0f};

  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceProd);
 }

 TEST_F(TestReduceOpenCL, ReduceProdFp16) {
  int n = 1;
  int h = 2;
  int w = 2;
  int c = 3;
  std::vector<int> shape = {n, h, w, c};
  std::vector<float16_t> input_data = {2.0f, 1.0f, 3.0f, 1.0f, 4.0f, 1.0f, 4.0f, 3.0f, 2.0f, 3.0f, 1.0f, 1.0f};
  std::vector<float16_t> output_data = {24.0f, 12.0f, 6.0f};

  RunTestCaseReduce(shape, input_data.data(), output_data.data(), true, schema::ReduceMode_ReduceProd);
 }

 TEST_F(TestReduceOpenCL, ReduceProdLocalFp32) {
  int n = 1;
  int h = 17;
  int w = 2;
  int c = 3;
  std::vector<int> shape = {n, h, w, c};
  std::vector<float> input_data = {
    0.304f, 2.304f, 1.391f, 1.072f, 0.351f, 0.641f, 0.120f, 2.382f, 0.460f, 1.672f, 0.553f, 1.534f, 1.423f,
    0.892f, 2.900f, 1.953f, 1.745f, 1.171f, 1.717f, 1.291f, 1.572f, 2.388f, 0.154f, 0.252f, 0.794f, 0.981f,
    0.366f, 1.372f, 1.778f, 1.848f, 1.023f, 1.124f, 2.045f, 2.374f, 1.965f, 0.260f, 1.306f, 1.889f, 1.144f,
    1.816f, 2.189f, 2.215f, 1.913f, 2.577f, 2.910f, 1.712f, 0.342f, 1.349f, 0.215f, 2.717f, 1.813f, 2.764f,
    1.989f, 1.710f, 0.156f, 2.293f, 2.648f, 1.281f, 1.078f, 2.757f, 0.746f, 0.238f, 0.235f, 0.123f, 0.730f,
    1.558f, 1.798f, 0.993f, 2.479f, 1.930f, 1.687f, 1.078f, 0.600f, 0.710f, 1.926f, 0.848f, 0.984f, 0.568f,
    0.983f, 1.068f, 2.362f, 2.770f, 2.184f, 2.883f, 1.177f, 0.232f, 0.782f, 1.340f, 2.029f, 1.524f, 0.159f,
    2.892f, 1.225f, 0.638f, 2.537f, 0.813f, 0.337f, 1.871f, 0.602f, 2.387f, 1.209f, 2.886f};
  std::vector<float> output_data = {0.103f, 229.081f, 1030.031f};

  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceProd);
 }

 TEST_F(TestReduceOpenCL, ReduceProdLocalFp16) {
  int n = 1;
  int h = 17;
  int w = 2;
  int c = 3;
  std::vector<int> shape = {n, h, w, c};
  std::vector<float16_t> input_data = {
    2.843f, 2.398f, 0.998f, 1.164f, 1.048f, 0.880f, 2.112f, 1.354f, 2.892f, 0.755f, 2.033f, 1.140f, 1.117f,
    2.550f, 2.340f, 2.905f, 0.114f, 0.773f, 2.589f, 2.404f, 1.037f, 0.561f, 2.671f, 0.419f, 1.723f, 2.041f,
    2.888f, 2.440f, 1.668f, 0.821f, 0.918f, 1.251f, 1.141f, 2.497f, 0.408f, 2.384f, 0.457f, 2.754f, 0.624f,
    0.198f, 0.599f, 2.566f, 1.279f, 2.973f, 0.363f, 2.222f, 1.144f, 2.715f, 1.135f, 0.900f, 1.906f, 0.982f,
    2.211f, 2.113f, 0.585f, 1.766f, 1.612f, 1.796f, 0.607f, 1.121f, 1.277f, 2.600f, 1.446f, 1.467f, 1.828f,
    2.227f, 0.950f, 2.702f, 1.297f, 0.552f, 2.476f, 1.404f, 2.487f, 0.615f, 0.205f, 0.577f, 0.809f, 1.432f,
    1.668f, 2.243f, 2.711f, 2.221f, 0.183f, 2.964f, 1.174f, 0.928f, 2.703f, 0.427f, 0.410f, 1.436f, 1.427f,
    1.144f, 2.970f, 2.014f, 2.380f, 1.286f, 2.570f, 2.765f, 1.757f, 0.513f, 2.449f, 0.770f};
  std::vector<float16_t> output_data = {715.940f, 12232.266f, 46763.609f};

  RunTestCaseReduce(shape, input_data.data(), output_data.data(), true, schema::ReduceMode_ReduceProd);
 }

 TEST_F(TestReduceOpenCL, ReduceProdWCFp32) {
  int n = 1;
  int h = 3;
  int w = 2;
  int c = 2;
  std::vector<int> shape = {n, h, w, c};
  std::vector<float> input_data = {1.691f, 2.804f, 0.184f, 1.760f, 0.255f, 1.461f,
                                   2.751f, 2.487f, 1.304f, 0.686f, 0.702f, 0.393f};
  std::vector<float> output_data = {1.536f, 2.549f, 0.247f};

  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceProd, true);
 }

 TEST_F(TestReduceOpenCL, ReduceProdWCLocalFp32) {
  int n = 1;
  int h = 5;
  int w = 17;
  int c = 2;
  std::vector<int> shape = {n, h, w, c};
  std::vector<float> input_data = {
    1.360f, 0.615f, 0.894f, 1.357f, 0.701f, 1.430f, 1.488f, 0.701f, 0.688f, 0.869f, 1.321f, 0.836f, 1.160f, 1.460f,
    1.215f, 1.157f, 0.855f, 0.992f, 0.724f, 0.741f, 0.921f, 1.496f, 1.285f, 1.040f, 0.695f, 1.264f, 0.998f, 0.925f,
    1.170f, 1.384f, 1.413f, 0.617f, 0.743f, 1.299f, 0.998f, 1.131f, 1.491f, 1.371f, 0.808f, 1.001f, 0.602f, 0.812f,
    1.299f, 1.500f, 0.867f, 0.970f, 1.174f, 0.887f, 1.409f, 1.144f, 0.969f, 1.303f, 1.154f, 0.796f, 0.952f, 1.347f,
    0.794f, 0.601f, 1.191f, 1.310f, 0.619f, 0.961f, 0.951f, 1.395f, 0.861f, 1.177f, 1.274f, 0.701f, 0.758f, 0.635f,
    1.256f, 1.450f, 0.900f, 1.313f, 1.401f, 0.904f, 0.835f, 0.767f, 1.258f, 1.467f, 1.278f, 0.652f, 0.731f, 0.648f,
    1.308f, 1.199f, 1.485f, 1.352f, 0.639f, 1.291f, 0.924f, 0.762f, 0.791f, 1.392f, 1.328f, 1.190f, 1.458f, 1.193f,
    1.109f, 1.098f, 1.117f, 1.197f, 1.097f, 0.879f, 1.175f, 0.723f, 1.260f, 1.454f, 0.703f, 0.729f, 1.467f, 0.918f,
    0.631f, 0.750f, 1.292f, 1.208f, 0.972f, 0.621f, 0.673f, 0.710f, 1.482f, 1.092f, 1.162f, 1.432f, 0.774f, 1.132f,
    1.258f, 0.761f, 0.799f, 1.071f, 1.099f, 1.484f, 0.674f, 0.916f, 0.684f, 0.842f, 1.412f, 0.956f, 1.199f, 0.969f,
    0.957f, 1.124f, 0.937f, 0.815f, 1.308f, 1.448f, 1.059f, 1.373f, 0.804f, 1.172f, 1.387f, 0.826f, 0.783f, 0.707f,
    1.159f, 0.927f, 0.602f, 0.932f, 1.024f, 1.266f, 0.885f, 0.920f, 1.120f, 0.973f, 0.964f, 1.365f, 0.926f, 0.709f,
    1.177f, 0.615f};
  std::vector<float> output_data = {1.544f, 1.984f, 5.516f, 0.247f, 0.919f};
  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceProd, true);
 }

 TEST_F(TestReduceOpenCL, ReduceSumSquareFp32) {
  int n = 1;
  int h = 2;
  int w = 2;
  int c = 3;
  std::vector<int> shape = {n, h, w, c};
  std::vector<float> input_data = {-0.081f, 0.305f,  -0.291f, 0.777f, 0.338f, 0.482f,
                                   0.959f,  -0.695f, -0.055f, 0.001f, 0.723f, -0.112f};
  std::vector<float> output_data = {1.530f, 1.213f, 0.333f};

  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceSumSquare);
 }

 TEST_F(TestReduceOpenCL, ReduceSumSquareFp16) {
  int n = 1;
  int h = 2;
  int w = 2;
  int c = 3;
  std::vector<int> shape = {n, h, w, c};
  std::vector<float16_t> input_data = {-0.730f, -0.938f, 0.236f, -0.631f, -0.058f, -0.625f,
                                       0.097f,  -0.343f, 0.120f, -0.339f, 0.003f,  -0.288f};
  std::vector<float16_t> output_data = {1.055f, 1.001f, 0.544f};

  RunTestCaseReduce(shape, input_data.data(), output_data.data(), true, schema::ReduceMode_ReduceSumSquare);
 }

 TEST_F(TestReduceOpenCL, ReduceSumSquareLocalFp32) {
  int n = 1;
  int h = 17;
  int w = 2;
  int c = 3;
  std::vector<int> shape = {n, h, w, c};
  std::vector<float> input_data = {
    0.025f,  -0.130f, 0.292f,  0.128f,  0.360f,  -0.181f, -0.179f, 0.469f,  0.434f,  -0.417f, -0.414f, 0.998f,  0.654f,
    -0.102f, 0.039f,  -0.822f, -0.155f, 0.113f,  0.204f,  0.615f,  0.844f,  -0.364f, 0.486f,  0.799f,  0.452f,  -0.884f,
    -0.006f, 0.888f,  -0.567f, 0.620f,  -0.365f, -0.096f, -0.300f, -0.263f, 0.945f,  -0.900f, -0.798f, -0.536f, -0.506f,
    0.148f,  -0.496f, 0.344f,  0.096f,  0.881f,  -0.848f, 0.401f,  -0.724f, 0.806f,  -0.550f, 0.377f,  0.560f,  -0.144f,
    0.439f,  0.038f,  -0.985f, 0.246f,  0.233f,  -0.864f, 0.427f,  -0.723f, 0.592f,  -0.642f, 0.376f,  0.769f,  0.020f,
    0.965f,  0.532f,  -0.448f, -0.168f, 0.502f,  0.900f,  0.468f,  0.834f,  -0.768f, -0.337f, 0.874f,  0.941f,  -0.449f,
    -0.330f, 0.605f,  0.081f,  0.804f,  -0.823f, -0.270f, 0.117f,  0.040f,  0.316f,  0.951f,  -0.920f, 0.599f,  0.855f,
    0.075f,  -0.898f, -0.298f, 0.208f,  0.899f,  0.751f,  -0.421f, 0.478f,  -0.106f, -0.031f, 0.974f};
  std::vector<float> output_data = {11.569f, 10.620f, 11.552f};

  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceSumSquare);
 }

 TEST_F(TestReduceOpenCL, ReduceSumSquareLocalFp16) {
  int n = 1;
  int h = 17;
  int w = 2;
  int c = 3;
  std::vector<int> shape = {n, h, w, c};
  std::vector<float16_t> input_data = {
    0.931f,  0.611f,  0.921f,  -0.873f, 0.084f,  -0.677f, -0.366f, -0.627f, -0.359f, 0.217f,  -0.825f, -0.453f, 0.486f,
    0.675f,  -0.968f, 0.070f,  0.300f,  -0.508f, -0.423f, -0.741f, -0.390f, 0.649f,  -0.313f, -0.921f, -0.130f, -0.212f,
    -0.591f, 0.135f,  -0.556f, -0.963f, -0.509f, -0.480f, 0.694f,  -0.913f, 0.778f,  0.498f,  -0.520f, 0.271f,  0.087f,
    0.265f,  0.905f,  0.669f,  0.257f,  -0.307f, 0.789f,  0.117f,  0.468f,  0.728f,  0.372f,  -0.475f, 0.195f,  0.163f,
    0.766f,  -0.504f, 0.876f,  -0.203f, 0.636f,  -0.340f, -0.126f, 0.368f,  -0.173f, -0.149f, 0.492f,  -0.220f, 0.521f,
    -0.844f, -0.684f, -0.718f, 0.255f,  -0.148f, -0.891f, 0.577f,  -0.880f, 0.005f,  -0.904f, 0.282f,  0.473f,  -0.512f,
    -0.385f, -0.674f, 0.443f,  -0.172f, 0.224f,  0.720f,  -0.050f, 0.003f,  -0.743f, 0.025f,  0.941f,  0.107f,  0.176f,
    -0.360f, 0.975f,  -0.781f, -0.727f, 0.274f,  0.214f,  -0.330f, 0.237f,  0.967f,  0.156f,  -0.587f};
  std::vector<float16_t> output_data = {8.472f, 9.920f, 13.418f};

  RunTestCaseReduce(shape, input_data.data(), output_data.data(), true, schema::ReduceMode_ReduceSumSquare);
 }

 TEST_F(TestReduceOpenCL, ReduceSumSquareWCFp32) {
  int n = 1;
  int h = 3;
  int w = 2;
  int c = 2;
  std::vector<int> shape = {n, h, w, c};
  std::vector<float> input_data = {-0.686f, 0.613f,  -0.701f, 0.978f, 0.632f,  0.677f,
                                   0.780f,  -0.888f, 0.147f,  0.448f, -0.100f, 0.936f};
  std::vector<float> output_data = {2.294f, 2.255f, 1.108f};

  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceSumSquare, true);
 }

 TEST_F(TestReduceOpenCL, ReduceSumSquareWCLocalFp32) {
  int n = 1;
  int h = 5;
  int w = 17;
  int c = 2;
  std::vector<int> shape = {n, h, w, c};
  std::vector<float> input_data = {
    -0.309f, -0.836f, 0.749f,  -0.820f, -0.715f, -0.770f, 0.030f,  -0.817f, 0.009f,  0.146f,  0.642f,  0.382f,  -0.085f,
    -0.268f, -0.424f, -0.957f, -0.127f, -0.852f, 0.596f,  0.340f,  -0.492f, -0.374f, -0.669f, 0.665f,  -0.664f, -0.079f,
    0.462f,  0.469f,  0.187f,  -0.730f, -0.240f, -0.446f, 0.254f,  0.284f,  0.743f,  0.297f,  0.235f,  -0.068f, 0.652f,
    -0.474f, -0.749f, -0.499f, 0.106f,  -0.988f, 0.033f,  -0.327f, -0.050f, -0.228f, -0.676f, -0.136f, -0.801f, 0.885f,
    -0.108f, -0.019f, -0.092f, 0.538f,  0.760f,  0.996f,  -0.610f, 0.125f,  0.296f,  0.861f,  0.811f,  0.948f,  -0.665f,
    0.920f,  0.669f,  0.572f,  -0.653f, -0.823f, -0.967f, -0.094f, 0.078f,  0.458f,  0.954f,  -0.357f, 0.887f,  -0.194f,
    -0.453f, -0.774f, -0.805f, -0.064f, -0.671f, -0.151f, -0.910f, 0.695f,  0.762f,  0.755f,  -0.933f, 0.277f,  -0.697f,
    0.074f,  -0.333f, 0.790f,  -0.370f, 0.264f,  -0.649f, 0.570f,  0.933f,  0.714f,  0.296f,  -0.430f, 0.634f,  0.619f,
    -0.744f, -0.898f, -0.908f, -0.800f, 0.500f,  -0.688f, 0.816f,  0.901f,  0.054f,  0.993f,  0.346f,  -0.285f, -0.926f,
    0.746f,  -0.718f, 0.708f,  -0.193f, 0.838f,  -0.869f, -0.189f, -0.195f, -0.324f, -0.498f, -0.216f, 0.632f,  -0.701f,
    0.272f,  0.550f,  0.486f,  -0.415f, 0.285f,  0.617f,  0.740f,  0.170f,  0.486f,  0.251f,  -0.165f, -0.424f, 0.705f,
    -0.802f, -0.977f, -0.449f, 0.502f,  -0.406f, 0.125f,  -0.643f, -0.324f, -0.409f, 0.218f,  0.719f,  -0.043f, -0.933f,
    -0.580f, 0.830f,  -0.091f, 0.998f,  -0.458f, 0.142f,  -0.220f, -0.440f, 0.824f,  -0.349f, 0.983f,  -0.546f, 0.085f,
    0.235f};
  std::vector<float> output_data = {9.889f, 11.926f, 13.296f, 13.537f, 10.563f};
  RunTestCaseReduce(shape, input_data.data(), output_data.data(), false, schema::ReduceMode_ReduceSumSquare, true);
 }
 }  // namespace mindspore