From 8946acc35b53606f4d2a8fab0b94a094860078da Mon Sep 17 00:00:00 2001
From: zhaozhenlong <zhaozhenlong1@huawei.com>
Date: Wed, 16 Sep 2020 19:44:59 +0800
Subject: [PATCH] add coffe for reduce

---
 mindspore/lite/nnacl/reduce_parameter.h       |   1 +
 mindspore/lite/schema/ops.fbs                 |   1 +
 mindspore/lite/src/ops/reduce.cc              |   3 +
 mindspore/lite/src/ops/reduce.h               |   2 +
 mindspore/lite/src/populate_parameter.cc      |   1 +
 .../src/runtime/kernel/arm/fp32/reduce.cc     |  54 ++-
 .../lite/src/runtime/kernel/arm/fp32/reduce.h |   7 +-
 .../kernel/arm/fp32/reduce_fp32_tests.cc      | 434 ++++++++++++------
 8 files changed, 359 insertions(+), 144 deletions(-)
diff --git a/mindspore/lite/nnacl/reduce_parameter.h b/mindspore/lite/nnacl/reduce_parameter.h
index 2a801f6c84..e28f6f625f 100644
--- a/mindspore/lite/nnacl/reduce_parameter.h
+++ b/mindspore/lite/nnacl/reduce_parameter.h
@@ -23,6 +23,7 @@ struct ReduceParameter {
   OpParameter op_parameter_;
   bool keep_dims_;
   bool reduce_to_end_;
+  float coeff;
   int axes_[REDUCE_MAX_AXES_NUM];
   int num_axes_;
   int mode_;
diff --git a/mindspore/lite/schema/ops.fbs b/mindspore/lite/schema/ops.fbs
index c728751791..1e4c1de009 100644
--- a/mindspore/lite/schema/ops.fbs
+++ b/mindspore/lite/schema/ops.fbs
@@ -676,6 +676,7 @@ table Reduce {
     keepDims: int;
     mode: ReduceMode;
     reduceToEnd: bool = false;
+    coeff: float = 1.0;
 }
 
 table Transpose {
diff --git a/mindspore/lite/src/ops/reduce.cc b/mindspore/lite/src/ops/reduce.cc
index b8ff29f505..ad6635522c 100644
--- a/mindspore/lite/src/ops/reduce.cc
+++ b/mindspore/lite/src/ops/reduce.cc
@@ -24,11 +24,13 @@ std::vector<int> Reduce::GetAxes() const { return this->primitive_->value.AsRedu
 int Reduce::GetKeepDims() const { return this->primitive_->value.AsReduce()->keepDims; }
 int Reduce::GetMode() const { return this->primitive_->value.AsReduce()->mode; }
 bool Reduce::GetReduceToEnd() const { return this->primitive_->value.AsReduce()->reduceToEnd; }
+float Reduce::GetCoeff() const { return this->primitive_->value.AsReduce()->coeff; }
 
 void Reduce::SetAxes(const std::vector<int> &axes) { this->primitive_->value.AsReduce()->axes = axes; }
 void Reduce::SetKeepDims(int keep_dims) { this->primitive_->value.AsReduce()->keepDims = keep_dims; }
 void Reduce::SetMode(int mode) { this->primitive_->value.AsReduce()->mode = (schema::ReduceMode)mode; }
 void Reduce::SetReduceToEnd(bool reduce_to_end) { this->primitive_->value.AsReduce()->reduceToEnd = reduce_to_end; }
+void Reduce::SetCoeff(float coeff) { this->primitive_->value.AsReduce()->coeff = coeff; }
 
 int Reduce::UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs) {
   if (this->primitive_ == nullptr) {
@@ -101,6 +103,7 @@ std::vector<int> Reduce::GetAxes() const {
 int Reduce::GetKeepDims() const { return this->primitive_->value_as_Reduce()->keepDims(); }
 int Reduce::GetMode() const { return this->primitive_->value_as_Reduce()->mode(); }
 bool Reduce::GetReduceToEnd() const { return this->primitive_->value_as_Reduce()->reduceToEnd(); }
+float Reduce::GetCoeff() const { return this->primitive_->value_as_Reduce()->coeff(); }
 int Reduce::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
   MS_ASSERT(nullptr != primitive);
   MS_ASSERT(nullptr != fbb);
diff --git a/mindspore/lite/src/ops/reduce.h b/mindspore/lite/src/ops/reduce.h
index e5d50a2641..d859e0a1c1 100644
--- a/mindspore/lite/src/ops/reduce.h
+++ b/mindspore/lite/src/ops/reduce.h
@@ -38,6 +38,7 @@ class Reduce : public PrimitiveC {
   void SetKeepDims(int keep_dims);
   void SetMode(int mode);
   void SetReduceToEnd(bool reduce_to_end);
+  void SetCoeff(float coeff);
 #else
   Reduce() = default;
 
@@ -48,6 +49,7 @@ class Reduce : public PrimitiveC {
   int GetKeepDims() const;
   int GetMode() const;
   bool GetReduceToEnd() const;
+  float GetCoeff() const;
 };
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/populate_parameter.cc b/mindspore/lite/src/populate_parameter.cc
index c838129f1d..53af8819fc 100644
--- a/mindspore/lite/src/populate_parameter.cc
+++ b/mindspore/lite/src/populate_parameter.cc
@@ -551,6 +551,7 @@ OpParameter *PopulateReduceParameter(const mindspore::lite::PrimitiveC *primitiv
   auto reduce = reinterpret_cast<mindspore::lite::Reduce *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
   reduce_param->keep_dims_ = reduce->GetKeepDims();
   reduce_param->reduce_to_end_ = reduce->GetReduceToEnd();
+  reduce_param->coeff = reduce->GetCoeff();
   auto axisVector = reduce->GetAxes();
   if (axisVector.size() > REDUCE_MAX_AXES_NUM) {
     MS_LOG(ERROR) << "Reduce axes size " << axisVector.size() << " exceed limit " << REDUCE_MAX_AXES_NUM;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.cc
index 81bcc81ae1..426647cdd4 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.cc
@@ -30,6 +30,7 @@ using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_Mean;
 using mindspore::schema::PrimitiveType_Reduce;
 using mindspore::schema::ReduceMode;
+using mindspore::schema::ReduceMode_ReduceASum;
 using mindspore::schema::ReduceMode_ReduceMax;
 using mindspore::schema::ReduceMode_ReduceMean;
 using mindspore::schema::ReduceMode_ReduceMin;
@@ -68,7 +69,11 @@ int ReduceCPUKernel::Init() {
       break;
     }
     case static_cast<int>(ReduceMode_ReduceSumSquare): {
-      reducer_ = ReduceSumSquare;
+      reducer_ = ReduceSum;
+      break;
+    }
+    case static_cast<int>(ReduceMode_ReduceASum): {
+      reducer_ = ReduceSum;
       break;
     }
     default:
@@ -125,6 +130,7 @@ int ReduceCPUKernel::Run() {
   }
 
   src_data_ = in_tensors_.at(0)->MutableData();
+  PreProcess();
   for (size_t i = 0; i < static_cast<size_t>(num_axes_); ++i) {
     if (i != static_cast<size_t>(num_axes_ - 1)) {
       dst_data_ = data_buffers_[i];
@@ -142,10 +148,56 @@ int ReduceCPUKernel::Run() {
     }
     src_data_ = dst_data_;
   }
+  if (reduce_param_->reduce_to_end_ && reduce_param_->coeff - 1.0f > 1e-5) {
+    ret = CalculateCoeffOutput();
+    if (ret != RET_OK) {
+      return ret;
+    }
+  }
+
   FreeTmpBuffer();
   return RET_OK;
 }
 
+void ReduceCPUKernel::PreProcess() {
+  if (data_type_ == kDataTypeInt) {
+    return;
+  }
+  int num = in_tensors_.at(0)->ElementsNum();
+  float *data = reinterpret_cast<float *>(in_tensors_.at(0)->MutableData());
+  if (data == nullptr) {
+    return;
+  }
+  if (reduce_param_->mode_ == static_cast<int>(ReduceMode_ReduceASum)) {
+    for (int i = 0; i < num; ++i) {
+      if (data[i] < 0.0f) {
+        data[i] = 0.0f - data[i];
+      }
+    }
+  }
+  if (reduce_param_->mode_ == static_cast<int>(ReduceMode_ReduceSumSquare)) {
+    for (int i = 0; i < num; ++i) {
+      data[i] = data[i] * data[i];
+    }
+  }
+}
+
+int ReduceCPUKernel::CalculateCoeffOutput() {
+  auto out_tensor = out_tensors_.at(0);
+  int num = out_tensor->ElementsNum();
+  if (data_type_ != kDataTypeFloat) {
+    return RET_ERROR;
+  }
+  float *out_data = reinterpret_cast<float *>(out_tensor->MutableData());
+  if (out_data == nullptr) {
+    return RET_NULL_PTR;
+  }
+  for (int i = 0; i < num; ++i) {
+    out_data[i] *= reduce_param_->coeff;
+  }
+  return RET_OK;
+}
+
 int ReduceCPUKernel::MallocTmpBuffer() {
   data_buffers_.clear();
   for (auto size : buffer_sizes_) {
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.h b/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.h
index b7b485de88..2cc3a6fc96 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.h
@@ -36,7 +36,9 @@ class ReduceCPUKernel : public ReduceBaseCPUKernel {
   ReduceCPUKernel(OpParameter *param, const std::vector<lite::Tensor *> &inputs,
                   const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
                   const mindspore::lite::PrimitiveC *primitive)
-      : ReduceBaseCPUKernel(param, inputs, outputs, ctx, primitive) {}
+      : ReduceBaseCPUKernel(param, inputs, outputs, ctx, primitive) {
+    reduce_param_ = reinterpret_cast<ReduceParameter *>(param);
+  }
   ~ReduceCPUKernel() {
     src_data_ = nullptr;
     dst_data_ = nullptr;
@@ -50,6 +52,7 @@ class ReduceCPUKernel : public ReduceBaseCPUKernel {
   int CallReduceUnit(int task_id);
 
  private:
+  ReduceParameter *reduce_param_;
   Reducer reducer_ = nullptr;
   IntReducer int_reducer_ = nullptr;
   std::vector<void *> data_buffers_;
@@ -61,6 +64,8 @@ class ReduceCPUKernel : public ReduceBaseCPUKernel {
  private:
   int MallocTmpBuffer();
   void FreeTmpBuffer();
+  int CalculateCoeffOutput();
+  void PreProcess();
 };
 }  // namespace mindspore::kernel
 
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/reduce_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/reduce_fp32_tests.cc
index 82db7d3418..4da3ef549e 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/reduce_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/reduce_fp32_tests.cc
@@ -17,18 +17,82 @@
 #include "utils/log_adapter.h"
 #include "common/common_test.h"
 #include "mindspore/lite/nnacl/fp32/reduce.h"
+#include "schema/inner/model_generated.h"
+#include "src/tensor.h"
+#include "mindspore/lite/src/kernel_registry.h"
+#include "mindspore/lite/src/runtime/allocator.h"
+
+using mindspore::lite::Allocator;
+using mindspore::lite::Tensor;
+using mindspore::schema::ReduceMode;
+using mindspore::schema::ReduceMode_ReduceMax;
+using mindspore::schema::ReduceMode_ReduceMean;
+using mindspore::schema::ReduceMode_ReduceMin;
+using mindspore::schema::ReduceMode_ReduceProd;
+using mindspore::schema::ReduceMode_ReduceSum;
+using mindspore::schema::ReduceMode_ReduceASum;
+using mindspore::schema::ReduceMode_ReduceSumSquare;
 
 namespace mindspore {
 
 class TestReduceFp32 : public mindspore::CommonTest {
  public:
   TestReduceFp32() = default;
-  int tid = 0;
-  int thread_num = 1;
+
+  void Prepare(const std::vector<int> &in_shape, const std::vector<int> &out_shape, float *input_data,
+               float *output_data, ReduceMode mode, const int *axes, const int num_axes, bool reduce_to_end,
+               float coeff);
+  void TearDown() override;
+
+ public:
+  int tid_ = 0;
+  int thread_num_ = 1;
   float err_tol = 1e-5;
+  ReduceParameter param_ = {};
+  Tensor in_tensor_;
+  Tensor out_tensor_;
+  std::vector<Tensor *> inputs{&in_tensor_};
+  std::vector<Tensor *> outputs{&out_tensor_};
+  kernel::KernelKey desc_ = {kernel::KERNEL_ARCH::kCPU, kNumberTypeFloat32, schema::PrimitiveType_Reduce};
+  kernel::KernelCreator creator_ = nullptr;
+  lite::InnerContext *ctx_ = nullptr;
+  kernel::LiteKernel *kernel_ = nullptr;
 };
 
-TEST_F(TestReduceFp32, Mean) {
+void TestReduceFp32::TearDown() {
+  delete ctx_;
+  in_tensor_.SetData(nullptr);
+  out_tensor_.SetData(nullptr);
+}
+
+void TestReduceFp32::Prepare(const std::vector<int> &in_shape, const std::vector<int> &out_shape, float *input_data,
+                             float *output_data, ReduceMode mode, const int *axes, const int num_axes,
+                             bool reduce_to_end, float coeff) {
+  in_tensor_.set_data_type(kNumberTypeFloat32);
+  in_tensor_.set_shape(in_shape);
+  in_tensor_.SetData(input_data);
+
+  out_tensor_.set_data_type(kNumberTypeFloat32);
+  out_tensor_.set_shape(out_shape);
+  out_tensor_.SetData(output_data);
+
+  param_.mode_ = static_cast<int>(mode);
+  param_.num_axes_ = num_axes;
+  memcpy(param_.axes_, axes, num_axes * sizeof(int));
+  param_.reduce_to_end_ = reduce_to_end;
+  param_.coeff = coeff;
+
+  ctx_ = new (std::nothrow) lite::InnerContext;
+  ASSERT_EQ(lite::RET_OK, ctx_->Init());
+  creator_ = lite::KernelRegistry::GetInstance()->GetCreator(desc_);
+  if (ctx_->allocator == nullptr) {
+    ctx_->allocator = Allocator::Create();
+  }
+  ctx_->thread_num_ = thread_num_;
+  kernel_ = creator_(inputs, outputs, reinterpret_cast<OpParameter *>(&param_), ctx_, desc_, nullptr);
+}
+
+TEST_F(TestReduceFp32, Mean1) {
   /* 2 4 4 3 NHWC */
   float in[96] = {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0,
                   16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
@@ -39,21 +103,52 @@ TEST_F(TestReduceFp32, Mean) {
   float correct[24] = {18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0,
                        66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0};
 
-  int input_shape[4] = {2, 4, 4, 3};
-  // int output_shape[4] = {2, 1, 4, 3};
-
+  std::vector<int> in_shape{2, 4, 4, 3};
+  std::vector<int> out_shape{2, 1, 4, 3};
+  int axes[1] = {1};
+  int axis_num = 1;
   float out[24] = {0};
-  int outer_size = 2;
-  int inner_size = 12;
-  int axis_size = 4;
-  (void)ReduceMean(outer_size, inner_size, axis_size, in, out, tid, thread_num);
+  bool reduce_to_end = false;
+  float coeff = 1.0f;
+  thread_num_ = 2;
+
+  Prepare(in_shape, out_shape, in, out, ReduceMode_ReduceMean, axes, axis_num, reduce_to_end, coeff);
+  kernel_->Run();
 
   int output_size = 24;
   CompareOutputData(out, correct, output_size, err_tol);
 }
 
-TEST_F(TestReduceFp32, Mean2Thread) {
-  /* 2*4*4*3 NHWC */
+// thread num 2 reduce_to_end
+TEST_F(TestReduceFp32, Mean2) {
+  /* 2 4 4 3 NHWC */
+  float in[96] = {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0,
+                  16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
+                  32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0,
+                  48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0,
+                  64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0,
+                  80.0, 81.0, 82.0, 83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0, 90.0, 91.0, 92.0, 93.0, 94.0, 95.0};
+  float correct[2] = {47.0, 143.0};
+
+  std::vector<int> in_shape{2, 4, 4, 3};
+  std::vector<int> out_shape{2, 1, 1, 1};
+  int axes[1] = {1};
+  int axis_num = 1;
+  float out[24] = {0};
+  bool reduce_to_end = true;
+  float coeff = 2.0f;
+  thread_num_ = 2;
+
+  Prepare(in_shape, out_shape, in, out, ReduceMode_ReduceMean, axes, axis_num, reduce_to_end, coeff);
+  kernel_->Run();
+
+  int output_size = 2;
+  CompareOutputData(out, correct, output_size, err_tol);
+}
+
+// thread num 1
+TEST_F(TestReduceFp32, Mean3) {
+  /* 2 4 4 3 NHWC */
   float in[96] = {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0,
                   16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
                   32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0,
@@ -63,20 +158,19 @@ TEST_F(TestReduceFp32, Mean2Thread) {
   float correct[24] = {18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0,
                        66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0};
 
-  int input_shape[4] = {2, 4, 4, 3};
-  // int output_shape[4] = {2, 1, 4, 3};
-
+  std::vector<int> in_shape{2, 4, 4, 3};
+  std::vector<int> out_shape{2, 1, 4, 3};
+  int axes[1] = {1};
+  int axis_num = 1;
   float out[24] = {0};
-  int outer_size = 2;
-  int inner_size = 12;
-  int axis_size = 4;
-  thread_num = 2;
-  tid = 0;
-  (void)ReduceMean(outer_size, inner_size, axis_size, in, out, tid, thread_num);
-  tid = 1;
-  (void)ReduceMean(outer_size, inner_size, axis_size, in, out, tid, thread_num);
+  bool reduce_to_end = false;
+  float coeff = 2.0f;
+  thread_num_ = 1;
 
-  int output_size = 24;
+  Prepare(in_shape, out_shape, in, out, ReduceMode_ReduceMean, axes, axis_num, reduce_to_end, coeff);
+  kernel_->Run();
+
+  int output_size = 2;
   CompareOutputData(out, correct, output_size, err_tol);
 }
 
@@ -91,37 +185,16 @@ TEST_F(TestReduceFp32, MeanAllAxis) {
   float correct[1] = {47.5};
   float out[1] = {0};
 
-  int input_shape[4] = {2, 4, 4, 3};
-  int outer_size = 1;
-  int inner_size = 48;
-  int axis_size = 2;
-  float *src = in;
-  float dst1[48] = {0};
-  MS_ASSERT(dst != nullptr);
-  (void)ReduceMean(outer_size, inner_size, axis_size, src, dst1, tid, thread_num);
-
-  input_shape[0] = 1;  // 1 4 4 3
-  outer_size = 1;
-  inner_size = 12;
-  axis_size = 4;
-  src = dst1;
-  float dst2[12] = {0};
-  (void)ReduceMean(outer_size, inner_size, axis_size, src, dst2, tid, thread_num);
-
-  input_shape[1] = 1;  // 1 1 4 3
-  outer_size = 1;
-  inner_size = 3;
-  axis_size = 4;
-  src = dst2;
-  float dst3[3] = {0};
-  (void)ReduceMean(outer_size, inner_size, axis_size, src, dst3, tid, thread_num);
-
-  input_shape[2] = 1;  // 1 1 1 3
-  outer_size = 1;
-  inner_size = 1;
-  axis_size = 3;
-  src = dst3;
-  (void)ReduceMean(outer_size, inner_size, axis_size, src, out, tid, thread_num);
+  std::vector<int> in_shape{2, 4, 4, 3};
+  std::vector<int> out_shape{1, 1, 1, 1};
+  int axes[4] = {0, 1, 2, 3};
+  int axis_num = 4;
+  bool reduce_to_end = false;
+  float coeff = 0.0f;
+  thread_num_ = 2;
+
+  Prepare(in_shape, out_shape, in, out, ReduceMode_ReduceMean, axes, axis_num, reduce_to_end, coeff);
+  kernel_->Run();
 
   int output_size = 1;
   CompareOutputData(out, correct, output_size, err_tol);
@@ -138,20 +211,24 @@ TEST_F(TestReduceFp32, Sum) {
   float correct[24] = {72.0,  76.0,  80.0,  84.0,  88.0,  92.0,  96.0,  100.0, 104.0, 108.0, 112.0, 116.0,
                        264.0, 268.0, 272.0, 276.0, 280.0, 284.0, 288.0, 292.0, 296.0, 300.0, 304.0, 308.0};
 
-  int input_shape[4] = {2, 4, 4, 3};
-  // int output_shape[4] = {2, 1, 4, 3};
-
+  std::vector<int> in_shape{2, 4, 4, 3};
+  std::vector<int> out_shape{2, 1, 4, 3};
+  int axes[1] = {1};
+  int axis_num = 1;
   float out[24] = {0};
-  int outer_size = 2;
-  int inner_size = 12;
-  int axis_size = 4;
-  (void)ReduceSum(outer_size, inner_size, axis_size, in, out, tid, thread_num);
+  bool reduce_to_end = false;
+  float coeff = 1.0f;
+  thread_num_ = 2;
+
+  Prepare(in_shape, out_shape, in, out, ReduceMode_ReduceSum, axes, axis_num, reduce_to_end, coeff);
+  kernel_->Run();
 
   int output_size = 24;
   CompareOutputData(out, correct, output_size, err_tol);
 }
 
-TEST_F(TestReduceFp32, Sum2Thread) {
+// sum reduce_to_end
+TEST_F(TestReduceFp32, Sum2) {
   /* 2*4*4*3 NHWC */
   float in[96] = {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0,
                   16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
@@ -159,23 +236,51 @@ TEST_F(TestReduceFp32, Sum2Thread) {
                   48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0,
                   64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0,
                   80.0, 81.0, 82.0, 83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0, 90.0, 91.0, 92.0, 93.0, 94.0, 95.0};
-  float correct[24] = {72.0,  76.0,  80.0,  84.0,  88.0,  92.0,  96.0,  100.0, 104.0, 108.0, 112.0, 116.0,
-                       264.0, 268.0, 272.0, 276.0, 280.0, 284.0, 288.0, 292.0, 296.0, 300.0, 304.0, 308.0};
-
-  int input_shape[4] = {2, 4, 4, 3};
-  // int output_shape[4] = {2, 1, 4, 3};
-
-  float out[24] = {0};
-  int outer_size = 2;
-  int inner_size = 12;
-  int axis_size = 4;
-  thread_num = 2;
-  tid = 0;
-  (void)ReduceSum(outer_size, inner_size, axis_size, in, out, tid, thread_num);
-  tid = 1;
-  (void)ReduceSum(outer_size, inner_size, axis_size, in, out, tid, thread_num);
+  float correct[32] = {6.0,   24.0,  42.0,  60.0,  78.0,  96.0,  114.0, 132.0, 150.0, 168.0, 186.0,
+                       204.0, 222.0, 240.0, 258.0, 276.0, 294.0, 312.0, 330.0, 348.0, 366.0, 384.0,
+                       402.0, 420.0, 438.0, 456.0, 474.0, 492.0, 510.0, 528.0, 546.0, 564.0};
+
+  std::vector<int> in_shape{2, 4, 4, 3};
+  std::vector<int> out_shape{2, 4, 4, 1};
+  int axes[1] = {-1};
+  int axis_num = 1;
+  float out[32] = {0};
+  bool reduce_to_end = true;
+  float coeff = 2.0f;
+  thread_num_ = 2;
+
+  Prepare(in_shape, out_shape, in, out, ReduceMode_ReduceSum, axes, axis_num, reduce_to_end, coeff);
+  kernel_->Run();
+
+  int output_size = 32;
+  CompareOutputData(out, correct, output_size, err_tol);
+}
 
-  int output_size = 24;
+TEST_F(TestReduceFp32, Sum3) {
+  /* 2*4*4*3 NHWC */
+  float in[96] = {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0,
+                  16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
+                  32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0,
+                  48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0,
+                  64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0,
+                  80.0, 81.0, 82.0, 83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0, 90.0, 91.0, 92.0, 93.0, 94.0, 95.0};
+  float correct[32] = {3.0,   12.0,  21.0,  30.0,  39.0,  48.0,  57.0,  66.0,  75.0,  84.0,  93.0,
+                       102.0, 111.0, 120.0, 129.0, 138.0, 147.0, 156.0, 165.0, 174.0, 183.0, 192.0,
+                       201.0, 210.0, 219.0, 228.0, 237.0, 246.0, 255.0, 264.0, 273.0, 282.0};
+
+  std::vector<int> in_shape{2, 4, 4, 3};
+  std::vector<int> out_shape{2, 4, 4, 1};
+  int axes[1] = {-1};
+  int axis_num = 1;
+  float out[32] = {0};
+  bool reduce_to_end = false;
+  float coeff = 0.0f;
+  thread_num_ = 2;
+
+  Prepare(in_shape, out_shape, in, out, ReduceMode_ReduceSum, axes, axis_num, reduce_to_end, coeff);
+  kernel_->Run();
+
+  int output_size = 32;
   CompareOutputData(out, correct, output_size, err_tol);
 }
 
@@ -189,38 +294,16 @@ TEST_F(TestReduceFp32, SumAllAxis) {
                   80.0, 81.0, 82.0, 83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0, 90.0, 91.0, 92.0, 93.0, 94.0, 95.0};
   float correct[1] = {4560};
   float out[1] = {0};
+  std::vector<int> in_shape{2, 4, 4, 3};
+  std::vector<int> out_shape{1, 1, 1, 1};
+  int axes[4] = {0};
+  int axis_num = 4;
+  bool reduce_to_end = true;
+  float coeff = 1.0f;
+  thread_num_ = 2;
 
-  int input_shape[4] = {2, 4, 4, 3};
-  int outer_size = 1;
-  int inner_size = 48;
-  int axis_size = 2;
-  float *src = in;
-  float dst1[48] = {0};
-  MS_ASSERT(dst != nullptr);
-  (void)ReduceSum(outer_size, inner_size, axis_size, src, dst1, tid, thread_num);
-
-  input_shape[0] = 1;  // 1 4 4 3
-  outer_size = 1;
-  inner_size = 12;
-  axis_size = 4;
-  src = dst1;
-  float dst2[12] = {0};
-  (void)ReduceSum(outer_size, inner_size, axis_size, src, dst2, tid, thread_num);
-
-  input_shape[1] = 1;  // 1 1 4 3
-  outer_size = 1;
-  inner_size = 3;
-  axis_size = 4;
-  src = dst2;
-  float dst3[3] = {0};
-  (void)ReduceSum(outer_size, inner_size, axis_size, src, dst3, tid, thread_num);
-
-  input_shape[2] = 1;  // 1 1 1 3
-  outer_size = 1;
-  inner_size = 1;
-  axis_size = 3;
-  src = dst3;
-  (void)ReduceSum(outer_size, inner_size, axis_size, src, out, tid, thread_num);
+  Prepare(in_shape, out_shape, in, out, ReduceMode_ReduceSum, axes, axis_num, reduce_to_end, coeff);
+  kernel_->Run();
 
   int output_size = 1;
   CompareOutputData(out, correct, output_size, err_tol);
@@ -237,14 +320,17 @@ TEST_F(TestReduceFp32, Max) {
   float correct[24] = {36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0,
                        84.0, 85.0, 86.0, 87.0, 88.0, 89.0, 90.0, 91.0, 92.0, 93.0, 94.0, 95.0};
 
-  int input_shape[4] = {2, 4, 4, 3};
-  // int output_shape[4] = {2, 1, 4, 3};
-
+  std::vector<int> in_shape{2, 4, 4, 3};
+  std::vector<int> out_shape{2, 1, 4, 3};
+  int axes[1] = {1};
+  int axis_num = 1;
   float out[24] = {0};
-  int outer_size = 2;
-  int inner_size = 12;
-  int axis_size = 4;
-  (void)ReduceMax(outer_size, inner_size, axis_size, in, out, tid, thread_num);
+  bool reduce_to_end = false;
+  float coeff = 1.0f;
+  thread_num_ = 2;
+
+  Prepare(in_shape, out_shape, in, out, ReduceMode_ReduceMax, axes, axis_num, reduce_to_end, coeff);
+  kernel_->Run();
 
   int output_size = 24;
   CompareOutputData(out, correct, output_size, err_tol);
@@ -261,14 +347,17 @@ TEST_F(TestReduceFp32, Min) {
   float correct[24] = {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0,
                        48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0};
 
-  int input_shape[4] = {2, 4, 4, 3};
-  // int output_shape[4] = {2, 1, 4, 3};
-
+  std::vector<int> in_shape{2, 4, 4, 3};
+  std::vector<int> out_shape{2, 1, 4, 3};
+  int axes[1] = {1};
+  int axis_num = 1;
   float out[24] = {0};
-  int outer_size = 2;
-  int inner_size = 12;
-  int axis_size = 4;
-  (void)ReduceMin(outer_size, inner_size, axis_size, in, out, tid, thread_num);
+  bool reduce_to_end = false;
+  float coeff = 1.0f;
+  thread_num_ = 2;
+
+  Prepare(in_shape, out_shape, in, out, ReduceMode_ReduceMin, axes, axis_num, reduce_to_end, coeff);
+  kernel_->Run();
 
   int output_size = 24;
   CompareOutputData(out, correct, output_size, err_tol);
@@ -286,14 +375,17 @@ TEST_F(TestReduceFp32, Prod) {
                        225280.0,   280665.0,   344080.0,   416185.0,   17418240.0, 18546744.0, 19728400.0, 20964824.0,
                        22257664.0, 23608584.0, 25019280.0, 26491464.0, 28026880.0, 29627288.0, 31294480.0, 33030264.0};
 
-  int input_shape[4] = {2, 4, 4, 3};
-  // int output_shape[4] = {2, 1, 4, 3};
-
+  std::vector<int> in_shape{2, 4, 4, 3};
+  std::vector<int> out_shape{2, 1, 4, 3};
+  int axes[1] = {1};
+  int axis_num = 1;
   float out[24] = {0};
-  int outer_size = 2;
-  int inner_size = 12;
-  int axis_size = 4;
-  (void)ReduceProd(outer_size, inner_size, axis_size, in, out, tid, thread_num);
+  bool reduce_to_end = false;
+  float coeff = 1.0f;
+  thread_num_ = 2;
+
+  Prepare(in_shape, out_shape, in, out, ReduceMode_ReduceProd, axes, axis_num, reduce_to_end, coeff);
+  kernel_->Run();
 
   int output_size = 24;
   CompareOutputData(out, correct, output_size, err_tol);
@@ -307,20 +399,78 @@ TEST_F(TestReduceFp32, SumSquare) {
                   48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0,
                   64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0,
                   80.0, 81.0, 82.0, 83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0, 90.0, 91.0, 92.0, 93.0, 94.0, 95.0};
-  float correct[24] = {2016.0,  2164.0,  2320.0,  2484.0,  2656.0,  2836.0,  3024.0,  3220.0,
-                       3424.0,  3636.0,  3856.0,  4084.0,  18144.0, 18676.0, 19216.0, 19764.0,
-                       20320.0, 20884.0, 21456.0, 22036.0, 22624.0, 23220.0, 23824.0, 24436.0};
+  float correct[8] = {1012.0, 7636.0, 21172.0, 41620.0, 68980.0, 103252.0, 144436.0, 192532.0};
 
-  int input_shape[4] = {2, 4, 4, 3};
-  // int output_shape[4] = {2, 1, 4, 3};
+  std::vector<int> in_shape{2, 4, 4, 3};
+  std::vector<int> out_shape{2, 4, 1, 1};
+  int axes[1] = {2};
+  int axis_num = 1;
+  float out[8] = {0};
+  bool reduce_to_end = true;
+  float coeff = 2.0f;
+  thread_num_ = 2;
 
-  float out[24] = {0};
-  int outer_size = 2;
-  int inner_size = 12;
-  int axis_size = 4;
-  (void)ReduceSumSquare(outer_size, inner_size, axis_size, in, out, tid, thread_num);
+  Prepare(in_shape, out_shape, in, out, ReduceMode_ReduceSumSquare, axes, axis_num, reduce_to_end, coeff);
+  kernel_->Run();
 
-  int output_size = 24;
+  int output_size = 8;
+  CompareOutputData(out, correct, output_size, err_tol);
+}
+
+TEST_F(TestReduceFp32, SumSquare2) {
+  /* 2*4*4*3 NHWC */
+  float in[96] = {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0,
+                  16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
+                  32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0,
+                  48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0,
+                  64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0,
+                  80.0, 81.0, 82.0, 83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0, 90.0, 91.0, 92.0, 93.0, 94.0, 95.0};
+  float correct[32] = {10.0,    100.0,   298.0,   604.0,   1018.0,  1540.0,  2170.0,  2908.0,
+                       3754.0,  4708.0,  5770.0,  6940.0,  8218.0,  9604.0,  11098.0, 12700.0,
+                       14410.0, 16228.0, 18154.0, 20188.0, 22330.0, 24580.0, 26938.0, 29404.0,
+                       31978.0, 34660.0, 37450.0, 40348.0, 43354.0, 46468.0, 49690.0, 53020.0};
+
+  std::vector<int> in_shape{2, 4, 4, 3};
+  std::vector<int> out_shape{2, 4, 4, 1};
+  int axes[1] = {3};
+  int axis_num = 1;
+  float out[32] = {0};
+  bool reduce_to_end = true;
+  float coeff = 2.0f;
+  thread_num_ = 2;
+
+  Prepare(in_shape, out_shape, in, out, ReduceMode_ReduceSumSquare, axes, axis_num, reduce_to_end, coeff);
+  kernel_->Run();
+
+  int output_size = 32;
+  CompareOutputData(out, correct, output_size, err_tol);
+}
+
+TEST_F(TestReduceFp32, ASum) {
+  /* 2*4*4*3 NHWC */
+  float in[96] = {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0,
+                  16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
+                  32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0,
+                  48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0,
+                  64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0,
+                  80.0, 81.0, 82.0, 83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0, 90.0, 91.0, 92.0, 93.0, 94.0, 95.0};
+  float correct[32] = {3.0,   12.0,  21.0,  30.0,  39.0,  48.0,  57.0,  66.0,  75.0,  84.0,  93.0,
+                       102.0, 111.0, 120.0, 129.0, 138.0, 147.0, 156.0, 165.0, 174.0, 183.0, 192.0,
+                       201.0, 210.0, 219.0, 228.0, 237.0, 246.0, 255.0, 264.0, 273.0, 282.0};
+
+  std::vector<int> in_shape{2, 4, 4, 3};
+  std::vector<int> out_shape{2, 4, 4, 1};
+  int axes[1] = {3};
+  int axis_num = 1;
+  float out[32] = {0};
+  bool reduce_to_end = true;
+  float coeff = 1.0f;
+  thread_num_ = 2;
+
+  Prepare(in_shape, out_shape, in, out, ReduceMode_ReduceASum, axes, axis_num, reduce_to_end, coeff);
+  kernel_->Run();
+
+  int output_size = 32;
   CompareOutputData(out, correct, output_size, err_tol);
 }
 }  // namespace mindspore