From 4d6f6181d3776aad9d2f860199df634d83cdf59a Mon Sep 17 00:00:00 2001
From: zhaozhenlong <zhaozhenlong1@huawei.com>
Date: Sat, 15 Aug 2020 11:02:30 +0800
Subject: [PATCH] Reduce int8

---
 mindspore/lite/src/populate_parameter.cc      |   2 +-
 .../runtime/kernel/arm/base/reduce_base.cc    | 199 ++++++++
 .../src/runtime/kernel/arm/base/reduce_base.h |  54 ++
 .../src/runtime/kernel/arm/base/resize_base.h |   2 +-
 .../src/runtime/kernel/arm/fp32/reduce.cc     | 132 +----
 .../lite/src/runtime/kernel/arm/fp32/reduce.h |  34 +-
 .../runtime/kernel/arm/int8/reduce_int8.cc    | 323 ++++++++++++
 .../src/runtime/kernel/arm/int8/reduce_int8.h |  98 ++++
 .../src/runtime/kernel/arm/nnacl/errorcode.h  |   2 +
 .../runtime/kernel/arm/nnacl/fp32/reduce.h    |   9 +-
 .../kernel/arm/nnacl/int8/reduce_int8.c       | 467 ++++++++++++++++++
 .../kernel/arm/nnacl/int8/reduce_int8.h       |  53 ++
 .../kernel/arm/nnacl/quantization/quantize.h  |  20 +
 .../kernel/arm/nnacl/reduce_parameter.h       |  30 ++
 .../kernel/arm/nnacl/resize_parameter.h       |   2 +-
 .../arm/fp32/resize_bilinear_fp32_tests.cc    | 271 +++++-----
 .../resize_nearest_neighbor_fp32_tests.cc     | 239 ++++++---
 .../kernel/arm/int8/reduce_int8_tests.cc      | 355 +++++++++++++
 .../arm/int8/resize_bilinear_int8_tests.cc    |   6 +-
 .../resize_nearest_neighbor_int8_tests.cc     |   4 +-
 20 files changed, 1940 insertions(+), 362 deletions(-)
 create mode 100644 mindspore/lite/src/runtime/kernel/arm/base/reduce_base.cc
 create mode 100644 mindspore/lite/src/runtime/kernel/arm/base/reduce_base.h
 create mode 100644 mindspore/lite/src/runtime/kernel/arm/int8/reduce_int8.cc
 create mode 100644 mindspore/lite/src/runtime/kernel/arm/int8/reduce_int8.h
 create mode 100644 mindspore/lite/src/runtime/kernel/arm/nnacl/int8/reduce_int8.c
 create mode 100644 mindspore/lite/src/runtime/kernel/arm/nnacl/int8/reduce_int8.h
 create mode 100644 mindspore/lite/src/runtime/kernel/arm/nnacl/reduce_parameter.h
 create mode 100644 mindspore/lite/test/ut/src/runtime/kernel/arm/int8/reduce_int8_tests.cc

diff --git a/mindspore/lite/src/populate_parameter.cc b/mindspore/lite/src/populate_parameter.cc
index 80a00476cf..b6a20936b0 100644
--- a/mindspore/lite/src/populate_parameter.cc
+++ b/mindspore/lite/src/populate_parameter.cc
@@ -38,7 +38,7 @@
 #include "src/runtime/kernel/arm/nnacl/softmax_parameter.h"
 #include "src/runtime/kernel/arm/nnacl/tile.h"
 #include "src/runtime/kernel/arm/nnacl/fp32/topk.h"
-#include "src/runtime/kernel/arm/nnacl/fp32/reduce.h"
+#include "src/runtime/kernel/arm/nnacl/reduce_parameter.h"
 #include "src/runtime/kernel/arm/nnacl/fp32/activation.h"
 #include "src/runtime/kernel/arm/nnacl/fp32/arithmetic.h"
 #include "src/runtime/kernel/arm/nnacl/fp32/batchnorm.h"
diff --git a/mindspore/lite/src/runtime/kernel/arm/base/reduce_base.cc b/mindspore/lite/src/runtime/kernel/arm/base/reduce_base.cc
new file mode 100644
index 0000000000..901113f12c
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/base/reduce_base.cc
@@ -0,0 +1,199 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "schema/model_generated.h"
+#include "src/kernel_registry.h"
+#include "include/errorcode.h"
+#include "src/runtime/runtime_api.h"
+#include "src/runtime/kernel/arm/base/reduce_base.h"
+#include "src/runtime/kernel/arm/fp32/reduce.h"
+#include "src/runtime/kernel/arm/int8/reduce_int8.h"
+
+using mindspore::kernel::KERNEL_ARCH::kCPU;
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_NULL_PTR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_Mean;
+using mindspore::schema::PrimitiveType_Reduce;
+
+namespace mindspore::kernel {
+namespace {
+constexpr size_t kInputNum = 1;
+constexpr size_t kOutputNum = 1;
+}  // namespace
+
+int ReduceBaseCPUKernel::CheckInputsOutputs() {
+  if (in_tensors_.size() != kInputNum) {
+    MS_LOG(ERROR) << "Reduce inputs size should be " << kInputNum << " but got " << in_tensors_.size();
+    return RET_ERROR;
+  }
+  if (out_tensors_.size() != kOutputNum) {
+    MS_LOG(ERROR) << "Reduce outputs size should be " << kOutputNum << " but got " << out_tensors_.size();
+    return RET_ERROR;
+  }
+  auto input = in_tensors_.at(0);
+  if (input == nullptr) {
+    MS_LOG(ERROR) << "Reduce input is nullptr";
+    return RET_NULL_PTR;
+  }
+  auto output = out_tensors_.at(0);
+  if (output == nullptr) {
+    MS_LOG(ERROR) << "Reduce output is nullptr";
+    return RET_NULL_PTR;
+  }
+  return RET_OK;
+}
+
+int ReduceBaseCPUKernel::CheckParameters() {
+  size_t input_rank = in_tensors_.at(0)->shape().size();
+  if (static_cast<size_t>(num_axes_) > input_rank) {
+    MS_LOG(ERROR) << "Reduce op invalid num of reduce axes " << num_axes_ << " larger than input rank " << input_rank;
+    return RET_ERROR;
+  }
+  for (auto i = 0; i < num_axes_; i++) {
+    if (axes_[i] < -static_cast<int>(input_rank) || axes_[i] >= static_cast<int>(input_rank)) {
+      MS_LOG(ERROR) << "Reduce got invalid axis " << axes_[i] << ", axis should be in ["
+                    << -static_cast<int>(input_rank) << ", " << input_rank - 1 << "].";
+      return RET_ERROR;
+    }
+    if (axes_[i] < 0) {
+      axes_[i] += static_cast<int>(input_rank);
+    }
+  }
+
+  if (num_axes_ == 0) {
+    for (int i = 0; i < input_rank; i++) {
+      axes_[i] = i;
+    }
+    num_axes_ = static_cast<int>(input_rank);
+  }
+
+  return RET_OK;
+}
+
+int ReduceBaseCPUKernel::Init() {
+  auto reduce_param = reinterpret_cast<ReduceParameter *>(op_parameter_);
+  if (reduce_param == nullptr) {
+    return RET_NULL_PTR;
+  }
+  num_axes_ = reduce_param->num_axes_;
+  mode_ = reduce_param->mode_;
+  memcpy(axes_, reduce_param->axes_, sizeof(reduce_param->axes_));
+
+  auto ret = CheckInputsOutputs();
+  if (ret != RET_OK) {
+    return ret;
+  }
+  ret = CheckParameters();
+  if (ret != RET_OK) {
+    return ret;
+  }
+
+  return RET_OK;
+}
+
+kernel::LiteKernel *CpuReduceFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
+                                               const std::vector<lite::tensor::Tensor *> &outputs,
+                                               OpParameter *opParameter, const lite::Context *ctx,
+                                               const kernel::KernelKey &desc, const lite::Primitive *primitive) {
+  MS_ASSERT(opParameter != nullptr);
+  MS_ASSERT(desc.type == schema::PrimitiveType_Reduce);
+  if (opParameter == nullptr) {
+    MS_LOG(ERROR) << "Reduce opParameter nullptr";
+    return nullptr;
+  }
+  if (desc.type != schema::PrimitiveType_Reduce) {
+    MS_LOG(ERROR) << "Reduce op desc.type should be PrimitiveType_Reduce, got " << desc.type;
+    return nullptr;
+  }
+  auto *kernel = new (std::nothrow) ReduceCPUKernel(opParameter, inputs, outputs, ctx, primitive);
+  if (kernel == nullptr) {
+    MS_LOG(ERROR) << "Reduce new ReduceCPUKernel failed.";
+    return nullptr;
+  }
+  auto ret = kernel->Init();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
+                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
+    delete kernel;
+    return nullptr;
+  }
+  return kernel;
+}
+
+kernel::LiteKernel *CpuMeanFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
+                                             const std::vector<lite::tensor::Tensor *> &outputs,
+                                             OpParameter *opParameter, const lite::Context *ctx,
+                                             const kernel::KernelKey &desc, const lite::Primitive *primitive) {
+  MS_ASSERT(opParameter != nullptr);
+  MS_ASSERT(desc.type == schema::PrimitiveType_Mean);
+  if (opParameter == nullptr) {
+    MS_LOG(ERROR) << "Reduce opParameter nullptr";
+    return nullptr;
+  }
+  if (desc.type != schema::PrimitiveType_Mean) {
+    MS_LOG(ERROR) << "Reduce op desc.type should be PrimitiveType_Mean, got " << desc.type;
+    return nullptr;
+  }
+  auto *kernel = new (std::nothrow) ReduceCPUKernel(opParameter, inputs, outputs, ctx, primitive);
+  if (kernel == nullptr) {
+    MS_LOG(ERROR) << "Reduce new ReduceCPUKernel failed.";
+    return nullptr;
+  }
+  auto ret = kernel->Init();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
+                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
+    delete kernel;
+    return nullptr;
+  }
+  return kernel;
+}
+
+kernel::LiteKernel *CpuReduceInt8KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
+                                               const std::vector<lite::tensor::Tensor *> &outputs,
+                                               OpParameter *opParameter, const lite::Context *ctx,
+                                               const kernel::KernelKey &desc, const lite::Primitive *primitive) {
+  MS_ASSERT(opParameter != nullptr);
+  MS_ASSERT(desc.type == schema::PrimitiveType_Reduce);
+  if (opParameter == nullptr) {
+    MS_LOG(ERROR) << "Reduce opParameter nullptr";
+    return nullptr;
+  }
+  if (desc.type != schema::PrimitiveType_Reduce) {
+    MS_LOG(ERROR) << "Reduce op desc.type should be PrimitiveType_Reduce, got " << desc.type;
+    return nullptr;
+  }
+  auto *kernel = new (std::nothrow) ReduceInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
+  if (kernel == nullptr) {
+    MS_LOG(ERROR) << "Reduce new ReduceCPUKernel failed.";
+    return nullptr;
+  }
+  auto ret = kernel->Init();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
+                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
+    delete kernel;
+    return nullptr;
+  }
+  return kernel;
+}
+
+REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Reduce, CpuReduceFp32KernelCreator)
+REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Mean, CpuMeanFp32KernelCreator)
+REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_Reduce, CpuReduceInt8KernelCreator)
+}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/base/reduce_base.h b/mindspore/lite/src/runtime/kernel/arm/base/reduce_base.h
new file mode 100644
index 0000000000..3410dfff0f
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/base/reduce_base.h
@@ -0,0 +1,54 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_BASE_REDUCE_BASE_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_BASE_REDUCE_BASE_H_
+
+#include <vector>
+#include "src/lite_kernel.h"
+#include "ir/anf.h"
+#include "nnacl/reduce_parameter.h"
+
+namespace mindspore::kernel {
+class ReduceBaseCPUKernel : public LiteKernel {
+ public:
+  ReduceBaseCPUKernel(OpParameter *param, const std::vector<lite::tensor::Tensor *> &inputs,
+                      const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx,
+                      const lite::Primitive *primitive)
+      : LiteKernel(param, inputs, outputs, ctx, primitive) {}
+  virtual ~ReduceBaseCPUKernel() = default;
+
+  int Init() override;
+  int ReSize() override { return 0; };
+
+ private:
+  int CheckInputsOutputs();
+  int CheckParameters();
+
+ protected:
+  int axes_[REDUCE_MAX_AXES_NUM];
+  int num_axes_;
+  int mode_;
+
+ protected:
+  int outer_size_;
+  int inner_size_;
+  int axis_size_;
+  std::vector<int> tmp_shape_;
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_BASE_REDUCE_BASE_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/base/resize_base.h b/mindspore/lite/src/runtime/kernel/arm/base/resize_base.h
index 79ca034cb5..85a3537ba4 100644
--- a/mindspore/lite/src/runtime/kernel/arm/base/resize_base.h
+++ b/mindspore/lite/src/runtime/kernel/arm/base/resize_base.h
@@ -31,7 +31,7 @@ class ResizeBaseCPUKernel : public LiteKernel {
                       const lite::Primitive *primitive)
       : LiteKernel(parameter, inputs, outputs, ctx, primitive), context_(ctx) {}
 
-  ~ResizeBaseCPUKernel() = default;
+  virtual ~ResizeBaseCPUKernel() = default;
 
   int Init() override;
   int ReSize() override { return 0; };
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.cc
index e9929aced4..64c4a07253 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.cc
@@ -20,6 +20,7 @@
 #include "include/errorcode.h"
 #include "src/runtime/runtime_api.h"
 #include "src/runtime/kernel/arm/nnacl/fp32/reduce.h"
+#include "src/runtime/kernel/arm/base/reduce_base.h"
 
 using mindspore::kernel::KERNEL_ARCH::kCPU;
 using mindspore::lite::KernelRegistrar;
@@ -37,69 +38,9 @@ using mindspore::schema::ReduceMode_ReduceSum;
 using mindspore::schema::ReduceMode_ReduceSumSquare;
 
 namespace mindspore::kernel {
-namespace {
-constexpr size_t kInputNum = 1;
-constexpr size_t kOutputNum = 1;
-}  // namespace
-
-int ReduceCPUKernel::CheckInputsOutputs() {
-  if (in_tensors_.size() != kInputNum) {
-    MS_LOG(ERROR) << "Reduce inputs size should be " << kInputNum << " but got " << in_tensors_.size();
-    return RET_ERROR;
-  }
-  if (out_tensors_.size() != kOutputNum) {
-    MS_LOG(ERROR) << "Reduce outputs size should be " << kOutputNum << " but got " << out_tensors_.size();
-    return RET_ERROR;
-  }
-  auto input = in_tensors_.at(0);
-  if (input == nullptr) {
-    MS_LOG(ERROR) << "Reduce input is nullptr";
-    return RET_NULL_PTR;
-  }
-  auto output = out_tensors_.at(0);
-  if (output == nullptr) {
-    MS_LOG(ERROR) << "Reduce output is nullptr";
-    return RET_NULL_PTR;
-  }
-  return RET_OK;
-}
-
-int ReduceCPUKernel::CheckParameters() {
-  size_t input_rank = in_tensors_.at(0)->shape().size();
-  if (static_cast<size_t>(num_axes_) > input_rank) {
-    MS_LOG(ERROR) << "Reduce num of reduce axes " << num_axes_ << " larger than input rank " << input_rank;
-    return RET_ERROR;
-  }
-  for (auto i = 0; i < num_axes_; i++) {
-    if (axes_[i] < -static_cast<int>(input_rank) || axes_[i] >= static_cast<int>(input_rank)) {
-      MS_LOG(ERROR) << "Reduce got invalid axis " << axes_[i] << ", axis should be in ["
-                    << -static_cast<int>(input_rank) << ", " << input_rank - 1 << "].";
-      return RET_ERROR;
-    }
-    if (axes_[i] < 0) {
-      axes_[i] += static_cast<int>(input_rank);
-    }
-  }
-
-  if (num_axes_ == 0) {
-    for (int i = 0; i < input_rank; i++) {
-      axes_[i] = i;
-    }
-  }
-
-  return RET_OK;
-}
 
 int ReduceCPUKernel::Init() {
-  if (context_->infer_shape_interrupt_ && !context_->running_) {
-    set_need_reinit();
-    return RET_OK;
-  }
-  auto ret = CheckInputsOutputs();
-  if (ret != RET_OK) {
-    return ret;
-  }
-  ret = CheckParameters();
+  auto ret = ReduceBaseCPUKernel::Init();
   if (ret != RET_OK) {
     return ret;
   }
@@ -107,7 +48,6 @@ int ReduceCPUKernel::Init() {
   if (ret != RET_OK) {
     return ret;
   }
-
   switch (mode_) {
     case static_cast<int>(ReduceMode_ReduceSum): {
       reducer_ = ReduceSum;
@@ -137,7 +77,10 @@ int ReduceCPUKernel::Init() {
       MS_LOG(ERROR) << "Reduce unsupported reduce mode: " << mode_;
       return RET_ERROR;
   }
-  return RET_OK;
+  if (!InferShapeDone()) {
+    return RET_OK;
+  }
+  return ReSize();
 }
 
 int ReduceCPUKernel::CallReduceUnit(int task_id) {
@@ -225,67 +168,4 @@ int ReduceCPUKernel::MallocTmpBuffer() {
   }
   return RET_OK;
 }
-
-kernel::LiteKernel *CpuReduceFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
-                                               const std::vector<lite::tensor::Tensor *> &outputs,
-                                               OpParameter *opParameter, const lite::Context *ctx,
-                                               const kernel::KernelKey &desc, const lite::Primitive *primitive) {
-  MS_ASSERT(opParameter != nullptr);
-  MS_ASSERT(desc.type == schema::PrimitiveType_Reduce);
-  if (opParameter == nullptr) {
-    MS_LOG(ERROR) << "Reduce opParameter nullptr";
-    return nullptr;
-  }
-  if (desc.type != schema::PrimitiveType_Reduce) {
-    MS_LOG(ERROR) << "Reduce op desc.type should be PrimitiveType_Reduce, got " << desc.type;
-    return nullptr;
-  }
-  auto *kernel = new (std::nothrow)
-    ReduceCPUKernel(reinterpret_cast<ReduceParameter *>(opParameter), inputs, outputs, ctx, primitive);
-  if (kernel == nullptr) {
-    MS_LOG(ERROR) << "Reduce new ReduceCPUKernel failed.";
-    return nullptr;
-  }
-  auto ret = kernel->Init();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
-                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
-    delete kernel;
-    return nullptr;
-  }
-  return kernel;
-}
-
-kernel::LiteKernel *CpuMeanFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
-                                             const std::vector<lite::tensor::Tensor *> &outputs,
-                                             OpParameter *opParameter, const lite::Context *ctx,
-                                             const kernel::KernelKey &desc, const lite::Primitive *primitive) {
-  MS_ASSERT(opParameter != nullptr);
-  MS_ASSERT(desc.type == schema::PrimitiveType_Mean);
-  if (opParameter == nullptr) {
-    MS_LOG(ERROR) << "Reduce opParameter nullptr";
-    return nullptr;
-  }
-  if (desc.type != schema::PrimitiveType_Mean) {
-    MS_LOG(ERROR) << "Reduce op desc.type should be PrimitiveType_Mean, got " << desc.type;
-    return nullptr;
-  }
-  auto *kernel = new (std::nothrow)
-    ReduceCPUKernel(reinterpret_cast<ReduceParameter *>(opParameter), inputs, outputs, ctx, primitive);
-  if (kernel == nullptr) {
-    MS_LOG(ERROR) << "Reduce new ReduceCPUKernel failed.";
-    return nullptr;
-  }
-  auto ret = kernel->Init();
-  if (ret != RET_OK) {
-    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
-                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
-    delete kernel;
-    return nullptr;
-  }
-  return kernel;
-}
-
-REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Reduce, CpuReduceFp32KernelCreator)
-REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Mean, CpuMeanFp32KernelCreator)
 }  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.h b/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.h
index 2857ee9baf..5b05b76598 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.h
@@ -21,25 +21,20 @@
 #include "src/lite_kernel.h"
 
 #include "src/runtime/kernel/arm/nnacl/fp32/reduce.h"
+#include "src/runtime/kernel/arm/base/reduce_base.h"
 #include "ir/anf.h"
 using mindspore::schema::ReduceMode;
 
 namespace mindspore::kernel {
-class ReduceCPUKernel : public LiteKernel {
+class ReduceCPUKernel : public ReduceBaseCPUKernel {
   typedef int (*Reducer)(const int outer_size, const int inner_size, const int axis_size, const float *src_data,
                          const int *src_shape, float *dst_data, const int tid, const int thread_num);
 
  public:
-  ReduceCPUKernel(ReduceParameter *param, const std::vector<lite::tensor::Tensor *> &inputs,
+  ReduceCPUKernel(OpParameter *param, const std::vector<lite::tensor::Tensor *> &inputs,
                   const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx,
                   const lite::Primitive *primitive)
-      : LiteKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs, ctx, primitive),
-        context_(ctx),
-        keep_dims_(param->keep_dims_),
-        num_axes_(param->num_axes_),
-        mode_(param->mode_) {
-    memcpy(axes_, param->axes_, sizeof(param->axes_));
-  }
+      : ReduceBaseCPUKernel(param, inputs, outputs, ctx, primitive) {}
   ~ReduceCPUKernel() {
     for (auto i = 0; i < data_buffers_.size(); i++) {
       float *buffer = data_buffers_[i];
@@ -58,26 +53,13 @@ class ReduceCPUKernel : public LiteKernel {
   int CallReduceUnit(int task_id);
 
  private:
-  int CheckInputsOutputs();
-  int CheckParameters();
-  int MallocTmpBuffer();
-
- private:
-  const lite::Context *context_ = nullptr;
-  bool keep_dims_;
-  int axes_[REDUCE_MAX_AXES_NUM];
-  int num_axes_;
-  int mode_;
-
- private:
+  Reducer reducer_;
   std::vector<float *> data_buffers_;
-  int outer_size_;
-  int inner_size_;
-  int axis_size_;
-  std::vector<int> tmp_shape_;
   const float *src_data_;
   float *dst_data_;
-  Reducer reducer_;
+
+ private:
+  int MallocTmpBuffer();
 };
 }  // namespace mindspore::kernel
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/reduce_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/reduce_int8.cc
new file mode 100644
index 0000000000..5a0dc32a71
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/reduce_int8.cc
@@ -0,0 +1,323 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include "schema/model_generated.h"
+#include "src/runtime/runtime_api.h"
+#include "src/kernel_registry.h"
+#include "nnacl/quantization/quantize.h"
+#include "include/errorcode.h"
+#include "src/runtime/kernel/arm/int8/reduce_int8.h"
+
+using mindspore::lite::KernelRegistrar;
+using mindspore::lite::RET_ERROR;
+using mindspore::lite::RET_NULL_PTR;
+using mindspore::lite::RET_OK;
+using mindspore::schema::PrimitiveType_Reduce;
+using mindspore::schema::ReduceMode_ReduceMax;
+using mindspore::schema::ReduceMode_ReduceMean;
+using mindspore::schema::ReduceMode_ReduceMin;
+using mindspore::schema::ReduceMode_ReduceProd;
+using mindspore::schema::ReduceMode_ReduceSum;
+using mindspore::schema::ReduceMode_ReduceSumSquare;
+
+namespace mindspore::kernel {
+int ReduceInt8CPUKernel::Init() {
+  auto ret = ReduceBaseCPUKernel::Init();
+  if (ret != RET_OK) {
+    return ret;
+  }
+  ret = MallocTmpBuffer();
+  if (ret != RET_OK) {
+    return ret;
+  }
+  ret = CalculateQuantArgs();
+  if (ret != RET_OK) {
+    return ret;
+  }
+
+  switch (mode_) {
+    case static_cast<int>(ReduceMode_ReduceMean): {
+      reducer_ = ReduceMeanInt8;
+      last_reducer_ = ReduceMeanLastAxis;
+      break;
+    }
+    case static_cast<int>(ReduceMode_ReduceSum): {
+      reducer_ = ReduceSumInt8;
+      last_reducer_ = ReduceSumLastAxis;
+      break;
+    }
+
+    case static_cast<int>(ReduceMode_ReduceMax): {
+      reducer_ = ReduceMaxInt8;
+      last_reducer_ = ReduceMaxLastAxis;
+      break;
+    }
+    case static_cast<int>(ReduceMode_ReduceMin): {
+      reducer_ = ReduceMinInt8;
+      last_reducer_ = ReduceMinLastAxis;
+      break;
+    }
+    case static_cast<int>(ReduceMode_ReduceProd): {
+      reducer_ = ReduceProdInt8;
+      last_reducer_ = ReduceProdLastAxis;
+      break;
+    }
+    case static_cast<int>(ReduceMode_ReduceSumSquare): {
+      // In multi-axes reduce cases, sum square output different output for different reduce order
+      // e.g. axes [2, 3] is different from axes [3, 2].
+      reducer_ = ReduceSumSquareInt8;
+      last_reducer_ = ReduceSumSquareLastAxis;
+      break;
+    }
+    default:
+      MS_LOG(ERROR) << "Reduce unsupported reduce mode: " << mode_;
+      return RET_ERROR;
+  }
+  if (!InferShapeDone()) {
+    return RET_OK;
+  }
+  return ReSize();
+}
+
+int ReduceInt8CPUKernel::CalculateQuantArgs() {
+  lite::tensor::Tensor *input = in_tensors_.at(0);
+  lite::tensor::Tensor *output = out_tensors_.at(0);
+  MS_ASSERT(input);
+  MS_ASSERT(output);
+
+  quant_arg_.in_scale_ = input->GetQuantParams().front().scale;
+  quant_arg_.in_zp_ = input->GetQuantParams().front().zeroPoint;
+  quant_arg_.out_scale_ = output->GetQuantParams().front().scale;
+  quant_arg_.out_zp_ = output->GetQuantParams().front().zeroPoint;
+
+  // (quant_out - out_zp) * out_scale = (quant_in - in_zp) * in_scale
+  const double input_output_multiplier = quant_arg_.in_scale_ / quant_arg_.out_scale_;
+  int shift;
+  QuantizeMultiplierSmallerThanOne(input_output_multiplier, &quant_arg_.in_out_multiplier_, &shift);
+  quant_arg_.in_out_left_shift_ = shift < 0 ? -shift : 0;
+  quant_arg_.in_out_right_shift_ = shift > 0 ? shift : 0;
+
+  // (quant_out - zp_out)*scale_out = sum((quant_in -zp)*scale_in) * (1/num) for each axis in axes
+  // quant_out = sum(quant_in-zp) * (scale_in/scale_out) * (1/num)
+  if (mode_ == static_cast<int>(schema::ReduceMode_ReduceMean)) {
+    for (auto i = 0; i < num_axes_; i++) {
+      auto axis = axes_[i];
+      double reciprocal = 1.0 / in_tensors_.at(0)->shape()[axis];
+      QuantMulArg *qm = new (std::nothrow) QuantMulArg;
+      if (qm == nullptr) {
+        MS_LOG(ERROR) << "Reduce new QuantMulArg failed.";
+        return RET_NULL_PTR;
+      }
+      QuantizeMultiplierSmallerThanOne(reciprocal, &qm->multiplier_, &shift);
+      qm->left_shift_ = shift < 0 ? -shift : 0;
+      qm->right_shift_ = shift > 0 ? shift : 0;
+      mean_multipliers_.push_back(qm);
+    }
+  }
+
+  // (quant_out - zp) * scale_out = prod(quant_in - zp) * scale_in^num
+  // quant_out = prod(quant_in-zp) * (scale_in^num/scale_out) + zp_out
+  // scale_in^num-1 * scale_in/scale_out
+  if (mode_ == static_cast<int>(schema::ReduceMode_ReduceProd)) {
+    for (auto i = 0; i < num_axes_; i++) {
+      int axis_size = in_tensors_.at(0)->shape()[axes_[i]];
+      QuantMulArg *qm = new (std::nothrow) QuantMulArg;
+      if (qm == nullptr) {
+        MS_LOG(ERROR) << "ReduceProd new QuantMulArg failed.";
+        return RET_NULL_PTR;
+      }
+      double prod_multiplier = pow(quant_arg_.in_scale_, axis_size - 1);
+      QuantizeMultiplierSmallerThanOne(prod_multiplier, &qm->multiplier_, &shift);
+      qm->left_shift_ = shift < 0 ? -shift : 0;
+      qm->right_shift_ = shift > 0 ? shift : 0;
+      prod_multipliers_.push_back(qm);
+    }
+  }
+
+  // (quant_out - zp) * scale_out = sum((quant_in - zp)^2 * scale_in^2)
+  // quant_out = sum((quant_in - zp)^2) * scale_in^2 / scale_out + zp_out
+  // scale_in * scale_in/scale_out
+  if (mode_ == static_cast<int>(schema::ReduceMode_ReduceSumSquare)) {
+    for (auto i = 0; i < num_axes_ - 1; i++) {
+      QuantMulArg *qm = new (std::nothrow) QuantMulArg;
+      if (qm == nullptr) {
+        MS_LOG(ERROR) << "ReduceProd new QuantMultiplier failed.";
+        return RET_NULL_PTR;
+      }
+      double sumsquare_multiplier = quant_arg_.in_scale_;
+      QuantizeMultiplierSmallerThanOne(sumsquare_multiplier, &qm->multiplier_, &shift);
+      qm->left_shift_ = shift < 0 ? -shift : 0;
+      qm->right_shift_ = shift > 0 ? shift : 0;
+      sum_square_multipliers_.push_back(qm);
+    }
+
+    QuantMulArg *qm = new (std::nothrow) QuantMulArg;
+    if (qm == nullptr) {
+      MS_LOG(ERROR) << "ReduceProd new QuantMultiplier failed.";
+      return RET_NULL_PTR;
+    }
+    double sumsquare_multiplier = quant_arg_.in_scale_ * quant_arg_.in_scale_ / quant_arg_.out_scale_;
+    QuantizeMultiplierSmallerThanOne(sumsquare_multiplier, &qm->multiplier_, &shift);
+    qm->left_shift_ = shift < 0 ? -shift : 0;
+    qm->right_shift_ = shift > 0 ? shift : 0;
+    sum_square_multipliers_.push_back(qm);
+  }
+  return RET_OK;
+}
+
+int ReduceInt8CPUKernel::MallocTmpBuffer() {
+  auto input_shape = in_tensors_.at(0)->shape();
+  for (auto i = 0; i < num_axes_ - 1; i++) {
+    int axis = axes_[i];
+    size_t size = 1;
+    for (auto j = 0; j < input_shape.size(); j++) {
+      if (static_cast<size_t>(axis) != j) {
+        size *= input_shape[j];
+      }
+    }
+    int32_t *buffer = reinterpret_cast<int32_t *>(malloc(size * sizeof(int32_t)));
+    if (buffer == nullptr) {
+      MS_LOG(ERROR) << "Malloc data failed.";
+      return RET_ERROR;
+    }
+    data_buffers_.emplace_back(buffer);
+    input_shape[axis] = 1;
+  }
+
+  auto input = in_tensors_.at(0);
+  begin_src_data_ = reinterpret_cast<int32_t *>(malloc(sizeof(int32_t) * input->ElementsNum()));
+  if (begin_src_data_ == nullptr) {
+    return RET_NULL_PTR;
+  }
+  auto input_data = reinterpret_cast<int8_t *>(input->Data());
+  for (auto i = 0; i < input->ElementsNum(); i++) {
+    begin_src_data_[i] = static_cast<int32_t>(input_data[i]);
+  }
+  return RET_OK;
+}
+
+int ReduceInt8Impl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+  auto reduce = reinterpret_cast<ReduceInt8CPUKernel *>(cdata);
+  auto error_code = reduce->CallReduceUnit(task_id);
+  if (error_code != RET_OK) {
+    MS_LOG(ERROR) << "Reduce Run error task_id[" << task_id << "] error_code[" << error_code << "]";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+int ReduceInt8CPUKernel::Run() {
+  auto prepare_ret = Prepare();
+  if (prepare_ret != RET_OK) {
+    MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret;
+    return prepare_ret;
+  }
+
+  is_last_axis_ = false;
+  tmp_shape_ = in_tensors_.at(0)->shape();
+  src_data_ = begin_src_data_;
+
+  for (int i = 0; i < data_buffers_.size(); ++i) {
+    if (mode_ == static_cast<int>(schema::ReduceMode_ReduceMean)) {
+      quant_arg_.mean_multiplier_ = mean_multipliers_[i]->multiplier_;
+      quant_arg_.mean_left_shift_ = mean_multipliers_[i]->left_shift_;
+      quant_arg_.mean_right_shift_ = mean_multipliers_[i]->right_shift_;
+    }
+
+    if (mode_ == static_cast<int>(schema::ReduceMode_ReduceProd)) {
+      quant_arg_.prod_multiplier_ = prod_multipliers_[i]->multiplier_;
+      quant_arg_.prod_left_shift_ = prod_multipliers_[i]->left_shift_;
+      quant_arg_.prod_right_shift_ = prod_multipliers_[i]->right_shift_;
+    }
+    if (mode_ == static_cast<int>(schema::ReduceMode_ReduceSumSquare)) {
+      quant_arg_.sum_square_multiplier_ = sum_square_multipliers_[i]->multiplier_;
+      quant_arg_.sum_square_left_shift_ = sum_square_multipliers_[i]->left_shift_;
+      quant_arg_.sum_square_right_shift_ = sum_square_multipliers_[i]->right_shift_;
+    }
+    dst_data_ = data_buffers_[i];
+    int axis = axes_[i];
+    outer_size_ = 1;
+    for (int j = 0; j < axis; j++) {
+      outer_size_ *= tmp_shape_[j];
+    }
+    inner_size_ = 1;
+    for (int k = axis + 1; k < static_cast<int>(tmp_shape_.size()); k++) {
+      inner_size_ *= tmp_shape_[k];
+    }
+    axis_size_ = tmp_shape_[axis];
+    auto error_code = LiteBackendParallelLaunch(ReduceInt8Impl, this, context_->thread_num_);
+    if (error_code != RET_OK) {
+      MS_LOG(ERROR) << "Reduce run error, error_code[" << error_code << "]";
+      return RET_ERROR;
+    }
+    tmp_shape_[axis] = 1;
+    src_data_ = dst_data_;
+  }
+
+  if (mode_ == static_cast<int>(schema::ReduceMode_ReduceMean)) {
+    quant_arg_.mean_multiplier_ = mean_multipliers_.back()->multiplier_;
+    quant_arg_.mean_left_shift_ = mean_multipliers_.back()->left_shift_;
+    quant_arg_.mean_right_shift_ = mean_multipliers_.back()->right_shift_;
+  }
+  if (mode_ == static_cast<int>(schema::ReduceMode_ReduceProd)) {
+    quant_arg_.prod_multiplier_ = prod_multipliers_.back()->multiplier_;
+    quant_arg_.prod_left_shift_ = prod_multipliers_.back()->left_shift_;
+    quant_arg_.prod_right_shift_ = prod_multipliers_.back()->right_shift_;
+  }
+  if (mode_ == static_cast<int>(schema::ReduceMode_ReduceSumSquare)) {
+    quant_arg_.sum_square_multiplier_ = sum_square_multipliers_.back()->multiplier_;
+    quant_arg_.sum_square_left_shift_ = sum_square_multipliers_.back()->left_shift_;
+    quant_arg_.sum_square_right_shift_ = sum_square_multipliers_.back()->right_shift_;
+  }
+  int last_reduce_axis = axes_[num_axes_ - 1];
+  outer_size_ = 1;
+  for (int i = 0; i < last_reduce_axis; i++) {
+    outer_size_ *= tmp_shape_[i];
+  }
+  inner_size_ = 1;
+  for (int i = last_reduce_axis + 1; i < static_cast<int>(tmp_shape_.size()); i++) {
+    inner_size_ *= tmp_shape_[i];
+  }
+  axis_size_ = tmp_shape_[last_reduce_axis];
+  last_dst_data_ = reinterpret_cast<int8_t *>(out_tensors_.at(0)->Data());
+  is_last_axis_ = true;
+  auto error_code = LiteBackendParallelLaunch(ReduceInt8Impl, this, context_->thread_num_);
+  if (error_code != RET_OK) {
+    MS_LOG(ERROR) << "Reduce run error, error_code[" << error_code << "]";
+    return RET_ERROR;
+  }
+
+  if (begin_src_data_ != nullptr) {
+    free(begin_src_data_);
+    begin_src_data_ = nullptr;
+  }
+
+  return RET_OK;
+}
+
+int ReduceInt8CPUKernel::CallReduceUnit(int task_id) {
+  int ret;
+  if (!is_last_axis_) {
+    ret =
+      reducer_(outer_size_, inner_size_, axis_size_, src_data_, dst_data_, &quant_arg_, task_id, context_->thread_num_);
+  } else {
+    ret = last_reducer_(outer_size_, inner_size_, axis_size_, src_data_, last_dst_data_, &quant_arg_, task_id,
+                        context_->thread_num_);
+  }
+  return ret;
+}
+}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/reduce_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/reduce_int8.h
new file mode 100644
index 0000000000..895e9016d9
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/reduce_int8.h
@@ -0,0 +1,98 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_REDUCE_INT8_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_REDUCE_INT8_H_
+
+#include <vector>
+#include "src/lite_kernel.h"
+#include "nnacl/reduce_parameter.h"
+#include "nnacl/int8/reduce_int8.h"
+#include "nnacl/quantization/quantize.h"
+#include "ir/anf.h"
+#include "src/runtime/kernel/arm/base/reduce_base.h"
+
+using mindspore::schema::ReduceMode;
+
+namespace mindspore::kernel {
+class ReduceInt8CPUKernel : public ReduceBaseCPUKernel {
+  typedef int (*Reducer)(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
+                         int32_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num);
+  typedef int (*LastReducer)(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
+                             int8_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num);
+
+ public:
+  ReduceInt8CPUKernel(OpParameter *param, const std::vector<lite::tensor::Tensor *> &inputs,
+                      const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx,
+                      const lite::Primitive *primitive)
+      : ReduceBaseCPUKernel(param, inputs, outputs, ctx, primitive) {}
+  ~ReduceInt8CPUKernel() {
+    for (auto i = 0; i < data_buffers_.size(); i++) {
+      int32_t *buffer = data_buffers_[i];
+      if (buffer != nullptr) {
+        free(buffer);
+        buffer = nullptr;
+      }
+    }
+    for (auto qm : mean_multipliers_) {
+      delete qm;
+      qm = nullptr;
+    }
+    for (auto qm : prod_multipliers_) {
+      delete qm;
+      qm = nullptr;
+    }
+    for (auto qm : sum_square_multipliers_) {
+      delete qm;
+      qm = nullptr;
+    }
+    src_data_ = nullptr;
+    dst_data_ = nullptr;
+  }
+
+  int Init() override;
+  int ReSize() override { return 0; };
+  int Run() override;
+  int CallReduceUnit(int task_id);
+  int ReduceLastAxis(int task_id);
+
+ public:
+  bool is_last_axis_ = true;
+
+ private:
+  int MallocTmpBuffer();
+  int CalculateQuantArgs();
+
+ private:
+  ReduceParameter *param_ = nullptr;
+  ReduceQuantArg quant_arg_;
+
+ private:
+  int32_t *begin_src_data_ = nullptr;
+  int8_t *last_dst_data_ = nullptr;
+  std::vector<int32_t *> data_buffers_;
+  const int32_t *src_data_ = nullptr;
+  int32_t *dst_data_ = nullptr;
+
+  Reducer reducer_ = nullptr;
+  LastReducer last_reducer_ = nullptr;
+  std::vector<QuantMulArg *> mean_multipliers_;
+  std::vector<QuantMulArg *> prod_multipliers_;
+  std::vector<QuantMulArg *> sum_square_multipliers_;
+};
+}  // namespace mindspore::kernel
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_REDUCE_INT8_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/errorcode.h b/mindspore/lite/src/runtime/kernel/arm/nnacl/errorcode.h
index 2d4553cede..fbe5c6bf47 100644
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/errorcode.h
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/errorcode.h
@@ -49,6 +49,8 @@ typedef enum ErrorCodeUint8OpEnum {
 
 typedef enum ErrorCodeInt8OpEnum {
   NNACL_ERRCODE_OP_INT8_START = 40000,
+  NNACL_ERRCODE_ADD_OVERFLOW,
+  NNACL_ERRCODE_MUL_OVERFLOW,
   NNACL_ERRCODE_OP_INT8_END = 49999
 } ErrorCodeInt8OpEnums;
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/reduce.h b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/reduce.h
index e6a9b29938..db0fa5cad7 100644
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/reduce.h
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/reduce.h
@@ -17,15 +17,8 @@
 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_FP32_REDUCE_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_FP32_REDUCE_H_
 #include "nnacl/op_base.h"
-#define REDUCE_MAX_AXES_NUM 8
+#include "src/runtime/kernel/arm/nnacl/reduce_parameter.h"
 
-typedef struct ReduceParameter {
-  OpParameter op_parameter_;
-  bool keep_dims_;
-  int axes_[REDUCE_MAX_AXES_NUM];
-  int num_axes_;
-  int mode_;
-} ReduceParameter;
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/int8/reduce_int8.c b/mindspore/lite/src/runtime/kernel/arm/nnacl/int8/reduce_int8.c
new file mode 100644
index 0000000000..61952ae7f6
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/int8/reduce_int8.c
@@ -0,0 +1,467 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdint.h>
+#include "nnacl/int8/reduce_int8.h"
+#include "nnacl/errorcode.h"
+#include "nnacl/quantization/fixed_point.h"
+
+inline bool isAddOverflow(int32_t x, int32_t y) {
+  int32_t sum = x + y;
+  return (x > 0 && y > 0 && sum < 0) || (x < 0 && y < 0 && sum > 0);
+}
+
+inline bool isMulOverflow(int32_t x, int32_t y) {
+  int32_t p = x * y;
+  return (x != 0) && (p / x != y);
+}
+
+// Get x such that (x-zp_in) * scale_in = mean
+// Assuming reduce n axes, this works for first n-1 reduce. One call for one reduce.
+int ReduceMeanInt8(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
+                   int32_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num) {
+  if (src_data == NULL || dst_data == NULL) {
+    return NNACL_NULL_PTR;
+  }
+  int i, j, k;
+  for (j = tid; j < outer_size; j += thread_num) {
+    const int32_t *outer_src = src_data + j * axis_size * inner_size;
+    int32_t *outer_dst = dst_data + j * inner_size;
+    for (k = 0; k < inner_size; k++) {
+      const int32_t *inner_src = outer_src + k;
+      int32_t *inner_dst = outer_dst + k;
+      int32_t sum = 0;
+      // (x - zp_in) * scale_in = mean[(item - zp_in) * scale_in]
+      // x = mean(item-zp_in) + zp_in
+      for (i = 0; i < axis_size; i++) {
+        int32_t tmp = inner_src[i * inner_size] - quant->in_zp_;
+        if (isAddOverflow(sum, tmp)) {
+          return NNACL_ERRCODE_ADD_OVERFLOW;
+        }
+        sum += tmp;
+      }
+      int32_t mean = RoundingDivideByPOT(
+        SaturatingRoundingDoublingHighMul(sum * (1 << (unsigned int)quant->mean_left_shift_), quant->mean_multiplier_),
+        quant->mean_right_shift_);
+      if (isAddOverflow(mean, quant->in_zp_)) {
+        return NNACL_ERRCODE_ADD_OVERFLOW;
+      }
+      *inner_dst = mean + quant->in_zp_;
+    }
+  }
+  return NNACL_OK;
+}
+
+// suppose reduce n axes, this works for last reduce axis.
+// get y such that (y-zp_out) * scale_out = mean(x-zp_in)*scale_in
+int ReduceMeanLastAxis(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
+                       int8_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num) {
+  if (src_data == NULL || dst_data == NULL) {
+    return NNACL_NULL_PTR;
+  }
+  int i, j, k;
+  for (j = tid; j < outer_size; j += thread_num) {
+    const int32_t *outer_src = src_data + j * axis_size * inner_size;
+    int8_t *outer_dst = dst_data + j * inner_size;
+    for (k = 0; k < inner_size; k++) {
+      const int32_t *inner_src = outer_src + k;
+      int8_t *inner_dst = outer_dst + k;
+      int32_t sum = 0;
+      for (i = 0; i < axis_size; i++) {
+        // y = mean(x-zp_in) * scale + zp_out
+        int32_t tmp = inner_src[i * inner_size] - quant->in_zp_;
+        if (isAddOverflow(tmp, sum)) {
+          return NNACL_ERRCODE_ADD_OVERFLOW;
+        }
+        sum += tmp;
+      }
+      // sum / num
+      int32_t mean = RoundingDivideByPOT(
+        SaturatingRoundingDoublingHighMul(sum * (1 << (unsigned int)quant->mean_left_shift_), quant->mean_multiplier_),
+        quant->mean_right_shift_);
+      // trans to output scale
+      int32_t mean_scaled =
+        RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(mean * (1 << (unsigned int)quant->in_out_left_shift_),
+                                                              quant->in_out_multiplier_),
+                            quant->in_out_right_shift_);
+      if (isAddOverflow(mean_scaled, quant->out_zp_)) {
+        return NNACL_ERRCODE_ADD_OVERFLOW;
+      }
+      mean = mean_scaled + quant->out_zp_;
+
+      if (mean > INT8_MAX) {
+        *inner_dst = INT8_MAX;
+      } else if (mean < INT8_MIN) {
+        *inner_dst = INT8_MIN;
+      } else {
+        *inner_dst = (int8_t)mean;
+      }
+    }
+  }
+  return NNACL_OK;
+}
+
+// Get x such that (x-zp_in) * scale_in = sum(item-zp_in)*scale_in
+// Assuming reduce n axes, this works for first n-1 reduce. One call for one reduce.
+int ReduceSumInt8(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
+                  int32_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num) {
+  if (src_data == NULL || dst_data == NULL) {
+    return NNACL_NULL_PTR;
+  }
+  int i, j, k;
+  for (j = tid; j < outer_size; j += thread_num) {
+    const int32_t *outer_src = src_data + j * axis_size * inner_size;
+    int32_t *outer_dst = dst_data + j * inner_size;
+    for (k = 0; k < inner_size; k++) {
+      const int32_t *inner_src = outer_src + k;
+      int32_t *inner_dst = outer_dst + k;
+      int32_t sum = 0;
+      for (i = 0; i < axis_size; i++) {
+        int32_t tmp = inner_src[i * inner_size] - quant->in_zp_;
+        if (isAddOverflow(tmp, sum)) {
+          return NNACL_ERRCODE_ADD_OVERFLOW;
+        }
+        sum += tmp;
+      }
+
+      if (isAddOverflow(quant->in_zp_, sum)) {
+        return NNACL_ERRCODE_ADD_OVERFLOW;
+      }
+      *inner_dst = sum + quant->in_zp_;
+    }
+  }
+  return NNACL_OK;
+}
+
+// suppose reduce n axes, this works for last reduce axis.
+// get y such that (y-zp_out) * scale_out = sum(item-zp_in)*scale_in
+int ReduceSumLastAxis(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
+                      int8_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num) {
+  if (src_data == NULL || dst_data == NULL) {
+    return NNACL_NULL_PTR;
+  }
+  int i, j, k;
+  for (j = tid; j < outer_size; j += thread_num) {
+    const int32_t *outer_src = src_data + j * axis_size * inner_size;
+    int8_t *outer_dst = dst_data + j * inner_size;
+    for (k = 0; k < inner_size; k++) {
+      const int32_t *inner_src = outer_src + k;
+      int8_t *inner_dst = outer_dst + k;
+      int32_t sum = 0;
+      for (i = 0; i < axis_size; i++) {
+        int32_t tmp = inner_src[i * inner_size] - quant->in_zp_;
+        if (isAddOverflow(tmp, sum)) {
+          return NNACL_ERRCODE_ADD_OVERFLOW;
+        }
+        sum += tmp;
+      }
+      int32_t sum_scaled =
+        RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(sum * (1 << (unsigned int)quant->in_out_left_shift_),
+                                                              quant->in_out_multiplier_),
+                            quant->in_out_right_shift_);
+      if (isAddOverflow(sum_scaled, quant->out_zp_)) {
+        return NNACL_ERRCODE_ADD_OVERFLOW;
+      }
+      sum = sum_scaled + quant->out_zp_;
+      if (sum > INT8_MAX) {
+        *inner_dst = INT8_MAX;
+      } else if (sum < INT8_MIN) {
+        *inner_dst = INT8_MIN;
+      } else {
+        *inner_dst = (int8_t)sum;
+      }
+    }
+  }
+  return NNACL_OK;
+}
+
+int ReduceMaxLastAxis(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
+                      int8_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num) {
+  if (src_data == NULL || dst_data == NULL) {
+    return NNACL_NULL_PTR;
+  }
+  int i, j, k;
+  for (j = tid; j < outer_size; j += thread_num) {
+    const int32_t *outer_src = src_data + j * axis_size * inner_size;
+    int8_t *outer_dst = dst_data + j * inner_size;
+    for (k = 0; k < inner_size; k++) {
+      const int32_t *inner_src = outer_src + k;
+      int8_t *inner_dst = outer_dst + k;
+      int32_t tmp = INT8_MIN;
+      for (i = 0; i < axis_size; i++) {
+        tmp = tmp > inner_src[i * inner_size] ? tmp : inner_src[i * inner_size];
+      }
+      int32_t tmp_scaled = RoundingDivideByPOT(
+        SaturatingRoundingDoublingHighMul((tmp - quant->in_zp_) * (1 << (unsigned int)quant->in_out_left_shift_),
+                                          quant->in_out_multiplier_),
+        quant->in_out_right_shift_);
+      if (isAddOverflow(tmp_scaled, quant->out_zp_)) {
+        return NNACL_ERRCODE_ADD_OVERFLOW;
+      }
+      tmp = tmp_scaled + quant->out_zp_;
+      if (tmp > INT8_MAX) {
+        *inner_dst = INT8_MAX;
+      } else if (tmp < INT8_MIN) {
+        *inner_dst = INT8_MIN;
+      } else {
+        *inner_dst = (int8_t)tmp;
+      }
+    }
+  }
+  return NNACL_OK;
+}
+
+int ReduceMaxInt8(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
+                  int32_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num) {
+  if (src_data == NULL || dst_data == NULL) {
+    return NNACL_NULL_PTR;
+  }
+  int i, j, k;
+  for (j = tid; j < outer_size; j += thread_num) {
+    const int32_t *outer_src = src_data + j * axis_size * inner_size;
+    int32_t *outer_dst = dst_data + j * inner_size;
+    for (k = 0; k < inner_size; k++) {
+      const int32_t *inner_src = outer_src + k;
+      int32_t *inner_dst = outer_dst + k;
+      int32_t tmp = INT8_MIN;
+      for (i = 0; i < axis_size; i++) {
+        tmp = tmp > inner_src[i * inner_size] ? tmp : inner_src[i * inner_size];
+      }
+
+      *inner_dst = tmp;
+    }
+  }
+  return NNACL_OK;
+}
+
+int ReduceMinLastAxis(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
+                      int8_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num) {
+  if (src_data == NULL || dst_data == NULL) {
+    return NNACL_NULL_PTR;
+  }
+  int i, j, k;
+  int base_offset = 20;
+  for (j = tid; j < outer_size; j += thread_num) {
+    const int32_t *outer_src = src_data + j * axis_size * inner_size;
+    int8_t *outer_dst = dst_data + j * inner_size;
+    for (k = 0; k < inner_size; k++) {
+      const int32_t *inner_src = outer_src + k;
+      int8_t *inner_dst = outer_dst + k;
+      int32_t tmp = INT8_MAX;
+      for (i = 0; i < axis_size; i++) {
+        tmp = tmp < inner_src[i * inner_size] ? tmp : inner_src[i * inner_size];
+      }
+      int32_t tmp_scaled =
+        RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
+                              (tmp - quant->in_zp_) * (1 << (unsigned int)quant->in_out_left_shift_ + base_offset),
+                              quant->in_out_multiplier_),
+                            quant->in_out_right_shift_ + base_offset);
+      if (isAddOverflow(tmp_scaled, quant->out_zp_)) {
+        return NNACL_ERRCODE_ADD_OVERFLOW;
+      }
+      tmp = tmp_scaled + quant->out_zp_;
+      if (tmp > INT8_MAX) {
+        *inner_dst = INT8_MAX;
+      } else if (tmp < INT8_MIN) {
+        *inner_dst = INT8_MIN;
+      } else {
+        *inner_dst = (int8_t)tmp;
+      }
+    }
+  }
+  return NNACL_OK;
+}
+
+int ReduceMinInt8(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
+                  int32_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num) {
+  if (src_data == NULL || dst_data == NULL) {
+    return NNACL_NULL_PTR;
+  }
+  int i, j, k;
+  for (j = tid; j < outer_size; j += thread_num) {
+    const int32_t *outer_src = src_data + j * axis_size * inner_size;
+    int32_t *outer_dst = dst_data + j * inner_size;
+    for (k = 0; k < inner_size; k++) {
+      const int32_t *inner_src = outer_src + k;
+      int32_t *inner_dst = outer_dst + k;
+      int32_t tmp = INT8_MAX;
+      for (i = 0; i < axis_size; i++) {
+        tmp = tmp < inner_src[i * inner_size] ? tmp : inner_src[i * inner_size];
+      }
+      *inner_dst = tmp;
+    }
+  }
+  return NNACL_OK;
+}
+
+int ReduceProdLastAxis(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
+                       int8_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num) {
+  if (src_data == NULL || dst_data == NULL) {
+    return NNACL_NULL_PTR;
+  }
+  int i, j, k;
+  for (j = tid; j < outer_size; j += thread_num) {
+    const int32_t *outer_src = src_data + j * axis_size * inner_size;
+    int8_t *outer_dst = dst_data + j * inner_size;
+    for (k = 0; k < inner_size; k++) {
+      const int32_t *inner_src = outer_src + k;
+      int8_t *inner_dst = outer_dst + k;
+      int32_t prod = 1;
+      for (i = 0; i < axis_size; i++) {
+        // quant_out = prod(quant_in-zp) * (scale_in^num/scale_out) + zp_out
+        int32_t tmp = inner_src[i * inner_size] - quant->in_zp_;
+        if (isMulOverflow(prod, tmp)) {
+          return NNACL_ERRCODE_MUL_OVERFLOW;
+        }
+        prod *= tmp;
+      }
+      prod = RoundingDivideByPOT(
+        SaturatingRoundingDoublingHighMul(prod * (1 << (unsigned int)quant->prod_left_shift_), quant->prod_multiplier_),
+        quant->prod_right_shift_);
+      int32_t prod_scaled =
+        RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(prod * (1 << (unsigned int)quant->in_out_left_shift_),
+                                                              quant->in_out_multiplier_),
+                            quant->in_out_right_shift_);
+      if (isAddOverflow(prod_scaled, quant->out_zp_)) {
+        return NNACL_ERRCODE_ADD_OVERFLOW;
+      }
+      prod = prod_scaled + quant->out_zp_;
+      if (prod > INT8_MAX) {
+        *inner_dst = INT8_MAX;
+      } else if (prod < INT8_MIN) {
+        *inner_dst = INT8_MIN;
+      } else {
+        *inner_dst = (int8_t)prod;
+      }
+    }
+  }
+  return NNACL_OK;
+}
+
+int ReduceProdInt8(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
+                   int32_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num) {
+  if (src_data == NULL || dst_data == NULL) {
+    return NNACL_NULL_PTR;
+  }
+  int i, j, k;
+  for (j = tid; j < outer_size; j += thread_num) {
+    const int32_t *outer_src = src_data + j * axis_size * inner_size;
+    int32_t *outer_dst = dst_data + j * inner_size;
+    for (k = 0; k < inner_size; k++) {
+      const int32_t *inner_src = outer_src + k;
+      int32_t *inner_dst = outer_dst + k;
+      int32_t prod = 1;
+      for (i = 0; i < axis_size; i++) {
+        int32_t tmp = inner_src[i * inner_size] - quant->in_zp_;
+        if (isMulOverflow(prod, tmp)) {
+          return NNACL_ERRCODE_MUL_OVERFLOW;
+        }
+        prod *= tmp;
+      }
+      prod = RoundingDivideByPOT(
+        SaturatingRoundingDoublingHighMul(prod * (1 << (unsigned int)quant->prod_left_shift_), quant->prod_multiplier_),
+        quant->prod_right_shift_);
+      if (isAddOverflow(prod, quant->in_zp_)) {
+        return NNACL_ERRCODE_ADD_OVERFLOW;
+      }
+      *inner_dst = prod + quant->in_zp_;  // todo overflow
+    }
+  }
+  return NNACL_OK;
+}
+
+int ReduceSumSquareLastAxis(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
+                            int8_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num) {
+  if (src_data == NULL || dst_data == NULL) {
+    return NNACL_NULL_PTR;
+  }
+  int i, j, k;
+  for (j = tid; j < outer_size; j += thread_num) {
+    const int32_t *outer_src = src_data + j * axis_size * inner_size;
+    int8_t *outer_dst = dst_data + j * inner_size;
+    for (k = 0; k < inner_size; k++) {
+      const int32_t *inner_src = outer_src + k;
+      int8_t *inner_dst = outer_dst + k;
+      int32_t sum = 0;
+      // quant_out = sum((quant_in - zp)^2) * scale_in^2 / scale_out + zp_out
+      for (i = 0; i < axis_size; i++) {
+        int32_t tmp;
+        if (isMulOverflow(inner_src[i * inner_size] - quant->in_zp_, inner_src[i * inner_size] - quant->in_zp_)) {
+          return NNACL_ERRCODE_MUL_OVERFLOW;
+        }
+        tmp = (inner_src[i * inner_size] - quant->in_zp_) * (inner_src[i * inner_size] - quant->in_zp_);
+        if (isAddOverflow(sum, tmp)) {
+          return NNACL_ERRCODE_ADD_OVERFLOW;
+        }
+        sum += tmp;
+      }
+      int32_t sum_scaled =
+        RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(sum * (1 << (unsigned int)quant->sum_square_left_shift_),
+                                                              quant->sum_square_multiplier_),
+                            quant->sum_square_right_shift_);
+      if (isAddOverflow(sum_scaled, quant->out_zp_)) {
+        return NNACL_ERRCODE_ADD_OVERFLOW;
+      }
+      sum = sum_scaled + quant->out_zp_;
+
+      if (sum > INT8_MAX) {
+        *inner_dst = INT8_MAX;
+      } else if (sum < INT8_MIN) {
+        *inner_dst = INT8_MIN;
+      } else {
+        *inner_dst = (int8_t)sum;
+      }
+    }
+  }
+  return NNACL_OK;
+}
+
+int ReduceSumSquareInt8(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
+                        int32_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num) {
+  if (src_data == NULL || dst_data == NULL) {
+    return NNACL_NULL_PTR;
+  }
+  int i, j, k;
+  for (j = tid; j < outer_size; j += thread_num) {
+    const int32_t *outer_src = src_data + j * axis_size * inner_size;
+    int32_t *outer_dst = dst_data + j * inner_size;
+    for (k = 0; k < inner_size; k++) {
+      const int32_t *inner_src = outer_src + k;
+      int32_t *inner_dst = outer_dst + k;
+      int32_t sum = 0;
+      for (i = 0; i < axis_size; i++) {
+        int32_t tmp;
+        if (isMulOverflow(inner_src[i * inner_size] - quant->in_zp_, inner_src[i * inner_size] - quant->in_zp_)) {
+          return NNACL_ERRCODE_MUL_OVERFLOW;
+        }
+        tmp = (inner_src[i * inner_size] - quant->in_zp_) * (inner_src[i * inner_size] - quant->in_zp_);
+        if (isAddOverflow(sum, tmp)) {
+          return NNACL_ERRCODE_ADD_OVERFLOW;
+        }
+        sum += tmp;
+      }
+      sum =
+        RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(sum * (1 << (unsigned int)quant->sum_square_left_shift_),
+                                                              quant->sum_square_multiplier_),
+                            quant->sum_square_right_shift_);
+      if (isAddOverflow(sum, quant->in_zp_)) {
+        return NNACL_ERRCODE_ADD_OVERFLOW;
+      }
+      *inner_dst = sum + quant->in_zp_;
+    }
+  }
+  return NNACL_OK;
+}
diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/int8/reduce_int8.h b/mindspore/lite/src/runtime/kernel/arm/nnacl/int8/reduce_int8.h
new file mode 100644
index 0000000000..b8b95cb1df
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/int8/reduce_int8.h
@@ -0,0 +1,53 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_INT8_REDUCE_INT8_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_INT8_REDUCE_INT8_H_
+#include "nnacl/quantization/quantize.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int ReduceMeanInt8(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
+                   int32_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num);
+int ReduceMeanLastAxis(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
+                       int8_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num);
+int ReduceSumInt8(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
+                  int32_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num);
+int ReduceSumLastAxis(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
+                      int8_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num);
+int ReduceMaxInt8(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
+                  int32_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num);
+int ReduceMaxLastAxis(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
+                      int8_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num);
+int ReduceMinInt8(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
+                  int32_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num);
+int ReduceMinLastAxis(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
+                      int8_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num);
+int ReduceProdLastAxis(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
+                       int8_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num);
+int ReduceProdInt8(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
+                   int32_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num);
+int ReduceSumSquareLastAxis(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
+                            int8_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num);
+int ReduceSumSquareInt8(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
+                        int32_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num);
+bool isAddOverflow(int32_t x, int32_t y);
+bool isMulOverflow(int32_t x, int32_t y);
+#ifdef __cplusplus
+}
+#endif
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_INT8_REDUCE_INT8_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/quantization/quantize.h b/mindspore/lite/src/runtime/kernel/arm/nnacl/quantization/quantize.h
index 64e6f534cd..cbdf2b5871 100644
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/quantization/quantize.h
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/quantization/quantize.h
@@ -219,6 +219,26 @@ typedef struct DivQuantArg {
   int output_multiplier_;
   int output_shift_;
 } DivQuantArg;
+
+typedef struct ReduceQuantArg {
+  double in_scale_;
+  int32_t in_zp_;
+  double out_scale_;
+  int32_t out_zp_;
+  int32_t in_out_multiplier_;
+  int in_out_left_shift_;
+  int in_out_right_shift_;
+  int32_t mean_multiplier_;
+  int mean_left_shift_;
+  int mean_right_shift_;
+  int32_t prod_multiplier_;
+  int prod_left_shift_;
+  int prod_right_shift_;
+  int32_t sum_square_multiplier_;
+  int sum_square_left_shift_;
+  int sum_square_right_shift_;
+} ReduceQuantArg;
+
 #ifdef __cplusplus
 extern "C" {
 #endif
diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/reduce_parameter.h b/mindspore/lite/src/runtime/kernel/arm/nnacl/reduce_parameter.h
new file mode 100644
index 0000000000..092789aa35
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/reduce_parameter.h
@@ -0,0 +1,30 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_REDUCE_PARAMETER_H_
+#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_REDUCE_PARAMETER_H_
+#include "nnacl/op_base.h"
+#define REDUCE_MAX_AXES_NUM 8
+
+struct ReduceParameter {
+  OpParameter op_parameter_;
+  bool keep_dims_;
+  int axes_[REDUCE_MAX_AXES_NUM];
+  int num_axes_;
+  int mode_;
+};
+
+#endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_REDUCE_PARAMETER_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/nnacl/resize_parameter.h b/mindspore/lite/src/runtime/kernel/arm/nnacl/resize_parameter.h
index 946f4f88a1..237b9c5ca2 100644
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/resize_parameter.h
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/resize_parameter.h
@@ -16,7 +16,7 @@
 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_RESIZE_PARAMETER_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_RESIZE_PARAMETER_H_
 
-#include "src/runtime/kernel/arm/nnacl/op_base.h"
+#include "nnacl/op_base.h"
 typedef struct ResizeParameter {
   OpParameter op_parameter_;
   int method_;
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/resize_bilinear_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/resize_bilinear_fp32_tests.cc
index b9e316260b..8745ce5d57 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/resize_bilinear_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/resize_bilinear_fp32_tests.cc
@@ -13,204 +13,255 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <iostream>
 #include <vector>
+#include "mindspore/lite/src/lite_kernel.h"
+#include "mindspore/lite/src/ir/tensor.h"
 #include "common/common_test.h"
-#include "mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/resize.h"
+#include "nnacl/resize_parameter.h"
+#include "mindspore/lite/src/kernel_registry.h"
 
 namespace mindspore {
 
 class TestResizeBilinearFp32 : public mindspore::CommonTest {
  public:
   TestResizeBilinearFp32() = default;
+  void Prepare(const std::vector<int> &input_shape, const std::vector<int> &output_shape, float *input_data,
+               float *output_data, const bool align_corners, const int thread_num);
+
+  void TearDown() override;
 
  public:
-  int tid = 0;
-  int thread_num = 1;
   float err_tol = 1e-5;
+  lite::tensor::Tensor in_tensor_;
+  lite::tensor::Tensor out_tensor_;
+  std::vector<lite::tensor::Tensor *> inputs_{&in_tensor_};
+  std::vector<lite::tensor::Tensor *> outputs_{&out_tensor_};
+  ResizeParameter param_ = {{}};
+  kernel::KernelKey desc = {kernel::KERNEL_ARCH::kCPU, kNumberTypeFloat32, schema::PrimitiveType_Resize};
+  lite::Context ctx_ = lite::Context();
+  kernel::KernelCreator creator_ = nullptr;
+  kernel::LiteKernel *kernel_ = nullptr;
 };
 
+void TestResizeBilinearFp32::TearDown() {
+  in_tensor_.SetData(nullptr);
+  out_tensor_.SetData(nullptr);
+}
+
+void TestResizeBilinearFp32::Prepare(const std::vector<int> &input_shape, const std::vector<int> &output_shape,
+                                     float *input_data, float *output_data, const bool align_corners,
+                                     const int thread_num) {
+  in_tensor_.set_data_type(kNumberTypeFloat32);
+  in_tensor_.set_shape(input_shape);
+  out_tensor_.set_data_type(kNumberTypeFloat32);
+  out_tensor_.set_shape(output_shape);
+  in_tensor_.SetData(input_data);
+  out_tensor_.SetData(output_data);
+
+  ResizeParameter param_ = {
+    {}, static_cast<int>(schema::ResizeMethod_BILINEAR), output_shape[1], output_shape[2], align_corners};
+  desc = {kernel::KERNEL_ARCH::kCPU, kNumberTypeFloat32, schema::PrimitiveType_Resize};
+  ctx_ = lite::Context();
+  ctx_.thread_num_ = thread_num;
+  creator_ = lite::KernelRegistry::GetInstance()->GetCreator(desc);
+  ASSERT_NE(creator_, nullptr);
+  kernel_ = creator_(inputs_, outputs_, reinterpret_cast<OpParameter *>(&param_), &ctx_, desc, nullptr);
+  ASSERT_NE(kernel_, nullptr);
+}
+
 // 1*1 -> 1*1
 TEST_F(TestResizeBilinearFp32, ResizeBilinearTest1) {
-  std::vector<float> input = {1.0};
+  float input_data[] = {1.0f};
+  float output_data[1] = {0};
   std::vector<int> input_shape = {1, 1, 1, 1};
   std::vector<int> output_shape = {1, 1, 1, 1};
   std::vector<float> expect = {1.0};
   bool align_corners = false;
-
   auto output_size = 1;
-  std::vector<float> output(output_size, 0.0);
 
-  ResizeBilinear(input.data(), output.data(), input_shape.data(), output_shape.data(), align_corners, tid,
-                 thread_num);
-  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
+  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
+  auto ret = kernel_->Run();
+  EXPECT_EQ(0, ret);
+
+  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }
 
 // 2*2 -> 1*1
 TEST_F(TestResizeBilinearFp32, ResizeBilinearTest2) {
-  std::vector<float> input = {0.0, 1.0, 2.0, 3.0};
+  float input_data[] = {0.0, 1.0, 2.0, 3.0};
+  float output_data[1] = {0};
   std::vector<int> input_shape = {1, 2, 2, 1};
   std::vector<int> output_shape = {1, 1, 1, 1};
   std::vector<float> expect = {0.0};
   bool align_corners = false;
-
   int output_size = 1;
-  std::vector<float> output(output_size, 0.0);
 
-  ResizeBilinear(input.data(), output.data(), input_shape.data(), output_shape.data(), align_corners, tid,
-                 thread_num);
-  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
+  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
+  auto ret = kernel_->Run();
+  EXPECT_EQ(0, ret);
+
+  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }
 
 // 2*2 -> 1*2
 TEST_F(TestResizeBilinearFp32, ResizeBilinearTest3) {
-  std::vector<float> input = {0.0, 1.0, 2.0, 3.0};
+  float input_data[] = {0.0, 1.0, 2.0, 3.0};
+  float output_data[2] = {0};
   std::vector<int> input_shape = {1, 2, 2, 1};
   std::vector<int> output_shape = {1, 1, 2, 1};
   std::vector<float> expect = {0.0, 1.0};
   bool align_corners = false;
-
   auto output_size = 2;
-  std::vector<float> output(output_size, 0.0);
 
-  ResizeBilinear(input.data(), output.data(), input_shape.data(), output_shape.data(), align_corners, tid,
-                 thread_num);
-  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
+  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
+  auto ret = kernel_->Run();
+  EXPECT_EQ(0, ret);
+
+  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }
 
 // 2*2 -> 2*1
 TEST_F(TestResizeBilinearFp32, ResizeBilinearTest4) {
-  std::vector<float> input = {0.0, 1.0, 2.0, 3.0};
+  float input_data[] = {0.0, 1.0, 2.0, 3.0};
+  float output_data[2] = {0};
   std::vector<int> input_shape = {1, 2, 2, 1};
   std::vector<int> output_shape = {1, 2, 1, 1};
   std::vector<float> expect = {0.0, 2.0};
   bool align_corners = false;
-
   auto output_size = 2;
-  std::vector<float> output(output_size, 0.0);
 
-  ResizeBilinear(input.data(), output.data(), input_shape.data(), output_shape.data(), align_corners, tid,
-                 thread_num);
-  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
+  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
+  auto ret = kernel_->Run();
+  EXPECT_EQ(0, ret);
+
+  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }
 
 // 2*2 -> 2*2
 TEST_F(TestResizeBilinearFp32, ResizeBilinearTest5) {
-  std::vector<float> input = {0.0, 1.0, 2.0, 3.0};
+  float input_data[] = {0.0, 1.0, 2.0, 3.0};
+  float output_data[4] = {0};
   std::vector<int> input_shape = {1, 2, 2, 1};
   std::vector<int> output_shape = {1, 2, 2, 1};
   std::vector<float> expect = {0.0, 1.0, 2.0, 3.0};
   bool align_corners = false;
-
   auto output_size = 4;
-  std::vector<float> output(output_size, 0.0);
 
-  ResizeBilinear(input.data(), output.data(), input_shape.data(), output_shape.data(), align_corners, tid,
-                 thread_num);
-  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
+  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
+  auto ret = kernel_->Run();
+  EXPECT_EQ(0, ret);
+
+  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }
 
 // 2*2 -> 1*4
 TEST_F(TestResizeBilinearFp32, ResizeBilinearTest6) {
-  std::vector<float> input = {0.0, 1.0, 2.0, 3.0};
+  float input_data[] = {0.0, 1.0, 2.0, 3.0};
+  float output_data[4] = {0};
   std::vector<int> input_shape = {1, 2, 2, 1};
   std::vector<int> output_shape = {1, 1, 4, 1};
   std::vector<float> expect = {0.0, 0.5, 1.0, 1.0};
   bool align_corners = false;
-
   auto output_size = 4;
-  std::vector<float> output(output_size, 0.0);
 
-  ResizeBilinear(input.data(), output.data(), input_shape.data(), output_shape.data(), align_corners, tid,
-                 thread_num);
-  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
+  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
+  auto ret = kernel_->Run();
+  EXPECT_EQ(0, ret);
+
+  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }
 
 // 2*2 -> 4*1
 TEST_F(TestResizeBilinearFp32, ResizeBilinearTest7) {
-  std::vector<float> input = {0.0, 1.0, 2.0, 3.0};
+  float input_data[] = {0.0, 1.0, 2.0, 3.0};
+  float output_data[4] = {0};
   std::vector<int> input_shape = {1, 2, 2, 1};
   std::vector<int> output_shape = {1, 4, 1, 1};
   std::vector<float> expect = {0.0, 1.0, 2.0, 2.0};
   bool align_corners = false;
-
   auto output_size = 4;
-  std::vector<float> output(output_size, 0.0);
 
-  ResizeBilinear(input.data(), output.data(), input_shape.data(), output_shape.data(), align_corners, tid,
-                 thread_num);
-  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
+  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
+  auto ret = kernel_->Run();
+  EXPECT_EQ(0, ret);
+
+  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }
 
 // 2*2 -> 2*4
 TEST_F(TestResizeBilinearFp32, ResizeBilinearTest8) {
-  std::vector<float> input = {0.0, 1.0, 2.0, 3.0};
+  float input_data[] = {0.0, 1.0, 2.0, 3.0};
+  float output_data[8] = {0};
   std::vector<int> input_shape = {1, 2, 2, 1};
   std::vector<int> output_shape = {1, 2, 4, 1};
   std::vector<float> expect = {0.0, 0.5, 1.0, 1.0, 2.0, 2.5, 3.0, 3.0};
   bool align_corners = false;
-
   auto output_size = 8;
-  std::vector<float> output(output_size, 0.0);
 
-  ResizeBilinear(input.data(), output.data(), input_shape.data(), output_shape.data(), align_corners, tid,
-                 thread_num);
-  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
+  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
+  auto ret = kernel_->Run();
+  EXPECT_EQ(0, ret);
+
+  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }
 
 // 2*2 -> 4*2
 TEST_F(TestResizeBilinearFp32, ResizeBilinearTest9) {
-  std::vector<float> input = {0.0, 1.0, 2.0, 3.0};
+  float input_data[] = {0.0, 1.0, 2.0, 3.0};
+  float output_data[8] = {0};
   std::vector<int> input_shape = {1, 2, 2, 1};
   std::vector<int> output_shape = {1, 4, 2, 1};
   std::vector<float> expect = {0.0, 1.0, 1.0, 2.0, 2.0, 3.0, 2.0, 3.0};
   bool align_corners = false;
-
   auto output_size = 8;
-  std::vector<float> output(output_size, 0.0);
 
-  ResizeBilinear(input.data(), output.data(), input_shape.data(), output_shape.data(), align_corners, tid,
-                 thread_num);
-  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
+  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
+  auto ret = kernel_->Run();
+  EXPECT_EQ(0, ret);
+
+  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }
 
 // 2*2 -> 3*3
 TEST_F(TestResizeBilinearFp32, ResizeBilinearTest10) {
-  std::vector<float> input = {0.0, 1.0, 2.0, 3.0};
+  float input_data[] = {0.0, 1.0, 2.0, 3.0};
+  float output_data[9] = {0};
   std::vector<int> input_shape = {1, 2, 2, 1};
   std::vector<int> output_shape = {1, 3, 3, 1};
   std::vector<float> expect = {0.0, 0.6666667, 1.0, 1.3333334, 2.0, 2.3333335, 2.0, 2.6666667, 3.0};
   bool align_corners = false;
 
   auto output_size = 9;
-  std::vector<float> output(output_size, 0.0);
+  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
+  auto ret = kernel_->Run();
+  EXPECT_EQ(0, ret);
 
-  ResizeBilinear(input.data(), output.data(), input_shape.data(), output_shape.data(), align_corners, tid,
-                 thread_num);
-  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
+  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }
 
 // 2*2 -> 4*4
 TEST_F(TestResizeBilinearFp32, ResizeBilinearTest11) {
-  std::vector<float> input = {0.0, 1.0, 2.0, 3.0};
+  float input_data[] = {0.0, 1.0, 2.0, 3.0};
+  float output_data[16] = {0};
   std::vector<int> input_shape = {1, 2, 2, 1};
   std::vector<int> output_shape = {1, 4, 4, 1};
   std::vector<float> expect = {0.0, 0.5, 1.0, 1.0, 1.0, 1.5, 2.0, 2.0, 2.0, 2.5, 3.0, 3.0, 2.0, 2.5, 3.0, 3.0};
   bool align_corners = false;
 
   auto output_size = 16;
-  std::vector<float> output(output_size, 0.0);
+  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
+  auto ret = kernel_->Run();
+  EXPECT_EQ(0, ret);
 
-  ResizeBilinear(input.data(), output.data(), input_shape.data(), output_shape.data(), align_corners, tid,
-                 thread_num);
-  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
+  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }
 
 // 2*2*2*5 -> 2*4*4*5
 TEST_F(TestResizeBilinearFp32, ResizeBilinearTest12) {
-  std::vector<float> input = {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0, 13.0,
-                              14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0,
-                              28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0};
+  float input_data[] = {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0, 13.0,
+                        14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0,
+                        28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0};
+  float output_data[160] = {0};
   std::vector<int> input_shape = {2, 2, 2, 5};
   std::vector<int> output_shape = {2, 4, 4, 5};
   std::vector<float> expect = {
@@ -224,20 +275,21 @@ TEST_F(TestResizeBilinearFp32, ResizeBilinearTest12) {
     33.5, 34.5, 35.5, 36.5, 35.0, 36.0, 37.0, 38.0, 39.0, 35.0, 36.0, 37.0, 38.0, 39.0, 30.0, 31.0, 32.0, 33.0,
     34.0, 32.5, 33.5, 34.5, 35.5, 36.5, 35.0, 36.0, 37.0, 38.0, 39.0, 35.0, 36.0, 37.0, 38.0, 39.0};
   bool align_corners = false;
-
   auto output_size = 160;
-  std::vector<float> output(output_size, 0.0);
 
-  ResizeBilinear(input.data(), output.data(), input_shape.data(), output_shape.data(), align_corners, tid,
-                 thread_num);
-  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
+  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
+  auto ret = kernel_->Run();
+  EXPECT_EQ(0, ret);
+
+  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }
 
 // 2*2*2*5 -> 2*4*4*5 align corners
 TEST_F(TestResizeBilinearFp32, ResizeBilinearTest13) {
-  std::vector<float> input = {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0, 13.0,
-                              14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0,
-                              28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0};
+  float input_data[] = {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0, 13.0,
+                        14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0,
+                        28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0};
+  float output_data[160] = {0};
   std::vector<int> input_shape = {2, 2, 2, 5};
   std::vector<int> output_shape = {2, 4, 4, 5};
   std::vector<float> expect = {
@@ -258,20 +310,21 @@ TEST_F(TestResizeBilinearFp32, ResizeBilinearTest13) {
     30.0,      31.0,      32.0,      33.0,      34.0,      31.666666, 32.666668, 33.666668, 34.666668, 35.666668,
     33.333332, 34.333332, 35.333332, 36.333332, 37.333332, 35.0,      36.0,      37.0,      38.0,      39.0};
   bool align_corners = true;
-
   auto output_size = 160;
-  std::vector<float> output(output_size, 0.0);
 
-  ResizeBilinear(input.data(), output.data(), input_shape.data(), output_shape.data(), align_corners, tid,
-                 thread_num);
-  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
+  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
+  auto ret = kernel_->Run();
+  EXPECT_EQ(0, ret);
+
+  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }
 
 // 2*2*2*5 -> 2*4*4*5 thread_num 2
 TEST_F(TestResizeBilinearFp32, ResizeBilinearTest14) {
-  std::vector<float> input = {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0, 13.0,
-                              14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0,
-                              28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0};
+  float input_data[] = {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0, 13.0,
+                        14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0,
+                        28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0};
+  float output_data[160] = {0};
   std::vector<int> input_shape = {2, 2, 2, 5};
   std::vector<int> output_shape = {2, 4, 4, 5};
   std::vector<float> expect = {
@@ -285,24 +338,22 @@ TEST_F(TestResizeBilinearFp32, ResizeBilinearTest14) {
     33.5, 34.5, 35.5, 36.5, 35.0, 36.0, 37.0, 38.0, 39.0, 35.0, 36.0, 37.0, 38.0, 39.0, 30.0, 31.0, 32.0, 33.0,
     34.0, 32.5, 33.5, 34.5, 35.5, 36.5, 35.0, 36.0, 37.0, 38.0, 39.0, 35.0, 36.0, 37.0, 38.0, 39.0};
   bool align_corners = false;
-
   auto output_size = 160;
-  std::vector<float> output(output_size, 0.0);
-  thread_num = 2;
-  tid = 0;
-  ResizeBilinear(input.data(), output.data(), input_shape.data(), output_shape.data(), align_corners, tid,
-                 thread_num);
-  tid = 1;
-  ResizeBilinear(input.data(), output.data(), input_shape.data(), output_shape.data(), align_corners, tid,
-                 thread_num);
-  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
+  int thread_num = 2;
+
+  Prepare(input_shape, output_shape, input_data, output_data, align_corners, thread_num);
+  auto ret = kernel_->Run();
+  EXPECT_EQ(0, ret);
+
+  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }
 
 // 2*2*2*5 -> 2*4*4*5 thread_num 4
 TEST_F(TestResizeBilinearFp32, ResizeBilinearTest15) {
-  std::vector<float> input = {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0, 13.0,
-                              14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0,
-                              28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0};
+  float input_data[] = {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0, 13.0,
+                        14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0,
+                        28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0};
+  float output_data[160] = {0};
   std::vector<int> input_shape = {2, 2, 2, 5};
   std::vector<int> output_shape = {2, 4, 4, 5};
   std::vector<float> expect = {
@@ -319,19 +370,11 @@ TEST_F(TestResizeBilinearFp32, ResizeBilinearTest15) {
 
   auto output_size = 160;
   std::vector<float> output(output_size, 0.0);
-  thread_num = 4;
-  tid = 0;
-  ResizeBilinear(input.data(), output.data(), input_shape.data(), output_shape.data(), align_corners, tid,
-                 thread_num);
-  tid = 1;
-  ResizeBilinear(input.data(), output.data(), input_shape.data(), output_shape.data(), align_corners, tid,
-                 thread_num);
-  tid = 2;
-  ResizeBilinear(input.data(), output.data(), input_shape.data(), output_shape.data(), align_corners, tid,
-                 thread_num);
-  tid = 3;
-  ResizeBilinear(input.data(), output.data(), input_shape.data(), output_shape.data(), align_corners, tid,
-                 thread_num);
-  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
+  int thread_num = 4;
+  Prepare(input_shape, output_shape, input_data, output_data, align_corners, thread_num);
+  auto ret = kernel_->Run();
+  EXPECT_EQ(0, ret);
+
+  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/resize_nearest_neighbor_fp32_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/resize_nearest_neighbor_fp32_tests.cc
index 65cb508489..b1fae684ed 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/resize_nearest_neighbor_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/resize_nearest_neighbor_fp32_tests.cc
@@ -15,168 +15,250 @@
  */
 #include <vector>
 #include "common/common_test.h"
-#include "mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/resize.h"
+#include "nnacl/resize_parameter.h"
+#include "mindspore/lite/src/kernel_registry.h"
 
 namespace mindspore {
 
 class TestResizeNearestNeighborFp32 : public mindspore::CommonTest {
  public:
   TestResizeNearestNeighborFp32() = default;
+  void Prepare(const std::vector<int> &input_shape, const std::vector<int> &output_shape, float *input_data,
+               float *output_data, const bool align_corners, const int thread_num);
+
+  void TearDown() override;
 
  public:
-  int tid = 0;
-  int thread_num = 1;
   float err_tol = 1e-5;
+  lite::tensor::Tensor in_tensor_;
+  lite::tensor::Tensor out_tensor_;
+  std::vector<lite::tensor::Tensor *> inputs_{&in_tensor_};
+  std::vector<lite::tensor::Tensor *> outputs_{&out_tensor_};
+  ResizeParameter param_ = {{}};
+  kernel::KernelKey desc = {kernel::KERNEL_ARCH::kCPU, kNumberTypeFloat32, schema::PrimitiveType_Resize};
+  lite::Context ctx_ = lite::Context();
+  kernel::KernelCreator creator_ = nullptr;
+  kernel::LiteKernel *kernel_ = nullptr;
 };
 
+void TestResizeNearestNeighborFp32::TearDown() {
+  in_tensor_.SetData(nullptr);
+  out_tensor_.SetData(nullptr);
+}
+
+void TestResizeNearestNeighborFp32::Prepare(const std::vector<int> &input_shape, const std::vector<int> &output_shape,
+                                            float *input_data, float *output_data, const bool align_corners,
+                                            const int thread_num) {
+  in_tensor_.set_data_type(kNumberTypeFloat32);
+  in_tensor_.set_shape(input_shape);
+  out_tensor_.set_data_type(kNumberTypeFloat32);
+  out_tensor_.set_shape(output_shape);
+  in_tensor_.SetData(input_data);
+  out_tensor_.SetData(output_data);
+
+  ResizeParameter param_ = {
+    {}, static_cast<int>(schema::ResizeMethod_NEAREST_NEIGHBOR), output_shape[1], output_shape[2], align_corners};
+  desc = {kernel::KERNEL_ARCH::kCPU, kNumberTypeFloat32, schema::PrimitiveType_Resize};
+  ctx_ = lite::Context();
+  ctx_.thread_num_ = thread_num;
+  creator_ = lite::KernelRegistry::GetInstance()->GetCreator(desc);
+  ASSERT_NE(creator_, nullptr);
+  kernel_ = creator_(inputs_, outputs_, reinterpret_cast<OpParameter *>(&param_), &ctx_, desc, nullptr);
+  ASSERT_NE(kernel_, nullptr);
+}
 // 1*1 -> 1*1
 TEST_F(TestResizeNearestNeighborFp32, ResizeNearestNeighborTest1) {
-  std::vector<float> input = {1.0};
+  float input_data[] = {1.0};
+  float output_data[1] = {0};
   std::vector<int> input_shape = {1, 1, 1, 1};
   std::vector<int> output_shape = {1, 1, 1, 1};
   std::vector<float> expect = {1.0};
   size_t output_size = 1;
-  std::vector<float> output(output_size, 0.0);
+  bool align_corners = false;
+
+  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
+  auto ret = kernel_->Run();
+  EXPECT_EQ(0, ret);
 
-  ResizeNearestNeighbor(input.data(), output.data(), input_shape.data(), output_shape.data(), tid, thread_num);
-  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
+  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }
 
 // 2*2 -> 1*1
 TEST_F(TestResizeNearestNeighborFp32, ResizeNearestNeighborTest2) {
-  std::vector<float> input = {0.0, 1.0, 2.0, 3.0};
+  float input_data[] = {0.0, 1.0, 2.0, 3.0};
+  float output_data[1] = {0};
   std::vector<int> input_shape = {1, 2, 2, 1};
   std::vector<int> output_shape = {1, 1, 1, 1};
   std::vector<float> expect = {0.0};
   size_t output_size = 1;
-  std::vector<float> output(output_size, 0.0);
+  bool align_corners = false;
 
-  ResizeNearestNeighbor(input.data(), output.data(), input_shape.data(), output_shape.data(), tid, thread_num);
-  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
+  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
+  auto ret = kernel_->Run();
+  EXPECT_EQ(0, ret);
+
+  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }
 
 // 2*2 -> 1*2
 TEST_F(TestResizeNearestNeighborFp32, ResizeNearestNeighborTest3) {
-  std::vector<float> input = {0.0, 1.0, 2.0, 3.0};
+  float input_data[] = {0.0, 1.0, 2.0, 3.0};
+  float output_data[2] = {0};
   std::vector<int> input_shape = {1, 2, 2, 1};
   std::vector<int> output_shape = {1, 1, 2, 1};
   std::vector<float> expect = {0.0, 1.0};
   size_t output_size = 2;
-  std::vector<float> output(output_size, 0.0);
+  bool align_corners = false;
+
+  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
+  auto ret = kernel_->Run();
+  EXPECT_EQ(0, ret);
 
-  ResizeNearestNeighbor(input.data(), output.data(), input_shape.data(), output_shape.data(), tid, thread_num);
-  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
+  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }
 
 // 2*2 -> 2*1
 TEST_F(TestResizeNearestNeighborFp32, ResizeNearestNeighborTest4) {
-  std::vector<float> input = {0.0, 1.0, 2.0, 3.0};
+  float input_data[] = {0.0, 1.0, 2.0, 3.0};
+  float output_data[2] = {0};
   std::vector<int> input_shape = {1, 2, 2, 1};
   std::vector<int> output_shape = {1, 2, 1, 1};
   std::vector<float> expect = {0.0, 2.0};
   size_t output_size = 2;
-  std::vector<float> output(output_size, 0.0);
+  bool align_corners = false;
+
+  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
+  auto ret = kernel_->Run();
+  EXPECT_EQ(0, ret);
 
-  ResizeNearestNeighbor(input.data(), output.data(), input_shape.data(), output_shape.data(), tid, thread_num);
-  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
+  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }
 
 // 2*2 -> 2*2
 TEST_F(TestResizeNearestNeighborFp32, ResizeNearestNeighborTest5) {
-  std::vector<float> input = {0.0, 1.0, 2.0, 3.0};
+  float input_data[] = {0.0, 1.0, 2.0, 3.0};
+  float output_data[4] = {0};
   std::vector<int> input_shape = {1, 2, 2, 1};
   std::vector<int> output_shape = {1, 2, 2, 1};
   std::vector<float> expect = {0.0, 1.0, 2.0, 3.0};
   size_t output_size = 4;
-  std::vector<float> output(output_size, 0.0);
+  bool align_corners = false;
 
-  ResizeNearestNeighbor(input.data(), output.data(), input_shape.data(), output_shape.data(), tid, thread_num);
-  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
+  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
+  auto ret = kernel_->Run();
+  EXPECT_EQ(0, ret);
+
+  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }
 
 // 2*2 -> 1*4
 TEST_F(TestResizeNearestNeighborFp32, ResizeNearestNeighborTest6) {
-  std::vector<float> input = {0.0, 1.0, 2.0, 3.0};
+  float input_data[] = {0.0, 1.0, 2.0, 3.0};
+  float output_data[4] = {0};
   std::vector<int> input_shape = {1, 2, 2, 1};
   std::vector<int> output_shape = {1, 1, 4, 1};
   std::vector<float> expect = {0.0, 0.0, 1.0, 1.0};
   size_t output_size = 4;
-  std::vector<float> output(output_size, 0.0);
+  bool align_corners = false;
+
+  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
+  auto ret = kernel_->Run();
+  EXPECT_EQ(0, ret);
 
-  ResizeNearestNeighbor(input.data(), output.data(), input_shape.data(), output_shape.data(), tid, thread_num);
-  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
+  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }
 
 // 2*2 -> 4*1
 TEST_F(TestResizeNearestNeighborFp32, ResizeNearestNeighborTest7) {
-  std::vector<float> input = {0.0, 1.0, 2.0, 3.0};
+  float input_data[] = {0.0, 1.0, 2.0, 3.0};
+  float output_data[4] = {0};
   std::vector<int> input_shape = {1, 2, 2, 1};
   std::vector<int> output_shape = {1, 4, 1, 1};
   std::vector<float> expect = {0.0, 0.0, 2.0, 2.0};
   size_t output_size = 4;
-  std::vector<float> output(output_size, 0.0);
+  bool align_corners = false;
 
-  ResizeNearestNeighbor(input.data(), output.data(), input_shape.data(), output_shape.data(), tid, thread_num);
-  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
+  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
+  auto ret = kernel_->Run();
+  EXPECT_EQ(0, ret);
+
+  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }
 
 // 2*2 -> 2*4
 TEST_F(TestResizeNearestNeighborFp32, ResizeNearestNeighborTest8) {
-  std::vector<float> input = {0.0, 1.0, 2.0, 3.0};
+  float input_data[] = {0.0, 1.0, 2.0, 3.0};
+  float output_data[8] = {0};
   std::vector<int> input_shape = {1, 2, 2, 1};
   std::vector<int> output_shape = {1, 2, 4, 1};
   std::vector<float> expect = {0.0, 0.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0};
   size_t output_size = 8;
-  std::vector<float> output(output_size, 0.0);
+  bool align_corners = false;
+
+  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
+  auto ret = kernel_->Run();
+  EXPECT_EQ(0, ret);
 
-  ResizeNearestNeighbor(input.data(), output.data(), input_shape.data(), output_shape.data(), tid, thread_num);
-  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
+  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }
 
 // 2*2 -> 4*2
 TEST_F(TestResizeNearestNeighborFp32, ResizeNearestNeighborTest9) {
-  std::vector<float> input = {0.0, 1.0, 2.0, 3.0};
+  float input_data[] = {0.0, 1.0, 2.0, 3.0};
+  float output_data[8] = {0};
   std::vector<int> input_shape = {1, 2, 2, 1};
   std::vector<int> output_shape = {1, 4, 2, 1};
   std::vector<float> expect = {0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
   size_t output_size = 8;
-  std::vector<float> output(output_size, 0.0);
+  bool align_corners = false;
+
+  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
+  auto ret = kernel_->Run();
+  EXPECT_EQ(0, ret);
 
-  ResizeNearestNeighbor(input.data(), output.data(), input_shape.data(), output_shape.data(), tid, thread_num);
-  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
+  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }
 
 // 2*2 -> 3*3
 TEST_F(TestResizeNearestNeighborFp32, ResizeNearestNeighborTest10) {
-  std::vector<float> input = {0.0, 1.0, 2.0, 3.0};
+  float input_data[] = {0.0, 1.0, 2.0, 3.0};
+  float output_data[9] = {0};
   std::vector<int> input_shape = {1, 2, 2, 1};
   std::vector<int> output_shape = {1, 3, 3, 1};
   std::vector<float> expect = {0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 2.0, 2.0, 3.0};
   size_t output_size = 9;
-  std::vector<float> output(output_size, 0.0);
+  bool align_corners = false;
 
-  ResizeNearestNeighbor(input.data(), output.data(), input_shape.data(), output_shape.data(), tid, thread_num);
-  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
+  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
+  auto ret = kernel_->Run();
+  EXPECT_EQ(0, ret);
+
+  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }
 
 // 2*2 -> 4*4
 TEST_F(TestResizeNearestNeighborFp32, ResizeNearestNeighborTest11) {
-  std::vector<float> input = {0.0, 1.0, 2.0, 3.0};
+  float input_data[] = {0.0, 1.0, 2.0, 3.0};
+  float output_data[16] = {0};
   std::vector<int> input_shape = {1, 2, 2, 1};
   std::vector<int> output_shape = {1, 4, 4, 1};
   std::vector<float> expect = {0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 2.0, 2.0, 3.0, 3.0};
   size_t output_size = 16;
-  std::vector<float> output(output_size, 0.0);
+  bool align_corners = false;
+
+  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
+  auto ret = kernel_->Run();
+  EXPECT_EQ(0, ret);
 
-  ResizeNearestNeighbor(input.data(), output.data(), input_shape.data(), output_shape.data(), tid, thread_num);
-  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
+  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }
 
 // 2*2*2*5 -> 2*4*4*5
 TEST_F(TestResizeNearestNeighborFp32, ResizeNearestNeighborTest12) {
-  std::vector<float> input = {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0, 13.0,
-                              14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0,
-                              28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0};
+  float input_data[] = {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0, 13.0,
+                        14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0,
+                        28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0};
+  float output_data[160] = {0};
   std::vector<int> input_shape = {2, 2, 2, 5};
   std::vector<int> output_shape = {2, 4, 4, 5};
   std::vector<float> expect = {
@@ -190,17 +272,21 @@ TEST_F(TestResizeNearestNeighborFp32, ResizeNearestNeighborTest12) {
     31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 35.0, 36.0, 37.0, 38.0, 39.0, 30.0, 31.0, 32.0, 33.0,
     34.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 35.0, 36.0, 37.0, 38.0, 39.0};
   size_t output_size = 160;
-  std::vector<float> output(output_size, 0.0);
+  bool align_corners = false;
 
-  ResizeNearestNeighbor(input.data(), output.data(), input_shape.data(), output_shape.data(), tid, thread_num);
-  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
+  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
+  auto ret = kernel_->Run();
+  EXPECT_EQ(0, ret);
+
+  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }
 
 // 2*2*2*5 -> 2*4*4*5 thread_num 2
 TEST_F(TestResizeNearestNeighborFp32, ResizeNearestNeighborTest13) {
-  std::vector<float> input = {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0, 13.0,
-                              14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0,
-                              28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0};
+  float input_data[] = {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0, 13.0,
+                        14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0,
+                        28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0};
+  float output_data[160] = {0};
   std::vector<int> input_shape = {2, 2, 2, 5};
   std::vector<int> output_shape = {2, 4, 4, 5};
   std::vector<float> expect = {
@@ -214,21 +300,21 @@ TEST_F(TestResizeNearestNeighborFp32, ResizeNearestNeighborTest13) {
     31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 35.0, 36.0, 37.0, 38.0, 39.0, 30.0, 31.0, 32.0, 33.0,
     34.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 35.0, 36.0, 37.0, 38.0, 39.0};
   size_t output_size = 160;
-  std::vector<float> output(output_size, 0.0);
-
-  thread_num = 2;
-  tid = 0;
-  ResizeNearestNeighbor(input.data(), output.data(), input_shape.data(), output_shape.data(), tid, thread_num);
-  tid = 1;
-  ResizeNearestNeighbor(input.data(), output.data(), input_shape.data(), output_shape.data(), tid, thread_num);
-  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
+  bool align_corners = false;
+
+  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 2);
+  auto ret = kernel_->Run();
+  EXPECT_EQ(0, ret);
+
+  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }
 
 // 2*2*2*5 -> 2*4*4*5 thread_num 4
 TEST_F(TestResizeNearestNeighborFp32, ResizeNearestNeighborTest14) {
-  std::vector<float> input = {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0, 13.0,
-                              14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0,
-                              28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0};
+  float input_data[] = {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0, 13.0,
+                        14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0,
+                        28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0};
+  float output_data[160] = {0};
   std::vector<int> input_shape = {2, 2, 2, 5};
   std::vector<int> output_shape = {2, 4, 4, 5};
   std::vector<float> expect = {
@@ -242,17 +328,12 @@ TEST_F(TestResizeNearestNeighborFp32, ResizeNearestNeighborTest14) {
     31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 35.0, 36.0, 37.0, 38.0, 39.0, 30.0, 31.0, 32.0, 33.0,
     34.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 35.0, 36.0, 37.0, 38.0, 39.0};
   size_t output_size = 160;
-  std::vector<float> output(output_size, 0.0);
-
-  thread_num = 4;
-  tid = 0;
-  ResizeNearestNeighbor(input.data(), output.data(), input_shape.data(), output_shape.data(), tid, thread_num);
-  tid = 1;
-  ResizeNearestNeighbor(input.data(), output.data(), input_shape.data(), output_shape.data(), tid, thread_num);
-  tid = 2;
-  ResizeNearestNeighbor(input.data(), output.data(), input_shape.data(), output_shape.data(), tid, thread_num);
-  tid = 3;
-  ResizeNearestNeighbor(input.data(), output.data(), input_shape.data(), output_shape.data(), tid, thread_num);
-  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
+  bool align_corners = false;
+
+  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 4);
+  auto ret = kernel_->Run();
+  EXPECT_EQ(0, ret);
+
+  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }
 }  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/reduce_int8_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/reduce_int8_tests.cc
new file mode 100644
index 0000000000..f3e78a5014
--- /dev/null
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/reduce_int8_tests.cc
@@ -0,0 +1,355 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <memory>
+#include "utils/log_adapter.h"
+#include "common/common_test.h"
+#include "src/ir/tensor.h"
+#include "mindspore/lite/src/kernel_registry.h"
+#include "nnacl/fp32/reduce.h"
+
+namespace mindspore {
+using mindspore::lite::tensor::QuantArg;
+using mindspore::lite::tensor::Tensor;
+using mindspore::schema::ReduceMode;
+using mindspore::schema::ReduceMode_ReduceMax;
+using mindspore::schema::ReduceMode_ReduceMean;
+using mindspore::schema::ReduceMode_ReduceMin;
+using mindspore::schema::ReduceMode_ReduceProd;
+using mindspore::schema::ReduceMode_ReduceSum;
+using mindspore::schema::ReduceMode_ReduceSumSquare;
+
+class TestReduceInt8 : public mindspore::CommonTest {
+ public:
+  TestReduceInt8() = default;
+  void Prepare(const std::vector<int> &in_shape, const std::vector<int> &out_shape, int8_t *input_data,
+               int8_t *output_data, ReduceMode mode, const int *axes, const int num_axes);
+  void TearDown() override;
+
+ public:
+  int thread_num_ = 1;
+
+  ReduceParameter param_ = {};
+  Tensor in_tensor_;
+  Tensor out_tensor_;
+  std::vector<Tensor *> inputs{&in_tensor_};
+  std::vector<Tensor *> outputs{&out_tensor_};
+  kernel::KernelKey desc_ = {kernel::KERNEL_ARCH::kCPU, kNumberTypeInt8, schema::PrimitiveType_Reduce};
+  kernel::KernelCreator creator_ = nullptr;
+  lite::Context ctx_ = lite::Context();
+  kernel::LiteKernel *kernel_ = nullptr;
+  const QuantArg quant_in_ = {0.005f, 5};
+  const QuantArg quant_out_ = {0.01f, 1};
+  float err_tol_ = 0.05;
+};
+
+void TestReduceInt8::TearDown() {
+  in_tensor_.SetData(nullptr);
+  out_tensor_.SetData(nullptr);
+}
+
+void TestReduceInt8::Prepare(const std::vector<int> &in_shape, const std::vector<int> &out_shape, int8_t *input_data,
+                             int8_t *output_data, ReduceMode mode, const int *axes, const int num_axes) {
+  in_tensor_.set_data_type(kNumberTypeInt8);
+  in_tensor_.set_shape(in_shape);
+  in_tensor_.SetData(input_data);
+  in_tensor_.AddQuantParam(quant_in_);
+
+  out_tensor_.set_data_type(kNumberTypeInt8);
+  out_tensor_.set_shape(out_shape);
+  out_tensor_.SetData(output_data);
+  out_tensor_.AddQuantParam(quant_out_);
+
+  param_.mode_ = static_cast<int>(mode);
+  param_.num_axes_ = num_axes;
+  memcpy(param_.axes_, axes, num_axes * sizeof(int));
+
+  creator_ = lite::KernelRegistry::GetInstance()->GetCreator(desc_);
+
+  ctx_.thread_num_ = thread_num_;
+  kernel_ = creator_(inputs, outputs, reinterpret_cast<OpParameter *>(&param_), &ctx_, desc_, nullptr);
+}
+
+TEST_F(TestReduceInt8, Mean) {
+  /* 2 4 4 3 NHWC */
+  int8_t input_data[96] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                           20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
+                           40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+                           60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+                           80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95};
+  int8_t output_data[32] = {0};
+  int axes[] = {3};
+  int num_axes = 1;
+  std::vector<int> input_shape = {2, 4, 4, 3};
+  std::vector<int> output_shape = {2, 4, 4, 1};
+  int output_size = 32;
+  int8_t correct[] = {-1, 1,  2,  3,  5,  7,  8,  10, 11, 12, 14, 16, 17, 19, 20, 22,
+                      23, 25, 26, 28, 29, 30, 32, 34, 35, 37, 38, 40, 41, 43, 44, 46};
+
+  thread_num_ = 2;
+  Prepare(input_shape, output_shape, input_data, output_data, ReduceMode_ReduceMean, axes, num_axes);
+  auto ret = kernel_->Run();
+  EXPECT_EQ(0, ret);
+
+  err_tol_ = 0.09375;
+  CompareOutputInt8(output_data, correct, output_size, err_tol_);
+}
+
+TEST_F(TestReduceInt8, MeanAllAxis) {
+  /* 2*4*4*3 NHWC */
+  int8_t input_data[96] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                           20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
+                           40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+                           60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+                           80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95};
+  int8_t output_data[1] = {0};
+  int axes[] = {0};
+  int num_axes = 0;
+  std::vector<int> input_shape = {2, 4, 4, 3};
+  std::vector<int> output_shape = {1};
+  int output_size = 1;
+  int8_t correct[] = {22};
+  thread_num_ = 2;
+  Prepare(input_shape, output_shape, input_data, output_data, ReduceMode_ReduceMean, axes, num_axes);
+  auto ret = kernel_->Run();
+  EXPECT_EQ(0, ret);
+
+  err_tol_ = 1.0f;
+  CompareOutputInt8(output_data, correct, output_size, err_tol_);
+}
+
+TEST_F(TestReduceInt8, Sum) {
+  /* 2*4*4*3 NHWC */
+  int8_t input_data[96] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                           20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
+                           40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+                           60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+                           80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95};
+  int8_t output_data[32] = {0};
+  int axes[] = {-1};
+  int num_axes = 1;
+  std::vector<int> input_shape = {2, 4, 4, 3};
+  std::vector<int> output_shape = {2, 4, 4, 1};
+  int output_size = 32;
+  int8_t correct[] = {-5, -1, 4,  9,  13, 18, 22, 27, 31,  36,  40,  45,  49,  54,  58,  63,
+                      67, 72, 76, 81, 85, 90, 94, 99, 103, 107, 112, 117, 121, 126, 127, 127};
+  thread_num_ = 2;
+  Prepare(input_shape, output_shape, input_data, output_data, ReduceMode_ReduceSum, axes, num_axes);
+  auto ret = kernel_->Run();
+  EXPECT_EQ(0, ret);
+
+  err_tol_ = 0.0625f;
+  CompareOutputInt8(output_data, correct, output_size, err_tol_);
+}
+
+TEST_F(TestReduceInt8, SumAllAxis) {
+  /* 2*4*4*3 NHWC */
+  int8_t input_data[96] = {
+    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+  };
+  int8_t output_data[1] = {0};
+  int axes[] = {0, 1, 2, 3};
+  int num_axes = 4;
+  std::vector<int> input_shape = {2, 4, 4, 3};
+  std::vector<int> output_shape = {1};
+  int output_size = 1;
+  int8_t correct[] = {-47};
+  thread_num_ = 2;
+  Prepare(input_shape, output_shape, input_data, output_data, ReduceMode_ReduceSum, axes, num_axes);
+  auto ret = kernel_->Run();
+  EXPECT_EQ(0, ret);
+
+  CompareOutputInt8(output_data, correct, output_size, err_tol_);
+}
+
+TEST_F(TestReduceInt8, Max) {
+  /* 2*4*4*3 NHWC */
+  int8_t input_data[96] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                           20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
+                           40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+                           60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+                           80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95};
+  int8_t output_data[32] = {0};
+  int axes[] = {3};
+  int num_axes = 1;
+  std::vector<int> input_shape = {2, 4, 4, 3};
+  std::vector<int> output_shape = {2, 4, 4, 1};
+  int output_size = 32;
+  int8_t correct[] = {-1, 1,  3,  4,  6,  7,  9,  10, 12, 13, 15, 16, 18, 19, 21, 22,
+                      24, 25, 27, 28, 30, 31, 33, 34, 36, 37, 39, 40, 42, 43, 45, 46};
+  thread_num_ = 2;
+  Prepare(input_shape, output_shape, input_data, output_data, ReduceMode_ReduceMax, axes, num_axes);
+  auto ret = kernel_->Run();
+  EXPECT_EQ(0, ret);
+
+  CompareOutputInt8(output_data, correct, output_size, err_tol_);
+}
+
+TEST_F(TestReduceInt8, MaxAll) {
+  /* 2*4*4*3 NHWC */
+  int8_t input_data[96] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                           20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
+                           40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+                           60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+                           80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95};
+  int8_t output_data[1] = {0};
+  int axes[] = {0, 1, 2, 3};
+  int num_axes = 4;
+  std::vector<int> input_shape = {2, 4, 4, 3};
+  std::vector<int> output_shape = {1};
+  int output_size = 1;
+  int8_t correct[] = {46};
+  thread_num_ = 2;
+  Prepare(input_shape, output_shape, input_data, output_data, ReduceMode_ReduceMax, axes, num_axes);
+  auto ret = kernel_->Run();
+  EXPECT_EQ(0, ret);
+
+  CompareOutputInt8(output_data, correct, output_size, err_tol_);
+}
+
+TEST_F(TestReduceInt8, Min) {
+  /* 2*4*4*3 NHWC */
+  int8_t input_data[96] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                           20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
+                           40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+                           60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+                           80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95};
+  int8_t output_data[32] = {0};
+  int axes[] = {3};
+  int num_axes = 1;
+  std::vector<int> input_shape = {2, 4, 4, 3};
+  std::vector<int> output_shape = {2, 4, 4, 1};
+  int output_size = 32;
+  int8_t correct[] = {-2, 0,  2,  3,  5,  6,  8,  9,  11, 12, 14, 15, 17, 18, 20, 21,
+                      23, 24, 26, 27, 29, 30, 32, 33, 35, 36, 38, 39, 41, 42, 44, 45};
+  thread_num_ = 2;
+  Prepare(input_shape, output_shape, input_data, output_data, ReduceMode_ReduceMin, axes, num_axes);
+  auto ret = kernel_->Run();
+  EXPECT_EQ(0, ret);
+
+  CompareOutputInt8(output_data, correct, output_size, err_tol_);
+}
+
+TEST_F(TestReduceInt8, MinAll) {
+  /* 2*4*4*3 NHWC */
+  int8_t input_data[96] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                           20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
+                           40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+                           60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+                           80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95};
+  int8_t output_data[1] = {0};
+  int axes[] = {0};
+  int num_axes = 0;
+  std::vector<int> input_shape = {2, 4, 4, 3};
+  std::vector<int> output_shape = {1};
+  int output_size = 1;
+  int8_t correct[] = {-2};
+  thread_num_ = 2;
+  Prepare(input_shape, output_shape, input_data, output_data, ReduceMode_ReduceMin, axes, num_axes);
+  auto ret = kernel_->Run();
+  EXPECT_EQ(0, ret);
+
+  CompareOutputInt8(output_data, correct, output_size, err_tol_);
+}
+
+TEST_F(TestReduceInt8, Prod) {
+  /* 2*4*4*3 NHWC */
+  int8_t input_data[96] = {105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105,
+                           105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105,
+                           105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105,
+                           105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105,
+                           105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105,
+                           105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105};
+  int8_t output_data[32] = {0};
+  int axes[] = {3};
+  int num_axes = 1;
+  std::vector<int> input_shape = {2, 4, 4, 3};
+  std::vector<int> output_shape = {2, 4, 4, 1};
+  int output_size = 32;
+  int8_t correct[] = {
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  };
+  thread_num_ = 2;
+  Prepare(input_shape, output_shape, input_data, output_data, ReduceMode_ReduceProd, axes, num_axes);
+  auto ret = kernel_->Run();
+  EXPECT_EQ(0, ret);
+
+  CompareOutputInt8(output_data, correct, output_size, err_tol_);
+}
+
+TEST_F(TestReduceInt8, Prod2Axis) {
+  /* 2*4*4*3 NHWC */
+  int8_t input_data[12] = {105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105};
+  int8_t output_data[8] = {0};
+  int axes[] = {2, 3};
+  int num_axes = 2;
+  std::vector<int> input_shape = {1, 2, 2, 3};
+  std::vector<int> output_shape = {1, 2};
+  int output_size = 2;
+  int8_t correct[] = {3, 3};
+  thread_num_ = 1;
+  Prepare(input_shape, output_shape, input_data, output_data, ReduceMode_ReduceProd, axes, num_axes);
+  auto ret = kernel_->Run();
+  EXPECT_EQ(0, ret);
+
+  CompareOutputInt8(output_data, correct, output_size, err_tol_);
+}
+
+TEST_F(TestReduceInt8, SumSquare) {
+  /* 2*4*4*3 NHWC */
+
+  int8_t input_data[96] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                           20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
+                           40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+                           60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+                           80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95};
+  int8_t output_data[32] = {0};
+  int axes[] = {3};
+  int num_axes = 1;
+  std::vector<int> input_shape = {2, 4, 4, 3};
+  std::vector<int> output_shape = {2, 4, 4, 1};
+  int output_size = 32;
+  int8_t correct[] = {1,  1,  1,  1,  1,  2,  2,  3,  4,  5,  6,  7,  9,  10, 12, 14,
+                      16, 18, 20, 22, 25, 27, 30, 33, 36, 39, 42, 45, 49, 53, 56, 60};
+  thread_num_ = 1;
+  Prepare(input_shape, output_shape, input_data, output_data, ReduceMode_ReduceSumSquare, axes, num_axes);
+  auto ret = kernel_->Run();
+  EXPECT_EQ(0, ret);
+
+  CompareOutputInt8(output_data, correct, output_size, err_tol_);
+}
+
+TEST_F(TestReduceInt8, SumSquare2Axis) {
+  /* 2*4*4*3 NHWC */
+  int8_t input_data[12] = {105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105};
+  int8_t output_data[8] = {0};
+  int axes[] = {3, 2};
+  int num_axes = 2;
+  std::vector<int> input_shape = {1, 2, 2, 3};
+  std::vector<int> output_shape = {1, 2};
+  int output_size = 2;
+  int8_t correct[] = {114, 114};
+  thread_num_ = 1;
+  Prepare(input_shape, output_shape, input_data, output_data, ReduceMode_ReduceSumSquare, axes, num_axes);
+  auto ret = kernel_->Run();
+  EXPECT_EQ(0, ret);
+
+  CompareOutputInt8(output_data, correct, output_size, err_tol_);
+}
+
+}  // namespace mindspore
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/resize_bilinear_int8_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/resize_bilinear_int8_tests.cc
index ed0b269dd6..dc27b89a1b 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/resize_bilinear_int8_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/resize_bilinear_int8_tests.cc
@@ -18,10 +18,8 @@
 #include "include/context.h"
 #include "src/ir/tensor.h"
 #include "common/common_test.h"
-#include "src/common/file_utils.h"
 #include "mindspore/lite/src/kernel_registry.h"
-#include "src/runtime/kernel/arm/nnacl/int8/resize.h"
-#include "src/runtime/kernel/arm/int8/resize_int8.h"
+#include "nnacl/int8/resize.h"
 
 namespace mindspore {
 using mindspore::lite::tensor::QuantArg;
@@ -92,7 +90,7 @@ TEST_F(TestResizeBilinearInt8, Bilinear0) {
   int8_t expect[16] = {4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 5, 5, 6, 6};
 
   Prepare(in_shape, out_shape, input_data, output_data, quant_in, quant_out, align_corners, thread_num);
-  kernel_->Init();
+  kernel_->Init();  // todo delete
   kernel_->Run();
 
   CompareOutputInt8(output_data, expect, 16, err_percent_);
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/resize_nearest_neighbor_int8_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/resize_nearest_neighbor_int8_tests.cc
index ffc3790c54..794a348c71 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/resize_nearest_neighbor_int8_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/resize_nearest_neighbor_int8_tests.cc
@@ -19,7 +19,7 @@
 #include "src/ir/tensor.h"
 #include "common/common_test.h"
 #include "mindspore/lite/src/kernel_registry.h"
-#include "src/runtime/kernel/arm/nnacl/int8/resize.h"
+#include "nnacl/int8/resize.h"
 
 namespace mindspore {
 using mindspore::lite::tensor::QuantArg;
@@ -92,7 +92,7 @@ TEST_F(TestResizeNearestNeighborInt8, NearestNeighbor0) {
   err_percent_ = 0.25f;
 
   Prepare(in_shape, out_shape, input_data, output_data, quant_in, quant_out, false, thread_num);
-  kernel_->Init();
+  kernel_->Init();  // todo delete
   kernel_->Run();
 
   CompareOutputInt8(output_data, expect, 16, err_percent_);