!4485 add reduce ops int8

Merge pull request !4485 from zhaozhenlong/lite/op/int8/reduce_mean_sum
5 years ago · 0a01bed4cb
--- a/mindspore/lite/src/populate_parameter.cc
+++ b/mindspore/lite/src/populate_parameter.cc
@@ -38,7 +38,7 @@
 #include "src/runtime/kernel/arm/nnacl/softmax_parameter.h"
 #include "src/runtime/kernel/arm/nnacl/fp32/tile.h"
 #include "src/runtime/kernel/arm/nnacl/fp32/topk.h"
 #include "src/runtime/kernel/arm/nnacl/fp32/reduce.h"
 #include "src/runtime/kernel/arm/nnacl/reduce_parameter.h"
 #include "src/runtime/kernel/arm/nnacl/fp32/activation.h"
 #include "src/runtime/kernel/arm/nnacl/fp32/arithmetic.h"
 #include "src/runtime/kernel/arm/nnacl/fp32/batchnorm.h"
--- a/mindspore/lite/src/runtime/kernel/arm/base/reduce_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/reduce_base.cc
@@ -0,0 +1,199 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
 #include "include/errorcode.h"
 #include "src/runtime/runtime_api.h"
 #include "src/runtime/kernel/arm/base/reduce_base.h"
 #include "src/runtime/kernel/arm/fp32/reduce.h"
 #include "src/runtime/kernel/arm/int8/reduce_int8.h"

 using mindspore::kernel::KERNEL_ARCH::kCPU;
 using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_NULL_PTR;
 using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_Mean;
 using mindspore::schema::PrimitiveType_Reduce;

 namespace mindspore::kernel {
 namespace {
 constexpr size_t kInputNum = 1;
 constexpr size_t kOutputNum = 1;
 }  // namespace

 int ReduceBaseCPUKernel::CheckInputsOutputs() {
  if (in_tensors_.size() != kInputNum) {
    MS_LOG(ERROR) << "Reduce inputs size should be " << kInputNum << " but got " << in_tensors_.size();
    return RET_ERROR;
  }
  if (out_tensors_.size() != kOutputNum) {
    MS_LOG(ERROR) << "Reduce outputs size should be " << kOutputNum << " but got " << out_tensors_.size();
    return RET_ERROR;
  }
  auto input = in_tensors_.at(0);
  if (input == nullptr) {
    MS_LOG(ERROR) << "Reduce input is nullptr";
    return RET_NULL_PTR;
  }
  auto output = out_tensors_.at(0);
  if (output == nullptr) {
    MS_LOG(ERROR) << "Reduce output is nullptr";
    return RET_NULL_PTR;
  }
  return RET_OK;
 }

 int ReduceBaseCPUKernel::CheckParameters() {
  size_t input_rank = in_tensors_.at(0)->shape().size();
  if (static_cast<size_t>(num_axes_) > input_rank) {
    MS_LOG(ERROR) << "Reduce op invalid num of reduce axes " << num_axes_ << " larger than input rank " << input_rank;
    return RET_ERROR;
  }
  for (auto i = 0; i < num_axes_; i++) {
    if (axes_[i] < -static_cast<int>(input_rank) || axes_[i] >= static_cast<int>(input_rank)) {
      MS_LOG(ERROR) << "Reduce got invalid axis " << axes_[i] << ", axis should be in ["
                    << -static_cast<int>(input_rank) << ", " << input_rank - 1 << "].";
      return RET_ERROR;
    }
    if (axes_[i] < 0) {
      axes_[i] += static_cast<int>(input_rank);
    }
  }

  if (num_axes_ == 0) {
    for (int i = 0; i < input_rank; i++) {
      axes_[i] = i;
    }
    num_axes_ = static_cast<int>(input_rank);
  }

  return RET_OK;
 }

 int ReduceBaseCPUKernel::Init() {
  auto reduce_param = reinterpret_cast<ReduceParameter *>(op_parameter_);
  if (reduce_param == nullptr) {
    return RET_NULL_PTR;
  }
  num_axes_ = reduce_param->num_axes_;
  mode_ = reduce_param->mode_;
  memcpy(axes_, reduce_param->axes_, sizeof(reduce_param->axes_));

  auto ret = CheckInputsOutputs();
  if (ret != RET_OK) {
    return ret;
  }
  ret = CheckParameters();
  if (ret != RET_OK) {
    return ret;
  }

  return RET_OK;
 }

 kernel::LiteKernel *CpuReduceFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
                                               const std::vector<lite::tensor::Tensor *> &outputs,
                                               OpParameter *opParameter, const lite::Context *ctx,
                                               const kernel::KernelKey &desc, const lite::Primitive *primitive) {
  MS_ASSERT(opParameter != nullptr);
  MS_ASSERT(desc.type == schema::PrimitiveType_Reduce);
  if (opParameter == nullptr) {
    MS_LOG(ERROR) << "Reduce opParameter nullptr";
    return nullptr;
  }
  if (desc.type != schema::PrimitiveType_Reduce) {
    MS_LOG(ERROR) << "Reduce op desc.type should be PrimitiveType_Reduce, got " << desc.type;
    return nullptr;
  }
  auto *kernel = new (std::nothrow) ReduceCPUKernel(opParameter, inputs, outputs, ctx, primitive);
  if (kernel == nullptr) {
    MS_LOG(ERROR) << "Reduce new ReduceCPUKernel failed.";
    return nullptr;
  }
  auto ret = kernel->Init();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
    delete kernel;
    return nullptr;
  }
  return kernel;
 }

 kernel::LiteKernel *CpuMeanFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
                                             const std::vector<lite::tensor::Tensor *> &outputs,
                                             OpParameter *opParameter, const lite::Context *ctx,
                                             const kernel::KernelKey &desc, const lite::Primitive *primitive) {
  MS_ASSERT(opParameter != nullptr);
  MS_ASSERT(desc.type == schema::PrimitiveType_Mean);
  if (opParameter == nullptr) {
    MS_LOG(ERROR) << "Reduce opParameter nullptr";
    return nullptr;
  }
  if (desc.type != schema::PrimitiveType_Mean) {
    MS_LOG(ERROR) << "Reduce op desc.type should be PrimitiveType_Mean, got " << desc.type;
    return nullptr;
  }
  auto *kernel = new (std::nothrow) ReduceCPUKernel(opParameter, inputs, outputs, ctx, primitive);
  if (kernel == nullptr) {
    MS_LOG(ERROR) << "Reduce new ReduceCPUKernel failed.";
    return nullptr;
  }
  auto ret = kernel->Init();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
    delete kernel;
    return nullptr;
  }
  return kernel;
 }

 kernel::LiteKernel *CpuReduceInt8KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
                                               const std::vector<lite::tensor::Tensor *> &outputs,
                                               OpParameter *opParameter, const lite::Context *ctx,
                                               const kernel::KernelKey &desc, const lite::Primitive *primitive) {
  MS_ASSERT(opParameter != nullptr);
  MS_ASSERT(desc.type == schema::PrimitiveType_Reduce);
  if (opParameter == nullptr) {
    MS_LOG(ERROR) << "Reduce opParameter nullptr";
    return nullptr;
  }
  if (desc.type != schema::PrimitiveType_Reduce) {
    MS_LOG(ERROR) << "Reduce op desc.type should be PrimitiveType_Reduce, got " << desc.type;
    return nullptr;
  }
  auto *kernel = new (std::nothrow) ReduceInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
  if (kernel == nullptr) {
    MS_LOG(ERROR) << "Reduce new ReduceCPUKernel failed.";
    return nullptr;
  }
  auto ret = kernel->Init();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
    delete kernel;
    return nullptr;
  }
  return kernel;
 }

 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Reduce, CpuReduceFp32KernelCreator)
 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Mean, CpuMeanFp32KernelCreator)
 REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_Reduce, CpuReduceInt8KernelCreator)
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/base/reduce_base.h
+++ b/mindspore/lite/src/runtime/kernel/arm/base/reduce_base.h
@@ -0,0 +1,54 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_BASE_REDUCE_BASE_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_BASE_REDUCE_BASE_H_

 #include <vector>
 #include "src/lite_kernel.h"
 #include "ir/anf.h"
 #include "nnacl/reduce_parameter.h"

 namespace mindspore::kernel {
 class ReduceBaseCPUKernel : public LiteKernel {
 public:
  ReduceBaseCPUKernel(OpParameter *param, const std::vector<lite::tensor::Tensor *> &inputs,
                      const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx,
                      const lite::Primitive *primitive)
      : LiteKernel(param, inputs, outputs, ctx, primitive) {}
  virtual ~ReduceBaseCPUKernel() = default;

  int Init() override;
  int ReSize() override { return 0; };

 private:
  int CheckInputsOutputs();
  int CheckParameters();

 protected:
  int axes_[REDUCE_MAX_AXES_NUM];
  int num_axes_;
  int mode_;

 protected:
  int outer_size_;
  int inner_size_;
  int axis_size_;
  std::vector<int> tmp_shape_;
 };
 }  // namespace mindspore::kernel

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_BASE_REDUCE_BASE_H_
--- a/mindspore/lite/src/runtime/kernel/arm/base/resize_base.h
+++ b/mindspore/lite/src/runtime/kernel/arm/base/resize_base.h
@@ -31,7 +31,7 @@ class ResizeBaseCPUKernel : public LiteKernel {
                      const lite::Primitive *primitive)
      : LiteKernel(parameter, inputs, outputs, ctx, primitive), context_(ctx) {}

  ~ResizeBaseCPUKernel() = default;
  virtual ~ResizeBaseCPUKernel() = default;

  int Init() override;
  int ReSize() override { return 0; };
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.cc
@@ -20,6 +20,7 @@
 #include "include/errorcode.h"
 #include "src/runtime/runtime_api.h"
 #include "src/runtime/kernel/arm/nnacl/fp32/reduce.h"
 #include "src/runtime/kernel/arm/base/reduce_base.h"

 using mindspore::kernel::KERNEL_ARCH::kCPU;
 using mindspore::lite::KernelRegistrar;
@@ -37,69 +38,9 @@ using mindspore::schema::ReduceMode_ReduceSum;
 using mindspore::schema::ReduceMode_ReduceSumSquare;

 namespace mindspore::kernel {
 namespace {
 constexpr size_t kInputNum = 1;
 constexpr size_t kOutputNum = 1;
 }  // namespace

 int ReduceCPUKernel::CheckInputsOutputs() {
  if (in_tensors_.size() != kInputNum) {
    MS_LOG(ERROR) << "Reduce inputs size should be " << kInputNum << " but got " << in_tensors_.size();
    return RET_ERROR;
  }
  if (out_tensors_.size() != kOutputNum) {
    MS_LOG(ERROR) << "Reduce outputs size should be " << kOutputNum << " but got " << out_tensors_.size();
    return RET_ERROR;
  }
  auto input = in_tensors_.at(0);
  if (input == nullptr) {
    MS_LOG(ERROR) << "Reduce input is nullptr";
    return RET_NULL_PTR;
  }
  auto output = out_tensors_.at(0);
  if (output == nullptr) {
    MS_LOG(ERROR) << "Reduce output is nullptr";
    return RET_NULL_PTR;
  }
  return RET_OK;
 }

 int ReduceCPUKernel::CheckParameters() {
  size_t input_rank = in_tensors_.at(0)->shape().size();
  if (static_cast<size_t>(num_axes_) > input_rank) {
    MS_LOG(ERROR) << "Reduce num of reduce axes " << num_axes_ << " larger than input rank " << input_rank;
    return RET_ERROR;
  }
  for (auto i = 0; i < num_axes_; i++) {
    if (axes_[i] < -static_cast<int>(input_rank) || axes_[i] >= static_cast<int>(input_rank)) {
      MS_LOG(ERROR) << "Reduce got invalid axis " << axes_[i] << ", axis should be in ["
                    << -static_cast<int>(input_rank) << ", " << input_rank - 1 << "].";
      return RET_ERROR;
    }
    if (axes_[i] < 0) {
      axes_[i] += static_cast<int>(input_rank);
    }
  }

  if (num_axes_ == 0) {
    for (int i = 0; i < input_rank; i++) {
      axes_[i] = i;
    }
  }

  return RET_OK;
 }

 int ReduceCPUKernel::Init() {
  if (context_->infer_shape_interrupt_ && !context_->running_) {
    set_need_reinit();
    return RET_OK;
  }
  auto ret = CheckInputsOutputs();
  if (ret != RET_OK) {
    return ret;
  }
  ret = CheckParameters();
  auto ret = ReduceBaseCPUKernel::Init();
  if (ret != RET_OK) {
    return ret;
  }
@@ -107,7 +48,6 @@ int ReduceCPUKernel::Init() {
  if (ret != RET_OK) {
    return ret;
  }

  switch (mode_) {
    case static_cast<int>(ReduceMode_ReduceSum): {
      reducer_ = ReduceSum;
@@ -137,7 +77,10 @@ int ReduceCPUKernel::Init() {
      MS_LOG(ERROR) << "Reduce unsupported reduce mode: " << mode_;
      return RET_ERROR;
  }
  return RET_OK;
  if (!InferShapeDone()) {
    return RET_OK;
  }
  return ReSize();
 }

 int ReduceCPUKernel::CallReduceUnit(int task_id) {
@@ -225,67 +168,4 @@ int ReduceCPUKernel::MallocTmpBuffer() {
  }
  return RET_OK;
 }

 kernel::LiteKernel *CpuReduceFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
                                               const std::vector<lite::tensor::Tensor *> &outputs,
                                               OpParameter *opParameter, const lite::Context *ctx,
                                               const kernel::KernelKey &desc, const lite::Primitive *primitive) {
  MS_ASSERT(opParameter != nullptr);
  MS_ASSERT(desc.type == schema::PrimitiveType_Reduce);
  if (opParameter == nullptr) {
    MS_LOG(ERROR) << "Reduce opParameter nullptr";
    return nullptr;
  }
  if (desc.type != schema::PrimitiveType_Reduce) {
    MS_LOG(ERROR) << "Reduce op desc.type should be PrimitiveType_Reduce, got " << desc.type;
    return nullptr;
  }
  auto *kernel = new (std::nothrow)
    ReduceCPUKernel(reinterpret_cast<ReduceParameter *>(opParameter), inputs, outputs, ctx, primitive);
  if (kernel == nullptr) {
    MS_LOG(ERROR) << "Reduce new ReduceCPUKernel failed.";
    return nullptr;
  }
  auto ret = kernel->Init();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
    delete kernel;
    return nullptr;
  }
  return kernel;
 }

 kernel::LiteKernel *CpuMeanFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
                                             const std::vector<lite::tensor::Tensor *> &outputs,
                                             OpParameter *opParameter, const lite::Context *ctx,
                                             const kernel::KernelKey &desc, const lite::Primitive *primitive) {
  MS_ASSERT(opParameter != nullptr);
  MS_ASSERT(desc.type == schema::PrimitiveType_Mean);
  if (opParameter == nullptr) {
    MS_LOG(ERROR) << "Reduce opParameter nullptr";
    return nullptr;
  }
  if (desc.type != schema::PrimitiveType_Mean) {
    MS_LOG(ERROR) << "Reduce op desc.type should be PrimitiveType_Mean, got " << desc.type;
    return nullptr;
  }
  auto *kernel = new (std::nothrow)
    ReduceCPUKernel(reinterpret_cast<ReduceParameter *>(opParameter), inputs, outputs, ctx, primitive);
  if (kernel == nullptr) {
    MS_LOG(ERROR) << "Reduce new ReduceCPUKernel failed.";
    return nullptr;
  }
  auto ret = kernel->Init();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
    delete kernel;
    return nullptr;
  }
  return kernel;
 }

 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Reduce, CpuReduceFp32KernelCreator)
 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Mean, CpuMeanFp32KernelCreator)
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.h
@@ -21,25 +21,20 @@
 #include "src/lite_kernel.h"

 #include "src/runtime/kernel/arm/nnacl/fp32/reduce.h"
 #include "src/runtime/kernel/arm/base/reduce_base.h"
 #include "ir/anf.h"
 using mindspore::schema::ReduceMode;

 namespace mindspore::kernel {
 class ReduceCPUKernel : public LiteKernel {
 class ReduceCPUKernel : public ReduceBaseCPUKernel {
  typedef int (*Reducer)(const int outer_size, const int inner_size, const int axis_size, const float *src_data,
                         const int *src_shape, float *dst_data, const int tid, const int thread_num);

 public:
  ReduceCPUKernel(ReduceParameter *param, const std::vector<lite::tensor::Tensor *> &inputs,
  ReduceCPUKernel(OpParameter *param, const std::vector<lite::tensor::Tensor *> &inputs,
                  const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx,
                  const lite::Primitive *primitive)
      : LiteKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs, ctx, primitive),
        context_(ctx),
        keep_dims_(param->keep_dims_),
        num_axes_(param->num_axes_),
        mode_(param->mode_) {
    memcpy(axes_, param->axes_, sizeof(param->axes_));
  }
      : ReduceBaseCPUKernel(param, inputs, outputs, ctx, primitive) {}
  ~ReduceCPUKernel() {
    for (auto i = 0; i < data_buffers_.size(); i++) {
      float *buffer = data_buffers_[i];
@@ -58,26 +53,13 @@ class ReduceCPUKernel : public LiteKernel {
  int CallReduceUnit(int task_id);

 private:
  int CheckInputsOutputs();
  int CheckParameters();
  int MallocTmpBuffer();

 private:
  const lite::Context *context_ = nullptr;
  bool keep_dims_;
  int axes_[REDUCE_MAX_AXES_NUM];
  int num_axes_;
  int mode_;

 private:
  Reducer reducer_;
  std::vector<float *> data_buffers_;
  int outer_size_;
  int inner_size_;
  int axis_size_;
  std::vector<int> tmp_shape_;
  const float *src_data_;
  float *dst_data_;
  Reducer reducer_;

 private:
  int MallocTmpBuffer();
 };
 }  // namespace mindspore::kernel

--- a/mindspore/lite/src/runtime/kernel/arm/int8/reduce_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/reduce_int8.cc
@@ -0,0 +1,323 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include <algorithm>
 #include "schema/model_generated.h"
 #include "src/runtime/runtime_api.h"
 #include "src/kernel_registry.h"
 #include "nnacl/quantization/quantize.h"
 #include "include/errorcode.h"
 #include "src/runtime/kernel/arm/int8/reduce_int8.h"

 using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_NULL_PTR;
 using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_Reduce;
 using mindspore::schema::ReduceMode_ReduceMax;
 using mindspore::schema::ReduceMode_ReduceMean;
 using mindspore::schema::ReduceMode_ReduceMin;
 using mindspore::schema::ReduceMode_ReduceProd;
 using mindspore::schema::ReduceMode_ReduceSum;
 using mindspore::schema::ReduceMode_ReduceSumSquare;

 namespace mindspore::kernel {
 int ReduceInt8CPUKernel::Init() {
  auto ret = ReduceBaseCPUKernel::Init();
  if (ret != RET_OK) {
    return ret;
  }
  ret = MallocTmpBuffer();
  if (ret != RET_OK) {
    return ret;
  }
  ret = CalculateQuantArgs();
  if (ret != RET_OK) {
    return ret;
  }

  switch (mode_) {
    case static_cast<int>(ReduceMode_ReduceMean): {
      reducer_ = ReduceMeanInt8;
      last_reducer_ = ReduceMeanLastAxis;
      break;
    }
    case static_cast<int>(ReduceMode_ReduceSum): {
      reducer_ = ReduceSumInt8;
      last_reducer_ = ReduceSumLastAxis;
      break;
    }

    case static_cast<int>(ReduceMode_ReduceMax): {
      reducer_ = ReduceMaxInt8;
      last_reducer_ = ReduceMaxLastAxis;
      break;
    }
    case static_cast<int>(ReduceMode_ReduceMin): {
      reducer_ = ReduceMinInt8;
      last_reducer_ = ReduceMinLastAxis;
      break;
    }
    case static_cast<int>(ReduceMode_ReduceProd): {
      reducer_ = ReduceProdInt8;
      last_reducer_ = ReduceProdLastAxis;
      break;
    }
    case static_cast<int>(ReduceMode_ReduceSumSquare): {
      // In multi-axes reduce cases, sum square output different output for different reduce order
      // e.g. axes [2, 3] is different from axes [3, 2].
      reducer_ = ReduceSumSquareInt8;
      last_reducer_ = ReduceSumSquareLastAxis;
      break;
    }
    default:
      MS_LOG(ERROR) << "Reduce unsupported reduce mode: " << mode_;
      return RET_ERROR;
  }
  if (!InferShapeDone()) {
    return RET_OK;
  }
  return ReSize();
 }

 int ReduceInt8CPUKernel::CalculateQuantArgs() {
  lite::tensor::Tensor *input = in_tensors_.at(0);
  lite::tensor::Tensor *output = out_tensors_.at(0);
  MS_ASSERT(input);
  MS_ASSERT(output);

  quant_arg_.in_scale_ = input->GetQuantParams().front().scale;
  quant_arg_.in_zp_ = input->GetQuantParams().front().zeroPoint;
  quant_arg_.out_scale_ = output->GetQuantParams().front().scale;
  quant_arg_.out_zp_ = output->GetQuantParams().front().zeroPoint;

  // (quant_out - out_zp) * out_scale = (quant_in - in_zp) * in_scale
  const double input_output_multiplier = quant_arg_.in_scale_ / quant_arg_.out_scale_;
  int shift;
  QuantizeMultiplierSmallerThanOne(input_output_multiplier, &quant_arg_.in_out_multiplier_, &shift);
  quant_arg_.in_out_left_shift_ = shift < 0 ? -shift : 0;
  quant_arg_.in_out_right_shift_ = shift > 0 ? shift : 0;

  // (quant_out - zp_out)*scale_out = sum((quant_in -zp)*scale_in) * (1/num) for each axis in axes
  // quant_out = sum(quant_in-zp) * (scale_in/scale_out) * (1/num)
  if (mode_ == static_cast<int>(schema::ReduceMode_ReduceMean)) {
    for (auto i = 0; i < num_axes_; i++) {
      auto axis = axes_[i];
      double reciprocal = 1.0 / in_tensors_.at(0)->shape()[axis];
      QuantMulArg *qm = new (std::nothrow) QuantMulArg;
      if (qm == nullptr) {
        MS_LOG(ERROR) << "Reduce new QuantMulArg failed.";
        return RET_NULL_PTR;
      }
      QuantizeMultiplierSmallerThanOne(reciprocal, &qm->multiplier_, &shift);
      qm->left_shift_ = shift < 0 ? -shift : 0;
      qm->right_shift_ = shift > 0 ? shift : 0;
      mean_multipliers_.push_back(qm);
    }
  }

  // (quant_out - zp) * scale_out = prod(quant_in - zp) * scale_in^num
  // quant_out = prod(quant_in-zp) * (scale_in^num/scale_out) + zp_out
  // scale_in^num-1 * scale_in/scale_out
  if (mode_ == static_cast<int>(schema::ReduceMode_ReduceProd)) {
    for (auto i = 0; i < num_axes_; i++) {
      int axis_size = in_tensors_.at(0)->shape()[axes_[i]];
      QuantMulArg *qm = new (std::nothrow) QuantMulArg;
      if (qm == nullptr) {
        MS_LOG(ERROR) << "ReduceProd new QuantMulArg failed.";
        return RET_NULL_PTR;
      }
      double prod_multiplier = pow(quant_arg_.in_scale_, axis_size - 1);
      QuantizeMultiplierSmallerThanOne(prod_multiplier, &qm->multiplier_, &shift);
      qm->left_shift_ = shift < 0 ? -shift : 0;
      qm->right_shift_ = shift > 0 ? shift : 0;
      prod_multipliers_.push_back(qm);
    }
  }

  // (quant_out - zp) * scale_out = sum((quant_in - zp)^2 * scale_in^2)
  // quant_out = sum((quant_in - zp)^2) * scale_in^2 / scale_out + zp_out
  // scale_in * scale_in/scale_out
  if (mode_ == static_cast<int>(schema::ReduceMode_ReduceSumSquare)) {
    for (auto i = 0; i < num_axes_ - 1; i++) {
      QuantMulArg *qm = new (std::nothrow) QuantMulArg;
      if (qm == nullptr) {
        MS_LOG(ERROR) << "ReduceProd new QuantMultiplier failed.";
        return RET_NULL_PTR;
      }
      double sumsquare_multiplier = quant_arg_.in_scale_;
      QuantizeMultiplierSmallerThanOne(sumsquare_multiplier, &qm->multiplier_, &shift);
      qm->left_shift_ = shift < 0 ? -shift : 0;
      qm->right_shift_ = shift > 0 ? shift : 0;
      sum_square_multipliers_.push_back(qm);
    }

    QuantMulArg *qm = new (std::nothrow) QuantMulArg;
    if (qm == nullptr) {
      MS_LOG(ERROR) << "ReduceProd new QuantMultiplier failed.";
      return RET_NULL_PTR;
    }
    double sumsquare_multiplier = quant_arg_.in_scale_ * quant_arg_.in_scale_ / quant_arg_.out_scale_;
    QuantizeMultiplierSmallerThanOne(sumsquare_multiplier, &qm->multiplier_, &shift);
    qm->left_shift_ = shift < 0 ? -shift : 0;
    qm->right_shift_ = shift > 0 ? shift : 0;
    sum_square_multipliers_.push_back(qm);
  }
  return RET_OK;
 }

 int ReduceInt8CPUKernel::MallocTmpBuffer() {
  auto input_shape = in_tensors_.at(0)->shape();
  for (auto i = 0; i < num_axes_ - 1; i++) {
    int axis = axes_[i];
    size_t size = 1;
    for (auto j = 0; j < input_shape.size(); j++) {
      if (static_cast<size_t>(axis) != j) {
        size *= input_shape[j];
      }
    }
    int32_t *buffer = reinterpret_cast<int32_t *>(malloc(size * sizeof(int32_t)));
    if (buffer == nullptr) {
      MS_LOG(ERROR) << "Malloc data failed.";
      return RET_ERROR;
    }
    data_buffers_.emplace_back(buffer);
    input_shape[axis] = 1;
  }

  auto input = in_tensors_.at(0);
  begin_src_data_ = reinterpret_cast<int32_t *>(malloc(sizeof(int32_t) * input->ElementsNum()));
  if (begin_src_data_ == nullptr) {
    return RET_NULL_PTR;
  }
  auto input_data = reinterpret_cast<int8_t *>(input->Data());
  for (auto i = 0; i < input->ElementsNum(); i++) {
    begin_src_data_[i] = static_cast<int32_t>(input_data[i]);
  }
  return RET_OK;
 }

 int ReduceInt8Impl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
  auto reduce = reinterpret_cast<ReduceInt8CPUKernel *>(cdata);
  auto error_code = reduce->CallReduceUnit(task_id);
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "Reduce Run error task_id[" << task_id << "] error_code[" << error_code << "]";
    return RET_ERROR;
  }
  return RET_OK;
 }

 int ReduceInt8CPUKernel::Run() {
  auto prepare_ret = Prepare();
  if (prepare_ret != RET_OK) {
    MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret;
    return prepare_ret;
  }

  is_last_axis_ = false;
  tmp_shape_ = in_tensors_.at(0)->shape();
  src_data_ = begin_src_data_;

  for (int i = 0; i < data_buffers_.size(); ++i) {
    if (mode_ == static_cast<int>(schema::ReduceMode_ReduceMean)) {
      quant_arg_.mean_multiplier_ = mean_multipliers_[i]->multiplier_;
      quant_arg_.mean_left_shift_ = mean_multipliers_[i]->left_shift_;
      quant_arg_.mean_right_shift_ = mean_multipliers_[i]->right_shift_;
    }

    if (mode_ == static_cast<int>(schema::ReduceMode_ReduceProd)) {
      quant_arg_.prod_multiplier_ = prod_multipliers_[i]->multiplier_;
      quant_arg_.prod_left_shift_ = prod_multipliers_[i]->left_shift_;
      quant_arg_.prod_right_shift_ = prod_multipliers_[i]->right_shift_;
    }
    if (mode_ == static_cast<int>(schema::ReduceMode_ReduceSumSquare)) {
      quant_arg_.sum_square_multiplier_ = sum_square_multipliers_[i]->multiplier_;
      quant_arg_.sum_square_left_shift_ = sum_square_multipliers_[i]->left_shift_;
      quant_arg_.sum_square_right_shift_ = sum_square_multipliers_[i]->right_shift_;
    }
    dst_data_ = data_buffers_[i];
    int axis = axes_[i];
    outer_size_ = 1;
    for (int j = 0; j < axis; j++) {
      outer_size_ *= tmp_shape_[j];
    }
    inner_size_ = 1;
    for (int k = axis + 1; k < static_cast<int>(tmp_shape_.size()); k++) {
      inner_size_ *= tmp_shape_[k];
    }
    axis_size_ = tmp_shape_[axis];
    auto error_code = LiteBackendParallelLaunch(ReduceInt8Impl, this, context_->thread_num_);
    if (error_code != RET_OK) {
      MS_LOG(ERROR) << "Reduce run error, error_code[" << error_code << "]";
      return RET_ERROR;
    }
    tmp_shape_[axis] = 1;
    src_data_ = dst_data_;
  }

  if (mode_ == static_cast<int>(schema::ReduceMode_ReduceMean)) {
    quant_arg_.mean_multiplier_ = mean_multipliers_.back()->multiplier_;
    quant_arg_.mean_left_shift_ = mean_multipliers_.back()->left_shift_;
    quant_arg_.mean_right_shift_ = mean_multipliers_.back()->right_shift_;
  }
  if (mode_ == static_cast<int>(schema::ReduceMode_ReduceProd)) {
    quant_arg_.prod_multiplier_ = prod_multipliers_.back()->multiplier_;
    quant_arg_.prod_left_shift_ = prod_multipliers_.back()->left_shift_;
    quant_arg_.prod_right_shift_ = prod_multipliers_.back()->right_shift_;
  }
  if (mode_ == static_cast<int>(schema::ReduceMode_ReduceSumSquare)) {
    quant_arg_.sum_square_multiplier_ = sum_square_multipliers_.back()->multiplier_;
    quant_arg_.sum_square_left_shift_ = sum_square_multipliers_.back()->left_shift_;
    quant_arg_.sum_square_right_shift_ = sum_square_multipliers_.back()->right_shift_;
  }
  int last_reduce_axis = axes_[num_axes_ - 1];
  outer_size_ = 1;
  for (int i = 0; i < last_reduce_axis; i++) {
    outer_size_ *= tmp_shape_[i];
  }
  inner_size_ = 1;
  for (int i = last_reduce_axis + 1; i < static_cast<int>(tmp_shape_.size()); i++) {
    inner_size_ *= tmp_shape_[i];
  }
  axis_size_ = tmp_shape_[last_reduce_axis];
  last_dst_data_ = reinterpret_cast<int8_t *>(out_tensors_.at(0)->Data());
  is_last_axis_ = true;
  auto error_code = LiteBackendParallelLaunch(ReduceInt8Impl, this, context_->thread_num_);
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "Reduce run error, error_code[" << error_code << "]";
    return RET_ERROR;
  }

  if (begin_src_data_ != nullptr) {
    free(begin_src_data_);
    begin_src_data_ = nullptr;
  }

  return RET_OK;
 }

 int ReduceInt8CPUKernel::CallReduceUnit(int task_id) {
  int ret;
  if (!is_last_axis_) {
    ret =
      reducer_(outer_size_, inner_size_, axis_size_, src_data_, dst_data_, &quant_arg_, task_id, context_->thread_num_);
  } else {
    ret = last_reducer_(outer_size_, inner_size_, axis_size_, src_data_, last_dst_data_, &quant_arg_, task_id,
                        context_->thread_num_);
  }
  return ret;
 }
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/int8/reduce_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/reduce_int8.h
@@ -0,0 +1,98 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_REDUCE_INT8_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_REDUCE_INT8_H_

 #include <vector>
 #include "src/lite_kernel.h"
 #include "nnacl/reduce_parameter.h"
 #include "nnacl/int8/reduce_int8.h"
 #include "nnacl/quantization/quantize.h"
 #include "ir/anf.h"
 #include "src/runtime/kernel/arm/base/reduce_base.h"

 using mindspore::schema::ReduceMode;

 namespace mindspore::kernel {
 class ReduceInt8CPUKernel : public ReduceBaseCPUKernel {
  typedef int (*Reducer)(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
                         int32_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num);
  typedef int (*LastReducer)(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
                             int8_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num);

 public:
  ReduceInt8CPUKernel(OpParameter *param, const std::vector<lite::tensor::Tensor *> &inputs,
                      const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx,
                      const lite::Primitive *primitive)
      : ReduceBaseCPUKernel(param, inputs, outputs, ctx, primitive) {}
  ~ReduceInt8CPUKernel() {
    for (auto i = 0; i < data_buffers_.size(); i++) {
      int32_t *buffer = data_buffers_[i];
      if (buffer != nullptr) {
        free(buffer);
        buffer = nullptr;
      }
    }
    for (auto qm : mean_multipliers_) {
      delete qm;
      qm = nullptr;
    }
    for (auto qm : prod_multipliers_) {
      delete qm;
      qm = nullptr;
    }
    for (auto qm : sum_square_multipliers_) {
      delete qm;
      qm = nullptr;
    }
    src_data_ = nullptr;
    dst_data_ = nullptr;
  }

  int Init() override;
  int ReSize() override { return 0; };
  int Run() override;
  int CallReduceUnit(int task_id);
  int ReduceLastAxis(int task_id);

 public:
  bool is_last_axis_ = true;

 private:
  int MallocTmpBuffer();
  int CalculateQuantArgs();

 private:
  ReduceParameter *param_ = nullptr;
  ReduceQuantArg quant_arg_;

 private:
  int32_t *begin_src_data_ = nullptr;
  int8_t *last_dst_data_ = nullptr;
  std::vector<int32_t *> data_buffers_;
  const int32_t *src_data_ = nullptr;
  int32_t *dst_data_ = nullptr;

  Reducer reducer_ = nullptr;
  LastReducer last_reducer_ = nullptr;
  std::vector<QuantMulArg *> mean_multipliers_;
  std::vector<QuantMulArg *> prod_multipliers_;
  std::vector<QuantMulArg *> sum_square_multipliers_;
 };
 }  // namespace mindspore::kernel

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_REDUCE_INT8_H_
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/errorcode.h
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/errorcode.h
@@ -49,6 +49,8 @@ typedef enum ErrorCodeUint8OpEnum {

 typedef enum ErrorCodeInt8OpEnum {
  NNACL_ERRCODE_OP_INT8_START = 40000,
  NNACL_ERRCODE_ADD_OVERFLOW,
  NNACL_ERRCODE_MUL_OVERFLOW,
  NNACL_ERRCODE_OP_INT8_END = 49999
 } ErrorCodeInt8OpEnums;

--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/reduce.h
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/reduce.h
@@ -17,15 +17,8 @@
 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_FP32_REDUCE_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_FP32_REDUCE_H_
 #include "nnacl/op_base.h"
 #define REDUCE_MAX_AXES_NUM 8
 #include "src/runtime/kernel/arm/nnacl/reduce_parameter.h"

 typedef struct ReduceParameter {
  OpParameter op_parameter_;
  bool keep_dims_;
  int axes_[REDUCE_MAX_AXES_NUM];
  int num_axes_;
  int mode_;
 } ReduceParameter;

 #ifdef __cplusplus
 extern "C" {
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/int8/reduce_int8.c
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/int8/reduce_int8.c
@@ -0,0 +1,467 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include <stdint.h>
 #include "nnacl/int8/reduce_int8.h"
 #include "nnacl/errorcode.h"
 #include "nnacl/quantization/fixed_point.h"

 inline bool isAddOverflow(int32_t x, int32_t y) {
  int32_t sum = x + y;
  return (x > 0 && y > 0 && sum < 0) || (x < 0 && y < 0 && sum > 0);
 }

 inline bool isMulOverflow(int32_t x, int32_t y) {
  int32_t p = x * y;
  return (x != 0) && (p / x != y);
 }

 // Get x such that (x-zp_in) * scale_in = mean
 // Assuming reduce n axes, this works for first n-1 reduce. One call for one reduce.
 int ReduceMeanInt8(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
                   int32_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num) {
  if (src_data == NULL || dst_data == NULL) {
    return NNACL_NULL_PTR;
  }
  int i, j, k;
  for (j = tid; j < outer_size; j += thread_num) {
    const int32_t *outer_src = src_data + j * axis_size * inner_size;
    int32_t *outer_dst = dst_data + j * inner_size;
    for (k = 0; k < inner_size; k++) {
      const int32_t *inner_src = outer_src + k;
      int32_t *inner_dst = outer_dst + k;
      int32_t sum = 0;
      // (x - zp_in) * scale_in = mean[(item - zp_in) * scale_in]
      // x = mean(item-zp_in) + zp_in
      for (i = 0; i < axis_size; i++) {
        int32_t tmp = inner_src[i * inner_size] - quant->in_zp_;
        if (isAddOverflow(sum, tmp)) {
          return NNACL_ERRCODE_ADD_OVERFLOW;
        }
        sum += tmp;
      }
      int32_t mean = RoundingDivideByPOT(
        SaturatingRoundingDoublingHighMul(sum * (1 << (unsigned int)quant->mean_left_shift_), quant->mean_multiplier_),
        quant->mean_right_shift_);
      if (isAddOverflow(mean, quant->in_zp_)) {
        return NNACL_ERRCODE_ADD_OVERFLOW;
      }
      *inner_dst = mean + quant->in_zp_;
    }
  }
  return NNACL_OK;
 }

 // suppose reduce n axes, this works for last reduce axis.
 // get y such that (y-zp_out) * scale_out = mean(x-zp_in)*scale_in
 int ReduceMeanLastAxis(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
                       int8_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num) {
  if (src_data == NULL || dst_data == NULL) {
    return NNACL_NULL_PTR;
  }
  int i, j, k;
  for (j = tid; j < outer_size; j += thread_num) {
    const int32_t *outer_src = src_data + j * axis_size * inner_size;
    int8_t *outer_dst = dst_data + j * inner_size;
    for (k = 0; k < inner_size; k++) {
      const int32_t *inner_src = outer_src + k;
      int8_t *inner_dst = outer_dst + k;
      int32_t sum = 0;
      for (i = 0; i < axis_size; i++) {
        // y = mean(x-zp_in) * scale + zp_out
        int32_t tmp = inner_src[i * inner_size] - quant->in_zp_;
        if (isAddOverflow(tmp, sum)) {
          return NNACL_ERRCODE_ADD_OVERFLOW;
        }
        sum += tmp;
      }
      // sum / num
      int32_t mean = RoundingDivideByPOT(
        SaturatingRoundingDoublingHighMul(sum * (1 << (unsigned int)quant->mean_left_shift_), quant->mean_multiplier_),
        quant->mean_right_shift_);
      // trans to output scale
      int32_t mean_scaled =
        RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(mean * (1 << (unsigned int)quant->in_out_left_shift_),
                                                              quant->in_out_multiplier_),
                            quant->in_out_right_shift_);
      if (isAddOverflow(mean_scaled, quant->out_zp_)) {
        return NNACL_ERRCODE_ADD_OVERFLOW;
      }
      mean = mean_scaled + quant->out_zp_;

      if (mean > INT8_MAX) {
        *inner_dst = INT8_MAX;
      } else if (mean < INT8_MIN) {
        *inner_dst = INT8_MIN;
      } else {
        *inner_dst = (int8_t)mean;
      }
    }
  }
  return NNACL_OK;
 }

 // Get x such that (x-zp_in) * scale_in = sum(item-zp_in)*scale_in
 // Assuming reduce n axes, this works for first n-1 reduce. One call for one reduce.
 int ReduceSumInt8(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
                  int32_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num) {
  if (src_data == NULL || dst_data == NULL) {
    return NNACL_NULL_PTR;
  }
  int i, j, k;
  for (j = tid; j < outer_size; j += thread_num) {
    const int32_t *outer_src = src_data + j * axis_size * inner_size;
    int32_t *outer_dst = dst_data + j * inner_size;
    for (k = 0; k < inner_size; k++) {
      const int32_t *inner_src = outer_src + k;
      int32_t *inner_dst = outer_dst + k;
      int32_t sum = 0;
      for (i = 0; i < axis_size; i++) {
        int32_t tmp = inner_src[i * inner_size] - quant->in_zp_;
        if (isAddOverflow(tmp, sum)) {
          return NNACL_ERRCODE_ADD_OVERFLOW;
        }
        sum += tmp;
      }

      if (isAddOverflow(quant->in_zp_, sum)) {
        return NNACL_ERRCODE_ADD_OVERFLOW;
      }
      *inner_dst = sum + quant->in_zp_;
    }
  }
  return NNACL_OK;
 }

 // suppose reduce n axes, this works for last reduce axis.
 // get y such that (y-zp_out) * scale_out = sum(item-zp_in)*scale_in
 int ReduceSumLastAxis(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
                      int8_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num) {
  if (src_data == NULL || dst_data == NULL) {
    return NNACL_NULL_PTR;
  }
  int i, j, k;
  for (j = tid; j < outer_size; j += thread_num) {
    const int32_t *outer_src = src_data + j * axis_size * inner_size;
    int8_t *outer_dst = dst_data + j * inner_size;
    for (k = 0; k < inner_size; k++) {
      const int32_t *inner_src = outer_src + k;
      int8_t *inner_dst = outer_dst + k;
      int32_t sum = 0;
      for (i = 0; i < axis_size; i++) {
        int32_t tmp = inner_src[i * inner_size] - quant->in_zp_;
        if (isAddOverflow(tmp, sum)) {
          return NNACL_ERRCODE_ADD_OVERFLOW;
        }
        sum += tmp;
      }
      int32_t sum_scaled =
        RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(sum * (1 << (unsigned int)quant->in_out_left_shift_),
                                                              quant->in_out_multiplier_),
                            quant->in_out_right_shift_);
      if (isAddOverflow(sum_scaled, quant->out_zp_)) {
        return NNACL_ERRCODE_ADD_OVERFLOW;
      }
      sum = sum_scaled + quant->out_zp_;
      if (sum > INT8_MAX) {
        *inner_dst = INT8_MAX;
      } else if (sum < INT8_MIN) {
        *inner_dst = INT8_MIN;
      } else {
        *inner_dst = (int8_t)sum;
      }
    }
  }
  return NNACL_OK;
 }

 int ReduceMaxLastAxis(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
                      int8_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num) {
  if (src_data == NULL || dst_data == NULL) {
    return NNACL_NULL_PTR;
  }
  int i, j, k;
  for (j = tid; j < outer_size; j += thread_num) {
    const int32_t *outer_src = src_data + j * axis_size * inner_size;
    int8_t *outer_dst = dst_data + j * inner_size;
    for (k = 0; k < inner_size; k++) {
      const int32_t *inner_src = outer_src + k;
      int8_t *inner_dst = outer_dst + k;
      int32_t tmp = INT8_MIN;
      for (i = 0; i < axis_size; i++) {
        tmp = tmp > inner_src[i * inner_size] ? tmp : inner_src[i * inner_size];
      }
      int32_t tmp_scaled = RoundingDivideByPOT(
        SaturatingRoundingDoublingHighMul((tmp - quant->in_zp_) * (1 << (unsigned int)quant->in_out_left_shift_),
                                          quant->in_out_multiplier_),
        quant->in_out_right_shift_);
      if (isAddOverflow(tmp_scaled, quant->out_zp_)) {
        return NNACL_ERRCODE_ADD_OVERFLOW;
      }
      tmp = tmp_scaled + quant->out_zp_;
      if (tmp > INT8_MAX) {
        *inner_dst = INT8_MAX;
      } else if (tmp < INT8_MIN) {
        *inner_dst = INT8_MIN;
      } else {
        *inner_dst = (int8_t)tmp;
      }
    }
  }
  return NNACL_OK;
 }

 int ReduceMaxInt8(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
                  int32_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num) {
  if (src_data == NULL || dst_data == NULL) {
    return NNACL_NULL_PTR;
  }
  int i, j, k;
  for (j = tid; j < outer_size; j += thread_num) {
    const int32_t *outer_src = src_data + j * axis_size * inner_size;
    int32_t *outer_dst = dst_data + j * inner_size;
    for (k = 0; k < inner_size; k++) {
      const int32_t *inner_src = outer_src + k;
      int32_t *inner_dst = outer_dst + k;
      int32_t tmp = INT8_MIN;
      for (i = 0; i < axis_size; i++) {
        tmp = tmp > inner_src[i * inner_size] ? tmp : inner_src[i * inner_size];
      }

      *inner_dst = tmp;
    }
  }
  return NNACL_OK;
 }

 int ReduceMinLastAxis(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
                      int8_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num) {
  if (src_data == NULL || dst_data == NULL) {
    return NNACL_NULL_PTR;
  }
  int i, j, k;
  int base_offset = 20;
  for (j = tid; j < outer_size; j += thread_num) {
    const int32_t *outer_src = src_data + j * axis_size * inner_size;
    int8_t *outer_dst = dst_data + j * inner_size;
    for (k = 0; k < inner_size; k++) {
      const int32_t *inner_src = outer_src + k;
      int8_t *inner_dst = outer_dst + k;
      int32_t tmp = INT8_MAX;
      for (i = 0; i < axis_size; i++) {
        tmp = tmp < inner_src[i * inner_size] ? tmp : inner_src[i * inner_size];
      }
      int32_t tmp_scaled =
        RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
                              (tmp - quant->in_zp_) * (1 << (unsigned int)quant->in_out_left_shift_ + base_offset),
                              quant->in_out_multiplier_),
                            quant->in_out_right_shift_ + base_offset);
      if (isAddOverflow(tmp_scaled, quant->out_zp_)) {
        return NNACL_ERRCODE_ADD_OVERFLOW;
      }
      tmp = tmp_scaled + quant->out_zp_;
      if (tmp > INT8_MAX) {
        *inner_dst = INT8_MAX;
      } else if (tmp < INT8_MIN) {
        *inner_dst = INT8_MIN;
      } else {
        *inner_dst = (int8_t)tmp;
      }
    }
  }
  return NNACL_OK;
 }

 int ReduceMinInt8(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
                  int32_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num) {
  if (src_data == NULL || dst_data == NULL) {
    return NNACL_NULL_PTR;
  }
  int i, j, k;
  for (j = tid; j < outer_size; j += thread_num) {
    const int32_t *outer_src = src_data + j * axis_size * inner_size;
    int32_t *outer_dst = dst_data + j * inner_size;
    for (k = 0; k < inner_size; k++) {
      const int32_t *inner_src = outer_src + k;
      int32_t *inner_dst = outer_dst + k;
      int32_t tmp = INT8_MAX;
      for (i = 0; i < axis_size; i++) {
        tmp = tmp < inner_src[i * inner_size] ? tmp : inner_src[i * inner_size];
      }
      *inner_dst = tmp;
    }
  }
  return NNACL_OK;
 }

 int ReduceProdLastAxis(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
                       int8_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num) {
  if (src_data == NULL || dst_data == NULL) {
    return NNACL_NULL_PTR;
  }
  int i, j, k;
  for (j = tid; j < outer_size; j += thread_num) {
    const int32_t *outer_src = src_data + j * axis_size * inner_size;
    int8_t *outer_dst = dst_data + j * inner_size;
    for (k = 0; k < inner_size; k++) {
      const int32_t *inner_src = outer_src + k;
      int8_t *inner_dst = outer_dst + k;
      int32_t prod = 1;
      for (i = 0; i < axis_size; i++) {
        // quant_out = prod(quant_in-zp) * (scale_in^num/scale_out) + zp_out
        int32_t tmp = inner_src[i * inner_size] - quant->in_zp_;
        if (isMulOverflow(prod, tmp)) {
          return NNACL_ERRCODE_MUL_OVERFLOW;
        }
        prod *= tmp;
      }
      prod = RoundingDivideByPOT(
        SaturatingRoundingDoublingHighMul(prod * (1 << (unsigned int)quant->prod_left_shift_), quant->prod_multiplier_),
        quant->prod_right_shift_);
      int32_t prod_scaled =
        RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(prod * (1 << (unsigned int)quant->in_out_left_shift_),
                                                              quant->in_out_multiplier_),
                            quant->in_out_right_shift_);
      if (isAddOverflow(prod_scaled, quant->out_zp_)) {
        return NNACL_ERRCODE_ADD_OVERFLOW;
      }
      prod = prod_scaled + quant->out_zp_;
      if (prod > INT8_MAX) {
        *inner_dst = INT8_MAX;
      } else if (prod < INT8_MIN) {
        *inner_dst = INT8_MIN;
      } else {
        *inner_dst = (int8_t)prod;
      }
    }
  }
  return NNACL_OK;
 }

 int ReduceProdInt8(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
                   int32_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num) {
  if (src_data == NULL || dst_data == NULL) {
    return NNACL_NULL_PTR;
  }
  int i, j, k;
  for (j = tid; j < outer_size; j += thread_num) {
    const int32_t *outer_src = src_data + j * axis_size * inner_size;
    int32_t *outer_dst = dst_data + j * inner_size;
    for (k = 0; k < inner_size; k++) {
      const int32_t *inner_src = outer_src + k;
      int32_t *inner_dst = outer_dst + k;
      int32_t prod = 1;
      for (i = 0; i < axis_size; i++) {
        int32_t tmp = inner_src[i * inner_size] - quant->in_zp_;
        if (isMulOverflow(prod, tmp)) {
          return NNACL_ERRCODE_MUL_OVERFLOW;
        }
        prod *= tmp;
      }
      prod = RoundingDivideByPOT(
        SaturatingRoundingDoublingHighMul(prod * (1 << (unsigned int)quant->prod_left_shift_), quant->prod_multiplier_),
        quant->prod_right_shift_);
      if (isAddOverflow(prod, quant->in_zp_)) {
        return NNACL_ERRCODE_ADD_OVERFLOW;
      }
      *inner_dst = prod + quant->in_zp_;  // todo overflow
    }
  }
  return NNACL_OK;
 }

 int ReduceSumSquareLastAxis(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
                            int8_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num) {
  if (src_data == NULL || dst_data == NULL) {
    return NNACL_NULL_PTR;
  }
  int i, j, k;
  for (j = tid; j < outer_size; j += thread_num) {
    const int32_t *outer_src = src_data + j * axis_size * inner_size;
    int8_t *outer_dst = dst_data + j * inner_size;
    for (k = 0; k < inner_size; k++) {
      const int32_t *inner_src = outer_src + k;
      int8_t *inner_dst = outer_dst + k;
      int32_t sum = 0;
      // quant_out = sum((quant_in - zp)^2) * scale_in^2 / scale_out + zp_out
      for (i = 0; i < axis_size; i++) {
        int32_t tmp;
        if (isMulOverflow(inner_src[i * inner_size] - quant->in_zp_, inner_src[i * inner_size] - quant->in_zp_)) {
          return NNACL_ERRCODE_MUL_OVERFLOW;
        }
        tmp = (inner_src[i * inner_size] - quant->in_zp_) * (inner_src[i * inner_size] - quant->in_zp_);
        if (isAddOverflow(sum, tmp)) {
          return NNACL_ERRCODE_ADD_OVERFLOW;
        }
        sum += tmp;
      }
      int32_t sum_scaled =
        RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(sum * (1 << (unsigned int)quant->sum_square_left_shift_),
                                                              quant->sum_square_multiplier_),
                            quant->sum_square_right_shift_);
      if (isAddOverflow(sum_scaled, quant->out_zp_)) {
        return NNACL_ERRCODE_ADD_OVERFLOW;
      }
      sum = sum_scaled + quant->out_zp_;

      if (sum > INT8_MAX) {
        *inner_dst = INT8_MAX;
      } else if (sum < INT8_MIN) {
        *inner_dst = INT8_MIN;
      } else {
        *inner_dst = (int8_t)sum;
      }
    }
  }
  return NNACL_OK;
 }

 int ReduceSumSquareInt8(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
                        int32_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num) {
  if (src_data == NULL || dst_data == NULL) {
    return NNACL_NULL_PTR;
  }
  int i, j, k;
  for (j = tid; j < outer_size; j += thread_num) {
    const int32_t *outer_src = src_data + j * axis_size * inner_size;
    int32_t *outer_dst = dst_data + j * inner_size;
    for (k = 0; k < inner_size; k++) {
      const int32_t *inner_src = outer_src + k;
      int32_t *inner_dst = outer_dst + k;
      int32_t sum = 0;
      for (i = 0; i < axis_size; i++) {
        int32_t tmp;
        if (isMulOverflow(inner_src[i * inner_size] - quant->in_zp_, inner_src[i * inner_size] - quant->in_zp_)) {
          return NNACL_ERRCODE_MUL_OVERFLOW;
        }
        tmp = (inner_src[i * inner_size] - quant->in_zp_) * (inner_src[i * inner_size] - quant->in_zp_);
        if (isAddOverflow(sum, tmp)) {
          return NNACL_ERRCODE_ADD_OVERFLOW;
        }
        sum += tmp;
      }
      sum =
        RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(sum * (1 << (unsigned int)quant->sum_square_left_shift_),
                                                              quant->sum_square_multiplier_),
                            quant->sum_square_right_shift_);
      if (isAddOverflow(sum, quant->in_zp_)) {
        return NNACL_ERRCODE_ADD_OVERFLOW;
      }
      *inner_dst = sum + quant->in_zp_;
    }
  }
  return NNACL_OK;
 }
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/int8/reduce_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/int8/reduce_int8.h
@@ -0,0 +1,53 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_INT8_REDUCE_INT8_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_INT8_REDUCE_INT8_H_
 #include "nnacl/quantization/quantize.h"
 #ifdef __cplusplus
 extern "C" {
 #endif

 int ReduceMeanInt8(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
                   int32_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num);
 int ReduceMeanLastAxis(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
                       int8_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num);
 int ReduceSumInt8(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
                  int32_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num);
 int ReduceSumLastAxis(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
                      int8_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num);
 int ReduceMaxInt8(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
                  int32_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num);
 int ReduceMaxLastAxis(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
                      int8_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num);
 int ReduceMinInt8(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
                  int32_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num);
 int ReduceMinLastAxis(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
                      int8_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num);
 int ReduceProdLastAxis(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
                       int8_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num);
 int ReduceProdInt8(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
                   int32_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num);
 int ReduceSumSquareLastAxis(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
                            int8_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num);
 int ReduceSumSquareInt8(const int outer_size, const int inner_size, const int axis_size, const int32_t *src_data,
                        int32_t *dst_data, const ReduceQuantArg *quant, const int tid, const int thread_num);
 bool isAddOverflow(int32_t x, int32_t y);
 bool isMulOverflow(int32_t x, int32_t y);
 #ifdef __cplusplus
 }
 #endif
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_INT8_REDUCE_INT8_H_
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/quantization/quantize.h
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/quantization/quantize.h
@@ -219,6 +219,26 @@ typedef struct DivQuantArg {
  int output_multiplier_;
  int output_shift_;
 } DivQuantArg;

 typedef struct ReduceQuantArg {
  double in_scale_;
  int32_t in_zp_;
  double out_scale_;
  int32_t out_zp_;
  int32_t in_out_multiplier_;
  int in_out_left_shift_;
  int in_out_right_shift_;
  int32_t mean_multiplier_;
  int mean_left_shift_;
  int mean_right_shift_;
  int32_t prod_multiplier_;
  int prod_left_shift_;
  int prod_right_shift_;
  int32_t sum_square_multiplier_;
  int sum_square_left_shift_;
  int sum_square_right_shift_;
 } ReduceQuantArg;

 #ifdef __cplusplus
 extern "C" {
 #endif
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/reduce_parameter.h
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/reduce_parameter.h
@@ -0,0 +1,30 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_REDUCE_PARAMETER_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_REDUCE_PARAMETER_H_
 #include "nnacl/op_base.h"
 #define REDUCE_MAX_AXES_NUM 8

 struct ReduceParameter {
  OpParameter op_parameter_;
  bool keep_dims_;
  int axes_[REDUCE_MAX_AXES_NUM];
  int num_axes_;
  int mode_;
 };

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_REDUCE_PARAMETER_H_
--- a/mindspore/lite/src/runtime/kernel/arm/nnacl/resize_parameter.h
+++ b/mindspore/lite/src/runtime/kernel/arm/nnacl/resize_parameter.h
@@ -16,7 +16,7 @@
 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_RESIZE_PARAMETER_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_NNACL_RESIZE_PARAMETER_H_

 #include "src/runtime/kernel/arm/nnacl/op_base.h"
 #include "nnacl/op_base.h"
 typedef struct ResizeParameter {
  OpParameter op_parameter_;
  int method_;
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/resize_bilinear_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/resize_bilinear_fp32_tests.cc
@@ -13,204 +13,255 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <iostream>
 #include <vector>
 #include "mindspore/lite/src/lite_kernel.h"
 #include "mindspore/lite/src/ir/tensor.h"
 #include "common/common_test.h"
 #include "mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/resize.h"
 #include "nnacl/resize_parameter.h"
 #include "mindspore/lite/src/kernel_registry.h"

 namespace mindspore {

 class TestResizeBilinearFp32 : public mindspore::CommonTest {
 public:
  TestResizeBilinearFp32() = default;
  void Prepare(const std::vector<int> &input_shape, const std::vector<int> &output_shape, float *input_data,
               float *output_data, const bool align_corners, const int thread_num);

  void TearDown() override;

 public:
  int tid = 0;
  int thread_num = 1;
  float err_tol = 1e-5;
  lite::tensor::Tensor in_tensor_;
  lite::tensor::Tensor out_tensor_;
  std::vector<lite::tensor::Tensor *> inputs_{&in_tensor_};
  std::vector<lite::tensor::Tensor *> outputs_{&out_tensor_};
  ResizeParameter param_ = {{}};
  kernel::KernelKey desc = {kernel::KERNEL_ARCH::kCPU, kNumberTypeFloat32, schema::PrimitiveType_Resize};
  lite::Context ctx_ = lite::Context();
  kernel::KernelCreator creator_ = nullptr;
  kernel::LiteKernel *kernel_ = nullptr;
 };

 void TestResizeBilinearFp32::TearDown() {
  in_tensor_.SetData(nullptr);
  out_tensor_.SetData(nullptr);
 }

 void TestResizeBilinearFp32::Prepare(const std::vector<int> &input_shape, const std::vector<int> &output_shape,
                                     float *input_data, float *output_data, const bool align_corners,
                                     const int thread_num) {
  in_tensor_.set_data_type(kNumberTypeFloat32);
  in_tensor_.set_shape(input_shape);
  out_tensor_.set_data_type(kNumberTypeFloat32);
  out_tensor_.set_shape(output_shape);
  in_tensor_.SetData(input_data);
  out_tensor_.SetData(output_data);

  ResizeParameter param_ = {
    {}, static_cast<int>(schema::ResizeMethod_BILINEAR), output_shape[1], output_shape[2], align_corners};
  desc = {kernel::KERNEL_ARCH::kCPU, kNumberTypeFloat32, schema::PrimitiveType_Resize};
  ctx_ = lite::Context();
  ctx_.thread_num_ = thread_num;
  creator_ = lite::KernelRegistry::GetInstance()->GetCreator(desc);
  ASSERT_NE(creator_, nullptr);
  kernel_ = creator_(inputs_, outputs_, reinterpret_cast<OpParameter *>(&param_), &ctx_, desc, nullptr);
  ASSERT_NE(kernel_, nullptr);
 }

 // 1*1 -> 1*1
 TEST_F(TestResizeBilinearFp32, ResizeBilinearTest1) {
  std::vector<float> input = {1.0};
  float input_data[] = {1.0f};
  float output_data[1] = {0};
  std::vector<int> input_shape = {1, 1, 1, 1};
  std::vector<int> output_shape = {1, 1, 1, 1};
  std::vector<float> expect = {1.0};
  bool align_corners = false;

  auto output_size = 1;
  std::vector<float> output(output_size, 0.0);

  ResizeBilinear(input.data(), output.data(), input_shape.data(), output_shape.data(), align_corners, tid,
                 thread_num);
  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
  auto ret = kernel_->Run();
  EXPECT_EQ(0, ret);

  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }

 // 2*2 -> 1*1
 TEST_F(TestResizeBilinearFp32, ResizeBilinearTest2) {
  std::vector<float> input = {0.0, 1.0, 2.0, 3.0};
  float input_data[] = {0.0, 1.0, 2.0, 3.0};
  float output_data[1] = {0};
  std::vector<int> input_shape = {1, 2, 2, 1};
  std::vector<int> output_shape = {1, 1, 1, 1};
  std::vector<float> expect = {0.0};
  bool align_corners = false;

  int output_size = 1;
  std::vector<float> output(output_size, 0.0);

  ResizeBilinear(input.data(), output.data(), input_shape.data(), output_shape.data(), align_corners, tid,
                 thread_num);
  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
  auto ret = kernel_->Run();
  EXPECT_EQ(0, ret);

  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }

 // 2*2 -> 1*2
 TEST_F(TestResizeBilinearFp32, ResizeBilinearTest3) {
  std::vector<float> input = {0.0, 1.0, 2.0, 3.0};
  float input_data[] = {0.0, 1.0, 2.0, 3.0};
  float output_data[2] = {0};
  std::vector<int> input_shape = {1, 2, 2, 1};
  std::vector<int> output_shape = {1, 1, 2, 1};
  std::vector<float> expect = {0.0, 1.0};
  bool align_corners = false;

  auto output_size = 2;
  std::vector<float> output(output_size, 0.0);

  ResizeBilinear(input.data(), output.data(), input_shape.data(), output_shape.data(), align_corners, tid,
                 thread_num);
  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
  auto ret = kernel_->Run();
  EXPECT_EQ(0, ret);

  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }

 // 2*2 -> 2*1
 TEST_F(TestResizeBilinearFp32, ResizeBilinearTest4) {
  std::vector<float> input = {0.0, 1.0, 2.0, 3.0};
  float input_data[] = {0.0, 1.0, 2.0, 3.0};
  float output_data[2] = {0};
  std::vector<int> input_shape = {1, 2, 2, 1};
  std::vector<int> output_shape = {1, 2, 1, 1};
  std::vector<float> expect = {0.0, 2.0};
  bool align_corners = false;

  auto output_size = 2;
  std::vector<float> output(output_size, 0.0);

  ResizeBilinear(input.data(), output.data(), input_shape.data(), output_shape.data(), align_corners, tid,
                 thread_num);
  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
  auto ret = kernel_->Run();
  EXPECT_EQ(0, ret);

  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }

 // 2*2 -> 2*2
 TEST_F(TestResizeBilinearFp32, ResizeBilinearTest5) {
  std::vector<float> input = {0.0, 1.0, 2.0, 3.0};
  float input_data[] = {0.0, 1.0, 2.0, 3.0};
  float output_data[4] = {0};
  std::vector<int> input_shape = {1, 2, 2, 1};
  std::vector<int> output_shape = {1, 2, 2, 1};
  std::vector<float> expect = {0.0, 1.0, 2.0, 3.0};
  bool align_corners = false;

  auto output_size = 4;
  std::vector<float> output(output_size, 0.0);

  ResizeBilinear(input.data(), output.data(), input_shape.data(), output_shape.data(), align_corners, tid,
                 thread_num);
  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
  auto ret = kernel_->Run();
  EXPECT_EQ(0, ret);

  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }

 // 2*2 -> 1*4
 TEST_F(TestResizeBilinearFp32, ResizeBilinearTest6) {
  std::vector<float> input = {0.0, 1.0, 2.0, 3.0};
  float input_data[] = {0.0, 1.0, 2.0, 3.0};
  float output_data[4] = {0};
  std::vector<int> input_shape = {1, 2, 2, 1};
  std::vector<int> output_shape = {1, 1, 4, 1};
  std::vector<float> expect = {0.0, 0.5, 1.0, 1.0};
  bool align_corners = false;

  auto output_size = 4;
  std::vector<float> output(output_size, 0.0);

  ResizeBilinear(input.data(), output.data(), input_shape.data(), output_shape.data(), align_corners, tid,
                 thread_num);
  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
  auto ret = kernel_->Run();
  EXPECT_EQ(0, ret);

  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }

 // 2*2 -> 4*1
 TEST_F(TestResizeBilinearFp32, ResizeBilinearTest7) {
  std::vector<float> input = {0.0, 1.0, 2.0, 3.0};
  float input_data[] = {0.0, 1.0, 2.0, 3.0};
  float output_data[4] = {0};
  std::vector<int> input_shape = {1, 2, 2, 1};
  std::vector<int> output_shape = {1, 4, 1, 1};
  std::vector<float> expect = {0.0, 1.0, 2.0, 2.0};
  bool align_corners = false;

  auto output_size = 4;
  std::vector<float> output(output_size, 0.0);

  ResizeBilinear(input.data(), output.data(), input_shape.data(), output_shape.data(), align_corners, tid,
                 thread_num);
  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
  auto ret = kernel_->Run();
  EXPECT_EQ(0, ret);

  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }

 // 2*2 -> 2*4
 TEST_F(TestResizeBilinearFp32, ResizeBilinearTest8) {
  std::vector<float> input = {0.0, 1.0, 2.0, 3.0};
  float input_data[] = {0.0, 1.0, 2.0, 3.0};
  float output_data[8] = {0};
  std::vector<int> input_shape = {1, 2, 2, 1};
  std::vector<int> output_shape = {1, 2, 4, 1};
  std::vector<float> expect = {0.0, 0.5, 1.0, 1.0, 2.0, 2.5, 3.0, 3.0};
  bool align_corners = false;

  auto output_size = 8;
  std::vector<float> output(output_size, 0.0);

  ResizeBilinear(input.data(), output.data(), input_shape.data(), output_shape.data(), align_corners, tid,
                 thread_num);
  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
  auto ret = kernel_->Run();
  EXPECT_EQ(0, ret);

  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }

 // 2*2 -> 4*2
 TEST_F(TestResizeBilinearFp32, ResizeBilinearTest9) {
  std::vector<float> input = {0.0, 1.0, 2.0, 3.0};
  float input_data[] = {0.0, 1.0, 2.0, 3.0};
  float output_data[8] = {0};
  std::vector<int> input_shape = {1, 2, 2, 1};
  std::vector<int> output_shape = {1, 4, 2, 1};
  std::vector<float> expect = {0.0, 1.0, 1.0, 2.0, 2.0, 3.0, 2.0, 3.0};
  bool align_corners = false;

  auto output_size = 8;
  std::vector<float> output(output_size, 0.0);

  ResizeBilinear(input.data(), output.data(), input_shape.data(), output_shape.data(), align_corners, tid,
                 thread_num);
  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
  auto ret = kernel_->Run();
  EXPECT_EQ(0, ret);

  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }

 // 2*2 -> 3*3
 TEST_F(TestResizeBilinearFp32, ResizeBilinearTest10) {
  std::vector<float> input = {0.0, 1.0, 2.0, 3.0};
  float input_data[] = {0.0, 1.0, 2.0, 3.0};
  float output_data[9] = {0};
  std::vector<int> input_shape = {1, 2, 2, 1};
  std::vector<int> output_shape = {1, 3, 3, 1};
  std::vector<float> expect = {0.0, 0.6666667, 1.0, 1.3333334, 2.0, 2.3333335, 2.0, 2.6666667, 3.0};
  bool align_corners = false;

  auto output_size = 9;
  std::vector<float> output(output_size, 0.0);
  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
  auto ret = kernel_->Run();
  EXPECT_EQ(0, ret);

  ResizeBilinear(input.data(), output.data(), input_shape.data(), output_shape.data(), align_corners, tid,
                 thread_num);
  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }

 // 2*2 -> 4*4
 TEST_F(TestResizeBilinearFp32, ResizeBilinearTest11) {
  std::vector<float> input = {0.0, 1.0, 2.0, 3.0};
  float input_data[] = {0.0, 1.0, 2.0, 3.0};
  float output_data[16] = {0};
  std::vector<int> input_shape = {1, 2, 2, 1};
  std::vector<int> output_shape = {1, 4, 4, 1};
  std::vector<float> expect = {0.0, 0.5, 1.0, 1.0, 1.0, 1.5, 2.0, 2.0, 2.0, 2.5, 3.0, 3.0, 2.0, 2.5, 3.0, 3.0};
  bool align_corners = false;

  auto output_size = 16;
  std::vector<float> output(output_size, 0.0);
  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
  auto ret = kernel_->Run();
  EXPECT_EQ(0, ret);

  ResizeBilinear(input.data(), output.data(), input_shape.data(), output_shape.data(), align_corners, tid,
                 thread_num);
  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }

 // 2*2*2*5 -> 2*4*4*5
 TEST_F(TestResizeBilinearFp32, ResizeBilinearTest12) {
  std::vector<float> input = {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0, 13.0,
                              14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0,
                              28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0};
  float input_data[] = {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0, 13.0,
                        14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0,
                        28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0};
  float output_data[160] = {0};
  std::vector<int> input_shape = {2, 2, 2, 5};
  std::vector<int> output_shape = {2, 4, 4, 5};
  std::vector<float> expect = {
@@ -224,20 +275,21 @@ TEST_F(TestResizeBilinearFp32, ResizeBilinearTest12) {
    33.5, 34.5, 35.5, 36.5, 35.0, 36.0, 37.0, 38.0, 39.0, 35.0, 36.0, 37.0, 38.0, 39.0, 30.0, 31.0, 32.0, 33.0,
    34.0, 32.5, 33.5, 34.5, 35.5, 36.5, 35.0, 36.0, 37.0, 38.0, 39.0, 35.0, 36.0, 37.0, 38.0, 39.0};
  bool align_corners = false;

  auto output_size = 160;
  std::vector<float> output(output_size, 0.0);

  ResizeBilinear(input.data(), output.data(), input_shape.data(), output_shape.data(), align_corners, tid,
                 thread_num);
  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
  auto ret = kernel_->Run();
  EXPECT_EQ(0, ret);

  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }

 // 2*2*2*5 -> 2*4*4*5 align corners
 TEST_F(TestResizeBilinearFp32, ResizeBilinearTest13) {
  std::vector<float> input = {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0, 13.0,
                              14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0,
                              28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0};
  float input_data[] = {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0, 13.0,
                        14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0,
                        28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0};
  float output_data[160] = {0};
  std::vector<int> input_shape = {2, 2, 2, 5};
  std::vector<int> output_shape = {2, 4, 4, 5};
  std::vector<float> expect = {
@@ -258,20 +310,21 @@ TEST_F(TestResizeBilinearFp32, ResizeBilinearTest13) {
    30.0,      31.0,      32.0,      33.0,      34.0,      31.666666, 32.666668, 33.666668, 34.666668, 35.666668,
    33.333332, 34.333332, 35.333332, 36.333332, 37.333332, 35.0,      36.0,      37.0,      38.0,      39.0};
  bool align_corners = true;

  auto output_size = 160;
  std::vector<float> output(output_size, 0.0);

  ResizeBilinear(input.data(), output.data(), input_shape.data(), output_shape.data(), align_corners, tid,
                 thread_num);
  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
  auto ret = kernel_->Run();
  EXPECT_EQ(0, ret);

  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }

 // 2*2*2*5 -> 2*4*4*5 thread_num 2
 TEST_F(TestResizeBilinearFp32, ResizeBilinearTest14) {
  std::vector<float> input = {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0, 13.0,
                              14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0,
                              28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0};
  float input_data[] = {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0, 13.0,
                        14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0,
                        28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0};
  float output_data[160] = {0};
  std::vector<int> input_shape = {2, 2, 2, 5};
  std::vector<int> output_shape = {2, 4, 4, 5};
  std::vector<float> expect = {
@@ -285,24 +338,22 @@ TEST_F(TestResizeBilinearFp32, ResizeBilinearTest14) {
    33.5, 34.5, 35.5, 36.5, 35.0, 36.0, 37.0, 38.0, 39.0, 35.0, 36.0, 37.0, 38.0, 39.0, 30.0, 31.0, 32.0, 33.0,
    34.0, 32.5, 33.5, 34.5, 35.5, 36.5, 35.0, 36.0, 37.0, 38.0, 39.0, 35.0, 36.0, 37.0, 38.0, 39.0};
  bool align_corners = false;

  auto output_size = 160;
  std::vector<float> output(output_size, 0.0);
  thread_num = 2;
  tid = 0;
  ResizeBilinear(input.data(), output.data(), input_shape.data(), output_shape.data(), align_corners, tid,
                 thread_num);
  tid = 1;
  ResizeBilinear(input.data(), output.data(), input_shape.data(), output_shape.data(), align_corners, tid,
                 thread_num);
  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
  int thread_num = 2;

  Prepare(input_shape, output_shape, input_data, output_data, align_corners, thread_num);
  auto ret = kernel_->Run();
  EXPECT_EQ(0, ret);

  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }

 // 2*2*2*5 -> 2*4*4*5 thread_num 4
 TEST_F(TestResizeBilinearFp32, ResizeBilinearTest15) {
  std::vector<float> input = {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0, 13.0,
                              14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0,
                              28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0};
  float input_data[] = {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0, 13.0,
                        14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0,
                        28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0};
  float output_data[160] = {0};
  std::vector<int> input_shape = {2, 2, 2, 5};
  std::vector<int> output_shape = {2, 4, 4, 5};
  std::vector<float> expect = {
@@ -319,19 +370,11 @@ TEST_F(TestResizeBilinearFp32, ResizeBilinearTest15) {

  auto output_size = 160;
  std::vector<float> output(output_size, 0.0);
  thread_num = 4;
  tid = 0;
  ResizeBilinear(input.data(), output.data(), input_shape.data(), output_shape.data(), align_corners, tid,
                 thread_num);
  tid = 1;
  ResizeBilinear(input.data(), output.data(), input_shape.data(), output_shape.data(), align_corners, tid,
                 thread_num);
  tid = 2;
  ResizeBilinear(input.data(), output.data(), input_shape.data(), output_shape.data(), align_corners, tid,
                 thread_num);
  tid = 3;
  ResizeBilinear(input.data(), output.data(), input_shape.data(), output_shape.data(), align_corners, tid,
                 thread_num);
  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
  int thread_num = 4;
  Prepare(input_shape, output_shape, input_data, output_data, align_corners, thread_num);
  auto ret = kernel_->Run();
  EXPECT_EQ(0, ret);

  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }
 }  // namespace mindspore
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/resize_nearest_neighbor_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/resize_nearest_neighbor_fp32_tests.cc
@@ -15,168 +15,250 @@
 */
 #include <vector>
 #include "common/common_test.h"
 #include "mindspore/lite/src/runtime/kernel/arm/nnacl/fp32/resize.h"
 #include "nnacl/resize_parameter.h"
 #include "mindspore/lite/src/kernel_registry.h"

 namespace mindspore {

 class TestResizeNearestNeighborFp32 : public mindspore::CommonTest {
 public:
  TestResizeNearestNeighborFp32() = default;
  void Prepare(const std::vector<int> &input_shape, const std::vector<int> &output_shape, float *input_data,
               float *output_data, const bool align_corners, const int thread_num);

  void TearDown() override;

 public:
  int tid = 0;
  int thread_num = 1;
  float err_tol = 1e-5;
  lite::tensor::Tensor in_tensor_;
  lite::tensor::Tensor out_tensor_;
  std::vector<lite::tensor::Tensor *> inputs_{&in_tensor_};
  std::vector<lite::tensor::Tensor *> outputs_{&out_tensor_};
  ResizeParameter param_ = {{}};
  kernel::KernelKey desc = {kernel::KERNEL_ARCH::kCPU, kNumberTypeFloat32, schema::PrimitiveType_Resize};
  lite::Context ctx_ = lite::Context();
  kernel::KernelCreator creator_ = nullptr;
  kernel::LiteKernel *kernel_ = nullptr;
 };

 void TestResizeNearestNeighborFp32::TearDown() {
  in_tensor_.SetData(nullptr);
  out_tensor_.SetData(nullptr);
 }

 void TestResizeNearestNeighborFp32::Prepare(const std::vector<int> &input_shape, const std::vector<int> &output_shape,
                                            float *input_data, float *output_data, const bool align_corners,
                                            const int thread_num) {
  in_tensor_.set_data_type(kNumberTypeFloat32);
  in_tensor_.set_shape(input_shape);
  out_tensor_.set_data_type(kNumberTypeFloat32);
  out_tensor_.set_shape(output_shape);
  in_tensor_.SetData(input_data);
  out_tensor_.SetData(output_data);

  ResizeParameter param_ = {
    {}, static_cast<int>(schema::ResizeMethod_NEAREST_NEIGHBOR), output_shape[1], output_shape[2], align_corners};
  desc = {kernel::KERNEL_ARCH::kCPU, kNumberTypeFloat32, schema::PrimitiveType_Resize};
  ctx_ = lite::Context();
  ctx_.thread_num_ = thread_num;
  creator_ = lite::KernelRegistry::GetInstance()->GetCreator(desc);
  ASSERT_NE(creator_, nullptr);
  kernel_ = creator_(inputs_, outputs_, reinterpret_cast<OpParameter *>(&param_), &ctx_, desc, nullptr);
  ASSERT_NE(kernel_, nullptr);
 }
 // 1*1 -> 1*1
 TEST_F(TestResizeNearestNeighborFp32, ResizeNearestNeighborTest1) {
  std::vector<float> input = {1.0};
  float input_data[] = {1.0};
  float output_data[1] = {0};
  std::vector<int> input_shape = {1, 1, 1, 1};
  std::vector<int> output_shape = {1, 1, 1, 1};
  std::vector<float> expect = {1.0};
  size_t output_size = 1;
  std::vector<float> output(output_size, 0.0);
  bool align_corners = false;

  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
  auto ret = kernel_->Run();
  EXPECT_EQ(0, ret);

  ResizeNearestNeighbor(input.data(), output.data(), input_shape.data(), output_shape.data(), tid, thread_num);
  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }

 // 2*2 -> 1*1
 TEST_F(TestResizeNearestNeighborFp32, ResizeNearestNeighborTest2) {
  std::vector<float> input = {0.0, 1.0, 2.0, 3.0};
  float input_data[] = {0.0, 1.0, 2.0, 3.0};
  float output_data[1] = {0};
  std::vector<int> input_shape = {1, 2, 2, 1};
  std::vector<int> output_shape = {1, 1, 1, 1};
  std::vector<float> expect = {0.0};
  size_t output_size = 1;
  std::vector<float> output(output_size, 0.0);
  bool align_corners = false;

  ResizeNearestNeighbor(input.data(), output.data(), input_shape.data(), output_shape.data(), tid, thread_num);
  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
  auto ret = kernel_->Run();
  EXPECT_EQ(0, ret);

  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }

 // 2*2 -> 1*2
 TEST_F(TestResizeNearestNeighborFp32, ResizeNearestNeighborTest3) {
  std::vector<float> input = {0.0, 1.0, 2.0, 3.0};
  float input_data[] = {0.0, 1.0, 2.0, 3.0};
  float output_data[2] = {0};
  std::vector<int> input_shape = {1, 2, 2, 1};
  std::vector<int> output_shape = {1, 1, 2, 1};
  std::vector<float> expect = {0.0, 1.0};
  size_t output_size = 2;
  std::vector<float> output(output_size, 0.0);
  bool align_corners = false;

  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
  auto ret = kernel_->Run();
  EXPECT_EQ(0, ret);

  ResizeNearestNeighbor(input.data(), output.data(), input_shape.data(), output_shape.data(), tid, thread_num);
  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }

 // 2*2 -> 2*1
 TEST_F(TestResizeNearestNeighborFp32, ResizeNearestNeighborTest4) {
  std::vector<float> input = {0.0, 1.0, 2.0, 3.0};
  float input_data[] = {0.0, 1.0, 2.0, 3.0};
  float output_data[2] = {0};
  std::vector<int> input_shape = {1, 2, 2, 1};
  std::vector<int> output_shape = {1, 2, 1, 1};
  std::vector<float> expect = {0.0, 2.0};
  size_t output_size = 2;
  std::vector<float> output(output_size, 0.0);
  bool align_corners = false;

  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
  auto ret = kernel_->Run();
  EXPECT_EQ(0, ret);

  ResizeNearestNeighbor(input.data(), output.data(), input_shape.data(), output_shape.data(), tid, thread_num);
  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }

 // 2*2 -> 2*2
 TEST_F(TestResizeNearestNeighborFp32, ResizeNearestNeighborTest5) {
  std::vector<float> input = {0.0, 1.0, 2.0, 3.0};
  float input_data[] = {0.0, 1.0, 2.0, 3.0};
  float output_data[4] = {0};
  std::vector<int> input_shape = {1, 2, 2, 1};
  std::vector<int> output_shape = {1, 2, 2, 1};
  std::vector<float> expect = {0.0, 1.0, 2.0, 3.0};
  size_t output_size = 4;
  std::vector<float> output(output_size, 0.0);
  bool align_corners = false;

  ResizeNearestNeighbor(input.data(), output.data(), input_shape.data(), output_shape.data(), tid, thread_num);
  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
  auto ret = kernel_->Run();
  EXPECT_EQ(0, ret);

  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }

 // 2*2 -> 1*4
 TEST_F(TestResizeNearestNeighborFp32, ResizeNearestNeighborTest6) {
  std::vector<float> input = {0.0, 1.0, 2.0, 3.0};
  float input_data[] = {0.0, 1.0, 2.0, 3.0};
  float output_data[4] = {0};
  std::vector<int> input_shape = {1, 2, 2, 1};
  std::vector<int> output_shape = {1, 1, 4, 1};
  std::vector<float> expect = {0.0, 0.0, 1.0, 1.0};
  size_t output_size = 4;
  std::vector<float> output(output_size, 0.0);
  bool align_corners = false;

  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
  auto ret = kernel_->Run();
  EXPECT_EQ(0, ret);

  ResizeNearestNeighbor(input.data(), output.data(), input_shape.data(), output_shape.data(), tid, thread_num);
  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }

 // 2*2 -> 4*1
 TEST_F(TestResizeNearestNeighborFp32, ResizeNearestNeighborTest7) {
  std::vector<float> input = {0.0, 1.0, 2.0, 3.0};
  float input_data[] = {0.0, 1.0, 2.0, 3.0};
  float output_data[4] = {0};
  std::vector<int> input_shape = {1, 2, 2, 1};
  std::vector<int> output_shape = {1, 4, 1, 1};
  std::vector<float> expect = {0.0, 0.0, 2.0, 2.0};
  size_t output_size = 4;
  std::vector<float> output(output_size, 0.0);
  bool align_corners = false;

  ResizeNearestNeighbor(input.data(), output.data(), input_shape.data(), output_shape.data(), tid, thread_num);
  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
  auto ret = kernel_->Run();
  EXPECT_EQ(0, ret);

  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }

 // 2*2 -> 2*4
 TEST_F(TestResizeNearestNeighborFp32, ResizeNearestNeighborTest8) {
  std::vector<float> input = {0.0, 1.0, 2.0, 3.0};
  float input_data[] = {0.0, 1.0, 2.0, 3.0};
  float output_data[8] = {0};
  std::vector<int> input_shape = {1, 2, 2, 1};
  std::vector<int> output_shape = {1, 2, 4, 1};
  std::vector<float> expect = {0.0, 0.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0};
  size_t output_size = 8;
  std::vector<float> output(output_size, 0.0);
  bool align_corners = false;

  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
  auto ret = kernel_->Run();
  EXPECT_EQ(0, ret);

  ResizeNearestNeighbor(input.data(), output.data(), input_shape.data(), output_shape.data(), tid, thread_num);
  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }

 // 2*2 -> 4*2
 TEST_F(TestResizeNearestNeighborFp32, ResizeNearestNeighborTest9) {
  std::vector<float> input = {0.0, 1.0, 2.0, 3.0};
  float input_data[] = {0.0, 1.0, 2.0, 3.0};
  float output_data[8] = {0};
  std::vector<int> input_shape = {1, 2, 2, 1};
  std::vector<int> output_shape = {1, 4, 2, 1};
  std::vector<float> expect = {0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
  size_t output_size = 8;
  std::vector<float> output(output_size, 0.0);
  bool align_corners = false;

  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
  auto ret = kernel_->Run();
  EXPECT_EQ(0, ret);

  ResizeNearestNeighbor(input.data(), output.data(), input_shape.data(), output_shape.data(), tid, thread_num);
  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }

 // 2*2 -> 3*3
 TEST_F(TestResizeNearestNeighborFp32, ResizeNearestNeighborTest10) {
  std::vector<float> input = {0.0, 1.0, 2.0, 3.0};
  float input_data[] = {0.0, 1.0, 2.0, 3.0};
  float output_data[9] = {0};
  std::vector<int> input_shape = {1, 2, 2, 1};
  std::vector<int> output_shape = {1, 3, 3, 1};
  std::vector<float> expect = {0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 2.0, 2.0, 3.0};
  size_t output_size = 9;
  std::vector<float> output(output_size, 0.0);
  bool align_corners = false;

  ResizeNearestNeighbor(input.data(), output.data(), input_shape.data(), output_shape.data(), tid, thread_num);
  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
  auto ret = kernel_->Run();
  EXPECT_EQ(0, ret);

  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }

 // 2*2 -> 4*4
 TEST_F(TestResizeNearestNeighborFp32, ResizeNearestNeighborTest11) {
  std::vector<float> input = {0.0, 1.0, 2.0, 3.0};
  float input_data[] = {0.0, 1.0, 2.0, 3.0};
  float output_data[16] = {0};
  std::vector<int> input_shape = {1, 2, 2, 1};
  std::vector<int> output_shape = {1, 4, 4, 1};
  std::vector<float> expect = {0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 2.0, 2.0, 3.0, 3.0};
  size_t output_size = 16;
  std::vector<float> output(output_size, 0.0);
  bool align_corners = false;

  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
  auto ret = kernel_->Run();
  EXPECT_EQ(0, ret);

  ResizeNearestNeighbor(input.data(), output.data(), input_shape.data(), output_shape.data(), tid, thread_num);
  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }

 // 2*2*2*5 -> 2*4*4*5
 TEST_F(TestResizeNearestNeighborFp32, ResizeNearestNeighborTest12) {
  std::vector<float> input = {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0, 13.0,
                              14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0,
                              28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0};
  float input_data[] = {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0, 13.0,
                        14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0,
                        28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0};
  float output_data[160] = {0};
  std::vector<int> input_shape = {2, 2, 2, 5};
  std::vector<int> output_shape = {2, 4, 4, 5};
  std::vector<float> expect = {
@@ -190,17 +272,21 @@ TEST_F(TestResizeNearestNeighborFp32, ResizeNearestNeighborTest12) {
    31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 35.0, 36.0, 37.0, 38.0, 39.0, 30.0, 31.0, 32.0, 33.0,
    34.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 35.0, 36.0, 37.0, 38.0, 39.0};
  size_t output_size = 160;
  std::vector<float> output(output_size, 0.0);
  bool align_corners = false;

  ResizeNearestNeighbor(input.data(), output.data(), input_shape.data(), output_shape.data(), tid, thread_num);
  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 1);
  auto ret = kernel_->Run();
  EXPECT_EQ(0, ret);

  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }

 // 2*2*2*5 -> 2*4*4*5 thread_num 2
 TEST_F(TestResizeNearestNeighborFp32, ResizeNearestNeighborTest13) {
  std::vector<float> input = {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0, 13.0,
                              14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0,
                              28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0};
  float input_data[] = {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0, 13.0,
                        14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0,
                        28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0};
  float output_data[160] = {0};
  std::vector<int> input_shape = {2, 2, 2, 5};
  std::vector<int> output_shape = {2, 4, 4, 5};
  std::vector<float> expect = {
@@ -214,21 +300,21 @@ TEST_F(TestResizeNearestNeighborFp32, ResizeNearestNeighborTest13) {
    31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 35.0, 36.0, 37.0, 38.0, 39.0, 30.0, 31.0, 32.0, 33.0,
    34.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 35.0, 36.0, 37.0, 38.0, 39.0};
  size_t output_size = 160;
  std::vector<float> output(output_size, 0.0);

  thread_num = 2;
  tid = 0;
  ResizeNearestNeighbor(input.data(), output.data(), input_shape.data(), output_shape.data(), tid, thread_num);
  tid = 1;
  ResizeNearestNeighbor(input.data(), output.data(), input_shape.data(), output_shape.data(), tid, thread_num);
  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
  bool align_corners = false;

  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 2);
  auto ret = kernel_->Run();
  EXPECT_EQ(0, ret);

  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }

 // 2*2*2*5 -> 2*4*4*5 thread_num 4
 TEST_F(TestResizeNearestNeighborFp32, ResizeNearestNeighborTest14) {
  std::vector<float> input = {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0, 13.0,
                              14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0,
                              28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0};
  float input_data[] = {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0, 13.0,
                        14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0,
                        28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0};
  float output_data[160] = {0};
  std::vector<int> input_shape = {2, 2, 2, 5};
  std::vector<int> output_shape = {2, 4, 4, 5};
  std::vector<float> expect = {
@@ -242,17 +328,12 @@ TEST_F(TestResizeNearestNeighborFp32, ResizeNearestNeighborTest14) {
    31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 35.0, 36.0, 37.0, 38.0, 39.0, 30.0, 31.0, 32.0, 33.0,
    34.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 35.0, 36.0, 37.0, 38.0, 39.0};
  size_t output_size = 160;
  std::vector<float> output(output_size, 0.0);

  thread_num = 4;
  tid = 0;
  ResizeNearestNeighbor(input.data(), output.data(), input_shape.data(), output_shape.data(), tid, thread_num);
  tid = 1;
  ResizeNearestNeighbor(input.data(), output.data(), input_shape.data(), output_shape.data(), tid, thread_num);
  tid = 2;
  ResizeNearestNeighbor(input.data(), output.data(), input_shape.data(), output_shape.data(), tid, thread_num);
  tid = 3;
  ResizeNearestNeighbor(input.data(), output.data(), input_shape.data(), output_shape.data(), tid, thread_num);
  CompareOutputData(output.data(), expect.data(), output_size, err_tol);
  bool align_corners = false;

  Prepare(input_shape, output_shape, input_data, output_data, align_corners, 4);
  auto ret = kernel_->Run();
  EXPECT_EQ(0, ret);

  CompareOutputData(output_data, expect.data(), output_size, err_tol);
 }
 }  // namespace mindspore
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/reduce_int8_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/reduce_int8_tests.cc
@@ -0,0 +1,355 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <memory>
 #include "utils/log_adapter.h"
 #include "common/common_test.h"
 #include "src/ir/tensor.h"
 #include "mindspore/lite/src/kernel_registry.h"
 #include "nnacl/fp32/reduce.h"

 namespace mindspore {
 using mindspore::lite::tensor::QuantArg;
 using mindspore::lite::tensor::Tensor;
 using mindspore::schema::ReduceMode;
 using mindspore::schema::ReduceMode_ReduceMax;
 using mindspore::schema::ReduceMode_ReduceMean;
 using mindspore::schema::ReduceMode_ReduceMin;
 using mindspore::schema::ReduceMode_ReduceProd;
 using mindspore::schema::ReduceMode_ReduceSum;
 using mindspore::schema::ReduceMode_ReduceSumSquare;

 class TestReduceInt8 : public mindspore::CommonTest {
 public:
  TestReduceInt8() = default;
  void Prepare(const std::vector<int> &in_shape, const std::vector<int> &out_shape, int8_t *input_data,
               int8_t *output_data, ReduceMode mode, const int *axes, const int num_axes);
  void TearDown() override;

 public:
  int thread_num_ = 1;

  ReduceParameter param_ = {};
  Tensor in_tensor_;
  Tensor out_tensor_;
  std::vector<Tensor *> inputs{&in_tensor_};
  std::vector<Tensor *> outputs{&out_tensor_};
  kernel::KernelKey desc_ = {kernel::KERNEL_ARCH::kCPU, kNumberTypeInt8, schema::PrimitiveType_Reduce};
  kernel::KernelCreator creator_ = nullptr;
  lite::Context ctx_ = lite::Context();
  kernel::LiteKernel *kernel_ = nullptr;
  const QuantArg quant_in_ = {0.005f, 5};
  const QuantArg quant_out_ = {0.01f, 1};
  float err_tol_ = 0.05;
 };

 void TestReduceInt8::TearDown() {
  in_tensor_.SetData(nullptr);
  out_tensor_.SetData(nullptr);
 }

 void TestReduceInt8::Prepare(const std::vector<int> &in_shape, const std::vector<int> &out_shape, int8_t *input_data,
                             int8_t *output_data, ReduceMode mode, const int *axes, const int num_axes) {
  in_tensor_.set_data_type(kNumberTypeInt8);
  in_tensor_.set_shape(in_shape);
  in_tensor_.SetData(input_data);
  in_tensor_.AddQuantParam(quant_in_);

  out_tensor_.set_data_type(kNumberTypeInt8);
  out_tensor_.set_shape(out_shape);
  out_tensor_.SetData(output_data);
  out_tensor_.AddQuantParam(quant_out_);

  param_.mode_ = static_cast<int>(mode);
  param_.num_axes_ = num_axes;
  memcpy(param_.axes_, axes, num_axes * sizeof(int));

  creator_ = lite::KernelRegistry::GetInstance()->GetCreator(desc_);

  ctx_.thread_num_ = thread_num_;
  kernel_ = creator_(inputs, outputs, reinterpret_cast<OpParameter *>(&param_), &ctx_, desc_, nullptr);
 }

 TEST_F(TestReduceInt8, Mean) {
  /* 2 4 4 3 NHWC */
  int8_t input_data[96] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
                           20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
                           40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
                           60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
                           80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95};
  int8_t output_data[32] = {0};
  int axes[] = {3};
  int num_axes = 1;
  std::vector<int> input_shape = {2, 4, 4, 3};
  std::vector<int> output_shape = {2, 4, 4, 1};
  int output_size = 32;
  int8_t correct[] = {-1, 1,  2,  3,  5,  7,  8,  10, 11, 12, 14, 16, 17, 19, 20, 22,
                      23, 25, 26, 28, 29, 30, 32, 34, 35, 37, 38, 40, 41, 43, 44, 46};

  thread_num_ = 2;
  Prepare(input_shape, output_shape, input_data, output_data, ReduceMode_ReduceMean, axes, num_axes);
  auto ret = kernel_->Run();
  EXPECT_EQ(0, ret);

  err_tol_ = 0.09375;
  CompareOutputInt8(output_data, correct, output_size, err_tol_);
 }

 TEST_F(TestReduceInt8, MeanAllAxis) {
  /* 2*4*4*3 NHWC */
  int8_t input_data[96] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
                           20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
                           40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
                           60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
                           80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95};
  int8_t output_data[1] = {0};
  int axes[] = {0};
  int num_axes = 0;
  std::vector<int> input_shape = {2, 4, 4, 3};
  std::vector<int> output_shape = {1};
  int output_size = 1;
  int8_t correct[] = {22};
  thread_num_ = 2;
  Prepare(input_shape, output_shape, input_data, output_data, ReduceMode_ReduceMean, axes, num_axes);
  auto ret = kernel_->Run();
  EXPECT_EQ(0, ret);

  err_tol_ = 1.0f;
  CompareOutputInt8(output_data, correct, output_size, err_tol_);
 }

 TEST_F(TestReduceInt8, Sum) {
  /* 2*4*4*3 NHWC */
  int8_t input_data[96] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
                           20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
                           40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
                           60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
                           80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95};
  int8_t output_data[32] = {0};
  int axes[] = {-1};
  int num_axes = 1;
  std::vector<int> input_shape = {2, 4, 4, 3};
  std::vector<int> output_shape = {2, 4, 4, 1};
  int output_size = 32;
  int8_t correct[] = {-5, -1, 4,  9,  13, 18, 22, 27, 31,  36,  40,  45,  49,  54,  58,  63,
                      67, 72, 76, 81, 85, 90, 94, 99, 103, 107, 112, 117, 121, 126, 127, 127};
  thread_num_ = 2;
  Prepare(input_shape, output_shape, input_data, output_data, ReduceMode_ReduceSum, axes, num_axes);
  auto ret = kernel_->Run();
  EXPECT_EQ(0, ret);

  err_tol_ = 0.0625f;
  CompareOutputInt8(output_data, correct, output_size, err_tol_);
 }

 TEST_F(TestReduceInt8, SumAllAxis) {
  /* 2*4*4*3 NHWC */
  int8_t input_data[96] = {
    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
  };
  int8_t output_data[1] = {0};
  int axes[] = {0, 1, 2, 3};
  int num_axes = 4;
  std::vector<int> input_shape = {2, 4, 4, 3};
  std::vector<int> output_shape = {1};
  int output_size = 1;
  int8_t correct[] = {-47};
  thread_num_ = 2;
  Prepare(input_shape, output_shape, input_data, output_data, ReduceMode_ReduceSum, axes, num_axes);
  auto ret = kernel_->Run();
  EXPECT_EQ(0, ret);

  CompareOutputInt8(output_data, correct, output_size, err_tol_);
 }

 TEST_F(TestReduceInt8, Max) {
  /* 2*4*4*3 NHWC */
  int8_t input_data[96] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
                           20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
                           40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
                           60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
                           80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95};
  int8_t output_data[32] = {0};
  int axes[] = {3};
  int num_axes = 1;
  std::vector<int> input_shape = {2, 4, 4, 3};
  std::vector<int> output_shape = {2, 4, 4, 1};
  int output_size = 32;
  int8_t correct[] = {-1, 1,  3,  4,  6,  7,  9,  10, 12, 13, 15, 16, 18, 19, 21, 22,
                      24, 25, 27, 28, 30, 31, 33, 34, 36, 37, 39, 40, 42, 43, 45, 46};
  thread_num_ = 2;
  Prepare(input_shape, output_shape, input_data, output_data, ReduceMode_ReduceMax, axes, num_axes);
  auto ret = kernel_->Run();
  EXPECT_EQ(0, ret);

  CompareOutputInt8(output_data, correct, output_size, err_tol_);
 }

 TEST_F(TestReduceInt8, MaxAll) {
  /* 2*4*4*3 NHWC */
  int8_t input_data[96] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
                           20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
                           40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
                           60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
                           80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95};
  int8_t output_data[1] = {0};
  int axes[] = {0, 1, 2, 3};
  int num_axes = 4;
  std::vector<int> input_shape = {2, 4, 4, 3};
  std::vector<int> output_shape = {1};
  int output_size = 1;
  int8_t correct[] = {46};
  thread_num_ = 2;
  Prepare(input_shape, output_shape, input_data, output_data, ReduceMode_ReduceMax, axes, num_axes);
  auto ret = kernel_->Run();
  EXPECT_EQ(0, ret);

  CompareOutputInt8(output_data, correct, output_size, err_tol_);
 }

 TEST_F(TestReduceInt8, Min) {
  /* 2*4*4*3 NHWC */
  int8_t input_data[96] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
                           20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
                           40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
                           60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
                           80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95};
  int8_t output_data[32] = {0};
  int axes[] = {3};
  int num_axes = 1;
  std::vector<int> input_shape = {2, 4, 4, 3};
  std::vector<int> output_shape = {2, 4, 4, 1};
  int output_size = 32;
  int8_t correct[] = {-2, 0,  2,  3,  5,  6,  8,  9,  11, 12, 14, 15, 17, 18, 20, 21,
                      23, 24, 26, 27, 29, 30, 32, 33, 35, 36, 38, 39, 41, 42, 44, 45};
  thread_num_ = 2;
  Prepare(input_shape, output_shape, input_data, output_data, ReduceMode_ReduceMin, axes, num_axes);
  auto ret = kernel_->Run();
  EXPECT_EQ(0, ret);

  CompareOutputInt8(output_data, correct, output_size, err_tol_);
 }

 TEST_F(TestReduceInt8, MinAll) {
  /* 2*4*4*3 NHWC */
  int8_t input_data[96] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
                           20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
                           40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
                           60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
                           80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95};
  int8_t output_data[1] = {0};
  int axes[] = {0};
  int num_axes = 0;
  std::vector<int> input_shape = {2, 4, 4, 3};
  std::vector<int> output_shape = {1};
  int output_size = 1;
  int8_t correct[] = {-2};
  thread_num_ = 2;
  Prepare(input_shape, output_shape, input_data, output_data, ReduceMode_ReduceMin, axes, num_axes);
  auto ret = kernel_->Run();
  EXPECT_EQ(0, ret);

  CompareOutputInt8(output_data, correct, output_size, err_tol_);
 }

 TEST_F(TestReduceInt8, Prod) {
  /* 2*4*4*3 NHWC */
  int8_t input_data[96] = {105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105,
                           105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105,
                           105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105,
                           105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105,
                           105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105,
                           105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105};
  int8_t output_data[32] = {0};
  int axes[] = {3};
  int num_axes = 1;
  std::vector<int> input_shape = {2, 4, 4, 3};
  std::vector<int> output_shape = {2, 4, 4, 1};
  int output_size = 32;
  int8_t correct[] = {
    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
  };
  thread_num_ = 2;
  Prepare(input_shape, output_shape, input_data, output_data, ReduceMode_ReduceProd, axes, num_axes);
  auto ret = kernel_->Run();
  EXPECT_EQ(0, ret);

  CompareOutputInt8(output_data, correct, output_size, err_tol_);
 }

 TEST_F(TestReduceInt8, Prod2Axis) {
  /* 2*4*4*3 NHWC */
  int8_t input_data[12] = {105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105};
  int8_t output_data[8] = {0};
  int axes[] = {2, 3};
  int num_axes = 2;
  std::vector<int> input_shape = {1, 2, 2, 3};
  std::vector<int> output_shape = {1, 2};
  int output_size = 2;
  int8_t correct[] = {3, 3};
  thread_num_ = 1;
  Prepare(input_shape, output_shape, input_data, output_data, ReduceMode_ReduceProd, axes, num_axes);
  auto ret = kernel_->Run();
  EXPECT_EQ(0, ret);

  CompareOutputInt8(output_data, correct, output_size, err_tol_);
 }

 TEST_F(TestReduceInt8, SumSquare) {
  /* 2*4*4*3 NHWC */

  int8_t input_data[96] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
                           20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
                           40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
                           60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
                           80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95};
  int8_t output_data[32] = {0};
  int axes[] = {3};
  int num_axes = 1;
  std::vector<int> input_shape = {2, 4, 4, 3};
  std::vector<int> output_shape = {2, 4, 4, 1};
  int output_size = 32;
  int8_t correct[] = {1,  1,  1,  1,  1,  2,  2,  3,  4,  5,  6,  7,  9,  10, 12, 14,
                      16, 18, 20, 22, 25, 27, 30, 33, 36, 39, 42, 45, 49, 53, 56, 60};
  thread_num_ = 1;
  Prepare(input_shape, output_shape, input_data, output_data, ReduceMode_ReduceSumSquare, axes, num_axes);
  auto ret = kernel_->Run();
  EXPECT_EQ(0, ret);

  CompareOutputInt8(output_data, correct, output_size, err_tol_);
 }

 TEST_F(TestReduceInt8, SumSquare2Axis) {
  /* 2*4*4*3 NHWC */
  int8_t input_data[12] = {105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105};
  int8_t output_data[8] = {0};
  int axes[] = {3, 2};
  int num_axes = 2;
  std::vector<int> input_shape = {1, 2, 2, 3};
  std::vector<int> output_shape = {1, 2};
  int output_size = 2;
  int8_t correct[] = {114, 114};
  thread_num_ = 1;
  Prepare(input_shape, output_shape, input_data, output_data, ReduceMode_ReduceSumSquare, axes, num_axes);
  auto ret = kernel_->Run();
  EXPECT_EQ(0, ret);

  CompareOutputInt8(output_data, correct, output_size, err_tol_);
 }

 }  // namespace mindspore
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/resize_bilinear_int8_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/resize_bilinear_int8_tests.cc
@@ -18,10 +18,8 @@
 #include "include/context.h"
 #include "src/ir/tensor.h"
 #include "common/common_test.h"
 #include "src/common/file_utils.h"
 #include "mindspore/lite/src/kernel_registry.h"
 #include "src/runtime/kernel/arm/nnacl/int8/resize.h"
 #include "src/runtime/kernel/arm/int8/resize_int8.h"
 #include "nnacl/int8/resize.h"

 namespace mindspore {
 using mindspore::lite::tensor::QuantArg;
@@ -92,7 +90,7 @@ TEST_F(TestResizeBilinearInt8, Bilinear0) {
  int8_t expect[16] = {4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 5, 5, 6, 6};

  Prepare(in_shape, out_shape, input_data, output_data, quant_in, quant_out, align_corners, thread_num);
  kernel_->Init();
  kernel_->Init();  // todo delete
  kernel_->Run();

  CompareOutputInt8(output_data, expect, 16, err_percent_);
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/resize_nearest_neighbor_int8_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/resize_nearest_neighbor_int8_tests.cc
@@ -19,7 +19,7 @@
 #include "src/ir/tensor.h"
 #include "common/common_test.h"
 #include "mindspore/lite/src/kernel_registry.h"
 #include "src/runtime/kernel/arm/nnacl/int8/resize.h"
 #include "nnacl/int8/resize.h"

 namespace mindspore {
 using mindspore::lite::tensor::QuantArg;
@@ -92,7 +92,7 @@ TEST_F(TestResizeNearestNeighborInt8, NearestNeighbor0) {
  err_percent_ = 0.25f;

  Prepare(in_shape, out_shape, input_data, output_data, quant_in, quant_out, false, thread_num);
  kernel_->Init();
  kernel_->Init();  // todo delete
  kernel_->Run();

  CompareOutputInt8(output_data, expect, 16, err_percent_);