!5878 reduce prod support int

Merge pull request !5878 from zhaozhenlong/lite/issue/reduce_int
5 years ago · 8200410f20
--- a/mindspore/lite/nnacl/common_func.h
+++ b/mindspore/lite/nnacl/common_func.h
@@ -46,6 +46,15 @@ void IndirectGemmFp32(float *output, const float *input, const float *weight, co
 int offset(const int *shape, const int dim0, const int dim1, const int dim2, const int dim3);
 int offsetComm(const int *shape, const int dim0, const int dim1, const int dim2);
 int offset4d(const int *shape, const int *dims);
 inline bool isAddOverflow(int32_t x, int32_t y) {
  int32_t sum = x + y;
  return (x > 0 && y > 0 && sum < 0) || (x < 0 && y < 0 && sum > 0);
 }

 inline bool isMulOverflow(int32_t x, int32_t y) {
  int32_t p = x * y;
  return (x != 0) && (p / x != y);
 }

 #ifdef ENABLE_ARM64
 void BiasAdd(const float *bias, float *data, size_t oc4, size_t plan_size);
--- a/mindspore/lite/nnacl/fp32/reduce.c
+++ b/mindspore/lite/nnacl/fp32/reduce.c
@@ -17,6 +17,7 @@
 #include <float.h>
 #include "nnacl/fp32/reduce.h"
 #include "nnacl/errorcode.h"
 #include "nnacl/common_func.h"

 int ReduceMean(const int outer_size, const int inner_size, const int axis_size, const float *src_data, float *dst_data,
               const int tid, const int thread_num) {
@@ -123,6 +124,31 @@ int ReduceProd(const int outer_size, const int inner_size, const int axis_size,
  }
  return NNACL_OK;
 }

 int IntReduceProd(const int outer_size, const int inner_size, const int axis_size, const int *src_data, int *dst_data,
                  const int tid, const int thread_num) {
  if (src_data == NULL || dst_data == NULL) {
    return NNACL_NULL_PTR;
  }
  int i, j, k;
  for (j = tid; j < outer_size; j += thread_num) {
    const int *outer_src = src_data + j * axis_size * inner_size;
    int *outer_dst = dst_data + j * inner_size;
    for (k = 0; k < inner_size; k++) {
      const int *inner_src = outer_src + k;
      int *inner_dst = outer_dst + k;
      int tmp = 1;
      for (i = 0; i < axis_size; i++) {
        if (isMulOverflow(tmp, inner_src[i * inner_size])) {
          return NNACL_ERRCODE_MUL_OVERFLOW;
        }
        tmp *= inner_src[i * inner_size];
      }
      *inner_dst = tmp;
    }
  }
  return NNACL_OK;
 }
 int ReduceSumSquare(const int outer_size, const int inner_size, const int axis_size, const float *src_data,
                    float *dst_data, const int tid, const int thread_num) {
  if (src_data == NULL || dst_data == NULL) {
--- a/mindspore/lite/nnacl/fp32/reduce.h
+++ b/mindspore/lite/nnacl/fp32/reduce.h
@@ -32,6 +32,8 @@ int ReduceMin(const int outer_size, const int inner_size, const int axis_size, c
              const int tid, const int thread_num);
 int ReduceProd(const int outer_size, const int inner_size, const int axis_size, const float *src_data, float *dst_data,
               const int tid, const int thread_num);
 int IntReduceProd(const int outer_size, const int inner_size, const int axis_size, const int *src_data, int *dst_data,
                  const int tid, const int thread_num);
 int ReduceSumSquare(const int outer_size, const int inner_size, const int axis_size, const float *src_data,
                    float *dst_data, const int tid, const int thread_num);
 #ifdef __cplusplus
--- a/mindspore/lite/nnacl/int8/reduce_int8.c
+++ b/mindspore/lite/nnacl/int8/reduce_int8.c
@@ -18,16 +18,7 @@
 #include "nnacl/int8/reduce_int8.h"
 #include "nnacl/errorcode.h"
 #include "nnacl/quantization/fixed_point.h"

 inline bool isAddOverflow(int32_t x, int32_t y) {
  int32_t sum = x + y;
  return (x > 0 && y > 0 && sum < 0) || (x < 0 && y < 0 && sum > 0);
 }

 inline bool isMulOverflow(int32_t x, int32_t y) {
  int32_t p = x * y;
  return (x != 0) && (p / x != y);
 }
 #include "nnacl/common_func.h"

 // Get x such that (x-zp_in) * scale_in = mean
 // Assuming reduce n axes, this works for first n-1 reduce. One call for one reduce.
@@ -268,7 +259,7 @@ int ReduceMinLastAxis(const int outer_size, const int inner_size, const int axis
        RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
                              (tmp - quant->in_zp_) * (1 << ((unsigned int)quant->in_out_left_shift_ + base_offset)),
                              quant->in_out_multiplier_),
                              quant->in_out_right_shift_ + base_offset);
                            quant->in_out_right_shift_ + base_offset);
      if (isAddOverflow(tmp_scaled, quant->out_zp_)) {
        return NNACL_ERRCODE_ADD_OVERFLOW;
      }
--- a/mindspore/lite/nnacl/op_base.h
+++ b/mindspore/lite/nnacl/op_base.h
@@ -53,6 +53,7 @@

 typedef enum LiteDataType {
  kDataTypeFloat,
  kDataTypeInt,
  kDataTypeInt8,
 } LiteDataType;

--- a/mindspore/lite/src/runtime/kernel/arm/base/reduce_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/reduce_base.cc
@@ -257,6 +257,8 @@ kernel::LiteKernel *CpuReduceInt8KernelCreator(const std::vector<lite::Tensor *>
 }

 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Reduce, CpuReduceFp32KernelCreator)
 REG_KERNEL(kCPU, kNumberTypeInt, PrimitiveType_Reduce, CpuReduceFp32KernelCreator)
 REG_KERNEL(kCPU, kNumberTypeInt32, PrimitiveType_Reduce, CpuReduceFp32KernelCreator)
 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Mean, CpuMeanFp32KernelCreator)
 REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_Reduce, CpuReduceInt8KernelCreator)
 REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_Mean, CpuReduceInt8KernelCreator)
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.cc
@@ -64,6 +64,7 @@ int ReduceCPUKernel::Init() {
    }
    case static_cast<int>(ReduceMode_ReduceProd): {
      reducer_ = ReduceProd;
      int_reducer_ = IntReduceProd;
      break;
    }
    case static_cast<int>(ReduceMode_ReduceSumSquare): {
@@ -81,10 +82,25 @@ int ReduceCPUKernel::Init() {
  return ReSize();
 }

 int ReduceCPUKernel::ReSize() { return ReduceBaseCPUKernel::ReSize(); }
 int ReduceCPUKernel::ReSize() {
  if (in_tensors().at(0)->data_type() == kNumberTypeFloat32) {
    data_type_ = kDataTypeFloat;
  } else {
    data_type_ = kDataTypeInt;
  }
  return ReduceBaseCPUKernel::ReSize();
 }

 int ReduceCPUKernel::CallReduceUnit(int task_id) {
  auto ret = reducer_(outer_size_, inner_size_, axis_size_, src_data_, dst_data_, task_id, context_->thread_num_);
  int ret;
  if (data_type_ == kDataTypeFloat) {
    ret = reducer_(outer_size_, inner_size_, axis_size_, static_cast<const float *>(src_data_),
                   static_cast<float *>(dst_data_), task_id, context_->thread_num_);
  } else {
    ret = int_reducer_(outer_size_, inner_size_, axis_size_, static_cast<const int *>(src_data_),
                       static_cast<int *>(dst_data_), task_id, context_->thread_num_);
  }

  return ret;
 }

@@ -110,12 +126,12 @@ int ReduceCPUKernel::Run() {
    return ret;
  }

  src_data_ = static_cast<float *>(in_tensors_.at(0)->MutableData());
  src_data_ = in_tensors_.at(0)->MutableData();
  for (size_t i = 0; i < static_cast<size_t>(num_axes_); ++i) {
    if (i != static_cast<size_t>(num_axes_ - 1)) {
      dst_data_ = data_buffers_[i];
    } else {
      dst_data_ = reinterpret_cast<float *>(out_tensors_.at(0)->MutableData());
      dst_data_ = out_tensors_.at(0)->MutableData();
    }
    outer_size_ = outer_sizes_[i];
    inner_size_ = inner_sizes_[i];
@@ -135,7 +151,12 @@ int ReduceCPUKernel::Run() {
 int ReduceCPUKernel::MallocTmpBuffer() {
  data_buffers_.clear();
  for (auto size : buffer_sizes_) {
    float *buffer = reinterpret_cast<float *>(context_->allocator->Malloc(size * sizeof(float)));
    void *buffer;
    if (data_type_ == kDataTypeFloat) {
      buffer = context_->allocator->Malloc(size * sizeof(float));
    } else {
      buffer = context_->allocator->Malloc(size * sizeof(int));
    }
    if (buffer == nullptr) {
      MS_LOG(ERROR) << "Malloc data failed.";
      return RET_ERROR;
@@ -146,8 +167,7 @@ int ReduceCPUKernel::MallocTmpBuffer() {
 }

 void ReduceCPUKernel::FreeTmpBuffer() {
  for (size_t i = 0; i < data_buffers_.size(); i++) {
    float *buffer = data_buffers_[i];
  for (auto buffer : data_buffers_) {
    if (buffer != nullptr) {
      context_->allocator->Free(buffer);
      buffer = nullptr;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.h
@@ -29,6 +29,8 @@ namespace mindspore::kernel {
 class ReduceCPUKernel : public ReduceBaseCPUKernel {
  typedef int (*Reducer)(const int outer_size, const int inner_size, const int axis_size, const float *src_data,
                         float *dst_data, const int tid, const int thread_num);
  typedef int (*IntReducer)(const int outer_size, const int inner_size, const int axis_size, const int *src_data,
                            int *dst_data, const int tid, const int thread_num);

 public:
  ReduceCPUKernel(OpParameter *param, const std::vector<lite::Tensor *> &inputs,
@@ -36,9 +38,10 @@ class ReduceCPUKernel : public ReduceBaseCPUKernel {
                  const mindspore::lite::PrimitiveC *primitive)
      : ReduceBaseCPUKernel(param, inputs, outputs, ctx, primitive) {}
  ~ReduceCPUKernel() {
    FreeTmpBuffer();
    src_data_ = nullptr;
    dst_data_ = nullptr;
    reducer_ = nullptr;
    int_reducer_ = nullptr;
  }

  int Init() override;
@@ -48,9 +51,12 @@ class ReduceCPUKernel : public ReduceBaseCPUKernel {

 private:
  Reducer reducer_ = nullptr;
  std::vector<float *> data_buffers_;
  const float *src_data_ = nullptr;
  float *dst_data_ = nullptr;
  IntReducer int_reducer_ = nullptr;
  std::vector<void *> data_buffers_;
  LiteDataType data_type_;

  const void *src_data_ = nullptr;
  void *dst_data_ = nullptr;

 private:
  int MallocTmpBuffer();