diff --git a/mindspore/lite/nnacl/common_func.h b/mindspore/lite/nnacl/common_func.h index 9528d83055..4a69715571 100644 --- a/mindspore/lite/nnacl/common_func.h +++ b/mindspore/lite/nnacl/common_func.h @@ -46,6 +46,15 @@ void IndirectGemmFp32(float *output, const float *input, const float *weight, co int offset(const int *shape, const int dim0, const int dim1, const int dim2, const int dim3); int offsetComm(const int *shape, const int dim0, const int dim1, const int dim2); int offset4d(const int *shape, const int *dims); +inline bool isAddOverflow(int32_t x, int32_t y) { + int32_t sum = x + y; + return (x > 0 && y > 0 && sum < 0) || (x < 0 && y < 0 && sum > 0); +} + +inline bool isMulOverflow(int32_t x, int32_t y) { + int32_t p = x * y; + return (x != 0) && (p / x != y); +} #ifdef ENABLE_ARM64 void BiasAdd(const float *bias, float *data, size_t oc4, size_t plan_size); diff --git a/mindspore/lite/nnacl/fp32/reduce.c b/mindspore/lite/nnacl/fp32/reduce.c index fdda3f5da1..ce78faf7a4 100644 --- a/mindspore/lite/nnacl/fp32/reduce.c +++ b/mindspore/lite/nnacl/fp32/reduce.c @@ -17,6 +17,7 @@ #include #include "nnacl/fp32/reduce.h" #include "nnacl/errorcode.h" +#include "nnacl/common_func.h" int ReduceMean(const int outer_size, const int inner_size, const int axis_size, const float *src_data, float *dst_data, const int tid, const int thread_num) { @@ -123,6 +124,31 @@ int ReduceProd(const int outer_size, const int inner_size, const int axis_size, } return NNACL_OK; } + +int IntReduceProd(const int outer_size, const int inner_size, const int axis_size, const int *src_data, int *dst_data, + const int tid, const int thread_num) { + if (src_data == NULL || dst_data == NULL) { + return NNACL_NULL_PTR; + } + int i, j, k; + for (j = tid; j < outer_size; j += thread_num) { + const int *outer_src = src_data + j * axis_size * inner_size; + int *outer_dst = dst_data + j * inner_size; + for (k = 0; k < inner_size; k++) { + const int *inner_src = outer_src + k; + int *inner_dst = outer_dst + k; + int tmp = 1; + for (i = 0; i < axis_size; i++) { + if (isMulOverflow(tmp, inner_src[i * inner_size])) { + return NNACL_ERRCODE_MUL_OVERFLOW; + } + tmp *= inner_src[i * inner_size]; + } + *inner_dst = tmp; + } + } + return NNACL_OK; +} int ReduceSumSquare(const int outer_size, const int inner_size, const int axis_size, const float *src_data, float *dst_data, const int tid, const int thread_num) { if (src_data == NULL || dst_data == NULL) { diff --git a/mindspore/lite/nnacl/fp32/reduce.h b/mindspore/lite/nnacl/fp32/reduce.h index 78fa15c135..9c87f6392d 100644 --- a/mindspore/lite/nnacl/fp32/reduce.h +++ b/mindspore/lite/nnacl/fp32/reduce.h @@ -32,6 +32,8 @@ int ReduceMin(const int outer_size, const int inner_size, const int axis_size, c const int tid, const int thread_num); int ReduceProd(const int outer_size, const int inner_size, const int axis_size, const float *src_data, float *dst_data, const int tid, const int thread_num); +int IntReduceProd(const int outer_size, const int inner_size, const int axis_size, const int *src_data, int *dst_data, + const int tid, const int thread_num); int ReduceSumSquare(const int outer_size, const int inner_size, const int axis_size, const float *src_data, float *dst_data, const int tid, const int thread_num); #ifdef __cplusplus diff --git a/mindspore/lite/nnacl/int8/reduce_int8.c b/mindspore/lite/nnacl/int8/reduce_int8.c index 1893a7ef0e..80553f8fe8 100644 --- a/mindspore/lite/nnacl/int8/reduce_int8.c +++ b/mindspore/lite/nnacl/int8/reduce_int8.c @@ -18,16 +18,7 @@ #include "nnacl/int8/reduce_int8.h" #include "nnacl/errorcode.h" #include "nnacl/quantization/fixed_point.h" - -inline bool isAddOverflow(int32_t x, int32_t y) { - int32_t sum = x + y; - return (x > 0 && y > 0 && sum < 0) || (x < 0 && y < 0 && sum > 0); -} - -inline bool isMulOverflow(int32_t x, int32_t y) { - int32_t p = x * y; - return (x != 0) && (p / x != y); -} +#include "nnacl/common_func.h" // Get x such that (x-zp_in) * scale_in = mean // Assuming reduce n axes, this works for first n-1 reduce. One call for one reduce. @@ -268,7 +259,7 @@ int ReduceMinLastAxis(const int outer_size, const int inner_size, const int axis RoundingDivideByPOT(SaturatingRoundingDoublingHighMul( (tmp - quant->in_zp_) * (1 << ((unsigned int)quant->in_out_left_shift_ + base_offset)), quant->in_out_multiplier_), - quant->in_out_right_shift_ + base_offset); + quant->in_out_right_shift_ + base_offset); if (isAddOverflow(tmp_scaled, quant->out_zp_)) { return NNACL_ERRCODE_ADD_OVERFLOW; } diff --git a/mindspore/lite/nnacl/op_base.h b/mindspore/lite/nnacl/op_base.h index b080dfd334..5262765759 100644 --- a/mindspore/lite/nnacl/op_base.h +++ b/mindspore/lite/nnacl/op_base.h @@ -53,6 +53,7 @@ typedef enum LiteDataType { kDataTypeFloat, + kDataTypeInt, kDataTypeInt8, } LiteDataType; diff --git a/mindspore/lite/src/runtime/kernel/arm/base/reduce_base.cc b/mindspore/lite/src/runtime/kernel/arm/base/reduce_base.cc index 5731acc901..3008492cdf 100644 --- a/mindspore/lite/src/runtime/kernel/arm/base/reduce_base.cc +++ b/mindspore/lite/src/runtime/kernel/arm/base/reduce_base.cc @@ -257,6 +257,8 @@ kernel::LiteKernel *CpuReduceInt8KernelCreator(const std::vector } REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Reduce, CpuReduceFp32KernelCreator) +REG_KERNEL(kCPU, kNumberTypeInt, PrimitiveType_Reduce, CpuReduceFp32KernelCreator) +REG_KERNEL(kCPU, kNumberTypeInt32, PrimitiveType_Reduce, CpuReduceFp32KernelCreator) REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Mean, CpuMeanFp32KernelCreator) REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_Reduce, CpuReduceInt8KernelCreator) REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_Mean, CpuReduceInt8KernelCreator) diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.cc index f98351e36b..08100211b6 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.cc +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.cc @@ -64,6 +64,7 @@ int ReduceCPUKernel::Init() { } case static_cast(ReduceMode_ReduceProd): { reducer_ = ReduceProd; + int_reducer_ = IntReduceProd; break; } case static_cast(ReduceMode_ReduceSumSquare): { @@ -81,10 +82,25 @@ int ReduceCPUKernel::Init() { return ReSize(); } -int ReduceCPUKernel::ReSize() { return ReduceBaseCPUKernel::ReSize(); } +int ReduceCPUKernel::ReSize() { + if (in_tensors().at(0)->data_type() == kNumberTypeFloat32) { + data_type_ = kDataTypeFloat; + } else { + data_type_ = kDataTypeInt; + } + return ReduceBaseCPUKernel::ReSize(); +} int ReduceCPUKernel::CallReduceUnit(int task_id) { - auto ret = reducer_(outer_size_, inner_size_, axis_size_, src_data_, dst_data_, task_id, context_->thread_num_); + int ret; + if (data_type_ == kDataTypeFloat) { + ret = reducer_(outer_size_, inner_size_, axis_size_, static_cast(src_data_), + static_cast(dst_data_), task_id, context_->thread_num_); + } else { + ret = int_reducer_(outer_size_, inner_size_, axis_size_, static_cast(src_data_), + static_cast(dst_data_), task_id, context_->thread_num_); + } + return ret; } @@ -110,12 +126,12 @@ int ReduceCPUKernel::Run() { return ret; } - src_data_ = static_cast(in_tensors_.at(0)->MutableData()); + src_data_ = in_tensors_.at(0)->MutableData(); for (size_t i = 0; i < static_cast(num_axes_); ++i) { if (i != static_cast(num_axes_ - 1)) { dst_data_ = data_buffers_[i]; } else { - dst_data_ = reinterpret_cast(out_tensors_.at(0)->MutableData()); + dst_data_ = out_tensors_.at(0)->MutableData(); } outer_size_ = outer_sizes_[i]; inner_size_ = inner_sizes_[i]; @@ -135,7 +151,12 @@ int ReduceCPUKernel::Run() { int ReduceCPUKernel::MallocTmpBuffer() { data_buffers_.clear(); for (auto size : buffer_sizes_) { - float *buffer = reinterpret_cast(context_->allocator->Malloc(size * sizeof(float))); + void *buffer; + if (data_type_ == kDataTypeFloat) { + buffer = context_->allocator->Malloc(size * sizeof(float)); + } else { + buffer = context_->allocator->Malloc(size * sizeof(int)); + } if (buffer == nullptr) { MS_LOG(ERROR) << "Malloc data failed."; return RET_ERROR; @@ -146,8 +167,7 @@ int ReduceCPUKernel::MallocTmpBuffer() { } void ReduceCPUKernel::FreeTmpBuffer() { - for (size_t i = 0; i < data_buffers_.size(); i++) { - float *buffer = data_buffers_[i]; + for (auto buffer : data_buffers_) { if (buffer != nullptr) { context_->allocator->Free(buffer); buffer = nullptr; diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.h b/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.h index 6cf5856d61..c983966590 100644 --- a/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.h +++ b/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.h @@ -29,6 +29,8 @@ namespace mindspore::kernel { class ReduceCPUKernel : public ReduceBaseCPUKernel { typedef int (*Reducer)(const int outer_size, const int inner_size, const int axis_size, const float *src_data, float *dst_data, const int tid, const int thread_num); + typedef int (*IntReducer)(const int outer_size, const int inner_size, const int axis_size, const int *src_data, + int *dst_data, const int tid, const int thread_num); public: ReduceCPUKernel(OpParameter *param, const std::vector &inputs, @@ -36,9 +38,10 @@ class ReduceCPUKernel : public ReduceBaseCPUKernel { const mindspore::lite::PrimitiveC *primitive) : ReduceBaseCPUKernel(param, inputs, outputs, ctx, primitive) {} ~ReduceCPUKernel() { - FreeTmpBuffer(); src_data_ = nullptr; dst_data_ = nullptr; + reducer_ = nullptr; + int_reducer_ = nullptr; } int Init() override; @@ -48,9 +51,12 @@ class ReduceCPUKernel : public ReduceBaseCPUKernel { private: Reducer reducer_ = nullptr; - std::vector data_buffers_; - const float *src_data_ = nullptr; - float *dst_data_ = nullptr; + IntReducer int_reducer_ = nullptr; + std::vector data_buffers_; + LiteDataType data_type_; + + const void *src_data_ = nullptr; + void *dst_data_ = nullptr; private: int MallocTmpBuffer();