Merge pull request !5878 from zhaozhenlong/lite/issue/reduce_inttags/v1.0.0
| @@ -46,6 +46,15 @@ void IndirectGemmFp32(float *output, const float *input, const float *weight, co | |||
| int offset(const int *shape, const int dim0, const int dim1, const int dim2, const int dim3); | |||
| int offsetComm(const int *shape, const int dim0, const int dim1, const int dim2); | |||
| int offset4d(const int *shape, const int *dims); | |||
| inline bool isAddOverflow(int32_t x, int32_t y) { | |||
| int32_t sum = x + y; | |||
| return (x > 0 && y > 0 && sum < 0) || (x < 0 && y < 0 && sum > 0); | |||
| } | |||
| inline bool isMulOverflow(int32_t x, int32_t y) { | |||
| int32_t p = x * y; | |||
| return (x != 0) && (p / x != y); | |||
| } | |||
| #ifdef ENABLE_ARM64 | |||
| void BiasAdd(const float *bias, float *data, size_t oc4, size_t plan_size); | |||
| @@ -17,6 +17,7 @@ | |||
| #include <float.h> | |||
| #include "nnacl/fp32/reduce.h" | |||
| #include "nnacl/errorcode.h" | |||
| #include "nnacl/common_func.h" | |||
| int ReduceMean(const int outer_size, const int inner_size, const int axis_size, const float *src_data, float *dst_data, | |||
| const int tid, const int thread_num) { | |||
| @@ -123,6 +124,31 @@ int ReduceProd(const int outer_size, const int inner_size, const int axis_size, | |||
| } | |||
| return NNACL_OK; | |||
| } | |||
| int IntReduceProd(const int outer_size, const int inner_size, const int axis_size, const int *src_data, int *dst_data, | |||
| const int tid, const int thread_num) { | |||
| if (src_data == NULL || dst_data == NULL) { | |||
| return NNACL_NULL_PTR; | |||
| } | |||
| int i, j, k; | |||
| for (j = tid; j < outer_size; j += thread_num) { | |||
| const int *outer_src = src_data + j * axis_size * inner_size; | |||
| int *outer_dst = dst_data + j * inner_size; | |||
| for (k = 0; k < inner_size; k++) { | |||
| const int *inner_src = outer_src + k; | |||
| int *inner_dst = outer_dst + k; | |||
| int tmp = 1; | |||
| for (i = 0; i < axis_size; i++) { | |||
| if (isMulOverflow(tmp, inner_src[i * inner_size])) { | |||
| return NNACL_ERRCODE_MUL_OVERFLOW; | |||
| } | |||
| tmp *= inner_src[i * inner_size]; | |||
| } | |||
| *inner_dst = tmp; | |||
| } | |||
| } | |||
| return NNACL_OK; | |||
| } | |||
| int ReduceSumSquare(const int outer_size, const int inner_size, const int axis_size, const float *src_data, | |||
| float *dst_data, const int tid, const int thread_num) { | |||
| if (src_data == NULL || dst_data == NULL) { | |||
| @@ -32,6 +32,8 @@ int ReduceMin(const int outer_size, const int inner_size, const int axis_size, c | |||
| const int tid, const int thread_num); | |||
| int ReduceProd(const int outer_size, const int inner_size, const int axis_size, const float *src_data, float *dst_data, | |||
| const int tid, const int thread_num); | |||
| int IntReduceProd(const int outer_size, const int inner_size, const int axis_size, const int *src_data, int *dst_data, | |||
| const int tid, const int thread_num); | |||
| int ReduceSumSquare(const int outer_size, const int inner_size, const int axis_size, const float *src_data, | |||
| float *dst_data, const int tid, const int thread_num); | |||
| #ifdef __cplusplus | |||
| @@ -18,16 +18,7 @@ | |||
| #include "nnacl/int8/reduce_int8.h" | |||
| #include "nnacl/errorcode.h" | |||
| #include "nnacl/quantization/fixed_point.h" | |||
| inline bool isAddOverflow(int32_t x, int32_t y) { | |||
| int32_t sum = x + y; | |||
| return (x > 0 && y > 0 && sum < 0) || (x < 0 && y < 0 && sum > 0); | |||
| } | |||
| inline bool isMulOverflow(int32_t x, int32_t y) { | |||
| int32_t p = x * y; | |||
| return (x != 0) && (p / x != y); | |||
| } | |||
| #include "nnacl/common_func.h" | |||
| // Get x such that (x-zp_in) * scale_in = mean | |||
| // Assuming reduce n axes, this works for first n-1 reduce. One call for one reduce. | |||
| @@ -268,7 +259,7 @@ int ReduceMinLastAxis(const int outer_size, const int inner_size, const int axis | |||
| RoundingDivideByPOT(SaturatingRoundingDoublingHighMul( | |||
| (tmp - quant->in_zp_) * (1 << ((unsigned int)quant->in_out_left_shift_ + base_offset)), | |||
| quant->in_out_multiplier_), | |||
| quant->in_out_right_shift_ + base_offset); | |||
| quant->in_out_right_shift_ + base_offset); | |||
| if (isAddOverflow(tmp_scaled, quant->out_zp_)) { | |||
| return NNACL_ERRCODE_ADD_OVERFLOW; | |||
| } | |||
| @@ -53,6 +53,7 @@ | |||
| typedef enum LiteDataType { | |||
| kDataTypeFloat, | |||
| kDataTypeInt, | |||
| kDataTypeInt8, | |||
| } LiteDataType; | |||
| @@ -257,6 +257,8 @@ kernel::LiteKernel *CpuReduceInt8KernelCreator(const std::vector<lite::Tensor *> | |||
| } | |||
| REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Reduce, CpuReduceFp32KernelCreator) | |||
| REG_KERNEL(kCPU, kNumberTypeInt, PrimitiveType_Reduce, CpuReduceFp32KernelCreator) | |||
| REG_KERNEL(kCPU, kNumberTypeInt32, PrimitiveType_Reduce, CpuReduceFp32KernelCreator) | |||
| REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Mean, CpuMeanFp32KernelCreator) | |||
| REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_Reduce, CpuReduceInt8KernelCreator) | |||
| REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_Mean, CpuReduceInt8KernelCreator) | |||
| @@ -64,6 +64,7 @@ int ReduceCPUKernel::Init() { | |||
| } | |||
| case static_cast<int>(ReduceMode_ReduceProd): { | |||
| reducer_ = ReduceProd; | |||
| int_reducer_ = IntReduceProd; | |||
| break; | |||
| } | |||
| case static_cast<int>(ReduceMode_ReduceSumSquare): { | |||
| @@ -81,10 +82,25 @@ int ReduceCPUKernel::Init() { | |||
| return ReSize(); | |||
| } | |||
| int ReduceCPUKernel::ReSize() { return ReduceBaseCPUKernel::ReSize(); } | |||
| int ReduceCPUKernel::ReSize() { | |||
| if (in_tensors().at(0)->data_type() == kNumberTypeFloat32) { | |||
| data_type_ = kDataTypeFloat; | |||
| } else { | |||
| data_type_ = kDataTypeInt; | |||
| } | |||
| return ReduceBaseCPUKernel::ReSize(); | |||
| } | |||
| int ReduceCPUKernel::CallReduceUnit(int task_id) { | |||
| auto ret = reducer_(outer_size_, inner_size_, axis_size_, src_data_, dst_data_, task_id, context_->thread_num_); | |||
| int ret; | |||
| if (data_type_ == kDataTypeFloat) { | |||
| ret = reducer_(outer_size_, inner_size_, axis_size_, static_cast<const float *>(src_data_), | |||
| static_cast<float *>(dst_data_), task_id, context_->thread_num_); | |||
| } else { | |||
| ret = int_reducer_(outer_size_, inner_size_, axis_size_, static_cast<const int *>(src_data_), | |||
| static_cast<int *>(dst_data_), task_id, context_->thread_num_); | |||
| } | |||
| return ret; | |||
| } | |||
| @@ -110,12 +126,12 @@ int ReduceCPUKernel::Run() { | |||
| return ret; | |||
| } | |||
| src_data_ = static_cast<float *>(in_tensors_.at(0)->MutableData()); | |||
| src_data_ = in_tensors_.at(0)->MutableData(); | |||
| for (size_t i = 0; i < static_cast<size_t>(num_axes_); ++i) { | |||
| if (i != static_cast<size_t>(num_axes_ - 1)) { | |||
| dst_data_ = data_buffers_[i]; | |||
| } else { | |||
| dst_data_ = reinterpret_cast<float *>(out_tensors_.at(0)->MutableData()); | |||
| dst_data_ = out_tensors_.at(0)->MutableData(); | |||
| } | |||
| outer_size_ = outer_sizes_[i]; | |||
| inner_size_ = inner_sizes_[i]; | |||
| @@ -135,7 +151,12 @@ int ReduceCPUKernel::Run() { | |||
| int ReduceCPUKernel::MallocTmpBuffer() { | |||
| data_buffers_.clear(); | |||
| for (auto size : buffer_sizes_) { | |||
| float *buffer = reinterpret_cast<float *>(context_->allocator->Malloc(size * sizeof(float))); | |||
| void *buffer; | |||
| if (data_type_ == kDataTypeFloat) { | |||
| buffer = context_->allocator->Malloc(size * sizeof(float)); | |||
| } else { | |||
| buffer = context_->allocator->Malloc(size * sizeof(int)); | |||
| } | |||
| if (buffer == nullptr) { | |||
| MS_LOG(ERROR) << "Malloc data failed."; | |||
| return RET_ERROR; | |||
| @@ -146,8 +167,7 @@ int ReduceCPUKernel::MallocTmpBuffer() { | |||
| } | |||
| void ReduceCPUKernel::FreeTmpBuffer() { | |||
| for (size_t i = 0; i < data_buffers_.size(); i++) { | |||
| float *buffer = data_buffers_[i]; | |||
| for (auto buffer : data_buffers_) { | |||
| if (buffer != nullptr) { | |||
| context_->allocator->Free(buffer); | |||
| buffer = nullptr; | |||
| @@ -29,6 +29,8 @@ namespace mindspore::kernel { | |||
| class ReduceCPUKernel : public ReduceBaseCPUKernel { | |||
| typedef int (*Reducer)(const int outer_size, const int inner_size, const int axis_size, const float *src_data, | |||
| float *dst_data, const int tid, const int thread_num); | |||
| typedef int (*IntReducer)(const int outer_size, const int inner_size, const int axis_size, const int *src_data, | |||
| int *dst_data, const int tid, const int thread_num); | |||
| public: | |||
| ReduceCPUKernel(OpParameter *param, const std::vector<lite::Tensor *> &inputs, | |||
| @@ -36,9 +38,10 @@ class ReduceCPUKernel : public ReduceBaseCPUKernel { | |||
| const mindspore::lite::PrimitiveC *primitive) | |||
| : ReduceBaseCPUKernel(param, inputs, outputs, ctx, primitive) {} | |||
| ~ReduceCPUKernel() { | |||
| FreeTmpBuffer(); | |||
| src_data_ = nullptr; | |||
| dst_data_ = nullptr; | |||
| reducer_ = nullptr; | |||
| int_reducer_ = nullptr; | |||
| } | |||
| int Init() override; | |||
| @@ -48,9 +51,12 @@ class ReduceCPUKernel : public ReduceBaseCPUKernel { | |||
| private: | |||
| Reducer reducer_ = nullptr; | |||
| std::vector<float *> data_buffers_; | |||
| const float *src_data_ = nullptr; | |||
| float *dst_data_ = nullptr; | |||
| IntReducer int_reducer_ = nullptr; | |||
| std::vector<void *> data_buffers_; | |||
| LiteDataType data_type_; | |||
| const void *src_data_ = nullptr; | |||
| void *dst_data_ = nullptr; | |||
| private: | |||
| int MallocTmpBuffer(); | |||