Browse Source

!5878 reduce prod support int

Merge pull request !5878 from zhaozhenlong/lite/issue/reduce_int
tags/v1.0.0
mindspore-ci-bot Gitee 5 years ago
parent
commit
8200410f20
8 changed files with 79 additions and 22 deletions
  1. +9
    -0
      mindspore/lite/nnacl/common_func.h
  2. +26
    -0
      mindspore/lite/nnacl/fp32/reduce.c
  3. +2
    -0
      mindspore/lite/nnacl/fp32/reduce.h
  4. +2
    -11
      mindspore/lite/nnacl/int8/reduce_int8.c
  5. +1
    -0
      mindspore/lite/nnacl/op_base.h
  6. +2
    -0
      mindspore/lite/src/runtime/kernel/arm/base/reduce_base.cc
  7. +27
    -7
      mindspore/lite/src/runtime/kernel/arm/fp32/reduce.cc
  8. +10
    -4
      mindspore/lite/src/runtime/kernel/arm/fp32/reduce.h

+ 9
- 0
mindspore/lite/nnacl/common_func.h View File

@@ -46,6 +46,15 @@ void IndirectGemmFp32(float *output, const float *input, const float *weight, co
int offset(const int *shape, const int dim0, const int dim1, const int dim2, const int dim3);
int offsetComm(const int *shape, const int dim0, const int dim1, const int dim2);
int offset4d(const int *shape, const int *dims);
inline bool isAddOverflow(int32_t x, int32_t y) {
int32_t sum = x + y;
return (x > 0 && y > 0 && sum < 0) || (x < 0 && y < 0 && sum > 0);
}

inline bool isMulOverflow(int32_t x, int32_t y) {
int32_t p = x * y;
return (x != 0) && (p / x != y);
}

#ifdef ENABLE_ARM64
void BiasAdd(const float *bias, float *data, size_t oc4, size_t plan_size);


+ 26
- 0
mindspore/lite/nnacl/fp32/reduce.c View File

@@ -17,6 +17,7 @@
#include <float.h>
#include "nnacl/fp32/reduce.h"
#include "nnacl/errorcode.h"
#include "nnacl/common_func.h"

int ReduceMean(const int outer_size, const int inner_size, const int axis_size, const float *src_data, float *dst_data,
const int tid, const int thread_num) {
@@ -123,6 +124,31 @@ int ReduceProd(const int outer_size, const int inner_size, const int axis_size,
}
return NNACL_OK;
}

int IntReduceProd(const int outer_size, const int inner_size, const int axis_size, const int *src_data, int *dst_data,
const int tid, const int thread_num) {
if (src_data == NULL || dst_data == NULL) {
return NNACL_NULL_PTR;
}
int i, j, k;
for (j = tid; j < outer_size; j += thread_num) {
const int *outer_src = src_data + j * axis_size * inner_size;
int *outer_dst = dst_data + j * inner_size;
for (k = 0; k < inner_size; k++) {
const int *inner_src = outer_src + k;
int *inner_dst = outer_dst + k;
int tmp = 1;
for (i = 0; i < axis_size; i++) {
if (isMulOverflow(tmp, inner_src[i * inner_size])) {
return NNACL_ERRCODE_MUL_OVERFLOW;
}
tmp *= inner_src[i * inner_size];
}
*inner_dst = tmp;
}
}
return NNACL_OK;
}
int ReduceSumSquare(const int outer_size, const int inner_size, const int axis_size, const float *src_data,
float *dst_data, const int tid, const int thread_num) {
if (src_data == NULL || dst_data == NULL) {


+ 2
- 0
mindspore/lite/nnacl/fp32/reduce.h View File

@@ -32,6 +32,8 @@ int ReduceMin(const int outer_size, const int inner_size, const int axis_size, c
const int tid, const int thread_num);
int ReduceProd(const int outer_size, const int inner_size, const int axis_size, const float *src_data, float *dst_data,
const int tid, const int thread_num);
int IntReduceProd(const int outer_size, const int inner_size, const int axis_size, const int *src_data, int *dst_data,
const int tid, const int thread_num);
int ReduceSumSquare(const int outer_size, const int inner_size, const int axis_size, const float *src_data,
float *dst_data, const int tid, const int thread_num);
#ifdef __cplusplus


+ 2
- 11
mindspore/lite/nnacl/int8/reduce_int8.c View File

@@ -18,16 +18,7 @@
#include "nnacl/int8/reduce_int8.h"
#include "nnacl/errorcode.h"
#include "nnacl/quantization/fixed_point.h"

inline bool isAddOverflow(int32_t x, int32_t y) {
int32_t sum = x + y;
return (x > 0 && y > 0 && sum < 0) || (x < 0 && y < 0 && sum > 0);
}

inline bool isMulOverflow(int32_t x, int32_t y) {
int32_t p = x * y;
return (x != 0) && (p / x != y);
}
#include "nnacl/common_func.h"

// Get x such that (x-zp_in) * scale_in = mean
// Assuming reduce n axes, this works for first n-1 reduce. One call for one reduce.
@@ -268,7 +259,7 @@ int ReduceMinLastAxis(const int outer_size, const int inner_size, const int axis
RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
(tmp - quant->in_zp_) * (1 << ((unsigned int)quant->in_out_left_shift_ + base_offset)),
quant->in_out_multiplier_),
quant->in_out_right_shift_ + base_offset);
quant->in_out_right_shift_ + base_offset);
if (isAddOverflow(tmp_scaled, quant->out_zp_)) {
return NNACL_ERRCODE_ADD_OVERFLOW;
}


+ 1
- 0
mindspore/lite/nnacl/op_base.h View File

@@ -53,6 +53,7 @@

typedef enum LiteDataType {
kDataTypeFloat,
kDataTypeInt,
kDataTypeInt8,
} LiteDataType;



+ 2
- 0
mindspore/lite/src/runtime/kernel/arm/base/reduce_base.cc View File

@@ -257,6 +257,8 @@ kernel::LiteKernel *CpuReduceInt8KernelCreator(const std::vector<lite::Tensor *>
}

REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Reduce, CpuReduceFp32KernelCreator)
REG_KERNEL(kCPU, kNumberTypeInt, PrimitiveType_Reduce, CpuReduceFp32KernelCreator)
REG_KERNEL(kCPU, kNumberTypeInt32, PrimitiveType_Reduce, CpuReduceFp32KernelCreator)
REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Mean, CpuMeanFp32KernelCreator)
REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_Reduce, CpuReduceInt8KernelCreator)
REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_Mean, CpuReduceInt8KernelCreator)


+ 27
- 7
mindspore/lite/src/runtime/kernel/arm/fp32/reduce.cc View File

@@ -64,6 +64,7 @@ int ReduceCPUKernel::Init() {
}
case static_cast<int>(ReduceMode_ReduceProd): {
reducer_ = ReduceProd;
int_reducer_ = IntReduceProd;
break;
}
case static_cast<int>(ReduceMode_ReduceSumSquare): {
@@ -81,10 +82,25 @@ int ReduceCPUKernel::Init() {
return ReSize();
}

int ReduceCPUKernel::ReSize() { return ReduceBaseCPUKernel::ReSize(); }
int ReduceCPUKernel::ReSize() {
if (in_tensors().at(0)->data_type() == kNumberTypeFloat32) {
data_type_ = kDataTypeFloat;
} else {
data_type_ = kDataTypeInt;
}
return ReduceBaseCPUKernel::ReSize();
}

int ReduceCPUKernel::CallReduceUnit(int task_id) {
auto ret = reducer_(outer_size_, inner_size_, axis_size_, src_data_, dst_data_, task_id, context_->thread_num_);
int ret;
if (data_type_ == kDataTypeFloat) {
ret = reducer_(outer_size_, inner_size_, axis_size_, static_cast<const float *>(src_data_),
static_cast<float *>(dst_data_), task_id, context_->thread_num_);
} else {
ret = int_reducer_(outer_size_, inner_size_, axis_size_, static_cast<const int *>(src_data_),
static_cast<int *>(dst_data_), task_id, context_->thread_num_);
}

return ret;
}

@@ -110,12 +126,12 @@ int ReduceCPUKernel::Run() {
return ret;
}

src_data_ = static_cast<float *>(in_tensors_.at(0)->MutableData());
src_data_ = in_tensors_.at(0)->MutableData();
for (size_t i = 0; i < static_cast<size_t>(num_axes_); ++i) {
if (i != static_cast<size_t>(num_axes_ - 1)) {
dst_data_ = data_buffers_[i];
} else {
dst_data_ = reinterpret_cast<float *>(out_tensors_.at(0)->MutableData());
dst_data_ = out_tensors_.at(0)->MutableData();
}
outer_size_ = outer_sizes_[i];
inner_size_ = inner_sizes_[i];
@@ -135,7 +151,12 @@ int ReduceCPUKernel::Run() {
int ReduceCPUKernel::MallocTmpBuffer() {
data_buffers_.clear();
for (auto size : buffer_sizes_) {
float *buffer = reinterpret_cast<float *>(context_->allocator->Malloc(size * sizeof(float)));
void *buffer;
if (data_type_ == kDataTypeFloat) {
buffer = context_->allocator->Malloc(size * sizeof(float));
} else {
buffer = context_->allocator->Malloc(size * sizeof(int));
}
if (buffer == nullptr) {
MS_LOG(ERROR) << "Malloc data failed.";
return RET_ERROR;
@@ -146,8 +167,7 @@ int ReduceCPUKernel::MallocTmpBuffer() {
}

void ReduceCPUKernel::FreeTmpBuffer() {
for (size_t i = 0; i < data_buffers_.size(); i++) {
float *buffer = data_buffers_[i];
for (auto buffer : data_buffers_) {
if (buffer != nullptr) {
context_->allocator->Free(buffer);
buffer = nullptr;


+ 10
- 4
mindspore/lite/src/runtime/kernel/arm/fp32/reduce.h View File

@@ -29,6 +29,8 @@ namespace mindspore::kernel {
class ReduceCPUKernel : public ReduceBaseCPUKernel {
typedef int (*Reducer)(const int outer_size, const int inner_size, const int axis_size, const float *src_data,
float *dst_data, const int tid, const int thread_num);
typedef int (*IntReducer)(const int outer_size, const int inner_size, const int axis_size, const int *src_data,
int *dst_data, const int tid, const int thread_num);

public:
ReduceCPUKernel(OpParameter *param, const std::vector<lite::Tensor *> &inputs,
@@ -36,9 +38,10 @@ class ReduceCPUKernel : public ReduceBaseCPUKernel {
const mindspore::lite::PrimitiveC *primitive)
: ReduceBaseCPUKernel(param, inputs, outputs, ctx, primitive) {}
~ReduceCPUKernel() {
FreeTmpBuffer();
src_data_ = nullptr;
dst_data_ = nullptr;
reducer_ = nullptr;
int_reducer_ = nullptr;
}

int Init() override;
@@ -48,9 +51,12 @@ class ReduceCPUKernel : public ReduceBaseCPUKernel {

private:
Reducer reducer_ = nullptr;
std::vector<float *> data_buffers_;
const float *src_data_ = nullptr;
float *dst_data_ = nullptr;
IntReducer int_reducer_ = nullptr;
std::vector<void *> data_buffers_;
LiteDataType data_type_;

const void *src_data_ = nullptr;
void *dst_data_ = nullptr;

private:
int MallocTmpBuffer();


Loading…
Cancel
Save