| @@ -44,6 +44,49 @@ int ReduceMean(int outer_size, int inner_size, int axis_size, const float *src_d | |||||
| } | } | ||||
| return NNACL_OK; | return NNACL_OK; | ||||
| } | } | ||||
| int IntReduceMean(int outer_size, int inner_size, int axis_size, const int *src_data, int *dst_data, int tid, | |||||
| int thread_num) { | |||||
| if (src_data == NULL || dst_data == NULL) { | |||||
| return NNACL_NULL_PTR; | |||||
| } | |||||
| int i, j; | |||||
| #ifdef ENABLE_NEON | |||||
| int block_mod = inner_size % C4NUM; | |||||
| int block_c4 = inner_size - block_mod; | |||||
| #endif | |||||
| for (j = tid; j < outer_size; j += thread_num) { | |||||
| const int *outer_src = src_data + j * axis_size * inner_size; | |||||
| int *outer_dst = dst_data + j * inner_size; | |||||
| int k = 0; | |||||
| #ifdef ENABLE_NEON | |||||
| for (; k < block_c4; k += C4NUM) { | |||||
| const int *inner_src = outer_src + k; | |||||
| int *inner_dst = outer_dst + k; | |||||
| int32x4_t tmp = {0, 0, 0, 0}; | |||||
| for (i = 0; i < axis_size; i++) { | |||||
| tmp = vaddq_s32(tmp, vld1q_s32(inner_src + i * inner_size)); | |||||
| } | |||||
| tmp[0] /= axis_size; | |||||
| tmp[1] /= axis_size; | |||||
| tmp[2] /= axis_size; | |||||
| tmp[3] /= axis_size; | |||||
| vst1q_s32(inner_dst, tmp); | |||||
| } | |||||
| #endif | |||||
| for (; k < inner_size; k++) { | |||||
| const int *inner_src = outer_src + k; | |||||
| int *inner_dst = outer_dst + k; | |||||
| int tmp = 0; | |||||
| for (i = 0; i < axis_size; i++) { | |||||
| tmp += inner_src[i * inner_size]; | |||||
| } | |||||
| *inner_dst = tmp / axis_size; | |||||
| } | |||||
| } | |||||
| return NNACL_OK; | |||||
| } | |||||
| int ReduceSum(int outer_size, int inner_size, int axis_size, const float *src_data, float *dst_data, int tid, | int ReduceSum(int outer_size, int inner_size, int axis_size, const float *src_data, float *dst_data, int tid, | ||||
| int thread_num) { | int thread_num) { | ||||
| if (src_data == NULL || dst_data == NULL) { | if (src_data == NULL || dst_data == NULL) { | ||||
| @@ -81,6 +124,7 @@ int ReduceSum(int outer_size, int inner_size, int axis_size, const float *src_da | |||||
| } | } | ||||
| return NNACL_OK; | return NNACL_OK; | ||||
| } | } | ||||
| int IntReduceSum(int outer_size, int inner_size, int axis_size, const int *src_data, int *dst_data, int tid, | int IntReduceSum(int outer_size, int inner_size, int axis_size, const int *src_data, int *dst_data, int tid, | ||||
| int thread_num) { | int thread_num) { | ||||
| if (src_data == NULL || dst_data == NULL) { | if (src_data == NULL || dst_data == NULL) { | ||||
| @@ -118,6 +162,7 @@ int IntReduceSum(int outer_size, int inner_size, int axis_size, const int *src_d | |||||
| } | } | ||||
| return NNACL_OK; | return NNACL_OK; | ||||
| } | } | ||||
| int ReduceMax(int outer_size, int inner_size, int axis_size, const float *src_data, float *dst_data, int tid, | int ReduceMax(int outer_size, int inner_size, int axis_size, const float *src_data, float *dst_data, int tid, | ||||
| int thread_num) { | int thread_num) { | ||||
| if (src_data == NULL || dst_data == NULL) { | if (src_data == NULL || dst_data == NULL) { | ||||
| @@ -139,6 +184,7 @@ int ReduceMax(int outer_size, int inner_size, int axis_size, const float *src_da | |||||
| } | } | ||||
| return NNACL_OK; | return NNACL_OK; | ||||
| } | } | ||||
| int IntReduceMax(int outer_size, int inner_size, int axis_size, const int *src_data, int *dst_data, int tid, | int IntReduceMax(int outer_size, int inner_size, int axis_size, const int *src_data, int *dst_data, int tid, | ||||
| int thread_num) { | int thread_num) { | ||||
| if (src_data == NULL || dst_data == NULL) { | if (src_data == NULL || dst_data == NULL) { | ||||
| @@ -160,6 +206,7 @@ int IntReduceMax(int outer_size, int inner_size, int axis_size, const int *src_d | |||||
| } | } | ||||
| return NNACL_OK; | return NNACL_OK; | ||||
| } | } | ||||
| int ReduceMin(int outer_size, int inner_size, int axis_size, const float *src_data, float *dst_data, int tid, | int ReduceMin(int outer_size, int inner_size, int axis_size, const float *src_data, float *dst_data, int tid, | ||||
| int thread_num) { | int thread_num) { | ||||
| if (src_data == NULL || dst_data == NULL) { | if (src_data == NULL || dst_data == NULL) { | ||||
| @@ -181,6 +228,7 @@ int ReduceMin(int outer_size, int inner_size, int axis_size, const float *src_da | |||||
| } | } | ||||
| return NNACL_OK; | return NNACL_OK; | ||||
| } | } | ||||
| int IntReduceMin(int outer_size, int inner_size, int axis_size, const int *src_data, int *dst_data, int tid, | int IntReduceMin(int outer_size, int inner_size, int axis_size, const int *src_data, int *dst_data, int tid, | ||||
| int thread_num) { | int thread_num) { | ||||
| if (src_data == NULL || dst_data == NULL) { | if (src_data == NULL || dst_data == NULL) { | ||||
| @@ -271,6 +319,7 @@ int IntReduceProd(int outer_size, int inner_size, int axis_size, const int *src_ | |||||
| } | } | ||||
| return NNACL_OK; | return NNACL_OK; | ||||
| } | } | ||||
| int ReduceSumSquare(int outer_size, int inner_size, int axis_size, const float *src_data, float *dst_data, int tid, | int ReduceSumSquare(int outer_size, int inner_size, int axis_size, const float *src_data, float *dst_data, int tid, | ||||
| int thread_num) { | int thread_num) { | ||||
| if (src_data == NULL || dst_data == NULL) { | if (src_data == NULL || dst_data == NULL) { | ||||
| @@ -24,6 +24,8 @@ extern "C" { | |||||
| #endif | #endif | ||||
| int ReduceMean(int outer_size, int inner_size, int axis_size, const float *src_data, float *dst_data, int tid, | int ReduceMean(int outer_size, int inner_size, int axis_size, const float *src_data, float *dst_data, int tid, | ||||
| int thread_num); | int thread_num); | ||||
| int IntReduceMean(int outer_size, int inner_size, int axis_size, const int *src_data, int *dst_data, int tid, | |||||
| int thread_num); | |||||
| int ReduceSum(int outer_size, int inner_size, int axis_size, const float *src_data, float *dst_data, int tid, | int ReduceSum(int outer_size, int inner_size, int axis_size, const float *src_data, float *dst_data, int tid, | ||||
| int thread_num); | int thread_num); | ||||
| int IntReduceSum(int outer_size, int inner_size, int axis_size, const int *src_data, int *dst_data, int tid, | int IntReduceSum(int outer_size, int inner_size, int axis_size, const int *src_data, int *dst_data, int tid, | ||||
| @@ -40,54 +40,13 @@ using mindspore::schema::ReduceMode_ReduceSum; | |||||
| using mindspore::schema::ReduceMode_ReduceSumSquare; | using mindspore::schema::ReduceMode_ReduceSumSquare; | ||||
| namespace mindspore::kernel { | namespace mindspore::kernel { | ||||
| int ReduceCPUKernel::Init() { | int ReduceCPUKernel::Init() { | ||||
| auto ret = ReduceBaseCPUKernel::Init(); | auto ret = ReduceBaseCPUKernel::Init(); | ||||
| if (ret != RET_OK) { | if (ret != RET_OK) { | ||||
| return ret; | return ret; | ||||
| } | } | ||||
| switch (mode_) { | |||||
| case static_cast<int>(ReduceMode_ReduceSum): { | |||||
| reducer_ = ReduceSum; | |||||
| int_reducer_ = IntReduceSum; | |||||
| break; | |||||
| } | |||||
| case static_cast<int>(ReduceMode_ReduceMean): { | |||||
| reducer_ = ReduceMean; | |||||
| break; | |||||
| } | |||||
| case static_cast<int>(ReduceMode_ReduceMax): { | |||||
| reducer_ = ReduceMax; | |||||
| int_reducer_ = IntReduceMax; | |||||
| break; | |||||
| } | |||||
| case static_cast<int>(ReduceMode_ReduceMin): { | |||||
| reducer_ = ReduceMin; | |||||
| int_reducer_ = IntReduceMin; | |||||
| break; | |||||
| } | |||||
| case static_cast<int>(ReduceMode_ReduceProd): { | |||||
| reducer_ = ReduceProd; | |||||
| int_reducer_ = IntReduceProd; | |||||
| break; | |||||
| } | |||||
| case static_cast<int>(ReduceMode_ReduceSumSquare): { | |||||
| reducer_ = ReduceSum; | |||||
| break; | |||||
| } | |||||
| case static_cast<int>(ReduceMode_ReduceASum): { | |||||
| reducer_ = ReduceSum; | |||||
| break; | |||||
| } | |||||
| case static_cast<int>(ReduceMode_ReduceAll): { | |||||
| bool_reducer_ = ReduceAll; | |||||
| break; | |||||
| } | |||||
| default: | |||||
| MS_LOG(ERROR) << "Reduce unsupported reduce mode: " << mode_; | |||||
| return RET_ERROR; | |||||
| } | |||||
| InitialKernelList(); | |||||
| if (!InferShapeDone()) { | if (!InferShapeDone()) { | ||||
| return RET_OK; | return RET_OK; | ||||
| @@ -98,19 +57,29 @@ int ReduceCPUKernel::Init() { | |||||
| int ReduceCPUKernel::ReSize() { return ReduceBaseCPUKernel::ReSize(); } | int ReduceCPUKernel::ReSize() { return ReduceBaseCPUKernel::ReSize(); } | ||||
| int ReduceCPUKernel::CallReduceUnit(int task_id) { | int ReduceCPUKernel::CallReduceUnit(int task_id) { | ||||
| int ret; | |||||
| if (data_type_ == kDataTypeFloat) { | if (data_type_ == kDataTypeFloat) { | ||||
| ret = reducer_(outer_size_, inner_size_, axis_size_, static_cast<const float *>(src_data_), | |||||
| static_cast<float *>(dst_data_), task_id, context_->thread_num_); | |||||
| if (!reducer_) { | |||||
| MS_LOG(ERROR) << "function reducer_ is null."; | |||||
| return RET_NULL_PTR; | |||||
| } | |||||
| reducer_(outer_size_, inner_size_, axis_size_, static_cast<const float *>(src_data_), | |||||
| static_cast<float *>(dst_data_), task_id, context_->thread_num_); | |||||
| } else if (data_type_ == KDataTypeBool) { | } else if (data_type_ == KDataTypeBool) { | ||||
| ret = bool_reducer_(outer_size_, inner_size_, axis_size_, static_cast<const bool *>(src_data_), | |||||
| static_cast<bool *>(dst_data_), task_id, context_->thread_num_); | |||||
| if (!bool_reducer_) { | |||||
| MS_LOG(ERROR) << "function bool_reducer_ is null."; | |||||
| return RET_NULL_PTR; | |||||
| } | |||||
| bool_reducer_(outer_size_, inner_size_, axis_size_, static_cast<const bool *>(src_data_), | |||||
| static_cast<bool *>(dst_data_), task_id, context_->thread_num_); | |||||
| } else { | } else { | ||||
| ret = int_reducer_(outer_size_, inner_size_, axis_size_, static_cast<const int *>(src_data_), | |||||
| static_cast<int *>(dst_data_), task_id, context_->thread_num_); | |||||
| if (!int_reducer_) { | |||||
| MS_LOG(ERROR) << "function int_reducer_ is null."; | |||||
| return RET_NULL_PTR; | |||||
| } | |||||
| int_reducer_(outer_size_, inner_size_, axis_size_, static_cast<const int *>(src_data_), | |||||
| static_cast<int *>(dst_data_), task_id, context_->thread_num_); | |||||
| } | } | ||||
| return ret; | |||||
| return RET_OK; | |||||
| } | } | ||||
| int ReduceImpl(void *cdata, int task_id) { | int ReduceImpl(void *cdata, int task_id) { | ||||
| @@ -143,7 +112,7 @@ int ReduceCPUKernel::Run() { | |||||
| if (i != static_cast<size_t>(num_axes_ - 1)) { | if (i != static_cast<size_t>(num_axes_ - 1)) { | ||||
| dst_data_ = data_buffers_.at(i); | dst_data_ = data_buffers_.at(i); | ||||
| } else { | } else { | ||||
| dst_data_ = out_tensors_.at(0)->MutableData(); | |||||
| dst_data_ = out_tensors_.at(0)->data_c(); | |||||
| } | } | ||||
| outer_size_ = outer_sizes_.at(i); | outer_size_ = outer_sizes_.at(i); | ||||
| inner_size_ = inner_sizes_.at(i); | inner_size_ = inner_sizes_.at(i); | ||||
| @@ -173,7 +142,7 @@ void ReduceCPUKernel::HandleASumAndSumSquare() { | |||||
| return; | return; | ||||
| } | } | ||||
| int num = in_tensors_.at(0)->ElementsNum(); | int num = in_tensors_.at(0)->ElementsNum(); | ||||
| float *data = reinterpret_cast<float *>(in_tensors_.at(0)->data_c()); | |||||
| auto *data = reinterpret_cast<float *>(in_tensors_.at(0)->data_c()); | |||||
| if (data == nullptr) { | if (data == nullptr) { | ||||
| return; | return; | ||||
| } | } | ||||
| @@ -197,7 +166,7 @@ int ReduceCPUKernel::CalculateCoeffOutput() { | |||||
| if (data_type_ != kDataTypeFloat) { | if (data_type_ != kDataTypeFloat) { | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| float *out_data = reinterpret_cast<float *>(out_tensor->MutableData()); | |||||
| auto *out_data = reinterpret_cast<float *>(out_tensor->data_c()); | |||||
| if (out_data == nullptr) { | if (out_data == nullptr) { | ||||
| return RET_NULL_PTR; | return RET_NULL_PTR; | ||||
| } | } | ||||
| @@ -237,6 +206,26 @@ void ReduceCPUKernel::FreeTmpBuffer() { | |||||
| data_buffers_.clear(); | data_buffers_.clear(); | ||||
| } | } | ||||
| void ReduceCPUKernel::InitialKernelList() { | |||||
| ReduceKernelList func_list[] = {{ReduceMode_ReduceSum, ReduceSum, IntReduceSum, nullptr}, | |||||
| {ReduceMode_ReduceMean, ReduceMean, IntReduceMean, nullptr}, | |||||
| {ReduceMode_ReduceMax, ReduceMax, IntReduceMax, nullptr}, | |||||
| {ReduceMode_ReduceMin, ReduceMin, IntReduceMin, nullptr}, | |||||
| {ReduceMode_ReduceProd, ReduceProd, IntReduceProd, nullptr}, | |||||
| {ReduceMode_ReduceSumSquare, ReduceSum, IntReduceSum, nullptr}, | |||||
| {ReduceMode_ReduceASum, ReduceSum, IntReduceSum, nullptr}, | |||||
| {ReduceMode_ReduceAll, nullptr, nullptr, ReduceAll}}; | |||||
| int list_len = sizeof(func_list) / sizeof(ReduceKernelList); | |||||
| for (int i = 0; i < list_len; ++i) { | |||||
| if (mode_ == func_list[i].type_) { | |||||
| reducer_ = func_list[i].float_func_; | |||||
| int_reducer_ = func_list[i].int_func_; | |||||
| bool_reducer_ = func_list[i].bool_func_; | |||||
| break; | |||||
| } | |||||
| } | |||||
| } | |||||
| REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_ReduceFusion, LiteKernelCreator<ReduceCPUKernel>) | REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_ReduceFusion, LiteKernelCreator<ReduceCPUKernel>) | ||||
| REG_KERNEL(kCPU, kNumberTypeInt, PrimitiveType_ReduceFusion, LiteKernelCreator<ReduceCPUKernel>) | REG_KERNEL(kCPU, kNumberTypeInt, PrimitiveType_ReduceFusion, LiteKernelCreator<ReduceCPUKernel>) | ||||
| REG_KERNEL(kCPU, kNumberTypeInt32, PrimitiveType_ReduceFusion, LiteKernelCreator<ReduceCPUKernel>) | REG_KERNEL(kCPU, kNumberTypeInt32, PrimitiveType_ReduceFusion, LiteKernelCreator<ReduceCPUKernel>) | ||||
| @@ -26,21 +26,27 @@ | |||||
| using mindspore::schema::ReduceMode; | using mindspore::schema::ReduceMode; | ||||
| namespace mindspore::kernel { | namespace mindspore::kernel { | ||||
| class ReduceCPUKernel : public ReduceBaseCPUKernel { | |||||
| typedef int (*Reducer)(const int outer_size, const int inner_size, const int axis_size, const float *src_data, | |||||
| float *dst_data, const int tid, const int thread_num); | |||||
| typedef int (*IntReducer)(const int outer_size, const int inner_size, const int axis_size, const int *src_data, | |||||
| int *dst_data, const int tid, const int thread_num); | |||||
| typedef int (*BoolReducer)(const int outer_size, const int inner_size, const int axis_size, const bool *src_data, | |||||
| bool *dst_data, const int tid, const int thread_num); | |||||
| typedef int (*Reducer)(const int outer_size, const int inner_size, const int axis_size, const float *src_data, | |||||
| float *dst_data, const int tid, const int thread_num); | |||||
| typedef int (*IntReducer)(const int outer_size, const int inner_size, const int axis_size, const int *src_data, | |||||
| int *dst_data, const int tid, const int thread_num); | |||||
| typedef int (*BoolReducer)(const int outer_size, const int inner_size, const int axis_size, const bool *src_data, | |||||
| bool *dst_data, const int tid, const int thread_num); | |||||
| struct ReduceKernelList { | |||||
| int type_; | |||||
| Reducer float_func_; | |||||
| IntReducer int_func_; | |||||
| BoolReducer bool_func_; | |||||
| }; | |||||
| class ReduceCPUKernel : public ReduceBaseCPUKernel { | |||||
| public: | public: | ||||
| ReduceCPUKernel(OpParameter *param, const std::vector<lite::Tensor *> &inputs, | ReduceCPUKernel(OpParameter *param, const std::vector<lite::Tensor *> &inputs, | ||||
| const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx) | const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx) | ||||
| : ReduceBaseCPUKernel(param, inputs, outputs, ctx) { | : ReduceBaseCPUKernel(param, inputs, outputs, ctx) { | ||||
| reduce_param_ = reinterpret_cast<ReduceParameter *>(param); | reduce_param_ = reinterpret_cast<ReduceParameter *>(param); | ||||
| } | } | ||||
| ~ReduceCPUKernel() { | |||||
| ~ReduceCPUKernel() override { | |||||
| src_data_ = nullptr; | src_data_ = nullptr; | ||||
| dst_data_ = nullptr; | dst_data_ = nullptr; | ||||
| reducer_ = nullptr; | reducer_ = nullptr; | ||||
| @@ -52,6 +58,9 @@ class ReduceCPUKernel : public ReduceBaseCPUKernel { | |||||
| int Run() override; | int Run() override; | ||||
| int CallReduceUnit(int task_id); | int CallReduceUnit(int task_id); | ||||
| protected: | |||||
| void InitialKernelList(); | |||||
| private: | private: | ||||
| ReduceParameter *reduce_param_; | ReduceParameter *reduce_param_; | ||||
| Reducer reducer_ = nullptr; | Reducer reducer_ = nullptr; | ||||