From 99a97d14cf1ce7a677b4a4c40f38d4aa6a3122ea Mon Sep 17 00:00:00 2001 From: tao_yunhao Date: Fri, 18 Sep 2020 14:30:45 +0800 Subject: [PATCH] optimize cpu op reduce --- mindspore/lite/nnacl/fp32/reduce.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/mindspore/lite/nnacl/fp32/reduce.c b/mindspore/lite/nnacl/fp32/reduce.c index ce78faf7a4..d95d81a037 100644 --- a/mindspore/lite/nnacl/fp32/reduce.c +++ b/mindspore/lite/nnacl/fp32/reduce.c @@ -14,8 +14,8 @@ * limitations under the License. */ -#include #include "nnacl/fp32/reduce.h" +#include #include "nnacl/errorcode.h" #include "nnacl/common_func.h" @@ -45,11 +45,27 @@ int ReduceSum(const int outer_size, const int inner_size, const int axis_size, c if (src_data == NULL || dst_data == NULL) { return NNACL_NULL_PTR; } - int i, j, k; + int i, j; +#ifdef ENABLE_NEON + int block_mod = inner_size % C4NUM; + int block_c4 = inner_size - block_mod; +#endif for (j = tid; j < outer_size; j += thread_num) { const float *outer_src = src_data + j * axis_size * inner_size; float *outer_dst = dst_data + j * inner_size; - for (k = 0; k < inner_size; k++) { + int k = 0; +#ifdef ENABLE_NEON + for (; k < block_c4; k += C4NUM) { + const float *inner_src = outer_src + k; + float *inner_dst = outer_dst + k; + float32x4_t tmp = {0, 0, 0, 0}; + for (i = 0; i < axis_size; i++) { + tmp = vaddq_f32(tmp, vld1q_f32(inner_src + i * inner_size)); + } + vst1q_f32(inner_dst, tmp); + } +#endif + for (; k < inner_size; k++) { const float *inner_src = outer_src + k; float *inner_dst = outer_dst + k; float tmp = 0.0f;