| @@ -14,8 +14,8 @@ | |||||
| * limitations under the License. | * limitations under the License. | ||||
| */ | */ | ||||
| #include <float.h> | |||||
| #include "nnacl/fp32/reduce.h" | #include "nnacl/fp32/reduce.h" | ||||
| #include <float.h> | |||||
| #include "nnacl/errorcode.h" | #include "nnacl/errorcode.h" | ||||
| #include "nnacl/common_func.h" | #include "nnacl/common_func.h" | ||||
| @@ -45,11 +45,27 @@ int ReduceSum(const int outer_size, const int inner_size, const int axis_size, c | |||||
| if (src_data == NULL || dst_data == NULL) { | if (src_data == NULL || dst_data == NULL) { | ||||
| return NNACL_NULL_PTR; | return NNACL_NULL_PTR; | ||||
| } | } | ||||
| int i, j, k; | |||||
| int i, j; | |||||
| #ifdef ENABLE_NEON | |||||
| int block_mod = inner_size % C4NUM; | |||||
| int block_c4 = inner_size - block_mod; | |||||
| #endif | |||||
| for (j = tid; j < outer_size; j += thread_num) { | for (j = tid; j < outer_size; j += thread_num) { | ||||
| const float *outer_src = src_data + j * axis_size * inner_size; | const float *outer_src = src_data + j * axis_size * inner_size; | ||||
| float *outer_dst = dst_data + j * inner_size; | float *outer_dst = dst_data + j * inner_size; | ||||
| for (k = 0; k < inner_size; k++) { | |||||
| int k = 0; | |||||
| #ifdef ENABLE_NEON | |||||
| for (; k < block_c4; k += C4NUM) { | |||||
| const float *inner_src = outer_src + k; | |||||
| float *inner_dst = outer_dst + k; | |||||
| float32x4_t tmp = {0, 0, 0, 0}; | |||||
| for (i = 0; i < axis_size; i++) { | |||||
| tmp = vaddq_f32(tmp, vld1q_f32(inner_src + i * inner_size)); | |||||
| } | |||||
| vst1q_f32(inner_dst, tmp); | |||||
| } | |||||
| #endif | |||||
| for (; k < inner_size; k++) { | |||||
| const float *inner_src = outer_src + k; | const float *inner_src = outer_src + k; | ||||
| float *inner_dst = outer_dst + k; | float *inner_dst = outer_dst + k; | ||||
| float tmp = 0.0f; | float tmp = 0.0f; | ||||