|
|
|
@@ -14,8 +14,8 @@ |
|
|
|
* limitations under the License. |
|
|
|
*/ |
|
|
|
|
|
|
|
#include <float.h> |
|
|
|
#include "nnacl/fp32/reduce.h" |
|
|
|
#include <float.h> |
|
|
|
#include "nnacl/errorcode.h" |
|
|
|
#include "nnacl/common_func.h" |
|
|
|
|
|
|
|
@@ -45,11 +45,27 @@ int ReduceSum(const int outer_size, const int inner_size, const int axis_size, c |
|
|
|
if (src_data == NULL || dst_data == NULL) { |
|
|
|
return NNACL_NULL_PTR; |
|
|
|
} |
|
|
|
int i, j, k; |
|
|
|
int i, j; |
|
|
|
#ifdef ENABLE_NEON |
|
|
|
int block_mod = inner_size % C4NUM; |
|
|
|
int block_c4 = inner_size - block_mod; |
|
|
|
#endif |
|
|
|
for (j = tid; j < outer_size; j += thread_num) { |
|
|
|
const float *outer_src = src_data + j * axis_size * inner_size; |
|
|
|
float *outer_dst = dst_data + j * inner_size; |
|
|
|
for (k = 0; k < inner_size; k++) { |
|
|
|
int k = 0; |
|
|
|
#ifdef ENABLE_NEON |
|
|
|
for (; k < block_c4; k += C4NUM) { |
|
|
|
const float *inner_src = outer_src + k; |
|
|
|
float *inner_dst = outer_dst + k; |
|
|
|
float32x4_t tmp = {0, 0, 0, 0}; |
|
|
|
for (i = 0; i < axis_size; i++) { |
|
|
|
tmp = vaddq_f32(tmp, vld1q_f32(inner_src + i * inner_size)); |
|
|
|
} |
|
|
|
vst1q_f32(inner_dst, tmp); |
|
|
|
} |
|
|
|
#endif |
|
|
|
for (; k < inner_size; k++) { |
|
|
|
const float *inner_src = outer_src + k; |
|
|
|
float *inner_dst = outer_dst + k; |
|
|
|
float tmp = 0.0f; |
|
|
|
|