|
|
@@ -1321,19 +1321,14 @@ int ElementOptSquaredDifferenceFp16(float16_t *input0, float16_t *input1, float1 |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
int ElementMaximumFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) { |
|
|
int ElementMaximumFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) { |
|
|
|
|
|
#ifdef ENABLE_NEON |
|
|
int block_mod = element_size % C8NUM; |
|
|
int block_mod = element_size % C8NUM; |
|
|
int block_c8 = element_size - block_mod; |
|
|
int block_c8 = element_size - block_mod; |
|
|
for (int index = 0; index < block_c8; index += C8NUM) { |
|
|
for (int index = 0; index < block_c8; index += C8NUM) { |
|
|
#ifdef ENABLE_NEON |
|
|
|
|
|
float16x8_t vin0 = vld1q_f16(input0); |
|
|
float16x8_t vin0 = vld1q_f16(input0); |
|
|
float16x8_t vin1 = vld1q_f16(input1); |
|
|
float16x8_t vin1 = vld1q_f16(input1); |
|
|
float16x8_t vout = vmaxq_f16(vin0, vin1); |
|
|
float16x8_t vout = vmaxq_f16(vin0, vin1); |
|
|
vst1q_f16(output, vout); |
|
|
vst1q_f16(output, vout); |
|
|
#else |
|
|
|
|
|
for (int i = 0; i < C8NUM; ++i) { |
|
|
|
|
|
output[i] = MSMAX(input0[i], input1[i]); |
|
|
|
|
|
} |
|
|
|
|
|
#endif |
|
|
|
|
|
input0 += C8NUM; |
|
|
input0 += C8NUM; |
|
|
input1 += C8NUM; |
|
|
input1 += C8NUM; |
|
|
output += C8NUM; |
|
|
output += C8NUM; |
|
|
@@ -1341,6 +1336,11 @@ int ElementMaximumFp16(float16_t *input0, float16_t *input1, float16_t *output, |
|
|
for (int index = 0; index < block_mod; ++index) { |
|
|
for (int index = 0; index < block_mod; ++index) { |
|
|
output[index] = MSMAX(input0[index], input1[index]); |
|
|
output[index] = MSMAX(input0[index], input1[index]); |
|
|
} |
|
|
} |
|
|
|
|
|
#else |
|
|
|
|
|
for (int index = 0; index < element_size; ++index) { |
|
|
|
|
|
output[index] = MSMAX(input0[index], input1[index]); |
|
|
|
|
|
} |
|
|
|
|
|
#endif |
|
|
return NNACL_OK; |
|
|
return NNACL_OK; |
|
|
} |
|
|
} |
|
|
int ElementOptMaximumFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size, |
|
|
int ElementOptMaximumFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size, |
|
|
@@ -1394,19 +1394,14 @@ int ElementOptMaximumFp16(float16_t *input0, float16_t *input1, float16_t *outpu |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
int ElementMinimumFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) { |
|
|
int ElementMinimumFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) { |
|
|
|
|
|
#ifdef ENABLE_NEON |
|
|
int block_mod = element_size % C8NUM; |
|
|
int block_mod = element_size % C8NUM; |
|
|
int block_c8 = element_size - block_mod; |
|
|
int block_c8 = element_size - block_mod; |
|
|
for (int index = 0; index < block_c8; index += C8NUM) { |
|
|
for (int index = 0; index < block_c8; index += C8NUM) { |
|
|
#ifdef ENABLE_NEON |
|
|
|
|
|
float16x8_t vin0 = vld1q_f16(input0); |
|
|
float16x8_t vin0 = vld1q_f16(input0); |
|
|
float16x8_t vin1 = vld1q_f16(input1); |
|
|
float16x8_t vin1 = vld1q_f16(input1); |
|
|
float16x8_t vout = vminq_f16(vin0, vin1); |
|
|
float16x8_t vout = vminq_f16(vin0, vin1); |
|
|
vst1q_f16(output, vout); |
|
|
vst1q_f16(output, vout); |
|
|
#else |
|
|
|
|
|
for (int i = 0; i < C8NUM; ++i) { |
|
|
|
|
|
output[i] = MSMIN(input0[i], input1[i]); |
|
|
|
|
|
} |
|
|
|
|
|
#endif |
|
|
|
|
|
input0 += C8NUM; |
|
|
input0 += C8NUM; |
|
|
input1 += C8NUM; |
|
|
input1 += C8NUM; |
|
|
output += C8NUM; |
|
|
output += C8NUM; |
|
|
@@ -1414,6 +1409,11 @@ int ElementMinimumFp16(float16_t *input0, float16_t *input1, float16_t *output, |
|
|
for (int index = 0; index < block_mod; ++index) { |
|
|
for (int index = 0; index < block_mod; ++index) { |
|
|
output[index] = MSMIN(input0[index], input1[index]); |
|
|
output[index] = MSMIN(input0[index], input1[index]); |
|
|
} |
|
|
} |
|
|
|
|
|
#else |
|
|
|
|
|
for (int index = 0; index < element_size; ++index) { |
|
|
|
|
|
output[index] = MSMIN(input0[index], input1[index]); |
|
|
|
|
|
} |
|
|
|
|
|
#endif |
|
|
return NNACL_OK; |
|
|
return NNACL_OK; |
|
|
} |
|
|
} |
|
|
int ElementOptMinimumFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size, |
|
|
int ElementOptMinimumFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size, |
|
|
@@ -1783,23 +1783,18 @@ int ElementOptLessEqualFp16(float16_t *input0, float16_t *input1, float16_t *out |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
int ElementGreaterFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) { |
|
|
int ElementGreaterFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size) { |
|
|
|
|
|
#ifdef ENABLE_NEON |
|
|
int block_mod = element_size % C8NUM; |
|
|
int block_mod = element_size % C8NUM; |
|
|
int block_c8 = element_size - block_mod; |
|
|
int block_c8 = element_size - block_mod; |
|
|
#ifdef ENABLE_NEON |
|
|
|
|
|
|
|
|
|
|
|
float16x8_t vtrue = {1, 1, 1, 1, 1, 1, 1, 1}; |
|
|
float16x8_t vtrue = {1, 1, 1, 1, 1, 1, 1, 1}; |
|
|
float16x8_t vfalse = {0, 0, 0, 0, 0, 0, 0, 0}; |
|
|
float16x8_t vfalse = {0, 0, 0, 0, 0, 0, 0, 0}; |
|
|
#endif |
|
|
|
|
|
for (int index = 0; index < block_c8; index += C8NUM) { |
|
|
for (int index = 0; index < block_c8; index += C8NUM) { |
|
|
#ifdef ENABLE_NEON |
|
|
|
|
|
float16x8_t vin0 = vld1q_f16(input0); |
|
|
float16x8_t vin0 = vld1q_f16(input0); |
|
|
float16x8_t vin1 = vld1q_f16(input1); |
|
|
float16x8_t vin1 = vld1q_f16(input1); |
|
|
float16x8_t vout = vbslq_f16(vcgtq_f16(vin0, vin1), vtrue, vfalse); |
|
|
float16x8_t vout = vbslq_f16(vcgtq_f16(vin0, vin1), vtrue, vfalse); |
|
|
vst1q_f16(output, vout); |
|
|
vst1q_f16(output, vout); |
|
|
#else |
|
|
|
|
|
for (int i = 0; i < C8NUM; ++i) { |
|
|
|
|
|
output[i] = (float16_t)(input0[i] > input1[i]); |
|
|
|
|
|
} |
|
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
|
|
input0 += C8NUM; |
|
|
input0 += C8NUM; |
|
|
input1 += C8NUM; |
|
|
input1 += C8NUM; |
|
|
output += C8NUM; |
|
|
output += C8NUM; |
|
|
@@ -1807,6 +1802,11 @@ int ElementGreaterFp16(float16_t *input0, float16_t *input1, float16_t *output, |
|
|
for (int index = 0; index < block_mod; ++index) { |
|
|
for (int index = 0; index < block_mod; ++index) { |
|
|
output[index] = (float16_t)(input0[index] > input1[index]); |
|
|
output[index] = (float16_t)(input0[index] > input1[index]); |
|
|
} |
|
|
} |
|
|
|
|
|
#else |
|
|
|
|
|
for (int index = 0; index < element_size; ++index) { |
|
|
|
|
|
output[index] = (float16_t)(input0[index] > input1[index]); |
|
|
|
|
|
} |
|
|
|
|
|
#endif |
|
|
return NNACL_OK; |
|
|
return NNACL_OK; |
|
|
} |
|
|
} |
|
|
int ElementOptGreaterFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size, |
|
|
int ElementOptGreaterFp16(float16_t *input0, float16_t *input1, float16_t *output, int element_size, |
|
|
|