|
|
|
@@ -21,7 +21,7 @@ int ElementMul(float *input0, float *input1, float *output, int element_size) { |
|
|
|
int block_c4 = element_size - block_mod; |
|
|
|
|
|
|
|
for (int index = 0; index < block_c4; index += C4NUM) { |
|
|
|
#ifdef USE_NEON |
|
|
|
#ifdef ENABLE_NEON |
|
|
|
float32x4_t vin0 = vld1q_f32(input0); |
|
|
|
float32x4_t vin1 = vld1q_f32(input1); |
|
|
|
float32x4_t vout = vmulq_f32(vin0, vin1); |
|
|
|
@@ -43,6 +43,73 @@ int ElementMul(float *input0, float *input1, float *output, int element_size) { |
|
|
|
return NNACL_OK; |
|
|
|
} |
|
|
|
|
|
|
|
int ElementMulRelu(float *input0, float *input1, float *output, int element_size) { |
|
|
|
int block_mod = element_size % C4NUM; |
|
|
|
int block_c4 = element_size - block_mod; |
|
|
|
|
|
|
|
#ifdef ENABLE_NEON |
|
|
|
float32x4_t zeros = {0, 0, 0, 0}; |
|
|
|
#endif |
|
|
|
for (int index = 0; index < block_c4; index += C4NUM) { |
|
|
|
#ifdef ENABLE_NEON |
|
|
|
float32x4_t vin0 = vld1q_f32(input0); |
|
|
|
float32x4_t vin1 = vld1q_f32(input1); |
|
|
|
float32x4_t vout = vmulq_f32(vin0, vin1); |
|
|
|
vout = vbslq_f32(vcgtq_f32(vout, zeros), vout, zeros); |
|
|
|
vst1q_f32(output, vout); |
|
|
|
#else |
|
|
|
float res = input0[0] * input1[0]; |
|
|
|
output[0] = res > 0 ? res : 0; |
|
|
|
res = input0[1] * input1[1]; |
|
|
|
output[1] = res > 0 ? res : 0; |
|
|
|
res = input0[2] * input1[2]; |
|
|
|
output[2] = res > 0 ? res : 0; |
|
|
|
res = input0[3] * input1[3]; |
|
|
|
output[3] = res > 0 ? res : 0; |
|
|
|
#endif |
|
|
|
input0 += C4NUM; |
|
|
|
input1 += C4NUM; |
|
|
|
output += C4NUM; |
|
|
|
} |
|
|
|
for (int index = 0; index < block_mod; ++index) { |
|
|
|
float res = input0[index] * input1[index]; |
|
|
|
output[index] = res > 0 ? res : 0; |
|
|
|
} |
|
|
|
|
|
|
|
return NNACL_OK; |
|
|
|
} |
|
|
|
|
|
|
|
int ElementMulRelu6(float *input0, float *input1, float *output, int element_size) { |
|
|
|
int block_mod = element_size % C4NUM; |
|
|
|
int block_c4 = element_size - block_mod; |
|
|
|
|
|
|
|
#ifdef ENABLE_NEON |
|
|
|
float32x4_t zeros = {0, 0, 0, 0}; |
|
|
|
float32x4_t bounds = {6, 6, 6, 6}; |
|
|
|
#endif |
|
|
|
for (int index = 0; index < block_c4; index += C4NUM) { |
|
|
|
#ifdef ENABLE_NEON |
|
|
|
float32x4_t vin0 = vld1q_f32(input0); |
|
|
|
float32x4_t vin1 = vld1q_f32(input1); |
|
|
|
float32x4_t vout = vminq_f32(vmaxq_f32(vmulq_f32(vin0, vin1), zeros), bounds); |
|
|
|
vst1q_f32(output, vout); |
|
|
|
#else |
|
|
|
output[0] = MSMIN(MSMAX(input0[0] * input1[0], 0), 6); |
|
|
|
output[1] = MSMIN(MSMAX(input0[1] * input1[1], 0), 6); |
|
|
|
output[2] = MSMIN(MSMAX(input0[2] * input1[2], 0), 6); |
|
|
|
output[3] = MSMIN(MSMAX(input0[3] * input1[3], 0), 6); |
|
|
|
#endif |
|
|
|
input0 += C4NUM; |
|
|
|
input1 += C4NUM; |
|
|
|
output += C4NUM; |
|
|
|
} |
|
|
|
for (int index = 0; index < block_mod; ++index) { |
|
|
|
output[index] = MSMIN(MSMAX(input0[index] * input1[index], 0), 6); |
|
|
|
} |
|
|
|
|
|
|
|
return NNACL_OK; |
|
|
|
} |
|
|
|
|
|
|
|
int BroadcastMul(float *input0, float *input1, float *tile_input0, float *tile_input1, float *output, int element_size, |
|
|
|
ArithmeticParameter *param) { |
|
|
|
TileDimensions(input0, input1, tile_input0, tile_input1, param); |
|
|
|
@@ -54,7 +121,7 @@ int ElementAdd(float *input0, float *input1, float *output, int element_size) { |
|
|
|
int block_c4 = element_size - block_mod; |
|
|
|
|
|
|
|
for (int index = 0; index < block_c4; index += C4NUM) { |
|
|
|
#ifdef USE_NEON |
|
|
|
#ifdef ENABLE_NEON |
|
|
|
float32x4_t vin0 = vld1q_f32(input0); |
|
|
|
float32x4_t vin1 = vld1q_f32(input1); |
|
|
|
float32x4_t vout = vaddq_f32(vin0, vin1); |
|
|
|
@@ -75,6 +142,72 @@ int ElementAdd(float *input0, float *input1, float *output, int element_size) { |
|
|
|
return NNACL_OK; |
|
|
|
} |
|
|
|
|
|
|
|
int ElementAddRelu(float *input0, float *input1, float *output, int element_size) { |
|
|
|
int block_mod = element_size % C4NUM; |
|
|
|
int block_c4 = element_size - block_mod; |
|
|
|
|
|
|
|
#ifdef ENABLE_NEON |
|
|
|
float32x4_t zeros = {0, 0, 0, 0}; |
|
|
|
#endif |
|
|
|
for (int index = 0; index < block_c4; index += C4NUM) { |
|
|
|
#ifdef ENABLE_NEON |
|
|
|
float32x4_t vin0 = vld1q_f32(input0); |
|
|
|
float32x4_t vin1 = vld1q_f32(input1); |
|
|
|
float32x4_t vout = vaddq_f32(vin0, vin1); |
|
|
|
vout = vbslq_f32(vcgtq_f32(vout, zeros), vout, zeros); |
|
|
|
vst1q_f32(output, vout); |
|
|
|
#else |
|
|
|
float res = input0[0] + input1[0]; |
|
|
|
output[0] = res > 0 ? res : 0; |
|
|
|
res = input0[1] + input1[1]; |
|
|
|
output[1] = res > 0 ? res : 0; |
|
|
|
res = input0[2] + input1[2]; |
|
|
|
output[2] = res > 0 ? res : 0; |
|
|
|
res = input0[3] + input1[3]; |
|
|
|
output[3] = res > 0 ? res : 0; |
|
|
|
#endif |
|
|
|
input0 += C4NUM; |
|
|
|
input1 += C4NUM; |
|
|
|
output += C4NUM; |
|
|
|
} |
|
|
|
for (int index = 0; index < block_mod; ++index) { |
|
|
|
float res = input0[index] + input1[index]; |
|
|
|
output[index] = res > 0 ? res : 0; |
|
|
|
} |
|
|
|
return NNACL_OK; |
|
|
|
} |
|
|
|
|
|
|
|
int ElementAddRelu6(float *input0, float *input1, float *output, int element_size) { |
|
|
|
int block_mod = element_size % C4NUM; |
|
|
|
int block_c4 = element_size - block_mod; |
|
|
|
|
|
|
|
#ifdef ENABLE_NEON |
|
|
|
float32x4_t zeros = {0, 0, 0, 0}; |
|
|
|
float32x4_t bounds = {6, 6, 6, 6}; |
|
|
|
#endif |
|
|
|
for (int index = 0; index < block_c4; index += C4NUM) { |
|
|
|
#ifdef ENABLE_NEON |
|
|
|
float32x4_t vin0 = vld1q_f32(input0); |
|
|
|
float32x4_t vin1 = vld1q_f32(input1); |
|
|
|
float32x4_t vout = vminq_f32(vmaxq_f32(vaddq_f32(vin0, vin1), zeros), bounds); |
|
|
|
vst1q_f32(output, vout); |
|
|
|
#else |
|
|
|
output[0] = MSMIN(MSMAX(input0[0] + input1[0], 0), 6); |
|
|
|
output[1] = MSMIN(MSMAX(input0[1] + input1[1], 0), 6); |
|
|
|
output[2] = MSMIN(MSMAX(input0[2] + input1[2], 0), 6); |
|
|
|
output[3] = MSMIN(MSMAX(input0[3] + input1[3], 0), 6); |
|
|
|
#endif |
|
|
|
input0 += C4NUM; |
|
|
|
input1 += C4NUM; |
|
|
|
output += C4NUM; |
|
|
|
} |
|
|
|
for (int index = 0; index < block_mod; ++index) { |
|
|
|
output[index] = MSMIN(MSMAX(input0[index] + input1[index], 0), 6); |
|
|
|
} |
|
|
|
|
|
|
|
return NNACL_OK; |
|
|
|
} |
|
|
|
|
|
|
|
int ElementAddInt8(int8_t *input0, int8_t *input1, int8_t *output, int element_size) { |
|
|
|
for (int i = 0; i < element_size; i++) { |
|
|
|
output[i] = input0[i] + input1[i]; |
|
|
|
@@ -99,7 +232,7 @@ int ElementSub(float *input0, float *input1, float *output, int element_size) { |
|
|
|
int block_c4 = element_size - block_mod; |
|
|
|
|
|
|
|
for (int index = 0; index < block_c4; index += C4NUM) { |
|
|
|
#ifdef USE_NEON |
|
|
|
#ifdef ENABLE_NEON |
|
|
|
float32x4_t vin0 = vld1q_f32(input0); |
|
|
|
float32x4_t vin1 = vld1q_f32(input1); |
|
|
|
float32x4_t vout = vsubq_f32(vin0, vin1); |
|
|
|
@@ -120,6 +253,72 @@ int ElementSub(float *input0, float *input1, float *output, int element_size) { |
|
|
|
return NNACL_OK; |
|
|
|
} |
|
|
|
|
|
|
|
int ElementSubRelu(float *input0, float *input1, float *output, int element_size) { |
|
|
|
int block_mod = element_size % C4NUM; |
|
|
|
int block_c4 = element_size - block_mod; |
|
|
|
|
|
|
|
#ifdef ENABLE_NEON |
|
|
|
float32x4_t zeros = {0, 0, 0, 0}; |
|
|
|
#endif |
|
|
|
for (int index = 0; index < block_c4; index += C4NUM) { |
|
|
|
#ifdef ENABLE_NEON |
|
|
|
float32x4_t vin0 = vld1q_f32(input0); |
|
|
|
float32x4_t vin1 = vld1q_f32(input1); |
|
|
|
float32x4_t vout = vsubq_f32(vin0, vin1); |
|
|
|
vout = vbslq_f32(vcgtq_f32(vout, zeros), vout, zeros); |
|
|
|
vst1q_f32(output, vout); |
|
|
|
#else |
|
|
|
float res = input0[0] - input1[0]; |
|
|
|
output[0] = res > 0 ? res : 0; |
|
|
|
res = input0[1] - input1[1]; |
|
|
|
output[1] = res > 0 ? res : 0; |
|
|
|
res = input0[2] - input1[2]; |
|
|
|
output[2] = res > 0 ? res : 0; |
|
|
|
res = input0[3] - input1[3]; |
|
|
|
output[3] = res > 0 ? res : 0; |
|
|
|
#endif |
|
|
|
input0 += C4NUM; |
|
|
|
input1 += C4NUM; |
|
|
|
output += C4NUM; |
|
|
|
} |
|
|
|
for (int index = 0; index < block_mod; ++index) { |
|
|
|
float res = input0[index] - input1[index]; |
|
|
|
output[index] = res > 0 ? res : 0; |
|
|
|
} |
|
|
|
return NNACL_OK; |
|
|
|
} |
|
|
|
|
|
|
|
int ElementSubRelu6(float *input0, float *input1, float *output, int element_size) { |
|
|
|
int block_mod = element_size % C4NUM; |
|
|
|
int block_c4 = element_size - block_mod; |
|
|
|
|
|
|
|
#ifdef ENABLE_NEON |
|
|
|
float32x4_t zeros = {0, 0, 0, 0}; |
|
|
|
float32x4_t bounds = {6, 6, 6, 6}; |
|
|
|
#endif |
|
|
|
for (int index = 0; index < block_c4; index += C4NUM) { |
|
|
|
#ifdef ENABLE_NEON |
|
|
|
float32x4_t vin0 = vld1q_f32(input0); |
|
|
|
float32x4_t vin1 = vld1q_f32(input1); |
|
|
|
float32x4_t vout = vminq_f32(vmaxq_f32(vsubq_f32(vin0, vin1), zeros), bounds); |
|
|
|
vst1q_f32(output, vout); |
|
|
|
#else |
|
|
|
output[0] = MSMIN(MSMAX(input0[0] - input1[0], 0), 6); |
|
|
|
output[1] = MSMIN(MSMAX(input0[1] - input1[1], 0), 6); |
|
|
|
output[2] = MSMIN(MSMAX(input0[2] - input1[2], 0), 6); |
|
|
|
output[3] = MSMIN(MSMAX(input0[3] - input1[3], 0), 6); |
|
|
|
#endif |
|
|
|
input0 += C4NUM; |
|
|
|
input1 += C4NUM; |
|
|
|
output += C4NUM; |
|
|
|
} |
|
|
|
for (int index = 0; index < block_mod; ++index) { |
|
|
|
output[index] = MSMIN(MSMAX(input0[index] - input1[index], 0), 6); |
|
|
|
} |
|
|
|
|
|
|
|
return NNACL_OK; |
|
|
|
} |
|
|
|
|
|
|
|
int BroadcastSub(float *input0, float *input1, float *tile_input0, float *tile_input1, float *output, int element_size, |
|
|
|
ArithmeticParameter *param) { |
|
|
|
TileDimensions(input0, input1, tile_input0, tile_input1, param); |
|
|
|
@@ -137,6 +336,27 @@ int ElementDiv(float *input0, float *input1, float *output, int element_size) { |
|
|
|
return NNACL_OK; |
|
|
|
} |
|
|
|
|
|
|
|
int ElementDivRelu(float *input0, float *input1, float *output, int element_size) { |
|
|
|
for (int i = 0; i < element_size; i++) { |
|
|
|
if (input1[i] == 0) { |
|
|
|
return NNACL_ERRCODE_DIVISOR_ZERO; |
|
|
|
} |
|
|
|
float res = input0[i] / input1[i]; |
|
|
|
output[i] = res > 0 ? res : 0; |
|
|
|
} |
|
|
|
return NNACL_OK; |
|
|
|
} |
|
|
|
|
|
|
|
int ElementDivRelu6(float *input0, float *input1, float *output, int element_size) { |
|
|
|
for (int i = 0; i < element_size; i++) { |
|
|
|
if (input1[i] == 0) { |
|
|
|
return NNACL_ERRCODE_DIVISOR_ZERO; |
|
|
|
} |
|
|
|
output[i] = MSMIN(MSMAX(input0[i] / input1[i], 0), 6); |
|
|
|
} |
|
|
|
return NNACL_OK; |
|
|
|
} |
|
|
|
|
|
|
|
int BroadcastDiv(float *input0, float *input1, float *tile_input0, float *tile_input1, float *output, int element_size, |
|
|
|
ArithmeticParameter *param) { |
|
|
|
TileDimensions(input0, input1, tile_input0, tile_input1, param); |
|
|
|
@@ -179,11 +399,18 @@ int ElementLogicalAnd(float *input0, float *input1, float *output, int element_s |
|
|
|
int block_mod = element_size % C4NUM; |
|
|
|
int block_c4 = element_size - block_mod; |
|
|
|
|
|
|
|
#ifdef ENABLE_NEON |
|
|
|
float32x4_t vtrue = {1, 1, 1, 1}; |
|
|
|
float32x4_t vfalse = {0, 0, 0, 0}; |
|
|
|
uint32x4_t mask = vmovq_n_u32((uint32_t(1u << 31) - 1)); |
|
|
|
uint32x4_t zeros = {0, 0, 0, 0}; |
|
|
|
#endif |
|
|
|
|
|
|
|
for (int index = 0; index < block_c4; index += C4NUM) { |
|
|
|
#ifdef USE_NEON |
|
|
|
float32x4_t vin0 = vld1q_f32(input0); |
|
|
|
float32x4_t vin1 = vld1q_f32(input1); |
|
|
|
float32x4_t vout = vandq_f32(vin0, vin1); |
|
|
|
#ifdef ENABLE_NEON |
|
|
|
uint32x4_t vin0 = vandq_u32(vreinterpretq_s32_f32(vld1q_f32(input0)), mask); |
|
|
|
uint32x4_t vin1 = vandq_u32(vreinterpretq_s32_f32(vld1q_f32(input1)), mask); |
|
|
|
float32x4_t vout = vbslq_f32(vceqq_u32(vandq_u32(vin0, vin1), zeros), vfalse, vtrue); |
|
|
|
vst1q_f32(output, vout); |
|
|
|
#else |
|
|
|
output[0] = (float)((bool)(input0[0]) & (bool)(input1[0])); |
|
|
|
@@ -222,11 +449,18 @@ int ElementLogicalOr(float *input0, float *input1, float *output, int element_si |
|
|
|
int block_mod = element_size % C4NUM; |
|
|
|
int block_c4 = element_size - block_mod; |
|
|
|
|
|
|
|
#ifdef ENABLE_NEON |
|
|
|
float32x4_t vtrue = {1, 1, 1, 1}; |
|
|
|
float32x4_t vfalse = {0, 0, 0, 0}; |
|
|
|
uint32x4_t mask = vmovq_n_u32((uint32_t(1u << 31) - 1)); |
|
|
|
uint32x4_t zeros = {0, 0, 0, 0}; |
|
|
|
#endif |
|
|
|
|
|
|
|
for (int index = 0; index < block_c4; index += C4NUM) { |
|
|
|
#ifdef USE_NEON |
|
|
|
float32x4_t vin0 = vld1q_f32(input0); |
|
|
|
float32x4_t vin1 = vld1q_f32(input1); |
|
|
|
float32x4_t vout = vorrq_f32(vin0, vin1); |
|
|
|
#ifdef ENABLE_NEON |
|
|
|
uint32x4_t vin0 = vandq_u32(vreinterpretq_s32_f32(vld1q_f32(input0)), mask); |
|
|
|
uint32x4_t vin1 = vandq_u32(vreinterpretq_s32_f32(vld1q_f32(input1)), mask); |
|
|
|
float32x4_t vout = vbslq_f32(vceqq_u32(vorrq_u32(vin0, vin1), zeros), vfalse, vtrue); |
|
|
|
vst1q_f32(output, vout); |
|
|
|
#else |
|
|
|
output[0] = (float)((bool)(input0[0]) | (bool)(input1[0])); |
|
|
|
@@ -255,7 +489,7 @@ int ElementMaximum(float *input0, float *input1, float *output, int element_size |
|
|
|
int block_c4 = element_size - block_mod; |
|
|
|
|
|
|
|
for (int index = 0; index < block_c4; index += C4NUM) { |
|
|
|
#ifdef USE_NEON |
|
|
|
#ifdef ENABLE_NEON |
|
|
|
float32x4_t vin0 = vld1q_f32(input0); |
|
|
|
float32x4_t vin1 = vld1q_f32(input1); |
|
|
|
float32x4_t vout = vmaxq_f32(vin0, vin1); |
|
|
|
@@ -287,7 +521,7 @@ int ElementMinimum(float *input0, float *input1, float *output, int element_size |
|
|
|
int block_c4 = element_size - block_mod; |
|
|
|
|
|
|
|
for (int index = 0; index < block_c4; index += C4NUM) { |
|
|
|
#ifdef USE_NEON |
|
|
|
#ifdef ENABLE_NEON |
|
|
|
float32x4_t vin0 = vld1q_f32(input0); |
|
|
|
float32x4_t vin1 = vld1q_f32(input1); |
|
|
|
float32x4_t vout = vminq_f32(vin0, vin1); |
|
|
|
@@ -317,15 +551,15 @@ int BroadcastMinimum(float *input0, float *input1, float *tile_input0, float *ti |
|
|
|
int ElementNotEqual(float *input0, float *input1, float *output, int element_size) { |
|
|
|
int block_mod = element_size % C4NUM; |
|
|
|
int block_c4 = element_size - block_mod; |
|
|
|
#ifdef USE_NEON |
|
|
|
#ifdef ENABLE_NEON |
|
|
|
float32x4_t vtrue = {1, 1, 1, 1}; |
|
|
|
float32x4_t vfalse = {0, 0, 0, 0}; |
|
|
|
#endif |
|
|
|
for (int index = 0; index < block_c4; index += C4NUM) { |
|
|
|
#ifdef USE_NEON |
|
|
|
#ifdef ENABLE_NEON |
|
|
|
float32x4_t vin0 = vld1q_f32(input0); |
|
|
|
float32x4_t vin1 = vld1q_f32(input1); |
|
|
|
float32x4_t vout = vbslq_f32(vceqq_fp32(vin0, vin1), vfalse, vtrue); |
|
|
|
float32x4_t vout = vbslq_f32(vceqq_f32(vin0, vin1), vfalse, vtrue); |
|
|
|
vst1q_f32(output, vout); |
|
|
|
#else |
|
|
|
output[0] = (float)(input0[0] != input1[0]); |
|
|
|
@@ -352,15 +586,15 @@ int BroadcastNotEqual(float *input0, float *input1, float *tile_input0, float *t |
|
|
|
int ElementEqual(float *input0, float *input1, float *output, int element_size) { |
|
|
|
int block_mod = element_size % C4NUM; |
|
|
|
int block_c4 = element_size - block_mod; |
|
|
|
#ifdef USE_NEON |
|
|
|
#ifdef ENABLE_NEON |
|
|
|
float32x4_t vtrue = {1, 1, 1, 1}; |
|
|
|
float32x4_t vfalse = {0, 0, 0, 0}; |
|
|
|
#endif |
|
|
|
for (int index = 0; index < block_c4; index += C4NUM) { |
|
|
|
#ifdef USE_NEON |
|
|
|
#ifdef ENABLE_NEON |
|
|
|
float32x4_t vin0 = vld1q_f32(input0); |
|
|
|
float32x4_t vin1 = vld1q_f32(input1); |
|
|
|
float32x4_t vout = vbslq_f32(vceqq_fp32(vin0, vin1), vtrue, vfalse); |
|
|
|
float32x4_t vout = vbslq_f32(vceqq_f32(vin0, vin1), vtrue, vfalse); |
|
|
|
vst1q_f32(output, vout); |
|
|
|
#else |
|
|
|
output[0] = (float)(input0[0] == input1[0]); |
|
|
|
@@ -387,15 +621,15 @@ int BroadcastEqual(float *input0, float *input1, float *tile_input0, float *tile |
|
|
|
int ElementLess(float *input0, float *input1, float *output, int element_size) { |
|
|
|
int block_mod = element_size % C4NUM; |
|
|
|
int block_c4 = element_size - block_mod; |
|
|
|
#ifdef USE_NEON |
|
|
|
#ifdef ENABLE_NEON |
|
|
|
float32x4_t vtrue = {1, 1, 1, 1}; |
|
|
|
float32x4_t vfalse = {0, 0, 0, 0}; |
|
|
|
#endif |
|
|
|
for (int index = 0; index < block_c4; index += C4NUM) { |
|
|
|
#ifdef USE_NEON |
|
|
|
#ifdef ENABLE_NEON |
|
|
|
float32x4_t vin0 = vld1q_f32(input0); |
|
|
|
float32x4_t vin1 = vld1q_f32(input1); |
|
|
|
float32x4_t vout = vbslq_f32(vcltq_fp32(vin0, vin1), vtrue, vfalse); |
|
|
|
float32x4_t vout = vbslq_f32(vcltq_f32(vin0, vin1), vtrue, vfalse); |
|
|
|
vst1q_f32(output, vout); |
|
|
|
#else |
|
|
|
output[0] = (float)(input0[0] < input1[0]); |
|
|
|
@@ -422,15 +656,15 @@ int BroadcastLess(float *input0, float *input1, float *tile_input0, float *tile_ |
|
|
|
int ElementLessEqual(float *input0, float *input1, float *output, int element_size) { |
|
|
|
int block_mod = element_size % C4NUM; |
|
|
|
int block_c4 = element_size - block_mod; |
|
|
|
#ifdef USE_NEON |
|
|
|
#ifdef ENABLE_NEON |
|
|
|
float32x4_t vtrue = {1, 1, 1, 1}; |
|
|
|
float32x4_t vfalse = {0, 0, 0, 0}; |
|
|
|
#endif |
|
|
|
for (int index = 0; index < block_c4; index += C4NUM) { |
|
|
|
#ifdef USE_NEON |
|
|
|
#ifdef ENABLE_NEON |
|
|
|
float32x4_t vin0 = vld1q_f32(input0); |
|
|
|
float32x4_t vin1 = vld1q_f32(input1); |
|
|
|
float32x4_t vout = vbslq_f32(vcleq_fp32(vin0, vin1), vtrue, vfalse); |
|
|
|
float32x4_t vout = vbslq_f32(vcleq_f32(vin0, vin1), vtrue, vfalse); |
|
|
|
vst1q_f32(output, vout); |
|
|
|
#else |
|
|
|
output[0] = (float)(input0[0] <= input1[0]); |
|
|
|
@@ -457,15 +691,15 @@ int BroadcastLessEqual(float *input0, float *input1, float *tile_input0, float * |
|
|
|
int ElementGreater(float *input0, float *input1, float *output, int element_size) { |
|
|
|
int block_mod = element_size % C4NUM; |
|
|
|
int block_c4 = element_size - block_mod; |
|
|
|
#ifdef USE_NEON |
|
|
|
#ifdef ENABLE_NEON |
|
|
|
float32x4_t vtrue = {1, 1, 1, 1}; |
|
|
|
float32x4_t vfalse = {0, 0, 0, 0}; |
|
|
|
#endif |
|
|
|
for (int index = 0; index < block_c4; index += C4NUM) { |
|
|
|
#ifdef USE_NEON |
|
|
|
#ifdef ENABLE_NEON |
|
|
|
float32x4_t vin0 = vld1q_f32(input0); |
|
|
|
float32x4_t vin1 = vld1q_f32(input1); |
|
|
|
float32x4_t vout = vbslq_f32(vcgtq_fp32(vin0, vin1), vtrue, vfalse); |
|
|
|
float32x4_t vout = vbslq_f32(vcgtq_f32(vin0, vin1), vtrue, vfalse); |
|
|
|
vst1q_f32(output, vout); |
|
|
|
#else |
|
|
|
output[0] = (float)(input0[0] > input1[0]); |
|
|
|
@@ -492,15 +726,15 @@ int BroadcastGreater(float *input0, float *input1, float *tile_input0, float *ti |
|
|
|
int ElementGreaterEqual(float *input0, float *input1, float *output, int element_size) { |
|
|
|
int block_mod = element_size % C4NUM; |
|
|
|
int block_c4 = element_size - block_mod; |
|
|
|
#ifdef USE_NEON |
|
|
|
#ifdef ENABLE_NEON |
|
|
|
float32x4_t vtrue = {1, 1, 1, 1}; |
|
|
|
float32x4_t vfalse = {0, 0, 0, 0}; |
|
|
|
#endif |
|
|
|
for (int index = 0; index < block_c4; index += C4NUM) { |
|
|
|
#ifdef USE_NEON |
|
|
|
#ifdef ENABLE_NEON |
|
|
|
float32x4_t vin0 = vld1q_f32(input0); |
|
|
|
float32x4_t vin1 = vld1q_f32(input1); |
|
|
|
float32x4_t vout = vbslq_f32(vcgeq_fp32(vin0, vin1), vtrue, vfalse); |
|
|
|
float32x4_t vout = vbslq_f32(vcgeq_f32(vin0, vin1), vtrue, vfalse); |
|
|
|
vst1q_f32(output, vout); |
|
|
|
#else |
|
|
|
output[0] = (float)(input0[0] >= input1[0]); |
|
|
|
@@ -523,4 +757,3 @@ int BroadcastGreaterEqual(float *input0, float *input1, float *tile_input0, floa |
|
|
|
TileDimensions(input0, input1, tile_input0, tile_input1, param); |
|
|
|
return ElementGreaterEqual(tile_input0, tile_input1, output, element_size); |
|
|
|
} |
|
|
|
|