|
|
|
@@ -31,99 +31,57 @@ int8_t MinInt8(int8_t a, int8_t b) { return b ^ ((a ^ b) & -(a < b)); } |
|
|
|
int8_t MaxInt8(int8_t a, int8_t b) { return a ^ ((a ^ b) & -(a < b)); } |
|
|
|
|
|
|
|
void ReluFp32(float *data, float *dst, int ele_num) { |
|
|
|
int four_block = UP_DIV(ele_num, C4NUM); |
|
|
|
for (int i = 0; i < four_block - 1; i++) { |
|
|
|
int index = i * C4NUM; |
|
|
|
#ifdef ENABLE_NEON |
|
|
|
float32x4_t relu_data = vld1q_f32(data + index); |
|
|
|
float32x4_t zero_data = vdupq_n_f32(0); |
|
|
|
relu_data = vmaxq_f32(relu_data, zero_data); |
|
|
|
vst1q_f32(dst + index, relu_data); |
|
|
|
#else |
|
|
|
data[index] = data[index] < 0 ? 0 : data[index]; |
|
|
|
data[index + 1] = data[index + 1] < 0 ? 0 : data[index + 1]; |
|
|
|
data[index + 2] = data[index + 2] < 0 ? 0 : data[index + 2]; |
|
|
|
data[index + 3] = data[index + 3] < 0 ? 0 : data[index + 3]; |
|
|
|
#endif |
|
|
|
} |
|
|
|
for (int j = (four_block - 1) * C4NUM; j < ele_num; ++j) { |
|
|
|
data[j] = data[j] < 0 ? 0 : data[j]; |
|
|
|
int index = 0; |
|
|
|
#ifdef ENABLE_AVX |
|
|
|
int c8_block = DOWN_DIV(ele_num, C8NUM) * C8NUM; |
|
|
|
for (; index < c8_block; index += C8NUM) { |
|
|
|
MS_FLOAT32X8 relu_data = MS_LD256_F32(data + index); |
|
|
|
MS_FLOAT32X8 zero_data = MS_MOV256_F32(0.0f); |
|
|
|
relu_data = MS_MAX256_F32(relu_data, zero_data); |
|
|
|
MS_ST256_F32(dst + index, relu_data); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
void Relu6Fp32(float *data, float *dst, int ele_num) { |
|
|
|
int four_block = UP_DIV(ele_num, C4NUM); |
|
|
|
for (int i = 0; i < four_block - 1; i++) { |
|
|
|
int index = i * C4NUM; |
|
|
|
#ifdef ENABLE_NEON |
|
|
|
float32x4_t relu6_data = vld1q_f32(data + index); |
|
|
|
float32x4_t zero_data = vdupq_n_f32(0); |
|
|
|
float32x4_t six_data = vdupq_n_f32(6); |
|
|
|
relu6_data = vmaxq_f32(relu6_data, zero_data); |
|
|
|
relu6_data = vminq_f32(relu6_data, six_data); |
|
|
|
vst1q_f32(dst + index, relu6_data); |
|
|
|
#else |
|
|
|
data[index] = data[index] < 0 ? 0 : data[index]; |
|
|
|
data[index] = data[index] > 6 ? 6 : data[index]; |
|
|
|
data[index + 1] = data[index + 1] < 0 ? 0 : data[index + 1]; |
|
|
|
data[index + 1] = data[index + 1] > 6 ? 6 : data[index + 1]; |
|
|
|
data[index + 2] = data[index + 2] < 0 ? 0 : data[index + 2]; |
|
|
|
data[index + 2] = data[index + 2] > 6 ? 6 : data[index + 2]; |
|
|
|
data[index + 3] = data[index + 3] < 0 ? 0 : data[index + 3]; |
|
|
|
data[index + 3] = data[index + 3] > 6 ? 6 : data[index + 3]; |
|
|
|
#endif |
|
|
|
#if defined(ENABLE_NEON) || defined(ENABLE_SSE) |
|
|
|
int c4_block = DOWN_DIV(ele_num, C4NUM) * C4NUM; |
|
|
|
for (; index < c4_block; index += C4NUM) { |
|
|
|
MS_FLOAT32X4 relu_data = MS_LDQ_F32(data + index); |
|
|
|
MS_FLOAT32X4 zero_data = MS_MOVQ_F32(0.0f); |
|
|
|
relu_data = MS_MAXQ_F32(relu_data, zero_data); |
|
|
|
MS_STQ_F32(dst + index, relu_data); |
|
|
|
} |
|
|
|
for (int j = (four_block - 1) * C4NUM; j < ele_num; ++j) { |
|
|
|
data[j] = data[j] < 0 ? 0 : data[j]; |
|
|
|
data[j] = data[j] > 6 ? 6 : data[j]; |
|
|
|
#endif |
|
|
|
for (; index < ele_num; ++index) { |
|
|
|
data[index] = data[index] < 0.0f ? 0.0f : data[index]; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
void Relu6Fp32(float *data, float *dst, int ele_num) { |
|
|
|
int index = 0; |
|
|
|
#ifdef ENABLE_AVX |
|
|
|
#ifdef WIN32 |
|
|
|
void ReluFp32C8(float *data, float *dst, int ele_num) { |
|
|
|
int four_block = UP_DIV(ele_num, C8NUM); |
|
|
|
for (int i = 0; i < four_block - 1; i++) { |
|
|
|
int index = i * C8NUM; |
|
|
|
data[index] = data[index] < 0 ? 0 : data[index]; |
|
|
|
data[index + 1] = data[index + 1] < 0 ? 0 : data[index + 1]; |
|
|
|
data[index + 2] = data[index + 2] < 0 ? 0 : data[index + 2]; |
|
|
|
data[index + 3] = data[index + 3] < 0 ? 0 : data[index + 3]; |
|
|
|
data[index + 4] = data[index + 4] < 0 ? 0 : data[index + 4]; |
|
|
|
data[index + 5] = data[index + 5] < 0 ? 0 : data[index + 5]; |
|
|
|
data[index + 6] = data[index + 6] < 0 ? 0 : data[index + 6]; |
|
|
|
data[index + 7] = data[index + 7] < 0 ? 0 : data[index + 7]; |
|
|
|
} |
|
|
|
for (int j = (four_block - 1) * C8NUM; j < ele_num; ++j) { |
|
|
|
data[j] = data[j] < 0 ? 0 : data[j]; |
|
|
|
int c8_block = DOWN_DIV(ele_num, C8NUM) * C8NUM; |
|
|
|
for (; index < c8_block; index += C8NUM) { |
|
|
|
MS_FLOAT32X8 relu6_data = MS_LD256_F32(data + index); |
|
|
|
MS_FLOAT32X8 zero_data = MS_MOV256_F32(0.0f); |
|
|
|
MS_FLOAT32X8 six_data = MS_MOV256_F32(6.0f); |
|
|
|
relu6_data = MS_MAX256_F32(relu6_data, zero_data); |
|
|
|
relu6_data = MS_MIN256_F32(relu6_data, six_data); |
|
|
|
MS_ST256_F32(dst + index, relu6_data); |
|
|
|
} |
|
|
|
} |
|
|
|
#endif |
|
|
|
|
|
|
|
void Relu6Fp32C8(float *data, float *dst, int ele_num) { |
|
|
|
int four_block = UP_DIV(ele_num, C8NUM); |
|
|
|
for (int i = 0; i < four_block - 1; i++) { |
|
|
|
int index = i * C8NUM; |
|
|
|
data[index] = data[index] < 0 ? 0 : data[index]; |
|
|
|
data[index] = data[index] > 6 ? 6 : data[index]; |
|
|
|
data[index + 1] = data[index + 1] < 0 ? 0 : data[index + 1]; |
|
|
|
data[index + 1] = data[index + 1] > 6 ? 6 : data[index + 1]; |
|
|
|
data[index + 2] = data[index + 2] < 0 ? 0 : data[index + 2]; |
|
|
|
data[index + 2] = data[index + 2] > 6 ? 6 : data[index + 2]; |
|
|
|
data[index + 3] = data[index + 3] < 0 ? 0 : data[index + 3]; |
|
|
|
data[index + 3] = data[index + 3] > 6 ? 6 : data[index + 3]; |
|
|
|
data[index + 4] = data[index + 4] < 0 ? 0 : data[index + 4]; |
|
|
|
data[index + 4] = data[index + 4] > 6 ? 6 : data[index + 4]; |
|
|
|
data[index + 5] = data[index + 5] < 0 ? 0 : data[index + 5]; |
|
|
|
data[index + 5] = data[index + 5] > 6 ? 6 : data[index + 5]; |
|
|
|
data[index + 6] = data[index + 6] < 0 ? 0 : data[index + 6]; |
|
|
|
data[index + 6] = data[index + 6] > 6 ? 6 : data[index + 6]; |
|
|
|
data[index + 7] = data[index + 7] < 0 ? 0 : data[index + 7]; |
|
|
|
data[index + 7] = data[index + 7] > 6 ? 6 : data[index + 7]; |
|
|
|
#if defined(ENABLE_NEON) || defined(ENABLE_SSE) |
|
|
|
int c4_block = DOWN_DIV(ele_num, C4NUM) * C4NUM; |
|
|
|
for (; index < c4_block; index += C4NUM) { |
|
|
|
MS_FLOAT32X4 relu6_data = MS_LDQ_F32(data + index); |
|
|
|
MS_FLOAT32X4 zero_data = MS_MOVQ_F32(0.0f); |
|
|
|
MS_FLOAT32X4 six_data = MS_MOVQ_F32(6.0f); |
|
|
|
relu6_data = MS_MAXQ_F32(relu6_data, zero_data); |
|
|
|
relu6_data = MS_MINQ_F32(relu6_data, six_data); |
|
|
|
MS_STQ_F32(dst + index, relu6_data); |
|
|
|
} |
|
|
|
for (int j = (four_block - 1) * C8NUM; j < ele_num; ++j) { |
|
|
|
data[j] = data[j] < 0 ? 0 : data[j]; |
|
|
|
data[j] = data[j] > 6 ? 6 : data[j]; |
|
|
|
#endif |
|
|
|
for (; index < ele_num; ++index) { |
|
|
|
data[index] = data[index] < 0.0f ? 0.0f : data[index]; |
|
|
|
data[index] = data[index] > 6.0f ? 6.0f : data[index]; |
|
|
|
} |
|
|
|
} |
|
|
|
#endif |
|
|
|
#endif |