diff --git a/src/layer/x86/avx512_mathfun.h b/src/layer/x86/avx512_mathfun.h index e1cc70b70..b5e47bdbe 100644 --- a/src/layer/x86/avx512_mathfun.h +++ b/src/layer/x86/avx512_mathfun.h @@ -847,4 +847,13 @@ static NCNN_FORCEINLINE __m512 atan2512_ps(__m512 y, __m512 x) return _mm512_mask_mov_ps(special_result, normal_mode, normal_result); } +static NCNN_FORCEINLINE __m512 abs512_ps(__m512 x) +{ + // Use negative zero as the sign bit mask. + const __m512 magic_negative_zero = _mm512_set1_ps(-0.0f); + + // return (!magic_negative_zero && x); + return _mm512_andnot_ps(magic_negative_zero, x); +} + #endif // AVX512_MATHFUN_H diff --git a/src/layer/x86/avx_mathfun.h b/src/layer/x86/avx_mathfun.h index 19cb40f6d..65c34efc2 100644 --- a/src/layer/x86/avx_mathfun.h +++ b/src/layer/x86/avx_mathfun.h @@ -1078,4 +1078,13 @@ static NCNN_FORCEINLINE __m256 atan2256_ps(__m256 y, __m256 x) _mm256_andnot_ps(normal_mode, special_result)); } +static NCNN_FORCEINLINE __m256 abs256_ps(__m256 x) +{ + // Use negative zero as the sign bit mask. + const __m256 magic_negative_zero = _mm256_set1_ps(-0.0f); + + // return (!magic_negative_zero && x); + return _mm256_andnot_ps(magic_negative_zero, x); +} + #endif // AVX_MATHFUN_H diff --git a/src/layer/x86/sse_mathfun.h b/src/layer/x86/sse_mathfun.h index fb4b22192..b7cecfb81 100644 --- a/src/layer/x86/sse_mathfun.h +++ b/src/layer/x86/sse_mathfun.h @@ -1148,4 +1148,13 @@ static NCNN_FORCEINLINE __m128 atan2_ps(__m128 y, __m128 x) _mm_andnot_ps(normal_mode, special_result)); } +static NCNN_FORCEINLINE __m128 abs_ps(__m128 inputs) +{ + // Use negative zero as the sign bit mask. + const __m128 magic_negative_zero = _mm_set_ps1(-0.0f); + + // return (!magic_negative_zero && x); + return _mm_andnot_ps(magic_negative_zero, inputs); +} + #endif // SSE_MATHFUN_H diff --git a/src/layer/x86/unaryop_x86.cpp b/src/layer/x86/unaryop_x86.cpp index 551304808..9ba8c6c7a 100644 --- a/src/layer/x86/unaryop_x86.cpp +++ b/src/layer/x86/unaryop_x86.cpp @@ -106,17 +106,17 @@ struct unary_op_abs #if __SSE2__ __m128 func_pack4(const __m128& x) const { - return abs_sse(x); + return abs_ps(x); } #if __AVX__ __m256 func_pack8(const __m256& x) const { - return abs_avx(x); + return abs256_ps(x); } #if __AVX512F__ __m512 func_pack16(const __m512& x) const { - return abs_avx512(x); + return abs512_ps(x); } #endif // __AVX512F__ #endif // __AVX__ diff --git a/src/layer/x86/x86_activation.h b/src/layer/x86/x86_activation.h index 4299d4d46..b02b8ee9a 100644 --- a/src/layer/x86/x86_activation.h +++ b/src/layer/x86/x86_activation.h @@ -56,15 +56,6 @@ static NCNN_FORCEINLINE __m128 hardswish_sse(__m128 inputs, __m128 a, __m128 b) return _mm_mul_ps(b, inputs); } -static NCNN_FORCEINLINE __m128 abs_sse(__m128 inputs) -{ - // Use negative zero as the sign bit mask. - const __m128 magic_negative_zero = _mm_set_ps1(-0.0f); - - // return (!magic_negative_zero && x); - return _mm_andnot_ps(magic_negative_zero, inputs); -} - static NCNN_FORCEINLINE __m128 lrelu_sse(__m128 inputs, float slope) { __m128 pos = _mm_max_ps(_mm_setzero_ps(), inputs); @@ -169,11 +160,6 @@ static NCNN_FORCEINLINE __m256 hardswish_avx(__m256 inputs, __m256 a, __m256 b) return _mm256_mul_ps(b, inputs); } -static NCNN_FORCEINLINE __m256 abs_avx(__m256 inputs) -{ - return _mm256_max_ps(_mm256_sub_ps(_mm256_setzero_ps(), inputs), inputs); -} - static NCNN_FORCEINLINE __m256 lrelu_avx(__m256 inputs, float slope) { __m256 pos = _mm256_max_ps(_mm256_setzero_ps(), inputs); @@ -273,11 +259,6 @@ static NCNN_FORCEINLINE __m512 hardswish_avx512(__m512 inputs, __m512 a, __m512 return _mm512_mul_ps(b, inputs); } -static NCNN_FORCEINLINE __m512 abs_avx512(__m512 inputs) -{ - return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(inputs), _mm512_set1_epi32(0x7fffffff))); -} - static NCNN_FORCEINLINE __m512 lrelu_avx512(__m512 inputs, float slope) { __mmask16 _is_negative = _mm512_cmp_ps_mask(inputs, _mm512_setzero_ps(), _CMP_LT_OQ);