|
|
|
@@ -56,15 +56,6 @@ static NCNN_FORCEINLINE __m128 hardswish_sse(__m128 inputs, __m128 a, __m128 b) |
|
|
|
return _mm_mul_ps(b, inputs); |
|
|
|
} |
|
|
|
|
|
|
|
static NCNN_FORCEINLINE __m128 abs_sse(__m128 inputs) |
|
|
|
{ |
|
|
|
// Use negative zero as the sign bit mask. |
|
|
|
const __m128 magic_negative_zero = _mm_set_ps1(-0.0f); |
|
|
|
|
|
|
|
// return (!magic_negative_zero && x); |
|
|
|
return _mm_andnot_ps(magic_negative_zero, inputs); |
|
|
|
} |
|
|
|
|
|
|
|
static NCNN_FORCEINLINE __m128 lrelu_sse(__m128 inputs, float slope) |
|
|
|
{ |
|
|
|
__m128 pos = _mm_max_ps(_mm_setzero_ps(), inputs); |
|
|
|
@@ -169,11 +160,6 @@ static NCNN_FORCEINLINE __m256 hardswish_avx(__m256 inputs, __m256 a, __m256 b) |
|
|
|
return _mm256_mul_ps(b, inputs); |
|
|
|
} |
|
|
|
|
|
|
|
static NCNN_FORCEINLINE __m256 abs_avx(__m256 inputs) |
|
|
|
{ |
|
|
|
return _mm256_max_ps(_mm256_sub_ps(_mm256_setzero_ps(), inputs), inputs); |
|
|
|
} |
|
|
|
|
|
|
|
static NCNN_FORCEINLINE __m256 lrelu_avx(__m256 inputs, float slope) |
|
|
|
{ |
|
|
|
__m256 pos = _mm256_max_ps(_mm256_setzero_ps(), inputs); |
|
|
|
@@ -273,11 +259,6 @@ static NCNN_FORCEINLINE __m512 hardswish_avx512(__m512 inputs, __m512 a, __m512 |
|
|
|
return _mm512_mul_ps(b, inputs); |
|
|
|
} |
|
|
|
|
|
|
|
static NCNN_FORCEINLINE __m512 abs_avx512(__m512 inputs) |
|
|
|
{ |
|
|
|
return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(inputs), _mm512_set1_epi32(0x7fffffff))); |
|
|
|
} |
|
|
|
|
|
|
|
static NCNN_FORCEINLINE __m512 lrelu_avx512(__m512 inputs, float slope) |
|
|
|
{ |
|
|
|
__mmask16 _is_negative = _mm512_cmp_ps_mask(inputs, _mm512_setzero_ps(), _CMP_LT_OQ); |
|
|
|
|