Browse Source

move x86 abs_mathfun to x86_mathfun (#4659)

* move x86 abs_mathfun to x86_mathfun

* Improve the implementation of AVX and AVX512F abs.

* Unified naming rules

---------

Co-authored-by: MouriNaruto <Mouri_Naruto@Outlook.com>
tags/20230517
Yoh GitHub 3 years ago
parent
commit
bd15e32517
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 30 additions and 22 deletions
  1. +9
    -0
      src/layer/x86/avx512_mathfun.h
  2. +9
    -0
      src/layer/x86/avx_mathfun.h
  3. +9
    -0
      src/layer/x86/sse_mathfun.h
  4. +3
    -3
      src/layer/x86/unaryop_x86.cpp
  5. +0
    -19
      src/layer/x86/x86_activation.h

+ 9
- 0
src/layer/x86/avx512_mathfun.h View File

@@ -847,4 +847,13 @@ static NCNN_FORCEINLINE __m512 atan2512_ps(__m512 y, __m512 x)
return _mm512_mask_mov_ps(special_result, normal_mode, normal_result);
}

static NCNN_FORCEINLINE __m512 abs512_ps(__m512 x)
{
// Use negative zero as the sign bit mask.
const __m512 magic_negative_zero = _mm512_set1_ps(-0.0f);

// return (!magic_negative_zero && x);
return _mm512_andnot_ps(magic_negative_zero, x);
}

#endif // AVX512_MATHFUN_H

+ 9
- 0
src/layer/x86/avx_mathfun.h View File

@@ -1078,4 +1078,13 @@ static NCNN_FORCEINLINE __m256 atan2256_ps(__m256 y, __m256 x)
_mm256_andnot_ps(normal_mode, special_result));
}

static NCNN_FORCEINLINE __m256 abs256_ps(__m256 x)
{
// Use negative zero as the sign bit mask.
const __m256 magic_negative_zero = _mm256_set1_ps(-0.0f);

// return (!magic_negative_zero && x);
return _mm256_andnot_ps(magic_negative_zero, x);
}

#endif // AVX_MATHFUN_H

+ 9
- 0
src/layer/x86/sse_mathfun.h View File

@@ -1148,4 +1148,13 @@ static NCNN_FORCEINLINE __m128 atan2_ps(__m128 y, __m128 x)
_mm_andnot_ps(normal_mode, special_result));
}

static NCNN_FORCEINLINE __m128 abs_ps(__m128 inputs)
{
// Use negative zero as the sign bit mask.
const __m128 magic_negative_zero = _mm_set_ps1(-0.0f);

// return (!magic_negative_zero && x);
return _mm_andnot_ps(magic_negative_zero, inputs);
}

#endif // SSE_MATHFUN_H

+ 3
- 3
src/layer/x86/unaryop_x86.cpp View File

@@ -106,17 +106,17 @@ struct unary_op_abs
#if __SSE2__
__m128 func_pack4(const __m128& x) const
{
return abs_sse(x);
return abs_ps(x);
}
#if __AVX__
__m256 func_pack8(const __m256& x) const
{
return abs_avx(x);
return abs256_ps(x);
}
#if __AVX512F__
__m512 func_pack16(const __m512& x) const
{
return abs_avx512(x);
return abs512_ps(x);
}
#endif // __AVX512F__
#endif // __AVX__


+ 0
- 19
src/layer/x86/x86_activation.h View File

@@ -56,15 +56,6 @@ static NCNN_FORCEINLINE __m128 hardswish_sse(__m128 inputs, __m128 a, __m128 b)
return _mm_mul_ps(b, inputs);
}

static NCNN_FORCEINLINE __m128 abs_sse(__m128 inputs)
{
// Use negative zero as the sign bit mask.
const __m128 magic_negative_zero = _mm_set_ps1(-0.0f);

// return (!magic_negative_zero && x);
return _mm_andnot_ps(magic_negative_zero, inputs);
}

static NCNN_FORCEINLINE __m128 lrelu_sse(__m128 inputs, float slope)
{
__m128 pos = _mm_max_ps(_mm_setzero_ps(), inputs);
@@ -169,11 +160,6 @@ static NCNN_FORCEINLINE __m256 hardswish_avx(__m256 inputs, __m256 a, __m256 b)
return _mm256_mul_ps(b, inputs);
}

static NCNN_FORCEINLINE __m256 abs_avx(__m256 inputs)
{
return _mm256_max_ps(_mm256_sub_ps(_mm256_setzero_ps(), inputs), inputs);
}

static NCNN_FORCEINLINE __m256 lrelu_avx(__m256 inputs, float slope)
{
__m256 pos = _mm256_max_ps(_mm256_setzero_ps(), inputs);
@@ -273,11 +259,6 @@ static NCNN_FORCEINLINE __m512 hardswish_avx512(__m512 inputs, __m512 a, __m512
return _mm512_mul_ps(b, inputs);
}

static NCNN_FORCEINLINE __m512 abs_avx512(__m512 inputs)
{
return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(inputs), _mm512_set1_epi32(0x7fffffff)));
}

static NCNN_FORCEINLINE __m512 lrelu_avx512(__m512 inputs, float slope)
{
__mmask16 _is_negative = _mm512_cmp_ps_mask(inputs, _mm512_setzero_ps(), _CMP_LT_OQ);


Loading…
Cancel
Save