Browse Source

move floor and ceil sse_function from unaryOp to sse_mathfun (#4566)

tags/20230517
Yoh GitHub 3 years ago
parent
commit
7573faae52
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 86 additions and 76 deletions
  1. +84
    -0
      src/layer/x86/sse_mathfun.h
  2. +2
    -76
      src/layer/x86/unaryop_x86.cpp

+ 84
- 0
src/layer/x86/sse_mathfun.h View File

@@ -752,4 +752,88 @@ static NCNN_FORCEINLINE __m128 atan2_ps(__m128 a, __m128 b)
return _mm_loadu_ps(tmpx);
}

static NCNN_FORCEINLINE __m128 ceil_ps(__m128 x)
{
#if __SSE4_1__
return _mm_ceil_ps(x);
#endif // __SSE4_1__

// Use negative zero as the sign bit mask.
const __m128 magic_negative_zero = _mm_set_ps1(-0.0f);

// The smallest float number that have no fractional part. (2^23)
const __m128 magic_smallest_no_fraction = _mm_set_ps1(8388608.0f);

// absolute = abs(x);
__m128 absolute = _mm_andnot_ps(magic_negative_zero, x);

// negative_mask = magic_negative_zero && x;
__m128 negative_mask = _mm_and_ps(magic_negative_zero, x);

// no_fraction = (magic_smallest_no_fraction < absolute);
__m128 no_fraction = _mm_cmplt_ps(magic_smallest_no_fraction, absolute);

// truncated = static_cast<float>(static_cast<uint32_t>(absolute));
__m128 truncated = _mm_cvtepi32_ps(_mm_cvttps_epi32(absolute));

// truncated_with_sign = (truncated || negative_mask);
__m128 truncated_with_sign = _mm_or_ps(truncated, negative_mask);

// positive_fix = ((x > -0.0f) && (x > truncated_with_sign) ? -1.0f : 0.0f);
__m128 positive_fix = _mm_and_ps(
_mm_and_ps(
_mm_cmpgt_ps(x, magic_negative_zero),
_mm_cmpgt_ps(x, truncated_with_sign)),
_mm_set_ps1(-1.0f));

// fixed_result = truncated_with_sign - positive_fix;
__m128 fixed_result = _mm_sub_ps(truncated_with_sign, positive_fix);

// return ((x && no_fraction) || (!no_fraction && fixed_result));
return _mm_or_ps(
_mm_and_ps(x, no_fraction),
_mm_andnot_ps(no_fraction, fixed_result));
}

static NCNN_FORCEINLINE __m128 floor_ps(__m128 x)
{
#if __SSE4_1__
return _mm_floor_ps(x);
#endif // __SSE4_1__

// Use negative zero as the sign bit mask.
const __m128 magic_negative_zero = _mm_set_ps1(-0.0f);

// The smallest float number that have no fractional part. (2^23)
const __m128 magic_smallest_no_fraction = _mm_set_ps1(8388608.0f);

// absolute = abs(x);
__m128 absolute = _mm_andnot_ps(magic_negative_zero, x);

// negative_mask = magic_negative_zero && x;
__m128 negative_mask = _mm_and_ps(magic_negative_zero, x);

// no_fraction = (magic_smallest_no_fraction < absolute);
__m128 no_fraction = _mm_cmplt_ps(magic_smallest_no_fraction, absolute);

// truncated = static_cast<float>(static_cast<uint32_t>(absolute));
__m128 truncated = _mm_cvtepi32_ps(_mm_cvttps_epi32(absolute));

// truncated_with_sign = (truncated || negative_mask);
__m128 truncated_with_sign = _mm_or_ps(truncated, negative_mask);

// negative_fix = ((x < truncated_with_sign) ? 1.0f : 0.0f);
__m128 negative_fix = _mm_and_ps(
_mm_cmplt_ps(x, truncated_with_sign),
_mm_set_ps1(1.0f));

// fixed_result = truncated_with_sign - negative_fix;
__m128 fixed_result = _mm_sub_ps(truncated_with_sign, negative_fix);

// return ((x && no_fraction) || (!no_fraction && fixed_result));
return _mm_or_ps(
_mm_and_ps(x, no_fraction),
_mm_andnot_ps(no_fraction, fixed_result));
}

#endif // SSE_MATHFUN_H

+ 2
- 76
src/layer/x86/unaryop_x86.cpp View File

@@ -158,43 +158,7 @@ struct unary_op_floor
#if __SSE2__
__m128 func_pack4(const __m128& x) const
{
#if __SSE4_1__
return _mm_floor_ps(x);
#endif // __SSE4_1__

// Use negative zero as the sign bit mask.
const __m128 magic_negative_zero = _mm_set_ps1(-0.0f);

// The smallest float number that have no fractional part. (2^23)
const __m128 magic_smallest_no_fraction = _mm_set_ps1(8388608.0f);

// absolute = abs(x);
__m128 absolute = _mm_andnot_ps(magic_negative_zero, x);

// negative_mask = magic_negative_zero && x;
__m128 negative_mask = _mm_and_ps(magic_negative_zero, x);

// no_fraction = (magic_smallest_no_fraction < absolute);
__m128 no_fraction = _mm_cmplt_ps(magic_smallest_no_fraction, absolute);

// truncated = static_cast<float>(static_cast<uint32_t>(absolute));
__m128 truncated = _mm_cvtepi32_ps(_mm_cvttps_epi32(absolute));

// truncated_with_sign = (truncated || negative_mask);
__m128 truncated_with_sign = _mm_or_ps(truncated, negative_mask);

// negative_fix = ((x < truncated_with_sign) ? 1.0f : 0.0f);
__m128 negative_fix = _mm_and_ps(
_mm_cmplt_ps(x, truncated_with_sign),
_mm_set_ps1(1.0f));

// fixed_result = truncated_with_sign - negative_fix;
__m128 fixed_result = _mm_sub_ps(truncated_with_sign, negative_fix);

// return ((x && no_fraction) || (!no_fraction && fixed_result));
return _mm_or_ps(
_mm_and_ps(x, no_fraction),
_mm_andnot_ps(no_fraction, fixed_result));
return floor_ps(x);
}
#if __AVX__
__m256 func_pack8(const __m256& x) const
@@ -220,45 +184,7 @@ struct unary_op_ceil
#if __SSE2__
__m128 func_pack4(const __m128& x) const
{
#if __SSE4_1__
return _mm_ceil_ps(x);
#endif // __SSE4_1__

// Use negative zero as the sign bit mask.
const __m128 magic_negative_zero = _mm_set_ps1(-0.0f);

// The smallest float number that have no fractional part. (2^23)
const __m128 magic_smallest_no_fraction = _mm_set_ps1(8388608.0f);

// absolute = abs(x);
__m128 absolute = _mm_andnot_ps(magic_negative_zero, x);

// negative_mask = magic_negative_zero && x;
__m128 negative_mask = _mm_and_ps(magic_negative_zero, x);

// no_fraction = (magic_smallest_no_fraction < absolute);
__m128 no_fraction = _mm_cmplt_ps(magic_smallest_no_fraction, absolute);

// truncated = static_cast<float>(static_cast<uint32_t>(absolute));
__m128 truncated = _mm_cvtepi32_ps(_mm_cvttps_epi32(absolute));

// truncated_with_sign = (truncated || negative_mask);
__m128 truncated_with_sign = _mm_or_ps(truncated, negative_mask);

// positive_fix = ((x > -0.0f) && (x > truncated_with_sign) ? -1.0f : 0.0f);
__m128 positive_fix = _mm_and_ps(
_mm_and_ps(
_mm_cmpgt_ps(x, magic_negative_zero),
_mm_cmpgt_ps(x, truncated_with_sign)),
_mm_set_ps1(-1.0f));

// fixed_result = truncated_with_sign - positive_fix;
__m128 fixed_result = _mm_sub_ps(truncated_with_sign, positive_fix);

// return ((x && no_fraction) || (!no_fraction && fixed_result));
return _mm_or_ps(
_mm_and_ps(x, no_fraction),
_mm_andnot_ps(no_fraction, fixed_result));
return ceil_ps(x);
}
#if __AVX__
__m256 func_pack8(const __m256& x) const


Loading…
Cancel
Save