|
|
|
@@ -836,4 +836,89 @@ static NCNN_FORCEINLINE __m128 floor_ps(__m128 x) |
|
|
|
_mm_andnot_ps(no_fraction, fixed_result)); |
|
|
|
} |
|
|
|
|
|
|
|
static NCNN_FORCEINLINE __m128 asin_ps(__m128 x) |
|
|
|
{ |
|
|
|
const __m128 magic_negative_zero = _mm_set_ps1(-0.0f); |
|
|
|
const __m128 magic_half_one = _mm_set_ps1(0.5f); |
|
|
|
const __m128 magic_one = _mm_set_ps1(1.0f); |
|
|
|
const __m128 magic_a4 = _mm_set_ps1(0.023994016f); |
|
|
|
const __m128 magic_a5 = _mm_set_ps1(0.042417344f); |
|
|
|
const __m128 magic_a2 = _mm_set_ps1(0.07494697f); |
|
|
|
const __m128 magic_a3 = _mm_set_ps1(0.045520633f); |
|
|
|
const __m128 magic_a0 = _mm_set_ps1(1.0f); |
|
|
|
const __m128 magic_a1 = _mm_set_ps1(0.166667819f); |
|
|
|
const __m128 magic_half_pi = _mm_set_ps1(1.5707964f); |
|
|
|
const __m128 magic_three = _mm_set_ps1(3.0f); |
|
|
|
|
|
|
|
// negative_mask = magic_negative_zero && x; |
|
|
|
__m128 negative_mask = _mm_and_ps(magic_negative_zero, x); |
|
|
|
|
|
|
|
// absolute = abs(x); |
|
|
|
__m128 absolute = _mm_andnot_ps(magic_negative_zero, x); |
|
|
|
|
|
|
|
// Reference: https://en.wikipedia.org/wiki/Small-angle_approximation |
|
|
|
|
|
|
|
// is_small_input = (absolute <= 0.5f); |
|
|
|
__m128 is_small_input = _mm_cmple_ps(absolute, magic_half_one); |
|
|
|
|
|
|
|
// is_big_input = (!is_small_input); |
|
|
|
__m128 is_big_input = _mm_andnot_ps(is_small_input, magic_one); |
|
|
|
|
|
|
|
// big_input_approx = sqrt(0.5f * (1 - absolute)); |
|
|
|
__m128 big_input_approx = _mm_sqrt_ps(_mm_mul_ps( |
|
|
|
magic_half_one, |
|
|
|
_mm_sub_ps(magic_one, absolute))); |
|
|
|
|
|
|
|
// input_approx = (is_small_input ? absolute : big_input_approx); |
|
|
|
__m128 input_approx = _mm_or_ps( |
|
|
|
_mm_and_ps(is_small_input, absolute), |
|
|
|
_mm_andnot_ps(is_small_input, big_input_approx)); |
|
|
|
|
|
|
|
// square_of_input_approx = input_approx * input_approx; |
|
|
|
__m128 square_of_input_approx = _mm_mul_ps(input_approx, input_approx); |
|
|
|
|
|
|
|
// fourth_power_of_input_approx = |
|
|
|
// square_of_input_approx * square_of_input_approx; |
|
|
|
__m128 fourth_power_of_input_approx = _mm_mul_ps( |
|
|
|
square_of_input_approx, square_of_input_approx); |
|
|
|
|
|
|
|
// TODO: Need more explanations. |
|
|
|
// x1 = ((magic_a4 * fourth_power_of_input_approx) + magic_a2); |
|
|
|
// x2 = ((magic_a5 * fourth_power_of_input_approx) + magic_a3); |
|
|
|
// x3 = ((x1 * fourth_power_of_input_approx) + magic_a0); |
|
|
|
// x4 = ((fourth_power_of_input_approx * x2) + magic_a1); |
|
|
|
// output_approx = (x3 + (square_of_input_approx * x4)); |
|
|
|
__m128 output_approx = _mm_add_ps( |
|
|
|
_mm_add_ps( |
|
|
|
_mm_mul_ps( |
|
|
|
_mm_add_ps( |
|
|
|
_mm_mul_ps(magic_a4, fourth_power_of_input_approx), |
|
|
|
magic_a2), |
|
|
|
fourth_power_of_input_approx), |
|
|
|
magic_a0), |
|
|
|
_mm_mul_ps( |
|
|
|
square_of_input_approx, |
|
|
|
_mm_add_ps( |
|
|
|
_mm_mul_ps( |
|
|
|
fourth_power_of_input_approx, |
|
|
|
_mm_add_ps( |
|
|
|
_mm_mul_ps(magic_a5, fourth_power_of_input_approx), |
|
|
|
magic_a3)), |
|
|
|
magic_a1))); |
|
|
|
|
|
|
|
// TODO: Need more explanations. |
|
|
|
// x1 = ((0.5 * PI) * is_big_input); |
|
|
|
// x2 = (output_approx * input_approx); |
|
|
|
// x3 = (1.0f - (3.0f * is_big_input)); |
|
|
|
// final_approx = (x1 + (x2 * x3)); |
|
|
|
__m128 final_approx = _mm_add_ps( |
|
|
|
_mm_mul_ps(magic_half_pi, is_big_input), |
|
|
|
_mm_mul_ps( |
|
|
|
_mm_mul_ps(output_approx, input_approx), |
|
|
|
_mm_sub_ps(magic_one, _mm_mul_ps(magic_three, is_big_input)))); |
|
|
|
|
|
|
|
// return (final_approx || negative_mask); |
|
|
|
return _mm_or_ps(final_approx, negative_mask); |
|
|
|
} |
|
|
|
|
|
|
|
#endif // SSE_MATHFUN_H |