From c34e30590253bd8701cb67edd991bd2e17e82267 Mon Sep 17 00:00:00 2001 From: jasonZhang Date: Sat, 7 May 2022 12:11:01 +0800 Subject: [PATCH] Add SSE&AVX optimized for tan (#3765) --- src/layer/x86/avx512_mathfun.h | 11 +++++++++++ src/layer/x86/avx_mathfun.h | 12 ++++++++++++ src/layer/x86/sse_mathfun.h | 12 ++++++++++++ src/layer/x86/unaryop_x86.cpp | 29 +++-------------------------- 4 files changed, 38 insertions(+), 26 deletions(-) diff --git a/src/layer/x86/avx512_mathfun.h b/src/layer/x86/avx512_mathfun.h index d7c756a92..2892e3d2b 100644 --- a/src/layer/x86/avx512_mathfun.h +++ b/src/layer/x86/avx512_mathfun.h @@ -446,6 +446,17 @@ static NCNN_FORCEINLINE void sincos512_ps(__m512 x, __m512* s, __m512* c) *c = _mm512_xor_ps(xmm2, sign_bit_cos); } +static NCNN_FORCEINLINE __m512 tan512_ps(__m512 x) +{ + __m512 ysin, ycos; + __m512 eps = _mm512_set1_ps(1E-8f); + sincos512_ps(x, &ysin, &ycos); + __mmask16 mask = _mm512_cmp_ps_mask(ycos, _mm512_setzero_ps(), _CMP_EQ_OS); + ycos = _mm512_mask_add_ps(ycos, mask, ycos, eps); + __m512 ytan = _mm512_div_ps(ysin, ycos); + return ytan; +} + static NCNN_FORCEINLINE __m512 pow512_ps(__m512 a, __m512 b) { // pow(x, m) = exp(m * log(x)) diff --git a/src/layer/x86/avx_mathfun.h b/src/layer/x86/avx_mathfun.h index 6dd6794e9..db2869134 100644 --- a/src/layer/x86/avx_mathfun.h +++ b/src/layer/x86/avx_mathfun.h @@ -691,6 +691,18 @@ static NCNN_FORCEINLINE void sincos256_ps(__m256 x, __m256* s, __m256* c) *c = _mm256_xor_ps(xmm2, sign_bit_cos); } +static NCNN_FORCEINLINE __m256 tan256_ps(__m256 x) +{ + __m256 ysin, ycos; + __m256 eps = _mm256_set1_ps(1E-8f); + sincos256_ps(x, &ysin, &ycos); + __m256 mask = _mm256_cmp_ps(ycos, _mm256_setzero_ps(), _CMP_EQ_OS); + __m256 _tmp = _mm256_and_ps(eps, mask); + ycos = _mm256_add_ps(ycos, _tmp); + __m256 ytan = _mm256_div_ps(ysin, ycos); + return ytan; +} + static NCNN_FORCEINLINE __m256 pow256_ps(__m256 a, __m256 b) { // pow(x, m) = exp(m * log(x)) diff --git a/src/layer/x86/sse_mathfun.h b/src/layer/x86/sse_mathfun.h index 377cdf42c..764e33e79 100644 --- a/src/layer/x86/sse_mathfun.h +++ b/src/layer/x86/sse_mathfun.h @@ -679,6 +679,18 @@ static NCNN_FORCEINLINE void sincos_ps(v4sf x, v4sf* s, v4sf* c) *c = _mm_xor_ps(xmm2, sign_bit_cos); } +static NCNN_FORCEINLINE __m128 tan_ps(__m128 x) +{ + __m128 ysin, ycos; + __m128 eps = _mm_set1_ps(1E-8f); + sincos_ps(x, &ysin, &ycos); + __m128 mask = _mm_cmpeq_ps(ycos, _mm_setzero_ps()); + __m128 _tmp = _mm_and_ps(eps, mask); + ycos = _mm_add_ps(ycos, _tmp); + __m128 ytan = _mm_div_ps(ysin, ycos); + return ytan; +} + static NCNN_FORCEINLINE __m128 pow_ps(__m128 a, __m128 b) { // pow(x, m) = exp(m * log(x)) diff --git a/src/layer/x86/unaryop_x86.cpp b/src/layer/x86/unaryop_x86.cpp index 78161b701..2dd431f7e 100644 --- a/src/layer/x86/unaryop_x86.cpp +++ b/src/layer/x86/unaryop_x86.cpp @@ -471,40 +471,17 @@ struct unary_op_tan #if __SSE2__ __m128 operator()(const __m128& x) const { - //TODO sse optimize - float tmp[4]; - _mm_storeu_ps(tmp, x); - tmp[0] = tan(tmp[0]); - tmp[1] = tan(tmp[1]); - tmp[2] = tan(tmp[2]); - tmp[3] = tan(tmp[3]); - return _mm_loadu_ps(tmp); + return tan_ps(x); } #if __AVX__ __m256 operator()(const __m256& x) const { - //TODO avx optimize - float tmp[8]; - _mm256_storeu_ps(tmp, x); - tmp[0] = tan(tmp[0]); - tmp[1] = tan(tmp[1]); - tmp[2] = tan(tmp[2]); - tmp[3] = tan(tmp[3]); - tmp[4] = tan(tmp[4]); - tmp[5] = tan(tmp[5]); - tmp[6] = tan(tmp[6]); - tmp[7] = tan(tmp[7]); - return _mm256_loadu_ps(tmp); + return tan256_ps(x); } #if __AVX512F__ __m512 operator()(const __m512& x) const { - //TODO avx512 optimize - float tmp[16]; - _mm512_storeu_ps(tmp, x); - for (int i = 0; i < 16; i++) - tmp[i] = tan(tmp[i]); - return _mm512_loadu_ps(tmp); + return tan512_ps(x); } #endif // __AVX512F__ #endif // __AVX__