From c34e30590253bd8701cb67edd991bd2e17e82267 Mon Sep 17 00:00:00 2001
From: jasonZhang <zqhy_0929@163.com>
Date: Sat, 7 May 2022 12:11:01 +0800
Subject: [PATCH] Add  SSE&AVX optimized for tan (#3765)

---
 src/layer/x86/avx512_mathfun.h | 11 +++++++++++
 src/layer/x86/avx_mathfun.h    | 12 ++++++++++++
 src/layer/x86/sse_mathfun.h    | 12 ++++++++++++
 src/layer/x86/unaryop_x86.cpp  | 29 +++--------------------------
 4 files changed, 38 insertions(+), 26 deletions(-)

diff --git a/src/layer/x86/avx512_mathfun.h b/src/layer/x86/avx512_mathfun.h
index d7c756a92..2892e3d2b 100644
--- a/src/layer/x86/avx512_mathfun.h
+++ b/src/layer/x86/avx512_mathfun.h
@@ -446,6 +446,17 @@ static NCNN_FORCEINLINE void sincos512_ps(__m512 x, __m512* s, __m512* c)
     *c = _mm512_xor_ps(xmm2, sign_bit_cos);
 }
 
+static NCNN_FORCEINLINE __m512 tan512_ps(__m512 x)
+{
+    __m512 ysin, ycos;
+    __m512 eps = _mm512_set1_ps(1E-8f);
+    sincos512_ps(x, &ysin, &ycos);
+    __mmask16 mask = _mm512_cmp_ps_mask(ycos, _mm512_setzero_ps(), _CMP_EQ_OS);
+    ycos = _mm512_mask_add_ps(ycos, mask, ycos, eps);
+    __m512 ytan = _mm512_div_ps(ysin, ycos);
+    return ytan;
+}
+
 static NCNN_FORCEINLINE __m512 pow512_ps(__m512 a, __m512 b)
 {
     // pow(x, m) = exp(m * log(x))
diff --git a/src/layer/x86/avx_mathfun.h b/src/layer/x86/avx_mathfun.h
index 6dd6794e9..db2869134 100644
--- a/src/layer/x86/avx_mathfun.h
+++ b/src/layer/x86/avx_mathfun.h
@@ -691,6 +691,18 @@ static NCNN_FORCEINLINE void sincos256_ps(__m256 x, __m256* s, __m256* c)
     *c = _mm256_xor_ps(xmm2, sign_bit_cos);
 }
 
+static NCNN_FORCEINLINE __m256 tan256_ps(__m256 x)
+{
+    __m256 ysin, ycos;
+    __m256 eps = _mm256_set1_ps(1E-8f);
+    sincos256_ps(x, &ysin, &ycos);
+    __m256 mask = _mm256_cmp_ps(ycos, _mm256_setzero_ps(), _CMP_EQ_OS);
+    __m256 _tmp = _mm256_and_ps(eps, mask);
+    ycos = _mm256_add_ps(ycos, _tmp);
+    __m256 ytan = _mm256_div_ps(ysin, ycos);
+    return ytan;
+}
+
 static NCNN_FORCEINLINE __m256 pow256_ps(__m256 a, __m256 b)
 {
     // pow(x, m) = exp(m * log(x))
diff --git a/src/layer/x86/sse_mathfun.h b/src/layer/x86/sse_mathfun.h
index 377cdf42c..764e33e79 100644
--- a/src/layer/x86/sse_mathfun.h
+++ b/src/layer/x86/sse_mathfun.h
@@ -679,6 +679,18 @@ static NCNN_FORCEINLINE void sincos_ps(v4sf x, v4sf* s, v4sf* c)
     *c = _mm_xor_ps(xmm2, sign_bit_cos);
 }
 
+static NCNN_FORCEINLINE __m128 tan_ps(__m128 x)
+{
+    __m128 ysin, ycos;
+    __m128 eps = _mm_set1_ps(1E-8f);
+    sincos_ps(x, &ysin, &ycos);
+    __m128 mask = _mm_cmpeq_ps(ycos, _mm_setzero_ps());
+    __m128 _tmp = _mm_and_ps(eps, mask);
+    ycos = _mm_add_ps(ycos, _tmp);
+    __m128 ytan = _mm_div_ps(ysin, ycos);
+    return ytan;
+}
+
 static NCNN_FORCEINLINE __m128 pow_ps(__m128 a, __m128 b)
 {
     // pow(x, m) = exp(m * log(x))
diff --git a/src/layer/x86/unaryop_x86.cpp b/src/layer/x86/unaryop_x86.cpp
index 78161b701..2dd431f7e 100644
--- a/src/layer/x86/unaryop_x86.cpp
+++ b/src/layer/x86/unaryop_x86.cpp
@@ -471,40 +471,17 @@ struct unary_op_tan
 #if __SSE2__
     __m128 operator()(const __m128& x) const
     {
-        //TODO sse optimize
-        float tmp[4];
-        _mm_storeu_ps(tmp, x);
-        tmp[0] = tan(tmp[0]);
-        tmp[1] = tan(tmp[1]);
-        tmp[2] = tan(tmp[2]);
-        tmp[3] = tan(tmp[3]);
-        return _mm_loadu_ps(tmp);
+        return tan_ps(x);
     }
 #if __AVX__
     __m256 operator()(const __m256& x) const
     {
-        //TODO avx optimize
-        float tmp[8];
-        _mm256_storeu_ps(tmp, x);
-        tmp[0] = tan(tmp[0]);
-        tmp[1] = tan(tmp[1]);
-        tmp[2] = tan(tmp[2]);
-        tmp[3] = tan(tmp[3]);
-        tmp[4] = tan(tmp[4]);
-        tmp[5] = tan(tmp[5]);
-        tmp[6] = tan(tmp[6]);
-        tmp[7] = tan(tmp[7]);
-        return _mm256_loadu_ps(tmp);
+        return tan256_ps(x);
     }
 #if __AVX512F__
     __m512 operator()(const __m512& x) const
     {
-        //TODO avx512 optimize
-        float tmp[16];
-        _mm512_storeu_ps(tmp, x);
-        for (int i = 0; i < 16; i++)
-            tmp[i] = tan(tmp[i]);
-        return _mm512_loadu_ps(tmp);
+        return tan512_ps(x);
     }
 #endif // __AVX512F__
 #endif // __AVX__