Add SSE&AVX optimized for tan (#3765)

4 years ago · c34e305902
--- a/src/layer/x86/avx512_mathfun.h
+++ b/src/layer/x86/avx512_mathfun.h
@@ -446,6 +446,17 @@ static NCNN_FORCEINLINE void sincos512_ps(__m512 x, __m512* s, __m512* c)
    *c = _mm512_xor_ps(xmm2, sign_bit_cos);
 }

 static NCNN_FORCEINLINE __m512 tan512_ps(__m512 x)
 {
    __m512 ysin, ycos;
    __m512 eps = _mm512_set1_ps(1E-8f);
    sincos512_ps(x, &ysin, &ycos);
    __mmask16 mask = _mm512_cmp_ps_mask(ycos, _mm512_setzero_ps(), _CMP_EQ_OS);
    ycos = _mm512_mask_add_ps(ycos, mask, ycos, eps);
    __m512 ytan = _mm512_div_ps(ysin, ycos);
    return ytan;
 }

 static NCNN_FORCEINLINE __m512 pow512_ps(__m512 a, __m512 b)
 {
    // pow(x, m) = exp(m * log(x))
--- a/src/layer/x86/avx_mathfun.h
+++ b/src/layer/x86/avx_mathfun.h
@@ -691,6 +691,18 @@ static NCNN_FORCEINLINE void sincos256_ps(__m256 x, __m256* s, __m256* c)
    *c = _mm256_xor_ps(xmm2, sign_bit_cos);
 }

 static NCNN_FORCEINLINE __m256 tan256_ps(__m256 x)
 {
    __m256 ysin, ycos;
    __m256 eps = _mm256_set1_ps(1E-8f);
    sincos256_ps(x, &ysin, &ycos);
    __m256 mask = _mm256_cmp_ps(ycos, _mm256_setzero_ps(), _CMP_EQ_OS);
    __m256 _tmp = _mm256_and_ps(eps, mask);
    ycos = _mm256_add_ps(ycos, _tmp);
    __m256 ytan = _mm256_div_ps(ysin, ycos);
    return ytan;
 }

 static NCNN_FORCEINLINE __m256 pow256_ps(__m256 a, __m256 b)
 {
    // pow(x, m) = exp(m * log(x))
--- a/src/layer/x86/sse_mathfun.h
+++ b/src/layer/x86/sse_mathfun.h
@@ -679,6 +679,18 @@ static NCNN_FORCEINLINE void sincos_ps(v4sf x, v4sf* s, v4sf* c)
    *c = _mm_xor_ps(xmm2, sign_bit_cos);
 }

 static NCNN_FORCEINLINE __m128 tan_ps(__m128 x)
 {
    __m128 ysin, ycos;
    __m128 eps = _mm_set1_ps(1E-8f);
    sincos_ps(x, &ysin, &ycos);
    __m128 mask = _mm_cmpeq_ps(ycos, _mm_setzero_ps());
    __m128 _tmp = _mm_and_ps(eps, mask);
    ycos = _mm_add_ps(ycos, _tmp);
    __m128 ytan = _mm_div_ps(ysin, ycos);
    return ytan;
 }

 static NCNN_FORCEINLINE __m128 pow_ps(__m128 a, __m128 b)
 {
    // pow(x, m) = exp(m * log(x))
--- a/src/layer/x86/unaryop_x86.cpp
+++ b/src/layer/x86/unaryop_x86.cpp
@@ -471,40 +471,17 @@ struct unary_op_tan
 #if __SSE2__
    __m128 operator()(const __m128& x) const
    {
        //TODO sse optimize
        float tmp[4];
        _mm_storeu_ps(tmp, x);
        tmp[0] = tan(tmp[0]);
        tmp[1] = tan(tmp[1]);
        tmp[2] = tan(tmp[2]);
        tmp[3] = tan(tmp[3]);
        return _mm_loadu_ps(tmp);
        return tan_ps(x);
    }
 #if __AVX__
    __m256 operator()(const __m256& x) const
    {
        //TODO avx optimize
        float tmp[8];
        _mm256_storeu_ps(tmp, x);
        tmp[0] = tan(tmp[0]);
        tmp[1] = tan(tmp[1]);
        tmp[2] = tan(tmp[2]);
        tmp[3] = tan(tmp[3]);
        tmp[4] = tan(tmp[4]);
        tmp[5] = tan(tmp[5]);
        tmp[6] = tan(tmp[6]);
        tmp[7] = tan(tmp[7]);
        return _mm256_loadu_ps(tmp);
        return tan256_ps(x);
    }
 #if __AVX512F__
    __m512 operator()(const __m512& x) const
    {
        //TODO avx512 optimize
        float tmp[16];
        _mm512_storeu_ps(tmp, x);
        for (int i = 0; i < 16; i++)
            tmp[i] = tan(tmp[i]);
        return _mm512_loadu_ps(tmp);
        return tan512_ps(x);
    }
 #endif // __AVX512F__
 #endif // __AVX__