Browse Source

Add SSE&AVX optimized for tan (#3765)

tags/20220701
jasonZhang GitHub 4 years ago
parent
commit
c34e305902
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 38 additions and 26 deletions
  1. +11
    -0
      src/layer/x86/avx512_mathfun.h
  2. +12
    -0
      src/layer/x86/avx_mathfun.h
  3. +12
    -0
      src/layer/x86/sse_mathfun.h
  4. +3
    -26
      src/layer/x86/unaryop_x86.cpp

+ 11
- 0
src/layer/x86/avx512_mathfun.h View File

@@ -446,6 +446,17 @@ static NCNN_FORCEINLINE void sincos512_ps(__m512 x, __m512* s, __m512* c)
*c = _mm512_xor_ps(xmm2, sign_bit_cos);
}

static NCNN_FORCEINLINE __m512 tan512_ps(__m512 x)
{
__m512 ysin, ycos;
__m512 eps = _mm512_set1_ps(1E-8f);
sincos512_ps(x, &ysin, &ycos);
__mmask16 mask = _mm512_cmp_ps_mask(ycos, _mm512_setzero_ps(), _CMP_EQ_OS);
ycos = _mm512_mask_add_ps(ycos, mask, ycos, eps);
__m512 ytan = _mm512_div_ps(ysin, ycos);
return ytan;
}

static NCNN_FORCEINLINE __m512 pow512_ps(__m512 a, __m512 b)
{
// pow(x, m) = exp(m * log(x))


+ 12
- 0
src/layer/x86/avx_mathfun.h View File

@@ -691,6 +691,18 @@ static NCNN_FORCEINLINE void sincos256_ps(__m256 x, __m256* s, __m256* c)
*c = _mm256_xor_ps(xmm2, sign_bit_cos);
}

static NCNN_FORCEINLINE __m256 tan256_ps(__m256 x)
{
__m256 ysin, ycos;
__m256 eps = _mm256_set1_ps(1E-8f);
sincos256_ps(x, &ysin, &ycos);
__m256 mask = _mm256_cmp_ps(ycos, _mm256_setzero_ps(), _CMP_EQ_OS);
__m256 _tmp = _mm256_and_ps(eps, mask);
ycos = _mm256_add_ps(ycos, _tmp);
__m256 ytan = _mm256_div_ps(ysin, ycos);
return ytan;
}

static NCNN_FORCEINLINE __m256 pow256_ps(__m256 a, __m256 b)
{
// pow(x, m) = exp(m * log(x))


+ 12
- 0
src/layer/x86/sse_mathfun.h View File

@@ -679,6 +679,18 @@ static NCNN_FORCEINLINE void sincos_ps(v4sf x, v4sf* s, v4sf* c)
*c = _mm_xor_ps(xmm2, sign_bit_cos);
}

static NCNN_FORCEINLINE __m128 tan_ps(__m128 x)
{
__m128 ysin, ycos;
__m128 eps = _mm_set1_ps(1E-8f);
sincos_ps(x, &ysin, &ycos);
__m128 mask = _mm_cmpeq_ps(ycos, _mm_setzero_ps());
__m128 _tmp = _mm_and_ps(eps, mask);
ycos = _mm_add_ps(ycos, _tmp);
__m128 ytan = _mm_div_ps(ysin, ycos);
return ytan;
}

static NCNN_FORCEINLINE __m128 pow_ps(__m128 a, __m128 b)
{
// pow(x, m) = exp(m * log(x))


+ 3
- 26
src/layer/x86/unaryop_x86.cpp View File

@@ -471,40 +471,17 @@ struct unary_op_tan
#if __SSE2__
__m128 operator()(const __m128& x) const
{
//TODO sse optimize
float tmp[4];
_mm_storeu_ps(tmp, x);
tmp[0] = tan(tmp[0]);
tmp[1] = tan(tmp[1]);
tmp[2] = tan(tmp[2]);
tmp[3] = tan(tmp[3]);
return _mm_loadu_ps(tmp);
return tan_ps(x);
}
#if __AVX__
__m256 operator()(const __m256& x) const
{
//TODO avx optimize
float tmp[8];
_mm256_storeu_ps(tmp, x);
tmp[0] = tan(tmp[0]);
tmp[1] = tan(tmp[1]);
tmp[2] = tan(tmp[2]);
tmp[3] = tan(tmp[3]);
tmp[4] = tan(tmp[4]);
tmp[5] = tan(tmp[5]);
tmp[6] = tan(tmp[6]);
tmp[7] = tan(tmp[7]);
return _mm256_loadu_ps(tmp);
return tan256_ps(x);
}
#if __AVX512F__
__m512 operator()(const __m512& x) const
{
//TODO avx512 optimize
float tmp[16];
_mm512_storeu_ps(tmp, x);
for (int i = 0; i < 16; i++)
tmp[i] = tan(tmp[i]);
return _mm512_loadu_ps(tmp);
return tan512_ps(x);
}
#endif // __AVX512F__
#endif // __AVX__


Loading…
Cancel
Save