Browse Source

Implementaion of dasum, sasum with AVX2 & AVX512 intrinsic

tags/v0.3.11^2
Gengxin Xie 5 years ago
parent
commit
cb3c190a3a
7 changed files with 327 additions and 0 deletions
  1. +2
    -0
      kernel/x86_64/KERNEL.HASWELL
  2. +96
    -0
      kernel/x86_64/dasum.c
  3. +35
    -0
      kernel/x86_64/dasum_microk_haswell-2.c
  4. +27
    -0
      kernel/x86_64/dasum_microk_skylakex-2.c
  5. +104
    -0
      kernel/x86_64/sasum.c
  6. +36
    -0
      kernel/x86_64/sasum_microk_haswell-2.c
  7. +27
    -0
      kernel/x86_64/sasum_microk_skylakex-2.c

+ 2
- 0
kernel/x86_64/KERNEL.HASWELL View File

@@ -100,3 +100,5 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CGEMM3MKERNEL = cgemm3m_kernel_8x4_haswell.c
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_haswell.c

SASUMKERNEL = sasum.c
DASUMKERNEL = dasum.c

+ 96
- 0
kernel/x86_64/dasum.c View File

@@ -0,0 +1,96 @@
#include "common.h"
#include <math.h>

#define ABS fabs

#if defined(SKYLAKEX)
#include "dasum_microk_skylakex-2.c"
#elif defined(HASWELL)
#include "dasum_microk_haswell-2.c"
#endif

#ifndef HAVE_KERNEL_16
static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1)
{

BLASLONG i=0;
FLOAT *x = x1;
FLOAT temp0, temp1, temp2, temp3;
FLOAT temp4, temp5, temp6, temp7;
FLOAT sum0 = 0.0;
FLOAT sum1 = 0.0;
FLOAT sum2 = 0.0;
FLOAT sum3 = 0.0;

while ( i< n )
{

temp0 = ABS(x[0]);
temp1 = ABS(x[1]);
temp2 = ABS(x[2]);
temp3 = ABS(x[3]);
temp4 = ABS(x[4]);
temp5 = ABS(x[5]);
temp6 = ABS(x[6]);
temp7 = ABS(x[7]);

sum0 += temp0;
sum1 += temp1;
sum2 += temp2;
sum3 += temp3;

sum0 += temp4;
sum1 += temp5;
sum2 += temp6;
sum3 += temp7;

x+=8;
i+=8;

}

return sum0+sum1+sum2+sum3;
}

#endif

FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
FLOAT sumf = 0.0;
BLASLONG n1;

if (n <= 0 || inc_x <= 0) return(sumf);

if ( inc_x == 1 )
{

n1 = n & -16;
if ( n1 > 0 )
{

sumf = dasum_kernel_16(n1, x);
i=n1;
}

while(i < n)
{
sumf += ABS(x[i]);
i++;
}

}
else
{

n *= inc_x;
while(i < n)
{
sumf += ABS(x[i]);
i += inc_x;
}

}
return(sumf);
}


+ 35
- 0
kernel/x86_64/dasum_microk_haswell-2.c View File

@@ -0,0 +1,35 @@
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))

#define HAVE_KERNEL_16 1

#include <immintrin.h>

static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1)
{
BLASLONG i = 0;
__m256d accum_0, accum_1, accum_2, accum_3;

accum_0 = _mm256_setzero_pd();
accum_1 = _mm256_setzero_pd();
accum_2 = _mm256_setzero_pd();
accum_3 = _mm256_setzero_pd();

__m256i abs_mask = _mm256_set1_epi64x(0x7fffffffffffffff);
for (; i < n; i += 16) {
accum_0 += (__m256d)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 0]), abs_mask);
accum_1 += (__m256d)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 4]), abs_mask);
accum_2 += (__m256d)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 8]), abs_mask);
accum_3 += (__m256d)_mm256_and_si256(_mm256_loadu_si256(&x1[i+12]), abs_mask);
}

accum_0 = accum_0 + accum_1 + accum_2 + accum_3;

__m128d half_accum0;
half_accum0 = _mm_add_pd(_mm256_extractf128_pd(accum_0, 0), _mm256_extractf128_pd(accum_0, 1));

half_accum0 = _mm_hadd_pd(half_accum0, half_accum0);

return half_accum0[0];

}
#endif

+ 27
- 0
kernel/x86_64/dasum_microk_skylakex-2.c View File

@@ -0,0 +1,27 @@
/* need a new enough GCC for avx512 support */
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))

#if defined(__AVX512CD__)
#define HAVE_KERNEL_16 1

#include <immintrin.h>

static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1)
{
BLASLONG i = 0;

__m512d accum_0, accum_1;

accum_0 = _mm512_setzero_pd();
accum_1 = _mm512_setzero_pd();

for (; i < n; i += 16) {
accum_0 += _mm512_abs_pd(_mm512_loadu_pd(&x1[i+ 0]));
accum_1 += _mm512_abs_pd(_mm512_loadu_pd(&x1[i+ 8]));
}

accum_0 += accum_1;
return _mm512_reduce_add_pd(accum_0);
}
#endif
#endif

+ 104
- 0
kernel/x86_64/sasum.c View File

@@ -0,0 +1,104 @@
#include "common.h"
#include <math.h>

#if defined(DOUBLE)

#error supports float only

#else

#define ABS fabsf

#endif

#if defined(SKYLAKEX)
#include "sasum_microk_skylakex-2.c"
#elif defined(HASWELL)
#include "sasum_microk_haswell-2.c"
#endif

#ifndef HAVE_KERNEL_32

static FLOAT sasum_kernel_32(BLASLONG n, FLOAT *x1)
{

BLASLONG i=0;
FLOAT *x = x1;
FLOAT temp0, temp1, temp2, temp3;
FLOAT temp4, temp5, temp6, temp7;
FLOAT sum0 = 0.0;
FLOAT sum1 = 0.0;
FLOAT sum2 = 0.0;
FLOAT sum3 = 0.0;

while ( i< n )
{

temp0 = ABS(x[0]);
temp1 = ABS(x[1]);
temp2 = ABS(x[2]);
temp3 = ABS(x[3]);
temp4 = ABS(x[4]);
temp5 = ABS(x[5]);
temp6 = ABS(x[6]);
temp7 = ABS(x[7]);

sum0 += temp0;
sum1 += temp1;
sum2 += temp2;
sum3 += temp3;

sum0 += temp4;
sum1 += temp5;
sum2 += temp6;
sum3 += temp7;

x+=8;
i+=8;

}

return sum0+sum1+sum2+sum3;
}

#endif

FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
FLOAT sumf = 0.0;
BLASLONG n1;

if (n <= 0 || inc_x <= 0) return(sumf);

if ( inc_x == 1 )
{

n1 = n & -32;
if ( n1 > 0 )
{

sumf = sasum_kernel_32(n1, x);
i=n1;
}

while(i < n)
{
sumf += ABS(x[i]);
i++;
}

}
else
{

n *= inc_x;
while(i < n)
{
sumf += ABS(x[i]);
i += inc_x;
}

}
return(sumf);
}

+ 36
- 0
kernel/x86_64/sasum_microk_haswell-2.c View File

@@ -0,0 +1,36 @@
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))

#define HAVE_KERNEL_32 1

#include <immintrin.h>

static FLOAT sasum_kernel_32(BLASLONG n, FLOAT *x1)
{
BLASLONG i = 0;
__m256 accum_0, accum_1, accum_2, accum_3;

accum_0 = _mm256_setzero_ps();
accum_1 = _mm256_setzero_ps();
accum_2 = _mm256_setzero_ps();
accum_3 = _mm256_setzero_ps();

__m256i abs_mask = _mm256_set1_epi32(0x7fffffff);
for (; i < n; i += 32) {
accum_0 += (__m256)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 0]), abs_mask);
accum_1 += (__m256)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 8]), abs_mask);
accum_2 += (__m256)_mm256_and_si256(_mm256_loadu_si256(&x1[i+16]), abs_mask);
accum_3 += (__m256)_mm256_and_si256(_mm256_loadu_si256(&x1[i+24]), abs_mask);
}

accum_0 = accum_0 + accum_1 + accum_2 + accum_3;

__m128 half_accum0;
half_accum0 = _mm_add_ps(_mm256_extractf128_ps(accum_0, 0), _mm256_extractf128_ps(accum_0, 1));

half_accum0 = _mm_hadd_ps(half_accum0, half_accum0);
half_accum0 = _mm_hadd_ps(half_accum0, half_accum0);

return half_accum0[0];

}
#endif

+ 27
- 0
kernel/x86_64/sasum_microk_skylakex-2.c View File

@@ -0,0 +1,27 @@
/* need a new enough GCC for avx512 support */
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))

#if defined(__AVX512CD__)
#define HAVE_KERNEL_32 1

#include <immintrin.h>

static FLOAT sasum_kernel_32(BLASLONG n, FLOAT *x1)
{
BLASLONG i = 0;

__m512 accum_0, accum_1;

accum_0 = _mm512_setzero_ps();
accum_1 = _mm512_setzero_ps();

for (; i < n; i += 32) {
accum_0 += _mm512_abs_ps(_mm512_loadu_ps(&x1[i+ 0]));
accum_1 += _mm512_abs_ps(_mm512_loadu_ps(&x1[i+ 16]));
}

accum_0 += accum_1;
return _mm512_reduce_add_ps(accum_0);
}
#endif
#endif

Loading…
Cancel
Save