| @@ -100,3 +100,5 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CGEMM3MKERNEL = cgemm3m_kernel_8x4_haswell.c | |||
| ZGEMM3MKERNEL = zgemm3m_kernel_4x4_haswell.c | |||
| SASUMKERNEL = sasum.c | |||
| DASUMKERNEL = dasum.c | |||
| @@ -0,0 +1,96 @@ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #define ABS fabs | |||
| #if defined(SKYLAKEX) | |||
| #include "dasum_microk_skylakex-2.c" | |||
| #elif defined(HASWELL) | |||
| #include "dasum_microk_haswell-2.c" | |||
| #endif | |||
| #ifndef HAVE_KERNEL_16 | |||
| static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1) | |||
| { | |||
| BLASLONG i=0; | |||
| FLOAT *x = x1; | |||
| FLOAT temp0, temp1, temp2, temp3; | |||
| FLOAT temp4, temp5, temp6, temp7; | |||
| FLOAT sum0 = 0.0; | |||
| FLOAT sum1 = 0.0; | |||
| FLOAT sum2 = 0.0; | |||
| FLOAT sum3 = 0.0; | |||
| while ( i< n ) | |||
| { | |||
| temp0 = ABS(x[0]); | |||
| temp1 = ABS(x[1]); | |||
| temp2 = ABS(x[2]); | |||
| temp3 = ABS(x[3]); | |||
| temp4 = ABS(x[4]); | |||
| temp5 = ABS(x[5]); | |||
| temp6 = ABS(x[6]); | |||
| temp7 = ABS(x[7]); | |||
| sum0 += temp0; | |||
| sum1 += temp1; | |||
| sum2 += temp2; | |||
| sum3 += temp3; | |||
| sum0 += temp4; | |||
| sum1 += temp5; | |||
| sum2 += temp6; | |||
| sum3 += temp7; | |||
| x+=8; | |||
| i+=8; | |||
| } | |||
| return sum0+sum1+sum2+sum3; | |||
| } | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i=0; | |||
| FLOAT sumf = 0.0; | |||
| BLASLONG n1; | |||
| if (n <= 0 || inc_x <= 0) return(sumf); | |||
| if ( inc_x == 1 ) | |||
| { | |||
| n1 = n & -16; | |||
| if ( n1 > 0 ) | |||
| { | |||
| sumf = dasum_kernel_16(n1, x); | |||
| i=n1; | |||
| } | |||
| while(i < n) | |||
| { | |||
| sumf += ABS(x[i]); | |||
| i++; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| n *= inc_x; | |||
| while(i < n) | |||
| { | |||
| sumf += ABS(x[i]); | |||
| i += inc_x; | |||
| } | |||
| } | |||
| return(sumf); | |||
| } | |||
| @@ -0,0 +1,35 @@ | |||
| #if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) | |||
| #define HAVE_KERNEL_16 1 | |||
| #include <immintrin.h> | |||
| static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1) | |||
| { | |||
| BLASLONG i = 0; | |||
| __m256d accum_0, accum_1, accum_2, accum_3; | |||
| accum_0 = _mm256_setzero_pd(); | |||
| accum_1 = _mm256_setzero_pd(); | |||
| accum_2 = _mm256_setzero_pd(); | |||
| accum_3 = _mm256_setzero_pd(); | |||
| __m256i abs_mask = _mm256_set1_epi64x(0x7fffffffffffffff); | |||
| for (; i < n; i += 16) { | |||
| accum_0 += (__m256d)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 0]), abs_mask); | |||
| accum_1 += (__m256d)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 4]), abs_mask); | |||
| accum_2 += (__m256d)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 8]), abs_mask); | |||
| accum_3 += (__m256d)_mm256_and_si256(_mm256_loadu_si256(&x1[i+12]), abs_mask); | |||
| } | |||
| accum_0 = accum_0 + accum_1 + accum_2 + accum_3; | |||
| __m128d half_accum0; | |||
| half_accum0 = _mm_add_pd(_mm256_extractf128_pd(accum_0, 0), _mm256_extractf128_pd(accum_0, 1)); | |||
| half_accum0 = _mm_hadd_pd(half_accum0, half_accum0); | |||
| return half_accum0[0]; | |||
| } | |||
| #endif | |||
| @@ -0,0 +1,27 @@ | |||
| /* need a new enough GCC for avx512 support */ | |||
| #if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) | |||
| #if defined(__AVX512CD__) | |||
| #define HAVE_KERNEL_16 1 | |||
| #include <immintrin.h> | |||
| static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1) | |||
| { | |||
| BLASLONG i = 0; | |||
| __m512d accum_0, accum_1; | |||
| accum_0 = _mm512_setzero_pd(); | |||
| accum_1 = _mm512_setzero_pd(); | |||
| for (; i < n; i += 16) { | |||
| accum_0 += _mm512_abs_pd(_mm512_loadu_pd(&x1[i+ 0])); | |||
| accum_1 += _mm512_abs_pd(_mm512_loadu_pd(&x1[i+ 8])); | |||
| } | |||
| accum_0 += accum_1; | |||
| return _mm512_reduce_add_pd(accum_0); | |||
| } | |||
| #endif | |||
| #endif | |||
| @@ -0,0 +1,104 @@ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #if defined(DOUBLE) | |||
| #error supports float only | |||
| #else | |||
| #define ABS fabsf | |||
| #endif | |||
| #if defined(SKYLAKEX) | |||
| #include "sasum_microk_skylakex-2.c" | |||
| #elif defined(HASWELL) | |||
| #include "sasum_microk_haswell-2.c" | |||
| #endif | |||
| #ifndef HAVE_KERNEL_32 | |||
| static FLOAT sasum_kernel_32(BLASLONG n, FLOAT *x1) | |||
| { | |||
| BLASLONG i=0; | |||
| FLOAT *x = x1; | |||
| FLOAT temp0, temp1, temp2, temp3; | |||
| FLOAT temp4, temp5, temp6, temp7; | |||
| FLOAT sum0 = 0.0; | |||
| FLOAT sum1 = 0.0; | |||
| FLOAT sum2 = 0.0; | |||
| FLOAT sum3 = 0.0; | |||
| while ( i< n ) | |||
| { | |||
| temp0 = ABS(x[0]); | |||
| temp1 = ABS(x[1]); | |||
| temp2 = ABS(x[2]); | |||
| temp3 = ABS(x[3]); | |||
| temp4 = ABS(x[4]); | |||
| temp5 = ABS(x[5]); | |||
| temp6 = ABS(x[6]); | |||
| temp7 = ABS(x[7]); | |||
| sum0 += temp0; | |||
| sum1 += temp1; | |||
| sum2 += temp2; | |||
| sum3 += temp3; | |||
| sum0 += temp4; | |||
| sum1 += temp5; | |||
| sum2 += temp6; | |||
| sum3 += temp7; | |||
| x+=8; | |||
| i+=8; | |||
| } | |||
| return sum0+sum1+sum2+sum3; | |||
| } | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i=0; | |||
| FLOAT sumf = 0.0; | |||
| BLASLONG n1; | |||
| if (n <= 0 || inc_x <= 0) return(sumf); | |||
| if ( inc_x == 1 ) | |||
| { | |||
| n1 = n & -32; | |||
| if ( n1 > 0 ) | |||
| { | |||
| sumf = sasum_kernel_32(n1, x); | |||
| i=n1; | |||
| } | |||
| while(i < n) | |||
| { | |||
| sumf += ABS(x[i]); | |||
| i++; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| n *= inc_x; | |||
| while(i < n) | |||
| { | |||
| sumf += ABS(x[i]); | |||
| i += inc_x; | |||
| } | |||
| } | |||
| return(sumf); | |||
| } | |||
| @@ -0,0 +1,36 @@ | |||
| #if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) | |||
| #define HAVE_KERNEL_32 1 | |||
| #include <immintrin.h> | |||
| static FLOAT sasum_kernel_32(BLASLONG n, FLOAT *x1) | |||
| { | |||
| BLASLONG i = 0; | |||
| __m256 accum_0, accum_1, accum_2, accum_3; | |||
| accum_0 = _mm256_setzero_ps(); | |||
| accum_1 = _mm256_setzero_ps(); | |||
| accum_2 = _mm256_setzero_ps(); | |||
| accum_3 = _mm256_setzero_ps(); | |||
| __m256i abs_mask = _mm256_set1_epi32(0x7fffffff); | |||
| for (; i < n; i += 32) { | |||
| accum_0 += (__m256)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 0]), abs_mask); | |||
| accum_1 += (__m256)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 8]), abs_mask); | |||
| accum_2 += (__m256)_mm256_and_si256(_mm256_loadu_si256(&x1[i+16]), abs_mask); | |||
| accum_3 += (__m256)_mm256_and_si256(_mm256_loadu_si256(&x1[i+24]), abs_mask); | |||
| } | |||
| accum_0 = accum_0 + accum_1 + accum_2 + accum_3; | |||
| __m128 half_accum0; | |||
| half_accum0 = _mm_add_ps(_mm256_extractf128_ps(accum_0, 0), _mm256_extractf128_ps(accum_0, 1)); | |||
| half_accum0 = _mm_hadd_ps(half_accum0, half_accum0); | |||
| half_accum0 = _mm_hadd_ps(half_accum0, half_accum0); | |||
| return half_accum0[0]; | |||
| } | |||
| #endif | |||
| @@ -0,0 +1,27 @@ | |||
| /* need a new enough GCC for avx512 support */ | |||
| #if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) | |||
| #if defined(__AVX512CD__) | |||
| #define HAVE_KERNEL_32 1 | |||
| #include <immintrin.h> | |||
| static FLOAT sasum_kernel_32(BLASLONG n, FLOAT *x1) | |||
| { | |||
| BLASLONG i = 0; | |||
| __m512 accum_0, accum_1; | |||
| accum_0 = _mm512_setzero_ps(); | |||
| accum_1 = _mm512_setzero_ps(); | |||
| for (; i < n; i += 32) { | |||
| accum_0 += _mm512_abs_ps(_mm512_loadu_ps(&x1[i+ 0])); | |||
| accum_1 += _mm512_abs_ps(_mm512_loadu_ps(&x1[i+ 16])); | |||
| } | |||
| accum_0 += accum_1; | |||
| return _mm512_reduce_add_ps(accum_0); | |||
| } | |||
| #endif | |||
| #endif | |||