| @@ -66,6 +66,11 @@ FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), VORTEX) | |||
| CCOMMON_OPT += -march=armv8.3-a | |||
| FCOMMON_OPT += -march=armv8.3-a | |||
| endif | |||
| ifeq ($(GCCVERSIONGTEQ9), 1) | |||
| ifeq ($(CORE), TSV110) | |||
| CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 | |||
| @@ -98,6 +98,7 @@ THUNDERX | |||
| THUNDERX2T99 | |||
| TSV110 | |||
| THUNDERX3T110 | |||
| VORTEX | |||
| 9.System Z: | |||
| ZARCH_GENERIC | |||
| @@ -128,8 +128,13 @@ int main(int argc, char *argv[]){ | |||
| int to = 200; | |||
| int step = 1; | |||
| #if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) | |||
| struct timeval start, stop; | |||
| double time1,timeg; | |||
| #else | |||
| struct timespec start = { 0, 0 }, stop = { 0, 0 }; | |||
| double time1, timeg; | |||
| #endif | |||
| argc--;argv++; | |||
| @@ -160,26 +165,30 @@ int main(int argc, char *argv[]){ | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| #if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) | |||
| gettimeofday( &start, (struct timezone *)0); | |||
| #else | |||
| clock_gettime(CLOCK_REALTIME, &start); | |||
| #endif | |||
| result = ASUM (&m, x, &inc_x); | |||
| gettimeofday( &stop, (struct timezone *)0); | |||
| time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; | |||
| #if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) | |||
| clock_gettime(CLOCK_REALTIME, &stop); | |||
| time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; | |||
| #else | |||
| gettimeofday( &stop, (struct timezone *)0); | |||
| time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9; | |||
| #endif | |||
| timeg += time1; | |||
| } | |||
| if (loops >1) | |||
| timeg /= loops; | |||
| #ifdef COMPLEX | |||
| @@ -26,6 +26,11 @@ | |||
| *****************************************************************************/ | |||
| #include <string.h> | |||
| #ifdef OS_DARWIN | |||
| #include <sys/sysctl.h> | |||
| int32_t value; | |||
| size_t length=sizeof(value); | |||
| #endif | |||
| #define CPU_UNKNOWN 0 | |||
| #define CPU_ARMV8 1 | |||
| @@ -45,6 +50,8 @@ | |||
| #define CPU_TSV110 9 | |||
| // Ampere | |||
| #define CPU_EMAG8180 10 | |||
| // Apple | |||
| #define CPU_VORTEX 13 | |||
| static char *cpuname[] = { | |||
| "UNKNOWN", | |||
| @@ -59,7 +66,8 @@ static char *cpuname[] = { | |||
| "TSV110", | |||
| "EMAG8180", | |||
| "NEOVERSEN1", | |||
| "THUNDERX3T110" | |||
| "THUNDERX3T110", | |||
| "VORTEX" | |||
| }; | |||
| static char *cpuname_lower[] = { | |||
| @@ -75,7 +83,8 @@ static char *cpuname_lower[] = { | |||
| "tsv110", | |||
| "emag8180", | |||
| "neoversen1", | |||
| "thunderx3t110" | |||
| "thunderx3t110", | |||
| "vortex" | |||
| }; | |||
| int get_feature(char *search) | |||
| @@ -198,6 +207,10 @@ int detect(void) | |||
| } | |||
| #else | |||
| #ifdef DARWIN | |||
| sysctlbyname("hw.cpufamily",&value,&length,NULL,0); | |||
| if (value ==131287967) return CPU_VORTEX; | |||
| #endif | |||
| return CPU_ARMV8; | |||
| #endif | |||
| @@ -247,7 +260,10 @@ int n=0; | |||
| printf("#define NUM_CORES %d\n",n); | |||
| #endif | |||
| #ifdef DARWIN | |||
| sysctlbyname("hw.physicalcpu_max",&value,&length,NULL,0); | |||
| printf("#define NUM_CORES %d\n",value); | |||
| #endif | |||
| } | |||
| @@ -398,6 +414,19 @@ void get_cpuconfig(void) | |||
| printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | |||
| printf("#define DTB_SIZE 4096 \n"); | |||
| break; | |||
| #ifdef DARWIN | |||
| case CPU_VORTEX: | |||
| printf("#define VORTEX \n"); | |||
| sysctlbyname("hw.l1icachesize",&value,&length,NULL,0); | |||
| printf("#define L1_CODE_SIZE %d \n",value); | |||
| sysctlbyname("hw.cachelinesize",&value,&length,NULL,0); | |||
| printf("#define L1_CODE_LINESIZE %d \n",value); | |||
| sysctlbyname("hw.l1dcachesize",&value,&length,NULL,0); | |||
| printf("#define L1_DATA_SIZE %d \n",value); | |||
| sysctlbyname("hw.l2dcachesize",&value,&length,NULL,0); | |||
| printf("#define L2_DATA_SIZE %d \n",value); | |||
| break; | |||
| #endif | |||
| } | |||
| get_cpucount(); | |||
| } | |||
| @@ -0,0 +1 @@ | |||
| include $(KERNELDIR)/KERNEL.ARMV8 | |||
| @@ -100,3 +100,5 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CGEMM3MKERNEL = cgemm3m_kernel_8x4_haswell.c | |||
| ZGEMM3MKERNEL = zgemm3m_kernel_4x4_haswell.c | |||
| SASUMKERNEL = sasum.c | |||
| DASUMKERNEL = dasum.c | |||
| @@ -0,0 +1,82 @@ | |||
| #include "common.h" | |||
| #ifndef ABS_K | |||
| #define ABS_K(a) ((a) > 0 ? (a) : (-(a))) | |||
| #endif | |||
| #if defined(SKYLAKEX) | |||
| #include "dasum_microk_skylakex-2.c" | |||
| #elif defined(HASWELL) | |||
| #include "dasum_microk_haswell-2.c" | |||
| #endif | |||
| #ifndef HAVE_DASUM_KERNEL | |||
| static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG n_8 = n & -8; | |||
| FLOAT *x = x1; | |||
| FLOAT temp0, temp1, temp2, temp3; | |||
| FLOAT temp4, temp5, temp6, temp7; | |||
| FLOAT sum0 = 0.0; | |||
| FLOAT sum1 = 0.0; | |||
| FLOAT sum2 = 0.0; | |||
| FLOAT sum3 = 0.0; | |||
| FLOAT sum4 = 0.0; | |||
| while (i < n_8) { | |||
| temp0 = ABS_K(x[0]); | |||
| temp1 = ABS_K(x[1]); | |||
| temp2 = ABS_K(x[2]); | |||
| temp3 = ABS_K(x[3]); | |||
| temp4 = ABS_K(x[4]); | |||
| temp5 = ABS_K(x[5]); | |||
| temp6 = ABS_K(x[6]); | |||
| temp7 = ABS_K(x[7]); | |||
| sum0 += temp0; | |||
| sum1 += temp1; | |||
| sum2 += temp2; | |||
| sum3 += temp3; | |||
| sum0 += temp4; | |||
| sum1 += temp5; | |||
| sum2 += temp6; | |||
| sum3 += temp7; | |||
| x+=8; | |||
| i+=8; | |||
| } | |||
| while (i < n) { | |||
| sum4 += ABS_K(x1[i]); | |||
| i++; | |||
| } | |||
| return sum0+sum1+sum2+sum3+sum4; | |||
| } | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i=0; | |||
| FLOAT sumf = 0.0; | |||
| if (n <= 0 || inc_x <= 0) return(sumf); | |||
| if ( inc_x == 1 ) { | |||
| sumf = dasum_kernel(n, x); | |||
| } | |||
| else { | |||
| n *= inc_x; | |||
| while(i < n) { | |||
| sumf += ABS_K(x[i]); | |||
| i += inc_x; | |||
| } | |||
| } | |||
| return(sumf); | |||
| } | |||
| @@ -0,0 +1,86 @@ | |||
| #if (( defined(__GNUC__) && __GNUC__ > 6 ) || (defined(__clang__) && __clang_major__ >= 6)) && defined(__AVX2__) | |||
| #define HAVE_DASUM_KERNEL | |||
| #include <immintrin.h> | |||
| #include <stdint.h> | |||
| #ifndef ABS_K | |||
| #define ABS_K(a) ((a) > 0 ? (a) : (-(a))) | |||
| #endif | |||
| static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1) | |||
| { | |||
| BLASLONG i = 0; | |||
| FLOAT sumf = 0.0; | |||
| if (n >= 256) { | |||
| BLASLONG align_256 = ((32 - ((uintptr_t)x1 & (uintptr_t)0x1f)) >> 3) & 0x3; | |||
| for (i = 0; i < align_256; i++) { | |||
| sumf += ABS_K(x1[i]); | |||
| } | |||
| n -= align_256; | |||
| x1 += align_256; | |||
| } | |||
| BLASLONG tail_index_SSE = n&(~7); | |||
| BLASLONG tail_index_AVX2 = n&(~255); | |||
| if (n >= 256) { | |||
| __m256d accum_0, accum_1, accum_2, accum_3; | |||
| accum_0 = _mm256_setzero_pd(); | |||
| accum_1 = _mm256_setzero_pd(); | |||
| accum_2 = _mm256_setzero_pd(); | |||
| accum_3 = _mm256_setzero_pd(); | |||
| __m256i abs_mask = _mm256_set1_epi64x(0x7fffffffffffffff); | |||
| for (i = 0; i < tail_index_AVX2; i += 16) { | |||
| accum_0 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 0]), abs_mask); | |||
| accum_1 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 4]), abs_mask); | |||
| accum_2 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 8]), abs_mask); | |||
| accum_3 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+12]), abs_mask); | |||
| } | |||
| accum_0 = accum_0 + accum_1 + accum_2 + accum_3; | |||
| __m128d half_accum0; | |||
| half_accum0 = _mm_add_pd(_mm256_extractf128_pd(accum_0, 0), _mm256_extractf128_pd(accum_0, 1)); | |||
| half_accum0 = _mm_hadd_pd(half_accum0, half_accum0); | |||
| sumf += half_accum0[0]; | |||
| } | |||
| if (n >= 8) { | |||
| __m128d accum_20, accum_21, accum_22, accum_23; | |||
| accum_20 = _mm_setzero_pd(); | |||
| accum_21 = _mm_setzero_pd(); | |||
| accum_22 = _mm_setzero_pd(); | |||
| accum_23 = _mm_setzero_pd(); | |||
| __m128i abs_mask2 = _mm_set1_epi64x(0x7fffffffffffffff); | |||
| for (i = tail_index_AVX2; i < tail_index_SSE; i += 8) { | |||
| accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2); | |||
| accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 2]), abs_mask2); | |||
| accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2); | |||
| accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 6]), abs_mask2); | |||
| } | |||
| accum_20 = accum_20 + accum_21 + accum_22 + accum_23; | |||
| __m128d half_accum20; | |||
| half_accum20 = _mm_hadd_pd(accum_20, accum_20); | |||
| sumf += half_accum20[0]; | |||
| } | |||
| for (i = tail_index_SSE; i < n; ++i) { | |||
| sumf += ABS_K(x1[i]); | |||
| } | |||
| return sumf; | |||
| } | |||
| #endif | |||
| @@ -0,0 +1,80 @@ | |||
| /* need a new enough GCC for avx512 support */ | |||
| #if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) | |||
| #define HAVE_DASUM_KERNEL 1 | |||
| #include <immintrin.h> | |||
| #include <stdint.h> | |||
| #ifndef ABS_K | |||
| #define ABS_K(a) ((a) > 0 ? (a) : (-(a))) | |||
| #endif | |||
| static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1) | |||
| { | |||
| BLASLONG i = 0; | |||
| FLOAT sumf = 0.0; | |||
| if (n >= 256) { | |||
| BLASLONG align_512 = ((64 - ((uintptr_t)x1 & (uintptr_t)0x3f)) >> 3) & 0x7; | |||
| for (i = 0; i < align_512; i++) { | |||
| sumf += ABS_K(x1[i]); | |||
| } | |||
| n -= align_512; | |||
| x1 += align_512; | |||
| } | |||
| BLASLONG tail_index_SSE = n&(~7); | |||
| BLASLONG tail_index_AVX512 = n&(~255); | |||
| // | |||
| if ( n >= 256 ) { | |||
| __m512d accum_0, accum_1, accum_2, accum_3; | |||
| accum_0 = _mm512_setzero_pd(); | |||
| accum_1 = _mm512_setzero_pd(); | |||
| accum_2 = _mm512_setzero_pd(); | |||
| accum_3 = _mm512_setzero_pd(); | |||
| for (i = 0; i < tail_index_AVX512; i += 32) { | |||
| accum_0 += _mm512_abs_pd(_mm512_load_pd(&x1[i + 0])); | |||
| accum_1 += _mm512_abs_pd(_mm512_load_pd(&x1[i + 8])); | |||
| accum_2 += _mm512_abs_pd(_mm512_load_pd(&x1[i +16])); | |||
| accum_3 += _mm512_abs_pd(_mm512_load_pd(&x1[i +24])); | |||
| } | |||
| accum_0 = accum_0 + accum_1 + accum_2 + accum_3; | |||
| sumf += _mm512_reduce_add_pd(accum_0); | |||
| } | |||
| if (n >= 8) { | |||
| __m128d accum_20, accum_21, accum_22, accum_23; | |||
| accum_20 = _mm_setzero_pd(); | |||
| accum_21 = _mm_setzero_pd(); | |||
| accum_22 = _mm_setzero_pd(); | |||
| accum_23 = _mm_setzero_pd(); | |||
| __m128i abs_mask2 = _mm_set1_epi64x(0x7fffffffffffffff); | |||
| for (i = tail_index_AVX512; i < tail_index_SSE; i += 8) { | |||
| accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2); | |||
| accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 2]), abs_mask2); | |||
| accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2); | |||
| accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 6]), abs_mask2); | |||
| } | |||
| accum_20 = accum_20 + accum_21 + accum_22 + accum_23; | |||
| __m128d half_accum20; | |||
| half_accum20 = _mm_hadd_pd(accum_20, accum_20); | |||
| sumf += half_accum20[0]; | |||
| } | |||
| for (i = tail_index_SSE; i < n; ++i) { | |||
| sumf += ABS_K(x1[i]); | |||
| } | |||
| return sumf; | |||
| } | |||
| #endif | |||
| @@ -0,0 +1,90 @@ | |||
| #include "common.h" | |||
| #if defined(DOUBLE) | |||
| #error supports float only | |||
| #else | |||
| #ifndef ABS_K | |||
| #define ABS_K(a) ((a) > 0 ? (a) : (-(a))) | |||
| #endif | |||
| #endif | |||
| #if defined(SKYLAKEX) | |||
| #include "sasum_microk_skylakex-2.c" | |||
| #elif defined(HASWELL) | |||
| #include "sasum_microk_haswell-2.c" | |||
| #endif | |||
| #ifndef HAVE_SASUM_KERNEL | |||
| static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG n_8 = n & -8; | |||
| FLOAT *x = x1; | |||
| FLOAT temp0, temp1, temp2, temp3; | |||
| FLOAT temp4, temp5, temp6, temp7; | |||
| FLOAT sum0 = 0.0; | |||
| FLOAT sum1 = 0.0; | |||
| FLOAT sum2 = 0.0; | |||
| FLOAT sum3 = 0.0; | |||
| FLOAT sum4 = 0.0; | |||
| while (i < n_8) { | |||
| temp0 = ABS_K(x[0]); | |||
| temp1 = ABS_K(x[1]); | |||
| temp2 = ABS_K(x[2]); | |||
| temp3 = ABS_K(x[3]); | |||
| temp4 = ABS_K(x[4]); | |||
| temp5 = ABS_K(x[5]); | |||
| temp6 = ABS_K(x[6]); | |||
| temp7 = ABS_K(x[7]); | |||
| sum0 += temp0; | |||
| sum1 += temp1; | |||
| sum2 += temp2; | |||
| sum3 += temp3; | |||
| sum0 += temp4; | |||
| sum1 += temp5; | |||
| sum2 += temp6; | |||
| sum3 += temp7; | |||
| x+=8; | |||
| i+=8; | |||
| } | |||
| while (i < n) { | |||
| sum4 += ABS_K(x1[i]); | |||
| i++; | |||
| } | |||
| return sum0+sum1+sum2+sum3+sum4; | |||
| } | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i=0; | |||
| FLOAT sumf = 0.0; | |||
| if (n <= 0 || inc_x <= 0) return(sumf); | |||
| if ( inc_x == 1 ) { | |||
| sumf = sasum_kernel(n, x); | |||
| } | |||
| else { | |||
| n *= inc_x; | |||
| while(i < n) { | |||
| sumf += ABS_K(x[i]); | |||
| i += inc_x; | |||
| } | |||
| } | |||
| return(sumf); | |||
| } | |||
| @@ -0,0 +1,82 @@ | |||
| #if (( defined(__GNUC__) && __GNUC__ > 6 ) || (defined(__clang__) && __clang_major__ >= 6)) && defined(__AVX2__) | |||
| #define HAVE_SASUM_KERNEL 1 | |||
| #include <immintrin.h> | |||
| #include <stdint.h> | |||
| #ifndef ABS_K | |||
| #define ABS_K(a) ((a) > 0 ? (a) : (-(a))) | |||
| #endif | |||
| static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1) | |||
| { | |||
| BLASLONG i = 0; | |||
| FLOAT sumf = 0.0; | |||
| if (n >= 256) { | |||
| BLASLONG align_256 = ((32 - ((uintptr_t)x1 & (uintptr_t)0x1f)) >> 2) & 0x7; | |||
| for (i = 0; i < align_256; i++) { | |||
| sumf += ABS_K(x1[i]); | |||
| } | |||
| n -= align_256; | |||
| x1 += align_256; | |||
| } | |||
| BLASLONG tail_index_SSE = n&(~7); | |||
| BLASLONG tail_index_AVX2 = n&(~255); | |||
| if (n >= 256) { | |||
| __m256 accum_0, accum_1, accum_2, accum_3; | |||
| accum_0 = _mm256_setzero_ps(); | |||
| accum_1 = _mm256_setzero_ps(); | |||
| accum_2 = _mm256_setzero_ps(); | |||
| accum_3 = _mm256_setzero_ps(); | |||
| __m256i abs_mask = _mm256_set1_epi32(0x7fffffff); | |||
| for (i = 0; i < tail_index_AVX2; i += 32) { | |||
| accum_0 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+ 0]), abs_mask); | |||
| accum_1 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+ 8]), abs_mask); | |||
| accum_2 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+16]), abs_mask); | |||
| accum_3 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+24]), abs_mask); | |||
| } | |||
| accum_0 = accum_0 + accum_1 + accum_2 + accum_3; | |||
| __m128 half_accum0; | |||
| half_accum0 = _mm_add_ps(_mm256_extractf128_ps(accum_0, 0), _mm256_extractf128_ps(accum_0, 1)); | |||
| half_accum0 = _mm_hadd_ps(half_accum0, half_accum0); | |||
| half_accum0 = _mm_hadd_ps(half_accum0, half_accum0); | |||
| sumf += half_accum0[0]; | |||
| } | |||
| if (n >= 8) { | |||
| __m128 accum_20, accum_21; | |||
| accum_20 = _mm_setzero_ps(); | |||
| accum_21 = _mm_setzero_ps(); | |||
| __m128i abs_mask2 = _mm_set1_epi32(0x7fffffff); | |||
| for (i = tail_index_AVX2; i < tail_index_SSE; i += 8) { | |||
| accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2); | |||
| accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2); | |||
| } | |||
| accum_20 += accum_21; | |||
| accum_20 = _mm_hadd_ps(accum_20, accum_20); | |||
| accum_20 = _mm_hadd_ps(accum_20, accum_20); | |||
| sumf += accum_20[0]; | |||
| } | |||
| for (i = tail_index_SSE; i < n; ++i) { | |||
| sumf += ABS_K(x1[i]); | |||
| } | |||
| return sumf; | |||
| } | |||
| #endif | |||
| @@ -0,0 +1,73 @@ | |||
| /* need a new enough GCC for avx512 support */ | |||
| #if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) | |||
| #define HAVE_SASUM_KERNEL 1 | |||
| #ifndef ABS_K | |||
| #define ABS_K(a) ((a) > 0 ? (a) : (-(a))) | |||
| #endif | |||
| #include <immintrin.h> | |||
| #include <stdint.h> | |||
| static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1) | |||
| { | |||
| BLASLONG i = 0; | |||
| FLOAT sumf = 0.0; | |||
| if (n >= 256) { | |||
| BLASLONG align_512 = ((64 - ((uintptr_t)x1 & (uintptr_t)0x3f)) >> 2) & 0xf; | |||
| for (i = 0; i < align_512; i++) { | |||
| sumf += ABS_K(x1[i]); | |||
| } | |||
| n -= align_512; | |||
| x1 += align_512; | |||
| } | |||
| BLASLONG tail_index_SSE = n&(~7); | |||
| BLASLONG tail_index_AVX512 = n&(~255); | |||
| if (n >= 256) { | |||
| __m512 accum_0, accum_1, accum_2, accum_3; | |||
| accum_0 = _mm512_setzero_ps(); | |||
| accum_1 = _mm512_setzero_ps(); | |||
| accum_2 = _mm512_setzero_ps(); | |||
| accum_3 = _mm512_setzero_ps(); | |||
| for (i = 0; i < tail_index_AVX512; i += 64) { | |||
| accum_0 += _mm512_abs_ps(_mm512_load_ps(&x1[i + 0])); | |||
| accum_1 += _mm512_abs_ps(_mm512_load_ps(&x1[i +16])); | |||
| accum_2 += _mm512_abs_ps(_mm512_load_ps(&x1[i +32])); | |||
| accum_3 += _mm512_abs_ps(_mm512_load_ps(&x1[i +48])); | |||
| } | |||
| accum_0 = accum_0 + accum_1 + accum_2 + accum_3; | |||
| sumf += _mm512_reduce_add_ps(accum_0); | |||
| } | |||
| if (n >= 8) { | |||
| __m128 accum_20, accum_21; | |||
| accum_20 = _mm_setzero_ps(); | |||
| accum_21 = _mm_setzero_ps(); | |||
| __m128i abs_mask2 = _mm_set1_epi32(0x7fffffff); | |||
| for (i = tail_index_AVX512; i < tail_index_SSE; i += 8) { | |||
| accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2); | |||
| accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2); | |||
| } | |||
| accum_20 += accum_21; | |||
| accum_20 = _mm_hadd_ps(accum_20, accum_20); | |||
| accum_20 = _mm_hadd_ps(accum_20, accum_20); | |||
| sumf += accum_20[0]; | |||
| } | |||
| for (i = tail_index_SSE; i < n; i++) { | |||
| sumf += ABS_K(x1[i]); | |||
| } | |||
| return sumf; | |||
| } | |||
| #endif | |||
| @@ -680,8 +680,8 @@ | |||
| * the one from above. Compare it with D1 computed | |||
| * using the DSBTRD. | |||
| * | |||
| CALL SLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) | |||
| CALL SLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) | |||
| CALL SLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) | |||
| CALL SLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) | |||
| CALL CLACPY( ' ', K+1, N, A, LDA, U, LDU ) | |||
| LH = MAX(1, 4*N) | |||
| LW = LWORK - LH | |||
| @@ -753,8 +753,8 @@ | |||
| * the one from above. Compare it with D1 computed | |||
| * using the DSBTRD. | |||
| * | |||
| CALL SLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) | |||
| CALL SLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) | |||
| CALL SLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) | |||
| CALL SLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) | |||
| CALL CLACPY( ' ', K+1, N, A, LDA, U, LDU ) | |||
| LH = MAX(1, 4*N) | |||
| LW = LWORK - LH | |||