| @@ -1,10 +1,21 @@ | |||||
| # COMPILER_PREFIX = mingw32- | # COMPILER_PREFIX = mingw32- | ||||
| ifndef DYNAMIC_ARCH | |||||
| ADD_CPUFLAGS = 1 | |||||
| else | |||||
| ifdef TARGET_CORE | |||||
| ADD_CPUFLAGS = 1 | |||||
| endif | |||||
| endif | |||||
| ifdef ADD_CPUFLAGS | |||||
| ifdef HAVE_SSE | ifdef HAVE_SSE | ||||
| CCOMMON_OPT += -msse | CCOMMON_OPT += -msse | ||||
| ifneq ($(F_COMPILER), NAG) | |||||
| FCOMMON_OPT += -msse | FCOMMON_OPT += -msse | ||||
| endif | endif | ||||
| endif | |||||
| endif | |||||
| ifeq ($(OSNAME), Interix) | ifeq ($(OSNAME), Interix) | ||||
| ARFLAGS = -m x86 | ARFLAGS = -m x86 | ||||
| @@ -8,6 +8,16 @@ endif | |||||
| endif | endif | ||||
| endif | endif | ||||
| ifndef DYNAMIC_ARCH | |||||
| ADD_CPUFLAGS = 1 | |||||
| else | |||||
| ifdef TARGET_CORE | |||||
| ADD_CPUFLAGS = 1 | |||||
| endif | |||||
| endif | |||||
| ifdef ADD_CPUFLAGS | |||||
| ifdef HAVE_SSE3 | ifdef HAVE_SSE3 | ||||
| CCOMMON_OPT += -msse3 | CCOMMON_OPT += -msse3 | ||||
| ifneq ($(F_COMPILER), NAG) | ifneq ($(F_COMPILER), NAG) | ||||
| @@ -44,7 +54,6 @@ endif | |||||
| endif | endif | ||||
| ifeq ($(CORE), SKYLAKEX) | ifeq ($(CORE), SKYLAKEX) | ||||
| ifndef DYNAMIC_ARCH | |||||
| ifndef NO_AVX512 | ifndef NO_AVX512 | ||||
| CCOMMON_OPT += -march=skylake-avx512 | CCOMMON_OPT += -march=skylake-avx512 | ||||
| ifneq ($(F_COMPILER), NAG) | ifneq ($(F_COMPILER), NAG) | ||||
| @@ -62,10 +71,8 @@ endif | |||||
| endif | endif | ||||
| endif | endif | ||||
| endif | endif | ||||
| endif | |||||
| ifeq ($(CORE), COOPERLAKE) | ifeq ($(CORE), COOPERLAKE) | ||||
| ifndef DYNAMIC_ARCH | |||||
| ifndef NO_AVX512 | ifndef NO_AVX512 | ||||
| ifeq ($(C_COMPILER), GCC) | ifeq ($(C_COMPILER), GCC) | ||||
| # cooperlake support was added in 10.1 | # cooperlake support was added in 10.1 | ||||
| @@ -88,7 +95,6 @@ endif | |||||
| endif | endif | ||||
| endif | endif | ||||
| endif | endif | ||||
| endif | |||||
| ifdef HAVE_AVX2 | ifdef HAVE_AVX2 | ||||
| ifndef NO_AVX2 | ifndef NO_AVX2 | ||||
| @@ -120,6 +126,7 @@ endif | |||||
| endif | endif | ||||
| endif | endif | ||||
| endif | |||||
| ifeq ($(OSNAME), Interix) | ifeq ($(OSNAME), Interix) | ||||
| @@ -299,6 +299,10 @@ if (NO_AVX2) | |||||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX2") | set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX2") | ||||
| endif () | endif () | ||||
| if (NO_AVX512) | |||||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") | |||||
| endif () | |||||
| if (USE_THREAD) | if (USE_THREAD) | ||||
| # USE_SIMPLE_THREADED_LEVEL3 = 1 | # USE_SIMPLE_THREADED_LEVEL3 = 1 | ||||
| # NO_AFFINITY = 1 | # NO_AFFINITY = 1 | ||||
| @@ -126,7 +126,7 @@ extern void openblas_warning(int verbose, const char * msg); | |||||
| #endif | #endif | ||||
| #define get_cpu_ftr(id, var) ({ \ | #define get_cpu_ftr(id, var) ({ \ | ||||
| __asm__ ("mrs %0, "#id : "=r" (var)); \ | |||||
| __asm__ __volatile__ ("mrs %0, "#id : "=r" (var)); \ | |||||
| }) | }) | ||||
| static char *corename[] = { | static char *corename[] = { | ||||
| @@ -186,7 +186,7 @@ ZSWAPKERNEL = zswap.c | |||||
| SGEMVNKERNEL = sgemv_n.c | SGEMVNKERNEL = sgemv_n.c | ||||
| DGEMVNKERNEL = dgemv_n_power10.c | DGEMVNKERNEL = dgemv_n_power10.c | ||||
| CGEMVNKERNEL = cgemv_n.c | CGEMVNKERNEL = cgemv_n.c | ||||
| ZGEMVNKERNEL = zgemv_n_4.c | |||||
| ZGEMVNKERNEL = zgemv_n_power10.c | |||||
| # | # | ||||
| SGEMVTKERNEL = sgemv_t.c | SGEMVTKERNEL = sgemv_t.c | ||||
| DGEMVTKERNEL = dgemv_t_power10.c | DGEMVTKERNEL = dgemv_t_power10.c | ||||
| @@ -190,10 +190,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||||
| __vector_quad acc0, acc1, acc2, acc3, acc4,acc5,acc6,acc7; | __vector_quad acc0, acc1, acc2, acc3, acc4,acc5,acc6,acc7; | ||||
| BLASLONG l = 0; | BLASLONG l = 0; | ||||
| vec_t *rowA = (vec_t *) & AO[0]; | vec_t *rowA = (vec_t *) & AO[0]; | ||||
| vec_t *rb = (vec_t *) & BO[0]; | |||||
| __vector_pair rowB, rowB1; | __vector_pair rowB, rowB1; | ||||
| __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); | |||||
| __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); | |||||
| rowB = *((__vector_pair *)((void *)&BO[0])); | |||||
| rowB1 = *((__vector_pair *)((void *)&BO[4])); | |||||
| __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | ||||
| __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); | __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); | ||||
| __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]); | __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]); | ||||
| @@ -205,9 +204,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||||
| for (l = 1; l < temp; l++) | for (l = 1; l < temp; l++) | ||||
| { | { | ||||
| rowA = (vec_t *) & AO[l << 3]; | rowA = (vec_t *) & AO[l << 3]; | ||||
| rb = (vec_t *) & BO[l << 3]; | |||||
| __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); | |||||
| __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); | |||||
| rowB = *((__vector_pair *)((void *)&BO[l << 3])); | |||||
| rowB1 = *((__vector_pair *)((void *)&BO[(l << 3) + 4])); | |||||
| __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | ||||
| __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); | __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); | ||||
| __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]); | __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]); | ||||
| @@ -247,9 +245,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||||
| BLASLONG l = 0; | BLASLONG l = 0; | ||||
| vec_t *rowA = (vec_t *) & AO[0]; | vec_t *rowA = (vec_t *) & AO[0]; | ||||
| __vector_pair rowB, rowB1; | __vector_pair rowB, rowB1; | ||||
| vec_t *rb = (vec_t *) & BO[0]; | |||||
| __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); | |||||
| __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); | |||||
| rowB = *((__vector_pair *)((void *)&BO[0])); | |||||
| rowB1 = *((__vector_pair *)((void *)&BO[4])); | |||||
| __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | ||||
| __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); | __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); | ||||
| __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]); | __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]); | ||||
| @@ -257,9 +254,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||||
| for (l = 1; l < temp; l++) | for (l = 1; l < temp; l++) | ||||
| { | { | ||||
| rowA = (vec_t *) & AO[l << 2]; | rowA = (vec_t *) & AO[l << 2]; | ||||
| rb = (vec_t *) & BO[l << 3]; | |||||
| __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); | |||||
| __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); | |||||
| rowB = *((__vector_pair *)((void *)&BO[l << 3])); | |||||
| rowB1 = *((__vector_pair *)((void *)&BO[(l << 3) + 4])); | |||||
| __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | ||||
| __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); | __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); | ||||
| __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]); | __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]); | ||||
| @@ -291,17 +287,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||||
| BLASLONG l = 0; | BLASLONG l = 0; | ||||
| vec_t *rowA = (vec_t *) & AO[0]; | vec_t *rowA = (vec_t *) & AO[0]; | ||||
| __vector_pair rowB, rowB1; | __vector_pair rowB, rowB1; | ||||
| vec_t *rb = (vec_t *) & BO[0]; | |||||
| __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); | |||||
| __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); | |||||
| rowB = *((__vector_pair *)((void *)&BO[0])); | |||||
| rowB1 = *((__vector_pair *)((void *)&BO[4])); | |||||
| __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | ||||
| __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); | __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); | ||||
| for (l = 1; l < temp; l++) | for (l = 1; l < temp; l++) | ||||
| { | { | ||||
| rowA = (vec_t *) & AO[l << 1]; | rowA = (vec_t *) & AO[l << 1]; | ||||
| rb = (vec_t *) & BO[l << 3]; | |||||
| __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); | |||||
| __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); | |||||
| rowB = *((__vector_pair *)((void *)&BO[l << 3])); | |||||
| rowB1 = *((__vector_pair *)((void *)&BO[(l << 3) + 4])); | |||||
| __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | ||||
| __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); | __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); | ||||
| } | } | ||||
| @@ -403,8 +397,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||||
| BLASLONG l = 0; | BLASLONG l = 0; | ||||
| vec_t *rowA = (vec_t *) & AO[0]; | vec_t *rowA = (vec_t *) & AO[0]; | ||||
| __vector_pair rowB; | __vector_pair rowB; | ||||
| vec_t *rb = (vec_t *) & BO[0]; | |||||
| __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); | |||||
| rowB = *((__vector_pair *)((void *)&BO[0])); | |||||
| __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | ||||
| __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); | __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); | ||||
| __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]); | __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]); | ||||
| @@ -412,8 +405,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||||
| for (l = 1; l < temp; l++) | for (l = 1; l < temp; l++) | ||||
| { | { | ||||
| rowA = (vec_t *) & AO[l << 3]; | rowA = (vec_t *) & AO[l << 3]; | ||||
| rb = (vec_t *) & BO[l << 2]; | |||||
| __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); | |||||
| rowB = *((__vector_pair *)((void *)&BO[l << 2])); | |||||
| __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | ||||
| __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); | __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); | ||||
| __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); | __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); | ||||
| @@ -445,15 +437,13 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||||
| BLASLONG l = 0; | BLASLONG l = 0; | ||||
| vec_t *rowA = (vec_t *) & AO[0]; | vec_t *rowA = (vec_t *) & AO[0]; | ||||
| __vector_pair rowB; | __vector_pair rowB; | ||||
| vec_t *rb = (vec_t *) & BO[0]; | |||||
| __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); | |||||
| rowB = *((__vector_pair *)((void *)&BO[0])); | |||||
| __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | ||||
| __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); | __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); | ||||
| for (l = 1; l < temp; l++) | for (l = 1; l < temp; l++) | ||||
| { | { | ||||
| rowA = (vec_t *) & AO[l << 2]; | rowA = (vec_t *) & AO[l << 2]; | ||||
| rb = (vec_t *) & BO[l << 2]; | |||||
| __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); | |||||
| rowB = *((__vector_pair *)((void *)&BO[l << 2])); | |||||
| __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | ||||
| __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); | __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); | ||||
| } | } | ||||
| @@ -481,14 +471,12 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||||
| BLASLONG l = 0; | BLASLONG l = 0; | ||||
| vec_t *rowA = (vec_t *) & AO[0]; | vec_t *rowA = (vec_t *) & AO[0]; | ||||
| __vector_pair rowB; | __vector_pair rowB; | ||||
| vec_t *rb = (vec_t *) & BO[0]; | |||||
| __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); | |||||
| rowB = *((__vector_pair *)((void *)&BO[0])); | |||||
| __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); | ||||
| for (l = 1; l < temp; l++) | for (l = 1; l < temp; l++) | ||||
| { | { | ||||
| rowA = (vec_t *) & AO[l << 1]; | rowA = (vec_t *) & AO[l << 1]; | ||||
| rb = (vec_t *) & BO[l << 2]; | |||||
| __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); | |||||
| rowB = *((__vector_pair *)((void *)&BO[l << 2])); | |||||
| __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | ||||
| } | } | ||||
| SAVE_ACC (&acc0, 0); | SAVE_ACC (&acc0, 0); | ||||
| @@ -43,6 +43,134 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #elif HAVE_KERNEL_4x4_VEC | #elif HAVE_KERNEL_4x4_VEC | ||||
| #if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||||
| typedef __vector unsigned char vec_t; | |||||
| typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); | |||||
| static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { | |||||
| BLASLONG i; | |||||
| FLOAT *a0, *a1, *a2, *a3; | |||||
| a0 = ap; | |||||
| a1 = ap + lda; | |||||
| a2 = a1 + lda; | |||||
| a3 = a2 + lda; | |||||
| __vector_quad acc0, acc1, acc2, acc3;; | |||||
| __vector_quad acc4, acc5, acc6, acc7; | |||||
| v4sf_t result[4]; | |||||
| __vector_pair *Va0, *Va1, *Va2, *Va3; | |||||
| i = 0; | |||||
| n = n << 1; | |||||
| __builtin_mma_xxsetaccz (&acc0); | |||||
| __builtin_mma_xxsetaccz (&acc1); | |||||
| __builtin_mma_xxsetaccz (&acc2); | |||||
| __builtin_mma_xxsetaccz (&acc3); | |||||
| __builtin_mma_xxsetaccz (&acc4); | |||||
| __builtin_mma_xxsetaccz (&acc5); | |||||
| __builtin_mma_xxsetaccz (&acc6); | |||||
| __builtin_mma_xxsetaccz (&acc7); | |||||
| while (i < n) { | |||||
| vec_t *rx = (vec_t *) & x[i]; | |||||
| Va0 = ((__vector_pair*)((void*)&a0[i])); | |||||
| Va1 = ((__vector_pair*)((void*)&a1[i])); | |||||
| Va2 = ((__vector_pair*)((void*)&a2[i])); | |||||
| Va3 = ((__vector_pair*)((void*)&a3[i])); | |||||
| __builtin_mma_xvf64gerpp (&acc0, Va0[0], rx[0]); | |||||
| __builtin_mma_xvf64gerpp (&acc1, Va1[0], rx[0]); | |||||
| __builtin_mma_xvf64gerpp (&acc2, Va2[0], rx[0]); | |||||
| __builtin_mma_xvf64gerpp (&acc3, Va3[0], rx[0]); | |||||
| __builtin_mma_xvf64gerpp (&acc4, Va0[0], rx[1]); | |||||
| __builtin_mma_xvf64gerpp (&acc5, Va1[0], rx[1]); | |||||
| __builtin_mma_xvf64gerpp (&acc6, Va2[0], rx[1]); | |||||
| __builtin_mma_xvf64gerpp (&acc7, Va3[0], rx[1]); | |||||
| __builtin_mma_xvf64gerpp (&acc0, Va0[1], rx[2]); | |||||
| __builtin_mma_xvf64gerpp (&acc1, Va1[1], rx[2]); | |||||
| __builtin_mma_xvf64gerpp (&acc2, Va2[1], rx[2]); | |||||
| __builtin_mma_xvf64gerpp (&acc3, Va3[1], rx[2]); | |||||
| __builtin_mma_xvf64gerpp (&acc4, Va0[1], rx[3]); | |||||
| __builtin_mma_xvf64gerpp (&acc5, Va1[1], rx[3]); | |||||
| __builtin_mma_xvf64gerpp (&acc6, Va2[1], rx[3]); | |||||
| __builtin_mma_xvf64gerpp (&acc7, Va3[1], rx[3]); | |||||
| i += 8; | |||||
| } | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| __builtin_mma_disassemble_acc ((void *)result, &acc0); | |||||
| register FLOAT temp_r0 = result[0][0] - result[1][1]; | |||||
| register FLOAT temp_i0 = result[0][1] + result[1][0]; | |||||
| __builtin_mma_disassemble_acc ((void *)result, &acc4); | |||||
| temp_r0 += result[2][0] - result[3][1]; | |||||
| temp_i0 += result[2][1] + result[3][0]; | |||||
| __builtin_mma_disassemble_acc ((void *)result, &acc1); | |||||
| register FLOAT temp_r1 = result[0][0] - result[1][1]; | |||||
| register FLOAT temp_i1 = result[0][1] + result[1][0]; | |||||
| __builtin_mma_disassemble_acc ((void *)result, &acc5); | |||||
| temp_r1 += result[2][0] - result[3][1]; | |||||
| temp_i1 += result[2][1] + result[3][0]; | |||||
| __builtin_mma_disassemble_acc ((void *)result, &acc2); | |||||
| register FLOAT temp_r2 = result[0][0] - result[1][1]; | |||||
| register FLOAT temp_i2 = result[0][1] + result[1][0]; | |||||
| __builtin_mma_disassemble_acc ((void *)result, &acc6); | |||||
| temp_r2 += result[2][0] - result[3][1]; | |||||
| temp_i2 += result[2][1] + result[3][0]; | |||||
| __builtin_mma_disassemble_acc ((void *)result, &acc3); | |||||
| register FLOAT temp_r3 = result[0][0] - result[1][1]; | |||||
| register FLOAT temp_i3 = result[0][1] + result[1][0]; | |||||
| __builtin_mma_disassemble_acc ((void *)result, &acc7); | |||||
| temp_r3 += result[2][0] - result[3][1]; | |||||
| temp_i3 += result[2][1] + result[3][0]; | |||||
| #else | |||||
| __builtin_mma_disassemble_acc ((void *)result, &acc0); | |||||
| register FLOAT temp_r0 = result[0][0] + result[1][1]; | |||||
| register FLOAT temp_i0 = result[0][1] - result[1][0]; | |||||
| __builtin_mma_disassemble_acc ((void *)result, &acc4); | |||||
| temp_r0 += result[2][0] + result[3][1]; | |||||
| temp_i0 += result[2][1] - result[3][0]; | |||||
| __builtin_mma_disassemble_acc ((void *)result, &acc1); | |||||
| register FLOAT temp_r1 = result[0][0] + result[1][1]; | |||||
| register FLOAT temp_i1 = result[0][1] - result[1][0]; | |||||
| __builtin_mma_disassemble_acc ((void *)result, &acc5); | |||||
| temp_r1 += result[2][0] + result[3][1]; | |||||
| temp_i1 += result[2][1] - result[3][0]; | |||||
| __builtin_mma_disassemble_acc ((void *)result, &acc2); | |||||
| register FLOAT temp_r2 = result[0][0] + result[1][1]; | |||||
| register FLOAT temp_i2 = result[0][1] - result[1][0]; | |||||
| __builtin_mma_disassemble_acc ((void *)result, &acc6); | |||||
| temp_r2 += result[2][0] + result[3][1]; | |||||
| temp_i2 += result[2][1] - result[3][0]; | |||||
| __builtin_mma_disassemble_acc ((void *)result, &acc3); | |||||
| register FLOAT temp_r3 = result[0][0] + result[1][1]; | |||||
| register FLOAT temp_i3 = result[0][1] - result[1][0]; | |||||
| __builtin_mma_disassemble_acc ((void *)result, &acc7); | |||||
| temp_r3 += result[2][0] + result[3][1]; | |||||
| temp_i3 += result[2][1] - result[3][0]; | |||||
| #endif | |||||
| #if !defined(XCONJ) | |||||
| y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; | |||||
| y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; | |||||
| y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; | |||||
| y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; | |||||
| y[4] += alpha_r * temp_r2 - alpha_i * temp_i2; | |||||
| y[5] += alpha_r * temp_i2 + alpha_i * temp_r2; | |||||
| y[6] += alpha_r * temp_r3 - alpha_i * temp_i3; | |||||
| y[7] += alpha_r * temp_i3 + alpha_i * temp_r3; | |||||
| #else | |||||
| y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; | |||||
| y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; | |||||
| y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; | |||||
| y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; | |||||
| y[4] += alpha_r * temp_r2 + alpha_i * temp_i2; | |||||
| y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2; | |||||
| y[6] += alpha_r * temp_r3 + alpha_i * temp_i3; | |||||
| y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3; | |||||
| #endif | |||||
| } | |||||
| #else | |||||
| static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { | static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { | ||||
| BLASLONG i; | BLASLONG i; | ||||
| FLOAT *a0, *a1, *a2, *a3; | FLOAT *a0, *a1, *a2, *a3; | ||||
| @@ -198,6 +326,7 @@ static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA | |||||
| #endif | #endif | ||||
| } | } | ||||
| #endif | |||||
| #else | #else | ||||
| static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { | static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { | ||||
| @@ -501,7 +501,11 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f | |||||
| int32_t permil[16] = {0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3}; | int32_t permil[16] = {0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3}; | ||||
| BLASLONG n_count = n; | BLASLONG n_count = n; | ||||
| float *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*next_b = B; | float *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*next_b = B; | ||||
| #if defined(__clang__) | |||||
| for(;n_count>23;n_count-=24) COMPUTE(24) | |||||
| #else | |||||
| for(;n_count>23;n_count-=24) COMPUTE_n24 | for(;n_count>23;n_count-=24) COMPUTE_n24 | ||||
| #endif | |||||
| for(;n_count>19;n_count-=20) COMPUTE(20) | for(;n_count>19;n_count-=20) COMPUTE(20) | ||||
| for(;n_count>15;n_count-=16) COMPUTE(16) | for(;n_count>15;n_count-=16) COMPUTE(16) | ||||
| for(;n_count>11;n_count-=12) COMPUTE(12) | for(;n_count>11;n_count-=12) COMPUTE(12) | ||||
| @@ -319,14 +319,14 @@ | |||||
| REAL ABSB, ANORM, ASCALE, ATOL, BNORM, BSCALE, BTOL, | REAL ABSB, ANORM, ASCALE, ATOL, BNORM, BSCALE, BTOL, | ||||
| $ C, SAFMIN, TEMP, TEMP2, TEMPR, ULP | $ C, SAFMIN, TEMP, TEMP2, TEMPR, ULP | ||||
| COMPLEX ABI22, AD11, AD12, AD21, AD22, CTEMP, CTEMP2, | COMPLEX ABI22, AD11, AD12, AD21, AD22, CTEMP, CTEMP2, | ||||
| $ CTEMP3, ESHIFT, RTDISC, S, SHIFT, SIGNBC, T1, | |||||
| $ CTEMP3, ESHIFT, S, SHIFT, SIGNBC, | |||||
| $ U12, X, ABI12, Y | $ U12, X, ABI12, Y | ||||
| * .. | * .. | ||||
| * .. External Functions .. | * .. External Functions .. | ||||
| COMPLEX CLADIV | COMPLEX CLADIV | ||||
| LOGICAL LSAME | LOGICAL LSAME | ||||
| REAL CLANHS, SLAMCH | REAL CLANHS, SLAMCH | ||||
| EXTERNAL CLADIV, LLSAME, CLANHS, SLAMCH | |||||
| EXTERNAL CLADIV, LSAME, CLANHS, SLAMCH | |||||
| * .. | * .. | ||||
| * .. External Subroutines .. | * .. External Subroutines .. | ||||
| EXTERNAL CLARTG, CLASET, CROT, CSCAL, XERBLA | EXTERNAL CLARTG, CLASET, CROT, CSCAL, XERBLA | ||||
| @@ -351,6 +351,7 @@ | |||||
| ILSCHR = .TRUE. | ILSCHR = .TRUE. | ||||
| ISCHUR = 2 | ISCHUR = 2 | ||||
| ELSE | ELSE | ||||
| ILSCHR = .TRUE. | |||||
| ISCHUR = 0 | ISCHUR = 0 | ||||
| END IF | END IF | ||||
| * | * | ||||
| @@ -364,6 +365,7 @@ | |||||
| ILQ = .TRUE. | ILQ = .TRUE. | ||||
| ICOMPQ = 3 | ICOMPQ = 3 | ||||
| ELSE | ELSE | ||||
| ILQ = .TRUE. | |||||
| ICOMPQ = 0 | ICOMPQ = 0 | ||||
| END IF | END IF | ||||
| * | * | ||||
| @@ -377,6 +379,7 @@ | |||||
| ILZ = .TRUE. | ILZ = .TRUE. | ||||
| ICOMPZ = 3 | ICOMPZ = 3 | ||||
| ELSE | ELSE | ||||
| ILZ = .TRUE. | |||||
| ICOMPZ = 0 | ICOMPZ = 0 | ||||
| END IF | END IF | ||||
| * | * | ||||
| @@ -139,7 +139,7 @@ | |||||
| * ===================================================================== | * ===================================================================== | ||||
| * | * | ||||
| * .. Parameters .. | * .. Parameters .. | ||||
| DOUBLE PRECISION ZERO, HALF, ONE | |||||
| DOUBLE PRECISION ZERO, HALF, ONE, TWO | |||||
| PARAMETER ( ZERO = 0.0D+0, HALF = 0.5D+0, ONE = 1.0D+0, | PARAMETER ( ZERO = 0.0D+0, HALF = 0.5D+0, ONE = 1.0D+0, | ||||
| $ TWO = 2.0D0 ) | $ TWO = 2.0D0 ) | ||||
| DOUBLE PRECISION MULTPL | DOUBLE PRECISION MULTPL | ||||
| @@ -139,7 +139,7 @@ | |||||
| * ===================================================================== | * ===================================================================== | ||||
| * | * | ||||
| * .. Parameters .. | * .. Parameters .. | ||||
| REAL ZERO, HALF, ONE | |||||
| REAL ZERO, HALF, ONE, TWO | |||||
| PARAMETER ( ZERO = 0.0E+0, HALF = 0.5E+0, ONE = 1.0E+0, | PARAMETER ( ZERO = 0.0E+0, HALF = 0.5E+0, ONE = 1.0E+0, | ||||
| $ TWO = 2.0E+0 ) | $ TWO = 2.0E+0 ) | ||||
| REAL MULTPL | REAL MULTPL | ||||
| @@ -319,7 +319,7 @@ | |||||
| DOUBLE PRECISION ABSB, ANORM, ASCALE, ATOL, BNORM, BSCALE, BTOL, | DOUBLE PRECISION ABSB, ANORM, ASCALE, ATOL, BNORM, BSCALE, BTOL, | ||||
| $ C, SAFMIN, TEMP, TEMP2, TEMPR, ULP | $ C, SAFMIN, TEMP, TEMP2, TEMPR, ULP | ||||
| COMPLEX*16 ABI22, AD11, AD12, AD21, AD22, CTEMP, CTEMP2, | COMPLEX*16 ABI22, AD11, AD12, AD21, AD22, CTEMP, CTEMP2, | ||||
| $ CTEMP3, ESHIFT, RTDISC, S, SHIFT, SIGNBC, T1, | |||||
| $ CTEMP3, ESHIFT, S, SHIFT, SIGNBC, | |||||
| $ U12, X, ABI12, Y | $ U12, X, ABI12, Y | ||||
| * .. | * .. | ||||
| * .. External Functions .. | * .. External Functions .. | ||||
| @@ -352,6 +352,7 @@ | |||||
| ILSCHR = .TRUE. | ILSCHR = .TRUE. | ||||
| ISCHUR = 2 | ISCHUR = 2 | ||||
| ELSE | ELSE | ||||
| ILSCHR = .TRUE. | |||||
| ISCHUR = 0 | ISCHUR = 0 | ||||
| END IF | END IF | ||||
| * | * | ||||
| @@ -365,6 +366,7 @@ | |||||
| ILQ = .TRUE. | ILQ = .TRUE. | ||||
| ICOMPQ = 3 | ICOMPQ = 3 | ||||
| ELSE | ELSE | ||||
| ILQ = .TRUE. | |||||
| ICOMPQ = 0 | ICOMPQ = 0 | ||||
| END IF | END IF | ||||
| * | * | ||||
| @@ -378,6 +380,7 @@ | |||||
| ILZ = .TRUE. | ILZ = .TRUE. | ||||
| ICOMPZ = 3 | ICOMPZ = 3 | ||||
| ELSE | ELSE | ||||
| ILZ = .TRUE. | |||||
| ICOMPZ = 0 | ICOMPZ = 0 | ||||
| END IF | END IF | ||||
| * | * | ||||