From e3368cbf1881cdbe65ccc20803f5596bab2b4c08 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sun, 16 Feb 2020 22:58:00 +0800 Subject: [PATCH 001/593] AVX512 STRMM kernel --- kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c | 536 +++++++++++++++---- 1 file changed, 418 insertions(+), 118 deletions(-) diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c index 6ca822b91..ec7570179 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c @@ -1,8 +1,152 @@ -/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store */ -/* r10 to assist prefetch, r12 = k << 4(const), r13 = k(const), r14 = b_head_pos(const), r15 = %1 + 3r12 */ +/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store, %6 = b_pref */ +/* r10 to assist prefetch, r11 = m_counter, r12 = k << 4(const), r13 = k_todo, r14 = b_head_pos(const), r15 = %1 + 3r12 */ #include "common.h" #include +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + #define BACKWARDS 1 +#else + #define BACKWARDS 0 +#endif +#define REC_POINTER_1(ptr) "salq $2,%%r13; subq %%r13,"#ptr"; sarq $2,%%r13;" +#define REC_POINTER_2(ptr) "salq $3,%%r13; subq %%r13,"#ptr"; sarq $3,%%r13;" +#define REC_POINTER_4(ptr) "salq $4,%%r13; subq %%r13,"#ptr"; sarq $4,%%r13;" +#define REC_POINTER_8(ptr) "salq $5,%%r13; subq %%r13,"#ptr"; sarq $5,%%r13;" +#define REC_POINTER_16(ptr) "salq $6,%%r13; subq %%r13,"#ptr"; sarq $6,%%r13;" +#define INC_POINTER_1(ptr) "sarq $2,%%r12; addq %%r12,"#ptr"; salq $2,%%r12;" +#define INC_POINTER_2(ptr) "sarq $1,%%r12; addq %%r12,"#ptr"; salq $1,%%r12;" +#define INC_POINTER_4(ptr) "addq %%r12,"#ptr";" +#define INC_POINTER_8(ptr) "leaq ("#ptr",%%r12,2),"#ptr";" +#define INC_POINTER_16(ptr) "leaq ("#ptr",%%r12,4),"#ptr";" +#define SET_POINTER(ptr,dim) REC_POINTER_##dim(ptr) INC_POINTER_##dim(ptr) +#define SET_PB_1 SET_POINTER(%1,1) +#define SET_PB_2 SET_POINTER(%1,2) +#define SET_PB_4 SET_POINTER(%1,4) +#define SET_PB_8 SET_POINTER(%1,4) +#define SET_PB_12 SET_POINTER(%1,4) +#define SET_PB_16 SET_POINTER(%1,4) +#define SET_PB_20 SET_POINTER(%1,4) +#define SET_PB_24 SET_POINTER(%1,4) +#ifdef TRMMKERNEL + #if BACKWARDS == 1 + #define START_SET_PAPB(mdim,ndim) SET_POINTER(%0,mdim) "movq %%r14,%1;" SET_PB_##ndim "leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15;" + #define END_SET_PA(mdim) "" + #else + #define START_SET_PAPB(mdim,ndim) "movq %%r14,%1; leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15;" + #define END_SET_PA(mdim) SET_POINTER(%0,mdim) + #endif +#else + #define START_SET_PAPB(mdim,ndim) "movq %%r14,%1; leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15;" + #define END_SET_PA(mdim) "" +#endif +#define RECOVER_PA(mdim) REC_POINTER_##mdim(%0) + +#if defined(TRMMKERNEL) && !defined(LEFT) + #if BACKWARDS == 1 + #define KERNEL_HEAD_C_n8(mdim) \ + KERNEL_k1m##mdim##n4 KERNEL_k1m##mdim##n4 KERNEL_k1m##mdim##n4 KERNEL_k1m##mdim##n4 "subq $4,%4; addq $64,%%r15;" + #define KERNEL_HEAD_C_n12(mdim) KERNEL_HEAD_C_n8(mdim)\ + KERNEL_k1m##mdim##n8 KERNEL_k1m##mdim##n8 KERNEL_k1m##mdim##n8 KERNEL_k1m##mdim##n8 "subq $4,%4; addq $64,%%r15;" + #define KERNEL_HEAD_C_n16(mdim) KERNEL_HEAD_C_n12(mdim)\ + KERNEL_k1m##mdim##n12 KERNEL_k1m##mdim##n12 KERNEL_k1m##mdim##n12 KERNEL_k1m##mdim##n12 "subq $4,%4; addq $64,%%r15;" + #define KERNEL_HEAD_C_n20(mdim) KERNEL_HEAD_C_n16(mdim)\ + KERNEL_k1m##mdim##n16 KERNEL_k1m##mdim##n16 KERNEL_k1m##mdim##n16 KERNEL_k1m##mdim##n16 "subq $4,%4;" + #define KERNEL_HEAD_C_n24(mdim) KERNEL_HEAD_C_n20(mdim)\ + KERNEL_k1m##mdim##n20 KERNEL_k1m##mdim##n20 KERNEL_k1m##mdim##n20 KERNEL_k1m##mdim##n20 "subq $4,%4;" + #define KERNEL_HEAD_R_n4(mdim) "subq $12,%4; addq $64,%%r15; addq $"#mdim"*48,%0;" + #define KERNEL_HEAD_R_n8(mdim) KERNEL_HEAD_R_n4(mdim)\ + kernel_k1m##mdim##n4(%%r15) kernel_k1m##mdim##n4(%%r15) kernel_k1m##mdim##n4(%%r15) kernel_k1m##mdim##n4(%%r15) "subq $4,%4;" + #define KERNEL_HEAD_R_n12(mdim) KERNEL_HEAD_R_n8(mdim)\ + kernel_k1m##mdim##n8(%%r15) kernel_k1m##mdim##n8(%%r15) kernel_k1m##mdim##n8(%%r15) kernel_k1m##mdim##n8(%%r15) "subq $4,%4;" + #define KERNEL_TAIL_C_n8(mdim) "" + #define KERNEL_TAIL_C_n12(mdim) "" + #define KERNEL_TAIL_C_n16(mdim) "" + #define KERNEL_TAIL_C_n20(mdim) "" + #define KERNEL_TAIL_C_n24(mdim) "" + #define KERNEL_TAIL_R_n4(mdim) "" + #define KERNEL_TAIL_R_n8(mdim) "" + #define KERNEL_TAIL_R_n12(mdim) "" + #else + #define KERNEL_HEAD_C_n8(mdim) "" + #define KERNEL_HEAD_C_n12(mdim) "" + #define KERNEL_HEAD_C_n16(mdim) "" + #define KERNEL_HEAD_C_n20(mdim) "" + #define KERNEL_HEAD_C_n24(mdim) "" + #define KERNEL_HEAD_R_n4(mdim) "" + #define KERNEL_HEAD_R_n8(mdim) "" + #define KERNEL_HEAD_R_n12(mdim) "" + #define end_kernel_k4_ncx1(k_0,k_1,k_2,k_3,n1,mdim) \ + end_load_a_k1m##mdim(k_0) end_acc_nc##n1##_k1m##mdim(k_0)\ + end_load_a_k1m##mdim(k_1) end_acc_nc##n1##_k1m##mdim(k_1)\ + end_load_a_k1m##mdim(k_2) end_acc_nc##n1##_k1m##mdim(k_2)\ + end_load_a_k1m##mdim(k_3) end_acc_nc##n1##_k1m##mdim(k_3) + #define end_kernel_k4_ncx2(k_0,k_1,k_2,k_3,n1,n2,mdim) \ + end_load_a_k1m##mdim(k_0) end_acc_nc##n1##_k1m##mdim(k_0) end_acc_nc##n2##_k1m##mdim(k_0)\ + end_load_a_k1m##mdim(k_1) end_acc_nc##n1##_k1m##mdim(k_1) end_acc_nc##n2##_k1m##mdim(k_1)\ + end_load_a_k1m##mdim(k_2) end_acc_nc##n1##_k1m##mdim(k_2) end_acc_nc##n2##_k1m##mdim(k_2)\ + end_load_a_k1m##mdim(k_3) end_acc_nc##n1##_k1m##mdim(k_3) end_acc_nc##n2##_k1m##mdim(k_3) + #define end_kernel_k4_ncx3(k_0,k_1,k_2,k_3,n1,n2,n3,mdim) \ + end_load_a_k1m##mdim(k_0) end_acc_nc##n1##_k1m##mdim(k_0) end_acc_nc##n2##_k1m##mdim(k_0) end_acc_nc##n3##_k1m##mdim(k_0)\ + end_load_a_k1m##mdim(k_1) end_acc_nc##n1##_k1m##mdim(k_1) end_acc_nc##n2##_k1m##mdim(k_1) end_acc_nc##n3##_k1m##mdim(k_1)\ + end_load_a_k1m##mdim(k_2) end_acc_nc##n1##_k1m##mdim(k_2) end_acc_nc##n2##_k1m##mdim(k_2) end_acc_nc##n3##_k1m##mdim(k_2)\ + end_load_a_k1m##mdim(k_3) end_acc_nc##n1##_k1m##mdim(k_3) end_acc_nc##n2##_k1m##mdim(k_3) end_acc_nc##n3##_k1m##mdim(k_3) + #define end_kernel_k4_ncx4(k_0,k_1,k_2,k_3,n1,n2,n3,n4,mdim) \ + end_load_a_k1m##mdim(k_0) end_acc_nc##n1##_k1m##mdim(k_0) end_acc_nc##n2##_k1m##mdim(k_0) end_acc_nc##n3##_k1m##mdim(k_0) end_acc_nc##n4##_k1m##mdim(k_0)\ + end_load_a_k1m##mdim(k_1) end_acc_nc##n1##_k1m##mdim(k_1) end_acc_nc##n2##_k1m##mdim(k_1) end_acc_nc##n3##_k1m##mdim(k_1) end_acc_nc##n4##_k1m##mdim(k_1)\ + end_load_a_k1m##mdim(k_2) end_acc_nc##n1##_k1m##mdim(k_2) end_acc_nc##n2##_k1m##mdim(k_2) end_acc_nc##n3##_k1m##mdim(k_2) end_acc_nc##n4##_k1m##mdim(k_2)\ + end_load_a_k1m##mdim(k_3) end_acc_nc##n1##_k1m##mdim(k_3) end_acc_nc##n2##_k1m##mdim(k_3) end_acc_nc##n3##_k1m##mdim(k_3) end_acc_nc##n4##_k1m##mdim(k_3) + #define end_kernel_k4_ncx5(k_0,k_1,k_2,k_3,n1,n2,n3,n4,n5,mdim) \ + end_load_a_k1m##mdim(k_0) end_acc_nc##n1##_k1m##mdim(k_0) end_acc_nc##n2##_k1m##mdim(k_0)\ + end_acc_nc##n3##_k1m##mdim(k_0) end_acc_nc##n4##_k1m##mdim(k_0) end_acc_nc##n5##_k1m##mdim(k_0)\ + end_load_a_k1m##mdim(k_1) end_acc_nc##n1##_k1m##mdim(k_1) end_acc_nc##n2##_k1m##mdim(k_1)\ + end_acc_nc##n3##_k1m##mdim(k_1) end_acc_nc##n4##_k1m##mdim(k_1) end_acc_nc##n5##_k1m##mdim(k_1)\ + end_load_a_k1m##mdim(k_2) end_acc_nc##n1##_k1m##mdim(k_2) end_acc_nc##n2##_k1m##mdim(k_2)\ + end_acc_nc##n3##_k1m##mdim(k_2) end_acc_nc##n4##_k1m##mdim(k_2) end_acc_nc##n5##_k1m##mdim(k_2)\ + end_load_a_k1m##mdim(k_3) end_acc_nc##n1##_k1m##mdim(k_3) end_acc_nc##n2##_k1m##mdim(k_3)\ + end_acc_nc##n3##_k1m##mdim(k_3) end_acc_nc##n4##_k1m##mdim(k_3) end_acc_nc##n5##_k1m##mdim(k_3) + #define KERNEL_TAIL_C_n8(mdim) end_kernel_k4_ncx1(0,1,2,3,2,mdim) + #define KERNEL_TAIL_C_n12(mdim) \ + end_kernel_k4_ncx2(0,1,2,3,2,3,mdim) end_kernel_k4_ncx1(4,5,6,7,3,mdim) + #define KERNEL_TAIL_C_n16(mdim) \ + end_kernel_k4_ncx3(0,1,2,3,2,3,4,mdim) end_kernel_k4_ncx2(4,5,6,7,3,4,mdim) end_kernel_k4_ncx1(8,9,10,11,4,mdim) + #define KERNEL_TAIL_C_n20(mdim) \ + end_kernel_k4_ncx4(0,1,2,3,2,3,4,5,mdim) end_kernel_k4_ncx3(4,5,6,7,3,4,5,mdim)\ + end_kernel_k4_ncx2(8,9,10,11,4,5,mdim) end_kernel_k4_ncx1(12,13,14,15,5,mdim) + #define KERNEL_TAIL_C_n24(mdim) \ + end_kernel_k4_ncx5(0,1,2,3,2,3,4,5,6,mdim) end_kernel_k4_ncx4(4,5,6,7,3,4,5,6,mdim) end_kernel_k4_ncx3(8,9,10,11,4,5,6,mdim)\ + end_kernel_k4_ncx2(12,13,14,15,5,6,mdim) end_kernel_k4_ncx1(16,17,18,19,6,mdim) + #define KERNEL_TAIL_R_n4(mdim) \ + end_kernel_k4_ncx1(0,1,2,3,4,mdim) end_kernel_k4_ncx1(4,5,6,7,4,mdim) end_kernel_k4_ncx1(8,9,10,11,4,mdim) + #define KERNEL_TAIL_R_n8(mdim) \ + end_kernel_k4_ncx2(0,1,2,3,4,5,mdim) end_kernel_k4_ncx2(4,5,6,7,4,5,mdim) end_kernel_k4_ncx2(8,9,10,11,4,5,mdim) end_kernel_k4_ncx1(12,13,14,15,5,mdim) + #define KERNEL_TAIL_R_n12(mdim) \ + end_kernel_k4_ncx3(0,1,2,3,4,5,6,mdim) end_kernel_k4_ncx3(4,5,6,7,4,5,6,mdim) end_kernel_k4_ncx3(8,9,10,11,4,5,6,mdim)\ + end_kernel_k4_ncx2(12,13,14,15,5,6,mdim) end_kernel_k4_ncx1(16,17,18,19,6,mdim) + #endif +#else + #define KERNEL_HEAD_C_n8(mdim) "" + #define KERNEL_HEAD_C_n12(mdim) "" + #define KERNEL_HEAD_C_n16(mdim) "" + #define KERNEL_HEAD_C_n20(mdim) "" + #define KERNEL_HEAD_C_n24(mdim) "" + #define KERNEL_HEAD_R_n4(mdim) "" + #define KERNEL_HEAD_R_n8(mdim) "" + #define KERNEL_HEAD_R_n12(mdim) "" + #define KERNEL_TAIL_C_n8(mdim) "" + #define KERNEL_TAIL_C_n12(mdim) "" + #define KERNEL_TAIL_C_n16(mdim) "" + #define KERNEL_TAIL_C_n20(mdim) "" + #define KERNEL_TAIL_C_n24(mdim) "" + #define KERNEL_TAIL_R_n4(mdim) "" + #define KERNEL_TAIL_R_n8(mdim) "" + #define KERNEL_TAIL_R_n12(mdim) "" +#endif +#define KERNEL_HEAD_C_n1(mdim) "" +#define KERNEL_HEAD_C_n2(mdim) "" +#define KERNEL_HEAD_C_n4(mdim) "" +#define KERNEL_TAIL_C_n1(mdim) "" +#define KERNEL_TAIL_C_n2(mdim) "" +#define KERNEL_TAIL_C_n4(mdim) "" /* m = 16 */ /* zmm8-zmm31 for accumulators, zmm1-zmm7 for temporary use, zmm0 for alpha */ #define KERNEL_k1m16n1 \ @@ -15,9 +159,10 @@ #define KERNEL_k1m16n2 KERNEL_h_k1m16n2 "addq $8,%1;" #define KERNEL_h_k1m16n4 KERNEL_h_k1m16n2 "vbroadcastsd 8(%1),%%zmm7; vfmadd231ps %%zmm4,%%zmm7,%%zmm10; vfmadd231ps %%zmm5,%%zmm7,%%zmm11;" #define KERNEL_k1m16n4 KERNEL_h_k1m16n4 "addq $16,%1;" -#define unit_kernel_k1m16n4(c1,c2,c3,c4, ...) \ - "vbroadcastsd ("#__VA_ARGS__"),%%zmm6; vfmadd231ps %%zmm4,%%zmm6,"#c1"; vfmadd231ps %%zmm5,%%zmm6,"#c2";"\ - "vbroadcastsd 8("#__VA_ARGS__"),%%zmm7; vfmadd231ps %%zmm4,%%zmm7,"#c3"; vfmadd231ps %%zmm5,%%zmm7,"#c4";" +#define unit_gen_kernel_k1m16n4(c1,c2,c3,c4,k_no,...) \ + "vbroadcastsd "#k_no"*16 ("#__VA_ARGS__"),%%zmm6; vfmadd231ps %%zmm4,%%zmm6,"#c1"; vfmadd231ps %%zmm5,%%zmm6,"#c2";"\ + "vbroadcastsd "#k_no"*16+8("#__VA_ARGS__"),%%zmm7; vfmadd231ps %%zmm4,%%zmm7,"#c3"; vfmadd231ps %%zmm5,%%zmm7,"#c4";" +#define unit_kernel_k1m16n4(c1,c2,c3,c4, ...) unit_gen_kernel_k1m16n4(c1,c2,c3,c4,0,__VA_ARGS__) #define KERNEL_h_k1m16n8 KERNEL_h_k1m16n4 unit_kernel_k1m16n4(%%zmm12,%%zmm13,%%zmm14,%%zmm15,%1,%%r12,1) #define KERNEL_k1m16n8 KERNEL_h_k1m16n8 "addq $16,%1;" #define KERNEL_h_k1m16n12 KERNEL_h_k1m16n8 unit_kernel_k1m16n4(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%1,%%r12,2) @@ -28,6 +173,12 @@ #define KERNEL_k1m16n20 KERNEL_h_k1m16n20 "addq $16,%%r15;" #define KERNEL_h_k1m16n24 KERNEL_h_k1m16n20 unit_kernel_k1m16n4(%%zmm28,%%zmm29,%%zmm30,%%zmm31,%%r15,%%r12,2) #define KERNEL_k1m16n24 KERNEL_h_k1m16n24 "addq $16,%%r15;" +#define end_load_a_k1m16(k_no) "vmovsldup "#k_no"*64(%0),%%zmm4; vmovshdup "#k_no"*64(%0),%%zmm5;" +#define end_acc_nc2_k1m16(k_no) unit_gen_kernel_k1m16n4(%%zmm12,%%zmm13,%%zmm14,%%zmm15,k_no,%1,%%r12,1) +#define end_acc_nc3_k1m16(k_no) unit_gen_kernel_k1m16n4(%%zmm16,%%zmm17,%%zmm18,%%zmm19,k_no,%1,%%r12,2) +#define end_acc_nc4_k1m16(k_no) unit_gen_kernel_k1m16n4(%%zmm20,%%zmm21,%%zmm22,%%zmm23,k_no,%%r15) +#define end_acc_nc5_k1m16(k_no) unit_gen_kernel_k1m16n4(%%zmm24,%%zmm25,%%zmm26,%%zmm27,k_no,%%r15,%%r12,1) +#define end_acc_nc6_k1m16(k_no) unit_gen_kernel_k1m16n4(%%zmm28,%%zmm29,%%zmm30,%%zmm31,k_no,%%r15,%%r12,2) #define INIT_m16n1 "vpxorq %%zmm8,%%zmm8,%%zmm8;" #define INIT_m16n2 INIT_m16n1 "vpxorq %%zmm9,%%zmm9,%%zmm9;" #define INIT_m16n4 INIT_m16n2 "vpxorq %%zmm10,%%zmm10,%%zmm10;vpxorq %%zmm11,%%zmm11,%%zmm11;" @@ -38,11 +189,19 @@ #define INIT_m16n16 INIT_m16n12 unit_init_m16n4(%%zmm20,%%zmm21,%%zmm22,%%zmm23) #define INIT_m16n20 INIT_m16n16 unit_init_m16n4(%%zmm24,%%zmm25,%%zmm26,%%zmm27) #define INIT_m16n24 INIT_m16n20 unit_init_m16n4(%%zmm28,%%zmm29,%%zmm30,%%zmm31) -#define SAVE_h_m16n1 "vfmadd213ps (%2),%%zmm0,%%zmm8; vmovups %%zmm8,(%2);" -#define unit_save_m16n2(c1,c2) \ +#ifdef TRMMKERNEL + #define SAVE_h_m16n1 "vmulps %%zmm8,%%zmm0,%%zmm8; vmovups %%zmm8,(%2);" + #define unit_save_m16n2(c1,c2) \ + "vunpcklps "#c2","#c1",%%zmm6; vunpckhps "#c2","#c1",%%zmm7; vunpcklpd %%zmm7,%%zmm6,%%zmm4; vunpckhpd %%zmm7,%%zmm6,%%zmm5;"\ + "vmulps %%zmm4,%%zmm0,%%zmm4; vmulps %%zmm5,%%zmm0,%%zmm5;"\ + "vmovups %%zmm4,(%5); vmovups %%zmm5,(%5,%3,1); leaq (%5,%3,2),%5;" +#else + #define SAVE_h_m16n1 "vfmadd213ps (%2),%%zmm0,%%zmm8; vmovups %%zmm8,(%2);" + #define unit_save_m16n2(c1,c2) \ "vunpcklps "#c2","#c1",%%zmm6; vunpckhps "#c2","#c1",%%zmm7; vunpcklpd %%zmm7,%%zmm6,%%zmm4; vunpckhpd %%zmm7,%%zmm6,%%zmm5;"\ "vfmadd213ps (%5),%%zmm0,%%zmm4; vfmadd213ps (%5,%3,1),%%zmm0,%%zmm5;"\ "vmovups %%zmm4,(%5); vmovups %%zmm5,(%5,%3,1); leaq (%5,%3,2),%5;" +#endif #define SAVE_h_m16n2 "movq %2,%5;" unit_save_m16n2(%%zmm8,%%zmm9) #define SAVE_h_m16n4 SAVE_h_m16n2 unit_save_m16n2(%%zmm10,%%zmm11) #define SAVE_h_m16n8 SAVE_h_m16n4 unit_save_m16n2(%%zmm12,%%zmm13) unit_save_m16n2(%%zmm14,%%zmm15) @@ -52,8 +211,9 @@ #define SAVE_h_m16n24 SAVE_h_m16n20 unit_save_m16n2(%%zmm28,%%zmm29) unit_save_m16n2(%%zmm30,%%zmm31) #define SAVE_m16(ndim) SAVE_h_m16n##ndim "addq $64,%2;" #define COMPUTE_m16(ndim) \ - INIT_m16n##ndim\ - "movq %%r13,%4; movq %%r14,%1; leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15; movq %2,%5; xorq %%r10,%%r10;"\ + INIT_m16n##ndim START_SET_PAPB(16,ndim)\ + "movq %%r13,%4; movq %2,%5; xorq %%r10,%%r10;"\ + KERNEL_HEAD_C_n##ndim(16)\ "cmpq $16,%4; jb "#ndim"016162f;"\ #ndim"016161:\n\t"\ "cmpq $126,%%r10; movq $126,%%r10; cmoveq %3,%%r10;"\ @@ -72,28 +232,41 @@ KERNEL_k1m16n##ndim\ "leaq (%5,%3,2),%5; decq %4; jnz "#ndim"016163b;"\ #ndim"016164:\n\t"\ + KERNEL_TAIL_C_n##ndim(16)\ "prefetcht0 (%%r14); prefetcht0 64(%%r14);"\ - SAVE_m16(ndim) + SAVE_m16(ndim) END_SET_PA(16) /* m = 8 *//* ymm0 for alpha, ymm1-ymm3 for temporary use, ymm4-ymm15 for accumulators */ -#define KERNEL_k1m8n1(b_addr) \ +#define kernel_k1m8n1(b_addr) \ "vmovups (%0),%%ymm1; addq $32,%0;"\ "vbroadcastss ("#b_addr"),%%ymm2; vfmadd231ps %%ymm1,%%ymm2,%%ymm4;"\ "addq $4,"#b_addr";" -#define KERNEL_h_k1m8n2(b_addr) \ +#define kernel_h_k1m8n2(b_addr) \ "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; addq $32,%0;"\ "vbroadcastsd ("#b_addr"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;" -#define KERNEL_k1m8n2(b_addr) KERNEL_h_k1m8n2(b_addr) "addq $8,"#b_addr";" -#define KERNEL_h_k1m8n4(b_addr) \ - KERNEL_h_k1m8n2(b_addr) "vbroadcastsd 8("#b_addr"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;" -#define KERNEL_k1m8n4(b_addr) KERNEL_h_k1m8n4(b_addr) "addq $16,"#b_addr";" -#define unit_kernel_k1m8n4(c1,c2,c3,c4,...) \ - "vbroadcastsd ("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c1"; vfmadd231ps %%ymm2,%%ymm3,"#c2";"\ - "vbroadcastsd 8("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c3"; vfmadd231ps %%ymm2,%%ymm3,"#c4";" -#define KERNEL_h_k1m8n8(b_addr) KERNEL_h_k1m8n4(b_addr) unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,b_addr,%%r12,1) -#define KERNEL_k1m8n8(b_addr) KERNEL_h_k1m8n8(b_addr) "addq $16,"#b_addr";" -#define KERNEL_h_k1m8n12(b_addr) KERNEL_h_k1m8n8(b_addr) unit_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,b_addr,%%r12,2) -#define KERNEL_k1m8n12(b_addr) KERNEL_h_k1m8n12(b_addr) "addq $16,"#b_addr";" +#define kernel_k1m8n2(b_addr) kernel_h_k1m8n2(b_addr) "addq $8,"#b_addr";" +#define kernel_h_k1m8n4(b_addr) \ + kernel_h_k1m8n2(b_addr) "vbroadcastsd 8("#b_addr"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;" +#define kernel_k1m8n4(b_addr) kernel_h_k1m8n4(b_addr) "addq $16,"#b_addr";" +#define unit_gen_kernel_k1m8n4(c1,c2,c3,c4,k_no,...) \ + "vbroadcastsd "#k_no"*16 ("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c1"; vfmadd231ps %%ymm2,%%ymm3,"#c2";"\ + "vbroadcastsd "#k_no"*16+8("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c3"; vfmadd231ps %%ymm2,%%ymm3,"#c4";" +#define unit_kernel_k1m8n4(c1,c2,c3,c4,...) unit_gen_kernel_k1m8n4(c1,c2,c3,c4,0,__VA_ARGS__) +#define kernel_h_k1m8n8(b_addr) kernel_h_k1m8n4(b_addr) unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,b_addr,%%r12,1) +#define kernel_k1m8n8(b_addr) kernel_h_k1m8n8(b_addr) "addq $16,"#b_addr";" +#define kernel_h_k1m8n12(b_addr) kernel_h_k1m8n8(b_addr) unit_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,b_addr,%%r12,2) +#define kernel_k1m8n12(b_addr) kernel_h_k1m8n12(b_addr) "addq $16,"#b_addr";" +#define KERNEL_k1m8n1 kernel_k1m8n1(%1) +#define KERNEL_k1m8n2 kernel_k1m8n2(%1) +#define KERNEL_k1m8n4 kernel_k1m8n4(%1) +#define KERNEL_k1m8n8 kernel_k1m8n8(%1) +#define KERNEL_k1m8n12 kernel_k1m8n12(%1) +#define end_load_a_k1m8(k_no) "vmovsldup "#k_no"*32(%0),%%ymm1; vmovshdup "#k_no"*32(%0),%%ymm2;" +#define end_acc_nc2_k1m8(k_no) unit_gen_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,k_no,%1,%%r12,1) +#define end_acc_nc3_k1m8(k_no) unit_gen_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,k_no,%1,%%r12,2) +#define end_acc_nc4_k1m8(k_no) unit_gen_kernel_k1m8n4(%%ymm4,%%ymm5,%%ymm6,%%ymm7,k_no,%%r15) +#define end_acc_nc5_k1m8(k_no) unit_gen_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,k_no,%%r15,%%r12,1) +#define end_acc_nc6_k1m8(k_no) unit_gen_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,k_no,%%r15,%%r12,2) #define INIT_m8n1 "vpxor %%ymm4,%%ymm4,%%ymm4;" #define INIT_m8n2 INIT_m8n1 "vpxor %%ymm5,%%ymm5,%%ymm5;" #define INIT_m8n4 INIT_m8n2 "vpxor %%ymm6,%%ymm6,%%ymm6;vpxor %%ymm7,%%ymm7,%%ymm7;" @@ -101,12 +274,21 @@ "vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";" #define INIT_m8n8 INIT_m8n4 unit_init_m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11) #define INIT_m8n12 INIT_m8n8 unit_init_m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15) -#define SAVE_L_m8n1 "vfmadd213ps (%2),%%ymm0,%%ymm4; vmovups %%ymm4,(%2);" -#define unit_save_m8n2(c1,c2) \ +#ifdef TRMMKERNEL + #define SAVE_L_m8n1 "vmulps %%ymm4,%%ymm0,%%ymm4; vmovups %%ymm4,(%2);" + #define unit_save_m8n2(c1,c2) \ "vunpcklps "#c2","#c1",%%ymm2; vunpckhps "#c2","#c1",%%ymm3;"\ - "vunpcklpd %%ymm3,%%ymm2,%%ymm1;vfmadd213ps (%5), %%ymm0,%%ymm1;vmovups %%ymm1,(%5);"\ - "vunpckhpd %%ymm3,%%ymm2,%%ymm1;vfmadd213ps (%5,%3,1),%%ymm0,%%ymm1;vmovups %%ymm1,(%5,%3,1);"\ + "vunpcklpd %%ymm3,%%ymm2,%%ymm1; vmulps %%ymm1,%%ymm0,%%ymm1; vmovups %%ymm1,(%5);"\ + "vunpckhpd %%ymm3,%%ymm2,%%ymm1; vmulps %%ymm1,%%ymm0,%%ymm1; vmovups %%ymm1,(%5,%3,1);"\ "leaq (%5,%3,2),%5;" +#else + #define SAVE_L_m8n1 "vfmadd213ps (%2),%%ymm0,%%ymm4; vmovups %%ymm4,(%2);" + #define unit_save_m8n2(c1,c2) \ + "vunpcklps "#c2","#c1",%%ymm2; vunpckhps "#c2","#c1",%%ymm3;"\ + "vunpcklpd %%ymm3,%%ymm2,%%ymm1; vfmadd213ps (%5), %%ymm0,%%ymm1; vmovups %%ymm1,(%5);"\ + "vunpckhpd %%ymm3,%%ymm2,%%ymm1; vfmadd213ps (%5,%3,1),%%ymm0,%%ymm1; vmovups %%ymm1,(%5,%3,1);"\ + "leaq (%5,%3,2),%5;" +#endif #define SAVE_L_m8n2 "movq %2,%5;" unit_save_m8n2(%%ymm4,%%ymm5) #define SAVE_L_m8n4 SAVE_L_m8n2 unit_save_m8n2(%%ymm6,%%ymm7) #define SAVE_L_m8n8 SAVE_L_m8n4 unit_save_m8n2(%%ymm8,%%ymm9) unit_save_m8n2(%%ymm10,%%ymm11) @@ -115,53 +297,68 @@ #define SAVE_R_m8n8 SAVE_R_m8n4 unit_save_m8n2(%%ymm8,%%ymm9) unit_save_m8n2(%%ymm10,%%ymm11) #define SAVE_R_m8n12 SAVE_R_m8n8 unit_save_m8n2(%%ymm12,%%ymm13) unit_save_m8n2(%%ymm14,%%ymm15) #define COMPUTE_L_m8(ndim,sim) \ - INIT_m8n##ndim\ - "movq %%r13,%4; movq %%r14,%1;"\ - #ndim""#sim"882:\n\t"\ + INIT_m8n##ndim START_SET_PAPB(8,ndim)\ + "movq %%r13,%4;"\ + KERNEL_HEAD_C_n##ndim(8)\ "testq %4,%4; jz "#ndim""#sim"883f;"\ - KERNEL_k1m8n##ndim(%1)\ - "decq %4; jmp "#ndim""#sim"882b;"\ + #ndim""#sim"882:\n\t"\ + kernel_k1m8n##ndim(%1)\ + "decq %4; jnz "#ndim""#sim"882b;"\ #ndim""#sim"883:\n\t"\ + KERNEL_TAIL_C_n##ndim(8)\ SAVE_L_m8n##ndim "addq $32,%2;" #define COMPUTE_R_m8(ndim,sim) \ - "subq %%r12,%0; subq %%r12,%0;"\ - INIT_m8n##ndim\ - "movq %%r13,%4; leaq (%%r14,%%r12,2),%%r15; addq %%r12,%%r15;"\ - #ndim""#sim"882:\n\t"\ + INIT_m8n##ndim RECOVER_PA(8)\ + "movq %%r13,%4;"\ + KERNEL_HEAD_R_n##ndim(8)\ "testq %4,%4; jz "#ndim""#sim"883f;"\ - KERNEL_k1m8n##ndim(%%r15)\ - "decq %4; jmp "#ndim""#sim"882b;"\ + #ndim""#sim"882:\n\t"\ + kernel_k1m8n##ndim(%%r15)\ + "decq %4; jnz "#ndim""#sim"882b;"\ #ndim""#sim"883:\n\t"\ - SAVE_R_m8n##ndim -#define COMPUTE_m8_n1 COMPUTE_L_m8(1,33833) -#define COMPUTE_m8_n2 COMPUTE_L_m8(2,33833) -#define COMPUTE_m8_n4 COMPUTE_L_m8(4,33833) -#define COMPUTE_m8_n8 COMPUTE_L_m8(8,33833) -#define COMPUTE_m8_n12 COMPUTE_L_m8(12,33833) + KERNEL_TAIL_R_n##ndim(8)\ + SAVE_R_m8n##ndim END_SET_PA(8) +#define COMPUTE_m8_n1 COMPUTE_L_m8(1,33833) END_SET_PA(8) +#define COMPUTE_m8_n2 COMPUTE_L_m8(2,33833) END_SET_PA(8) +#define COMPUTE_m8_n4 COMPUTE_L_m8(4,33833) END_SET_PA(8) +#define COMPUTE_m8_n8 COMPUTE_L_m8(8,33833) END_SET_PA(8) +#define COMPUTE_m8_n12 COMPUTE_L_m8(12,33833) END_SET_PA(8) #define COMPUTE_m8_n16 COMPUTE_L_m8(12,33733) COMPUTE_R_m8(4,33933) #define COMPUTE_m8_n20 COMPUTE_L_m8(12,33633) COMPUTE_R_m8(8,33933) #define COMPUTE_m8_n24 COMPUTE_L_m8(12,33533) COMPUTE_R_m8(12,33933) #define COMPUTE_m8(ndim) COMPUTE_m8_n##ndim /* m = 4 *//* xmm0 for alpha, xmm1-xmm3 for temporary use, xmm4-xmm15 for accumulators */ -#define KERNEL_k1m4n1(b_addr) \ +#define kernel_k1m4n1(b_addr) \ "vmovups (%0),%%xmm1; addq $16,%0;"\ "vbroadcastss ("#b_addr"),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ "addq $4,"#b_addr";" -#define KERNEL_h_k1m4n2(b_addr) \ +#define kernel_h_k1m4n2(b_addr) \ "vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2; addq $16,%0;"\ "vmovddup ("#b_addr"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm4; vfmadd231ps %%xmm2,%%xmm3,%%xmm5;" -#define KERNEL_k1m4n2(b_addr) KERNEL_h_k1m4n2(b_addr) "addq $8,"#b_addr";" -#define KERNEL_h_k1m4n4(b_addr) \ - KERNEL_h_k1m4n2(b_addr) "vmovddup 8("#b_addr"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm6; vfmadd231ps %%xmm2,%%xmm3,%%xmm7;" -#define KERNEL_k1m4n4(b_addr) KERNEL_h_k1m4n4(b_addr) "addq $16,"#b_addr";" -#define unit_kernel_k1m4n4(c1,c2,c3,c4,...) \ - "vmovddup ("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c1"; vfmadd231ps %%xmm2,%%xmm3,"#c2";"\ - "vmovddup 8("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c3"; vfmadd231ps %%xmm2,%%xmm3,"#c4";" -#define KERNEL_h_k1m4n8(b_addr) KERNEL_h_k1m4n4(b_addr) unit_kernel_k1m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11,b_addr,%%r12,1) -#define KERNEL_k1m4n8(b_addr) KERNEL_h_k1m4n8(b_addr) "addq $16,"#b_addr";" -#define KERNEL_h_k1m4n12(b_addr) KERNEL_h_k1m4n8(b_addr) unit_kernel_k1m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15,b_addr,%%r12,2) -#define KERNEL_k1m4n12(b_addr) KERNEL_h_k1m4n12(b_addr) "addq $16,"#b_addr";" +#define kernel_k1m4n2(b_addr) kernel_h_k1m4n2(b_addr) "addq $8,"#b_addr";" +#define kernel_h_k1m4n4(b_addr) \ + kernel_h_k1m4n2(b_addr) "vmovddup 8("#b_addr"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm6; vfmadd231ps %%xmm2,%%xmm3,%%xmm7;" +#define kernel_k1m4n4(b_addr) kernel_h_k1m4n4(b_addr) "addq $16,"#b_addr";" +#define unit_gen_kernel_k1m4n4(c1,c2,c3,c4,k_no,...) \ + "vmovddup "#k_no"*16 ("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c1"; vfmadd231ps %%xmm2,%%xmm3,"#c2";"\ + "vmovddup "#k_no"*16+8("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c3"; vfmadd231ps %%xmm2,%%xmm3,"#c4";" +#define unit_kernel_k1m4n4(c1,c2,c3,c4,...) unit_gen_kernel_k1m4n4(c1,c2,c3,c4,0,__VA_ARGS__) +#define kernel_h_k1m4n8(b_addr) kernel_h_k1m4n4(b_addr) unit_kernel_k1m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11,b_addr,%%r12,1) +#define kernel_k1m4n8(b_addr) kernel_h_k1m4n8(b_addr) "addq $16,"#b_addr";" +#define kernel_h_k1m4n12(b_addr) kernel_h_k1m4n8(b_addr) unit_kernel_k1m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15,b_addr,%%r12,2) +#define kernel_k1m4n12(b_addr) kernel_h_k1m4n12(b_addr) "addq $16,"#b_addr";" +#define KERNEL_k1m4n1 kernel_k1m4n1(%1) +#define KERNEL_k1m4n2 kernel_k1m4n2(%1) +#define KERNEL_k1m4n4 kernel_k1m4n4(%1) +#define KERNEL_k1m4n8 kernel_k1m4n8(%1) +#define KERNEL_k1m4n12 kernel_k1m4n12(%1) +#define end_load_a_k1m4(k_no) "vmovsldup "#k_no"*16(%0),%%xmm1; vmovshdup "#k_no"*16(%0),%%xmm2;" +#define end_acc_nc2_k1m4(k_no) unit_gen_kernel_k1m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11,k_no,%1,%%r12,1) +#define end_acc_nc3_k1m4(k_no) unit_gen_kernel_k1m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15,k_no,%1,%%r12,2) +#define end_acc_nc4_k1m4(k_no) unit_gen_kernel_k1m4n4(%%xmm4,%%xmm5,%%xmm6,%%xmm7,k_no,%%r15) +#define end_acc_nc5_k1m4(k_no) unit_gen_kernel_k1m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11,k_no,%%r15,%%r12,1) +#define end_acc_nc6_k1m4(k_no) unit_gen_kernel_k1m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15,k_no,%%r15,%%r12,2) #define INIT_m4n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" #define INIT_m4n2 INIT_m4n1 "vpxor %%xmm5,%%xmm5,%%xmm5;" #define INIT_m4n4 INIT_m4n2 "vpxor %%xmm6,%%xmm6,%%xmm6;vpxor %%xmm7,%%xmm7,%%xmm7;" @@ -169,12 +366,21 @@ "vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";" #define INIT_m4n8 INIT_m4n4 unit_init_m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11) #define INIT_m4n12 INIT_m4n8 unit_init_m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15) -#define SAVE_L_m4n1 "vfmadd213ps (%2),%%xmm0,%%xmm4; vmovups %%xmm4,(%2);" -#define unit_save_m4n2(c1,c2) \ +#ifdef TRMMKERNEL + #define SAVE_L_m4n1 "vmulps %%xmm4,%%xmm0,%%xmm4; vmovups %%xmm4,(%2);" + #define unit_save_m4n2(c1,c2) \ + "vunpcklps "#c2","#c1",%%xmm2; vunpckhps "#c2","#c1",%%xmm3;"\ + "vunpcklpd %%xmm3,%%xmm2,%%xmm1;vmulps %%xmm1,%%xmm0,%%xmm1;vmovups %%xmm1,(%5);"\ + "vunpckhpd %%xmm3,%%xmm2,%%xmm1;vmulps %%xmm1,%%xmm0,%%xmm1;vmovups %%xmm1,(%5,%3,1);"\ + "leaq (%5,%3,2),%5;" +#else + #define SAVE_L_m4n1 "vfmadd213ps (%2),%%xmm0,%%xmm4; vmovups %%xmm4,(%2);" + #define unit_save_m4n2(c1,c2) \ "vunpcklps "#c2","#c1",%%xmm2; vunpckhps "#c2","#c1",%%xmm3;"\ "vunpcklpd %%xmm3,%%xmm2,%%xmm1;vfmadd213ps (%5), %%xmm0,%%xmm1;vmovups %%xmm1,(%5);"\ "vunpckhpd %%xmm3,%%xmm2,%%xmm1;vfmadd213ps (%5,%3,1),%%xmm0,%%xmm1;vmovups %%xmm1,(%5,%3,1);"\ "leaq (%5,%3,2),%5;" +#endif #define SAVE_L_m4n2 "movq %2,%5;" unit_save_m4n2(%%xmm4,%%xmm5) #define SAVE_L_m4n4 SAVE_L_m4n2 unit_save_m4n2(%%xmm6,%%xmm7) #define SAVE_L_m4n8 SAVE_L_m4n4 unit_save_m4n2(%%xmm8,%%xmm9) unit_save_m4n2(%%xmm10,%%xmm11) @@ -183,29 +389,32 @@ #define SAVE_R_m4n8 SAVE_R_m4n4 unit_save_m4n2(%%xmm8,%%xmm9) unit_save_m4n2(%%xmm10,%%xmm11) #define SAVE_R_m4n12 SAVE_R_m4n8 unit_save_m4n2(%%xmm12,%%xmm13) unit_save_m4n2(%%xmm14,%%xmm15) #define COMPUTE_L_m4(ndim,sim) \ - INIT_m4n##ndim\ - "movq %%r13,%4; movq %%r14,%1;"\ - #ndim""#sim"442:\n\t"\ + INIT_m4n##ndim START_SET_PAPB(4,ndim)\ + "movq %%r13,%4;"\ + KERNEL_HEAD_C_n##ndim(4)\ "testq %4,%4; jz "#ndim""#sim"443f;"\ - KERNEL_k1m4n##ndim(%1)\ - "decq %4; jmp "#ndim""#sim"442b;"\ + #ndim""#sim"442:\n\t"\ + kernel_k1m4n##ndim(%1)\ + "decq %4; jnz "#ndim""#sim"442b;"\ #ndim""#sim"443:\n\t"\ + KERNEL_TAIL_C_n##ndim(4)\ SAVE_L_m4n##ndim "addq $16,%2;" #define COMPUTE_R_m4(ndim,sim) \ - "subq %%r12,%0;"\ - INIT_m4n##ndim\ - "movq %%r13,%4; leaq (%%r14,%%r12,2),%%r15; addq %%r12,%%r15;"\ - #ndim""#sim"442:\n\t"\ + INIT_m4n##ndim RECOVER_PA(4)\ + "movq %%r13,%4;"\ + KERNEL_HEAD_R_n##ndim(4)\ "testq %4,%4; jz "#ndim""#sim"443f;"\ - KERNEL_k1m4n##ndim(%%r15)\ - "decq %4; jmp "#ndim""#sim"442b;"\ + #ndim""#sim"442:\n\t"\ + kernel_k1m4n##ndim(%%r15)\ + "decq %4; jnz "#ndim""#sim"442b;"\ #ndim""#sim"443:\n\t"\ - SAVE_R_m4n##ndim -#define COMPUTE_m4_n1 COMPUTE_L_m4(1,55855) -#define COMPUTE_m4_n2 COMPUTE_L_m4(2,55855) -#define COMPUTE_m4_n4 COMPUTE_L_m4(4,55855) -#define COMPUTE_m4_n8 COMPUTE_L_m4(8,55855) -#define COMPUTE_m4_n12 COMPUTE_L_m4(12,55855) + KERNEL_TAIL_R_n##ndim(4)\ + SAVE_R_m4n##ndim END_SET_PA(4) +#define COMPUTE_m4_n1 COMPUTE_L_m4(1,55855) END_SET_PA(4) +#define COMPUTE_m4_n2 COMPUTE_L_m4(2,55855) END_SET_PA(4) +#define COMPUTE_m4_n4 COMPUTE_L_m4(4,55855) END_SET_PA(4) +#define COMPUTE_m4_n8 COMPUTE_L_m4(8,55855) END_SET_PA(4) +#define COMPUTE_m4_n12 COMPUTE_L_m4(12,55855) END_SET_PA(4) #define COMPUTE_m4_n16 COMPUTE_L_m4(12,55755) COMPUTE_R_m4(4,55955) #define COMPUTE_m4_n20 COMPUTE_L_m4(12,55655) COMPUTE_R_m4(8,55955) #define COMPUTE_m4_n24 COMPUTE_L_m4(12,55555) COMPUTE_R_m4(12,55955) @@ -217,40 +426,60 @@ "vmovsd (%0),%%xmm1; addq $8,%0;"\ "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ "addq $4,%1;" -#define SAVE_h_m2n1 "vmovsd (%2),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm4; vmovsd %%xmm4,(%2);" #define INIT_m2n2 INIT_m2n1 "vpxor %%xmm5,%%xmm5,%%xmm5;" #define KERNEL_k1m2n2 \ "vmovsd (%0),%%xmm1; addq $8,%0;"\ "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ "vbroadcastss 4(%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm5;"\ "addq $8,%1;" -#define SAVE_h_m2n2 SAVE_h_m2n1 "vmovsd (%2,%3,1),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm5; vmovsd %%xmm5,(%2,%3,1);" +#ifdef TRMMKERNEL + #define SAVE_h_m2n1 "vmulps %%xmm4,%%xmm0,%%xmm4; vmovsd %%xmm4,(%2);" + #define SAVE_h_m2n2 SAVE_h_m2n1 "vmulps %%xmm5,%%xmm0,%%xmm5; vmovsd %%xmm5,(%2,%3,1);" +#else + #define SAVE_h_m2n1 "vmovsd (%2),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm4; vmovsd %%xmm4,(%2);" + #define SAVE_h_m2n2 SAVE_h_m2n1 "vmovsd (%2,%3,1),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm5; vmovsd %%xmm5,(%2,%3,1);" +#endif #define INIT_m2n4 INIT_m2n2 #define INIT_m2n8 INIT_m2n4 "vpxor %%xmm6,%%xmm6,%%xmm6; vpxor %%xmm7,%%xmm7,%%xmm7;" #define INIT_m2n12 INIT_m2n8 "vpxor %%xmm8,%%xmm8,%%xmm8; vpxor %%xmm9,%%xmm9,%%xmm9;" #define INIT_m2n16 INIT_m2n12 "vpxor %%xmm10,%%xmm10,%%xmm10; vpxor %%xmm11,%%xmm11,%%xmm11;" #define INIT_m2n20 INIT_m2n16 "vpxor %%xmm12,%%xmm12,%%xmm12; vpxor %%xmm13,%%xmm13,%%xmm13;" #define INIT_m2n24 INIT_m2n20 "vpxor %%xmm14,%%xmm14,%%xmm14; vpxor %%xmm15,%%xmm15,%%xmm15;" +#define unit_gen_kernel_k1m2n4(c1,c2,k_no,...) \ + "vmovups "#k_no"*16("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c1"; vfmadd231ps %%xmm2,%%xmm3,"#c2";" #define KERNEL_h_k1m2n4 \ - "vbroadcastss (%0),%%xmm1; vbroadcastss 4(%0),%%xmm2; addq $8,%0;"\ - "vmovups (%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm4; vfmadd231ps %%xmm2,%%xmm3,%%xmm5;" + "vbroadcastss (%0),%%xmm1; vbroadcastss 4(%0),%%xmm2; addq $8,%0;" unit_gen_kernel_k1m2n4(%%xmm4,%%xmm5,0,%1) #define KERNEL_k1m2n4 KERNEL_h_k1m2n4 "addq $16,%1;" -#define KERNEL_h_k1m2n8 KERNEL_h_k1m2n4 "vmovups (%1,%%r12,1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm6; vfmadd231ps %%xmm2,%%xmm3,%%xmm7;" +#define KERNEL_h_k1m2n8 KERNEL_h_k1m2n4 unit_gen_kernel_k1m2n4(%%xmm6,%%xmm7,0,%1,%%r12,1) #define KERNEL_k1m2n8 KERNEL_h_k1m2n8 "addq $16,%1;" -#define KERNEL_k1m2n12 KERNEL_h_k1m2n8 \ - "vmovups (%1,%%r12,2),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm8; vfmadd231ps %%xmm2,%%xmm3,%%xmm9; addq $16,%1;" -#define KERNEL_h_k1m2n16 KERNEL_k1m2n12 "vmovups (%%r15),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm10; vfmadd231ps %%xmm2,%%xmm3,%%xmm11;" +#define KERNEL_k1m2n12 KERNEL_h_k1m2n8 unit_gen_kernel_k1m2n4(%%xmm8,%%xmm9,0,%1,%%r12,2) "addq $16,%1;" +#define KERNEL_h_k1m2n16 KERNEL_k1m2n12 unit_gen_kernel_k1m2n4(%%xmm10,%%xmm11,0,%%r15) #define KERNEL_k1m2n16 KERNEL_h_k1m2n16 "addq $16,%%r15;" -#define KERNEL_h_k1m2n20 KERNEL_h_k1m2n16 "vmovups (%%r15,%%r12,1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm12; vfmadd231ps %%xmm2,%%xmm3,%%xmm13;" +#define KERNEL_h_k1m2n20 KERNEL_h_k1m2n16 unit_gen_kernel_k1m2n4(%%xmm12,%%xmm13,0,%%r15,%%r12,1) #define KERNEL_k1m2n20 KERNEL_h_k1m2n20 "addq $16,%%r15;" -#define KERNEL_h_k1m2n24 KERNEL_h_k1m2n20 "vmovups (%%r15,%%r12,2),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm14; vfmadd231ps %%xmm2,%%xmm3,%%xmm15;" +#define KERNEL_h_k1m2n24 KERNEL_h_k1m2n20 unit_gen_kernel_k1m2n4(%%xmm14,%%xmm15,0,%%r15,%%r12,2) #define KERNEL_k1m2n24 KERNEL_h_k1m2n24 "addq $16,%%r15;" -#define unit_save_m2n4(c1,c2) \ +#define end_load_a_k1m2(k_no) "vbroadcastss "#k_no"*8(%0),%%xmm1; vbroadcastss "#k_no"*8+4(%0),%%xmm2;" +#define end_acc_nc2_k1m2(k_no) unit_gen_kernel_k1m2n4(%%xmm6,%%xmm7,k_no,%1,%%r12,1) +#define end_acc_nc3_k1m2(k_no) unit_gen_kernel_k1m2n4(%%xmm8,%%xmm9,k_no,%1,%%r12,2) +#define end_acc_nc4_k1m2(k_no) unit_gen_kernel_k1m2n4(%%xmm10,%%xmm11,k_no,%%r15) +#define end_acc_nc5_k1m2(k_no) unit_gen_kernel_k1m2n4(%%xmm12,%%xmm13,k_no,%%r15,%%r12,1) +#define end_acc_nc6_k1m2(k_no) unit_gen_kernel_k1m2n4(%%xmm14,%%xmm15,k_no,%%r15,%%r12,2) +#ifdef TRMMKERNEL + #define unit_save_m2n4(c1,c2) \ + "vunpcklps "#c2","#c1",%%xmm1; vunpckhps "#c2","#c1",%%xmm2;"\ + "vmulps %%xmm1,%%xmm0,%%xmm1; vmovsd %%xmm1,(%5); vmovhpd %%xmm1,(%5,%3,1);"\ + "leaq (%5,%3,2),%5;"\ + "vmulps %%xmm2,%%xmm0,%%xmm2; vmovsd %%xmm2,(%5); vmovhpd %%xmm2,(%5,%3,1);"\ + "leaq (%5,%3,2),%5;" +#else + #define unit_save_m2n4(c1,c2) \ "vunpcklps "#c2","#c1",%%xmm1; vunpckhps "#c2","#c1",%%xmm2;"\ "vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1; vmovsd %%xmm1,(%5); vmovhpd %%xmm1,(%5,%3,1);"\ "leaq (%5,%3,2),%5;"\ "vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2; vmovsd %%xmm2,(%5); vmovhpd %%xmm2,(%5,%3,1);"\ "leaq (%5,%3,2),%5;" +#endif #define SAVE_h_m2n4 "movq %2,%5;" unit_save_m2n4(%%xmm4,%%xmm5) #define SAVE_h_m2n8 SAVE_h_m2n4 unit_save_m2n4(%%xmm6,%%xmm7) #define SAVE_h_m2n12 SAVE_h_m2n8 unit_save_m2n4(%%xmm8,%%xmm9) @@ -259,13 +488,15 @@ #define SAVE_h_m2n24 SAVE_h_m2n20 unit_save_m2n4(%%xmm14,%%xmm15) #define SAVE_m2(ndim) SAVE_h_m2n##ndim "addq $8,%2;" #define COMPUTE_m2(ndim) \ - INIT_m2n##ndim\ - "movq %%r13,%4; movq %%r14,%1; leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15;"\ + INIT_m2n##ndim START_SET_PAPB(2,ndim)\ + "movq %%r13,%4;"\ + KERNEL_HEAD_C_n##ndim(2)\ "testq %4,%4; jz "#ndim"002022f;"\ #ndim"002021:\n\t"\ KERNEL_k1m2n##ndim "decq %4; jnz "#ndim"002021b;"\ #ndim"002022:\n\t"\ - SAVE_m2(ndim) + KERNEL_TAIL_C_n##ndim(2)\ + SAVE_m2(ndim) END_SET_PA(2) /* m = 1 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm9 for accumulators */ #define INIT_m1n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" @@ -273,15 +504,25 @@ "vmovss (%1),%%xmm3; addq $4,%1;"\ "vmovss (%0),%%xmm1; vfmadd231ss %%xmm3,%%xmm1,%%xmm4;"\ "addq $4,%0;" -#define SAVE_h_m1n1 "vfmadd213ss (%2),%%xmm0,%%xmm4; vmovss %%xmm4,(%2);" +#ifdef TRMMKERNEL + #define SAVE_h_m1n1 "vmulss %%xmm4,%%xmm0,%%xmm4; vmovss %%xmm4,(%2);" +#else + #define SAVE_h_m1n1 "vfmadd213ss (%2),%%xmm0,%%xmm4; vmovss %%xmm4,(%2);" +#endif #define INIT_m1n2 INIT_m1n1 #define KERNEL_k1m1n2 \ "vmovsd (%1),%%xmm3; addq $8,%1;"\ "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\ "addq $4,%0;" -#define SAVE_h_m1n2 \ +#ifdef TRMMKERNEL + #define SAVE_h_m1n2 \ + "vmulps %%xmm4,%%xmm0,%%xmm4;"\ + "vmovss %%xmm4,(%2); vextractps $1,%%xmm4,(%2,%3,1);" +#else + #define SAVE_h_m1n2 \ "vmovss (%2),%%xmm3; vinsertps $16,(%2,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm4;"\ "vmovss %%xmm4,(%2); vextractps $1,%%xmm4,(%2,%3,1);" +#endif #define INIT_m1n4 INIT_m1n2 #define INIT_m1n8 INIT_m1n4 "vpxor %%xmm5,%%xmm5,%%xmm5;" #define INIT_m1n12 INIT_m1n8 "vpxor %%xmm6,%%xmm6,%%xmm6;" @@ -300,12 +541,25 @@ #define KERNEL_k1m1n20 KERNEL_h_k1m1n20 "addq $16,%%r15;" #define KERNEL_h_k1m1n24 KERNEL_h_k1m1n20 "vfmadd231ps (%%r15,%%r12,2),%%xmm1,%%xmm9;" #define KERNEL_k1m1n24 KERNEL_h_k1m1n24 "addq $16,%%r15;" -#define unit_save_m1n4(c1) \ +#define end_load_a_k1m1(k_no) "vbroadcastss "#k_no"*4(%0),%%xmm1;" +#define end_acc_nc2_k1m1(k_no) "vfmadd231ps "#k_no"*16(%1,%%r12,1),%%xmm1,%%xmm5;" +#define end_acc_nc3_k1m1(k_no) "vfmadd231ps "#k_no"*16(%1,%%r12,2),%%xmm1,%%xmm6;" +#define end_acc_nc4_k1m1(k_no) "vfmadd231ps "#k_no"*16(%%r15),%%xmm1,%%xmm7;" +#define end_acc_nc5_k1m1(k_no) "vfmadd231ps "#k_no"*16(%%r15,%%r12,1),%%xmm1,%%xmm8;" +#define end_acc_nc6_k1m1(k_no) "vfmadd231ps "#k_no"*16(%%r15,%%r12,2),%%xmm1,%%xmm9;" +#ifdef TRMMKERNEL + #define unit_save_m1n4(c1) \ + "vmulps "#c1",%%xmm0,"#c1"; vpxor %%xmm10,%%xmm10,%%xmm10; vmovsd "#c1",%%xmm10,%%xmm2; vmovhlps "#c1",%%xmm10,%%xmm1;"\ + "vmovss %%xmm2,(%5); vextractps $1,%%xmm2,(%5,%3,1); leaq (%5,%3,2),%5;"\ + "vmovss %%xmm1,(%5); vextractps $1,%%xmm1,(%5,%3,1); leaq (%5,%3,2),%5;" +#else + #define unit_save_m1n4(c1) \ "vpxor %%xmm10,%%xmm10,%%xmm10; vmovsd "#c1",%%xmm10,%%xmm2; vmovhlps "#c1",%%xmm10,%%xmm1;"\ "vmovss (%5),%%xmm3; vinsertps $16,(%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2;"\ "vmovss %%xmm2,(%5); vextractps $1,%%xmm2,(%5,%3,1); leaq (%5,%3,2),%5;"\ "vmovss (%5),%%xmm3; vinsertps $16,(%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1;"\ "vmovss %%xmm1,(%5); vextractps $1,%%xmm1,(%5,%3,1); leaq (%5,%3,2),%5;" +#endif #define SAVE_h_m1n4 "movq %2,%5;" unit_save_m1n4(%%xmm4) #define SAVE_h_m1n8 SAVE_h_m1n4 unit_save_m1n4(%%xmm5) #define SAVE_h_m1n12 SAVE_h_m1n8 unit_save_m1n4(%%xmm6) @@ -314,58 +568,102 @@ #define SAVE_h_m1n24 SAVE_h_m1n20 unit_save_m1n4(%%xmm9) #define SAVE_m1(ndim) SAVE_h_m1n##ndim "addq $4,%2;" #define COMPUTE_m1(ndim) \ - INIT_m1n##ndim\ - "movq %%r13,%4; movq %%r14,%1; leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15;"\ + INIT_m1n##ndim START_SET_PAPB(1,ndim)\ + "movq %%r13,%4;"\ + KERNEL_HEAD_C_n##ndim(1)\ "testq %4,%4; jz "#ndim"001012f;"\ #ndim"001011:\n\t"\ KERNEL_k1m1n##ndim "decq %4; jnz "#ndim"001011b;"\ #ndim"001012:\n\t"\ - SAVE_m1(ndim) + KERNEL_TAIL_C_n##ndim(1)\ + SAVE_m1(ndim) END_SET_PA(1) -/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 = "+r"(K), %5 = "+r"(ctemp) */ -/* %6 = "+r"(next_b), %7 = "m"(ALPHA), %8 = "m"(M) */ -/* r11 = m_counter, r12 = k << 4(const), r13 = k(const), r14 = b_head_pos(const), r15 = %1 + 3r12 */ +/* %7 = "m"(ALPHA), %8 = "m"(M), %9 = "m"(K), %10 = "m"(off) */ +#ifdef TRMMKERNEL + #if BACKWARDS == 1 + #define OFFSET_TO_K "movq %9,%%r13; subq %10,%%r13;" + #else + #define OFFSET_TO_K "movq %10,%%r13;" + #endif +#else + #define OFFSET_TO_K "movq %9,%%r13;" +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + #if BACKWARDS == 1 + #define START_UPDATE_OFFSET(ndim) {} + #define END_UPDATE_OFFSET(ndim) {off += (ndim);} + #else + #define START_UPDATE_OFFSET(ndim) {off += (ndim)>4 ? 4:(ndim);} + #define END_UPDATE_OFFSET(ndim) {off += (ndim)>4 ? ((ndim)-4):0;} + #endif +#else + #define START_UPDATE_OFFSET(ndim) {} + #define END_UPDATE_OFFSET(ndim) {} +#endif +#if defined(TRMMKERNEL) && defined(LEFT) + #if BACKWARDS == 1 + #define START_UPDATE_K(mdim) "" + #define END_UPDATE_K(mdim) "subq $"#mdim",%%r13;" + #else + #define START_UPDATE_K(mdim) "addq $"#mdim",%%r13;" + #define END_UPDATE_K(mdim) "" + #endif +#else + #define START_UPDATE_K(mdim) "" + #define END_UPDATE_K(mdim) "" +#endif #define COMPUTE(ndim) {\ - next_b = b_pointer + ndim * K;\ - __asm__ __volatile__(\ + next_b = b_pointer + ndim * K; START_UPDATE_OFFSET(ndim)\ + __asm__ __volatile__(\ "vbroadcastss %7,%%zmm0;"\ - "movq %4,%%r13; movq %4,%%r12; salq $4,%%r12; movq %1,%%r14; movq %8,%%r11;"\ + OFFSET_TO_K "movq %9,%%r12; salq $4,%%r12; movq %1,%%r14; movq %8,%%r11;"\ "cmpq $16,%%r11;jb 33101"#ndim"f;"\ "33109"#ndim":\n\t"\ - COMPUTE_m16(ndim)\ + START_UPDATE_K(16) COMPUTE_m16(ndim) END_UPDATE_K(16)\ "subq $16,%%r11;cmpq $16,%%r11;jnb 33109"#ndim"b;"\ "33101"#ndim":\n\t"\ "cmpq $8,%%r11;jb 33102"#ndim"f;"\ - COMPUTE_m8(ndim)\ + START_UPDATE_K(8) COMPUTE_m8(ndim) END_UPDATE_K(8)\ "subq $8,%%r11;"\ "33102"#ndim":\n\t"\ "cmpq $4,%%r11;jb 33103"#ndim"f;"\ - COMPUTE_m4(ndim)\ + START_UPDATE_K(4) COMPUTE_m4(ndim) END_UPDATE_K(4)\ "subq $4,%%r11;"\ "33103"#ndim":\n\t"\ "cmpq $2,%%r11;jb 33104"#ndim"f;"\ - COMPUTE_m2(ndim)\ + START_UPDATE_K(2) COMPUTE_m2(ndim) END_UPDATE_K(2)\ "subq $2,%%r11;"\ "33104"#ndim":\n\t"\ "testq %%r11,%%r11;jz 33105"#ndim"f;"\ - COMPUTE_m1(ndim)\ + START_UPDATE_K(1) COMPUTE_m1(ndim) END_UPDATE_K(1)\ "33105"#ndim":\n\t"\ - "movq %%r13,%4; movq %%r14,%1; vzeroupper;"\ - :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(K),"+r"(ctemp),"+r"(next_b):"m"(ALPHA),"m"(M)\ - :"r10","r11","r12","r13","r14","r15","zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14",\ - "zmm15","zmm16","zmm17","zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31",\ - "cc","memory");\ - a_pointer -= M * K; b_pointer += ndim * K; c_pointer += LDC * ndim - M;\ + "movq %%r14,%1; vzeroupper;"\ + :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(k_counter),"+r"(ctemp),"+r"(next_b)\ + :"m"(ALPHA),"m"(M),"m"(K),"m"(off):"r10","r11","r12","r13","r14","r15","cc","memory",\ + "zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15",\ + "zmm16","zmm17","zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31");\ + a_pointer -= M * K; b_pointer += ndim * K; c_pointer += LDC * ndim - M; END_UPDATE_OFFSET(ndim)\ } int __attribute__ ((noinline)) -CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC) +CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC +#ifdef TRMMKERNEL +,BLASLONG offset +#endif +) { - if(m==0||n==0||k==0||alpha==(float)0.0) return 0; + if(m==0||n==0) return 0; int64_t ldc_in_bytes = (int64_t)LDC * sizeof(float);float ALPHA = alpha; - int64_t M = (int64_t)m, K = (int64_t)k; + int64_t M = (int64_t)m, K = (int64_t)k, k_counter = K, off = 0; BLASLONG n_count = n; float *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*next_b = B; +#ifdef TRMMKERNEL + #ifdef LEFT + off = offset; + #else + off = -offset; + #endif +#endif for(;n_count>23;n_count-=24) COMPUTE(24) for(;n_count>19;n_count-=20) COMPUTE(20) for(;n_count>15;n_count-=16) COMPUTE(16) @@ -376,5 +674,7 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f if(n_count>0) COMPUTE(1) return 0; } -#include -#include "sgemm_direct_skylakex.c" +#ifndef TRMMKERNEL + #include + #include "sgemm_direct_skylakex.c" +#endif From f566787e6e15fb7c6fc563c1c3b5b66b865aeb77 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sun, 16 Feb 2020 22:58:44 +0800 Subject: [PATCH 002/593] Update KERNEL.SKYLAKEX --- kernel/x86_64/KERNEL.SKYLAKEX | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index dcd201649..9b3c83e42 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -1,7 +1,7 @@ include $(KERNELDIR)/KERNEL.HASWELL SGEMMKERNEL = sgemm_kernel_16x4_skylakex_2.c -STRMMKERNEL = sgemm_kernel_16x4_haswell.S +STRMMKERNEL = sgemm_kernel_16x4_skylakex_2.c SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = sgemm_tcopy_16_skylakex.c SGEMMONCOPY = sgemm_ncopy_4_skylakex.c From b0558c11b9b1eccdd84dbdee225dc41efb07a390 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sun, 16 Feb 2020 23:01:31 +0800 Subject: [PATCH 003/593] Update param.h --- param.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/param.h b/param.h index e6ab93aa5..e479314d9 100644 --- a/param.h +++ b/param.h @@ -1722,16 +1722,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define XGEMM_DEFAULT_R xgemm_r #define XGEMM_DEFAULT_Q 128 -#define CGEMM3M_DEFAULT_UNROLL_N 8 -#define CGEMM3M_DEFAULT_UNROLL_M 4 -#define ZGEMM3M_DEFAULT_UNROLL_N 8 -#define ZGEMM3M_DEFAULT_UNROLL_M 2 +#define CGEMM3M_DEFAULT_UNROLL_N 4 +#define CGEMM3M_DEFAULT_UNROLL_M 8 +#define ZGEMM3M_DEFAULT_UNROLL_N 4 +#define ZGEMM3M_DEFAULT_UNROLL_M 4 -#define CGEMM3M_DEFAULT_P 448 -#define ZGEMM3M_DEFAULT_P 224 +#define CGEMM3M_DEFAULT_P 320 +#define ZGEMM3M_DEFAULT_P 256 #define XGEMM3M_DEFAULT_P 112 -#define CGEMM3M_DEFAULT_Q 224 -#define ZGEMM3M_DEFAULT_Q 224 +#define CGEMM3M_DEFAULT_Q 320 +#define ZGEMM3M_DEFAULT_Q 256 #define XGEMM3M_DEFAULT_Q 224 #define CGEMM3M_DEFAULT_R 12288 #define ZGEMM3M_DEFAULT_R 12288 From f6fcbd7906acb6bbfffad49086863ffb0ba014da Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sat, 22 Feb 2020 23:37:45 +0800 Subject: [PATCH 004/593] Fix performance bug when LDC is a multiple of 1024 --- sgemm_kernel_8x4_haswell_2.c | 424 +++++++++++++++++++++++++++++++++++ 1 file changed, 424 insertions(+) create mode 100644 sgemm_kernel_8x4_haswell_2.c diff --git a/sgemm_kernel_8x4_haswell_2.c b/sgemm_kernel_8x4_haswell_2.c new file mode 100644 index 000000000..5ab3e6d1f --- /dev/null +++ b/sgemm_kernel_8x4_haswell_2.c @@ -0,0 +1,424 @@ +/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store, %6 = b_pref */ +/* r10 = tmp, r11 = m_counter, r12 = k << 2(const), r13 = tmp, r14 = b_head_pos(const), r15 = tmp */ + +/* m = 8 *//* ymm0 for alpha, ymm1-ymm3 for temporary use, ymm4-ymm15 for accumulators */ +#define KERNEL_k1m8n1 \ + "vmovups (%0),%%ymm1; addq $32,%0;"\ + "vbroadcastss (%1),%%ymm2; vfmadd231ps %%ymm1,%%ymm2,%%ymm4;"\ + "addq $4,%1;" +#define KERNEL_h_k1m8n2 \ + "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; addq $32,%0;"\ + "vbroadcastsd (%1),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;" +#define KERNEL_k1m8n2 KERNEL_h_k1m8n2 "addq $8,%1;" +#define KERNEL_h_k1m8n4 \ + KERNEL_h_k1m8n2 "vbroadcastsd 8(%1),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;" +#define KERNEL_k1m8n4 KERNEL_h_k1m8n4 "addq $16,%1;" +#define unit_kernel_k1m8n4(c1,c2,c3,c4,boff,...) \ + "vbroadcastsd "#boff"("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c1"; vfmadd231ps %%ymm2,%%ymm3,"#c2";"\ + "vbroadcastsd "#boff"+8("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c3"; vfmadd231ps %%ymm2,%%ymm3,"#c4";" +#define KERNEL_h_k1m8n8 KERNEL_h_k1m8n4 unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,0,%1,%%r12,4) +#define KERNEL_k1m8n8 KERNEL_h_k1m8n8 "addq $16,%1;" +#define KERNEL_h_k1m8n12 KERNEL_h_k1m8n8 unit_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,0,%1,%%r12,8) +#define KERNEL_k1m8n12 KERNEL_h_k1m8n12 "addq $16,%1;" +#define KERNEL_k2m8n4 \ + "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; prefetcht0 512(%0);"\ + unit_kernel_k1m8n4(%%ymm4,%%ymm5,%%ymm6,%%ymm7,0,%1)\ + "vmovsldup 32(%0),%%ymm1; vmovshdup 32(%0),%%ymm2; addq $64,%0;"\ + unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,16,%1)\ + "addq $32,%1;" +#define KERNEL_L_k1m8n6 \ + "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; prefetcht0 512(%0); addq $32,%0;"\ + "vbroadcastsd (%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ + "vbroadcastsd 8(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ + "vbroadcastsd (%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ + "addq $16,%1;" +#define KERNEL_L_k2m8n6 \ + "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; prefetcht0 512(%0);"\ + "vbroadcastsd (%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ + "vbroadcastsd 8(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ + "vbroadcastsd (%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ + "vmovsldup 32(%0),%%ymm1; vmovshdup 32(%0),%%ymm2; addq $64,%0;"\ + "vbroadcastsd 16(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ + "vbroadcastsd 24(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ + "vbroadcastsd 16(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ + "addq $32,%1;" +#define KERNEL_L_k1m16n6 \ + "vmovups (%0),%%ymm1; vmovups (%0,%%r12,8),%%ymm2; prefetcht0 512(%0,%%r12,8); addq $32,%0;"\ + "vbroadcastss (%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ + "vbroadcastss 4(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ + "vbroadcastss 8(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ + "vbroadcastss 12(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ + "vbroadcastss (%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ + "vbroadcastss 4(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ + "addq $16,%1;" +#define KERNEL_L_k2m16n6 \ + "vmovups (%0),%%ymm1; vmovups (%0,%%r12,8),%%ymm2; prefetcht0 512(%0,%%r12,8);"\ + "vbroadcastss (%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ + "vbroadcastss 4(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ + "vbroadcastss 8(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ + "vbroadcastss 12(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ + "vbroadcastss (%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ + "vbroadcastss 4(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ + "vmovups 32(%0),%%ymm1; vmovups 32(%0,%%r12,8),%%ymm2; addq $64,%0;"\ + "vbroadcastss 16(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ + "vbroadcastss 20(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ + "vbroadcastss 24(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ + "vbroadcastss 28(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ + "vbroadcastss 16(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ + "vbroadcastss 20(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ + "addq $32,%1;" +#define KERNEL_R_k1m16n6 \ + "vmovups (%0),%%ymm1; vmovups (%0,%%r12,8),%%ymm2; prefetcht0 512(%0,%%r12,8); addq $32,%0;"\ + "vbroadcastss 8(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ + "vbroadcastss 12(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ + "vbroadcastss (%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ + "vbroadcastss 4(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ + "vbroadcastss 8(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ + "vbroadcastss 12(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ + "addq $16,%1;" +#define KERNEL_R_k2m16n6 \ + "vmovups (%0),%%ymm1; vmovups (%0,%%r12,8),%%ymm2; prefetcht0 512(%0,%%r12,8);"\ + "vbroadcastss 8(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ + "vbroadcastss 12(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ + "vbroadcastss (%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ + "vbroadcastss 4(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ + "vbroadcastss 8(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ + "vbroadcastss 12(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ + "vmovups 32(%0),%%ymm1; vmovups 32(%0,%%r12,8),%%ymm2; addq $64,%0;"\ + "vbroadcastss 24(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ + "vbroadcastss 28(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ + "vbroadcastss 16(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ + "vbroadcastss 20(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ + "vbroadcastss 24(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ + "vbroadcastss 28(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ + "addq $32,%1;" +#define KERNEL_R_k1m8n6 \ + "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; prefetcht0 512(%0); addq $32,%0;"\ + "vbroadcastsd 8(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ + "vbroadcastsd (%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ + "vbroadcastsd 8(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ + "addq $16,%1;" +#define KERNEL_R_k2m8n6 \ + "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; prefetcht0 512(%0);"\ + "vbroadcastsd 8(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ + "vbroadcastsd (%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ + "vbroadcastsd 8(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ + "vmovsldup 32(%0),%%ymm1; vmovshdup 32(%0),%%ymm2; addq $64,%0;"\ + "vbroadcastsd 24(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ + "vbroadcastsd 16(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ + "vbroadcastsd 24(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ + "addq $32,%1;" +#define INIT_m8n1 "vpxor %%ymm4,%%ymm4,%%ymm4;" +#define INIT_m8n2 INIT_m8n1 "vpxor %%ymm5,%%ymm5,%%ymm5;" +#define unit_init_m8n4(c1,c2,c3,c4) \ + "vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";" +#define INIT_m8n8 unit_init_m8n4(%%ymm4,%%ymm5,%%ymm6,%%ymm7) unit_init_m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11) +#define INIT_m8n4 INIT_m8n8 +#define INIT_m8n12 INIT_m8n8 unit_init_m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15) +#define INIT_m8n6 INIT_m8n12 +#define INIT_m16n6 INIT_m8n12 +#define SAVE_m8n1 "vfmadd213ps (%2),%%ymm0,%%ymm4; vmovups %%ymm4,(%2);" +#define unit_save_m8n2(c1,c2) \ + "vunpcklps "#c2","#c1",%%ymm2; vunpckhps "#c2","#c1",%%ymm3; vunpcklpd %%ymm3,%%ymm2,"#c1"; vunpckhpd %%ymm3,%%ymm2,"#c2";"\ + "vfmadd213ps (%5),%%ymm0,"#c1"; vfmadd213ps (%5,%3,1),%%ymm0,"#c2"; vmovups "#c1",(%5); vmovups "#c2",(%5,%3,1); leaq (%5,%3,2),%5;" +#define SAVE_m8n2 "movq %2,%5;" unit_save_m8n2(%%ymm4,%%ymm5) +#define SAVE_m8n4 "movq %2,%5;"\ + "vaddps %%ymm4,%%ymm8,%%ymm4; vaddps %%ymm5,%%ymm9,%%ymm5; vaddps %%ymm6,%%ymm10,%%ymm6; vaddps %%ymm7,%%ymm11,%%ymm7;"\ + unit_save_m8n2(%%ymm4,%%ymm5) unit_save_m8n2(%%ymm6,%%ymm7) +#define SAVE_m8n8 "movq %2,%5;"\ + unit_save_m8n2(%%ymm4,%%ymm5) unit_save_m8n2(%%ymm6,%%ymm7) unit_save_m8n2(%%ymm8,%%ymm9) unit_save_m8n2(%%ymm10,%%ymm11) +#define SAVE_m8n12 SAVE_m8n8 unit_save_m8n2(%%ymm12,%%ymm13) unit_save_m8n2(%%ymm14,%%ymm15) +#define unit_save_m16n2(c1,c2,c3,c4) \ + "vfmadd213ps (%5),%%ymm0,"#c1"; vfmadd213ps 32(%5),%%ymm0,"#c2"; vmovups "#c1",(%5); vmovups "#c2",32(%5);"\ + "vfmadd213ps (%5,%3,1),%%ymm0,"#c3"; vfmadd213ps 32(%5,%3,1),%%ymm0,"#c4"; vmovups "#c3",(%5,%3,1); vmovups "#c4",32(%5,%3,1); leaq (%5,%3,2),%5;" +#define SAVE_L_m16n6 "movq %2,%5;"\ + unit_save_m16n2(%%ymm4,%%ymm5,%%ymm6,%%ymm7) unit_save_m16n2(%%ymm8,%%ymm9,%%ymm10,%%ymm11) unit_save_m16n2(%%ymm12,%%ymm13,%%ymm14,%%ymm15) +#define SAVE_R_m16n6 "leaq (%2,%3,4),%5; leaq (%5,%3,2),%5;"\ + unit_save_m16n2(%%ymm4,%%ymm5,%%ymm6,%%ymm7) unit_save_m16n2(%%ymm8,%%ymm9,%%ymm10,%%ymm11) unit_save_m16n2(%%ymm12,%%ymm13,%%ymm14,%%ymm15) +#define SAVE_L_m8n6 "movq %2,%5;"\ + "vaddps %%ymm4,%%ymm10,%%ymm4; vaddps %%ymm5,%%ymm11,%%ymm5; vaddps %%ymm6,%%ymm12,%%ymm6;"\ + "vaddps %%ymm7,%%ymm13,%%ymm7; vaddps %%ymm8,%%ymm14,%%ymm8; vaddps %%ymm9,%%ymm15,%%ymm9;"\ + unit_save_m8n2(%%ymm4,%%ymm5) unit_save_m8n2(%%ymm6,%%ymm7) unit_save_m8n2(%%ymm8,%%ymm9) +#define SAVE_R_m8n6 "leaq (%2,%3,4),%5; leaq (%5,%3,2),%5;"\ + "vaddps %%ymm4,%%ymm10,%%ymm4; vaddps %%ymm5,%%ymm11,%%ymm5; vaddps %%ymm6,%%ymm12,%%ymm6;"\ + "vaddps %%ymm7,%%ymm13,%%ymm7; vaddps %%ymm8,%%ymm14,%%ymm8; vaddps %%ymm9,%%ymm15,%%ymm9;"\ + unit_save_m8n2(%%ymm4,%%ymm5) unit_save_m8n2(%%ymm6,%%ymm7) unit_save_m8n2(%%ymm8,%%ymm9) + +/* m = 4 *//* xmm0 for alpha, xmm1-xmm3 for temporary use, xmm4-xmm15 for accumulators */ +#define KERNEL_k1m4n1 \ + "vmovups (%0),%%xmm1; addq $16,%0;"\ + "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ + "addq $4,%1;" +#define KERNEL_h_k1m4n2 \ + "vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2; addq $16,%0;"\ + "vmovddup (%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm4; vfmadd231ps %%xmm2,%%xmm3,%%xmm5;" +#define KERNEL_k1m4n2 KERNEL_h_k1m4n2 "addq $8,%1;" +#define KERNEL_h_k1m4n4 \ + KERNEL_h_k1m4n2 "vmovddup 8(%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm6; vfmadd231ps %%xmm2,%%xmm3,%%xmm7;" +#define KERNEL_k1m4n4 KERNEL_h_k1m4n4 "addq $16,%1;" +#define unit_kernel_k1m4n4(c1,c2,c3,c4,...) \ + "vmovddup ("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c1"; vfmadd231ps %%xmm2,%%xmm3,"#c2";"\ + "vmovddup 8("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c3"; vfmadd231ps %%xmm2,%%xmm3,"#c4";" +#define KERNEL_h_k1m4n8 KERNEL_h_k1m4n4 unit_kernel_k1m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11,%1,%%r12,4) +#define KERNEL_k1m4n8 KERNEL_h_k1m4n8 "addq $16,%1;" +#define KERNEL_h_k1m4n12 KERNEL_h_k1m4n8 unit_kernel_k1m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15,%1,%%r12,8) +#define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $16,%1;" +#define INIT_m4n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" +#define INIT_m4n2 INIT_m4n1 "vpxor %%xmm5,%%xmm5,%%xmm5;" +#define INIT_m4n4 INIT_m4n2 "vpxor %%xmm6,%%xmm6,%%xmm6;vpxor %%xmm7,%%xmm7,%%xmm7;" +#define unit_init_m4n4(c1,c2,c3,c4) \ + "vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";" +#define INIT_m4n8 INIT_m4n4 unit_init_m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11) +#define INIT_m4n12 INIT_m4n8 unit_init_m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15) +#define SAVE_m4n1 "vfmadd213ps (%2),%%xmm0,%%xmm4; vmovups %%xmm4,(%2);" +#define unit_save_m4n2(c1,c2) \ + "vunpcklps "#c2","#c1",%%xmm2; vunpckhps "#c2","#c1",%%xmm3; vunpcklpd %%xmm3,%%xmm2,"#c1"; vunpckhpd %%xmm3,%%xmm2,"#c2";"\ + "vfmadd213ps (%5),%%xmm0,"#c1"; vmovups "#c1",(%5);"\ + "vfmadd213ps (%5,%3,1),%%xmm0,"#c2"; vmovups "#c2",(%5,%3,1);"\ + "leaq (%5,%3,2),%5;" +#define SAVE_m4n2 "movq %2,%5;" unit_save_m4n2(%%xmm4,%%xmm5) +#define SAVE_m4n4 SAVE_m4n2 unit_save_m4n2(%%xmm6,%%xmm7) +#define SAVE_m4n8 SAVE_m4n4 unit_save_m4n2(%%xmm8,%%xmm9) unit_save_m4n2(%%xmm10,%%xmm11) +#define SAVE_m4n12 SAVE_m4n8 unit_save_m4n2(%%xmm12,%%xmm13) unit_save_m4n2(%%xmm14,%%xmm15) + +/* m = 2 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm9 for accumulators */ +#define INIT_m2n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" +#define KERNEL_k1m2n1 \ + "vmovsd (%0),%%xmm1; addq $8,%0;"\ + "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ + "addq $4,%1;" +#define SAVE_m2n1 "vmovsd (%2),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm4; vmovsd %%xmm4,(%2);" +#define INIT_m2n2 INIT_m2n1 "vpxor %%xmm5,%%xmm5,%%xmm5;" +#define KERNEL_k1m2n2 \ + "vmovsd (%0),%%xmm1; addq $8,%0;"\ + "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ + "vbroadcastss 4(%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm5;"\ + "addq $8,%1;" +#define SAVE_m2n2 SAVE_m2n1 "vmovsd (%2,%3,1),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm5; vmovsd %%xmm5,(%2,%3,1);" +#define INIT_m2n4 INIT_m2n2 +#define INIT_m2n8 INIT_m2n4 "vpxor %%xmm6,%%xmm6,%%xmm6; vpxor %%xmm7,%%xmm7,%%xmm7;" +#define INIT_m2n12 INIT_m2n8 "vpxor %%xmm8,%%xmm8,%%xmm8; vpxor %%xmm9,%%xmm9,%%xmm9;" +#define KERNEL_k1m2n4 \ + "vmovups (%1),%%xmm3; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\ + "vbroadcastss 4(%0),%%xmm2; vfmadd231ps %%xmm3,%%xmm2,%%xmm5;"\ + "addq $8,%0;" +#define KERNEL_k1m2n8 \ + "vmovups (%1),%%xmm3; vmovups (%1,%%r12,4),%%xmm2; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4; vfmadd231ps %%xmm2,%%xmm1,%%xmm6;"\ + "vbroadcastss 4(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm5; vfmadd231ps %%xmm2,%%xmm1,%%xmm7;"\ + "addq $8,%0;" +#define KERNEL_k1m2n12 \ + "vmovups (%1),%%xmm3; vmovups (%1,%%r12,4),%%xmm2; vmovups (%1,%%r12,8),%%xmm1; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm6; vfmadd231ps %%xmm1,%%xmm10,%%xmm8;"\ + "vbroadcastss 4(%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm5; vfmadd231ps %%xmm2,%%xmm10,%%xmm7; vfmadd231ps %%xmm1,%%xmm10,%%xmm9;"\ + "addq $8,%0;" +#define unit_save_m2n4(c1,c2) \ + "vunpcklps "#c2","#c1",%%xmm1; vunpckhps "#c2","#c1",%%xmm2;"\ + "vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1;"\ + "vmovsd %%xmm1,(%5); vmovhpd %%xmm1,(%5,%3,1); leaq (%5,%3,2),%5;"\ + "vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2;"\ + "vmovsd %%xmm2,(%5); vmovhpd %%xmm2,(%5,%3,1); leaq (%5,%3,2),%5;" +#define SAVE_m2n4 "movq %2,%5;" unit_save_m2n4(%%xmm4,%%xmm5) +#define SAVE_m2n8 SAVE_m2n4 unit_save_m2n4(%%xmm6,%%xmm7) +#define SAVE_m2n12 SAVE_m2n8 unit_save_m2n4(%%xmm8,%%xmm9) + +/* m = 1 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm6 for accumulators */ +#define INIT_m1n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" +#define KERNEL_k1m1n1 \ + "vmovss (%1),%%xmm3; addq $4,%1;"\ + "vmovss (%0),%%xmm1; vfmadd231ss %%xmm3,%%xmm1,%%xmm4;"\ + "addq $4,%0;" +#define SAVE_m1n1 "vfmadd213ss (%2),%%xmm0,%%xmm4; vmovss %%xmm4,(%2);" +#define INIT_m1n2 INIT_m1n1 +#define KERNEL_k1m1n2 \ + "vmovsd (%1),%%xmm3; addq $8,%1;"\ + "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\ + "addq $4,%0;" +#define SAVE_m1n2 \ + "vmovss (%2),%%xmm3; vinsertps $16,(%2,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm4;"\ + "vmovss %%xmm4,(%2); vextractps $1,%%xmm4,(%2,%3,1);" +#define INIT_m1n4 INIT_m1n2 +#define INIT_m1n8 INIT_m1n4 "vpxor %%xmm5,%%xmm5,%%xmm5;" +#define INIT_m1n12 INIT_m1n8 "vpxor %%xmm6,%%xmm6,%%xmm6;" +#define KERNEL_k1m1n4 \ + "vmovups (%1),%%xmm3; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\ + "addq $4,%0;" +#define KERNEL_k1m1n8 \ + "vmovups (%1),%%xmm3; vmovups (%1,%%r12,4),%%xmm2; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4; vfmadd231ps %%xmm2,%%xmm1,%%xmm5;"\ + "addq $4,%0;" +#define KERNEL_k1m1n12 \ + "vmovups (%1),%%xmm3; vmovups (%1,%%r12,4),%%xmm2; vmovups (%1,%%r12,8),%%xmm1; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm5; vfmadd231ps %%xmm1,%%xmm10,%%xmm6;"\ + "addq $4,%0;" +#define unit_save_m1n4(c1) \ + "vpxor %%xmm10,%%xmm10,%%xmm10; vmovsd "#c1",%%xmm10,%%xmm2; vmovhlps "#c1",%%xmm10,%%xmm1;"\ + "vmovss (%5),%%xmm3; vinsertps $16,(%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2;"\ + "vmovss %%xmm2,(%5); vextractps $1,%%xmm2,(%5,%3,1); leaq (%5,%3,2),%5;"\ + "vmovss (%5),%%xmm3; vinsertps $16,(%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1;"\ + "vmovss %%xmm1,(%5); vextractps $1,%%xmm1,(%5,%3,1); leaq (%5,%3,2),%5;" +#define SAVE_m1n4 "movq %2,%5;" unit_save_m1n4(%%xmm4) +#define SAVE_m1n8 SAVE_m1n4 unit_save_m1n4(%%xmm5) +#define SAVE_m1n12 SAVE_m1n8 unit_save_m1n4(%%xmm6) + +/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store, %6 = b_pref */ +/* r10 = tmp, r11 = m_counter, r12 = k << 2(const), r13 = tmp, r14 = b_head_pos(const), r15 = tmp */ + +#define COMPUTE_SIMPLE(mdim,ndim) \ + "movq %%r12,%4; sarq $2,%4; movq %%r14,%1;" INIT_m##mdim##n##ndim\ + "testq %4,%4; jz 7"#mdim"7"#ndim"2f;"\ + "7"#mdim"7"#ndim"1:\n\t"\ + KERNEL_k1m##mdim##n##ndim "decq %4; jnz 7"#mdim"7"#ndim"1b;"\ + "7"#mdim"7"#ndim"2:\n\t"\ + SAVE_m##mdim##n##ndim "addq $"#mdim"*4,%2;" +#define COMPUTE_m8n1 COMPUTE_SIMPLE(8,1) +#define COMPUTE_m8n2 COMPUTE_SIMPLE(8,2) +#define COMPUTE_m8n8 COMPUTE_SIMPLE(8,8) +#define COMPUTE_m8n12 COMPUTE_SIMPLE(8,12) +#define COMPUTE_m8n4 \ + "movq %%r12,%4; sarq $2,%4; movq %%r14,%1;" INIT_m8n4\ + "cmpq $8,%4; jb 78740f;"\ + "78749:\n\t"\ + KERNEL_k2m8n4 KERNEL_k2m8n4 KERNEL_k2m8n4 KERNEL_k2m8n4\ + "subq $8,%4; cmpq $8,%4; jnb 78749b;"\ + "78740:\n\t"\ + "testq %4,%4; jz 78742f;"\ + "78741:\n\t"\ + KERNEL_k1m8n4 "decq %4; jnz 78741b;"\ + "78742:\n\t"\ + SAVE_m8n4 "addq $32,%2;" +#define COMPUTE_L_m16n6 \ + "movq %%r12,%%r13; sarq $2,%%r13; movq %%r14,%1;" INIT_m16n6\ + "movq %%r13,%4; movq %2,%5; cmpq $16,%%r13; jb 7116762f; movq $14,%4;"\ + "7116761:\n\t"\ + KERNEL_L_k2m16n6 "prefetcht0 128(%1); testq $24,%4; movq $84,%%r15; cmovz %3,%%r15;"\ + KERNEL_L_k2m16n6 "prefetcht1 (%5); subq $63,%5; addq %%r15,%5;"\ + KERNEL_L_k2m16n6 "prefetcht0 128(%1); prefetcht1 (%6); cmpq $198,%4; cmoveq %2,%5;"\ + KERNEL_L_k2m16n6 "addq $16,%6; addq $8,%4; cmpq %4,%%r13; jnb 7116761b;"\ + "movq %2,%5; negq %4; leaq 14(%%r13,%4,1),%4;"\ + "7116762:\n\t"\ + "xorq %%r15,%%r15; testq %4,%4; jz 7116764f;"\ + "7116763:\n\t"\ + "prefetcht0 (%5); prefetcht0 63(%5); addq %3,%5; incq %%r15;"\ + KERNEL_L_k1m16n6 "cmpq $6,%%r15; cmoveq %2,%5; decq %4; jnz 7116763b;"\ + "7116764:\n\t"\ + SAVE_L_m16n6 "addq $32,%2;" +#define COMPUTE_R_m16n6 \ + "movq %%r12,%%r13; sarq $2,%%r13; movq %%r14,%1;" INIT_m16n6\ + "movq %%r13,%4; leaq (%2,%3,4),%5; leaq (%5,%3,2),%5; movq %5,%%r10; cmpq $16,%%r13; jb 7216762f; movq $14,%4;"\ + "7216761:\n\t"\ + KERNEL_R_k2m16n6 "prefetcht0 128(%1,%%r12,8); testq $24,%4; movq $84,%%r15; cmovz %3,%%r15;"\ + KERNEL_R_k2m16n6 "prefetcht1 (%5); subq $63,%5; addq %%r15,%5;"\ + KERNEL_R_k2m16n6 "prefetcht0 128(%1,%%r12,8); prefetcht1 (%6); cmpq $198,%4; cmoveq %%r10,%5;"\ + KERNEL_R_k2m16n6 "addq $16,%6; addq $8,%4; cmpq %4,%%r13; jnb 7216761b;"\ + "movq %%r10,%5; negq %4; leaq 14(%%r13,%4,1),%4;"\ + "7216762:\n\t"\ + "xorq %%r15,%%r15; testq %4,%4; jz 7216764f;"\ + "7216763:\n\t"\ + "prefetcht0 (%5); prefetcht0 63(%5); addq %3,%5; incq %%r15;"\ + KERNEL_R_k1m16n6 "cmpq $6,%%r15; cmoveq %%r10,%5; decq %4; jnz 7216763b;"\ + "7216764:\n\t"\ + "prefetcht0 (%%r14); prefetcht0 64(%%r14);" SAVE_R_m16n6 "addq $32,%2;" +#define COMPUTE_H_m8n6 \ + "movq %%r12,%4; sarq $2,%4; movq %%r14,%1;" INIT_m8n6\ + "cmpq $8,%4; jb 718760f; movq %2,%5; xorq %%r15,%%r15;"\ + "718769:\n\t"\ + KERNEL_L_k2m8n6 KERNEL_L_k2m8n6 "cmpq $62,%%r15; movq $62,%%r15; cmoveq %3,%%r15;"\ + KERNEL_L_k2m8n6 KERNEL_L_k2m8n6 "prefetcht2 (%5); leaq -31(%5,%%r15,1),%5;"\ + "subq $8,%4; cmpq $8,%4; jnb 718769b;"\ + "718760:\n\t"\ + "testq %4,%4; jz 718762f;"\ + "718761:\n\t"\ + KERNEL_L_k1m8n6 "decq %4; jnz 718761b;"\ + "718762:\n\t"\ + SAVE_L_m8n6 "negq %%r12; leaq (%0,%%r12,8),%0; negq %%r12;" +#define COMPUTE_T_m8n6(side,sim) \ + "movq %%r12,%4; sarq $2,%4; movq %%r14,%1;" INIT_m8n6\ + "cmpq $8,%4; jb 72"#sim"8760f;"\ + "72"#sim"8769:\n\t"\ + KERNEL_##side##_k2m8n6 KERNEL_##side##_k2m8n6 KERNEL_##side##_k2m8n6 KERNEL_##side##_k2m8n6\ + "subq $8,%4; cmpq $8,%4; jnb 72"#sim"8769b;"\ + "72"#sim"8760:\n\t"\ + "testq %4,%4; jz 72"#sim"8762f;"\ + "72"#sim"8761:\n\t"\ + KERNEL_##side##_k1m8n6 "decq %4; jnz 72"#sim"8761b;"\ + "72"#sim"8762:\n\t"\ + SAVE_##side##_m8n6 "addq $32,%2;" +#define COMPUTE_NORMAL(ndim) {\ + next_b = b_pointer + ndim * K;\ + __asm__ __volatile__(\ + "vbroadcastss %9,%%ymm0;"\ + "movq %8,%%r12; salq $2,%%r12; movq %1,%%r14; movq %7,%%r11;"\ + "cmpq $8,%%r11;jb 33101"#ndim"f;"\ + "33109"#ndim":\n\t"\ + COMPUTE_m8n##ndim\ + "subq $8,%%r11;cmpq $8,%%r11;jnb 33109"#ndim"b;"\ + "33101"#ndim":\n\t"\ + "cmpq $4,%%r11;jb 33103"#ndim"f;"\ + COMPUTE_SIMPLE(4,ndim) "subq $4,%%r11;"\ + "33103"#ndim":\n\t"\ + "cmpq $2,%%r11;jb 33104"#ndim"f;"\ + COMPUTE_SIMPLE(2,ndim) "subq $2,%%r11;"\ + "33104"#ndim":\n\t"\ + "testq %%r11,%%r11;jz 33105"#ndim"f;"\ + COMPUTE_SIMPLE(1,ndim)\ + "33105"#ndim":\n\t"\ + "movq %%r14,%1; vzeroupper;"\ + :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(k_count),"+r"(ctemp),"+r"(next_b)\ + :"m"(M),"m"(K),"m"(ALPHA):"r10","r11","r12","r13","r14","r15",\ + "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15","cc","memory");\ + a_pointer -= M * K; b_pointer += ndim * K; c_pointer += (LDC * ndim - M);\ +} +#define COMPUTE_n12 {\ + next_b = b_pointer + 12 * K;\ + __asm__ __volatile__(\ + "vbroadcastss %9,%%ymm0;"\ + "movq %8,%%r12; salq $2,%%r12; movq %1,%%r14; movq %7,%%r11;"\ + "cmpq $16,%%r11;jb 3310112f;"\ + COMPUTE_H_m8n6\ + "3310612:\n\t"\ + COMPUTE_R_m16n6 "subq $8,%%r11; cmpq $16,%%r11;jb 3310712f;"\ + COMPUTE_L_m16n6 "subq $8,%%r11; cmpq $16,%%r11;jnb 3310612b;"\ + COMPUTE_T_m8n6(R,5) "subq $8,%%r11; jmp 3310212f;"\ + "3310712:\n\t"\ + COMPUTE_T_m8n6(L,7) "subq $8,%%r11; jmp 3310212f;"\ + "3310112:\n\t"\ + "cmpq $8,%%r11;jb 3310212f;"\ + COMPUTE_SIMPLE(8,12) "subq $8,%%r11;"\ + "3310212:\n\t"\ + "cmpq $4,%%r11;jb 3310312f;"\ + COMPUTE_SIMPLE(4,12) "subq $4,%%r11;"\ + "3310312:\n\t"\ + "cmpq $2,%%r11;jb 3310412f;"\ + COMPUTE_SIMPLE(2,12) "subq $2,%%r11;"\ + "3310412:\n\t"\ + "testq %%r11,%%r11;jz 3310512f;"\ + COMPUTE_SIMPLE(1,12)\ + "3310512:\n\t"\ + "movq %%r14,%1; vzeroupper;"\ + :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(k_count),"+r"(ctemp),"+r"(next_b)\ + :"m"(M),"m"(K),"m"(ALPHA):"r10","r11","r12","r13","r14","r15",\ + "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15","cc","memory");\ + a_pointer -= M * K; b_pointer += 12 * K; c_pointer += (LDC * 12 - M);\ +} + +#include "common.h" +#include +int __attribute__ ((noinline)) +CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC){ + if(m==0||n==0||k==0||alpha==(float)0.0) return 0; + int64_t ldc_in_bytes = (int64_t)LDC * sizeof(float); + float ALPHA = alpha; + int64_t M = (int64_t)m, K = (int64_t)k, k_count = 0; + BLASLONG n_count = n; + float *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*next_b = B; + for(;n_count>11;n_count-=12) COMPUTE_n12 + for(;n_count>7;n_count-=8) COMPUTE_NORMAL(8) + for(;n_count>3;n_count-=4) COMPUTE_NORMAL(4) + for(;n_count>1;n_count-=2) COMPUTE_NORMAL(2) + if(n_count>0) COMPUTE_NORMAL(1) + return 0; +} + From f1746e7284e0ac00c45937087547cf2d43823968 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sat, 22 Feb 2020 23:38:48 +0800 Subject: [PATCH 005/593] Delete sgemm_kernel_8x4_haswell_2.c --- sgemm_kernel_8x4_haswell_2.c | 424 ----------------------------------- 1 file changed, 424 deletions(-) delete mode 100644 sgemm_kernel_8x4_haswell_2.c diff --git a/sgemm_kernel_8x4_haswell_2.c b/sgemm_kernel_8x4_haswell_2.c deleted file mode 100644 index 5ab3e6d1f..000000000 --- a/sgemm_kernel_8x4_haswell_2.c +++ /dev/null @@ -1,424 +0,0 @@ -/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store, %6 = b_pref */ -/* r10 = tmp, r11 = m_counter, r12 = k << 2(const), r13 = tmp, r14 = b_head_pos(const), r15 = tmp */ - -/* m = 8 *//* ymm0 for alpha, ymm1-ymm3 for temporary use, ymm4-ymm15 for accumulators */ -#define KERNEL_k1m8n1 \ - "vmovups (%0),%%ymm1; addq $32,%0;"\ - "vbroadcastss (%1),%%ymm2; vfmadd231ps %%ymm1,%%ymm2,%%ymm4;"\ - "addq $4,%1;" -#define KERNEL_h_k1m8n2 \ - "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; addq $32,%0;"\ - "vbroadcastsd (%1),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;" -#define KERNEL_k1m8n2 KERNEL_h_k1m8n2 "addq $8,%1;" -#define KERNEL_h_k1m8n4 \ - KERNEL_h_k1m8n2 "vbroadcastsd 8(%1),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;" -#define KERNEL_k1m8n4 KERNEL_h_k1m8n4 "addq $16,%1;" -#define unit_kernel_k1m8n4(c1,c2,c3,c4,boff,...) \ - "vbroadcastsd "#boff"("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c1"; vfmadd231ps %%ymm2,%%ymm3,"#c2";"\ - "vbroadcastsd "#boff"+8("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c3"; vfmadd231ps %%ymm2,%%ymm3,"#c4";" -#define KERNEL_h_k1m8n8 KERNEL_h_k1m8n4 unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,0,%1,%%r12,4) -#define KERNEL_k1m8n8 KERNEL_h_k1m8n8 "addq $16,%1;" -#define KERNEL_h_k1m8n12 KERNEL_h_k1m8n8 unit_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,0,%1,%%r12,8) -#define KERNEL_k1m8n12 KERNEL_h_k1m8n12 "addq $16,%1;" -#define KERNEL_k2m8n4 \ - "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; prefetcht0 512(%0);"\ - unit_kernel_k1m8n4(%%ymm4,%%ymm5,%%ymm6,%%ymm7,0,%1)\ - "vmovsldup 32(%0),%%ymm1; vmovshdup 32(%0),%%ymm2; addq $64,%0;"\ - unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,16,%1)\ - "addq $32,%1;" -#define KERNEL_L_k1m8n6 \ - "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; prefetcht0 512(%0); addq $32,%0;"\ - "vbroadcastsd (%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ - "vbroadcastsd 8(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ - "vbroadcastsd (%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ - "addq $16,%1;" -#define KERNEL_L_k2m8n6 \ - "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; prefetcht0 512(%0);"\ - "vbroadcastsd (%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ - "vbroadcastsd 8(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ - "vbroadcastsd (%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ - "vmovsldup 32(%0),%%ymm1; vmovshdup 32(%0),%%ymm2; addq $64,%0;"\ - "vbroadcastsd 16(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ - "vbroadcastsd 24(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ - "vbroadcastsd 16(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ - "addq $32,%1;" -#define KERNEL_L_k1m16n6 \ - "vmovups (%0),%%ymm1; vmovups (%0,%%r12,8),%%ymm2; prefetcht0 512(%0,%%r12,8); addq $32,%0;"\ - "vbroadcastss (%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ - "vbroadcastss 4(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ - "vbroadcastss 8(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ - "vbroadcastss 12(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ - "vbroadcastss (%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ - "vbroadcastss 4(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ - "addq $16,%1;" -#define KERNEL_L_k2m16n6 \ - "vmovups (%0),%%ymm1; vmovups (%0,%%r12,8),%%ymm2; prefetcht0 512(%0,%%r12,8);"\ - "vbroadcastss (%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ - "vbroadcastss 4(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ - "vbroadcastss 8(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ - "vbroadcastss 12(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ - "vbroadcastss (%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ - "vbroadcastss 4(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ - "vmovups 32(%0),%%ymm1; vmovups 32(%0,%%r12,8),%%ymm2; addq $64,%0;"\ - "vbroadcastss 16(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ - "vbroadcastss 20(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ - "vbroadcastss 24(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ - "vbroadcastss 28(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ - "vbroadcastss 16(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ - "vbroadcastss 20(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ - "addq $32,%1;" -#define KERNEL_R_k1m16n6 \ - "vmovups (%0),%%ymm1; vmovups (%0,%%r12,8),%%ymm2; prefetcht0 512(%0,%%r12,8); addq $32,%0;"\ - "vbroadcastss 8(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ - "vbroadcastss 12(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ - "vbroadcastss (%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ - "vbroadcastss 4(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ - "vbroadcastss 8(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ - "vbroadcastss 12(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ - "addq $16,%1;" -#define KERNEL_R_k2m16n6 \ - "vmovups (%0),%%ymm1; vmovups (%0,%%r12,8),%%ymm2; prefetcht0 512(%0,%%r12,8);"\ - "vbroadcastss 8(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ - "vbroadcastss 12(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ - "vbroadcastss (%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ - "vbroadcastss 4(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ - "vbroadcastss 8(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ - "vbroadcastss 12(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ - "vmovups 32(%0),%%ymm1; vmovups 32(%0,%%r12,8),%%ymm2; addq $64,%0;"\ - "vbroadcastss 24(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ - "vbroadcastss 28(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ - "vbroadcastss 16(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ - "vbroadcastss 20(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ - "vbroadcastss 24(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ - "vbroadcastss 28(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ - "addq $32,%1;" -#define KERNEL_R_k1m8n6 \ - "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; prefetcht0 512(%0); addq $32,%0;"\ - "vbroadcastsd 8(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ - "vbroadcastsd (%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ - "vbroadcastsd 8(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ - "addq $16,%1;" -#define KERNEL_R_k2m8n6 \ - "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; prefetcht0 512(%0);"\ - "vbroadcastsd 8(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ - "vbroadcastsd (%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ - "vbroadcastsd 8(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ - "vmovsldup 32(%0),%%ymm1; vmovshdup 32(%0),%%ymm2; addq $64,%0;"\ - "vbroadcastsd 24(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ - "vbroadcastsd 16(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ - "vbroadcastsd 24(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ - "addq $32,%1;" -#define INIT_m8n1 "vpxor %%ymm4,%%ymm4,%%ymm4;" -#define INIT_m8n2 INIT_m8n1 "vpxor %%ymm5,%%ymm5,%%ymm5;" -#define unit_init_m8n4(c1,c2,c3,c4) \ - "vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";" -#define INIT_m8n8 unit_init_m8n4(%%ymm4,%%ymm5,%%ymm6,%%ymm7) unit_init_m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11) -#define INIT_m8n4 INIT_m8n8 -#define INIT_m8n12 INIT_m8n8 unit_init_m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15) -#define INIT_m8n6 INIT_m8n12 -#define INIT_m16n6 INIT_m8n12 -#define SAVE_m8n1 "vfmadd213ps (%2),%%ymm0,%%ymm4; vmovups %%ymm4,(%2);" -#define unit_save_m8n2(c1,c2) \ - "vunpcklps "#c2","#c1",%%ymm2; vunpckhps "#c2","#c1",%%ymm3; vunpcklpd %%ymm3,%%ymm2,"#c1"; vunpckhpd %%ymm3,%%ymm2,"#c2";"\ - "vfmadd213ps (%5),%%ymm0,"#c1"; vfmadd213ps (%5,%3,1),%%ymm0,"#c2"; vmovups "#c1",(%5); vmovups "#c2",(%5,%3,1); leaq (%5,%3,2),%5;" -#define SAVE_m8n2 "movq %2,%5;" unit_save_m8n2(%%ymm4,%%ymm5) -#define SAVE_m8n4 "movq %2,%5;"\ - "vaddps %%ymm4,%%ymm8,%%ymm4; vaddps %%ymm5,%%ymm9,%%ymm5; vaddps %%ymm6,%%ymm10,%%ymm6; vaddps %%ymm7,%%ymm11,%%ymm7;"\ - unit_save_m8n2(%%ymm4,%%ymm5) unit_save_m8n2(%%ymm6,%%ymm7) -#define SAVE_m8n8 "movq %2,%5;"\ - unit_save_m8n2(%%ymm4,%%ymm5) unit_save_m8n2(%%ymm6,%%ymm7) unit_save_m8n2(%%ymm8,%%ymm9) unit_save_m8n2(%%ymm10,%%ymm11) -#define SAVE_m8n12 SAVE_m8n8 unit_save_m8n2(%%ymm12,%%ymm13) unit_save_m8n2(%%ymm14,%%ymm15) -#define unit_save_m16n2(c1,c2,c3,c4) \ - "vfmadd213ps (%5),%%ymm0,"#c1"; vfmadd213ps 32(%5),%%ymm0,"#c2"; vmovups "#c1",(%5); vmovups "#c2",32(%5);"\ - "vfmadd213ps (%5,%3,1),%%ymm0,"#c3"; vfmadd213ps 32(%5,%3,1),%%ymm0,"#c4"; vmovups "#c3",(%5,%3,1); vmovups "#c4",32(%5,%3,1); leaq (%5,%3,2),%5;" -#define SAVE_L_m16n6 "movq %2,%5;"\ - unit_save_m16n2(%%ymm4,%%ymm5,%%ymm6,%%ymm7) unit_save_m16n2(%%ymm8,%%ymm9,%%ymm10,%%ymm11) unit_save_m16n2(%%ymm12,%%ymm13,%%ymm14,%%ymm15) -#define SAVE_R_m16n6 "leaq (%2,%3,4),%5; leaq (%5,%3,2),%5;"\ - unit_save_m16n2(%%ymm4,%%ymm5,%%ymm6,%%ymm7) unit_save_m16n2(%%ymm8,%%ymm9,%%ymm10,%%ymm11) unit_save_m16n2(%%ymm12,%%ymm13,%%ymm14,%%ymm15) -#define SAVE_L_m8n6 "movq %2,%5;"\ - "vaddps %%ymm4,%%ymm10,%%ymm4; vaddps %%ymm5,%%ymm11,%%ymm5; vaddps %%ymm6,%%ymm12,%%ymm6;"\ - "vaddps %%ymm7,%%ymm13,%%ymm7; vaddps %%ymm8,%%ymm14,%%ymm8; vaddps %%ymm9,%%ymm15,%%ymm9;"\ - unit_save_m8n2(%%ymm4,%%ymm5) unit_save_m8n2(%%ymm6,%%ymm7) unit_save_m8n2(%%ymm8,%%ymm9) -#define SAVE_R_m8n6 "leaq (%2,%3,4),%5; leaq (%5,%3,2),%5;"\ - "vaddps %%ymm4,%%ymm10,%%ymm4; vaddps %%ymm5,%%ymm11,%%ymm5; vaddps %%ymm6,%%ymm12,%%ymm6;"\ - "vaddps %%ymm7,%%ymm13,%%ymm7; vaddps %%ymm8,%%ymm14,%%ymm8; vaddps %%ymm9,%%ymm15,%%ymm9;"\ - unit_save_m8n2(%%ymm4,%%ymm5) unit_save_m8n2(%%ymm6,%%ymm7) unit_save_m8n2(%%ymm8,%%ymm9) - -/* m = 4 *//* xmm0 for alpha, xmm1-xmm3 for temporary use, xmm4-xmm15 for accumulators */ -#define KERNEL_k1m4n1 \ - "vmovups (%0),%%xmm1; addq $16,%0;"\ - "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ - "addq $4,%1;" -#define KERNEL_h_k1m4n2 \ - "vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2; addq $16,%0;"\ - "vmovddup (%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm4; vfmadd231ps %%xmm2,%%xmm3,%%xmm5;" -#define KERNEL_k1m4n2 KERNEL_h_k1m4n2 "addq $8,%1;" -#define KERNEL_h_k1m4n4 \ - KERNEL_h_k1m4n2 "vmovddup 8(%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm6; vfmadd231ps %%xmm2,%%xmm3,%%xmm7;" -#define KERNEL_k1m4n4 KERNEL_h_k1m4n4 "addq $16,%1;" -#define unit_kernel_k1m4n4(c1,c2,c3,c4,...) \ - "vmovddup ("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c1"; vfmadd231ps %%xmm2,%%xmm3,"#c2";"\ - "vmovddup 8("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c3"; vfmadd231ps %%xmm2,%%xmm3,"#c4";" -#define KERNEL_h_k1m4n8 KERNEL_h_k1m4n4 unit_kernel_k1m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11,%1,%%r12,4) -#define KERNEL_k1m4n8 KERNEL_h_k1m4n8 "addq $16,%1;" -#define KERNEL_h_k1m4n12 KERNEL_h_k1m4n8 unit_kernel_k1m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15,%1,%%r12,8) -#define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $16,%1;" -#define INIT_m4n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" -#define INIT_m4n2 INIT_m4n1 "vpxor %%xmm5,%%xmm5,%%xmm5;" -#define INIT_m4n4 INIT_m4n2 "vpxor %%xmm6,%%xmm6,%%xmm6;vpxor %%xmm7,%%xmm7,%%xmm7;" -#define unit_init_m4n4(c1,c2,c3,c4) \ - "vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";" -#define INIT_m4n8 INIT_m4n4 unit_init_m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11) -#define INIT_m4n12 INIT_m4n8 unit_init_m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15) -#define SAVE_m4n1 "vfmadd213ps (%2),%%xmm0,%%xmm4; vmovups %%xmm4,(%2);" -#define unit_save_m4n2(c1,c2) \ - "vunpcklps "#c2","#c1",%%xmm2; vunpckhps "#c2","#c1",%%xmm3; vunpcklpd %%xmm3,%%xmm2,"#c1"; vunpckhpd %%xmm3,%%xmm2,"#c2";"\ - "vfmadd213ps (%5),%%xmm0,"#c1"; vmovups "#c1",(%5);"\ - "vfmadd213ps (%5,%3,1),%%xmm0,"#c2"; vmovups "#c2",(%5,%3,1);"\ - "leaq (%5,%3,2),%5;" -#define SAVE_m4n2 "movq %2,%5;" unit_save_m4n2(%%xmm4,%%xmm5) -#define SAVE_m4n4 SAVE_m4n2 unit_save_m4n2(%%xmm6,%%xmm7) -#define SAVE_m4n8 SAVE_m4n4 unit_save_m4n2(%%xmm8,%%xmm9) unit_save_m4n2(%%xmm10,%%xmm11) -#define SAVE_m4n12 SAVE_m4n8 unit_save_m4n2(%%xmm12,%%xmm13) unit_save_m4n2(%%xmm14,%%xmm15) - -/* m = 2 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm9 for accumulators */ -#define INIT_m2n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" -#define KERNEL_k1m2n1 \ - "vmovsd (%0),%%xmm1; addq $8,%0;"\ - "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ - "addq $4,%1;" -#define SAVE_m2n1 "vmovsd (%2),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm4; vmovsd %%xmm4,(%2);" -#define INIT_m2n2 INIT_m2n1 "vpxor %%xmm5,%%xmm5,%%xmm5;" -#define KERNEL_k1m2n2 \ - "vmovsd (%0),%%xmm1; addq $8,%0;"\ - "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ - "vbroadcastss 4(%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm5;"\ - "addq $8,%1;" -#define SAVE_m2n2 SAVE_m2n1 "vmovsd (%2,%3,1),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm5; vmovsd %%xmm5,(%2,%3,1);" -#define INIT_m2n4 INIT_m2n2 -#define INIT_m2n8 INIT_m2n4 "vpxor %%xmm6,%%xmm6,%%xmm6; vpxor %%xmm7,%%xmm7,%%xmm7;" -#define INIT_m2n12 INIT_m2n8 "vpxor %%xmm8,%%xmm8,%%xmm8; vpxor %%xmm9,%%xmm9,%%xmm9;" -#define KERNEL_k1m2n4 \ - "vmovups (%1),%%xmm3; addq $16,%1;"\ - "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\ - "vbroadcastss 4(%0),%%xmm2; vfmadd231ps %%xmm3,%%xmm2,%%xmm5;"\ - "addq $8,%0;" -#define KERNEL_k1m2n8 \ - "vmovups (%1),%%xmm3; vmovups (%1,%%r12,4),%%xmm2; addq $16,%1;"\ - "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4; vfmadd231ps %%xmm2,%%xmm1,%%xmm6;"\ - "vbroadcastss 4(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm5; vfmadd231ps %%xmm2,%%xmm1,%%xmm7;"\ - "addq $8,%0;" -#define KERNEL_k1m2n12 \ - "vmovups (%1),%%xmm3; vmovups (%1,%%r12,4),%%xmm2; vmovups (%1,%%r12,8),%%xmm1; addq $16,%1;"\ - "vbroadcastss (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm6; vfmadd231ps %%xmm1,%%xmm10,%%xmm8;"\ - "vbroadcastss 4(%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm5; vfmadd231ps %%xmm2,%%xmm10,%%xmm7; vfmadd231ps %%xmm1,%%xmm10,%%xmm9;"\ - "addq $8,%0;" -#define unit_save_m2n4(c1,c2) \ - "vunpcklps "#c2","#c1",%%xmm1; vunpckhps "#c2","#c1",%%xmm2;"\ - "vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1;"\ - "vmovsd %%xmm1,(%5); vmovhpd %%xmm1,(%5,%3,1); leaq (%5,%3,2),%5;"\ - "vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2;"\ - "vmovsd %%xmm2,(%5); vmovhpd %%xmm2,(%5,%3,1); leaq (%5,%3,2),%5;" -#define SAVE_m2n4 "movq %2,%5;" unit_save_m2n4(%%xmm4,%%xmm5) -#define SAVE_m2n8 SAVE_m2n4 unit_save_m2n4(%%xmm6,%%xmm7) -#define SAVE_m2n12 SAVE_m2n8 unit_save_m2n4(%%xmm8,%%xmm9) - -/* m = 1 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm6 for accumulators */ -#define INIT_m1n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" -#define KERNEL_k1m1n1 \ - "vmovss (%1),%%xmm3; addq $4,%1;"\ - "vmovss (%0),%%xmm1; vfmadd231ss %%xmm3,%%xmm1,%%xmm4;"\ - "addq $4,%0;" -#define SAVE_m1n1 "vfmadd213ss (%2),%%xmm0,%%xmm4; vmovss %%xmm4,(%2);" -#define INIT_m1n2 INIT_m1n1 -#define KERNEL_k1m1n2 \ - "vmovsd (%1),%%xmm3; addq $8,%1;"\ - "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\ - "addq $4,%0;" -#define SAVE_m1n2 \ - "vmovss (%2),%%xmm3; vinsertps $16,(%2,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm4;"\ - "vmovss %%xmm4,(%2); vextractps $1,%%xmm4,(%2,%3,1);" -#define INIT_m1n4 INIT_m1n2 -#define INIT_m1n8 INIT_m1n4 "vpxor %%xmm5,%%xmm5,%%xmm5;" -#define INIT_m1n12 INIT_m1n8 "vpxor %%xmm6,%%xmm6,%%xmm6;" -#define KERNEL_k1m1n4 \ - "vmovups (%1),%%xmm3; addq $16,%1;"\ - "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\ - "addq $4,%0;" -#define KERNEL_k1m1n8 \ - "vmovups (%1),%%xmm3; vmovups (%1,%%r12,4),%%xmm2; addq $16,%1;"\ - "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4; vfmadd231ps %%xmm2,%%xmm1,%%xmm5;"\ - "addq $4,%0;" -#define KERNEL_k1m1n12 \ - "vmovups (%1),%%xmm3; vmovups (%1,%%r12,4),%%xmm2; vmovups (%1,%%r12,8),%%xmm1; addq $16,%1;"\ - "vbroadcastss (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm5; vfmadd231ps %%xmm1,%%xmm10,%%xmm6;"\ - "addq $4,%0;" -#define unit_save_m1n4(c1) \ - "vpxor %%xmm10,%%xmm10,%%xmm10; vmovsd "#c1",%%xmm10,%%xmm2; vmovhlps "#c1",%%xmm10,%%xmm1;"\ - "vmovss (%5),%%xmm3; vinsertps $16,(%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2;"\ - "vmovss %%xmm2,(%5); vextractps $1,%%xmm2,(%5,%3,1); leaq (%5,%3,2),%5;"\ - "vmovss (%5),%%xmm3; vinsertps $16,(%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1;"\ - "vmovss %%xmm1,(%5); vextractps $1,%%xmm1,(%5,%3,1); leaq (%5,%3,2),%5;" -#define SAVE_m1n4 "movq %2,%5;" unit_save_m1n4(%%xmm4) -#define SAVE_m1n8 SAVE_m1n4 unit_save_m1n4(%%xmm5) -#define SAVE_m1n12 SAVE_m1n8 unit_save_m1n4(%%xmm6) - -/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store, %6 = b_pref */ -/* r10 = tmp, r11 = m_counter, r12 = k << 2(const), r13 = tmp, r14 = b_head_pos(const), r15 = tmp */ - -#define COMPUTE_SIMPLE(mdim,ndim) \ - "movq %%r12,%4; sarq $2,%4; movq %%r14,%1;" INIT_m##mdim##n##ndim\ - "testq %4,%4; jz 7"#mdim"7"#ndim"2f;"\ - "7"#mdim"7"#ndim"1:\n\t"\ - KERNEL_k1m##mdim##n##ndim "decq %4; jnz 7"#mdim"7"#ndim"1b;"\ - "7"#mdim"7"#ndim"2:\n\t"\ - SAVE_m##mdim##n##ndim "addq $"#mdim"*4,%2;" -#define COMPUTE_m8n1 COMPUTE_SIMPLE(8,1) -#define COMPUTE_m8n2 COMPUTE_SIMPLE(8,2) -#define COMPUTE_m8n8 COMPUTE_SIMPLE(8,8) -#define COMPUTE_m8n12 COMPUTE_SIMPLE(8,12) -#define COMPUTE_m8n4 \ - "movq %%r12,%4; sarq $2,%4; movq %%r14,%1;" INIT_m8n4\ - "cmpq $8,%4; jb 78740f;"\ - "78749:\n\t"\ - KERNEL_k2m8n4 KERNEL_k2m8n4 KERNEL_k2m8n4 KERNEL_k2m8n4\ - "subq $8,%4; cmpq $8,%4; jnb 78749b;"\ - "78740:\n\t"\ - "testq %4,%4; jz 78742f;"\ - "78741:\n\t"\ - KERNEL_k1m8n4 "decq %4; jnz 78741b;"\ - "78742:\n\t"\ - SAVE_m8n4 "addq $32,%2;" -#define COMPUTE_L_m16n6 \ - "movq %%r12,%%r13; sarq $2,%%r13; movq %%r14,%1;" INIT_m16n6\ - "movq %%r13,%4; movq %2,%5; cmpq $16,%%r13; jb 7116762f; movq $14,%4;"\ - "7116761:\n\t"\ - KERNEL_L_k2m16n6 "prefetcht0 128(%1); testq $24,%4; movq $84,%%r15; cmovz %3,%%r15;"\ - KERNEL_L_k2m16n6 "prefetcht1 (%5); subq $63,%5; addq %%r15,%5;"\ - KERNEL_L_k2m16n6 "prefetcht0 128(%1); prefetcht1 (%6); cmpq $198,%4; cmoveq %2,%5;"\ - KERNEL_L_k2m16n6 "addq $16,%6; addq $8,%4; cmpq %4,%%r13; jnb 7116761b;"\ - "movq %2,%5; negq %4; leaq 14(%%r13,%4,1),%4;"\ - "7116762:\n\t"\ - "xorq %%r15,%%r15; testq %4,%4; jz 7116764f;"\ - "7116763:\n\t"\ - "prefetcht0 (%5); prefetcht0 63(%5); addq %3,%5; incq %%r15;"\ - KERNEL_L_k1m16n6 "cmpq $6,%%r15; cmoveq %2,%5; decq %4; jnz 7116763b;"\ - "7116764:\n\t"\ - SAVE_L_m16n6 "addq $32,%2;" -#define COMPUTE_R_m16n6 \ - "movq %%r12,%%r13; sarq $2,%%r13; movq %%r14,%1;" INIT_m16n6\ - "movq %%r13,%4; leaq (%2,%3,4),%5; leaq (%5,%3,2),%5; movq %5,%%r10; cmpq $16,%%r13; jb 7216762f; movq $14,%4;"\ - "7216761:\n\t"\ - KERNEL_R_k2m16n6 "prefetcht0 128(%1,%%r12,8); testq $24,%4; movq $84,%%r15; cmovz %3,%%r15;"\ - KERNEL_R_k2m16n6 "prefetcht1 (%5); subq $63,%5; addq %%r15,%5;"\ - KERNEL_R_k2m16n6 "prefetcht0 128(%1,%%r12,8); prefetcht1 (%6); cmpq $198,%4; cmoveq %%r10,%5;"\ - KERNEL_R_k2m16n6 "addq $16,%6; addq $8,%4; cmpq %4,%%r13; jnb 7216761b;"\ - "movq %%r10,%5; negq %4; leaq 14(%%r13,%4,1),%4;"\ - "7216762:\n\t"\ - "xorq %%r15,%%r15; testq %4,%4; jz 7216764f;"\ - "7216763:\n\t"\ - "prefetcht0 (%5); prefetcht0 63(%5); addq %3,%5; incq %%r15;"\ - KERNEL_R_k1m16n6 "cmpq $6,%%r15; cmoveq %%r10,%5; decq %4; jnz 7216763b;"\ - "7216764:\n\t"\ - "prefetcht0 (%%r14); prefetcht0 64(%%r14);" SAVE_R_m16n6 "addq $32,%2;" -#define COMPUTE_H_m8n6 \ - "movq %%r12,%4; sarq $2,%4; movq %%r14,%1;" INIT_m8n6\ - "cmpq $8,%4; jb 718760f; movq %2,%5; xorq %%r15,%%r15;"\ - "718769:\n\t"\ - KERNEL_L_k2m8n6 KERNEL_L_k2m8n6 "cmpq $62,%%r15; movq $62,%%r15; cmoveq %3,%%r15;"\ - KERNEL_L_k2m8n6 KERNEL_L_k2m8n6 "prefetcht2 (%5); leaq -31(%5,%%r15,1),%5;"\ - "subq $8,%4; cmpq $8,%4; jnb 718769b;"\ - "718760:\n\t"\ - "testq %4,%4; jz 718762f;"\ - "718761:\n\t"\ - KERNEL_L_k1m8n6 "decq %4; jnz 718761b;"\ - "718762:\n\t"\ - SAVE_L_m8n6 "negq %%r12; leaq (%0,%%r12,8),%0; negq %%r12;" -#define COMPUTE_T_m8n6(side,sim) \ - "movq %%r12,%4; sarq $2,%4; movq %%r14,%1;" INIT_m8n6\ - "cmpq $8,%4; jb 72"#sim"8760f;"\ - "72"#sim"8769:\n\t"\ - KERNEL_##side##_k2m8n6 KERNEL_##side##_k2m8n6 KERNEL_##side##_k2m8n6 KERNEL_##side##_k2m8n6\ - "subq $8,%4; cmpq $8,%4; jnb 72"#sim"8769b;"\ - "72"#sim"8760:\n\t"\ - "testq %4,%4; jz 72"#sim"8762f;"\ - "72"#sim"8761:\n\t"\ - KERNEL_##side##_k1m8n6 "decq %4; jnz 72"#sim"8761b;"\ - "72"#sim"8762:\n\t"\ - SAVE_##side##_m8n6 "addq $32,%2;" -#define COMPUTE_NORMAL(ndim) {\ - next_b = b_pointer + ndim * K;\ - __asm__ __volatile__(\ - "vbroadcastss %9,%%ymm0;"\ - "movq %8,%%r12; salq $2,%%r12; movq %1,%%r14; movq %7,%%r11;"\ - "cmpq $8,%%r11;jb 33101"#ndim"f;"\ - "33109"#ndim":\n\t"\ - COMPUTE_m8n##ndim\ - "subq $8,%%r11;cmpq $8,%%r11;jnb 33109"#ndim"b;"\ - "33101"#ndim":\n\t"\ - "cmpq $4,%%r11;jb 33103"#ndim"f;"\ - COMPUTE_SIMPLE(4,ndim) "subq $4,%%r11;"\ - "33103"#ndim":\n\t"\ - "cmpq $2,%%r11;jb 33104"#ndim"f;"\ - COMPUTE_SIMPLE(2,ndim) "subq $2,%%r11;"\ - "33104"#ndim":\n\t"\ - "testq %%r11,%%r11;jz 33105"#ndim"f;"\ - COMPUTE_SIMPLE(1,ndim)\ - "33105"#ndim":\n\t"\ - "movq %%r14,%1; vzeroupper;"\ - :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(k_count),"+r"(ctemp),"+r"(next_b)\ - :"m"(M),"m"(K),"m"(ALPHA):"r10","r11","r12","r13","r14","r15",\ - "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15","cc","memory");\ - a_pointer -= M * K; b_pointer += ndim * K; c_pointer += (LDC * ndim - M);\ -} -#define COMPUTE_n12 {\ - next_b = b_pointer + 12 * K;\ - __asm__ __volatile__(\ - "vbroadcastss %9,%%ymm0;"\ - "movq %8,%%r12; salq $2,%%r12; movq %1,%%r14; movq %7,%%r11;"\ - "cmpq $16,%%r11;jb 3310112f;"\ - COMPUTE_H_m8n6\ - "3310612:\n\t"\ - COMPUTE_R_m16n6 "subq $8,%%r11; cmpq $16,%%r11;jb 3310712f;"\ - COMPUTE_L_m16n6 "subq $8,%%r11; cmpq $16,%%r11;jnb 3310612b;"\ - COMPUTE_T_m8n6(R,5) "subq $8,%%r11; jmp 3310212f;"\ - "3310712:\n\t"\ - COMPUTE_T_m8n6(L,7) "subq $8,%%r11; jmp 3310212f;"\ - "3310112:\n\t"\ - "cmpq $8,%%r11;jb 3310212f;"\ - COMPUTE_SIMPLE(8,12) "subq $8,%%r11;"\ - "3310212:\n\t"\ - "cmpq $4,%%r11;jb 3310312f;"\ - COMPUTE_SIMPLE(4,12) "subq $4,%%r11;"\ - "3310312:\n\t"\ - "cmpq $2,%%r11;jb 3310412f;"\ - COMPUTE_SIMPLE(2,12) "subq $2,%%r11;"\ - "3310412:\n\t"\ - "testq %%r11,%%r11;jz 3310512f;"\ - COMPUTE_SIMPLE(1,12)\ - "3310512:\n\t"\ - "movq %%r14,%1; vzeroupper;"\ - :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(k_count),"+r"(ctemp),"+r"(next_b)\ - :"m"(M),"m"(K),"m"(ALPHA):"r10","r11","r12","r13","r14","r15",\ - "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15","cc","memory");\ - a_pointer -= M * K; b_pointer += 12 * K; c_pointer += (LDC * 12 - M);\ -} - -#include "common.h" -#include -int __attribute__ ((noinline)) -CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC){ - if(m==0||n==0||k==0||alpha==(float)0.0) return 0; - int64_t ldc_in_bytes = (int64_t)LDC * sizeof(float); - float ALPHA = alpha; - int64_t M = (int64_t)m, K = (int64_t)k, k_count = 0; - BLASLONG n_count = n; - float *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*next_b = B; - for(;n_count>11;n_count-=12) COMPUTE_n12 - for(;n_count>7;n_count-=8) COMPUTE_NORMAL(8) - for(;n_count>3;n_count-=4) COMPUTE_NORMAL(4) - for(;n_count>1;n_count-=2) COMPUTE_NORMAL(2) - if(n_count>0) COMPUTE_NORMAL(1) - return 0; -} - From 97a32cb0a52b159d547b0c41d42b18854c365ec9 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sat, 22 Feb 2020 23:39:20 +0800 Subject: [PATCH 006/593] Update KERNEL.HASWELL --- kernel/x86_64/KERNEL.HASWELL | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index d24b7f3b3..f6ca5c2d5 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -32,7 +32,7 @@ CAXPYKERNEL = caxpy.c ZAXPYKERNEL = zaxpy.c STRMMKERNEL = sgemm_kernel_8x4_haswell.c -SGEMMKERNEL = sgemm_kernel_8x4_haswell.c +SGEMMKERNEL = sgemm_kernel_8x4_haswell_2.c SGEMM_BETA = sgemm_beta_skylakex.c SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMITCOPY = ../generic/gemm_tcopy_8.c From a2ff577a3005cd5f028705e567b08f6cbd65534c Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sat, 22 Feb 2020 23:39:43 +0800 Subject: [PATCH 007/593] Update KERNEL.ZEN --- kernel/x86_64/KERNEL.ZEN | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.ZEN b/kernel/x86_64/KERNEL.ZEN index 7cec2e5ed..1cd02db74 100644 --- a/kernel/x86_64/KERNEL.ZEN +++ b/kernel/x86_64/KERNEL.ZEN @@ -31,7 +31,7 @@ CAXPYKERNEL = caxpy.c ZAXPYKERNEL = zaxpy.c STRMMKERNEL = sgemm_kernel_8x4_haswell.c -SGEMMKERNEL = sgemm_kernel_8x4_haswell.c +SGEMMKERNEL = sgemm_kernel_8x4_haswell_2.c SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMITCOPY = ../generic/gemm_tcopy_8.c SGEMMONCOPY = ../generic/gemm_ncopy_4.c From 903854c168cc438c7d154fefb25a639752674242 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sat, 22 Feb 2020 23:40:02 +0800 Subject: [PATCH 008/593] Add files via upload --- kernel/x86_64/sgemm_kernel_8x4_haswell_2.c | 424 +++++++++++++++++++++ 1 file changed, 424 insertions(+) create mode 100644 kernel/x86_64/sgemm_kernel_8x4_haswell_2.c diff --git a/kernel/x86_64/sgemm_kernel_8x4_haswell_2.c b/kernel/x86_64/sgemm_kernel_8x4_haswell_2.c new file mode 100644 index 000000000..5ab3e6d1f --- /dev/null +++ b/kernel/x86_64/sgemm_kernel_8x4_haswell_2.c @@ -0,0 +1,424 @@ +/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store, %6 = b_pref */ +/* r10 = tmp, r11 = m_counter, r12 = k << 2(const), r13 = tmp, r14 = b_head_pos(const), r15 = tmp */ + +/* m = 8 *//* ymm0 for alpha, ymm1-ymm3 for temporary use, ymm4-ymm15 for accumulators */ +#define KERNEL_k1m8n1 \ + "vmovups (%0),%%ymm1; addq $32,%0;"\ + "vbroadcastss (%1),%%ymm2; vfmadd231ps %%ymm1,%%ymm2,%%ymm4;"\ + "addq $4,%1;" +#define KERNEL_h_k1m8n2 \ + "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; addq $32,%0;"\ + "vbroadcastsd (%1),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;" +#define KERNEL_k1m8n2 KERNEL_h_k1m8n2 "addq $8,%1;" +#define KERNEL_h_k1m8n4 \ + KERNEL_h_k1m8n2 "vbroadcastsd 8(%1),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;" +#define KERNEL_k1m8n4 KERNEL_h_k1m8n4 "addq $16,%1;" +#define unit_kernel_k1m8n4(c1,c2,c3,c4,boff,...) \ + "vbroadcastsd "#boff"("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c1"; vfmadd231ps %%ymm2,%%ymm3,"#c2";"\ + "vbroadcastsd "#boff"+8("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c3"; vfmadd231ps %%ymm2,%%ymm3,"#c4";" +#define KERNEL_h_k1m8n8 KERNEL_h_k1m8n4 unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,0,%1,%%r12,4) +#define KERNEL_k1m8n8 KERNEL_h_k1m8n8 "addq $16,%1;" +#define KERNEL_h_k1m8n12 KERNEL_h_k1m8n8 unit_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,0,%1,%%r12,8) +#define KERNEL_k1m8n12 KERNEL_h_k1m8n12 "addq $16,%1;" +#define KERNEL_k2m8n4 \ + "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; prefetcht0 512(%0);"\ + unit_kernel_k1m8n4(%%ymm4,%%ymm5,%%ymm6,%%ymm7,0,%1)\ + "vmovsldup 32(%0),%%ymm1; vmovshdup 32(%0),%%ymm2; addq $64,%0;"\ + unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,16,%1)\ + "addq $32,%1;" +#define KERNEL_L_k1m8n6 \ + "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; prefetcht0 512(%0); addq $32,%0;"\ + "vbroadcastsd (%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ + "vbroadcastsd 8(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ + "vbroadcastsd (%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ + "addq $16,%1;" +#define KERNEL_L_k2m8n6 \ + "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; prefetcht0 512(%0);"\ + "vbroadcastsd (%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ + "vbroadcastsd 8(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ + "vbroadcastsd (%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ + "vmovsldup 32(%0),%%ymm1; vmovshdup 32(%0),%%ymm2; addq $64,%0;"\ + "vbroadcastsd 16(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ + "vbroadcastsd 24(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ + "vbroadcastsd 16(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ + "addq $32,%1;" +#define KERNEL_L_k1m16n6 \ + "vmovups (%0),%%ymm1; vmovups (%0,%%r12,8),%%ymm2; prefetcht0 512(%0,%%r12,8); addq $32,%0;"\ + "vbroadcastss (%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ + "vbroadcastss 4(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ + "vbroadcastss 8(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ + "vbroadcastss 12(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ + "vbroadcastss (%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ + "vbroadcastss 4(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ + "addq $16,%1;" +#define KERNEL_L_k2m16n6 \ + "vmovups (%0),%%ymm1; vmovups (%0,%%r12,8),%%ymm2; prefetcht0 512(%0,%%r12,8);"\ + "vbroadcastss (%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ + "vbroadcastss 4(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ + "vbroadcastss 8(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ + "vbroadcastss 12(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ + "vbroadcastss (%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ + "vbroadcastss 4(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ + "vmovups 32(%0),%%ymm1; vmovups 32(%0,%%r12,8),%%ymm2; addq $64,%0;"\ + "vbroadcastss 16(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ + "vbroadcastss 20(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ + "vbroadcastss 24(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ + "vbroadcastss 28(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ + "vbroadcastss 16(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ + "vbroadcastss 20(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ + "addq $32,%1;" +#define KERNEL_R_k1m16n6 \ + "vmovups (%0),%%ymm1; vmovups (%0,%%r12,8),%%ymm2; prefetcht0 512(%0,%%r12,8); addq $32,%0;"\ + "vbroadcastss 8(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ + "vbroadcastss 12(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ + "vbroadcastss (%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ + "vbroadcastss 4(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ + "vbroadcastss 8(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ + "vbroadcastss 12(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ + "addq $16,%1;" +#define KERNEL_R_k2m16n6 \ + "vmovups (%0),%%ymm1; vmovups (%0,%%r12,8),%%ymm2; prefetcht0 512(%0,%%r12,8);"\ + "vbroadcastss 8(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ + "vbroadcastss 12(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ + "vbroadcastss (%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ + "vbroadcastss 4(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ + "vbroadcastss 8(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ + "vbroadcastss 12(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ + "vmovups 32(%0),%%ymm1; vmovups 32(%0,%%r12,8),%%ymm2; addq $64,%0;"\ + "vbroadcastss 24(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ + "vbroadcastss 28(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ + "vbroadcastss 16(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ + "vbroadcastss 20(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ + "vbroadcastss 24(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ + "vbroadcastss 28(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ + "addq $32,%1;" +#define KERNEL_R_k1m8n6 \ + "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; prefetcht0 512(%0); addq $32,%0;"\ + "vbroadcastsd 8(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ + "vbroadcastsd (%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ + "vbroadcastsd 8(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ + "addq $16,%1;" +#define KERNEL_R_k2m8n6 \ + "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; prefetcht0 512(%0);"\ + "vbroadcastsd 8(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\ + "vbroadcastsd (%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\ + "vbroadcastsd 8(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\ + "vmovsldup 32(%0),%%ymm1; vmovshdup 32(%0),%%ymm2; addq $64,%0;"\ + "vbroadcastsd 24(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\ + "vbroadcastsd 16(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\ + "vbroadcastsd 24(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\ + "addq $32,%1;" +#define INIT_m8n1 "vpxor %%ymm4,%%ymm4,%%ymm4;" +#define INIT_m8n2 INIT_m8n1 "vpxor %%ymm5,%%ymm5,%%ymm5;" +#define unit_init_m8n4(c1,c2,c3,c4) \ + "vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";" +#define INIT_m8n8 unit_init_m8n4(%%ymm4,%%ymm5,%%ymm6,%%ymm7) unit_init_m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11) +#define INIT_m8n4 INIT_m8n8 +#define INIT_m8n12 INIT_m8n8 unit_init_m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15) +#define INIT_m8n6 INIT_m8n12 +#define INIT_m16n6 INIT_m8n12 +#define SAVE_m8n1 "vfmadd213ps (%2),%%ymm0,%%ymm4; vmovups %%ymm4,(%2);" +#define unit_save_m8n2(c1,c2) \ + "vunpcklps "#c2","#c1",%%ymm2; vunpckhps "#c2","#c1",%%ymm3; vunpcklpd %%ymm3,%%ymm2,"#c1"; vunpckhpd %%ymm3,%%ymm2,"#c2";"\ + "vfmadd213ps (%5),%%ymm0,"#c1"; vfmadd213ps (%5,%3,1),%%ymm0,"#c2"; vmovups "#c1",(%5); vmovups "#c2",(%5,%3,1); leaq (%5,%3,2),%5;" +#define SAVE_m8n2 "movq %2,%5;" unit_save_m8n2(%%ymm4,%%ymm5) +#define SAVE_m8n4 "movq %2,%5;"\ + "vaddps %%ymm4,%%ymm8,%%ymm4; vaddps %%ymm5,%%ymm9,%%ymm5; vaddps %%ymm6,%%ymm10,%%ymm6; vaddps %%ymm7,%%ymm11,%%ymm7;"\ + unit_save_m8n2(%%ymm4,%%ymm5) unit_save_m8n2(%%ymm6,%%ymm7) +#define SAVE_m8n8 "movq %2,%5;"\ + unit_save_m8n2(%%ymm4,%%ymm5) unit_save_m8n2(%%ymm6,%%ymm7) unit_save_m8n2(%%ymm8,%%ymm9) unit_save_m8n2(%%ymm10,%%ymm11) +#define SAVE_m8n12 SAVE_m8n8 unit_save_m8n2(%%ymm12,%%ymm13) unit_save_m8n2(%%ymm14,%%ymm15) +#define unit_save_m16n2(c1,c2,c3,c4) \ + "vfmadd213ps (%5),%%ymm0,"#c1"; vfmadd213ps 32(%5),%%ymm0,"#c2"; vmovups "#c1",(%5); vmovups "#c2",32(%5);"\ + "vfmadd213ps (%5,%3,1),%%ymm0,"#c3"; vfmadd213ps 32(%5,%3,1),%%ymm0,"#c4"; vmovups "#c3",(%5,%3,1); vmovups "#c4",32(%5,%3,1); leaq (%5,%3,2),%5;" +#define SAVE_L_m16n6 "movq %2,%5;"\ + unit_save_m16n2(%%ymm4,%%ymm5,%%ymm6,%%ymm7) unit_save_m16n2(%%ymm8,%%ymm9,%%ymm10,%%ymm11) unit_save_m16n2(%%ymm12,%%ymm13,%%ymm14,%%ymm15) +#define SAVE_R_m16n6 "leaq (%2,%3,4),%5; leaq (%5,%3,2),%5;"\ + unit_save_m16n2(%%ymm4,%%ymm5,%%ymm6,%%ymm7) unit_save_m16n2(%%ymm8,%%ymm9,%%ymm10,%%ymm11) unit_save_m16n2(%%ymm12,%%ymm13,%%ymm14,%%ymm15) +#define SAVE_L_m8n6 "movq %2,%5;"\ + "vaddps %%ymm4,%%ymm10,%%ymm4; vaddps %%ymm5,%%ymm11,%%ymm5; vaddps %%ymm6,%%ymm12,%%ymm6;"\ + "vaddps %%ymm7,%%ymm13,%%ymm7; vaddps %%ymm8,%%ymm14,%%ymm8; vaddps %%ymm9,%%ymm15,%%ymm9;"\ + unit_save_m8n2(%%ymm4,%%ymm5) unit_save_m8n2(%%ymm6,%%ymm7) unit_save_m8n2(%%ymm8,%%ymm9) +#define SAVE_R_m8n6 "leaq (%2,%3,4),%5; leaq (%5,%3,2),%5;"\ + "vaddps %%ymm4,%%ymm10,%%ymm4; vaddps %%ymm5,%%ymm11,%%ymm5; vaddps %%ymm6,%%ymm12,%%ymm6;"\ + "vaddps %%ymm7,%%ymm13,%%ymm7; vaddps %%ymm8,%%ymm14,%%ymm8; vaddps %%ymm9,%%ymm15,%%ymm9;"\ + unit_save_m8n2(%%ymm4,%%ymm5) unit_save_m8n2(%%ymm6,%%ymm7) unit_save_m8n2(%%ymm8,%%ymm9) + +/* m = 4 *//* xmm0 for alpha, xmm1-xmm3 for temporary use, xmm4-xmm15 for accumulators */ +#define KERNEL_k1m4n1 \ + "vmovups (%0),%%xmm1; addq $16,%0;"\ + "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ + "addq $4,%1;" +#define KERNEL_h_k1m4n2 \ + "vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2; addq $16,%0;"\ + "vmovddup (%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm4; vfmadd231ps %%xmm2,%%xmm3,%%xmm5;" +#define KERNEL_k1m4n2 KERNEL_h_k1m4n2 "addq $8,%1;" +#define KERNEL_h_k1m4n4 \ + KERNEL_h_k1m4n2 "vmovddup 8(%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm6; vfmadd231ps %%xmm2,%%xmm3,%%xmm7;" +#define KERNEL_k1m4n4 KERNEL_h_k1m4n4 "addq $16,%1;" +#define unit_kernel_k1m4n4(c1,c2,c3,c4,...) \ + "vmovddup ("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c1"; vfmadd231ps %%xmm2,%%xmm3,"#c2";"\ + "vmovddup 8("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c3"; vfmadd231ps %%xmm2,%%xmm3,"#c4";" +#define KERNEL_h_k1m4n8 KERNEL_h_k1m4n4 unit_kernel_k1m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11,%1,%%r12,4) +#define KERNEL_k1m4n8 KERNEL_h_k1m4n8 "addq $16,%1;" +#define KERNEL_h_k1m4n12 KERNEL_h_k1m4n8 unit_kernel_k1m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15,%1,%%r12,8) +#define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $16,%1;" +#define INIT_m4n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" +#define INIT_m4n2 INIT_m4n1 "vpxor %%xmm5,%%xmm5,%%xmm5;" +#define INIT_m4n4 INIT_m4n2 "vpxor %%xmm6,%%xmm6,%%xmm6;vpxor %%xmm7,%%xmm7,%%xmm7;" +#define unit_init_m4n4(c1,c2,c3,c4) \ + "vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";" +#define INIT_m4n8 INIT_m4n4 unit_init_m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11) +#define INIT_m4n12 INIT_m4n8 unit_init_m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15) +#define SAVE_m4n1 "vfmadd213ps (%2),%%xmm0,%%xmm4; vmovups %%xmm4,(%2);" +#define unit_save_m4n2(c1,c2) \ + "vunpcklps "#c2","#c1",%%xmm2; vunpckhps "#c2","#c1",%%xmm3; vunpcklpd %%xmm3,%%xmm2,"#c1"; vunpckhpd %%xmm3,%%xmm2,"#c2";"\ + "vfmadd213ps (%5),%%xmm0,"#c1"; vmovups "#c1",(%5);"\ + "vfmadd213ps (%5,%3,1),%%xmm0,"#c2"; vmovups "#c2",(%5,%3,1);"\ + "leaq (%5,%3,2),%5;" +#define SAVE_m4n2 "movq %2,%5;" unit_save_m4n2(%%xmm4,%%xmm5) +#define SAVE_m4n4 SAVE_m4n2 unit_save_m4n2(%%xmm6,%%xmm7) +#define SAVE_m4n8 SAVE_m4n4 unit_save_m4n2(%%xmm8,%%xmm9) unit_save_m4n2(%%xmm10,%%xmm11) +#define SAVE_m4n12 SAVE_m4n8 unit_save_m4n2(%%xmm12,%%xmm13) unit_save_m4n2(%%xmm14,%%xmm15) + +/* m = 2 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm9 for accumulators */ +#define INIT_m2n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" +#define KERNEL_k1m2n1 \ + "vmovsd (%0),%%xmm1; addq $8,%0;"\ + "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ + "addq $4,%1;" +#define SAVE_m2n1 "vmovsd (%2),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm4; vmovsd %%xmm4,(%2);" +#define INIT_m2n2 INIT_m2n1 "vpxor %%xmm5,%%xmm5,%%xmm5;" +#define KERNEL_k1m2n2 \ + "vmovsd (%0),%%xmm1; addq $8,%0;"\ + "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ + "vbroadcastss 4(%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm5;"\ + "addq $8,%1;" +#define SAVE_m2n2 SAVE_m2n1 "vmovsd (%2,%3,1),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm5; vmovsd %%xmm5,(%2,%3,1);" +#define INIT_m2n4 INIT_m2n2 +#define INIT_m2n8 INIT_m2n4 "vpxor %%xmm6,%%xmm6,%%xmm6; vpxor %%xmm7,%%xmm7,%%xmm7;" +#define INIT_m2n12 INIT_m2n8 "vpxor %%xmm8,%%xmm8,%%xmm8; vpxor %%xmm9,%%xmm9,%%xmm9;" +#define KERNEL_k1m2n4 \ + "vmovups (%1),%%xmm3; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\ + "vbroadcastss 4(%0),%%xmm2; vfmadd231ps %%xmm3,%%xmm2,%%xmm5;"\ + "addq $8,%0;" +#define KERNEL_k1m2n8 \ + "vmovups (%1),%%xmm3; vmovups (%1,%%r12,4),%%xmm2; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4; vfmadd231ps %%xmm2,%%xmm1,%%xmm6;"\ + "vbroadcastss 4(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm5; vfmadd231ps %%xmm2,%%xmm1,%%xmm7;"\ + "addq $8,%0;" +#define KERNEL_k1m2n12 \ + "vmovups (%1),%%xmm3; vmovups (%1,%%r12,4),%%xmm2; vmovups (%1,%%r12,8),%%xmm1; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm6; vfmadd231ps %%xmm1,%%xmm10,%%xmm8;"\ + "vbroadcastss 4(%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm5; vfmadd231ps %%xmm2,%%xmm10,%%xmm7; vfmadd231ps %%xmm1,%%xmm10,%%xmm9;"\ + "addq $8,%0;" +#define unit_save_m2n4(c1,c2) \ + "vunpcklps "#c2","#c1",%%xmm1; vunpckhps "#c2","#c1",%%xmm2;"\ + "vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1;"\ + "vmovsd %%xmm1,(%5); vmovhpd %%xmm1,(%5,%3,1); leaq (%5,%3,2),%5;"\ + "vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2;"\ + "vmovsd %%xmm2,(%5); vmovhpd %%xmm2,(%5,%3,1); leaq (%5,%3,2),%5;" +#define SAVE_m2n4 "movq %2,%5;" unit_save_m2n4(%%xmm4,%%xmm5) +#define SAVE_m2n8 SAVE_m2n4 unit_save_m2n4(%%xmm6,%%xmm7) +#define SAVE_m2n12 SAVE_m2n8 unit_save_m2n4(%%xmm8,%%xmm9) + +/* m = 1 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm6 for accumulators */ +#define INIT_m1n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" +#define KERNEL_k1m1n1 \ + "vmovss (%1),%%xmm3; addq $4,%1;"\ + "vmovss (%0),%%xmm1; vfmadd231ss %%xmm3,%%xmm1,%%xmm4;"\ + "addq $4,%0;" +#define SAVE_m1n1 "vfmadd213ss (%2),%%xmm0,%%xmm4; vmovss %%xmm4,(%2);" +#define INIT_m1n2 INIT_m1n1 +#define KERNEL_k1m1n2 \ + "vmovsd (%1),%%xmm3; addq $8,%1;"\ + "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\ + "addq $4,%0;" +#define SAVE_m1n2 \ + "vmovss (%2),%%xmm3; vinsertps $16,(%2,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm4;"\ + "vmovss %%xmm4,(%2); vextractps $1,%%xmm4,(%2,%3,1);" +#define INIT_m1n4 INIT_m1n2 +#define INIT_m1n8 INIT_m1n4 "vpxor %%xmm5,%%xmm5,%%xmm5;" +#define INIT_m1n12 INIT_m1n8 "vpxor %%xmm6,%%xmm6,%%xmm6;" +#define KERNEL_k1m1n4 \ + "vmovups (%1),%%xmm3; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\ + "addq $4,%0;" +#define KERNEL_k1m1n8 \ + "vmovups (%1),%%xmm3; vmovups (%1,%%r12,4),%%xmm2; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4; vfmadd231ps %%xmm2,%%xmm1,%%xmm5;"\ + "addq $4,%0;" +#define KERNEL_k1m1n12 \ + "vmovups (%1),%%xmm3; vmovups (%1,%%r12,4),%%xmm2; vmovups (%1,%%r12,8),%%xmm1; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm5; vfmadd231ps %%xmm1,%%xmm10,%%xmm6;"\ + "addq $4,%0;" +#define unit_save_m1n4(c1) \ + "vpxor %%xmm10,%%xmm10,%%xmm10; vmovsd "#c1",%%xmm10,%%xmm2; vmovhlps "#c1",%%xmm10,%%xmm1;"\ + "vmovss (%5),%%xmm3; vinsertps $16,(%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2;"\ + "vmovss %%xmm2,(%5); vextractps $1,%%xmm2,(%5,%3,1); leaq (%5,%3,2),%5;"\ + "vmovss (%5),%%xmm3; vinsertps $16,(%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1;"\ + "vmovss %%xmm1,(%5); vextractps $1,%%xmm1,(%5,%3,1); leaq (%5,%3,2),%5;" +#define SAVE_m1n4 "movq %2,%5;" unit_save_m1n4(%%xmm4) +#define SAVE_m1n8 SAVE_m1n4 unit_save_m1n4(%%xmm5) +#define SAVE_m1n12 SAVE_m1n8 unit_save_m1n4(%%xmm6) + +/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store, %6 = b_pref */ +/* r10 = tmp, r11 = m_counter, r12 = k << 2(const), r13 = tmp, r14 = b_head_pos(const), r15 = tmp */ + +#define COMPUTE_SIMPLE(mdim,ndim) \ + "movq %%r12,%4; sarq $2,%4; movq %%r14,%1;" INIT_m##mdim##n##ndim\ + "testq %4,%4; jz 7"#mdim"7"#ndim"2f;"\ + "7"#mdim"7"#ndim"1:\n\t"\ + KERNEL_k1m##mdim##n##ndim "decq %4; jnz 7"#mdim"7"#ndim"1b;"\ + "7"#mdim"7"#ndim"2:\n\t"\ + SAVE_m##mdim##n##ndim "addq $"#mdim"*4,%2;" +#define COMPUTE_m8n1 COMPUTE_SIMPLE(8,1) +#define COMPUTE_m8n2 COMPUTE_SIMPLE(8,2) +#define COMPUTE_m8n8 COMPUTE_SIMPLE(8,8) +#define COMPUTE_m8n12 COMPUTE_SIMPLE(8,12) +#define COMPUTE_m8n4 \ + "movq %%r12,%4; sarq $2,%4; movq %%r14,%1;" INIT_m8n4\ + "cmpq $8,%4; jb 78740f;"\ + "78749:\n\t"\ + KERNEL_k2m8n4 KERNEL_k2m8n4 KERNEL_k2m8n4 KERNEL_k2m8n4\ + "subq $8,%4; cmpq $8,%4; jnb 78749b;"\ + "78740:\n\t"\ + "testq %4,%4; jz 78742f;"\ + "78741:\n\t"\ + KERNEL_k1m8n4 "decq %4; jnz 78741b;"\ + "78742:\n\t"\ + SAVE_m8n4 "addq $32,%2;" +#define COMPUTE_L_m16n6 \ + "movq %%r12,%%r13; sarq $2,%%r13; movq %%r14,%1;" INIT_m16n6\ + "movq %%r13,%4; movq %2,%5; cmpq $16,%%r13; jb 7116762f; movq $14,%4;"\ + "7116761:\n\t"\ + KERNEL_L_k2m16n6 "prefetcht0 128(%1); testq $24,%4; movq $84,%%r15; cmovz %3,%%r15;"\ + KERNEL_L_k2m16n6 "prefetcht1 (%5); subq $63,%5; addq %%r15,%5;"\ + KERNEL_L_k2m16n6 "prefetcht0 128(%1); prefetcht1 (%6); cmpq $198,%4; cmoveq %2,%5;"\ + KERNEL_L_k2m16n6 "addq $16,%6; addq $8,%4; cmpq %4,%%r13; jnb 7116761b;"\ + "movq %2,%5; negq %4; leaq 14(%%r13,%4,1),%4;"\ + "7116762:\n\t"\ + "xorq %%r15,%%r15; testq %4,%4; jz 7116764f;"\ + "7116763:\n\t"\ + "prefetcht0 (%5); prefetcht0 63(%5); addq %3,%5; incq %%r15;"\ + KERNEL_L_k1m16n6 "cmpq $6,%%r15; cmoveq %2,%5; decq %4; jnz 7116763b;"\ + "7116764:\n\t"\ + SAVE_L_m16n6 "addq $32,%2;" +#define COMPUTE_R_m16n6 \ + "movq %%r12,%%r13; sarq $2,%%r13; movq %%r14,%1;" INIT_m16n6\ + "movq %%r13,%4; leaq (%2,%3,4),%5; leaq (%5,%3,2),%5; movq %5,%%r10; cmpq $16,%%r13; jb 7216762f; movq $14,%4;"\ + "7216761:\n\t"\ + KERNEL_R_k2m16n6 "prefetcht0 128(%1,%%r12,8); testq $24,%4; movq $84,%%r15; cmovz %3,%%r15;"\ + KERNEL_R_k2m16n6 "prefetcht1 (%5); subq $63,%5; addq %%r15,%5;"\ + KERNEL_R_k2m16n6 "prefetcht0 128(%1,%%r12,8); prefetcht1 (%6); cmpq $198,%4; cmoveq %%r10,%5;"\ + KERNEL_R_k2m16n6 "addq $16,%6; addq $8,%4; cmpq %4,%%r13; jnb 7216761b;"\ + "movq %%r10,%5; negq %4; leaq 14(%%r13,%4,1),%4;"\ + "7216762:\n\t"\ + "xorq %%r15,%%r15; testq %4,%4; jz 7216764f;"\ + "7216763:\n\t"\ + "prefetcht0 (%5); prefetcht0 63(%5); addq %3,%5; incq %%r15;"\ + KERNEL_R_k1m16n6 "cmpq $6,%%r15; cmoveq %%r10,%5; decq %4; jnz 7216763b;"\ + "7216764:\n\t"\ + "prefetcht0 (%%r14); prefetcht0 64(%%r14);" SAVE_R_m16n6 "addq $32,%2;" +#define COMPUTE_H_m8n6 \ + "movq %%r12,%4; sarq $2,%4; movq %%r14,%1;" INIT_m8n6\ + "cmpq $8,%4; jb 718760f; movq %2,%5; xorq %%r15,%%r15;"\ + "718769:\n\t"\ + KERNEL_L_k2m8n6 KERNEL_L_k2m8n6 "cmpq $62,%%r15; movq $62,%%r15; cmoveq %3,%%r15;"\ + KERNEL_L_k2m8n6 KERNEL_L_k2m8n6 "prefetcht2 (%5); leaq -31(%5,%%r15,1),%5;"\ + "subq $8,%4; cmpq $8,%4; jnb 718769b;"\ + "718760:\n\t"\ + "testq %4,%4; jz 718762f;"\ + "718761:\n\t"\ + KERNEL_L_k1m8n6 "decq %4; jnz 718761b;"\ + "718762:\n\t"\ + SAVE_L_m8n6 "negq %%r12; leaq (%0,%%r12,8),%0; negq %%r12;" +#define COMPUTE_T_m8n6(side,sim) \ + "movq %%r12,%4; sarq $2,%4; movq %%r14,%1;" INIT_m8n6\ + "cmpq $8,%4; jb 72"#sim"8760f;"\ + "72"#sim"8769:\n\t"\ + KERNEL_##side##_k2m8n6 KERNEL_##side##_k2m8n6 KERNEL_##side##_k2m8n6 KERNEL_##side##_k2m8n6\ + "subq $8,%4; cmpq $8,%4; jnb 72"#sim"8769b;"\ + "72"#sim"8760:\n\t"\ + "testq %4,%4; jz 72"#sim"8762f;"\ + "72"#sim"8761:\n\t"\ + KERNEL_##side##_k1m8n6 "decq %4; jnz 72"#sim"8761b;"\ + "72"#sim"8762:\n\t"\ + SAVE_##side##_m8n6 "addq $32,%2;" +#define COMPUTE_NORMAL(ndim) {\ + next_b = b_pointer + ndim * K;\ + __asm__ __volatile__(\ + "vbroadcastss %9,%%ymm0;"\ + "movq %8,%%r12; salq $2,%%r12; movq %1,%%r14; movq %7,%%r11;"\ + "cmpq $8,%%r11;jb 33101"#ndim"f;"\ + "33109"#ndim":\n\t"\ + COMPUTE_m8n##ndim\ + "subq $8,%%r11;cmpq $8,%%r11;jnb 33109"#ndim"b;"\ + "33101"#ndim":\n\t"\ + "cmpq $4,%%r11;jb 33103"#ndim"f;"\ + COMPUTE_SIMPLE(4,ndim) "subq $4,%%r11;"\ + "33103"#ndim":\n\t"\ + "cmpq $2,%%r11;jb 33104"#ndim"f;"\ + COMPUTE_SIMPLE(2,ndim) "subq $2,%%r11;"\ + "33104"#ndim":\n\t"\ + "testq %%r11,%%r11;jz 33105"#ndim"f;"\ + COMPUTE_SIMPLE(1,ndim)\ + "33105"#ndim":\n\t"\ + "movq %%r14,%1; vzeroupper;"\ + :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(k_count),"+r"(ctemp),"+r"(next_b)\ + :"m"(M),"m"(K),"m"(ALPHA):"r10","r11","r12","r13","r14","r15",\ + "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15","cc","memory");\ + a_pointer -= M * K; b_pointer += ndim * K; c_pointer += (LDC * ndim - M);\ +} +#define COMPUTE_n12 {\ + next_b = b_pointer + 12 * K;\ + __asm__ __volatile__(\ + "vbroadcastss %9,%%ymm0;"\ + "movq %8,%%r12; salq $2,%%r12; movq %1,%%r14; movq %7,%%r11;"\ + "cmpq $16,%%r11;jb 3310112f;"\ + COMPUTE_H_m8n6\ + "3310612:\n\t"\ + COMPUTE_R_m16n6 "subq $8,%%r11; cmpq $16,%%r11;jb 3310712f;"\ + COMPUTE_L_m16n6 "subq $8,%%r11; cmpq $16,%%r11;jnb 3310612b;"\ + COMPUTE_T_m8n6(R,5) "subq $8,%%r11; jmp 3310212f;"\ + "3310712:\n\t"\ + COMPUTE_T_m8n6(L,7) "subq $8,%%r11; jmp 3310212f;"\ + "3310112:\n\t"\ + "cmpq $8,%%r11;jb 3310212f;"\ + COMPUTE_SIMPLE(8,12) "subq $8,%%r11;"\ + "3310212:\n\t"\ + "cmpq $4,%%r11;jb 3310312f;"\ + COMPUTE_SIMPLE(4,12) "subq $4,%%r11;"\ + "3310312:\n\t"\ + "cmpq $2,%%r11;jb 3310412f;"\ + COMPUTE_SIMPLE(2,12) "subq $2,%%r11;"\ + "3310412:\n\t"\ + "testq %%r11,%%r11;jz 3310512f;"\ + COMPUTE_SIMPLE(1,12)\ + "3310512:\n\t"\ + "movq %%r14,%1; vzeroupper;"\ + :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(k_count),"+r"(ctemp),"+r"(next_b)\ + :"m"(M),"m"(K),"m"(ALPHA):"r10","r11","r12","r13","r14","r15",\ + "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15","cc","memory");\ + a_pointer -= M * K; b_pointer += 12 * K; c_pointer += (LDC * 12 - M);\ +} + +#include "common.h" +#include +int __attribute__ ((noinline)) +CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC){ + if(m==0||n==0||k==0||alpha==(float)0.0) return 0; + int64_t ldc_in_bytes = (int64_t)LDC * sizeof(float); + float ALPHA = alpha; + int64_t M = (int64_t)m, K = (int64_t)k, k_count = 0; + BLASLONG n_count = n; + float *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*next_b = B; + for(;n_count>11;n_count-=12) COMPUTE_n12 + for(;n_count>7;n_count-=8) COMPUTE_NORMAL(8) + for(;n_count>3;n_count-=4) COMPUTE_NORMAL(4) + for(;n_count>1;n_count-=2) COMPUTE_NORMAL(2) + if(n_count>0) COMPUTE_NORMAL(1) + return 0; +} + From 2515e1152f278f2f543156162de62d69213c9088 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 26 Feb 2020 18:36:54 +0800 Subject: [PATCH 009/593] Update cgemm_kernel_8x2_haswell.c --- kernel/x86_64/cgemm_kernel_8x2_haswell.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/cgemm_kernel_8x2_haswell.c b/kernel/x86_64/cgemm_kernel_8x2_haswell.c index eab8c9ea5..5d3bd599a 100644 --- a/kernel/x86_64/cgemm_kernel_8x2_haswell.c +++ b/kernel/x86_64/cgemm_kernel_8x2_haswell.c @@ -50,7 +50,7 @@ "vmovups (%0),%%ymm0; vmovups 32(%0),%%ymm1; prefetcht0 512(%0); addq $64,%0;"\ acc_m8n2_con(0,1,4,5,6,7,0,8,%1) acc_m8n2_con(0,1,8,9,10,11,0,8,%1,%%r12,1) #define KERNEL_2_k1m8n4 \ - "vpermilps $177,%%ymm0,%%ymm0; vpermilps $177,%%ymm1,%%ymm1;"\ + "vpermilps $177,-64(%0),%%ymm0; vpermilps $177,-32(%0),%%ymm1;"\ acc_m8n2_con(0,1,4,5,6,7,4,12,%1) acc_m8n2_con(0,1,8,9,10,11,4,12,%1,%%r12,1) #define KERNEL_1_k1m8n6 KERNEL_1_k1m8n4 acc_m8n2_con(0,1,12,13,14,15,0,8,%1,%%r12,2) #define KERNEL_2_k1m8n6 KERNEL_2_k1m8n4 acc_m8n2_con(0,1,12,13,14,15,4,12,%1,%%r12,2) From 1b980001dda2af41c470856f65fee34c09d7ad11 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Wed, 26 Feb 2020 18:38:12 +0800 Subject: [PATCH 010/593] Update zgemm_kernel_4x2_haswell.c --- kernel/x86_64/zgemm_kernel_4x2_haswell.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/zgemm_kernel_4x2_haswell.c b/kernel/x86_64/zgemm_kernel_4x2_haswell.c index 3279b8b8c..e3bd7897a 100644 --- a/kernel/x86_64/zgemm_kernel_4x2_haswell.c +++ b/kernel/x86_64/zgemm_kernel_4x2_haswell.c @@ -50,7 +50,7 @@ "vmovupd (%0),%%ymm0; vmovupd 32(%0),%%ymm1; prefetcht0 512(%0); addq $64,%0;"\ acc_m4n2_con(0,1,4,5,6,7,0,16,%1) acc_m4n2_con(0,1,8,9,10,11,0,16,%1,%%r12,1) #define KERNEL_2_k1m4n4 \ - "vpermilpd $5,%%ymm0,%%ymm0; vpermilpd $5,%%ymm1,%%ymm1;"\ + "vpermilpd $5,-64(%0),%%ymm0; vpermilpd $5,-32(%0),%%ymm1;"\ acc_m4n2_con(0,1,4,5,6,7,8,24,%1) acc_m4n2_con(0,1,8,9,10,11,8,24,%1,%%r12,1) #define KERNEL_1_k1m4n6 KERNEL_1_k1m4n4 acc_m4n2_con(0,1,12,13,14,15,0,16,%1,%%r12,2) #define KERNEL_2_k1m4n6 KERNEL_2_k1m4n4 acc_m4n2_con(0,1,12,13,14,15,8,24,%1,%%r12,2) From 8164fd13281e824260346d558e4f5408296d753b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 26 Feb 2020 22:19:57 +0100 Subject: [PATCH 011/593] Always assume server-class cpu count for TSV110 and EMAG8180 --- param.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/param.h b/param.h index 055749dc1..2b7b4a050 100644 --- a/param.h +++ b/param.h @@ -2620,7 +2620,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /*FIXME: this should be using the cache size, but there is currently no easy way to query that on ARM. So if getarch counted more than 8 cores we simply assume the host is a big desktop or server with abundant cache rather than a phone or embedded device */ -#if NUM_CORES > 8 +#if NUM_CORES > 8 || defined(TSV110) || defined(EMAG8180) #define SGEMM_DEFAULT_P 512 #define DGEMM_DEFAULT_P 256 #define CGEMM_DEFAULT_P 256 From 2352331e60a8e0e83baba65da9349734f5edc49b Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Thu, 27 Feb 2020 22:25:19 +0800 Subject: [PATCH 012/593] Update zgemm_kernel_4x2_haswell.c --- kernel/x86_64/zgemm_kernel_4x2_haswell.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/zgemm_kernel_4x2_haswell.c b/kernel/x86_64/zgemm_kernel_4x2_haswell.c index e3bd7897a..917a3fd48 100644 --- a/kernel/x86_64/zgemm_kernel_4x2_haswell.c +++ b/kernel/x86_64/zgemm_kernel_4x2_haswell.c @@ -93,9 +93,9 @@ "movq $10,%5; movq $84,%%r15;"\ #ndim"4441:\n\t"\ "prefetcht1 (%3); subq $63,%3; addq %%r15,%3;"\ - "prefetcht0 96(%1); prefetcht0 96(%1,%%r12,1); prefetcht0 96(%1,%%r12,2);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\ + KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\ "testq $12,%5; movq $84,%%r15; cmovz %4,%%r15; prefetcht1 (%8); addq $16,%8;"\ - "prefetcht0 96(%1); prefetcht0 96(%1,%%r12,1); prefetcht0 96(%1,%%r12,2);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\ + KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\ "addq $4,%5; cmpq %5,%%r13; jnb "#ndim"4441b;"\ "movq %2,%3; negq %5; leaq 10(%%r13,%5,1),%5; prefetcht0 (%6); prefetcht0 15(%6);"\ #ndim"4442:\n\t"\ From dd22eb7621a5fb7d4b65e39ae59679aadb0b3767 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Thu, 27 Feb 2020 22:26:15 +0800 Subject: [PATCH 013/593] Update cgemm_kernel_8x2_haswell.c --- kernel/x86_64/cgemm_kernel_8x2_haswell.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/x86_64/cgemm_kernel_8x2_haswell.c b/kernel/x86_64/cgemm_kernel_8x2_haswell.c index 5d3bd599a..08882346d 100644 --- a/kernel/x86_64/cgemm_kernel_8x2_haswell.c +++ b/kernel/x86_64/cgemm_kernel_8x2_haswell.c @@ -93,7 +93,6 @@ "movq $10,%5; movq $84,%%r15;"\ #ndim"8881:\n\t"\ "prefetcht1 (%3); subq $63,%3; addq %%r15,%3;"\ - "prefetcht0 64(%1); prefetcht0 64(%1,%%r12,1); prefetcht0 64(%1,%%r12,2);"\ KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\ "testq $12,%5; movq $84,%%r15; cmovz %4,%%r15; prefetcht1 (%8); addq $16,%8;"\ KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\ From a66f4d80c8b39c7d7949f0702c238ab86c690a15 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 27 Feb 2020 23:09:40 +0100 Subject: [PATCH 014/593] Apply MinGW AVX512 compilation fix to fortran options as well original issue was #1708, I see now that the same problem affects gfortran compilation. The underlying issue is said to be fixed (but not yet released) on all branches of gcc as of a few days ago but it will certainly take time to reach mingw/msys. --- Makefile.x86_64 | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 99364752f..f2de51ef4 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -15,10 +15,12 @@ CCOMMON_OPT += -march=skylake-avx512 FCOMMON_OPT += -march=skylake-avx512 ifeq ($(OSNAME), CYGWIN_NT) CCOMMON_OPT += -fno-asynchronous-unwind-tables +FCOMMON_OPT += -fno-asynchronous-unwind-tables endif ifeq ($(OSNAME), WINNT) ifeq ($(C_COMPILER), GCC) CCOMMON_OPT += -fno-asynchronous-unwind-tables +FCOMMON_OPT += -fno-asynchronous-unwind-tables endif endif endif From e1062400c4ea7b4cad65b9df8a30fac4224f9737 Mon Sep 17 00:00:00 2001 From: j00520245 Date: Fri, 28 Feb 2020 16:36:53 +0800 Subject: [PATCH 015/593] New add syr benchmark --- benchmark/Makefile | 49 +++++++++++- benchmark/syr.c | 187 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 234 insertions(+), 2 deletions(-) create mode 100644 benchmark/syr.c diff --git a/benchmark/Makefile b/benchmark/Makefile index 51e9c64aa..1d4a220e4 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -12,9 +12,9 @@ include $(TOPDIR)/Makefile.system # ACML 6.1 custom ACML=/home/saar/acml6.1/gfortran64_mp/lib LIBACML = -fopenmp $(ACML)/libacml_mp.so -lgfortran -lm - -# Atlas Ubuntu + +# Atlas Ubuntu #ATLAS=/usr/lib/atlas-base #LIBATLAS = -fopenmp $(ATLAS)/liblapack_atlas.a $(ATLAS)/libptcblas.a $(ATLAS)/libptf77blas.a $(ATLAS)/libatlas.a -lgfortran -lm @@ -56,6 +56,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \ strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \ + ssyr.goto dsyr.goto \ ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ sger.goto dger.goto cger.goto zger.goto \ @@ -83,6 +84,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ sgemm.acml dgemm.acml cgemm.acml zgemm.acml \ strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \ strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \ + ssyr.acml dsyr.acml \ ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ sger.acml dger.acml cger.acml zger.acml \ @@ -109,6 +111,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ sgemm.atlas dgemm.atlas cgemm.atlas zgemm.atlas \ strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \ strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \ + ssyr.goto dsyr.atlas \ ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ sger.atlas dger.atlas cger.atlas zger.atlas\ @@ -136,6 +139,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ sgemm.mkl dgemm.mkl cgemm.mkl zgemm.mkl \ strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \ strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \ + ssyr.mkl dsyr.mkl \ ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ sger.mkl dger.mkl cger.mkl zger.mkl \ @@ -162,6 +166,7 @@ else goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \ strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \ + ssyr.goto dsyr.goto \ ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ sger.goto dger.goto cger.goto zger.goto \ @@ -188,6 +193,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ sgemm.acml dgemm.acml cgemm.acml zgemm.acml \ strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \ strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \ + ssyr.acml dsyr.acml \ ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ sger.acml dger.acml cger.acml zger.acml \ @@ -214,6 +220,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ sgemm.atlas dgemm.atlas cgemm.atlas zgemm.atlas \ strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \ strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \ + ssyr.atlas dsyr.atlas \ ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ sger.atlas dger.atlas cger.atlas zger.atlas\ @@ -243,6 +250,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ sgemm.mkl dgemm.mkl cgemm.mkl zgemm.mkl \ strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \ strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \ + ssyr.mkl dsyr.mkl \ ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ sger.mkl dger.mkl cger.mkl zger.mkl \ @@ -280,6 +288,7 @@ veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ sgemm.veclib dgemm.veclib cgemm.veclib zgemm.veclib \ strmm.veclib dtrmm.veclib ctrmm.veclib ztrmm.veclib \ strsm.veclib dtrsm.veclib ctrsm.veclib ztrsm.veclib \ + ssyr.veclib dsyr.veclib \ ssyrk.veclib dsyrk.veclib csyrk.veclib zsyrk.veclib \ ssyr2k.veclib dsyr2k.veclib csyr2k.veclib zsyr2k.veclib \ sger.veclib dger.veclib cger.veclib zger.veclib \ @@ -768,6 +777,36 @@ ztrsm.veclib : ztrsm.$(SUFFIX) ztrsm.essl : ztrsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Ssyr #################################################### +ssyr.goto : ssyr.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ssyr.acml : ssyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr.atlas : ssyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr.mkl : ssyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr.veclib : ssyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Dsyr #################################################### +dsyr.goto : dsyr.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dsyr.acml : dsyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr.atlas : dsyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr.mkl : dsyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr.veclib : dsyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Ssyrk #################################################### ssyrk.goto : ssyrk.$(SUFFIX) ../$(LIBNAME) @@ -2078,6 +2117,12 @@ ctrsm.$(SUFFIX) : trsm.c ztrsm.$(SUFFIX) : trsm.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +ssyr.$(SUFFIX) : syr.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dsyr.$(SUFFIX) : syr.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + ssyrk.$(SUFFIX) : syrk.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ diff --git a/benchmark/syr.c b/benchmark/syr.c new file mode 100644 index 000000000..91b5b5904 --- /dev/null +++ b/benchmark/syr.c @@ -0,0 +1,187 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef SYR + +#ifdef DOUBLE +#define SYR BLASFUNC(dsyr) +#else +#define SYR BLASFUNC(ssyr) +#endif + + + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x,*a; + FLOAT alpha[] = {1.0, 1.0}; + char *p; + + char uplo='U'; + + if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; + + blasint m, i, j; + blasint inc_x= 1; + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Inc_x = %d\n", from, to, step,uplo,inc_x); + + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + fprintf(stderr, " %6d : ", (int)m); + + for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ + x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + + for(j = 0; j < m; j++){ + for(i = 0; i < m * COMPSIZE; i++){ + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + } + + gettimeofday( &start, (struct timezone *)0); + + SYR (&uplo, &m, alpha, x, &inc_x, a, &m ); + + gettimeofday( &stop, (struct timezone *)0); + + time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + + gettimeofday( &start, (struct timezone *)0); + + fprintf(stderr, + " %10.2f MFlops\n", + COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6); + + } + + return 0; +} + +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); From c623a965f95b83bd4340a5ebe1a370b1a900545d Mon Sep 17 00:00:00 2001 From: Ali Saidi Date: Fri, 21 Feb 2020 22:46:58 +0000 Subject: [PATCH 016/593] Add Neoverse-N1 core The implementation is a hybird of the ARMV8 one with some of the improved TX2 rountines along with specifying -march=v8.2-a --- Makefile.arm64 | 17 +++ Makefile.system | 2 + TargetList.txt | 1 + cmake/arch.cmake | 2 +- cmake/prebuild.cmake | 27 +++++ cpuid_arm64.c | 23 +++- driver/others/dynamic_arm64.c | 8 +- getarch.c | 18 ++++ kernel/arm64/KERNEL.NEOVERSEN1 | 189 +++++++++++++++++++++++++++++++++ param.h | 29 +++++ 10 files changed, 312 insertions(+), 4 deletions(-) create mode 100644 kernel/arm64/KERNEL.NEOVERSEN1 diff --git a/Makefile.arm64 b/Makefile.arm64 index c17ea7938..a7cd82e3a 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -24,6 +24,23 @@ CCOMMON_OPT += -march=armv8-a -mtune=cortex-a73 FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73 endif +# Use a72 tunings because Neoverse-N1 is only available +# in GCC>=9 +ifeq ($(CORE), NEOVERSEN1) +ifeq ($(GCCVERSIONGTEQ7), 1) +ifeq ($(GCCVERSIONGTEQ9), 1) +CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1 +FCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1 +else +CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 +FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 +endif +else +CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 +FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 +endif +endif + ifeq ($(CORE), THUNDERX) CCOMMON_OPT += -march=armv8-a -mtune=thunderx FCOMMON_OPT += -march=armv8-a -mtune=thunderx diff --git a/Makefile.system b/Makefile.system index a928b6e25..1e30d05a8 100644 --- a/Makefile.system +++ b/Makefile.system @@ -328,6 +328,7 @@ ifeq ($(C_COMPILER), GCC) GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) +GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) ifeq ($(GCCVERSIONGT4), 1) @@ -554,6 +555,7 @@ DYNAMIC_CORE += CORTEXA53 DYNAMIC_CORE += CORTEXA57 DYNAMIC_CORE += CORTEXA72 DYNAMIC_CORE += CORTEXA73 +DYNAMIC_CORE += NEOVERSEN1 DYNAMIC_CORE += FALKOR DYNAMIC_CORE += THUNDERX DYNAMIC_CORE += THUNDERX2T99 diff --git a/TargetList.txt b/TargetList.txt index 6a57bf1af..5b31df045 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -88,6 +88,7 @@ CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 +NEOVERSEN1 FALKOR THUNDERX THUNDERX2T99 diff --git a/cmake/arch.cmake b/cmake/arch.cmake index d31961c14..9d51f777c 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -45,7 +45,7 @@ endif () if (DYNAMIC_ARCH) if (ARM64) - set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180) + set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1) endif () if (POWER) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index b74a0699b..44e1473d1 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -229,6 +229,33 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_N 4) set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "NEOVERSEN1") + file(APPEND ${TARGET_CONF_TEMP} + "#define L1_CODE_SIZE\t65536\n" + "#define L1_CODE_LINESIZE\t64\n" + "#define L1_CODE_ASSOCIATIVE\t4\n" + "#define L1_DATA_SIZE\t65536\n" + "#define L1_DATA_LINESIZE\t64\n" + "#define L1_DATA_ASSOCIATIVE\t2\n" + "#define L2_SIZE\t1048576\n\n" + "#define L2_LINESIZE\t64\n" + "#define L2_ASSOCIATIVE\t16\n" + "#define DTB_DEFAULT_ENTRIES\t64\n" + "#define DTB_SIZE\t4096\n" + "#define HAVE_VFPV4\n" + "#define HAVE_VFPV3\n" + "#define HAVE_VFP\n" + "#define HAVE_NEON\n" + "#define ARMV8\n") + set(SGEMM_UNROLL_M 16) + set(SGEMM_UNROLL_N 4) + set(DGEMM_UNROLL_M 8) + set(DGEMM_UNROLL_N 4) + set(CGEMM_UNROLL_M 8) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 4) + set(ZGEMM_UNROLL_N 4) + set(SYMV_P 16) elseif ("${TCORE}" STREQUAL "FALKOR") file(APPEND ${TARGET_CONF_TEMP} "#define L1_CODE_SIZE\t65536\n" diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 5868af75c..4103216e6 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -34,6 +34,7 @@ #define CPU_CORTEXA57 3 #define CPU_CORTEXA72 4 #define CPU_CORTEXA73 5 +#define CPU_NEOVERSEN1 11 // Qualcomm #define CPU_FALKOR 6 // Cavium @@ -55,7 +56,8 @@ static char *cpuname[] = { "THUNDERX", "THUNDERX2T99", "TSV110", - "EMAG8180" + "EMAG8180", + "NEOVERSEN1" }; static char *cpuname_lower[] = { @@ -69,7 +71,8 @@ static char *cpuname_lower[] = { "thunderx", "thunderx2t99", "tsv110", - "emag8180" + "emag8180", + "neoversen1" }; int get_feature(char *search) @@ -144,6 +147,8 @@ int detect(void) return CPU_CORTEXA72; else if (strstr(cpu_part, "0xd09")) return CPU_CORTEXA73; + else if (strstr(cpu_part, "0xd0c")) + return CPU_NEOVERSEN1; } // Qualcomm else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00")) @@ -285,6 +290,20 @@ void get_cpuconfig(void) printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_SIZE 4096\n"); break; + case CPU_NEOVERSEN1: + printf("#define %s\n", cpuname[d]); + printf("#define L1_CODE_SIZE 65536\n"); + printf("#define L1_CODE_LINESIZE 64\n"); + printf("#define L1_CODE_ASSOCIATIVE 4\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L1_DATA_ASSOCIATIVE 4\n"); + printf("#define L2_SIZE 1048576\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define L2_ASSOCIATIVE 16\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + break; case CPU_FALKOR: printf("#define FALKOR\n"); diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 9f42ce4c6..11ef2725c 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -52,10 +52,11 @@ extern gotoblas_t gotoblas_THUNDERX; extern gotoblas_t gotoblas_THUNDERX2T99; extern gotoblas_t gotoblas_TSV110; extern gotoblas_t gotoblas_EMAG8180; +extern gotoblas_t gotoblas_NEOVERSEN1; extern void openblas_warning(int verbose, const char * msg); -#define NUM_CORETYPES 10 +#define NUM_CORETYPES 11 /* * In case asm/hwcap.h is outdated on the build system, make sure @@ -80,6 +81,7 @@ static char *corename[] = { "thunderx2t99", "tsv110", "emag8180", + "neoversen1", "unknown" }; @@ -94,6 +96,7 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_THUNDERX2T99) return corename[ 7]; if (gotoblas == &gotoblas_TSV110) return corename[ 8]; if (gotoblas == &gotoblas_EMAG8180) return corename[ 9]; + if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10]; return corename[NUM_CORETYPES]; } @@ -123,6 +126,7 @@ static gotoblas_t *force_coretype(char *coretype) { case 7: return (&gotoblas_THUNDERX2T99); case 8: return (&gotoblas_TSV110); case 9: return (&gotoblas_EMAG8180); + case 10: return (&gotoblas_NEOVERSEN1); } snprintf(message, 128, "Core not found: %s\n", coretype); openblas_warning(1, message); @@ -168,6 +172,8 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_CORTEXA72; case 0xd09: // Cortex A73 return &gotoblas_CORTEXA73; + case 0xd0c: // Neoverse N1 + return &gotoblas_NEOVERSEN1; } break; case 0x42: // Broadcom diff --git a/getarch.c b/getarch.c index d29f6369c..30ca290e3 100644 --- a/getarch.c +++ b/getarch.c @@ -1028,6 +1028,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif +#ifdef FORCE_NEOVERSEN1 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "NEOVERSEN1" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DNEOVERSEN1 " \ + "-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" \ + "-march=armv8.2-a -mtune=cortex-a72" +#define LIBNAME "neoversen1" +#define CORENAME "NEOVERSEN1" +#else +#endif + + #ifdef FORCE_FALKOR #define FORCE #define ARCHITECTURE "ARM64" diff --git a/kernel/arm64/KERNEL.NEOVERSEN1 b/kernel/arm64/KERNEL.NEOVERSEN1 new file mode 100644 index 000000000..ea010db42 --- /dev/null +++ b/kernel/arm64/KERNEL.NEOVERSEN1 @@ -0,0 +1,189 @@ +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = daxpy_thunderx2t99.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + + +SASUMKERNEL = sasum_thunderx2t99.c +DASUMKERNEL = dasum_thunderx2t99.c +CASUMKERNEL = casum_thunderx2t99.c +ZASUMKERNEL = zasum_thunderx2t99.c + +SCOPYKERNEL = copy_thunderx2t99.c +DCOPYKERNEL = copy_thunderx2t99.c +CCOPYKERNEL = copy_thunderx2t99.c +ZCOPYKERNEL = copy_thunderx2t99.c + +SSWAPKERNEL = swap_thunderx2t99.S +DSWAPKERNEL = swap_thunderx2t99.S +CSWAPKERNEL = swap_thunderx2t99.S +ZSWAPKERNEL = swap_thunderx2t99.S + +ISAMAXKERNEL = iamax_thunderx2t99.c +IDAMAXKERNEL = iamax_thunderx2t99.c +ICAMAXKERNEL = izamax_thunderx2t99.c +IZAMAXKERNEL = izamax_thunderx2t99.c + +SNRM2KERNEL = scnrm2_thunderx2t99.c +DNRM2KERNEL = dznrm2_thunderx2t99.c +CNRM2KERNEL = scnrm2_thunderx2t99.c +ZNRM2KERNEL = dznrm2_thunderx2t99.c + +DDOTKERNEL = dot_thunderx2t99.c +SDOTKERNEL = dot_thunderx2t99.c +CDOTKERNEL = zdot_thunderx2t99.c +ZDOTKERNEL = zdot_thunderx2t99.c +DSDOTKERNEL = dot.S + +DGEMM_BETA = dgemm_beta.S +SGEMM_BETA = sgemm_beta.S + +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +ifeq ($(SGEMM_UNROLL_M), 16) +SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S +else +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +endif +ifeq ($(SGEMM_UNROLL_M), 4) +SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S +else +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +endif +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ifeq ($(SGEMM_UNROLL_N), 16) +SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S +else +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +endif +ifeq ($(SGEMM_UNROLL_N), 4) +SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S +else +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +endif +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) + +ifeq ($(DGEMM_UNROLL_M), 8) +DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S +DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S +else +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +endif + +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +ifeq ($(DGEMM_UNROLL_N), 4) +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S +else +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +endif + +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/param.h b/param.h index 2b7b4a050..726639a4a 100644 --- a/param.h +++ b/param.h @@ -2705,6 +2705,35 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 +#elif defined(NEOVERSEN1) + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 4 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 160 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 128 + +#define SGEMM_DEFAULT_Q 352 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 112 + +#define SGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + #else // Other/undetected ARMv8 cores #define SGEMM_DEFAULT_UNROLL_M 16 From 19f3a4091c41ec50b1f956e916680241cf202c91 Mon Sep 17 00:00:00 2001 From: Ali Saidi Date: Sat, 22 Feb 2020 05:07:55 +0000 Subject: [PATCH 017/593] Make rpcc() on arm64 get closer to what x86 returns The Arm implementation of rpcc() uses the architected timer which is defined by the SBSA to be between 10-400MHz. These numbers are much smaller than the cycle counter frequency used by x86. Make the numbers closer by shifting the cycle counter up by the number of leading zeros in the cntfrq_el0 register which gets us closer to a noraml cpu clock cycle range. --- common_arm64.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/common_arm64.h b/common_arm64.h index 5951e1ee5..66a1d1dc4 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -81,10 +81,12 @@ static void __inline blas_lock(volatile BLASULONG *address){ #if !defined(OS_DARWIN) && !defined (OS_ANDROID) static __inline BLASULONG rpcc(void){ BLASULONG ret = 0; + blasint shift; __asm__ __volatile__ ("isb; mrs %0,cntvct_el0":"=r"(ret)); + __asm__ __volatile__ ("mrs %0,cntfrq_el0; clz %w0, %w0":"=&r"(shift)); - return ret; + return ret << shift; } #define RPCC_DEFINED From 0af9991cc9d0d3696847eaaaa8fa4288deea9146 Mon Sep 17 00:00:00 2001 From: Ali Saidi Date: Fri, 21 Feb 2020 23:43:43 +0000 Subject: [PATCH 018/593] Use wait-for-event to not spin in the blas_lock --- common_arm64.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/common_arm64.h b/common_arm64.h index 5951e1ee5..52f7451d3 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -53,16 +53,16 @@ static void __inline blas_lock(volatile BLASULONG *address){ BLASULONG ret; do { - while (*address) {YIELDING;}; - __asm__ __volatile__( "mov x4, #1 \n\t" + "sevl \n\t" "1: \n\t" + "wfe \n\t" + "2: \n\t" "ldaxr x2, [%1] \n\t" "cbnz x2, 1b \n\t" - "2: \n\t" "stxr w3, x4, [%1] \n\t" - "cbnz w3, 1b \n\t" + "cbnz w3, 2b \n\t" "mov %0, #0 \n\t" : "=r"(ret), "=r"(address) : "1"(address) From 97ce6bbce2580d1a3d8c9844afcac431d749abdc Mon Sep 17 00:00:00 2001 From: Ali Saidi Date: Sat, 29 Feb 2020 17:27:18 +0000 Subject: [PATCH 019/593] Fix barriers in level3_thread --- driver/level3/level3_thread.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index bf558447e..ca0085e71 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -351,8 +351,9 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Make sure if no one is using workspace */ START_RPCC(); for (i = 0; i < args -> nthreads; i++) - while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;}; + while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; STOP_RPCC(waiting1); + MB; #if defined(FUSED_GEMM) && !defined(TIMING) @@ -395,10 +396,10 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, } #endif + WMB; /* Set flag so other threads can access local region of B */ for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; - WMB; } /* Get regions of B from other threads and apply kernel */ @@ -417,8 +418,9 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Wait until other region of B is initialized */ START_RPCC(); - while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;}; + while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; STOP_RPCC(waiting2); + MB; /* Apply kernel with local region of A and part of other region of B */ START_RPCC(); @@ -434,8 +436,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Clear synchronization flag if this thread is done with other region of B */ if (m_to - m_from == min_i) { - job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; WMB; + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; } } } while (current != mypos); @@ -477,8 +479,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Clear synchronization flag if this thread is done with region of B */ if (is + min_i >= m_to) { - job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; WMB; + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; } } @@ -497,10 +499,11 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, START_RPCC(); for (i = 0; i < args -> nthreads; i++) { for (js = 0; js < DIVIDE_RATE; js++) { - while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;MB;}; + while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;}; } } STOP_RPCC(waiting3); + MB; #ifdef TIMING BLASLONG waiting = waiting1 + waiting2 + waiting3; @@ -705,7 +708,7 @@ EnterCriticalSection((PCRITICAL_SECTION)&level3_lock); } } } - + WMB; /* Execute parallel computation */ exec_blas(nthreads, queue); } From 4f371b0fbf8219270e37cb7827850ac13c8686d5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 1 Mar 2020 23:45:58 +0100 Subject: [PATCH 020/593] Use POWER8 kernels on big-endian POWER9 for now --- kernel/power/KERNEL.POWER9 | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 index 4bfa017e1..aabb5d976 100644 --- a/kernel/power/KERNEL.POWER9 +++ b/kernel/power/KERNEL.POWER9 @@ -1,3 +1,7 @@ +ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) +include $(KERNELDIR)/KERNEL.POWER8 +else + #SGEMM_BETA = ../generic/gemm_beta.c #DGEMM_BETA = ../generic/gemm_beta.c #CGEMM_BETA = ../generic/zgemm_beta.c @@ -206,3 +210,5 @@ QCABS_KERNEL = ../generic/cabs.c #Dump kernel CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c + +endif From f14013da7fda056a2ee42ccf88f14b46b91686ef Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 2 Mar 2020 00:01:22 +0100 Subject: [PATCH 021/593] Update with 0.3.9 changes --- Changelog.txt | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index d66b2719a..5f924629b 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,48 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.9 + 1-Mar-2020 + + common: + * Fixed a miscompilation of the GETRF functions with CMAKE + * Imported bugfix 390 from LAPACK (missing NaN propagation in xCOMBSSQ) + * The size of the memory buffer used for splitting GEMM tasks across + multiple threads can now be configured in the build system. + +POWER: + * Fixed several compilation problems related to endianness + and ELF version on POWER8 and POWER9 + * Fixed use of the absolute value IAMIN/IAMAX instead of IMIN/IMAX + * Fixed a race condition in the level3 blas code + +MIPS64: + * Fixed use of the absoltute value IAMIN/IAMAX instead of IMIN/IMAX + +ARMV7: + * Fixed a race condition in the level3 blas code + * Fixed compilation on Android +ARMV8: + * Added support for Ampere EMAG8180 + * Added support for Neoverse N1 + * Improved performance of the blas_lock function + * Fixed a race condition in the level3 blas code + * Fixed a performance regression on TSV110-based servers + +x86_64: + * Fixed a long-standing error with undeclared register overwrites + in the DSCAL microkernel for HASWELL,SKYLAKEX and ZEN + * Fixed a long-standing bug in the SSE implementation of IAMAX + * Fixed a CMAKE build failure with DYNAMIC_ARCH + * Fixed cpu autodetection of Goldmont+, Cannon Lake and Ice Lake + * Fixed a compilation failure on OSX with compiler name containing dash + * Fixed compilation with MinGW on SkylakeX + * Improved speed of the AVX512 GEMM3M kernel on SkylakeX + * Added an AVX512 STRMM kernel for SkylakeX + * Improved GEMM performance on Haswell and Zen + +zarch: + * fixed compilation of the DYNAMIC_ARCH code + ==================================================================== Version 0.3.8 9-Feb-2020 From d221c50f2741b31b83e3cbcc005977cb0fe47bc3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 2 Mar 2020 00:02:36 +0100 Subject: [PATCH 022/593] Add Ampere EMAG8180 --- TargetList.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/TargetList.txt b/TargetList.txt index 5b31df045..f4a40ed02 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -89,6 +89,7 @@ CORTEXA57 CORTEXA72 CORTEXA73 NEOVERSEN1 +EMAG8180 FALKOR THUNDERX THUNDERX2T99 From 960dec234fad4834f1fcd0c2453f878c35576d41 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 2 Mar 2020 00:09:49 +0100 Subject: [PATCH 023/593] Version 0.3.9 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 951271717..eb1543591 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 9.dev) +set(OpenBLAS_PATCH_VERSION 9) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From 33f76a6c378681ac2af76b20e55ff682191a1937 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 2 Mar 2020 00:10:20 +0100 Subject: [PATCH 024/593] Version 0.3.9 --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 724a60ec4..a4465e448 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.9.dev +VERSION = 0.3.9 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From 43c2e845ab3931716b2ec7ad3ac3da2a8a447264 Mon Sep 17 00:00:00 2001 From: Ali Saidi Date: Sat, 22 Feb 2020 05:31:07 +0000 Subject: [PATCH 025/593] Switch blas_server to use acq/rel semantics Heavy-weight locking isn't required to pass the work queue pointer between threads and simple atomic acquire/release semantics can be used instead. This is especially important as pthread_mutex_lock() isn't fair. We've observed substantial variation in runtime because of the the unfairness of these locks which complety goes away with this implementation. The locks themselves are left to provide a portable way for idling threads to sleep/wakeup after many unsuccessful iterations waiting. --- driver/others/blas_server.c | 99 +++++++++++++++---------------------- 1 file changed, 41 insertions(+), 58 deletions(-) diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 6f4e20610..ce028a7fc 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -140,6 +140,16 @@ typedef struct { } thread_status_t; +#if (__STDC_VERSION__ >= 201112L) +#define atomic_load_queue(p) __atomic_load_n(p, __ATOMIC_RELAXED) +#define atomic_store_queue(p, v) __atomic_store_n(p, v, __ATOMIC_RELAXED) +#else +#define atomic_load_queue(p) (blas_queue_t*)(*(volatile blas_queue_t**)(p)) +#define atomic_store_queue(p, v) (*(volatile blas_queue_t* volatile*)(p) = (v)) +#endif + + + static thread_status_t thread_status[MAX_CPU_NUMBER] __attribute__((aligned(ATTRIBUTE_SIZE))); #ifndef THREAD_TIMEOUT @@ -312,20 +322,19 @@ blas_queue_t *tscq; last_tick = (unsigned int)rpcc(); - pthread_mutex_lock (&thread_status[cpu].lock); - tscq=thread_status[cpu].queue; - pthread_mutex_unlock (&thread_status[cpu].lock); + tscq = atomic_load_queue(&thread_status[cpu].queue); while(!tscq) { YIELDING; if ((unsigned int)rpcc() - last_tick > thread_timeout) { - pthread_mutex_lock (&thread_status[cpu].lock); - if (!thread_status[cpu].queue) { + if (!atomic_load_queue(&thread_status[cpu].queue)) { + pthread_mutex_lock (&thread_status[cpu].lock); thread_status[cpu].status = THREAD_STATUS_SLEEP; - while (thread_status[cpu].status == THREAD_STATUS_SLEEP) { + while (thread_status[cpu].status == THREAD_STATUS_SLEEP && + !atomic_load_queue(&thread_status[cpu].queue)) { #ifdef MONITOR main_status[cpu] = MAIN_SLEEPING; @@ -333,19 +342,18 @@ blas_queue_t *tscq; pthread_cond_wait(&thread_status[cpu].wakeup, &thread_status[cpu].lock); } + pthread_mutex_unlock(&thread_status[cpu].lock); } - pthread_mutex_unlock(&thread_status[cpu].lock); - last_tick = (unsigned int)rpcc(); } - pthread_mutex_lock (&thread_status[cpu].lock); - tscq=thread_status[cpu].queue; - pthread_mutex_unlock (&thread_status[cpu].lock); + + tscq = atomic_load_queue(&thread_status[cpu].queue); } - queue = thread_status[cpu].queue; + queue = atomic_load_queue(&thread_status[cpu].queue); + MB; if ((long)queue == -1) break; @@ -360,9 +368,7 @@ blas_queue_t *tscq; if (queue) { int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine; - pthread_mutex_lock (&thread_status[cpu].lock); - thread_status[cpu].queue = (blas_queue_t *)1; - pthread_mutex_unlock (&thread_status[cpu].lock); + atomic_store_queue(&thread_status[cpu].queue, (blas_queue_t *)1); sa = queue -> sa; sb = queue -> sb; @@ -442,13 +448,9 @@ blas_queue_t *tscq; // arm: make sure all results are written out _before_ // thread is marked as done and other threads use them - WMB; + MB; + atomic_store_queue(&thread_status[cpu].queue, (blas_queue_t *)0); - pthread_mutex_lock (&thread_status[cpu].lock); - thread_status[cpu].queue = (blas_queue_t * volatile) ((long)thread_status[cpu].queue & 0); /* Need a trick */ - pthread_mutex_unlock (&thread_status[cpu].lock); - - WMB; } @@ -566,12 +568,9 @@ int blas_thread_init(void){ for(i = 0; i < blas_num_threads - 1; i++){ - thread_status[i].queue = (blas_queue_t *)NULL; + atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)0); thread_status[i].status = THREAD_STATUS_WAKEUP; - pthread_mutex_init(&thread_status[i].lock, NULL); - pthread_cond_init (&thread_status[i].wakeup, NULL); - #ifdef NEED_STACKATTR ret=pthread_create(&blas_threads[i], &attr, &blas_thread_server, (void *)i); @@ -655,7 +654,8 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ if (queue -> mode & BLAS_NODE) { do { - while((thread_status[i].node != node || thread_status[i].queue) && (i < blas_num_threads - 1)) i ++; + + while((thread_status[i].node != node || atomic_load_queue(&thread_status[i].queue)) && (i < blas_num_threads - 1)) i ++; if (i < blas_num_threads - 1) break; @@ -669,36 +669,26 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ } while (1); } else { - pthread_mutex_lock (&thread_status[i].lock); - tsiq = thread_status[i].queue; - pthread_mutex_unlock (&thread_status[i].lock); + tsiq = atomic_load_queue(&thread_status[i].queue); while(tsiq) { i ++; if (i >= blas_num_threads - 1) i = 0; - pthread_mutex_lock (&thread_status[i].lock); - tsiq = thread_status[i].queue; - pthread_mutex_unlock (&thread_status[i].lock); + tsiq = atomic_load_queue(&thread_status[i].queue); } } #else - pthread_mutex_lock (&thread_status[i].lock); - tsiq=thread_status[i].queue ; - pthread_mutex_unlock (&thread_status[i].lock); + tsiq = atomic_load_queue(&thread_status[i].queue); while(tsiq) { i ++; if (i >= blas_num_threads - 1) i = 0; - pthread_mutex_lock (&thread_status[i].lock); - tsiq=thread_status[i].queue ; - pthread_mutex_unlock (&thread_status[i].lock); + tsiq = atomic_load_queue(&thread_status[i].queue); } #endif queue -> assigned = i; - WMB; - pthread_mutex_lock (&thread_status[i].lock); - thread_status[i].queue = queue; - pthread_mutex_unlock (&thread_status[i].lock); - WMB; + MB; + + atomic_store_queue(&thread_status[i].queue, queue); queue = queue -> next; pos ++; @@ -718,9 +708,7 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ pos = current -> assigned; - pthread_mutex_lock (&thread_status[pos].lock); - tspq=thread_status[pos].queue; - pthread_mutex_unlock (&thread_status[pos].lock); + tspq = atomic_load_queue(&thread_status[pos].queue); if ((BLASULONG)tspq > 1) { pthread_mutex_lock (&thread_status[pos].lock); @@ -752,24 +740,20 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){ while ((num > 0) && queue) { - pthread_mutex_lock(&thread_status[queue->assigned].lock); - tsqq=thread_status[queue -> assigned].queue; - pthread_mutex_unlock(&thread_status[queue->assigned].lock); + tsqq = atomic_load_queue(&thread_status[queue->assigned].queue); while(tsqq) { YIELDING; - pthread_mutex_lock(&thread_status[queue->assigned].lock); - tsqq=thread_status[queue -> assigned].queue; - pthread_mutex_unlock(&thread_status[queue->assigned].lock); - - + tsqq = atomic_load_queue(&thread_status[queue->assigned].queue); }; queue = queue -> next; num --; } + MB; + #ifdef SMP_DEBUG fprintf(STDERR, "Done.\n\n"); #endif @@ -880,7 +864,7 @@ void goto_set_num_threads(int num_threads) { for(i = blas_num_threads - 1; i < num_threads - 1; i++){ - thread_status[i].queue = (blas_queue_t *)NULL; + atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)0); thread_status[i].status = THREAD_STATUS_WAKEUP; pthread_mutex_init(&thread_status[i].lock, NULL); @@ -971,12 +955,11 @@ int BLASFUNC(blas_thread_shutdown)(void){ for (i = 0; i < blas_num_threads - 1; i++) { - pthread_mutex_lock (&thread_status[i].lock); - thread_status[i].queue = (blas_queue_t *)-1; + pthread_mutex_lock (&thread_status[i].lock); + atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)-1); thread_status[i].status = THREAD_STATUS_WAKEUP; - pthread_cond_signal (&thread_status[i].wakeup); pthread_mutex_unlock(&thread_status[i].lock); From 917d243580701952a25601f854b0232a2dade8fb Mon Sep 17 00:00:00 2001 From: MacChen02 <58972037+MacChen02@users.noreply.github.com> Date: Mon, 2 Mar 2020 14:36:27 +0800 Subject: [PATCH 026/593] Update benchmark statistical time function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The function gettimeofday does not count the time,when testing the axpy small data volume use case. Use the function clock_gettime to replace the gettimeofday function to count the time. --- benchmark/axpy.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/benchmark/axpy.c b/benchmark/axpy.c index 37c7aeb63..e40f93c70 100644 --- a/benchmark/axpy.c +++ b/benchmark/axpy.c @@ -128,7 +128,7 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; + struct timespec start, stop; double time1,timeg; argc--;argv++; @@ -175,13 +175,13 @@ int main(int argc, char *argv[]){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + clock_gettime( CLOCK_REALTIME, &start); AXPY (&m, alpha, x, &inc_x, y, &inc_y ); - gettimeofday( &stop, (struct timezone *)0); + clock_gettime( CLOCK_REALTIME, &stop); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) * 1.e-9; timeg += time1; @@ -190,7 +190,7 @@ int main(int argc, char *argv[]){ timeg /= loops; fprintf(stderr, - " %10.2f MFlops %10.6f sec\n", + " %10.2f MFlops %10.9f sec\n", COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6, timeg); } From 0f65c05cd1f83187863546a0e5a11bbc633ab85c Mon Sep 17 00:00:00 2001 From: jianghesong Date: Mon, 2 Mar 2020 19:13:45 +0800 Subject: [PATCH 027/593] fix core dumped error --- benchmark/cholesky.c | 54 ++++++++++++++++++++++++-------------------- benchmark/geev.c | 2 +- benchmark/gemm3m.c | 6 ++--- benchmark/gemv.c | 4 ++-- benchmark/ger.c | 2 +- benchmark/gesv.c | 6 ++--- benchmark/getri.c | 2 +- benchmark/hemm.c | 6 ++--- benchmark/hemv.c | 2 +- benchmark/her2k.c | 6 ++--- benchmark/herk.c | 4 ++-- benchmark/linpack.c | 4 ++-- benchmark/potrf.c | 36 ++++++++++++++--------------- benchmark/symm.c | 6 ++--- benchmark/symv.c | 2 +- benchmark/syr2k.c | 6 ++--- benchmark/syrk.c | 4 ++-- benchmark/trmm.c | 4 ++-- benchmark/trsm.c | 4 ++-- 19 files changed, 83 insertions(+), 77 deletions(-) diff --git a/benchmark/cholesky.c b/benchmark/cholesky.c index 8d121efb3..5908b6085 100644 --- a/benchmark/cholesky.c +++ b/benchmark/cholesky.c @@ -173,46 +173,46 @@ int main(int argc, char *argv[]){ #ifndef COMPLEX if (uplos & 1) { for (j = 0; j < m; j++) { - for(i = 0; i < j; i++) a[i + j * m] = 0.; - a[j + j * m] = ((double) rand() / (double) RAND_MAX) + 8.; - for(i = j + 1; i < m; i++) a[i + j * m] = ((double) rand() / (double) RAND_MAX) - 0.5; + for(i = 0; i < j; i++) a[(long)i + (long)j * (long)m] = 0.; + a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.; + for(i = j + 1; i < m; i++) a[(long)i + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) - 0.5; } } else { for (j = 0; j < m; j++) { - for(i = 0; i < j; i++) a[i + j * m] = ((double) rand() / (double) RAND_MAX) - 0.5; - a[j + j * m] = ((double) rand() / (double) RAND_MAX) + 8.; - for(i = j + 1; i < m; i++) a[i + j * m] = 0.; + for(i = 0; i < j; i++) a[(long)i + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) - 0.5; + a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.; + for(i = j + 1; i < m; i++) a[(long)i + (long)j * (long)m] = 0.; } } #else if (uplos & 1) { for (j = 0; j < m; j++) { for(i = 0; i < j; i++) { - a[(i + j * m) * 2 + 0] = 0.; - a[(i + j * m) * 2 + 1] = 0.; + a[((long)i + (long)j * (long)m) * 2 + 0] = 0.; + a[((long)i + (long)j * (long)m) * 2 + 1] = 0.; } - a[(j + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) + 8.; - a[(j + j * m) * 2 + 1] = 0.; + a[((long)j + (long)j * (long)m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) + 8.; + a[((long)j + (long)j * (long)m) * 2 + 1] = 0.; for(i = j + 1; i < m; i++) { - a[(i + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) - 0.5; - a[(i + j * m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5; + a[((long)i + (long)j * (long)m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) - 0.5; + a[((long)i + (long)j * (long)m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5; } } } else { for (j = 0; j < m; j++) { for(i = 0; i < j; i++) { - a[(i + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) - 0.5; - a[(i + j * m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5; + a[((long)i + (long)j * (long)m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) - 0.5; + a[((long)i + (long)j * (long)m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5; } - a[(j + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) + 8.; - a[(j + j * m) * 2 + 1] = 0.; + a[((long)j + (long)j * (long)m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) + 8.; + a[((long)j + (long)j * (long)m) * 2 + 1] = 0.; for(i = j + 1; i < m; i++) { - a[(i + j * m) * 2 + 0] = 0.; - a[(i + j * m) * 2 + 1] = 0.; + a[((long)i + (long)j * (long)m) * 2 + 0] = 0.; + a[((long)i + (long)j * (long)m) * 2 + 1] = 0.; } } } @@ -239,10 +239,13 @@ int main(int argc, char *argv[]){ for (j = 0; j < m; j++) { for(i = 0; i <= j; i++) { #ifndef COMPLEX - if (maxerr < fabs(a[i + j * m] - b[i + j * m])) maxerr = fabs(a[i + j * m] - b[i + j * m]); + if (maxerr < fabs(a[(long)i + (long)j * (long)m] - b[(long)i + (long)j * (long)m])) + maxerr = fabs(a[(long)i + (long)j * (long)m] - b[(long)i + (long)j * (long)m]); #else - if (maxerr < fabs(a[(i + j * m) * 2 + 0] - b[(i + j * m) * 2 + 0])) maxerr = fabs(a[(i + j * m) * 2 + 0] - b[(i + j * m) * 2 + 0]); - if (maxerr < fabs(a[(i + j * m) * 2 + 1] - b[(i + j * m) * 2 + 1])) maxerr = fabs(a[(i + j * m) * 2 + 1] - b[(i + j * m) * 2 + 1]); + if (maxerr < fabs(a[((long)i + (long)j * (long)m) * 2 + 0] - b[((long)i + (long)j * (long)m) * 2 + 0])) + maxerr = fabs(a[((long)i + (long)j * (long)m) * 2 + 0] - b[((long)i + (long)j * (long)m) * 2 + 0]); + if (maxerr < fabs(a[((long)i + (long)j * (long)m) * 2 + 1] - b[((long)i + (long)j * (long)m) * 2 + 1])) + maxerr = fabs(a[((long)i + (long)j * (long)m) * 2 + 1] - b[((long)i + (long)j * (long)m) * 2 + 1]); #endif } } @@ -250,10 +253,13 @@ int main(int argc, char *argv[]){ for (j = 0; j < m; j++) { for(i = j; i < m; i++) { #ifndef COMPLEX - if (maxerr < fabs(a[i + j * m] - b[i + j * m])) maxerr = fabs(a[i + j * m] - b[i + j * m]); + if (maxerr < fabs(a[(long)i + (long)j * (long)m] - b[(long)i + (long)j * (long)m])) + maxerr = fabs(a[(long)i + (long)j * (long)m] - b[(long)i + (long)j * (long)m]); #else - if (maxerr < fabs(a[(i + j * m) * 2 + 0] - b[(i + j * m) * 2 + 0])) maxerr = fabs(a[(i + j * m) * 2 + 0] - b[(i + j * m) * 2 + 0]); - if (maxerr < fabs(a[(i + j * m) * 2 + 1] - b[(i + j * m) * 2 + 1])) maxerr = fabs(a[(i + j * m) * 2 + 1] - b[(i + j * m) * 2 + 1]); + if (maxerr < fabs(a[((long)i + (long)j * (long)m) * 2 + 0] - b[((long)i + (long)j * (long)m) * 2 + 0])) + maxerr = fabs(a[((long)i + (long)j * (long)m) * 2 + 0] - b[((long)i + (long)j * (long)m) * 2 + 0]); + if (maxerr < fabs(a[((long)i + (long)j * (long)m) * 2 + 1] - b[((long)i + (long)j * (long)m) * 2 + 1])) + maxerr = fabs(a[((long)i + (long)j * (long)m) * 2 + 1] - b[((long)i + (long)j * (long)m) * 2 + 1]); #endif } } diff --git a/benchmark/geev.c b/benchmark/geev.c index d3751defb..ef9271220 100644 --- a/benchmark/geev.c +++ b/benchmark/geev.c @@ -195,7 +195,7 @@ int main(int argc, char *argv[]){ for(j = 0; j < to; j++){ for(i = 0; i < to * COMPSIZE; i++){ - a[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[(long)i + (long)j * (long)to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } diff --git a/benchmark/gemm3m.c b/benchmark/gemm3m.c index d39543585..f4048c436 100644 --- a/benchmark/gemm3m.c +++ b/benchmark/gemm3m.c @@ -181,9 +181,9 @@ int main(int argc, char *argv[]){ for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ - a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + b[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + c[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } diff --git a/benchmark/gemv.c b/benchmark/gemv.c index adf8f3d91..a9dee67d2 100644 --- a/benchmark/gemv.c +++ b/benchmark/gemv.c @@ -197,7 +197,7 @@ int main(int argc, char *argv[]){ fprintf(stderr, " %6dx%d : ", (int)m,(int)n); for(j = 0; j < m; j++){ for(i = 0; i < n * COMPSIZE; i++){ - a[j + i * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[(long)j + (long)i * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } @@ -234,7 +234,7 @@ int main(int argc, char *argv[]){ fprintf(stderr, " %6dx%d : ", (int)m,(int)n); for(j = 0; j < m; j++){ for(i = 0; i < n * COMPSIZE; i++){ - a[j + i * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[(long)j + (long)i * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } diff --git a/benchmark/ger.c b/benchmark/ger.c index a752a3c3e..ca7e94e15 100644 --- a/benchmark/ger.c +++ b/benchmark/ger.c @@ -182,7 +182,7 @@ int main(int argc, char *argv[]){ for(j = 0; j < m; j++){ for(i = 0; i < n * COMPSIZE; i++){ - a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } diff --git a/benchmark/gesv.c b/benchmark/gesv.c index 26ff8bc1a..80f644e69 100644 --- a/benchmark/gesv.c +++ b/benchmark/gesv.c @@ -177,20 +177,20 @@ int main(int argc, char *argv[]){ for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ - a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ - b[i + j * m * COMPSIZE] = 0.0; + b[(long)i + (long)j * (long)m * COMPSIZE] = 0.0; } } for (j = 0; j < m; ++j) { for (i = 0; i < m * COMPSIZE; ++i) { - b[i] += a[i + j * m * COMPSIZE]; + b[i] += a[(long)i + (long)j * (long)m * COMPSIZE]; } } diff --git a/benchmark/getri.c b/benchmark/getri.c index 083cdc9aa..e8b82a758 100644 --- a/benchmark/getri.c +++ b/benchmark/getri.c @@ -172,7 +172,7 @@ int main(int argc, char *argv[]){ for(j = 0; j < to; j++){ for(i = 0; i < to * COMPSIZE; i++){ - a[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[(long)i + (long)j * (long)to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } diff --git a/benchmark/hemm.c b/benchmark/hemm.c index 318c407ba..a0c549292 100644 --- a/benchmark/hemm.c +++ b/benchmark/hemm.c @@ -164,9 +164,9 @@ int main(int argc, char *argv[]){ for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ - a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + b[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + c[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } diff --git a/benchmark/hemv.c b/benchmark/hemv.c index 05028e3cf..b6ff512ce 100644 --- a/benchmark/hemv.c +++ b/benchmark/hemv.c @@ -167,7 +167,7 @@ int main(int argc, char *argv[]){ for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ - a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } diff --git a/benchmark/her2k.c b/benchmark/her2k.c index 028e2718f..55421878a 100644 --- a/benchmark/her2k.c +++ b/benchmark/her2k.c @@ -163,9 +163,9 @@ int main(int argc, char *argv[]){ for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ - a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + b[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + c[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } diff --git a/benchmark/herk.c b/benchmark/herk.c index d2e25ff46..bd336e6b1 100644 --- a/benchmark/herk.c +++ b/benchmark/herk.c @@ -162,8 +162,8 @@ int main(int argc, char *argv[]){ for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ - a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + c[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } diff --git a/benchmark/linpack.c b/benchmark/linpack.c index 7d5c87163..e4b20e99d 100644 --- a/benchmark/linpack.c +++ b/benchmark/linpack.c @@ -186,7 +186,7 @@ int main(int argc, char *argv[]){ for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ - a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } @@ -194,7 +194,7 @@ int main(int argc, char *argv[]){ for (j = 0; j < m; ++j) { for (i = 0; i < m * COMPSIZE; ++i) { - b[i] += a[i + j * m * COMPSIZE]; + b[i] += a[(long)i + (long)j * (long)m * COMPSIZE]; } } diff --git a/benchmark/potrf.c b/benchmark/potrf.c index 1d714549b..580e46072 100644 --- a/benchmark/potrf.c +++ b/benchmark/potrf.c @@ -170,46 +170,46 @@ int main(int argc, char *argv[]){ #ifndef COMPLEX if (uplos & 1) { for (j = 0; j < m; j++) { - for(i = 0; i < j; i++) a[i + j * m] = 0.; - a[j + j * m] = ((double) rand() / (double) RAND_MAX) + 8.; - for(i = j + 1; i < m; i++) a[i + j * m] = ((double) rand() / (double) RAND_MAX) - 0.5; + for(i = 0; i < j; i++) a[(long)i + (long)j * (long)m] = 0.; + a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.; + for(i = j + 1; i < m; i++) a[(long)i + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) - 0.5; } } else { for (j = 0; j < m; j++) { - for(i = 0; i < j; i++) a[i + j * m] = ((double) rand() / (double) RAND_MAX) - 0.5; - a[j + j * m] = ((double) rand() / (double) RAND_MAX) + 8.; - for(i = j + 1; i < m; i++) a[i + j * m] = 0.; + for(i = 0; i < j; i++) a[(long)i + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) - 0.5; + a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.; + for(i = j + 1; i < m; i++) a[(long)i + (long)j * (long)m] = 0.; } } #else if (uplos & 1) { for (j = 0; j < m; j++) { for(i = 0; i < j; i++) { - a[(i + j * m) * 2 + 0] = 0.; - a[(i + j * m) * 2 + 1] = 0.; + a[((long)i + (long)j * (long)m) * 2 + 0] = 0.; + a[((long)i + (long)j * (long)m) * 2 + 1] = 0.; } - a[(j + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) + 8.; - a[(j + j * m) * 2 + 1] = 0.; + a[((long)j + (long)j * (long)m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) + 8.; + a[((long)j + (long)j * (long)m) * 2 + 1] = 0.; for(i = j + 1; i < m; i++) { - a[(i + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) - 0.5; - a[(i + j * m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5; + a[((long)i + (long)j * (long)m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) - 0.5; + a[((long)i + (long)j * (long)m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5; } } } else { for (j = 0; j < m; j++) { for(i = 0; i < j; i++) { - a[(i + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) - 0.5; - a[(i + j * m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5; + a[((long)i + (long)j * (long)m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) - 0.5; + a[((long)i + (long)j * (long)m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5; } - a[(j + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) + 8.; - a[(j + j * m) * 2 + 1] = 0.; + a[((long)j + (long)j * (long)m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) + 8.; + a[((long)j + (long)j * (long)m) * 2 + 1] = 0.; for(i = j + 1; i < m; i++) { - a[(i + j * m) * 2 + 0] = 0.; - a[(i + j * m) * 2 + 1] = 0.; + a[((long)i + (long)j * (long)m) * 2 + 0] = 0.; + a[((long)i + (long)j * (long)m) * 2 + 1] = 0.; } } } diff --git a/benchmark/symm.c b/benchmark/symm.c index 35ebcee97..9c26d92fe 100644 --- a/benchmark/symm.c +++ b/benchmark/symm.c @@ -175,9 +175,9 @@ int main(int argc, char *argv[]){ for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ - a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + b[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + c[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } diff --git a/benchmark/symv.c b/benchmark/symv.c index df2a5d301..789c3560f 100644 --- a/benchmark/symv.c +++ b/benchmark/symv.c @@ -177,7 +177,7 @@ int main(int argc, char *argv[]){ for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ - a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } diff --git a/benchmark/syr2k.c b/benchmark/syr2k.c index 9840b5f3e..6b51e4f2b 100644 --- a/benchmark/syr2k.c +++ b/benchmark/syr2k.c @@ -175,9 +175,9 @@ int main(int argc, char *argv[]){ for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ - a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + b[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + c[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } diff --git a/benchmark/syrk.c b/benchmark/syrk.c index 34817f2bb..06582b861 100644 --- a/benchmark/syrk.c +++ b/benchmark/syrk.c @@ -172,8 +172,8 @@ int main(int argc, char *argv[]){ for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ - a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + c[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } diff --git a/benchmark/trmm.c b/benchmark/trmm.c index 54c7972db..6a5e59c7b 100644 --- a/benchmark/trmm.c +++ b/benchmark/trmm.c @@ -175,8 +175,8 @@ int main(int argc, char *argv[]){ for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ - a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + b[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } diff --git a/benchmark/trsm.c b/benchmark/trsm.c index 9eae3380c..6ce1d532c 100644 --- a/benchmark/trsm.c +++ b/benchmark/trsm.c @@ -191,8 +191,8 @@ int main(int argc, char *argv[]){ for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ - a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + b[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } From 7ca4ffdbddd03697e1c9773f48fa8e890a2d3b86 Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Wed, 19 Feb 2020 18:24:01 +0100 Subject: [PATCH 028/593] Improve test coverage for utests. --- utest/CMakeLists.txt | 1 + utest/Makefile | 2 +- utest/test_amax.c | 13 +++++- utest/test_min.c | 100 +++++++++++++++++++++++++++++++++++++++++++ utest/utest_main2.c | 86 ++++++++++++++++++++++++++++++++++++- 5 files changed, 199 insertions(+), 3 deletions(-) create mode 100644 utest/test_min.c diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index 544646911..dc5175fc5 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -6,6 +6,7 @@ if (MSVC AND "${CMAKE_C_COMPILER_ID}" MATCHES Clang) else () set(OpenBLAS_utest_src utest_main.c + test_min.c test_amax.c test_ismin.c test_rotmg.c diff --git a/utest/Makefile b/utest/Makefile index 32bdcc6e1..0b9892411 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -11,7 +11,7 @@ UTESTBIN=openblas_utest include $(TOPDIR)/Makefile.system -OBJS=utest_main.o test_amax.o test_ismin.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o +OBJS=utest_main.o test_min.o test_amax.o test_ismin.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o #test_rot.o test_swap.o test_axpy.o test_dotu.o test_dsdot.o test_fork.o ifneq ($(NO_LAPACK), 1) diff --git a/utest/test_amax.c b/utest/test_amax.c index 411598410..831804027 100644 --- a/utest/test_amax.c +++ b/utest/test_amax.c @@ -40,6 +40,17 @@ CTEST(amax, samax){ te_max=BLASFUNC(samax)(&N, x, &inc); tr_max=3.3; - + ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS); } + +CTEST(amax, damax){ + blasint N=3, inc=1; + double te_max=0.0, tr_max=0.0; + double x[]={-1.1, 2.2, -3.3}; + + te_max=BLASFUNC(damax)(&N, x, &inc); + tr_max=3.3; + + ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), DOUBLE_EPS); +} diff --git a/utest/test_min.c b/utest/test_min.c new file mode 100644 index 000000000..fd31b5982 --- /dev/null +++ b/utest/test_min.c @@ -0,0 +1,100 @@ +/***************************************************************************** +Copyright (c) 2011-2016, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "openblas_utest.h" + +CTEST(min, smin_negative){ + blasint N=3, inc=1; + float te_min=0.0, tr_min=0.0; + float x[]={-1.1, -2.2, -3.3}; + + te_min=BLASFUNC(smin)(&N, x, &inc); + tr_min=-3.3; + + ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), SINGLE_EPS); +} + +CTEST(min, dmin_positive){ + blasint N=3, inc=1; + double te_min=0.0, tr_min=0.0; + double x[]={1.1, 0.0, 3.3}; + + te_min=BLASFUNC(dmin)(&N, x, &inc); + tr_min=0.0; + + ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), DOUBLE_EPS); +} + +CTEST(min, smin_zero){ + blasint N=3, inc=1; + float te_min=0.0, tr_min=0.0; + float x[]={1.1, 2.2, 0.0}; + + te_min=BLASFUNC(smin)(&N, x, &inc); + tr_min=0.0; + + ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), SINGLE_EPS); +} + +CTEST(max, smax_negative){ + blasint N=3, inc=1; + float te_max=0.0, tr_max=0.0; + float x[]={-1.1, -2.2, -3.3}; + + te_max=BLASFUNC(smax)(&N, x, &inc); + tr_max=-1.1; + + ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS); +} + +CTEST(max, dmax_positive){ + blasint N=3, inc=1; + double te_max=0.0, tr_max=0.0; + double x[]={1.1, 0.0, 3.3}; + + te_max=BLASFUNC(dmax)(&N, x, &inc); + tr_max=3.3; + + ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), DOUBLE_EPS); +} + +CTEST(max, smax_zero){ + blasint N=3, inc=1; + float te_max=0.0, tr_max=0.0; + float x[]={-1.1, -2.2, 0.0}; + + te_max=BLASFUNC(smax)(&N, x, &inc); + tr_max=0.0; + + ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS); +} diff --git a/utest/utest_main2.c b/utest/utest_main2.c index aa95a5a3f..6b252863a 100644 --- a/utest/utest_main2.c +++ b/utest/utest_main2.c @@ -50,6 +50,17 @@ CTEST(amax, samax){ ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS); } +CTEST(amax, damax){ + blasint N=3, inc=1; + double te_max=0.0, tr_max=0.0; + double x[]={-1.1, 2.2, -3.3}; + + te_max=BLASFUNC(damax)(&N, x, &inc); + tr_max=3.3; + + ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), DOUBLE_EPS); +} + CTEST (drotmg,rotmg) { double te_d1, tr_d1; @@ -508,9 +519,82 @@ CTEST(swap,cswap_inc_0) } } +CTEST(min, smin_negative){ + blasint N=3, inc=1; + float te_min=0.0, tr_min=0.0; + float x[]={-1.1, -2.2, -3.3}; + + te_min=BLASFUNC(smin)(&N, x, &inc); + tr_min=-3.3; + + ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), SINGLE_EPS); +} + +CTEST(min, dmin_positive){ + blasint N=3, inc=1; + double te_min=0.0, tr_min=0.0; + double x[]={1.1, 0.0, 3.3}; + + te_min=BLASFUNC(dmin)(&N, x, &inc); + tr_min=0.0; + + ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), DOUBLE_EPS); +} + +CTEST(min, smin_zero){ + blasint N=3, inc=1; + float te_min=0.0, tr_min=0.0; + float x[]={1.1, 2.2, 0.0}; + + te_min=BLASFUNC(smin)(&N, x, &inc); + tr_min=0.0; + + ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), SINGLE_EPS); +} + +CTEST(max, smax_negative){ + blasint N=3, inc=1; + float te_max=0.0, tr_max=0.0; + float x[]={-1.1, -2.2, -3.3}; + + te_max=BLASFUNC(smax)(&N, x, &inc); + tr_max=-1.1; + + ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS); +} + +CTEST(max, dmax_positive){ + blasint N=3, inc=1; + double te_max=0.0, tr_max=0.0; + double x[]={1.1, 0.0, 3.3}; + + te_max=BLASFUNC(dmax)(&N, x, &inc); + tr_max=3.3; + + ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), DOUBLE_EPS); +} + +CTEST(max, smax_zero){ + blasint N=3, inc=1; + float te_max=0.0, tr_max=0.0; + float x[]={-1.1, -2.2, 0.0}; + + te_max=BLASFUNC(smax)(&N, x, &inc); + tr_max=0.0; + + ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS); +} + int main(int argc, const char ** argv){ - CTEST_ADD(amax, samax); + CTEST_ADD (amax, samax); + CTEST_ADD (amax, damax); + CTEST_ADD (min, smin_negative); + CTEST_ADD (min, dmin_positive); + CTEST_ADD (min, smin_zero); + CTEST_ADD (max, smax_negative); + CTEST_ADD (max, dmax_positive); + CTEST_ADD (max, smax_zero); CTEST_ADD (drotmg,rotmg); CTEST_ADD (drotmg,rotmg_issue1452); CTEST_ADD (drotmg,rotmg_D1eqD2_X1eqX2); From 21f6c4b5a972683f7228e5ad446bc940947c2d2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D9=85=D9=87=D8=AF=D9=8A=20=D8=B4=D9=8A=D9=86=D9=88=D9=86?= =?UTF-8?q?=20=28Mehdi=20Chinoune=29?= Date: Mon, 2 Mar 2020 17:22:28 +0100 Subject: [PATCH 029/593] fixes #2480 --- cmake/cc.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/cc.cmake b/cmake/cc.cmake index 22217575c..d5551147c 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -99,7 +99,7 @@ endif () if (${CORE} STREQUAL "SKYLAKEX") if (NOT DYNAMIC_ARCH) if (NOT NO_AVX512) - set (CCOMMON_OPT = "${CCOMMON_OPT} -march=skylake-avx512") + set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512") endif () endif () endif () From 790d50fbba8b1c41be7d0feb3865e5870d61c81f Mon Sep 17 00:00:00 2001 From: wuanjun 00447568 Date: Tue, 3 Mar 2020 17:13:49 +0800 Subject: [PATCH 030/593] [OpenBlas]: add benchmark file trmv.c and modify benchmark/Makefile to test s/d/c/ztrmv --- benchmark/Makefile | 87 +++++++++++++++++++++++ benchmark/trmv.c | 170 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 257 insertions(+) create mode 100644 benchmark/trmv.c diff --git a/benchmark/Makefile b/benchmark/Makefile index 1d4a220e4..c037dd6d6 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -73,6 +73,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ cherk.goto zherk.goto \ cher2k.goto zher2k.goto \ sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ + strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ sgesv.goto dgesv.goto cgesv.goto zgesv.goto \ sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ @@ -100,6 +101,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ cherk.acml zherk.acml \ cher2k.acml zher2k.acml \ sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ + strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ sgetri.acml dgetri.acml cgetri.acml zgetri.acml \ @@ -128,6 +130,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ cherk.atlas zherk.atlas \ cher2k.atlas zher2k.atlas \ sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ + strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ @@ -155,6 +158,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ cherk.mkl zherk.mkl \ cher2k.mkl zher2k.mkl \ sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ + strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \ @@ -183,6 +187,7 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ cherk.goto zherk.goto \ cher2k.goto zher2k.goto \ sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ + strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ ssymm.goto dsymm.goto csymm.goto zsymm.goto \ smallscaling \ isamax.goto idamax.goto icamax.goto izamax.goto \ @@ -209,6 +214,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ cherk.acml zherk.acml \ cher2k.acml zher2k.acml \ sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ + strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ sgetri.acml dgetri.acml cgetri.acml zgetri.acml \ @@ -237,6 +243,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ cherk.atlas zherk.atlas \ cher2k.atlas zher2k.atlas \ sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ + strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ @@ -266,6 +273,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ cherk.mkl zherk.mkl \ cher2k.mkl zher2k.mkl \ sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ + strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \ @@ -304,6 +312,7 @@ veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ cherk.veclib zherk.veclib \ cher2k.veclib zher2k.veclib \ sgemv.veclib dgemv.veclib cgemv.veclib zgemv.veclib \ + strmv.veclib dtrmv.veclib ctrmv.veclib ztrmv.veclib \ sgeev.veclib dgeev.veclib cgeev.veclib zgeev.veclib \ sgesv.veclib dgesv.veclib cgesv.veclib zgesv.veclib \ sgetri.veclib dgetri.veclib cgetri.veclib zgetri.veclib \ @@ -1108,6 +1117,72 @@ zgemv.mkl : zgemv.$(SUFFIX) zgemv.veclib : zgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Strmv #################################################### +strmv.goto : strmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +strmv.acml : strmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strmv.atlas : strmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strmv.mkl : strmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strmv.veclib : strmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dtrmv #################################################### +dtrmv.goto : dtrmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dtrmv.acml : dtrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrmv.atlas : dtrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrmv.mkl : dtrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrmv.veclib : dtrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ctrmv #################################################### + +ctrmv.goto : ctrmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ctrmv.acml : ctrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrmv.atlas : ctrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrmv.mkl : ctrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrmv.veclib : ctrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ztrmv #################################################### + +ztrmv.goto : ztrmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ztrmv.acml : ztrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrmv.atlas : ztrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrmv.mkl : ztrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrmv.veclib : ztrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Sger #################################################### sger.goto : sger.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -2177,6 +2252,18 @@ cgemv.$(SUFFIX) : gemv.c zgemv.$(SUFFIX) : gemv.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +strmv.$(SUFFIX) : trmv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dtrmv.$(SUFFIX) : trmv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ctrmv.$(SUFFIX) : trmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +ztrmv.$(SUFFIX) : trmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + sger.$(SUFFIX) : ger.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ diff --git a/benchmark/trmv.c b/benchmark/trmv.c new file mode 100644 index 000000000..84d1903de --- /dev/null +++ b/benchmark/trmv.c @@ -0,0 +1,170 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#undef TRMV + +#ifndef COMPLEX + +#ifdef DOUBLE +#define TRMV BLASFUNC(dtrmv) +#else +#define TRMV BLASFUNC(strmv) +#endif + +#else + +#ifdef DOUBLE +#define TRMV BLASFUNC(ztrmv) +#else +#define TRMV BLASFUNC(ctrmv) +#endif + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size) +{ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1) { + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]) +{ + + FLOAT *a, *x; + char *p; + + char uplo ='U'; + char trans='N'; + char diag ='U'; + + int loops = 1; + int l; + blasint inc_x=1; + + if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; + if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; + if ((p = getenv("OPENBLAS_DIAG"))) diag=*p; + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + long n, i, j; + + int from = 1; + int to = 200; + int step = 1; + + struct timespec start = { 0, 0 }, stop = { 0, 0 }; + double time1, timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c Diag = %c Loops=%d Inc_x=%d\n", from, + to, step, uplo, trans, diag, loops, inc_x); + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) { + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(n = from; n <= to; n += step) { + timeg=0; + + fprintf(stderr, " %6d : ", (int)n); + for(j = 0; j < n; j++) { + for(i = 0; i < n * COMPSIZE; i++) { + a[i + j * n * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + } + + for (i = 0; i < n * COMPSIZE * abs(inc_x); i++) { + x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + + for (l = 0; l < loops; l++) { + clock_gettime(CLOCK_REALTIME, &start); + TRMV (&uplo, &trans, &diag, &n, a, &n, x, &inc_x); + clock_gettime(CLOCK_REALTIME, &stop); + + time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9; + timeg += time1; + } + + timeg /= loops; + fprintf(stderr, " %10.2f MFlops %12.9f sec\n", + COMPSIZE * COMPSIZE * 1. * (double)n * (double)n / timeg / 1.e6, timeg); + } + + return 0; +} \ No newline at end of file From f682d19ed4b247903e37254ac2c4d2f136c237b0 Mon Sep 17 00:00:00 2001 From: wuanjun 00447568 Date: Tue, 3 Mar 2020 17:13:49 +0800 Subject: [PATCH 031/593] [OpenBlas]: add benchmark file trmv.c and modify benchmark/Makefile to test s/d/c/ztrmv --- benchmark/Makefile | 87 +++++++++++++++++++++++ benchmark/trmv.c | 172 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 259 insertions(+) create mode 100644 benchmark/trmv.c diff --git a/benchmark/Makefile b/benchmark/Makefile index 1d4a220e4..c037dd6d6 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -73,6 +73,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ cherk.goto zherk.goto \ cher2k.goto zher2k.goto \ sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ + strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ sgesv.goto dgesv.goto cgesv.goto zgesv.goto \ sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ @@ -100,6 +101,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ cherk.acml zherk.acml \ cher2k.acml zher2k.acml \ sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ + strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ sgetri.acml dgetri.acml cgetri.acml zgetri.acml \ @@ -128,6 +130,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ cherk.atlas zherk.atlas \ cher2k.atlas zher2k.atlas \ sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ + strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ @@ -155,6 +158,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ cherk.mkl zherk.mkl \ cher2k.mkl zher2k.mkl \ sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ + strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \ @@ -183,6 +187,7 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ cherk.goto zherk.goto \ cher2k.goto zher2k.goto \ sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ + strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ ssymm.goto dsymm.goto csymm.goto zsymm.goto \ smallscaling \ isamax.goto idamax.goto icamax.goto izamax.goto \ @@ -209,6 +214,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ cherk.acml zherk.acml \ cher2k.acml zher2k.acml \ sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ + strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ sgetri.acml dgetri.acml cgetri.acml zgetri.acml \ @@ -237,6 +243,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ cherk.atlas zherk.atlas \ cher2k.atlas zher2k.atlas \ sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ + strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ @@ -266,6 +273,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ cherk.mkl zherk.mkl \ cher2k.mkl zher2k.mkl \ sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ + strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \ @@ -304,6 +312,7 @@ veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ cherk.veclib zherk.veclib \ cher2k.veclib zher2k.veclib \ sgemv.veclib dgemv.veclib cgemv.veclib zgemv.veclib \ + strmv.veclib dtrmv.veclib ctrmv.veclib ztrmv.veclib \ sgeev.veclib dgeev.veclib cgeev.veclib zgeev.veclib \ sgesv.veclib dgesv.veclib cgesv.veclib zgesv.veclib \ sgetri.veclib dgetri.veclib cgetri.veclib zgetri.veclib \ @@ -1108,6 +1117,72 @@ zgemv.mkl : zgemv.$(SUFFIX) zgemv.veclib : zgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Strmv #################################################### +strmv.goto : strmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +strmv.acml : strmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strmv.atlas : strmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strmv.mkl : strmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strmv.veclib : strmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dtrmv #################################################### +dtrmv.goto : dtrmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dtrmv.acml : dtrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrmv.atlas : dtrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrmv.mkl : dtrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrmv.veclib : dtrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ctrmv #################################################### + +ctrmv.goto : ctrmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ctrmv.acml : ctrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrmv.atlas : ctrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrmv.mkl : ctrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrmv.veclib : ctrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ztrmv #################################################### + +ztrmv.goto : ztrmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ztrmv.acml : ztrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrmv.atlas : ztrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrmv.mkl : ztrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrmv.veclib : ztrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Sger #################################################### sger.goto : sger.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -2177,6 +2252,18 @@ cgemv.$(SUFFIX) : gemv.c zgemv.$(SUFFIX) : gemv.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +strmv.$(SUFFIX) : trmv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dtrmv.$(SUFFIX) : trmv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ctrmv.$(SUFFIX) : trmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +ztrmv.$(SUFFIX) : trmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + sger.$(SUFFIX) : ger.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ diff --git a/benchmark/trmv.c b/benchmark/trmv.c new file mode 100644 index 000000000..969f4f1d4 --- /dev/null +++ b/benchmark/trmv.c @@ -0,0 +1,172 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#undef TRMV + +#ifndef COMPLEX + +#ifdef DOUBLE +#define TRMV BLASFUNC(dtrmv) +#else +#define TRMV BLASFUNC(strmv) +#endif + +#else + +#ifdef DOUBLE +#define TRMV BLASFUNC(ztrmv) +#else +#define TRMV BLASFUNC(ctrmv) +#endif + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size) +{ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1) { + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]) +{ + + FLOAT *a, *x; + char *p; + + char uplo ='U'; + char trans='N'; + char diag ='U'; + + int loops = 1; + int l; + blasint inc_x=1; + + if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; + if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; + if ((p = getenv("OPENBLAS_DIAG"))) diag=*p; + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + long n, i, j; + + int from = 1; + int to = 200; + int step = 1; + + struct timespec start = { 0, 0 }, stop = { 0, 0 }; + double time1, timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c Diag = %c Loops=%d Inc_x=%d\n", from, + to, step, uplo, trans, diag, loops, inc_x); + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) { + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(n = from; n <= to; n += step) { + timeg=0; + + fprintf(stderr, " %6d : ", (int)n); + for(j = 0; j < n; j++) { + for(i = 0; i < n * COMPSIZE; i++) { + a[i + j * n * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + } + + for (i = 0; i < n * COMPSIZE * abs(inc_x); i++) { + x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + + for (l = 0; l < loops; l++) { + clock_gettime(CLOCK_REALTIME, &start); + TRMV (&uplo, &trans, &diag, &n, a, &n, x, &inc_x); + clock_gettime(CLOCK_REALTIME, &stop); + + time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9; + timeg += time1; + } + + timeg /= loops; + fprintf(stderr, " %10.2f MFlops %12.9f sec\n", + COMPSIZE * COMPSIZE * 1. * (double)n * (double)n / timeg / 1.e6, timeg); + } + + return 0; +} + +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); From 2afc0748039d5adaede2780e2a30e628a843a0a1 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Tue, 3 Mar 2020 12:35:10 -0600 Subject: [PATCH 032/593] Fix DYNAMIC_ARCH build for POWER9 Setting DYNAMIC_ARCH=1 on POWER9 does not build POWER9 files due to some compiler version checks. This patch fixes some of the macros that are used to check compiler version. On fixing those checks, there are some new make failures related to icamin, icamax, isamin, isamax and caxpy files on POWER9. This patch fixes those failures as well. --- Makefile.system | 2 +- driver/others/dynamic_power.c | 8 ++++---- kernel/power/caxpy_power9.S | 4 ++++ kernel/power/icamax_power9.S | 7 +++++++ kernel/power/icamin_power9.S | 7 +++++++ kernel/power/isamax_power9.S | 7 +++++++ kernel/power/isamin_power9.S | 7 +++++++ 7 files changed, 37 insertions(+), 5 deletions(-) diff --git a/Makefile.system b/Makefile.system index 1e30d05a8..829c08f16 100644 --- a/Makefile.system +++ b/Makefile.system @@ -327,7 +327,6 @@ ifeq ($(C_COMPILER), GCC) #Version tests for supporting specific features (MS_ABI, POWER9 intrinsics) GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) -GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) @@ -575,6 +574,7 @@ ifneq ($(C_COMPILER), GCC) DYNAMIC_CORE += POWER9 endif ifeq ($(C_COMPILER), GCC) +GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) ifeq ($(GCCVERSIONGT5), 1) DYNAMIC_CORE += POWER9 else diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index 1dec5f4b3..8c831b998 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -3,7 +3,7 @@ extern gotoblas_t gotoblas_POWER6; extern gotoblas_t gotoblas_POWER8; -#if (!defined C_GCC) || (GCC_VERSION >= 60000) +#if (!defined __GNUC__) || ( __GNUC__ >= 6) extern gotoblas_t gotoblas_POWER9; #endif @@ -21,7 +21,7 @@ static char *corename[] = { char *gotoblas_corename(void) { if (gotoblas == &gotoblas_POWER6) return corename[1]; if (gotoblas == &gotoblas_POWER8) return corename[2]; -#if (!defined C_GCC) || (GCC_VERSION >= 60000) +#if (!defined __GNUC__) || ( __GNUC__ >= 6) if (gotoblas == &gotoblas_POWER9) return corename[3]; #endif return corename[0]; @@ -33,7 +33,7 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_POWER6; if (__builtin_cpu_is("power8")) return &gotoblas_POWER8; -#if (!defined C_GCC) || (GCC_VERSION >= 60000) +#if (!defined __GNUC__) || ( __GNUC__ >= 6) if (__builtin_cpu_is("power9")) return &gotoblas_POWER9; #endif @@ -59,7 +59,7 @@ static gotoblas_t *force_coretype(char * coretype) { { case 1: return (&gotoblas_POWER6); case 2: return (&gotoblas_POWER8); -#if (!defined C_GCC) || (GCC_VERSION >= 60000) +#if (!defined __GNUC__) || ( __GNUC__ >= 6) case 3: return (&gotoblas_POWER9); #endif default: return NULL; diff --git a/kernel/power/caxpy_power9.S b/kernel/power/caxpy_power9.S index 844cacd50..b4733ff9f 100644 --- a/kernel/power/caxpy_power9.S +++ b/kernel/power/caxpy_power9.S @@ -13,7 +13,11 @@ PROLOGUE +#ifdef CONJ +caxpyc_k: +#else caxpy_k: +#endif .LCF0: 0: addis 2,12,.TOC.-.LCF0@ha addi 2,2,.TOC.-.LCF0@l diff --git a/kernel/power/icamax_power9.S b/kernel/power/icamax_power9.S index 2968b3f8b..bf6ab6e82 100644 --- a/kernel/power/icamax_power9.S +++ b/kernel/power/icamax_power9.S @@ -1,3 +1,4 @@ +/* .file "icamax.c" .abiversion 2 .section ".text" @@ -5,6 +6,12 @@ .p2align 4,,15 .globl icamax_k .type icamax_k, @function +*/ +#define ASSEMBLER +#include "common.h" + + PROLOGUE + icamax_k: .LCF0: 0: addis 2,12,.TOC.-.LCF0@ha diff --git a/kernel/power/icamin_power9.S b/kernel/power/icamin_power9.S index 8eaa79f33..58a3c53a3 100644 --- a/kernel/power/icamin_power9.S +++ b/kernel/power/icamin_power9.S @@ -1,3 +1,4 @@ +/* .file "icamin.c" .abiversion 2 .section ".text" @@ -5,6 +6,12 @@ .p2align 4,,15 .globl icamin_k .type icamin_k, @function +*/ +#define ASSEMBLER +#include "common.h" + + PROLOGUE + icamin_k: .LCF0: 0: addis 2,12,.TOC.-.LCF0@ha diff --git a/kernel/power/isamax_power9.S b/kernel/power/isamax_power9.S index 9df1e773c..259c996fc 100644 --- a/kernel/power/isamax_power9.S +++ b/kernel/power/isamax_power9.S @@ -1,3 +1,4 @@ +/* .file "isamax.c" .abiversion 2 .section ".text" @@ -5,6 +6,12 @@ .p2align 4,,15 .globl isamax_k .type isamax_k, @function +*/ +#define ASSEMBLER +#include "common.h" + + PROLOGUE + isamax_k: .LCF0: 0: addis 2,12,.TOC.-.LCF0@ha diff --git a/kernel/power/isamin_power9.S b/kernel/power/isamin_power9.S index 0475edf46..36486ff02 100644 --- a/kernel/power/isamin_power9.S +++ b/kernel/power/isamin_power9.S @@ -1,3 +1,4 @@ +/* .file "isamin.c" .abiversion 2 .section ".text" @@ -5,6 +6,12 @@ .p2align 4,,15 .globl isamin_k .type isamin_k, @function +*/ +#define ASSEMBLER +#include "common.h" + + PROLOGUE + isamin_k: .LCF0: 0: addis 2,12,.TOC.-.LCF0@ha From 635c9e4e098415266593341df3575d7401295800 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 3 Mar 2020 21:04:12 +0100 Subject: [PATCH 033/593] Restore initializers for mutex and conditional --- driver/others/blas_server.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index ce028a7fc..3d2d5ef7a 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -571,6 +571,9 @@ int blas_thread_init(void){ atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)0); thread_status[i].status = THREAD_STATUS_WAKEUP; + pthread_mutex_init(&thread_status[i].lock, NULL); + pthread_cond_init (&thread_status[i].wakeup, NULL) + #ifdef NEED_STACKATTR ret=pthread_create(&blas_threads[i], &attr, &blas_thread_server, (void *)i); From d68e4ba59bd71f0515e27fc8ce2eb1fd9c94f63f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 3 Mar 2020 21:37:48 +0100 Subject: [PATCH 034/593] Fix cut/paste glitch --- driver/others/blas_server.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 3d2d5ef7a..aa0644845 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -572,7 +572,7 @@ int blas_thread_init(void){ thread_status[i].status = THREAD_STATUS_WAKEUP; pthread_mutex_init(&thread_status[i].lock, NULL); - pthread_cond_init (&thread_status[i].wakeup, NULL) + pthread_cond_init (&thread_status[i].wakeup, NULL); #ifdef NEED_STACKATTR ret=pthread_create(&blas_threads[i], &attr, From 114dbec947277f806b73b0e8392849d07847d40c Mon Sep 17 00:00:00 2001 From: Darkness303 <1010287144@qq.com> Date: Wed, 4 Mar 2020 14:09:10 +0800 Subject: [PATCH 035/593] 1.Add syr2 benchmark 2.Fixed some errors --- benchmark/Makefile | 47 ++++++++++- benchmark/syr.c | 4 +- benchmark/syr2.c | 194 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 241 insertions(+), 4 deletions(-) create mode 100644 benchmark/syr2.c diff --git a/benchmark/Makefile b/benchmark/Makefile index 1d4a220e4..b9ffbf381 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -57,6 +57,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \ strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \ ssyr.goto dsyr.goto \ + ssyr2.goto dsyr2.goto \ ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ sger.goto dger.goto cger.goto zger.goto \ @@ -85,6 +86,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \ strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \ ssyr.acml dsyr.acml \ + ssyr2.acml dsyr2.acml \ ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ sger.acml dger.acml cger.acml zger.acml \ @@ -111,7 +113,8 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ sgemm.atlas dgemm.atlas cgemm.atlas zgemm.atlas \ strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \ strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \ - ssyr.goto dsyr.atlas \ + ssyr.atlas dsyr.atlas \ + ssyr2.atlas dsyr2.atlas \ ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ sger.atlas dger.atlas cger.atlas zger.atlas\ @@ -140,6 +143,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \ strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \ ssyr.mkl dsyr.mkl \ + ssyr2.mkl dsyr2.mkl \ ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ sger.mkl dger.mkl cger.mkl zger.mkl \ @@ -167,6 +171,7 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \ strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \ ssyr.goto dsyr.goto \ + ssyr2.goto dsyr2.goto \ ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ sger.goto dger.goto cger.goto zger.goto \ @@ -194,6 +199,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \ strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \ ssyr.acml dsyr.acml \ + ssyr2.acml dsyr2.acml \ ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ sger.acml dger.acml cger.acml zger.acml \ @@ -221,6 +227,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \ strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \ ssyr.atlas dsyr.atlas \ + ssyr2.atlas dsyr2.atlas \ ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ sger.atlas dger.atlas cger.atlas zger.atlas\ @@ -251,6 +258,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \ strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \ ssyr.mkl dsyr.mkl \ + ssyr2.mkl dsyr2.mkl \ ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ sger.mkl dger.mkl cger.mkl zger.mkl \ @@ -289,6 +297,7 @@ veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ strmm.veclib dtrmm.veclib ctrmm.veclib ztrmm.veclib \ strsm.veclib dtrsm.veclib ctrsm.veclib ztrsm.veclib \ ssyr.veclib dsyr.veclib \ + ssyr2.veclib dsyr2.veclib \ ssyrk.veclib dsyrk.veclib csyrk.veclib zsyrk.veclib \ ssyr2k.veclib dsyr2k.veclib csyr2k.veclib zsyr2k.veclib \ sger.veclib dger.veclib cger.veclib zger.veclib \ @@ -807,6 +816,36 @@ dsyr.mkl : dsyr.$(SUFFIX) dsyr.veclib : dsyr.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Ssyr2 #################################################### +ssyr2.goto : ssyr2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ssyr2.acml : ssyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr2.atlas : ssyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr2.mkl : ssyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr2.veclib : ssyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Dsyr2 #################################################### +dsyr2.goto : dsyr2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dsyr2.acml : dsyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr2.atlas : dsyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr2.mkl : dsyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr2.veclib : dsyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Ssyrk #################################################### ssyrk.goto : ssyrk.$(SUFFIX) ../$(LIBNAME) @@ -2123,6 +2162,12 @@ ssyr.$(SUFFIX) : syr.c dsyr.$(SUFFIX) : syr.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ +ssyr2.$(SUFFIX) : syr2.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dsyr2.$(SUFFIX) : syr2.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + ssyrk.$(SUFFIX) : syrk.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ diff --git a/benchmark/syr.c b/benchmark/syr.c index 91b5b5904..458bc6edb 100644 --- a/benchmark/syr.c +++ b/benchmark/syr.c @@ -173,11 +173,9 @@ int main(int argc, char *argv[]){ time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; - gettimeofday( &start, (struct timezone *)0); - fprintf(stderr, " %10.2f MFlops\n", - COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6); + COMPSIZE * COMPSIZE * 1. * (double)m * (double)m / time1 * 1.e-6); } diff --git a/benchmark/syr2.c b/benchmark/syr2.c new file mode 100644 index 000000000..0129dd09a --- /dev/null +++ b/benchmark/syr2.c @@ -0,0 +1,194 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef SYR2 + + +#ifdef DOUBLE +#define SYR2 BLASFUNC(dsyr2) +#else +#define SYR2 BLASFUNC(ssyr2) +#endif + + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *y, *a; + FLOAT alpha[] = {1.0, 1.0}; + char *p; + + char uplo='U'; + + if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; + + blasint m, i, j; + blasint inc_x= 1; + blasint inc_y= 1; + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Inc_x = %d Inc_y = %d\n", from, to, step,uplo,inc_x,inc_y); + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + fprintf(stderr, " %6d : ", (int)m); + for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ + x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + + for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ + y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + + for(j = 0; j < m; j++){ + for(i = 0; i < m * COMPSIZE; i++){ + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + } + + gettimeofday( &start, (struct timezone *)0); + + SYR2 (&uplo, &m, alpha, x, &inc_x, y, &inc_y, a, &m ); + + gettimeofday( &stop, (struct timezone *)0); + + time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + + fprintf(stderr, + " %10.2f MFlops\n", + COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / time1 * 1.e-6); + + } + + return 0; +} + +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); From de74e116411b230a6e4b36d06053fe468738f6c8 Mon Sep 17 00:00:00 2001 From: q00437336 Date: Wed, 4 Mar 2020 02:57:33 -0500 Subject: [PATCH 036/593] add benchmark for trsv --- benchmark/Makefile | 87 ++++++++++++++++++ benchmark/trsv.c | 222 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 309 insertions(+) create mode 100644 benchmark/trsv.c diff --git a/benchmark/Makefile b/benchmark/Makefile index c037dd6d6..cf8ab3416 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -74,6 +74,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ cher2k.goto zher2k.goto \ sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ + strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \ sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ sgesv.goto dgesv.goto cgesv.goto zgesv.goto \ sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ @@ -102,6 +103,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ cher2k.acml zher2k.acml \ sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ + strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \ sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ sgetri.acml dgetri.acml cgetri.acml zgetri.acml \ @@ -131,6 +133,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ cher2k.atlas zher2k.atlas \ sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ + strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \ sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ @@ -159,6 +162,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ cher2k.mkl zher2k.mkl \ sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ + strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \ sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \ @@ -188,6 +192,7 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ cher2k.goto zher2k.goto \ sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ + strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \ ssymm.goto dsymm.goto csymm.goto zsymm.goto \ smallscaling \ isamax.goto idamax.goto icamax.goto izamax.goto \ @@ -215,6 +220,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ cher2k.acml zher2k.acml \ sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ + strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \ sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ sgetri.acml dgetri.acml cgetri.acml zgetri.acml \ @@ -244,6 +250,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ cher2k.atlas zher2k.atlas \ sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ + strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \ sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ @@ -274,6 +281,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ cher2k.mkl zher2k.mkl \ sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ + strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \ sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \ @@ -313,6 +321,7 @@ veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ cher2k.veclib zher2k.veclib \ sgemv.veclib dgemv.veclib cgemv.veclib zgemv.veclib \ strmv.veclib dtrmv.veclib ctrmv.veclib ztrmv.veclib \ + strsv.veclib dtrsv.veclib ctrsv.veclib ztrsv.veclib \ sgeev.veclib dgeev.veclib cgeev.veclib zgeev.veclib \ sgesv.veclib dgesv.veclib cgesv.veclib zgesv.veclib \ sgetri.veclib dgetri.veclib cgetri.veclib zgetri.veclib \ @@ -1183,6 +1192,72 @@ ztrmv.mkl : ztrmv.$(SUFFIX) ztrmv.veclib : ztrmv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Strsv #################################################### +strsv.goto : strsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +strsv.acml : strsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strsv.atlas : strsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strsv.mkl : strsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strsv.veclib : strsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dtrsv #################################################### +dtrsv.goto : dtrsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dtrsv.acml : dtrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrsv.atlas : dtrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrsv.mkl : dtrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrsv.veclib : dtrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ctrsv #################################################### + +ctrsv.goto : ctrsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ctrsv.acml : ctrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrsv.atlas : ctrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrsv.mkl : ctrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrsv.veclib : ctrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ztrsv #################################################### + +ztrsv.goto : ztrsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ztrsv.acml : ztrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrsv.atlas : ztrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrsv.mkl : ztrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrsv.veclib : ztrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Sger #################################################### sger.goto : sger.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -2264,6 +2339,18 @@ ctrmv.$(SUFFIX) : trmv.c ztrmv.$(SUFFIX) : trmv.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +strsv.$(SUFFIX) : trsv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dtrsv.$(SUFFIX) : trsv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ctrsv.$(SUFFIX) : trsv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +ztrsv.$(SUFFIX) : trsv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + sger.$(SUFFIX) : ger.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ diff --git a/benchmark/trsv.c b/benchmark/trsv.c new file mode 100644 index 000000000..8652eb331 --- /dev/null +++ b/benchmark/trsv.c @@ -0,0 +1,222 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include +#include "common.h" + + +#undef GEMV +#undef TRSV + +#ifndef COMPLEX + +#ifdef DOUBLE +#define TRSV BLASFUNC(dtrsv) +#else +#define TRSV BLASFUNC(strsv) +#endif + +#else + +#ifdef DOUBLE +#define TRSV BLASFUNC(ztrsv) +#else +#define TRSV BLASFUNC(ctrsv) +#endif + +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *a, *x; + blasint n = 0, i, j; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timespec time_start, time_end; + time_t seconds = 0; + + double time1,timeg; + long long nanos = 0; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + char uplo ='L'; + char transa = 'N'; + char diag ='U'; + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_TRANSA"))) transa=*p; + if ((p = getenv("OPENBLAS_DIAG"))) diag=*p; + if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; + + fprintf(stderr, "From : %3d To : %3d Step = %3d Transa = '%c' Inc_x = %d uplo=%c diag=%c loop = %d\n", from, to, step,transa,inc_x, + uplo,diag,loops); + + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + fprintf(stderr, "============================================\n"); + + for(n = from; n <= to; n += step) + { + timeg=0; + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * n * n * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * n * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + for(j = 0; j < n; j++){ + for(i = 0; i < n * COMPSIZE; i++){ + a[i + j * n * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + } + + for(i = 0; i < n * COMPSIZE * abs(inc_x); i++){ + x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + + for(l =0;l< loops;l++){ + + clock_gettime(CLOCK_REALTIME,&time_start); + + TRSV(&uplo,&transa,&diag,&n,a,&n,x,&inc_x); + + clock_gettime(CLOCK_REALTIME,&time_end); + nanos = time_end.tv_nsec - time_start.tv_nsec; + seconds = time_end.tv_sec - time_start.tv_sec; + + time1 = seconds + nanos /1.e9; + timeg += time1; + } + + + timeg /= loops; + long long muls = n*(n+1)/2.0; + long long adds = (n - 1.0)*n/2.0; + + fprintf(stderr, "%10d %10.2f MFlops %10.6f sec\n", n,(muls+adds) / timeg * 1.e-6, timeg); + if(a != NULL){ + free(a); + } + + if( x != NULL){ + free(x); + } + + } + + return 0; +} + From 13f9afbd997ffa9bf30a861ec243559fc09e45f5 Mon Sep 17 00:00:00 2001 From: l00546269 Date: Wed, 4 Mar 2020 16:47:23 +0800 Subject: [PATCH 037/593] [OpenBLAS]:modifed the Makefile [Description]:add c/fortran compiler version information in final note --- Makefile | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index a22e16bab..018855a2a 100644 --- a/Makefile +++ b/Makefile @@ -56,10 +56,11 @@ ifneq ($(INTERFACE64), 0) @echo " Use 64 bits int (equivalent to \"-i8\" in Fortran) " endif endif - - @echo " C compiler ... $(C_COMPILER) (command line : $(CC))" + @cverinfo=`$(CC) --version | sed -n '1p'`; \ + echo " C compiler ... $(C_COMPILER) (cmd & version : $${cverinfo})" ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) - @echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))" + @fverinfo=`$(FC) --version | sed -n '1p'`; \ + echo " Fortran compiler ... $(F_COMPILER) (cmd & version : $${fverinfo})" endif ifneq ($(OSNAME), AIX) @echo -n " Library Name ... $(LIBNAME)" @@ -68,9 +69,9 @@ else endif ifndef SMP - @echo " (Single threaded) " + @echo " (Single-threading) " else - @echo " (Multi threaded; Max num-threads is $(NUM_THREADS))" + @echo " (Multi-threading; Max num-threads is $(NUM_THREADS))" endif ifeq ($(USE_OPENMP), 1) From 233838b4bca6dbc08a6db3899629b7393b10fc19 Mon Sep 17 00:00:00 2001 From: q00437336 Date: Wed, 4 Mar 2020 03:54:40 -0500 Subject: [PATCH 038/593] change clock to CLOCK_PROCESS_CPUTIME_ID --- benchmark/trsv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmark/trsv.c b/benchmark/trsv.c index 8652eb331..c60890de4 100644 --- a/benchmark/trsv.c +++ b/benchmark/trsv.c @@ -189,11 +189,11 @@ int main(int argc, char *argv[]){ for(l =0;l< loops;l++){ - clock_gettime(CLOCK_REALTIME,&time_start); + clock_gettime(CLOCK_PROCESS_CPUTIME_ID,&time_start); TRSV(&uplo,&transa,&diag,&n,a,&n,x,&inc_x); - clock_gettime(CLOCK_REALTIME,&time_end); + clock_gettime(CLOCK_PROCESS_CPUTIME_ID,&time_end); nanos = time_end.tv_nsec - time_start.tv_nsec; seconds = time_end.tv_sec - time_start.tv_sec; From 0f1a2b12f91b83c39721cbf1585bafa5fa1663fc Mon Sep 17 00:00:00 2001 From: s00527847 Date: Wed, 4 Mar 2020 15:50:19 -0500 Subject: [PATCH 039/593] add benchmark for spr/spr2 --- benchmark/Makefile | 96 ++++++++++++++++++++- benchmark/spr.c | 198 +++++++++++++++++++++++++++++++++++++++++++ benchmark/spr2.c | 207 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 500 insertions(+), 1 deletion(-) create mode 100755 benchmark/spr.c create mode 100755 benchmark/spr2.c diff --git a/benchmark/Makefile b/benchmark/Makefile index c037dd6d6..660f44fee 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -57,6 +57,8 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \ strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \ ssyr.goto dsyr.goto \ + sspr.goto dspr.goto \ + sspr2.goto dspr2.goto \ ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ sger.goto dger.goto cger.goto zger.goto \ @@ -86,6 +88,8 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \ strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \ ssyr.acml dsyr.acml \ + sspr.acml dspr.acml \ + sspr2.acml dspr2.acml \ ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ sger.acml dger.acml cger.acml zger.acml \ @@ -113,7 +117,9 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ sgemm.atlas dgemm.atlas cgemm.atlas zgemm.atlas \ strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \ strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \ - ssyr.goto dsyr.atlas \ + ssyr.atlas dsyr.atlas \ + sspr.atlas dspr.atlas \ + sspr2.atlas dspr2.atlas \ ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ sger.atlas dger.atlas cger.atlas zger.atlas\ @@ -143,6 +149,8 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \ strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \ ssyr.mkl dsyr.mkl \ + sspr.mkl dspr.mkl \ + sspr2.mkl dspr2.mkl \ ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ sger.mkl dger.mkl cger.mkl zger.mkl \ @@ -171,6 +179,8 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \ strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \ ssyr.goto dsyr.goto \ + sspr.goto dspr.goto \ + sspr2.goto dspr2.goto \ ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ sger.goto dger.goto cger.goto zger.goto \ @@ -199,6 +209,8 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \ strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \ ssyr.acml dsyr.acml \ + sspr.acml dspr.acml \ + sspr2.acml dspr2.acml \ ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ sger.acml dger.acml cger.acml zger.acml \ @@ -227,6 +239,8 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \ strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \ ssyr.atlas dsyr.atlas \ + sspr.atlas dspr.atlas \ + sspr2.atlas dspr2.atlas \ ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ sger.atlas dger.atlas cger.atlas zger.atlas\ @@ -258,6 +272,8 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \ strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \ ssyr.mkl dsyr.mkl \ + sspr.mkl dspr.mkl \ + sspr2.mkl dspr2.mkl \ ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ sger.mkl dger.mkl cger.mkl zger.mkl \ @@ -297,6 +313,8 @@ veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ strmm.veclib dtrmm.veclib ctrmm.veclib ztrmm.veclib \ strsm.veclib dtrsm.veclib ctrsm.veclib ztrsm.veclib \ ssyr.veclib dsyr.veclib \ + sspr.veclib dspr.veclib \ + sspr2.veclib dspr2.veclib \ ssyrk.veclib dsyrk.veclib csyrk.veclib zsyrk.veclib \ ssyr2k.veclib dsyr2k.veclib csyr2k.veclib zsyr2k.veclib \ sger.veclib dger.veclib cger.veclib zger.veclib \ @@ -816,6 +834,70 @@ dsyr.mkl : dsyr.$(SUFFIX) dsyr.veclib : dsyr.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sspr #################################################### +sspr.goto : sspr.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sspr.acml : sspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr.atlas : sspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr.mkl : sspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr.veclib : sspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dspr #################################################### +dspr.goto : dspr.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dspr.acml : dspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr.atlas : dspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr.mkl : dspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr.veclib : dspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sspr2 #################################################### +sspr2.goto : sspr2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sspr2.acml : sspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr2.atlas : sspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr2.mkl : sspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr2.veclib : sspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dspr2 #################################################### +dspr2.goto : dspr2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dspr2.acml : dspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr2.atlas : dspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr2.mkl : dspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr2.veclib : dspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Ssyrk #################################################### ssyrk.goto : ssyrk.$(SUFFIX) ../$(LIBNAME) @@ -2197,6 +2279,18 @@ ssyr.$(SUFFIX) : syr.c dsyr.$(SUFFIX) : syr.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +sspr.$(SUFFIX) : spr.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dspr.$(SUFFIX) : spr.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +sspr2.$(SUFFIX) : spr2.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dspr2.$(SUFFIX) : spr2.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ ssyrk.$(SUFFIX) : syrk.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ diff --git a/benchmark/spr.c b/benchmark/spr.c new file mode 100755 index 000000000..61a972c08 --- /dev/null +++ b/benchmark/spr.c @@ -0,0 +1,198 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef SPR + +#ifdef DOUBLE +#define SPR BLASFUNC(dspr) +#else +#define SPR BLASFUNC(sspr) +#endif + + + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *a,*c; + FLOAT alpha[] = {1.0, 1.0}; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + char uplo='U'; + + if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + blasint m, i, j; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Inc_x = %d\n", from, to, step,uplo,inc_x); + + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time\n"); + + for(m = from; m <= to; m += step) + { + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef SPR2 + +#ifdef DOUBLE +#define SPR2 BLASFUNC(dspr2) +#else +#define SPR2 BLASFUNC(sspr2) +#endif + + + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *a,*b,*c; + FLOAT alpha[] = {1.0, 1.0}; + blasint inc_x=1,inc_y=1; + int loops = 1; + int l; + char *p; + + char uplo='U'; + + if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + blasint m, i, j; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Inc_x = %d Inc_y = %d\n", from, to, step,uplo,inc_x,inc_y); + + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time\n"); + + for(m = from; m <= to; m += step) + { + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + for (l=0; l Date: Thu, 5 Mar 2020 09:55:16 +0800 Subject: [PATCH 040/593] Add benchmark file rotm.c and modify benchmark/Makefile to test s/drotm modified: benchmark/Makefile new file: benchmark/rotm.c --- benchmark/Makefile | 44 ++++++++++ benchmark/rotm.c | 210 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 254 insertions(+) create mode 100644 benchmark/rotm.c diff --git a/benchmark/Makefile b/benchmark/Makefile index c037dd6d6..699670e33 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -62,6 +62,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ sger.goto dger.goto cger.goto zger.goto \ sdot.goto ddot.goto \ srot.goto drot.goto \ + srotm.goto drotm.goto \ saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ scopy.goto dcopy.goto ccopy.goto zcopy.goto \ sswap.goto dswap.goto cswap.goto zswap.goto \ @@ -90,6 +91,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ sger.acml dger.acml cger.acml zger.acml \ sdot.acml ddot.acml \ + srotm.acml drotm.acml \ saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ scopy.acml dcopy.acml ccopy.acml zcopy.acml \ sswap.acml dswap.acml cswap.acml zswap.acml \ @@ -118,6 +120,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ sger.atlas dger.atlas cger.atlas zger.atlas\ sdot.atlas ddot.atlas \ + srotm.atlas drotm.atlas \ saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \ scopy.atlas dcopy.atlas ccopy.atlas zcopy.atlas \ sswap.atlas dswap.atlas cswap.atlas zswap.atlas \ @@ -147,6 +150,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ sger.mkl dger.mkl cger.mkl zger.mkl \ sdot.mkl ddot.mkl \ + srotm.mkl drotm.mkl \ saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \ scopy.mkl dcopy.mkl ccopy.mkl zcopy.mkl \ sswap.mkl dswap.mkl cswap.mkl zswap.mkl \ @@ -176,6 +180,7 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ sger.goto dger.goto cger.goto zger.goto \ sdot.goto ddot.goto cdot.goto zdot.goto \ srot.goto drot.goto \ + srotm.goto drotm.goto \ saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ scopy.goto dcopy.goto ccopy.goto zcopy.goto \ sswap.goto dswap.goto cswap.goto zswap.goto \ @@ -203,6 +208,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ sger.acml dger.acml cger.acml zger.acml \ sdot.acml ddot.acml \ + srotm.acml drotm.acml \ saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ scopy.acml dcopy.acml ccopy.acml zcopy.acml \ sswap.acml dswap.acml cswap.acml zswap.acml \ @@ -231,6 +237,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ sger.atlas dger.atlas cger.atlas zger.atlas\ sdot.atlas ddot.atlas \ + srotm.atlas drotm.atlas \ saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \ scopy.atlas dcopy.atlas ccopy.atlas zcopy.atlas \ sswap.atlas dswap.atlas cswap.atlas zswap.atlas \ @@ -262,6 +269,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ sger.mkl dger.mkl cger.mkl zger.mkl \ sdot.mkl ddot.mkl cdot.mkl zdot.mkl \ + srotm.atlas drotm.atlas \ saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \ scopy.mkl dcopy.mkl ccopy.mkl zcopy.mkl \ sswap.mkl dswap.mkl cswap.mkl zswap.mkl \ @@ -301,6 +309,7 @@ veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ ssyr2k.veclib dsyr2k.veclib csyr2k.veclib zsyr2k.veclib \ sger.veclib dger.veclib cger.veclib zger.veclib \ sdot.veclib ddot.veclib cdot.veclib zdot.veclib \ + srotm.veclib drotm.veclib \ saxpy.veclib daxpy.veclib caxpy.veclib zaxpy.veclib \ scopy.veclib dcopy.veclib ccopy.veclib zcopy.veclib \ sswap.veclib dswap.veclib cswap.veclib zswap.veclib \ @@ -1639,6 +1648,37 @@ drot.mkl : drot.$(SUFFIX) drot.veclib : drot.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### srotm #################################################### +srotm.goto : srotm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +srotm.acml : srotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +srotm.atlas : srotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +srotm.mkl : srotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +srotm.veclib : srotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### drotm #################################################### +drotm.goto : drotm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +drotm.acml : drotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +drotm.atlas : drotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +drotm.mkl : drotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +drotm.veclib : drotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Saxpy #################################################### saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME) @@ -2432,7 +2472,11 @@ srot.$(SUFFIX) : rot.c drot.$(SUFFIX) : rot.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ +srotm.$(SUFFIX) : rotm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ +drotm.$(SUFFIX) : rotm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ diff --git a/benchmark/rotm.c b/benchmark/rotm.c new file mode 100644 index 000000000..8dea2d08c --- /dev/null +++ b/benchmark/rotm.c @@ -0,0 +1,210 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF +THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#undef ROTM + +#ifdef DOUBLE +#define ROTM BLASFUNC(drotm) +#else +#define ROTM BLASFUNC(srotm) +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz) +{ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size) +{ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid = + shmget(IPC_PRIVATE, (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT | 0600)) < 0) { + printf("Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1) { + printf("Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]) +{ + + FLOAT *x, *y; + // FLOAT result; + blasint m, i; + blasint inc_x = 1, inc_y = 1; + FLOAT param[5] = {1, 2.0, 3.0, 4.0, 5.0}; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1, timeg; + + argc--; + argv++; + + if (argc > 0) { + from = atol(*argv); + argc--; + argv++; + } + if (argc > 0) { + to = MAX(atol(*argv), from); + argc--; + argv++; + } + if (argc > 0) { + step = atol(*argv); + argc--; + argv++; + } + + if ((p = getenv("OPENBLAS_LOOPS"))) + loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) + inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) + inc_y = atoi(p); + + fprintf( + stderr, + "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", + from, to, step, inc_x, inc_y, loops); + + if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == + NULL) { + fprintf(stderr, "Out of Memory!!\n"); + exit(1); + } + + if ((y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == + NULL) { + fprintf(stderr, "Out of Memory!!\n"); + exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for (m = from; m <= to; m += step) { + + timeg = 0; + + fprintf(stderr, " %6d : ", (int)m); + for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) { + x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; + } + + for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) { + y[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; + } + + for (l = 0; l < loops; l++) { + gettimeofday(&start, (struct timezone *)0); + + ROTM(&m, x, &inc_x, y, &inc_y, param); + + gettimeofday(&stop, (struct timezone *)0); + + time1 = (double)(stop.tv_sec - start.tv_sec) + + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + + timeg += time1; + } + + timeg /= loops; + + fprintf(stderr, " %10.2f MFlops %10.6f sec\n", + COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6, timeg); + } + + return 0; +} From 32c847df45c43a043f720049eada5d727026e26c Mon Sep 17 00:00:00 2001 From: chenxuqiang Date: Fri, 6 Mar 2020 01:02:02 -0500 Subject: [PATCH 041/593] benchmark/hpmv&hbmv: add benchmark/hpmv.c and benchmark/hbmv.c Signed-off-by: Xuqiang Chen chenxuqiang3@hisilicon.com --- benchmark/Makefile | 93 ++++++++++++++++++++ benchmark/hbmv.c | 210 +++++++++++++++++++++++++++++++++++++++++++++ benchmark/hpmv.c | 207 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 510 insertions(+) create mode 100644 benchmark/hbmv.c create mode 100644 benchmark/hpmv.c diff --git a/benchmark/Makefile b/benchmark/Makefile index c037dd6d6..c554a57b1 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -69,6 +69,8 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ sasum.goto dasum.goto casum.goto zasum.goto \ ssymv.goto dsymv.goto csymv.goto zsymv.goto \ chemv.goto zhemv.goto \ + chbmv.goto zhbmv.goto \ + chpmv.goto zhpmv.goto \ chemm.goto zhemm.goto \ cherk.goto zherk.goto \ cher2k.goto zher2k.goto \ @@ -97,6 +99,8 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ sasum.acml dasum.acml casum.acml zasum.acml \ ssymv.acml dsymv.acml csymv.acml zsymv.acml \ chemv.acml zhemv.acml \ + chbmv.acml zhbmv.acml \ + chpmv.acml zhpmv.acml \ chemm.acml zhemm.acml \ cherk.acml zherk.acml \ cher2k.acml zher2k.acml \ @@ -125,6 +129,8 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ sasum.atlas dasum.atlas casum.atlas zasum.atlas \ ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \ chemv.atlas zhemv.atlas \ + chbmv.atlas zhbmv.atlas \ + chpmv.atlas zhpmv.atlas \ chemm.acml zhemm.acml \ chemm.atlas zhemm.atlas \ cherk.atlas zherk.atlas \ @@ -154,6 +160,8 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ sasum.mkl dasum.mkl casum.mkl zasum.mkl \ ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ chemv.mkl zhemv.mkl \ + chbmv.mkl zhbmv.mkl \ + chpmv.mkl zhpmv.mkl \ chemm.mkl zhemm.mkl \ cherk.mkl zherk.mkl \ cher2k.mkl zher2k.mkl \ @@ -183,6 +191,8 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ sasum.goto dasum.goto casum.goto zasum.goto \ ssymv.goto dsymv.goto \ chemv.goto zhemv.goto \ + chbmv.goto zhbmv.goto \ + chpmv.goto zhpmv.goto \ chemm.goto zhemm.goto \ cherk.goto zherk.goto \ cher2k.goto zher2k.goto \ @@ -210,6 +220,8 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ sasum.acml dasum.acml casum.acml zasum.acml \ ssymv.acml dsymv.acml csymv.acml zsymv.acml \ chemv.acml zhemv.acml \ + chbmv.acml zhbmv.acml \ + chpmv.acml zhpmv.acml \ chemm.acml zhemm.acml \ cherk.acml zherk.acml \ cher2k.acml zher2k.acml \ @@ -238,6 +250,8 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ sasum.atlas dasum.atlas casum.atlas zasum.atlas \ ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \ chemv.atlas zhemv.atlas \ + chbmv.atlas zhbmv.atlas \ + chpmv.atlas zhpmv.atlas \ chemm.acml zhemm.acml \ chemm.atlas zhemm.atlas \ cherk.atlas zherk.atlas \ @@ -269,6 +283,8 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ sasum.mkl dasum.mkl casum.mkl zasum.mkl \ ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ chemv.mkl zhemv.mkl \ + chbmv.mkl zhbmv.mkl \ + chpmv.mkl zhpmv.mkl \ chemm.mkl zhemm.mkl \ cherk.mkl zherk.mkl \ cher2k.mkl zher2k.mkl \ @@ -308,6 +324,8 @@ veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ sasum.veclib dasum.veclib casum.veclib zasum.veclib \ ssymv.veclib dsymv.veclib csymv.veclib zsymv.veclib \ chemv.veclib zhemv.veclib \ + chbmv.veclib zhbmv.veclib \ + chpmv.veclib zhpmv.veclib \ chemm.veclib zhemm.veclib \ cherk.veclib zherk.veclib \ cher2k.veclib zher2k.veclib \ @@ -1542,7 +1560,70 @@ zhemv.mkl : zhemv.$(SUFFIX) zhemv.veclib : zhemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Chbmv #################################################### +chbmv.goto : chbmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +chbmv.acml : chbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chbmv.atlas : chbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chbmv.mkl : chbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chbmv.veclib : chbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Zhbmv #################################################### + +zhbmv.goto : zhbmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zhbmv.acml : zhbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhbmv.atlas : zhbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhbmv.mkl : zhbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhbmv.veclib : zhbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Chpmv #################################################### + +chpmv.goto : chpmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +chpmv.acml : chpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chpmv.atlas : chpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chpmv.mkl : chpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chpmv.veclib : chpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Zhpmv #################################################### + +zhpmv.goto : zhpmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zhpmv.acml : zhpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhpmv.atlas : zhpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhpmv.mkl : zhpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhpmv.veclib : zhpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Sdot #################################################### sdot.goto : sdot.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -2331,6 +2412,18 @@ chemv.$(SUFFIX) : hemv.c zhemv.$(SUFFIX) : hemv.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +chbmv.$(SUFFIX) : hbmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zhbmv.$(SUFFIX) : hbmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +chpmv.$(SUFFIX) : hpmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zhpmv.$(SUFFIX) : hpmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + sdot.$(SUFFIX) : dot.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ diff --git a/benchmark/hbmv.c b/benchmark/hbmv.c new file mode 100644 index 000000000..b9dcc03bb --- /dev/null +++ b/benchmark/hbmv.c @@ -0,0 +1,210 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef HBMV + + +#ifdef DOUBLE +#define HBMV BLASFUNC(zhbmv) +#else +#define HBMV BLASFUNC(chbmv) +#endif + + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz) { + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size) { + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *a, *x, *y; + FLOAT alpha[] = {1.0, 1.0}; + FLOAT beta [] = {0.0, 0.0}; + blasint k = 1; + char uplo='L'; + blasint m, i, j; + blasint inc_x=1, inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; + if ((p = getenv("OPENBLAS_K"))) k = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' k = %d Inc_x = %d Inc_y = %d Loops = %d\n", + from, to, step, uplo, k, inc_x, inc_y, loops); + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) { + fprintf(stderr,"Out of Memory!!\n"); + exit(1); + } + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) { + fprintf(stderr,"Out of Memory!!\n"); + exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL) { + fprintf(stderr,"Out of Memory!!\n"); + exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) { + + timeg=0; + + fprintf(stderr, " %6dx%d : ", (int)m, (int)m); + + for(j = 0; j < m; j++) { + for(i = 0; i < m * COMPSIZE; i++) { + a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + } + + for (l = 0; l < loops; l++) { + + for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) { + x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + + for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) { + y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + + gettimeofday( &start, (struct timezone *)0); + + HBMV (&uplo, &m, &k, alpha, a, &m, x, &inc_x, beta, y, &inc_y ); + + gettimeofday( &stop, (struct timezone *)0); + + time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + + timeg += time1; + + } + + timeg /= loops; + + fprintf(stderr, " %10.2f MFlops\n", + COMPSIZE * COMPSIZE * 2. * (double)(2 * k + 1) * (double)m / timeg * 1.e-6); + } + + return 0; +} + +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); diff --git a/benchmark/hpmv.c b/benchmark/hpmv.c new file mode 100644 index 000000000..6e6634fcf --- /dev/null +++ b/benchmark/hpmv.c @@ -0,0 +1,207 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef HPMV + + +#ifdef DOUBLE +#define HPMV BLASFUNC(zhpmv) +#else +#define HPMV BLASFUNC(chpmv) +#endif + + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz) { + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size) { + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *a, *x, *y; + FLOAT alpha[] = {1.0, 1.0}; + FLOAT beta [] = {1.0, 1.0}; + char uplo='L'; + blasint m, i, j; + blasint inc_x=1, inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; + + fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops); + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) { + fprintf(stderr,"Out of Memory!!\n"); + exit(1); + } + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) { + fprintf(stderr,"Out of Memory!!\n"); + exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL) { + fprintf(stderr,"Out of Memory!!\n"); + exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) { + + timeg=0; + + fprintf(stderr, " %6dx%d : ", (int)m, (int)m); + + for(j = 0; j < m; j++) { + for(i = 0; i < m * COMPSIZE; i++) { + a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + } + + for (l = 0; l < loops; l++) { + + for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) { + x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + + for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) { + y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + + gettimeofday( &start, (struct timezone *)0); + + HPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y ); + + gettimeofday( &stop, (struct timezone *)0); + + time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + + timeg += time1; + + } + + timeg /= loops; + + fprintf(stderr, " %10.2f MFlops\n", + COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / timeg * 1.e-6); + } + + return 0; +} + +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); From 208c7e7ca50a8bfdfabbec750bdc538023c94aed Mon Sep 17 00:00:00 2001 From: Ali Saidi Date: Mon, 24 Feb 2020 05:45:30 +0000 Subject: [PATCH 042/593] Use acq/rel semantics to pass flags/pointers in getrf_parallel. The current implementation has locks, but the locks each only have a critical section of one variable so atomic reads/writes with barriers can be used to achieve the same behavior. Like the previous patch, pthread_mutex_lock isn't fair, so in a tight loop the previous thread that has the lock can keep it starving another thread, even if that thread is about to write the data that will stop the current thread from spinning. On a 64c Arm system this improves performance by 20x on sgesv.goto. --- lapack/getrf/getrf_parallel.c | 119 +++++++++++++--------------------- 1 file changed, 44 insertions(+), 75 deletions(-) diff --git a/lapack/getrf/getrf_parallel.c b/lapack/getrf/getrf_parallel.c index c82defcab..c602822a8 100644 --- a/lapack/getrf/getrf_parallel.c +++ b/lapack/getrf/getrf_parallel.c @@ -68,25 +68,16 @@ double sqrt(double); #define GETRF_FACTOR 1.00 -#if defined(USE_PTHREAD_LOCK) -static pthread_mutex_t getrf_lock = PTHREAD_MUTEX_INITIALIZER; -#elif defined(USE_PTHREAD_SPINLOCK) -static pthread_spinlock_t getrf_lock = 0; +#if (__STDC_VERSION__ >= 201112L) +#define atomic_load_long(p) __atomic_load_n(p, __ATOMIC_RELAXED) +#define atomic_store_long(p, v) __atomic_store_n(p, v, __ATOMIC_RELAXED) #else -static BLASULONG getrf_lock = 0UL; -#endif - -#if defined(USE_PTHREAD_LOCK) -static pthread_mutex_t getrf_flag_lock = PTHREAD_MUTEX_INITIALIZER; -#elif defined(USE_PTHREAD_SPINLOCK) -static pthread_spinlock_t getrf_flag_lock = 0; -#else -static BLASULONG getrf_flag_lock = 0UL; +#define atomic_load_long(p) (BLASLONG)(*(volatile BLASLONG*)(p)) +#define atomic_store_long(p, v) (*(volatile BLASLONG *)(p)) = (v) #endif - static __inline BLASLONG FORMULA1(BLASLONG M, BLASLONG N, BLASLONG IS, BLASLONG BK, BLASLONG T) { double m = (double)(M - IS - BK); @@ -119,11 +110,7 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra FLOAT *d = (FLOAT *)args -> b + (k + k * lda) * COMPSIZE; FLOAT *sbb = sb; -#if __STDC_VERSION__ >= 201112L - _Atomic BLASLONG *flag = (_Atomic BLASLONG *)args -> d; -#else volatile BLASLONG *flag = (volatile BLASLONG *)args -> d; -#endif blasint *ipiv = (blasint *)args -> c; @@ -180,7 +167,10 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra } } - if ((js + REAL_GEMM_R >= n) && (mypos >= 0)) flag[mypos * CACHE_LINE_SIZE] = 0; + if ((js + REAL_GEMM_R >= n) && (mypos >= 0)) { + MB; + atomic_store_long(&flag[mypos * CACHE_LINE_SIZE], 0); + } for (is = 0; is < m; is += GEMM_P){ min_i = m - is; @@ -201,14 +191,10 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra /* Non blocking implementation */ typedef struct { -#if __STDC_VERSION__ >= 201112L - _Atomic -#else - volatile -#endif - BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; + volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; } job_t; + #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); @@ -246,11 +232,8 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * blasint *ipiv = (blasint *)args -> c; BLASLONG jw; -#if __STDC_VERSION__ >= 201112L - _Atomic BLASLONG *flag = (_Atomic BLASLONG *)args -> d; -#else volatile BLASLONG *flag = (volatile BLASLONG *)args -> d; -#endif + if (args -> a == NULL) { TRSM_ILTCOPY(k, k, (FLOAT *)args -> b, lda, 0, sb); sbb = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); @@ -280,10 +263,9 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * #if 1 { do { - LOCK_COMMAND(&getrf_lock); - jw = job[mypos].working[i][CACHE_LINE_SIZE * bufferside]; - UNLOCK_COMMAND(&getrf_lock); + jw = atomic_load_long(&job[mypos].working[i][CACHE_LINE_SIZE * bufferside]); } while (jw); + MB; } #else while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {}; @@ -326,21 +308,17 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * } MB; for (i = 0; i < args -> nthreads; i++) { - LOCK_COMMAND(&getrf_lock); - job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; - UNLOCK_COMMAND(&getrf_lock); + atomic_store_long(&job[mypos].working[i][CACHE_LINE_SIZE * bufferside], (BLASLONG)buffer[bufferside]); } } - LOCK_COMMAND(&getrf_flag_lock); - flag[mypos * CACHE_LINE_SIZE] = 0; - UNLOCK_COMMAND(&getrf_flag_lock); + MB; + atomic_store_long(&flag[mypos * CACHE_LINE_SIZE], 0); if (m == 0) { + MB; for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { - LOCK_COMMAND(&getrf_lock); - job[mypos].working[mypos][CACHE_LINE_SIZE * xxx] = 0; - UNLOCK_COMMAND(&getrf_lock); + atomic_store_long(&job[mypos].working[mypos][CACHE_LINE_SIZE * xxx], 0); } } @@ -366,10 +344,9 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * if ((current != mypos) && (!is)) { #if 1 do { - LOCK_COMMAND(&getrf_lock); - jw = job[current].working[mypos][CACHE_LINE_SIZE * bufferside]; - UNLOCK_COMMAND(&getrf_lock); - } while (jw == 0); + jw = atomic_load_long(&job[current].working[mypos][CACHE_LINE_SIZE * bufferside]); + } while (jw == 0); + MB; #else while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {}; #endif @@ -381,9 +358,7 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * MB; if (is + min_i >= m) { - LOCK_COMMAND(&getrf_lock); - job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; - UNLOCK_COMMAND(&getrf_lock); + atomic_store_long(&job[current].working[mypos][CACHE_LINE_SIZE * bufferside], 0); } } @@ -397,10 +372,9 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { #if 1 do { - LOCK_COMMAND(&getrf_lock); - jw = job[mypos].working[i][CACHE_LINE_SIZE *xxx]; - UNLOCK_COMMAND(&getrf_lock); + jw = atomic_load_long(&job[mypos].working[i][CACHE_LINE_SIZE *xxx]); } while(jw != 0); + MB; #else while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {}; #endif @@ -443,12 +417,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, #ifdef _MSC_VER BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE]; #else -#if __STDC_VERSION__ >= 201112L - _Atomic -#else - volatile -#endif - BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128))); + volatile BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128))); #endif #ifndef COMPLEX @@ -543,7 +512,11 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, if (width > mn - is - bk) width = mn - is - bk; } - if (num_cpu > 0) exec_blas_async_wait(num_cpu, &queue[0]); + + if (num_cpu > 0) { + WMB; + exec_blas_async_wait(num_cpu, &queue[0]); + } mm = m - bk - is; nn = n - bk - is; @@ -608,7 +581,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; - flag[num_cpu * CACHE_LINE_SIZE] = 1; + atomic_store_long(&flag[num_cpu * CACHE_LINE_SIZE], 1); num_cpu ++; @@ -637,6 +610,8 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, if (num_cpu > 0) { queue[num_cpu - 1].next = NULL; + WMB; + exec_blas_async(0, &queue[0]); inner_basic_thread(&newarg, NULL, range_n_mine, sa, sbb, -1); @@ -647,14 +622,10 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, for (i = 0; i < num_cpu; i ++) { #if 1 - LOCK_COMMAND(&getrf_flag_lock); - f=flag[i*CACHE_LINE_SIZE]; - UNLOCK_COMMAND(&getrf_flag_lock); - while (f!=0) { - LOCK_COMMAND(&getrf_flag_lock); - f=flag[i*CACHE_LINE_SIZE]; - UNLOCK_COMMAND(&getrf_flag_lock); - }; + do { + f = atomic_load_long(&flag[i*CACHE_LINE_SIZE]); + } while (f != 0); + MB; #else while (flag[i*CACHE_LINE_SIZE]) {}; #endif @@ -719,12 +690,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, BLASLONG range[MAX_CPU_NUMBER + 1]; BLASLONG width, nn, num_cpu; -#if __STDC_VERSION__ >= 201112L - _Atomic -#else - volatile -#endif - BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128))); + volatile BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128))); #ifndef COMPLEX #ifdef XDOUBLE @@ -833,6 +799,8 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, nn = n - bk - is; if (width > nn) width = nn; + WMB; + if (num_cpu > 1) exec_blas_async_wait(num_cpu - 1, &queue[1]); range[0] = 0; @@ -867,7 +835,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; - flag[num_cpu * CACHE_LINE_SIZE] = 1; + atomic_store_long(&flag[num_cpu * CACHE_LINE_SIZE], 1); num_cpu ++; } @@ -882,6 +850,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, range_n_new[0] = offset + is; range_n_new[1] = offset + is + bk; + WMB; if (num_cpu > 1) { exec_blas_async(1, &queue[1]); @@ -917,7 +886,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, #endif - for (i = 1; i < num_cpu; i ++) while (flag[i * CACHE_LINE_SIZE]) {}; + for (i = 1; i < num_cpu; i ++) while (atomic_load_long(&flag[i * CACHE_LINE_SIZE])) {}; TRSM_ILTCOPY(bk, bk, a + (is + is * lda) * COMPSIZE, lda, 0, sb); From 37f46f2fa08d4946080edb6215aae61bc0c6f03a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 6 Mar 2020 15:37:26 +0100 Subject: [PATCH 043/593] Fix another spot where make was used instead of $(MAKE) Broke lapack-testing on BSD as their default "make" does not support GNU Makefile syntax --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index a22e16bab..82d2402a3 100644 --- a/Makefile +++ b/Makefile @@ -317,7 +317,7 @@ lapack-test : $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/EIG xeigtstc xeigtstd xeigtsts xeigtstz $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/LIN xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc ifneq ($(CROSS), 1) - ( cd $(NETLIB_LAPACK_DIR)/INSTALL; make all; ./testlsame; ./testslamch; ./testdlamch; \ + ( cd $(NETLIB_LAPACK_DIR)/INSTALL; $(MAKE) all; ./testlsame; ./testslamch; ./testdlamch; \ ./testsecond; ./testdsecnd; ./testieee; ./testversion ) (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING) endif From 8a8df530e296144e041d0c26d976dfae5c9019fe Mon Sep 17 00:00:00 2001 From: l00546269 Date: Sat, 7 Mar 2020 10:14:33 +0800 Subject: [PATCH 044/593] [OpenBLAS]:modifed the Makefile [Description]: check the compiler version and show the detail info --- Makefile | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 018855a2a..c7ba1ff54 100644 --- a/Makefile +++ b/Makefile @@ -56,11 +56,21 @@ ifneq ($(INTERFACE64), 0) @echo " Use 64 bits int (equivalent to \"-i8\" in Fortran) " endif endif - @cverinfo=`$(CC) --version | sed -n '1p'`; \ - echo " C compiler ... $(C_COMPILER) (cmd & version : $${cverinfo})" + @$(CC) --version > /dev/null 2>&1;\ + if [ $$? -eq 0 ]; then \ + cverinfo=`$(CC) --version | sed -n '1p'`; \ + echo " C compiler ... $(C_COMPILER) (cmd & version : $${cverinfo})";\ + else \ + echo " C compiler ... $(C_COMPILER) (command line : $(CC))";\ + fi ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) - @fverinfo=`$(FC) --version | sed -n '1p'`; \ - echo " Fortran compiler ... $(F_COMPILER) (cmd & version : $${fverinfo})" + @$(FC) --version > /dev/null 2>&1;\ + if [ $$? -eq 0 ]; then \ + fverinfo=`$(FC) --version | sed -n '1p'`; \ + echo " Fortran compiler ... $(F_COMPILER) (cmd & version : $${fverinfo})";\ + else \ + echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))";\ + fi endif ifneq ($(OSNAME), AIX) @echo -n " Library Name ... $(LIBNAME)" From 09c7a191bdf0eecf2c2678ad900660f5e875c745 Mon Sep 17 00:00:00 2001 From: shengyang Date: Sat, 7 Mar 2020 15:17:49 +0800 Subject: [PATCH 045/593] add benchmark for csrot and zdrot modified: benchmark/Makefile modified: benchmark/rot.c --- benchmark/Makefile | 49 ++++++++++++++++++++++++++++++++++++++++++++-- benchmark/rot.c | 36 +++++++++++++++++++++------------- 2 files changed, 69 insertions(+), 16 deletions(-) diff --git a/benchmark/Makefile b/benchmark/Makefile index ed94db5b5..4a1ecaf7b 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -64,7 +64,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ sger.goto dger.goto cger.goto zger.goto \ sdot.goto ddot.goto \ - srot.goto drot.goto \ + srot.goto drot.goto csrot.goto zdrot.goto \ srotm.goto drotm.goto \ saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ scopy.goto dcopy.goto ccopy.goto zcopy.goto \ @@ -100,6 +100,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ sger.acml dger.acml cger.acml zger.acml \ sdot.acml ddot.acml \ + srot.acml drot.acml csrot.acml zdrot.acml \ srotm.acml drotm.acml \ saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ scopy.acml dcopy.acml ccopy.acml zcopy.acml \ @@ -135,6 +136,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ sger.atlas dger.atlas cger.atlas zger.atlas\ sdot.atlas ddot.atlas \ + srot.atlas drot.atlas csrot.atlas zdrot.atlas \ srotm.atlas drotm.atlas \ saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \ scopy.atlas dcopy.atlas ccopy.atlas zcopy.atlas \ @@ -171,6 +173,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ sger.mkl dger.mkl cger.mkl zger.mkl \ sdot.mkl ddot.mkl \ + srot.mkl drot.mkl csrot.mkl zdrot.mkl \ srotm.mkl drotm.mkl \ saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \ scopy.mkl dcopy.mkl ccopy.mkl zcopy.mkl \ @@ -206,7 +209,7 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ sger.goto dger.goto cger.goto zger.goto \ sdot.goto ddot.goto cdot.goto zdot.goto \ - srot.goto drot.goto \ + srot.goto drot.goto csrot.goto zdrot.goto \ srotm.goto drotm.goto \ saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ scopy.goto dcopy.goto ccopy.goto zcopy.goto \ @@ -241,6 +244,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ sger.acml dger.acml cger.acml zger.acml \ sdot.acml ddot.acml \ + srot.acml drot.acml csrot.acml zdrot.acml \ srotm.acml drotm.acml \ saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ scopy.acml dcopy.acml ccopy.acml zcopy.acml \ @@ -276,6 +280,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ sger.atlas dger.atlas cger.atlas zger.atlas\ sdot.atlas ddot.atlas \ + srot.atlas drot.atlas csrot.atlas zdrot.atlas \ srotm.atlas drotm.atlas \ saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \ scopy.atlas dcopy.atlas ccopy.atlas zcopy.atlas \ @@ -314,6 +319,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ sger.mkl dger.mkl cger.mkl zger.mkl \ sdot.mkl ddot.mkl cdot.mkl zdot.mkl \ + srot.atlas drot.atlas csrot.atlas zdrot.atlas \ srotm.atlas drotm.atlas \ saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \ scopy.mkl dcopy.mkl ccopy.mkl zcopy.mkl \ @@ -360,6 +366,7 @@ veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ ssyr2k.veclib dsyr2k.veclib csyr2k.veclib zsyr2k.veclib \ sger.veclib dger.veclib cger.veclib zger.veclib \ sdot.veclib ddot.veclib cdot.veclib zdot.veclib \ + srot.veclib drot.veclib csrot.veclib zdrot.veclib \ srotm.veclib drotm.veclib \ saxpy.veclib daxpy.veclib caxpy.veclib zaxpy.veclib \ scopy.veclib dcopy.veclib ccopy.veclib zcopy.veclib \ @@ -1925,6 +1932,38 @@ drot.mkl : drot.$(SUFFIX) drot.veclib : drot.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### csrot #################################################### +csrot.goto : csrot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +csrot.acml : csrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csrot.atlas : csrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csrot.mkl : csrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csrot.veclib : csrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### zdrot #################################################### +zdrot.goto : zdrot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zdrot.acml : zdrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zdrot.atlas : zdrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zdrot.mkl : zdrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zdrot.veclib : zdrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### srotm #################################################### srotm.goto : srotm.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -2791,6 +2830,12 @@ srot.$(SUFFIX) : rot.c drot.$(SUFFIX) : rot.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ +csrot.$(SUFFIX) : rot.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zdrot.$(SUFFIX) : rot.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + srotm.$(SUFFIX) : rotm.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ diff --git a/benchmark/rot.c b/benchmark/rot.c index 3ff783cc6..8ec8b1d97 100644 --- a/benchmark/rot.c +++ b/benchmark/rot.c @@ -32,9 +32,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #include "common.h" +#undef ROT -#undef DOT - +#ifndef COMPLEX #ifdef DOUBLE #define ROT BLASFUNC(drot) @@ -42,6 +42,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ROT BLASFUNC(srot) #endif +#else + +#ifdef DOUBLE +#define ROT BLASFUNC(zdrot) +#else +#define ROT BLASFUNC(csrot) +#endif + +#endif #if defined(__WIN32__) || defined(__WIN64__) @@ -160,17 +169,16 @@ int main(int argc, char *argv[]){ fprintf(stderr, " %6d : ", (int)m); + for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ + x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + + for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ + y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } for (l=0; l Date: Sat, 7 Mar 2020 17:48:55 +0800 Subject: [PATCH 046/593] Add benchmark file axpby.c and modify benchmark/Makefile to test s/d/c/zaxpby --- benchmark/Makefile | 102 +++++++++++++++++++++-- benchmark/axpby.c | 202 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 296 insertions(+), 8 deletions(-) create mode 100644 benchmark/axpby.c diff --git a/benchmark/Makefile b/benchmark/Makefile index ed94db5b5..401db7f5b 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -85,7 +85,8 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ sgesv.goto dgesv.goto cgesv.goto zgesv.goto \ sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \ - ssymm.goto dsymm.goto csymm.goto zsymm.goto + ssymm.goto dsymm.goto csymm.goto zsymm.goto \ + saxpby.goto daxpby.goto caxpby.goto zaxpby.goto acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ @@ -120,7 +121,8 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ sgetri.acml dgetri.acml cgetri.acml zgetri.acml \ spotrf.acml dpotrf.acml cpotrf.acml zpotrf.acml \ - ssymm.acml dsymm.acml csymm.acml zsymm.acml + ssymm.acml dsymm.acml csymm.acml zsymm.acml \ + saxpby.goto daxpby.goto caxpby.goto zaxpby.goto atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ scholesky.atlas dcholesky.atlas ccholesky.atlas zcholesky.atlas \ @@ -156,7 +158,8 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \ - ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas + ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas \ + saxpby.goto daxpby.goto caxpby.goto zaxpby.goto mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \ @@ -191,7 +194,8 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \ spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \ - ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl + ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl \ + saxpby.goto daxpby.goto caxpby.goto zaxpby.goto else @@ -226,6 +230,7 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ ssymm.goto dsymm.goto csymm.goto zsymm.goto \ smallscaling \ isamax.goto idamax.goto icamax.goto izamax.goto \ + saxpby.goto daxpby.goto caxpby.goto zaxpby.goto \ snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ @@ -261,7 +266,8 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ sgetri.acml dgetri.acml cgetri.acml zgetri.acml \ spotrf.acml dpotrf.acml cpotrf.acml zpotrf.acml \ - ssymm.acml dsymm.acml csymm.acml zsymm.acml + ssymm.acml dsymm.acml csymm.acml zsymm.acml \ + saxpby.goto daxpby.goto caxpby.goto zaxpby.goto atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ scholesky.atlas dcholesky.atlas ccholesky.atlas zcholesky.atlas \ @@ -299,7 +305,8 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \ ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas \ isamax.atlas idamax.atlas icamax.atlas izamax.atlas \ - snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto + snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto \ + saxpby.goto daxpby.goto caxpby.goto zaxpby.goto mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \ @@ -334,7 +341,8 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \ spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \ - ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl + ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl \ + saxpby.goto daxpby.goto caxpby.goto zaxpby.goto @@ -380,7 +388,8 @@ veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ sgesv.veclib dgesv.veclib cgesv.veclib zgesv.veclib \ sgetri.veclib dgetri.veclib cgetri.veclib zgetri.veclib \ spotrf.veclib dpotrf.veclib cpotrf.veclib zpotrf.veclib \ - ssymm.veclib dsymm.veclib csymm.veclib zsymm.veclib + ssymm.veclib dsymm.veclib csymm.veclib zsymm.veclib \ + saxpby.goto daxpby.goto caxpby.goto zaxpby.goto goto_3m :: cgemm3m.goto zgemm3m.goto @@ -2023,7 +2032,72 @@ zaxpy.mkl : zaxpy.$(SUFFIX) zaxpy.veclib : zaxpy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Saxpby #################################################### +saxpby.goto : saxpby.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +saxpby.acml : saxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +saxpby.atlas : saxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +saxpby.mkl : saxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +saxpby.veclib : saxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Daxpby #################################################### +daxpby.goto : daxpby.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +daxpby.acml : daxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +daxpby.atlas : daxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +daxpby.mkl : daxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +daxpby.veclib : daxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Caxpby #################################################### + +caxpby.goto : caxpby.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +caxpby.acml : caxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +caxpby.atlas : caxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +caxpby.mkl : caxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +caxpby.veclib : caxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zaxpby #################################################### + +zaxpby.goto : zaxpby.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zaxpby.acml : zaxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zaxpby.atlas : zaxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zaxpby.mkl : zaxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zaxpby.veclib : zaxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Scopy #################################################### scopy.goto : scopy.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -2722,6 +2796,18 @@ caxpy.$(SUFFIX) : axpy.c zaxpy.$(SUFFIX) : axpy.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +saxpby.$(SUFFIX) : axpby.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +daxpby.$(SUFFIX) : axpby.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +caxpby.$(SUFFIX) : axpby.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zaxpby.$(SUFFIX) : axpby.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + scopy.$(SUFFIX) : copy.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ diff --git a/benchmark/axpby.c b/benchmark/axpby.c new file mode 100644 index 000000000..3b3dd9979 --- /dev/null +++ b/benchmark/axpby.c @@ -0,0 +1,202 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef AXPBY + +#ifdef COMPLEX +#ifdef DOUBLE +#define AXPBY BLASFUNC(zaxpby) +#else +#define AXPBY BLASFUNC(caxpby) +#endif +#else +#ifdef DOUBLE +#define AXPBY BLASFUNC(daxpby) +#else +#define AXPBY BLASFUNC(saxpby) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *y; + FLOAT alpha[2] = { 2.0, 2.0 }; + FLOAT beta[2] = {2.0, 2.0}; + blasint m, i; + blasint inc_x=1,inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ + x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + + for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ + y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + + for (l=0; l Date: Wed, 4 Mar 2020 17:44:50 -0500 Subject: [PATCH 047/593] Remove redundant code --- benchmark/hemm.c | 2 -- benchmark/her2k.c | 2 -- benchmark/herk.c | 2 -- benchmark/symm.c | 2 -- benchmark/syr2k.c | 2 -- benchmark/syrk.c | 2 -- 6 files changed, 12 deletions(-) diff --git a/benchmark/hemm.c b/benchmark/hemm.c index a0c549292..2fe0f5c5f 100644 --- a/benchmark/hemm.c +++ b/benchmark/hemm.c @@ -178,8 +178,6 @@ int main(int argc, char *argv[]){ time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; - gettimeofday( &start, (struct timezone *)0); - fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)m / time1 * 1.e-6); diff --git a/benchmark/her2k.c b/benchmark/her2k.c index 55421878a..a0772feff 100644 --- a/benchmark/her2k.c +++ b/benchmark/her2k.c @@ -177,8 +177,6 @@ int main(int argc, char *argv[]){ time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; - gettimeofday( &start, (struct timezone *)0); - fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)m / time1 * 1.e-6); diff --git a/benchmark/herk.c b/benchmark/herk.c index bd336e6b1..eed8ed738 100644 --- a/benchmark/herk.c +++ b/benchmark/herk.c @@ -175,8 +175,6 @@ int main(int argc, char *argv[]){ time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; - gettimeofday( &start, (struct timezone *)0); - fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6); diff --git a/benchmark/symm.c b/benchmark/symm.c index 9c26d92fe..b979e8d51 100644 --- a/benchmark/symm.c +++ b/benchmark/symm.c @@ -189,8 +189,6 @@ int main(int argc, char *argv[]){ time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; - gettimeofday( &start, (struct timezone *)0); - fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)m / time1 * 1.e-6); diff --git a/benchmark/syr2k.c b/benchmark/syr2k.c index 6b51e4f2b..b1fcd8a18 100644 --- a/benchmark/syr2k.c +++ b/benchmark/syr2k.c @@ -189,8 +189,6 @@ int main(int argc, char *argv[]){ time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; - gettimeofday( &start, (struct timezone *)0); - fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)m / time1 * 1.e-6); diff --git a/benchmark/syrk.c b/benchmark/syrk.c index 06582b861..95625a6c4 100644 --- a/benchmark/syrk.c +++ b/benchmark/syrk.c @@ -185,8 +185,6 @@ int main(int argc, char *argv[]){ time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; - gettimeofday( &start, (struct timezone *)0); - fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6); From bbeda55b7b528ae1926fb8074cdc9bddf7720e08 Mon Sep 17 00:00:00 2001 From: s00527847 Date: Sat, 7 Mar 2020 13:09:19 -0500 Subject: [PATCH 048/593] add trmm.c --- benchmark/trmm.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/benchmark/trmm.c b/benchmark/trmm.c index 6a5e59c7b..e095b85ee 100644 --- a/benchmark/trmm.c +++ b/benchmark/trmm.c @@ -188,8 +188,6 @@ int main(int argc, char *argv[]){ time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; - gettimeofday( &start, (struct timezone *)0); - fprintf(stderr, " %10.2f MFlops %10.6f sec\n", COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6, time1); From c5bdd2135228c14ae2ad536e2c2f27578721c063 Mon Sep 17 00:00:00 2001 From: s00548429 Date: Mon, 9 Mar 2020 14:59:03 +0800 Subject: [PATCH 049/593] Add benchmark for ?amax, ?max, ?amin, ?min, i?max, i?amin and i?min. --- benchmark/Makefile | 6100 +++++++++++++++++++++++--------------------- benchmark/amax.c | 191 ++ benchmark/amin.c | 192 ++ benchmark/iamin.c | 192 ++ benchmark/imax.c | 186 ++ benchmark/imin.c | 186 ++ benchmark/max.c | 185 ++ benchmark/min.c | 185 ++ 8 files changed, 4444 insertions(+), 2973 deletions(-) create mode 100644 benchmark/amax.c create mode 100644 benchmark/amin.c create mode 100644 benchmark/iamin.c create mode 100644 benchmark/imax.c create mode 100644 benchmark/imin.c create mode 100644 benchmark/max.c create mode 100644 benchmark/min.c diff --git a/benchmark/Makefile b/benchmark/Makefile index 139496085..2db873e95 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -1,2973 +1,3127 @@ -TOPDIR = .. -include $(TOPDIR)/Makefile.system - -# ACML standard -#ACML=/opt/acml5.3.1/gfortran64_mp/lib -#LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm - -# ACML custom -#ACML=/opt/pb/acml-5-3-1-gfortran-64bit/gfortran64_fma4_mp/lib -#LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm - -# ACML 6.1 custom -ACML=/home/saar/acml6.1/gfortran64_mp/lib -LIBACML = -fopenmp $(ACML)/libacml_mp.so -lgfortran -lm - - -# Atlas Ubuntu -#ATLAS=/usr/lib/atlas-base -#LIBATLAS = -fopenmp $(ATLAS)/liblapack_atlas.a $(ATLAS)/libptcblas.a $(ATLAS)/libptf77blas.a $(ATLAS)/libatlas.a -lgfortran -lm - -# Atlas RHEL and Fedora -ATLAS=/usr/lib64/atlas -LIBATLAS = -fopenmp $(ATLAS)/liblapack.a $(ATLAS)/libptcblas.a $(ATLAS)/libptf77blas.a $(ATLAS)/libatlas.a -lgfortran -lm - -# Intel standard -# MKL=/opt/intel/mkl/lib/intel64 -# LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm - -# Intel custom -MKL=/home/saar/intel_mkl -LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm - -# Apple vecLib -LIBVECLIB = -framework Accelerate - -ESSL=/opt/ibm/lib -#LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a -LIBESSL = -lesslsmp $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a - -ifneq ($(NO_LAPACK), 1) -GOTO_LAPACK_TARGETS=slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ - scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \ - sgesv.goto dgesv.goto cgesv.goto zgesv.goto \ - sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ - csymv.goto zsymv.goto \ - sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ - spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto -else -GOTO_LAPACK_TARGETS= -endif - -ifeq ($(OSNAME), WINNT) - -goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ - scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \ - sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ - strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \ - strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \ - sspr.goto dspr.goto \ - sspr2.goto dspr2.goto \ - ssyr.goto dsyr.goto \ - ssyr2.goto dsyr2.goto \ - ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ - ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ - sger.goto dger.goto cger.goto zger.goto \ - sdot.goto ddot.goto \ - srot.goto drot.goto csrot.goto zdrot.goto \ - srotm.goto drotm.goto \ - saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ - scopy.goto dcopy.goto ccopy.goto zcopy.goto \ - sswap.goto dswap.goto cswap.goto zswap.goto \ - sscal.goto dscal.goto cscal.goto zscal.goto \ - sasum.goto dasum.goto casum.goto zasum.goto \ - ssymv.goto dsymv.goto csymv.goto zsymv.goto \ - chemv.goto zhemv.goto \ - chbmv.goto zhbmv.goto \ - chpmv.goto zhpmv.goto \ - chemm.goto zhemm.goto \ - cherk.goto zherk.goto \ - cher2k.goto zher2k.goto \ - sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ - strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ - strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \ - sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ - sgesv.goto dgesv.goto cgesv.goto zgesv.goto \ - sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ - spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \ - ssymm.goto dsymm.goto csymm.goto zsymm.goto \ - saxpby.goto daxpby.goto caxpby.goto zaxpby.goto - -acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ - scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ - sgemm.acml dgemm.acml cgemm.acml zgemm.acml \ - strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \ - strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \ - sspr.acml dspr.acml \ - sspr2.acml dspr2.acml \ - ssyr.acml dsyr.acml \ - ssyr2.acml dsyr2.acml \ - ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ - ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ - sger.acml dger.acml cger.acml zger.acml \ - sdot.acml ddot.acml \ - srot.acml drot.acml csrot.acml zdrot.acml \ - srotm.acml drotm.acml \ - saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ - scopy.acml dcopy.acml ccopy.acml zcopy.acml \ - sswap.acml dswap.acml cswap.acml zswap.acml \ - sscal.acml dscal.acml cscal.acml zscal.acml \ - sasum.acml dasum.acml casum.acml zasum.acml \ - ssymv.acml dsymv.acml csymv.acml zsymv.acml \ - chemv.acml zhemv.acml \ - chbmv.acml zhbmv.acml \ - chpmv.acml zhpmv.acml \ - chemm.acml zhemm.acml \ - cherk.acml zherk.acml \ - cher2k.acml zher2k.acml \ - sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ - strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ - strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \ - sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ - sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ - sgetri.acml dgetri.acml cgetri.acml zgetri.acml \ - spotrf.acml dpotrf.acml cpotrf.acml zpotrf.acml \ - ssymm.acml dsymm.acml csymm.acml zsymm.acml \ - saxpby.goto daxpby.goto caxpby.goto zaxpby.goto - -atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ - scholesky.atlas dcholesky.atlas ccholesky.atlas zcholesky.atlas \ - sgemm.atlas dgemm.atlas cgemm.atlas zgemm.atlas \ - strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \ - strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \ - sspr.atlas dspr.atlas \ - sspr2.atlas dspr2.atlas \ - ssyr.atlas dsyr.atlas \ - ssyr2.atlas dsyr2.atlas \ - ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ - ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ - sger.atlas dger.atlas cger.atlas zger.atlas\ - sdot.atlas ddot.atlas \ - srot.atlas drot.atlas csrot.atlas zdrot.atlas \ - srotm.atlas drotm.atlas \ - saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \ - scopy.atlas dcopy.atlas ccopy.atlas zcopy.atlas \ - sswap.atlas dswap.atlas cswap.atlas zswap.atlas \ - sscal.atlas dscal.atlas cscal.atlas zscal.atlas \ - sasum.atlas dasum.atlas casum.atlas zasum.atlas \ - ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \ - chemv.atlas zhemv.atlas \ - chbmv.atlas zhbmv.atlas \ - chpmv.atlas zhpmv.atlas \ - chemm.acml zhemm.acml \ - chemm.atlas zhemm.atlas \ - cherk.atlas zherk.atlas \ - cher2k.atlas zher2k.atlas \ - sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ - strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ - strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \ - sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ - sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ - sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ - spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \ - ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas \ - saxpby.goto daxpby.goto caxpby.goto zaxpby.goto - -mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ - scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \ - sgemm.mkl dgemm.mkl cgemm.mkl zgemm.mkl \ - strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \ - strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \ - sspr.mkl dspr.mkl \ - sspr2.mkl dspr2.mkl \ - ssyr.mkl dsyr.mkl \ - ssyr2.mkl dsyr2.mkl \ - ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ - ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ - sger.mkl dger.mkl cger.mkl zger.mkl \ - sdot.mkl ddot.mkl \ - srot.mkl drot.mkl csrot.mkl zdrot.mkl \ - srotm.mkl drotm.mkl \ - saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \ - scopy.mkl dcopy.mkl ccopy.mkl zcopy.mkl \ - sswap.mkl dswap.mkl cswap.mkl zswap.mkl \ - sscal.mkl dscal.mkl cscal.mkl zscal.mkl \ - sasum.mkl dasum.mkl casum.mkl zasum.mkl \ - ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ - chemv.mkl zhemv.mkl \ - chbmv.mkl zhbmv.mkl \ - chpmv.mkl zhpmv.mkl \ - chemm.mkl zhemm.mkl \ - cherk.mkl zherk.mkl \ - cher2k.mkl zher2k.mkl \ - sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ - strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ - strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \ - sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ - sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ - sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \ - spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \ - ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl \ - saxpby.goto daxpby.goto caxpby.goto zaxpby.goto - -else - -goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ - strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \ - strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \ - sspr.goto dspr.goto \ - sspr2.goto dspr2.goto \ - ssyr.goto dsyr.goto \ - ssyr2.goto dsyr2.goto \ - ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ - ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ - sger.goto dger.goto cger.goto zger.goto \ - sdot.goto ddot.goto cdot.goto zdot.goto \ - srot.goto drot.goto csrot.goto zdrot.goto \ - srotm.goto drotm.goto \ - saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ - scopy.goto dcopy.goto ccopy.goto zcopy.goto \ - sswap.goto dswap.goto cswap.goto zswap.goto \ - sscal.goto dscal.goto cscal.goto zscal.goto \ - sasum.goto dasum.goto casum.goto zasum.goto \ - ssymv.goto dsymv.goto \ - chemv.goto zhemv.goto \ - chbmv.goto zhbmv.goto \ - chpmv.goto zhpmv.goto \ - chemm.goto zhemm.goto \ - cherk.goto zherk.goto \ - cher2k.goto zher2k.goto \ - sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ - strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ - strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \ - ssymm.goto dsymm.goto csymm.goto zsymm.goto \ - smallscaling \ - isamax.goto idamax.goto icamax.goto izamax.goto \ - saxpby.goto daxpby.goto caxpby.goto zaxpby.goto \ - snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) - -acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ - scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ - sgemm.acml dgemm.acml cgemm.acml zgemm.acml \ - strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \ - strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \ - sspr.acml dspr.acml \ - sspr2.acml dspr2.acml \ - ssyr.acml dsyr.acml \ - ssyr2.acml dsyr2.acml \ - ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ - ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ - sger.acml dger.acml cger.acml zger.acml \ - sdot.acml ddot.acml \ - srot.acml drot.acml csrot.acml zdrot.acml \ - srotm.acml drotm.acml \ - saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ - scopy.acml dcopy.acml ccopy.acml zcopy.acml \ - sswap.acml dswap.acml cswap.acml zswap.acml \ - sscal.acml dscal.acml cscal.acml zscal.acml \ - sasum.acml dasum.acml casum.acml zasum.acml \ - ssymv.acml dsymv.acml csymv.acml zsymv.acml \ - chemv.acml zhemv.acml \ - chbmv.acml zhbmv.acml \ - chpmv.acml zhpmv.acml \ - chemm.acml zhemm.acml \ - cherk.acml zherk.acml \ - cher2k.acml zher2k.acml \ - sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ - strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ - strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \ - sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ - sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ - sgetri.acml dgetri.acml cgetri.acml zgetri.acml \ - spotrf.acml dpotrf.acml cpotrf.acml zpotrf.acml \ - ssymm.acml dsymm.acml csymm.acml zsymm.acml \ - saxpby.goto daxpby.goto caxpby.goto zaxpby.goto - -atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ - scholesky.atlas dcholesky.atlas ccholesky.atlas zcholesky.atlas \ - sgemm.atlas dgemm.atlas cgemm.atlas zgemm.atlas \ - strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \ - strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \ - sspr.atlas dspr.atlas \ - sspr2.atlas dspr2.atlas \ - ssyr.atlas dsyr.atlas \ - ssyr2.atlas dsyr2.atlas \ - ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ - ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ - sger.atlas dger.atlas cger.atlas zger.atlas\ - sdot.atlas ddot.atlas \ - srot.atlas drot.atlas csrot.atlas zdrot.atlas \ - srotm.atlas drotm.atlas \ - saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \ - scopy.atlas dcopy.atlas ccopy.atlas zcopy.atlas \ - sswap.atlas dswap.atlas cswap.atlas zswap.atlas \ - sscal.atlas dscal.atlas cscal.atlas zscal.atlas \ - sasum.atlas dasum.atlas casum.atlas zasum.atlas \ - ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \ - chemv.atlas zhemv.atlas \ - chbmv.atlas zhbmv.atlas \ - chpmv.atlas zhpmv.atlas \ - chemm.acml zhemm.acml \ - chemm.atlas zhemm.atlas \ - cherk.atlas zherk.atlas \ - cher2k.atlas zher2k.atlas \ - sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ - strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ - strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \ - sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ - sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ - sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ - spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \ - ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas \ - isamax.atlas idamax.atlas icamax.atlas izamax.atlas \ - snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto \ - saxpby.goto daxpby.goto caxpby.goto zaxpby.goto - -mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ - scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \ - sgemm.mkl dgemm.mkl cgemm.mkl zgemm.mkl \ - strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \ - strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \ - sspr.mkl dspr.mkl \ - sspr2.mkl dspr2.mkl \ - ssyr.mkl dsyr.mkl \ - ssyr2.mkl dsyr2.mkl \ - ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ - ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ - sger.mkl dger.mkl cger.mkl zger.mkl \ - sdot.mkl ddot.mkl cdot.mkl zdot.mkl \ - srot.atlas drot.atlas csrot.atlas zdrot.atlas \ - srotm.atlas drotm.atlas \ - saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \ - scopy.mkl dcopy.mkl ccopy.mkl zcopy.mkl \ - sswap.mkl dswap.mkl cswap.mkl zswap.mkl \ - sscal.mkl dscal.mkl cscal.mkl zscal.mkl \ - sasum.mkl dasum.mkl casum.mkl zasum.mkl \ - ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ - chemv.mkl zhemv.mkl \ - chbmv.mkl zhbmv.mkl \ - chpmv.mkl zhpmv.mkl \ - chemm.mkl zhemm.mkl \ - cherk.mkl zherk.mkl \ - cher2k.mkl zher2k.mkl \ - sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ - strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ - strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \ - sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ - sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ - sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \ - spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \ - ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl \ - saxpby.goto daxpby.goto caxpby.goto zaxpby.goto - - - - -endif - -essl :: sgemm.essl strmm.essl dgemm.essl dtrmm.essl \ - cgemm.essl ctrmm.essl zgemm.essl ztrmm.essl \ - slinpack.essl clinpack.essl dlinpack.essl zlinpack.essl \ - scholesky.essl ccholesky.essl dcholesky.essl zcholesky.essl \ - strsm.essl dtrsm.essl ctrsm.essl ztrsm.essl - -veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ - scholesky.veclib dcholesky.veclib ccholesky.veclib zcholesky.veclib \ - sgemm.veclib dgemm.veclib cgemm.veclib zgemm.veclib \ - strmm.veclib dtrmm.veclib ctrmm.veclib ztrmm.veclib \ - strsm.veclib dtrsm.veclib ctrsm.veclib ztrsm.veclib \ - sspr.veclib dspr.veclib \ - sspr2.veclib dspr2.veclib \ - ssyr.veclib dsyr.veclib \ - ssyr2.veclib dsyr2.veclib \ - ssyrk.veclib dsyrk.veclib csyrk.veclib zsyrk.veclib \ - ssyr2k.veclib dsyr2k.veclib csyr2k.veclib zsyr2k.veclib \ - sger.veclib dger.veclib cger.veclib zger.veclib \ - sdot.veclib ddot.veclib cdot.veclib zdot.veclib \ - srot.veclib drot.veclib csrot.veclib zdrot.veclib \ - srotm.veclib drotm.veclib \ - saxpy.veclib daxpy.veclib caxpy.veclib zaxpy.veclib \ - scopy.veclib dcopy.veclib ccopy.veclib zcopy.veclib \ - sswap.veclib dswap.veclib cswap.veclib zswap.veclib \ - sscal.veclib dscal.veclib cscal.veclib zscal.veclib \ - sasum.veclib dasum.veclib casum.veclib zasum.veclib \ - ssymv.veclib dsymv.veclib csymv.veclib zsymv.veclib \ - chemv.veclib zhemv.veclib \ - chbmv.veclib zhbmv.veclib \ - chpmv.veclib zhpmv.veclib \ - chemm.veclib zhemm.veclib \ - cherk.veclib zherk.veclib \ - cher2k.veclib zher2k.veclib \ - sgemv.veclib dgemv.veclib cgemv.veclib zgemv.veclib \ - strmv.veclib dtrmv.veclib ctrmv.veclib ztrmv.veclib \ - strsv.veclib dtrsv.veclib ctrsv.veclib ztrsv.veclib \ - sgeev.veclib dgeev.veclib cgeev.veclib zgeev.veclib \ - sgesv.veclib dgesv.veclib cgesv.veclib zgesv.veclib \ - sgetri.veclib dgetri.veclib cgetri.veclib zgetri.veclib \ - spotrf.veclib dpotrf.veclib cpotrf.veclib zpotrf.veclib \ - ssymm.veclib dsymm.veclib csymm.veclib zsymm.veclib \ - saxpby.goto daxpby.goto caxpby.goto zaxpby.goto - -goto_3m :: cgemm3m.goto zgemm3m.goto - -mkl_3m :: cgemm3m.mkl zgemm3m.mkl - -all :: goto mkl atlas acml veclib - -exe : - @./Make_exe.sh - -##################################### Slinpack #################################################### -slinpack.goto : slinpack.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -slinpack.acml : slinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -slinpack.atlas : slinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -slinpack.mkl : slinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -slinpack.veclib : slinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -slinpack.essl : slinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dlinpack #################################################### -dlinpack.goto : dlinpack.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dlinpack.acml : dlinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dlinpack.atlas : dlinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dlinpack.mkl : dlinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dlinpack.veclib : dlinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dlinpack.essl : dlinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Clinpack #################################################### - -clinpack.goto : clinpack.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -clinpack.acml : clinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -clinpack.atlas : clinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -clinpack.mkl : clinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -clinpack.veclib : clinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -clinpack.essl : clinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zlinpack #################################################### - -zlinpack.goto : zlinpack.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zlinpack.acml : zlinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zlinpack.atlas : zlinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zlinpack.mkl : zlinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zlinpack.veclib : zlinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zlinpack.essl : zlinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Scholesky ################################################### - -scholesky.goto : scholesky.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -scholesky.acml : scholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -scholesky.atlas : scholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -scholesky.mkl : scholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -scholesky.veclib : scholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -scholesky.essl : scholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dcholesky ################################################### - -dcholesky.goto : dcholesky.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dcholesky.acml : dcholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dcholesky.atlas : dcholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dcholesky.mkl : dcholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dcholesky.veclib : dcholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dcholesky.essl : dcholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ccholesky ################################################### - -ccholesky.goto : ccholesky.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ccholesky.acml : ccholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ccholesky.atlas : ccholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ccholesky.mkl : ccholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ccholesky.veclib : ccholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ccholesky.essl : ccholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - - -##################################### Zcholesky ################################################### - -zcholesky.goto : zcholesky.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zcholesky.acml : zcholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zcholesky.atlas : zcholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zcholesky.mkl : zcholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zcholesky.veclib : zcholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zcholesky.essl : zcholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sgemm #################################################### -sgemm.goto : sgemm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sgemm.acml : sgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgemm.atlas : sgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgemm.mkl : sgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgemm.veclib : sgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgemm.essl : sgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dgemm #################################################### -dgemm.goto : dgemm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dgemm.acml : dgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgemm.atlas : dgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgemm.mkl : dgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgemm.veclib : dgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgemm.essl : dgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cgemm #################################################### - -cgemm.goto : cgemm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cgemm.acml : cgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgemm.atlas : cgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgemm.mkl : cgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgemm.veclib : cgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgemm.essl : cgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zgemm #################################################### - -zgemm.goto : zgemm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zgemm.acml : zgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgemm.atlas : zgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgemm.mkl : zgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgemm.veclib : zgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgemm.essl : zgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ssymm #################################################### -ssymm.goto : ssymm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ssymm.acml : ssymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssymm.atlas : ssymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssymm.mkl : ssymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssymm.veclib : ssymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dsymm #################################################### -dsymm.goto : dsymm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dsymm.acml : dsymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsymm.atlas : dsymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsymm.mkl : dsymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsymm.veclib : dsymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Csymm #################################################### - -csymm.goto : csymm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -csymm.acml : csymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csymm.atlas : csymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csymm.mkl : csymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csymm.veclib : csymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zsymm #################################################### - -zsymm.goto : zsymm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zsymm.acml : zsymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsymm.atlas : zsymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsymm.mkl : zsymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsymm.veclib : zsymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Strmm #################################################### -strmm.goto : strmm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -strmm.acml : strmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strmm.atlas : strmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strmm.mkl : strmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strmm.veclib : strmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strmm.essl : strmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dtrmm #################################################### -dtrmm.goto : dtrmm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dtrmm.acml : dtrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrmm.atlas : dtrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrmm.mkl : dtrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrmm.veclib : dtrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrmm.essl : dtrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ctrmm #################################################### - -ctrmm.goto : ctrmm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ctrmm.acml : ctrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrmm.atlas : ctrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrmm.mkl : ctrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrmm.veclib : ctrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrmm.essl : ctrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ztrmm #################################################### - -ztrmm.goto : ztrmm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ztrmm.acml : ztrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrmm.atlas : ztrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrmm.mkl : ztrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrmm.veclib : ztrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrmm.essl : ztrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Strsm #################################################### -strsm.goto : strsm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -strsm.acml : strsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strsm.atlas : strsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strsm.mkl : strsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strsm.veclib : strsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strsm.essl : strsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dtrsm #################################################### -dtrsm.goto : dtrsm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dtrsm.acml : dtrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrsm.atlas : dtrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrsm.mkl : dtrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrsm.veclib : dtrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrsm.essl : dtrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ctrsm #################################################### - -ctrsm.goto : ctrsm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ctrsm.acml : ctrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrsm.atlas : ctrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrsm.mkl : ctrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrsm.veclib : ctrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrsm.essl : ctrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ztrsm #################################################### - -ztrsm.goto : ztrsm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ztrsm.acml : ztrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrsm.atlas : ztrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrsm.mkl : ztrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrsm.veclib : ztrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrsm.essl : ztrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Ssyr #################################################### -ssyr.goto : ssyr.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ssyr.acml : ssyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr.atlas : ssyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr.mkl : ssyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr.veclib : ssyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Dsyr #################################################### -dsyr.goto : dsyr.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dsyr.acml : dsyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr.atlas : dsyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr.mkl : dsyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr.veclib : dsyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sspr #################################################### -sspr.goto : sspr.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sspr.acml : sspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sspr.atlas : sspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sspr.mkl : sspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sspr.veclib : sspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dspr #################################################### -dspr.goto : dspr.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dspr.acml : dspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dspr.atlas : dspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dspr.mkl : dspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dspr.veclib : dspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sspr2 #################################################### -sspr2.goto : sspr2.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sspr2.acml : sspr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sspr2.atlas : sspr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sspr2.mkl : sspr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sspr2.veclib : sspr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dspr2 #################################################### -dspr2.goto : dspr2.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dspr2.acml : dspr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dspr2.atlas : dspr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dspr2.mkl : dspr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dspr2.veclib : dspr2.$(SUFFIX) - -##################################### Ssyr2 #################################################### -ssyr2.goto : ssyr2.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ssyr2.acml : ssyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr2.atlas : ssyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr2.mkl : ssyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr2.veclib : ssyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Dsyr2 #################################################### -dsyr2.goto : dsyr2.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dsyr2.acml : dsyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr2.atlas : dsyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr2.mkl : dsyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr2.veclib : dsyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ssyrk #################################################### -ssyrk.goto : ssyrk.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ssyrk.acml : ssyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyrk.atlas : ssyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyrk.mkl : ssyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyrk.veclib : ssyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dsyrk #################################################### -dsyrk.goto : dsyrk.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dsyrk.acml : dsyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyrk.atlas : dsyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyrk.mkl : dsyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyrk.veclib : dsyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Csyrk #################################################### - -csyrk.goto : csyrk.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -csyrk.acml : csyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csyrk.atlas : csyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csyrk.mkl : csyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csyrk.veclib : csyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zsyrk #################################################### - -zsyrk.goto : zsyrk.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zsyrk.acml : zsyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsyrk.atlas : zsyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsyrk.mkl : zsyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsyrk.veclib : zsyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ssyr2k #################################################### -ssyr2k.goto : ssyr2k.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ssyr2k.acml : ssyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr2k.atlas : ssyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr2k.mkl : ssyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr2k.veclib : ssyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dsyr2k #################################################### -dsyr2k.goto : dsyr2k.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dsyr2k.acml : dsyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr2k.atlas : dsyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr2k.mkl : dsyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr2k.veclib : dsyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Csyr2k #################################################### - -csyr2k.goto : csyr2k.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -csyr2k.acml : csyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csyr2k.atlas : csyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csyr2k.mkl : csyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csyr2k.veclib : csyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zsyr2k #################################################### - -zsyr2k.goto : zsyr2k.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zsyr2k.acml : zsyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsyr2k.atlas : zsyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsyr2k.mkl : zsyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsyr2k.veclib : zsyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Chemm #################################################### - -chemm.goto : chemm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -chemm.acml : chemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chemm.atlas : chemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chemm.mkl : chemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chemm.veclib : chemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zhemm #################################################### - -zhemm.goto : zhemm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zhemm.acml : zhemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhemm.atlas : zhemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhemm.mkl : zhemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhemm.veclib : zhemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cherk #################################################### - -cherk.goto : cherk.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cherk.acml : cherk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cherk.atlas : cherk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cherk.mkl : cherk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cherk.veclib : cherk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zherk #################################################### - -zherk.goto : zherk.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zherk.acml : zherk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zherk.atlas : zherk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zherk.mkl : zherk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zherk.veclib : zherk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cher2k #################################################### - -cher2k.goto : cher2k.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cher2k.acml : cher2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cher2k.atlas : cher2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cher2k.mkl : cher2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cher2k.veclib : cher2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zher2k #################################################### - -zher2k.goto : zher2k.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zher2k.acml : zher2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zher2k.atlas : zher2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zher2k.mkl : zher2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zher2k.veclib : zher2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sgemv #################################################### -sgemv.goto : sgemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sgemv.acml : sgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgemv.atlas : sgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgemv.mkl : sgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgemv.veclib : sgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dgemv #################################################### -dgemv.goto : dgemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dgemv.acml : dgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgemv.atlas : dgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgemv.mkl : dgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgemv.veclib : dgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cgemv #################################################### - -cgemv.goto : cgemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cgemv.acml : cgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgemv.atlas : cgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgemv.mkl : cgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgemv.veclib : cgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zgemv #################################################### - -zgemv.goto : zgemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zgemv.acml : zgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgemv.atlas : zgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgemv.mkl : zgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgemv.veclib : zgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Strmv #################################################### -strmv.goto : strmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -strmv.acml : strmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strmv.atlas : strmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strmv.mkl : strmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strmv.veclib : strmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dtrmv #################################################### -dtrmv.goto : dtrmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dtrmv.acml : dtrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrmv.atlas : dtrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrmv.mkl : dtrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrmv.veclib : dtrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ctrmv #################################################### - -ctrmv.goto : ctrmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ctrmv.acml : ctrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrmv.atlas : ctrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrmv.mkl : ctrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrmv.veclib : ctrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ztrmv #################################################### - -ztrmv.goto : ztrmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ztrmv.acml : ztrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrmv.atlas : ztrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrmv.mkl : ztrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrmv.veclib : ztrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Strsv #################################################### -strsv.goto : strsv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -strsv.acml : strsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strsv.atlas : strsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strsv.mkl : strsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strsv.veclib : strsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dtrsv #################################################### -dtrsv.goto : dtrsv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dtrsv.acml : dtrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrsv.atlas : dtrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrsv.mkl : dtrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrsv.veclib : dtrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ctrsv #################################################### - -ctrsv.goto : ctrsv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ctrsv.acml : ctrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrsv.atlas : ctrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrsv.mkl : ctrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrsv.veclib : ctrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ztrsv #################################################### - -ztrsv.goto : ztrsv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ztrsv.acml : ztrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrsv.atlas : ztrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrsv.mkl : ztrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrsv.veclib : ztrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sger #################################################### -sger.goto : sger.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sger.acml : sger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sger.atlas : sger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sger.mkl : sger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sger.veclib : sger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dger #################################################### -dger.goto : dger.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dger.acml : dger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dger.atlas : dger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dger.mkl : dger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dger.veclib : dger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cger #################################################### -cger.goto : cger.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cger.acml : cger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cger.atlas : cger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cger.mkl : cger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cger.veclib : cger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zger #################################################### -zger.goto : zger.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zger.acml : zger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zger.atlas : zger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zger.mkl : zger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zger.veclib : zger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ssymv #################################################### -ssymv.goto : ssymv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ssymv.acml : ssymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssymv.atlas : ssymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssymv.mkl : ssymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssymv.veclib : ssymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dsymv #################################################### -dsymv.goto : dsymv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dsymv.acml : dsymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsymv.atlas : dsymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsymv.mkl : dsymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsymv.veclib : dsymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Csymv #################################################### -csymv.goto : csymv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -csymv.acml : csymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csymv.atlas : csymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csymv.mkl : csymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csymv.veclib : csymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dsymv #################################################### -zsymv.goto : zsymv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zsymv.acml : zsymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsymv.atlas : zsymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsymv.mkl : zsymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsymv.veclib : zsymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sgeev #################################################### -sgeev.goto : sgeev.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sgeev.acml : sgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgeev.atlas : sgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgeev.mkl : sgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgeev.veclib : sgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dgeev #################################################### -dgeev.goto : dgeev.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dgeev.acml : dgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgeev.atlas : dgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgeev.mkl : dgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgeev.veclib : dgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cgeev #################################################### - -cgeev.goto : cgeev.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cgeev.acml : cgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgeev.atlas : cgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgeev.mkl : cgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgeev.veclib : cgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zgeev #################################################### - -zgeev.goto : zgeev.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zgeev.acml : zgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgeev.atlas : zgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgeev.mkl : zgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgeev.veclib : zgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sgetri #################################################### -sgetri.goto : sgetri.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sgetri.acml : sgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgetri.atlas : sgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgetri.mkl : sgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgetri.veclib : sgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dgetri #################################################### -dgetri.goto : dgetri.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dgetri.acml : dgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgetri.atlas : dgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgetri.mkl : dgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgetri.veclib : dgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cgetri #################################################### - -cgetri.goto : cgetri.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cgetri.acml : cgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgetri.atlas : cgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgetri.mkl : cgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgetri.veclib : cgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zgetri #################################################### - -zgetri.goto : zgetri.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zgetri.acml : zgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgetri.atlas : zgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgetri.mkl : zgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgetri.veclib : zgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Spotrf #################################################### -spotrf.goto : spotrf.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -spotrf.acml : spotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -spotrf.atlas : spotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -spotrf.mkl : spotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -spotrf.veclib : spotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dpotrf #################################################### -dpotrf.goto : dpotrf.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dpotrf.acml : dpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dpotrf.atlas : dpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dpotrf.mkl : dpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dpotrf.veclib : dpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cpotrf #################################################### - -cpotrf.goto : cpotrf.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cpotrf.acml : cpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cpotrf.atlas : cpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cpotrf.mkl : cpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cpotrf.veclib : cpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zpotrf #################################################### - -zpotrf.goto : zpotrf.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zpotrf.acml : zpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zpotrf.atlas : zpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zpotrf.mkl : zpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zpotrf.veclib : zpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Chemv #################################################### - -chemv.goto : chemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -chemv.acml : chemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chemv.atlas : chemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chemv.mkl : chemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chemv.veclib : chemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zhemv #################################################### - -zhemv.goto : zhemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zhemv.acml : zhemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhemv.atlas : zhemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhemv.mkl : zhemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhemv.veclib : zhemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Chbmv #################################################### - -chbmv.goto : chbmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -chbmv.acml : chbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chbmv.atlas : chbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chbmv.mkl : chbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chbmv.veclib : chbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Zhbmv #################################################### - -zhbmv.goto : zhbmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zhbmv.acml : zhbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhbmv.atlas : zhbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhbmv.mkl : zhbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhbmv.veclib : zhbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Chpmv #################################################### - -chpmv.goto : chpmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -chpmv.acml : chpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chpmv.atlas : chpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chpmv.mkl : chpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chpmv.veclib : chpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Zhpmv #################################################### - -zhpmv.goto : zhpmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zhpmv.acml : zhpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhpmv.atlas : zhpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhpmv.mkl : zhpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhpmv.veclib : zhpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Sdot #################################################### -sdot.goto : sdot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sdot.acml : sdot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sdot.atlas : sdot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sdot.mkl : sdot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sdot.veclib : sdot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ddot #################################################### -ddot.goto : ddot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ddot.acml : ddot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ddot.atlas : ddot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ddot.mkl : ddot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ddot.veclib : ddot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cdot #################################################### -cdot.goto : cdot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cdot.acml : cdot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cdot.atlas : cdot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cdot.mkl : cdot-intel.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cdot.veclib : cdot-intel.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zdot #################################################### -zdot.goto : zdot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zdot.acml : zdot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zdot.atlas : zdot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zdot.mkl : zdot-intel.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zdot.veclib : zdot-intel.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Srot #################################################### -srot.goto : srot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -srot.acml : srot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -srot.atlas : srot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -srot.mkl : srot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -srot.veclib : srot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Drot #################################################### -drot.goto : drot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -drot.acml : drot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -drot.atlas : drot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -drot.mkl : drot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -drot.veclib : drot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### csrot #################################################### -csrot.goto : csrot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -csrot.acml : csrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csrot.atlas : csrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csrot.mkl : csrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csrot.veclib : csrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### zdrot #################################################### -zdrot.goto : zdrot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zdrot.acml : zdrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zdrot.atlas : zdrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zdrot.mkl : zdrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zdrot.veclib : zdrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### srotm #################################################### -srotm.goto : srotm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -srotm.acml : srotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -srotm.atlas : srotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -srotm.mkl : srotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -srotm.veclib : srotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### drotm #################################################### -drotm.goto : drotm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -drotm.acml : drotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -drotm.atlas : drotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -drotm.mkl : drotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -drotm.veclib : drotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Saxpy #################################################### -saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -saxpy.acml : saxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -saxpy.atlas : saxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -saxpy.mkl : saxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -saxpy.veclib : saxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Daxpy #################################################### -daxpy.goto : daxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -daxpy.acml : daxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -daxpy.atlas : daxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -daxpy.mkl : daxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -daxpy.veclib : daxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Caxpy #################################################### - -caxpy.goto : caxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -caxpy.acml : caxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -caxpy.atlas : caxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -caxpy.mkl : caxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -caxpy.veclib : caxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zaxpy #################################################### - -zaxpy.goto : zaxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zaxpy.acml : zaxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zaxpy.atlas : zaxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zaxpy.mkl : zaxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zaxpy.veclib : zaxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Saxpby #################################################### -saxpby.goto : saxpby.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -saxpby.acml : saxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -saxpby.atlas : saxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -saxpby.mkl : saxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -saxpby.veclib : saxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Daxpby #################################################### -daxpby.goto : daxpby.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -daxpby.acml : daxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -daxpby.atlas : daxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -daxpby.mkl : daxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -daxpby.veclib : daxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Caxpby #################################################### - -caxpby.goto : caxpby.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -caxpby.acml : caxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -caxpby.atlas : caxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -caxpby.mkl : caxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -caxpby.veclib : caxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zaxpby #################################################### - -zaxpby.goto : zaxpby.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zaxpby.acml : zaxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zaxpby.atlas : zaxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zaxpby.mkl : zaxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zaxpby.veclib : zaxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Scopy #################################################### -scopy.goto : scopy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -scopy.acml : scopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -scopy.atlas : scopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -scopy.mkl : scopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -scopy.veclib : scopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dcopy #################################################### -dcopy.goto : dcopy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dcopy.acml : dcopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dcopy.atlas : dcopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dcopy.mkl : dcopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dcopy.veclib : dcopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ccopy #################################################### - -ccopy.goto : ccopy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ccopy.acml : ccopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ccopy.atlas : ccopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ccopy.mkl : ccopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ccopy.veclib : ccopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zcopy #################################################### - -zcopy.goto : zcopy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zcopy.acml : zcopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zcopy.atlas : zcopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zcopy.mkl : zcopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zcopy.veclib : zcopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sscal #################################################### -sscal.goto : sscal.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sscal.acml : sscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sscal.atlas : sscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sscal.mkl : sscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sscal.veclib : sscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dscal #################################################### -dscal.goto : dscal.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dscal.acml : dscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dscal.atlas : dscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dscal.mkl : dscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dscal.veclib : dscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cscal #################################################### - -cscal.goto : cscal.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cscal.acml : cscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cscal.atlas : cscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cscal.mkl : cscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cscal.veclib : cscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zscal #################################################### - -zscal.goto : zscal.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zscal.acml : zscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zscal.atlas : zscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zscal.mkl : zscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zscal.veclib : zscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sasum #################################################### -sasum.goto : sasum.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sasum.acml : sasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sasum.atlas : sasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sasum.mkl : sasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sasum.veclib : sasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dasum #################################################### -dasum.goto : dasum.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dasum.acml : dasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dasum.atlas : dasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dasum.mkl : dasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dasum.veclib : dasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Casum #################################################### - -casum.goto : casum.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -casum.acml : casum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -casum.atlas : casum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -casum.mkl : casum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -casum.veclib : casum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zasum #################################################### - -zasum.goto : zasum.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zasum.acml : zasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zasum.atlas : zasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zasum.mkl : zasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zasum.veclib : zasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sswap #################################################### -sswap.goto : sswap.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sswap.acml : sswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sswap.atlas : sswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sswap.mkl : sswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sswap.veclib : sswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dswap #################################################### -dswap.goto : dswap.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dswap.acml : dswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dswap.atlas : dswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dswap.mkl : dswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dswap.veclib : dswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cswap #################################################### - -cswap.goto : cswap.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cswap.acml : cswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cswap.atlas : cswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cswap.mkl : cswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cswap.veclib : cswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zswap #################################################### - -zswap.goto : zswap.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zswap.acml : zswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zswap.atlas : zswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zswap.mkl : zswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zswap.veclib : zswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - - -##################################### Sgesv #################################################### -sgesv.goto : sgesv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sgesv.acml : sgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgesv.atlas : sgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgesv.mkl : sgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgesv.veclib : sgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dgesv #################################################### -dgesv.goto : dgesv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dgesv.acml : dgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgesv.atlas : dgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgesv.mkl : dgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgesv.veclib : dgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cgesv #################################################### - -cgesv.goto : cgesv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cgesv.acml : cgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgesv.atlas : cgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgesv.mkl : cgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgesv.veclib : cgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zgesv #################################################### - -zgesv.goto : zgesv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zgesv.acml : zgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgesv.atlas : zgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgesv.mkl : zgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgesv.veclib : zgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - - -##################################### Cgemm3m #################################################### - -cgemm3m.goto : cgemm3m.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cgemm3m.mkl : cgemm3m.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgemm3m.veclib : cgemm3m.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zgemm3m #################################################### - -zgemm3m.goto : zgemm3m.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zgemm3m.mkl : zgemm3m.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgemm3m.veclib : zgemm3m.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -############################################## ISAMAX ############################################## -isamax.goto : isamax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -isamax.atlas : isamax.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -############################################## IDAMAX ############################################## -idamax.goto : idamax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -idamax.atlas : idamax.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -############################################## ICAMAX ############################################## -icamax.goto : icamax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -icamax.atlas : icamax.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -############################################## IZAMAX ############################################## -izamax.goto : izamax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -izamax.atlas : izamax.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -############################################## SNRM2 ############################################## -snrm2.goto : snrm2.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -snrm2.atlas : snrm2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -############################################## DNRM2 ############################################## -dnrm2.goto : dnrm2.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dnrm2.atlas : dnrm2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -############################################## Sscnrm2 ############################################## -scnrm2.goto : scnrm2.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -scnrm2.atlas : scnrm2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -############################################## Ddznrm2 ############################################## -dznrm2.goto : dznrm2.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dznrm2.atlas : dznrm2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - - -################################################################################################### - -slinpack.$(SUFFIX) : linpack.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dlinpack.$(SUFFIX) : linpack.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -clinpack.$(SUFFIX) : linpack.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zlinpack.$(SUFFIX) : linpack.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -scholesky.$(SUFFIX) : cholesky.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dcholesky.$(SUFFIX) : cholesky.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ccholesky.$(SUFFIX) : cholesky.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zcholesky.$(SUFFIX) : cholesky.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -sgemm.$(SUFFIX) : gemm.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dgemm.$(SUFFIX) : gemm.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cgemm.$(SUFFIX) : gemm.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zgemm.$(SUFFIX) : gemm.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -ssymm.$(SUFFIX) : symm.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dsymm.$(SUFFIX) : symm.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -csymm.$(SUFFIX) : symm.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zsymm.$(SUFFIX) : symm.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -strmm.$(SUFFIX) : trmm.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dtrmm.$(SUFFIX) : trmm.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ctrmm.$(SUFFIX) : trmm.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -ztrmm.$(SUFFIX) : trmm.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -strsm.$(SUFFIX) : trsm.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dtrsm.$(SUFFIX) : trsm.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ctrsm.$(SUFFIX) : trsm.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -ztrsm.$(SUFFIX) : trsm.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -ssyr.$(SUFFIX) : syr.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dsyr.$(SUFFIX) : syr.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -sspr.$(SUFFIX) : spr.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dspr.$(SUFFIX) : spr.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -sspr2.$(SUFFIX) : spr2.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dspr2.$(SUFFIX) : spr2.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ssyr2.$(SUFFIX) : syr2.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dsyr2.$(SUFFIX) : syr2.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ssyrk.$(SUFFIX) : syrk.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dsyrk.$(SUFFIX) : syrk.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -csyrk.$(SUFFIX) : syrk.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zsyrk.$(SUFFIX) : syrk.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -ssyr2k.$(SUFFIX) : syr2k.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dsyr2k.$(SUFFIX) : syr2k.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -csyr2k.$(SUFFIX) : syr2k.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zsyr2k.$(SUFFIX) : syr2k.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -chemm.$(SUFFIX) : hemm.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zhemm.$(SUFFIX) : hemm.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -cherk.$(SUFFIX) : herk.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zherk.$(SUFFIX) : herk.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -cher2k.$(SUFFIX) : her2k.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zher2k.$(SUFFIX) : her2k.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -sgemv.$(SUFFIX) : gemv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dgemv.$(SUFFIX) : gemv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cgemv.$(SUFFIX) : gemv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zgemv.$(SUFFIX) : gemv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -strmv.$(SUFFIX) : trmv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dtrmv.$(SUFFIX) : trmv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ctrmv.$(SUFFIX) : trmv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -ztrmv.$(SUFFIX) : trmv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -strsv.$(SUFFIX) : trsv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dtrsv.$(SUFFIX) : trsv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ctrsv.$(SUFFIX) : trsv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -ztrsv.$(SUFFIX) : trsv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -sger.$(SUFFIX) : ger.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dger.$(SUFFIX) : ger.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cger.$(SUFFIX) : ger.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zger.$(SUFFIX) : ger.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - - -ssymv.$(SUFFIX) : symv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dsymv.$(SUFFIX) : symv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -csymv.$(SUFFIX) : symv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zsymv.$(SUFFIX) : symv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -sgeev.$(SUFFIX) : geev.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dgeev.$(SUFFIX) : geev.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cgeev.$(SUFFIX) : geev.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zgeev.$(SUFFIX) : geev.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -sgetri.$(SUFFIX) : getri.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dgetri.$(SUFFIX) : getri.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cgetri.$(SUFFIX) : getri.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zgetri.$(SUFFIX) : getri.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -spotrf.$(SUFFIX) : potrf.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dpotrf.$(SUFFIX) : potrf.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cpotrf.$(SUFFIX) : potrf.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zpotrf.$(SUFFIX) : potrf.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -chemv.$(SUFFIX) : hemv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zhemv.$(SUFFIX) : hemv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -chbmv.$(SUFFIX) : hbmv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zhbmv.$(SUFFIX) : hbmv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -chpmv.$(SUFFIX) : hpmv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zhpmv.$(SUFFIX) : hpmv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -sdot.$(SUFFIX) : dot.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -ddot.$(SUFFIX) : dot.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cdot.$(SUFFIX) : zdot.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zdot.$(SUFFIX) : zdot.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -cdot-intel.$(SUFFIX) : zdot-intel.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zdot-intel.$(SUFFIX) : zdot-intel.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - - - -saxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -daxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -caxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zaxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -saxpby.$(SUFFIX) : axpby.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -daxpby.$(SUFFIX) : axpby.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -caxpby.$(SUFFIX) : axpby.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zaxpby.$(SUFFIX) : axpby.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -scopy.$(SUFFIX) : copy.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dcopy.$(SUFFIX) : copy.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ccopy.$(SUFFIX) : copy.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zcopy.$(SUFFIX) : copy.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -sswap.$(SUFFIX) : swap.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dswap.$(SUFFIX) : swap.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cswap.$(SUFFIX) : swap.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zswap.$(SUFFIX) : swap.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - - - -sscal.$(SUFFIX) : scal.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dscal.$(SUFFIX) : scal.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cscal.$(SUFFIX) : scal.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zscal.$(SUFFIX) : scal.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -sasum.$(SUFFIX) : asum.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dasum.$(SUFFIX) : asum.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -casum.$(SUFFIX) : asum.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zasum.$(SUFFIX) : asum.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - - -sgesv.$(SUFFIX) : gesv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dgesv.$(SUFFIX) : gesv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cgesv.$(SUFFIX) : gesv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zgesv.$(SUFFIX) : gesv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -srot.$(SUFFIX) : rot.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -drot.$(SUFFIX) : rot.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -csrot.$(SUFFIX) : rot.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zdrot.$(SUFFIX) : rot.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -srotm.$(SUFFIX) : rotm.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -drotm.$(SUFFIX) : rotm.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - - - -cgemm3m.$(SUFFIX) : gemm3m.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zgemm3m.$(SUFFIX) : gemm3m.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - - -isamax.$(SUFFIX) : iamax.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -idamax.$(SUFFIX) : iamax.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -icamax.$(SUFFIX) : iamax.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -izamax.$(SUFFIX) : iamax.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - - -snrm2.$(SUFFIX) : nrm2.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dnrm2.$(SUFFIX) : nrm2.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -scnrm2.$(SUFFIX) : nrm2.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -dznrm2.$(SUFFIX) : nrm2.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - - -smallscaling: smallscaling.c ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm -lpthread - -clean :: - @rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl smallscaling - -include $(TOPDIR)/Makefile.tail - +TOPDIR = .. +include $(TOPDIR)/Makefile.system + +# ACML standard +#ACML=/opt/acml5.3.1/gfortran64_mp/lib +#LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm + +# ACML custom +#ACML=/opt/pb/acml-5-3-1-gfortran-64bit/gfortran64_fma4_mp/lib +#LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm + +# ACML 6.1 custom +ACML=/home/saar/acml6.1/gfortran64_mp/lib +LIBACML = -fopenmp $(ACML)/libacml_mp.so -lgfortran -lm + + +# Atlas Ubuntu +#ATLAS=/usr/lib/atlas-base +#LIBATLAS = -fopenmp $(ATLAS)/liblapack_atlas.a $(ATLAS)/libptcblas.a $(ATLAS)/libptf77blas.a $(ATLAS)/libatlas.a -lgfortran -lm + +# Atlas RHEL and Fedora +ATLAS=/usr/lib64/atlas +LIBATLAS = -fopenmp $(ATLAS)/liblapack.a $(ATLAS)/libptcblas.a $(ATLAS)/libptf77blas.a $(ATLAS)/libatlas.a -lgfortran -lm + +# Intel standard +# MKL=/opt/intel/mkl/lib/intel64 +# LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm + +# Intel custom +MKL=/home/saar/intel_mkl +LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm + +# Apple vecLib +LIBVECLIB = -framework Accelerate + +ESSL=/opt/ibm/lib +#LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a +LIBESSL = -lesslsmp $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a + +ifneq ($(NO_LAPACK), 1) +GOTO_LAPACK_TARGETS=slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ + scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \ + sgesv.goto dgesv.goto cgesv.goto zgesv.goto \ + sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ + csymv.goto zsymv.goto \ + sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ + spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto +else +GOTO_LAPACK_TARGETS= +endif + +ifeq ($(OSNAME), WINNT) + +goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ + scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \ + sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ + strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \ + strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \ + sspr.goto dspr.goto \ + sspr2.goto dspr2.goto \ + ssyr.goto dsyr.goto \ + ssyr2.goto dsyr2.goto \ + ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ + ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ + sger.goto dger.goto cger.goto zger.goto \ + sdot.goto ddot.goto \ + srot.goto drot.goto csrot.goto zdrot.goto \ + srotm.goto drotm.goto \ + saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ + scopy.goto dcopy.goto ccopy.goto zcopy.goto \ + sswap.goto dswap.goto cswap.goto zswap.goto \ + sscal.goto dscal.goto cscal.goto zscal.goto \ + sasum.goto dasum.goto casum.goto zasum.goto \ + ssymv.goto dsymv.goto csymv.goto zsymv.goto \ + chemv.goto zhemv.goto \ + chbmv.goto zhbmv.goto \ + chpmv.goto zhpmv.goto \ + chemm.goto zhemm.goto \ + cherk.goto zherk.goto \ + cher2k.goto zher2k.goto \ + sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ + strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ + strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \ + sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ + sgesv.goto dgesv.goto cgesv.goto zgesv.goto \ + sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ + spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \ + ssymm.goto dsymm.goto csymm.goto zsymm.goto \ + saxpby.goto daxpby.goto caxpby.goto zaxpby.goto + +acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ + scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ + sgemm.acml dgemm.acml cgemm.acml zgemm.acml \ + strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \ + strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \ + sspr.acml dspr.acml \ + sspr2.acml dspr2.acml \ + ssyr.acml dsyr.acml \ + ssyr2.acml dsyr2.acml \ + ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ + ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ + sger.acml dger.acml cger.acml zger.acml \ + sdot.acml ddot.acml \ + srot.acml drot.acml csrot.acml zdrot.acml \ + srotm.acml drotm.acml \ + saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ + scopy.acml dcopy.acml ccopy.acml zcopy.acml \ + sswap.acml dswap.acml cswap.acml zswap.acml \ + sscal.acml dscal.acml cscal.acml zscal.acml \ + sasum.acml dasum.acml casum.acml zasum.acml \ + ssymv.acml dsymv.acml csymv.acml zsymv.acml \ + chemv.acml zhemv.acml \ + chbmv.acml zhbmv.acml \ + chpmv.acml zhpmv.acml \ + chemm.acml zhemm.acml \ + cherk.acml zherk.acml \ + cher2k.acml zher2k.acml \ + sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ + strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ + strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \ + sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ + sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ + sgetri.acml dgetri.acml cgetri.acml zgetri.acml \ + spotrf.acml dpotrf.acml cpotrf.acml zpotrf.acml \ + ssymm.acml dsymm.acml csymm.acml zsymm.acml \ + saxpby.acml daxpby.acml caxpby.acml zaxpby.acml + +atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ + scholesky.atlas dcholesky.atlas ccholesky.atlas zcholesky.atlas \ + sgemm.atlas dgemm.atlas cgemm.atlas zgemm.atlas \ + strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \ + strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \ + sspr.atlas dspr.atlas \ + sspr2.atlas dspr2.atlas \ + ssyr.atlas dsyr.atlas \ + ssyr2.atlas dsyr2.atlas \ + ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ + ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ + sger.atlas dger.atlas cger.atlas zger.atlas\ + sdot.atlas ddot.atlas \ + srot.atlas drot.atlas csrot.atlas zdrot.atlas \ + srotm.atlas drotm.atlas \ + saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \ + scopy.atlas dcopy.atlas ccopy.atlas zcopy.atlas \ + sswap.atlas dswap.atlas cswap.atlas zswap.atlas \ + sscal.atlas dscal.atlas cscal.atlas zscal.atlas \ + sasum.atlas dasum.atlas casum.atlas zasum.atlas \ + ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \ + chemv.atlas zhemv.atlas \ + chbmv.atlas zhbmv.atlas \ + chpmv.atlas zhpmv.atlas \ + chemm.acml zhemm.acml \ + chemm.atlas zhemm.atlas \ + cherk.atlas zherk.atlas \ + cher2k.atlas zher2k.atlas \ + sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ + strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ + strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \ + sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ + sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ + sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ + spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \ + ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas \ + saxpby.atlas daxpby.atlas caxpby.atlas zaxpby.atlas + +mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ + scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \ + sgemm.mkl dgemm.mkl cgemm.mkl zgemm.mkl \ + strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \ + strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \ + sspr.mkl dspr.mkl \ + sspr2.mkl dspr2.mkl \ + ssyr.mkl dsyr.mkl \ + ssyr2.mkl dsyr2.mkl \ + ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ + ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ + sger.mkl dger.mkl cger.mkl zger.mkl \ + sdot.mkl ddot.mkl \ + srot.mkl drot.mkl csrot.mkl zdrot.mkl \ + srotm.mkl drotm.mkl \ + saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \ + scopy.mkl dcopy.mkl ccopy.mkl zcopy.mkl \ + sswap.mkl dswap.mkl cswap.mkl zswap.mkl \ + sscal.mkl dscal.mkl cscal.mkl zscal.mkl \ + sasum.mkl dasum.mkl casum.mkl zasum.mkl \ + ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ + chemv.mkl zhemv.mkl \ + chbmv.mkl zhbmv.mkl \ + chpmv.mkl zhpmv.mkl \ + chemm.mkl zhemm.mkl \ + cherk.mkl zherk.mkl \ + cher2k.mkl zher2k.mkl \ + sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ + strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ + strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \ + sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ + sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ + sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \ + spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \ + ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl \ + saxpby.mkl daxpby.mkl caxpby.mkl zaxpby.mkl + +else + +goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ + strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \ + strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \ + sspr.goto dspr.goto \ + sspr2.goto dspr2.goto \ + ssyr.goto dsyr.goto \ + ssyr2.goto dsyr2.goto \ + ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ + ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ + sger.goto dger.goto cger.goto zger.goto \ + sdot.goto ddot.goto cdot.goto zdot.goto \ + srot.goto drot.goto csrot.goto zdrot.goto \ + srotm.goto drotm.goto \ + saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ + scopy.goto dcopy.goto ccopy.goto zcopy.goto \ + sswap.goto dswap.goto cswap.goto zswap.goto \ + sscal.goto dscal.goto cscal.goto zscal.goto \ + sasum.goto dasum.goto casum.goto zasum.goto \ + ssymv.goto dsymv.goto \ + chemv.goto zhemv.goto \ + chbmv.goto zhbmv.goto \ + chpmv.goto zhpmv.goto \ + chemm.goto zhemm.goto \ + cherk.goto zherk.goto \ + cher2k.goto zher2k.goto \ + sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ + strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ + strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \ + ssymm.goto dsymm.goto csymm.goto zsymm.goto \ + smallscaling \ + isamax.goto idamax.goto icamax.goto izamax.goto \ + ismax.goto idmax.goto \ + isamin.goto idamin.goto icamin.goto izamin.goto \ + ismin.goto idmin.goto \ + samax.goto damax.goto camax.goto zamax.goto \ + smax.goto dmax.goto \ + samin.goto damin.goto camin.goto zamin.goto \ + smin.goto dmin.goto \ + saxpby.goto daxpby.goto caxpby.goto zaxpby.goto \ + snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) + +acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ + scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ + sgemm.acml dgemm.acml cgemm.acml zgemm.acml \ + strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \ + strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \ + sspr.acml dspr.acml \ + sspr2.acml dspr2.acml \ + ssyr.acml dsyr.acml \ + ssyr2.acml dsyr2.acml \ + ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ + ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ + sger.acml dger.acml cger.acml zger.acml \ + sdot.acml ddot.acml \ + srot.acml drot.acml csrot.acml zdrot.acml \ + srotm.acml drotm.acml \ + saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ + scopy.acml dcopy.acml ccopy.acml zcopy.acml \ + sswap.acml dswap.acml cswap.acml zswap.acml \ + sscal.acml dscal.acml cscal.acml zscal.acml \ + sasum.acml dasum.acml casum.acml zasum.acml \ + ssymv.acml dsymv.acml csymv.acml zsymv.acml \ + chemv.acml zhemv.acml \ + chbmv.acml zhbmv.acml \ + chpmv.acml zhpmv.acml \ + chemm.acml zhemm.acml \ + cherk.acml zherk.acml \ + cher2k.acml zher2k.acml \ + sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ + strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ + strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \ + sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ + sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ + sgetri.acml dgetri.acml cgetri.acml zgetri.acml \ + spotrf.acml dpotrf.acml cpotrf.acml zpotrf.acml \ + ssymm.acml dsymm.acml csymm.acml zsymm.acml \ + saxpby.acml daxpby.acml caxpby.acml zaxpby.acml + +atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ + scholesky.atlas dcholesky.atlas ccholesky.atlas zcholesky.atlas \ + sgemm.atlas dgemm.atlas cgemm.atlas zgemm.atlas \ + strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \ + strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \ + sspr.atlas dspr.atlas \ + sspr2.atlas dspr2.atlas \ + ssyr.atlas dsyr.atlas \ + ssyr2.atlas dsyr2.atlas \ + ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ + ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ + sger.atlas dger.atlas cger.atlas zger.atlas\ + sdot.atlas ddot.atlas \ + srot.atlas drot.atlas csrot.atlas zdrot.atlas \ + srotm.atlas drotm.atlas \ + saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \ + scopy.atlas dcopy.atlas ccopy.atlas zcopy.atlas \ + sswap.atlas dswap.atlas cswap.atlas zswap.atlas \ + sscal.atlas dscal.atlas cscal.atlas zscal.atlas \ + sasum.atlas dasum.atlas casum.atlas zasum.atlas \ + ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \ + chemv.atlas zhemv.atlas \ + chbmv.atlas zhbmv.atlas \ + chpmv.atlas zhpmv.atlas \ + chemm.acml zhemm.acml \ + chemm.atlas zhemm.atlas \ + cherk.atlas zherk.atlas \ + cher2k.atlas zher2k.atlas \ + sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ + strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ + strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \ + sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ + sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ + sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ + spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \ + ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas \ + isamax.atlas idamax.atlas icamax.atlas izamax.atlas \ + snrm2.atlas dnrm2.atlas scnrm2.atlas dznrm2.atlas \ + saxpby.atlas daxpby.atlas caxpby.atlas zaxpby.atlas + +mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ + scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \ + sgemm.mkl dgemm.mkl cgemm.mkl zgemm.mkl \ + strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \ + strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \ + sspr.mkl dspr.mkl \ + sspr2.mkl dspr2.mkl \ + ssyr.mkl dsyr.mkl \ + ssyr2.mkl dsyr2.mkl \ + ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ + ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ + sger.mkl dger.mkl cger.mkl zger.mkl \ + sdot.mkl ddot.mkl cdot.mkl zdot.mkl \ + srot.atlas drot.atlas csrot.atlas zdrot.atlas \ + srotm.atlas drotm.atlas \ + saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \ + scopy.mkl dcopy.mkl ccopy.mkl zcopy.mkl \ + sswap.mkl dswap.mkl cswap.mkl zswap.mkl \ + sscal.mkl dscal.mkl cscal.mkl zscal.mkl \ + sasum.mkl dasum.mkl casum.mkl zasum.mkl \ + ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ + chemv.mkl zhemv.mkl \ + chbmv.mkl zhbmv.mkl \ + chpmv.mkl zhpmv.mkl \ + chemm.mkl zhemm.mkl \ + cherk.mkl zherk.mkl \ + cher2k.mkl zher2k.mkl \ + sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ + strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ + strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \ + sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ + sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ + sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \ + spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \ + ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl \ + saxpby.mkl daxpby.mkl caxpby.mkl zaxpby.mkl + + + + +endif + +essl :: sgemm.essl strmm.essl dgemm.essl dtrmm.essl \ + cgemm.essl ctrmm.essl zgemm.essl ztrmm.essl \ + slinpack.essl clinpack.essl dlinpack.essl zlinpack.essl \ + scholesky.essl ccholesky.essl dcholesky.essl zcholesky.essl \ + strsm.essl dtrsm.essl ctrsm.essl ztrsm.essl + +veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ + scholesky.veclib dcholesky.veclib ccholesky.veclib zcholesky.veclib \ + sgemm.veclib dgemm.veclib cgemm.veclib zgemm.veclib \ + strmm.veclib dtrmm.veclib ctrmm.veclib ztrmm.veclib \ + strsm.veclib dtrsm.veclib ctrsm.veclib ztrsm.veclib \ + sspr.veclib dspr.veclib \ + sspr2.veclib dspr2.veclib \ + ssyr.veclib dsyr.veclib \ + ssyr2.veclib dsyr2.veclib \ + ssyrk.veclib dsyrk.veclib csyrk.veclib zsyrk.veclib \ + ssyr2k.veclib dsyr2k.veclib csyr2k.veclib zsyr2k.veclib \ + sger.veclib dger.veclib cger.veclib zger.veclib \ + sdot.veclib ddot.veclib cdot.veclib zdot.veclib \ + srot.veclib drot.veclib csrot.veclib zdrot.veclib \ + srotm.veclib drotm.veclib \ + saxpy.veclib daxpy.veclib caxpy.veclib zaxpy.veclib \ + scopy.veclib dcopy.veclib ccopy.veclib zcopy.veclib \ + sswap.veclib dswap.veclib cswap.veclib zswap.veclib \ + sscal.veclib dscal.veclib cscal.veclib zscal.veclib \ + sasum.veclib dasum.veclib casum.veclib zasum.veclib \ + ssymv.veclib dsymv.veclib csymv.veclib zsymv.veclib \ + chemv.veclib zhemv.veclib \ + chbmv.veclib zhbmv.veclib \ + chpmv.veclib zhpmv.veclib \ + chemm.veclib zhemm.veclib \ + cherk.veclib zherk.veclib \ + cher2k.veclib zher2k.veclib \ + sgemv.veclib dgemv.veclib cgemv.veclib zgemv.veclib \ + strmv.veclib dtrmv.veclib ctrmv.veclib ztrmv.veclib \ + strsv.veclib dtrsv.veclib ctrsv.veclib ztrsv.veclib \ + sgeev.veclib dgeev.veclib cgeev.veclib zgeev.veclib \ + sgesv.veclib dgesv.veclib cgesv.veclib zgesv.veclib \ + sgetri.veclib dgetri.veclib cgetri.veclib zgetri.veclib \ + spotrf.veclib dpotrf.veclib cpotrf.veclib zpotrf.veclib \ + ssymm.veclib dsymm.veclib csymm.veclib zsymm.veclib \ + saxpby.veclib daxpby.veclib caxpby.veclib zaxpby.veclib + +goto_3m :: cgemm3m.goto zgemm3m.goto + +mkl_3m :: cgemm3m.mkl zgemm3m.mkl + +all :: goto mkl atlas acml veclib + +exe : + @./Make_exe.sh + +##################################### Slinpack #################################################### +slinpack.goto : slinpack.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +slinpack.acml : slinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +slinpack.atlas : slinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +slinpack.mkl : slinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +slinpack.veclib : slinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +slinpack.essl : slinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dlinpack #################################################### +dlinpack.goto : dlinpack.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dlinpack.acml : dlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dlinpack.atlas : dlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dlinpack.mkl : dlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dlinpack.veclib : dlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dlinpack.essl : dlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Clinpack #################################################### + +clinpack.goto : clinpack.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +clinpack.acml : clinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +clinpack.atlas : clinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +clinpack.mkl : clinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +clinpack.veclib : clinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +clinpack.essl : clinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zlinpack #################################################### + +zlinpack.goto : zlinpack.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zlinpack.acml : zlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zlinpack.atlas : zlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zlinpack.mkl : zlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zlinpack.veclib : zlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zlinpack.essl : zlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Scholesky ################################################### + +scholesky.goto : scholesky.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +scholesky.acml : scholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +scholesky.atlas : scholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +scholesky.mkl : scholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +scholesky.veclib : scholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +scholesky.essl : scholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dcholesky ################################################### + +dcholesky.goto : dcholesky.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dcholesky.acml : dcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dcholesky.atlas : dcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dcholesky.mkl : dcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dcholesky.veclib : dcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dcholesky.essl : dcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ccholesky ################################################### + +ccholesky.goto : ccholesky.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ccholesky.acml : ccholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ccholesky.atlas : ccholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ccholesky.mkl : ccholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ccholesky.veclib : ccholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ccholesky.essl : ccholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + + +##################################### Zcholesky ################################################### + +zcholesky.goto : zcholesky.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zcholesky.acml : zcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zcholesky.atlas : zcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zcholesky.mkl : zcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zcholesky.veclib : zcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zcholesky.essl : zcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sgemm #################################################### +sgemm.goto : sgemm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sgemm.acml : sgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgemm.atlas : sgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgemm.mkl : sgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgemm.veclib : sgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgemm.essl : sgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dgemm #################################################### +dgemm.goto : dgemm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dgemm.acml : dgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgemm.atlas : dgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgemm.mkl : dgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgemm.veclib : dgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgemm.essl : dgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cgemm #################################################### + +cgemm.goto : cgemm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cgemm.acml : cgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgemm.atlas : cgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgemm.mkl : cgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgemm.veclib : cgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgemm.essl : cgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zgemm #################################################### + +zgemm.goto : zgemm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zgemm.acml : zgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgemm.atlas : zgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgemm.mkl : zgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgemm.veclib : zgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgemm.essl : zgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ssymm #################################################### +ssymm.goto : ssymm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ssymm.acml : ssymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssymm.atlas : ssymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssymm.mkl : ssymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssymm.veclib : ssymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dsymm #################################################### +dsymm.goto : dsymm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dsymm.acml : dsymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsymm.atlas : dsymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsymm.mkl : dsymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsymm.veclib : dsymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Csymm #################################################### + +csymm.goto : csymm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +csymm.acml : csymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csymm.atlas : csymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csymm.mkl : csymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csymm.veclib : csymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zsymm #################################################### + +zsymm.goto : zsymm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zsymm.acml : zsymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsymm.atlas : zsymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsymm.mkl : zsymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsymm.veclib : zsymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Strmm #################################################### +strmm.goto : strmm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +strmm.acml : strmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strmm.atlas : strmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strmm.mkl : strmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strmm.veclib : strmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strmm.essl : strmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dtrmm #################################################### +dtrmm.goto : dtrmm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dtrmm.acml : dtrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrmm.atlas : dtrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrmm.mkl : dtrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrmm.veclib : dtrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrmm.essl : dtrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ctrmm #################################################### + +ctrmm.goto : ctrmm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ctrmm.acml : ctrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrmm.atlas : ctrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrmm.mkl : ctrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrmm.veclib : ctrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrmm.essl : ctrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ztrmm #################################################### + +ztrmm.goto : ztrmm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ztrmm.acml : ztrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrmm.atlas : ztrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrmm.mkl : ztrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrmm.veclib : ztrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrmm.essl : ztrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Strsm #################################################### +strsm.goto : strsm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +strsm.acml : strsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strsm.atlas : strsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strsm.mkl : strsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strsm.veclib : strsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strsm.essl : strsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dtrsm #################################################### +dtrsm.goto : dtrsm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dtrsm.acml : dtrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrsm.atlas : dtrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrsm.mkl : dtrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrsm.veclib : dtrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrsm.essl : dtrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ctrsm #################################################### + +ctrsm.goto : ctrsm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ctrsm.acml : ctrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrsm.atlas : ctrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrsm.mkl : ctrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrsm.veclib : ctrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrsm.essl : ctrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ztrsm #################################################### + +ztrsm.goto : ztrsm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ztrsm.acml : ztrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrsm.atlas : ztrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrsm.mkl : ztrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrsm.veclib : ztrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrsm.essl : ztrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Ssyr #################################################### +ssyr.goto : ssyr.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ssyr.acml : ssyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr.atlas : ssyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr.mkl : ssyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr.veclib : ssyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Dsyr #################################################### +dsyr.goto : dsyr.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dsyr.acml : dsyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr.atlas : dsyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr.mkl : dsyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr.veclib : dsyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sspr #################################################### +sspr.goto : sspr.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sspr.acml : sspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr.atlas : sspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr.mkl : sspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr.veclib : sspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dspr #################################################### +dspr.goto : dspr.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dspr.acml : dspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr.atlas : dspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr.mkl : dspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr.veclib : dspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sspr2 #################################################### +sspr2.goto : sspr2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sspr2.acml : sspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr2.atlas : sspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr2.mkl : sspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr2.veclib : sspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dspr2 #################################################### +dspr2.goto : dspr2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dspr2.acml : dspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr2.atlas : dspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr2.mkl : dspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr2.veclib : dspr2.$(SUFFIX) + +##################################### Ssyr2 #################################################### +ssyr2.goto : ssyr2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ssyr2.acml : ssyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr2.atlas : ssyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr2.mkl : ssyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr2.veclib : ssyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Dsyr2 #################################################### +dsyr2.goto : dsyr2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dsyr2.acml : dsyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr2.atlas : dsyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr2.mkl : dsyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr2.veclib : dsyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ssyrk #################################################### +ssyrk.goto : ssyrk.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ssyrk.acml : ssyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyrk.atlas : ssyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyrk.mkl : ssyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyrk.veclib : ssyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dsyrk #################################################### +dsyrk.goto : dsyrk.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dsyrk.acml : dsyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyrk.atlas : dsyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyrk.mkl : dsyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyrk.veclib : dsyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Csyrk #################################################### + +csyrk.goto : csyrk.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +csyrk.acml : csyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csyrk.atlas : csyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csyrk.mkl : csyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csyrk.veclib : csyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zsyrk #################################################### + +zsyrk.goto : zsyrk.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zsyrk.acml : zsyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsyrk.atlas : zsyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsyrk.mkl : zsyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsyrk.veclib : zsyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ssyr2k #################################################### +ssyr2k.goto : ssyr2k.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ssyr2k.acml : ssyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr2k.atlas : ssyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr2k.mkl : ssyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr2k.veclib : ssyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dsyr2k #################################################### +dsyr2k.goto : dsyr2k.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dsyr2k.acml : dsyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr2k.atlas : dsyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr2k.mkl : dsyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr2k.veclib : dsyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Csyr2k #################################################### + +csyr2k.goto : csyr2k.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +csyr2k.acml : csyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csyr2k.atlas : csyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csyr2k.mkl : csyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csyr2k.veclib : csyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zsyr2k #################################################### + +zsyr2k.goto : zsyr2k.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zsyr2k.acml : zsyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsyr2k.atlas : zsyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsyr2k.mkl : zsyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsyr2k.veclib : zsyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Chemm #################################################### + +chemm.goto : chemm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +chemm.acml : chemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chemm.atlas : chemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chemm.mkl : chemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chemm.veclib : chemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zhemm #################################################### + +zhemm.goto : zhemm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zhemm.acml : zhemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhemm.atlas : zhemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhemm.mkl : zhemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhemm.veclib : zhemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cherk #################################################### + +cherk.goto : cherk.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cherk.acml : cherk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cherk.atlas : cherk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cherk.mkl : cherk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cherk.veclib : cherk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zherk #################################################### + +zherk.goto : zherk.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zherk.acml : zherk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zherk.atlas : zherk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zherk.mkl : zherk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zherk.veclib : zherk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cher2k #################################################### + +cher2k.goto : cher2k.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cher2k.acml : cher2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cher2k.atlas : cher2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cher2k.mkl : cher2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cher2k.veclib : cher2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zher2k #################################################### + +zher2k.goto : zher2k.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zher2k.acml : zher2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zher2k.atlas : zher2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zher2k.mkl : zher2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zher2k.veclib : zher2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sgemv #################################################### +sgemv.goto : sgemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sgemv.acml : sgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgemv.atlas : sgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgemv.mkl : sgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgemv.veclib : sgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dgemv #################################################### +dgemv.goto : dgemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dgemv.acml : dgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgemv.atlas : dgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgemv.mkl : dgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgemv.veclib : dgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cgemv #################################################### + +cgemv.goto : cgemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cgemv.acml : cgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgemv.atlas : cgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgemv.mkl : cgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgemv.veclib : cgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zgemv #################################################### + +zgemv.goto : zgemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zgemv.acml : zgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgemv.atlas : zgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgemv.mkl : zgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgemv.veclib : zgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Strmv #################################################### +strmv.goto : strmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +strmv.acml : strmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strmv.atlas : strmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strmv.mkl : strmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strmv.veclib : strmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dtrmv #################################################### +dtrmv.goto : dtrmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dtrmv.acml : dtrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrmv.atlas : dtrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrmv.mkl : dtrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrmv.veclib : dtrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ctrmv #################################################### + +ctrmv.goto : ctrmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ctrmv.acml : ctrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrmv.atlas : ctrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrmv.mkl : ctrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrmv.veclib : ctrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ztrmv #################################################### + +ztrmv.goto : ztrmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ztrmv.acml : ztrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrmv.atlas : ztrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrmv.mkl : ztrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrmv.veclib : ztrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Strsv #################################################### +strsv.goto : strsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +strsv.acml : strsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strsv.atlas : strsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strsv.mkl : strsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strsv.veclib : strsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dtrsv #################################################### +dtrsv.goto : dtrsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dtrsv.acml : dtrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrsv.atlas : dtrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrsv.mkl : dtrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrsv.veclib : dtrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ctrsv #################################################### + +ctrsv.goto : ctrsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ctrsv.acml : ctrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrsv.atlas : ctrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrsv.mkl : ctrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrsv.veclib : ctrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ztrsv #################################################### + +ztrsv.goto : ztrsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ztrsv.acml : ztrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrsv.atlas : ztrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrsv.mkl : ztrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrsv.veclib : ztrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sger #################################################### +sger.goto : sger.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sger.acml : sger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sger.atlas : sger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sger.mkl : sger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sger.veclib : sger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dger #################################################### +dger.goto : dger.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dger.acml : dger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dger.atlas : dger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dger.mkl : dger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dger.veclib : dger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cger #################################################### +cger.goto : cger.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cger.acml : cger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cger.atlas : cger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cger.mkl : cger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cger.veclib : cger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zger #################################################### +zger.goto : zger.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zger.acml : zger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zger.atlas : zger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zger.mkl : zger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zger.veclib : zger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ssymv #################################################### +ssymv.goto : ssymv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ssymv.acml : ssymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssymv.atlas : ssymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssymv.mkl : ssymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssymv.veclib : ssymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dsymv #################################################### +dsymv.goto : dsymv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dsymv.acml : dsymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsymv.atlas : dsymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsymv.mkl : dsymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsymv.veclib : dsymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Csymv #################################################### +csymv.goto : csymv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +csymv.acml : csymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csymv.atlas : csymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csymv.mkl : csymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csymv.veclib : csymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dsymv #################################################### +zsymv.goto : zsymv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zsymv.acml : zsymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsymv.atlas : zsymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsymv.mkl : zsymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsymv.veclib : zsymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sgeev #################################################### +sgeev.goto : sgeev.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sgeev.acml : sgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgeev.atlas : sgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgeev.mkl : sgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgeev.veclib : sgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dgeev #################################################### +dgeev.goto : dgeev.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dgeev.acml : dgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgeev.atlas : dgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgeev.mkl : dgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgeev.veclib : dgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cgeev #################################################### + +cgeev.goto : cgeev.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cgeev.acml : cgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgeev.atlas : cgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgeev.mkl : cgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgeev.veclib : cgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zgeev #################################################### + +zgeev.goto : zgeev.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zgeev.acml : zgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgeev.atlas : zgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgeev.mkl : zgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgeev.veclib : zgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sgetri #################################################### +sgetri.goto : sgetri.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sgetri.acml : sgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgetri.atlas : sgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgetri.mkl : sgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgetri.veclib : sgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dgetri #################################################### +dgetri.goto : dgetri.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dgetri.acml : dgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgetri.atlas : dgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgetri.mkl : dgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgetri.veclib : dgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cgetri #################################################### + +cgetri.goto : cgetri.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cgetri.acml : cgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgetri.atlas : cgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgetri.mkl : cgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgetri.veclib : cgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zgetri #################################################### + +zgetri.goto : zgetri.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zgetri.acml : zgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgetri.atlas : zgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgetri.mkl : zgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgetri.veclib : zgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Spotrf #################################################### +spotrf.goto : spotrf.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +spotrf.acml : spotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +spotrf.atlas : spotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +spotrf.mkl : spotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +spotrf.veclib : spotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dpotrf #################################################### +dpotrf.goto : dpotrf.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dpotrf.acml : dpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dpotrf.atlas : dpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dpotrf.mkl : dpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dpotrf.veclib : dpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cpotrf #################################################### + +cpotrf.goto : cpotrf.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cpotrf.acml : cpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cpotrf.atlas : cpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cpotrf.mkl : cpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cpotrf.veclib : cpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zpotrf #################################################### + +zpotrf.goto : zpotrf.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zpotrf.acml : zpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zpotrf.atlas : zpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zpotrf.mkl : zpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zpotrf.veclib : zpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Chemv #################################################### + +chemv.goto : chemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +chemv.acml : chemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chemv.atlas : chemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chemv.mkl : chemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chemv.veclib : chemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zhemv #################################################### + +zhemv.goto : zhemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zhemv.acml : zhemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhemv.atlas : zhemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhemv.mkl : zhemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhemv.veclib : zhemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Chbmv #################################################### + +chbmv.goto : chbmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +chbmv.acml : chbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chbmv.atlas : chbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chbmv.mkl : chbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chbmv.veclib : chbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Zhbmv #################################################### + +zhbmv.goto : zhbmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zhbmv.acml : zhbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhbmv.atlas : zhbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhbmv.mkl : zhbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhbmv.veclib : zhbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Chpmv #################################################### + +chpmv.goto : chpmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +chpmv.acml : chpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chpmv.atlas : chpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chpmv.mkl : chpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chpmv.veclib : chpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Zhpmv #################################################### + +zhpmv.goto : zhpmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zhpmv.acml : zhpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhpmv.atlas : zhpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhpmv.mkl : zhpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhpmv.veclib : zhpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Sdot #################################################### +sdot.goto : sdot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sdot.acml : sdot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sdot.atlas : sdot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sdot.mkl : sdot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sdot.veclib : sdot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ddot #################################################### +ddot.goto : ddot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ddot.acml : ddot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ddot.atlas : ddot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ddot.mkl : ddot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ddot.veclib : ddot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cdot #################################################### +cdot.goto : cdot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cdot.acml : cdot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cdot.atlas : cdot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cdot.mkl : cdot-intel.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cdot.veclib : cdot-intel.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zdot #################################################### +zdot.goto : zdot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zdot.acml : zdot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zdot.atlas : zdot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zdot.mkl : zdot-intel.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zdot.veclib : zdot-intel.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Srot #################################################### +srot.goto : srot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +srot.acml : srot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +srot.atlas : srot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +srot.mkl : srot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +srot.veclib : srot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Drot #################################################### +drot.goto : drot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +drot.acml : drot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +drot.atlas : drot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +drot.mkl : drot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +drot.veclib : drot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### csrot #################################################### +csrot.goto : csrot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +csrot.acml : csrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csrot.atlas : csrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csrot.mkl : csrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csrot.veclib : csrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### zdrot #################################################### +zdrot.goto : zdrot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zdrot.acml : zdrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zdrot.atlas : zdrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zdrot.mkl : zdrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zdrot.veclib : zdrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### srotm #################################################### +srotm.goto : srotm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +srotm.acml : srotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +srotm.atlas : srotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +srotm.mkl : srotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +srotm.veclib : srotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### drotm #################################################### +drotm.goto : drotm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +drotm.acml : drotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +drotm.atlas : drotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +drotm.mkl : drotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +drotm.veclib : drotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Saxpy #################################################### +saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +saxpy.acml : saxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +saxpy.atlas : saxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +saxpy.mkl : saxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +saxpy.veclib : saxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Daxpy #################################################### +daxpy.goto : daxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +daxpy.acml : daxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +daxpy.atlas : daxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +daxpy.mkl : daxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +daxpy.veclib : daxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Caxpy #################################################### + +caxpy.goto : caxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +caxpy.acml : caxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +caxpy.atlas : caxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +caxpy.mkl : caxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +caxpy.veclib : caxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zaxpy #################################################### + +zaxpy.goto : zaxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zaxpy.acml : zaxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zaxpy.atlas : zaxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zaxpy.mkl : zaxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zaxpy.veclib : zaxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Saxpby #################################################### +saxpby.goto : saxpby.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +saxpby.acml : saxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +saxpby.atlas : saxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +saxpby.mkl : saxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +saxpby.veclib : saxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Daxpby #################################################### +daxpby.goto : daxpby.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +daxpby.acml : daxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +daxpby.atlas : daxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +daxpby.mkl : daxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +daxpby.veclib : daxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Caxpby #################################################### + +caxpby.goto : caxpby.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +caxpby.acml : caxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +caxpby.atlas : caxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +caxpby.mkl : caxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +caxpby.veclib : caxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zaxpby #################################################### + +zaxpby.goto : zaxpby.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zaxpby.acml : zaxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zaxpby.atlas : zaxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zaxpby.mkl : zaxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zaxpby.veclib : zaxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Scopy #################################################### +scopy.goto : scopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +scopy.acml : scopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +scopy.atlas : scopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +scopy.mkl : scopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +scopy.veclib : scopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dcopy #################################################### +dcopy.goto : dcopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dcopy.acml : dcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dcopy.atlas : dcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dcopy.mkl : dcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dcopy.veclib : dcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ccopy #################################################### + +ccopy.goto : ccopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ccopy.acml : ccopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ccopy.atlas : ccopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ccopy.mkl : ccopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ccopy.veclib : ccopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zcopy #################################################### + +zcopy.goto : zcopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zcopy.acml : zcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zcopy.atlas : zcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zcopy.mkl : zcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zcopy.veclib : zcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sscal #################################################### +sscal.goto : sscal.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sscal.acml : sscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sscal.atlas : sscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sscal.mkl : sscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sscal.veclib : sscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dscal #################################################### +dscal.goto : dscal.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dscal.acml : dscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dscal.atlas : dscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dscal.mkl : dscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dscal.veclib : dscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cscal #################################################### + +cscal.goto : cscal.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cscal.acml : cscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cscal.atlas : cscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cscal.mkl : cscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cscal.veclib : cscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zscal #################################################### + +zscal.goto : zscal.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zscal.acml : zscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zscal.atlas : zscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zscal.mkl : zscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zscal.veclib : zscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sasum #################################################### +sasum.goto : sasum.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sasum.acml : sasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sasum.atlas : sasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sasum.mkl : sasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sasum.veclib : sasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dasum #################################################### +dasum.goto : dasum.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dasum.acml : dasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dasum.atlas : dasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dasum.mkl : dasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dasum.veclib : dasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Casum #################################################### + +casum.goto : casum.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +casum.acml : casum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +casum.atlas : casum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +casum.mkl : casum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +casum.veclib : casum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zasum #################################################### + +zasum.goto : zasum.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zasum.acml : zasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zasum.atlas : zasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zasum.mkl : zasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zasum.veclib : zasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sswap #################################################### +sswap.goto : sswap.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sswap.acml : sswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sswap.atlas : sswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sswap.mkl : sswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sswap.veclib : sswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dswap #################################################### +dswap.goto : dswap.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dswap.acml : dswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dswap.atlas : dswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dswap.mkl : dswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dswap.veclib : dswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cswap #################################################### + +cswap.goto : cswap.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cswap.acml : cswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cswap.atlas : cswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cswap.mkl : cswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cswap.veclib : cswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zswap #################################################### + +zswap.goto : zswap.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zswap.acml : zswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zswap.atlas : zswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zswap.mkl : zswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zswap.veclib : zswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + + +##################################### Sgesv #################################################### +sgesv.goto : sgesv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sgesv.acml : sgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgesv.atlas : sgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgesv.mkl : sgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgesv.veclib : sgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dgesv #################################################### +dgesv.goto : dgesv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dgesv.acml : dgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgesv.atlas : dgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgesv.mkl : dgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgesv.veclib : dgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cgesv #################################################### + +cgesv.goto : cgesv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cgesv.acml : cgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgesv.atlas : cgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgesv.mkl : cgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgesv.veclib : cgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zgesv #################################################### + +zgesv.goto : zgesv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zgesv.acml : zgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgesv.atlas : zgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgesv.mkl : zgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgesv.veclib : zgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + + +##################################### Cgemm3m #################################################### + +cgemm3m.goto : cgemm3m.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cgemm3m.mkl : cgemm3m.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgemm3m.veclib : cgemm3m.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zgemm3m #################################################### + +zgemm3m.goto : zgemm3m.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zgemm3m.mkl : zgemm3m.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgemm3m.veclib : zgemm3m.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## ISAMAX ############################################## +isamax.goto : isamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +isamax.atlas : isamax.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## IDAMAX ############################################## +idamax.goto : idamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +idamax.atlas : idamax.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## ICAMAX ############################################## +icamax.goto : icamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +icamax.atlas : icamax.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## IZAMAX ############################################## +izamax.goto : izamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +izamax.atlas : izamax.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## ISMAX ############################################## +ismax.goto : ismax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## IDMAX ############################################## +idmax.goto : idmax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ISAMIN ############################################## +isamin.goto : isamin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## IDAMIN ############################################## +idamin.goto : idamin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ICAMIN ############################################## +icamin.goto : icamin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## IZAMIN ############################################## +izamin.goto : izamin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ISMIN ############################################## +ismin.goto : ismin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## IDMIN ############################################## +idmin.goto : idmin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## SAMAX ############################################## +samax.goto : samax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## DAMAX ############################################## +damax.goto : damax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## CAMAX ############################################## +camax.goto : camax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ZAMAX ############################################## +zamax.goto : zamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## SMAX ############################################## +smax.goto : smax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## DMAX ############################################## +dmax.goto : dmax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## SAMIN ############################################## +samin.goto : samin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## DAMIN ############################################## +damin.goto : damin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## CAMIN ############################################## +camin.goto : camin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ZAMIN ############################################## +zamin.goto : zamin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## SMIN ############################################## +smin.goto : smin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## DMIN ############################################## +dmin.goto : dmin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## SNRM2 ############################################## +snrm2.goto : snrm2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +snrm2.atlas : snrm2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## DNRM2 ############################################## +dnrm2.goto : dnrm2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dnrm2.atlas : dnrm2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## Sscnrm2 ############################################## +scnrm2.goto : scnrm2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +scnrm2.atlas : scnrm2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## Ddznrm2 ############################################## +dznrm2.goto : dznrm2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dznrm2.atlas : dznrm2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + + +################################################################################################### + +slinpack.$(SUFFIX) : linpack.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dlinpack.$(SUFFIX) : linpack.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +clinpack.$(SUFFIX) : linpack.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zlinpack.$(SUFFIX) : linpack.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +scholesky.$(SUFFIX) : cholesky.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dcholesky.$(SUFFIX) : cholesky.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ccholesky.$(SUFFIX) : cholesky.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zcholesky.$(SUFFIX) : cholesky.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sgemm.$(SUFFIX) : gemm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dgemm.$(SUFFIX) : gemm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cgemm.$(SUFFIX) : gemm.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zgemm.$(SUFFIX) : gemm.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +ssymm.$(SUFFIX) : symm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dsymm.$(SUFFIX) : symm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +csymm.$(SUFFIX) : symm.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zsymm.$(SUFFIX) : symm.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +strmm.$(SUFFIX) : trmm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dtrmm.$(SUFFIX) : trmm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ctrmm.$(SUFFIX) : trmm.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +ztrmm.$(SUFFIX) : trmm.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +strsm.$(SUFFIX) : trsm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dtrsm.$(SUFFIX) : trsm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ctrsm.$(SUFFIX) : trsm.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +ztrsm.$(SUFFIX) : trsm.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +ssyr.$(SUFFIX) : syr.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dsyr.$(SUFFIX) : syr.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +sspr.$(SUFFIX) : spr.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dspr.$(SUFFIX) : spr.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +sspr2.$(SUFFIX) : spr2.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dspr2.$(SUFFIX) : spr2.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ssyr2.$(SUFFIX) : syr2.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dsyr2.$(SUFFIX) : syr2.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ssyrk.$(SUFFIX) : syrk.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dsyrk.$(SUFFIX) : syrk.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +csyrk.$(SUFFIX) : syrk.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zsyrk.$(SUFFIX) : syrk.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +ssyr2k.$(SUFFIX) : syr2k.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dsyr2k.$(SUFFIX) : syr2k.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +csyr2k.$(SUFFIX) : syr2k.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zsyr2k.$(SUFFIX) : syr2k.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +chemm.$(SUFFIX) : hemm.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zhemm.$(SUFFIX) : hemm.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +cherk.$(SUFFIX) : herk.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zherk.$(SUFFIX) : herk.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +cher2k.$(SUFFIX) : her2k.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zher2k.$(SUFFIX) : her2k.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sgemv.$(SUFFIX) : gemv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dgemv.$(SUFFIX) : gemv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cgemv.$(SUFFIX) : gemv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zgemv.$(SUFFIX) : gemv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +strmv.$(SUFFIX) : trmv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dtrmv.$(SUFFIX) : trmv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ctrmv.$(SUFFIX) : trmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +ztrmv.$(SUFFIX) : trmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +strsv.$(SUFFIX) : trsv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dtrsv.$(SUFFIX) : trsv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ctrsv.$(SUFFIX) : trsv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +ztrsv.$(SUFFIX) : trsv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sger.$(SUFFIX) : ger.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dger.$(SUFFIX) : ger.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cger.$(SUFFIX) : ger.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zger.$(SUFFIX) : ger.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + +ssymv.$(SUFFIX) : symv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dsymv.$(SUFFIX) : symv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +csymv.$(SUFFIX) : symv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zsymv.$(SUFFIX) : symv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sgeev.$(SUFFIX) : geev.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dgeev.$(SUFFIX) : geev.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cgeev.$(SUFFIX) : geev.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zgeev.$(SUFFIX) : geev.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sgetri.$(SUFFIX) : getri.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dgetri.$(SUFFIX) : getri.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cgetri.$(SUFFIX) : getri.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zgetri.$(SUFFIX) : getri.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +spotrf.$(SUFFIX) : potrf.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dpotrf.$(SUFFIX) : potrf.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cpotrf.$(SUFFIX) : potrf.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zpotrf.$(SUFFIX) : potrf.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +chemv.$(SUFFIX) : hemv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zhemv.$(SUFFIX) : hemv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +chbmv.$(SUFFIX) : hbmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zhbmv.$(SUFFIX) : hbmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +chpmv.$(SUFFIX) : hpmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zhpmv.$(SUFFIX) : hpmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sdot.$(SUFFIX) : dot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +ddot.$(SUFFIX) : dot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cdot.$(SUFFIX) : zdot.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zdot.$(SUFFIX) : zdot.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +cdot-intel.$(SUFFIX) : zdot-intel.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zdot-intel.$(SUFFIX) : zdot-intel.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + + +saxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +daxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +caxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zaxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +saxpby.$(SUFFIX) : axpby.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +daxpby.$(SUFFIX) : axpby.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +caxpby.$(SUFFIX) : axpby.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zaxpby.$(SUFFIX) : axpby.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +scopy.$(SUFFIX) : copy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dcopy.$(SUFFIX) : copy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ccopy.$(SUFFIX) : copy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zcopy.$(SUFFIX) : copy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sswap.$(SUFFIX) : swap.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dswap.$(SUFFIX) : swap.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cswap.$(SUFFIX) : swap.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zswap.$(SUFFIX) : swap.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + + +sscal.$(SUFFIX) : scal.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dscal.$(SUFFIX) : scal.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cscal.$(SUFFIX) : scal.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zscal.$(SUFFIX) : scal.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sasum.$(SUFFIX) : asum.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dasum.$(SUFFIX) : asum.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +casum.$(SUFFIX) : asum.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zasum.$(SUFFIX) : asum.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + +sgesv.$(SUFFIX) : gesv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dgesv.$(SUFFIX) : gesv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cgesv.$(SUFFIX) : gesv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zgesv.$(SUFFIX) : gesv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +srot.$(SUFFIX) : rot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +drot.$(SUFFIX) : rot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +csrot.$(SUFFIX) : rot.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zdrot.$(SUFFIX) : rot.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +srotm.$(SUFFIX) : rotm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +drotm.$(SUFFIX) : rotm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + + + +cgemm3m.$(SUFFIX) : gemm3m.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zgemm3m.$(SUFFIX) : gemm3m.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + +isamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +idamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +icamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +izamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + +ismax.$(SUFFIX) : imax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +idmax.$(SUFFIX) : imax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + + +isamin.$(SUFFIX) : iamin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +idamin.$(SUFFIX) : iamin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +icamin.$(SUFFIX) : iamin.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +izamin.$(SUFFIX) : iamin.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + +ismin.$(SUFFIX) : imin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +idmin.$(SUFFIX) : imin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + + +samax.$(SUFFIX) : amax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +damax.$(SUFFIX) : amax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +camax.$(SUFFIX) : amax.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zamax.$(SUFFIX) : amax.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + +smax.$(SUFFIX) : max.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dmax.$(SUFFIX) : max.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + + +samin.$(SUFFIX) : amin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +damin.$(SUFFIX) : amin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +camin.$(SUFFIX) : amin.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zamin.$(SUFFIX) : amin.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + +smin.$(SUFFIX) : min.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dmin.$(SUFFIX) : min.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + + +snrm2.$(SUFFIX) : nrm2.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dnrm2.$(SUFFIX) : nrm2.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +scnrm2.$(SUFFIX) : nrm2.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +dznrm2.$(SUFFIX) : nrm2.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + +smallscaling: smallscaling.c ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm -lpthread + +clean :: + @rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl smallscaling + +include $(TOPDIR)/Makefile.tail + diff --git a/benchmark/amax.c b/benchmark/amax.c new file mode 100644 index 000000000..32f55ce83 --- /dev/null +++ b/benchmark/amax.c @@ -0,0 +1,191 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef AMAX + +#ifdef COMPLEX +#ifdef DOUBLE +#define AMAX BLASFUNC(dzamax) +#else +#define AMAX BLASFUNC(scamax) +#endif +#else +#ifdef DOUBLE +#define AMAX BLASFUNC(damax) +#else +#define AMAX BLASFUNC(samax) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef AMIN + +#ifdef COMPLEX +#ifdef DOUBLE +#define AMIN BLASFUNC(dzamin) +#else +#define AMIN BLASFUNC(scamin) +#endif +#else +#ifdef DOUBLE +#define AMIN BLASFUNC(damin) +#else +#define AMIN BLASFUNC(samin) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef IAMIN + +#ifdef COMPLEX +#ifdef DOUBLE +#define IAMIN BLASFUNC(izamin) +#else +#define IAMIN BLASFUNC(icamin) +#endif +#else +#ifdef DOUBLE +#define IAMIN BLASFUNC(idamin) +#else +#define IAMIN BLASFUNC(isamin) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef IMAX + +#ifndef COMPLEX +#ifdef DOUBLE +#define IMAX BLASFUNC(idmax) +#else +#define IMAX BLASFUNC(ismax) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef IMIN + +#ifndef COMPLEX +#ifdef DOUBLE +#define IMIN BLASFUNC(idmin) +#else +#define IMIN BLASFUNC(ismin) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef NAMAX + +#ifndef COMPLEX +#ifdef DOUBLE +#define NAMAX BLASFUNC(dmax) +#else +#define NAMAX BLASFUNC(smax) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef NAMIN + +#ifndef COMPLEX +#ifdef DOUBLE +#define NAMIN BLASFUNC(dmin) +#else +#define NAMIN BLASFUNC(smin) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for (l=0; l Date: Mon, 9 Mar 2020 15:36:50 +0800 Subject: [PATCH 050/593] Fix the functional bugs for zamax. --- kernel/arm64/zamax.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/arm64/zamax.S b/kernel/arm64/zamax.S index c2c0a5374..3b49c4fff 100644 --- a/kernel/arm64/zamax.S +++ b/kernel/arm64/zamax.S @@ -137,10 +137,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fadd v3.2d, v3.2d, v4.2d #if defined(USE_MIN) fmin v1.2d, v1.2d, v3.2d - fminp MAXF, v1.2d + fminp TMPF, v1.2d #else fmax v1.2d, v1.2d, v3.2d - fmaxp MAXF, v1.2d + fmaxp TMPF, v1.2d #endif #endif fcmp MAXF, TMPF From ff40a4e7262d0e401c8c57032b0caf1058043d94 Mon Sep 17 00:00:00 2001 From: "jayfely@qq.com" Date: Tue, 10 Mar 2020 14:22:18 +0800 Subject: [PATCH 051/593] Add benchmark for SPMV --- benchmark/Makefile | 698 +++++---------------------------------------- benchmark/spmv.c | 219 ++++++++++++++ interface/Makefile | 4 +- 3 files changed, 300 insertions(+), 621 deletions(-) create mode 100644 benchmark/spmv.c diff --git a/benchmark/Makefile b/benchmark/Makefile index 2db873e95..ccddb55e8 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -12,9 +12,9 @@ include $(TOPDIR)/Makefile.system # ACML 6.1 custom ACML=/home/saar/acml6.1/gfortran64_mp/lib LIBACML = -fopenmp $(ACML)/libacml_mp.so -lgfortran -lm + - -# Atlas Ubuntu +# Atlas Ubuntu #ATLAS=/usr/lib/atlas-base #LIBATLAS = -fopenmp $(ATLAS)/liblapack_atlas.a $(ATLAS)/libptcblas.a $(ATLAS)/libptf77blas.a $(ATLAS)/libatlas.a -lgfortran -lm @@ -56,16 +56,11 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \ strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \ - sspr.goto dspr.goto \ - sspr2.goto dspr2.goto \ - ssyr.goto dsyr.goto \ - ssyr2.goto dsyr2.goto \ ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ sger.goto dger.goto cger.goto zger.goto \ sdot.goto ddot.goto \ - srot.goto drot.goto csrot.goto zdrot.goto \ - srotm.goto drotm.goto \ + srot.goto drot.goto \ saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ scopy.goto dcopy.goto ccopy.goto zcopy.goto \ sswap.goto dswap.goto cswap.goto zswap.goto \ @@ -73,36 +68,26 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ sasum.goto dasum.goto casum.goto zasum.goto \ ssymv.goto dsymv.goto csymv.goto zsymv.goto \ chemv.goto zhemv.goto \ - chbmv.goto zhbmv.goto \ - chpmv.goto zhpmv.goto \ chemm.goto zhemm.goto \ cherk.goto zherk.goto \ cher2k.goto zher2k.goto \ sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ - strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ - strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \ + sspmv.goto dspmv.goto cspmv.goto zspmv.goto \ sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ sgesv.goto dgesv.goto cgesv.goto zgesv.goto \ sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \ - ssymm.goto dsymm.goto csymm.goto zsymm.goto \ - saxpby.goto daxpby.goto caxpby.goto zaxpby.goto + ssymm.goto dsymm.goto csymm.goto zsymm.goto acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ sgemm.acml dgemm.acml cgemm.acml zgemm.acml \ strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \ strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \ - sspr.acml dspr.acml \ - sspr2.acml dspr2.acml \ - ssyr.acml dsyr.acml \ - ssyr2.acml dsyr2.acml \ ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ sger.acml dger.acml cger.acml zger.acml \ sdot.acml ddot.acml \ - srot.acml drot.acml csrot.acml zdrot.acml \ - srotm.acml drotm.acml \ saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ scopy.acml dcopy.acml ccopy.acml zcopy.acml \ sswap.acml dswap.acml cswap.acml zswap.acml \ @@ -110,36 +95,26 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ sasum.acml dasum.acml casum.acml zasum.acml \ ssymv.acml dsymv.acml csymv.acml zsymv.acml \ chemv.acml zhemv.acml \ - chbmv.acml zhbmv.acml \ - chpmv.acml zhpmv.acml \ chemm.acml zhemm.acml \ cherk.acml zherk.acml \ cher2k.acml zher2k.acml \ sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ - strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ - strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \ + sspmv.acml dspmv.acml cspmv.acml zspmv.acml \ sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ sgetri.acml dgetri.acml cgetri.acml zgetri.acml \ spotrf.acml dpotrf.acml cpotrf.acml zpotrf.acml \ - ssymm.acml dsymm.acml csymm.acml zsymm.acml \ - saxpby.acml daxpby.acml caxpby.acml zaxpby.acml + ssymm.acml dsymm.acml csymm.acml zsymm.acml atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ scholesky.atlas dcholesky.atlas ccholesky.atlas zcholesky.atlas \ sgemm.atlas dgemm.atlas cgemm.atlas zgemm.atlas \ strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \ strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \ - sspr.atlas dspr.atlas \ - sspr2.atlas dspr2.atlas \ - ssyr.atlas dsyr.atlas \ - ssyr2.atlas dsyr2.atlas \ ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ sger.atlas dger.atlas cger.atlas zger.atlas\ sdot.atlas ddot.atlas \ - srot.atlas drot.atlas csrot.atlas zdrot.atlas \ - srotm.atlas drotm.atlas \ saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \ scopy.atlas dcopy.atlas ccopy.atlas zcopy.atlas \ sswap.atlas dswap.atlas cswap.atlas zswap.atlas \ @@ -147,37 +122,27 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ sasum.atlas dasum.atlas casum.atlas zasum.atlas \ ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \ chemv.atlas zhemv.atlas \ - chbmv.atlas zhbmv.atlas \ - chpmv.atlas zhpmv.atlas \ chemm.acml zhemm.acml \ chemm.atlas zhemm.atlas \ cherk.atlas zherk.atlas \ cher2k.atlas zher2k.atlas \ sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ - strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ - strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \ + sspmv.atlas dspmv.atlas cspmv.atlas zspmv.atlas \ sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \ - ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas \ - saxpby.atlas daxpby.atlas caxpby.atlas zaxpby.atlas + ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \ sgemm.mkl dgemm.mkl cgemm.mkl zgemm.mkl \ strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \ strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \ - sspr.mkl dspr.mkl \ - sspr2.mkl dspr2.mkl \ - ssyr.mkl dsyr.mkl \ - ssyr2.mkl dsyr2.mkl \ ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ sger.mkl dger.mkl cger.mkl zger.mkl \ sdot.mkl ddot.mkl \ - srot.mkl drot.mkl csrot.mkl zdrot.mkl \ - srotm.mkl drotm.mkl \ saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \ scopy.mkl dcopy.mkl ccopy.mkl zcopy.mkl \ sswap.mkl dswap.mkl cswap.mkl zswap.mkl \ @@ -185,36 +150,27 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ sasum.mkl dasum.mkl casum.mkl zasum.mkl \ ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ chemv.mkl zhemv.mkl \ - chbmv.mkl zhbmv.mkl \ - chpmv.mkl zhpmv.mkl \ chemm.mkl zhemm.mkl \ cherk.mkl zherk.mkl \ cher2k.mkl zher2k.mkl \ sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ - strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ - strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \ + sspmv.mkl dspmv.mkl cspmv.mkl zspmv.mkl \ sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \ spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \ - ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl \ - saxpby.mkl daxpby.mkl caxpby.mkl zaxpby.mkl + ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl else goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \ strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \ - sspr.goto dspr.goto \ - sspr2.goto dspr2.goto \ - ssyr.goto dsyr.goto \ - ssyr2.goto dsyr2.goto \ ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ sger.goto dger.goto cger.goto zger.goto \ sdot.goto ddot.goto cdot.goto zdot.goto \ - srot.goto drot.goto csrot.goto zdrot.goto \ - srotm.goto drotm.goto \ + srot.goto drot.goto \ saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ scopy.goto dcopy.goto ccopy.goto zcopy.goto \ sswap.goto dswap.goto cswap.goto zswap.goto \ @@ -222,14 +178,11 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ sasum.goto dasum.goto casum.goto zasum.goto \ ssymv.goto dsymv.goto \ chemv.goto zhemv.goto \ - chbmv.goto zhbmv.goto \ - chpmv.goto zhpmv.goto \ chemm.goto zhemm.goto \ cherk.goto zherk.goto \ cher2k.goto zher2k.goto \ + sspmv.goto dspmv.goto cspmv.goto zspmv.goto \ sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ - strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ - strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \ ssymm.goto dsymm.goto csymm.goto zsymm.goto \ smallscaling \ isamax.goto idamax.goto icamax.goto izamax.goto \ @@ -240,7 +193,6 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ smax.goto dmax.goto \ samin.goto damin.goto camin.goto zamin.goto \ smin.goto dmin.goto \ - saxpby.goto daxpby.goto caxpby.goto zaxpby.goto \ snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ @@ -248,16 +200,10 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ sgemm.acml dgemm.acml cgemm.acml zgemm.acml \ strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \ strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \ - sspr.acml dspr.acml \ - sspr2.acml dspr2.acml \ - ssyr.acml dsyr.acml \ - ssyr2.acml dsyr2.acml \ ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ sger.acml dger.acml cger.acml zger.acml \ sdot.acml ddot.acml \ - srot.acml drot.acml csrot.acml zdrot.acml \ - srotm.acml drotm.acml \ saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ scopy.acml dcopy.acml ccopy.acml zcopy.acml \ sswap.acml dswap.acml cswap.acml zswap.acml \ @@ -265,36 +211,26 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ sasum.acml dasum.acml casum.acml zasum.acml \ ssymv.acml dsymv.acml csymv.acml zsymv.acml \ chemv.acml zhemv.acml \ - chbmv.acml zhbmv.acml \ - chpmv.acml zhpmv.acml \ chemm.acml zhemm.acml \ cherk.acml zherk.acml \ cher2k.acml zher2k.acml \ sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ - strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ - strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \ + sspmv.acml dspmv.acml cspmv.acml zspmv.acml \ sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ sgetri.acml dgetri.acml cgetri.acml zgetri.acml \ spotrf.acml dpotrf.acml cpotrf.acml zpotrf.acml \ - ssymm.acml dsymm.acml csymm.acml zsymm.acml \ - saxpby.acml daxpby.acml caxpby.acml zaxpby.acml + ssymm.acml dsymm.acml csymm.acml zsymm.acml atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ scholesky.atlas dcholesky.atlas ccholesky.atlas zcholesky.atlas \ sgemm.atlas dgemm.atlas cgemm.atlas zgemm.atlas \ strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \ strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \ - sspr.atlas dspr.atlas \ - sspr2.atlas dspr2.atlas \ - ssyr.atlas dsyr.atlas \ - ssyr2.atlas dsyr2.atlas \ ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ sger.atlas dger.atlas cger.atlas zger.atlas\ sdot.atlas ddot.atlas \ - srot.atlas drot.atlas csrot.atlas zdrot.atlas \ - srotm.atlas drotm.atlas \ saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \ scopy.atlas dcopy.atlas ccopy.atlas zcopy.atlas \ sswap.atlas dswap.atlas cswap.atlas zswap.atlas \ @@ -302,39 +238,29 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ sasum.atlas dasum.atlas casum.atlas zasum.atlas \ ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \ chemv.atlas zhemv.atlas \ - chbmv.atlas zhbmv.atlas \ - chpmv.atlas zhpmv.atlas \ chemm.acml zhemm.acml \ chemm.atlas zhemm.atlas \ cherk.atlas zherk.atlas \ cher2k.atlas zher2k.atlas \ sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ - strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ - strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \ + sspmv.atlas dspmv.atlas cspmv.atlas zspmv.atlas \ sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \ ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas \ isamax.atlas idamax.atlas icamax.atlas izamax.atlas \ - snrm2.atlas dnrm2.atlas scnrm2.atlas dznrm2.atlas \ - saxpby.atlas daxpby.atlas caxpby.atlas zaxpby.atlas + snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \ sgemm.mkl dgemm.mkl cgemm.mkl zgemm.mkl \ strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \ strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \ - sspr.mkl dspr.mkl \ - sspr2.mkl dspr2.mkl \ - ssyr.mkl dsyr.mkl \ - ssyr2.mkl dsyr2.mkl \ ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ sger.mkl dger.mkl cger.mkl zger.mkl \ sdot.mkl ddot.mkl cdot.mkl zdot.mkl \ - srot.atlas drot.atlas csrot.atlas zdrot.atlas \ - srotm.atlas drotm.atlas \ saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \ scopy.mkl dcopy.mkl ccopy.mkl zcopy.mkl \ sswap.mkl dswap.mkl cswap.mkl zswap.mkl \ @@ -342,20 +268,16 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ sasum.mkl dasum.mkl casum.mkl zasum.mkl \ ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ chemv.mkl zhemv.mkl \ - chbmv.mkl zhbmv.mkl \ - chpmv.mkl zhpmv.mkl \ chemm.mkl zhemm.mkl \ cherk.mkl zherk.mkl \ cher2k.mkl zher2k.mkl \ sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ - strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ - strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \ + sspmv.mkl dspmv.mkl cspmv.mkl zspmv.mkl \ sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \ spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \ - ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl \ - saxpby.mkl daxpby.mkl caxpby.mkl zaxpby.mkl + ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl @@ -373,16 +295,10 @@ veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ sgemm.veclib dgemm.veclib cgemm.veclib zgemm.veclib \ strmm.veclib dtrmm.veclib ctrmm.veclib ztrmm.veclib \ strsm.veclib dtrsm.veclib ctrsm.veclib ztrsm.veclib \ - sspr.veclib dspr.veclib \ - sspr2.veclib dspr2.veclib \ - ssyr.veclib dsyr.veclib \ - ssyr2.veclib dsyr2.veclib \ ssyrk.veclib dsyrk.veclib csyrk.veclib zsyrk.veclib \ ssyr2k.veclib dsyr2k.veclib csyr2k.veclib zsyr2k.veclib \ sger.veclib dger.veclib cger.veclib zger.veclib \ sdot.veclib ddot.veclib cdot.veclib zdot.veclib \ - srot.veclib drot.veclib csrot.veclib zdrot.veclib \ - srotm.veclib drotm.veclib \ saxpy.veclib daxpy.veclib caxpy.veclib zaxpy.veclib \ scopy.veclib dcopy.veclib ccopy.veclib zcopy.veclib \ sswap.veclib dswap.veclib cswap.veclib zswap.veclib \ @@ -390,20 +306,16 @@ veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ sasum.veclib dasum.veclib casum.veclib zasum.veclib \ ssymv.veclib dsymv.veclib csymv.veclib zsymv.veclib \ chemv.veclib zhemv.veclib \ - chbmv.veclib zhbmv.veclib \ - chpmv.veclib zhpmv.veclib \ chemm.veclib zhemm.veclib \ cherk.veclib zherk.veclib \ cher2k.veclib zher2k.veclib \ sgemv.veclib dgemv.veclib cgemv.veclib zgemv.veclib \ - strmv.veclib dtrmv.veclib ctrmv.veclib ztrmv.veclib \ - strsv.veclib dtrsv.veclib ctrsv.veclib ztrsv.veclib \ + sspmv.veclib dspmv.veclib cspmv.veclib zspmv.veclib \ sgeev.veclib dgeev.veclib cgeev.veclib zgeev.veclib \ sgesv.veclib dgesv.veclib cgesv.veclib zgesv.veclib \ sgetri.veclib dgetri.veclib cgetri.veclib zgetri.veclib \ spotrf.veclib dpotrf.veclib cpotrf.veclib zpotrf.veclib \ - ssymm.veclib dsymm.veclib csymm.veclib zsymm.veclib \ - saxpby.veclib daxpby.veclib caxpby.veclib zaxpby.veclib + ssymm.veclib dsymm.veclib csymm.veclib zsymm.veclib goto_3m :: cgemm3m.goto zgemm3m.goto @@ -872,130 +784,6 @@ ztrsm.veclib : ztrsm.$(SUFFIX) ztrsm.essl : ztrsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Ssyr #################################################### -ssyr.goto : ssyr.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ssyr.acml : ssyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr.atlas : ssyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr.mkl : ssyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr.veclib : ssyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Dsyr #################################################### -dsyr.goto : dsyr.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dsyr.acml : dsyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr.atlas : dsyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr.mkl : dsyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr.veclib : dsyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sspr #################################################### -sspr.goto : sspr.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sspr.acml : sspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sspr.atlas : sspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sspr.mkl : sspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sspr.veclib : sspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dspr #################################################### -dspr.goto : dspr.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dspr.acml : dspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dspr.atlas : dspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dspr.mkl : dspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dspr.veclib : dspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sspr2 #################################################### -sspr2.goto : sspr2.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sspr2.acml : sspr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sspr2.atlas : sspr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sspr2.mkl : sspr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sspr2.veclib : sspr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dspr2 #################################################### -dspr2.goto : dspr2.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dspr2.acml : dspr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dspr2.atlas : dspr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dspr2.mkl : dspr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dspr2.veclib : dspr2.$(SUFFIX) - -##################################### Ssyr2 #################################################### -ssyr2.goto : ssyr2.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ssyr2.acml : ssyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr2.atlas : ssyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr2.mkl : ssyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr2.veclib : ssyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Dsyr2 #################################################### -dsyr2.goto : dsyr2.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dsyr2.acml : dsyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr2.atlas : dsyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr2.mkl : dsyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr2.veclib : dsyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Ssyrk #################################################### ssyrk.goto : ssyrk.$(SUFFIX) ../$(LIBNAME) @@ -1231,202 +1019,135 @@ zher2k.mkl : zher2k.$(SUFFIX) zher2k.veclib : zher2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Sgemv #################################################### -sgemv.goto : sgemv.$(SUFFIX) ../$(LIBNAME) +##################################### Sspmv #################################################### +sspmv.goto : sspmv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm -sgemv.acml : sgemv.$(SUFFIX) +sspmv.acml : sspmv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -sgemv.atlas : sgemv.$(SUFFIX) +sspmv.atlas : sspmv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -sgemv.mkl : sgemv.$(SUFFIX) +sspmv.mkl : sspmv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -sgemv.veclib : sgemv.$(SUFFIX) +sspmv.veclib : sspmv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Dgemv #################################################### -dgemv.goto : dgemv.$(SUFFIX) ../$(LIBNAME) +##################################### Dspmv #################################################### +dspmv.goto : dspmv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm -dgemv.acml : dgemv.$(SUFFIX) +dspmv.acml : dspmv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -dgemv.atlas : dgemv.$(SUFFIX) +dspmv.atlas : dspmv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -dgemv.mkl : dgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgemv.veclib : dgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cgemv #################################################### - -cgemv.goto : cgemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cgemv.acml : cgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgemv.atlas : cgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgemv.mkl : cgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgemv.veclib : cgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zgemv #################################################### - -zgemv.goto : zgemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zgemv.acml : zgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgemv.atlas : zgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgemv.mkl : zgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgemv.veclib : zgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Strmv #################################################### -strmv.goto : strmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -strmv.acml : strmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strmv.atlas : strmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strmv.mkl : strmv.$(SUFFIX) +dspmv.mkl : dspmv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -strmv.veclib : strmv.$(SUFFIX) +dspmv.veclib : dspmv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Dtrmv #################################################### -dtrmv.goto : dtrmv.$(SUFFIX) ../$(LIBNAME) +##################################### Cspmv #################################################### +cspmv.goto : cspmv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm -dtrmv.acml : dtrmv.$(SUFFIX) +cspmv.acml : cspmv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -dtrmv.atlas : dtrmv.$(SUFFIX) +cspmv.atlas : cspmv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -dtrmv.mkl : dtrmv.$(SUFFIX) +cspmv.mkl : cspmv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -dtrmv.veclib : dtrmv.$(SUFFIX) +cspmv.veclib : cspmv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Ctrmv #################################################### - -ctrmv.goto : ctrmv.$(SUFFIX) ../$(LIBNAME) +##################################### Zspmv #################################################### +zspmv.goto : zspmv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm -ctrmv.acml : ctrmv.$(SUFFIX) +zspmv.acml : zspmv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -ctrmv.atlas : ctrmv.$(SUFFIX) +zspmv.atlas : zspmv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -ctrmv.mkl : ctrmv.$(SUFFIX) +zspmv.mkl : zspmv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -ctrmv.veclib : ctrmv.$(SUFFIX) +zspmv.veclib : zspmv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Ztrmv #################################################### - -ztrmv.goto : ztrmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ztrmv.acml : ztrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrmv.atlas : ztrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -ztrmv.mkl : ztrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrmv.veclib : ztrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Strsv #################################################### -strsv.goto : strsv.$(SUFFIX) ../$(LIBNAME) +##################################### Sgemv #################################################### +sgemv.goto : sgemv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm -strsv.acml : strsv.$(SUFFIX) +sgemv.acml : sgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -strsv.atlas : strsv.$(SUFFIX) +sgemv.atlas : sgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -strsv.mkl : strsv.$(SUFFIX) +sgemv.mkl : sgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -strsv.veclib : strsv.$(SUFFIX) +sgemv.veclib : sgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Dtrsv #################################################### -dtrsv.goto : dtrsv.$(SUFFIX) ../$(LIBNAME) +##################################### Dgemv #################################################### +dgemv.goto : dgemv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm -dtrsv.acml : dtrsv.$(SUFFIX) +dgemv.acml : dgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -dtrsv.atlas : dtrsv.$(SUFFIX) +dgemv.atlas : dgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -dtrsv.mkl : dtrsv.$(SUFFIX) +dgemv.mkl : dgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -dtrsv.veclib : dtrsv.$(SUFFIX) +dgemv.veclib : dgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Ctrsv #################################################### +##################################### Cgemv #################################################### -ctrsv.goto : ctrsv.$(SUFFIX) ../$(LIBNAME) +cgemv.goto : cgemv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm -ctrsv.acml : ctrsv.$(SUFFIX) +cgemv.acml : cgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -ctrsv.atlas : ctrsv.$(SUFFIX) +cgemv.atlas : cgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -ctrsv.mkl : ctrsv.$(SUFFIX) +cgemv.mkl : cgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -ctrsv.veclib : ctrsv.$(SUFFIX) +cgemv.veclib : cgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Ztrsv #################################################### +##################################### Zgemv #################################################### -ztrsv.goto : ztrsv.$(SUFFIX) ../$(LIBNAME) +zgemv.goto : zgemv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm -ztrsv.acml : ztrsv.$(SUFFIX) +zgemv.acml : zgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -ztrsv.atlas : ztrsv.$(SUFFIX) +zgemv.atlas : zgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -ztrsv.mkl : ztrsv.$(SUFFIX) +zgemv.mkl : zgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -ztrsv.veclib : ztrsv.$(SUFFIX) +zgemv.veclib : zgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Sger #################################################### @@ -1788,70 +1509,7 @@ zhemv.mkl : zhemv.$(SUFFIX) zhemv.veclib : zhemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Chbmv #################################################### -chbmv.goto : chbmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -chbmv.acml : chbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chbmv.atlas : chbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chbmv.mkl : chbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chbmv.veclib : chbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Zhbmv #################################################### - -zhbmv.goto : zhbmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zhbmv.acml : zhbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhbmv.atlas : zhbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhbmv.mkl : zhbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhbmv.veclib : zhbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Chpmv #################################################### - -chpmv.goto : chpmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -chpmv.acml : chpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chpmv.atlas : chpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chpmv.mkl : chpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chpmv.veclib : chpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Zhpmv #################################################### - -zhpmv.goto : zhpmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zhpmv.acml : zhpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhpmv.atlas : zhpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhpmv.mkl : zhpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhpmv.veclib : zhpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Sdot #################################################### sdot.goto : sdot.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -1948,69 +1606,6 @@ drot.mkl : drot.$(SUFFIX) drot.veclib : drot.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### csrot #################################################### -csrot.goto : csrot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -csrot.acml : csrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csrot.atlas : csrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csrot.mkl : csrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csrot.veclib : csrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### zdrot #################################################### -zdrot.goto : zdrot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zdrot.acml : zdrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zdrot.atlas : zdrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zdrot.mkl : zdrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zdrot.veclib : zdrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### srotm #################################################### -srotm.goto : srotm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -srotm.acml : srotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -srotm.atlas : srotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -srotm.mkl : srotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -srotm.veclib : srotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### drotm #################################################### -drotm.goto : drotm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -drotm.acml : drotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -drotm.atlas : drotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -drotm.mkl : drotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -drotm.veclib : drotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Saxpy #################################################### saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME) @@ -2078,72 +1673,7 @@ zaxpy.mkl : zaxpy.$(SUFFIX) zaxpy.veclib : zaxpy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Saxpby #################################################### -saxpby.goto : saxpby.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -saxpby.acml : saxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -saxpby.atlas : saxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -saxpby.mkl : saxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -saxpby.veclib : saxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Daxpby #################################################### -daxpby.goto : daxpby.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -daxpby.acml : daxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -daxpby.atlas : daxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -daxpby.mkl : daxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -daxpby.veclib : daxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Caxpby #################################################### - -caxpby.goto : caxpby.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -caxpby.acml : caxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -caxpby.atlas : caxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -caxpby.mkl : caxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -caxpby.veclib : caxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zaxpby #################################################### -zaxpby.goto : zaxpby.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zaxpby.acml : zaxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zaxpby.atlas : zaxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zaxpby.mkl : zaxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zaxpby.veclib : zaxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - ##################################### Scopy #################################################### scopy.goto : scopy.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -2533,7 +2063,7 @@ ismax.goto : ismax.$(SUFFIX) ../$(LIBNAME) ############################################## IDMAX ############################################## idmax.goto : idmax.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - + ############################################## ISAMIN ############################################## isamin.goto : isamin.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -2709,30 +2239,6 @@ ctrsm.$(SUFFIX) : trsm.c ztrsm.$(SUFFIX) : trsm.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ -ssyr.$(SUFFIX) : syr.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dsyr.$(SUFFIX) : syr.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -sspr.$(SUFFIX) : spr.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dspr.$(SUFFIX) : spr.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -sspr2.$(SUFFIX) : spr2.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dspr2.$(SUFFIX) : spr2.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ssyr2.$(SUFFIX) : syr2.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dsyr2.$(SUFFIX) : syr2.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - ssyrk.$(SUFFIX) : syrk.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ @@ -2775,40 +2281,28 @@ cher2k.$(SUFFIX) : her2k.c zher2k.$(SUFFIX) : her2k.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ -sgemv.$(SUFFIX) : gemv.c +sspmv.$(SUFFIX) : spmv.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ -dgemv.$(SUFFIX) : gemv.c +dspmv.$(SUFFIX) : spmv.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ -cgemv.$(SUFFIX) : gemv.c +cspmv.$(SUFFIX) : spmv.c $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ -zgemv.$(SUFFIX) : gemv.c +zspmv.$(SUFFIX) : spmv.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ -strmv.$(SUFFIX) : trmv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dtrmv.$(SUFFIX) : trmv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ctrmv.$(SUFFIX) : trmv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -ztrmv.$(SUFFIX) : trmv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -strsv.$(SUFFIX) : trsv.c +sgemv.$(SUFFIX) : gemv.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ -dtrsv.$(SUFFIX) : trsv.c +dgemv.$(SUFFIX) : gemv.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ -ctrsv.$(SUFFIX) : trsv.c +cgemv.$(SUFFIX) : gemv.c $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ -ztrsv.$(SUFFIX) : trsv.c +zgemv.$(SUFFIX) : gemv.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ sger.$(SUFFIX) : ger.c @@ -2878,18 +2372,6 @@ chemv.$(SUFFIX) : hemv.c zhemv.$(SUFFIX) : hemv.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ -chbmv.$(SUFFIX) : hbmv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zhbmv.$(SUFFIX) : hbmv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -chpmv.$(SUFFIX) : hpmv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zhpmv.$(SUFFIX) : hpmv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - sdot.$(SUFFIX) : dot.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ @@ -2922,18 +2404,6 @@ caxpy.$(SUFFIX) : axpy.c zaxpy.$(SUFFIX) : axpy.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ -saxpby.$(SUFFIX) : axpby.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -daxpby.$(SUFFIX) : axpby.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -caxpby.$(SUFFIX) : axpby.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zaxpby.$(SUFFIX) : axpby.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - scopy.$(SUFFIX) : copy.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ @@ -3003,17 +2473,7 @@ srot.$(SUFFIX) : rot.c drot.$(SUFFIX) : rot.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ -csrot.$(SUFFIX) : rot.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zdrot.$(SUFFIX) : rot.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -srotm.$(SUFFIX) : rotm.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ -drotm.$(SUFFIX) : rotm.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ @@ -3096,7 +2556,6 @@ camin.$(SUFFIX) : amin.c zamin.$(SUFFIX) : amin.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - smin.$(SUFFIX) : min.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ @@ -3104,6 +2563,7 @@ dmin.$(SUFFIX) : min.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + snrm2.$(SUFFIX) : nrm2.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ diff --git a/benchmark/spmv.c b/benchmark/spmv.c new file mode 100644 index 000000000..12a33e298 --- /dev/null +++ b/benchmark/spmv.c @@ -0,0 +1,219 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef SPMV + + +#ifndef COMPLEX + +#ifdef DOUBLE +#define SPMV BLASFUNC(dspmv) +#else +#define SPMV BLASFUNC(sspmv) +#endif + +#else + +#ifdef DOUBLE +#define SPMV BLASFUNC(zspmv) +#else +#define SPMV BLASFUNC(cspmv) +#endif + +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *a, *x, *y; + FLOAT alpha[] = {1.0, 1.0}; + FLOAT beta [] = {1.0, 1.0}; + char uplo='L'; + blasint m, i, j; + blasint inc_x=1,inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; + + fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops); + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6dx%d : ", (int)m,(int)m); + + for(j = 0; j < m; j++){ + for(i = 0; i < m * COMPSIZE; i++){ + a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + } + + + for (l=0; l Date: Tue, 10 Mar 2020 14:32:18 +0800 Subject: [PATCH 052/593] Modify Makefile in Benchmark --- benchmark/Makefile | 712 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 668 insertions(+), 44 deletions(-) diff --git a/benchmark/Makefile b/benchmark/Makefile index ccddb55e8..fdc57819f 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -12,9 +12,9 @@ include $(TOPDIR)/Makefile.system # ACML 6.1 custom ACML=/home/saar/acml6.1/gfortran64_mp/lib LIBACML = -fopenmp $(ACML)/libacml_mp.so -lgfortran -lm - -# Atlas Ubuntu + +# Atlas Ubuntu #ATLAS=/usr/lib/atlas-base #LIBATLAS = -fopenmp $(ATLAS)/liblapack_atlas.a $(ATLAS)/libptcblas.a $(ATLAS)/libptf77blas.a $(ATLAS)/libatlas.a -lgfortran -lm @@ -56,11 +56,16 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \ strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \ + sspr.goto dspr.goto \ + sspr2.goto dspr2.goto \ + ssyr.goto dsyr.goto \ + ssyr2.goto dsyr2.goto \ ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ sger.goto dger.goto cger.goto zger.goto \ sdot.goto ddot.goto \ - srot.goto drot.goto \ + srot.goto drot.goto csrot.goto zdrot.goto \ + srotm.goto drotm.goto \ saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ scopy.goto dcopy.goto ccopy.goto zcopy.goto \ sswap.goto dswap.goto cswap.goto zswap.goto \ @@ -68,26 +73,37 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ sasum.goto dasum.goto casum.goto zasum.goto \ ssymv.goto dsymv.goto csymv.goto zsymv.goto \ chemv.goto zhemv.goto \ + chbmv.goto zhbmv.goto \ + chpmv.goto zhpmv.goto \ chemm.goto zhemm.goto \ cherk.goto zherk.goto \ cher2k.goto zher2k.goto \ sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ sspmv.goto dspmv.goto cspmv.goto zspmv.goto \ + strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ + strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \ sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ sgesv.goto dgesv.goto cgesv.goto zgesv.goto \ sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \ - ssymm.goto dsymm.goto csymm.goto zsymm.goto + ssymm.goto dsymm.goto csymm.goto zsymm.goto \ + saxpby.goto daxpby.goto caxpby.goto zaxpby.goto acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ sgemm.acml dgemm.acml cgemm.acml zgemm.acml \ strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \ strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \ + sspr.acml dspr.acml \ + sspr2.acml dspr2.acml \ + ssyr.acml dsyr.acml \ + ssyr2.acml dsyr2.acml \ ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ sger.acml dger.acml cger.acml zger.acml \ sdot.acml ddot.acml \ + srot.acml drot.acml csrot.acml zdrot.acml \ + srotm.acml drotm.acml \ saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ scopy.acml dcopy.acml ccopy.acml zcopy.acml \ sswap.acml dswap.acml cswap.acml zswap.acml \ @@ -95,26 +111,37 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ sasum.acml dasum.acml casum.acml zasum.acml \ ssymv.acml dsymv.acml csymv.acml zsymv.acml \ chemv.acml zhemv.acml \ + chbmv.acml zhbmv.acml \ + chpmv.acml zhpmv.acml \ chemm.acml zhemm.acml \ cherk.acml zherk.acml \ cher2k.acml zher2k.acml \ sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ sspmv.acml dspmv.acml cspmv.acml zspmv.acml \ + strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ + strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \ sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ sgetri.acml dgetri.acml cgetri.acml zgetri.acml \ spotrf.acml dpotrf.acml cpotrf.acml zpotrf.acml \ - ssymm.acml dsymm.acml csymm.acml zsymm.acml + ssymm.acml dsymm.acml csymm.acml zsymm.acml \ + saxpby.acml daxpby.acml caxpby.acml zaxpby.acml atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ scholesky.atlas dcholesky.atlas ccholesky.atlas zcholesky.atlas \ sgemm.atlas dgemm.atlas cgemm.atlas zgemm.atlas \ strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \ strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \ + sspr.atlas dspr.atlas \ + sspr2.atlas dspr2.atlas \ + ssyr.atlas dsyr.atlas \ + ssyr2.atlas dsyr2.atlas \ ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ sger.atlas dger.atlas cger.atlas zger.atlas\ sdot.atlas ddot.atlas \ + srot.atlas drot.atlas csrot.atlas zdrot.atlas \ + srotm.atlas drotm.atlas \ saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \ scopy.atlas dcopy.atlas ccopy.atlas zcopy.atlas \ sswap.atlas dswap.atlas cswap.atlas zswap.atlas \ @@ -122,27 +149,38 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ sasum.atlas dasum.atlas casum.atlas zasum.atlas \ ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \ chemv.atlas zhemv.atlas \ + chbmv.atlas zhbmv.atlas \ + chpmv.atlas zhpmv.atlas \ chemm.acml zhemm.acml \ chemm.atlas zhemm.atlas \ cherk.atlas zherk.atlas \ cher2k.atlas zher2k.atlas \ sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ sspmv.atlas dspmv.atlas cspmv.atlas zspmv.atlas \ + strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ + strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \ sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \ - ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas + ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas \ + saxpby.atlas daxpby.atlas caxpby.atlas zaxpby.atlas mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \ sgemm.mkl dgemm.mkl cgemm.mkl zgemm.mkl \ strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \ strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \ + sspr.mkl dspr.mkl \ + sspr2.mkl dspr2.mkl \ + ssyr.mkl dsyr.mkl \ + ssyr2.mkl dsyr2.mkl \ ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ sger.mkl dger.mkl cger.mkl zger.mkl \ sdot.mkl ddot.mkl \ + srot.mkl drot.mkl csrot.mkl zdrot.mkl \ + srotm.mkl drotm.mkl \ saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \ scopy.mkl dcopy.mkl ccopy.mkl zcopy.mkl \ sswap.mkl dswap.mkl cswap.mkl zswap.mkl \ @@ -150,27 +188,37 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ sasum.mkl dasum.mkl casum.mkl zasum.mkl \ ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ chemv.mkl zhemv.mkl \ + chbmv.mkl zhbmv.mkl \ + chpmv.mkl zhpmv.mkl \ chemm.mkl zhemm.mkl \ cherk.mkl zherk.mkl \ cher2k.mkl zher2k.mkl \ sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ sspmv.mkl dspmv.mkl cspmv.mkl zspmv.mkl \ + strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ + strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \ sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \ spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \ - ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl + ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl \ + saxpby.mkl daxpby.mkl caxpby.mkl zaxpby.mkl else goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \ strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \ + sspr.goto dspr.goto \ + sspr2.goto dspr2.goto \ + ssyr.goto dsyr.goto \ + ssyr2.goto dsyr2.goto \ ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ sger.goto dger.goto cger.goto zger.goto \ sdot.goto ddot.goto cdot.goto zdot.goto \ - srot.goto drot.goto \ + srot.goto drot.goto csrot.goto zdrot.goto \ + srotm.goto drotm.goto \ saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ scopy.goto dcopy.goto ccopy.goto zcopy.goto \ sswap.goto dswap.goto cswap.goto zswap.goto \ @@ -178,11 +226,15 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ sasum.goto dasum.goto casum.goto zasum.goto \ ssymv.goto dsymv.goto \ chemv.goto zhemv.goto \ + chbmv.goto zhbmv.goto \ + chpmv.goto zhpmv.goto \ chemm.goto zhemm.goto \ cherk.goto zherk.goto \ cher2k.goto zher2k.goto \ - sspmv.goto dspmv.goto cspmv.goto zspmv.goto \ sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ + sspmv.goto dspmv.goto cspmv.goto zspmv.goto \ + strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ + strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \ ssymm.goto dsymm.goto csymm.goto zsymm.goto \ smallscaling \ isamax.goto idamax.goto icamax.goto izamax.goto \ @@ -193,6 +245,7 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ smax.goto dmax.goto \ samin.goto damin.goto camin.goto zamin.goto \ smin.goto dmin.goto \ + saxpby.goto daxpby.goto caxpby.goto zaxpby.goto \ snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ @@ -200,10 +253,16 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ sgemm.acml dgemm.acml cgemm.acml zgemm.acml \ strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \ strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \ + sspr.acml dspr.acml \ + sspr2.acml dspr2.acml \ + ssyr.acml dsyr.acml \ + ssyr2.acml dsyr2.acml \ ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ sger.acml dger.acml cger.acml zger.acml \ sdot.acml ddot.acml \ + srot.acml drot.acml csrot.acml zdrot.acml \ + srotm.acml drotm.acml \ saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ scopy.acml dcopy.acml ccopy.acml zcopy.acml \ sswap.acml dswap.acml cswap.acml zswap.acml \ @@ -211,26 +270,37 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ sasum.acml dasum.acml casum.acml zasum.acml \ ssymv.acml dsymv.acml csymv.acml zsymv.acml \ chemv.acml zhemv.acml \ + chbmv.acml zhbmv.acml \ + chpmv.acml zhpmv.acml \ chemm.acml zhemm.acml \ cherk.acml zherk.acml \ cher2k.acml zher2k.acml \ sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ sspmv.acml dspmv.acml cspmv.acml zspmv.acml \ + strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ + strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \ sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ sgetri.acml dgetri.acml cgetri.acml zgetri.acml \ spotrf.acml dpotrf.acml cpotrf.acml zpotrf.acml \ - ssymm.acml dsymm.acml csymm.acml zsymm.acml + ssymm.acml dsymm.acml csymm.acml zsymm.acml \ + saxpby.acml daxpby.acml caxpby.acml zaxpby.acml atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ scholesky.atlas dcholesky.atlas ccholesky.atlas zcholesky.atlas \ sgemm.atlas dgemm.atlas cgemm.atlas zgemm.atlas \ strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \ strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \ + sspr.atlas dspr.atlas \ + sspr2.atlas dspr2.atlas \ + ssyr.atlas dsyr.atlas \ + ssyr2.atlas dsyr2.atlas \ ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ sger.atlas dger.atlas cger.atlas zger.atlas\ sdot.atlas ddot.atlas \ + srot.atlas drot.atlas csrot.atlas zdrot.atlas \ + srotm.atlas drotm.atlas \ saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \ scopy.atlas dcopy.atlas ccopy.atlas zcopy.atlas \ sswap.atlas dswap.atlas cswap.atlas zswap.atlas \ @@ -238,29 +308,40 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ sasum.atlas dasum.atlas casum.atlas zasum.atlas \ ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \ chemv.atlas zhemv.atlas \ + chbmv.atlas zhbmv.atlas \ + chpmv.atlas zhpmv.atlas \ chemm.acml zhemm.acml \ chemm.atlas zhemm.atlas \ cherk.atlas zherk.atlas \ cher2k.atlas zher2k.atlas \ sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ sspmv.atlas dspmv.atlas cspmv.atlas zspmv.atlas \ + strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ + strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \ sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \ ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas \ isamax.atlas idamax.atlas icamax.atlas izamax.atlas \ - snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto + snrm2.atlas dnrm2.atlas scnrm2.atlas dznrm2.atlas \ + saxpby.atlas daxpby.atlas caxpby.atlas zaxpby.atlas mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \ sgemm.mkl dgemm.mkl cgemm.mkl zgemm.mkl \ strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \ strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \ + sspr.mkl dspr.mkl \ + sspr2.mkl dspr2.mkl \ + ssyr.mkl dsyr.mkl \ + ssyr2.mkl dsyr2.mkl \ ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ sger.mkl dger.mkl cger.mkl zger.mkl \ sdot.mkl ddot.mkl cdot.mkl zdot.mkl \ + srot.atlas drot.atlas csrot.atlas zdrot.atlas \ + srotm.atlas drotm.atlas \ saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \ scopy.mkl dcopy.mkl ccopy.mkl zcopy.mkl \ sswap.mkl dswap.mkl cswap.mkl zswap.mkl \ @@ -268,16 +349,21 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ sasum.mkl dasum.mkl casum.mkl zasum.mkl \ ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ chemv.mkl zhemv.mkl \ + chbmv.mkl zhbmv.mkl \ + chpmv.mkl zhpmv.mkl \ chemm.mkl zhemm.mkl \ cherk.mkl zherk.mkl \ cher2k.mkl zher2k.mkl \ sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ sspmv.mkl dspmv.mkl cspmv.mkl zspmv.mkl \ + strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ + strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \ sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \ spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \ - ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl + ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl \ + saxpby.mkl daxpby.mkl caxpby.mkl zaxpby.mkl @@ -295,10 +381,16 @@ veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ sgemm.veclib dgemm.veclib cgemm.veclib zgemm.veclib \ strmm.veclib dtrmm.veclib ctrmm.veclib ztrmm.veclib \ strsm.veclib dtrsm.veclib ctrsm.veclib ztrsm.veclib \ + sspr.veclib dspr.veclib \ + sspr2.veclib dspr2.veclib \ + ssyr.veclib dsyr.veclib \ + ssyr2.veclib dsyr2.veclib \ ssyrk.veclib dsyrk.veclib csyrk.veclib zsyrk.veclib \ ssyr2k.veclib dsyr2k.veclib csyr2k.veclib zsyr2k.veclib \ sger.veclib dger.veclib cger.veclib zger.veclib \ sdot.veclib ddot.veclib cdot.veclib zdot.veclib \ + srot.veclib drot.veclib csrot.veclib zdrot.veclib \ + srotm.veclib drotm.veclib \ saxpy.veclib daxpy.veclib caxpy.veclib zaxpy.veclib \ scopy.veclib dcopy.veclib ccopy.veclib zcopy.veclib \ sswap.veclib dswap.veclib cswap.veclib zswap.veclib \ @@ -306,16 +398,21 @@ veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ sasum.veclib dasum.veclib casum.veclib zasum.veclib \ ssymv.veclib dsymv.veclib csymv.veclib zsymv.veclib \ chemv.veclib zhemv.veclib \ + chbmv.veclib zhbmv.veclib \ + chpmv.veclib zhpmv.veclib \ chemm.veclib zhemm.veclib \ cherk.veclib zherk.veclib \ cher2k.veclib zher2k.veclib \ sgemv.veclib dgemv.veclib cgemv.veclib zgemv.veclib \ sspmv.veclib dspmv.veclib cspmv.veclib zspmv.veclib \ + strmv.veclib dtrmv.veclib ctrmv.veclib ztrmv.veclib \ + strsv.veclib dtrsv.veclib ctrsv.veclib ztrsv.veclib \ sgeev.veclib dgeev.veclib cgeev.veclib zgeev.veclib \ sgesv.veclib dgesv.veclib cgesv.veclib zgesv.veclib \ sgetri.veclib dgetri.veclib cgetri.veclib zgetri.veclib \ spotrf.veclib dpotrf.veclib cpotrf.veclib zpotrf.veclib \ - ssymm.veclib dsymm.veclib csymm.veclib zsymm.veclib + ssymm.veclib dsymm.veclib csymm.veclib zsymm.veclib \ + saxpby.veclib daxpby.veclib caxpby.veclib zaxpby.veclib goto_3m :: cgemm3m.goto zgemm3m.goto @@ -784,6 +881,130 @@ ztrsm.veclib : ztrsm.$(SUFFIX) ztrsm.essl : ztrsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Ssyr #################################################### +ssyr.goto : ssyr.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ssyr.acml : ssyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr.atlas : ssyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr.mkl : ssyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr.veclib : ssyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Dsyr #################################################### +dsyr.goto : dsyr.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dsyr.acml : dsyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr.atlas : dsyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr.mkl : dsyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr.veclib : dsyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sspr #################################################### +sspr.goto : sspr.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sspr.acml : sspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr.atlas : sspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr.mkl : sspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr.veclib : sspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dspr #################################################### +dspr.goto : dspr.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dspr.acml : dspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr.atlas : dspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr.mkl : dspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr.veclib : dspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sspr2 #################################################### +sspr2.goto : sspr2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sspr2.acml : sspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr2.atlas : sspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr2.mkl : sspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr2.veclib : sspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dspr2 #################################################### +dspr2.goto : dspr2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dspr2.acml : dspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr2.atlas : dspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr2.mkl : dspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr2.veclib : dspr2.$(SUFFIX) + +##################################### Ssyr2 #################################################### +ssyr2.goto : ssyr2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ssyr2.acml : ssyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr2.atlas : ssyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr2.mkl : ssyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr2.veclib : ssyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Dsyr2 #################################################### +dsyr2.goto : dsyr2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dsyr2.acml : dsyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr2.atlas : dsyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr2.mkl : dsyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr2.veclib : dsyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Ssyrk #################################################### ssyrk.goto : ssyrk.$(SUFFIX) ../$(LIBNAME) @@ -1019,6 +1240,72 @@ zher2k.mkl : zher2k.$(SUFFIX) zher2k.veclib : zher2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Sgemv #################################################### +sgemv.goto : sgemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sgemv.acml : sgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgemv.atlas : sgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgemv.mkl : sgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgemv.veclib : sgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dgemv #################################################### +dgemv.goto : dgemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dgemv.acml : dgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgemv.atlas : dgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgemv.mkl : dgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgemv.veclib : dgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cgemv #################################################### + +cgemv.goto : cgemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cgemv.acml : cgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgemv.atlas : cgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgemv.mkl : cgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgemv.veclib : cgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zgemv #################################################### + +zgemv.goto : zgemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zgemv.acml : zgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgemv.atlas : zgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgemv.mkl : zgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgemv.veclib : zgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Sspmv #################################################### sspmv.goto : sspmv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -1083,71 +1370,136 @@ zspmv.mkl : zspmv.$(SUFFIX) zspmv.veclib : zspmv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Strmv #################################################### +strmv.goto : strmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm -##################################### Sgemv #################################################### -sgemv.goto : sgemv.$(SUFFIX) ../$(LIBNAME) +strmv.acml : strmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strmv.atlas : strmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strmv.mkl : strmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strmv.veclib : strmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dtrmv #################################################### +dtrmv.goto : dtrmv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm -sgemv.acml : sgemv.$(SUFFIX) +dtrmv.acml : dtrmv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -sgemv.atlas : sgemv.$(SUFFIX) +dtrmv.atlas : dtrmv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -sgemv.mkl : sgemv.$(SUFFIX) +dtrmv.mkl : dtrmv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -sgemv.veclib : sgemv.$(SUFFIX) +dtrmv.veclib : dtrmv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Dgemv #################################################### -dgemv.goto : dgemv.$(SUFFIX) ../$(LIBNAME) +##################################### Ctrmv #################################################### + +ctrmv.goto : ctrmv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm -dgemv.acml : dgemv.$(SUFFIX) +ctrmv.acml : ctrmv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -dgemv.atlas : dgemv.$(SUFFIX) +ctrmv.atlas : ctrmv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -dgemv.mkl : dgemv.$(SUFFIX) +ctrmv.mkl : ctrmv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -dgemv.veclib : dgemv.$(SUFFIX) +ctrmv.veclib : ctrmv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Cgemv #################################################### +##################################### Ztrmv #################################################### -cgemv.goto : cgemv.$(SUFFIX) ../$(LIBNAME) +ztrmv.goto : ztrmv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm -cgemv.acml : cgemv.$(SUFFIX) +ztrmv.acml : ztrmv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -cgemv.atlas : cgemv.$(SUFFIX) +ztrmv.atlas : ztrmv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -cgemv.mkl : cgemv.$(SUFFIX) +ztrmv.mkl : ztrmv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -cgemv.veclib : cgemv.$(SUFFIX) +ztrmv.veclib : ztrmv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Zgemv #################################################### +##################################### Strsv #################################################### +strsv.goto : strsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm -zgemv.goto : zgemv.$(SUFFIX) ../$(LIBNAME) +strsv.acml : strsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strsv.atlas : strsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strsv.mkl : strsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strsv.veclib : strsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dtrsv #################################################### +dtrsv.goto : dtrsv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm -zgemv.acml : zgemv.$(SUFFIX) +dtrsv.acml : dtrsv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -zgemv.atlas : zgemv.$(SUFFIX) +dtrsv.atlas : dtrsv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -zgemv.mkl : zgemv.$(SUFFIX) +dtrsv.mkl : dtrsv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -zgemv.veclib : zgemv.$(SUFFIX) +dtrsv.veclib : dtrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ctrsv #################################################### + +ctrsv.goto : ctrsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ctrsv.acml : ctrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrsv.atlas : ctrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrsv.mkl : ctrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrsv.veclib : ctrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ztrsv #################################################### + +ztrsv.goto : ztrsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ztrsv.acml : ztrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrsv.atlas : ztrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrsv.mkl : ztrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrsv.veclib : ztrsv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Sger #################################################### @@ -1509,7 +1861,70 @@ zhemv.mkl : zhemv.$(SUFFIX) zhemv.veclib : zhemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Chbmv #################################################### + +chbmv.goto : chbmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +chbmv.acml : chbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chbmv.atlas : chbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chbmv.mkl : chbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chbmv.veclib : chbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Zhbmv #################################################### +zhbmv.goto : zhbmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zhbmv.acml : zhbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhbmv.atlas : zhbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhbmv.mkl : zhbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhbmv.veclib : zhbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Chpmv #################################################### + +chpmv.goto : chpmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +chpmv.acml : chpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chpmv.atlas : chpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chpmv.mkl : chpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chpmv.veclib : chpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Zhpmv #################################################### + +zhpmv.goto : zhpmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zhpmv.acml : zhpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhpmv.atlas : zhpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhpmv.mkl : zhpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhpmv.veclib : zhpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Sdot #################################################### sdot.goto : sdot.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -1606,6 +2021,69 @@ drot.mkl : drot.$(SUFFIX) drot.veclib : drot.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### csrot #################################################### +csrot.goto : csrot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +csrot.acml : csrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csrot.atlas : csrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csrot.mkl : csrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csrot.veclib : csrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### zdrot #################################################### +zdrot.goto : zdrot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zdrot.acml : zdrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zdrot.atlas : zdrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zdrot.mkl : zdrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zdrot.veclib : zdrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### srotm #################################################### +srotm.goto : srotm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +srotm.acml : srotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +srotm.atlas : srotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +srotm.mkl : srotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +srotm.veclib : srotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### drotm #################################################### +drotm.goto : drotm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +drotm.acml : drotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +drotm.atlas : drotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +drotm.mkl : drotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +drotm.veclib : drotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Saxpy #################################################### saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME) @@ -1673,7 +2151,72 @@ zaxpy.mkl : zaxpy.$(SUFFIX) zaxpy.veclib : zaxpy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Saxpby #################################################### +saxpby.goto : saxpby.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +saxpby.acml : saxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +saxpby.atlas : saxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +saxpby.mkl : saxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +saxpby.veclib : saxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Daxpby #################################################### +daxpby.goto : daxpby.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +daxpby.acml : daxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +daxpby.atlas : daxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +daxpby.mkl : daxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +daxpby.veclib : daxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Caxpby #################################################### + +caxpby.goto : caxpby.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +caxpby.acml : caxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +caxpby.atlas : caxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +caxpby.mkl : caxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +caxpby.veclib : caxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zaxpby #################################################### +zaxpby.goto : zaxpby.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zaxpby.acml : zaxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zaxpby.atlas : zaxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zaxpby.mkl : zaxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zaxpby.veclib : zaxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Scopy #################################################### scopy.goto : scopy.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -2063,7 +2606,7 @@ ismax.goto : ismax.$(SUFFIX) ../$(LIBNAME) ############################################## IDMAX ############################################## idmax.goto : idmax.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - + ############################################## ISAMIN ############################################## isamin.goto : isamin.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -2239,6 +2782,30 @@ ctrsm.$(SUFFIX) : trsm.c ztrsm.$(SUFFIX) : trsm.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +ssyr.$(SUFFIX) : syr.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dsyr.$(SUFFIX) : syr.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +sspr.$(SUFFIX) : spr.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dspr.$(SUFFIX) : spr.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +sspr2.$(SUFFIX) : spr2.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dspr2.$(SUFFIX) : spr2.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ssyr2.$(SUFFIX) : syr2.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dsyr2.$(SUFFIX) : syr2.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + ssyrk.$(SUFFIX) : syrk.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ @@ -2281,6 +2848,18 @@ cher2k.$(SUFFIX) : her2k.c zher2k.$(SUFFIX) : her2k.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +sgemv.$(SUFFIX) : gemv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dgemv.$(SUFFIX) : gemv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cgemv.$(SUFFIX) : gemv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zgemv.$(SUFFIX) : gemv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + sspmv.$(SUFFIX) : spmv.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ @@ -2293,16 +2872,28 @@ cspmv.$(SUFFIX) : spmv.c zspmv.$(SUFFIX) : spmv.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ -sgemv.$(SUFFIX) : gemv.c +strmv.$(SUFFIX) : trmv.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ -dgemv.$(SUFFIX) : gemv.c +dtrmv.$(SUFFIX) : trmv.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ -cgemv.$(SUFFIX) : gemv.c +ctrmv.$(SUFFIX) : trmv.c $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ -zgemv.$(SUFFIX) : gemv.c +ztrmv.$(SUFFIX) : trmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +strsv.$(SUFFIX) : trsv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dtrsv.$(SUFFIX) : trsv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ctrsv.$(SUFFIX) : trsv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +ztrsv.$(SUFFIX) : trsv.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ sger.$(SUFFIX) : ger.c @@ -2372,6 +2963,18 @@ chemv.$(SUFFIX) : hemv.c zhemv.$(SUFFIX) : hemv.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +chbmv.$(SUFFIX) : hbmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zhbmv.$(SUFFIX) : hbmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +chpmv.$(SUFFIX) : hpmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zhpmv.$(SUFFIX) : hpmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + sdot.$(SUFFIX) : dot.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ @@ -2404,6 +3007,18 @@ caxpy.$(SUFFIX) : axpy.c zaxpy.$(SUFFIX) : axpy.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +saxpby.$(SUFFIX) : axpby.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +daxpby.$(SUFFIX) : axpby.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +caxpby.$(SUFFIX) : axpby.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zaxpby.$(SUFFIX) : axpby.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + scopy.$(SUFFIX) : copy.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ @@ -2473,7 +3088,17 @@ srot.$(SUFFIX) : rot.c drot.$(SUFFIX) : rot.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ +csrot.$(SUFFIX) : rot.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zdrot.$(SUFFIX) : rot.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +srotm.$(SUFFIX) : rotm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ +drotm.$(SUFFIX) : rotm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ @@ -2556,6 +3181,7 @@ camin.$(SUFFIX) : amin.c zamin.$(SUFFIX) : amin.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + smin.$(SUFFIX) : min.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ @@ -2563,7 +3189,6 @@ dmin.$(SUFFIX) : min.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - snrm2.$(SUFFIX) : nrm2.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ @@ -2584,4 +3209,3 @@ clean :: @rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl smallscaling include $(TOPDIR)/Makefile.tail - From 3f7f7ab7e266cb0e951528d699fa5910af0908fe Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 10 Mar 2020 12:51:07 +0100 Subject: [PATCH 053/593] Restore INTERFACE64 for arm64 --- Makefile.system | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Makefile.system b/Makefile.system index 829c08f16..11cb5b3a0 100644 --- a/Makefile.system +++ b/Makefile.system @@ -635,6 +635,16 @@ endif ifeq ($(ARCH), arm64) NO_BINARY_MODE = 1 BINARY_DEFINED = 1 +ifdef INTERFACE64 +ifneq ($(INTERFACE64), 0) +ifeq ($(F_COMPILER), GFORTRAN) +FCOMMON_OPT += -fdefault-integer-8 +endif +ifeq ($(F_COMPILER), FLANG) +FCOMMON_OPT += -i8 +endif +endif +endif endif From b25ae1fc602923cbc134338e619606330fe5fd7f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 10 Mar 2020 13:37:41 +0100 Subject: [PATCH 054/593] Apply fix for Reference-LAPACK issue 394 reference to XERBLA extending beyond column 72, breaking builds with compilers that default to traditional punch card format --- lapack-netlib/SRC/sorhr_col.f | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lapack-netlib/SRC/sorhr_col.f b/lapack-netlib/SRC/sorhr_col.f index 38976245c..9aef57b26 100644 --- a/lapack-netlib/SRC/sorhr_col.f +++ b/lapack-netlib/SRC/sorhr_col.f @@ -282,7 +282,8 @@ $ NPLUSONE * .. * .. External Subroutines .. - EXTERNAL SCOPY, SLAORHR_COL_GETRFNP, SSCAL, STRSM, XERBLA + EXTERNAL SCOPY, SLAORHR_COL_GETRFNP, SSCAL, STRSM, + $XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -436,4 +437,4 @@ * * End of SORHR_COL * - END \ No newline at end of file + END From cd8871f1a1d6022d83bb8190c17779658f6a0e5f Mon Sep 17 00:00:00 2001 From: s00527847 Date: Tue, 10 Mar 2020 19:26:06 -0400 Subject: [PATCH 055/593] Use the correct unit of measure --- benchmark/spr.c | 2 +- benchmark/spr2.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmark/spr.c b/benchmark/spr.c index 61a972c08..c91e587b1 100755 --- a/benchmark/spr.c +++ b/benchmark/spr.c @@ -187,7 +187,7 @@ int main(int argc, char *argv[]){ timeg /= loops; fprintf(stderr, - " %10.2f MBytes %10.6f sec\n", + " %10.2f MFlops %10.6f sec\n", COMPSIZE * COMPSIZE * 1. * (double)m * (double)m / timeg * 1.e-6, timeg); } diff --git a/benchmark/spr2.c b/benchmark/spr2.c index 251543fa9..e8ee345d7 100755 --- a/benchmark/spr2.c +++ b/benchmark/spr2.c @@ -196,7 +196,7 @@ int main(int argc, char *argv[]){ timeg /= loops; fprintf(stderr, - " %10.2f MBytes %10.6f sec\n", + " %10.2f MFlops %10.6f sec\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / timeg * 1.e-6, timeg); } From 2f4c5bb3a98eb1956cc24221d909b1547a7e4eff Mon Sep 17 00:00:00 2001 From: "jayfely@qq.com" Date: Wed, 11 Mar 2020 10:30:09 +0800 Subject: [PATCH 056/593] Update spmv.c: solve segmentation fault when m and n are larger than 50000 --- benchmark/spmv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/spmv.c b/benchmark/spmv.c index 12a33e298..2a26c9416 100644 --- a/benchmark/spmv.c +++ b/benchmark/spmv.c @@ -178,7 +178,7 @@ int main(int argc, char *argv[]){ for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ - a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } From 3e8f1c6cc5586da2e399b8f811ec24129aa21304 Mon Sep 17 00:00:00 2001 From: wuanjun 00447568 Date: Wed, 11 Mar 2020 12:31:48 +0800 Subject: [PATCH 057/593] [OpenBlas]:Add benchmark tpmv.c and modify Makefile [Description]:Solve the problem of missing tpmv.c benchmark file --- benchmark/Makefile | 87 +++++++++++++++++++++++ benchmark/tpmv.c | 172 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 259 insertions(+) create mode 100644 benchmark/tpmv.c diff --git a/benchmark/Makefile b/benchmark/Makefile index 2db873e95..5e0c96c25 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -80,6 +80,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ cher2k.goto zher2k.goto \ sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ + stpmv.goto dtpmv.goto ctpmv.goto ztpmv.goto \ strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \ sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ sgesv.goto dgesv.goto cgesv.goto zgesv.goto \ @@ -117,6 +118,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ cher2k.acml zher2k.acml \ sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ + stpmv.acml dtpmv.acml ctpmv.acml ztpmv.acml \ strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \ sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ @@ -155,6 +157,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ cher2k.atlas zher2k.atlas \ sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ + stpmv.atlas dtpmv.atlas ctpmv.atlas ztpmv.atlas \ strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \ sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ @@ -192,6 +195,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ cher2k.mkl zher2k.mkl \ sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ + stpmv.mkl dtpmv.mkl ctpmv.mkl ztpmv.mkl \ strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \ sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ @@ -229,6 +233,7 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ cher2k.goto zher2k.goto \ sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ + stpmv.goto dtpmv.goto ctpmv.goto ztpmv.goto \ strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \ ssymm.goto dsymm.goto csymm.goto zsymm.goto \ smallscaling \ @@ -272,6 +277,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ cher2k.acml zher2k.acml \ sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ + stpmv.acml dtpmv.acml ctpmv.acml ztpmv.acml \ strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \ sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ @@ -310,6 +316,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ cher2k.atlas zher2k.atlas \ sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ + stpmv.atlas dtpmv.atlas ctpmv.atlas ztpmv.atlas \ strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \ sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ @@ -349,6 +356,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ cher2k.mkl zher2k.mkl \ sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ + stpmv.mkl dtpmv.mkl ctpmv.mkl ztpmv.mkl \ strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \ sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ @@ -397,6 +405,7 @@ veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ cher2k.veclib zher2k.veclib \ sgemv.veclib dgemv.veclib cgemv.veclib zgemv.veclib \ strmv.veclib dtrmv.veclib ctrmv.veclib ztrmv.veclib \ + stpmv.veclib dtpmv.veclib ctpmv.veclib ztpmv.veclib \ strsv.veclib dtrsv.veclib ctrsv.veclib ztrsv.veclib \ sgeev.veclib dgeev.veclib cgeev.veclib zgeev.veclib \ sgesv.veclib dgesv.veclib cgesv.veclib zgesv.veclib \ @@ -1363,6 +1372,72 @@ ztrmv.mkl : ztrmv.$(SUFFIX) ztrmv.veclib : ztrmv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Stpmv #################################################### +stpmv.goto : stpmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +stpmv.acml : stpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +stpmv.atlas : stpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +stpmv.mkl : stpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +stpmv.veclib : stpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dtpmv #################################################### +dtpmv.goto : dtpmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dtpmv.acml : dtpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtpmv.atlas : dtpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtpmv.mkl : dtpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtpmv.veclib : dtpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ctpmv #################################################### + +ctpmv.goto : ctpmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ctpmv.acml : ctpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctpmv.atlas : ctpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctpmv.mkl : ctpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctpmv.veclib : ctpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ztpmv #################################################### + +ztpmv.goto : ztpmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ztpmv.acml : ztpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztpmv.atlas : ztpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztpmv.mkl : ztpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztpmv.veclib : ztpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Strsv #################################################### strsv.goto : strsv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -2799,6 +2874,18 @@ ctrmv.$(SUFFIX) : trmv.c ztrmv.$(SUFFIX) : trmv.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +stpmv.$(SUFFIX) : tpmv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dtpmv.$(SUFFIX) : tpmv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ctpmv.$(SUFFIX) : tpmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +ztpmv.$(SUFFIX) : tpmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + strsv.$(SUFFIX) : trsv.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ diff --git a/benchmark/tpmv.c b/benchmark/tpmv.c new file mode 100644 index 000000000..ee5b97f24 --- /dev/null +++ b/benchmark/tpmv.c @@ -0,0 +1,172 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#undef TPMV + +#ifndef COMPLEX + +#ifdef DOUBLE +#define TPMV BLASFUNC(dtpmv) +#else +#define TPMV BLASFUNC(stpmv) +#endif + +#else + +#ifdef DOUBLE +#define TPMV BLASFUNC(ztpmv) +#else +#define TPMV BLASFUNC(ctpmv) +#endif + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size) +{ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1) { + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]) +{ + + FLOAT *a, *x; + char *p; + + char uplo ='U'; + char trans='N'; + char diag ='U'; + + int loops = 1; + int l; + blasint inc_x=1; + + if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; + if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; + if ((p = getenv("OPENBLAS_DIAG"))) diag=*p; + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + blasint n, i, j; + + int from = 1; + int to = 200; + int step = 1; + + struct timespec start = { 0, 0 }, stop = { 0, 0 }; + double time1, timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c Diag = %c Loops=%d Inc_x=%d\n", from, + to, step, uplo, trans, diag, loops, inc_x); + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) { + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(n = from; n <= to; n += step) { + timeg=0; + + fprintf(stderr, " %6d : ", (int)n); + for(j = 0; j < n; j++) { + for(i = 0; i < n * COMPSIZE; i++) { + a[(long)i + (long)j * (long)n * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + } + + for (i = 0; i < n * COMPSIZE * abs(inc_x); i++) { + x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + + for (l = 0; l < loops; l++) { + clock_gettime(CLOCK_REALTIME, &start); + TPMV (&uplo, &trans, &diag, &n, a, x, &inc_x); + clock_gettime(CLOCK_REALTIME, &stop); + + time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9; + timeg += time1; + } + + timeg /= loops; + fprintf(stderr, " %10.2f MFlops %12.9f sec\n", + COMPSIZE * COMPSIZE * 1. * (double)n * (double)n / timeg / 1.e6, timeg); + } + + return 0; +} + +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); From 649733ff151f148513e4da2b6d15af856bfe3c33 Mon Sep 17 00:00:00 2001 From: "jayfely@qq.com" Date: Wed, 11 Mar 2020 15:48:58 +0800 Subject: [PATCH 058/593] Only keep spmv.goto and spmv.atlas --- benchmark/Makefile | 41 ----------------------------------------- 1 file changed, 41 deletions(-) diff --git a/benchmark/Makefile b/benchmark/Makefile index fdc57819f..03145d1ad 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -117,7 +117,6 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ cherk.acml zherk.acml \ cher2k.acml zher2k.acml \ sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ - sspmv.acml dspmv.acml cspmv.acml zspmv.acml \ strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \ sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ @@ -194,7 +193,6 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ cherk.mkl zherk.mkl \ cher2k.mkl zher2k.mkl \ sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ - sspmv.mkl dspmv.mkl cspmv.mkl zspmv.mkl \ strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \ sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ @@ -276,7 +274,6 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ cherk.acml zherk.acml \ cher2k.acml zher2k.acml \ sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ - sspmv.acml dspmv.acml cspmv.acml zspmv.acml \ strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \ sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ @@ -355,7 +352,6 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ cherk.mkl zherk.mkl \ cher2k.mkl zher2k.mkl \ sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ - sspmv.mkl dspmv.mkl cspmv.mkl zspmv.mkl \ strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \ sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ @@ -404,7 +400,6 @@ veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ cherk.veclib zherk.veclib \ cher2k.veclib zher2k.veclib \ sgemv.veclib dgemv.veclib cgemv.veclib zgemv.veclib \ - sspmv.veclib dspmv.veclib cspmv.veclib zspmv.veclib \ strmv.veclib dtrmv.veclib ctrmv.veclib ztrmv.veclib \ strsv.veclib dtrsv.veclib ctrsv.veclib ztrsv.veclib \ sgeev.veclib dgeev.veclib cgeev.veclib zgeev.veclib \ @@ -1310,66 +1305,30 @@ zgemv.veclib : zgemv.$(SUFFIX) sspmv.goto : sspmv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm -sspmv.acml : sspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - sspmv.atlas : sspmv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -sspmv.mkl : sspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sspmv.veclib : sspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - ##################################### Dspmv #################################################### dspmv.goto : dspmv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm -dspmv.acml : dspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - dspmv.atlas : dspmv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -dspmv.mkl : dspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dspmv.veclib : dspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - ##################################### Cspmv #################################################### cspmv.goto : cspmv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm -cspmv.acml : cspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - cspmv.atlas : cspmv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -cspmv.mkl : cspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cspmv.veclib : cspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - ##################################### Zspmv #################################################### zspmv.goto : zspmv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm -zspmv.acml : zspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - zspmv.atlas : zspmv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -zspmv.mkl : zspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zspmv.veclib : zspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - ##################################### Strmv #################################################### strmv.goto : strmv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm From 83ecf9fea7c4e727042181ed5b985d5be05775f7 Mon Sep 17 00:00:00 2001 From: "jayfely@qq.com" Date: Wed, 11 Mar 2020 16:36:45 +0800 Subject: [PATCH 059/593] Modify Makefile in interface to remove the error occured in travis CI --- interface/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/interface/Makefile b/interface/Makefile index e25e5ccfc..3f0dcca28 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -86,7 +86,7 @@ CBLAS2OBJS = \ cgemv.$(SUFFIX) cgeru.$(SUFFIX) cgerc.$(SUFFIX) \ ctrsv.$(SUFFIX) ctrmv.$(SUFFIX) \ csyr2.$(SUFFIX) cgbmv.$(SUFFIX) \ - csbmv.$(SUFFIX) cspmv.$(SUFFIX) \ + csbmv.$(SUFFIX) \ cspr2.$(SUFFIX) \ ctbsv.$(SUFFIX) ctbmv.$(SUFFIX) \ ctpsv.$(SUFFIX) ctpmv.$(SUFFIX) \ @@ -115,7 +115,7 @@ ZBLAS2OBJS = \ zgemv.$(SUFFIX) zgeru.$(SUFFIX) zgerc.$(SUFFIX) \ ztrsv.$(SUFFIX) ztrmv.$(SUFFIX) \ zsyr2.$(SUFFIX) zgbmv.$(SUFFIX) \ - zsbmv.$(SUFFIX) zspmv.$(SUFFIX) \ + zsbmv.$(SUFFIX) \ zspr2.$(SUFFIX) \ ztbsv.$(SUFFIX) ztbmv.$(SUFFIX) \ ztpsv.$(SUFFIX) ztpmv.$(SUFFIX) \ From ae3f2c2e491193de9082001ef4c6a870863fa7d9 Mon Sep 17 00:00:00 2001 From: "jayfely@qq.com" Date: Wed, 11 Mar 2020 17:02:34 +0800 Subject: [PATCH 060/593] Remove cspmv and zspmv to remove the error occured in travis CI --- benchmark/Makefile | 28 ++++------------------------ 1 file changed, 4 insertions(+), 24 deletions(-) diff --git a/benchmark/Makefile b/benchmark/Makefile index 03145d1ad..2528e74ad 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -79,7 +79,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ cherk.goto zherk.goto \ cher2k.goto zher2k.goto \ sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ - sspmv.goto dspmv.goto cspmv.goto zspmv.goto \ + sspmv.goto dspmv.goto \ strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \ sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ @@ -155,7 +155,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ cherk.atlas zherk.atlas \ cher2k.atlas zher2k.atlas \ sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ - sspmv.atlas dspmv.atlas cspmv.atlas zspmv.atlas \ + sspmv.atlas dspmv.atlas \ strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \ sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ @@ -230,7 +230,7 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ cherk.goto zherk.goto \ cher2k.goto zher2k.goto \ sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ - sspmv.goto dspmv.goto cspmv.goto zspmv.goto \ + sspmv.goto dspmv.goto \ strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \ ssymm.goto dsymm.goto csymm.goto zsymm.goto \ @@ -312,7 +312,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ cherk.atlas zherk.atlas \ cher2k.atlas zher2k.atlas \ sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ - sspmv.atlas dspmv.atlas cspmv.atlas zspmv.atlas \ + sspmv.atlas dspmv.atlas \ strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \ sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ @@ -1315,20 +1315,6 @@ dspmv.goto : dspmv.$(SUFFIX) ../$(LIBNAME) dspmv.atlas : dspmv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Cspmv #################################################### -cspmv.goto : cspmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cspmv.atlas : cspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zspmv #################################################### -zspmv.goto : zspmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zspmv.atlas : zspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - ##################################### Strmv #################################################### strmv.goto : strmv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -2825,12 +2811,6 @@ sspmv.$(SUFFIX) : spmv.c dspmv.$(SUFFIX) : spmv.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ -cspmv.$(SUFFIX) : spmv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zspmv.$(SUFFIX) : spmv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - strmv.$(SUFFIX) : trmv.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ From a0a3bf7c81ed7bbd98c8e6effd0af2ecfc15577b Mon Sep 17 00:00:00 2001 From: l00546269 Date: Fri, 13 Mar 2020 10:58:39 +0800 Subject: [PATCH 061/593] [OpenBLAS]:fix the iamax benchmark error [Description]:the result for i?amax is not MFlops, it is MBytes --- benchmark/iamax.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/iamax.c b/benchmark/iamax.c index 034e24ea9..736f02b89 100644 --- a/benchmark/iamax.c +++ b/benchmark/iamax.c @@ -181,7 +181,7 @@ int main(int argc, char *argv[]){ timeg /= loops; fprintf(stderr, - " %10.2f MFlops %10.6f sec\n", + " %10.2f MBytes %10.6f sec\n", COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); } From c436e8af7b0e985a3131075071e023d4a0efea8b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 13 Mar 2020 20:10:26 +0100 Subject: [PATCH 062/593] Do not attempt to run ctest without fortran The main Makefile takes care of this in the build process, but users or CI jobs may try to run this directly --- ctest/Makefile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ctest/Makefile b/ctest/Makefile index f562c9bb3..6f5b65142 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -40,8 +40,11 @@ ztestl3o = c_zblas3.o c_z3chke.o auxiliary.o c_xerbla.o constant.o ztestl3o_3m = c_zblas3_3m.o c_z3chke_3m.o auxiliary.o c_xerbla.o constant.o - +ifeq ($(NOFORTRAN),1) +all :: +else all :: all1 all2 all3 +endif all1: xscblat1 xdcblat1 xccblat1 xzcblat1 ifndef CROSS From 2d8781b0dc1af15eeb73dfff92e5617088a1fb1f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 13 Mar 2020 20:11:19 +0100 Subject: [PATCH 063/593] Do not attempt to run test without fortran --- test/Makefile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test/Makefile b/test/Makefile index 074411b05..7a873b7e5 100644 --- a/test/Makefile +++ b/test/Makefile @@ -1,7 +1,12 @@ TOPDIR = .. include ../Makefile.system + +ifeq ($(NOFORTRAN),1) +all :: +else all :: level1 level2 level3 +endif level1 : sblat1 dblat1 cblat1 zblat1 ifndef CROSS From ee2e758278b5d82b7242f505ea694f082ef65879 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 13 Mar 2020 20:34:13 +0100 Subject: [PATCH 064/593] Move declarations of lapack_complex_custom types outside the extern C fixes #2510 --- lapack-netlib/LAPACKE/include/lapack.h | 44 ++++++++++++++------------ 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h index 0a6226fe4..36e53ec24 100644 --- a/lapack-netlib/LAPACKE/include/lapack.h +++ b/lapack-netlib/LAPACKE/include/lapack.h @@ -12,27 +12,6 @@ #include -#ifdef __cplusplus -extern "C" { -#endif - -/*----------------------------------------------------------------------------*/ -#ifndef lapack_int -#define lapack_int int -#endif - -#ifndef lapack_logical -#define lapack_logical lapack_int -#endif - -/* f2c, hence clapack and MacOS Accelerate, returns double instead of float - * for sdot, slange, clange, etc. */ -#if defined(LAPACK_F2C) - typedef double lapack_float_return; -#else - typedef float lapack_float_return; -#endif - /* Complex types are structures equivalent to the * Fortran complex types COMPLEX(4) and COMPLEX(8). * @@ -88,6 +67,29 @@ extern "C" { #endif /* LAPACK_COMPLEX_CUSTOM */ + +#ifdef __cplusplus +extern "C" { +#endif + +/*----------------------------------------------------------------------------*/ +#ifndef lapack_int +#define lapack_int int +#endif + +#ifndef lapack_logical +#define lapack_logical lapack_int +#endif + +/* f2c, hence clapack and MacOS Accelerate, returns double instead of float + * for sdot, slange, clange, etc. */ +#if defined(LAPACK_F2C) + typedef double lapack_float_return; +#else + typedef float lapack_float_return; +#endif + + /* Callback logical functions of one, two, or three arguments are used * to select eigenvalues to sort to the top left of the Schur form. * The value is selected if function returns TRUE (non-zero). */ From 2428dc9fd3a032738f6dbe5728caadd55278765e Mon Sep 17 00:00:00 2001 From: wuanjun 00447568 Date: Sat, 14 Mar 2020 09:11:08 +0800 Subject: [PATCH 065/593] [OpenBlas]: Add benchmark tpsv file and modify benchmark/Makefile [Description]: Solve lack of tpsv benchmark. --- benchmark/Makefile | 88 +++++++++++++++++++++++ benchmark/tpsv.c | 172 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 260 insertions(+) create mode 100644 benchmark/tpsv.c diff --git a/benchmark/Makefile b/benchmark/Makefile index 5e0c96c25..116515b01 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -81,6 +81,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ stpmv.goto dtpmv.goto ctpmv.goto ztpmv.goto \ + stpsv.goto dtpsv.goto ctpsv.goto ztpsv.goto \ strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \ sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ sgesv.goto dgesv.goto cgesv.goto zgesv.goto \ @@ -119,6 +120,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ stpmv.acml dtpmv.acml ctpmv.acml ztpmv.acml \ + stpsv.acml dtpsv.acml ctpsv.acml ztpsv.acml \ strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \ sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ @@ -158,6 +160,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ stpmv.atlas dtpmv.atlas ctpmv.atlas ztpmv.atlas \ + stpsv.atlas dtpsv.atlas ctpsv.atlas ztpsv.atlas \ strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \ sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ @@ -196,6 +199,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ stpmv.mkl dtpmv.mkl ctpmv.mkl ztpmv.mkl \ + stpsv.mkl dtpsv.mkl ctpsv.mkl ztpsv.mkl \ strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \ sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ @@ -234,6 +238,7 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ stpmv.goto dtpmv.goto ctpmv.goto ztpmv.goto \ + stpsv.goto dtpsv.goto ctpsv.goto ztpsv.goto \ strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \ ssymm.goto dsymm.goto csymm.goto zsymm.goto \ smallscaling \ @@ -278,6 +283,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ stpmv.acml dtpmv.acml ctpmv.acml ztpmv.acml \ + stpsv.acml dtpsv.acml ctpsv.acml ztpsv.acml \ strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \ sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ @@ -317,6 +323,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ stpmv.atlas dtpmv.atlas ctpmv.atlas ztpmv.atlas \ + stpsv.atlas dtpsv.atlas ctpsv.atlas ztpsv.atlas \ strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \ sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ @@ -357,6 +364,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ stpmv.mkl dtpmv.mkl ctpmv.mkl ztpmv.mkl \ + stpsv.mkl dtpsv.mkl ctpsv.mkl ztpsv.mkl \ strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \ sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ @@ -406,6 +414,7 @@ veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ sgemv.veclib dgemv.veclib cgemv.veclib zgemv.veclib \ strmv.veclib dtrmv.veclib ctrmv.veclib ztrmv.veclib \ stpmv.veclib dtpmv.veclib ctpmv.veclib ztpmv.veclib \ + stpsv.veclib dtpsv.veclib ctpsv.veclib ztpsv.veclib \ strsv.veclib dtrsv.veclib ctrsv.veclib ztrsv.veclib \ sgeev.veclib dgeev.veclib cgeev.veclib zgeev.veclib \ sgesv.veclib dgesv.veclib cgesv.veclib zgesv.veclib \ @@ -1438,6 +1447,73 @@ ztpmv.mkl : ztpmv.$(SUFFIX) ztpmv.veclib : ztpmv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Stpsv #################################################### +stpsv.goto : stpsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +stpsv.acml : stpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +stpsv.atlas : stpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +stpsv.mkl : stpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +stpsv.veclib : stpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dtpsv #################################################### +dtpsv.goto : dtpsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dtpsv.acml : dtpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtpsv.atlas : dtpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtpsv.mkl : dtpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtpsv.veclib : dtpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ctpsv #################################################### + +ctpsv.goto : ctpsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ctpsv.acml : ctpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctpsv.atlas : ctpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctpsv.mkl : ctpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctpsv.veclib : ctpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ztpsv #################################################### + +ztpsv.goto : ztpsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ztpsv.acml : ztpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztpsv.atlas : ztpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztpsv.mkl : ztpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztpsv.veclib : ztpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Strsv #################################################### strsv.goto : strsv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -2886,6 +2962,18 @@ ctpmv.$(SUFFIX) : tpmv.c ztpmv.$(SUFFIX) : tpmv.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +stpsv.$(SUFFIX) : tpsv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dtpsv.$(SUFFIX) : tpsv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ctpsv.$(SUFFIX) : tpsv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +ztpsv.$(SUFFIX) : tpsv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + strsv.$(SUFFIX) : trsv.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ diff --git a/benchmark/tpsv.c b/benchmark/tpsv.c new file mode 100644 index 000000000..46d78fd17 --- /dev/null +++ b/benchmark/tpsv.c @@ -0,0 +1,172 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#undef TPSV + +#ifndef COMPLEX + +#ifdef DOUBLE +#define TPSV BLASFUNC(dtpsv) +#else +#define TPSV BLASFUNC(stpsv) +#endif + +#else + +#ifdef DOUBLE +#define TPSV BLASFUNC(ztpsv) +#else +#define TPSV BLASFUNC(ctpsv) +#endif + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size) +{ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1) { + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]) +{ + + FLOAT *a, *x; + char *p; + + char uplo ='U'; + char trans='N'; + char diag ='U'; + + int loops = 1; + int l; + blasint inc_x=1; + + if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; + if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; + if ((p = getenv("OPENBLAS_DIAG"))) diag=*p; + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + blasint n, i, j; + + int from = 1; + int to = 200; + int step = 1; + + struct timespec start = { 0, 0 }, stop = { 0, 0 }; + double time1, timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c Diag = %c Loops=%d Inc_x=%d\n", from, + to, step, uplo, trans, diag, loops, inc_x); + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) { + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(n = from; n <= to; n += step) { + timeg=0; + + fprintf(stderr, " %6d : ", (int)n); + for(j = 0; j < n; j++) { + for(i = 0; i < n * COMPSIZE; i++) { + a[(long)i + (long)j * (long)n * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + } + + for (i = 0; i < n * COMPSIZE * abs(inc_x); i++) { + x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + + for (l = 0; l < loops; l++) { + clock_gettime(CLOCK_REALTIME, &start); + TPSV (&uplo, &trans, &diag, &n, a, x, &inc_x); + clock_gettime(CLOCK_REALTIME, &stop); + + time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9; + timeg += time1; + } + + timeg /= loops; + fprintf(stderr, " %10.2f MFlops %12.9f sec\n", + COMPSIZE * COMPSIZE * 1. * (double)n * (double)n / timeg / 1.e6, timeg); + } + + return 0; +} + +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); From d45c53ecf1f9aef0833643f7c16140ffa4d6b60d Mon Sep 17 00:00:00 2001 From: l00536773 Date: Mon, 16 Mar 2020 11:19:05 +0800 Subject: [PATCH 066/593] [OpenBLAS]: benchmark for her/her2 LEVEL2 functions [description]: benchmark for her/her2 [solution]: added benchmark for her/her2, modified makefile in benchmark [dts]: --- benchmark/Makefile | 98 +++++++++++++++++++++++ benchmark/her.c | 186 ++++++++++++++++++++++++++++++++++++++++++++ benchmark/her2.c | 190 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 474 insertions(+) create mode 100644 benchmark/her.c create mode 100644 benchmark/her2.c diff --git a/benchmark/Makefile b/benchmark/Makefile index df17366d7..efb4da5ea 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -78,6 +78,8 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ chemm.goto zhemm.goto \ cherk.goto zherk.goto \ cher2k.goto zher2k.goto \ + cher.goto zher.goto \ + cher2.goto zher2.goto \ sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ sspmv.goto dspmv.goto \ strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ @@ -117,6 +119,8 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ chemm.acml zhemm.acml \ cherk.acml zherk.acml \ cher2k.acml zher2k.acml \ + cher.acml zher.acml \ + cher2.acml zher2.acml \ sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ stpmv.acml dtpmv.acml ctpmv.acml ztpmv.acml \ @@ -156,6 +160,8 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ chemm.atlas zhemm.atlas \ cherk.atlas zherk.atlas \ cher2k.atlas zher2k.atlas \ + cher.atlas zher.atlas \ + cher2.atlas zher2.atlas \ sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ sspmv.atlas dspmv.atlas \ strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ @@ -195,6 +201,8 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ chemm.mkl zhemm.mkl \ cherk.mkl zherk.mkl \ cher2k.mkl zher2k.mkl \ + cher.mkl zher.mkl \ + cher2.mkl zher2.mkl \ sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ stpmv.mkl dtpmv.mkl ctpmv.mkl ztpmv.mkl \ @@ -233,6 +241,8 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ chemm.goto zhemm.goto \ cherk.goto zherk.goto \ cher2k.goto zher2k.goto \ + cher.goto zher.goto \ + cher2.goto zher2.goto \ sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ sspmv.goto dspmv.goto \ strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ @@ -278,6 +288,8 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ chemm.acml zhemm.acml \ cherk.acml zherk.acml \ cher2k.acml zher2k.acml \ + cher.acml zher.acml \ + cher2.acml zher2.acml \ sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ stpmv.acml dtpmv.acml ctpmv.acml ztpmv.acml \ @@ -317,6 +329,8 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ chemm.atlas zhemm.atlas \ cherk.atlas zherk.atlas \ cher2k.atlas zher2k.atlas \ + cher.atlas zher.atlas \ + cher2.atlas zher2.atlas \ sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ sspmv.atlas dspmv.atlas \ strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ @@ -358,6 +372,8 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ chemm.mkl zhemm.mkl \ cherk.mkl zherk.mkl \ cher2k.mkl zher2k.mkl \ + cher.mkl zher.mkl \ + cher2.mkl zher2.mkl \ sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ stpmv.mkl dtpmv.mkl ctpmv.mkl ztpmv.mkl \ @@ -407,6 +423,8 @@ veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ chemm.veclib zhemm.veclib \ cherk.veclib zherk.veclib \ cher2k.veclib zher2k.veclib \ + cher.veclib zher.veclib \ + cher2.veclib zher2.veclib \ sgemv.veclib dgemv.veclib cgemv.veclib zgemv.veclib \ strmv.veclib dtrmv.veclib ctrmv.veclib ztrmv.veclib \ stpmv.veclib dtpmv.veclib ctpmv.veclib ztpmv.veclib \ @@ -1244,6 +1262,74 @@ zher2k.mkl : zher2k.$(SUFFIX) zher2k.veclib : zher2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Cher #################################################### + +cher.goto : cher.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cher.acml : cher.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cher.atlas : cher.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cher.mkl : cher.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cher.veclib : cher.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zher #################################################### + +zher.goto : zher.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zher.acml : zher.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zher.atlas : zher.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zher.mkl : zher.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zher.veclib : zher.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cher2 #################################################### + +cher2.goto : cher2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cher2.acml : cher2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cher2.atlas : cher2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cher2.mkl : cher2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cher2.veclib : cher2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zher2 #################################################### + +zher2.goto : zher2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zher2.acml : zher2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zher2.atlas : zher2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zher2.mkl : zher2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zher2.veclib : zher2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Sgemv #################################################### sgemv.goto : sgemv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -2868,6 +2954,18 @@ cher2k.$(SUFFIX) : her2k.c zher2k.$(SUFFIX) : her2k.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +cher.$(SUFFIX) : her.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zher.$(SUFFIX) : her.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +cher2.$(SUFFIX) : her2.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zher2.$(SUFFIX) : her2.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + sgemv.$(SUFFIX) : gemv.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ diff --git a/benchmark/her.c b/benchmark/her.c new file mode 100644 index 000000000..f4e10b684 --- /dev/null +++ b/benchmark/her.c @@ -0,0 +1,186 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef HER + + +#ifdef DOUBLE +#define HER BLASFUNC(zher) +#else +#define HER BLASFUNC(cher) +#endif + + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *a, *x; + FLOAT alpha[] = {1.0, 1.0}; + blasint incx = 1; + char *p; + + char uplo='U'; + char trans='N'; + + if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; + if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; + + blasint m, i, j; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c\n", from, to, step,uplo,trans); + + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + fprintf(stderr, " %6d : ", (int)m); + + for(j = 0; j < m; j++){ + for(i = 0; i < m * COMPSIZE; i++){ + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + x[ (long)j * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + + gettimeofday( &start, (struct timezone *)0); + + HER (&uplo, &m, alpha, x, &incx, a, &m ); + + gettimeofday( &stop, (struct timezone *)0); + + time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + + gettimeofday( &start, (struct timezone *)0); + + fprintf(stderr, + " %10.2f MFlops\n", + COMPSIZE * COMPSIZE * 1. * (double)m * (double)m / time1 * 1.e-6); + + } + + return 0; +} diff --git a/benchmark/her2.c b/benchmark/her2.c new file mode 100644 index 000000000..e10b7e98e --- /dev/null +++ b/benchmark/her2.c @@ -0,0 +1,190 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef HER2 + + +#ifdef DOUBLE +#define HER2 BLASFUNC(zher2) +#else +#define HER2 BLASFUNC(cher2) +#endif + + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *a, *x, *y; + FLOAT alpha[] = {1.0, 1.0}; + blasint inc = 1; + char *p; + + char uplo='U'; + char trans='N'; + + if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; + if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; + + blasint m, i, j; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c\n", from, to, step,uplo,trans); + + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + fprintf(stderr, " %6d : ", (int)m); + + for(j = 0; j < m; j++){ + for(i = 0; i < m * COMPSIZE; i++){ + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + x[ (long)j * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + y[ (long)j * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + + gettimeofday( &start, (struct timezone *)0); + + + HER2 (&uplo, &m, alpha, x, &inc, y, &inc, a, &m ); + + gettimeofday( &stop, (struct timezone *)0); + + time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + + gettimeofday( &start, (struct timezone *)0); + + fprintf(stderr, + " %10.2f MFlops\n", + COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / time1 * 1.e-6); + + } + + return 0; +} From fa049d49c2b8aa26e9de09d328a0f31c3810d145 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Tue, 17 Mar 2020 00:34:08 +0800 Subject: [PATCH 067/593] AVX2 STRSM kernel --- kernel/x86_64/KERNEL.HASWELL | 8 +- kernel/x86_64/strsm_kernel_8x4_haswell_LN.c | 240 +++++++++++++++ kernel/x86_64/strsm_kernel_8x4_haswell_LT.c | 228 ++++++++++++++ .../strsm_kernel_8x4_haswell_L_common.h | 187 ++++++++++++ kernel/x86_64/strsm_kernel_8x4_haswell_RN.c | 279 +++++++++++++++++ kernel/x86_64/strsm_kernel_8x4_haswell_RT.c | 281 ++++++++++++++++++ .../strsm_kernel_8x4_haswell_R_common.h | 226 ++++++++++++++ 7 files changed, 1445 insertions(+), 4 deletions(-) create mode 100644 kernel/x86_64/strsm_kernel_8x4_haswell_LN.c create mode 100644 kernel/x86_64/strsm_kernel_8x4_haswell_LT.c create mode 100644 kernel/x86_64/strsm_kernel_8x4_haswell_L_common.h create mode 100644 kernel/x86_64/strsm_kernel_8x4_haswell_RN.c create mode 100644 kernel/x86_64/strsm_kernel_8x4_haswell_RT.c create mode 100644 kernel/x86_64/strsm_kernel_8x4_haswell_R_common.h diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index f6ca5c2d5..ef8b36a57 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -77,10 +77,10 @@ ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +STRSMKERNEL_LN = strsm_kernel_8x4_haswell_LN.c +STRSMKERNEL_LT = strsm_kernel_8x4_haswell_LT.c +STRSMKERNEL_RN = strsm_kernel_8x4_haswell_RN.c +STRSMKERNEL_RT = strsm_kernel_8x4_haswell_RT.c DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c diff --git a/kernel/x86_64/strsm_kernel_8x4_haswell_LN.c b/kernel/x86_64/strsm_kernel_8x4_haswell_LN.c new file mode 100644 index 000000000..4131debb1 --- /dev/null +++ b/kernel/x86_64/strsm_kernel_8x4_haswell_LN.c @@ -0,0 +1,240 @@ +#include "common.h" +#include +#include "strsm_kernel_8x4_haswell_L_common.h" + +#define SOLVE_LN_m1n4 \ + "subq $4,%2; movq %2,%3;" GEMM_SUM_REORDER_1x4(4)\ + SOLVE_m1n4(-4,4) SAVE_b_m1n4(-16,4)\ + "movq %2,%3;" save_c_m1n4(4) + +#define SOLVE_LN_m1n8 \ + "subq $4,%2; movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5)\ + SOLVE_m1n8(-4,4,5) SAVE_b_m1n8(-16,4,5)\ + "movq %2,%3;" save_c_m1n4(4) save_c_m1n4(5) + +#define SOLVE_LN_m1n12 \ + "subq $4,%2; movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6)\ + SOLVE_m1n12(-4,4,5,6) SAVE_b_m1n12(-16,4,5,6)\ + "movq %2,%3;" save_c_m1n4(4) save_c_m1n4(5) save_c_m1n4(6) + +#define SOLVE_LN_m2n4 \ + "subq $8,%2; movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5,4)\ + SOLVE_loup_m2n4(-8,4)\ + SOLVE_up_m2n4(-16,4) SAVE_b_m2n4(-32,4)\ + "movq %2,%3;" save_c_m2n4(4) + +#define SOLVE_LN_m2n8 \ + "subq $8,%2; movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5,4) GEMM_SUM_REORDER_2x4(6,7,5)\ + SOLVE_loup_m2n8(-8,4,5)\ + SOLVE_up_m2n8(-16,4,5) SAVE_b_m2n8(-32,4,5)\ + "movq %2,%3;" save_c_m2n4(4) save_c_m2n4(5) + +#define SOLVE_LN_m2n12 \ + "subq $8,%2; movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5,4) GEMM_SUM_REORDER_2x4(6,7,5) GEMM_SUM_REORDER_2x4(8,9,6)\ + SOLVE_loup_m2n12(-8,4,5,6)\ + SOLVE_up_m2n12(-16,4,5,6) SAVE_b_m2n12(-32,4,5,6)\ + "movq %2,%3;" save_c_m2n4(4) save_c_m2n4(5) save_c_m2n4(6) + +#define SOLVE_LN_m4n4 \ + "subq $16,%2; movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5)\ +\ + SOLVE_loup_m2n4(-8,5) SUBTRACT_m2n4(-16,4)\ + SOLVE_up_m2n4(-24,5) SUBTRACT_m2n4(-32,4) SAVE_b_m2n4(-32,5)\ +\ + SOLVE_loup_m2n4(-48,4)\ + SOLVE_up_m2n4(-64,4) SAVE_b_m2n4(-64,4)\ +\ + "movq %2,%3;" save_c_m4n4(4,5) + +#define SOLVE_LN_m4n8 \ + "subq $16,%2; movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7)\ +\ + SOLVE_loup_m2n8(-8,5,7) SUBTRACT_m2n8(-16,4,6)\ + SOLVE_up_m2n8(-24,5,7) SUBTRACT_m2n8(-32,4,6) SAVE_b_m2n8(-32,5,7)\ +\ + SOLVE_loup_m2n8(-48,4,6)\ + SOLVE_up_m2n8(-64,4,6) SAVE_b_m2n8(-64,4,6)\ +\ + "movq %2,%3;" save_c_m4n4(4,5) save_c_m4n4(6,7) + +#define SOLVE_LN_m4n12 \ + "subq $16,%2; movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9)\ +\ + SOLVE_loup_m2n12(-8,5,7,9) SUBTRACT_m2n12(-16,4,6,8)\ + SOLVE_up_m2n12(-24,5,7,9) SUBTRACT_m2n12(-32,4,6,8) SAVE_b_m2n12(-32,5,7,9)\ +\ + SOLVE_loup_m2n12(-48,4,6,8)\ + SOLVE_up_m2n12(-64,4,6,8) SAVE_b_m2n12(-64,4,6,8)\ +\ + "movq %2,%3;" save_c_m4n4(4,5) save_c_m4n4(6,7) save_c_m4n4(8,9) + +#define SOLVE_LN_m8n4 \ + "subq $32,%2; movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,-32)\ +\ + SOLVE_loup_m2n4(-8,7) SUBTRACT_m2n4(-16,6) SUBTRACT_m2n4(-24,5) SUBTRACT_m2n4(-32,4)\ + SOLVE_up_m2n4(-40,7) SUBTRACT_m2n4(-48,6) SUBTRACT_m2n4(-56,5) SUBTRACT_m2n4(-64,4) SAVE_b_m2n4(-32,7)\ +\ + SOLVE_loup_m2n4(-80,6) SUBTRACT_m2n4(-88,5) SUBTRACT_m2n4(-96,4)\ + SOLVE_up_m2n4(-112,6) SUBTRACT_m2n4(-120,5) SUBTRACT_m2n4(-128,4) SAVE_b_m2n4(-64,6)\ +\ + SOLVE_loup_m2n4(-152,5) SUBTRACT_m2n4(-160,4)\ + SOLVE_up_m2n4(-184,5) SUBTRACT_m2n4(-192,4) SAVE_b_m2n4(-96,5)\ +\ + SOLVE_loup_m2n4(-224,4)\ + SOLVE_up_m2n4(-256,4) SAVE_b_m2n4(-128,4)\ +\ + "movq %2,%3;" save_c_m8n4(4,5,6,7) + +#define SOLVE_LN_m8n8 \ + "subq $32,%2; movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,-32) GEMM_SUM_REORDER_8x4(8,9,10,11,-32)\ +\ + SOLVE_loup_m2n8(-8,7,11) SUBTRACT_m2n8(-16,6,10) SUBTRACT_m2n8(-24,5,9) SUBTRACT_m2n8(-32,4,8)\ + SOLVE_up_m2n8(-40,7,11) SUBTRACT_m2n8(-48,6,10) SUBTRACT_m2n8(-56,5,9) SUBTRACT_m2n8(-64,4,8) SAVE_b_m2n8(-32,7,11)\ +\ + SOLVE_loup_m2n8(-80,6,10) SUBTRACT_m2n8(-88,5,9) SUBTRACT_m2n8(-96,4,8)\ + SOLVE_up_m2n8(-112,6,10) SUBTRACT_m2n8(-120,5,9) SUBTRACT_m2n8(-128,4,8) SAVE_b_m2n8(-64,6,10)\ +\ + SOLVE_loup_m2n8(-152,5,9) SUBTRACT_m2n8(-160,4,8)\ + SOLVE_up_m2n8(-184,5,9) SUBTRACT_m2n8(-192,4,8) SAVE_b_m2n8(-96,5,9)\ +\ + SOLVE_loup_m2n8(-224,4,8)\ + SOLVE_up_m2n8(-256,4,8) SAVE_b_m2n8(-128,4,8)\ +\ + "movq %2,%3;" save_c_m8n4(4,5,6,7) save_c_m8n4(8,9,10,11) + +#define SOLVE_LN_m8n12 \ + "subq $32,%2; movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,-32) GEMM_SUM_REORDER_8x4(8,9,10,11,-32) GEMM_SUM_REORDER_8x4(12,13,14,15,-32)\ +\ + SOLVE_loup_m2n12(-8,7,11,15) SUBTRACT_m2n12(-16,6,10,14) SUBTRACT_m2n12(-24,5,9,13) SUBTRACT_m2n12(-32,4,8,12)\ + SOLVE_up_m2n12(-40,7,11,15) SUBTRACT_m2n12(-48,6,10,14) SUBTRACT_m2n12(-56,5,9,13) SUBTRACT_m2n12(-64,4,8,12) SAVE_b_m2n12(-32,7,11,15)\ +\ + SOLVE_loup_m2n12(-80,6,10,14) SUBTRACT_m2n12(-88,5,9,13) SUBTRACT_m2n12(-96,4,8,12)\ + SOLVE_up_m2n12(-112,6,10,14) SUBTRACT_m2n12(-120,5,9,13) SUBTRACT_m2n12(-128,4,8,12) SAVE_b_m2n12(-64,6,10,14)\ +\ + SOLVE_loup_m2n12(-152,5,9,13) SUBTRACT_m2n12(-160,4,8,12)\ + SOLVE_up_m2n12(-184,5,9,13) SUBTRACT_m2n12(-192,4,8,12) SAVE_b_m2n12(-96,5,9,13)\ +\ + SOLVE_loup_m2n12(-224,4,8,12)\ + SOLVE_up_m2n12(-256,4,8,12) SAVE_b_m2n12(-128,4,8,12)\ +\ + "movq %2,%3;" save_c_m8n4(4,5,6,7) save_c_m8n4(8,9,10,11) save_c_m8n4(12,13,14,15) + +/* r13 = k-kk, r14 = b_tail, r15 = a_tail */ + +#define GEMM_LN_SIMPLE(mdim,ndim) \ + "movq %%r15,%0; negq %%r12; leaq (%%r15,%%r12,"#mdim"),%%r15; negq %%r12;"\ + "movq %%r13,%5; addq $"#mdim",%%r13; movq %%r14,%1;" INIT_m##mdim##n##ndim\ + "testq %5,%5; jz 2"#mdim""#ndim"2f;"\ + "2"#mdim""#ndim"1:\n\t"\ + "subq $16,%1; subq $"#mdim"*4,%0;" GEMM_KERNEL_k1m##mdim##n##ndim "decq %5; jnz 2"#mdim""#ndim"1b;"\ + "2"#mdim""#ndim"2:\n\t" +#define GEMM_LN_m8n4 GEMM_LN_SIMPLE(8,4) +#define GEMM_LN_m8n8 GEMM_LN_SIMPLE(8,8) +#define GEMM_LN_m8n12 \ + "movq %%r15,%0; negq %%r12; leaq (%%r15,%%r12,8),%%r15; negq %%r12; movq %%r13,%5; addq $8,%%r13; movq %%r14,%1;" INIT_m8n12\ + "cmpq $8,%5; jb 28122f;"\ + "28121:\n\t"\ + "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "subq $8,%5; cmpq $8,%5; jnb 28121b;"\ + "28122:\n\t"\ + "testq %5,%5; jz 28124f;"\ + "28123:\n\t"\ + "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12 "decq %5; jnz 28123b;"\ + "28124:\n\t" +#define GEMM_LN_m4n4 GEMM_LN_SIMPLE(4,4) +#define GEMM_LN_m4n8 GEMM_LN_SIMPLE(4,8) +#define GEMM_LN_m4n12 GEMM_LN_SIMPLE(4,12) +#define GEMM_LN_m2n4 GEMM_LN_SIMPLE(2,4) +#define GEMM_LN_m2n8 GEMM_LN_SIMPLE(2,8) +#define GEMM_LN_m2n12 GEMM_LN_SIMPLE(2,12) +#define GEMM_LN_m1n4 GEMM_LN_SIMPLE(1,4) +#define GEMM_LN_m1n8 GEMM_LN_SIMPLE(1,8) +#define GEMM_LN_m1n12 GEMM_LN_SIMPLE(1,12) + +#define COMPUTE(ndim) {\ + c_ptr += M;\ + __asm__ __volatile__(\ + "movq %0,%%r15; movq %7,%%r13; movq %6,%%r12; salq $2,%%r12; leaq (%1,%%r12,4),%%r14; movq %10,%%r11;"\ + "testq $1,%%r11; jz "#ndim"772f;"\ + #ndim"771:\n\t"\ + GEMM_LN_m1n##ndim SOLVE_LN_m1n##ndim "subq $1,%%r11;"\ + #ndim"772:\n\t"\ + "testq $2,%%r11; jz "#ndim"773f;"\ + GEMM_LN_m2n##ndim SOLVE_LN_m2n##ndim "subq $2,%%r11;"\ + #ndim"773:\n\t"\ + "testq $4,%%r11; jz "#ndim"774f;"\ + GEMM_LN_m4n##ndim SOLVE_LN_m4n##ndim "subq $4,%%r11;"\ + #ndim"774:\n\t"\ + "testq %%r11,%%r11; jz "#ndim"776f;"\ + #ndim"775:\n\t"\ + GEMM_LN_m8n##ndim SOLVE_LN_m8n##ndim "subq $8,%%r11; jnz "#ndim"775b;"\ + #ndim"776:\n\t"\ + "movq %%r15,%0; movq %%r14,%1; vzeroupper;"\ + :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(kmkkinp),"m"(one[0]),"m"(zero[0]),"m"(M)\ + :"r11","r12","r13","r14","r15","cc","memory",\ + "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ + a_ptr += M * K; b_ptr += (ndim-4) * K; c_ptr += ldc * ndim;\ +} +static void solve_LN(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT a0, b0; + int i, j, k; + for (i=m-1;i>=0;i--) { + a0 = a[i*m+i]; //reciprocal of the original value + for (j=0;j0) GEMM_KERNEL_N(1,n,k-kk,-1.0,a_ptr+kk*1,sb+kk*n,c_ptr,ldc); + solve_LN(1,n,a_ptr+(kk-1)*1,sb+(kk-1)*n,c_ptr,ldc); + kk -= 1; + m_count--; + } + if(m_count&2){ + a_ptr-=k*2; c_ptr-=2; + if(k-kk>0) GEMM_KERNEL_N(2,n,k-kk,-1.0,a_ptr+kk*2,sb+kk*n,c_ptr,ldc); + solve_LN(2,n,a_ptr+(kk-2)*2,sb+(kk-2)*n,c_ptr,ldc); + kk -= 2; + m_count-=2; + } + if(m_count&4){ + a_ptr-=k*4; c_ptr-=4; + if(k-kk>0) GEMM_KERNEL_N(4,n,k-kk,-1.0,a_ptr+kk*4,sb+kk*n,c_ptr,ldc); + solve_LN(4,n,a_ptr+(kk-4)*4,sb+(kk-4)*n,c_ptr,ldc); + kk -= 4; + m_count-=4; + } + for(;m_count>7;m_count-=8){ + a_ptr-=k*8; c_ptr-=8; + if(k-kk>0) GEMM_KERNEL_N(8,n,k-kk,-1.0,a_ptr+kk*8,sb+kk*n,c_ptr,ldc); + solve_LN(8,n,a_ptr+(kk-8)*8,sb+(kk-8)*n,c_ptr,ldc); + kk -= 8; + } +} +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){ + float *a_ptr = sa+m*k, *b_ptr = sb, *c_ptr = C, *c_tmp = C; + float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; + float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}; + uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, kmkkinp = (uint64_t)(k-m-offset), k_cnt = 0; + BLASLONG n_count = n; + for(;n_count>11;n_count-=12) COMPUTE(12) + for(;n_count>7;n_count-=8) COMPUTE(8) + for(;n_count>3;n_count-=4) COMPUTE(4) + for(;n_count>1;n_count-=2) { COMPUTE_EDGE_1_nchunk(m,2,sa,b_ptr,c_ptr,ldc,k,offset); b_ptr += 2*k; c_ptr += ldc*2;} + if(n_count>0) COMPUTE_EDGE_1_nchunk(m,1,sa,b_ptr,c_ptr,ldc,k,offset); + return 0; +} + diff --git a/kernel/x86_64/strsm_kernel_8x4_haswell_LT.c b/kernel/x86_64/strsm_kernel_8x4_haswell_LT.c new file mode 100644 index 000000000..3c7e9e83b --- /dev/null +++ b/kernel/x86_64/strsm_kernel_8x4_haswell_LT.c @@ -0,0 +1,228 @@ +#include "common.h" +#include +#include "strsm_kernel_8x4_haswell_L_common.h" + +#define SOLVE_LT_m1n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_1x4(4)\ + SOLVE_m1n4(0,4) SAVE_b_m1n4(0,4)\ + "movq %2,%3; addq $4,%2;" save_c_m1n4(4) + +#define SOLVE_LT_m1n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5)\ + SOLVE_m1n8(0,4,5) SAVE_b_m1n8(0,4,5)\ + "movq %2,%3; addq $4,%2;" save_c_m1n4(4) save_c_m1n4(5) + +#define SOLVE_LT_m1n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6)\ + SOLVE_m1n12(0,4,5,6) SAVE_b_m1n12(0,4,5,6)\ + "movq %2,%3; addq $4,%2;" save_c_m1n4(4) save_c_m1n4(5) save_c_m1n4(6) + +#define SOLVE_LT_m2n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5,4)\ + SOLVE_uplo_m2n4(0,4)\ + SOLVE_lo_m2n4(8,4) SAVE_b_m2n4(0,4)\ + "movq %2,%3; addq $8,%2;" save_c_m2n4(4) + +#define SOLVE_LT_m2n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5,4) GEMM_SUM_REORDER_2x4(6,7,5)\ + SOLVE_uplo_m2n8(0,4,5)\ + SOLVE_lo_m2n8(8,4,5) SAVE_b_m2n8(0,4,5)\ + "movq %2,%3; addq $8,%2;" save_c_m2n4(4) save_c_m2n4(5) + +#define SOLVE_LT_m2n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5,4) GEMM_SUM_REORDER_2x4(6,7,5) GEMM_SUM_REORDER_2x4(8,9,6)\ + SOLVE_uplo_m2n12(0,4,5,6)\ + SOLVE_lo_m2n12(8,4,5,6) SAVE_b_m2n12(0,4,5,6)\ + "movq %2,%3; addq $8,%2;" save_c_m2n4(4) save_c_m2n4(5) save_c_m2n4(6) + +#define SOLVE_LT_m4n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5)\ +\ + SOLVE_uplo_m2n4(0,4) SUBTRACT_m2n4(8,5)\ + SOLVE_lo_m2n4(16,4) SUBTRACT_m2n4(24,5) SAVE_b_m2n4(0,4)\ +\ + SOLVE_uplo_m2n4(40,5)\ + SOLVE_lo_m2n4(56,5) SAVE_b_m2n4(32,5)\ +\ + "movq %2,%3; addq $16,%2;" save_c_m4n4(4,5) + +#define SOLVE_LT_m4n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7)\ +\ + SOLVE_uplo_m2n8(0,4,6) SUBTRACT_m2n8(8,5,7)\ + SOLVE_lo_m2n8(16,4,6) SUBTRACT_m2n8(24,5,7) SAVE_b_m2n8(0,4,6)\ +\ + SOLVE_uplo_m2n8(40,5,7)\ + SOLVE_lo_m2n8(56,5,7) SAVE_b_m2n8(32,5,7)\ +\ + "movq %2,%3; addq $16,%2;" save_c_m4n4(4,5) save_c_m4n4(6,7) + +#define SOLVE_LT_m4n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9)\ +\ + SOLVE_uplo_m2n12(0,4,6,8) SUBTRACT_m2n12(8,5,7,9)\ + SOLVE_lo_m2n12(16,4,6,8) SUBTRACT_m2n12(24,5,7,9) SAVE_b_m2n12(0,4,6,8)\ +\ + SOLVE_uplo_m2n12(40,5,7,9)\ + SOLVE_lo_m2n12(56,5,7,9) SAVE_b_m2n12(32,5,7,9)\ +\ + "movq %2,%3; addq $16,%2;" save_c_m4n4(4,5) save_c_m4n4(6,7) save_c_m4n4(8,9) + +#define SOLVE_LT_m8n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63)\ +\ + SOLVE_uplo_m2n4(0,4) SUBTRACT_m2n4(8,5) SUBTRACT_m2n4(16,6) SUBTRACT_m2n4(24,7)\ + SOLVE_lo_m2n4(32,4) SUBTRACT_m2n4(40,5) SUBTRACT_m2n4(48,6) SUBTRACT_m2n4(56,7) SAVE_b_m2n4(0,4)\ +\ + SOLVE_uplo_m2n4(72,5) SUBTRACT_m2n4(80,6) SUBTRACT_m2n4(88,7)\ + SOLVE_lo_m2n4(104,5) SUBTRACT_m2n4(112,6) SUBTRACT_m2n4(120,7) SAVE_b_m2n4(32,5)\ +\ + SOLVE_uplo_m2n4(144,6) SUBTRACT_m2n4(152,7)\ + SOLVE_lo_m2n4(176,6) SUBTRACT_m2n4(184,7) SAVE_b_m2n4(64,6)\ +\ + SOLVE_uplo_m2n4(216,7)\ + SOLVE_lo_m2n4(248,7) SAVE_b_m2n4(96,7)\ +\ + "movq %2,%3; addq $32,%2;" save_c_m8n4(4,5,6,7) + +#define SOLVE_LT_m8n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63)\ +\ + SOLVE_uplo_m2n8(0,4,8) SUBTRACT_m2n8(8,5,9) SUBTRACT_m2n8(16,6,10) SUBTRACT_m2n8(24,7,11)\ + SOLVE_lo_m2n8(32,4,8) SUBTRACT_m2n8(40,5,9) SUBTRACT_m2n8(48,6,10) SUBTRACT_m2n8(56,7,11) SAVE_b_m2n8(0,4,8)\ +\ + SOLVE_uplo_m2n8(72,5,9) SUBTRACT_m2n8(80,6,10) SUBTRACT_m2n8(88,7,11)\ + SOLVE_lo_m2n8(104,5,9) SUBTRACT_m2n8(112,6,10) SUBTRACT_m2n8(120,7,11) SAVE_b_m2n8(32,5,9)\ +\ + SOLVE_uplo_m2n8(144,6,10) SUBTRACT_m2n8(152,7,11)\ + SOLVE_lo_m2n8(176,6,10) SUBTRACT_m2n8(184,7,11) SAVE_b_m2n8(64,6,10)\ +\ + SOLVE_uplo_m2n8(216,7,11)\ + SOLVE_lo_m2n8(248,7,11) SAVE_b_m2n8(96,7,11)\ +\ + "movq %2,%3; addq $32,%2;" save_c_m8n4(4,5,6,7) save_c_m8n4(8,9,10,11) + +#define SOLVE_LT_m8n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) GEMM_SUM_REORDER_8x4(12,13,14,15,63)\ +\ + SOLVE_uplo_m2n12(0,4,8,12) SUBTRACT_m2n12(8,5,9,13) SUBTRACT_m2n12(16,6,10,14) SUBTRACT_m2n12(24,7,11,15)\ + SOLVE_lo_m2n12(32,4,8,12) SUBTRACT_m2n12(40,5,9,13) SUBTRACT_m2n12(48,6,10,14) SUBTRACT_m2n12(56,7,11,15) SAVE_b_m2n12(0,4,8,12)\ +\ + SOLVE_uplo_m2n12(72,5,9,13) SUBTRACT_m2n12(80,6,10,14) SUBTRACT_m2n12(88,7,11,15)\ + SOLVE_lo_m2n12(104,5,9,13) SUBTRACT_m2n12(112,6,10,14) SUBTRACT_m2n12(120,7,11,15) SAVE_b_m2n12(32,5,9,13)\ +\ + SOLVE_uplo_m2n12(144,6,10,14) SUBTRACT_m2n12(152,7,11,15)\ + SOLVE_lo_m2n12(176,6,10,14) SUBTRACT_m2n12(184,7,11,15) SAVE_b_m2n12(64,6,10,14)\ +\ + SOLVE_uplo_m2n12(216,7,11,15)\ + SOLVE_lo_m2n12(248,7,11,15) SAVE_b_m2n12(96,7,11,15)\ +\ + "movq %2,%3; addq $32,%2;" save_c_m8n4(4,5,6,7) save_c_m8n4(8,9,10,11) save_c_m8n4(12,13,14,15) + +#define GEMM_LT_SIMPLE(mdim,ndim) \ + "movq %%r15,%0; leaq (%%r15,%%r12,"#mdim"),%%r15; movq %%r13,%5; addq $"#mdim",%%r13; movq %%r14,%1;" INIT_m##mdim##n##ndim\ + "testq %5,%5; jz 1"#mdim""#ndim"2f;"\ + "1"#mdim""#ndim"1:\n\t"\ + GEMM_KERNEL_k1m##mdim##n##ndim "addq $16,%1; addq $"#mdim"*4,%0; decq %5; jnz 1"#mdim""#ndim"1b;"\ + "1"#mdim""#ndim"2:\n\t" +#define GEMM_LT_m8n4 GEMM_LT_SIMPLE(8,4) +#define GEMM_LT_m8n8 GEMM_LT_SIMPLE(8,8) +#define GEMM_LT_m8n12 \ + "movq %%r15,%0; leaq (%%r15,%%r12,8),%%r15; movq %%r13,%5; addq $8,%%r13; movq %%r14,%1;" INIT_m8n12\ + "cmpq $8,%5; jb 18122f;"\ + "18121:\n\t"\ + GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ + "subq $8,%5; cmpq $8,%5; jnb 18121b;"\ + "18122:\n\t"\ + "testq %5,%5; jz 18124f;"\ + "18123:\n\t"\ + GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1; decq %5; jnz 18123b;"\ + "18124:\n\t" +#define GEMM_LT_m4n4 GEMM_LT_SIMPLE(4,4) +#define GEMM_LT_m4n8 GEMM_LT_SIMPLE(4,8) +#define GEMM_LT_m4n12 GEMM_LT_SIMPLE(4,12) +#define GEMM_LT_m2n4 GEMM_LT_SIMPLE(2,4) +#define GEMM_LT_m2n8 GEMM_LT_SIMPLE(2,8) +#define GEMM_LT_m2n12 GEMM_LT_SIMPLE(2,12) +#define GEMM_LT_m1n4 GEMM_LT_SIMPLE(1,4) +#define GEMM_LT_m1n8 GEMM_LT_SIMPLE(1,8) +#define GEMM_LT_m1n12 GEMM_LT_SIMPLE(1,12) + +#define COMPUTE(ndim) {\ + __asm__ __volatile__(\ + "movq %0,%%r15; movq %1,%%r14; movq %7,%%r13; movq %6,%%r12; salq $2,%%r12; movq %10,%%r11;"\ + "cmpq $8,%%r11; jb "#ndim"772f;"\ + #ndim"771:\n\t"\ + GEMM_LT_m8n##ndim SOLVE_LT_m8n##ndim "subq $8,%%r11; cmpq $8,%%r11; jnb "#ndim"771b;"\ + #ndim"772:\n\t"\ + "testq $4,%%r11; jz "#ndim"773f;"\ + GEMM_LT_m4n##ndim SOLVE_LT_m4n##ndim "subq $4,%%r11;"\ + #ndim"773:\n\t"\ + "testq $2,%%r11; jz "#ndim"774f;"\ + GEMM_LT_m2n##ndim SOLVE_LT_m2n##ndim "subq $2,%%r11;"\ + #ndim"774:\n\t"\ + "testq $1,%%r11; jz "#ndim"775f;"\ + GEMM_LT_m1n##ndim SOLVE_LT_m1n##ndim "subq $1,%%r11;"\ + #ndim"775:\n\t"\ + "movq %%r15,%0; movq %%r14,%1; vzeroupper;"\ + :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(OFF),"m"(one[0]),"m"(zero[0]),"m"(M)\ + :"r11","r12","r13","r14","r15","cc","memory",\ + "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ + a_ptr -= M * K; b_ptr += ndim * K; c_ptr += ldc * ndim - M;\ +} +static void solve_LT(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT a0, b0; + int i, j, k; + for (i=0;i7;m_count-=8){ + if(kk>0) GEMM_KERNEL_N(8,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); + solve_LT(8,n,a_ptr+kk*8,sb+kk*n,c_ptr,ldc); + kk += 8; a_ptr += k * 8; c_ptr += 8; + } + for(;m_count>3;m_count-=4){ + if(kk>0) GEMM_KERNEL_N(4,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); + solve_LT(4,n,a_ptr+kk*4,sb+kk*n,c_ptr,ldc); + kk += 4; a_ptr += k * 4; c_ptr += 4; + } + for(;m_count>1;m_count-=2){ + if(kk>0) GEMM_KERNEL_N(2,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); + solve_LT(2,n,a_ptr+kk*2,sb+kk*n,c_ptr,ldc); + kk += 2; a_ptr += k * 2; c_ptr += 2; + } + if(m_count>0){ + if(kk>0) GEMM_KERNEL_N(1,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); + solve_LT(1,n,a_ptr+kk*1,sb+kk*n,c_ptr,ldc); + kk += 1; a_ptr += k * 1; c_ptr += 1; + } +} +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){ + float *a_ptr = sa, *b_ptr = sb, *c_ptr = C, *c_tmp = C; + float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; + float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}; + uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, OFF = (uint64_t)offset, k_cnt = 0; + BLASLONG n_count = n; + for(;n_count>11;n_count-=12) COMPUTE(12) + for(;n_count>7;n_count-=8) COMPUTE(8) + for(;n_count>3;n_count-=4) COMPUTE(4) + for(;n_count>1;n_count-=2) { COMPUTE_EDGE_1_nchunk(m,2,a_ptr,b_ptr,c_ptr,ldc,k,offset); b_ptr += 2*k; c_ptr += ldc*2;} + if(n_count>0) COMPUTE_EDGE_1_nchunk(m,1,a_ptr,b_ptr,c_ptr,ldc,k,offset); + return 0; +} + diff --git a/kernel/x86_64/strsm_kernel_8x4_haswell_L_common.h b/kernel/x86_64/strsm_kernel_8x4_haswell_L_common.h new file mode 100644 index 000000000..cfa56da97 --- /dev/null +++ b/kernel/x86_64/strsm_kernel_8x4_haswell_L_common.h @@ -0,0 +1,187 @@ +/* r11 = m_counter, r12 = size_of_k_elements, r13 = kk, r14 = b_head, r15 = a_head */ +/* register i/o: %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc, %5 = k_counter */ +/* memory input: %6 = K, %7 = offset, %8 = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}, %9 = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}, %10 = M */ + +#define init_m8n4(c1,c2,c3,c4)\ + "vpxor %%ymm"#c1",%%ymm"#c1",%%ymm"#c1"; vpxor %%ymm"#c2",%%ymm"#c2",%%ymm"#c2"; vpxor %%ymm"#c3",%%ymm"#c3",%%ymm"#c3"; vpxor %%ymm"#c4",%%ymm"#c4",%%ymm"#c4";" +#define INIT_m8n4 init_m8n4(4,5,6,7) +#define INIT_m8n8 INIT_m8n4 init_m8n4(8,9,10,11) +#define INIT_m8n12 INIT_m8n8 init_m8n4(12,13,14,15) + +#define init_m4n4(c1,c2,c3,c4)\ + "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2"; vpxor %%xmm"#c3",%%xmm"#c3",%%xmm"#c3"; vpxor %%xmm"#c4",%%xmm"#c4",%%xmm"#c4";" +#define INIT_m4n4 init_m4n4(4,5,6,7) +#define INIT_m4n8 INIT_m4n4 init_m4n4(8,9,10,11) +#define INIT_m4n12 INIT_m4n8 init_m4n4(12,13,14,15) + +#define init_m2n4(c1,c2)\ + "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";" +#define INIT_m2n4 init_m2n4(4,5) +#define INIT_m2n8 INIT_m2n4 init_m2n4(6,7) +#define INIT_m2n12 INIT_m2n8 init_m2n4(8,9) + +#define init_m1n4(c1) "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1";" +#define INIT_m1n4 init_m1n4(4) +#define INIT_m1n8 INIT_m1n4 init_m1n4(5) +#define INIT_m1n12 INIT_m1n8 init_m1n4(6) + +#define GEMM_KERNEL_k1m8n4 \ + "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2;"\ + "vbroadcastsd (%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm4; vfnmadd231ps %%ymm3,%%ymm2,%%ymm5;"\ + "vbroadcastsd 8(%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm6; vfnmadd231ps %%ymm3,%%ymm2,%%ymm7;" +#define GEMM_KERNEL_k1m8n8 GEMM_KERNEL_k1m8n4\ + "vbroadcastsd (%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm8; vfnmadd231ps %%ymm3,%%ymm2,%%ymm9;"\ + "vbroadcastsd 8(%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm10; vfnmadd231ps %%ymm3,%%ymm2,%%ymm11;" +#define GEMM_KERNEL_k1m8n12 GEMM_KERNEL_k1m8n8\ + "vbroadcastsd (%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm12; vfnmadd231ps %%ymm3,%%ymm2,%%ymm13;"\ + "vbroadcastsd 8(%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm14; vfnmadd231ps %%ymm3,%%ymm2,%%ymm15;" + +#define GEMM_KERNEL_k1m4n4 \ + "vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2;"\ + "vmovddup (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;"\ + "vmovddup 8(%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;" +#define GEMM_KERNEL_k1m4n8 GEMM_KERNEL_k1m4n4\ + "vmovddup (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;"\ + "vmovddup 8(%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm10; vfnmadd231ps %%xmm3,%%xmm2,%%xmm11;" +#define GEMM_KERNEL_k1m4n12 GEMM_KERNEL_k1m4n8\ + "vmovddup (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm12; vfnmadd231ps %%xmm3,%%xmm2,%%xmm13;"\ + "vmovddup 8(%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm14; vfnmadd231ps %%xmm3,%%xmm2,%%xmm15;" + +#define GEMM_KERNEL_k1m2n4 \ + "vbroadcastss (%0),%%xmm1; vbroadcastss 4(%0),%%xmm2;"\ + "vmovups (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;" +#define GEMM_KERNEL_k1m2n8 GEMM_KERNEL_k1m2n4\ + "vmovups (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;" +#define GEMM_KERNEL_k1m2n12 GEMM_KERNEL_k1m2n8\ + "vmovups (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;" + +#define GEMM_KERNEL_k1m1n4 "vbroadcastss (%0),%%xmm1; vfnmadd231ps (%1),%%xmm1,%%xmm4;" +#define GEMM_KERNEL_k1m1n8 GEMM_KERNEL_k1m1n4 "vfnmadd231ps (%1,%%r12,4),%%xmm1,%%xmm5;" +#define GEMM_KERNEL_k1m1n12 GEMM_KERNEL_k1m1n8 "vfnmadd231ps (%1,%%r12,8),%%xmm1,%%xmm6;" + +#define GEMM_SUM_REORDER_8x4(c1,c2,c3,c4,prefpos)\ + "vunpcklps %%ymm"#c2",%%ymm"#c1",%%ymm0; vunpckhps %%ymm"#c2",%%ymm"#c1",%%ymm1;"\ + "vunpcklps %%ymm"#c4",%%ymm"#c3",%%ymm2; vunpckhps %%ymm"#c4",%%ymm"#c3",%%ymm3;"\ + "vmovups (%3),%%ymm"#c1"; vmovups (%3,%4,1),%%ymm"#c2"; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1);"\ + "vunpcklpd %%ymm"#c2",%%ymm"#c1",%%ymm"#c3"; vunpckhpd %%ymm"#c2",%%ymm"#c1",%%ymm"#c4";"\ + "vaddps %%ymm0,%%ymm"#c3",%%ymm0; vaddps %%ymm1,%%ymm"#c4",%%ymm1;"\ + "leaq (%3,%4,2),%3;"\ + "vmovups (%3),%%ymm"#c1"; vmovups (%3,%4,1),%%ymm"#c2"; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1);"\ + "vunpcklpd %%ymm"#c2",%%ymm"#c1",%%ymm"#c3"; vunpckhpd %%ymm"#c2",%%ymm"#c1",%%ymm"#c4";"\ + "vaddps %%ymm2,%%ymm"#c3",%%ymm2; vaddps %%ymm3,%%ymm"#c4",%%ymm3;"\ + "leaq (%3,%4,2),%3;"\ + "vperm2f128 $2,%%ymm0,%%ymm2,%%ymm"#c1"; vperm2f128 $2,%%ymm1,%%ymm3,%%ymm"#c2";"\ + "vperm2f128 $19,%%ymm0,%%ymm2,%%ymm"#c3"; vperm2f128 $19,%%ymm1,%%ymm3,%%ymm"#c4";" + +#define GEMM_SUM_REORDER_4x4(c1,c2,c3,c4,co1,co2)\ + "vunpcklps %%xmm"#c2",%%xmm"#c1",%%xmm0; vunpckhps %%xmm"#c2",%%xmm"#c1",%%xmm1;"\ + "vunpcklps %%xmm"#c4",%%xmm"#c3",%%xmm2; vunpckhps %%xmm"#c4",%%xmm"#c3",%%xmm3;"\ + "vmovups (%3),%%xmm"#c1"; vmovups (%3,%4,1),%%xmm"#c2";"\ + "vunpcklpd %%xmm"#c2",%%xmm"#c1",%%xmm"#c3"; vunpckhpd %%xmm"#c2",%%xmm"#c1",%%xmm"#c4";"\ + "vaddps %%xmm0,%%xmm"#c3",%%xmm0; vaddps %%xmm1,%%xmm"#c4",%%xmm1;"\ + "leaq (%3,%4,2),%3;"\ + "vmovups (%3),%%xmm"#c1"; vmovups (%3,%4,1),%%xmm"#c2";"\ + "vunpcklpd %%xmm"#c2",%%xmm"#c1",%%xmm"#c3"; vunpckhpd %%xmm"#c2",%%xmm"#c1",%%xmm"#c4";"\ + "vaddps %%xmm2,%%xmm"#c3",%%xmm2; vaddps %%xmm3,%%xmm"#c4",%%xmm3;"\ + "leaq (%3,%4,2),%3;"\ + "vperm2f128 $2,%%ymm0,%%ymm2,%%ymm"#co1"; vperm2f128 $2,%%ymm1,%%ymm3,%%ymm"#co2";" + +#define GEMM_SUM_REORDER_2x4(c1,c2,co1)\ + "vunpcklps %%xmm"#c2",%%xmm"#c1",%%xmm0; vunpckhps %%xmm"#c2",%%xmm"#c1",%%xmm1;"\ + "vmovsd (%3),%%xmm2; vmovhpd (%3,%4,1),%%xmm2,%%xmm2; vaddps %%xmm0,%%xmm2,%%xmm0; leaq (%3,%4,2),%3;"\ + "vmovsd (%3),%%xmm2; vmovhpd (%3,%4,1),%%xmm2,%%xmm2; vaddps %%xmm1,%%xmm2,%%xmm1; leaq (%3,%4,2),%3;"\ + "vperm2f128 $2,%%ymm0,%%ymm1,%%ymm"#co1";" + +#define GEMM_SUM_REORDER_1x4(c1)\ + "vmovss (%3),%%xmm1; vinsertps $16,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\ + "vinsertps $32,(%3),%%xmm1,%%xmm1; vinsertps $48,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\ + "vaddps %%xmm"#c1",%%xmm1,%%xmm"#c1";" + +#define save_c_m8n4(c1,c2,c3,c4)\ + "vunpcklpd %%ymm"#c2",%%ymm"#c1",%%ymm0; vunpckhpd %%ymm"#c2",%%ymm"#c1",%%ymm1;"\ + "vunpcklpd %%ymm"#c4",%%ymm"#c3",%%ymm2; vunpckhpd %%ymm"#c4",%%ymm"#c3",%%ymm3;"\ + "vperm2f128 $2,%%ymm0,%%ymm2,%%ymm"#c1"; vperm2f128 $2,%%ymm1,%%ymm3,%%ymm"#c2";"\ + "vmovups %%ymm"#c1",(%3); vmovups %%ymm"#c2",(%3,%4,1); leaq (%3,%4,2),%3;"\ + "vperm2f128 $19,%%ymm0,%%ymm2,%%ymm"#c3"; vperm2f128 $19,%%ymm1,%%ymm3,%%ymm"#c4";"\ + "vmovups %%ymm"#c3",(%3); vmovups %%ymm"#c4",(%3,%4,1); leaq (%3,%4,2),%3;" + +#define save_c_m4n4(c1,c2)\ + "vunpcklpd %%ymm"#c2",%%ymm"#c1",%%ymm0; vunpckhpd %%ymm"#c2",%%ymm"#c1",%%ymm1;"\ + "vmovups %%xmm0,(%3); vmovups %%xmm1,(%3,%4,1); leaq (%3,%4,2),%3;"\ + "vextractf128 $1,%%ymm0,(%3); vextractf128 $1,%%ymm1,(%3,%4,1); leaq (%3,%4,2),%3;" + +#define save_c_m2n4(c1)\ + "vextractf128 $1,%%ymm"#c1",%%xmm1; vmovsd %%xmm"#c1",(%3); vmovhpd %%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"\ + "vmovsd %%xmm1,(%3); vmovhpd %%xmm1,(%3,%4,1); leaq (%3,%4,2),%3;" + +#define save_c_m1n4(c1)\ + "vmovss %%xmm"#c1",(%3); vextractps $1,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"\ + "vextractps $2,%%xmm"#c1",(%3); vextractps $3,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;" + +#define SOLVE_up_m2n4(a_off,c1)\ + "vbroadcastsd "#a_off"(%0),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ + "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\ + "vmovsldup %%ymm"#c1",%%ymm1;" + +#define SOLVE_up_m2n8(a_off,c1,c2)\ + "vbroadcastsd "#a_off"(%0),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ + "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\ + "vmovsldup %%ymm"#c1",%%ymm1; vmovsldup %%ymm"#c2",%%ymm2;" + +#define SOLVE_up_m2n12(a_off,c1,c2,c3)\ + "vbroadcastsd "#a_off"(%0),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ + "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2"; vmulps %%ymm2,%%ymm"#c3",%%ymm"#c3";"\ + "vmovsldup %%ymm"#c1",%%ymm1; vmovsldup %%ymm"#c2",%%ymm2; vmovsldup %%ymm"#c3",%%ymm3;" + +#define SOLVE_uplo_m2n4(a_off,c1) SOLVE_up_m2n4(a_off,c1)\ + "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" + +#define SOLVE_uplo_m2n8(a_off,c1,c2) SOLVE_up_m2n8(a_off,c1,c2)\ + "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" + +#define SOLVE_uplo_m2n12(a_off,c1,c2,c3) SOLVE_up_m2n12(a_off,c1,c2,c3)\ + "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2"; vfnmadd231ps %%ymm0,%%ymm3,%%ymm"#c3";" + +#define SOLVE_lo_m2n4(a_off,c1)\ + "vbroadcastsd "#a_off"(%0),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ + "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\ + "vmovshdup %%ymm"#c1",%%ymm1;" + +#define SOLVE_lo_m2n8(a_off,c1,c2)\ + "vbroadcastsd "#a_off"(%0),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ + "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\ + "vmovshdup %%ymm"#c1",%%ymm1; vmovshdup %%ymm"#c2",%%ymm2;" + +#define SOLVE_lo_m2n12(a_off,c1,c2,c3)\ + "vbroadcastsd "#a_off"(%0),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ + "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2"; vmulps %%ymm2,%%ymm"#c3",%%ymm"#c3";"\ + "vmovshdup %%ymm"#c1",%%ymm1; vmovshdup %%ymm"#c2",%%ymm2; vmovshdup %%ymm"#c3",%%ymm3;" + +#define SOLVE_loup_m2n4(a_off,c1) SOLVE_lo_m2n4(a_off,c1)\ + "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" + +#define SOLVE_loup_m2n8(a_off,c1,c2) SOLVE_lo_m2n8(a_off,c1,c2)\ + "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" + +#define SOLVE_loup_m2n12(a_off,c1,c2,c3) SOLVE_lo_m2n12(a_off,c1,c2,c3)\ + "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2"; vfnmadd231ps %%ymm0,%%ymm3,%%ymm"#c3";" + +#define SOLVE_m1n4(a_off,c1) "vbroadcastss "#a_off"(%0),%%xmm0; vmulps %%xmm0,%%xmm"#c1",%%xmm"#c1";" +#define SOLVE_m1n8(a_off,c1,c2) SOLVE_m1n4(a_off,c1) "vmulps %%xmm0,%%xmm"#c2",%%xmm"#c2";" +#define SOLVE_m1n12(a_off,c1,c2,c3) SOLVE_m1n8(a_off,c1,c2) "vmulps %%xmm0,%%xmm"#c3",%%xmm"#c3";" + +#define SUBTRACT_m2n4(a_off,c1) "vbroadcastsd "#a_off"(%0),%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" +#define SUBTRACT_m2n8(a_off,c1,c2) SUBTRACT_m2n4(a_off,c1) "vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" +#define SUBTRACT_m2n12(a_off,c1,c2,c3) SUBTRACT_m2n8(a_off,c1,c2) "vfnmadd231ps %%ymm0,%%ymm3,%%ymm"#c3";" + +#define save_b_m2n4(c1,tmp,b_off,...)\ + "vpermilps $216,%%ymm"#c1",%%ymm"#tmp"; vpermpd $216,%%ymm"#tmp",%%ymm"#tmp"; vmovups %%ymm"#tmp","#b_off"("#__VA_ARGS__");" + +#define SAVE_b_m2n4(b_off,c1) save_b_m2n4(c1,1,b_off,%1) +#define SAVE_b_m2n8(b_off,c1,c2) SAVE_b_m2n4(b_off,c1) save_b_m2n4(c2,2,b_off,%1,%%r12,4) +#define SAVE_b_m2n12(b_off,c1,c2,c3) SAVE_b_m2n8(b_off,c1,c2) save_b_m2n4(c3,3,b_off,%1,%%r12,8) + +#define SAVE_b_m1n4(b_off,c1) "vmovups %%xmm"#c1","#b_off"(%1);" +#define SAVE_b_m1n8(b_off,c1,c2) SAVE_b_m1n4(b_off,c1) "vmovups %%xmm"#c2","#b_off"(%1,%%r12,4);" +#define SAVE_b_m1n12(b_off,c1,c2,c3) SAVE_b_m1n8(b_off,c1,c2) "vmovups %%xmm"#c3","#b_off"(%1,%%r12,8);" + diff --git a/kernel/x86_64/strsm_kernel_8x4_haswell_RN.c b/kernel/x86_64/strsm_kernel_8x4_haswell_RN.c new file mode 100644 index 000000000..4e2cd4fe6 --- /dev/null +++ b/kernel/x86_64/strsm_kernel_8x4_haswell_RN.c @@ -0,0 +1,279 @@ +#include "common.h" +#include +#include "strsm_kernel_8x4_haswell_R_common.h" + +#define SOLVE_RN_m8n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) "movq %2,%3; addq $32,%2;"\ + SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1)\ + SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1)\ + SAVE_SOLUTION_m8n2(4,5,0)\ + SOLVE_leri_m8n2(40,6,7,%1)\ + SOLVE_ri_m8n2(56,6,7,%1)\ + SAVE_SOLUTION_m8n2(6,7,64) + +#define SOLVE_RN_m8n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) "movq %2,%3; addq $32,%2;"\ + SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1) SUBTRACT_m8n2(0,8,9,%1,%%r12,4) SUBTRACT_m8n2(8,10,11,%1,%%r12,4)\ + SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1) SUBTRACT_m8n2(16,8,9,%1,%%r12,4) SUBTRACT_m8n2(24,10,11,%1,%%r12,4)\ + SAVE_SOLUTION_m8n2(4,5,0)\ + SOLVE_leri_m8n2(40,6,7,%1) SUBTRACT_m8n2(32,8,9,%1,%%r12,4) SUBTRACT_m8n2(40,10,11,%1,%%r12,4)\ + SOLVE_ri_m8n2(56,6,7,%1) SUBTRACT_m8n2(48,8,9,%1,%%r12,4) SUBTRACT_m8n2(56,10,11,%1,%%r12,4)\ + SAVE_SOLUTION_m8n2(6,7,64)\ + SOLVE_leri_m8n2(64,8,9,%1,%%r12,4) SUBTRACT_m8n2(72,10,11,%1,%%r12,4)\ + SOLVE_ri_m8n2(80,8,9,%1,%%r12,4) SUBTRACT_m8n2(88,10,11,%1,%%r12,4)\ + SAVE_SOLUTION_m8n2(8,9,128)\ + SOLVE_leri_m8n2(104,10,11,%1,%%r12,4)\ + SOLVE_ri_m8n2(120,10,11,%1,%%r12,4)\ + SAVE_SOLUTION_m8n2(10,11,192) + +#define SOLVE_RN_m8n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) GEMM_SUM_REORDER_8x4(12,13,14,15,63) "movq %2,%3; addq $32,%2;"\ + SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1) SUBTRACT_m8n2(0,8,9,%1,%%r12,4) SUBTRACT_m8n2(8,10,11,%1,%%r12,4) SUBTRACT_m8n2(0,12,13,%1,%%r12,8) SUBTRACT_m8n2(8,14,15,%1,%%r12,8)\ + SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1) SUBTRACT_m8n2(16,8,9,%1,%%r12,4) SUBTRACT_m8n2(24,10,11,%1,%%r12,4) SUBTRACT_m8n2(16,12,13,%1,%%r12,8) SUBTRACT_m8n2(24,14,15,%1,%%r12,8)\ + SAVE_SOLUTION_m8n2(4,5,0)\ + SOLVE_leri_m8n2(40,6,7,%1) SUBTRACT_m8n2(32,8,9,%1,%%r12,4) SUBTRACT_m8n2(40,10,11,%1,%%r12,4) SUBTRACT_m8n2(32,12,13,%1,%%r12,8) SUBTRACT_m8n2(40,14,15,%1,%%r12,8)\ + SOLVE_ri_m8n2(56,6,7,%1) SUBTRACT_m8n2(48,8,9,%1,%%r12,4) SUBTRACT_m8n2(56,10,11,%1,%%r12,4) SUBTRACT_m8n2(48,12,13,%1,%%r12,8) SUBTRACT_m8n2(56,14,15,%1,%%r12,8)\ + SAVE_SOLUTION_m8n2(6,7,64)\ + SOLVE_leri_m8n2(64,8,9,%1,%%r12,4) SUBTRACT_m8n2(72,10,11,%1,%%r12,4) SUBTRACT_m8n2(64,12,13,%1,%%r12,8) SUBTRACT_m8n2(72,14,15,%1,%%r12,8)\ + SOLVE_ri_m8n2(80,8,9,%1,%%r12,4) SUBTRACT_m8n2(88,10,11,%1,%%r12,4) SUBTRACT_m8n2(80,12,13,%1,%%r12,8) SUBTRACT_m8n2(88,14,15,%1,%%r12,8)\ + SAVE_SOLUTION_m8n2(8,9,128)\ + SOLVE_leri_m8n2(104,10,11,%1,%%r12,4) SUBTRACT_m8n2(96,12,13,%1,%%r12,8) SUBTRACT_m8n2(104,14,15,%1,%%r12,8)\ + SOLVE_ri_m8n2(120,10,11,%1,%%r12,4) SUBTRACT_m8n2(112,12,13,%1,%%r12,8) SUBTRACT_m8n2(120,14,15,%1,%%r12,8)\ + SAVE_SOLUTION_m8n2(10,11,192)\ + SOLVE_leri_m8n2(128,12,13,%1,%%r12,8) SUBTRACT_m8n2(136,14,15,%1,%%r12,8)\ + SOLVE_ri_m8n2(144,12,13,%1,%%r12,8) SUBTRACT_m8n2(152,14,15,%1,%%r12,8)\ + SAVE_SOLUTION_m8n2(12,13,256)\ + SOLVE_leri_m8n2(168,14,15,%1,%%r12,8)\ + SOLVE_ri_m8n2(184,14,15,%1,%%r12,8)\ + SAVE_SOLUTION_m8n2(14,15,320) + +#define SOLVE_RN_m4n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) "movq %2,%3; addq $16,%2;"\ + SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1)\ + SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1)\ + SAVE_SOLUTION_m4n2(4,0)\ + SOLVE_leri_m4n2(40,5,%1)\ + SOLVE_ri_m4n2(56,5,%1)\ + SAVE_SOLUTION_m4n2(5,32) + +#define SOLVE_RN_m4n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) "movq %2,%3; addq $16,%2;"\ + SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1) SUBTRACT_m4n2(0,6,%1,%%r12,4) SUBTRACT_m4n2(8,7,%1,%%r12,4)\ + SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1) SUBTRACT_m4n2(16,6,%1,%%r12,4) SUBTRACT_m4n2(24,7,%1,%%r12,4)\ + SAVE_SOLUTION_m4n2(4,0)\ + SOLVE_leri_m4n2(40,5,%1) SUBTRACT_m4n2(32,6,%1,%%r12,4) SUBTRACT_m4n2(40,7,%1,%%r12,4)\ + SOLVE_ri_m4n2(56,5,%1) SUBTRACT_m4n2(48,6,%1,%%r12,4) SUBTRACT_m4n2(56,7,%1,%%r12,4)\ + SAVE_SOLUTION_m4n2(5,32)\ + SOLVE_leri_m4n2(64,6,%1,%%r12,4) SUBTRACT_m4n2(72,7,%1,%%r12,4)\ + SOLVE_ri_m4n2(80,6,%1,%%r12,4) SUBTRACT_m4n2(88,7,%1,%%r12,4)\ + SAVE_SOLUTION_m4n2(6,64)\ + SOLVE_leri_m4n2(104,7,%1,%%r12,4)\ + SOLVE_ri_m4n2(120,7,%1,%%r12,4)\ + SAVE_SOLUTION_m4n2(7,96) + +#define SOLVE_RN_m4n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9) "movq %2,%3; addq $16,%2;"\ + SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1) SUBTRACT_m4n2(0,6,%1,%%r12,4) SUBTRACT_m4n2(8,7,%1,%%r12,4) SUBTRACT_m4n2(0,8,%1,%%r12,8) SUBTRACT_m4n2(8,9,%1,%%r12,8)\ + SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1) SUBTRACT_m4n2(16,6,%1,%%r12,4) SUBTRACT_m4n2(24,7,%1,%%r12,4) SUBTRACT_m4n2(16,8,%1,%%r12,8) SUBTRACT_m4n2(24,9,%1,%%r12,8)\ + SAVE_SOLUTION_m4n2(4,0)\ + SOLVE_leri_m4n2(40,5,%1) SUBTRACT_m4n2(32,6,%1,%%r12,4) SUBTRACT_m4n2(40,7,%1,%%r12,4) SUBTRACT_m4n2(32,8,%1,%%r12,8) SUBTRACT_m4n2(40,9,%1,%%r12,8)\ + SOLVE_ri_m4n2(56,5,%1) SUBTRACT_m4n2(48,6,%1,%%r12,4) SUBTRACT_m4n2(56,7,%1,%%r12,4) SUBTRACT_m4n2(48,8,%1,%%r12,8) SUBTRACT_m4n2(56,9,%1,%%r12,8)\ + SAVE_SOLUTION_m4n2(5,32)\ + SOLVE_leri_m4n2(64,6,%1,%%r12,4) SUBTRACT_m4n2(72,7,%1,%%r12,4) SUBTRACT_m4n2(64,8,%1,%%r12,8) SUBTRACT_m4n2(72,9,%1,%%r12,8)\ + SOLVE_ri_m4n2(80,6,%1,%%r12,4) SUBTRACT_m4n2(88,7,%1,%%r12,4) SUBTRACT_m4n2(80,8,%1,%%r12,8) SUBTRACT_m4n2(88,9,%1,%%r12,8)\ + SAVE_SOLUTION_m4n2(6,64)\ + SOLVE_leri_m4n2(104,7,%1,%%r12,4) SUBTRACT_m4n2(96,8,%1,%%r12,8) SUBTRACT_m4n2(104,9,%1,%%r12,8)\ + SOLVE_ri_m4n2(120,7,%1,%%r12,4) SUBTRACT_m4n2(112,8,%1,%%r12,8) SUBTRACT_m4n2(120,9,%1,%%r12,8)\ + SAVE_SOLUTION_m4n2(7,96)\ + SOLVE_leri_m4n2(128,8,%1,%%r12,8) SUBTRACT_m4n2(136,9,%1,%%r12,8)\ + SOLVE_ri_m4n2(144,8,%1,%%r12,8) SUBTRACT_m4n2(152,9,%1,%%r12,8)\ + SAVE_SOLUTION_m4n2(8,128)\ + SOLVE_leri_m4n2(168,9,%1,%%r12,8)\ + SOLVE_ri_m4n2(184,9,%1,%%r12,8)\ + SAVE_SOLUTION_m4n2(9,160) + +#define SOLVE_RN_m2n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) "movq %2,%3; addq $8,%2;"\ + SOLVE_col1_ltor_m2n4(0,4,5,%1)\ + SOLVE_col2_ltor_m2n4(16,4,5,%1)\ + SOLVE_col3_ltor_m2n4(32,4,5,%1)\ + SOLVE_col4_ltor_m2n4(48,4,5,%1)\ + SAVE_SOLUTION_m2n4(4,5,0) + +#define SOLVE_RN_m2n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) "movq %2,%3; addq $8,%2;"\ + SOLVE_col1_ltor_m2n4(0,4,5,%1) SUBTRACT_m2n4(0,6,7,%1,%%r12,4)\ + SOLVE_col2_ltor_m2n4(16,4,5,%1) SUBTRACT_m2n4(16,6,7,%1,%%r12,4)\ + SOLVE_col3_ltor_m2n4(32,4,5,%1) SUBTRACT_m2n4(32,6,7,%1,%%r12,4)\ + SOLVE_col4_ltor_m2n4(48,4,5,%1) SUBTRACT_m2n4(48,6,7,%1,%%r12,4)\ + SAVE_SOLUTION_m2n4(4,5,0)\ + SOLVE_col1_ltor_m2n4(64,6,7,%1,%%r12,4)\ + SOLVE_col2_ltor_m2n4(80,6,7,%1,%%r12,4)\ + SOLVE_col3_ltor_m2n4(96,6,7,%1,%%r12,4)\ + SOLVE_col4_ltor_m2n4(112,6,7,%1,%%r12,4)\ + SAVE_SOLUTION_m2n4(6,7,32) + +#define SOLVE_RN_m2n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) GEMM_SUM_REORDER_2x4(8,9) "movq %2,%3; addq $8,%2;"\ + SOLVE_col1_ltor_m2n4(0,4,5,%1) SUBTRACT_m2n4(0,6,7,%1,%%r12,4) SUBTRACT_m2n4(0,8,9,%1,%%r12,8)\ + SOLVE_col2_ltor_m2n4(16,4,5,%1) SUBTRACT_m2n4(16,6,7,%1,%%r12,4) SUBTRACT_m2n4(16,8,9,%1,%%r12,8)\ + SOLVE_col3_ltor_m2n4(32,4,5,%1) SUBTRACT_m2n4(32,6,7,%1,%%r12,4) SUBTRACT_m2n4(32,8,9,%1,%%r12,8)\ + SOLVE_col4_ltor_m2n4(48,4,5,%1) SUBTRACT_m2n4(48,6,7,%1,%%r12,4) SUBTRACT_m2n4(48,8,9,%1,%%r12,8)\ + SAVE_SOLUTION_m2n4(4,5,0)\ + SOLVE_col1_ltor_m2n4(64,6,7,%1,%%r12,4) SUBTRACT_m2n4(64,8,9,%1,%%r12,8)\ + SOLVE_col2_ltor_m2n4(80,6,7,%1,%%r12,4) SUBTRACT_m2n4(80,8,9,%1,%%r12,8)\ + SOLVE_col3_ltor_m2n4(96,6,7,%1,%%r12,4) SUBTRACT_m2n4(96,8,9,%1,%%r12,8)\ + SOLVE_col4_ltor_m2n4(112,6,7,%1,%%r12,4) SUBTRACT_m2n4(112,8,9,%1,%%r12,8)\ + SAVE_SOLUTION_m2n4(6,7,32)\ + SOLVE_col1_ltor_m2n4(128,8,9,%1,%%r12,8)\ + SOLVE_col2_ltor_m2n4(144,8,9,%1,%%r12,8)\ + SOLVE_col3_ltor_m2n4(160,8,9,%1,%%r12,8)\ + SOLVE_col4_ltor_m2n4(176,8,9,%1,%%r12,8)\ + SAVE_SOLUTION_m2n4(8,9,64) + +#define SOLVE_RN_m1n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) "movq %2,%3; addq $4,%2;"\ + SOLVE_col1_ltor_m1n4(0,4,%1)\ + SOLVE_col2_ltor_m1n4(16,4,%1)\ + SOLVE_col3_ltor_m1n4(32,4,%1)\ + SOLVE_col4_ltor_m1n4(48,4,%1)\ + SAVE_SOLUTION_m1n4(4,0) + +#define SOLVE_RN_m1n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) "movq %2,%3; addq $4,%2;"\ + SOLVE_col1_ltor_m1n4(0,4,%1) SUBTRACT_m1n4(0,5,%1,%%r12,4)\ + SOLVE_col2_ltor_m1n4(16,4,%1) SUBTRACT_m1n4(16,5,%1,%%r12,4)\ + SOLVE_col3_ltor_m1n4(32,4,%1) SUBTRACT_m1n4(32,5,%1,%%r12,4)\ + SOLVE_col4_ltor_m1n4(48,4,%1) SUBTRACT_m1n4(48,5,%1,%%r12,4)\ + SAVE_SOLUTION_m1n4(4,0)\ + SOLVE_col1_ltor_m1n4(64,5,%1,%%r12,4)\ + SOLVE_col2_ltor_m1n4(80,5,%1,%%r12,4)\ + SOLVE_col3_ltor_m1n4(96,5,%1,%%r12,4)\ + SOLVE_col4_ltor_m1n4(112,5,%1,%%r12,4)\ + SAVE_SOLUTION_m1n4(5,16) + +#define SOLVE_RN_m1n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6) "movq %2,%3; addq $4,%2;"\ + SOLVE_col1_ltor_m1n4(0,4,%1) SUBTRACT_m1n4(0,5,%1,%%r12,4) SUBTRACT_m1n4(0,6,%1,%%r12,8)\ + SOLVE_col2_ltor_m1n4(16,4,%1) SUBTRACT_m1n4(16,5,%1,%%r12,4) SUBTRACT_m1n4(16,6,%1,%%r12,8)\ + SOLVE_col3_ltor_m1n4(32,4,%1) SUBTRACT_m1n4(32,5,%1,%%r12,4) SUBTRACT_m1n4(32,6,%1,%%r12,8)\ + SOLVE_col4_ltor_m1n4(48,4,%1) SUBTRACT_m1n4(48,5,%1,%%r12,4) SUBTRACT_m1n4(48,6,%1,%%r12,8)\ + SAVE_SOLUTION_m1n4(4,0)\ + SOLVE_col1_ltor_m1n4(64,5,%1,%%r12,4) SUBTRACT_m1n4(64,6,%1,%%r12,8)\ + SOLVE_col2_ltor_m1n4(80,5,%1,%%r12,4) SUBTRACT_m1n4(80,6,%1,%%r12,8)\ + SOLVE_col3_ltor_m1n4(96,5,%1,%%r12,4) SUBTRACT_m1n4(96,6,%1,%%r12,8)\ + SOLVE_col4_ltor_m1n4(112,5,%1,%%r12,4) SUBTRACT_m1n4(112,6,%1,%%r12,8)\ + SAVE_SOLUTION_m1n4(5,16)\ + SOLVE_col1_ltor_m1n4(128,6,%1,%%r12,8)\ + SOLVE_col2_ltor_m1n4(144,6,%1,%%r12,8)\ + SOLVE_col3_ltor_m1n4(160,6,%1,%%r12,8)\ + SOLVE_col4_ltor_m1n4(176,6,%1,%%r12,8)\ + SAVE_SOLUTION_m1n4(6,32) + +#define GEMM_RN_SIMPLE(mdim,ndim) \ + "movq %%r15,%0; leaq (%%r15,%%r12,"#mdim"),%%r15; movq %%r13,%5; movq %%r14,%1;" INIT_m##mdim##n##ndim\ + "testq %5,%5; jz 1"#mdim""#ndim"2f;"\ + "1"#mdim""#ndim"1:\n\t"\ + GEMM_KERNEL_k1m##mdim##n##ndim "addq $16,%1; addq $"#mdim"*4,%0; decq %5; jnz 1"#mdim""#ndim"1b;"\ + "1"#mdim""#ndim"2:\n\t" +#define GEMM_RN_m8n4 GEMM_RN_SIMPLE(8,4) +#define GEMM_RN_m8n8 GEMM_RN_SIMPLE(8,8) +#define GEMM_RN_m8n12 \ + "movq %%r15,%0; leaq (%%r15,%%r12,8),%%r15; movq %%r13,%5; movq %%r14,%1;" INIT_m8n12\ + "cmpq $8,%5; jb 18122f;"\ + "18121:\n\t"\ + GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ + "subq $8,%5; cmpq $8,%5; jnb 18121b;"\ + "18122:\n\t"\ + "testq %5,%5; jz 18124f;"\ + "18123:\n\t"\ + GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1; decq %5; jnz 18123b;"\ + "18124:\n\t" +#define GEMM_RN_m4n4 GEMM_RN_SIMPLE(4,4) +#define GEMM_RN_m4n8 GEMM_RN_SIMPLE(4,8) +#define GEMM_RN_m4n12 GEMM_RN_SIMPLE(4,12) +#define GEMM_RN_m2n4 GEMM_RN_SIMPLE(2,4) +#define GEMM_RN_m2n8 GEMM_RN_SIMPLE(2,8) +#define GEMM_RN_m2n12 GEMM_RN_SIMPLE(2,12) +#define GEMM_RN_m1n4 GEMM_RN_SIMPLE(1,4) +#define GEMM_RN_m1n8 GEMM_RN_SIMPLE(1,8) +#define GEMM_RN_m1n12 GEMM_RN_SIMPLE(1,12) + +#define COMPUTE(ndim) {\ + __asm__ __volatile__(\ + "movq %0,%%r15; movq %1,%%r14; movq %7,%%r13; movq %6,%%r12; salq $2,%%r12; movq %10,%%r11;"\ + "cmpq $8,%%r11; jb "#ndim"772f;"\ + #ndim"771:\n\t"\ + GEMM_RN_m8n##ndim SOLVE_RN_m8n##ndim "subq $8,%%r11; cmpq $8,%%r11; jnb "#ndim"771b;"\ + #ndim"772:\n\t"\ + "testq $4,%%r11; jz "#ndim"773f;"\ + GEMM_RN_m4n##ndim SOLVE_RN_m4n##ndim "subq $4,%%r11;"\ + #ndim"773:\n\t"\ + "testq $2,%%r11; jz "#ndim"774f;"\ + GEMM_RN_m2n##ndim SOLVE_RN_m2n##ndim "subq $2,%%r11;"\ + #ndim"774:\n\t"\ + "testq $1,%%r11; jz "#ndim"775f;"\ + GEMM_RN_m1n##ndim SOLVE_RN_m1n##ndim "subq $1,%%r11;"\ + #ndim"775:\n\t"\ + "movq %%r15,%0; movq %%r14,%1; vzeroupper;"\ + :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(OFF),"m"(one[0]),"m"(zero[0]),"m"(M)\ + :"r11","r12","r13","r14","r15","cc","memory",\ + "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ + a_ptr -= M * K; b_ptr += ndim * K; c_ptr += ldc * ndim - M; OFF += ndim;\ +} + +static void solve_RN(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT a0, b0; + int i, j, k; + for (i=0; i7;m_count-=8){ + if(kk>0) GEMM_KERNEL_N(8,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); + solve_RN(8,n,a_ptr+kk*8,sb+kk*n,c_ptr,ldc); + a_ptr += k * 8; c_ptr += 8; + } + for(;m_count>3;m_count-=4){ + if(kk>0) GEMM_KERNEL_N(4,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); + solve_RN(4,n,a_ptr+kk*4,sb+kk*n,c_ptr,ldc); + a_ptr += k * 4; c_ptr += 4; + } + for(;m_count>1;m_count-=2){ + if(kk>0) GEMM_KERNEL_N(2,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); + solve_RN(2,n,a_ptr+kk*2,sb+kk*n,c_ptr,ldc); + a_ptr += k * 2; c_ptr += 2; + } + if(m_count>0){ + if(kk>0) GEMM_KERNEL_N(1,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); + solve_RN(1,n,a_ptr+kk*1,sb+kk*n,c_ptr,ldc); + a_ptr += k * 1; c_ptr += 1; + } +} +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){ + float *a_ptr = sa, *b_ptr = sb, *c_ptr = C, *c_tmp = C; + float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; + float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}; + uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, OFF = (uint64_t)-offset, k_cnt = 0; + BLASLONG n_count = n; + for(;n_count>11;n_count-=12) COMPUTE(12) + for(;n_count>7;n_count-=8) COMPUTE(8) + for(;n_count>3;n_count-=4) COMPUTE(4) + for(;n_count>1;n_count-=2) { COMPUTE_EDGE_1_nchunk(m,2,a_ptr,b_ptr,c_ptr,ldc,k,OFF); b_ptr += 2*k; c_ptr += ldc*2; OFF+=2;} + if(n_count>0) COMPUTE_EDGE_1_nchunk(m,1,a_ptr,b_ptr,c_ptr,ldc,k,OFF); + return 0; +} diff --git a/kernel/x86_64/strsm_kernel_8x4_haswell_RT.c b/kernel/x86_64/strsm_kernel_8x4_haswell_RT.c new file mode 100644 index 000000000..ffcbfbbf0 --- /dev/null +++ b/kernel/x86_64/strsm_kernel_8x4_haswell_RT.c @@ -0,0 +1,281 @@ +#include "common.h" +#include +#include "strsm_kernel_8x4_haswell_R_common.h" + +#define SOLVE_RT_m8n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\ + SOLVE_rile_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\ + SOLVE_le_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\ + SAVE_SOLUTION_m8n2(6,7,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m8n2(-48,4,5,%1)\ + SOLVE_le_m8n2(-64,4,5,%1)\ + SAVE_SOLUTION_m8n2(4,5,-128) + +#define SOLVE_RT_m8n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\ + SOLVE_rile_m8n2(-8,10,11,%1,%%r12,4) SUBTRACT_m8n2(-16,8,9,%1,%%r12,4) SUBTRACT_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\ + SOLVE_le_m8n2(-24,10,11,%1,%%r12,4) SUBTRACT_m8n2(-32,8,9,%1,%%r12,4) SUBTRACT_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\ + SAVE_SOLUTION_m8n2(10,11,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m8n2(-48,8,9,%1,%%r12,4) SUBTRACT_m8n2(-40,6,7,%1) SUBTRACT_m8n2(-48,4,5,%1)\ + SOLVE_le_m8n2(-64,8,9,%1,%%r12,4) SUBTRACT_m8n2(-56,6,7,%1) SUBTRACT_m8n2(-64,4,5,%1)\ + SAVE_SOLUTION_m8n2(8,9,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m8n2(-72,6,7,%1) SUBTRACT_m8n2(-80,4,5,%1)\ + SOLVE_le_m8n2(-88,6,7,%1) SUBTRACT_m8n2(-96,4,5,%1)\ + SAVE_SOLUTION_m8n2(6,7,-192) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m8n2(-112,4,5,%1)\ + SOLVE_le_m8n2(-128,4,5,%1)\ + SAVE_SOLUTION_m8n2(4,5,-256) + +#define SOLVE_RT_m8n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) GEMM_SUM_REORDER_8x4(12,13,14,15,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\ + SOLVE_rile_m8n2(-8,14,15,%1,%%r12,8) SUBTRACT_m8n2(-16,12,13,%1,%%r12,8) SUBTRACT_m8n2(-8,10,11,%1,%%r12,4) SUBTRACT_m8n2(-16,8,9,%1,%%r12,4) SUBTRACT_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\ + SOLVE_le_m8n2(-24,14,15,%1,%%r12,8) SUBTRACT_m8n2(-32,12,13,%1,%%r12,8) SUBTRACT_m8n2(-24,10,11,%1,%%r12,4) SUBTRACT_m8n2(-32,8,9,%1,%%r12,4) SUBTRACT_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\ + SAVE_SOLUTION_m8n2(14,15,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m8n2(-48,12,13,%1,%%r12,8) SUBTRACT_m8n2(-40,10,11,%1,%%r12,4) SUBTRACT_m8n2(-48,8,9,%1,%%r12,4) SUBTRACT_m8n2(-40,6,7,%1) SUBTRACT_m8n2(-48,4,5,%1)\ + SOLVE_le_m8n2(-64,12,13,%1,%%r12,8) SUBTRACT_m8n2(-56,10,11,%1,%%r12,4) SUBTRACT_m8n2(-64,8,9,%1,%%r12,4) SUBTRACT_m8n2(-56,6,7,%1) SUBTRACT_m8n2(-64,4,5,%1)\ + SAVE_SOLUTION_m8n2(12,13,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m8n2(-72,10,11,%1,%%r12,4) SUBTRACT_m8n2(-80,8,9,%1,%%r12,4) SUBTRACT_m8n2(-72,6,7,%1) SUBTRACT_m8n2(-80,4,5,%1)\ + SOLVE_le_m8n2(-88,10,11,%1,%%r12,4) SUBTRACT_m8n2(-96,8,9,%1,%%r12,4) SUBTRACT_m8n2(-88,6,7,%1) SUBTRACT_m8n2(-96,4,5,%1)\ + SAVE_SOLUTION_m8n2(10,11,-192) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m8n2(-112,8,9,%1,%%r12,4) SUBTRACT_m8n2(-104,6,7,%1) SUBTRACT_m8n2(-112,4,5,%1)\ + SOLVE_le_m8n2(-128,8,9,%1,%%r12,4) SUBTRACT_m8n2(-120,6,7,%1) SUBTRACT_m8n2(-128,4,5,%1)\ + SAVE_SOLUTION_m8n2(8,9,-256) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m8n2(-136,6,7,%1) SUBTRACT_m8n2(-144,4,5,%1)\ + SOLVE_le_m8n2(-152,6,7,%1) SUBTRACT_m8n2(-160,4,5,%1)\ + SAVE_SOLUTION_m8n2(6,7,-320) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m8n2(-176,4,5,%1)\ + SOLVE_le_m8n2(-192,4,5,%1)\ + SAVE_SOLUTION_m8n2(4,5,-384) + +#define SOLVE_RT_m4n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\ + SOLVE_rile_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\ + SOLVE_le_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\ + SAVE_SOLUTION_m4n2(5,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m4n2(-48,4,%1)\ + SOLVE_le_m4n2(-64,4,%1)\ + SAVE_SOLUTION_m4n2(4,-64) + +#define SOLVE_RT_m4n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\ + SOLVE_rile_m4n2(-8,7,%1,%%r12,4) SUBTRACT_m4n2(-16,6,%1,%%r12,4) SUBTRACT_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\ + SOLVE_le_m4n2(-24,7,%1,%%r12,4) SUBTRACT_m4n2(-32,6,%1,%%r12,4) SUBTRACT_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\ + SAVE_SOLUTION_m4n2(7,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m4n2(-48,6,%1,%%r12,4) SUBTRACT_m4n2(-40,5,%1) SUBTRACT_m4n2(-48,4,%1)\ + SOLVE_le_m4n2(-64,6,%1,%%r12,4) SUBTRACT_m4n2(-56,5,%1) SUBTRACT_m4n2(-64,4,%1)\ + SAVE_SOLUTION_m4n2(6,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m4n2(-72,5,%1) SUBTRACT_m4n2(-80,4,%1)\ + SOLVE_le_m4n2(-88,5,%1) SUBTRACT_m4n2(-96,4,%1)\ + SAVE_SOLUTION_m4n2(5,-96) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m4n2(-112,4,%1)\ + SOLVE_le_m4n2(-128,4,%1)\ + SAVE_SOLUTION_m4n2(4,-128) + +#define SOLVE_RT_m4n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\ + SOLVE_rile_m4n2(-8,9,%1,%%r12,8) SUBTRACT_m4n2(-16,8,%1,%%r12,8) SUBTRACT_m4n2(-8,7,%1,%%r12,4) SUBTRACT_m4n2(-16,6,%1,%%r12,4) SUBTRACT_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\ + SOLVE_le_m4n2(-24,9,%1,%%r12,8) SUBTRACT_m4n2(-32,8,%1,%%r12,8) SUBTRACT_m4n2(-24,7,%1,%%r12,4) SUBTRACT_m4n2(-32,6,%1,%%r12,4) SUBTRACT_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\ + SAVE_SOLUTION_m4n2(9,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m4n2(-48,8,%1,%%r12,8) SUBTRACT_m4n2(-40,7,%1,%%r12,4) SUBTRACT_m4n2(-48,6,%1,%%r12,4) SUBTRACT_m4n2(-40,5,%1) SUBTRACT_m4n2(-48,4,%1)\ + SOLVE_le_m4n2(-64,8,%1,%%r12,8) SUBTRACT_m4n2(-56,7,%1,%%r12,4) SUBTRACT_m4n2(-64,6,%1,%%r12,4) SUBTRACT_m4n2(-56,5,%1) SUBTRACT_m4n2(-64,4,%1)\ + SAVE_SOLUTION_m4n2(8,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m4n2(-72,7,%1,%%r12,4) SUBTRACT_m4n2(-80,6,%1,%%r12,4) SUBTRACT_m4n2(-72,5,%1) SUBTRACT_m4n2(-80,4,%1)\ + SOLVE_le_m4n2(-88,7,%1,%%r12,4) SUBTRACT_m4n2(-96,6,%1,%%r12,4) SUBTRACT_m4n2(-88,5,%1) SUBTRACT_m4n2(-96,4,%1)\ + SAVE_SOLUTION_m4n2(7,-96) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m4n2(-112,6,%1,%%r12,4) SUBTRACT_m4n2(-104,5,%1) SUBTRACT_m4n2(-112,4,%1)\ + SOLVE_le_m4n2(-128,6,%1,%%r12,4) SUBTRACT_m4n2(-120,5,%1) SUBTRACT_m4n2(-128,4,%1)\ + SAVE_SOLUTION_m4n2(6,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m4n2(-136,5,%1) SUBTRACT_m4n2(-144,4,%1)\ + SOLVE_le_m4n2(-152,5,%1) SUBTRACT_m4n2(-160,4,%1)\ + SAVE_SOLUTION_m4n2(5,-160) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m4n2(-176,4,%1)\ + SOLVE_le_m4n2(-192,4,%1)\ + SAVE_SOLUTION_m4n2(4,-192) + +#define SOLVE_RT_m2n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\ + SOLVE_col4_rtol_m2n4(-16,4,5,%1)\ + SOLVE_col3_rtol_m2n4(-32,4,5,%1)\ + SOLVE_col2_rtol_m2n4(-48,4,5,%1)\ + SOLVE_col1_rtol_m2n4(-64,4,5,%1)\ + SAVE_SOLUTION_m2n4(4,5,-32) + +#define SOLVE_RT_m2n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\ + SOLVE_col4_rtol_m2n4(-16,6,7,%1,%%r12,4) SUBTRACT_m2n4(-16,4,5,%1)\ + SOLVE_col3_rtol_m2n4(-32,6,7,%1,%%r12,4) SUBTRACT_m2n4(-32,4,5,%1)\ + SOLVE_col2_rtol_m2n4(-48,6,7,%1,%%r12,4) SUBTRACT_m2n4(-48,4,5,%1)\ + SOLVE_col1_rtol_m2n4(-64,6,7,%1,%%r12,4) SUBTRACT_m2n4(-64,4,5,%1)\ + SAVE_SOLUTION_m2n4(6,7,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ + SOLVE_col4_rtol_m2n4(-80,4,5,%1)\ + SOLVE_col3_rtol_m2n4(-96,4,5,%1)\ + SOLVE_col2_rtol_m2n4(-112,4,5,%1)\ + SOLVE_col1_rtol_m2n4(-128,4,5,%1)\ + SAVE_SOLUTION_m2n4(4,5,-64) + +#define SOLVE_RT_m2n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) GEMM_SUM_REORDER_2x4(8,9) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\ + SOLVE_col4_rtol_m2n4(-16,8,9,%1,%%r12,8) SUBTRACT_m2n4(-16,6,7,%1,%%r12,4) SUBTRACT_m2n4(-16,4,5,%1)\ + SOLVE_col3_rtol_m2n4(-32,8,9,%1,%%r12,8) SUBTRACT_m2n4(-32,6,7,%1,%%r12,4) SUBTRACT_m2n4(-32,4,5,%1)\ + SOLVE_col2_rtol_m2n4(-48,8,9,%1,%%r12,8) SUBTRACT_m2n4(-48,6,7,%1,%%r12,4) SUBTRACT_m2n4(-48,4,5,%1)\ + SOLVE_col1_rtol_m2n4(-64,8,9,%1,%%r12,8) SUBTRACT_m2n4(-64,6,7,%1,%%r12,4) SUBTRACT_m2n4(-64,4,5,%1)\ + SAVE_SOLUTION_m2n4(8,9,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ + SOLVE_col4_rtol_m2n4(-80,6,7,%1,%%r12,4) SUBTRACT_m2n4(-80,4,5,%1)\ + SOLVE_col3_rtol_m2n4(-96,6,7,%1,%%r12,4) SUBTRACT_m2n4(-96,4,5,%1)\ + SOLVE_col2_rtol_m2n4(-112,6,7,%1,%%r12,4) SUBTRACT_m2n4(-112,4,5,%1)\ + SOLVE_col1_rtol_m2n4(-128,6,7,%1,%%r12,4) SUBTRACT_m2n4(-128,4,5,%1)\ + SAVE_SOLUTION_m2n4(6,7,-64) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ + SOLVE_col4_rtol_m2n4(-144,4,5,%1)\ + SOLVE_col3_rtol_m2n4(-160,4,5,%1)\ + SOLVE_col2_rtol_m2n4(-176,4,5,%1)\ + SOLVE_col1_rtol_m2n4(-192,4,5,%1)\ + SAVE_SOLUTION_m2n4(4,5,-96) + +#define SOLVE_RT_m1n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\ + SOLVE_col4_rtol_m1n4(-16,4,%1)\ + SOLVE_col3_rtol_m1n4(-32,4,%1)\ + SOLVE_col2_rtol_m1n4(-48,4,%1)\ + SOLVE_col1_rtol_m1n4(-64,4,%1)\ + SAVE_SOLUTION_m1n4(4,-16) + +#define SOLVE_RT_m1n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\ + SOLVE_col4_rtol_m1n4(-16,5,%1,%%r12,4) SUBTRACT_m1n4(-16,4,%1)\ + SOLVE_col3_rtol_m1n4(-32,5,%1,%%r12,4) SUBTRACT_m1n4(-32,4,%1)\ + SOLVE_col2_rtol_m1n4(-48,5,%1,%%r12,4) SUBTRACT_m1n4(-48,4,%1)\ + SOLVE_col1_rtol_m1n4(-64,5,%1,%%r12,4) SUBTRACT_m1n4(-64,4,%1)\ + SAVE_SOLUTION_m1n4(5,-16) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ + SOLVE_col4_rtol_m1n4(-80,4,%1)\ + SOLVE_col3_rtol_m1n4(-96,4,%1)\ + SOLVE_col2_rtol_m1n4(-112,4,%1)\ + SOLVE_col1_rtol_m1n4(-128,4,%1)\ + SAVE_SOLUTION_m1n4(4,-32) + +#define SOLVE_RT_m1n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\ + SOLVE_col4_rtol_m1n4(-16,6,%1,%%r12,8) SUBTRACT_m1n4(-16,5,%1,%%r12,4) SUBTRACT_m1n4(-16,4,%1)\ + SOLVE_col3_rtol_m1n4(-32,6,%1,%%r12,8) SUBTRACT_m1n4(-32,5,%1,%%r12,4) SUBTRACT_m1n4(-32,4,%1)\ + SOLVE_col2_rtol_m1n4(-48,6,%1,%%r12,8) SUBTRACT_m1n4(-48,5,%1,%%r12,4) SUBTRACT_m1n4(-48,4,%1)\ + SOLVE_col1_rtol_m1n4(-64,6,%1,%%r12,8) SUBTRACT_m1n4(-64,5,%1,%%r12,4) SUBTRACT_m1n4(-64,4,%1)\ + SAVE_SOLUTION_m1n4(6,-16) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ + SOLVE_col4_rtol_m1n4(-80,5,%1,%%r12,4) SUBTRACT_m1n4(-80,4,%1)\ + SOLVE_col3_rtol_m1n4(-96,5,%1,%%r12,4) SUBTRACT_m1n4(-96,4,%1)\ + SOLVE_col2_rtol_m1n4(-112,5,%1,%%r12,4) SUBTRACT_m1n4(-112,4,%1)\ + SOLVE_col1_rtol_m1n4(-128,5,%1,%%r12,4) SUBTRACT_m1n4(-128,4,%1)\ + SAVE_SOLUTION_m1n4(5,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ + SOLVE_col4_rtol_m1n4(-144,4,%1)\ + SOLVE_col3_rtol_m1n4(-160,4,%1)\ + SOLVE_col2_rtol_m1n4(-176,4,%1)\ + SOLVE_col1_rtol_m1n4(-192,4,%1)\ + SAVE_SOLUTION_m1n4(4,-48) + +/* r14 = b_tail, r15 = a_tail, r13 = k-kk */ +#define GEMM_RT_SIMPLE(mdim,ndim) \ + "leaq (%%r15,%%r12,"#mdim"),%%r15; movq %%r15,%0; movq %%r13,%5; movq %%r14,%1;" INIT_m##mdim##n##ndim\ + "testq %5,%5; jz 1"#mdim""#ndim"2f;"\ + "1"#mdim""#ndim"1:\n\t"\ + "subq $16,%1; subq $"#mdim"*4,%0;" GEMM_KERNEL_k1m##mdim##n##ndim "decq %5; jnz 1"#mdim""#ndim"1b;"\ + "1"#mdim""#ndim"2:\n\t" +#define GEMM_RT_m8n4 GEMM_RT_SIMPLE(8,4) +#define GEMM_RT_m8n8 GEMM_RT_SIMPLE(8,8) +#define GEMM_RT_m8n12 \ + "leaq (%%r15,%%r12,8),%%r15; movq %%r15,%0; movq %%r13,%5; movq %%r14,%1;" INIT_m8n12\ + "cmpq $8,%5; jb 18122f;"\ + "18121:\n\t"\ + "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "subq $8,%5; cmpq $8,%5; jnb 18121b;"\ + "18122:\n\t"\ + "testq %5,%5; jz 18124f;"\ + "18123:\n\t"\ + "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12 "decq %5; jnz 18123b;"\ + "18124:\n\t" +#define GEMM_RT_m4n4 GEMM_RT_SIMPLE(4,4) +#define GEMM_RT_m4n8 GEMM_RT_SIMPLE(4,8) +#define GEMM_RT_m4n12 GEMM_RT_SIMPLE(4,12) +#define GEMM_RT_m2n4 GEMM_RT_SIMPLE(2,4) +#define GEMM_RT_m2n8 GEMM_RT_SIMPLE(2,8) +#define GEMM_RT_m2n12 GEMM_RT_SIMPLE(2,12) +#define GEMM_RT_m1n4 GEMM_RT_SIMPLE(1,4) +#define GEMM_RT_m1n8 GEMM_RT_SIMPLE(1,8) +#define GEMM_RT_m1n12 GEMM_RT_SIMPLE(1,12) + +#define COMPUTE(ndim) {\ + b_ptr -= (ndim-4)*K; c_ptr -= ndim * ldc;\ + __asm__ __volatile__(\ + "movq %0,%%r15; movq %6,%%r13; subq %7,%%r13; movq %6,%%r12; salq $2,%%r12; movq %1,%%r14; movq %10,%%r11;"\ + "cmpq $8,%%r11; jb "#ndim"772f;"\ + #ndim"771:\n\t"\ + GEMM_RT_m8n##ndim SOLVE_RT_m8n##ndim "subq $8,%%r11; cmpq $8,%%r11; jnb "#ndim"771b;"\ + #ndim"772:\n\t"\ + "testq $4,%%r11; jz "#ndim"773f;"\ + GEMM_RT_m4n##ndim SOLVE_RT_m4n##ndim "subq $4,%%r11;"\ + #ndim"773:\n\t"\ + "testq $2,%%r11; jz "#ndim"774f;"\ + GEMM_RT_m2n##ndim SOLVE_RT_m2n##ndim "subq $2,%%r11;"\ + #ndim"774:\n\t"\ + "testq $1,%%r11; jz "#ndim"775f;"\ + GEMM_RT_m1n##ndim SOLVE_RT_m1n##ndim "subq $1,%%r11;"\ + #ndim"775:\n\t"\ + "movq %%r15,%0; movq %%r14,%1; vzeroupper;"\ + :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(OFF),"m"(one[0]),"m"(zero[0]),"m"(M)\ + :"r11","r12","r13","r14","r15","cc","memory",\ + "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ + a_ptr -= M * K; b_ptr -= 4 * K; c_ptr -= M; OFF -= ndim;\ +} + +static void solve_RT(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc){ + FLOAT a0, b0; + int i, j, k; + for (i=n-1;i>=0;i--) { + b0 = b[i*n+i]; + for (j=0;j7;m_count-=8){ + if(k-kk>0) GEMM_KERNEL_N(8,n,k-kk,-1.0,a_ptr+kk*8,sb+kk*n,c_ptr,ldc); + solve_RT(8,n,a_ptr+(kk-n)*8,sb+(kk-n)*n,c_ptr,ldc); + a_ptr += k * 8; c_ptr += 8; + } + for(;m_count>3;m_count-=4){ + if(k-kk>0) GEMM_KERNEL_N(4,n,k-kk,-1.0,a_ptr+kk*4,sb+kk*n,c_ptr,ldc); + solve_RT(4,n,a_ptr+(kk-n)*4,sb+(kk-n)*n,c_ptr,ldc); + a_ptr += k * 4; c_ptr += 4; + } + for(;m_count>1;m_count-=2){ + if(k-kk>0) GEMM_KERNEL_N(2,n,k-kk,-1.0,a_ptr+kk*2,sb+kk*n,c_ptr,ldc); + solve_RT(2,n,a_ptr+(kk-n)*2,sb+(kk-n)*n,c_ptr,ldc); + a_ptr += k * 2; c_ptr += 2; + } + if(m_count>0){ + if(k-kk>0) GEMM_KERNEL_N(1,n,k-kk,-1.0,a_ptr+kk*1,sb+kk*n,c_ptr,ldc); + solve_RT(1,n,a_ptr+(kk-n)*1,sb+(kk-n)*n,c_ptr,ldc); + a_ptr += k * 1; c_ptr += 1; + } +} +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){ + float *a_ptr = sa, *b_ptr = sb+n*k, *c_ptr = C+n*ldc, *c_tmp = C; + float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; + float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}; + uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, OFF = (uint64_t)(n-offset), k_cnt = 0; + BLASLONG n_count = n; + if(n&1){b_ptr-=k; c_ptr-=ldc; COMPUTE_EDGE_1_nchunk(m,1,a_ptr,b_ptr,c_ptr,ldc,k,OFF); OFF--; n_count--;} + if(n&2){b_ptr-=k*2; c_ptr-=ldc*2; COMPUTE_EDGE_1_nchunk(m,2,a_ptr,b_ptr,c_ptr,ldc,k,OFF); OFF-=2; n_count-=2;} + for(;n_count>11;n_count-=12) COMPUTE(12) + for(;n_count>7;n_count-=8) COMPUTE(8) + for(;n_count>3;n_count-=4) COMPUTE(4) + return 0; +} diff --git a/kernel/x86_64/strsm_kernel_8x4_haswell_R_common.h b/kernel/x86_64/strsm_kernel_8x4_haswell_R_common.h new file mode 100644 index 000000000..36b7aa1a3 --- /dev/null +++ b/kernel/x86_64/strsm_kernel_8x4_haswell_R_common.h @@ -0,0 +1,226 @@ +/* r11 = m_counter, r12 = size_of_k_elements, r13 = kk, r14 = b_head, r15 = a_head */ +/* register i/o: %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc, %5 = k_counter */ +/* memory input: %6 = K, %7 = offset, %8 = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}, %9 = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}, %10 = M */ + +#define init_m8n4(c1,c2,c3,c4)\ + "vpxor %%ymm"#c1",%%ymm"#c1",%%ymm"#c1"; vpxor %%ymm"#c2",%%ymm"#c2",%%ymm"#c2";"\ + "vpxor %%ymm"#c3",%%ymm"#c3",%%ymm"#c3"; vpxor %%ymm"#c4",%%ymm"#c4",%%ymm"#c4";" +#define INIT_m8n4 init_m8n4(4,5,6,7) +#define INIT_m8n8 INIT_m8n4 init_m8n4(8,9,10,11) +#define INIT_m8n12 INIT_m8n8 init_m8n4(12,13,14,15) + +#define init_m4n4(c1,c2,c3,c4)\ + "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";"\ + "vpxor %%xmm"#c3",%%xmm"#c3",%%xmm"#c3"; vpxor %%xmm"#c4",%%xmm"#c4",%%xmm"#c4";" +#define INIT_m4n4 init_m4n4(4,5,6,7) +#define INIT_m4n8 INIT_m4n4 init_m4n4(8,9,10,11) +#define INIT_m4n12 INIT_m4n8 init_m4n4(12,13,14,15) + +#define init_m2n4(c1,c2)\ + "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";" +#define INIT_m2n4 init_m2n4(4,5) +#define INIT_m2n8 INIT_m2n4 init_m2n4(6,7) +#define INIT_m2n12 INIT_m2n8 init_m2n4(8,9) + +#define init_m1n4(c1) "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1";" +#define INIT_m1n4 init_m1n4(4) +#define INIT_m1n8 INIT_m1n4 init_m1n4(5) +#define INIT_m1n12 INIT_m1n8 init_m1n4(6) + +#define GEMM_KERNEL_k1m8n4 \ + "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2;"\ + "vbroadcastsd (%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm4; vfnmadd231ps %%ymm3,%%ymm2,%%ymm5;"\ + "vbroadcastsd 8(%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm6; vfnmadd231ps %%ymm3,%%ymm2,%%ymm7;" +#define GEMM_KERNEL_k1m8n8 GEMM_KERNEL_k1m8n4\ + "vbroadcastsd (%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm8; vfnmadd231ps %%ymm3,%%ymm2,%%ymm9;"\ + "vbroadcastsd 8(%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm10; vfnmadd231ps %%ymm3,%%ymm2,%%ymm11;" +#define GEMM_KERNEL_k1m8n12 GEMM_KERNEL_k1m8n8\ + "vbroadcastsd (%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm12; vfnmadd231ps %%ymm3,%%ymm2,%%ymm13;"\ + "vbroadcastsd 8(%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm14; vfnmadd231ps %%ymm3,%%ymm2,%%ymm15;" + +#define GEMM_KERNEL_k1m4n4 \ + "vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2;"\ + "vmovddup (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;"\ + "vmovddup 8(%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;" +#define GEMM_KERNEL_k1m4n8 GEMM_KERNEL_k1m4n4\ + "vmovddup (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;"\ + "vmovddup 8(%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm10; vfnmadd231ps %%xmm3,%%xmm2,%%xmm11;" +#define GEMM_KERNEL_k1m4n12 GEMM_KERNEL_k1m4n8\ + "vmovddup (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm12; vfnmadd231ps %%xmm3,%%xmm2,%%xmm13;"\ + "vmovddup 8(%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm14; vfnmadd231ps %%xmm3,%%xmm2,%%xmm15;" + +#define GEMM_KERNEL_k1m2n4 \ + "vbroadcastss (%0),%%xmm1; vbroadcastss 4(%0),%%xmm2;"\ + "vmovups (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;" +#define GEMM_KERNEL_k1m2n8 GEMM_KERNEL_k1m2n4\ + "vmovups (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;" +#define GEMM_KERNEL_k1m2n12 GEMM_KERNEL_k1m2n8\ + "vmovups (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;" + +#define GEMM_KERNEL_k1m1n4 "vbroadcastss (%0),%%xmm1; vfnmadd231ps (%1),%%xmm1,%%xmm4;" +#define GEMM_KERNEL_k1m1n8 GEMM_KERNEL_k1m1n4 "vfnmadd231ps (%1,%%r12,4),%%xmm1,%%xmm5;" +#define GEMM_KERNEL_k1m1n12 GEMM_KERNEL_k1m1n8 "vfnmadd231ps (%1,%%r12,8),%%xmm1,%%xmm6;" + +#define GEMM_SUM_REORDER_8x4(c1,c2,c3,c4,prefpos)\ + "vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\ + "vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\ + "vaddps %%ymm0,%%ymm"#c1",%%ymm"#c1"; vaddps %%ymm1,%%ymm"#c2",%%ymm"#c2";"\ + "vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\ + "vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\ + "vaddps %%ymm0,%%ymm"#c3",%%ymm"#c3"; vaddps %%ymm1,%%ymm"#c4",%%ymm"#c4";" + +#define GEMM_SUM_REORDER_4x4(c1,c2,c3,c4,co1,co2)\ + "vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\ + "vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\ + "vunpcklpd %%xmm"#c2",%%xmm"#c1",%%xmm0; vunpckhpd %%xmm"#c2",%%xmm"#c1",%%xmm1;"\ + "vaddps %%xmm0,%%xmm2,%%xmm"#c1"; vaddps %%xmm1,%%xmm3,%%xmm"#c2";"\ + "vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\ + "vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\ + "vunpcklpd %%xmm"#c4",%%xmm"#c3",%%xmm0; vunpckhpd %%xmm"#c4",%%xmm"#c3",%%xmm1;"\ + "vaddps %%xmm0,%%xmm2,%%xmm"#c3"; vaddps %%xmm1,%%xmm3,%%xmm"#c4";"\ + "vperm2f128 $2,%%ymm"#c1",%%ymm"#c2",%%ymm"#co1"; vperm2f128 $2,%%ymm"#c3",%%ymm"#c4",%%ymm"#co2";" + +#define GEMM_SUM_REORDER_2x4(c1,c2)\ + "vmovsd (%3),%%xmm0; vmovhpd (%3,%4,1),%%xmm0,%%xmm0; leaq (%3,%4,2),%3; vpermilps $216,%%xmm0,%%xmm0;"\ + "vmovsd (%3),%%xmm1; vmovhpd (%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3; vpermilps $216,%%xmm1,%%xmm1;"\ + "vunpcklpd %%xmm1,%%xmm0,%%xmm2; vaddps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ + "vunpckhpd %%xmm1,%%xmm0,%%xmm3; vaddps %%xmm3,%%xmm"#c2",%%xmm"#c2";"\ + +#define GEMM_SUM_REORDER_1x4(c1)\ + "vmovss (%3),%%xmm1; vinsertps $16,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\ + "vinsertps $32,(%3),%%xmm1,%%xmm1; vinsertps $48,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\ + "vaddps %%xmm"#c1",%%xmm1,%%xmm"#c1";" + +#define SOLVE_le_m4n2(b_off,c1,...)\ + "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ + "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\ + "vmovsldup %%ymm"#c1",%%ymm1;" + +#define SOLVE_le_m8n2(b_off,c1,c2,...)\ + "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ + "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\ + "vmovsldup %%ymm"#c1",%%ymm1; vmovsldup %%ymm"#c2",%%ymm2;" + +#define SOLVE_leri_m4n2(b_off,c1,...) SOLVE_le_m4n2(b_off,c1,__VA_ARGS__)\ + "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" + +#define SOLVE_leri_m8n2(b_off,c1,c2,...) SOLVE_le_m8n2(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" + +#define SOLVE_ri_m4n2(b_off,c1,...)\ + "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ + "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\ + "vmovshdup %%ymm"#c1",%%ymm1;" + +#define SOLVE_ri_m8n2(b_off,c1,c2,...)\ + "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ + "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\ + "vmovshdup %%ymm"#c1",%%ymm1; vmovshdup %%ymm"#c2",%%ymm2;" + +#define SOLVE_rile_m4n2(b_off,c1,...) SOLVE_ri_m4n2(b_off,c1,__VA_ARGS__)\ + "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" + +#define SOLVE_rile_m8n2(b_off,c1,c2,...) SOLVE_ri_m8n2(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" + +#define SOLVE_col1_rtol_m1n4(b_off,c1,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ + "vpermilps $0,%%xmm"#c1",%%xmm1;" + +#define SOLVE_col1_rtol_m2n4(b_off,c1,c2,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ + "vpermilps $0,%%xmm"#c1",%%xmm1; vpermilps $0,%%xmm"#c2",%%xmm2;" + +#define SOLVE_col1_ltor_m1n4(b_off,c1,...) SOLVE_col1_rtol_m1n4(b_off,c1,__VA_ARGS__)\ + "vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" + +#define SOLVE_col1_ltor_m2n4(b_off,c1,c2,...) SOLVE_col1_rtol_m2n4(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" + +#define SOLVE_col2_mul_m1n4(b_off,c1,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ + "vpermilps $85,%%xmm"#c1",%%xmm1;" + +#define SOLVE_col2_mul_m2n4(b_off,c1,c2,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ + "vpermilps $85,%%xmm"#c1",%%xmm1; vpermilps $85,%%xmm"#c2",%%xmm2;" + +#define SOLVE_col2_rtol_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\ + "vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" + +#define SOLVE_col2_rtol_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" + +#define SOLVE_col2_ltor_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\ + "vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" + +#define SOLVE_col2_ltor_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" + +#define SOLVE_col3_mul_m1n4(b_off,c1,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ + "vpermilps $170,%%xmm"#c1",%%xmm1;" + +#define SOLVE_col3_mul_m2n4(b_off,c1,c2,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ + "vpermilps $170,%%xmm"#c1",%%xmm1; vpermilps $170,%%xmm"#c2",%%xmm2;" + +#define SOLVE_col3_rtol_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\ + "vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" + +#define SOLVE_col3_rtol_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" + +#define SOLVE_col3_ltor_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\ + "vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" + +#define SOLVE_col3_ltor_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" + +#define SOLVE_col4_ltor_m1n4(b_off,c1,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ + "vpermilps $255,%%xmm"#c1",%%xmm1;" + +#define SOLVE_col4_ltor_m2n4(b_off,c1,c2,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ + "vpermilps $255,%%xmm"#c1",%%xmm1; vpermilps $255,%%xmm"#c2",%%xmm2;" + +#define SOLVE_col4_rtol_m1n4(b_off,c1,...) SOLVE_col4_ltor_m1n4(b_off,c1,__VA_ARGS__)\ + "vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" + +#define SOLVE_col4_rtol_m2n4(b_off,c1,c2,...) SOLVE_col4_ltor_m2n4(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" + +#define SUBTRACT_m4n2(b_off,c1,...) "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" + +#define SUBTRACT_m8n2(b_off,c1,c2,...) SUBTRACT_m4n2(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" + +#define SUBTRACT_m1n4(b_off,c1,...) "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" + +#define SUBTRACT_m2n4(b_off,c1,c2,...) SUBTRACT_m1n4(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" + +#define SAVE_SOLUTION_m8n2(c1,c2,a_off)\ + "vunpcklps %%ymm"#c2",%%ymm"#c1",%%ymm0; vunpckhps %%ymm"#c2",%%ymm"#c1",%%ymm1;"\ + "vunpcklpd %%ymm1,%%ymm0,%%ymm"#c1"; vunpckhpd %%ymm1,%%ymm0,%%ymm"#c2";"\ + "vmovups %%ymm"#c1","#a_off"(%0); vmovups %%ymm"#c2","#a_off"+32(%0);"\ + "vmovups %%ymm"#c1",(%3); vmovups %%ymm"#c2",(%3,%4,1); leaq (%3,%4,2),%3;" + +#define SAVE_SOLUTION_m4n2(c1,a_off)\ + "vpermilps $216,%%ymm"#c1",%%ymm"#c1"; vpermpd $216,%%ymm"#c1",%%ymm"#c1";"\ + "vmovups %%ymm"#c1","#a_off"(%0); vmovups %%xmm"#c1",(%3); vextractf128 $1,%%ymm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;" + +#define SAVE_SOLUTION_m2n4(c1,c2,a_off)\ + "vunpcklps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;"\ + "vunpckhps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"+16(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;" + +#define SAVE_SOLUTION_m1n4(c1,a_off)\ + "vmovups %%xmm"#c1","#a_off"(%0); vmovss %%xmm"#c1",(%3); vextractps $1,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"\ + "vextractps $2,%%xmm"#c1",(%3); vextractps $3,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;" From cdc0e9011ea9911edb0027a207c67beeb0afca54 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Mon, 16 Mar 2020 16:39:37 +0000 Subject: [PATCH 068/593] Update KERNEL.ZEN --- kernel/x86_64/KERNEL.ZEN | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/KERNEL.ZEN b/kernel/x86_64/KERNEL.ZEN index 1cd02db74..7bb308fea 100644 --- a/kernel/x86_64/KERNEL.ZEN +++ b/kernel/x86_64/KERNEL.ZEN @@ -74,10 +74,10 @@ ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +STRSMKERNEL_LN = strsm_kernel_8x4_haswell_LN.c +STRSMKERNEL_LT = strsm_kernel_8x4_haswell_LT.c +STRSMKERNEL_RN = strsm_kernel_8x4_haswell_RN.c +STRSMKERNEL_RT = strsm_kernel_8x4_haswell_RT.c DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c From 62b96089864a1951561671704c0de3e40bc7b477 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Tue, 17 Mar 2020 12:52:55 +0800 Subject: [PATCH 069/593] Update KERNEL.SKYLAKEX --- kernel/x86_64/KERNEL.SKYLAKEX | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 9b3c83e42..333571fd4 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -6,6 +6,10 @@ SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = sgemm_tcopy_16_skylakex.c SGEMMONCOPY = sgemm_ncopy_4_skylakex.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c DGEMMKERNEL = dgemm_kernel_16x2_skylakex.c DTRMMKERNEL = dgemm_kernel_16x2_skylakex.c From a06d78556d48d6ae091b56cec262f4c90ec013cb Mon Sep 17 00:00:00 2001 From: shengyang Date: Wed, 18 Mar 2020 14:17:32 +0800 Subject: [PATCH 070/593] add ctest for srotm and modified ctest for srot. make sure that test cases cover all code path when kernel uses looping unrolling. --- ctest/c_sblas1.c | 7 + ctest/c_sblat1.f | 370 ++++++++++++++++++++++++++--------------------- 2 files changed, 209 insertions(+), 168 deletions(-) diff --git a/ctest/c_sblas1.c b/ctest/c_sblas1.c index 1f301a693..1a433b287 100644 --- a/ctest/c_sblas1.c +++ b/ctest/c_sblas1.c @@ -55,6 +55,13 @@ void F77_srotg( float *a, float *b, float *c, float *s) return; } +void F77_srotm(blasint *N, float *X, blasint *incX, float *Y, blasint *incY, + float *param) +{ + cblas_srotm(*N, X, *incX, Y, *incY, param); + return; +} + void F77_srot( blasint *N, float *X, blasint *incX, float *Y, blasint *incY, const float *c, const float *s) { diff --git a/ctest/c_sblat1.f b/ctest/c_sblat1.f index 89902f12d..66a5def89 100644 --- a/ctest/c_sblat1.f +++ b/ctest/c_sblat1.f @@ -19,7 +19,7 @@ DATA SFAC/9.765625E-4/ * .. Executable Statements .. WRITE (NOUT,99999) - DO 20 IC = 1, 10 + DO 20 IC = 1, 11 ICASE = IC CALL HEADER * @@ -40,7 +40,7 @@ ELSE IF (ICASE.EQ.1 .OR. ICASE.EQ.2 .OR. ICASE.EQ.5 .OR. + ICASE.EQ.6) THEN CALL CHECK2(SFAC) - ELSE IF (ICASE.EQ.4) THEN + ELSE IF (ICASE.EQ.4 .OR. ICASE.EQ.11) THEN CALL CHECK3(SFAC) END IF * -- Print @@ -59,7 +59,7 @@ INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Arrays .. - CHARACTER*15 L(10) + CHARACTER*15 L(11) * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Data statements .. @@ -73,6 +73,7 @@ DATA L(8)/'CBLAS_SASUM '/ DATA L(9)/'CBLAS_SSCAL '/ DATA L(10)/'CBLAS_ISAMAX'/ + DATA L(11)/'CBLAS_SROTM'/ * .. Executable Statements .. WRITE (NOUT,99999) ICASE, L(ICASE) RETURN @@ -396,203 +397,92 @@ * .. Scalar Arguments .. REAL SFAC * .. Scalars in Common .. - INTEGER ICASE, INCX, INCY, MODE, N + INTEGER ICASE, INCX, INCY, N LOGICAL PASS * .. Local Scalars .. REAL SC, SS - INTEGER I, K, KI, KN, KSIZE, LENX, LENY, MX, MY + INTEGER I, K, KI, KN, KSIZE, LEN * .. Local Arrays .. - REAL COPYX(5), COPYY(5), DT9X(7,4,4), DT9Y(7,4,4), - + DX1(7), DY1(7), MWPC(11), MWPS(11), MWPSTX(5), - + MWPSTY(5), MWPTX(11,5), MWPTY(11,5), MWPX(5), - + MWPY(5), SSIZE2(14,2), STX(7), STY(7), SX(7), - + SY(7) - INTEGER INCXS(4), INCYS(4), LENS(4,2), MWPINX(11), - + MWPINY(11), MWPN(11), NS(4) + REAL DX(19), DY(19), + + SSIZE2(19,2), STX(19), STY(19), SX(19), SY(19), + + PARAM(5, 4), SPARAM(5) + INTEGER INCXS(7), INCYS(7), NS(7) * .. External Subroutines .. - EXTERNAL SROTTEST, STEST + EXTERNAL SROTMTEST, SROTM * .. Intrinsic Functions .. - INTRINSIC ABS, MIN + INTRINSIC MIN * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Data statements .. - DATA INCXS/1, 2, -2, -1/ - DATA INCYS/1, -2, 1, -2/ - DATA LENS/1, 1, 2, 4, 1, 1, 3, 7/ - DATA NS/0, 1, 2, 4/ - DATA DX1/0.6E0, 0.1E0, -0.5E0, 0.8E0, 0.9E0, -0.3E0, - + -0.4E0/ - DATA DY1/0.5E0, -0.9E0, 0.3E0, 0.7E0, -0.6E0, 0.2E0, + DATA INCXS/1, 1, 2, 2, -2, -1, -2/ + DATA INCYS/1, 2, 2, -2, 1, -2, -2/ + DATA NS/0, 1, 2, 4, 5, 8, 9/ + DATA DX/0.6E0, 0.1E0, -0.5E0, 0.8E0, 0.9E0, -0.3E0, + + -0.4E0, 0.5E0, -0.9E0, 0.3E0, 0.7E0, -0.6E0, + + 0.2E0, 0.8E0, -0.46E0, 0.78E0, -0.46E0, -0.22E0, + + 1.06E0/ + DATA DY/0.5E0, -0.9E0, 0.3E0, 0.7E0, -0.6E0, 0.2E0, + + 0.6E0, 0.1E0, -0.5E0, 0.8E0, 0.9E0, -0.3E0, + + 0.96E0, 0.1E0, -0.76E0, 0.8E0, 0.90E0, 0.66E0, + 0.8E0/ DATA SC, SS/0.8E0, 0.6E0/ - DATA DT9X/0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, - + 0.0E0, 0.78E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, - + 0.0E0, 0.0E0, 0.78E0, -0.46E0, 0.0E0, 0.0E0, - + 0.0E0, 0.0E0, 0.0E0, 0.78E0, -0.46E0, -0.22E0, - + 1.06E0, 0.0E0, 0.0E0, 0.0E0, 0.6E0, 0.0E0, - + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.78E0, - + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, - + 0.66E0, 0.1E0, -0.1E0, 0.0E0, 0.0E0, 0.0E0, - + 0.0E0, 0.96E0, 0.1E0, -0.76E0, 0.8E0, 0.90E0, - + -0.3E0, -0.02E0, 0.6E0, 0.0E0, 0.0E0, 0.0E0, - + 0.0E0, 0.0E0, 0.0E0, 0.78E0, 0.0E0, 0.0E0, - + 0.0E0, 0.0E0, 0.0E0, 0.0E0, -0.06E0, 0.1E0, - + -0.1E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.90E0, - + 0.1E0, -0.22E0, 0.8E0, 0.18E0, -0.3E0, -0.02E0, - + 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, - + 0.78E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, - + 0.0E0, 0.78E0, 0.26E0, 0.0E0, 0.0E0, 0.0E0, - + 0.0E0, 0.0E0, 0.78E0, 0.26E0, -0.76E0, 1.12E0, - + 0.0E0, 0.0E0, 0.0E0/ - DATA DT9Y/0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, - + 0.0E0, 0.04E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, - + 0.0E0, 0.0E0, 0.04E0, -0.78E0, 0.0E0, 0.0E0, - + 0.0E0, 0.0E0, 0.0E0, 0.04E0, -0.78E0, 0.54E0, - + 0.08E0, 0.0E0, 0.0E0, 0.0E0, 0.5E0, 0.0E0, - + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.04E0, - + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.7E0, - + -0.9E0, -0.12E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, - + 0.64E0, -0.9E0, -0.30E0, 0.7E0, -0.18E0, 0.2E0, - + 0.28E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, - + 0.0E0, 0.0E0, 0.04E0, 0.0E0, 0.0E0, 0.0E0, - + 0.0E0, 0.0E0, 0.0E0, 0.7E0, -1.08E0, 0.0E0, - + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.64E0, -1.26E0, - + 0.54E0, 0.20E0, 0.0E0, 0.0E0, 0.0E0, 0.5E0, - + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, - + 0.04E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, - + 0.0E0, 0.04E0, -0.9E0, 0.18E0, 0.0E0, 0.0E0, - + 0.0E0, 0.0E0, 0.04E0, -0.9E0, 0.18E0, 0.7E0, - + -0.18E0, 0.2E0, 0.16E0/ + DATA PARAM/-2.0E0, 1.0E0, 0.0E0, 0.0E0, 1.0E0, + + -1.0E0, 0.2E0, 0.3E0, 0.4E0, 0.5E0, + + 0.0E0, 1.0E0, 0.3E0, 0.4E0, 1.0E0, + + 1.0E0, 0.2E0, -1.0E0, 1.0E0, 0.5E0/ + DATA LEN/19/ DATA SSIZE2/0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, - + 0.0E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, - + 1.17E0, 1.17E0, 1.17E0/ + + 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, + + 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, + + 1.17E0/ * .. Executable Statements .. * - DO 60 KI = 1, 4 + DO 60 KI = 1, 7 INCX = INCXS(KI) INCY = INCYS(KI) - MX = ABS(INCX) - MY = ABS(INCY) * - DO 40 KN = 1, 4 + DO 40 KN = 1, 7 N = NS(KN) KSIZE = MIN(2,KN) - LENX = LENS(KN,MX) - LENY = LENS(KN,MY) * IF (ICASE.EQ.4) THEN * .. SROTTEST .. - DO 20 I = 1, 7 - SX(I) = DX1(I) - SY(I) = DY1(I) - STX(I) = DT9X(I,KN,KI) - STY(I) = DT9Y(I,KN,KI) + DO 20 I = 1, 19 + SX(I) = DX(I) + SY(I) = DY(I) + STX(I) = DX(I) + STY(I) = DY(I) 20 CONTINUE CALL SROTTEST(N,SX,INCX,SY,INCY,SC,SS) - CALL STEST(LENX,SX,STX,SSIZE2(1,KSIZE),SFAC) - CALL STEST(LENY,SY,STY,SSIZE2(1,KSIZE),SFAC) + CALL SROT(N,STX,INCX,STY,INCY,SC,SS) + CALL STEST(LEN,SX,STX,SSIZE2(1,KSIZE),SFAC) + CALL STEST(LEN,SY,STY,SSIZE2(1,KSIZE),SFAC) + ELSE IF (ICASE.EQ.11) THEN +* .. SROTMTEST .. + DO 90 I = 1, 19 + SX(I) = DX(I) + SY(I) = DY(I) + STX(I) = DX(I) + STY(I) = DY(I) + 90 CONTINUE + DO 70 I = 1, 4 + DO 80 K = 1, 5 + SPARAM(K) = PARAM(K,I) + 80 CONTINUE + CALL SROTMTEST(N,SX,INCX,SY,INCY,SPARAM) + CALL SROTM(N,STX,INCX,STY,INCY,SPARAM) + CALL STEST(LEN,SX,STX,SSIZE2(1,KSIZE),SFAC) + CALL STEST(LEN,SY,STY,SSIZE2(1,KSIZE),SFAC) + 70 CONTINUE ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK3' STOP END IF 40 CONTINUE 60 CONTINUE -* - MWPC(1) = 1 - DO 80 I = 2, 11 - MWPC(I) = 0 - 80 CONTINUE - MWPS(1) = 0 - DO 100 I = 2, 6 - MWPS(I) = 1 - 100 CONTINUE - DO 120 I = 7, 11 - MWPS(I) = -1 - 120 CONTINUE - MWPINX(1) = 1 - MWPINX(2) = 1 - MWPINX(3) = 1 - MWPINX(4) = -1 - MWPINX(5) = 1 - MWPINX(6) = -1 - MWPINX(7) = 1 - MWPINX(8) = 1 - MWPINX(9) = -1 - MWPINX(10) = 1 - MWPINX(11) = -1 - MWPINY(1) = 1 - MWPINY(2) = 1 - MWPINY(3) = -1 - MWPINY(4) = -1 - MWPINY(5) = 2 - MWPINY(6) = 1 - MWPINY(7) = 1 - MWPINY(8) = -1 - MWPINY(9) = -1 - MWPINY(10) = 2 - MWPINY(11) = 1 - DO 140 I = 1, 11 - MWPN(I) = 5 - 140 CONTINUE - MWPN(5) = 3 - MWPN(10) = 3 - DO 160 I = 1, 5 - MWPX(I) = I - MWPY(I) = I - MWPTX(1,I) = I - MWPTY(1,I) = I - MWPTX(2,I) = I - MWPTY(2,I) = -I - MWPTX(3,I) = 6 - I - MWPTY(3,I) = I - 6 - MWPTX(4,I) = I - MWPTY(4,I) = -I - MWPTX(6,I) = 6 - I - MWPTY(6,I) = I - 6 - MWPTX(7,I) = -I - MWPTY(7,I) = I - MWPTX(8,I) = I - 6 - MWPTY(8,I) = 6 - I - MWPTX(9,I) = -I - MWPTY(9,I) = I - MWPTX(11,I) = I - 6 - MWPTY(11,I) = 6 - I - 160 CONTINUE - MWPTX(5,1) = 1 - MWPTX(5,2) = 3 - MWPTX(5,3) = 5 - MWPTX(5,4) = 4 - MWPTX(5,5) = 5 - MWPTY(5,1) = -1 - MWPTY(5,2) = 2 - MWPTY(5,3) = -2 - MWPTY(5,4) = 4 - MWPTY(5,5) = -3 - MWPTX(10,1) = -1 - MWPTX(10,2) = -3 - MWPTX(10,3) = -5 - MWPTX(10,4) = 4 - MWPTX(10,5) = 5 - MWPTY(10,1) = 1 - MWPTY(10,2) = 2 - MWPTY(10,3) = 2 - MWPTY(10,4) = 4 - MWPTY(10,5) = 3 - DO 200 I = 1, 11 - INCX = MWPINX(I) - INCY = MWPINY(I) - DO 180 K = 1, 5 - COPYX(K) = MWPX(K) - COPYY(K) = MWPY(K) - MWPSTX(K) = MWPTX(I,K) - MWPSTY(K) = MWPTY(I,K) - 180 CONTINUE - CALL SROTTEST(MWPN(I),COPYX,INCX,COPYY,INCY,MWPC(I),MWPS(I)) - CALL STEST(5,COPYX,MWPSTX,MWPSTX,SFAC) - CALL STEST(5,COPYY,MWPSTY,MWPSTY,SFAC) - 200 CONTINUE RETURN END SUBROUTINE STEST(LEN,SCOMP,STRUE,SSIZE,SFAC) @@ -726,3 +616,147 @@ + /1X) 99997 FORMAT (1X,I4,I3,3I5,2I36,I12) END + SUBROUTINE SROT(N,SX,INCX,SY,INCY,C,S) +* +* --Reference BLAS level1 routine (version 3.8.0) -- +* --Reference BLAS is a software package provided by Univ. of Tennessee, -- +* --Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2017 +* +* .. Scalar Arguments .. + REAL C,S + INTEGER INCX,INCY,N +* .. +* .. Array Arguments .. + REAL SX(*),SY(*) +* .. +* .. Local Scalars .. + REAL STEMP + INTEGER I,IX,IY +* .. + IF (n.LE.0) RETURN + IF (incx.EQ.1 .AND. incy.EQ.1) THEN + DO i = 1,n + stemp = c*sx(i) + s*sy(i) + sy(i) = c*sy(i) - s*sx(i) + sx(i) = stemp + END DO + ELSE + ix = 1 + iy = 1 + IF (incx.LT.0) ix = (-n+1)*incx + 1 + IF (incy.LT.0) iy = (-n+1)*incy + 1 + DO i = 1,n + stemp = c*sx(ix) + s*sy(iy) + sy(iy) = c*sy(iy) - s*sx(ix) + sx(ix) = stemp + ix = ix + incx + iy = iy + incy + END DO + END IF + RETURN + END + SUBROUTINE srotm(N,SX,INCX,SY,INCY,SPARAM) +* +* --Reference BLAS level1 routine (version 3.8.0) -- +* --Reference BLAS is a software package provided by Univ. of Tennessee, -- +* --Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2017 +* +* .. Scalar Arguments .. + INTEGER INCX,INCY,N +* .. +* .. Array Arguments .. + REAL SPARAM(5),SX(*),SY(*) +* .. +* +* ==================================================================== +* +* .. Local Scalars .. + REAL SFLAG,SH11,SH12,SH21,SH22,TWO,W,Z,ZERO + INTEGER I,KX,KY,NSTEPS +* .. +* .. Data statements .. + DATA zero,two/0.e0,2.e0/ +* .. +* + sflag = sparam(1) + IF (n.LE.0 .OR. (sflag+two.EQ.zero)) RETURN + IF (incx.EQ.incy.AND.incx.GT.0) THEN +* + nsteps = n*incx + IF (sflag.LT.zero) THEN + sh11 = sparam(2) + sh12 = sparam(4) + sh21 = sparam(3) + sh22 = sparam(5) + DO i = 1,nsteps,incx + w = sx(i) + z = sy(i) + sx(i) = w*sh11 + z*sh12 + sy(i) = w*sh21 + z*sh22 + END DO + ELSE IF (sflag.EQ.zero) THEN + sh12 = sparam(4) + sh21 = sparam(3) + DO i = 1,nsteps,incx + w = sx(i) + z = sy(i) + sx(i) = w + z*sh12 + sy(i) = w*sh21 + z + END DO + ELSE + sh11 = sparam(2) + sh22 = sparam(5) + DO i = 1,nsteps,incx + w = sx(i) + z = sy(i) + sx(i) = w*sh11 + z + sy(i) = -w + sh22*z + END DO + END IF + ELSE + kx = 1 + ky = 1 + IF (incx.LT.0) kx = 1 + (1-n)*incx + IF (incy.LT.0) ky = 1 + (1-n)*incy +* + IF (sflag.LT.zero) THEN + sh11 = sparam(2) + sh12 = sparam(4) + sh21 = sparam(3) + sh22 = sparam(5) + DO i = 1,n + w = sx(kx) + z = sy(ky) + sx(kx) = w*sh11 + z*sh12 + sy(ky) = w*sh21 + z*sh22 + kx = kx + incx + ky = ky + incy + END DO + ELSE IF (sflag.EQ.zero) THEN + sh12 = sparam(4) + sh21 = sparam(3) + DO i = 1,n + w = sx(kx) + z = sy(ky) + sx(kx) = w + z*sh12 + sy(ky) = w*sh21 + z + kx = kx + incx + ky = ky + incy + END DO + ELSE + sh11 = sparam(2) + sh22 = sparam(5) + DO i = 1,n + w = sx(kx) + z = sy(ky) + sx(kx) = w*sh11 + z + sy(ky) = -w + sh22*z + kx = kx + incx + ky = ky + incy + END DO + END IF + END IF + RETURN + END \ No newline at end of file From 6d54c9476056b13fb91bf90bbc803968e0743aef Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 20 Mar 2020 01:08:10 +0100 Subject: [PATCH 071/593] Make ifort on Windows create lowercase symbols with appended underscore tentative fix for #2472 --- cmake/fc.cmake | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cmake/fc.cmake b/cmake/fc.cmake index f54c989d4..ff26ac06c 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -76,8 +76,11 @@ if (${F_COMPILER} STREQUAL "GFORTRAN") endif () endif () -if (${F_COMPILER} STREQUAL "INTEL") +if (${F_COMPILER} STREQUAL "IFORT") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_INTEL") + if (MSVC) + set(FCOMMON_OPT "${FCOMMON_OPT} -names:lowercase -assume:underscore") + endif () if (INTERFACE64) set(FCOMMON_OPT "${FCOMMON_OPT} -i8") endif () From b8307768e2b47705000a485ffb050f27cb579a93 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sat, 21 Mar 2020 05:42:10 +0800 Subject: [PATCH 072/593] Add files via upload --- kernel/x86_64/KERNEL.SKYLAKEX | 10 +- kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c | 515 +++++++++++++++++++ 2 files changed, 520 insertions(+), 5 deletions(-) create mode 100644 kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 333571fd4..65f031d03 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -1,15 +1,15 @@ include $(KERNELDIR)/KERNEL.HASWELL -SGEMMKERNEL = sgemm_kernel_16x4_skylakex_2.c +SGEMMKERNEL = sgemm_kernel_16x4_skylakex_3.c STRMMKERNEL = sgemm_kernel_16x4_skylakex_2.c SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = sgemm_tcopy_16_skylakex.c SGEMMONCOPY = sgemm_ncopy_4_skylakex.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c DGEMMKERNEL = dgemm_kernel_16x2_skylakex.c DTRMMKERNEL = dgemm_kernel_16x2_skylakex.c diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c new file mode 100644 index 000000000..3b1af33c1 --- /dev/null +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c @@ -0,0 +1,515 @@ +/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store */ +/* r10 to assist prefetch, r12 = k << 4(const), r13 = k(const), r14 = b_head_pos(const), r15 = %1 + 3r12 */ + +#include "common.h" +#include + +/* m = 16 */ /* zmm8-zmm31 for accumulators, zmm4-zmm7 for temporary use, zmm0 for alpha */ +#define KERNEL_k1m16n1 \ + "vmovups (%0),%%zmm4; addq $64,%0;"\ + "vbroadcastss (%1),%%zmm6; vfmadd231ps %%zmm4,%%zmm6,%%zmm8;"\ + "addq $4,%1;" +#define KERNEL_h_k1m16n2 \ + "vmovsldup (%0),%%zmm4; vmovshdup (%0),%%zmm5; prefetcht0 512(%0); addq $64,%0;"\ + "vbroadcastsd (%1),%%zmm6; vfmadd231ps %%zmm4,%%zmm6,%%zmm8; vfmadd231ps %%zmm5,%%zmm6,%%zmm9;" +#define KERNEL_k1m16n2 KERNEL_h_k1m16n2 "addq $8,%1;" +#define KERNEL_h_k1m16n4 KERNEL_h_k1m16n2 "vbroadcastsd 8(%1),%%zmm7; vfmadd231ps %%zmm4,%%zmm7,%%zmm10; vfmadd231ps %%zmm5,%%zmm7,%%zmm11;" +#define KERNEL_k1m16n4 KERNEL_h_k1m16n4 "addq $16,%1;" +#define unit_kernel_k1m16n4(c1,c2,c3,c4, ...) \ + "vbroadcastsd ("#__VA_ARGS__"),%%zmm6; vfmadd231ps %%zmm4,%%zmm6,"#c1"; vfmadd231ps %%zmm5,%%zmm6,"#c2";"\ + "vbroadcastsd 8("#__VA_ARGS__"),%%zmm7; vfmadd231ps %%zmm4,%%zmm7,"#c3"; vfmadd231ps %%zmm5,%%zmm7,"#c4";" +#define KERNEL_h_k1m16n8 KERNEL_h_k1m16n4 unit_kernel_k1m16n4(%%zmm12,%%zmm13,%%zmm14,%%zmm15,%1,%%r12,1) +#define KERNEL_k1m16n8 KERNEL_h_k1m16n8 "addq $16,%1;" +#define KERNEL_h_k1m16n12 KERNEL_h_k1m16n8 unit_kernel_k1m16n4(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%1,%%r12,2) +#define KERNEL_k1m16n12 KERNEL_h_k1m16n12 "addq $16,%1;" +#define KERNEL_h_k1m16n16 KERNEL_k1m16n12 unit_kernel_k1m16n4(%%zmm20,%%zmm21,%%zmm22,%%zmm23,%%r15) +#define KERNEL_k1m16n16 KERNEL_h_k1m16n16 "addq $16,%%r15;" +#define KERNEL_h_k1m16n20 KERNEL_h_k1m16n16 unit_kernel_k1m16n4(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%r15,%%r12,1) +#define KERNEL_k1m16n20 KERNEL_h_k1m16n20 "addq $16,%%r15;" +#define KERNEL_h_k1m16n24 KERNEL_h_k1m16n20 unit_kernel_k1m16n4(%%zmm28,%%zmm29,%%zmm30,%%zmm31,%%r15,%%r12,2) +#define KERNEL_k1m16n24 KERNEL_h_k1m16n24 "addq $16,%%r15;" +#define INIT_m16n1 "vpxorq %%zmm8,%%zmm8,%%zmm8;" +#define INIT_m16n2 INIT_m16n1 "vpxorq %%zmm9,%%zmm9,%%zmm9;" +#define INIT_m16n4 INIT_m16n2 "vpxorq %%zmm10,%%zmm10,%%zmm10;vpxorq %%zmm11,%%zmm11,%%zmm11;" +#define unit_init_m16n4(c1,c2,c3,c4) \ + "vpxorq "#c1","#c1","#c1";vpxorq "#c2","#c2","#c2";vpxorq "#c3","#c3","#c3";vpxorq "#c4","#c4","#c4";" +#define INIT_m16n8 INIT_m16n4 unit_init_m16n4(%%zmm12,%%zmm13,%%zmm14,%%zmm15) +#define INIT_m16n12 INIT_m16n8 unit_init_m16n4(%%zmm16,%%zmm17,%%zmm18,%%zmm19) +#define INIT_m16n16 INIT_m16n12 unit_init_m16n4(%%zmm20,%%zmm21,%%zmm22,%%zmm23) +#define INIT_m16n20 INIT_m16n16 unit_init_m16n4(%%zmm24,%%zmm25,%%zmm26,%%zmm27) +#define INIT_m16n24 INIT_m16n20 unit_init_m16n4(%%zmm28,%%zmm29,%%zmm30,%%zmm31) +#define SAVE_h_m16n1 "vfmadd213ps (%2),%%zmm0,%%zmm8; vmovups %%zmm8,(%2);" +#define unit_save_m16n2(c1,c2) \ + "vunpcklps "#c2","#c1",%%zmm6; vunpckhps "#c2","#c1",%%zmm7; vunpcklpd %%zmm7,%%zmm6,%%zmm4; vunpckhpd %%zmm7,%%zmm6,%%zmm5;"\ + "vfmadd213ps (%5),%%zmm0,%%zmm4; vfmadd213ps (%5,%3,1),%%zmm0,%%zmm5;"\ + "vmovups %%zmm4,(%5); vmovups %%zmm5,(%5,%3,1); leaq (%5,%3,2),%5;" +#define SAVE_h_m16n2 "movq %2,%5;" unit_save_m16n2(%%zmm8,%%zmm9) +#define SAVE_h_m16n4 SAVE_h_m16n2 unit_save_m16n2(%%zmm10,%%zmm11) +#define SAVE_h_m16n8 SAVE_h_m16n4 unit_save_m16n2(%%zmm12,%%zmm13) unit_save_m16n2(%%zmm14,%%zmm15) +#define SAVE_h_m16n12 SAVE_h_m16n8 unit_save_m16n2(%%zmm16,%%zmm17) unit_save_m16n2(%%zmm18,%%zmm19) +#define SAVE_h_m16n16 SAVE_h_m16n12 unit_save_m16n2(%%zmm20,%%zmm21) unit_save_m16n2(%%zmm22,%%zmm23) +#define SAVE_h_m16n20 SAVE_h_m16n16 unit_save_m16n2(%%zmm24,%%zmm25) unit_save_m16n2(%%zmm26,%%zmm27) +#define SAVE_h_m16n24 SAVE_h_m16n20 unit_save_m16n2(%%zmm28,%%zmm29) unit_save_m16n2(%%zmm30,%%zmm31) +#define SAVE_m16(ndim) SAVE_h_m16n##ndim "addq $64,%2;" +#define COMPUTE_m16(ndim) \ + INIT_m16n##ndim\ + "movq %%r13,%4; movq %%r14,%1; leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15; movq %2,%5; xorq %%r10,%%r10;"\ + "cmpq $16,%4; jb "#ndim"016162f;"\ + #ndim"016161:\n\t"\ + "cmpq $126,%%r10; movq $126,%%r10; cmoveq %3,%%r10;"\ + KERNEL_k1m16n##ndim\ + KERNEL_k1m16n##ndim\ + "prefetcht1 (%5); subq $63,%5; addq %%r10,%5;"\ + KERNEL_k1m16n##ndim\ + KERNEL_k1m16n##ndim\ + "prefetcht1 (%6); addq $32,%6;"\ + "subq $4,%4; cmpq $16,%4; jnb "#ndim"016161b;"\ + "movq %2,%5;"\ + #ndim"016162:\n\t"\ + "testq %4,%4; jz "#ndim"016164f;"\ + #ndim"016163:\n\t"\ + "prefetcht0 (%5); prefetcht0 63(%5); prefetcht0 (%5,%3,1); prefetcht0 63(%5,%3,1);"\ + KERNEL_k1m16n##ndim\ + "leaq (%5,%3,2),%5; decq %4; jnz "#ndim"016163b;"\ + #ndim"016164:\n\t"\ + "prefetcht0 (%%r14); prefetcht0 64(%%r14);"\ + SAVE_m16(ndim) +#define unit_save_m16n2_rscr(c1,c2,scr_off) \ + "vunpcklps "#c2","#c1",%%zmm6; vunpckhps "#c2","#c1",%%zmm7; vunpcklpd %%zmm7,%%zmm6,%%zmm4; vunpckhpd %%zmm7,%%zmm6,%%zmm5;"\ + "vmovups "#scr_off"(%7),%%zmm6; vfmadd213ps -64(%5),%%zmm0,%%zmm6; vfmadd213ps (%5),%%zmm0,%%zmm4;"\ + "vmovups %%zmm6,-64(%5); vmovups %%zmm4,(%5);"\ + "vmovups "#scr_off"+64(%7),%%zmm6; vfmadd213ps -64(%5,%3,1),%%zmm0,%%zmm6; vfmadd213ps (%5,%3,1),%%zmm0,%%zmm5;"\ + "vmovups %%zmm6,-64(%5,%3,1); vmovups %%zmm5,(%5,%3,1); leaq (%5,%3,2),%5;" +#define unit_save_m16n2_wscr(c1,c2,scr_off) \ + "vunpcklps "#c2","#c1",%%zmm6; vunpckhps "#c2","#c1",%%zmm7; vunpcklpd %%zmm7,%%zmm6,%%zmm4; vunpckhpd %%zmm7,%%zmm6,%%zmm5;"\ + "vmovups %%zmm4,"#scr_off"(%7); vmovups %%zmm5,"#scr_off"+64(%7);" +#define COMPUTE_m16n24_LSAVE \ + INIT_m16n24\ + "movq %%r13,%4; movq %%r14,%1; leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15; movq %2,%5;"\ + "cmpq $16,%4; jb 24716162f; movq $16,%4;"\ + "24716161:\n\t"\ + KERNEL_k1m16n24 "addq $4,%4; testq $12,%4; movq $172,%%r10; cmovz %3,%%r10;"\ + KERNEL_k1m16n24 "prefetcht1 -64(%5); leaq -129(%5,%%r10,1),%5;"\ + KERNEL_k1m16n24 "prefetcht1 (%6); addq $32,%6; cmpq $208,%4; cmoveq %2,%5;"\ + KERNEL_k1m16n24 "cmpq %4,%%r13; jnb 24716161b;"\ + "movq %2,%5; negq %4; leaq 16(%%r13,%4,1),%4;"\ + "24716162:\n\t"\ + "testq %4,%4; jz 24716164f; movq %7,%%r10;"\ + "24716163:\n\t"\ + "prefetcht0 -64(%5); prefetcht0 (%5); prefetcht0 63(%5); addq %3,%5;"\ + KERNEL_k1m16n24 "prefetcht0 (%%r10); addq $64,%%r10; decq %4; jnz 24716163b;"\ + "24716164:\n\t"\ + "prefetcht0 (%%r14); prefetcht0 64(%%r14); movq %2,%5; addq $64,%2;"\ + unit_save_m16n2_rscr(%%zmm8,%%zmm9,0) unit_save_m16n2_rscr(%%zmm10,%%zmm11,128) unit_save_m16n2_rscr(%%zmm12,%%zmm13,256)\ + unit_save_m16n2_rscr(%%zmm14,%%zmm15,384) unit_save_m16n2_rscr(%%zmm16,%%zmm17,512) unit_save_m16n2_rscr(%%zmm18,%%zmm19,640)\ + unit_save_m16n2_wscr(%%zmm20,%%zmm21,0) unit_save_m16n2_wscr(%%zmm22,%%zmm23,128) unit_save_m16n2_wscr(%%zmm24,%%zmm25,256)\ + unit_save_m16n2_wscr(%%zmm26,%%zmm27,384) unit_save_m16n2_wscr(%%zmm28,%%zmm29,512) unit_save_m16n2_wscr(%%zmm30,%%zmm31,640) +#define COMPUTE_m16n24_RSAVE \ + INIT_m16n24 "leaq (%2,%3,8),%2; leaq (%2,%3,4),%2;"\ + "movq %%r13,%4; movq %%r14,%1; leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15; movq %2,%5;"\ + "cmpq $16,%4; jb 24616162f; movq $16,%4;"\ + "24616161:\n\t"\ + KERNEL_k1m16n24 "addq $4,%4; testq $12,%4; movq $172,%%r10; cmovz %3,%%r10;"\ + KERNEL_k1m16n24 "prefetcht1 -64(%5); leaq -129(%5,%%r10,1),%5;"\ + KERNEL_k1m16n24 "prefetcht1 (%6); addq $32,%6; cmpq $208,%4; cmoveq %2,%5;"\ + KERNEL_k1m16n24 "cmpq %4,%%r13; jnb 24616161b;"\ + "movq %2,%5; negq %4; leaq 16(%%r13,%4,1),%4;"\ + "24616162:\n\t"\ + "testq %4,%4; jz 24616164f; movq %7,%%r10;"\ + "24616163:\n\t"\ + "prefetcht0 -64(%5); prefetcht0 (%5); prefetcht0 63(%5); addq %3,%5;"\ + KERNEL_k1m16n24 "prefetcht0 (%%r10); addq $64,%%r10; decq %4; jnz 24616163b;"\ + "24616164:\n\t"\ + "prefetcht0 (%%r14); prefetcht0 64(%%r14); movq %2,%5; addq $64,%2;"\ + unit_save_m16n2_rscr(%%zmm20,%%zmm21,0) unit_save_m16n2_rscr(%%zmm22,%%zmm23,128) unit_save_m16n2_rscr(%%zmm24,%%zmm25,256)\ + unit_save_m16n2_rscr(%%zmm26,%%zmm27,384) unit_save_m16n2_rscr(%%zmm28,%%zmm29,512) unit_save_m16n2_rscr(%%zmm30,%%zmm31,640)\ + unit_save_m16n2_wscr(%%zmm8,%%zmm9,0) unit_save_m16n2_wscr(%%zmm10,%%zmm11,128) unit_save_m16n2_wscr(%%zmm12,%%zmm13,256)\ + unit_save_m16n2_wscr(%%zmm14,%%zmm15,384) unit_save_m16n2_wscr(%%zmm16,%%zmm17,512) unit_save_m16n2_wscr(%%zmm18,%%zmm19,640)\ + "negq %3; leaq (%2,%3,8),%2; leaq (%2,%3,4),%2; negq %3;" +#define COMPUTE_m16n24_LINIT \ + INIT_m16n24\ + "movq %%r13,%4; movq %%r14,%1; leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15; movq %2,%5;"\ + "cmpq $16,%4; jb 24516162f; movq $16,%4;"\ + "24516161:\n\t"\ + KERNEL_k1m16n24 "addq $4,%4; testq $12,%4; movq $84,%%r10; cmovz %3,%%r10;"\ + KERNEL_k1m16n24 "prefetcht1 (%5); leaq -63(%5,%%r10,1),%5;"\ + KERNEL_k1m16n24 "prefetcht1 (%6); addq $32,%6; cmpq $208,%4; cmoveq %2,%5;"\ + KERNEL_k1m16n24 "cmpq %4,%%r13; jnb 24516161b;"\ + "movq %2,%5; negq %4; leaq 16(%%r13,%4,1),%4;"\ + "24516162:\n\t"\ + "testq %4,%4; jz 24516164f; movq %7,%%r10;"\ + "24516163:\n\t"\ + "prefetcht0 (%5); prefetcht0 63(%5); addq %3,%5;"\ + KERNEL_k1m16n24 "prefetcht0 (%%r10); addq $64,%%r10; decq %4; jnz 24516163b;"\ + "24516164:\n\t"\ + "prefetcht0 (%%r14); prefetcht0 64(%%r14); movq %2,%5; addq $64,%2;"\ + unit_save_m16n2(%%zmm8,%%zmm9) unit_save_m16n2(%%zmm10,%%zmm11) unit_save_m16n2(%%zmm12,%%zmm13)\ + unit_save_m16n2(%%zmm14,%%zmm15) unit_save_m16n2(%%zmm16,%%zmm17) unit_save_m16n2(%%zmm18,%%zmm19)\ + unit_save_m16n2_wscr(%%zmm20,%%zmm21,0) unit_save_m16n2_wscr(%%zmm22,%%zmm23,128) unit_save_m16n2_wscr(%%zmm24,%%zmm25,256)\ + unit_save_m16n2_wscr(%%zmm26,%%zmm27,384) unit_save_m16n2_wscr(%%zmm28,%%zmm29,512) unit_save_m16n2_wscr(%%zmm30,%%zmm31,640) +#define COMPUTE_m16n24_LTAIL \ + INIT_m16n24\ + "movq %%r13,%4; movq %%r14,%1; leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15; movq %2,%5;"\ + "cmpq $16,%4; jb 24416162f; movq $16,%4;"\ + "24416161:\n\t"\ + KERNEL_k1m16n24 "addq $4,%4; testq $4,%4; movq $126,%%r10; cmovz %3,%%r10;"\ + KERNEL_k1m16n24 "prefetcht1 -64(%5); prefetcht1 (%5); leaq -63(%5,%%r10,1),%5;"\ + KERNEL_k1m16n24 "prefetcht1 (%6); addq $32,%6; cmpq $208,%4; cmoveq %2,%5;"\ + KERNEL_k1m16n24 "cmpq %4,%%r13; jnb 24416161b;"\ + "movq %2,%5; negq %4; leaq 16(%%r13,%4,1),%4;"\ + "24416162:\n\t"\ + "testq %4,%4; jz 24416164f; movq %7,%%r10;"\ + "24416163:\n\t"\ + "prefetcht0 -64(%5); prefetcht0 (%5); prefetcht0 63(%5); prefetcht0 -64(%5,%3,1); prefetcht0 (%5,%3,1); prefetcht0 63(%5,%3,1); leaq (%5,%3,2),%5;"\ + KERNEL_k1m16n24 "prefetcht0 (%%r10); addq $64,%%r10; decq %4; jnz 24416163b;"\ + "24416164:\n\t"\ + "prefetcht0 (%%r14); prefetcht0 64(%%r14); movq %2,%5; addq $64,%2;"\ + unit_save_m16n2_rscr(%%zmm8,%%zmm9,0) unit_save_m16n2_rscr(%%zmm10,%%zmm11,128) unit_save_m16n2_rscr(%%zmm12,%%zmm13,256)\ + unit_save_m16n2_rscr(%%zmm14,%%zmm15,384) unit_save_m16n2_rscr(%%zmm16,%%zmm17,512) unit_save_m16n2_rscr(%%zmm18,%%zmm19,640)\ + unit_save_m16n2(%%zmm20,%%zmm21) unit_save_m16n2(%%zmm22,%%zmm23) unit_save_m16n2(%%zmm24,%%zmm25)\ + unit_save_m16n2(%%zmm26,%%zmm27) unit_save_m16n2(%%zmm28,%%zmm29) unit_save_m16n2(%%zmm30,%%zmm31) +#define COMPUTE_m16n24_RTAIL \ + INIT_m16n24\ + "movq %%r13,%4; movq %%r14,%1; leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15; movq %2,%5;"\ + "cmpq $16,%4; jb 24416162f; movq $16,%4;"\ + "24416161:\n\t"\ + KERNEL_k1m16n24 "addq $4,%4; testq $4,%4; movq $126,%%r10; cmovz %3,%%r10;"\ + KERNEL_k1m16n24 "prefetcht1 -64(%5); prefetcht1 (%5); leaq -63(%5,%%r10,1),%5;"\ + KERNEL_k1m16n24 "prefetcht1 (%6); addq $32,%6; cmpq $208,%4; cmoveq %2,%5;"\ + KERNEL_k1m16n24 "cmpq %4,%%r13; jnb 24416161b;"\ + "movq %2,%5; negq %4; leaq 16(%%r13,%4,1),%4;"\ + "24416162:\n\t"\ + "testq %4,%4; jz 24416164f; movq %7,%%r10;"\ + "24416163:\n\t"\ + "prefetcht0 -64(%5); prefetcht0 (%5); prefetcht0 63(%5); prefetcht0 -64(%5,%3,1); prefetcht0 (%5,%3,1); prefetcht0 63(%5,%3,1); leaq (%5,%3,2),%5;"\ + KERNEL_k1m16n24 "prefetcht0 (%%r10); addq $64,%%r10; decq %4; jnz 24416163b;"\ + "24416164:\n\t"\ + "prefetcht0 (%%r14); prefetcht0 64(%%r14); movq %2,%5; addq $64,%2;"\ + unit_save_m16n2(%%zmm8,%%zmm9) unit_save_m16n2(%%zmm10,%%zmm11) unit_save_m16n2(%%zmm12,%%zmm13)\ + unit_save_m16n2(%%zmm14,%%zmm15) unit_save_m16n2(%%zmm16,%%zmm17) unit_save_m16n2(%%zmm18,%%zmm19)\ + unit_save_m16n2_rscr(%%zmm20,%%zmm21,0) unit_save_m16n2_rscr(%%zmm22,%%zmm23,128) unit_save_m16n2_rscr(%%zmm24,%%zmm25,256)\ + unit_save_m16n2_rscr(%%zmm26,%%zmm27,384) unit_save_m16n2_rscr(%%zmm28,%%zmm29,512) unit_save_m16n2_rscr(%%zmm30,%%zmm31,640) + +/* m = 8 *//* zmm0 for alpha, zmm1-2 for perm words, zmm4-7 for temporary use, zmm8-19 for accumulators */ +#define KERNEL_k1m8n1 \ + "vbroadcastss (%1),%%ymm4; addq $4,%1; vfmadd231ps (%0),%%ymm4,%%ymm8; addq $32,%0;" +#define KERNEL_k1m8n2 \ + "vmovups (%0),%%ymm4; addq $32,%0;"\ + "vbroadcastss (%1),%%ymm5; vfmadd231ps %%ymm5,%%ymm4,%%ymm8;"\ + "vbroadcastss 4(%1),%%ymm6; vfmadd231ps %%ymm6,%%ymm4,%%ymm9; addq $8,%1;" +#define unit_kernel_k1m8n4(c1,c2,...)\ + "vbroadcastf32x4 ("#__VA_ARGS__"),%%zmm7; vfmadd231ps %%zmm7,%%zmm4,"#c1"; vfmadd231ps %%zmm7,%%zmm5,"#c2";" +#define KERNEL_h_k1m8n4 \ + "vbroadcastf32x4 (%0),%%zmm4; vpermilps %%zmm2,%%zmm4,%%zmm4; vbroadcastf32x4 16(%0),%%zmm5; vpermilps %%zmm2,%%zmm5,%%zmm5; addq $32,%0;"\ + unit_kernel_k1m8n4(%%zmm8,%%zmm9,%1) +#define KERNEL_k1m8n4 KERNEL_h_k1m8n4 "addq $16,%1;" +#define KERNEL_h_k1m8n8 KERNEL_h_k1m8n4 unit_kernel_k1m8n4(%%zmm10,%%zmm11,%1,%%r12,1) +#define KERNEL_k1m8n8 KERNEL_h_k1m8n8 "addq $16,%1;" +#define KERNEL_k1m8n12 KERNEL_h_k1m8n8 unit_kernel_k1m8n4(%%zmm12,%%zmm13,%1,%%r12,2) "addq $16,%1;" +#define KERNEL_h_k1m8n16 KERNEL_k1m8n12 unit_kernel_k1m8n4(%%zmm14,%%zmm15,%%r15) +#define KERNEL_k1m8n16 KERNEL_h_k1m8n16 "addq $16,%%r15;" +#define KERNEL_h_k1m8n20 KERNEL_h_k1m8n16 unit_kernel_k1m8n4(%%zmm16,%%zmm17,%%r15,%%r12,1) +#define KERNEL_k1m8n20 KERNEL_h_k1m8n20 "addq $16,%%r15;" +#define KERNEL_k1m8n24 KERNEL_h_k1m8n20 unit_kernel_k1m8n4(%%zmm18,%%zmm19,%%r15,%%r12,2) "addq $16,%%r15;" +#define INIT_m8n1 "vpxor %%ymm8,%%ymm8,%%ymm8;" +#define INIT_m8n2 "vpxor %%ymm8,%%ymm8,%%ymm8; vpxor %%ymm9,%%ymm9,%%ymm9;" +#define unit_init_m8n4(c1,c2) "vpxorq "#c1","#c1","#c1";vpxorq "#c2","#c2","#c2";" +#define INIT_m8n4 unit_init_m8n4(%%zmm8,%%zmm9) +#define INIT_m8n8 INIT_m8n4 unit_init_m8n4(%%zmm10,%%zmm11) +#define INIT_m8n12 INIT_m8n8 unit_init_m8n4(%%zmm12,%%zmm13) +#define INIT_m8n16 INIT_m8n12 unit_init_m8n4(%%zmm14,%%zmm15) +#define INIT_m8n20 INIT_m8n16 unit_init_m8n4(%%zmm16,%%zmm17) +#define INIT_m8n24 INIT_m8n20 unit_init_m8n4(%%zmm18,%%zmm19) +#define SAVE_h_m8n1 "vfmadd213ps (%2),%%ymm0,%%ymm8; vmovups %%ymm8,(%2);" +#define SAVE_h_m8n2 \ + "vfmadd213ps (%2),%%ymm0,%%ymm8; vmovups %%ymm8,(%2);"\ + "vfmadd213ps (%2,%3,1),%%ymm0,%%ymm9; vmovups %%ymm9,(%2,%3,1);" +#define unit_save_m8n4(c1_no,c2_no)\ + "vpermps %%zmm"#c1_no",%%zmm1,%%zmm"#c1_no"; vpermps %%zmm"#c2_no",%%zmm1,%%zmm"#c2_no";"\ + "vextractf64x4 $1,%%zmm"#c1_no",%%ymm5; vextractf64x4 $1,%%zmm"#c2_no",%%ymm6;"\ + "vmovups (%5),%%xmm4; vinsertf128 $1,(%5,%3,1),%%ymm4,%%ymm4; vfmadd231ps %%ymm"#c1_no",%%ymm0,%%ymm4;"\ + "vmovups %%xmm4,(%5); vextractf128 $1,%%ymm4,(%5,%3,1);"\ + "vmovups 16(%5),%%xmm4; vinsertf128 $1,16(%5,%3,1),%%ymm4,%%ymm4; vfmadd231ps %%ymm"#c2_no",%%ymm0,%%ymm4;"\ + "vmovups %%xmm4,16(%5); vextractf128 $1,%%ymm4,16(%5,%3,1); leaq (%5,%3,2),%5;"\ + "vmovups (%5),%%xmm4; vinsertf128 $1,(%5,%3,1),%%ymm4,%%ymm4; vfmadd231ps %%ymm5,%%ymm0,%%ymm4;"\ + "vmovups %%xmm4,(%5); vextractf128 $1,%%ymm4,(%5,%3,1);"\ + "vmovups 16(%5),%%xmm4; vinsertf128 $1,16(%5,%3,1),%%ymm4,%%ymm4; vfmadd231ps %%ymm6,%%ymm0,%%ymm4;"\ + "vmovups %%xmm4,16(%5); vextractf128 $1,%%ymm4,16(%5,%3,1); leaq (%5,%3,2),%5;" +#define SAVE_h_m8n4 "movq %2,%5;" unit_save_m8n4(8,9) +#define SAVE_h_m8n8 SAVE_h_m8n4 unit_save_m8n4(10,11) +#define SAVE_h_m8n12 SAVE_h_m8n8 unit_save_m8n4(12,13) +#define SAVE_h_m8n16 SAVE_h_m8n12 unit_save_m8n4(14,15) +#define SAVE_h_m8n20 SAVE_h_m8n16 unit_save_m8n4(16,17) +#define SAVE_h_m8n24 SAVE_h_m8n20 unit_save_m8n4(18,19) +#define SAVE_m8(ndim) SAVE_h_m8n##ndim "addq $32,%2;" +#define COMPUTE_m8(ndim) \ + INIT_m8n##ndim\ + "movq %%r13,%4; movq %%r14,%1; leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15;"\ + "testq %4,%4; jz "#ndim"008082f;"\ + #ndim"008081:\n\t"\ + KERNEL_k1m8n##ndim "decq %4; jnz "#ndim"008081b;"\ + #ndim"008082:\n\t"\ + SAVE_m8(ndim) + +/* m = 4 *//* zmm0 for alpha, zmm1-2 for perm words, zmm4-7 for temporary use, zmm8-15 for accumulators */ +#define KERNEL_k1m4n1 "vbroadcastss (%1),%%xmm4; addq $4,%1; vfmadd231ps (%0),%%xmm4,%%xmm8; addq $16,%0;" +#define KERNEL_k1m4n2 "vmovups (%0),%%xmm4; addq $16,%0;"\ + "vbroadcastss (%1),%%xmm5; vfmadd231ps %%xmm5,%%xmm4,%%xmm8;"\ + "vbroadcastss 4(%1),%%xmm5; vfmadd231ps %%xmm5,%%xmm4,%%xmm9; addq $8,%1;" +#define unit_kernel_k1m4n4(c1,...) "vbroadcastf32x4 ("#__VA_ARGS__"),%%zmm7; vfmadd231ps %%zmm7,%%zmm4,"#c1";" +#define KERNEL_h_k1m4n4 "vbroadcastf32x4 (%0),%%zmm4; vpermilps %%zmm2,%%zmm4,%%zmm4; addq $16,%0;" unit_kernel_k1m4n4(%%zmm8,%1) +#define KERNEL_k1m4n4 KERNEL_h_k1m4n4 "addq $16,%1;" +#define KERNEL_h_k1m4n8 KERNEL_h_k1m4n4 unit_kernel_k1m4n4(%%zmm9,%1,%%r12,1) +#define KERNEL_k1m4n8 KERNEL_h_k1m4n8 "addq $16,%1;" +#define KERNEL_k1m4n12 KERNEL_h_k1m4n8 unit_kernel_k1m4n4(%%zmm10,%1,%%r12,2) "addq $16,%1;" +#define KERNEL_h_k1m4n16 KERNEL_k1m4n12 unit_kernel_k1m4n4(%%zmm11,%%r15) +#define KERNEL_k1m4n16 KERNEL_h_k1m4n16 "addq $16,%%r15;" +#define KERNEL_h_k1m4n20 KERNEL_h_k1m4n16 unit_kernel_k1m4n4(%%zmm12,%%r15,%%r12,1) +#define KERNEL_k1m4n20 KERNEL_h_k1m4n20 "addq $16,%%r15;" +#define KERNEL_h_k1m4n24 KERNEL_h_k1m4n20 unit_kernel_k1m4n4(%%zmm13,%%r15,%%r12,2) +#define KERNEL_k1m4n24 KERNEL_h_k1m4n24 "addq $16,%%r15;" +#define INIT_m4n1 "vpxor %%xmm8,%%xmm8,%%xmm8;" +#define INIT_m4n2 "vpxor %%xmm8,%%xmm8,%%xmm8; vpxor %%xmm9,%%xmm9,%%xmm9;" +#define INIT_m4n4 "vpxorq %%zmm8,%%zmm8,%%zmm8;" +#define INIT_m4n8 INIT_m4n4 "vpxorq %%zmm9,%%zmm9,%%zmm9;" +#define INIT_m4n12 INIT_m4n8 "vpxorq %%zmm10,%%zmm10,%%zmm10;" +#define INIT_m4n16 INIT_m4n12 "vpxorq %%zmm11,%%zmm11,%%zmm11;" +#define INIT_m4n20 INIT_m4n16 "vpxorq %%zmm12,%%zmm12,%%zmm12;" +#define INIT_m4n24 INIT_m4n20 "vpxorq %%zmm13,%%zmm13,%%zmm13;" +#define SAVE_h_m4n1 "vfmadd213ps (%2),%%xmm0,%%xmm8; vmovups %%xmm8,(%2);" +#define SAVE_h_m4n2 "vfmadd213ps (%2),%%xmm0,%%xmm8; vmovups %%xmm8,(%2); vfmadd213ps (%2,%3,1),%%xmm0,%%xmm9; vmovups %%xmm9,(%2,%3,1);" +#define unit_save_m4n4(c1_no)\ + "vpermps %%zmm"#c1_no",%%zmm1,%%zmm"#c1_no"; vextractf64x4 $1,%%zmm"#c1_no",%%ymm5;"\ + "vmovups (%5),%%xmm4; vinsertf128 $1,(%5,%3,1),%%ymm4,%%ymm4; vfmadd231ps %%ymm0,%%ymm"#c1_no",%%ymm4;"\ + "vmovups %%xmm4,(%5); vextractf128 $1,%%ymm4,(%5,%3,1); leaq (%5,%3,2),%5;"\ + "vmovups (%5),%%xmm4; vinsertf128 $1,(%5,%3,1),%%ymm4,%%ymm4; vfmadd231ps %%ymm0,%%ymm5,%%ymm4;"\ + "vmovups %%xmm4,(%5); vextractf128 $1,%%ymm4,(%5,%3,1); leaq (%5,%3,2),%5;" +#define SAVE_h_m4n4 "movq %2,%5;" unit_save_m4n4(8) +#define SAVE_h_m4n8 SAVE_h_m4n4 unit_save_m4n4(9) +#define SAVE_h_m4n12 SAVE_h_m4n8 unit_save_m4n4(10) +#define SAVE_h_m4n16 SAVE_h_m4n12 unit_save_m4n4(11) +#define SAVE_h_m4n20 SAVE_h_m4n16 unit_save_m4n4(12) +#define SAVE_h_m4n24 SAVE_h_m4n20 unit_save_m4n4(13) +#define SAVE_m4(ndim) SAVE_h_m4n##ndim "addq $16,%2;" +#define COMPUTE_m4(ndim) \ + INIT_m4n##ndim\ + "movq %%r13,%4; movq %%r14,%1; leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15;"\ + "testq %4,%4; jz "#ndim"004042f;"\ + #ndim"004041:\n\t"\ + KERNEL_k1m4n##ndim "decq %4; jnz "#ndim"004041b;"\ + #ndim"004042:\n\t"\ + SAVE_m4(ndim) + +/* m = 2 *//* xmm0 for alpha, xmm1-xmm3 for temporary use, xmm4-xmm15 for accumulators */ +#define INIT_m2n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" +#define KERNEL_k1m2n1 \ + "vmovsd (%0),%%xmm1; addq $8,%0;"\ + "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ + "addq $4,%1;" +#define SAVE_h_m2n1 "vmovsd (%2),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm4; vmovsd %%xmm4,(%2);" +#define INIT_m2n2 INIT_m2n1 "vpxor %%xmm5,%%xmm5,%%xmm5;" +#define KERNEL_k1m2n2 \ + "vmovsd (%0),%%xmm1; addq $8,%0;"\ + "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ + "vbroadcastss 4(%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm5;"\ + "addq $8,%1;" +#define SAVE_h_m2n2 SAVE_h_m2n1 "vmovsd (%2,%3,1),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm5; vmovsd %%xmm5,(%2,%3,1);" +#define INIT_m2n4 INIT_m2n2 +#define INIT_m2n8 INIT_m2n4 "vpxor %%xmm6,%%xmm6,%%xmm6; vpxor %%xmm7,%%xmm7,%%xmm7;" +#define INIT_m2n12 INIT_m2n8 "vpxor %%xmm8,%%xmm8,%%xmm8; vpxor %%xmm9,%%xmm9,%%xmm9;" +#define INIT_m2n16 INIT_m2n12 "vpxor %%xmm10,%%xmm10,%%xmm10; vpxor %%xmm11,%%xmm11,%%xmm11;" +#define INIT_m2n20 INIT_m2n16 "vpxor %%xmm12,%%xmm12,%%xmm12; vpxor %%xmm13,%%xmm13,%%xmm13;" +#define INIT_m2n24 INIT_m2n20 "vpxor %%xmm14,%%xmm14,%%xmm14; vpxor %%xmm15,%%xmm15,%%xmm15;" +#define KERNEL_h_k1m2n4 \ + "vbroadcastss (%0),%%xmm1; vbroadcastss 4(%0),%%xmm2; addq $8,%0;"\ + "vmovups (%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm4; vfmadd231ps %%xmm2,%%xmm3,%%xmm5;" +#define KERNEL_k1m2n4 KERNEL_h_k1m2n4 "addq $16,%1;" +#define KERNEL_h_k1m2n8 KERNEL_h_k1m2n4 "vmovups (%1,%%r12,1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm6; vfmadd231ps %%xmm2,%%xmm3,%%xmm7;" +#define KERNEL_k1m2n8 KERNEL_h_k1m2n8 "addq $16,%1;" +#define KERNEL_k1m2n12 KERNEL_h_k1m2n8 \ + "vmovups (%1,%%r12,2),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm8; vfmadd231ps %%xmm2,%%xmm3,%%xmm9; addq $16,%1;" +#define KERNEL_h_k1m2n16 KERNEL_k1m2n12 "vmovups (%%r15),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm10; vfmadd231ps %%xmm2,%%xmm3,%%xmm11;" +#define KERNEL_k1m2n16 KERNEL_h_k1m2n16 "addq $16,%%r15;" +#define KERNEL_h_k1m2n20 KERNEL_h_k1m2n16 "vmovups (%%r15,%%r12,1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm12; vfmadd231ps %%xmm2,%%xmm3,%%xmm13;" +#define KERNEL_k1m2n20 KERNEL_h_k1m2n20 "addq $16,%%r15;" +#define KERNEL_h_k1m2n24 KERNEL_h_k1m2n20 "vmovups (%%r15,%%r12,2),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm14; vfmadd231ps %%xmm2,%%xmm3,%%xmm15;" +#define KERNEL_k1m2n24 KERNEL_h_k1m2n24 "addq $16,%%r15;" +#define unit_save_m2n4(c1,c2) \ + "vunpcklps "#c2","#c1",%%xmm1; vunpckhps "#c2","#c1",%%xmm2;"\ + "vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1; vmovsd %%xmm1,(%5); vmovhpd %%xmm1,(%5,%3,1);"\ + "leaq (%5,%3,2),%5;"\ + "vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2; vmovsd %%xmm2,(%5); vmovhpd %%xmm2,(%5,%3,1);"\ + "leaq (%5,%3,2),%5;" +#define SAVE_h_m2n4 "movq %2,%5;" unit_save_m2n4(%%xmm4,%%xmm5) +#define SAVE_h_m2n8 SAVE_h_m2n4 unit_save_m2n4(%%xmm6,%%xmm7) +#define SAVE_h_m2n12 SAVE_h_m2n8 unit_save_m2n4(%%xmm8,%%xmm9) +#define SAVE_h_m2n16 SAVE_h_m2n12 unit_save_m2n4(%%xmm10,%%xmm11) +#define SAVE_h_m2n20 SAVE_h_m2n16 unit_save_m2n4(%%xmm12,%%xmm13) +#define SAVE_h_m2n24 SAVE_h_m2n20 unit_save_m2n4(%%xmm14,%%xmm15) +#define SAVE_m2(ndim) SAVE_h_m2n##ndim "addq $8,%2;" +#define COMPUTE_m2(ndim) \ + INIT_m2n##ndim\ + "movq %%r13,%4; movq %%r14,%1; leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15;"\ + "testq %4,%4; jz "#ndim"002022f;"\ + #ndim"002021:\n\t"\ + KERNEL_k1m2n##ndim "decq %4; jnz "#ndim"002021b;"\ + #ndim"002022:\n\t"\ + SAVE_m2(ndim) + +/* m = 1 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm9 for accumulators */ +#define INIT_m1n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" +#define KERNEL_k1m1n1 \ + "vmovss (%1),%%xmm3; addq $4,%1;"\ + "vmovss (%0),%%xmm1; vfmadd231ss %%xmm3,%%xmm1,%%xmm4;"\ + "addq $4,%0;" +#define SAVE_h_m1n1 "vfmadd213ss (%2),%%xmm0,%%xmm4; vmovss %%xmm4,(%2);" +#define INIT_m1n2 INIT_m1n1 +#define KERNEL_k1m1n2 \ + "vmovsd (%1),%%xmm3; addq $8,%1;"\ + "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\ + "addq $4,%0;" +#define SAVE_h_m1n2 \ + "vmovss (%2),%%xmm3; vinsertps $16,(%2,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm4;"\ + "vmovss %%xmm4,(%2); vextractps $1,%%xmm4,(%2,%3,1);" +#define INIT_m1n4 INIT_m1n2 +#define INIT_m1n8 INIT_m1n4 "vpxor %%xmm5,%%xmm5,%%xmm5;" +#define INIT_m1n12 INIT_m1n8 "vpxor %%xmm6,%%xmm6,%%xmm6;" +#define INIT_m1n16 INIT_m1n12 "vpxor %%xmm7,%%xmm7,%%xmm7;" +#define INIT_m1n20 INIT_m1n16 "vpxor %%xmm8,%%xmm8,%%xmm8;" +#define INIT_m1n24 INIT_m1n20 "vpxor %%xmm9,%%xmm9,%%xmm9;" +#define KERNEL_h_k1m1n4 \ + "vbroadcastss (%0),%%xmm1; addq $4,%0; vfmadd231ps (%1),%%xmm1,%%xmm4;" +#define KERNEL_k1m1n4 KERNEL_h_k1m1n4 "addq $16,%1;" +#define KERNEL_h_k1m1n8 KERNEL_h_k1m1n4 "vfmadd231ps (%1,%%r12,1),%%xmm1,%%xmm5;" +#define KERNEL_k1m1n8 KERNEL_h_k1m1n8 "addq $16,%1;" +#define KERNEL_k1m1n12 KERNEL_h_k1m1n8 "vfmadd231ps (%1,%%r12,2),%%xmm1,%%xmm6; addq $16,%1;" +#define KERNEL_h_k1m1n16 KERNEL_k1m1n12 "vfmadd231ps (%%r15),%%xmm1,%%xmm7;" +#define KERNEL_k1m1n16 KERNEL_h_k1m1n16 "addq $16,%%r15;" +#define KERNEL_h_k1m1n20 KERNEL_h_k1m1n16 "vfmadd231ps (%%r15,%%r12,1),%%xmm1,%%xmm8;" +#define KERNEL_k1m1n20 KERNEL_h_k1m1n20 "addq $16,%%r15;" +#define KERNEL_h_k1m1n24 KERNEL_h_k1m1n20 "vfmadd231ps (%%r15,%%r12,2),%%xmm1,%%xmm9;" +#define KERNEL_k1m1n24 KERNEL_h_k1m1n24 "addq $16,%%r15;" +#define unit_save_m1n4(c1) \ + "vpxor %%xmm10,%%xmm10,%%xmm10; vmovsd "#c1",%%xmm10,%%xmm2; vmovhlps "#c1",%%xmm10,%%xmm1;"\ + "vmovss (%5),%%xmm3; vinsertps $16,(%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2;"\ + "vmovss %%xmm2,(%5); vextractps $1,%%xmm2,(%5,%3,1); leaq (%5,%3,2),%5;"\ + "vmovss (%5),%%xmm3; vinsertps $16,(%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1;"\ + "vmovss %%xmm1,(%5); vextractps $1,%%xmm1,(%5,%3,1); leaq (%5,%3,2),%5;" +#define SAVE_h_m1n4 "movq %2,%5;" unit_save_m1n4(%%xmm4) +#define SAVE_h_m1n8 SAVE_h_m1n4 unit_save_m1n4(%%xmm5) +#define SAVE_h_m1n12 SAVE_h_m1n8 unit_save_m1n4(%%xmm6) +#define SAVE_h_m1n16 SAVE_h_m1n12 unit_save_m1n4(%%xmm7) +#define SAVE_h_m1n20 SAVE_h_m1n16 unit_save_m1n4(%%xmm8) +#define SAVE_h_m1n24 SAVE_h_m1n20 unit_save_m1n4(%%xmm9) +#define SAVE_m1(ndim) SAVE_h_m1n##ndim "addq $4,%2;" +#define COMPUTE_m1(ndim) \ + INIT_m1n##ndim\ + "movq %%r13,%4; movq %%r14,%1; leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15;"\ + "testq %4,%4; jz "#ndim"001012f;"\ + #ndim"001011:\n\t"\ + KERNEL_k1m1n##ndim "decq %4; jnz "#ndim"001011b;"\ + #ndim"001012:\n\t"\ + SAVE_m1(ndim) + +/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 = "+r"(K), %5 = "+r"(ctemp) */ +/* %6 = "+r"(next_b), %7 = "m"(ALPHA), %8 = "m"(M) */ +/* r11 = m_counter, r12 = k << 4(const), r13 = k(const), r14 = b_head_pos(const), r15 = %1 + 3r12 */ + +#define COMPUTE(ndim) {\ + next_b = b_pointer + ndim * K;\ + __asm__ __volatile__(\ + "vbroadcastss %7,%%zmm0; vmovups %9,%%zmm1; vmovups %10,%%zmm2;"\ + "movq %4,%%r13; movq %4,%%r12; salq $4,%%r12; movq %1,%%r14; movq %8,%%r11;"\ + "cmpq $16,%%r11;jb 33101"#ndim"f;"\ + "33109"#ndim":\n\t"\ + COMPUTE_m16(ndim)\ + "subq $16,%%r11;cmpq $16,%%r11;jnb 33109"#ndim"b;"\ + "33101"#ndim":\n\t"\ + "cmpq $8,%%r11;jb 33102"#ndim"f;"\ + COMPUTE_m8(ndim)\ + "subq $8,%%r11;"\ + "33102"#ndim":\n\t"\ + "cmpq $4,%%r11;jb 33103"#ndim"f;"\ + COMPUTE_m4(ndim)\ + "subq $4,%%r11;"\ + "33103"#ndim":\n\t"\ + "cmpq $2,%%r11;jb 33104"#ndim"f;"\ + COMPUTE_m2(ndim)\ + "subq $2,%%r11;"\ + "33104"#ndim":\n\t"\ + "testq %%r11,%%r11;jz 33105"#ndim"f;"\ + COMPUTE_m1(ndim)\ + "33105"#ndim":\n\t"\ + "movq %%r13,%4; movq %%r14,%1; vzeroupper;"\ + :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(K),"+r"(ctemp),"+r"(next_b):"m"(ALPHA),"m"(M),"m"(perm[0]),"m"(permil[0])\ + :"r10","r11","r12","r13","r14","r15","zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14",\ + "zmm15","zmm16","zmm17","zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31",\ + "cc","memory");\ + a_pointer -= M * K; b_pointer += ndim * K; c_pointer += LDC * ndim - M;\ +} + +#define COMPUTE_n24 {\ + next_b = b_pointer + 24 * K;\ + __asm__ __volatile__(\ + "vbroadcastss %8,%%zmm0; vmovups %10,%%zmm1; vmovups %11,%%zmm2;"\ + "movq %4,%%r13; movq %4,%%r12; salq $4,%%r12; movq %1,%%r14; movq %9,%%r11;"\ + "cmpq $32,%%r11;jb 3310024f;"\ + COMPUTE_m16n24_LINIT "subq $16,%%r11; cmpq $32,%%r11;jb 3310724f;"\ + "3310924:\n\t"\ + COMPUTE_m16n24_RSAVE "subq $16,%%r11; cmpq $32,%%r11;jb 3310824f;"\ + COMPUTE_m16n24_LSAVE "subq $16,%%r11; cmpq $32,%%r11;jnb 3310924b;"\ + "3310724:\n\t"\ + COMPUTE_m16n24_RTAIL "subq $16,%%r11; jmp 3310124f;"\ + "3310824:\n\t"\ + COMPUTE_m16n24_LTAIL "subq $16,%%r11; jmp 3310124f;"\ + "3310024:\n\t"\ + "cmpq $16,%%r11;jb 3310124f;"\ + COMPUTE_m16(24)\ + "subq $16,%%r11;"\ + "3310124:\n\t"\ + "cmpq $8,%%r11;jb 3310224f;"\ + COMPUTE_m8(24)\ + "subq $8,%%r11;"\ + "3310224:\n\t"\ + "cmpq $4,%%r11;jb 3310324f;"\ + COMPUTE_m4(24)\ + "subq $4,%%r11;"\ + "3310324:\n\t"\ + "cmpq $2,%%r11;jb 3310424f;"\ + COMPUTE_m2(24)\ + "subq $2,%%r11;"\ + "3310424:\n\t"\ + "testq %%r11,%%r11;jz 3310524f;"\ + COMPUTE_m1(24)\ + "3310524:\n\t"\ + "movq %%r13,%4; movq %%r14,%1; vzeroupper;"\ + :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(K),"+r"(ctemp),"+r"(next_b),"+r"(wscr):"m"(ALPHA),"m"(M),"m"(perm[0]),"m"(permil[0])\ + :"r10","r11","r12","r13","r14","r15","zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14",\ + "zmm15","zmm16","zmm17","zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31",\ + "cc","memory");\ + a_pointer -= M * K; b_pointer += 24 * K; c_pointer += LDC * 24 - M;\ +} + +int __attribute__ ((noinline)) +CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC) +{ + if(m==0||n==0||k==0||alpha==(float)0.0) return 0; + float scr[192]; float *wscr = scr; + int64_t ldc_in_bytes = (int64_t)LDC * sizeof(float);float ALPHA = alpha; + int64_t M = (int64_t)m, K = (int64_t)k; + int32_t perm[16] = {0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15}; + int32_t permil[16] = {0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3}; + BLASLONG n_count = n; + float *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*next_b = B; + for(;n_count>23;n_count-=24) COMPUTE_n24 + for(;n_count>19;n_count-=20) COMPUTE(20) + for(;n_count>15;n_count-=16) COMPUTE(16) + for(;n_count>11;n_count-=12) COMPUTE(12) + for(;n_count>7;n_count-=8) COMPUTE(8) + for(;n_count>3;n_count-=4) COMPUTE(4) + for(;n_count>1;n_count-=2) COMPUTE(2) + if(n_count>0) COMPUTE(1) + return 0; +} +#include +#include "sgemm_direct_skylakex.c" From 64daad436557418c0e8ad096fad1e3c4faeaa64e Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Fri, 20 Mar 2020 21:46:18 +0000 Subject: [PATCH 073/593] Update param.h --- param.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/param.h b/param.h index e479314d9..6e12bb37b 100644 --- a/param.h +++ b/param.h @@ -1700,12 +1700,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else -#define SGEMM_DEFAULT_P 640 +#define SGEMM_DEFAULT_P 448 #define DGEMM_DEFAULT_P 192 #define CGEMM_DEFAULT_P 384 #define ZGEMM_DEFAULT_P 256 -#define SGEMM_DEFAULT_Q 320 +#define SGEMM_DEFAULT_Q 448 #define DGEMM_DEFAULT_Q 384 #define CGEMM_DEFAULT_Q 192 #define ZGEMM_DEFAULT_Q 128 From 1d9773b800f0184721cc14d15a9074083c1f07e2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 20 Mar 2020 23:05:53 +0100 Subject: [PATCH 074/593] Use proper extension on the avx512 testcase filename The need to call it .tmp existed only when it was generated by a tmpfile call, and the "-x c" option to tell the compiler it is actually a C source is not universally supported (this broke the test with clang-cl at least) --- cmake/system_check.cmake | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index c4a553c5a..94eb0a9c6 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -109,10 +109,10 @@ else() endif() if (X86_64 OR X86) - file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "#include \n\nint main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }") -execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -c -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512) + file(WRITE ${PROJECT_BINARY_DIR}/avx512.c "#include \n\nint main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }") +execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -c -v -o ${PROJECT_BINARY_DIR}/avx512.o ${PROJECT_BINARY_DIR}/avx512.c OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512) if (NO_AVX512 EQUAL 1) set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") endif() - file(REMOVE "avx512.tmp" "avx512.o") + file(REMOVE "avx512.c" "avx512.o") endif() From 71cf2acdef490bd7ad915d39097178c0fd23ad5e Mon Sep 17 00:00:00 2001 From: Andreas Schwab Date: Sat, 21 Mar 2020 17:33:33 +0100 Subject: [PATCH 075/593] Fix ARCHCONFIG for Neoverse-N1 ../config_kernel.h:24:9: warning: missing whitespace after the macro name 24 | #define ARMV8-march armv8.2-a | ^~~~~ --- getarch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/getarch.c b/getarch.c index 30ca290e3..145753bcc 100644 --- a/getarch.c +++ b/getarch.c @@ -1038,7 +1038,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \ "-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ - "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8 " \ "-march=armv8.2-a -mtune=cortex-a72" #define LIBNAME "neoversen1" #define CORENAME "NEOVERSEN1" From 6a14b34c20f73c731a72ae05ca0776cf79d8a023 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 22 Mar 2020 14:33:16 +0100 Subject: [PATCH 076/593] Avoid calling DIRECT codepath in DYNAMIC_ARCH on non-SKX --- interface/gemm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/interface/gemm.c b/interface/gemm.c index 97e71bc85..8a1d50f4e 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -272,6 +272,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS PRINT_DEBUG_CNAME; #if !defined(COMPLEX) && !defined(DOUBLE) && defined(USE_SGEMM_KERNEL_DIRECT) +#ifdef DYNAMIC_ARCH + if (gotoblas == &gotoblas_SKYLAKEX) +#endif if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && sgemm_kernel_direct_performant(m,n,k)) { sgemm_kernel_direct(m, n, k, a, lda, b, ldb, c, ldc); return; From 50f4fb2fbdc8fad3d7e2e376fbd8c6c80ec073c2 Mon Sep 17 00:00:00 2001 From: shengyang Date: Sat, 21 Mar 2020 15:58:21 +0800 Subject: [PATCH 077/593] add ctest for drotm and modified ctest for drot. make sure that test cases cover all code path when kernel uses looping unrolling. --- ctest/c_dblas1.c | 7 + ctest/c_dblat1.f | 364 +++++++++++++++++++++++++---------------------- 2 files changed, 201 insertions(+), 170 deletions(-) diff --git a/ctest/c_dblas1.c b/ctest/c_dblas1.c index a288154c2..e49ae6007 100644 --- a/ctest/c_dblas1.c +++ b/ctest/c_dblas1.c @@ -53,6 +53,13 @@ void F77_drot( const int *N, double *X, const int *incX, double *Y, return; } +void F77_drotm(const int *N, double *X, const int *incX, double *Y, + const int *incY, const double *dparam) +{ + cblas_drotm(*N, X, *incX, Y, *incY, dparam); + return; +} + void F77_dscal(const int *N, const double *alpha, double *X, const int *incX) { diff --git a/ctest/c_dblat1.f b/ctest/c_dblat1.f index 4a71b4dcf..0139ede63 100644 --- a/ctest/c_dblat1.f +++ b/ctest/c_dblat1.f @@ -19,7 +19,7 @@ DATA SFAC/9.765625D-4/ * .. Executable Statements .. WRITE (NOUT,99999) - DO 20 IC = 1, 10 + DO 20 IC = 1, 11 ICASE = IC CALL HEADER * @@ -40,7 +40,7 @@ ELSE IF (ICASE.EQ.1 .OR. ICASE.EQ.2 .OR. ICASE.EQ.5 .OR. + ICASE.EQ.6) THEN CALL CHECK2(SFAC) - ELSE IF (ICASE.EQ.4) THEN + ELSE IF (ICASE.EQ.4 .OR. ICASE.EQ.11) THEN CALL CHECK3(SFAC) END IF * -- Print @@ -59,7 +59,7 @@ INTEGER ICASE, INCX, INCY, MODE, N LOGICAL PASS * .. Local Arrays .. - CHARACTER*15 L(10) + CHARACTER*15 L(11) * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Data statements .. @@ -73,6 +73,7 @@ DATA L(8)/'CBLAS_DASUM '/ DATA L(9)/'CBLAS_DSCAL '/ DATA L(10)/'CBLAS_IDAMAX'/ + DATA L(11)/'CBLAS_DROTM'/ * .. Executable Statements .. WRITE (NOUT,99999) ICASE, L(ICASE) RETURN @@ -400,199 +401,81 @@ LOGICAL PASS * .. Local Scalars .. DOUBLE PRECISION SC, SS - INTEGER I, K, KI, KN, KSIZE, LENX, LENY, MX, MY + INTEGER I, KI, KN, KSIZE, LEN * .. Local Arrays .. - DOUBLE PRECISION COPYX(5), COPYY(5), DT9X(7,4,4), DT9Y(7,4,4), - + DX1(7), DY1(7), MWPC(11), MWPS(11), MWPSTX(5), - + MWPSTY(5), MWPTX(11,5), MWPTY(11,5), MWPX(5), - + MWPY(5), SSIZE2(14,2), STX(7), STY(7), SX(7), - + SY(7) - INTEGER INCXS(4), INCYS(4), LENS(4,2), MWPINX(11), - + MWPINY(11), MWPN(11), NS(4) + DOUBLE PRECISION DX(10), DY(10), SSIZE2(10,2), STX(10), + + STY(10), SX(10), SY(10), + + PARAM(5, 4), DPARAM(5) + INTEGER INCXS(7), INCYS(7), NS(5) * .. External Subroutines .. - EXTERNAL STEST,DROTTEST + EXTERNAL STEST, DROTTEST, DROT * .. Intrinsic Functions .. - INTRINSIC ABS, MIN + INTRINSIC MIN * .. Common blocks .. COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS * .. Data statements .. - DATA INCXS/1, 2, -2, -1/ - DATA INCYS/1, -2, 1, -2/ - DATA LENS/1, 1, 2, 4, 1, 1, 3, 7/ - DATA NS/0, 1, 2, 4/ - DATA DX1/0.6D0, 0.1D0, -0.5D0, 0.8D0, 0.9D0, -0.3D0, - + -0.4D0/ - DATA DY1/0.5D0, -0.9D0, 0.3D0, 0.7D0, -0.6D0, 0.2D0, - + 0.8D0/ + DATA INCXS/1, 1, 2, 2, -2, -1, -2/ + DATA INCYS/1, 2, 2, -2, 1, -2, -2/ + DATA NS/0, 1, 2, 4, 5/ + DATA DX/0.6D0, 0.1D0, -0.5D0, 0.8D0, 0.9D0, -0.3D0, + + -0.4D0, 0.7D0, 0.5D0, 0.2D0/ + DATA DY/0.5D0, -0.9D0, 0.3D0, 0.7D0, -0.6D0, 0.2D0, + + 0.8D0, -0.5D0, 0.1D0, -0.3D0/ DATA SC, SS/0.8D0, 0.6D0/ - DATA DT9X/0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, - + 0.0D0, 0.78D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, - + 0.0D0, 0.0D0, 0.78D0, -0.46D0, 0.0D0, 0.0D0, - + 0.0D0, 0.0D0, 0.0D0, 0.78D0, -0.46D0, -0.22D0, - + 1.06D0, 0.0D0, 0.0D0, 0.0D0, 0.6D0, 0.0D0, - + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.78D0, - + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, - + 0.66D0, 0.1D0, -0.1D0, 0.0D0, 0.0D0, 0.0D0, - + 0.0D0, 0.96D0, 0.1D0, -0.76D0, 0.8D0, 0.90D0, - + -0.3D0, -0.02D0, 0.6D0, 0.0D0, 0.0D0, 0.0D0, - + 0.0D0, 0.0D0, 0.0D0, 0.78D0, 0.0D0, 0.0D0, - + 0.0D0, 0.0D0, 0.0D0, 0.0D0, -0.06D0, 0.1D0, - + -0.1D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.90D0, - + 0.1D0, -0.22D0, 0.8D0, 0.18D0, -0.3D0, -0.02D0, - + 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, - + 0.78D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, - + 0.0D0, 0.78D0, 0.26D0, 0.0D0, 0.0D0, 0.0D0, - + 0.0D0, 0.0D0, 0.78D0, 0.26D0, -0.76D0, 1.12D0, - + 0.0D0, 0.0D0, 0.0D0/ - DATA DT9Y/0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, - + 0.0D0, 0.04D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, - + 0.0D0, 0.0D0, 0.04D0, -0.78D0, 0.0D0, 0.0D0, - + 0.0D0, 0.0D0, 0.0D0, 0.04D0, -0.78D0, 0.54D0, - + 0.08D0, 0.0D0, 0.0D0, 0.0D0, 0.5D0, 0.0D0, - + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.04D0, - + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.7D0, - + -0.9D0, -0.12D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, - + 0.64D0, -0.9D0, -0.30D0, 0.7D0, -0.18D0, 0.2D0, - + 0.28D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, - + 0.0D0, 0.0D0, 0.04D0, 0.0D0, 0.0D0, 0.0D0, - + 0.0D0, 0.0D0, 0.0D0, 0.7D0, -1.08D0, 0.0D0, - + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.64D0, -1.26D0, - + 0.54D0, 0.20D0, 0.0D0, 0.0D0, 0.0D0, 0.5D0, - + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, - + 0.04D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, - + 0.0D0, 0.04D0, -0.9D0, 0.18D0, 0.0D0, 0.0D0, - + 0.0D0, 0.0D0, 0.04D0, -0.9D0, 0.18D0, 0.7D0, - + -0.18D0, 0.2D0, 0.16D0/ + DATA LEN/10/ + DATA PARAM/-2.0D0, 1.0D0, 0.0D0, 0.0D0, 1.0D0, + + -1.0D0, 0.2D0, 0.3D0, 0.4D0, 0.5D0, + + 0.0D0, 1.0D0, 0.3D0, 0.4D0, 1.0D0, + + 1.0D0, 0.2D0, -1.0D0, 1.0D0, 0.5D0/ DATA SSIZE2/0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, - + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, - + 0.0D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 1.17D0, 1.17D0, + 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, - + 1.17D0, 1.17D0, 1.17D0/ + + 1.17D0, 1.17D0/ * .. Executable Statements .. * - DO 60 KI = 1, 4 + DO 60 KI = 1, 7 INCX = INCXS(KI) INCY = INCYS(KI) - MX = ABS(INCX) - MY = ABS(INCY) * - DO 40 KN = 1, 4 + DO 40 KN = 1, 5 N = NS(KN) KSIZE = MIN(2,KN) - LENX = LENS(KN,MX) - LENY = LENS(KN,MY) * IF (ICASE.EQ.4) THEN * .. DROTTEST .. - DO 20 I = 1, 7 - SX(I) = DX1(I) - SY(I) = DY1(I) - STX(I) = DT9X(I,KN,KI) - STY(I) = DT9Y(I,KN,KI) + DO 20 I = 1, 10 + SX(I) = DX(I) + SY(I) = DY(I) + STX(I) = DX(I) + STY(I) = DY(I) 20 CONTINUE CALL DROTTEST(N,SX,INCX,SY,INCY,SC,SS) - CALL STEST(LENX,SX,STX,SSIZE2(1,KSIZE),SFAC) - CALL STEST(LENY,SY,STY,SSIZE2(1,KSIZE),SFAC) - ELSE + CALL DROT(N,STX,INCX,STY,INCY,SC,SS) + CALL STEST(LEN,SX,STX,SSIZE2(1,KSIZE),SFAC) + CALL STEST(LEN,SY,STY,SSIZE2(1,KSIZE),SFAC) + ELSE IF (ICASE.EQ.11) THEN +* .. DROTMTEST .. + DO 90 I = 1, 10 + SX(I) = DX(I) + SY(I) = DY(I) + STX(I) = DX(I) + STY(I) = DY(I) + 90 CONTINUE + DO 70 I = 1, 4 + DO 80 K = 1, 5 + DPARAM(K) = PARAM(K,I) + 80 CONTINUE + CALL DROTMTEST(N,SX,INCX,SY,INCY,DPARAM) + CALL DROTM(N,STX,INCX,STY,INCY,DPARAM) + CALL STEST(LEN,SX,STX,SSIZE2(1,KSIZE),SFAC) + CALL STEST(LEN,SY,STY,SSIZE2(1,KSIZE),SFAC) + 70 CONTINUE + ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK3' STOP END IF 40 CONTINUE 60 CONTINUE -* - MWPC(1) = 1 - DO 80 I = 2, 11 - MWPC(I) = 0 - 80 CONTINUE - MWPS(1) = 0.0 - DO 100 I = 2, 6 - MWPS(I) = 1.0 - 100 CONTINUE - DO 120 I = 7, 11 - MWPS(I) = -1.0 - 120 CONTINUE - MWPINX(1) = 1 - MWPINX(2) = 1 - MWPINX(3) = 1 - MWPINX(4) = -1 - MWPINX(5) = 1 - MWPINX(6) = -1 - MWPINX(7) = 1 - MWPINX(8) = 1 - MWPINX(9) = -1 - MWPINX(10) = 1 - MWPINX(11) = -1 - MWPINY(1) = 1 - MWPINY(2) = 1 - MWPINY(3) = -1 - MWPINY(4) = -1 - MWPINY(5) = 2 - MWPINY(6) = 1 - MWPINY(7) = 1 - MWPINY(8) = -1 - MWPINY(9) = -1 - MWPINY(10) = 2 - MWPINY(11) = 1 - DO 140 I = 1, 11 - MWPN(I) = 5 - 140 CONTINUE - MWPN(5) = 3 - MWPN(10) = 3 - DO 160 I = 1, 5 - MWPX(I) = I - MWPY(I) = I - MWPTX(1,I) = I - MWPTY(1,I) = I - MWPTX(2,I) = I - MWPTY(2,I) = -I - MWPTX(3,I) = 6 - I - MWPTY(3,I) = I - 6 - MWPTX(4,I) = I - MWPTY(4,I) = -I - MWPTX(6,I) = 6 - I - MWPTY(6,I) = I - 6 - MWPTX(7,I) = -I - MWPTY(7,I) = I - MWPTX(8,I) = I - 6 - MWPTY(8,I) = 6 - I - MWPTX(9,I) = -I - MWPTY(9,I) = I - MWPTX(11,I) = I - 6 - MWPTY(11,I) = 6 - I - 160 CONTINUE - MWPTX(5,1) = 1 - MWPTX(5,2) = 3 - MWPTX(5,3) = 5 - MWPTX(5,4) = 4 - MWPTX(5,5) = 5 - MWPTY(5,1) = -1 - MWPTY(5,2) = 2 - MWPTY(5,3) = -2 - MWPTY(5,4) = 4 - MWPTY(5,5) = -3 - MWPTX(10,1) = -1 - MWPTX(10,2) = -3 - MWPTX(10,3) = -5 - MWPTX(10,4) = 4 - MWPTX(10,5) = 5 - MWPTY(10,1) = 1 - MWPTY(10,2) = 2 - MWPTY(10,3) = 2 - MWPTY(10,4) = 4 - MWPTY(10,5) = 3 - DO 200 I = 1, 11 - INCX = MWPINX(I) - INCY = MWPINY(I) - DO 180 K = 1, 5 - COPYX(K) = MWPX(K) - COPYY(K) = MWPY(K) - MWPSTX(K) = MWPTX(I,K) - MWPSTY(K) = MWPTY(I,K) - 180 CONTINUE - CALL DROTTEST(MWPN(I),COPYX,INCX,COPYY,INCY,MWPC(I),MWPS(I)) - CALL STEST(5,COPYX,MWPSTX,MWPSTX,SFAC) - CALL STEST(5,COPYY,MWPSTY,MWPSTY,SFAC) - 200 CONTINUE RETURN END SUBROUTINE STEST(LEN,SCOMP,STRUE,SSIZE,SFAC) @@ -726,3 +609,144 @@ + /1X) 99997 FORMAT (1X,I4,I3,3I5,2I36,I12) END + SUBROUTINE DROT(N,DX,INCX,DY,INCY,C,S) +* .. Scalar Arguments .. + DOUBLE PRECISION C,S + INTEGER INCX,INCY,N +* .. +* .. Array Arguments .. + DOUBLE PRECISION DX(*),DY(*) +* .. +* applies a plane rotation. +* jack dongarra, linpack, 3/11/78. +* modified 12/3/93, array(1) declarations changed to array(*) +* +* .. Local Scalars .. + DOUBLE PRECISION DTEMP + INTEGER I,IX,IY +* .. + IF (N.LE.0) RETURN + IF (INCX.EQ.1 .AND. INCY.EQ.1) GO TO 20 + IX = 1 + IY = 1 + IF (INCX.LT.0) IX = (-N+1)*INCX + 1 + IF (INCY.LT.0) IY = (-N+1)*INCY + 1 + DO 10 I = 1,N + DTEMP = C*DX(IX) + S*DY(IY) + DY(IY) = C*DY(IY) - S*DX(IX) + DX(IX) = DTEMP + IX = IX + INCX + IY = IY + INCY + 10 CONTINUE + RETURN + 20 DO 30 I = 1,N + DTEMP = C*DX(I) + S*DY(I) + DY(I) = C*DY(I) - S*DX(I) + DX(I) = DTEMP + 30 CONTINUE + RETURN + END + SUBROUTINE drotm(N,DX,INCX,DY,INCY,DPARAM) +* +* -- Reference BLAS level1 routine (version 3.8.0) -- +* -- Reference BLAS is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* November 2017 +* +* .. Scalar Arguments .. + INTEGER INCX,INCY,N +* .. +* .. Array Arguments .. + DOUBLE PRECISION DPARAM(5),DX(*),DY(*) +* .. +* +* ===================================================================== +* +* .. Local Scalars .. + DOUBLE PRECISION DFLAG,DH11,DH12,DH21,DH22,TWO,W,Z,ZERO + INTEGER I,KX,KY,NSTEPS +* .. +* .. Data statements .. + DATA zero,two/0.d0,2.d0/ +* .. +* + dflag = dparam(1) + IF (n.LE.0 .OR. (dflag+two.EQ.zero)) RETURN + IF (incx.EQ.incy.AND.incx.GT.0) THEN +* + nsteps = n*incx + IF (dflag.LT.zero) THEN + dh11 = dparam(2) + dh12 = dparam(4) + dh21 = dparam(3) + dh22 = dparam(5) + DO i = 1,nsteps,incx + w = dx(i) + z = dy(i) + dx(i) = w*dh11 + z*dh12 + dy(i) = w*dh21 + z*dh22 + END DO + ELSE IF (dflag.EQ.zero) THEN + dh12 = dparam(4) + dh21 = dparam(3) + DO i = 1,nsteps,incx + w = dx(i) + z = dy(i) + dx(i) = w + z*dh12 + dy(i) = w*dh21 + z + END DO + ELSE + dh11 = dparam(2) + dh22 = dparam(5) + DO i = 1,nsteps,incx + w = dx(i) + z = dy(i) + dx(i) = w*dh11 + z + dy(i) = -w + dh22*z + END DO + END IF + ELSE + kx = 1 + ky = 1 + IF (incx.LT.0) kx = 1 + (1-n)*incx + IF (incy.LT.0) ky = 1 + (1-n)*incy +* + IF (dflag.LT.zero) THEN + dh11 = dparam(2) + dh12 = dparam(4) + dh21 = dparam(3) + dh22 = dparam(5) + DO i = 1,n + w = dx(kx) + z = dy(ky) + dx(kx) = w*dh11 + z*dh12 + dy(ky) = w*dh21 + z*dh22 + kx = kx + incx + ky = ky + incy + END DO + ELSE IF (dflag.EQ.zero) THEN + dh12 = dparam(4) + dh21 = dparam(3) + DO i = 1,n + w = dx(kx) + z = dy(ky) + dx(kx) = w + z*dh12 + dy(ky) = w*dh21 + z + kx = kx + incx + ky = ky + incy + END DO + ELSE + dh11 = dparam(2) + dh22 = dparam(5) + DO i = 1,n + w = dx(kx) + z = dy(ky) + dx(kx) = w*dh11 + z + dy(ky) = -w + dh22*z + kx = kx + incx + ky = ky + incy + END DO + END IF + END IF + RETURN + END From fe47dc8673fd9e6933128fa6f4218fd0327d3522 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 23 Mar 2020 19:35:51 +0100 Subject: [PATCH 078/593] Add message highlighting minimum target choice at end of DYNAMIC_ARCH builds related to #2526 --- Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile b/Makefile index a87ffdf2a..0b920cc9f 100644 --- a/Makefile +++ b/Makefile @@ -84,6 +84,10 @@ else @echo " (Multi-threading; Max num-threads is $(NUM_THREADS))" endif +ifeq ($(DYNAMIC_ARCH), 1) + @echo " Supporting multiple $(ARCH) cpu models with minimum requirement for the common code being $(CORE)" +endif + ifeq ($(USE_OPENMP), 1) @echo @echo " Use OpenMP in the multithreading. Because of ignoring OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS flags, " From 8229c163b7c0ce9ead73264b4f36487a91a3889d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 26 Mar 2020 21:12:56 +0100 Subject: [PATCH 079/593] Use runtime check for AVX512 (sgemm_direct) capability when using DYNAMIC_ARCH --- interface/gemm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/interface/gemm.c b/interface/gemm.c index 8a1d50f4e..0b18d9a8c 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -273,7 +273,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS #if !defined(COMPLEX) && !defined(DOUBLE) && defined(USE_SGEMM_KERNEL_DIRECT) #ifdef DYNAMIC_ARCH - if (gotoblas == &gotoblas_SKYLAKEX) + if (support_avx512() ) #endif if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && sgemm_kernel_direct_performant(m,n,k)) { sgemm_kernel_direct(m, n, k, a, lda, b, ldb, c, ldc); From 79fd006c58157351e847d9afa9efb52e76f00e43 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 26 Mar 2020 21:25:39 +0100 Subject: [PATCH 080/593] Expose the support_avx512 function provided in dynamic.c --- common.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/common.h b/common.h index a9fe8d911..762968e6f 100644 --- a/common.h +++ b/common.h @@ -657,6 +657,8 @@ void gotoblas_dynamic_init(void); void gotoblas_dynamic_quit(void); void gotoblas_profile_init(void); void gotoblas_profile_quit(void); + +int support_avx512(void); #ifdef USE_OPENMP From 07cdd5d05c824fbeb5fc27705d96580190a9e9e6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 31 Mar 2020 00:21:02 +0200 Subject: [PATCH 081/593] Fix zero initialization for beta=0 case use immediate initialization instead of multiplication in case register content is a NaN --- kernel/arm64/dgemm_beta.S | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/arm64/dgemm_beta.S b/kernel/arm64/dgemm_beta.S index 20011c343..7d21525c2 100644 --- a/kernel/arm64/dgemm_beta.S +++ b/kernel/arm64/dgemm_beta.S @@ -81,14 +81,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro INIT_ZERO - fmul v0.2d, v0.2d, betaV0 - fmul v1.2d, v1.2d, betaV0 - fmul v2.2d, v2.2d, betaV0 - fmul v3.2d, v3.2d, betaV0 - fmul v4.2d, v4.2d, betaV0 - fmul v5.2d, v5.2d, betaV0 - fmul v6.2d, v6.2d, betaV0 - fmul v7.2d, v7.2d, betaV0 + movi v0.2d, #0000000000000000 + movi v1.2d, #0000000000000000 + movi v2.2d, #0000000000000000 + movi v3.2d, #0000000000000000 + movi v4.2d, #0000000000000000 + movi v5.2d, #0000000000000000 + movi v6.2d, #0000000000000000 + movi v7.2d, #0000000000000000 .endm /************************************************************************************** From 144be81ca1f268a616972ed0c42de59b07469fbf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 31 Mar 2020 16:53:56 +0200 Subject: [PATCH 082/593] fix initialization to zero in the NEON SGEMM_BETA kernel as well --- kernel/arm64/sgemm_beta.S | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/arm64/sgemm_beta.S b/kernel/arm64/sgemm_beta.S index a3b97e231..574485bc4 100755 --- a/kernel/arm64/sgemm_beta.S +++ b/kernel/arm64/sgemm_beta.S @@ -81,14 +81,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro INIT_ZERO - fmul v0.4s, v0.4s, betaV0 - fmul v1.4s, v1.4s, betaV0 - fmul v2.4s, v2.4s, betaV0 - fmul v3.4s, v3.4s, betaV0 - fmul v4.4s, v4.4s, betaV0 - fmul v5.4s, v5.4s, betaV0 - fmul v6.4s, v6.4s, betaV0 - fmul v7.4s, v7.4s, betaV0 + movi v0.4s, #0x0 + movi v1.4s, #0x0 + movi v2.4s, #0x0 + movi v3.4s, #0x0 + movi v4.4s, #0x0 + movi v5.4s, #0x0 + movi v6.4s, #0x0 + movi v7.4s, #0x0 .endm /************************************************************************************** From a05243d0f2bbe0c753b0bcb2c6be8899fbf17808 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 1 Apr 2020 15:38:07 +0200 Subject: [PATCH 083/593] ifort and pgfort need "recursive" for compiling LAPACK as well as shown in Reference-LAPACK issue 401 (their PR 403) --- cmake/fc.cmake | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cmake/fc.cmake b/cmake/fc.cmake index ff26ac06c..cc330ae2c 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -76,14 +76,12 @@ if (${F_COMPILER} STREQUAL "GFORTRAN") endif () endif () -if (${F_COMPILER} STREQUAL "IFORT") +if (${F_COMPILER} STREQUAL "INTEL") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_INTEL") - if (MSVC) - set(FCOMMON_OPT "${FCOMMON_OPT} -names:lowercase -assume:underscore") - endif () if (INTERFACE64) set(FCOMMON_OPT "${FCOMMON_OPT} -i8") endif () + set(FCOMMON_OPT "${FCOMMON_OPT} -recursive") if (USE_OPENMP) set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") endif () @@ -123,6 +121,7 @@ if (${F_COMPILER} STREQUAL "PGI") else () set(FCOMMON_OPT "${FCOMMON_OPT} -tp p7") endif () + set(FCOMMON_OPT "${FCOMMON_OPT} -Mrecursive") if (USE_OPENMP) set(FCOMMON_OPT "${FCOMMON_OPT} -mp") endif () From e13b6773ee622fe769ddda52c999686c0311e0ea Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 1 Apr 2020 15:39:16 +0200 Subject: [PATCH 084/593] ifort and pgfort need "recursive" for safe compilation of LAPACK as well --- Makefile.system | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile.system b/Makefile.system index 11cb5b3a0..2998c0e6a 100644 --- a/Makefile.system +++ b/Makefile.system @@ -850,6 +850,7 @@ ifneq ($(INTERFACE64), 0) FCOMMON_OPT += -i8 endif endif +FCOMMON_OPT += -recursive ifeq ($(USE_OPENMP), 1) FCOMMON_OPT += -fopenmp endif @@ -893,6 +894,7 @@ FCOMMON_OPT += -tp p7-64 else FCOMMON_OPT += -tp p7 endif +FCOMMON_OPT += -Mrecursive ifeq ($(USE_OPENMP), 1) FCOMMON_OPT += -mp endif From 806f89166e97f2b0a3150f09f90d10a853a4ea20 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 2 Apr 2020 10:30:37 +0200 Subject: [PATCH 085/593] Make ARMV7 compile with xcode and add a CI job for it (#2537) * Add an ARMV7 iOS build on Travis * thread_local appears to be unavailable on ARMV7 iOS * Add no-thumb option for ARMV7 IOS build to get it to accept DMB ISH * Make local labels in macros of nrm2_vfpv3.S compatible with the xcode assembler --- .travis.yml | 6 ++++ driver/level2/gemv_thread.c | 4 +-- kernel/arm/nrm2_vfpv3.S | 60 ++++++++++++++++++------------------- 3 files changed, 38 insertions(+), 32 deletions(-) diff --git a/.travis.yml b/.travis.yml index 0f20aef5c..2d82f8812 100644 --- a/.travis.yml +++ b/.travis.yml @@ -180,6 +180,12 @@ matrix: - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0" - BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1" + - <<: *test-macos + osx_image: xcode10.1 + env: + - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" + - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch armv7 -miphoneos-version-min=5.1" + - BTYPE="TARGET=ARMV7 HOSTCC=clang NOFORTRAN=1" # whitelist branches: only: diff --git a/driver/level2/gemv_thread.c b/driver/level2/gemv_thread.c index d57740314..0d8c6b005 100644 --- a/driver/level2/gemv_thread.c +++ b/driver/level2/gemv_thread.c @@ -72,9 +72,9 @@ defined __BORLANDC__ ) # define thread_local __declspec(thread) /* note that ICC (linux) and Clang are covered by __GNUC__ */ -# elif defined __GNUC__ || \ +# elif (defined __GNUC__ || \ defined __SUNPRO_C || \ - defined __xlC__ + defined __xlC__) && !defined(__APPLE__) # define thread_local __thread # else # define UNSAFE diff --git a/kernel/arm/nrm2_vfpv3.S b/kernel/arm/nrm2_vfpv3.S index 7be1e977e..82ae5e8d4 100644 --- a/kernel/arm/nrm2_vfpv3.S +++ b/kernel/arm/nrm2_vfpv3.S @@ -61,20 +61,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vldmia.f64 X!, { d4 } vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr - beq KERNEL_F1_NEXT_\@ + beq 1f /* KERNEL_F1_NEXT_\@ */ vabs.f64 d4, d4 vcmpe.f64 d0, d4 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) - bge KERNEL_F1_NEXT_\@ + bge 1f /* KERNEL_F1_NEXT_\@ */ vdiv.f64 d2 , d0, d4 // scale / x vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) vmov.f64 d0 , d4 // scale = x -KERNEL_F1_NEXT_\@: +1: /* KERNEL_F1_NEXT_\@: */ .endm @@ -124,20 +124,20 @@ KERNEL_S1_NEXT: vldmia.f32 X!, { s4 } vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr - beq KERNEL_F1_NEXT_\@ + beq 1f /* KERNEL_F1_NEXT_\@ */ vabs.f32 s4, s4 vcmpe.f32 s0, s4 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) - bge KERNEL_F1_NEXT_\@ + bge 1f /* KERNEL_F1_NEXT_\@ */ vdiv.f32 s2 , s0, s4 // scale / x vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) vmov.f32 s0 , s4 // scale = x -KERNEL_F1_NEXT_\@: +1: /* KERNEL_F1_NEXT_\@: */ .endm @@ -195,37 +195,37 @@ KERNEL_S1_NEXT: vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr - beq KERNEL_F1_NEXT_\@ + beq 1f /* KERNEL_F1_NEXT_\@ */ vabs.f64 d4, d4 vcmpe.f64 d0, d4 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) - bge KERNEL_F1_NEXT_\@ + bge 1f /* KERNEL_F1_NEXT_\@ */ vdiv.f64 d2 , d0, d4 // scale / x vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) vmov.f64 d0 , d4 // scale = x -KERNEL_F1_NEXT_\@: +1: /* KERNEL_F1_NEXT_\@: */ vcmpe.f64 d5, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr - beq KERNEL_F1_END_\@ + beq 2f /* KERNEL_F1_END_\@ */ vabs.f64 d5, d5 vcmpe.f64 d0, d5 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) - bge KERNEL_F1_END_\@ + bge 2f /* KERNEL_F1_END_\@ */ vdiv.f64 d2 , d0, d5 // scale / x vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) vmov.f64 d0 , d5 // scale = x -KERNEL_F1_END_\@: +2: /* KERNEL_F1_END_\@: */ .endm @@ -253,37 +253,37 @@ KERNEL_F1_END_\@: vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr - beq KERNEL_S1_NEXT_\@ + beq 1f /* KERNEL_S1_NEXT_\@ */ vabs.f64 d4, d4 vcmpe.f64 d0, d4 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) - bge KERNEL_S1_NEXT_\@ + bge 1f /* KERNEL_S1_NEXT_\@ */ vdiv.f64 d2 , d0, d4 // scale / x vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) vmov.f64 d0 , d4 // scale = x -KERNEL_S1_NEXT_\@: +1: /* KERNEL_S1_NEXT_\@: */ vcmpe.f64 d5, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr - beq KERNEL_S1_END_\@ + beq 2f /* KERNEL_S1_END_\@ */ vabs.f64 d5, d5 vcmpe.f64 d0, d5 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) - bge KERNEL_S1_END_\@ + bge 2f /* KERNEL_S1_END_\@ */ vdiv.f64 d2 , d0, d5 // scale / x vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) vmov.f64 d0 , d5 // scale = x -KERNEL_S1_END_\@: +2: /* KERNEL_S1_END_\@: */ add X, X, INC_X @@ -298,37 +298,37 @@ KERNEL_S1_END_\@: vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr - beq KERNEL_F1_NEXT_\@ + beq 1f /* KERNEL_F1_NEXT_\@ */ vabs.f32 s4, s4 vcmpe.f32 s0, s4 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) - bge KERNEL_F1_NEXT_\@ + bge 1f /* KERNEL_F1_NEXT_\@ */ vdiv.f32 s2 , s0, s4 // scale / x vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) vmov.f32 s0 , s4 // scale = x -KERNEL_F1_NEXT_\@: +1: /* KERNEL_F1_NEXT_\@: */ vcmpe.f32 s5, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr - beq KERNEL_F1_END_\@ + beq 2f /* KERNEL_F1_END_\@ */ vabs.f32 s5, s5 vcmpe.f32 s0, s5 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) - bge KERNEL_F1_END_\@ + bge 2f /* KERNEL_F1_END_\@ */ vdiv.f32 s2 , s0, s5 // scale / x vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) vmov.f32 s0 , s5 // scale = x -KERNEL_F1_END_\@: +2: /* KERNEL_F1_END_\@: */ .endm @@ -354,37 +354,37 @@ KERNEL_F1_END_\@: vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr - beq KERNEL_S1_NEXT_\@ + beq 1f /* KERNEL_S1_NEXT_\@ */ vabs.f32 s4, s4 vcmpe.f32 s0, s4 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) - bge KERNEL_S1_NEXT_\@ + bge 1f /* KERNEL_S1_NEXT_\@ */ vdiv.f32 s2 , s0, s4 // scale / x vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) vmov.f32 s0 , s4 // scale = x -KERNEL_S1_NEXT_\@: +1: /* KERNEL_S1_NEXT_\@: */ vcmpe.f32 s5, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr - beq KERNEL_S1_END_\@ + beq 2f /* KERNEL_S1_END_\@ */ vabs.f32 s5, s5 vcmpe.f32 s0, s5 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) - bge KERNEL_S1_END_\@ + bge 2f /* KERNEL_S1_END_\@ */ vdiv.f32 s2 , s0, s5 // scale / x vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) vmov.f32 s0 , s5 // scale = x -KERNEL_S1_END_\@: +2: /* KERNEL_S1_END_\@: */ add X, X, INC_X From 41e802443a3ceb72f7c031a8b77fa45633c7885b Mon Sep 17 00:00:00 2001 From: Baptiste Daroussin Date: Fri, 3 Apr 2020 06:20:42 +0200 Subject: [PATCH 086/593] libname: treat FreeBSD and DragonFly like linux and sunos There is no difference in the way libnames are handle between FreeBSD and linux or sunos. FreeBSD and DragonFly prefers having sonames as well --- Makefile | 4 ++-- Makefile.install | 4 ++-- exports/Makefile | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index 0b920cc9f..18320e6a3 100644 --- a/Makefile +++ b/Makefile @@ -112,12 +112,12 @@ endif shared : ifneq ($(NO_SHARED), 1) -ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) +ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly)) @$(MAKE) -C exports so @ln -fs $(LIBSONAME) $(LIBPREFIX).so @ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) endif -ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly)) +ifeq ($(OSNAME), $(filter $(OSNAME),OpenBSD NetBSD)) @$(MAKE) -C exports so @ln -fs $(LIBSONAME) $(LIBPREFIX).so endif diff --git a/Makefile.install b/Makefile.install index 2dc32c3d9..dad869f4c 100644 --- a/Makefile.install +++ b/Makefile.install @@ -68,14 +68,14 @@ endif #for install shared library ifneq ($(NO_SHARED),1) @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) -ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) +ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly)) @install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) endif -ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly)) +ifeq ($(OSNAME), $(filter $(OSNAME),OpenBSD NetBSD)) @cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so diff --git a/exports/Makefile b/exports/Makefile index d32e449df..60291b1ff 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -126,7 +126,7 @@ endif dllinit.$(SUFFIX) : dllinit.c $(CC) $(CFLAGS) -c -o $(@F) -s $< -ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) +ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly)) so : ../$(LIBSONAME) @@ -171,7 +171,7 @@ endif endif #http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or -ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly)) +ifeq ($(OSNAME), $(filter $(OSNAME),OpenBSD NetBSD)) so : ../$(LIBSONAME) From 7972beb3754409db9af3c22dbbe7bd8075c09f6e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 3 Apr 2020 15:59:18 +0200 Subject: [PATCH 087/593] Add IBM Z to Travis configuration (#42) * Add IBM Z to Travis configuration --- .travis.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.travis.yml b/.travis.yml index 2d82f8812..c875572b2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -34,6 +34,16 @@ matrix: - TARGET_BOX=PPC64LE_LINUX - BTYPE="BINARY=64 USE_OPENMP=1" + - <<: *test-ubuntu + os: linux + arch: s390x + before_script: + - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=Z13 NUM_THREADS=32" + env: + # for matrix annotation only + - TARGET_BOX=IBMZ_LINUX + - BTYPE="BINARY=64 USE_OPENMP=1" + - <<: *test-ubuntu env: - TARGET_BOX=LINUX64 From 4ae6d1a01b612234c94fc3a76ea705dc340e1d52 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 3 Apr 2020 16:02:11 +0200 Subject: [PATCH 088/593] Add a Z13 build to the Travis configuration (#2542) * Add IBM Z to Travis configuration --- .travis.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.travis.yml b/.travis.yml index 2d82f8812..c875572b2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -34,6 +34,16 @@ matrix: - TARGET_BOX=PPC64LE_LINUX - BTYPE="BINARY=64 USE_OPENMP=1" + - <<: *test-ubuntu + os: linux + arch: s390x + before_script: + - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=Z13 NUM_THREADS=32" + env: + # for matrix annotation only + - TARGET_BOX=IBMZ_LINUX + - BTYPE="BINARY=64 USE_OPENMP=1" + - <<: *test-ubuntu env: - TARGET_BOX=LINUX64 From a56c9ec52a25ba0d72b6cc01f6312828113efec7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 4 Apr 2020 22:45:01 +0200 Subject: [PATCH 089/593] Revert "Add IBM Z to Travis configuration (#42)" This reverts commit 7972beb3754409db9af3c22dbbe7bd8075c09f6e. --- .travis.yml | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/.travis.yml b/.travis.yml index c875572b2..2d82f8812 100644 --- a/.travis.yml +++ b/.travis.yml @@ -34,16 +34,6 @@ matrix: - TARGET_BOX=PPC64LE_LINUX - BTYPE="BINARY=64 USE_OPENMP=1" - - <<: *test-ubuntu - os: linux - arch: s390x - before_script: - - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=Z13 NUM_THREADS=32" - env: - # for matrix annotation only - - TARGET_BOX=IBMZ_LINUX - - BTYPE="BINARY=64 USE_OPENMP=1" - - <<: *test-ubuntu env: - TARGET_BOX=LINUX64 From 69f277f8eea068a7283543620d9f4c2402dbaa8e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 8 Apr 2020 11:04:51 +0200 Subject: [PATCH 090/593] Add another memory barrier for ARM and a multicore test run on ThunderX to help detect such issues (#2544) * Add another memory barrier in memory.c to prevent races in memory slot allocation * Add an all-core test on Drone.io's ThunderX platform and modify dgemm_tester to use all 96 cores --- .drone.yml | 25 +++++++++++++++++++++++++ cpp_thread_test/dgemm_thread_safety.cpp | 2 +- driver/others/memory.c | 2 +- 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/.drone.yml b/.drone.yml index 696c5a99d..3bbd8fc88 100644 --- a/.drone.yml +++ b/.drone.yml @@ -141,3 +141,28 @@ steps: - cmake $CMAKE_FLAGS .. - make -j - ctest -V + +--- +kind: pipeline +name: arm64_native_test + +platform: + os: linux + arch: arm64 + +steps: +- name: Build and Test + image: ubuntu:19.04 + environment: + CC: gcc + COMMON_FLAGS: 'USE_OPENMP=1' + commands: + - echo "MAKE_FLAGS:= $COMMON_FLAGS" + - apt-get update -y + - apt-get install -y make $CC gfortran perl python g++ + - $CC --version + - make QUIET_MAKE=1 $COMMON_FLAGS + - make -C test $COMMON_FLAGS + - make -C ctest $COMMON_FLAGS + - make -C utest $COMMON_FLAGS + - make -C cpp_thread_test dgemm_tester diff --git a/cpp_thread_test/dgemm_thread_safety.cpp b/cpp_thread_test/dgemm_thread_safety.cpp index cecf794fa..1b6ad3826 100644 --- a/cpp_thread_test/dgemm_thread_safety.cpp +++ b/cpp_thread_test/dgemm_thread_safety.cpp @@ -12,7 +12,7 @@ void launch_cblas_dgemm(double* A, double* B, double* C, const blasint randomMat int main(int argc, char* argv[]){ blasint randomMatSize = 1024; //dimension of the random square matrices used - uint32_t numConcurrentThreads = 52; //number of concurrent calls of the functions being tested + uint32_t numConcurrentThreads = 96; //number of concurrent calls of the functions being tested uint32_t numTestRounds = 16; //number of testing rounds before success exit if (argc > 4){ diff --git a/driver/others/memory.c b/driver/others/memory.c index 62a5a0214..1af547fb2 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2740,7 +2740,7 @@ void *blas_memory_alloc(int procpos){ #ifdef DEBUG printf(" Position -> %d\n", position); #endif - +WMB; memory[position].used = 1; #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); From 7b4773b24d83ef81de6291da3d350ab98045d4a0 Mon Sep 17 00:00:00 2001 From: Sharvil Nanavati Date: Wed, 8 Apr 2020 12:47:41 -0700 Subject: [PATCH 091/593] Add API to set thread affinity on Linux. Issue: #2545 --- cblas.h | 5 +++++ driver/others/blas_server.c | 18 ++++++++++++++++++ openblas_config_template.h | 5 +++++ 3 files changed, 28 insertions(+) diff --git a/cblas.h b/cblas.h index 1a87074d6..4bc5588d8 100644 --- a/cblas.h +++ b/cblas.h @@ -25,6 +25,11 @@ char* openblas_get_config(void); /*Get the CPU corename on runtime.*/ char* openblas_get_corename(void); +#ifdef OPENBLAS_OS_LINUX +/* Sets thread affinity for OpenBLAS threads. `thread_idx` is in [0, openblas_get_num_threads()-1]. */ +int openblas_setaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set); +#endif + /* Get the parallelization type which is used by OpenBLAS */ int openblas_get_parallel(void); /* OpenBLAS is compiled for sequential use */ diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index aa0644845..f13b83dd4 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -72,6 +72,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_HAIKU) #include +#include #include #include #include @@ -279,6 +280,23 @@ int get_node(void); static int increased_threads = 0; +#ifdef OS_LINUX +int openblas_setaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set) { + const int active_threads = openblas_get_num_threads(); + + if (thread_idx < 0 || thread_idx >= active_threads) { + errno = EINVAL; + return -1; + } + + pthread_t thread = (thread_idx == active_threads - 1) + ? pthread_self() + : blas_threads[thread_idx]; + + return pthread_setaffinity_np(thread, cpusetsize, cpu_set); +} +#endif + static void* blas_thread_server(void *arg){ /* Thread identifier */ diff --git a/openblas_config_template.h b/openblas_config_template.h index 52dd49da2..49aea1cab 100644 --- a/openblas_config_template.h +++ b/openblas_config_template.h @@ -91,3 +91,8 @@ typedef int blasint; #define openblas_complex_xdouble_real(z) ((z).real) #define openblas_complex_xdouble_imag(z) ((z).imag) #endif + +/* Inclusion of Linux-specific header is needed for definition of cpu_set_t. */ +#ifdef OPENBLAS_OS_LINUX +#include +#endif From 8d07cf9b67152486bdbe456e97b3a7b1377fe63b Mon Sep 17 00:00:00 2001 From: gxw Date: Thu, 9 Apr 2020 19:25:13 +0800 Subject: [PATCH 092/593] Fix compilation problem on loongson platform Using "make TARGET=GENERIC" on loongson platform will get the following error messages: "make[1]: *** No rule to make target 'sgemm_incopy.o', needed by 'libs'" Add kernel/mips64/KERNEL.generic to slove the problem. --- kernel/mips64/KERNEL.generic | 160 +++++++++++++++++++++++++++++++++++ 1 file changed, 160 insertions(+) create mode 100644 kernel/mips64/KERNEL.generic diff --git a/kernel/mips64/KERNEL.generic b/kernel/mips64/KERNEL.generic new file mode 100644 index 000000000..17f2ef976 --- /dev/null +++ b/kernel/mips64/KERNEL.generic @@ -0,0 +1,160 @@ +SGEMM_BETA = ../generic/gemm_beta.c +DGEMM_BETA = ../generic/gemm_beta.c +CGEMM_BETA = ../generic/zgemm_beta.c +ZGEMM_BETA = ../generic/zgemm_beta.c + +STRMMKERNEL = ../generic/trmmkernel_2x2.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +#Pure C for other kernels +SAMAXKERNEL = ../mips/amax.c +DAMAXKERNEL = ../mips/amax.c +CAMAXKERNEL = ../mips/zamax.c +ZAMAXKERNEL = ../mips/zamax.c + +SAMINKERNEL = ../mips/amin.c +DAMINKERNEL = ../mips/amin.c +CAMINKERNEL = ../mips/zamin.c +ZAMINKERNEL = ../mips/zamin.c + +SMAXKERNEL = ../mips/max.c +DMAXKERNEL = ../mips/max.c + +SMINKERNEL = ../mips/min.c +DMINKERNEL = ../mips/min.c + +ISAMAXKERNEL = ../mips/iamax.c +IDAMAXKERNEL = ../mips/iamax.c +ICAMAXKERNEL = ../mips/izamax.c +IZAMAXKERNEL = ../mips/izamax.c + +ISAMINKERNEL = ../mips/iamin.c +IDAMINKERNEL = ../mips/iamin.c +ICAMINKERNEL = ../mips/izamin.c +IZAMINKERNEL = ../mips/izamin.c + +ISMAXKERNEL = ../mips/imax.c +IDMAXKERNEL = ../mips/imax.c + +ISMINKERNEL = ../mips/imin.c +IDMINKERNEL = ../mips/imin.c + +SASUMKERNEL = ../mips/asum.c +DASUMKERNEL = ../mips/asum.c +CASUMKERNEL = ../mips/zasum.c +ZASUMKERNEL = ../mips/zasum.c + +SSUMKERNEL = ../mips/sum.c +DSUMKERNEL = ../mips/sum.c +CSUMKERNEL = ../mips/zsum.c +ZSUMKERNEL = ../mips/zsum.c + +SAXPYKERNEL = ../mips/axpy.c +DAXPYKERNEL = ../mips/axpy.c +CAXPYKERNEL = ../mips/zaxpy.c +ZAXPYKERNEL = ../mips/zaxpy.c + +SCOPYKERNEL = ../mips/copy.c +DCOPYKERNEL = ../mips/copy.c +CCOPYKERNEL = ../mips/zcopy.c +ZCOPYKERNEL = ../mips/zcopy.c + +SDOTKERNEL = ../mips/dot.c +DDOTKERNEL = ../mips/dot.c +CDOTKERNEL = ../mips/zdot.c +ZDOTKERNEL = ../mips/zdot.c + +SNRM2KERNEL = ../mips/nrm2.c +DNRM2KERNEL = ../mips/nrm2.c +CNRM2KERNEL = ../mips/znrm2.c +ZNRM2KERNEL = ../mips/znrm2.c + +SROTKERNEL = ../mips/rot.c +DROTKERNEL = ../mips/rot.c +CROTKERNEL = ../mips/zrot.c +ZROTKERNEL = ../mips/zrot.c + +SSCALKERNEL = ../mips/scal.c +DSCALKERNEL = ../mips/scal.c +CSCALKERNEL = ../mips/zscal.c +ZSCALKERNEL = ../mips/zscal.c + +SSWAPKERNEL = ../mips/swap.c +DSWAPKERNEL = ../mips/swap.c +CSWAPKERNEL = ../mips/zswap.c +ZSWAPKERNEL = ../mips/zswap.c + +SGEMVNKERNEL = ../mips/gemv_n.c +DGEMVNKERNEL = ../mips/gemv_n.c +CGEMVNKERNEL = ../mips/zgemv_n.c +ZGEMVNKERNEL = ../mips/zgemv_n.c + +SGEMVTKERNEL = ../mips/gemv_t.c +DGEMVTKERNEL = ../mips/gemv_t.c +CGEMVTKERNEL = ../mips/zgemv_t.c +ZGEMVTKERNEL = ../mips/zgemv_t.c + +SSYMV_U_KERNEL = ../generic/symv_k.c +SSYMV_L_KERNEL = ../generic/symv_k.c +DSYMV_U_KERNEL = ../generic/symv_k.c +DSYMV_L_KERNEL = ../generic/symv_k.c +QSYMV_U_KERNEL = ../generic/symv_k.c +QSYMV_L_KERNEL = ../generic/symv_k.c +CSYMV_U_KERNEL = ../generic/zsymv_k.c +CSYMV_L_KERNEL = ../generic/zsymv_k.c +ZSYMV_U_KERNEL = ../generic/zsymv_k.c +ZSYMV_L_KERNEL = ../generic/zsymv_k.c +XSYMV_U_KERNEL = ../generic/zsymv_k.c +XSYMV_L_KERNEL = ../generic/zsymv_k.c + +ZHEMV_U_KERNEL = ../generic/zhemv_k.c +ZHEMV_L_KERNEL = ../generic/zhemv_k.c + +CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c +ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c From 66f89c0aaf5f3b179baba7c974afe14291c606c5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 10 Apr 2020 22:06:44 +0200 Subject: [PATCH 093/593] Match thread count to machine capability --- cpp_thread_test/dgemm_thread_safety.cpp | 6 +++++- cpp_thread_test/dgemv_thread_safety.cpp | 4 ++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/cpp_thread_test/dgemm_thread_safety.cpp b/cpp_thread_test/dgemm_thread_safety.cpp index cecf794fa..1c5287524 100644 --- a/cpp_thread_test/dgemm_thread_safety.cpp +++ b/cpp_thread_test/dgemm_thread_safety.cpp @@ -12,9 +12,13 @@ void launch_cblas_dgemm(double* A, double* B, double* C, const blasint randomMat int main(int argc, char* argv[]){ blasint randomMatSize = 1024; //dimension of the random square matrices used - uint32_t numConcurrentThreads = 52; //number of concurrent calls of the functions being tested + uint32_t numConcurrentThreads = 96; //number of concurrent calls of the functions being tested uint32_t numTestRounds = 16; //number of testing rounds before success exit + uint32_t maxHwThreads = omp_get_max_threads(); + if (maxHwThreads < 96) + numConcurrentThreads = maxHwThreads; + if (argc > 4){ std::cout<<"ERROR: too many arguments for thread safety tester"< 4){ std::cout<<"ERROR: too many arguments for thread safety tester"< Date: Sun, 12 Apr 2020 19:44:48 +0200 Subject: [PATCH 094/593] Increase default BUFFER_SIZE on ARM, ZARCH and newer x86_64, add GEMM_R for POWER8/9 As shown in #2538, default buffersizes on some platforms were smaller than required in memory.c and the requirement could never be fulfilled for a calculated GEMM_R on PPC given the fomula used --- common_arm.h | 2 +- common_arm64.h | 7 ++++++- common_x86_64.h | 6 ++++++ common_zarch.h | 6 +----- param.h | 29 ++++++++++++++++++++--------- 5 files changed, 34 insertions(+), 16 deletions(-) diff --git a/common_arm.h b/common_arm.h index 27fa76b76..8411e6dd6 100644 --- a/common_arm.h +++ b/common_arm.h @@ -121,7 +121,7 @@ REALNAME: #endif #define HUGE_PAGESIZE ( 4 << 20) -#define BUFFER_SIZE (16 << 20) +#define BUFFER_SIZE (32 << 20) #define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) diff --git a/common_arm64.h b/common_arm64.h index a928dbe7b..99e0cee57 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -141,12 +141,17 @@ REALNAME: #endif #define HUGE_PAGESIZE ( 4 << 20) +#ifndef BUFFERSIZE #if defined(CORTEXA57) #define BUFFER_SIZE (20 << 20) +#elif defined(TSV110) || defined(EMAG8180) +#define BUFFER_SIZE (32 << 20) #else #define BUFFER_SIZE (16 << 20) #endif - +#else +#define BUFFER_SIZE (32 << BUFFERSIZE) +#endif #define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) diff --git a/common_x86_64.h b/common_x86_64.h index fe5539abe..958e9caed 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -226,7 +226,13 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ #define HUGE_PAGESIZE ( 2 << 20) #ifndef BUFFERSIZE +#if defined(SKYLAKEX) +#define BUFFER_SIZE (32 << 21) +#elif defined(HASWELL) || defined(ZEN) +#define BUFFER_SIZE (32 << 22) +#else #define BUFFER_SIZE (32 << 20) +#endif #else #define BUFFER_SIZE (32 << BUFFERSIZE) #endif diff --git a/common_zarch.h b/common_zarch.h index e105574e0..b5503a7a4 100644 --- a/common_zarch.h +++ b/common_zarch.h @@ -123,11 +123,7 @@ REALNAME: #endif #define HUGE_PAGESIZE ( 4 << 20) -#if defined(CORTEXA57) -#define BUFFER_SIZE (20 << 20) -#else -#define BUFFER_SIZE (16 << 20) -#endif +#define BUFFER_SIZE (32 << 22) #define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) diff --git a/param.h b/param.h index 410308524..d6cbe544a 100644 --- a/param.h +++ b/param.h @@ -2229,15 +2229,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_N 2 -#define SGEMM_DEFAULT_P 1280 -#define DGEMM_DEFAULT_P 640 -#define CGEMM_DEFAULT_P 640 -#define ZGEMM_DEFAULT_P 320 - -#define SGEMM_DEFAULT_Q 640 -#define DGEMM_DEFAULT_Q 720 -#define CGEMM_DEFAULT_Q 640 -#define ZGEMM_DEFAULT_Q 640 +#define SGEMM_DEFAULT_P 1280UL +#define DGEMM_DEFAULT_P 640UL +#define CGEMM_DEFAULT_P 640UL +#define ZGEMM_DEFAULT_P 320UL + +#define SGEMM_DEFAULT_Q 640UL +#define DGEMM_DEFAULT_Q 720UL +#define CGEMM_DEFAULT_Q 640UL +#define ZGEMM_DEFAULT_Q 640UL + +#if 0 +#define SGEMM_DEFAULT_R SGEMM_DEFAULT_P +#define DGEMM_DEFAULT_R DGEMM_DEFAULT_P +#define CGEMM_DEFAULT_R CGEMM_DEFAULT_P +#define ZGEMM_DEFAULT_R ZGEMM_DEFAULT_P +#endif +#define SGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 512 #define SYMV_P 8 From 2a28448a9650363ea9cd0cd237a4cbc7ff2f4425 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 12 Apr 2020 19:45:36 +0200 Subject: [PATCH 095/593] Add safeguards for sufficient BUFFER_SIZE --- driver/others/memory.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 1af547fb2..a49fb1fa1 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -87,6 +87,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif +/* Memory buffer must fit two matrix subblocks of maximal size */ +#define XSTR(x) STR(x) +#define STR(x) #x +#if BUFFER_SIZE < (SGEMM_DEFAULT_P * SGEMM_DEFAULT_Q * 4 * 2) || \ + BUFFER_SIZE < (SGEMM_DEFAULT_P * SGEMM_DEFAULT_R * 4 * 2) || \ + BUFFER_SIZE < (SGEMM_DEFAULT_R * SGEMM_DEFAULT_Q * 4 * 2) +#warning BUFFER_SIZE is too small for P, Q, and R of SGEMM - large calculations may crash ! +#endif +#if BUFFER_SIZE < (DGEMM_DEFAULT_P * DGEMM_DEFAULT_Q * 8 * 2) || \ + BUFFER_SIZE < (DGEMM_DEFAULT_P * DGEMM_DEFAULT_R * 8 * 2) || \ + BUFFER_SIZE < (DGEMM_DEFAULT_R * DGEMM_DEFAULT_Q * 8 * 2) +#warning BUFFER_SIZE is too small for P, Q, and R of DGEMM - large calculations may crash ! +#endif +#if BUFFER_SIZE < (CGEMM_DEFAULT_P * CGEMM_DEFAULT_Q * 8 * 2) || \ + BUFFER_SIZE < (CGEMM_DEFAULT_P * CGEMM_DEFAULT_R * 8 * 2) || \ + BUFFER_SIZE < (CGEMM_DEFAULT_R * CGEMM_DEFAULT_Q * 8 * 2) +#warning BUFFER_SIZE is too small for P, Q, and R of CGEMM - large calculations may crash ! +#endif +#if BUFFER_SIZE < (ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_Q * 16 * 2) || \ + BUFFER_SIZE < (ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_R * 16 * 2) || \ + BUFFER_SIZE < (ZGEMM_DEFAULT_R * ZGEMM_DEFAULT_Q * 16 * 2) +#warning BUFFER_SIZE is too small for P, Q, and R of ZGEMM - large calculations may crash ! +#endif + #if defined(COMPILE_TLS) #include @@ -2740,7 +2764,7 @@ void *blas_memory_alloc(int procpos){ #ifdef DEBUG printf(" Position -> %d\n", position); #endif -WMB; + memory[position].used = 1; #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); From e9bfa2291a15974f94e2d322b860d47f40bcd1f4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 12 Apr 2020 19:47:02 +0200 Subject: [PATCH 096/593] Fix parameter overflow --- kernel/common_param.h | 1274 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1274 insertions(+) create mode 100644 kernel/common_param.h diff --git a/kernel/common_param.h b/kernel/common_param.h new file mode 100644 index 000000000..eab14b0a6 --- /dev/null +++ b/kernel/common_param.h @@ -0,0 +1,1274 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef COMMON_PARAM_H +#define COMMON_PARAM_H + +#ifndef ASSEMBLER + +#ifdef DYNAMIC_ARCH + +typedef struct { + int dtb_entries; + int offsetA, offsetB, align; + + int sgemm_p, sgemm_q, sgemm_r; + int sgemm_unroll_m, sgemm_unroll_n, sgemm_unroll_mn; + + int exclusive_cache; + + float (*samax_k) (BLASLONG, float *, BLASLONG); + float (*samin_k) (BLASLONG, float *, BLASLONG); + float (*smax_k) (BLASLONG, float *, BLASLONG); + float (*smin_k) (BLASLONG, float *, BLASLONG); +BLASLONG (*isamax_k)(BLASLONG, float *, BLASLONG); +BLASLONG (*isamin_k)(BLASLONG, float *, BLASLONG); +BLASLONG (*ismax_k) (BLASLONG, float *, BLASLONG); +BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); + + float (*snrm2_k) (BLASLONG, float *, BLASLONG); + float (*sasum_k) (BLASLONG, float *, BLASLONG); + float (*ssum_k) (BLASLONG, float *, BLASLONG); + int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); + float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); + double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); + + int (*srot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); + + int (*saxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + int (*sscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + int (*sswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + + int (*sgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*sgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*sger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + + int (*ssymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*ssymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + + int (*sgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); + int (*sgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + + int (*sgemm_incopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); + int (*sgemm_itcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); + int (*sgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); + int (*sgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); + + int (*strsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*strsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*strsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*strsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + + int (*strsm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*strsm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*strsm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*strsm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*strsm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*strsm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*strsm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*strsm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*strsm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*strsm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*strsm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*strsm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*strsm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*strsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*strsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*strsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + + int (*strmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*strmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*strmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*strmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + + int (*strmm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*strmm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*strmm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*strmm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*strmm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*strmm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*strmm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*strmm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*strmm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*strmm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*strmm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*strmm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*strmm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*strmm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*strmm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*strmm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + + int (*ssymm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ssymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ssymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ssymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + + int (*sneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); + int (*slaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); + + int dgemm_p, dgemm_q, dgemm_r; + int dgemm_unroll_m, dgemm_unroll_n, dgemm_unroll_mn; + + double (*damax_k) (BLASLONG, double *, BLASLONG); + double (*damin_k) (BLASLONG, double *, BLASLONG); + double (*dmax_k) (BLASLONG, double *, BLASLONG); + double (*dmin_k) (BLASLONG, double *, BLASLONG); +BLASLONG (*idamax_k)(BLASLONG, double *, BLASLONG); +BLASLONG (*idamin_k)(BLASLONG, double *, BLASLONG); +BLASLONG (*idmax_k) (BLASLONG, double *, BLASLONG); +BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); + + double (*dnrm2_k) (BLASLONG, double *, BLASLONG); + double (*dasum_k) (BLASLONG, double *, BLASLONG); + double (*dsum_k) (BLASLONG, double *, BLASLONG); + int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); + double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); + int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); + + int (*daxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); + int (*dscal_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); + int (*dswap_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); + + int (*dgemv_n) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*dgemv_t) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*dger_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + + int (*dsymv_L) (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*dsymv_U) (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + + int (*dgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG); + int (*dgemm_beta )(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); + + int (*dgemm_incopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); + int (*dgemm_itcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); + int (*dgemm_oncopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); + int (*dgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); + + int (*dtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*dtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*dtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*dtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); + + int (*dtrsm_iunucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*dtrsm_iunncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*dtrsm_iutucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*dtrsm_iutncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*dtrsm_ilnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*dtrsm_ilnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*dtrsm_iltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*dtrsm_iltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*dtrsm_ounucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*dtrsm_ounncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*dtrsm_outucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*dtrsm_outncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*dtrsm_olnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*dtrsm_olnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*dtrsm_oltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*dtrsm_oltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + + int (*dtrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*dtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*dtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*dtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); + + int (*dtrmm_iunucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dtrmm_iunncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dtrmm_iutucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dtrmm_iutncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dtrmm_ilnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dtrmm_ilnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dtrmm_iltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dtrmm_iltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dtrmm_ounucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dtrmm_ounncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dtrmm_outucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dtrmm_outncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dtrmm_olnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dtrmm_olnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dtrmm_oltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dtrmm_oltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + + int (*dsymm_iutcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dsymm_iltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dsymm_outcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dsymm_oltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + + int (*dneg_tcopy) (BLASLONG, BLASLONG, double *, BLASLONG, double *); + int (*dlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, double *, BLASLONG, blasint *, double *); + +#ifdef EXPRECISION + + int qgemm_p, qgemm_q, qgemm_r; + int qgemm_unroll_m, qgemm_unroll_n, qgemm_unroll_mn; + + xdouble (*qamax_k) (BLASLONG, xdouble *, BLASLONG); + xdouble (*qamin_k) (BLASLONG, xdouble *, BLASLONG); + xdouble (*qmax_k) (BLASLONG, xdouble *, BLASLONG); + xdouble (*qmin_k) (BLASLONG, xdouble *, BLASLONG); +BLASLONG (*iqamax_k)(BLASLONG, xdouble *, BLASLONG); +BLASLONG (*iqamin_k)(BLASLONG, xdouble *, BLASLONG); +BLASLONG (*iqmax_k) (BLASLONG, xdouble *, BLASLONG); +BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG); + + xdouble (*qnrm2_k) (BLASLONG, xdouble *, BLASLONG); + xdouble (*qasum_k) (BLASLONG, xdouble *, BLASLONG); + xdouble (*qsum_k) (BLASLONG, xdouble *, BLASLONG); + int (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + xdouble (*qdot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + int (*qrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble); + + int (*qaxpy_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + int (*qscal_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + int (*qswap_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + + int (*qgemv_n) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*qgemv_t) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*qger_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + + int (*qsymv_L) (BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*qsymv_U) (BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + + int (*qgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); + int (*qgemm_beta )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + + int (*qgemm_incopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*qgemm_itcopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*qgemm_oncopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*qgemm_otcopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + + int (*qtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*qtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*qtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*qtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + + int (*qtrsm_iunucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*qtrsm_iunncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*qtrsm_iutucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*qtrsm_iutncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*qtrsm_ilnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*qtrsm_ilnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*qtrsm_iltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*qtrsm_iltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*qtrsm_ounucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*qtrsm_ounncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*qtrsm_outucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*qtrsm_outncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*qtrsm_olnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*qtrsm_olnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*qtrsm_oltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*qtrsm_oltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + + int (*qtrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*qtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*qtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*qtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + + int (*qtrmm_iunucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qtrmm_iunncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qtrmm_iutucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qtrmm_iutncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qtrmm_ilnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qtrmm_ilnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qtrmm_iltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qtrmm_iltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qtrmm_ounucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qtrmm_ounncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qtrmm_outucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qtrmm_outncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qtrmm_olnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qtrmm_olnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qtrmm_oltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qtrmm_oltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + + int (*qsymm_iutcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qsymm_iltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qsymm_outcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qsymm_oltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + + int (*qneg_tcopy) (BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*qlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, xdouble *, BLASLONG, blasint *, xdouble *); + +#endif + + int cgemm_p, cgemm_q, cgemm_r; + int cgemm_unroll_m, cgemm_unroll_n, cgemm_unroll_mn; + + float (*camax_k) (BLASLONG, float *, BLASLONG); + float (*camin_k) (BLASLONG, float *, BLASLONG); +BLASLONG (*icamax_k)(BLASLONG, float *, BLASLONG); +BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); + + float (*cnrm2_k) (BLASLONG, float *, BLASLONG); + float (*casum_k) (BLASLONG, float *, BLASLONG); + float (*csum_k) (BLASLONG, float *, BLASLONG); + int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); + openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); + openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); + int (*csrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); + + int (*caxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + int (*caxpyc_k)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + int (*cscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + int (*cswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + + int (*cgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*cgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*cgemv_r) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*cgemv_c) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*cgemv_o) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*cgemv_u) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*cgemv_s) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*cgemv_d) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*cgeru_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*cgerc_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*cgerv_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*cgerd_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + + int (*csymv_L) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*csymv_U) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*chemv_L) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*chemv_U) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*chemv_M) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*chemv_V) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + + int (*cgemm_kernel_n )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); + int (*cgemm_kernel_l )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); + int (*cgemm_kernel_r )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); + int (*cgemm_kernel_b )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); + int (*cgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + + int (*cgemm_incopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); + int (*cgemm_itcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); + int (*cgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); + int (*cgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); + + int (*ctrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*ctrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*ctrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*ctrsm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*ctrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*ctrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*ctrsm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*ctrsm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + + int (*ctrsm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*ctrsm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*ctrsm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*ctrsm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*ctrsm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*ctrsm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*ctrsm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*ctrsm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*ctrsm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*ctrsm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*ctrsm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*ctrsm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*ctrsm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*ctrsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*ctrsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*ctrsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + + int (*ctrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*ctrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*ctrmm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*ctrmm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*ctrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*ctrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*ctrmm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*ctrmm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + + int (*ctrmm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ctrmm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ctrmm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ctrmm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ctrmm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ctrmm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ctrmm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ctrmm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ctrmm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ctrmm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ctrmm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ctrmm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ctrmm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ctrmm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ctrmm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ctrmm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + + int (*csymm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*csymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*csymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*csymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + + int (*chemm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*chemm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*chemm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*chemm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + + int cgemm3m_p, cgemm3m_q, cgemm3m_r; + int cgemm3m_unroll_m, cgemm3m_unroll_n, cgemm3m_unroll_mn; + + int (*cgemm3m_kernel)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); + + int (*cgemm3m_incopyb)(BLASLONG, BLASLONG, float *, BLASLONG, float *); + int (*cgemm3m_incopyr)(BLASLONG, BLASLONG, float *, BLASLONG, float *); + int (*cgemm3m_incopyi)(BLASLONG, BLASLONG, float *, BLASLONG, float *); + int (*cgemm3m_itcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, float *); + int (*cgemm3m_itcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, float *); + int (*cgemm3m_itcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, float *); + + int (*cgemm3m_oncopyb)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); + int (*cgemm3m_oncopyr)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); + int (*cgemm3m_oncopyi)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); + int (*cgemm3m_otcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); + int (*cgemm3m_otcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); + int (*cgemm3m_otcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); + + int (*csymm3m_iucopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*csymm3m_ilcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*csymm3m_iucopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*csymm3m_ilcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*csymm3m_iucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*csymm3m_ilcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + + int (*csymm3m_oucopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); + int (*csymm3m_olcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); + int (*csymm3m_oucopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); + int (*csymm3m_olcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); + int (*csymm3m_oucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); + int (*csymm3m_olcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); + + int (*chemm3m_iucopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*chemm3m_ilcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*chemm3m_iucopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*chemm3m_ilcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*chemm3m_iucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*chemm3m_ilcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + + int (*chemm3m_oucopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); + int (*chemm3m_olcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); + int (*chemm3m_oucopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); + int (*chemm3m_olcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); + int (*chemm3m_oucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); + int (*chemm3m_olcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); + + int (*cneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); + int (*claswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); + + int zgemm_p, zgemm_q, zgemm_r; + int zgemm_unroll_m, zgemm_unroll_n, zgemm_unroll_mn; + + double (*zamax_k) (BLASLONG, double *, BLASLONG); + double (*zamin_k) (BLASLONG, double *, BLASLONG); +BLASLONG (*izamax_k)(BLASLONG, double *, BLASLONG); +BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG); + + double (*znrm2_k) (BLASLONG, double *, BLASLONG); + double (*zasum_k) (BLASLONG, double *, BLASLONG); + double (*zsum_k) (BLASLONG, double *, BLASLONG); + int (*zcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); + openblas_complex_double (*zdotu_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); + openblas_complex_double (*zdotc_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); + int (*zdrot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); + + int (*zaxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); + int (*zaxpyc_k)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); + int (*zscal_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); + int (*zswap_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); + + int (*zgemv_n) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zgemv_t) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zgemv_r) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zgemv_c) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zgemv_o) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zgemv_u) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zgemv_s) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zgemv_d) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zgeru_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zgerc_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zgerv_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zgerd_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + + int (*zsymv_L) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zsymv_U) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zhemv_L) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zhemv_U) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zhemv_M) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zhemv_V) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + + int (*zgemm_kernel_n )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); + int (*zgemm_kernel_l )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); + int (*zgemm_kernel_r )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); + int (*zgemm_kernel_b )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); + int (*zgemm_beta )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); + + int (*zgemm_incopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); + int (*zgemm_itcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); + int (*zgemm_oncopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); + int (*zgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); + + int (*ztrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*ztrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*ztrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*ztrsm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*ztrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*ztrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*ztrsm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*ztrsm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + + int (*ztrsm_iunucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*ztrsm_iunncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*ztrsm_iutucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*ztrsm_iutncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*ztrsm_ilnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*ztrsm_ilnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*ztrsm_iltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*ztrsm_iltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*ztrsm_ounucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*ztrsm_ounncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*ztrsm_outucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*ztrsm_outncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*ztrsm_olnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*ztrsm_olnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*ztrsm_oltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*ztrsm_oltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + + int (*ztrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*ztrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*ztrmm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*ztrmm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*ztrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*ztrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*ztrmm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*ztrmm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + + int (*ztrmm_iunucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*ztrmm_iunncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*ztrmm_iutucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*ztrmm_iutncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*ztrmm_ilnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*ztrmm_ilnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*ztrmm_iltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*ztrmm_iltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*ztrmm_ounucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*ztrmm_ounncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*ztrmm_outucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*ztrmm_outncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*ztrmm_olnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*ztrmm_olnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*ztrmm_oltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*ztrmm_oltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + + int (*zsymm_iutcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zsymm_iltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zsymm_outcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zsymm_oltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + + int (*zhemm_iutcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zhemm_iltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zhemm_outcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zhemm_oltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + + int zgemm3m_p, zgemm3m_q, zgemm3m_r; + int zgemm3m_unroll_m, zgemm3m_unroll_n, zgemm3m_unroll_mn; + + int (*zgemm3m_kernel)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); + + int (*zgemm3m_incopyb)(BLASLONG, BLASLONG, double *, BLASLONG, double *); + int (*zgemm3m_incopyr)(BLASLONG, BLASLONG, double *, BLASLONG, double *); + int (*zgemm3m_incopyi)(BLASLONG, BLASLONG, double *, BLASLONG, double *); + int (*zgemm3m_itcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, double *); + int (*zgemm3m_itcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, double *); + int (*zgemm3m_itcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, double *); + + int (*zgemm3m_oncopyb)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); + int (*zgemm3m_oncopyr)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); + int (*zgemm3m_oncopyi)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); + int (*zgemm3m_otcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); + int (*zgemm3m_otcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); + int (*zgemm3m_otcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); + + int (*zsymm3m_iucopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zsymm3m_ilcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zsymm3m_iucopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zsymm3m_ilcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zsymm3m_iucopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zsymm3m_ilcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + + int (*zsymm3m_oucopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); + int (*zsymm3m_olcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); + int (*zsymm3m_oucopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); + int (*zsymm3m_olcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); + int (*zsymm3m_oucopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); + int (*zsymm3m_olcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); + + int (*zhemm3m_iucopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zhemm3m_ilcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zhemm3m_iucopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zhemm3m_ilcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zhemm3m_iucopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zhemm3m_ilcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + + int (*zhemm3m_oucopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); + int (*zhemm3m_olcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); + int (*zhemm3m_oucopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); + int (*zhemm3m_olcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); + int (*zhemm3m_oucopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); + int (*zhemm3m_olcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); + + int (*zneg_tcopy) (BLASLONG, BLASLONG, double *, BLASLONG, double *); + int (*zlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, double *, BLASLONG, blasint *, double *); + +#ifdef EXPRECISION + + int xgemm_p, xgemm_q, xgemm_r; + int xgemm_unroll_m, xgemm_unroll_n, xgemm_unroll_mn; + + xdouble (*xamax_k) (BLASLONG, xdouble *, BLASLONG); + xdouble (*xamin_k) (BLASLONG, xdouble *, BLASLONG); +BLASLONG (*ixamax_k)(BLASLONG, xdouble *, BLASLONG); +BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); + + xdouble (*xnrm2_k) (BLASLONG, xdouble *, BLASLONG); + xdouble (*xasum_k) (BLASLONG, xdouble *, BLASLONG); + xdouble (*xsum_k) (BLASLONG, xdouble *, BLASLONG); + int (*xcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + openblas_complex_xdouble (*xdotu_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + openblas_complex_xdouble (*xdotc_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + int (*xqrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble); + + int (*xaxpy_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + int (*xaxpyc_k)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + int (*xscal_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + int (*xswap_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + + int (*xgemv_n) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgemv_t) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgemv_r) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgemv_c) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgemv_o) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgemv_u) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgemv_s) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgemv_d) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgeru_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgerc_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgerv_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgerd_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + + int (*xsymv_L) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xsymv_U) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xhemv_L) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xhemv_U) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xhemv_M) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xhemv_V) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + + int (*xgemm_kernel_n )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); + int (*xgemm_kernel_l )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); + int (*xgemm_kernel_r )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); + int (*xgemm_kernel_b )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); + int (*xgemm_beta )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + + int (*xgemm_incopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgemm_itcopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgemm_oncopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgemm_otcopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + + int (*xtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*xtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*xtrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*xtrsm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*xtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*xtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*xtrsm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*xtrsm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + + int (*xtrsm_iunucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*xtrsm_iunncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*xtrsm_iutucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*xtrsm_iutncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*xtrsm_ilnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*xtrsm_ilnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*xtrsm_iltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*xtrsm_iltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*xtrsm_ounucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*xtrsm_ounncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*xtrsm_outucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*xtrsm_outncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*xtrsm_olnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*xtrsm_olnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*xtrsm_oltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*xtrsm_oltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + + int (*xtrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*xtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*xtrmm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*xtrmm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*xtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*xtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*xtrmm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*xtrmm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + + int (*xtrmm_iunucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xtrmm_iunncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xtrmm_iutucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xtrmm_iutncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xtrmm_ilnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xtrmm_ilnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xtrmm_iltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xtrmm_iltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xtrmm_ounucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xtrmm_ounncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xtrmm_outucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xtrmm_outncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xtrmm_olnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xtrmm_olnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xtrmm_oltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xtrmm_oltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + + int (*xsymm_iutcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xsymm_iltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xsymm_outcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xsymm_oltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + + int (*xhemm_iutcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xhemm_iltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xhemm_outcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xhemm_oltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + + int xgemm3m_p, xgemm3m_q, xgemm3m_r; + int xgemm3m_unroll_m, xgemm3m_unroll_n, xgemm3m_unroll_mn; + + int (*xgemm3m_kernel)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); + + int (*xgemm3m_incopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgemm3m_incopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgemm3m_incopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgemm3m_itcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgemm3m_itcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgemm3m_itcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + + int (*xgemm3m_oncopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); + int (*xgemm3m_oncopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); + int (*xgemm3m_oncopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); + int (*xgemm3m_otcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); + int (*xgemm3m_otcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); + int (*xgemm3m_otcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); + + int (*xsymm3m_iucopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xsymm3m_ilcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xsymm3m_iucopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xsymm3m_ilcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xsymm3m_iucopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xsymm3m_ilcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + + int (*xsymm3m_oucopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); + int (*xsymm3m_olcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); + int (*xsymm3m_oucopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); + int (*xsymm3m_olcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); + int (*xsymm3m_oucopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); + int (*xsymm3m_olcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); + + int (*xhemm3m_iucopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xhemm3m_ilcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xhemm3m_iucopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xhemm3m_ilcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xhemm3m_iucopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xhemm3m_ilcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + + int (*xhemm3m_oucopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); + int (*xhemm3m_olcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); + int (*xhemm3m_oucopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); + int (*xhemm3m_olcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); + int (*xhemm3m_oucopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); + int (*xhemm3m_olcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); + + int (*xneg_tcopy) (BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, xdouble *, BLASLONG, blasint *, xdouble *); + +#endif + + + void (*init)(void); + + int snum_opt, dnum_opt, qnum_opt; + + int (*saxpby_k) (BLASLONG, float, float*, BLASLONG,float, float*, BLASLONG); + int (*daxpby_k) (BLASLONG, double, double*, BLASLONG,double, double*, BLASLONG); + int (*caxpby_k) (BLASLONG, float, float, float*, BLASLONG,float,float, float*, BLASLONG); + int (*zaxpby_k) (BLASLONG, double, double, double*, BLASLONG,double,double, double*, BLASLONG); + + int (*somatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); + int (*somatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); + int (*somatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); + int (*somatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); + + int (*domatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); + int (*domatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); + int (*domatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); + int (*domatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); + + int (*comatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); + int (*comatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); + int (*comatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); + int (*comatcopy_k_rt) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); + + int (*comatcopy_k_cnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); + int (*comatcopy_k_ctc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); + int (*comatcopy_k_rnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); + int (*comatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); + + int (*zomatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); + int (*zomatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); + int (*zomatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); + int (*zomatcopy_k_rt) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); + + int (*zomatcopy_k_cnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); + int (*zomatcopy_k_ctc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); + int (*zomatcopy_k_rnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); + int (*zomatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); + + int (*simatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG); + int (*simatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG); + int (*simatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG); + int (*simatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG); + + int (*dimatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG); + int (*dimatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG); + int (*dimatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG); + int (*dimatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG); + + int (*cimatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + int (*cimatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + int (*cimatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + int (*cimatcopy_k_rt) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + + int (*cimatcopy_k_cnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + int (*cimatcopy_k_ctc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + int (*cimatcopy_k_rnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + int (*cimatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + + int (*zimatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); + int (*zimatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); + int (*zimatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); + int (*zimatcopy_k_rt) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); + + int (*zimatcopy_k_cnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); + int (*zimatcopy_k_ctc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); + int (*zimatcopy_k_rnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); + int (*zimatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); + + int (*sgeadd_k) (BLASLONG, BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG); + int (*dgeadd_k) (BLASLONG, BLASLONG, double, double *, BLASLONG, double, double *, BLASLONG); + int (*cgeadd_k) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float, float, float *, BLASLONG); + int (*zgeadd_k) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double, double, double *, BLASLONG); + +} gotoblas_t; + +extern gotoblas_t *gotoblas; + +#define DTB_ENTRIES gotoblas -> dtb_entries +#define GEMM_OFFSET_A gotoblas -> offsetA +#define GEMM_OFFSET_B gotoblas -> offsetB +#define GEMM_ALIGN gotoblas -> align + +#define HAVE_EX_L2 gotoblas -> exclusive_cache + +#define SGEMM_P gotoblas -> sgemm_p +#define SGEMM_Q gotoblas -> sgemm_q +#define SGEMM_R gotoblas -> sgemm_r +#define SGEMM_UNROLL_M gotoblas -> sgemm_unroll_m +#define SGEMM_UNROLL_N gotoblas -> sgemm_unroll_n +#define SGEMM_UNROLL_MN gotoblas -> sgemm_unroll_mn + +#define DGEMM_P gotoblas -> dgemm_p +#define DGEMM_Q gotoblas -> dgemm_q +#define DGEMM_R gotoblas -> dgemm_r +#define DGEMM_UNROLL_M gotoblas -> dgemm_unroll_m +#define DGEMM_UNROLL_N gotoblas -> dgemm_unroll_n +#define DGEMM_UNROLL_MN gotoblas -> dgemm_unroll_mn + +#define QGEMM_P gotoblas -> qgemm_p +#define QGEMM_Q gotoblas -> qgemm_q +#define QGEMM_R gotoblas -> qgemm_r +#define QGEMM_UNROLL_M gotoblas -> qgemm_unroll_m +#define QGEMM_UNROLL_N gotoblas -> qgemm_unroll_n +#define QGEMM_UNROLL_MN gotoblas -> qgemm_unroll_mn + +#define CGEMM_P gotoblas -> cgemm_p +#define CGEMM_Q gotoblas -> cgemm_q +#define CGEMM_R gotoblas -> cgemm_r +#define CGEMM_UNROLL_M gotoblas -> cgemm_unroll_m +#define CGEMM_UNROLL_N gotoblas -> cgemm_unroll_n +#define CGEMM_UNROLL_MN gotoblas -> cgemm_unroll_mn + +#define ZGEMM_P gotoblas -> zgemm_p +#define ZGEMM_Q gotoblas -> zgemm_q +#define ZGEMM_R gotoblas -> zgemm_r +#define ZGEMM_UNROLL_M gotoblas -> zgemm_unroll_m +#define ZGEMM_UNROLL_N gotoblas -> zgemm_unroll_n +#define ZGEMM_UNROLL_MN gotoblas -> zgemm_unroll_mn + +#define XGEMM_P gotoblas -> xgemm_p +#define XGEMM_Q gotoblas -> xgemm_q +#define XGEMM_R gotoblas -> xgemm_r +#define XGEMM_UNROLL_M gotoblas -> xgemm_unroll_m +#define XGEMM_UNROLL_N gotoblas -> xgemm_unroll_n +#define XGEMM_UNROLL_MN gotoblas -> xgemm_unroll_mn + +#define CGEMM3M_P gotoblas -> cgemm3m_p +#define CGEMM3M_Q gotoblas -> cgemm3m_q +#define CGEMM3M_R gotoblas -> cgemm3m_r +#define CGEMM3M_UNROLL_M gotoblas -> cgemm3m_unroll_m +#define CGEMM3M_UNROLL_N gotoblas -> cgemm3m_unroll_n +#define CGEMM3M_UNROLL_MN gotoblas -> cgemm3m_unroll_mn + +#define ZGEMM3M_P gotoblas -> zgemm3m_p +#define ZGEMM3M_Q gotoblas -> zgemm3m_q +#define ZGEMM3M_R gotoblas -> zgemm3m_r +#define ZGEMM3M_UNROLL_M gotoblas -> zgemm3m_unroll_m +#define ZGEMM3M_UNROLL_N gotoblas -> zgemm3m_unroll_n +#define ZGEMM3M_UNROLL_MN gotoblas -> zgemm3m_unroll_mn + +#define XGEMM3M_P gotoblas -> xgemm3m_p +#define XGEMM3M_Q gotoblas -> xgemm3m_q +#define XGEMM3M_R gotoblas -> xgemm3m_r +#define XGEMM3M_UNROLL_M gotoblas -> xgemm3m_unroll_m +#define XGEMM3M_UNROLL_N gotoblas -> xgemm3m_unroll_n +#define XGEMM3M_UNROLL_MN gotoblas -> xgemm3m_unroll_mn + +#else + +#define DTB_ENTRIES DTB_DEFAULT_ENTRIES + +#define GEMM_OFFSET_A GEMM_DEFAULT_OFFSET_A +#define GEMM_OFFSET_B GEMM_DEFAULT_OFFSET_B +#define GEMM_ALIGN GEMM_DEFAULT_ALIGN + +#ifdef HAVE_EXCLUSIVE_CACHE +#define HAVE_EX_L2 1 +#else +#define HAVE_EX_L2 0 +#endif + +#define SGEMM_P SGEMM_DEFAULT_P +#define SGEMM_Q SGEMM_DEFAULT_Q +#define SGEMM_R SGEMM_DEFAULT_R +#define SGEMM_UNROLL_M SGEMM_DEFAULT_UNROLL_M +#define SGEMM_UNROLL_N SGEMM_DEFAULT_UNROLL_N +#ifdef SGEMM_DEFAULT_UNROLL_MN +#define SGEMM_UNROLL_MN SGEMM_DEFAULT_UNROLL_MN +#else +#define SGEMM_UNROLL_MN MAX((SGEMM_UNROLL_M), (SGEMM_UNROLL_N)) +#endif + +#define DGEMM_P DGEMM_DEFAULT_P +#define DGEMM_Q DGEMM_DEFAULT_Q +#define DGEMM_R DGEMM_DEFAULT_R +#define DGEMM_UNROLL_M DGEMM_DEFAULT_UNROLL_M +#define DGEMM_UNROLL_N DGEMM_DEFAULT_UNROLL_N +#ifdef DGEMM_DEFAULT_UNROLL_MN +#define DGEMM_UNROLL_MN DGEMM_DEFAULT_UNROLL_MN +#else +#define DGEMM_UNROLL_MN MAX((DGEMM_UNROLL_M), (DGEMM_UNROLL_N)) +#endif + +#define QGEMM_P QGEMM_DEFAULT_P +#define QGEMM_Q QGEMM_DEFAULT_Q +#define QGEMM_R QGEMM_DEFAULT_R +#define QGEMM_UNROLL_M QGEMM_DEFAULT_UNROLL_M +#define QGEMM_UNROLL_N QGEMM_DEFAULT_UNROLL_N +#define QGEMM_UNROLL_MN MAX((QGEMM_UNROLL_M), (QGEMM_UNROLL_N)) + +#define CGEMM_P CGEMM_DEFAULT_P +#define CGEMM_Q CGEMM_DEFAULT_Q +#define CGEMM_R CGEMM_DEFAULT_R +#define CGEMM_UNROLL_M CGEMM_DEFAULT_UNROLL_M +#define CGEMM_UNROLL_N CGEMM_DEFAULT_UNROLL_N +#ifdef CGEMM_DEFAULT_UNROLL_MN +#define CGEMM_UNROLL_MN CGEMM_DEFAULT_UNROLL_MN +#else +#define CGEMM_UNROLL_MN MAX((CGEMM_UNROLL_M), (CGEMM_UNROLL_N)) +#endif + +#define ZGEMM_P ZGEMM_DEFAULT_P +#define ZGEMM_Q ZGEMM_DEFAULT_Q +#define ZGEMM_R ZGEMM_DEFAULT_R +#define ZGEMM_UNROLL_M ZGEMM_DEFAULT_UNROLL_M +#define ZGEMM_UNROLL_N ZGEMM_DEFAULT_UNROLL_N +#ifdef ZGEMM_DEFAULT_UNROLL_MN +#define ZGEMM_UNROLL_MN ZGEMM_DEFAULT_UNROLL_MN +#else +#define ZGEMM_UNROLL_MN MAX((ZGEMM_UNROLL_M), (ZGEMM_UNROLL_N)) +#endif + +#define XGEMM_P XGEMM_DEFAULT_P +#define XGEMM_Q XGEMM_DEFAULT_Q +#define XGEMM_R XGEMM_DEFAULT_R +#define XGEMM_UNROLL_M XGEMM_DEFAULT_UNROLL_M +#define XGEMM_UNROLL_N XGEMM_DEFAULT_UNROLL_N +#define XGEMM_UNROLL_MN MAX((XGEMM_UNROLL_M), (XGEMM_UNROLL_N)) + +#ifdef CGEMM3M_DEFAULT_UNROLL_N + +#define CGEMM3M_P CGEMM3M_DEFAULT_P +#define CGEMM3M_Q CGEMM3M_DEFAULT_Q +#define CGEMM3M_R CGEMM3M_DEFAULT_R +#define CGEMM3M_UNROLL_M CGEMM3M_DEFAULT_UNROLL_M +#define CGEMM3M_UNROLL_N CGEMM3M_DEFAULT_UNROLL_N +#define CGEMM3M_UNROLL_MN MAX((CGEMM3M_UNROLL_M), (CGEMM3M_UNROLL_N)) + +#else + +#define CGEMM3M_P SGEMM_DEFAULT_P +#define CGEMM3M_Q SGEMM_DEFAULT_Q +#define CGEMM3M_R SGEMM_DEFAULT_R +#define CGEMM3M_UNROLL_M SGEMM_DEFAULT_UNROLL_M +#define CGEMM3M_UNROLL_N SGEMM_DEFAULT_UNROLL_N +#define CGEMM3M_UNROLL_MN MAX((CGEMM_UNROLL_M), (CGEMM_UNROLL_N)) + +#endif + + +#ifdef ZGEMM3M_DEFAULT_UNROLL_N + +#define ZGEMM3M_P ZGEMM3M_DEFAULT_P +#define ZGEMM3M_Q ZGEMM3M_DEFAULT_Q +#define ZGEMM3M_R ZGEMM3M_DEFAULT_R +#define ZGEMM3M_UNROLL_M ZGEMM3M_DEFAULT_UNROLL_M +#define ZGEMM3M_UNROLL_N ZGEMM3M_DEFAULT_UNROLL_N +#define ZGEMM3M_UNROLL_MN MAX((ZGEMM_UNROLL_M), (ZGEMM_UNROLL_N)) + +#else + +#define ZGEMM3M_P DGEMM_DEFAULT_P +#define ZGEMM3M_Q DGEMM_DEFAULT_Q +#define ZGEMM3M_R DGEMM_DEFAULT_R +#define ZGEMM3M_UNROLL_M DGEMM_DEFAULT_UNROLL_M +#define ZGEMM3M_UNROLL_N DGEMM_DEFAULT_UNROLL_N +#define ZGEMM3M_UNROLL_MN MAX((ZGEMM_UNROLL_M), (ZGEMM_UNROLL_N)) + +#endif + +#define XGEMM3M_P QGEMM_DEFAULT_P +#define XGEMM3M_Q QGEMM_DEFAULT_Q +#define XGEMM3M_R QGEMM_DEFAULT_R +#define XGEMM3M_UNROLL_M QGEMM_DEFAULT_UNROLL_M +#define XGEMM3M_UNROLL_N QGEMM_DEFAULT_UNROLL_N +#define XGEMM3M_UNROLL_MN MAX((QGEMM_UNROLL_M), (QGEMM_UNROLL_N)) + + +#endif +#endif + +#ifndef COMPLEX +#if defined(XDOUBLE) +#define GEMM_P QGEMM_P +#define GEMM_Q QGEMM_Q +#define GEMM_R QGEMM_R +#define GEMM_UNROLL_M QGEMM_UNROLL_M +#define GEMM_UNROLL_N QGEMM_UNROLL_N +#define GEMM_UNROLL_MN QGEMM_UNROLL_MN +#define GEMM_DEFAULT_P QGEMM_DEFAULT_P +#define GEMM_DEFAULT_Q QGEMM_DEFAULT_Q +#define GEMM_DEFAULT_R QGEMM_DEFAULT_R +#define GEMM_DEFAULT_UNROLL_M QGEMM_DEFAULT_UNROLL_M +#define GEMM_DEFAULT_UNROLL_N QGEMM_DEFAULT_UNROLL_N +#elif defined(DOUBLE) +#define GEMM_P DGEMM_P +#define GEMM_Q DGEMM_Q +#define GEMM_R DGEMM_R +#define GEMM_UNROLL_M DGEMM_UNROLL_M +#define GEMM_UNROLL_N DGEMM_UNROLL_N +#define GEMM_UNROLL_MN DGEMM_UNROLL_MN +#define GEMM_DEFAULT_P DGEMM_DEFAULT_P +#define GEMM_DEFAULT_Q DGEMM_DEFAULT_Q +#define GEMM_DEFAULT_R DGEMM_DEFAULT_R +#define GEMM_DEFAULT_UNROLL_M DGEMM_DEFAULT_UNROLL_M +#define GEMM_DEFAULT_UNROLL_N DGEMM_DEFAULT_UNROLL_N +#else +#define GEMM_P SGEMM_P +#define GEMM_Q SGEMM_Q +#define GEMM_R SGEMM_R +#define GEMM_UNROLL_M SGEMM_UNROLL_M +#define GEMM_UNROLL_N SGEMM_UNROLL_N +#define GEMM_UNROLL_MN SGEMM_UNROLL_MN +#define GEMM_DEFAULT_P SGEMM_DEFAULT_P +#define GEMM_DEFAULT_Q SGEMM_DEFAULT_Q +#define GEMM_DEFAULT_R SGEMM_DEFAULT_R +#define GEMM_DEFAULT_UNROLL_M SGEMM_DEFAULT_UNROLL_M +#define GEMM_DEFAULT_UNROLL_N SGEMM_DEFAULT_UNROLL_N +#endif +#else +#if defined(XDOUBLE) +#define GEMM_P XGEMM_P +#define GEMM_Q XGEMM_Q +#define GEMM_R XGEMM_R +#define GEMM_UNROLL_M XGEMM_UNROLL_M +#define GEMM_UNROLL_N XGEMM_UNROLL_N +#define GEMM_UNROLL_MN XGEMM_UNROLL_MN +#define GEMM_DEFAULT_P XGEMM_DEFAULT_P +#define GEMM_DEFAULT_Q XGEMM_DEFAULT_Q +#define GEMM_DEFAULT_R XGEMM_DEFAULT_R +#define GEMM_DEFAULT_UNROLL_M XGEMM_DEFAULT_UNROLL_M +#define GEMM_DEFAULT_UNROLL_N XGEMM_DEFAULT_UNROLL_N +#elif defined(DOUBLE) +#define GEMM_P ZGEMM_P +#define GEMM_Q ZGEMM_Q +#define GEMM_R ZGEMM_R +#define GEMM_UNROLL_M ZGEMM_UNROLL_M +#define GEMM_UNROLL_N ZGEMM_UNROLL_N +#define GEMM_UNROLL_MN ZGEMM_UNROLL_MN +#define GEMM_DEFAULT_P ZGEMM_DEFAULT_P +#define GEMM_DEFAULT_Q ZGEMM_DEFAULT_Q +#define GEMM_DEFAULT_R ZGEMM_DEFAULT_R +#define GEMM_DEFAULT_UNROLL_M ZGEMM_DEFAULT_UNROLL_M +#define GEMM_DEFAULT_UNROLL_N ZGEMM_DEFAULT_UNROLL_N +#else +#define GEMM_P CGEMM_P +#define GEMM_Q CGEMM_Q +#define GEMM_R CGEMM_R +#define GEMM_UNROLL_M CGEMM_UNROLL_M +#define GEMM_UNROLL_N CGEMM_UNROLL_N +#define GEMM_UNROLL_MN CGEMM_UNROLL_MN +#define GEMM_DEFAULT_P CGEMM_DEFAULT_P +#define GEMM_DEFAULT_Q CGEMM_DEFAULT_Q +#define GEMM_DEFAULT_R CGEMM_DEFAULT_R +#define GEMM_DEFAULT_UNROLL_M CGEMM_DEFAULT_UNROLL_M +#define GEMM_DEFAULT_UNROLL_N CGEMM_DEFAULT_UNROLL_N +#endif +#endif + +#ifdef XDOUBLE +#define GEMM3M_UNROLL_M XGEMM3M_UNROLL_M +#define GEMM3M_UNROLL_N XGEMM3M_UNROLL_N +#elif defined(DOUBLE) +#define GEMM3M_UNROLL_M ZGEMM3M_UNROLL_M +#define GEMM3M_UNROLL_N ZGEMM3M_UNROLL_N +#else +#define GEMM3M_UNROLL_M CGEMM3M_UNROLL_M +#define GEMM3M_UNROLL_N CGEMM3M_UNROLL_N +#endif + + +#ifndef QGEMM_DEFAULT_UNROLL_M +#define QGEMM_DEFAULT_UNROLL_M 2 +#endif + +#ifndef QGEMM_DEFAULT_UNROLL_N +#define QGEMM_DEFAULT_UNROLL_N 2 +#endif + +#ifndef XGEMM_DEFAULT_UNROLL_M +#define XGEMM_DEFAULT_UNROLL_M 2 +#endif + +#ifndef XGEMM_DEFAULT_UNROLL_N +#define XGEMM_DEFAULT_UNROLL_N 2 +#endif + +#ifndef GEMM_THREAD +#define GEMM_THREAD gemm_thread_n +#endif + +#ifndef SGEMM_DEFAULT_R +#define SGEMM_DEFAULT_R (((BUFFER_SIZE - ((SGEMM_DEFAULT_P * SGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SGEMM_DEFAULT_Q * 4) - 15) & ~15UL) +#endif + +#ifndef DGEMM_DEFAULT_R +#define DGEMM_DEFAULT_R (((BUFFER_SIZE - ((DGEMM_DEFAULT_P * DGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (DGEMM_DEFAULT_Q * 8) - 15) & ~15UL) +#endif + +#ifndef QGEMM_DEFAULT_R +#define QGEMM_DEFAULT_R (((BUFFER_SIZE - ((QGEMM_DEFAULT_P * QGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (QGEMM_DEFAULT_Q * 16) - 15) & ~15UL) +#endif + +#ifndef CGEMM_DEFAULT_R +#define CGEMM_DEFAULT_R (((BUFFER_SIZE - ((CGEMM_DEFAULT_P * CGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (CGEMM_DEFAULT_Q * 8) - 15) & ~15UL) +#endif + +#ifndef ZGEMM_DEFAULT_R +#define ZGEMM_DEFAULT_R (((BUFFER_SIZE - ((ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (ZGEMM_DEFAULT_Q * 16) - 15) & ~15UL) +#endif + +#ifndef XGEMM_DEFAULT_R +#define XGEMM_DEFAULT_R (((BUFFER_SIZE - ((XGEMM_DEFAULT_P * XGEMM_DEFAULT_Q * 32 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (XGEMM_DEFAULT_Q * 32) - 15) & ~15UL) +#endif + +#ifndef SNUMOPT +#define SNUMOPT 2 +#endif + +#ifndef DNUMOPT +#define DNUMOPT 2 +#endif + +#ifndef QNUMOPT +#define QNUMOPT 1 +#endif + +#ifndef GEMM3M_P +#ifdef XDOUBLE +#define GEMM3M_P XGEMM3M_P +#elif defined(DOUBLE) +#define GEMM3M_P ZGEMM3M_P +#else +#define GEMM3M_P CGEMM3M_P +#endif +#endif + +#ifndef GEMM3M_Q +#ifdef XDOUBLE +#define GEMM3M_Q XGEMM3M_Q +#elif defined(DOUBLE) +#define GEMM3M_Q ZGEMM3M_Q +#else +#define GEMM3M_Q CGEMM3M_Q +#endif +#endif + +#ifndef GEMM3M_R +#ifdef XDOUBLE +#define GEMM3M_R XGEMM3M_R +#elif defined(DOUBLE) +#define GEMM3M_R ZGEMM3M_R +#else +#define GEMM3M_R CGEMM3M_R +#endif +#endif + + +#endif From d1d69e1b9ac20866a10170e49c3de2bdae8676d9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 13 Apr 2020 12:09:24 +0200 Subject: [PATCH 097/593] Add read barrier definition --- common_alpha.h | 1 + 1 file changed, 1 insertion(+) diff --git a/common_alpha.h b/common_alpha.h index 9739c941d..f1ea8ff94 100644 --- a/common_alpha.h +++ b/common_alpha.h @@ -43,6 +43,7 @@ #define MB asm("mb") #define WMB asm("wmb") +#define RMB asm("rmb") static void __inline blas_lock(unsigned long *address){ #ifndef __DECC From 8692456226b084333c8708b2887de22435cf3166 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 13 Apr 2020 12:10:37 +0200 Subject: [PATCH 098/593] Add read barrier definition --- common_arm.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/common_arm.h b/common_arm.h index 8411e6dd6..ee691ad75 100644 --- a/common_arm.h +++ b/common_arm.h @@ -37,11 +37,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define MB #define WMB +#define RMB #else #define MB __asm__ __volatile__ ("dmb ish" : : : "memory") #define WMB __asm__ __volatile__ ("dmb ishst" : : : "memory") +#define RMB __asm__ __volatile__ ("dmb ishld" : : : "memory") #endif From d237dc13601743dc9cb584d60a02ccdc797df3cf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 13 Apr 2020 12:11:58 +0200 Subject: [PATCH 099/593] Add read barrier definition --- common_arm64.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common_arm64.h b/common_arm64.h index 99e0cee57..314946282 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define MB __asm__ __volatile__ ("dmb ish" : : : "memory") #define WMB __asm__ __volatile__ ("dmb ishst" : : : "memory") - +#define RMB __asm__ __volatile__ ("dmb ishld" : : : "memory") #define INLINE inline From 25e879fe92d598e9535e48cba18ec65c1e7d5211 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 13 Apr 2020 12:12:54 +0200 Subject: [PATCH 100/593] Add (empty) read barrier definition --- common_ia64.h | 1 + 1 file changed, 1 insertion(+) diff --git a/common_ia64.h b/common_ia64.h index 72b75fc4e..59aefbd6d 100644 --- a/common_ia64.h +++ b/common_ia64.h @@ -47,6 +47,7 @@ #define MB #define WMB +#define RMB #ifdef __ECC #include From ee6b3df02ca8594271417fc63029a898ec86feb7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 13 Apr 2020 12:14:06 +0200 Subject: [PATCH 101/593] Add read barrier definition --- common_mips.h | 1 + 1 file changed, 1 insertion(+) diff --git a/common_mips.h b/common_mips.h index 35bff5083..2cc923043 100644 --- a/common_mips.h +++ b/common_mips.h @@ -35,6 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define MB __sync_synchronize() #define WMB __sync_synchronize() +#define RMB __sync_synchronize() #define INLINE inline From 99dde1d2c9629324ceabb6e744d0f4845089e24f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 13 Apr 2020 12:14:58 +0200 Subject: [PATCH 102/593] Add read barrier definition --- common_mips64.h | 1 + 1 file changed, 1 insertion(+) diff --git a/common_mips64.h b/common_mips64.h index 1163413dc..af638d60c 100644 --- a/common_mips64.h +++ b/common_mips64.h @@ -73,6 +73,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define MB __sync_synchronize() #define WMB __sync_synchronize() +#define RMB __sync_synchronize() #define INLINE inline From 3d4db4d002afbd8ee970a5de840e075ccbae626a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 13 Apr 2020 12:16:44 +0200 Subject: [PATCH 103/593] Add read barrier definition --- common_power.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/common_power.h b/common_power.h index e7caf9adf..e29d0f382 100644 --- a/common_power.h +++ b/common_power.h @@ -71,9 +71,11 @@ #if defined(POWER8) || defined(POWER9) #define MB __asm__ __volatile__ ("eieio":::"memory") #define WMB __asm__ __volatile__ ("eieio":::"memory") +#define RMB __asm__ __volatile__ ("eieio":::"memory") #else #define MB __asm__ __volatile__ ("sync") #define WMB __asm__ __volatile__ ("sync") +#define RMB __asm__ __volatile__ ("sync") #endif #define INLINE inline From 69b6e258d8d6fe6211a86b987204c350f4f62deb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 13 Apr 2020 12:17:41 +0200 Subject: [PATCH 104/593] Add (empty) read barrier definition --- common_sparc.h | 1 + 1 file changed, 1 insertion(+) diff --git a/common_sparc.h b/common_sparc.h index f99972db9..85e29fffa 100644 --- a/common_sparc.h +++ b/common_sparc.h @@ -41,6 +41,7 @@ #define MB __asm__ __volatile__ ("nop") #define WMB __asm__ __volatile__ ("nop") +#define RMB __asm__ __volatile__ ("nop") #ifndef ASSEMBLER From db3226a64681173d9d785cd71153a110b2b2dcee Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 13 Apr 2020 12:18:48 +0200 Subject: [PATCH 105/593] Add (empty) read barrier definition --- common_x86.h | 1 + 1 file changed, 1 insertion(+) diff --git a/common_x86.h b/common_x86.h index 99adc9f5b..ec928e236 100644 --- a/common_x86.h +++ b/common_x86.h @@ -47,6 +47,7 @@ #define MB #define WMB +#define RMB #ifdef C_SUN #define __asm__ __asm From a52bdd9d7b1b3e24d1eff9e52020c05cef6602dd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 13 Apr 2020 12:22:35 +0200 Subject: [PATCH 106/593] Add (empty) read barrier definition --- common_x86_64.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/common_x86_64.h b/common_x86_64.h index 958e9caed..0247674cd 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -63,13 +63,16 @@ #ifdef __GNUC__ #define MB do { __asm__ __volatile__("": : :"memory"); } while (0) #define WMB do { __asm__ __volatile__("": : :"memory"); } while (0) +#define RMB #else #define MB do {} while (0) #define WMB do {} while (0) +#define RMB #endif static void __inline blas_lock(volatile BLASULONG *address){ + #ifndef C_MSVC int ret; #else From f5efecb7caf9bd438eeeb3b53ebf96f9d8c38b61 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 13 Apr 2020 12:24:10 +0200 Subject: [PATCH 107/593] Add (empty) read barrier definition --- common_zarch.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/common_zarch.h b/common_zarch.h index b5503a7a4..442bae821 100644 --- a/common_zarch.h +++ b/common_zarch.h @@ -34,9 +34,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define COMMON_ZARCH #define MB -//__asm__ __volatile__ ("dmb ish" : : : "memory") #define WMB -//__asm__ __volatile__ ("dmb ishst" : : : "memory") +#define RMB #define INLINE inline From f41600e66fef4481ab82fbbad89144a8a8cc0599 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 13 Apr 2020 12:34:02 +0200 Subject: [PATCH 108/593] Add a read barrier in the traversing of the buffer list Needed on systems with weak memory ordering - the inferior, partially working fix from #2544 was already removed in #2551 --- driver/others/memory.c | 1 + 1 file changed, 1 insertion(+) diff --git a/driver/others/memory.c b/driver/others/memory.c index a49fb1fa1..5abcbf3a4 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2741,6 +2741,7 @@ void *blas_memory_alloc(int procpos){ LOCK_COMMAND(&alloc_lock); #endif do { + RMB; #if defined(USE_OPENMP) if (!memory[position].used) { blas_lock(&memory[position].lock); From 5b0093b5fe21dbdea04e37a6b3f687282b7313fb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 13 Apr 2020 14:58:52 +0200 Subject: [PATCH 109/593] Convert aligned moves to unaligned should have no performance impact on reasonably modern cpus and fixes occasional crashes in actual user code. --- kernel/x86_64/copy_sse2.S | 186 +++++++++++++++++++------------------- 1 file changed, 93 insertions(+), 93 deletions(-) diff --git a/kernel/x86_64/copy_sse2.S b/kernel/x86_64/copy_sse2.S index 200daafd9..a5ab2ea91 100644 --- a/kernel/x86_64/copy_sse2.S +++ b/kernel/x86_64/copy_sse2.S @@ -54,7 +54,7 @@ #ifdef OPTERON #define LOAD(OFFSET, ADDR, REG) xorps REG, REG; addpd OFFSET(ADDR), REG #else -#define LOAD(OFFSET, ADDR, REG) movaps OFFSET(ADDR), REG +#define LOAD(OFFSET, ADDR, REG) movups OFFSET(ADDR), REG #endif PROLOGUE @@ -104,14 +104,14 @@ sarq $4, %rax jle .L13 - movaps -16 * SIZE(X), %xmm0 - movaps -14 * SIZE(X), %xmm1 - movaps -12 * SIZE(X), %xmm2 - movaps -10 * SIZE(X), %xmm3 - movaps -8 * SIZE(X), %xmm4 - movaps -6 * SIZE(X), %xmm5 - movaps -4 * SIZE(X), %xmm6 - movaps -2 * SIZE(X), %xmm7 + movups -16 * SIZE(X), %xmm0 + movups -14 * SIZE(X), %xmm1 + movups -12 * SIZE(X), %xmm2 + movups -10 * SIZE(X), %xmm3 + movups -8 * SIZE(X), %xmm4 + movups -6 * SIZE(X), %xmm5 + movups -4 * SIZE(X), %xmm6 + movups -2 * SIZE(X), %xmm7 decq %rax jle .L12 @@ -122,36 +122,36 @@ PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif - movaps %xmm0, -16 * SIZE(Y) + movups %xmm0, -16 * SIZE(Y) LOAD( 0 * SIZE, X, %xmm0) - movaps %xmm1, -14 * SIZE(Y) + movups %xmm1, -14 * SIZE(Y) LOAD( 2 * SIZE, X, %xmm1) #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif - movaps %xmm2, -12 * SIZE(Y) + movups %xmm2, -12 * SIZE(Y) LOAD( 4 * SIZE, X, %xmm2) - movaps %xmm3, -10 * SIZE(Y) + movups %xmm3, -10 * SIZE(Y) LOAD( 6 * SIZE, X, %xmm3) #if defined(PREFETCHW) && !defined(FETCH128) PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif - movaps %xmm4, -8 * SIZE(Y) + movups %xmm4, -8 * SIZE(Y) LOAD( 8 * SIZE, X, %xmm4) - movaps %xmm5, -6 * SIZE(Y) + movups %xmm5, -6 * SIZE(Y) LOAD(10 * SIZE, X, %xmm5) #if defined(PREFETCH) && !defined(FETCH128) PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif - movaps %xmm6, -4 * SIZE(Y) + movups %xmm6, -4 * SIZE(Y) LOAD(12 * SIZE, X, %xmm6) - movaps %xmm7, -2 * SIZE(Y) + movups %xmm7, -2 * SIZE(Y) LOAD(14 * SIZE, X, %xmm7) subq $-16 * SIZE, Y @@ -161,14 +161,14 @@ ALIGN_3 .L12: - movaps %xmm0, -16 * SIZE(Y) - movaps %xmm1, -14 * SIZE(Y) - movaps %xmm2, -12 * SIZE(Y) - movaps %xmm3, -10 * SIZE(Y) - movaps %xmm4, -8 * SIZE(Y) - movaps %xmm5, -6 * SIZE(Y) - movaps %xmm6, -4 * SIZE(Y) - movaps %xmm7, -2 * SIZE(Y) + movups %xmm0, -16 * SIZE(Y) + movups %xmm1, -14 * SIZE(Y) + movups %xmm2, -12 * SIZE(Y) + movups %xmm3, -10 * SIZE(Y) + movups %xmm4, -8 * SIZE(Y) + movups %xmm5, -6 * SIZE(Y) + movups %xmm6, -4 * SIZE(Y) + movups %xmm7, -2 * SIZE(Y) subq $-16 * SIZE, Y subq $-16 * SIZE, X @@ -179,15 +179,15 @@ jle .L14 ALIGN_3 - movaps -16 * SIZE(X), %xmm0 - movaps -14 * SIZE(X), %xmm1 - movaps -12 * SIZE(X), %xmm2 - movaps -10 * SIZE(X), %xmm3 + movups -16 * SIZE(X), %xmm0 + movups -14 * SIZE(X), %xmm1 + movups -12 * SIZE(X), %xmm2 + movups -10 * SIZE(X), %xmm3 - movaps %xmm0, -16 * SIZE(Y) - movaps %xmm1, -14 * SIZE(Y) - movaps %xmm2, -12 * SIZE(Y) - movaps %xmm3, -10 * SIZE(Y) + movups %xmm0, -16 * SIZE(Y) + movups %xmm1, -14 * SIZE(Y) + movups %xmm2, -12 * SIZE(Y) + movups %xmm3, -10 * SIZE(Y) addq $8 * SIZE, X addq $8 * SIZE, Y @@ -198,11 +198,11 @@ jle .L15 ALIGN_3 - movaps -16 * SIZE(X), %xmm0 - movaps -14 * SIZE(X), %xmm1 + movups -16 * SIZE(X), %xmm0 + movups -14 * SIZE(X), %xmm1 - movaps %xmm0, -16 * SIZE(Y) - movaps %xmm1, -14 * SIZE(Y) + movups %xmm0, -16 * SIZE(Y) + movups %xmm1, -14 * SIZE(Y) addq $4 * SIZE, X addq $4 * SIZE, Y @@ -213,8 +213,8 @@ jle .L16 ALIGN_3 - movaps -16 * SIZE(X), %xmm0 - movaps %xmm0, -16 * SIZE(Y) + movups -16 * SIZE(X), %xmm0 + movups %xmm0, -16 * SIZE(Y) addq $2 * SIZE, X addq $2 * SIZE, Y @@ -246,13 +246,13 @@ sarq $4, %rax jle .L23 - movaps -15 * SIZE(X), %xmm1 - movaps -13 * SIZE(X), %xmm2 - movaps -11 * SIZE(X), %xmm3 - movaps -9 * SIZE(X), %xmm4 - movaps -7 * SIZE(X), %xmm5 - movaps -5 * SIZE(X), %xmm6 - movaps -3 * SIZE(X), %xmm7 + movups -15 * SIZE(X), %xmm1 + movups -13 * SIZE(X), %xmm2 + movups -11 * SIZE(X), %xmm3 + movups -9 * SIZE(X), %xmm4 + movups -7 * SIZE(X), %xmm5 + movups -5 * SIZE(X), %xmm6 + movups -3 * SIZE(X), %xmm7 decq %rax jle .L22 @@ -264,11 +264,11 @@ #endif SHUFPD_1 %xmm1, %xmm0 - movaps %xmm0, -16 * SIZE(Y) + movups %xmm0, -16 * SIZE(Y) LOAD(-1 * SIZE, X, %xmm0) SHUFPD_1 %xmm2, %xmm1 - movaps %xmm1, -14 * SIZE(Y) + movups %xmm1, -14 * SIZE(Y) LOAD( 1 * SIZE, X, %xmm1) #ifdef PREFETCH @@ -276,11 +276,11 @@ #endif SHUFPD_1 %xmm3, %xmm2 - movaps %xmm2, -12 * SIZE(Y) + movups %xmm2, -12 * SIZE(Y) LOAD( 3 * SIZE, X, %xmm2) SHUFPD_1 %xmm4, %xmm3 - movaps %xmm3, -10 * SIZE(Y) + movups %xmm3, -10 * SIZE(Y) LOAD( 5 * SIZE, X, %xmm3) #if defined(PREFETCHW) && !defined(FETCH128) @@ -288,11 +288,11 @@ #endif SHUFPD_1 %xmm5, %xmm4 - movaps %xmm4, -8 * SIZE(Y) + movups %xmm4, -8 * SIZE(Y) LOAD( 7 * SIZE, X, %xmm4) SHUFPD_1 %xmm6, %xmm5 - movaps %xmm5, -6 * SIZE(Y) + movups %xmm5, -6 * SIZE(Y) LOAD( 9 * SIZE, X, %xmm5) #if defined(PREFETCH) && !defined(FETCH128) @@ -300,11 +300,11 @@ #endif SHUFPD_1 %xmm7, %xmm6 - movaps %xmm6, -4 * SIZE(Y) + movups %xmm6, -4 * SIZE(Y) LOAD(11 * SIZE, X, %xmm6) SHUFPD_1 %xmm0, %xmm7 - movaps %xmm7, -2 * SIZE(Y) + movups %xmm7, -2 * SIZE(Y) LOAD(13 * SIZE, X, %xmm7) subq $-16 * SIZE, X @@ -315,26 +315,26 @@ .L22: SHUFPD_1 %xmm1, %xmm0 - movaps %xmm0, -16 * SIZE(Y) + movups %xmm0, -16 * SIZE(Y) LOAD(-1 * SIZE, X, %xmm0) SHUFPD_1 %xmm2, %xmm1 - movaps %xmm1, -14 * SIZE(Y) + movups %xmm1, -14 * SIZE(Y) SHUFPD_1 %xmm3, %xmm2 - movaps %xmm2, -12 * SIZE(Y) + movups %xmm2, -12 * SIZE(Y) SHUFPD_1 %xmm4, %xmm3 - movaps %xmm3, -10 * SIZE(Y) + movups %xmm3, -10 * SIZE(Y) SHUFPD_1 %xmm5, %xmm4 - movaps %xmm4, -8 * SIZE(Y) + movups %xmm4, -8 * SIZE(Y) SHUFPD_1 %xmm6, %xmm5 - movaps %xmm5, -6 * SIZE(Y) + movups %xmm5, -6 * SIZE(Y) SHUFPD_1 %xmm7, %xmm6 - movaps %xmm6, -4 * SIZE(Y) + movups %xmm6, -4 * SIZE(Y) SHUFPD_1 %xmm0, %xmm7 - movaps %xmm7, -2 * SIZE(Y) + movups %xmm7, -2 * SIZE(Y) subq $-16 * SIZE, X subq $-16 * SIZE, Y @@ -345,24 +345,24 @@ jle .L24 ALIGN_3 - movaps -15 * SIZE(X), %xmm1 - movaps -13 * SIZE(X), %xmm2 - movaps -11 * SIZE(X), %xmm3 - movaps -9 * SIZE(X), %xmm8 + movups -15 * SIZE(X), %xmm1 + movups -13 * SIZE(X), %xmm2 + movups -11 * SIZE(X), %xmm3 + movups -9 * SIZE(X), %xmm8 SHUFPD_1 %xmm1, %xmm0 - movaps %xmm0, -16 * SIZE(Y) + movups %xmm0, -16 * SIZE(Y) SHUFPD_1 %xmm2, %xmm1 - movaps %xmm1, -14 * SIZE(Y) + movups %xmm1, -14 * SIZE(Y) SHUFPD_1 %xmm3, %xmm2 - movaps %xmm2, -12 * SIZE(Y) + movups %xmm2, -12 * SIZE(Y) SHUFPD_1 %xmm8, %xmm3 - movaps %xmm3, -10 * SIZE(Y) + movups %xmm3, -10 * SIZE(Y) - movaps %xmm8, %xmm0 + movups %xmm8, %xmm0 addq $8 * SIZE, X addq $8 * SIZE, Y @@ -373,15 +373,15 @@ jle .L25 ALIGN_3 - movaps -15 * SIZE(X), %xmm1 - movaps -13 * SIZE(X), %xmm2 + movups -15 * SIZE(X), %xmm1 + movups -13 * SIZE(X), %xmm2 SHUFPD_1 %xmm1, %xmm0 SHUFPD_1 %xmm2, %xmm1 - movaps %xmm0, -16 * SIZE(Y) - movaps %xmm1, -14 * SIZE(Y) - movaps %xmm2, %xmm0 + movups %xmm0, -16 * SIZE(Y) + movups %xmm1, -14 * SIZE(Y) + movups %xmm2, %xmm0 addq $4 * SIZE, X addq $4 * SIZE, Y @@ -392,10 +392,10 @@ jle .L26 ALIGN_3 - movaps -15 * SIZE(X), %xmm1 + movups -15 * SIZE(X), %xmm1 SHUFPD_1 %xmm1, %xmm0 - movaps %xmm0, -16 * SIZE(Y) + movups %xmm0, -16 * SIZE(Y) addq $2 * SIZE, X addq $2 * SIZE, Y @@ -424,14 +424,14 @@ sarq $4, %rax jle .L23 - movaps -16 * SIZE(X), %xmm0 - movaps -14 * SIZE(X), %xmm1 - movaps -12 * SIZE(X), %xmm2 - movaps -10 * SIZE(X), %xmm3 - movaps -8 * SIZE(X), %xmm4 - movaps -6 * SIZE(X), %xmm5 - movaps -4 * SIZE(X), %xmm6 - movaps -2 * SIZE(X), %xmm7 + movups -16 * SIZE(X), %xmm0 + movups -14 * SIZE(X), %xmm1 + movups -12 * SIZE(X), %xmm2 + movups -10 * SIZE(X), %xmm3 + movups -8 * SIZE(X), %xmm4 + movups -6 * SIZE(X), %xmm5 + movups -4 * SIZE(X), %xmm6 + movups -2 * SIZE(X), %xmm7 decq %rax jle .L22 @@ -515,16 +515,16 @@ jle .L24 ALIGN_3 - movaps -16 * SIZE(X), %xmm0 + movups -16 * SIZE(X), %xmm0 movlps %xmm0, -16 * SIZE(Y) movhps %xmm0, -15 * SIZE(Y) - movaps -14 * SIZE(X), %xmm1 + movups -14 * SIZE(X), %xmm1 movlps %xmm1, -14 * SIZE(Y) movhps %xmm1, -13 * SIZE(Y) - movaps -12 * SIZE(X), %xmm2 + movups -12 * SIZE(X), %xmm2 movlps %xmm2, -12 * SIZE(Y) movhps %xmm2, -11 * SIZE(Y) - movaps -10 * SIZE(X), %xmm3 + movups -10 * SIZE(X), %xmm3 movlps %xmm3, -10 * SIZE(Y) movhps %xmm3, -9 * SIZE(Y) @@ -537,10 +537,10 @@ jle .L25 ALIGN_3 - movaps -16 * SIZE(X), %xmm0 + movups -16 * SIZE(X), %xmm0 movlps %xmm0, -16 * SIZE(Y) movhps %xmm0, -15 * SIZE(Y) - movaps -14 * SIZE(X), %xmm1 + movups -14 * SIZE(X), %xmm1 movlps %xmm1, -14 * SIZE(Y) movhps %xmm1, -13 * SIZE(Y) @@ -553,7 +553,7 @@ jle .L26 ALIGN_3 - movaps -16 * SIZE(X), %xmm0 + movups -16 * SIZE(X), %xmm0 movlps %xmm0, -16 * SIZE(Y) movhps %xmm0, -15 * SIZE(Y) From 3eec7d382c72a47df6ff687a7994f6f04b0c064d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 13 Apr 2020 15:56:31 +0200 Subject: [PATCH 110/593] ARMV7 does not support DMB ISHLD, use DMB ISH --- common_arm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common_arm.h b/common_arm.h index ee691ad75..682315de5 100644 --- a/common_arm.h +++ b/common_arm.h @@ -43,7 +43,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define MB __asm__ __volatile__ ("dmb ish" : : : "memory") #define WMB __asm__ __volatile__ ("dmb ishst" : : : "memory") -#define RMB __asm__ __volatile__ ("dmb ishld" : : : "memory") +#define RMB __asm__ __volatile__ ("dmb ish" : : : "memory") #endif From 0f08f3efa62558e104ae3e71e0470c5cd286a1d2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 13 Apr 2020 22:46:12 +0200 Subject: [PATCH 111/593] Add a multithread test for x86_64 --- .drone.yml | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/.drone.yml b/.drone.yml index 3bbd8fc88..300cf3254 100644 --- a/.drone.yml +++ b/.drone.yml @@ -166,3 +166,27 @@ steps: - make -C ctest $COMMON_FLAGS - make -C utest $COMMON_FLAGS - make -C cpp_thread_test dgemm_tester +--- +kind: pipeline +name: epyc_native_test + +platform: + os: linux + arch: amd64 + +steps: +- name: Build and Test + image: ubuntu:19.04 + environment: + CC: gcc + COMMON_FLAGS: 'USE_OPENMP=1' + commands: + - echo "MAKE_FLAGS:= $COMMON_FLAGS" + - apt-get update -y + - apt-get install -y make $CC gfortran perl python g++ + - $CC --version + - make QUIET_MAKE=1 $COMMON_FLAGS + - make -C test $COMMON_FLAGS + - make -C ctest $COMMON_FLAGS + - make -C utest $COMMON_FLAGS + - make -C cpp_thread_test dgemm_tester From b969533703cc745f04e4fc99e7e80d181e7f24f1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 14 Apr 2020 10:53:28 +0200 Subject: [PATCH 112/593] Add drone.io badge, mention EMAG8180 support, reformat the DYNAMIC_ARCH paragraph --- README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/README.md b/README.md index 04f43f4c7..61393bd8f 100644 --- a/README.md +++ b/README.md @@ -6,8 +6,11 @@ Travis CI: [![Build Status](https://travis-ci.org/xianyi/OpenBLAS.svg?branch=dev AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop) +Drone CI: [![Build Status](https://cloud.drone.io/api/badges/xianyi/OpenBLAS/status.svg?branch=develop)](https://cloud.drone.io/xianyi/OpenBLAS/) + [![Build Status](https://dev.azure.com/xianyi/OpenBLAS/_apis/build/status/xianyi.OpenBLAS?branchName=develop)](https://dev.azure.com/xianyi/OpenBLAS/_build/latest?definitionId=1&branchName=develop) + ## Introduction OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. @@ -140,6 +143,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th - **ThunderX**: Optimized some Level-1 functions - **ThunderX2T99**: Optimized Level-3 BLAS and parts of Levels 1 and 2 - **TSV110**: Optimized some Level-3 helper functions +- **EMAG 8180**: preliminary support based on A57 #### PPC/PPC64 @@ -154,11 +158,16 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th ### Support for multiple targets in a single library OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying DYNAMIC_ARCH=1 in Makefile.rule, on the gmake command line or as -DDYNAMIC_ARCH=TRUE in cmake. + For **x86_64**, the list of targets this activates contains Prescott, Core2, Nehalem, Barcelona, Sandybridge, Bulldozer, Piledriver, Steamroller, Excavator, Haswell, Zen, SkylakeX. For cpu generations not included in this list, the corresponding older model is used. If you also specify DYNAMIC_OLDER=1, specific support for Penryn, Dunnington, Opteron, Opteron/SSE3, Bobcat, Atom and Nano is added. Finally there is an option DYNAMIC_LIST that allows to specify an individual list of targets to include instead of the default. + DYNAMIC_ARCH is also supported on **x86**, where it translates to Katmai, Coppermine, Northwood, Prescott, Banias, Core2, Penryn, Dunnington, Nehalem, Athlon, Opteron, Opteron_SSE3, Barcelona, Bobcat, Atom and Nano. + On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73, Falkor, ThunderX, ThunderX2T99, TSV110 as well as generic ARMV8 cpus. + For **POWER**, the list encompasses POWER6, POWER8 and POWER9, on **ZARCH** it comprises Z13 and Z14. + The TARGET option can be used in conjunction with DYNAMIC_ARCH=1 to specify which cpu model should be assumed for all the common code in the library, usually you will want to set this to the oldest model you expect to encounter. Please note that it is not possible to combine support for different architectures, so no combined 32 and 64 bit or x86_64 and arm64 in the same library. From 84a9614345d0030275230083fe4bc38e4531652d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 14 Apr 2020 19:18:35 +0200 Subject: [PATCH 113/593] try x86_64 test without openmp --- .drone.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.drone.yml b/.drone.yml index 300cf3254..5686c5e41 100644 --- a/.drone.yml +++ b/.drone.yml @@ -179,7 +179,7 @@ steps: image: ubuntu:19.04 environment: CC: gcc - COMMON_FLAGS: 'USE_OPENMP=1' + COMMON_FLAGS: 'USE_THREAD=1' commands: - echo "MAKE_FLAGS:= $COMMON_FLAGS" - apt-get update -y From 7eb55504b1727eebcb0f451fa5b148dbea303b69 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Tue, 14 Apr 2020 14:55:08 -0500 Subject: [PATCH 114/593] RFC : Add half precision gemm for bfloat16 in OpenBLAS This patch adds support for bfloat16 data type matrix multiplication kernel. For architectures that don't support bfloat16, it is defined as unsigned short (2 bytes). Default unroll sizes can be changed as per architecture as done for SGEMM and for now 8 and 4 are used for M and N. Size of ncopy/tcopy can be changed as per architecture requirement and for now, size 2 is used. Added shgemm in kernel/power/KERNEL.POWER9 and tested in powerpc64le and powerpc64. For reference, added a small test compare_sgemm_shgemm.c to compare sgemm and shgemm output. This patch does not cover OpenBLAS test, benchmark and lapack tests for shgemm. Complex type implementation can be discussed and added once this is approved. --- Makefile.system | 2 + Makefile.tail | 7 ++- cmake/prebuild.cmake | 4 ++ cmake/system.cmake | 2 + common.h | 15 ++++++ common_interface.h | 5 ++ common_level3.h | 20 +++++++ common_macro.h | 51 ++++++++++++++++++ common_param.h | 44 +++++++++++++++ common_sh.h | 65 ++++++++++++++++++++++ driver/level3/Makefile | 49 +++++++++++++++++ driver/level3/level3.c | 15 +++--- driver/level3/level3_thread.c | 27 +++++----- driver/others/parameter.c | 17 ++++++ getarch_2nd.c | 2 + interface/Makefile | 17 ++++-- interface/gemm.c | 10 ++-- kernel/Makefile.L3 | 73 +++++++++++++++++++++++++ kernel/generic/gemm_beta.c | 2 +- kernel/generic/gemm_ncopy_2.c | 6 +-- kernel/generic/gemm_tcopy_2.c | 6 +-- kernel/generic/gemmkernel_2x2.c | 75 ++++++++++++++++---------- kernel/power/KERNEL.POWER9 | 11 ++++ kernel/setparam-ref.c | 30 +++++++++++ lapack/getrf/potrf_parallel.c | 3 ++ param.h | 6 +++ test/compare_sgemm_shgemm.c | 95 +++++++++++++++++++++++++++++++++ 27 files changed, 594 insertions(+), 65 deletions(-) create mode 100644 common_sh.h create mode 100644 test/compare_sgemm_shgemm.c diff --git a/Makefile.system b/Makefile.system index 2998c0e6a..0e176987c 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1390,6 +1390,8 @@ export FUNCTION_PROFILE export TARGET_CORE export NO_AVX512 +export SHGEMM_UNROLL_M +export SHGEMM_UNROLL_N export SGEMM_UNROLL_M export SGEMM_UNROLL_N export DGEMM_UNROLL_M diff --git a/Makefile.tail b/Makefile.tail index 2adede1a5..39902982b 100644 --- a/Makefile.tail +++ b/Makefile.tail @@ -1,3 +1,4 @@ +SHBLASOBJS_P = $(SHBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) SBLASOBJS_P = $(SBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) DBLASOBJS_P = $(DBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) QBLASOBJS_P = $(QBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) @@ -9,8 +10,8 @@ COMMONOBJS_P = $(COMMONOBJS:.$(SUFFIX)=.$(PSUFFIX)) HPLOBJS_P = $(HPLOBJS:.$(SUFFIX)=.$(PSUFFIX)) -BLASOBJS = $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) -BLASOBJS_P = $(SBLASOBJS_P) $(DBLASOBJS_P) $(CBLASOBJS_P) $(ZBLASOBJS_P) +BLASOBJS = $(SHBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) +BLASOBJS_P = $(SHBLASOBJS_P) $(SBLASOBJS_P) $(DBLASOBJS_P) $(CBLASOBJS_P) $(ZBLASOBJS_P) ifdef EXPRECISION BLASOBJS += $(QBLASOBJS) $(XBLASOBJS) @@ -22,6 +23,7 @@ BLASOBJS += $(QBLASOBJS) $(XBLASOBJS) BLASOBJS_P += $(QBLASOBJS_P) $(XBLASOBJS_P) endif +$(SHBLASOBJS) $(SHBLASOBJS_P) : override CFLAGS += -DHALF -UDOUBLE -UCOMPLEX $(SBLASOBJS) $(SBLASOBJS_P) : override CFLAGS += -UDOUBLE -UCOMPLEX $(DBLASOBJS) $(DBLASOBJS_P) : override CFLAGS += -DDOUBLE -UCOMPLEX $(QBLASOBJS) $(QBLASOBJS_P) : override CFLAGS += -DXDOUBLE -UCOMPLEX @@ -29,6 +31,7 @@ $(CBLASOBJS) $(CBLASOBJS_P) : override CFLAGS += -UDOUBLE -DCOMPLEX $(ZBLASOBJS) $(ZBLASOBJS_P) : override CFLAGS += -DDOUBLE -DCOMPLEX $(XBLASOBJS) $(XBLASOBJS_P) : override CFLAGS += -DXDOUBLE -DCOMPLEX +$(SHBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(SBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(DBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(QBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 44e1473d1..e0696093b 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -16,6 +16,8 @@ # HAVE_SSE2 # HAVE_SSE3 # MAKE +# SHGEMM_UNROLL_M +# SHGEMM_UNROLL_N # SGEMM_UNROLL_M # SGEMM_UNROLL_N # DGEMM_UNROLL_M @@ -437,6 +439,8 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(ZGEMM_UNROLL_N 2) set(SYMV_P 8) endif() + set(SHGEMM_UNROLL_M 8) + set(SHGEMM_UNROLL_N 4) # Or should this actually be NUM_CORES? if (${NUM_THREADS} GREATER 0) diff --git a/cmake/system.cmake b/cmake/system.cmake index ce980a7b9..65e5aa508 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -530,6 +530,8 @@ endif () #export FUNCTION_PROFILE #export TARGET_CORE # +#export SHGEMM_UNROLL_M +#export SHGEMM_UNROLL_N #export SGEMM_UNROLL_M #export SGEMM_UNROLL_N #export DGEMM_UNROLL_M diff --git a/common.h b/common.h index 762968e6f..1d8bf07e5 100644 --- a/common.h +++ b/common.h @@ -297,6 +297,17 @@ typedef int blasint; #define SIZE 8 #define BASE_SHIFT 3 #define ZBASE_SHIFT 4 +#elif defined(HALF) +#ifndef BFLOAT16 +typedef unsigned short bfloat16; +#define HALFCONVERSION 1 +#endif +#define IFLOAT bfloat16 +#define XFLOAT IFLOAT +#define FLOAT float +#define SIZE 2 +#define BASE_SHIFT 1 +#define ZBASE_SHIFT 2 #else #define FLOAT float #define SIZE 4 @@ -308,6 +319,10 @@ typedef int blasint; #define XFLOAT FLOAT #endif +#ifndef IFLOAT +#define IFLOAT FLOAT +#endif + #ifndef COMPLEX #define COMPSIZE 1 #else diff --git a/common_interface.h b/common_interface.h index c350ac8ec..081043af1 100644 --- a/common_interface.h +++ b/common_interface.h @@ -37,6 +37,9 @@ /*********************************************************************/ #ifndef ASSEMBLER +#ifndef BFLOAT16 +typedef unsigned short bfloat16; +#endif #ifdef __cplusplus extern "C" { @@ -469,6 +472,8 @@ void BLASFUNC(xhbmv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint /* Level 3 routines */ +void BLASFUNC(shgemm)(char *, char *, blasint *, blasint *, blasint *, float *, + bfloat16 *, blasint *, bfloat16 *, blasint *, float *, float *, blasint *); void BLASFUNC(sgemm)(char *, char *, blasint *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); void BLASFUNC(dgemm)(char *, char *, blasint *, blasint *, blasint *, double *, diff --git a/common_level3.h b/common_level3.h index 6fa902be8..8194ba6ce 100644 --- a/common_level3.h +++ b/common_level3.h @@ -37,6 +37,9 @@ /*********************************************************************/ #ifndef ASSEMBLER +#ifndef BFLOAT16 +typedef unsigned short bfloat16; +#endif #ifdef __CUDACC__ __global__ void cuda_sgemm_kernel(int, int, int, float *, float *, float *); @@ -55,6 +58,8 @@ extern void sgemm_kernel_direct(BLASLONG M, BLASLONG N, BLASLONG K, extern int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K); +int shgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, + bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG); int sgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int dgemm_beta(BLASLONG, BLASLONG, BLASLONG, double, @@ -76,6 +81,10 @@ int xgemm_beta(BLASLONG, BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); #endif +int shgemm_incopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); +int shgemm_itcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); +int shgemm_oncopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); +int shgemm_otcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); int sgemm_incopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); int sgemm_itcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); int sgemm_oncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); @@ -499,6 +508,7 @@ int xher2k_kernel_UC(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdoubl int xher2k_kernel_LN(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag); int xher2k_kernel_LC(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag); +int shgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG); int sgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); int dgemm_kernel(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG); @@ -527,6 +537,11 @@ int cgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float int zgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); int xgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); +int shgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int shgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int shgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int shgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); + int sgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int sgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int sgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); @@ -619,6 +634,11 @@ int xgemm_cr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLON int xgemm_cc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); #endif +int shgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int shgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int shgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int shgemm_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); + int sgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int sgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int sgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); diff --git a/common_macro.h b/common_macro.h index 13bb85794..b438c83ba 100644 --- a/common_macro.h +++ b/common_macro.h @@ -39,6 +39,7 @@ #ifndef COMMON_MACRO #define COMMON_MACRO +#include "common_sh.h" #include "common_s.h" #include "common_d.h" #include "common_q.h" @@ -642,6 +643,53 @@ #define IMATCOPY_K_RT DIMATCOPY_K_RT #define GEADD_K DGEADD_K + +#elif defined(HALF) + +#define GEMM_BETA SHGEMM_BETA +#define GEMM_KERNEL_N SHGEMM_KERNEL +#define GEMM_KERNEL_L SHGEMM_KERNEL +#define GEMM_KERNEL_R SHGEMM_KERNEL +#define GEMM_KERNEL_B SHGEMM_KERNEL + +#define GEMM_NN SHGEMM_NN +#define GEMM_CN SHGEMM_TN +#define GEMM_TN SHGEMM_TN +#define GEMM_NC SHGEMM_NT +#define GEMM_NT SHGEMM_NT +#define GEMM_CC SHGEMM_TT +#define GEMM_CT SHGEMM_TT +#define GEMM_TC SHGEMM_TT +#define GEMM_TT SHGEMM_TT +#define GEMM_NR SHGEMM_NN +#define GEMM_TR SHGEMM_TN +#define GEMM_CR SHGEMM_TN +#define GEMM_RN SHGEMM_NN +#define GEMM_RT SHGEMM_NT +#define GEMM_RC SHGEMM_NT +#define GEMM_RR SHGEMM_NN +#define GEMM_ONCOPY SHGEMM_ONCOPY +#define GEMM_OTCOPY SHGEMM_OTCOPY +#define GEMM_INCOPY SHGEMM_INCOPY +#define GEMM_ITCOPY SHGEMM_ITCOPY + +#define GEMM_THREAD_NN SHGEMM_THREAD_NN +#define GEMM_THREAD_CN SHGEMM_THREAD_TN +#define GEMM_THREAD_TN SHGEMM_THREAD_TN +#define GEMM_THREAD_NC SHGEMM_THREAD_NT +#define GEMM_THREAD_NT SHGEMM_THREAD_NT +#define GEMM_THREAD_CC SHGEMM_THREAD_TT +#define GEMM_THREAD_CT SHGEMM_THREAD_TT +#define GEMM_THREAD_TC SHGEMM_THREAD_TT +#define GEMM_THREAD_TT SHGEMM_THREAD_TT +#define GEMM_THREAD_NR SHGEMM_THREAD_NN +#define GEMM_THREAD_TR SHGEMM_THREAD_TN +#define GEMM_THREAD_CR SHGEMM_THREAD_TN +#define GEMM_THREAD_RN SHGEMM_THREAD_NN +#define GEMM_THREAD_RT SHGEMM_THREAD_NT +#define GEMM_THREAD_RC SHGEMM_THREAD_NT +#define GEMM_THREAD_RR SHGEMM_THREAD_NN + #else #define AMAX_K SAMAX_K @@ -2202,6 +2250,9 @@ #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) extern BLASLONG gemm_offset_a; extern BLASLONG gemm_offset_b; +extern BLASLONG shgemm_p; +extern BLASLONG shgemm_q; +extern BLASLONG shgemm_r; extern BLASLONG sgemm_p; extern BLASLONG sgemm_q; extern BLASLONG sgemm_r; diff --git a/common_param.h b/common_param.h index 574d5e176..f1cac38d1 100644 --- a/common_param.h +++ b/common_param.h @@ -84,6 +84,16 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*sgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); int (*sgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + int shgemm_p, shgemm_q, shgemm_r; + int shgemm_unroll_m, shgemm_unroll_n, shgemm_unroll_mn; + int (*shgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG); + int (*shgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG); + + int (*shgemm_incopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); + int (*shgemm_itcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); + int (*shgemm_oncopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); + int (*shgemm_otcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); + int (*sgemm_incopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*sgemm_itcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*sgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); @@ -907,6 +917,13 @@ extern gotoblas_t *gotoblas; #define HAVE_EX_L2 gotoblas -> exclusive_cache +#define SHGEMM_P gotoblas -> shgemm_p +#define SHGEMM_Q gotoblas -> shgemm_q +#define SHGEMM_R gotoblas -> shgemm_r +#define SHGEMM_UNROLL_M gotoblas -> shgemm_unroll_m +#define SHGEMM_UNROLL_N gotoblas -> shgemm_unroll_n +#define SHGEMM_UNROLL_MN gotoblas -> shgemm_unroll_mn + #define SGEMM_P gotoblas -> sgemm_p #define SGEMM_Q gotoblas -> sgemm_q #define SGEMM_R gotoblas -> sgemm_r @@ -984,6 +1001,17 @@ extern gotoblas_t *gotoblas; #define HAVE_EX_L2 0 #endif +#define SHGEMM_P SHGEMM_DEFAULT_P +#define SHGEMM_Q SHGEMM_DEFAULT_Q +#define SHGEMM_R SHGEMM_DEFAULT_R +#define SHGEMM_UNROLL_M SHGEMM_DEFAULT_UNROLL_M +#define SHGEMM_UNROLL_N SHGEMM_DEFAULT_UNROLL_N +#ifdef SHGEMM_DEFAULT_UNROLL_MN +#define SHGEMM_UNROLL_MN SHGEMM_DEFAULT_UNROLL_MN +#else +#define SHGEMM_UNROLL_MN MAX((SHGEMM_UNROLL_M), (SHGEMM_UNROLL_N)) +#endif + #define SGEMM_P SGEMM_DEFAULT_P #define SGEMM_Q SGEMM_DEFAULT_Q #define SGEMM_R SGEMM_DEFAULT_R @@ -1119,6 +1147,18 @@ extern gotoblas_t *gotoblas; #define GEMM_DEFAULT_R DGEMM_DEFAULT_R #define GEMM_DEFAULT_UNROLL_M DGEMM_DEFAULT_UNROLL_M #define GEMM_DEFAULT_UNROLL_N DGEMM_DEFAULT_UNROLL_N +#elif defined(HALF) +#define GEMM_P SHGEMM_P +#define GEMM_Q SHGEMM_Q +#define GEMM_R SHGEMM_R +#define GEMM_UNROLL_M SHGEMM_UNROLL_M +#define GEMM_UNROLL_N SHGEMM_UNROLL_N +#define GEMM_UNROLL_MN SHGEMM_UNROLL_MN +#define GEMM_DEFAULT_P SHGEMM_DEFAULT_P +#define GEMM_DEFAULT_Q SHGEMM_DEFAULT_Q +#define GEMM_DEFAULT_R SHGEMM_DEFAULT_R +#define GEMM_DEFAULT_UNROLL_M SHGEMM_DEFAULT_UNROLL_M +#define GEMM_DEFAULT_UNROLL_N SHGEMM_DEFAULT_UNROLL_N #else #define GEMM_P SGEMM_P #define GEMM_Q SGEMM_Q @@ -1204,6 +1244,10 @@ extern gotoblas_t *gotoblas; #define GEMM_THREAD gemm_thread_n #endif +#ifndef SHGEMM_DEFAULT_R +#define SHGEMM_DEFAULT_R (((BUFFER_SIZE - ((SHGEMM_DEFAULT_P * SHGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SHGEMM_DEFAULT_Q * 4) - 15) & ~15) +#endif + #ifndef SGEMM_DEFAULT_R #define SGEMM_DEFAULT_R (((BUFFER_SIZE - ((SGEMM_DEFAULT_P * SGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SGEMM_DEFAULT_Q * 4) - 15) & ~15) #endif diff --git a/common_sh.h b/common_sh.h new file mode 100644 index 000000000..8859694f1 --- /dev/null +++ b/common_sh.h @@ -0,0 +1,65 @@ +#ifndef COMMON_H_H +#define COMMON_H_H + +#ifndef DYNAMIC_ARCH + +#define SHGEMM_ONCOPY shgemm_oncopy +#define SHGEMM_OTCOPY shgemm_otcopy + +#if SHGEMM_DEFAULT_UNROLL_M == SHGEMM_DEFAULT_UNROLL_N +#define SHGEMM_INCOPY shgemm_oncopy +#define SHGEMM_ITCOPY shgemm_otcopy +#else +#define SHGEMM_INCOPY shgemm_incopy +#define SHGEMM_ITCOPY shgemm_itcopy +#endif +#define SHGEMM_BETA shgemm_beta +#define SHGEMM_KERNEL shgemm_kernel + +#else + +#define SHGEMM_ONCOPY gotoblas -> shgemm_oncopy +#define SHGEMM_OTCOPY gotoblas -> shgemm_otcopy +#define SHGEMM_INCOPY gotoblas -> shgemm_incopy +#define SHGEMM_ITCOPY gotoblas -> shgemm_itcopy +#define SHGEMM_BETA gotoblas -> shgemm_beta +#define SHGEMM_KERNEL gotoblas -> shgemm_kernel + +#endif + +#define SHGEMM_NN shgemm_nn +#define SHGEMM_CN shgemm_tn +#define SHGEMM_TN shgemm_tn +#define SHGEMM_NC shgemm_nt +#define SHGEMM_NT shgemm_nt +#define SHGEMM_CC shgemm_tt +#define SHGEMM_CT shgemm_tt +#define SHGEMM_TC shgemm_tt +#define SHGEMM_TT shgemm_tt +#define SHGEMM_NR shgemm_nn +#define SHGEMM_TR shgemm_tn +#define SHGEMM_CR shgemm_tn +#define SHGEMM_RN shgemm_nn +#define SHGEMM_RT shgemm_nt +#define SHGEMM_RC shgemm_nt +#define SHGEMM_RR shgemm_nn + +#define SHGEMM_THREAD_NN shgemm_thread_nn +#define SHGEMM_THREAD_CN shgemm_thread_tn +#define SHGEMM_THREAD_TN shgemm_thread_tn +#define SHGEMM_THREAD_NC shgemm_thread_nt +#define SHGEMM_THREAD_NT shgemm_thread_nt +#define SHGEMM_THREAD_CC shgemm_thread_tt +#define SHGEMM_THREAD_CT shgemm_thread_tt +#define SHGEMM_THREAD_TC shgemm_thread_tt +#define SHGEMM_THREAD_TT shgemm_thread_tt +#define SHGEMM_THREAD_NR shgemm_thread_nn +#define SHGEMM_THREAD_TR shgemm_thread_tn +#define SHGEMM_THREAD_CR shgemm_thread_tn +#define SHGEMM_THREAD_RN shgemm_thread_nn +#define SHGEMM_THREAD_RT shgemm_thread_nt +#define SHGEMM_THREAD_RC shgemm_thread_nt +#define SHGEMM_THREAD_RR shgemm_thread_nn + +#endif + diff --git a/driver/level3/Makefile b/driver/level3/Makefile index e320092e3..881b4ee35 100644 --- a/driver/level3/Makefile +++ b/driver/level3/Makefile @@ -19,6 +19,7 @@ ifeq ($(ARCH), MIPS) USE_GEMM3M = 1 endif +SHBLASOBJS += shgemm_nn.$(SUFFIX) shgemm_nt.$(SUFFIX) shgemm_tn.$(SUFFIX) shgemm_tt.$(SUFFIX) SBLASOBJS += \ sgemm_nn.$(SUFFIX) sgemm_nt.$(SUFFIX) sgemm_tn.$(SUFFIX) sgemm_tt.$(SUFFIX) \ strmm_LNUU.$(SUFFIX) strmm_LNUN.$(SUFFIX) strmm_LNLU.$(SUFFIX) strmm_LNLN.$(SUFFIX) \ @@ -204,6 +205,7 @@ COMMONOBJS += syrk_thread.$(SUFFIX) ifndef USE_SIMPLE_THREADED_LEVEL3 +SHBLASOBJS += shgemm_thread_nn.$(SUFFIX) shgemm_thread_nt.$(SUFFIX) shgemm_thread_tn.$(SUFFIX) shgemm_thread_tt.$(SUFFIX) SBLASOBJS += sgemm_thread_nn.$(SUFFIX) sgemm_thread_nt.$(SUFFIX) sgemm_thread_tn.$(SUFFIX) sgemm_thread_tt.$(SUFFIX) DBLASOBJS += dgemm_thread_nn.$(SUFFIX) dgemm_thread_nt.$(SUFFIX) dgemm_thread_tn.$(SUFFIX) dgemm_thread_tt.$(SUFFIX) QBLASOBJS += qgemm_thread_nn.$(SUFFIX) qgemm_thread_nt.$(SUFFIX) qgemm_thread_tn.$(SUFFIX) qgemm_thread_tt.$(SUFFIX) @@ -283,6 +285,18 @@ endif all :: +shgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) + +shgemm_nt.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) + +shgemm_tn.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) + +shgemm_tt.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) + sgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) @@ -478,6 +492,17 @@ gemm_thread_variable.$(SUFFIX) : gemm_thread_variable.c ../../common.h beta_thread.$(SUFFIX) : beta_thread.c ../../common.h $(CC) -c $(CFLAGS) $< -o $(@F) +shgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) + +shgemm_thread_nt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) + +shgemm_thread_tn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) + +shgemm_thread_tt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) sgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) @@ -2652,6 +2677,18 @@ xtrsm_RCLU.$(SUFFIX) : trsm_R.c xtrsm_RCLN.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) +shgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) + +shgemm_nt.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) + +shgemm_tn.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) + +shgemm_tt.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) + sgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) @@ -2848,6 +2885,18 @@ beta_thread.$(PSUFFIX) : beta_thread.c ../../common.h $(CC) -c $(PFLAGS) $< -o $(@F) +shgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) + +shgemm_thread_nt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) + +shgemm_thread_tn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) + +shgemm_thread_tt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) + sgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) diff --git a/driver/level3/level3.c b/driver/level3/level3.c index 9aa67286f..c6bbb9ca9 100644 --- a/driver/level3/level3.c +++ b/driver/level3/level3.c @@ -62,18 +62,18 @@ #ifndef ICOPY_OPERATION #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \ defined(RN) || defined(RT) || defined(RC) || defined(RR) -#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (IFLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); #else -#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (IFLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); #endif #endif #ifndef OCOPY_OPERATION #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \ defined(NR) || defined(TR) || defined(CR) || defined(RR) -#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (IFLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); #else -#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (IFLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); #endif #endif @@ -173,7 +173,8 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, XFLOAT *sa, XFLOAT *sb, BLASLONG dummy){ BLASLONG k, lda, ldb, ldc; FLOAT *alpha, *beta; - FLOAT *a, *b, *c; + IFLOAT *a, *b; + FLOAT *c; BLASLONG m_from, m_to, n_from, n_to; BLASLONG ls, is, js; @@ -198,8 +199,8 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, k = K; - a = (FLOAT *)A; - b = (FLOAT *)B; + a = (IFLOAT *)A; + b = (IFLOAT *)B; c = (FLOAT *)C; lda = LDA; diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index ca0085e71..5a8d497d2 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -117,18 +117,18 @@ typedef struct { #ifndef ICOPY_OPERATION #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \ defined(RN) || defined(RT) || defined(RC) || defined(RR) -#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (IFLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); #else -#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (IFLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); #endif #endif #ifndef OCOPY_OPERATION #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \ defined(NR) || defined(TR) || defined(CR) || defined(RR) -#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (IFLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); #else -#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (IFLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); #endif #endif @@ -219,15 +219,16 @@ typedef struct { #define STOP_RPCC(COUNTER) #endif -static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ +static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IFLOAT *sb, BLASLONG mypos){ - FLOAT *buffer[DIVIDE_RATE]; + IFLOAT *buffer[DIVIDE_RATE]; BLASLONG k, lda, ldb, ldc; BLASLONG m_from, m_to, n_from, n_to; FLOAT *alpha, *beta; - FLOAT *a, *b, *c; + IFLOAT *a, *b; + FLOAT *c; job_t *job = (job_t *)args -> common; BLASLONG nthreads_m; @@ -255,8 +256,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, k = K; - a = (FLOAT *)A; - b = (FLOAT *)B; + a = (IFLOAT *)A; + b = (IFLOAT *)B; c = (FLOAT *)C; lda = LDA; @@ -425,7 +426,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Apply kernel with local region of A and part of other region of B */ START_RPCC(); KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - js, div_n), min_l, alpha, - sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], + sa, (IFLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], c, ldc, m_from, js); STOP_RPCC(kernel); @@ -469,7 +470,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Apply kernel with local region of A and part of region of B */ START_RPCC(); KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - js, div_n), min_l, alpha, - sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], + sa, (IFLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], c, ldc, is, js); STOP_RPCC(kernel); @@ -532,7 +533,7 @@ static int round_up(int remainder, int width, int multiple) static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG - *range_n, FLOAT *sa, FLOAT *sb, + *range_n, IFLOAT *sa, IFLOAT *sb, BLASLONG nthreads_m, BLASLONG nthreads_n) { #ifndef USE_OPENMP @@ -728,7 +729,7 @@ EnterCriticalSection((PCRITICAL_SECTION)&level3_lock); return 0; } -int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ +int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IFLOAT *sb, BLASLONG mypos){ BLASLONG m = args -> m; BLASLONG n = args -> n; diff --git a/driver/others/parameter.c b/driver/others/parameter.c index 8bf7da78b..b1f3befae 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -62,6 +62,11 @@ BLASLONG gemm_offset_b = DEFAULT_GEMM_OFFSET_B; BLASLONG gemm_offset_b = GEMM_OFFSET_B; #endif +#if SHGEMM_P == shgemm_p +BLASLONG shgemm_p = DEFAULT_GEMM_P; +#else +BLASLONG shgemm_p = SHGEMM_P; +#endif #if SGEMM_P == sgemm_p BLASLONG sgemm_p = DEFAULT_GEMM_P; #else @@ -83,6 +88,11 @@ BLASLONG zgemm_p = DEFAULT_GEMM_P; BLASLONG zgemm_p = ZGEMM_P; #endif +#if SHGEMM_Q == shgemm_q +BLASLONG shgemm_q = DEFAULT_GEMM_Q; +#else +BLASLONG shgemm_q = SHGEMM_Q; +#endif #if SGEMM_Q == sgemm_q BLASLONG sgemm_q = DEFAULT_GEMM_Q; #else @@ -104,6 +114,11 @@ BLASLONG zgemm_q = DEFAULT_GEMM_Q; BLASLONG zgemm_q = ZGEMM_Q; #endif +#if SHGEMM_R == shgemm_r +BLASLONG shgemm_r = DEFAULT_GEMM_R; +#else +BLASLONG shgemm_r = SHGEMM_R; +#endif #if SGEMM_R == sgemm_r BLASLONG sgemm_r = DEFAULT_GEMM_R; #else @@ -597,6 +612,7 @@ void blas_set_parameter(void){ size = BITMASK(cpuid3, 16, 0xff); + shgemm_p = 192 * (size + 1); sgemm_p = 192 * (size + 1); dgemm_p = 96 * (size + 1); cgemm_p = 96 * (size + 1); @@ -610,6 +626,7 @@ void blas_set_parameter(void){ xgemm_p = 16 * (size + 1); #endif + shgemm_r = (((BUFFER_SIZE - ((SHGEMM_P * SHGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SHGEMM_Q * 4)) - 15) & ~15; sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15; dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15; cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15; diff --git a/getarch_2nd.c b/getarch_2nd.c index cf9c578cb..a1d0ccac8 100644 --- a/getarch_2nd.c +++ b/getarch_2nd.c @@ -9,6 +9,8 @@ int main(int argc, char **argv) { if ( (argc <= 1) || ((argc >= 2) && (*argv[1] == '0'))) { + printf("SHGEMM_UNROLL_M=%d\n", SHGEMM_DEFAULT_UNROLL_M); + printf("SHGEMM_UNROLL_N=%d\n", SHGEMM_DEFAULT_UNROLL_N); printf("SGEMM_UNROLL_M=%d\n", SGEMM_DEFAULT_UNROLL_M); printf("SGEMM_UNROLL_N=%d\n", SGEMM_DEFAULT_UNROLL_N); printf("DGEMM_UNROLL_M=%d\n", DGEMM_DEFAULT_UNROLL_M); diff --git a/interface/Makefile b/interface/Makefile index 3f0dcca28..741f6bac0 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -46,6 +46,7 @@ SBLAS3OBJS = \ somatcopy.$(SUFFIX) simatcopy.$(SUFFIX)\ sgeadd.$(SUFFIX) +SHBLAS3OBJS = shgemm.$(SUFFIX) DBLAS1OBJS = \ daxpy.$(SUFFIX) dswap.$(SUFFIX) \ @@ -277,6 +278,8 @@ CSBLAS3OBJS = \ cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX)\ cblas_sgeadd.$(SUFFIX) +CSHBLAS3OBJS = cblas_shgemm.$(SUFFIX) + CDBLAS1OBJS = \ cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \ cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \ @@ -367,6 +370,7 @@ override CFLAGS += -I. SBLAS1OBJS += $(CSBLAS1OBJS) SBLAS2OBJS += $(CSBLAS2OBJS) SBLAS3OBJS += $(CSBLAS3OBJS) +SHBLAS3OBJS += $(CSHBLAS3OBJS) DBLAS1OBJS += $(CDBLAS1OBJS) DBLAS2OBJS += $(CDBLAS2OBJS) DBLAS3OBJS += $(CDBLAS3OBJS) @@ -380,6 +384,7 @@ ZBLAS3OBJS += $(CZBLAS3OBJS) endif SBLASOBJS = $(SBLAS1OBJS) $(SBLAS2OBJS) $(SBLAS3OBJS) +SHBLASOBJS = $(SHBLAS3OBJS) DBLASOBJS = $(DBLAS1OBJS) $(DBLAS2OBJS) $(DBLAS3OBJS) QBLASOBJS = $(QBLAS1OBJS) $(QBLAS2OBJS) $(QBLAS3OBJS) CBLASOBJS = $(CBLAS1OBJS) $(CBLAS2OBJS) $(CBLAS3OBJS) @@ -454,7 +459,7 @@ ZBLASOBJS += $(ZLAPACKOBJS) endif -FUNCOBJS = $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) +FUNCOBJS = $(SHBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) ifdef EXPRECISION FUNCOBJS += $(QBLASOBJS) $(XBLASOBJS) @@ -488,10 +493,10 @@ level1 : $(SBLAS1OBJS) $(DBLAS1OBJS) $(QBLAS1OBJS) $(CBLAS1OBJS) $(ZBLAS1OBJS) $ level2 : $(SBLAS2OBJS) $(DBLAS2OBJS) $(QBLAS2OBJS) $(CBLAS2OBJS) $(ZBLAS2OBJS) $(XBLAS2OBJS) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ -level3 : $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $(XBLAS3OBJS) +level3 : $(SHBLAS3OBJS) $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $(XBLAS3OBJS) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ -$(CSBLASOBJS) $(CSBLASOBJS_P) $(CDBLASOBJS) $(CDBLASOBJS_P) $(CQBLASOBJS) $(CQBLASOBJS_P) \ +$(CSHBLASOBJS) $(CSHBLASOBJS_P) $(CSBLASOBJS) $(CSBLASOBJS_P) $(CDBLASOBJS) $(CDBLASOBJS_P) $(CQBLASOBJS) $(CQBLASOBJS_P) \ $(CCBLASOBJS) $(CCBLASOBJS_P) $(CZBLASOBJS) $(CZBLASOBJS_P) $(CXBLASOBJS) $(CXBLASOBJS_P) : override CFLAGS += -DCBLAS srot.$(SUFFIX) srot.$(PSUFFIX) : rot.c @@ -1209,6 +1214,9 @@ zhpr2.$(SUFFIX) zhpr2.$(PSUFFIX) : zhpr2.c xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c $(CC) -c $(CFLAGS) $< -o $(@F) +shgemm.$(SUFFIX) shgemm.$(PSUFFIX) : gemm.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + sgemm.$(SUFFIX) sgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) @@ -1770,6 +1778,9 @@ cblas_zhemv.$(SUFFIX) cblas_zhemv.$(PSUFFIX) : zhemv.c cblas_sgemm.$(SUFFIX) cblas_sgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) +cblas_shgemm.$(SUFFIX) cblas_shgemm.$(PSUFFIX) : gemm.c ../param.h + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + cblas_dgemm.$(SUFFIX) cblas_dgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) diff --git a/interface/gemm.c b/interface/gemm.c index 0b18d9a8c..99388e7d9 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -77,7 +77,7 @@ #define GEMM_MULTITHREAD_THRESHOLD 4 #endif -static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { +static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, BLASLONG) = { #ifndef GEMM3M GEMM_NN, GEMM_TN, GEMM_RN, GEMM_CN, GEMM_NT, GEMM_TT, GEMM_RT, GEMM_CT, @@ -108,8 +108,8 @@ static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLA void NAME(char *TRANSA, char *TRANSB, blasint *M, blasint *N, blasint *K, FLOAT *alpha, - FLOAT *a, blasint *ldA, - FLOAT *b, blasint *ldB, + IFLOAT *a, blasint *ldA, + IFLOAT *b, blasint *ldB, FLOAT *beta, FLOAT *c, blasint *ldC){ @@ -119,8 +119,8 @@ void NAME(char *TRANSA, char *TRANSB, blasint info; char transA, transB; - FLOAT *buffer; - FLOAT *sa, *sb; + IFLOAT *buffer; + IFLOAT *sa, *sb; #ifdef SMP double MNK; diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 6d96abb2e..aee610efb 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -59,6 +59,10 @@ ifeq ($(CORE), Z14) USE_TRMM = 1 endif +SHKERNELOBJS += \ + shgemm_kernel$(TSUFFIX).$(SUFFIX) \ + $(SHGEMMINCOPYOBJ) $(SHGEMMITCOPYOBJ) \ + $(SHGEMMONCOPYOBJ) $(SHGEMMOTCOPYOBJ) SKERNELOBJS += \ sgemm_kernel$(TSUFFIX).$(SUFFIX) \ @@ -93,6 +97,7 @@ XKERNELOBJS += \ $(XGEMMINCOPYOBJ) $(XGEMMITCOPYOBJ) \ $(XGEMMONCOPYOBJ) $(XGEMMOTCOPYOBJ) +SHBLASOBJS += $(SHKERNELOBJS) SBLASOBJS += $(SKERNELOBJS) DBLASOBJS += $(DKERNELOBJS) QBLASOBJS += $(QKERNELOBJS) @@ -100,6 +105,7 @@ CBLASOBJS += $(CKERNELOBJS) ZBLASOBJS += $(ZKERNELOBJS) XBLASOBJS += $(XKERNELOBJS) +SHBLASOBJS += shgemm_beta$(TSUFFIX).$(SUFFIX) SBLASOBJS += \ sgemm_beta$(TSUFFIX).$(SUFFIX) \ strmm_kernel_LN$(TSUFFIX).$(SUFFIX) strmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ @@ -390,6 +396,10 @@ ZBLASOBJS += \ zgeadd_k$(TSUFFIX).$(SUFFIX) +SHGEMMINCOPYOBJ_P = $(SHGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +SHGEMMITCOPYOBJ_P = $(SHGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +SHGEMMONCOPYOBJ_P = $(SHGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +SHGEMMOTCOPYOBJ_P = $(SHGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) SGEMMINCOPYOBJ_P = $(SGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) SGEMMITCOPYOBJ_P = $(SGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) SGEMMONCOPYOBJ_P = $(SGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) @@ -415,6 +425,9 @@ XGEMMITCOPYOBJ_P = $(XGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) XGEMMONCOPYOBJ_P = $(XGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) XGEMMOTCOPYOBJ_P = $(XGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +$(KDIR)shgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_BETA) + $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ + $(KDIR)sgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_BETA) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ @@ -433,6 +446,36 @@ $(KDIR)zgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA) $(KDIR)xgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMM_BETA) $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@ +$(KDIR)$(SHGEMMONCOPYOBJ) : $(KERNELDIR)/$(SHGEMMONCOPY) + $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)$(SHGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SHGEMMOTCOPY) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -E -DHALF -UDOUBLE -UCOMPLEX $< -o shgemmotcopy.s + m4 shgemmotcopy.s > shgemmotcopy_nomacros.s + $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemmotcopy_nomacros.s -o $@ + rm shgemmotcopy.s shgemmotcopy_nomacros.s +else + $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ +endif + +ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N)) + +$(KDIR)$(SHGEMMINCOPYOBJ) : $(KERNELDIR)/$(SHGEMMINCOPY) + $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)$(SHGEMMITCOPYOBJ) : $(KERNELDIR)/$(SHGEMMITCOPY) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -E -DHALF -UDOUBLE -UCOMPLEX $< -o shgemmitcopy.s + m4 shgemmitcopy.s > shgemmitcopy_nomacros.s + $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemmitcopy_nomacros.s -o $@ + rm shgemmitcopy.s shgemmitcopy_nomacros.s +else + $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ +endif + +endif + $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ @@ -590,6 +633,16 @@ else $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ endif +$(KDIR)shgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -E -DHALF -UDOUBLE -UCOMPLEX $< -o shgemm_kernel$(TSUFFIX).s + m4 shgemm_kernel$(TSUFFIX).s > shgemm_kernel$(TSUFFIX)_nomacros.s + $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemm_kernel$(TSUFFIX)_nomacros.s -o $@ + rm shgemm_kernel$(TSUFFIX).s shgemm_kernel$(TSUFFIX)_nomacros.s +else + $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ +endif + $(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_kernel$(TSUFFIX).s @@ -2206,6 +2259,9 @@ $(KDIR)xtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_ $(KDIR)sgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMM_BETA) $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ +$(KDIR)shgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHGEMM_BETA) + $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ + $(KDIR)dgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMM_BETA) $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ @@ -2221,6 +2277,20 @@ $(KDIR)zgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA) $(KDIR)xgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMM_BETA) $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@ +$(SHGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMONCOPY) + $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ + +$(SHGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMOTCOPY) + $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ + +ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N)) +$(SHGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMINCOPY) + $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ + +$(SHGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMITCOPY) + $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ + +endif $(SGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMONCOPY) $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ @@ -2325,6 +2395,9 @@ endif endif +$(KDIR)shgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND) + $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ + $(KDIR)sgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ diff --git a/kernel/generic/gemm_beta.c b/kernel/generic/gemm_beta.c index fa9d7680d..ccb772cc7 100644 --- a/kernel/generic/gemm_beta.c +++ b/kernel/generic/gemm_beta.c @@ -39,7 +39,7 @@ #include "common.h" int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, - FLOAT *dummy2, BLASLONG dummy3, FLOAT *dummy4, BLASLONG dummy5, + IFLOAT *dummy2, BLASLONG dummy3, IFLOAT *dummy4, BLASLONG dummy5, FLOAT *c, BLASLONG ldc){ diff --git a/kernel/generic/gemm_ncopy_2.c b/kernel/generic/gemm_ncopy_2.c index b728c713f..415860f81 100644 --- a/kernel/generic/gemm_ncopy_2.c +++ b/kernel/generic/gemm_ncopy_2.c @@ -39,10 +39,10 @@ #include #include "common.h" -int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ BLASLONG i, j; - FLOAT *a_offset, *a_offset1, *a_offset2; - FLOAT *b_offset; + IFLOAT *a_offset, *a_offset1, *a_offset2; + IFLOAT *b_offset; a_offset = a; b_offset = b; diff --git a/kernel/generic/gemm_tcopy_2.c b/kernel/generic/gemm_tcopy_2.c index 5695b13c2..b4aa4de57 100644 --- a/kernel/generic/gemm_tcopy_2.c +++ b/kernel/generic/gemm_tcopy_2.c @@ -39,11 +39,11 @@ #include #include "common.h" -int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ BLASLONG i, j; - FLOAT *a_offset, *a_offset1, *a_offset2; - FLOAT *b_offset, *b_offset1, *b_offset2; + IFLOAT *a_offset, *a_offset1, *a_offset2; + IFLOAT *b_offset, *b_offset1, *b_offset2; a_offset = a; b_offset = b; diff --git a/kernel/generic/gemmkernel_2x2.c b/kernel/generic/gemmkernel_2x2.c index 01f1c67b5..26a88db6d 100644 --- a/kernel/generic/gemmkernel_2x2.c +++ b/kernel/generic/gemmkernel_2x2.c @@ -1,13 +1,32 @@ #include "common.h" -int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc +#if defined(HALF) && defined(HALFCONVERSION) +float +bfloat16tof32 (bfloat16 f16) +{ + float result = 0; + unsigned short* q = (unsigned short*)(&result); +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + q[0] = f16; +#else + q[1] = f16; +#endif + return result; +} +#define BF16TOF32(x) (bfloat16tof32(x)) +#else +#define BF16TOF32(x) x +#endif +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,IFLOAT* ba,IFLOAT* bb,FLOAT* C,BLASLONG ldc #ifdef TRMMKERNEL ,BLASLONG offset #endif ) { BLASLONG i,j,k; - FLOAT *C0,*C1,*ptrba,*ptrbb; - FLOAT res0,res1,res2,res3,load0,load1,load2,load3,load4,load5,load6,load7; + FLOAT *C0,*C1; + IFLOAT *ptrba,*ptrbb; + FLOAT res0,res1,res2,res3; + IFLOAT load0,load1,load2,load3,load4,load5,load6,load7; for (j=0; j +#include +#include "common.h" +#define SGEMM BLASFUNC(sgemm) +#define SHGEMM BLASFUNC(shgemm) +typedef union +{ + unsigned short v; + struct + { +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + unsigned short s:1; + unsigned short e:8; + unsigned short m:7; +#else + unsigned short m:7; + unsigned short e:8; + unsigned short s:1; +#endif + } bits; +} bfloat16_bits; + +int +main (int argc, char *argv[]) +{ + int m, n, k; + int i, j, l; + int ret = 0; + int loop = 20; + char transA = 'N', transB = 'N'; + float alpha = 1.0, beta = 0.0; + char transa = 'N'; + char transb = 'N'; + + for (int x = 0; x <= loop; x++) + { + m = k = n = x; + float A[m * k]; + float B[k * n]; + float C[m * n]; + bfloat16_bits AA[m * k], BB[k * n]; + float CC[m * n]; + + for (int j = 0; j < m; j++) + { + for (int i = 0; i < m; i++) + { + A[j * k + i] = j * 9.0; + B[j * k + i] = i * 2.0; + C[j * k + i] = 0; + AA[j * k + i].v = *(uint32_t *) & A[j * k + i] >> 16; + BB[j * k + i].v = *(uint32_t *) & B[j * k + i] >> 16; + CC[j * k + i] = 0; + } + } + SGEMM (&transA, &transB, &m, &n, &k, &alpha, A, + &m, B, &k, &beta, C, &m); + SHGEMM (&transA, &transB, &m, &n, &k, &alpha, AA, + &m, BB, &k, &beta, CC, &m); + + for (i = 0; i < n; i++) + for (j = 0; j < m; j++) + for (l = 0; l < k; l++) + if (CC[i * m + j] != C[i * m + j]) + ret++; + } + fprintf (stderr, "Return code: %d\n", ret); + return ret; +} From ff010f496e255de706067ff54b57e38b69f33c0d Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Tue, 14 Apr 2020 20:38:53 -0500 Subject: [PATCH 115/593] Build shgemm for all architecture --- kernel/Makefile.L3 | 13 +++++++++++++ kernel/power/KERNEL.POWER9 | 11 ----------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index aee610efb..baf0c1c8a 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -59,6 +59,19 @@ ifeq ($(CORE), Z14) USE_TRMM = 1 endif +#ifndef SHGEMMKERNEL +SHGEMM_BETA = ../generic/gemm_beta.c +SHGEMMKERNEL = ../generic/gemmkernel_2x2.c +SHGEMMINCOPY = ../generic/gemm_ncopy_2.c +SHGEMMITCOPY = ../generic/gemm_tcopy_2.c +SHGEMMONCOPY = ../generic/gemm_ncopy_2.c +SHGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX) +SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX) +SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX) +SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX) +#endif + SHKERNELOBJS += \ shgemm_kernel$(TSUFFIX).$(SUFFIX) \ $(SHGEMMINCOPYOBJ) $(SHGEMMITCOPYOBJ) \ diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 index dedb015e8..aabb5d976 100644 --- a/kernel/power/KERNEL.POWER9 +++ b/kernel/power/KERNEL.POWER9 @@ -12,17 +12,6 @@ DTRMMKERNEL = dgemm_kernel_power9.S CTRMMKERNEL = cgemm_kernel_power9.S ZTRMMKERNEL = zgemm_kernel_power9.S -SHGEMM_BETA = ../generic/gemm_beta.c -SHGEMMKERNEL = ../generic/gemmkernel_2x2.c -SHGEMMINCOPY = ../generic/gemm_ncopy_2.c -SHGEMMITCOPY = ../generic/gemm_tcopy_2.c -SHGEMMONCOPY = ../generic/gemm_ncopy_2.c -SHGEMMOTCOPY = ../generic/gemm_tcopy_2.c -SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX) -SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX) -SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX) -SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX) - SGEMMKERNEL = sgemm_kernel_power9.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = sgemm_tcopy_16_power8.S From ac6a22ae7801888df527eb426647b0b55e79f60c Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Tue, 14 Apr 2020 22:58:39 -0500 Subject: [PATCH 116/593] Update header --- common_param.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/common_param.h b/common_param.h index f1cac38d1..6276f7f51 100644 --- a/common_param.h +++ b/common_param.h @@ -41,6 +41,9 @@ #ifndef ASSEMBLER +#ifndef BFLOAT16 +typedef unsigned short bfloat16; +#endif #ifdef DYNAMIC_ARCH typedef struct { From a87793e03c4a073b533ceadafa54cf6c01a66f18 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Wed, 15 Apr 2020 09:09:50 -0500 Subject: [PATCH 117/593] Fix DYNAMIC_ARCH compilation errors --- common_param.h | 106 +++++++++++++++++++++++++++++--- kernel/generic/gemmkernel_2x2.c | 2 +- kernel/setparam-ref.c | 46 +++++++++++++- 3 files changed, 142 insertions(+), 12 deletions(-) diff --git a/common_param.h b/common_param.h index 6276f7f51..446d42452 100644 --- a/common_param.h +++ b/common_param.h @@ -41,15 +41,110 @@ #ifndef ASSEMBLER +#ifdef DYNAMIC_ARCH + #ifndef BFLOAT16 typedef unsigned short bfloat16; #endif -#ifdef DYNAMIC_ARCH typedef struct { int dtb_entries; int offsetA, offsetB, align; +#if 1 + int shgemm_p, shgemm_q, shgemm_r; + int shgemm_unroll_m, shgemm_unroll_n, shgemm_unroll_mn; + + float (*shamax_k) (BLASLONG, float *, BLASLONG); + float (*shamin_k) (BLASLONG, float *, BLASLONG); + float (*shmax_k) (BLASLONG, float *, BLASLONG); + float (*shmin_k) (BLASLONG, float *, BLASLONG); +BLASLONG (*ishamax_k)(BLASLONG, float *, BLASLONG); +BLASLONG (*ishamin_k)(BLASLONG, float *, BLASLONG); +BLASLONG (*ishmax_k) (BLASLONG, float *, BLASLONG); +BLASLONG (*ishmin_k) (BLASLONG, float *, BLASLONG); + + float (*shnrm2_k) (BLASLONG, float *, BLASLONG); + float (*shasum_k) (BLASLONG, float *, BLASLONG); + float (*shsum_k) (BLASLONG, float *, BLASLONG); + int (*shcopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); + float (*shdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); + double (*dshdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); + + int (*shrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); + + int (*shaxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + int (*shscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + int (*shswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + + int (*shgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*shgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*shger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + + int (*shsymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*shsymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + + int (*shgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG); + int (*shgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG); + + int (*shgemm_incopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); + int (*shgemm_itcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); + int (*shgemm_oncopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); + int (*shgemm_otcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); + + int (*shtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*shtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*shtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*shtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + + int (*shtrsm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*shtrsm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*shtrsm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*shtrsm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*shtrsm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*shtrsm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*shtrsm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*shtrsm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*shtrsm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*shtrsm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*shtrsm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*shtrsm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*shtrsm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*shtrsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*shtrsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*shtrsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + + int (*shtrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*shtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*shtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*shtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + + int (*shtrmm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shtrmm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shtrmm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shtrmm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shtrmm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shtrmm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shtrmm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shtrmm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shtrmm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shtrmm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shtrmm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shtrmm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shtrmm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shtrmm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shtrmm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shtrmm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + + int (*shsymm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shsymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shsymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shsymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + + int (*shneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); + int (*shlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); + +#endif int sgemm_p, sgemm_q, sgemm_r; int sgemm_unroll_m, sgemm_unroll_n, sgemm_unroll_mn; @@ -87,15 +182,6 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*sgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); int (*sgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - int shgemm_p, shgemm_q, shgemm_r; - int shgemm_unroll_m, shgemm_unroll_n, shgemm_unroll_mn; - int (*shgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG); - int (*shgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG); - - int (*shgemm_incopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); - int (*shgemm_itcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); - int (*shgemm_oncopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); - int (*shgemm_otcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); int (*sgemm_incopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*sgemm_itcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); diff --git a/kernel/generic/gemmkernel_2x2.c b/kernel/generic/gemmkernel_2x2.c index 26a88db6d..cc7bb8e48 100644 --- a/kernel/generic/gemmkernel_2x2.c +++ b/kernel/generic/gemmkernel_2x2.c @@ -1,6 +1,6 @@ #include "common.h" #if defined(HALF) && defined(HALFCONVERSION) -float +static float bfloat16tof32 (bfloat16 f16) { float result = 0; diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 12d038901..79cd151f6 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -60,6 +60,15 @@ gotoblas_t TABLE_NAME = { #else MAX(SHGEMM_DEFAULT_UNROLL_M, SHGEMM_DEFAULT_UNROLL_N), #endif + + samax_kTS, samin_kTS, smax_kTS, smin_kTS, + isamax_kTS, isamin_kTS, ismax_kTS, ismin_kTS, + snrm2_kTS, sasum_kTS, ssum_kTS, scopy_kTS, sdot_kTS, + dsdot_kTS, + srot_kTS, saxpy_kTS, sscal_kTS, sswap_kTS, + sgemv_nTS, sgemv_tTS, sger_kTS, + ssymv_LTS, ssymv_UTS, + shgemm_kernelTS, shgemm_betaTS, #if SHGEMM_DEFAULT_UNROLL_M != SHGEMM_DEFAULT_UNROLL_N shgemm_incopyTS, shgemm_itcopyTS, @@ -67,7 +76,42 @@ gotoblas_t TABLE_NAME = { shgemm_oncopyTS, shgemm_otcopyTS, #endif shgemm_oncopyTS, shgemm_otcopyTS, - sgemm_kernelTS, sgemm_betaTS, + + strsm_kernel_LNTS, strsm_kernel_LTTS, strsm_kernel_RNTS, strsm_kernel_RTTS, +#if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N + strsm_iunucopyTS, strsm_iunncopyTS, strsm_iutucopyTS, strsm_iutncopyTS, + strsm_ilnucopyTS, strsm_ilnncopyTS, strsm_iltucopyTS, strsm_iltncopyTS, +#else + strsm_ounucopyTS, strsm_ounncopyTS, strsm_outucopyTS, strsm_outncopyTS, + strsm_olnucopyTS, strsm_olnncopyTS, strsm_oltucopyTS, strsm_oltncopyTS, +#endif + strsm_ounucopyTS, strsm_ounncopyTS, strsm_outucopyTS, strsm_outncopyTS, + strsm_olnucopyTS, strsm_olnncopyTS, strsm_oltucopyTS, strsm_oltncopyTS, + strmm_kernel_RNTS, strmm_kernel_RTTS, strmm_kernel_LNTS, strmm_kernel_LTTS, +#if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N + strmm_iunucopyTS, strmm_iunncopyTS, strmm_iutucopyTS, strmm_iutncopyTS, + strmm_ilnucopyTS, strmm_ilnncopyTS, strmm_iltucopyTS, strmm_iltncopyTS, +#else + strmm_ounucopyTS, strmm_ounncopyTS, strmm_outucopyTS, strmm_outncopyTS, + strmm_olnucopyTS, strmm_olnncopyTS, strmm_oltucopyTS, strmm_oltncopyTS, +#endif + strmm_ounucopyTS, strmm_ounncopyTS, strmm_outucopyTS, strmm_outncopyTS, + strmm_olnucopyTS, strmm_olnncopyTS, strmm_oltucopyTS, strmm_oltncopyTS, +#if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N + ssymm_iutcopyTS, ssymm_iltcopyTS, +#else + ssymm_outcopyTS, ssymm_oltcopyTS, +#endif + ssymm_outcopyTS, ssymm_oltcopyTS, + +#ifndef NO_LAPACK + sneg_tcopyTS, slaswp_ncopyTS, +#else + NULL,NULL, +#endif + + + 0, 0, 0, SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N, #ifdef SGEMM_DEFAULT_UNROLL_MN SGEMM_DEFAULT_UNROLL_MN, From 579811fb6ae33e9b82b970300e1a1481985b6105 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 15 Apr 2020 17:38:33 +0200 Subject: [PATCH 118/593] Move all 19.04-based jobs back to ubuntu 18.04 --- .drone.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.drone.yml b/.drone.yml index 5686c5e41..8b7ac3011 100644 --- a/.drone.yml +++ b/.drone.yml @@ -8,7 +8,7 @@ platform: steps: - name: Build and Test - image: ubuntu:19.04 + image: ubuntu:18.04 environment: CC: gcc COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32' @@ -32,7 +32,7 @@ platform: steps: - name: Build and Test - image: ubuntu:19.04 + image: ubuntu:18.04 environment: CC: gcc COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV6 NUM_THREADS=32' @@ -152,7 +152,7 @@ platform: steps: - name: Build and Test - image: ubuntu:19.04 + image: ubuntu:18.04 environment: CC: gcc COMMON_FLAGS: 'USE_OPENMP=1' @@ -176,7 +176,7 @@ platform: steps: - name: Build and Test - image: ubuntu:19.04 + image: ubuntu:18.04 environment: CC: gcc COMMON_FLAGS: 'USE_THREAD=1' From e8e8a6e60802596d1d9a037062ac40f4b1cad356 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 15 Apr 2020 19:26:12 +0200 Subject: [PATCH 119/593] Restore USE_OPENMP in the x86 thread test --- .drone.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.drone.yml b/.drone.yml index 8b7ac3011..b1c211d14 100644 --- a/.drone.yml +++ b/.drone.yml @@ -179,7 +179,7 @@ steps: image: ubuntu:18.04 environment: CC: gcc - COMMON_FLAGS: 'USE_THREAD=1' + COMMON_FLAGS: 'USE_OPENMP=1' commands: - echo "MAKE_FLAGS:= $COMMON_FLAGS" - apt-get update -y From 67cc4b9e16d2e8c017731d2b9eabb5c6b45a9ad5 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Wed, 15 Apr 2020 19:15:23 -0500 Subject: [PATCH 120/593] Fix warnings in clang and export symbol --- common.h | 9 +-- common_interface.h | 3 - common_level3.h | 3 - common_param.h | 4 -- common_sh.h | 4 +- exports/gensymbol | 4 +- kernel/common_param.h | 129 ++++++++++++++++++++++++++++++++++++++++++ kernel/setparam-ref.c | 8 +-- 8 files changed, 140 insertions(+), 24 deletions(-) diff --git a/common.h b/common.h index 1d8bf07e5..e2c8cdee5 100644 --- a/common.h +++ b/common.h @@ -257,6 +257,11 @@ typedef long BLASLONG; typedef unsigned long BLASULONG; #endif +#ifndef BFLOAT16 +typedef unsigned short bfloat16; +#define HALFCONVERSION 1 +#endif + #ifdef USE64BITINT typedef BLASLONG blasint; #if defined(OS_WINDOWS) && defined(__64BIT__) @@ -298,10 +303,6 @@ typedef int blasint; #define BASE_SHIFT 3 #define ZBASE_SHIFT 4 #elif defined(HALF) -#ifndef BFLOAT16 -typedef unsigned short bfloat16; -#define HALFCONVERSION 1 -#endif #define IFLOAT bfloat16 #define XFLOAT IFLOAT #define FLOAT float diff --git a/common_interface.h b/common_interface.h index 081043af1..78f5be6b0 100644 --- a/common_interface.h +++ b/common_interface.h @@ -37,9 +37,6 @@ /*********************************************************************/ #ifndef ASSEMBLER -#ifndef BFLOAT16 -typedef unsigned short bfloat16; -#endif #ifdef __cplusplus extern "C" { diff --git a/common_level3.h b/common_level3.h index 8194ba6ce..4e44a5e73 100644 --- a/common_level3.h +++ b/common_level3.h @@ -37,9 +37,6 @@ /*********************************************************************/ #ifndef ASSEMBLER -#ifndef BFLOAT16 -typedef unsigned short bfloat16; -#endif #ifdef __CUDACC__ __global__ void cuda_sgemm_kernel(int, int, int, float *, float *, float *); diff --git a/common_param.h b/common_param.h index 446d42452..19a34fa3d 100644 --- a/common_param.h +++ b/common_param.h @@ -43,10 +43,6 @@ #ifdef DYNAMIC_ARCH -#ifndef BFLOAT16 -typedef unsigned short bfloat16; -#endif - typedef struct { int dtb_entries; int offsetA, offsetB, align; diff --git a/common_sh.h b/common_sh.h index 8859694f1..7a0045762 100644 --- a/common_sh.h +++ b/common_sh.h @@ -1,5 +1,5 @@ -#ifndef COMMON_H_H -#define COMMON_H_H +#ifndef COMMON_SH_H +#define COMMON_SH_H #ifndef DYNAMIC_ARCH diff --git a/exports/gensymbol b/exports/gensymbol index d2894e6c8..235446f14 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -30,7 +30,7 @@ icamax,icamin,idamax,idamin,idmax,idmin,isamax,isamin,ismax,ismin, izamax,izamin,lsame,samax,samin,sasum,saxpy,scabs1,scamax, scamin,scasum,scnrm2,scopy,sdot,sdsdot,sgbmv,sgemm,sgemv,sger, - smax,smin,snrm2, + shgemm, smax,smin,snrm2, srot,srotg,srotm,srotmg,ssbmv,sscal,sspmv,sspr2,sspr,sswap, ssymm,ssymv,ssyr2,ssyr2k,ssyr,ssyrk,stbmv,stbsv,stpmv,stpsv, strmm,strmv,strsm,strsv,zaxpy,zcopy,zdotc,zdotu,zdrot, @@ -67,7 +67,7 @@ cblas_isamax, cblas_izamax, cblas_sasum, cblas_saxpy, cblas_scasum, cblas_scnrm2, cblas_scopy, cblas_sdot, cblas_sdsdot, cblas_sgbmv, cblas_sgemm, - cblas_sgemv, cblas_sger, cblas_snrm2, cblas_srot, cblas_srotg, + cblas_sgemv, cblas_sger, cblas_shgemm, cblas_snrm2, cblas_srot, cblas_srotg, cblas_srotm, cblas_srotmg, cblas_ssbmv, cblas_sscal, cblas_sspmv, cblas_sspr2, cblas_sspr, cblas_sswap, cblas_ssymm, cblas_ssymv, cblas_ssyr2, cblas_ssyr2k, cblas_ssyr, cblas_ssyrk, cblas_stbmv, cblas_stbsv, cblas_stpmv, cblas_stpsv, cblas_strmm, cblas_strmv, cblas_strsm, diff --git a/kernel/common_param.h b/kernel/common_param.h index eab14b0a6..29bb65e5c 100644 --- a/kernel/common_param.h +++ b/kernel/common_param.h @@ -47,6 +47,100 @@ typedef struct { int dtb_entries; int offsetA, offsetB, align; +#if 1 + int shgemm_p, shgemm_q, shgemm_r; + int shgemm_unroll_m, shgemm_unroll_n, shgemm_unroll_mn; + + float (*shamax_k) (BLASLONG, float *, BLASLONG); + float (*shamin_k) (BLASLONG, float *, BLASLONG); + float (*shmax_k) (BLASLONG, float *, BLASLONG); + float (*shmin_k) (BLASLONG, float *, BLASLONG); +BLASLONG (*ishamax_k)(BLASLONG, float *, BLASLONG); +BLASLONG (*ishamin_k)(BLASLONG, float *, BLASLONG); +BLASLONG (*ishmax_k) (BLASLONG, float *, BLASLONG); +BLASLONG (*ishmin_k) (BLASLONG, float *, BLASLONG); + + float (*shnrm2_k) (BLASLONG, float *, BLASLONG); + float (*shasum_k) (BLASLONG, float *, BLASLONG); + float (*shsum_k) (BLASLONG, float *, BLASLONG); + int (*shcopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); + float (*shdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); + double (*dshdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); + + int (*shrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); + + int (*shaxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + int (*shscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + int (*shswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + + int (*shgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*shgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*shger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + + int (*shsymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*shsymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + + int (*shgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG); + int (*shgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG); + + int (*shgemm_incopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); + int (*shgemm_itcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); + int (*shgemm_oncopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); + int (*shgemm_otcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); + + int (*shtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*shtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*shtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*shtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + + int (*shtrsm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*shtrsm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*shtrsm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*shtrsm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*shtrsm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*shtrsm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*shtrsm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*shtrsm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*shtrsm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*shtrsm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*shtrsm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*shtrsm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*shtrsm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*shtrsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*shtrsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*shtrsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + + int (*shtrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*shtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*shtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*shtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + + int (*shtrmm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shtrmm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shtrmm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shtrmm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shtrmm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shtrmm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shtrmm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shtrmm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shtrmm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shtrmm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shtrmm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shtrmm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shtrmm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shtrmm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shtrmm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shtrmm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + + int (*shsymm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shsymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shsymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*shsymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + + int (*shneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); + int (*shlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); + +#endif int sgemm_p, sgemm_q, sgemm_r; int sgemm_unroll_m, sgemm_unroll_n, sgemm_unroll_mn; @@ -84,6 +178,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*sgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); int (*sgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + int (*sgemm_incopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*sgemm_itcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*sgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); @@ -907,6 +1002,13 @@ extern gotoblas_t *gotoblas; #define HAVE_EX_L2 gotoblas -> exclusive_cache +#define SHGEMM_P gotoblas -> shgemm_p +#define SHGEMM_Q gotoblas -> shgemm_q +#define SHGEMM_R gotoblas -> shgemm_r +#define SHGEMM_UNROLL_M gotoblas -> shgemm_unroll_m +#define SHGEMM_UNROLL_N gotoblas -> shgemm_unroll_n +#define SHGEMM_UNROLL_MN gotoblas -> shgemm_unroll_mn + #define SGEMM_P gotoblas -> sgemm_p #define SGEMM_Q gotoblas -> sgemm_q #define SGEMM_R gotoblas -> sgemm_r @@ -984,6 +1086,17 @@ extern gotoblas_t *gotoblas; #define HAVE_EX_L2 0 #endif +#define SHGEMM_P SHGEMM_DEFAULT_P +#define SHGEMM_Q SHGEMM_DEFAULT_Q +#define SHGEMM_R SHGEMM_DEFAULT_R +#define SHGEMM_UNROLL_M SHGEMM_DEFAULT_UNROLL_M +#define SHGEMM_UNROLL_N SHGEMM_DEFAULT_UNROLL_N +#ifdef SHGEMM_DEFAULT_UNROLL_MN +#define SHGEMM_UNROLL_MN SHGEMM_DEFAULT_UNROLL_MN +#else +#define SHGEMM_UNROLL_MN MAX((SHGEMM_UNROLL_M), (SHGEMM_UNROLL_N)) +#endif + #define SGEMM_P SGEMM_DEFAULT_P #define SGEMM_Q SGEMM_DEFAULT_Q #define SGEMM_R SGEMM_DEFAULT_R @@ -1119,6 +1232,18 @@ extern gotoblas_t *gotoblas; #define GEMM_DEFAULT_R DGEMM_DEFAULT_R #define GEMM_DEFAULT_UNROLL_M DGEMM_DEFAULT_UNROLL_M #define GEMM_DEFAULT_UNROLL_N DGEMM_DEFAULT_UNROLL_N +#elif defined(HALF) +#define GEMM_P SHGEMM_P +#define GEMM_Q SHGEMM_Q +#define GEMM_R SHGEMM_R +#define GEMM_UNROLL_M SHGEMM_UNROLL_M +#define GEMM_UNROLL_N SHGEMM_UNROLL_N +#define GEMM_UNROLL_MN SHGEMM_UNROLL_MN +#define GEMM_DEFAULT_P SHGEMM_DEFAULT_P +#define GEMM_DEFAULT_Q SHGEMM_DEFAULT_Q +#define GEMM_DEFAULT_R SHGEMM_DEFAULT_R +#define GEMM_DEFAULT_UNROLL_M SHGEMM_DEFAULT_UNROLL_M +#define GEMM_DEFAULT_UNROLL_N SHGEMM_DEFAULT_UNROLL_N #else #define GEMM_P SGEMM_P #define GEMM_Q SGEMM_Q @@ -1204,6 +1329,10 @@ extern gotoblas_t *gotoblas; #define GEMM_THREAD gemm_thread_n #endif +#ifndef SHGEMM_DEFAULT_R +#define SHGEMM_DEFAULT_R (((BUFFER_SIZE - ((SHGEMM_DEFAULT_P * SHGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SHGEMM_DEFAULT_Q * 4) - 15) & ~15UL) +#endif + #ifndef SGEMM_DEFAULT_R #define SGEMM_DEFAULT_R (((BUFFER_SIZE - ((SGEMM_DEFAULT_P * SGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SGEMM_DEFAULT_Q * 4) - 15) & ~15UL) #endif diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 79cd151f6..b7cf0f112 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -958,6 +958,8 @@ static void init_parameter(void) { (void) l2; /* dirty trick to suppress unused variable warning for targets */ /* where the GEMM unrolling parameters do not depend on l2 */ + TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; + TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; @@ -1329,7 +1331,6 @@ static void init_parameter(void) { - TABLE_NAME.shgemm_p = ((TABLE_NAME.shgemm_p + SHGEMM_DEFAULT_UNROLL_M - 1)/SHGEMM_DEFAULT_UNROLL_M) * SHGEMM_DEFAULT_UNROLL_M; TABLE_NAME.sgemm_p = ((TABLE_NAME.sgemm_p + SGEMM_DEFAULT_UNROLL_M - 1)/SGEMM_DEFAULT_UNROLL_M) * SGEMM_DEFAULT_UNROLL_M; TABLE_NAME.dgemm_p = ((TABLE_NAME.dgemm_p + DGEMM_DEFAULT_UNROLL_M - 1)/DGEMM_DEFAULT_UNROLL_M) * DGEMM_DEFAULT_UNROLL_M; TABLE_NAME.cgemm_p = ((TABLE_NAME.cgemm_p + CGEMM_DEFAULT_UNROLL_M - 1)/CGEMM_DEFAULT_UNROLL_M) * CGEMM_DEFAULT_UNROLL_M; @@ -1357,11 +1358,6 @@ static void init_parameter(void) { fprintf(stderr, "L2 = %8d DGEMM_P .. %d\n", l2, TABLE_NAME.dgemm_p); #endif - TABLE_NAME.shgemm_r = (((BUFFER_SIZE - - ((TABLE_NAME.shgemm_p * TABLE_NAME.shgemm_q * 4 + TABLE_NAME.offsetA - + TABLE_NAME.align) & ~TABLE_NAME.align) - ) / (TABLE_NAME.shgemm_q * 4) - 15) & ~15); - TABLE_NAME.sgemm_r = (((BUFFER_SIZE - ((TABLE_NAME.sgemm_p * TABLE_NAME.sgemm_q * 4 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) From 6b7ef6543a998ea9f1063873d04469503be38766 Mon Sep 17 00:00:00 2001 From: l00536773 Date: Thu, 16 Apr 2020 10:55:10 +0800 Subject: [PATCH 121/593] [OpenBLAS]: benchmark error of potrf [description]: when the matrix size goes higher than 5800 during the cpotrf test, error info, such as "Potrf info = 5679", will be returned on ARM64 and x86 machines. Uplo = L & F. [solution]: changed the func for building the matrix so that the complex Hermitian matrix can stay positive definite during the computation. [dts]: --- benchmark/potrf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmark/potrf.c b/benchmark/potrf.c index 580e46072..cb4c23bab 100644 --- a/benchmark/potrf.c +++ b/benchmark/potrf.c @@ -193,14 +193,14 @@ int main(int argc, char *argv[]){ a[((long)j + (long)j * (long)m) * 2 + 1] = 0.; for(i = j + 1; i < m; i++) { - a[((long)i + (long)j * (long)m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) - 0.5; + a[((long)i + (long)j * (long)m) * 2 + 0] = 0; a[((long)i + (long)j * (long)m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5; } } } else { for (j = 0; j < m; j++) { for(i = 0; i < j; i++) { - a[((long)i + (long)j * (long)m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) - 0.5; + a[((long)i + (long)j * (long)m) * 2 + 0] = 0.; a[((long)i + (long)j * (long)m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5; } From 22bb50fb8115909ab8ba4a977913cd6adc1b3290 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Fri, 17 Apr 2020 13:35:17 -0500 Subject: [PATCH 122/593] cmake fixes --- CMakeLists.txt | 6 ++ cmake/kernel.cmake | 39 +++++++- cmake/utils.cmake | 7 ++ common_macro.h | 213 +++++++++++++++++++++++++++++++++++++++++- ctest/CMakeLists.txt | 3 + kernel/CMakeLists.txt | 21 ++++- lapack/CMakeLists.txt | 4 + 7 files changed, 287 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 951271717..20cf741c4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -89,6 +89,7 @@ endif () # set which float types we want to build for if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16) # if none are defined, build for all + set(BUILD_HALF true) set(BUILD_SINGLE true) set(BUILD_DOUBLE true) set(BUILD_COMPLEX true) @@ -120,6 +121,11 @@ if (BUILD_COMPLEX16) list(APPEND FLOAT_TYPES "ZCOMPLEX") # defines COMPLEX and DOUBLE endif () +if (BUILD_SINGLE OR BUILD_HALF) + message(STATUS "Building Half Precision") + list(APPEND FLOAT_TYPES "HALF") # defines nothing +endif () + if (NOT DEFINED CORE OR "${CORE}" STREQUAL "UNKNOWN") message(FATAL_ERROR "Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for details.") endif () diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake index 9b238f004..7b64a03fc 100644 --- a/cmake/kernel.cmake +++ b/cmake/kernel.cmake @@ -113,11 +113,29 @@ macro(SetDefaultL1) set(ZSUMKERNEL zsum.S) set(QSUMKERNEL sum.S) set(XSUMKERNEL zsum.S) + set(SHAMINKERNEL ../arm/amin.c) + set(SHAMAXKERNEL amax.S) + set(SHMAXKERNEL ../arm/max.c) + set(SHMINKERNEL ../arm/min.c) + set(ISHAMAXKERNEL iamax.S) + set(ISHAMINKERNEL ../arm/iamin.c) + set(ISHMAXKERNEL ../arm/imax.c) + set(ISHMINKERNEL ../arm/imin.c) + set(SHASUMKERNEL asum.S) + set(SHAXPYKERNEL axpy.S) + set(SHAXPBYKERNEL ../arm/axpby.c) + set(SHCOPYKERNEL copy.S) + set(SHDOTKERNEL dot.S) + set(SHROTKERNEL rot.S) + set(SHSCALKERNEL scal.S) + set(SHNRM2KERNEL nrm2.S) + set(SHSUMKERNEL sum.S) + set(SHSWAPKERNEL swap.S) endmacro () macro(SetDefaultL2) - set(SGEMVNKERNEL gemv_n.S) - set(SGEMVTKERNEL gemv_t.S) + set(SGEMVNKERNEL ../arm/gemv_n.c) + set(SGEMVTKERNEL ../arm/gemv_t.c) set(DGEMVNKERNEL gemv_n.S) set(DGEMVTKERNEL gemv_t.S) set(CGEMVNKERNEL zgemv_n.S) @@ -161,6 +179,10 @@ macro(SetDefaultL2) set(XHEMV_L_KERNEL ../generic/zhemv_k.c) set(XHEMV_V_KERNEL ../generic/zhemv_k.c) set(XHEMV_M_KERNEL ../generic/zhemv_k.c) + set(SHGEMVNKERNEL ../arm/gemv_n.c) + set(SHGEMVTKERNEL ../arm/gemv_t.c) + set(SHGERKERNEL ../generic/ger.c) + endmacro () macro(SetDefaultL3) @@ -168,4 +190,17 @@ macro(SetDefaultL3) set(DGEADD_KERNEL ../generic/geadd.c) set(CGEADD_KERNEL ../generic/zgeadd.c) set(ZGEADD_KERNEL ../generic/zgeadd.c) + set(SHGEADD_KERNEL ../generic/geadd.c) + set(SHGEMMKERNEL ../generic/gemmkernel_2x2.c) + set(SHGEMM_BETA ../generic/gemm_beta.c) + set(SHGEMMINCOPY ../generic/gemm_ncopy_2.c) + set(SHGEMMITCOPY ../generic/gemm_tcopy_2.c) + set(SHGEMMONCOPY ../generic/gemm_ncopy_2.c) + set(SHGEMMOTCOPY ../generic/gemm_tcopy_2.c) + set(SHGEMMINCOPYOBJ shgemm_incopy.o) + set(SHGEMMITCOPYOBJ shgemm_itcopy.o) + set(SHGEMMONCOPYOBJ shgemm_oncopy.o) + set(SHGEMMOTCOPYOBJ shgemm_otcopy.o) + + endmacro () diff --git a/cmake/utils.cmake b/cmake/utils.cmake index fd93f8a70..831ddffe6 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -163,6 +163,7 @@ function(GenerateNamedObjects sources_in) if (complex_only) list(REMOVE_ITEM float_list "SINGLE") list(REMOVE_ITEM float_list "DOUBLE") + list(REMOVE_ITEM float_list "HALF") elseif (real_only) list(REMOVE_ITEM float_list "COMPLEX") list(REMOVE_ITEM float_list "ZCOMPLEX") @@ -176,6 +177,9 @@ function(GenerateNamedObjects sources_in) if (NOT no_float_type) string(SUBSTRING ${float_type} 0 1 float_char) string(TOLOWER ${float_char} float_char) + if (${float_type} STREQUAL "HALF") + set (float_char "sh") + endif () endif () if (NOT name_in) @@ -210,6 +214,9 @@ function(GenerateNamedObjects sources_in) if (${float_type} STREQUAL "DOUBLE" OR ${float_type} STREQUAL "ZCOMPLEX") list(APPEND obj_defines "DOUBLE") endif () + if (${float_type} STREQUAL "HALF") + list(APPEND obj_defines "HALF") + endif () if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") list(APPEND obj_defines "COMPLEX") if (mangle_complex_sources) diff --git a/common_macro.h b/common_macro.h index b438c83ba..2166e62a2 100644 --- a/common_macro.h +++ b/common_macro.h @@ -646,6 +646,19 @@ #elif defined(HALF) +#define AXPYU_K SAXPYU_K +#define AXPYC_K SAXPYC_K +#define SCAL_K SSCAL_K +#define GEMV_N SGEMV_N +#define GEMV_T SGEMV_T +#define SYMV_U SSYMV_U +#define SYMV_L SSYMV_L +#define GERU_K SGERU_K +#define GERC_K SGERC_K +#define GERV_K SGERV_K +#define GERD_K SGERD_K +#define SYMV_THREAD_U SSYMV_THREAD_U +#define SYMV_THREAD_L SSYMV_THREAD_L #define GEMM_BETA SHGEMM_BETA #define GEMM_KERNEL_N SHGEMM_KERNEL #define GEMM_KERNEL_L SHGEMM_KERNEL @@ -672,6 +685,20 @@ #define GEMM_OTCOPY SHGEMM_OTCOPY #define GEMM_INCOPY SHGEMM_INCOPY #define GEMM_ITCOPY SHGEMM_ITCOPY +#define SYMM_THREAD_LU SSYMM_THREAD_LU +#define SYMM_THREAD_LL SSYMM_THREAD_LL +#define SYMM_THREAD_RU SSYMM_THREAD_RU +#define SYMM_THREAD_RL SSYMM_THREAD_RL +#define SYMM_LU SSYMM_LU +#define SYMM_LL SSYMM_LL +#define SYMM_RU SSYMM_RU +#define SYMM_RL SSYMM_RL + + +#define HEMM_THREAD_LU SHEMM_THREAD_LU +#define HEMM_THREAD_LL SHEMM_THREAD_LL +#define HEMM_THREAD_RU SHEMM_THREAD_RU +#define HEMM_THREAD_RL SHEMM_THREAD_RL #define GEMM_THREAD_NN SHGEMM_THREAD_NN #define GEMM_THREAD_CN SHGEMM_THREAD_TN @@ -690,6 +717,186 @@ #define GEMM_THREAD_RC SHGEMM_THREAD_NT #define GEMM_THREAD_RR SHGEMM_THREAD_NN +#ifdef UNIT + +#define TRMM_OUNCOPY STRMM_OUNUCOPY +#define TRMM_OUTCOPY STRMM_OUTUCOPY +#define TRMM_OLNCOPY STRMM_OLNUCOPY +#define TRMM_OLTCOPY STRMM_OLTUCOPY +#define TRSM_OUNCOPY STRSM_OUNUCOPY +#define TRSM_OUTCOPY STRSM_OUTUCOPY +#define TRSM_OLNCOPY STRSM_OLNUCOPY +#define TRSM_OLTCOPY STRSM_OLTUCOPY + +#define TRMM_IUNCOPY STRMM_IUNUCOPY +#define TRMM_IUTCOPY STRMM_IUTUCOPY +#define TRMM_ILNCOPY STRMM_ILNUCOPY +#define TRMM_ILTCOPY STRMM_ILTUCOPY +#define TRSM_IUNCOPY STRSM_IUNUCOPY +#define TRSM_IUTCOPY STRSM_IUTUCOPY +#define TRSM_ILNCOPY STRSM_ILNUCOPY +#define TRSM_ILTCOPY STRSM_ILTUCOPY + +#else + +#define TRMM_OUNCOPY STRMM_OUNNCOPY +#define TRMM_OUTCOPY STRMM_OUTNCOPY +#define TRMM_OLNCOPY STRMM_OLNNCOPY +#define TRMM_OLTCOPY STRMM_OLTNCOPY +#define TRSM_OUNCOPY STRSM_OUNNCOPY +#define TRSM_OUTCOPY STRSM_OUTNCOPY +#define TRSM_OLNCOPY STRSM_OLNNCOPY +#define TRSM_OLTCOPY STRSM_OLTNCOPY + +#define TRMM_IUNCOPY STRMM_IUNNCOPY +#define TRMM_IUTCOPY STRMM_IUTNCOPY +#define TRMM_ILNCOPY STRMM_ILNNCOPY +#define TRMM_ILTCOPY STRMM_ILTNCOPY +#define TRSM_IUNCOPY STRSM_IUNNCOPY +#define TRSM_IUTCOPY STRSM_IUTNCOPY +#define TRSM_ILNCOPY STRSM_ILNNCOPY +#define TRSM_ILTCOPY STRSM_ILTNCOPY + +#define TRMM_KERNEL_LN STRMM_KERNEL_LN +#define TRMM_KERNEL_LT STRMM_KERNEL_LT +#define TRMM_KERNEL_LR STRMM_KERNEL_LN +#define TRMM_KERNEL_LC STRMM_KERNEL_LT +#define TRMM_KERNEL_RN STRMM_KERNEL_RN +#define TRMM_KERNEL_RT STRMM_KERNEL_RT +#define TRMM_KERNEL_RR STRMM_KERNEL_RN +#define TRMM_KERNEL_RC STRMM_KERNEL_RT + +#define TRSM_KERNEL_LN STRSM_KERNEL_LN +#define TRSM_KERNEL_LT STRSM_KERNEL_LT +#define TRSM_KERNEL_LR STRSM_KERNEL_LN +#define TRSM_KERNEL_LC STRSM_KERNEL_LT +#define TRSM_KERNEL_RN STRSM_KERNEL_RN +#define TRSM_KERNEL_RT STRSM_KERNEL_RT +#define TRSM_KERNEL_RR STRSM_KERNEL_RN +#define TRSM_KERNEL_RC STRSM_KERNEL_RT + +#define SYMM_IUTCOPY SSYMM_IUTCOPY +#define SYMM_ILTCOPY SSYMM_ILTCOPY +#define SYMM_OUTCOPY SSYMM_OUTCOPY +#define SYMM_OLTCOPY SSYMM_OLTCOPY +#define TRMM_LNUU STRMM_LNUU +#define TRMM_LNUN STRMM_LNUN +#define TRMM_LNLU STRMM_LNLU +#define TRMM_LNLN STRMM_LNLN +#define TRMM_LTUU STRMM_LTUU +#define TRMM_LTUN STRMM_LTUN +#define TRMM_LTLU STRMM_LTLU +#define TRMM_LTLN STRMM_LTLN +#define TRMM_LRUU STRMM_LNUU +#define TRMM_LRUN STRMM_LNUN +#define TRMM_LRLU STRMM_LNLU +#define TRMM_LRLN STRMM_LNLN +#define TRMM_LCUU STRMM_LTUU +#define TRMM_LCUN STRMM_LTUN +#define TRMM_LCLU STRMM_LTLU +#define TRMM_LCLN STRMM_LTLN +#define TRMM_RNUU STRMM_RNUU +#define TRMM_RNUN STRMM_RNUN +#define TRMM_RNLU STRMM_RNLU +#define TRMM_RNLN STRMM_RNLN +#define TRMM_RTUU STRMM_RTUU +#define TRMM_RTUN STRMM_RTUN +#define TRMM_RTLU STRMM_RTLU +#define TRMM_RTLN STRMM_RTLN +#define TRMM_RRUU STRMM_RNUU +#define TRMM_RRUN STRMM_RNUN +#define TRMM_RRLU STRMM_RNLU +#define TRMM_RRLN STRMM_RNLN +#define TRMM_RCUU STRMM_RTUU +#define TRMM_RCUN STRMM_RTUN +#define TRMM_RCLU STRMM_RTLU +#define TRMM_RCLN STRMM_RTLN + +#define TRSM_LNUU STRSM_LNUU +#define TRSM_LNUN STRSM_LNUN +#define TRSM_LNLU STRSM_LNLU +#define TRSM_LNLN STRSM_LNLN +#define TRSM_LTUU STRSM_LTUU +#define TRSM_LTUN STRSM_LTUN +#define TRSM_LTLU STRSM_LTLU +#define TRSM_LTLN STRSM_LTLN +#define TRSM_LRUU STRSM_LNUU +#define TRSM_LRUN STRSM_LNUN +#define TRSM_LRLU STRSM_LNLU +#define TRSM_LRLN STRSM_LNLN +#define TRSM_LCUU STRSM_LTUU +#define TRSM_LCUN STRSM_LTUN +#define TRSM_LCLU STRSM_LTLU +#define TRSM_LCLN STRSM_LTLN +#define TRSM_RNUU STRSM_RNUU +#define TRSM_RNUN STRSM_RNUN +#define TRSM_RNLU STRSM_RNLU +#define TRSM_RNLN STRSM_RNLN +#define TRSM_RTUU STRSM_RTUU +#define TRSM_RTUN STRSM_RTUN +#define TRSM_RTLU STRSM_RTLU +#define TRSM_RTLN STRSM_RTLN +#define TRSM_RRUU STRSM_RNUU +#define TRSM_RRUN STRSM_RNUN +#define TRSM_RRLU STRSM_RNLU +#define TRSM_RRLN STRSM_RNLN +#define TRSM_RCUU STRSM_RTUU +#define TRSM_RCUN STRSM_RTUN +#define TRSM_RCLU STRSM_RTLU +#define TRSM_RCLN STRSM_RTLN +#define SYRK_UN SSYRK_UN +#define SYRK_UT SSYRK_UT +#define SYRK_LN SSYRK_LN +#define SYRK_LT SSYRK_LT +#define SYRK_UR SSYRK_UN +#define SYRK_UC SSYRK_UT +#define SYRK_LR SSYRK_LN +#define SYRK_LC SSYRK_LT + +#define SYRK_KERNEL_U SSYRK_KERNEL_U +#define SYRK_KERNEL_L SSYRK_KERNEL_L + +#define HERK_UN SSYRK_UN +#define HERK_LN SSYRK_LN +#define HERK_UC SSYRK_UT +#define HERK_LC SSYRK_LT + +#define HER2K_UN SSYR2K_UN +#define HER2K_LN SSYR2K_LN +#define HER2K_UC SSYR2K_UT +#define HER2K_LC SSYR2K_LT + +#define SYR2K_UN SSYR2K_UN +#define SYR2K_UT SSYR2K_UT +#define SYR2K_LN SSYR2K_LN +#define SYR2K_LT SSYR2K_LT +#define SYR2K_UR SSYR2K_UN +#define SYR2K_UC SSYR2K_UT +#define SYR2K_LR SSYR2K_LN +#define SYR2K_LC SSYR2K_LT + +#define SYR2K_KERNEL_U SSYR2K_KERNEL_U +#define SYR2K_KERNEL_L SSYR2K_KERNEL_L +#define SYRK_THREAD_UN SSYRK_THREAD_UN +#define SYRK_THREAD_UT SSYRK_THREAD_UT +#define SYRK_THREAD_LN SSYRK_THREAD_LN +#define SYRK_THREAD_LT SSYRK_THREAD_LT +#define SYRK_THREAD_UR SSYRK_THREAD_UR +#define SYRK_THREAD_UC SSYRK_THREAD_UC +#define SYRK_THREAD_LR SSYRK_THREAD_LN +#define SYRK_THREAD_LC SSYRK_THREAD_LT + +#define HERK_THREAD_UN SSYRK_THREAD_UN +#define HERK_THREAD_UT SSYRK_THREAD_UT +#define HERK_THREAD_LN SSYRK_THREAD_LN +#define HERK_THREAD_LT SSYRK_THREAD_LT +#define HERK_THREAD_UR SSYRK_THREAD_UR +#define HERK_THREAD_UC SSYRK_THREAD_UC +#define HERK_THREAD_LR SSYRK_THREAD_LN +#define HERK_THREAD_LC SSYRK_THREAD_LT + +#endif + #else #define AMAX_K SAMAX_K @@ -721,14 +928,14 @@ #define GEMV_S SGEMV_S #define GEMV_D SGEMV_D + +#define SYMV_U SSYMV_U +#define SYMV_L SSYMV_L #define GERU_K SGERU_K #define GERC_K SGERC_K #define GERV_K SGERV_K #define GERD_K SGERD_K -#define SYMV_U SSYMV_U -#define SYMV_L SSYMV_L - #define SYMV_THREAD_U SSYMV_THREAD_U #define SYMV_THREAD_L SSYMV_THREAD_L diff --git a/ctest/CMakeLists.txt b/ctest/CMakeLists.txt index 14c9d1944..8d301c239 100644 --- a/ctest/CMakeLists.txt +++ b/ctest/CMakeLists.txt @@ -12,6 +12,9 @@ FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh foreach(float_type ${FLOAT_TYPES}) string(SUBSTRING ${float_type} 0 1 float_char_upper) string(TOLOWER ${float_char_upper} float_char) + if (${float_char} STREQUAL "h") + continue() + endif() #level1 add_executable(x${float_char}cblat1 c_${float_char}blat1.f diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 35e0fff25..4113a1647 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -41,6 +41,9 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) foreach (float_type ${FLOAT_TYPES}) # a bit of metaprogramming here to pull out the appropriate KERNEL var string(SUBSTRING ${float_type} 0 1 float_char) + if (${float_type} STREQUAL "HALF") + set (float_char "SH") + endif () GenerateNamedObjects("${KERNELDIR}/${${float_char}AMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}AMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false ${float_type}) if (DEFINED ${float_char}MAXKERNEL) @@ -93,6 +96,9 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("generic/ger.c" "" "ger_k" false "" "" "" 3) foreach (float_type ${FLOAT_TYPES}) string(SUBSTRING ${float_type} 0 1 float_char) + if (${float_type} STREQUAL "HALF") + set (float_char "SH") + endif () if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") GenerateNamedObjects("${KERNELDIR}/${${float_char}GERUKERNEL}" "" "geru_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GERCKERNEL}" "CONJ" "gerc_k" false "" "" false ${float_type}) @@ -128,13 +134,19 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) set(USE_TRMM true) endif () - foreach (float_type SINGLE DOUBLE) + foreach (float_type SINGLE DOUBLE HALF) string(SUBSTRING ${float_type} 0 1 float_char) + if (${float_type} STREQUAL "HALF") + set (float_char "SH") + endif () GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type}) endforeach() foreach (float_type ${FLOAT_TYPES}) string(SUBSTRING ${float_type} 0 1 float_char) + if (${float_type} STREQUAL "HALF") + set (float_char "SH") + endif () if (${float_char}GEMMINCOPY) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMINCOPY}" "${float_type}" "${${float_char}GEMMINCOPYOBJ}" false "" "" true ${float_type}) endif () @@ -470,9 +482,13 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEADD_KERNEL}" "" "geadd_k" false "" "" false ${float_type}) endforeach () + # Makefile.LA if(NOT NO_LAPACK) foreach (float_type ${FLOAT_TYPES}) + if (${float_type} STREQUAL "HALF") + set (float_char "SH") + endif () if (NOT DEFINED ${float_char}NEG_TCOPY) if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C" OR ${float_char} STREQUAL "X") set(${float_char}NEG_TCOPY ../generic/zneg_tcopy.c) @@ -516,6 +532,9 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) foreach (float_type ${FLOAT_TYPES}) # a bit of metaprogramming here to pull out the appropriate KERNEL var string(SUBSTRING ${float_type} 0 1 float_char) + if (${float_type} STREQUAL "HALF") + set (float_char "SH") + endif () GenerateNamedObjects("generic/neg_tcopy_${${float_char}GEMM_UNROLL_M}.c" "" "neg_tcopy" false "" ${TSUFFIX} false ${float_type}) GenerateNamedObjects("generic/laswp_ncopy_${${float_char}GEMM_UNROLL_N}.c" "" "laswp_ncopy" false "" ${TSUFFIX} false ${float_type}) endforeach () diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt index e21a9aabb..778e6f8fa 100644 --- a/lapack/CMakeLists.txt +++ b/lapack/CMakeLists.txt @@ -2,6 +2,7 @@ include_directories(${PROJECT_SOURCE_DIR}) include_directories(${PROJECT_BINARY_DIR}) +list (REMOVE_ITEM FLOAT_TYPES "HALF") set(LAPACK_SOURCES potrf/potrf_U_single.c @@ -45,6 +46,9 @@ GenerateNamedObjects("laswp/generic/laswp_k_4.c" "" "laswp_plus" false "" "" fa GenerateNamedObjects("laswp/generic/laswp_k_4.c" "MINUS" "laswp_minus" false "" "" false 3) foreach (float_type ${FLOAT_TYPES}) +if (${float_type} STREQUAL "HALF") + continue() +endif() GenerateNamedObjects("getrf/getrf_single.c" "UNIT" "getrf_single" false "" "" false ${float_type}) endforeach () From 9f6d6f6cb69ba871a887ecc9751fbc2d529e1b98 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 17 Apr 2020 22:27:58 +0200 Subject: [PATCH 123/593] use saxpy.c instead of axpy.S for SHAXPY --- cmake/kernel.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake index 7b64a03fc..c8244d833 100644 --- a/cmake/kernel.cmake +++ b/cmake/kernel.cmake @@ -122,7 +122,7 @@ macro(SetDefaultL1) set(ISHMAXKERNEL ../arm/imax.c) set(ISHMINKERNEL ../arm/imin.c) set(SHASUMKERNEL asum.S) - set(SHAXPYKERNEL axpy.S) + set(SHAXPYKERNEL saxpy.c) set(SHAXPBYKERNEL ../arm/axpby.c) set(SHCOPYKERNEL copy.S) set(SHDOTKERNEL dot.S) From f361de30a363d9f262daa9272525468c3b884e27 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 18 Apr 2020 11:07:16 +0200 Subject: [PATCH 124/593] Use generic axpy.c for SHAXPY as x86 lacks saxpy.c --- cmake/kernel.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake index c8244d833..38096ad18 100644 --- a/cmake/kernel.cmake +++ b/cmake/kernel.cmake @@ -122,7 +122,7 @@ macro(SetDefaultL1) set(ISHMAXKERNEL ../arm/imax.c) set(ISHMINKERNEL ../arm/imin.c) set(SHASUMKERNEL asum.S) - set(SHAXPYKERNEL saxpy.c) + set(SHAXPYKERNEL ../arm/axpy.c) set(SHAXPBYKERNEL ../arm/axpby.c) set(SHCOPYKERNEL copy.S) set(SHDOTKERNEL dot.S) From e7afe8a969af29e2f25e3d3349c03c9c912b669e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 18 Apr 2020 11:10:15 +0200 Subject: [PATCH 125/593] Define AXPBY_K fallback for float16 --- common_macro.h | 1 + 1 file changed, 1 insertion(+) diff --git a/common_macro.h b/common_macro.h index 2166e62a2..95e5b1061 100644 --- a/common_macro.h +++ b/common_macro.h @@ -648,6 +648,7 @@ #define AXPYU_K SAXPYU_K #define AXPYC_K SAXPYC_K +#define AXPBY_K SAXPBY_K #define SCAL_K SSCAL_K #define GEMV_N SGEMV_N #define GEMV_T SGEMV_T From 0a19bd813cad97a5adc8577d1b103afadfbd911c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 18 Apr 2020 12:52:51 +0200 Subject: [PATCH 126/593] Use generic codes for shamax and shcopy --- cmake/kernel.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake index 38096ad18..27d1ad630 100644 --- a/cmake/kernel.cmake +++ b/cmake/kernel.cmake @@ -114,7 +114,7 @@ macro(SetDefaultL1) set(QSUMKERNEL sum.S) set(XSUMKERNEL zsum.S) set(SHAMINKERNEL ../arm/amin.c) - set(SHAMAXKERNEL amax.S) + set(SHAMAXKERNEL ../arm/amax.c) set(SHMAXKERNEL ../arm/max.c) set(SHMINKERNEL ../arm/min.c) set(ISHAMAXKERNEL iamax.S) @@ -124,7 +124,7 @@ macro(SetDefaultL1) set(SHASUMKERNEL asum.S) set(SHAXPYKERNEL ../arm/axpy.c) set(SHAXPBYKERNEL ../arm/axpby.c) - set(SHCOPYKERNEL copy.S) + set(SHCOPYKERNEL ../arm/copy.c) set(SHDOTKERNEL dot.S) set(SHROTKERNEL rot.S) set(SHSCALKERNEL scal.S) From a83a59b0381e719011685cda3081e20aa59eaaee Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 18 Apr 2020 15:53:51 +0200 Subject: [PATCH 127/593] Use generic kernels for ishama,shasum,shdot,shrot --- cmake/kernel.cmake | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake index 27d1ad630..f50244e7d 100644 --- a/cmake/kernel.cmake +++ b/cmake/kernel.cmake @@ -117,16 +117,16 @@ macro(SetDefaultL1) set(SHAMAXKERNEL ../arm/amax.c) set(SHMAXKERNEL ../arm/max.c) set(SHMINKERNEL ../arm/min.c) - set(ISHAMAXKERNEL iamax.S) + set(ISHAMAXKERNEL ../arm/iamax.c) set(ISHAMINKERNEL ../arm/iamin.c) set(ISHMAXKERNEL ../arm/imax.c) set(ISHMINKERNEL ../arm/imin.c) - set(SHASUMKERNEL asum.S) + set(SHASUMKERNEL ../arm/asum.c) set(SHAXPYKERNEL ../arm/axpy.c) set(SHAXPBYKERNEL ../arm/axpby.c) set(SHCOPYKERNEL ../arm/copy.c) - set(SHDOTKERNEL dot.S) - set(SHROTKERNEL rot.S) + set(SHDOTKERNEL ../arm/dot.c) + set(SHROTKERNEL ../arm/rot.c) set(SHSCALKERNEL scal.S) set(SHNRM2KERNEL nrm2.S) set(SHSUMKERNEL sum.S) From c7d668c2481303e2fab76d86e9b47fe40b361c22 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 18 Apr 2020 16:04:38 +0200 Subject: [PATCH 128/593] Update common_macro.h --- common_macro.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/common_macro.h b/common_macro.h index 95e5b1061..9eff94e8e 100644 --- a/common_macro.h +++ b/common_macro.h @@ -646,6 +646,17 @@ #elif defined(HALF) +#define AMAX_K SAMAX_K +#define AMIN_K SAMIN_K +#define MAX_K SMAX_K +#define MIN_K SMIN_K +#define IAMAX_K ISAMAX_K +#define IAMIN_K ISAMIN_K +#define IMAX_K ISMAX_K +#define IMIN_K ISMIN_K +#define ASUM_K SASUM_K +#define DOTU_K SDOTU_K +#define DOTC_K SDOTC_K #define AXPYU_K SAXPYU_K #define AXPYC_K SAXPYC_K #define AXPBY_K SAXPBY_K @@ -658,6 +669,10 @@ #define GERC_K SGERC_K #define GERV_K SGERV_K #define GERD_K SGERD_K +#define SUM_K SSUM_K +#define SWAP_K SSWAP_K +#define ROT_K SROT_K +#define COPY_K SCOPY_K #define SYMV_THREAD_U SSYMV_THREAD_U #define SYMV_THREAD_L SSYMV_THREAD_L #define GEMM_BETA SHGEMM_BETA From 61bbae3ac1a9e8c4399c9889af6b5533636c03c6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 18 Apr 2020 21:09:32 +0200 Subject: [PATCH 129/593] Handle MIPS24K like P5600 and allow enforcing TARGET=1004K as well (omission from earlier 1004K merge and later introduction of TARGET check) --- cpuid_mips.c | 20 +++++++++++++++++--- getarch.c | 28 ++++++++++++++++++++++++++++ param.h | 8 +++++++- 3 files changed, 52 insertions(+), 4 deletions(-) diff --git a/cpuid_mips.c b/cpuid_mips.c index 6f2932c94..df3541536 100644 --- a/cpuid_mips.c +++ b/cpuid_mips.c @@ -73,11 +73,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CPU_UNKNOWN 0 #define CPU_P5600 1 #define CPU_1004K 2 +#define CPU_24K 3 static char *cpuname[] = { "UNKNOWN", "P5600", - "1004K" + "1004K", + "24K" }; int detect(void){ @@ -105,6 +107,8 @@ int detect(void){ return CPU_P5600; } else if (strstr(p, "1004K")) { return CPU_1004K; + } else if (strstr(p, " 24K")) { + return CPU_24K; } else return CPU_UNKNOWN; } @@ -121,7 +125,7 @@ void get_architecture(void){ } void get_subarchitecture(void){ - if(detect()==CPU_P5600|| detect()==CPU_1004K){ + if(detect()==CPU_P5600|| detect()==CPU_1004K|| detect()==CPU_24K){ printf("P5600"); }else{ printf("UNKNOWN"); @@ -146,7 +150,15 @@ void get_cpuconfig(void){ printf("#define MIPS1004K\n"); printf("#define L1_DATA_SIZE 32768\n"); printf("#define L1_DATA_LINESIZE 32\n"); - printf("#define L2_SIZE 26144\n"); + printf("#define L2_SIZE 262144\n"); + printf("#define DTB_DEFAULT_ENTRIES 8\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 4\n"); + } else if (detect()==CPU_24K) { + printf("#define MIPS24K\n"); + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 32\n"); + printf("#define L2_SIZE 32768\n"); printf("#define DTB_DEFAULT_ENTRIES 8\n"); printf("#define DTB_SIZE 4096\n"); printf("#define L2_ASSOCIATIVE 4\n"); @@ -160,6 +172,8 @@ void get_libname(void){ printf("p5600\n"); } else if (detect()==CPU_1004K) { printf("1004K\n"); + } else if (detect()==CPU_24K) { + printf("24K\n"); }else{ printf("mips\n"); } diff --git a/getarch.c b/getarch.c index 145753bcc..8a6684975 100644 --- a/getarch.c +++ b/getarch.c @@ -812,6 +812,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif +#ifdef FORCE_1004K +#define FORCE +#define ARCHITECTURE "MIPS" +#define SUBARCHITECTURE "1004K" +#define SUBDIRNAME "mips" +#define ARCHCONFIG "-D1004K " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "1004K" +#define CORENAME "1004K" +#else +#endif + +#ifdef FORCE_24K +#define FORCE +#define ARCHITECTURE "MIPS" +#define SUBARCHITECTURE "24K" +#define SUBDIRNAME "mips" +#define ARCHCONFIG "-D24K " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=32768 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "24K" +#define CORENAME "24K" +#else +#endif + #ifdef FORCE_I6500 #define FORCE #define ARCHITECTURE "MIPS" diff --git a/param.h b/param.h index d6cbe544a..2795947c5 100644 --- a/param.h +++ b/param.h @@ -72,6 +72,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef PARAM_H #define PARAM_H +#define SHGEMM_DEFAULT_UNROLL_N 4 +#define SHGEMM_DEFAULT_UNROLL_M 8 +#define SHGEMM_DEFAULT_UNROLL_MN 32 +#define SHGEMM_DEFAULT_P 256 +#define SHGEMM_DEFAULT_R 256 +#define SHGEMM_DEFAULT_Q 256 #ifdef OPTERON #define SNUMOPT 4 @@ -2468,7 +2474,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif -#if defined(P5600) || defined(MIPS1004K) || defined(I6400) || defined(P6600) || defined(I6500) +#if defined(P5600) || defined(MIPS1004K) defined(MIPS24K) || defined(I6400) || defined(P6600) || defined(I6500) #define SNUMOPT 2 #define DNUMOPT 2 From d712ea724cbf517fb8a40607cf5381dd453dbf92 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 18 Apr 2020 21:10:18 +0200 Subject: [PATCH 130/593] Add MIPS24K support --- kernel/mips/KERNEL.24K | 1 + 1 file changed, 1 insertion(+) create mode 100644 kernel/mips/KERNEL.24K diff --git a/kernel/mips/KERNEL.24K b/kernel/mips/KERNEL.24K new file mode 100644 index 000000000..67135356e --- /dev/null +++ b/kernel/mips/KERNEL.24K @@ -0,0 +1 @@ +include $(KERNELDIR)/KERNEL.P5600 From 00172d440bfc7dedc8523a4cdad58b685801bb76 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 18 Apr 2020 21:16:49 +0200 Subject: [PATCH 131/593] Typo fix in MIPS24K addition --- param.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/param.h b/param.h index 2795947c5..9fdf40fe2 100644 --- a/param.h +++ b/param.h @@ -2474,7 +2474,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif -#if defined(P5600) || defined(MIPS1004K) defined(MIPS24K) || defined(I6400) || defined(P6600) || defined(I6500) +#if defined(P5600) || defined(MIPS1004K) || defined(MIPS24K) || defined(I6400) || defined(P6600) || defined(I6500) #define SNUMOPT 2 #define DNUMOPT 2 From 7dbb59b256d47507fa8a11c03b98857b957e42d1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 18 Apr 2020 21:34:14 +0200 Subject: [PATCH 132/593] Update common_macro.h --- common_macro.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/common_macro.h b/common_macro.h index 9eff94e8e..8fe1f156f 100644 --- a/common_macro.h +++ b/common_macro.h @@ -673,6 +673,7 @@ #define SWAP_K SSWAP_K #define ROT_K SROT_K #define COPY_K SCOPY_K +#define NRM2_K SNRM2_K #define SYMV_THREAD_U SSYMV_THREAD_U #define SYMV_THREAD_L SSYMV_THREAD_L #define GEMM_BETA SHGEMM_BETA @@ -911,6 +912,17 @@ #define HERK_THREAD_LR SSYRK_THREAD_LN #define HERK_THREAD_LC SSYRK_THREAD_LT +#define OMATCOPY_K_CN SOMATCOPY_K_CN +#define OMATCOPY_K_RN SOMATCOPY_K_RN +#define OMATCOPY_K_CT SOMATCOPY_K_CT +#define OMATCOPY_K_RT SOMATCOPY_K_RT +#define IMATCOPY_K_CN SIMATCOPY_K_CN +#define IMATCOPY_K_RN SIMATCOPY_K_RN +#define IMATCOPY_K_CT SIMATCOPY_K_CT +#define IMATCOPY_K_RT SIMATCOPY_K_RT + +#define GEADD_K SGEADD_K + #endif #else From d0737b014288c2808ab679c0a609a37a5f5be286 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 18 Apr 2020 21:36:28 +0200 Subject: [PATCH 133/593] Update kernel.cmake --- cmake/kernel.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake index f50244e7d..19e760c56 100644 --- a/cmake/kernel.cmake +++ b/cmake/kernel.cmake @@ -128,7 +128,7 @@ macro(SetDefaultL1) set(SHDOTKERNEL ../arm/dot.c) set(SHROTKERNEL ../arm/rot.c) set(SHSCALKERNEL scal.S) - set(SHNRM2KERNEL nrm2.S) + set(SHNRM2KERNEL ../arm/nrm2.c) set(SHSUMKERNEL sum.S) set(SHSWAPKERNEL swap.S) endmacro () From a1fc98dc57f896450c3a807814ad36f541eb112f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 18 Apr 2020 23:50:23 +0200 Subject: [PATCH 134/593] rename 1004K, 24K to MIPS1004K, MIPS24K to avoid identifier naming problem --- cpuid_mips.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpuid_mips.c b/cpuid_mips.c index df3541536..3a2e12393 100644 --- a/cpuid_mips.c +++ b/cpuid_mips.c @@ -78,8 +78,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static char *cpuname[] = { "UNKNOWN", "P5600", - "1004K", - "24K" + "MIPS1004K", + "MIPS24K" }; int detect(void){ @@ -171,9 +171,9 @@ void get_libname(void){ if(detect()==CPU_P5600) { printf("p5600\n"); } else if (detect()==CPU_1004K) { - printf("1004K\n"); + printf("mips1004K\n"); } else if (detect()==CPU_24K) { - printf("24K\n"); + printf("mips24K\n"); }else{ printf("mips\n"); } From b0b02a080d06f41d4132c75876c073fade5feb8b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 19 Apr 2020 06:50:51 +0200 Subject: [PATCH 135/593] Add compiler options for MIPS32 24K/1004K --- Makefile.prebuild | 6 +++++- Makefile.system | 9 ++++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/Makefile.prebuild b/Makefile.prebuild index b00f13368..48fb5e991 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -17,7 +17,11 @@ ifdef CPUIDEMU EXFLAGS = -DCPUIDEMU -DVENDOR=99 endif -ifeq ($(TARGET), 1004K) +ifeq ($(TARGET), MIPS24K) +TARGET_FLAGS = -mips32r2 +endif + +ifeq ($(TARGET), MIPS1004K) TARGET_FLAGS = -mips32r2 endif diff --git a/Makefile.system b/Makefile.system index 2998c0e6a..51bd1c4bd 100644 --- a/Makefile.system +++ b/Makefile.system @@ -690,7 +690,12 @@ CCOMMON_OPT += -march=mips64 FCOMMON_OPT += -march=mips64 endif -ifeq ($(CORE), 1004K) +ifeq ($(CORE), MIPS24K) +CCOMMON_OPT += -mips32r2 -mtune=24kc $(MSA_FLAGS) +FCOMMON_OPT += -mips32r2 -mtune=24kc $(MSA_FLAGS) +endif + +ifeq ($(CORE), MIPS1004K) CCOMMON_OPT += -mips32r2 $(MSA_FLAGS) FCOMMON_OPT += -mips32r2 $(MSA_FLAGS) endif @@ -1390,6 +1395,8 @@ export FUNCTION_PROFILE export TARGET_CORE export NO_AVX512 +export SHGEMM_UNROLL_M +export SHGEMM_UNROLL_N export SGEMM_UNROLL_M export SGEMM_UNROLL_N export DGEMM_UNROLL_M From 6721f2750eabc6b9839e1b4d1aef39fa46810bc7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 19 Apr 2020 06:51:57 +0200 Subject: [PATCH 136/593] Update TargetList.txt --- TargetList.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/TargetList.txt b/TargetList.txt index f4a40ed02..e2d2f4026 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -58,7 +58,8 @@ CELL 3.MIPS CPU: P5600 -1004K +MIPS1004K +MIPS24K 4.MIPS64 CPU: SICORTEX From 577c5d9f8fc3b1776bbaf5d6f15eabc3c0b8d170 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 19 Apr 2020 06:54:52 +0200 Subject: [PATCH 137/593] Update README.md --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index 61393bd8f..6dc3c7b42 100644 --- a/README.md +++ b/README.md @@ -122,6 +122,11 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th - **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations. - **AMD ZEN**: Uses Haswell codes with some optimizations. +#### MIPS32 + +- **MIPS 1004K**: uses P5600 codes +- **MIPS 24K**: uses P5600 codes + #### MIPS64 - **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2. From 8792fc4d5f0dd69de1024963611304dd291e9792 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 19 Apr 2020 07:21:48 +0200 Subject: [PATCH 138/593] Disable RPCC macro on MIPS24K --- common_mips.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/common_mips.h b/common_mips.h index 2cc923043..dd2f8d558 100644 --- a/common_mips.h +++ b/common_mips.h @@ -43,6 +43,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef ASSEMBLER +#if !defined(MIPS24K) static inline unsigned int rpcc(void){ unsigned long ret; @@ -53,6 +54,7 @@ static inline unsigned int rpcc(void){ return ret; } #define RPCC_DEFINED +#endif static inline int blas_quickdivide(blasint x, blasint y){ return x / y; From 4f70512b978c39237d6e7e17bfeaa336b69f957d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 19 Apr 2020 08:10:26 +0200 Subject: [PATCH 139/593] Update kernel.cmake --- cmake/kernel.cmake | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake index 19e760c56..1c1fed571 100644 --- a/cmake/kernel.cmake +++ b/cmake/kernel.cmake @@ -127,10 +127,10 @@ macro(SetDefaultL1) set(SHCOPYKERNEL ../arm/copy.c) set(SHDOTKERNEL ../arm/dot.c) set(SHROTKERNEL ../arm/rot.c) - set(SHSCALKERNEL scal.S) + set(SHSCALKERNEL ../arm/scal.c) set(SHNRM2KERNEL ../arm/nrm2.c) - set(SHSUMKERNEL sum.S) - set(SHSWAPKERNEL swap.S) + set(SHSUMKERNEL ../arm/sum.c) + set(SHSWAPKERNEL ../arm/swap.c) endmacro () macro(SetDefaultL2) From 2f4a8e5bc4504c0ba9faca82e0ebfb4d50120a48 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 19 Apr 2020 13:22:19 +0200 Subject: [PATCH 140/593] Rename the FORCE entries for 24K and 1004K to include the MIPS prefix --- getarch.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/getarch.c b/getarch.c index 8a6684975..6be006aee 100644 --- a/getarch.c +++ b/getarch.c @@ -812,7 +812,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif -#ifdef FORCE_1004K +#ifdef FORCE_MIPS1004K #define FORCE #define ARCHITECTURE "MIPS" #define SUBARCHITECTURE "1004K" @@ -826,7 +826,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif -#ifdef FORCE_24K +#ifdef FORCE_MIPS24K #define FORCE #define ARCHITECTURE "MIPS" #define SUBARCHITECTURE "24K" From 0d18f231fc4d5591a30e995c695c3b729b4178a5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 19 Apr 2020 13:52:58 +0200 Subject: [PATCH 141/593] Update getarch.c --- getarch.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/getarch.c b/getarch.c index 6be006aee..0bd5bc601 100644 --- a/getarch.c +++ b/getarch.c @@ -815,28 +815,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef FORCE_MIPS1004K #define FORCE #define ARCHITECTURE "MIPS" -#define SUBARCHITECTURE "1004K" +#define SUBARCHITECTURE "MIPS1004K" #define SUBDIRNAME "mips" #define ARCHCONFIG "-D1004K " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " -#define LIBNAME "1004K" -#define CORENAME "1004K" +#define LIBNAME "mips1004K" +#define CORENAME "MIPS1004K" #else #endif #ifdef FORCE_MIPS24K #define FORCE #define ARCHITECTURE "MIPS" -#define SUBARCHITECTURE "24K" +#define SUBARCHITECTURE "MIPS24K" #define SUBDIRNAME "mips" #define ARCHCONFIG "-D24K " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=32768 -DL2_LINESIZE=32 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " -#define LIBNAME "24K" -#define CORENAME "24K" +#define LIBNAME "mips24K" +#define CORENAME "MIPS24K" #else #endif From 5afb66812f21406b290878388bc3a74dde455910 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 19 Apr 2020 14:55:31 +0200 Subject: [PATCH 142/593] Update getarch.c --- getarch.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/getarch.c b/getarch.c index 0bd5bc601..e739d2de9 100644 --- a/getarch.c +++ b/getarch.c @@ -817,7 +817,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ARCHITECTURE "MIPS" #define SUBARCHITECTURE "MIPS1004K" #define SUBDIRNAME "mips" -#define ARCHCONFIG "-D1004K " \ +#define ARCHCONFIG "-DMIPS1004K " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " @@ -831,7 +831,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ARCHITECTURE "MIPS" #define SUBARCHITECTURE "MIPS24K" #define SUBDIRNAME "mips" -#define ARCHCONFIG "-D24K " \ +#define ARCHCONFIG "-DMIPS24K " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=32768 -DL2_LINESIZE=32 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " From 6a04efb1227fc6afd2ba72987b1780344da9eae6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 19 Apr 2020 15:43:54 +0200 Subject: [PATCH 143/593] Rename KERNEL files to include MIPS prefix --- kernel/mips/KERNEL.MIPS1004K | 1 + kernel/mips/KERNEL.MIPS24K | 1 + 2 files changed, 2 insertions(+) create mode 100644 kernel/mips/KERNEL.MIPS1004K create mode 100644 kernel/mips/KERNEL.MIPS24K diff --git a/kernel/mips/KERNEL.MIPS1004K b/kernel/mips/KERNEL.MIPS1004K new file mode 100644 index 000000000..67135356e --- /dev/null +++ b/kernel/mips/KERNEL.MIPS1004K @@ -0,0 +1 @@ +include $(KERNELDIR)/KERNEL.P5600 diff --git a/kernel/mips/KERNEL.MIPS24K b/kernel/mips/KERNEL.MIPS24K new file mode 100644 index 000000000..67135356e --- /dev/null +++ b/kernel/mips/KERNEL.MIPS24K @@ -0,0 +1 @@ +include $(KERNELDIR)/KERNEL.P5600 From 7353ea5afc682d12944cfd97ca8a24daa83304a3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 19 Apr 2020 15:44:19 +0200 Subject: [PATCH 144/593] Delete KERNEL.24K --- kernel/mips/KERNEL.24K | 1 - 1 file changed, 1 deletion(-) delete mode 100644 kernel/mips/KERNEL.24K diff --git a/kernel/mips/KERNEL.24K b/kernel/mips/KERNEL.24K deleted file mode 100644 index 67135356e..000000000 --- a/kernel/mips/KERNEL.24K +++ /dev/null @@ -1 +0,0 @@ -include $(KERNELDIR)/KERNEL.P5600 From e55ec82bb92338d09ecd77357da3fcdfac0a7902 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 19 Apr 2020 15:44:30 +0200 Subject: [PATCH 145/593] Delete KERNEL.1004K --- kernel/mips/KERNEL.1004K | 1 - 1 file changed, 1 deletion(-) delete mode 100644 kernel/mips/KERNEL.1004K diff --git a/kernel/mips/KERNEL.1004K b/kernel/mips/KERNEL.1004K deleted file mode 100644 index 67135356e..000000000 --- a/kernel/mips/KERNEL.1004K +++ /dev/null @@ -1 +0,0 @@ -include $(KERNELDIR)/KERNEL.P5600 From e1e543b145ce6bef55e7a8ee4e3efff3e3d31cb2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 19 Apr 2020 16:16:15 +0200 Subject: [PATCH 146/593] Add Windows build job on Azure CI (#2566) * Add Windows-CL build job on Azure --- .drone.yml | 192 ---------------------------------------- .travis.yml | 211 -------------------------------------------- appveyor.yml | 82 ----------------- azure-pipelines.yml | 20 +++++ 4 files changed, 20 insertions(+), 485 deletions(-) delete mode 100644 .drone.yml delete mode 100644 .travis.yml delete mode 100644 appveyor.yml diff --git a/.drone.yml b/.drone.yml deleted file mode 100644 index b1c211d14..000000000 --- a/.drone.yml +++ /dev/null @@ -1,192 +0,0 @@ ---- -kind: pipeline -name: arm64_gcc_make - -platform: - os: linux - arch: arm64 - -steps: -- name: Build and Test - image: ubuntu:18.04 - environment: - CC: gcc - COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32' - commands: - - echo "MAKE_FLAGS:= $COMMON_FLAGS" - - apt-get update -y - - apt-get install -y make $CC gfortran perl - - $CC --version - - make QUIET_MAKE=1 $COMMON_FLAGS - - make -C test $COMMON_FLAGS - - make -C ctest $COMMON_FLAGS - - make -C utest $COMMON_FLAGS - ---- -kind: pipeline -name: arm32_gcc_make - -platform: - os: linux - arch: arm - -steps: -- name: Build and Test - image: ubuntu:18.04 - environment: - CC: gcc - COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV6 NUM_THREADS=32' - commands: - - echo "MAKE_FLAGS:= $COMMON_FLAGS" - - apt-get update -y - - apt-get install -y make $CC gfortran perl - - $CC --version - - make QUIET_MAKE=1 $COMMON_FLAGS - - make -C test $COMMON_FLAGS - - make -C ctest $COMMON_FLAGS - - make -C utest $COMMON_FLAGS - ---- -kind: pipeline -name: arm64_clang_make - -platform: - os: linux - arch: arm64 - -steps: -- name: Build and Test - image: ubuntu:18.04 - environment: - CC: clang - COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32' - commands: - - echo "MAKE_FLAGS:= $COMMON_FLAGS" - - apt-get update -y - - apt-get install -y make $CC gfortran perl - - $CC --version - - make QUIET_MAKE=1 $COMMON_FLAGS - - make -C test $COMMON_FLAGS - - make -C ctest $COMMON_FLAGS - - make -C utest $COMMON_FLAGS - ---- -kind: pipeline -name: arm32_clang_cmake - -platform: - os: linux - arch: arm - -steps: -- name: Build and Test - image: ubuntu:18.04 - environment: - CC: clang - CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV6 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON' - commands: - - echo "CMAKE_FLAGS:= $CMAKE_FLAGS" - - apt-get update -y - - apt-get install -y make $CC g++ perl cmake - - $CC --version - - mkdir build && cd build - - cmake $CMAKE_FLAGS .. - - make -j - - ctest -V - ---- -kind: pipeline -name: arm64_gcc_cmake - -platform: - os: linux - arch: arm64 - -steps: -- name: Build and Test - image: ubuntu:18.04 - environment: - CC: gcc - CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON' - commands: - - echo "CMAKE_FLAGS:= $CMAKE_FLAGS" - - apt-get update -y - - apt-get install -y make $CC g++ perl cmake - - $CC --version - - mkdir build && cd build - - cmake $CMAKE_FLAGS .. - - make -j - - ctest -V - ---- -kind: pipeline -name: arm64_clang_cmake - -platform: - os: linux - arch: arm64 - -steps: -- name: Build and Test - image: ubuntu:18.04 - environment: - CC: clang - CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON' - commands: - - echo "CMAKE_FLAGS:= $CMAKE_FLAGS" - - apt-get update -y - - apt-get install -y make $CC g++ perl cmake - - $CC --version - - mkdir build && cd build - - cmake $CMAKE_FLAGS .. - - make -j - - ctest -V - ---- -kind: pipeline -name: arm64_native_test - -platform: - os: linux - arch: arm64 - -steps: -- name: Build and Test - image: ubuntu:18.04 - environment: - CC: gcc - COMMON_FLAGS: 'USE_OPENMP=1' - commands: - - echo "MAKE_FLAGS:= $COMMON_FLAGS" - - apt-get update -y - - apt-get install -y make $CC gfortran perl python g++ - - $CC --version - - make QUIET_MAKE=1 $COMMON_FLAGS - - make -C test $COMMON_FLAGS - - make -C ctest $COMMON_FLAGS - - make -C utest $COMMON_FLAGS - - make -C cpp_thread_test dgemm_tester ---- -kind: pipeline -name: epyc_native_test - -platform: - os: linux - arch: amd64 - -steps: -- name: Build and Test - image: ubuntu:18.04 - environment: - CC: gcc - COMMON_FLAGS: 'USE_OPENMP=1' - commands: - - echo "MAKE_FLAGS:= $COMMON_FLAGS" - - apt-get update -y - - apt-get install -y make $CC gfortran perl python g++ - - $CC --version - - make QUIET_MAKE=1 $COMMON_FLAGS - - make -C test $COMMON_FLAGS - - make -C ctest $COMMON_FLAGS - - make -C utest $COMMON_FLAGS - - make -C cpp_thread_test dgemm_tester diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index c875572b2..000000000 --- a/.travis.yml +++ /dev/null @@ -1,211 +0,0 @@ -# XXX: Precise is already deprecated, new default is Trusty. -# https://blog.travis-ci.com/2017-07-11-trusty-as-default-linux-is-coming -dist: precise -sudo: true -language: c - -matrix: - include: - - &test-ubuntu - os: linux - compiler: gcc - addons: - apt: - packages: - - gfortran - before_script: &common-before - - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" - script: - - set -e - - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE - - make -C test $COMMON_FLAGS $BTYPE - - make -C ctest $COMMON_FLAGS $BTYPE - - make -C utest $COMMON_FLAGS $BTYPE - env: - - TARGET_BOX=LINUX64 - - BTYPE="BINARY=64" - - - <<: *test-ubuntu - os: linux-ppc64le - before_script: - - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32" - env: - # for matrix annotation only - - TARGET_BOX=PPC64LE_LINUX - - BTYPE="BINARY=64 USE_OPENMP=1" - - - <<: *test-ubuntu - os: linux - arch: s390x - before_script: - - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=Z13 NUM_THREADS=32" - env: - # for matrix annotation only - - TARGET_BOX=IBMZ_LINUX - - BTYPE="BINARY=64 USE_OPENMP=1" - - - <<: *test-ubuntu - env: - - TARGET_BOX=LINUX64 - - BTYPE="BINARY=64 USE_OPENMP=1" - - - <<: *test-ubuntu - env: - - TARGET_BOX=LINUX64 - - BTYPE="BINARY=64 INTERFACE64=1" - - - <<: *test-ubuntu - compiler: clang - env: - - TARGET_BOX=LINUX64 - - BTYPE="BINARY=64 CC=clang" - - - <<: *test-ubuntu - compiler: clang - env: - - TARGET_BOX=LINUX64 - - BTYPE="BINARY=64 INTERFACE64=1 CC=clang" - - - <<: *test-ubuntu - addons: - apt: - packages: - - gcc-multilib - - gfortran-multilib - env: - - TARGET_BOX=LINUX32 - - BTYPE="BINARY=32" - - - os: linux - compiler: gcc - addons: - apt: - packages: - - binutils-mingw-w64-x86-64 - - gcc-mingw-w64-x86-64 - - gfortran-mingw-w64-x86-64 - before_script: *common-before - script: - - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE - env: - - TARGET_BOX=WIN64 - - BTYPE="BINARY=64 HOSTCC=gcc CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran" - - # Build & test on Alpine Linux inside chroot, i.e. on system with musl libc. - # These jobs needs sudo, so Travis runs them on VM-based infrastructure - # which is slower than container-based infrastructure used for jobs - # that don't require sudo. - - &test-alpine - os: linux - dist: trusty - sudo: true - language: minimal - before_install: - - "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \ - && echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1" - - alpine() { /alpine/enter-chroot -u "$USER" "$@"; } - install: - - sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers' - before_script: *common-before - script: - - set -e - # XXX: Disable some warnings for now to avoid exceeding Travis limit for log size. - - alpine make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE - CFLAGS="-Wno-misleading-indentation -Wno-sign-conversion -Wno-incompatible-pointer-types" - - alpine make -C test $COMMON_FLAGS $BTYPE - - alpine make -C ctest $COMMON_FLAGS $BTYPE - - alpine make -C utest $COMMON_FLAGS $BTYPE - env: - - TARGET_BOX=LINUX64_MUSL - - BTYPE="BINARY=64" - - # XXX: This job segfaults in TESTS OF THE COMPLEX LEVEL 3 BLAS, - # but only on Travis CI, cannot reproduce it elsewhere. - #- &test-alpine-openmp - # <<: *test-alpine - # env: - # - TARGET_BOX=LINUX64_MUSL - # - BTYPE="BINARY=64 USE_OPENMP=1" - - - <<: *test-alpine - env: - - TARGET_BOX=LINUX64_MUSL - - BTYPE="BINARY=64 INTERFACE64=1" - - # Build with the same flags as Alpine do in OpenBLAS package. - - <<: *test-alpine - env: - - TARGET_BOX=LINUX64_MUSL - - BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=CORE2" - - - &test-cmake - os: linux - compiler: clang - addons: - apt: - packages: - - gfortran - - cmake - dist: trusty - sudo: true - before_script: - - COMMON_ARGS="-DTARGET=NEHALEM -DNUM_THREADS=32" - script: - - set -e - - mkdir build - - CONFIG=Release - - cmake -Bbuild -H. $CMAKE_ARGS $COMMON_ARGS -DCMAKE_BUILD_TYPE=$CONFIG - - cmake --build build --config $CONFIG -- -j2 - env: - - CMAKE=1 - - <<: *test-cmake - env: - - CMAKE=1 CMAKE_ARGS="-DNOFORTRAN=1" - - <<: *test-cmake - compiler: gcc - env: - - CMAKE=1 - - - &test-macos - os: osx - osx_image: xcode10.1 - before_script: - - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" - - brew update - - brew install gcc@8 # for gfortran - script: - - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE - env: - - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-8" - - - <<: *test-macos - osx_image: xcode10.0 - env: - - BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1" - - - <<: *test-macos - osx_image: xcode10.1 - env: - - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" - - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0" - - BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1" - - - <<: *test-macos - osx_image: xcode10.1 - env: - - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" - - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch armv7 -miphoneos-version-min=5.1" - - BTYPE="TARGET=ARMV7 HOSTCC=clang NOFORTRAN=1" -# whitelist -branches: - only: - - master - - develop - -notifications: - webhooks: - urls: - - https://webhooks.gitter.im/e/8a6e4470a0cebd090344 - on_success: change # options: [always|never|change] default: always - on_failure: always # options: [always|never|change] default: always - on_start: never # options: [always|never|change] default: always diff --git a/appveyor.yml b/appveyor.yml deleted file mode 100644 index 1936059d5..000000000 --- a/appveyor.yml +++ /dev/null @@ -1,82 +0,0 @@ -version: 0.2.19.{build} - -#environment: - -platform: - - x64 - -os: Visual Studio 2017 - -configuration: Release - -clone_folder: c:\projects\OpenBLAS - -init: - - git config --global core.autocrlf input - -clone_depth: 5 - -skip_tags: true - -matrix: - fast_finish: false - -skip_commits: -# Add [av skip] to commit messages - message: /\[av skip\]/ - -environment: - global: - CONDA_INSTALL_LOCN: C:\\Miniconda36-x64 - matrix: - - COMPILER: clang-cl - WITH_FORTRAN: yes - - COMPILER: clang-cl - DYNAMIC_ARCH: ON - WITH_FORTRAN: no - - COMPILER: cl - - COMPILER: MinGW64-gcc-7.2.0-mingw - DYNAMIC_ARCH: OFF - WITH_FORTRAN: ignore - - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 - COMPILER: MinGW-gcc-6.3.0-32 - - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 - COMPILER: MinGW-gcc-5.3.0 - WITH_FORTRAN: ignore - -install: - - if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat - - if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force - - if [%COMPILER%]==[clang-cl] conda install --yes --quiet clangdev cmake - - - if [%WITH_FORTRAN%]==[no] conda install --yes --quiet ninja - - if [%WITH_FORTRAN%]==[yes] conda install --yes --quiet -c isuruf kitware-ninja - - if [%WITH_FORTRAN%]==[yes] conda install --yes --quiet flang - - - if [%COMPILER%]==[clang-cl] call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat" x64 - - if [%COMPILER%]==[clang-cl] set "LIB=%CONDA_INSTALL_LOCN%\Library\lib;%LIB%" - - if [%COMPILER%]==[clang-cl] set "CPATH=%CONDA_INSTALL_LOCN%\Library\include;%CPATH%" - -before_build: - - ps: if (-Not (Test-Path .\build)) { mkdir build } - - cd build - - set PATH=%PATH:C:\Program Files\Git\usr\bin;=% - - if [%COMPILER%]==[MinGW-gcc-5.3.0] set PATH=C:\MinGW\bin;C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH% - - if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] set PATH=C:\MinGW\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH% - - if [%COMPILER%]==[MinGW-gcc-6.3.0-32] set PATH=C:\msys64\usr\bin;C:\mingw-w64\i686-6.3.0-posix-dwarf-rt_v5-rev1\mingw64\bin;%PATH% - - if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" .. - - if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 .. - - if [%COMPILER%]==[MinGW-gcc-6.3.0-32] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 .. - - if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 .. - - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON .. - - if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. - - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' .. - -build_script: - - cmake --build . - -test_script: - - echo Running Test - - cd utest - - openblas_utest - diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 9b4c85367..639cb3558 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -49,3 +49,23 @@ jobs: # we need a privileged docker run for sde process attachment docker run --privileged intel_sde displayName: 'Run AVX512 SkylakeX docker build / test' + +- job: Windows_cl + pool: + vmImage: 'windows-latest' + steps: + - task: CMake@1 + inputs: + workingDirectory: 'build' # Optional + cmakeArgs: '-G "Visual Studio 16 2019" ..' + - task: CMake@1 + inputs: + cmakeArgs: '--build . --config Release' + workingDirectory: 'build' + - script: | + cd build + cd utest + dir + openblas_utest.exe + + From 04706e760d19305d22458eaa21abf0ad7bc415c5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 19 Apr 2020 19:00:37 +0200 Subject: [PATCH 147/593] Revert "Add Windows build job on Azure CI (#2566)" This reverts commit e1e543b145ce6bef55e7a8ee4e3efff3e3d31cb2. --- .drone.yml | 192 ++++++++++++++++++++++++++++++++++++++++ .travis.yml | 211 ++++++++++++++++++++++++++++++++++++++++++++ appveyor.yml | 82 +++++++++++++++++ azure-pipelines.yml | 20 ----- 4 files changed, 485 insertions(+), 20 deletions(-) create mode 100644 .drone.yml create mode 100644 .travis.yml create mode 100644 appveyor.yml diff --git a/.drone.yml b/.drone.yml new file mode 100644 index 000000000..b1c211d14 --- /dev/null +++ b/.drone.yml @@ -0,0 +1,192 @@ +--- +kind: pipeline +name: arm64_gcc_make + +platform: + os: linux + arch: arm64 + +steps: +- name: Build and Test + image: ubuntu:18.04 + environment: + CC: gcc + COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32' + commands: + - echo "MAKE_FLAGS:= $COMMON_FLAGS" + - apt-get update -y + - apt-get install -y make $CC gfortran perl + - $CC --version + - make QUIET_MAKE=1 $COMMON_FLAGS + - make -C test $COMMON_FLAGS + - make -C ctest $COMMON_FLAGS + - make -C utest $COMMON_FLAGS + +--- +kind: pipeline +name: arm32_gcc_make + +platform: + os: linux + arch: arm + +steps: +- name: Build and Test + image: ubuntu:18.04 + environment: + CC: gcc + COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV6 NUM_THREADS=32' + commands: + - echo "MAKE_FLAGS:= $COMMON_FLAGS" + - apt-get update -y + - apt-get install -y make $CC gfortran perl + - $CC --version + - make QUIET_MAKE=1 $COMMON_FLAGS + - make -C test $COMMON_FLAGS + - make -C ctest $COMMON_FLAGS + - make -C utest $COMMON_FLAGS + +--- +kind: pipeline +name: arm64_clang_make + +platform: + os: linux + arch: arm64 + +steps: +- name: Build and Test + image: ubuntu:18.04 + environment: + CC: clang + COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32' + commands: + - echo "MAKE_FLAGS:= $COMMON_FLAGS" + - apt-get update -y + - apt-get install -y make $CC gfortran perl + - $CC --version + - make QUIET_MAKE=1 $COMMON_FLAGS + - make -C test $COMMON_FLAGS + - make -C ctest $COMMON_FLAGS + - make -C utest $COMMON_FLAGS + +--- +kind: pipeline +name: arm32_clang_cmake + +platform: + os: linux + arch: arm + +steps: +- name: Build and Test + image: ubuntu:18.04 + environment: + CC: clang + CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV6 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON' + commands: + - echo "CMAKE_FLAGS:= $CMAKE_FLAGS" + - apt-get update -y + - apt-get install -y make $CC g++ perl cmake + - $CC --version + - mkdir build && cd build + - cmake $CMAKE_FLAGS .. + - make -j + - ctest -V + +--- +kind: pipeline +name: arm64_gcc_cmake + +platform: + os: linux + arch: arm64 + +steps: +- name: Build and Test + image: ubuntu:18.04 + environment: + CC: gcc + CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON' + commands: + - echo "CMAKE_FLAGS:= $CMAKE_FLAGS" + - apt-get update -y + - apt-get install -y make $CC g++ perl cmake + - $CC --version + - mkdir build && cd build + - cmake $CMAKE_FLAGS .. + - make -j + - ctest -V + +--- +kind: pipeline +name: arm64_clang_cmake + +platform: + os: linux + arch: arm64 + +steps: +- name: Build and Test + image: ubuntu:18.04 + environment: + CC: clang + CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON' + commands: + - echo "CMAKE_FLAGS:= $CMAKE_FLAGS" + - apt-get update -y + - apt-get install -y make $CC g++ perl cmake + - $CC --version + - mkdir build && cd build + - cmake $CMAKE_FLAGS .. + - make -j + - ctest -V + +--- +kind: pipeline +name: arm64_native_test + +platform: + os: linux + arch: arm64 + +steps: +- name: Build and Test + image: ubuntu:18.04 + environment: + CC: gcc + COMMON_FLAGS: 'USE_OPENMP=1' + commands: + - echo "MAKE_FLAGS:= $COMMON_FLAGS" + - apt-get update -y + - apt-get install -y make $CC gfortran perl python g++ + - $CC --version + - make QUIET_MAKE=1 $COMMON_FLAGS + - make -C test $COMMON_FLAGS + - make -C ctest $COMMON_FLAGS + - make -C utest $COMMON_FLAGS + - make -C cpp_thread_test dgemm_tester +--- +kind: pipeline +name: epyc_native_test + +platform: + os: linux + arch: amd64 + +steps: +- name: Build and Test + image: ubuntu:18.04 + environment: + CC: gcc + COMMON_FLAGS: 'USE_OPENMP=1' + commands: + - echo "MAKE_FLAGS:= $COMMON_FLAGS" + - apt-get update -y + - apt-get install -y make $CC gfortran perl python g++ + - $CC --version + - make QUIET_MAKE=1 $COMMON_FLAGS + - make -C test $COMMON_FLAGS + - make -C ctest $COMMON_FLAGS + - make -C utest $COMMON_FLAGS + - make -C cpp_thread_test dgemm_tester diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 000000000..c875572b2 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,211 @@ +# XXX: Precise is already deprecated, new default is Trusty. +# https://blog.travis-ci.com/2017-07-11-trusty-as-default-linux-is-coming +dist: precise +sudo: true +language: c + +matrix: + include: + - &test-ubuntu + os: linux + compiler: gcc + addons: + apt: + packages: + - gfortran + before_script: &common-before + - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" + script: + - set -e + - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE + - make -C test $COMMON_FLAGS $BTYPE + - make -C ctest $COMMON_FLAGS $BTYPE + - make -C utest $COMMON_FLAGS $BTYPE + env: + - TARGET_BOX=LINUX64 + - BTYPE="BINARY=64" + + - <<: *test-ubuntu + os: linux-ppc64le + before_script: + - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32" + env: + # for matrix annotation only + - TARGET_BOX=PPC64LE_LINUX + - BTYPE="BINARY=64 USE_OPENMP=1" + + - <<: *test-ubuntu + os: linux + arch: s390x + before_script: + - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=Z13 NUM_THREADS=32" + env: + # for matrix annotation only + - TARGET_BOX=IBMZ_LINUX + - BTYPE="BINARY=64 USE_OPENMP=1" + + - <<: *test-ubuntu + env: + - TARGET_BOX=LINUX64 + - BTYPE="BINARY=64 USE_OPENMP=1" + + - <<: *test-ubuntu + env: + - TARGET_BOX=LINUX64 + - BTYPE="BINARY=64 INTERFACE64=1" + + - <<: *test-ubuntu + compiler: clang + env: + - TARGET_BOX=LINUX64 + - BTYPE="BINARY=64 CC=clang" + + - <<: *test-ubuntu + compiler: clang + env: + - TARGET_BOX=LINUX64 + - BTYPE="BINARY=64 INTERFACE64=1 CC=clang" + + - <<: *test-ubuntu + addons: + apt: + packages: + - gcc-multilib + - gfortran-multilib + env: + - TARGET_BOX=LINUX32 + - BTYPE="BINARY=32" + + - os: linux + compiler: gcc + addons: + apt: + packages: + - binutils-mingw-w64-x86-64 + - gcc-mingw-w64-x86-64 + - gfortran-mingw-w64-x86-64 + before_script: *common-before + script: + - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE + env: + - TARGET_BOX=WIN64 + - BTYPE="BINARY=64 HOSTCC=gcc CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran" + + # Build & test on Alpine Linux inside chroot, i.e. on system with musl libc. + # These jobs needs sudo, so Travis runs them on VM-based infrastructure + # which is slower than container-based infrastructure used for jobs + # that don't require sudo. + - &test-alpine + os: linux + dist: trusty + sudo: true + language: minimal + before_install: + - "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \ + && echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1" + - alpine() { /alpine/enter-chroot -u "$USER" "$@"; } + install: + - sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers' + before_script: *common-before + script: + - set -e + # XXX: Disable some warnings for now to avoid exceeding Travis limit for log size. + - alpine make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE + CFLAGS="-Wno-misleading-indentation -Wno-sign-conversion -Wno-incompatible-pointer-types" + - alpine make -C test $COMMON_FLAGS $BTYPE + - alpine make -C ctest $COMMON_FLAGS $BTYPE + - alpine make -C utest $COMMON_FLAGS $BTYPE + env: + - TARGET_BOX=LINUX64_MUSL + - BTYPE="BINARY=64" + + # XXX: This job segfaults in TESTS OF THE COMPLEX LEVEL 3 BLAS, + # but only on Travis CI, cannot reproduce it elsewhere. + #- &test-alpine-openmp + # <<: *test-alpine + # env: + # - TARGET_BOX=LINUX64_MUSL + # - BTYPE="BINARY=64 USE_OPENMP=1" + + - <<: *test-alpine + env: + - TARGET_BOX=LINUX64_MUSL + - BTYPE="BINARY=64 INTERFACE64=1" + + # Build with the same flags as Alpine do in OpenBLAS package. + - <<: *test-alpine + env: + - TARGET_BOX=LINUX64_MUSL + - BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=CORE2" + + - &test-cmake + os: linux + compiler: clang + addons: + apt: + packages: + - gfortran + - cmake + dist: trusty + sudo: true + before_script: + - COMMON_ARGS="-DTARGET=NEHALEM -DNUM_THREADS=32" + script: + - set -e + - mkdir build + - CONFIG=Release + - cmake -Bbuild -H. $CMAKE_ARGS $COMMON_ARGS -DCMAKE_BUILD_TYPE=$CONFIG + - cmake --build build --config $CONFIG -- -j2 + env: + - CMAKE=1 + - <<: *test-cmake + env: + - CMAKE=1 CMAKE_ARGS="-DNOFORTRAN=1" + - <<: *test-cmake + compiler: gcc + env: + - CMAKE=1 + + - &test-macos + os: osx + osx_image: xcode10.1 + before_script: + - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" + - brew update + - brew install gcc@8 # for gfortran + script: + - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE + env: + - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-8" + + - <<: *test-macos + osx_image: xcode10.0 + env: + - BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1" + + - <<: *test-macos + osx_image: xcode10.1 + env: + - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" + - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0" + - BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1" + + - <<: *test-macos + osx_image: xcode10.1 + env: + - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" + - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch armv7 -miphoneos-version-min=5.1" + - BTYPE="TARGET=ARMV7 HOSTCC=clang NOFORTRAN=1" +# whitelist +branches: + only: + - master + - develop + +notifications: + webhooks: + urls: + - https://webhooks.gitter.im/e/8a6e4470a0cebd090344 + on_success: change # options: [always|never|change] default: always + on_failure: always # options: [always|never|change] default: always + on_start: never # options: [always|never|change] default: always diff --git a/appveyor.yml b/appveyor.yml new file mode 100644 index 000000000..1936059d5 --- /dev/null +++ b/appveyor.yml @@ -0,0 +1,82 @@ +version: 0.2.19.{build} + +#environment: + +platform: + - x64 + +os: Visual Studio 2017 + +configuration: Release + +clone_folder: c:\projects\OpenBLAS + +init: + - git config --global core.autocrlf input + +clone_depth: 5 + +skip_tags: true + +matrix: + fast_finish: false + +skip_commits: +# Add [av skip] to commit messages + message: /\[av skip\]/ + +environment: + global: + CONDA_INSTALL_LOCN: C:\\Miniconda36-x64 + matrix: + - COMPILER: clang-cl + WITH_FORTRAN: yes + - COMPILER: clang-cl + DYNAMIC_ARCH: ON + WITH_FORTRAN: no + - COMPILER: cl + - COMPILER: MinGW64-gcc-7.2.0-mingw + DYNAMIC_ARCH: OFF + WITH_FORTRAN: ignore + - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 + COMPILER: MinGW-gcc-6.3.0-32 + - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 + COMPILER: MinGW-gcc-5.3.0 + WITH_FORTRAN: ignore + +install: + - if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat + - if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force + - if [%COMPILER%]==[clang-cl] conda install --yes --quiet clangdev cmake + + - if [%WITH_FORTRAN%]==[no] conda install --yes --quiet ninja + - if [%WITH_FORTRAN%]==[yes] conda install --yes --quiet -c isuruf kitware-ninja + - if [%WITH_FORTRAN%]==[yes] conda install --yes --quiet flang + + - if [%COMPILER%]==[clang-cl] call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat" x64 + - if [%COMPILER%]==[clang-cl] set "LIB=%CONDA_INSTALL_LOCN%\Library\lib;%LIB%" + - if [%COMPILER%]==[clang-cl] set "CPATH=%CONDA_INSTALL_LOCN%\Library\include;%CPATH%" + +before_build: + - ps: if (-Not (Test-Path .\build)) { mkdir build } + - cd build + - set PATH=%PATH:C:\Program Files\Git\usr\bin;=% + - if [%COMPILER%]==[MinGW-gcc-5.3.0] set PATH=C:\MinGW\bin;C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH% + - if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] set PATH=C:\MinGW\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH% + - if [%COMPILER%]==[MinGW-gcc-6.3.0-32] set PATH=C:\msys64\usr\bin;C:\mingw-w64\i686-6.3.0-posix-dwarf-rt_v5-rev1\mingw64\bin;%PATH% + - if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" .. + - if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 .. + - if [%COMPILER%]==[MinGW-gcc-6.3.0-32] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 .. + - if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 .. + - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON .. + - if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. + - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' .. + +build_script: + - cmake --build . + +test_script: + - echo Running Test + - cd utest + - openblas_utest + diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 639cb3558..9b4c85367 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -49,23 +49,3 @@ jobs: # we need a privileged docker run for sde process attachment docker run --privileged intel_sde displayName: 'Run AVX512 SkylakeX docker build / test' - -- job: Windows_cl - pool: - vmImage: 'windows-latest' - steps: - - task: CMake@1 - inputs: - workingDirectory: 'build' # Optional - cmakeArgs: '-G "Visual Studio 16 2019" ..' - - task: CMake@1 - inputs: - cmakeArgs: '--build . --config Release' - workingDirectory: 'build' - - script: | - cd build - cd utest - dir - openblas_utest.exe - - From aec353b5a7f1da5b0cbaf3d8a150aeb456e47e7b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 19 Apr 2020 19:04:33 +0200 Subject: [PATCH 148/593] Add a Windows/CL build to the Azure Ci configuration --- azure-pipelines.yml | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 9b4c85367..639cb3558 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -49,3 +49,23 @@ jobs: # we need a privileged docker run for sde process attachment docker run --privileged intel_sde displayName: 'Run AVX512 SkylakeX docker build / test' + +- job: Windows_cl + pool: + vmImage: 'windows-latest' + steps: + - task: CMake@1 + inputs: + workingDirectory: 'build' # Optional + cmakeArgs: '-G "Visual Studio 16 2019" ..' + - task: CMake@1 + inputs: + cmakeArgs: '--build . --config Release' + workingDirectory: 'build' + - script: | + cd build + cd utest + dir + openblas_utest.exe + + From 239282d5e26156e8e5d3ef53b1bb595ea726be6f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 20 Apr 2020 22:30:51 +0200 Subject: [PATCH 149/593] Use CMAKE_SHARED_LINKER_FLAGS to pass MSVC linker option target_link_libraries does not work here according to issue 2472 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 20cf741c4..c2b9ae7ad 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -240,7 +240,7 @@ if (BUILD_SHARED_LIBS AND BUILD_RELAPACK) if (NOT MSVC) target_link_libraries(${OpenBLAS_LIBNAME} "-Wl,-allow-multiple-definition") else() - target_link_libraries(${OpenBLAS_LIBNAME} "/FORCE:MULTIPLE") + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /FORCE:MULTIPLE") endif() endif() From f5c4c28b989ee3b10c2ba89b7a5179cbfce38001 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 21 Apr 2020 17:17:17 +0200 Subject: [PATCH 150/593] Work around POWER8BE bugs on FreeBSD (ELFv2) for #2299 --- kernel/power/KERNEL.POWER8 | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index c7867012b..03a4d90b8 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -232,3 +232,11 @@ QCABS_KERNEL = ../generic/cabs.c #Dump kernel CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c + +ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) +IDAMAXKERNEL = ../arm/iamax.c +IDAMINKERNEL = ../arm/iamin.c +IZAMAXKERNEL = ../arm/izamax.c +IZAMINKERNEL = ../arm/izamin.c +endif + From 2db5178e2d3c6f010df51d1a7a4e62c2bd7b407f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 22 Apr 2020 11:01:28 +0200 Subject: [PATCH 151/593] enable cblas interfaces to GEMM3M in CMAKE builds --- interface/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index 5ea39f864..7a8fc6698 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -115,7 +115,7 @@ foreach (float_type ${FLOAT_TYPES}) GenerateNamedObjects("syr2k.c" "HEMM" "her2k" ${CBLAS_FLAG} "" "" false ${float_type}) if (USE_GEMM3M) - GenerateNamedObjects("gemm.c" "GEMM3M" "gemm3m" false "" "" false ${float_type}) + GenerateNamedObjects("gemm.c" "GEMM3M" "gemm3m" ${CBLAS_FLAG} "" "" false ${float_type}) endif() endif () if (${float_type} STREQUAL "COMPLEX") From 6275b43918e54fc19294860f990cec6002de5816 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 22 Apr 2020 14:12:27 +0200 Subject: [PATCH 152/593] Avoid duplicate printout of byte order and report ELF_VERSION --- getarch.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/getarch.c b/getarch.c index e739d2de9..c173d58b8 100644 --- a/getarch.c +++ b/getarch.c @@ -1362,10 +1362,12 @@ int main(int argc, char *argv[]){ #if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ printf("__BYTE_ORDER__=__ORDER_BIG_ENDIAN__\n"); -#endif -#if defined(__BIG_ENDIAN__) && __BIG_ENDIAN__ > 0 +#elif defined(__BIG_ENDIAN__) && __BIG_ENDIAN__ > 0 printf("__BYTE_ORDER__=__ORDER_BIG_ENDIAN__\n"); #endif +#if defined(_CALL_ELF) && (_CALL_ELF == 2) +printf("ELF_VERSION=2\n"); +#endif #ifdef MAKE_NB_JOBS #if MAKE_NB_JOBS > 0 From c90b28dee64244ee5038620a5889ef993bd8bea8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 22 Apr 2020 14:14:20 +0200 Subject: [PATCH 153/593] Export ELF_VERSION for use in powerpc kernel configurations --- Makefile.system | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile.system b/Makefile.system index 51bd1c4bd..ce071133d 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1349,6 +1349,7 @@ export ARCH export CORE export LIBCORE export __BYTE_ORDER__ +export ELF_VERSION export PGCPATH export CONFIG export CC From 06208c8d015d2429645b26c19e74909a861fbfd2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 22 Apr 2020 14:16:40 +0200 Subject: [PATCH 154/593] Limit this fix to ELFv2 builds --- kernel/power/KERNEL.POWER8 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 03a4d90b8..b2a43d4c4 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -233,7 +233,7 @@ QCABS_KERNEL = ../generic/cabs.c CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c -ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) +ifeq ($(__BYTE_ORDER__)$(ELF_VERSION),__ORDER_BIG_ENDIAN__2) IDAMAXKERNEL = ../arm/iamax.c IDAMINKERNEL = ../arm/iamin.c IZAMAXKERNEL = ../arm/izamax.c From 4412ee1754161ea98c91ee81c261abf16129e156 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 23 Apr 2020 10:54:46 +0200 Subject: [PATCH 155/593] Switch homebrew build env to new xcode 11.4 default 11.3.1 in the github image is causing brew to fail with "outdated xcode" message --- .github/workflows/nightly-Homebrew-build.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/nightly-Homebrew-build.yml b/.github/workflows/nightly-Homebrew-build.yml index f55e73d23..ed00f87c3 100644 --- a/.github/workflows/nightly-Homebrew-build.yml +++ b/.github/workflows/nightly-Homebrew-build.yml @@ -21,6 +21,7 @@ jobs: build-OpenBLAS-with-Homebrew: runs-on: macos-latest env: + DEVELOPER_DIR: /Applications/Xcode_11.4.app/Contents/Developer HOMEBREW_DEVELOPER: "ON" HOMEBREW_DISPLAY_INSTALL_TIMES: "ON" HOMEBREW_NO_ANALYTICS: "ON" From f80dd2151ebde13d024efa0ee333d3e15dbc046c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 23 Apr 2020 14:31:09 +0200 Subject: [PATCH 156/593] xcode 11.4.1 for homebrew ? --- .github/workflows/nightly-Homebrew-build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nightly-Homebrew-build.yml b/.github/workflows/nightly-Homebrew-build.yml index ed00f87c3..8d7cfea2d 100644 --- a/.github/workflows/nightly-Homebrew-build.yml +++ b/.github/workflows/nightly-Homebrew-build.yml @@ -21,7 +21,7 @@ jobs: build-OpenBLAS-with-Homebrew: runs-on: macos-latest env: - DEVELOPER_DIR: /Applications/Xcode_11.4.app/Contents/Developer + DEVELOPER_DIR: /Applications/Xcode_11.4.1.app/Contents/Developer HOMEBREW_DEVELOPER: "ON" HOMEBREW_DISPLAY_INSTALL_TIMES: "ON" HOMEBREW_NO_ANALYTICS: "ON" From 70869d571fa209c7ca5d95ad37678916db9cae24 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 24 Apr 2020 10:30:44 +0200 Subject: [PATCH 157/593] Quote include paths for getarch to protect any embedded spaces --- cmake/prebuild.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index e0696093b..067b97b4b 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -492,7 +492,7 @@ else(NOT CMAKE_CROSSCOMPILING) if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") try_compile(GETARCH_RESULT ${GETARCH_DIR} SOURCES ${GETARCH_SRC} - COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${GETARCH_DIR} -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}" + COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I"${GETARCH_DIR}" -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}" OUTPUT_VARIABLE GETARCH_LOG COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN} ) @@ -520,7 +520,7 @@ execute_process(COMMAND "${PROJECT_BINARY_DIR}/${GETARCH_BIN}" 1 OUTPUT_VARIABLE if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") try_compile(GETARCH2_RESULT ${GETARCH2_DIR} SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c - COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${GETARCH2_DIR} -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}" + COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I"${GETARCH2_DIR}" -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}" OUTPUT_VARIABLE GETARCH2_LOG COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} ) From 03ff213c51892d96674140d7e5009d8b06810563 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 24 Apr 2020 21:46:54 +0200 Subject: [PATCH 158/593] Increase POWER8 ZGEMM_R and use same R values for POWER9 fixes lapack-test zger failures seen in #2299 after application of my PR #2551 --- param.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/param.h b/param.h index 9fdf40fe2..7094249e8 100644 --- a/param.h +++ b/param.h @@ -2254,7 +2254,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_R 4096 #define DGEMM_DEFAULT_R 4096 #define CGEMM_DEFAULT_R 4096 -#define ZGEMM_DEFAULT_R 512 +#define ZGEMM_DEFAULT_R 4096 #define SYMV_P 8 @@ -2288,6 +2288,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_Q 1026 #define ZGEMM_DEFAULT_Q 1026 +#define SGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + #define SYMV_P 8 #endif From 3e28db7f380b64566727d843b8df34a58bc3227b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 25 Apr 2020 13:51:44 +0200 Subject: [PATCH 159/593] Update CONTRIBUTORS.md --- CONTRIBUTORS.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index df497c1d2..6d18047fb 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -180,3 +180,7 @@ In chronological order: * [2019-12-23] optimize AVX2 CGEMM and ZGEMM * [2019-12-30] AVX2 CGEMM3M & ZGEMM3M kernels * [2020-01-07] optimize AVX2 SGEMM and STRMM + +* Rajalakshmi Srinivasaraghavan + * [2020-04-15] Half-precision GEMM for bfloat16 + From e43b49e0643a1a793c745ce1436a25466857f7af Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 25 Apr 2020 16:18:54 +0200 Subject: [PATCH 160/593] Drop the set -e from travis scripts --- .travis.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index c875572b2..101147353 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,7 +16,6 @@ matrix: before_script: &common-before - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" script: - - set -e - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE - make -C test $COMMON_FLAGS $BTYPE - make -C ctest $COMMON_FLAGS $BTYPE @@ -108,7 +107,6 @@ matrix: - sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers' before_script: *common-before script: - - set -e # XXX: Disable some warnings for now to avoid exceeding Travis limit for log size. - alpine make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE CFLAGS="-Wno-misleading-indentation -Wno-sign-conversion -Wno-incompatible-pointer-types" @@ -151,7 +149,6 @@ matrix: before_script: - COMMON_ARGS="-DTARGET=NEHALEM -DNUM_THREADS=32" script: - - set -e - mkdir build - CONFIG=Release - cmake -Bbuild -H. $CMAKE_ARGS $COMMON_ARGS -DCMAKE_BUILD_TYPE=$CONFIG From 5e0dbf8dfeaf6d1a629363204c25b4037e53f906 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 26 Apr 2020 22:21:05 +0200 Subject: [PATCH 161/593] Increase default BUFFER_SIZE to accomodate SGEMM parameters in response to compile-time warning from #2551 --- common_mips.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common_mips.h b/common_mips.h index dd2f8d558..7dc3ba246 100644 --- a/common_mips.h +++ b/common_mips.h @@ -94,7 +94,7 @@ REALNAME: #endif #define HUGE_PAGESIZE ( 4 << 20) -#define BUFFER_SIZE (16 << 20) +#define BUFFER_SIZE (16 << 21) #define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) From e7bbdfdf84419254743d2f4e7c09d1333e5b38ba Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 27 Apr 2020 15:20:03 +0200 Subject: [PATCH 162/593] Have CMAKE parse conditional lines in KERNEL files Supports ifeq and ifneq, but requires both to have an else branch --- cmake/utils.cmake | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 831ddffe6..695723a66 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -15,10 +15,33 @@ endfunction () # Reads a Makefile into CMake vars. macro(ParseMakefileVars MAKEFILE_IN) message(STATUS "Reading vars from ${MAKEFILE_IN}...") + set (IfElse 0) + set (ElseSeen 0) file(STRINGS ${MAKEFILE_IN} makefile_contents) foreach (makefile_line ${makefile_contents}) +#message(STATUS "parsing ${makefile_line}") + if (${IfElse} GREATER 0) + string(REGEX MATCH "endif[ \t]*" line_match "${makefile_line}") + if (NOT "${line_match}" STREQUAL "") +# message(STATUS "ENDIF ${makefile_line}") + set (IfElse 0) + set (ElseSeen 0) + continue () + endif () + string(REGEX MATCH "else[ \t]*" line_match "${makefile_line}") + if (NOT "${line_match}" STREQUAL "") +# message(STATUS "ELSE ${makefile_line}") + set (ElseSeen 1) + continue () + endif() + if ( (${IfElse} EQUAL 2 AND ${ElseSeen} EQUAL 0) OR ( ${IfElse} EQUAL 1 AND ${ElseSeen} EQUAL 1)) +# message(STATUS "skipping ${makefile_line}") + continue () + endif () + endif () string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*(.+)$" line_match "${makefile_line}") if (NOT "${line_match}" STREQUAL "") +#message(STATUS "match on ${line_match}") set(var_name ${CMAKE_MATCH_1}) set(var_value ${CMAKE_MATCH_2}) # check for Makefile variables in the string, e.g. $(TSUFFIX) @@ -33,7 +56,31 @@ macro(ParseMakefileVars MAKEFILE_IN) else () string(REGEX MATCH "include \\$\\(KERNELDIR\\)/(.+)$" line_match "${makefile_line}") if (NOT "${line_match}" STREQUAL "") +#message(STATUS "match on include ${line_match}") ParseMakefileVars(${KERNELDIR}/${CMAKE_MATCH_1}) + else () +# message(STATUS "unmatched line ${line_match}") + string(REGEX MATCH "ifeq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}") + if (NOT "${line_match}" STREQUAL "") +# message(STATUS "IFEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}") + if (${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2}) +# message (STATUS "condition is true") + set (IfElse 1) + else () + set (IfElse 2) + endif () + else () + string(REGEX MATCH "ifneq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}") + if (NOT "${line_match}" STREQUAL "") +# message(STATUS "IFNEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}") + if (NOT ( ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2})) + message (STATUS "condition is true") + set (IfElse 1) + else () + set (IfElse 2) + endif () + endif () + endif () endif () endif () endforeach () From 3bd56846bb7c32d5d8507a66c00bf9dac6ece56b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 27 Apr 2020 16:27:09 +0200 Subject: [PATCH 163/593] Silence a debug message --- cmake/utils.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 695723a66..7a125ec55 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -74,7 +74,7 @@ macro(ParseMakefileVars MAKEFILE_IN) if (NOT "${line_match}" STREQUAL "") # message(STATUS "IFNEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}") if (NOT ( ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2})) - message (STATUS "condition is true") +# message (STATUS "condition is true") set (IfElse 1) else () set (IfElse 2) From 2d89603e9dca0fa80a371757f75e289ba7a495a5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 28 Apr 2020 10:40:40 +0200 Subject: [PATCH 164/593] Increase BUFFER_SIZE on mips64 to match SGEMM parameters --- common_mips64.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common_mips64.h b/common_mips64.h index af638d60c..a06edfe08 100644 --- a/common_mips64.h +++ b/common_mips64.h @@ -227,7 +227,7 @@ REALNAME: ;\ #define SEEK_ADDRESS -#define BUFFER_SIZE ( 32 << 20) +#define BUFFER_SIZE ( 32 << 21) #if defined(LOONGSON3A) #define PAGESIZE (16UL << 10) From f4248af26edbefe7ab21f4b46d8840fb8f810052 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 28 Apr 2020 10:43:12 +0200 Subject: [PATCH 165/593] Fix compiler warnings --- driver/others/blas_server.c | 6 ++++-- driver/others/memory.c | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index f13b83dd4..04b614a6e 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -272,7 +272,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ } } -#if defined(OS_LINUX) && !defined(NO_AFFINITY) +#if defined(OS_LINUX) && !defined(NO_AFFINITY) int gotoblas_set_affinity(int); int gotoblas_set_affinity2(int); int get_node(void); @@ -281,6 +281,8 @@ int get_node(void); static int increased_threads = 0; #ifdef OS_LINUX +extern int openblas_get_num_threads(void); + int openblas_setaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set) { const int active_threads = openblas_get_num_threads(); @@ -602,7 +604,7 @@ int blas_thread_init(void){ if(ret!=0){ struct rlimit rlim; const char *msg = strerror(ret); - fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create failed for thread %ld of %ld: %s\n", i+1,blas_num_threads,msg); + fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create failed for thread %ld of %d: %s\n", i+1,blas_num_threads,msg); #ifdef RLIMIT_NPROC if(0 == getrlimit(RLIMIT_NPROC, &rlim)) { fprintf(STDERR, "OpenBLAS blas_thread_init: RLIMIT_NPROC " diff --git a/driver/others/memory.c b/driver/others/memory.c index 5abcbf3a4..a5595aed4 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2070,7 +2070,7 @@ if (!release->address) return; if (munmap(release -> address, BUFFER_SIZE)) { int errsv=errno; perror("OpenBLAS : munmap failed:"); - printf("error code=%d,\trelease->address=%lx\n",errsv,release->address); + printf("error code=%d,\trelease->address=%p\n",errsv,release->address); } } From 564b0d39efd1193a92d071994dfda21e2c1fba7d Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Wed, 29 Apr 2020 13:40:34 -0500 Subject: [PATCH 166/593] Add test for shgemm This patch has Makefile changes to add test for shgemm which compares sgemm and shgemm result. --- .gitignore | 2 ++ test/Makefile | 23 ++++++++++++++++++++++- test/compare_sgemm_shgemm.c | 19 ++++++++++--------- 3 files changed, 34 insertions(+), 10 deletions(-) diff --git a/.gitignore b/.gitignore index 6803a919e..bca79f043 100644 --- a/.gitignore +++ b/.gitignore @@ -70,6 +70,7 @@ test/SBLAT2.SUMM test/SBLAT3.SUMM test/ZBLAT2.SUMM test/ZBLAT3.SUMM +test/SHBLAT3.SUMM test/cblat1 test/cblat2 test/cblat3 @@ -79,6 +80,7 @@ test/dblat3 test/sblat1 test/sblat2 test/sblat3 +test/test_shgemm test/zblat1 test/zblat2 test/zblat3 diff --git a/test/Makefile b/test/Makefile index 7a873b7e5..45f9821ec 100644 --- a/test/Makefile +++ b/test/Makefile @@ -64,9 +64,17 @@ endif endif endif +ifeq ($(BUILD_HALF),1) +level3 : test_shgemm sblat3 dblat3 cblat3 zblat3 +else level3 : sblat3 dblat3 cblat3 zblat3 +endif ifndef CROSS rm -f ?BLAT3.SUMM +ifeq ($(BUILD_HALF),1) + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./test_shgemm > SHBLAT3.SUMM + @$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0 +endif OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat3 < ./sblat3.dat @$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0 OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./dblat3 < ./dblat3.dat @@ -78,6 +86,10 @@ ifndef CROSS ifdef SMP rm -f ?BLAT3.SUMM ifeq ($(USE_OPENMP), 1) +ifeq ($(BUILD_HALF),1) + OMP_NUM_THREADS=2 ./test_shgemm > SHBLAT3.SUMM + @$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0 +endif OMP_NUM_THREADS=2 ./sblat3 < ./sblat3.dat @$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0 OMP_NUM_THREADS=2 ./dblat3 < ./dblat3.dat @@ -87,6 +99,10 @@ ifeq ($(USE_OPENMP), 1) OMP_NUM_THREADS=2 ./zblat3 < ./zblat3.dat @$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0 else +ifeq ($(BUILD_HALF),1) + OPENBLAS_NUM_THREADS=2 ./test_shgemm > SHBLAT3.SUMM + @$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0 +endif OPENBLAS_NUM_THREADS=2 ./sblat3 < ./sblat3.dat @$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0 OPENBLAS_NUM_THREADS=2 ./dblat3 < ./dblat3.dat @@ -165,6 +181,11 @@ zblat2 : zblat2.$(SUFFIX) ../$(LIBNAME) sblat3 : sblat3.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o sblat3 sblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) +ifeq ($(BUILD_HALF),1) +test_shgemm : compare_sgemm_shgemm.c ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o test_shgemm compare_sgemm_shgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) +endif + dblat3 : dblat3.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o dblat3 dblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) @@ -187,7 +208,7 @@ clean: @rm -f *.$(SUFFIX) *.$(PSUFFIX) gmon.$(SUFFIX)ut *.SUMM *.cxml *.exe *.pdb *.dwf \ sblat1 dblat1 cblat1 zblat1 \ sblat2 dblat2 cblat2 zblat2 \ - sblat3 dblat3 cblat3 zblat3 \ + test_shgemm sblat3 dblat3 cblat3 zblat3 \ sblat1p dblat1p cblat1p zblat1p \ sblat2p dblat2p cblat2p zblat2p \ sblat3p dblat3p cblat3p zblat3p \ diff --git a/test/compare_sgemm_shgemm.c b/test/compare_sgemm_shgemm.c index 978972b24..d5bd84b91 100644 --- a/test/compare_sgemm_shgemm.c +++ b/test/compare_sgemm_shgemm.c @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include #include -#include "common.h" +#include "../common.h" #define SGEMM BLASFUNC(sgemm) #define SHGEMM BLASFUNC(shgemm) typedef union @@ -52,7 +52,7 @@ main (int argc, char *argv[]) int m, n, k; int i, j, l; int ret = 0; - int loop = 20; + int loop = 100; char transA = 'N', transB = 'N'; float alpha = 1.0, beta = 0.0; char transa = 'N'; @@ -71,8 +71,8 @@ main (int argc, char *argv[]) { for (int i = 0; i < m; i++) { - A[j * k + i] = j * 9.0; - B[j * k + i] = i * 2.0; + A[j * k + i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) + 0.5; + B[j * k + i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) + 0.5; C[j * k + i] = 0; AA[j * k + i].v = *(uint32_t *) & A[j * k + i] >> 16; BB[j * k + i].v = *(uint32_t *) & B[j * k + i] >> 16; @@ -85,11 +85,12 @@ main (int argc, char *argv[]) &m, BB, &k, &beta, CC, &m); for (i = 0; i < n; i++) - for (j = 0; j < m; j++) - for (l = 0; l < k; l++) - if (CC[i * m + j] != C[i * m + j]) - ret++; + for (j = 0; j < m; j++) + for (l = 0; l < k; l++) + if (fabs(CC[i * m + j]-C[i * m + j]) > 1.0) + ret++; } - fprintf (stderr, "Return code: %d\n", ret); + if (ret != 0) + fprintf (stderr, "FATAL ERROR SHGEMM - Return code: %d\n", ret); return ret; } From 5dd14e3d48e30eababebc7e4534330fff5c2b904 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 1 May 2020 09:58:30 +0200 Subject: [PATCH 167/593] Make building the bfloat16 functions conditional on option BUILD_HALF (#2590) * make building the bfloat16 BLAS functions conditional on BUILD_HALF * pass the BUILD_HALF option to gensymbol * Pass BUILD_HALF as a compiler define for dynamic_arch builds --- CMakeLists.txt | 7 +++++-- Makefile.rule | 3 +++ Makefile.system | 5 +++++ cmake/kernel.cmake | 8 ++++++-- common_param.h | 20 ++++++++++++-------- driver/level3/Makefile | 6 +++++- exports/Makefile | 14 +++++++++----- exports/gensymbol | 11 +++++++++-- interface/Makefile | 8 ++++++++ kernel/CMakeLists.txt | 6 +++++- kernel/Makefile.L3 | 34 +++++++++++++++++++++++++++++++--- kernel/setparam-ref.c | 23 ++++++++++++++++++++++- 12 files changed, 120 insertions(+), 25 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c2b9ae7ad..70760d64d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -86,10 +86,13 @@ if (NOT NO_LAPACK) list(APPEND SUBDIRS lapack) endif () +if (NOT DEFINED BUILD_HALF) + set (BUILD_HALF false) +endif () # set which float types we want to build for if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16) # if none are defined, build for all - set(BUILD_HALF true) +# set(BUILD_HALF true) set(BUILD_SINGLE true) set(BUILD_DOUBLE true) set(BUILD_COMPLEX true) @@ -121,7 +124,7 @@ if (BUILD_COMPLEX16) list(APPEND FLOAT_TYPES "ZCOMPLEX") # defines COMPLEX and DOUBLE endif () -if (BUILD_SINGLE OR BUILD_HALF) +if (BUILD_HALF) message(STATUS "Building Half Precision") list(APPEND FLOAT_TYPES "HALF") # defines nothing endif () diff --git a/Makefile.rule b/Makefile.rule index 724a60ec4..8549e6394 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -273,6 +273,9 @@ COMMON_PROF = -pg # # CPP_THREAD_SAFETY_TEST = 1 + +# If you want to enable the experimental BFLOAT16 support +# BUILD_HALF = 1 # # End of user configuration # diff --git a/Makefile.system b/Makefile.system index ce071133d..76d755ec2 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1124,6 +1124,10 @@ ifeq ($(USE_TLS), 1) CCOMMON_OPT += -DUSE_TLS endif +ifeq ($(BUILD_HALF), 1) +CCOMMON_OPT += -DBUILD_HALF +endif + CCOMMON_OPT += -DVERSION=\"$(VERSION)\" ifndef SYMBOLPREFIX @@ -1395,6 +1399,7 @@ export KERNELDIR export FUNCTION_PROFILE export TARGET_CORE export NO_AVX512 +export BUILD_HALF export SHGEMM_UNROLL_M export SHGEMM_UNROLL_N diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake index 1c1fed571..4b505a102 100644 --- a/cmake/kernel.cmake +++ b/cmake/kernel.cmake @@ -113,6 +113,7 @@ macro(SetDefaultL1) set(ZSUMKERNEL zsum.S) set(QSUMKERNEL sum.S) set(XSUMKERNEL zsum.S) +if (BUILD_HALF) set(SHAMINKERNEL ../arm/amin.c) set(SHAMAXKERNEL ../arm/amax.c) set(SHMAXKERNEL ../arm/max.c) @@ -131,6 +132,7 @@ macro(SetDefaultL1) set(SHNRM2KERNEL ../arm/nrm2.c) set(SHSUMKERNEL ../arm/sum.c) set(SHSWAPKERNEL ../arm/swap.c) +endif () endmacro () macro(SetDefaultL2) @@ -179,10 +181,11 @@ macro(SetDefaultL2) set(XHEMV_L_KERNEL ../generic/zhemv_k.c) set(XHEMV_V_KERNEL ../generic/zhemv_k.c) set(XHEMV_M_KERNEL ../generic/zhemv_k.c) +if (BUILD_HALF) set(SHGEMVNKERNEL ../arm/gemv_n.c) set(SHGEMVTKERNEL ../arm/gemv_t.c) set(SHGERKERNEL ../generic/ger.c) - +endif () endmacro () macro(SetDefaultL3) @@ -190,6 +193,7 @@ macro(SetDefaultL3) set(DGEADD_KERNEL ../generic/geadd.c) set(CGEADD_KERNEL ../generic/zgeadd.c) set(ZGEADD_KERNEL ../generic/zgeadd.c) +if (BUILD_HALF) set(SHGEADD_KERNEL ../generic/geadd.c) set(SHGEMMKERNEL ../generic/gemmkernel_2x2.c) set(SHGEMM_BETA ../generic/gemm_beta.c) @@ -201,6 +205,6 @@ macro(SetDefaultL3) set(SHGEMMITCOPYOBJ shgemm_itcopy.o) set(SHGEMMONCOPYOBJ shgemm_oncopy.o) set(SHGEMMOTCOPYOBJ shgemm_otcopy.o) - +endif () endmacro () diff --git a/common_param.h b/common_param.h index 19a34fa3d..c92609a76 100644 --- a/common_param.h +++ b/common_param.h @@ -47,7 +47,7 @@ typedef struct { int dtb_entries; int offsetA, offsetB, align; -#if 1 +#ifdef BUILD_HALF int shgemm_p, shgemm_q, shgemm_r; int shgemm_unroll_m, shgemm_unroll_n, shgemm_unroll_mn; @@ -1002,12 +1002,14 @@ extern gotoblas_t *gotoblas; #define HAVE_EX_L2 gotoblas -> exclusive_cache +#ifdef BUILD_HALF #define SHGEMM_P gotoblas -> shgemm_p #define SHGEMM_Q gotoblas -> shgemm_q #define SHGEMM_R gotoblas -> shgemm_r #define SHGEMM_UNROLL_M gotoblas -> shgemm_unroll_m #define SHGEMM_UNROLL_N gotoblas -> shgemm_unroll_n #define SHGEMM_UNROLL_MN gotoblas -> shgemm_unroll_mn +#endif #define SGEMM_P gotoblas -> sgemm_p #define SGEMM_Q gotoblas -> sgemm_q @@ -1086,6 +1088,7 @@ extern gotoblas_t *gotoblas; #define HAVE_EX_L2 0 #endif +#ifdef BUILD_HALF #define SHGEMM_P SHGEMM_DEFAULT_P #define SHGEMM_Q SHGEMM_DEFAULT_Q #define SHGEMM_R SHGEMM_DEFAULT_R @@ -1096,6 +1099,7 @@ extern gotoblas_t *gotoblas; #else #define SHGEMM_UNROLL_MN MAX((SHGEMM_UNROLL_M), (SHGEMM_UNROLL_N)) #endif +#endif #define SGEMM_P SGEMM_DEFAULT_P #define SGEMM_Q SGEMM_DEFAULT_Q @@ -1330,31 +1334,31 @@ extern gotoblas_t *gotoblas; #endif #ifndef SHGEMM_DEFAULT_R -#define SHGEMM_DEFAULT_R (((BUFFER_SIZE - ((SHGEMM_DEFAULT_P * SHGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SHGEMM_DEFAULT_Q * 4) - 15) & ~15) +#define SHGEMM_DEFAULT_R (((BUFFER_SIZE - ((SHGEMM_DEFAULT_P * SHGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SHGEMM_DEFAULT_Q * 4) - 15) & ~15UL) #endif #ifndef SGEMM_DEFAULT_R -#define SGEMM_DEFAULT_R (((BUFFER_SIZE - ((SGEMM_DEFAULT_P * SGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SGEMM_DEFAULT_Q * 4) - 15) & ~15) +#define SGEMM_DEFAULT_R (((BUFFER_SIZE - ((SGEMM_DEFAULT_P * SGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SGEMM_DEFAULT_Q * 4) - 15) & ~15UL) #endif #ifndef DGEMM_DEFAULT_R -#define DGEMM_DEFAULT_R (((BUFFER_SIZE - ((DGEMM_DEFAULT_P * DGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (DGEMM_DEFAULT_Q * 8) - 15) & ~15) +#define DGEMM_DEFAULT_R (((BUFFER_SIZE - ((DGEMM_DEFAULT_P * DGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (DGEMM_DEFAULT_Q * 8) - 15) & ~15UL) #endif #ifndef QGEMM_DEFAULT_R -#define QGEMM_DEFAULT_R (((BUFFER_SIZE - ((QGEMM_DEFAULT_P * QGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (QGEMM_DEFAULT_Q * 16) - 15) & ~15) +#define QGEMM_DEFAULT_R (((BUFFER_SIZE - ((QGEMM_DEFAULT_P * QGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (QGEMM_DEFAULT_Q * 16) - 15) & ~15UL) #endif #ifndef CGEMM_DEFAULT_R -#define CGEMM_DEFAULT_R (((BUFFER_SIZE - ((CGEMM_DEFAULT_P * CGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (CGEMM_DEFAULT_Q * 8) - 15) & ~15) +#define CGEMM_DEFAULT_R (((BUFFER_SIZE - ((CGEMM_DEFAULT_P * CGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (CGEMM_DEFAULT_Q * 8) - 15) & ~15UL) #endif #ifndef ZGEMM_DEFAULT_R -#define ZGEMM_DEFAULT_R (((BUFFER_SIZE - ((ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (ZGEMM_DEFAULT_Q * 16) - 15) & ~15) +#define ZGEMM_DEFAULT_R (((BUFFER_SIZE - ((ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (ZGEMM_DEFAULT_Q * 16) - 15) & ~15UL) #endif #ifndef XGEMM_DEFAULT_R -#define XGEMM_DEFAULT_R (((BUFFER_SIZE - ((XGEMM_DEFAULT_P * XGEMM_DEFAULT_Q * 32 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (XGEMM_DEFAULT_Q * 32) - 15) & ~15) +#define XGEMM_DEFAULT_R (((BUFFER_SIZE - ((XGEMM_DEFAULT_P * XGEMM_DEFAULT_Q * 32 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (XGEMM_DEFAULT_Q * 32) - 15) & ~15UL) #endif #ifndef SNUMOPT diff --git a/driver/level3/Makefile b/driver/level3/Makefile index 881b4ee35..09a62d9bf 100644 --- a/driver/level3/Makefile +++ b/driver/level3/Makefile @@ -19,7 +19,10 @@ ifeq ($(ARCH), MIPS) USE_GEMM3M = 1 endif +ifeq ($(BUILD_HALF),1) SHBLASOBJS += shgemm_nn.$(SUFFIX) shgemm_nt.$(SUFFIX) shgemm_tn.$(SUFFIX) shgemm_tt.$(SUFFIX) +endif + SBLASOBJS += \ sgemm_nn.$(SUFFIX) sgemm_nt.$(SUFFIX) sgemm_tn.$(SUFFIX) sgemm_tt.$(SUFFIX) \ strmm_LNUU.$(SUFFIX) strmm_LNUN.$(SUFFIX) strmm_LNLU.$(SUFFIX) strmm_LNLN.$(SUFFIX) \ @@ -204,8 +207,9 @@ COMMONOBJS += gemm_thread_m.$(SUFFIX) gemm_thread_n.$(SUFFIX) gemm_thread_mn.$( COMMONOBJS += syrk_thread.$(SUFFIX) ifndef USE_SIMPLE_THREADED_LEVEL3 - +ifeq ($(BUILD_HALF),1) SHBLASOBJS += shgemm_thread_nn.$(SUFFIX) shgemm_thread_nt.$(SUFFIX) shgemm_thread_tn.$(SUFFIX) shgemm_thread_tt.$(SUFFIX) +endif SBLASOBJS += sgemm_thread_nn.$(SUFFIX) sgemm_thread_nt.$(SUFFIX) sgemm_thread_tn.$(SUFFIX) sgemm_thread_tt.$(SUFFIX) DBLASOBJS += dgemm_thread_nn.$(SUFFIX) dgemm_thread_nt.$(SUFFIX) dgemm_thread_tn.$(SUFFIX) dgemm_thread_tt.$(SUFFIX) QBLASOBJS += qgemm_thread_nn.$(SUFFIX) qgemm_thread_nt.$(SUFFIX) qgemm_thread_tn.$(SUFFIX) qgemm_thread_tt.$(SUFFIX) diff --git a/exports/Makefile b/exports/Makefile index 60291b1ff..c92d6e996 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -30,6 +30,10 @@ ifndef BUILD_LAPACK_DEPRECATED BUILD_LAPACK_DEPRECATED = 0 endif +ifndef BUILD_HALF +BUILD_HALF = 0 +endif + ifeq ($(OSNAME), WINNT) ifeq ($(F_COMPILER), GFORTRAN) ifndef ONLY_CBLAS @@ -234,23 +238,23 @@ static : ../$(LIBNAME) rm -f goto.$(SUFFIX) osx.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) + perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F) aix.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) + perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F) objcopy.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) + perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F) objconv.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) + perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F) test : linktest.c $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK. rm -f linktest linktest.c : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > linktest.c + perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > linktest.c clean :: @rm -f *.def *.dylib __.SYMDEF* *.renamed diff --git a/exports/gensymbol b/exports/gensymbol index 235446f14..0a68a3572 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -30,7 +30,7 @@ icamax,icamin,idamax,idamin,idmax,idmin,isamax,isamin,ismax,ismin, izamax,izamin,lsame,samax,samin,sasum,saxpy,scabs1,scamax, scamin,scasum,scnrm2,scopy,sdot,sdsdot,sgbmv,sgemm,sgemv,sger, - shgemm, smax,smin,snrm2, + smax,smin,snrm2, srot,srotg,srotm,srotmg,ssbmv,sscal,sspmv,sspr2,sspr,sswap, ssymm,ssymv,ssyr2,ssyr2k,ssyr,ssyrk,stbmv,stbsv,stpmv,stpsv, strmm,strmv,strsm,strsv,zaxpy,zcopy,zdotc,zdotu,zdrot, @@ -51,6 +51,7 @@ zimatcopy, ); +@halfblasobjs = (shgemm); @cblasobjs = ( cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv, cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k, @@ -67,7 +68,7 @@ cblas_isamax, cblas_izamax, cblas_sasum, cblas_saxpy, cblas_scasum, cblas_scnrm2, cblas_scopy, cblas_sdot, cblas_sdsdot, cblas_sgbmv, cblas_sgemm, - cblas_sgemv, cblas_sger, cblas_shgemm, cblas_snrm2, cblas_srot, cblas_srotg, + cblas_sgemv, cblas_sger, cblas_snrm2, cblas_srot, cblas_srotg, cblas_srotm, cblas_srotmg, cblas_ssbmv, cblas_sscal, cblas_sspmv, cblas_sspr2, cblas_sspr, cblas_sswap, cblas_ssymm, cblas_ssymv, cblas_ssyr2, cblas_ssyr2k, cblas_ssyr, cblas_ssyrk, cblas_stbmv, cblas_stbsv, cblas_stpmv, cblas_stpsv, cblas_strmm, cblas_strmv, cblas_strsm, @@ -83,6 +84,8 @@ cblas_sgeadd, cblas_dgeadd,cblas_cgeadd, cblas_zgeadd ); +@halfcblasobjs = (cblas_shgemm); + @exblasobjs = ( qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm, qgemv,qger,qmax,qmin, @@ -3454,6 +3457,10 @@ use File::Spec; use File::Basename; my $dirname = File::Spec->catfile(dirname(dirname(File::Spec->rel2abs(__FILE__))), "lapack-netlib"); +if ($ARGV[12] == 1) { + @blasobjs = (@blasobjs, @halfblasobjs); + @cblasobjs = (@cblasobjs, @halfcblasobjs); +} if ($ARGV[8] == 1) { #ONLY_CBLAS=1 @underscore_objs = (@misc_underscore_objs); diff --git a/interface/Makefile b/interface/Makefile index 741f6bac0..44a9fdcf0 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -46,7 +46,9 @@ SBLAS3OBJS = \ somatcopy.$(SUFFIX) simatcopy.$(SUFFIX)\ sgeadd.$(SUFFIX) +ifeq ($(BUILD_HALF),1) SHBLAS3OBJS = shgemm.$(SUFFIX) +endif DBLAS1OBJS = \ daxpy.$(SUFFIX) dswap.$(SUFFIX) \ @@ -278,7 +280,9 @@ CSBLAS3OBJS = \ cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX)\ cblas_sgeadd.$(SUFFIX) +ifeq ($(BUILD_HALF),1) CSHBLAS3OBJS = cblas_shgemm.$(SUFFIX) +endif CDBLAS1OBJS = \ cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \ @@ -1214,8 +1218,10 @@ zhpr2.$(SUFFIX) zhpr2.$(PSUFFIX) : zhpr2.c xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c $(CC) -c $(CFLAGS) $< -o $(@F) +ifeq ($(BUILD_HALF),1) shgemm.$(SUFFIX) shgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) +endif sgemm.$(SUFFIX) sgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) @@ -1778,8 +1784,10 @@ cblas_zhemv.$(SUFFIX) cblas_zhemv.$(PSUFFIX) : zhemv.c cblas_sgemm.$(SUFFIX) cblas_sgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) +ifeq ($(BUILD_HALF),1) cblas_shgemm.$(SUFFIX) cblas_shgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) +endif cblas_dgemm.$(SUFFIX) cblas_dgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 4113a1647..b114c6a33 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -137,7 +137,11 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) foreach (float_type SINGLE DOUBLE HALF) string(SUBSTRING ${float_type} 0 1 float_char) if (${float_type} STREQUAL "HALF") - set (float_char "SH") + if (NOT ${BUILD_HALF}) + continue () + else () + set (float_char "SH") + endif () endif () GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type}) endforeach() diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index baf0c1c8a..da6c5fd57 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -59,7 +59,8 @@ ifeq ($(CORE), Z14) USE_TRMM = 1 endif -#ifndef SHGEMMKERNEL +ifeq ($(BUILD_HALF), 1) +ifndef SHGEMMKERNEL SHGEMM_BETA = ../generic/gemm_beta.c SHGEMMKERNEL = ../generic/gemmkernel_2x2.c SHGEMMINCOPY = ../generic/gemm_ncopy_2.c @@ -70,12 +71,13 @@ SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX) SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX) SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX) SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX) -#endif +endif SHKERNELOBJS += \ shgemm_kernel$(TSUFFIX).$(SUFFIX) \ $(SHGEMMINCOPYOBJ) $(SHGEMMITCOPYOBJ) \ $(SHGEMMONCOPYOBJ) $(SHGEMMOTCOPYOBJ) +endif SKERNELOBJS += \ sgemm_kernel$(TSUFFIX).$(SUFFIX) \ @@ -110,7 +112,9 @@ XKERNELOBJS += \ $(XGEMMINCOPYOBJ) $(XGEMMITCOPYOBJ) \ $(XGEMMONCOPYOBJ) $(XGEMMOTCOPYOBJ) +ifeq ($(BUILD_HALF),1) SHBLASOBJS += $(SHKERNELOBJS) +endif SBLASOBJS += $(SKERNELOBJS) DBLASOBJS += $(DKERNELOBJS) QBLASOBJS += $(QKERNELOBJS) @@ -118,7 +122,10 @@ CBLASOBJS += $(CKERNELOBJS) ZBLASOBJS += $(ZKERNELOBJS) XBLASOBJS += $(XKERNELOBJS) +ifeq ($(BUILD_HALF),1) SHBLASOBJS += shgemm_beta$(TSUFFIX).$(SUFFIX) +endif + SBLASOBJS += \ sgemm_beta$(TSUFFIX).$(SUFFIX) \ strmm_kernel_LN$(TSUFFIX).$(SUFFIX) strmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ @@ -408,11 +415,13 @@ ZBLASOBJS += \ zimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ zgeadd_k$(TSUFFIX).$(SUFFIX) - +ifeq ($(BUILD_HALF), 1) SHGEMMINCOPYOBJ_P = $(SHGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) SHGEMMITCOPYOBJ_P = $(SHGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) SHGEMMONCOPYOBJ_P = $(SHGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) SHGEMMOTCOPYOBJ_P = $(SHGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +endif + SGEMMINCOPYOBJ_P = $(SGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) SGEMMITCOPYOBJ_P = $(SGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) SGEMMONCOPYOBJ_P = $(SGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) @@ -438,8 +447,10 @@ XGEMMITCOPYOBJ_P = $(XGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) XGEMMONCOPYOBJ_P = $(XGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) XGEMMOTCOPYOBJ_P = $(XGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +ifeq ($(BUILD_HALF),1) $(KDIR)shgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_BETA) $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ +endif $(KDIR)sgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_BETA) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ @@ -459,10 +470,14 @@ $(KDIR)zgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA) $(KDIR)xgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMM_BETA) $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@ + +ifeq ($(BUILD_HALF), 1) + $(KDIR)$(SHGEMMONCOPYOBJ) : $(KERNELDIR)/$(SHGEMMONCOPY) $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(SHGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SHGEMMOTCOPY) + ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DHALF -UDOUBLE -UCOMPLEX $< -o shgemmotcopy.s m4 shgemmotcopy.s > shgemmotcopy_nomacros.s @@ -487,6 +502,7 @@ else $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ endif +endif endif $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY) @@ -646,6 +662,8 @@ else $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ endif +ifeq ($(BUILD_HALF), 1) + $(KDIR)shgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DHALF -UDOUBLE -UCOMPLEX $< -o shgemm_kernel$(TSUFFIX).s @@ -655,6 +673,7 @@ ifeq ($(OS), AIX) else $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ endif +endif $(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND) ifeq ($(OS), AIX) @@ -2272,8 +2291,10 @@ $(KDIR)xtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_ $(KDIR)sgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMM_BETA) $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ +ifeq ($(BUILD_HALF),1) $(KDIR)shgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHGEMM_BETA) $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ +endif $(KDIR)dgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMM_BETA) $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ @@ -2290,6 +2311,8 @@ $(KDIR)zgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA) $(KDIR)xgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMM_BETA) $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@ + +ifeq ($(BUILD_HALF), 1) $(SHGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMONCOPY) $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ @@ -2304,6 +2327,8 @@ $(SHGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMITCOPY) $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ endif +endif + $(SGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMONCOPY) $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ @@ -2408,8 +2433,11 @@ endif endif + +ifeq ($(BUILD_HALF), 1) $(KDIR)shgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND) $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ +endif $(KDIR)sgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index b7cf0f112..d3aa030c1 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -53,6 +53,7 @@ gotoblas_t TABLE_NAME = { GEMM_DEFAULT_OFFSET_A, GEMM_DEFAULT_OFFSET_B, GEMM_DEFAULT_ALIGN, +#ifdef BUILD_HALF 0, 0, 0, SHGEMM_DEFAULT_UNROLL_M, SHGEMM_DEFAULT_UNROLL_N, #ifdef SHGEMM_DEFAULT_UNROLL_MN @@ -109,7 +110,7 @@ gotoblas_t TABLE_NAME = { #else NULL,NULL, #endif - +#endif 0, 0, 0, SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N, @@ -706,19 +707,25 @@ gotoblas_t TABLE_NAME = { #if defined(ARCH_ARM64) static void init_parameter(void) { +#if defined(BUILD_HALF) TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; +#endif TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#if defined(BUILD_HALF) TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; +#endif TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; +#if defined(BUILD_HALF) TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; +#endif TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R; @@ -782,20 +789,26 @@ static void init_parameter(void) { #if defined(ARCH_POWER) static void init_parameter(void) { +#ifdef BUILD_HALF TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; +#endif TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#ifdef BUILD_HALF TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; +#endif TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R; TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; +#ifdef BUILD_HALF TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; +#endif TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; @@ -805,20 +818,26 @@ static void init_parameter(void) { #if defined(ARCH_ZARCH) static void init_parameter(void) { +#ifdef BUILD_HALF TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; +#endif TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#ifdef BUILD_HALF TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; +#endif TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R; TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; +#ifdef BUILD_HALF TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; +#endif TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; @@ -958,9 +977,11 @@ static void init_parameter(void) { (void) l2; /* dirty trick to suppress unused variable warning for targets */ /* where the GEMM unrolling parameters do not depend on l2 */ +#ifdef BUILD_HALF TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; +#endif TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; From 90dba9f71668c0de77b77f32462c78fbbd424db1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 May 2020 10:44:50 +0200 Subject: [PATCH 168/593] Duplicate earlier Clang 9.0.0 workaround for corresponding Apple Clang version As discussed on the original PR #2329, the "Apple Clang 11.0.3" that appears to be based the same LLVM release produces the same miscompilation of this file. --- kernel/x86_64/dsymv_L_microk_skylakex-2.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/x86_64/dsymv_L_microk_skylakex-2.c b/kernel/x86_64/dsymv_L_microk_skylakex-2.c index bdcd914fb..f0df5aaa8 100644 --- a/kernel/x86_64/dsymv_L_microk_skylakex-2.c +++ b/kernel/x86_64/dsymv_L_microk_skylakex-2.c @@ -36,7 +36,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__clang_patchlevel__) && __clang_major__ == 9 && __clang_minor__ == 0 && __clang_patchlevel__ == 0 #pragma clang optimize off #endif - +#if defined(__apple_build_version__) && __clang_major__ == 11 && __clang_minor__ == 0 && __clang_patchlevel__ == 3 +#pragma clang optimize off +#endif static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) { @@ -164,6 +166,9 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL #if defined(__clang_patchlevel__) && __clang_major__ == 9 && __clang_minor__ == 0 && __clang_patchlevel__ == 0 #pragma clang optimize on #endif +#if defined(__apple_build_version__) && __clang_major__ == 11 && __clang_minor__ == 0 && __clang_patchlevel__ == 3 +#pragma clang optimize on +#endif #else #include "dsymv_L_microk_haswell-2.c" From 4e82eb9f8ae2898195442af215103b89332833fe Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 7 May 2020 00:31:32 +0200 Subject: [PATCH 169/593] Undefine ASMNAME/NAME/CNAME before defining them to avoid redefinition warning when environment variables like CFLAGS are being used (fixes #818) --- Makefile.system | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile.system b/Makefile.system index 76d755ec2..023546009 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1154,6 +1154,7 @@ KERNELDIR = $(TOPDIR)/kernel/$(ARCH) include $(TOPDIR)/Makefile.$(ARCH) +CCOMMON_OPT += -UASMNAME -UASMFNAME -UNAME -UCNAME -UCHAR_NAME -UCHAR_CNAME CCOMMON_OPT += -DASMNAME=$(FU)$(*F) -DASMFNAME=$(FU)$(*F)$(BU) -DNAME=$(*F)$(BU) -DCNAME=$(*F) -DCHAR_NAME=\"$(*F)$(BU)\" -DCHAR_CNAME=\"$(*F)\" ifeq ($(CORE), PPC440) From 8353cb245a5ad5095c5e78582d4be597d8075973 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Thu, 7 May 2020 09:14:05 -0700 Subject: [PATCH 170/593] ARM64: Improve DAXPY for ThunderX2 Improve performance of DAXPY for ThunderX2 when the vector fits in L1 Cache. --- kernel/arm64/daxpy_thunderx2t99.S | 59 +++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/kernel/arm64/daxpy_thunderx2t99.S b/kernel/arm64/daxpy_thunderx2t99.S index b8d0af5c2..baf39150f 100644 --- a/kernel/arm64/daxpy_thunderx2t99.S +++ b/kernel/arm64/daxpy_thunderx2t99.S @@ -98,11 +98,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add X, X, #128 .endm +/* + * No need to do software prefetches if the vector fits + * into L1 cache + */ +.macro KERNEL_F16_L1CACHE + ldp q4, q5, [X] + ldp q16, q17, [Y] + + ldp q6, q7, [X, #32] + ldp q18, q19, [Y, #32] + + fmla v16.2d, v4.2d, v0.d[0] + fmla v17.2d, v5.2d, v0.d[0] + + stp q16, q17, [Y] + + ldp q20, q21, [X, #64] + ldp q24, q25, [Y, #64] + + fmla v18.2d, v6.2d, v0.d[0] + fmla v19.2d, v7.2d, v0.d[0] + + stp q18, q19, [Y, #32] + + ldp q22, q23, [X, #96] + ldp q26, q27, [Y, #96] + + fmla v24.2d, v20.2d, v0.d[0] + fmla v25.2d, v21.2d, v0.d[0] + + stp q24, q25, [Y, #64] + + fmla v26.2d, v22.2d, v0.d[0] + fmla v27.2d, v23.2d, v0.d[0] + + stp q26, q27, [Y, #96] + + add Y, Y, #128 + add X, X, #128 +.endm + .macro KERNEL_F32 KERNEL_F16 KERNEL_F16 .endm + +.macro KERNEL_F32_L1CACHE + KERNEL_F16_L1CACHE + KERNEL_F16_L1CACHE +.endm + .macro INIT_S lsl INC_X, INC_X, #3 lsl INC_Y, INC_Y, #3 @@ -138,6 +185,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp I, xzr beq .Ldaxpy_kernel_F1 + cmp N, #2048 + ble .Ldaxpy_kernel_F32_L1CACHE + .align 5 .Ldaxpy_kernel_F32: @@ -145,6 +195,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. subs I, I, #1 bne .Ldaxpy_kernel_F32 + b .Ldaxpy_kernel_F1 + + .align 5 +.Ldaxpy_kernel_F32_L1CACHE: + + KERNEL_F32_L1CACHE + + subs I, I, #1 + bne .Ldaxpy_kernel_F32_L1CACHE .Ldaxpy_kernel_F1: From ec0f2286321f79f7e5d9bfe867e3664fb94f3967 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 8 May 2020 18:06:12 +0200 Subject: [PATCH 171/593] Add FFLAGS_DRV to the generated make.inc to fix lapack-test on x86_64 with icc/ifort fixes #2552 --- Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile b/Makefile index 18320e6a3..e113026dd 100644 --- a/Makefile +++ b/Makefile @@ -264,6 +264,7 @@ lapack_prebuild : ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) -@echo "FC = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc -@echo "FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "FFLAGS_DRV = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc From bd9ff820bcd4b99b7ce5054268b0437adfea4a05 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Fri, 8 May 2020 20:31:56 -0500 Subject: [PATCH 172/593] Fix cmake compilation issue - POWER9 This patch removes extra space in the sgemmotcopy filename thereby allowing it to create entry in kernel/Makefile created by cmake. --- kernel/power/KERNEL.POWER8 | 2 +- kernel/power/KERNEL.POWER9 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index b2a43d4c4..7fba5b4d6 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -12,7 +12,7 @@ SGEMMKERNEL = sgemm_kernel_16x8_power8.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = sgemm_tcopy_16_power8.S SGEMMONCOPY = ../generic/gemm_ncopy_8.c -SGEMMOTCOPY = sgemm_tcopy_8_power8.S +SGEMMOTCOPY = sgemm_tcopy_8_power8.S SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 index aabb5d976..ab8fbfcd9 100644 --- a/kernel/power/KERNEL.POWER9 +++ b/kernel/power/KERNEL.POWER9 @@ -16,7 +16,7 @@ SGEMMKERNEL = sgemm_kernel_power9.S SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = sgemm_tcopy_16_power8.S SGEMMONCOPY = ../generic/gemm_ncopy_8.c -SGEMMOTCOPY = sgemm_tcopy_8_power8.S +SGEMMOTCOPY = sgemm_tcopy_8_power8.S SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) From cd10b35fe9133e44c3aa3a2c6d5712b10bf046bf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 9 May 2020 13:42:33 +0200 Subject: [PATCH 173/593] Handle trailing spaces and empty condition variables --- cmake/utils.cmake | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 7a125ec55..1c21e776e 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -43,7 +43,8 @@ macro(ParseMakefileVars MAKEFILE_IN) if (NOT "${line_match}" STREQUAL "") #message(STATUS "match on ${line_match}") set(var_name ${CMAKE_MATCH_1}) - set(var_value ${CMAKE_MATCH_2}) +# set(var_value ${CMAKE_MATCH_2}) + string(STRIP ${CMAKE_MATCH_2} var_value) # check for Makefile variables in the string, e.g. $(TSUFFIX) string(REGEX MATCHALL "\\$\\(([0-9_a-zA-Z]+)\\)" make_var_matches ${var_value}) foreach (make_var ${make_var_matches}) @@ -63,7 +64,7 @@ macro(ParseMakefileVars MAKEFILE_IN) string(REGEX MATCH "ifeq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}") if (NOT "${line_match}" STREQUAL "") # message(STATUS "IFEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}") - if (${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2}) + if (DEFINED ${${CMAKE_MATCH_1}} AND ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2}) # message (STATUS "condition is true") set (IfElse 1) else () From 58d26b4448a22cd1447d11c6fb746e2a28f8b573 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 9 May 2020 17:15:36 +0200 Subject: [PATCH 174/593] Correct ifort options to same as suggested by reference-lapack --- Makefile.system | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index 023546009..1f1ae8353 100644 --- a/Makefile.system +++ b/Makefile.system @@ -855,7 +855,7 @@ ifneq ($(INTERFACE64), 0) FCOMMON_OPT += -i8 endif endif -FCOMMON_OPT += -recursive +FCOMMON_OPT += -recursive -fp-model strict -assume protect-parens ifeq ($(USE_OPENMP), 1) FCOMMON_OPT += -fopenmp endif From 2271c3506b32f866eeffc3d46008fba68844fc72 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 9 May 2020 23:49:18 +0200 Subject: [PATCH 175/593] Work around excessive LAPACK test failures on Skylake-X Something in the plain C parts of x86_64 cscal.c and zscal.c appears to be miscompiled by both gfortran9 and ifort when compiling for skylakex-avx512, even when the optimized Haswell microkernel is not in use. --- kernel/x86_64/KERNEL.SKYLAKEX | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 65f031d03..448aee074 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -24,3 +24,6 @@ DGEMM_BETA = dgemm_beta_skylakex.c CGEMMKERNEL = cgemm_kernel_8x2_skylakex.c ZGEMMKERNEL = zgemm_kernel_4x2_skylakex.c + +CSCALKERNEL = ../arm/zscal.c +ZSCALKERNEL = ../arm/zscal.c From ce90e2bd3f6e6e0bb338472d69fad47633639505 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Mon, 11 May 2020 09:57:46 -0500 Subject: [PATCH 176/593] Include shgemm in benchtest This patch is to enable benchtest for half precision gemm when BUILD_HALF is set during make. --- benchmark/Makefile | 20 ++++++++++++++++++-- benchmark/gemm.c | 13 ++++++++----- 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/benchmark/Makefile b/benchmark/Makefile index 90d903ad7..53f422be4 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -49,6 +49,12 @@ else GOTO_LAPACK_TARGETS= endif +ifeq ($(BUILD_HALF),1) +GOTO_HALF_TARGETS=shgemm.goto +else +GOTO_HALF_TARGETS= +endif + ifeq ($(OSNAME), WINNT) goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ @@ -91,7 +97,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \ ssymm.goto dsymm.goto csymm.goto zsymm.goto \ - saxpby.goto daxpby.goto caxpby.goto zaxpby.goto + saxpby.goto daxpby.goto caxpby.goto zaxpby.goto $(GOTO_HALF_TARGETS) acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ @@ -264,7 +270,7 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ samin.goto damin.goto camin.goto zamin.goto \ smin.goto dmin.goto \ saxpby.goto daxpby.goto caxpby.goto zaxpby.goto \ - snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) + snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) $(GOTO_HALF_TARGETS) acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ @@ -614,6 +620,11 @@ zcholesky.essl : zcholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Sgemm #################################################### +ifeq ($(BUILD_HALF),1) +shgemm.goto : shgemm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm +endif + sgemm.goto : sgemm.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -2916,6 +2927,11 @@ ccholesky.$(SUFFIX) : cholesky.c zcholesky.$(SUFFIX) : cholesky.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +ifeq ($(BUILD_HALF),1) +shgemm.$(SUFFIX) : gemm.c + $(CC) $(CFLAGS) -c -DHALF -UCOMPLEX -UDOUBLE -o $(@F) $^ +endif + sgemm.$(SUFFIX) : gemm.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ diff --git a/benchmark/gemm.c b/benchmark/gemm.c index dd016a7c3..d2235330b 100644 --- a/benchmark/gemm.c +++ b/benchmark/gemm.c @@ -39,6 +39,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef DOUBLE #define GEMM BLASFUNC(dgemm) +#elif defined(HALF) +#define GEMM BLASFUNC(shgemm) #else #define GEMM BLASFUNC(sgemm) #endif @@ -120,7 +122,8 @@ static void *huge_malloc(BLASLONG size){ int main(int argc, char *argv[]){ - FLOAT *a, *b, *c; + IFLOAT *a, *b; + FLOAT *c; FLOAT alpha[] = {1.0, 0.0}; FLOAT beta [] = {0.0, 0.0}; char transa = 'N'; @@ -184,10 +187,10 @@ int main(int argc, char *argv[]){ k = to; } - if (( a = (FLOAT *)malloc(sizeof(FLOAT) * m * k * COMPSIZE)) == NULL) { + if (( a = (IFLOAT *)malloc(sizeof(IFLOAT) * m * k * COMPSIZE)) == NULL) { fprintf(stderr,"Out of Memory!!\n");exit(1); } - if (( b = (FLOAT *)malloc(sizeof(FLOAT) * k * n * COMPSIZE)) == NULL) { + if (( b = (IFLOAT *)malloc(sizeof(IFLOAT) * k * n * COMPSIZE)) == NULL) { fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( c = (FLOAT *)malloc(sizeof(FLOAT) * m * n * COMPSIZE)) == NULL) { @@ -199,10 +202,10 @@ int main(int argc, char *argv[]){ #endif for (i = 0; i < m * k * COMPSIZE; i++) { - a[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[i] = ((IFLOAT) rand() / (IFLOAT) RAND_MAX) - 0.5; } for (i = 0; i < k * n * COMPSIZE; i++) { - b[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + b[i] = ((IFLOAT) rand() / (IFLOAT) RAND_MAX) - 0.5; } for (i = 0; i < m * n * COMPSIZE; i++) { c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; From 8efba9b7c036783e0c2449ab58c50739381746d5 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Mon, 11 May 2020 17:15:10 -0500 Subject: [PATCH 177/593] Improve shgemm test This patch adds another check to test shgemm results. --- test/compare_sgemm_shgemm.c | 58 +++++++++++++++++++++++++++++-------- 1 file changed, 46 insertions(+), 12 deletions(-) diff --git a/test/compare_sgemm_shgemm.c b/test/compare_sgemm_shgemm.c index d5bd84b91..7e254f844 100644 --- a/test/compare_sgemm_shgemm.c +++ b/test/compare_sgemm_shgemm.c @@ -46,6 +46,27 @@ typedef union } bits; } bfloat16_bits; +typedef union +{ + float v; + struct + { + uint32_t m:23; + uint32_t e:8; + uint32_t s:1; + } bits; +} float32_bits; + +float +float16to32 (bfloat16_bits f16) +{ + float32_bits f32; + f32.bits.s = f16.bits.s; + f32.bits.e = f16.bits.e; + f32.bits.m = (uint32_t) f16.bits.m << 16; + return f32.v; +} + int main (int argc, char *argv[]) { @@ -55,8 +76,6 @@ main (int argc, char *argv[]) int loop = 100; char transA = 'N', transB = 'N'; float alpha = 1.0, beta = 0.0; - char transa = 'N'; - char transb = 'N'; for (int x = 0; x <= loop; x++) { @@ -65,30 +84,45 @@ main (int argc, char *argv[]) float B[k * n]; float C[m * n]; bfloat16_bits AA[m * k], BB[k * n]; - float CC[m * n]; + float DD[m * n], CC[m * n]; for (int j = 0; j < m; j++) { for (int i = 0; i < m; i++) { - A[j * k + i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) + 0.5; - B[j * k + i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) + 0.5; + A[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; + B[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; C[j * k + i] = 0; AA[j * k + i].v = *(uint32_t *) & A[j * k + i] >> 16; BB[j * k + i].v = *(uint32_t *) & B[j * k + i] >> 16; CC[j * k + i] = 0; + DD[j * k + i] = 0; } } SGEMM (&transA, &transB, &m, &n, &k, &alpha, A, - &m, B, &k, &beta, C, &m); + &m, B, &k, &beta, C, &m); SHGEMM (&transA, &transB, &m, &n, &k, &alpha, AA, - &m, BB, &k, &beta, CC, &m); - + &m, BB, &k, &beta, CC, &m); for (i = 0; i < n; i++) - for (j = 0; j < m; j++) - for (l = 0; l < k; l++) - if (fabs(CC[i * m + j]-C[i * m + j]) > 1.0) - ret++; + for (j = 0; j < m; j++) + for (l = 0; l < k; l++) + if (fabs (CC[i * m + j] - C[i * m + j]) > 1.0) + ret++; + if (transA == 'N' && transB == 'N') + { + for (i = 0; i < n; i++) + for (j = 0; j < m; j++) + for (l = 0; l < k; l++) + { + DD[i * m + j] += + float16to32 (AA[l * m + j]) * float16to32 (BB[l + k * i]); + } + for (i = 0; i < n; i++) + for (j = 0; j < m; j++) + for (l = 0; l < k; l++) + if (CC[i * m + j] != DD[i * m + j]) + ret++; + } } if (ret != 0) fprintf (stderr, "FATAL ERROR SHGEMM - Return code: %d\n", ret); From 8c338616f907b0592f0f59f1e4a365c7b000bc9d Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Mon, 11 May 2020 12:37:21 +0200 Subject: [PATCH 178/593] s390x: gate dynamic arch detection on gcc version and add generic When building OpenBLAS with DYNAMIC_ARCH=1 on s390x (aka zarch), make sure to include support for systems without the facilities introduced with z13 (i.e., zarch_generic). Adjust runtime detection to fallback to that generic code when running on a unknown platform other than Z13 through Z15. When detecting a Z13 or newer system, add a check for gcc support for the architecture-specific features before selecting the respective kernel. Fallback to Z13 or generic code, in case. Signed-off-by: Marius Hillenbrand --- Makefile.system | 3 +- driver/others/dynamic_zarch.c | 70 +++++++++++++++++++++++------------ 2 files changed, 48 insertions(+), 25 deletions(-) diff --git a/Makefile.system b/Makefile.system index 1f1ae8353..111fc717b 100644 --- a/Makefile.system +++ b/Makefile.system @@ -563,7 +563,8 @@ DYNAMIC_CORE += EMAG8180 endif ifeq ($(ARCH), zarch) -DYNAMIC_CORE = Z13 +DYNAMIC_CORE = ZARCH_GENERIC +DYNAMIC_CORE += Z13 DYNAMIC_CORE += Z14 endif diff --git a/driver/others/dynamic_zarch.c b/driver/others/dynamic_zarch.c index 90d3051b1..8bcfcd004 100644 --- a/driver/others/dynamic_zarch.c +++ b/driver/others/dynamic_zarch.c @@ -1,12 +1,25 @@ - #include "common.h" +#include + +// Gate kernels for z13 and z14 on gcc version +#if (__GNUC__ == 5 && __GNUC_MINOR__ >= 2) || __GNUC__ >= 6 || \ + /* RHEL 7 since 7.3: */ \ + (__GNUC__ == 4 && __GNUC_MINOR__ == 8 && __GNUC_PATCHLEVEL__ == 5 && \ + __GNUC_RH_RELEASE__ >= 11) +#define HAVE_Z13_SUPPORT +#endif + +#if __GNUC__ >= 7 +#define HAVE_Z14_SUPPORT +#endif +extern gotoblas_t gotoblas_ZARCH_GENERIC; +#ifdef HAVE_Z13_SUPPORT extern gotoblas_t gotoblas_Z13; +#endif +#ifdef HAVE_Z14_SUPPORT extern gotoblas_t gotoblas_Z14; -//extern gotoblas_t gotoblas_Z15; -//#if (!defined C_GCC) || (GCC_VERSION >= 60000) -//extern gotoblas_t gotoblas_Z14; -//#endif +#endif #define NUM_CORETYPES 4 @@ -16,18 +29,19 @@ static char* corename[] = { "unknown", "Z13", "Z14", -// "Z15", "ZARCH_GENERIC", }; char* gotoblas_corename(void) { +#ifdef HAVE_Z13_SUPPORT if (gotoblas == &gotoblas_Z13) return corename[1]; +#endif +#ifdef HAVE_Z14_SUPPORT if (gotoblas == &gotoblas_Z14) return corename[2]; -// if (gotoblas == &gotoblas_Z15) return corename[3]; -//#if (!defined C_GCC) || (GCC_VERSION >= 60000) -// if (gotoblas == &gotoblas_POWER9) return corename[3]; -//#endif - return corename[0]; // try generic? +#endif + if (gotoblas == &gotoblas_ZARCH_GENERIC) return corename[3]; + + return corename[0]; } // __builtin_cpu_is is not supported by zarch @@ -49,14 +63,21 @@ static gotoblas_t* get_coretype(void) { fclose(infile); - if (strstr(p, "2964")) return &gotoblas_Z13; - if (strstr(p, "2965")) return &gotoblas_Z13; - if (strstr(p, "3906")) return &gotoblas_Z14; - if (strstr(p, "3907")) return &gotoblas_Z14; - if (strstr(p, "8561")) return &gotoblas_Z14; // fallback z15 to z14 - if (strstr(p, "8562")) return &gotoblas_Z14; // fallback z15 to z14 +#ifdef HAVE_Z13_SUPPORT + if (strstr(p, "2964") || strstr(p, "2965")) return &gotoblas_Z13; +#endif - return NULL; // should be ZARCH_GENERIC + // Z14 and Z15 systems + if (strstr(p, "3906") || strstr(p, "3907") || strstr(p, "8561") || + strstr(p, "8562")) +#ifdef HAVE_Z14_SUPPORT + return &gotoblas_Z14; +#else + return &gotoblas_Z13; +#endif + + // unknown system or compiler too old? use generic code for z architecture + return &gotoblas_ZARCH_GENERIC; } static gotoblas_t* force_coretype(char* coretype) { @@ -76,12 +97,13 @@ static gotoblas_t* force_coretype(char* coretype) { switch (found) { +#ifdef HAVE_Z13_SUPPORT case 1: return (&gotoblas_Z13); +#endif +#ifdef HAVE_Z14_SUPPORT case 2: return (&gotoblas_Z14); -// case 3: return (&gotoblas_Z15); -//#if (!defined C_GCC) || (GCC_VERSION >= 60000) -// case 3: return (&gotoblas_POWER9); -//#endif +#endif + case 3: return (&gotoblas_ZARCH_GENERIC); default: return NULL; } snprintf(message, 128, "Core not found: %s\n", coretype); @@ -109,9 +131,9 @@ void gotoblas_dynamic_init(void) { if (gotoblas == NULL) { - snprintf(coremsg, 128, "Falling back to Z14 core\n"); + snprintf(coremsg, 128, "Failed to detect system, falling back to generic z support.\n"); openblas_warning(1, coremsg); - gotoblas = &gotoblas_Z14; + gotoblas = &gotoblas_ZARCH_GENERIC; } if (gotoblas && gotoblas->init) { From 62cf391cbbf5ebdec5dc44e814797c6298e626bc Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Mon, 11 May 2020 18:37:04 +0200 Subject: [PATCH 179/593] s390x: only build kernels supported by gcc with dynamic arch support When building with dynamic arch support, only build kernels for architectures that are supported by the gcc we are building with. Signed-off-by: Marius Hillenbrand --- Makefile.system | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/Makefile.system b/Makefile.system index 111fc717b..98d9ae313 100644 --- a/Makefile.system +++ b/Makefile.system @@ -564,8 +564,26 @@ endif ifeq ($(ARCH), zarch) DYNAMIC_CORE = ZARCH_GENERIC + +# Z13 is supported since gcc-5.2, gcc-6, and in RHEL 7.3 and newer +GCC_GE_52 := $(subst 0,,$(shell expr `$(CC) -dumpversion` \>= "5.2")) + +ifeq ($(wildcard /etc/redhat-release), /etc/redhat-release) +RHEL_WITH_Z13 := $(subst 0,,$(shell source /etc/os-release ; expr $$VERSION_ID \>= "7.3")) +endif + +ifeq ($(or $(GCC_GE_52),$(RHEL_WITH_Z13)), 1) DYNAMIC_CORE += Z13 +else +$(info OpenBLAS: Not building Z13 kernels because gcc is older than 5.2 or 6.x) +endif + +GCC_MAJOR_GE_7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) +ifeq ($(GCC_MAJOR_GE_7), 1) DYNAMIC_CORE += Z14 +else +$(info OpenBLAS: Not building Z14 kernels because gcc is older than 7.x) +endif endif ifeq ($(ARCH), power) From 0dbe61a612708c1a689835dcf5fdb76b166e7729 Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Mon, 11 May 2020 13:00:10 +0200 Subject: [PATCH 180/593] s390x: choose SIMD kernels at run-time based on OS and compiler support Extend and simplify the run-time detection for dynamic architecture support for z to check HW_CAP and only use SIMD features if advertised by the OS. While at it, also honor the env variable LD_HWCAP_MASK and do not use the CPU features masked there. Note that we can only use the SIMD features on z13 or newer (i.e., Vector Facility or Vector-Enhancements Facilities) when the operating system supports properly context-switching the vector registers. The OS advertises that support as a bit in the HW_CAP value in the auxiliary vector. While all recent Linux kernels have that support, we should maintain compatibility with older versions that may still be in use. Signed-off-by: Marius Hillenbrand --- driver/others/dynamic_zarch.c | 78 ++++++++++++++++++++++++----------- 1 file changed, 53 insertions(+), 25 deletions(-) diff --git a/driver/others/dynamic_zarch.c b/driver/others/dynamic_zarch.c index 8bcfcd004..403b34111 100644 --- a/driver/others/dynamic_zarch.c +++ b/driver/others/dynamic_zarch.c @@ -13,6 +13,39 @@ #define HAVE_Z14_SUPPORT #endif +// Guard the use of getauxval() on glibc version >= 2.16 +#ifdef __GLIBC__ +#include +#if __GLIBC_PREREQ(2, 16) +#include +#define HAVE_GETAUXVAL 1 + +static unsigned long get_hwcap(void) +{ + unsigned long hwcap = getauxval(AT_HWCAP); + char *maskenv; + + // honor requests for not using specific CPU features in LD_HWCAP_MASK + maskenv = getenv("LD_HWCAP_MASK"); + if (maskenv) + hwcap &= strtoul(maskenv, NULL, 0); + + return hwcap; + // note that a missing auxval is interpreted as no capabilities + // available, which is safe. +} + +#else // __GLIBC_PREREQ(2, 16) +#warn "Cannot detect SIMD support in Z13 or newer architectures since glibc is older than 2.16" + +static unsigned long get_hwcap(void) { + // treat missing support for getauxval() as no capabilities available, + // which is safe. + return 0; +} +#endif // __GLIBC_PREREQ(2, 16) +#endif // __GLIBC + extern gotoblas_t gotoblas_ZARCH_GENERIC; #ifdef HAVE_Z13_SUPPORT extern gotoblas_t gotoblas_Z13; @@ -44,39 +77,34 @@ char* gotoblas_corename(void) { return corename[0]; } -// __builtin_cpu_is is not supported by zarch +/** + * Detect the fitting set of kernels by retrieving the CPU features supported by + * OS from the auxiliary value AT_HWCAP and choosing the set of kernels + * ("coretype") that exploits most of the features and can be compiled with the + * available gcc version. + * Note that we cannot use vector registers on a z13 or newer unless supported + * by the OS kernel (which needs to handle them properly during context switch). + */ static gotoblas_t* get_coretype(void) { - FILE* infile; - char buffer[512], * p; - - p = (char*)NULL; - infile = fopen("/proc/sysinfo", "r"); - while (fgets(buffer, sizeof(buffer), infile)) { - if (!strncmp("Type", buffer, 4)) { - p = strchr(buffer, ':') + 2; -#if 0 - fprintf(stderr, "%s\n", p); -#endif - break; - } - } - - fclose(infile); -#ifdef HAVE_Z13_SUPPORT - if (strstr(p, "2964") || strstr(p, "2965")) return &gotoblas_Z13; -#endif + unsigned long hwcap __attribute__((unused)) = get_hwcap(); - // Z14 and Z15 systems - if (strstr(p, "3906") || strstr(p, "3907") || strstr(p, "8561") || - strstr(p, "8562")) + // z14 and z15 systems: exploit Vector Facility (SIMD) and + // Vector-Enhancements Facility 1 (float SIMD instructions), if present. #ifdef HAVE_Z14_SUPPORT + if ((hwcap & HWCAP_S390_VX) && (hwcap & HWCAP_S390_VXE)) return &gotoblas_Z14; -#else +#endif + + // z13: Vector Facility (SIMD for double) +#ifdef HAVE_Z13_SUPPORT + if (hwcap & HWCAP_S390_VX) return &gotoblas_Z13; #endif - // unknown system or compiler too old? use generic code for z architecture + // fallback in case of missing compiler support, systems before z13, or + // when the OS does not advertise support for the Vector Facility (e.g., + // missing support in the OS kernel) return &gotoblas_ZARCH_GENERIC; } From d7c1677c20c326d4bf0f2cefc2c7ce36f7df3149 Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Tue, 12 May 2020 11:09:28 +0200 Subject: [PATCH 181/593] Update CONTRIBUTORS.md, adding myself Signed-off-by: Marius Hillenbrand --- CONTRIBUTORS.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 6d18047fb..738475a93 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -183,4 +183,6 @@ In chronological order: * Rajalakshmi Srinivasaraghavan * [2020-04-15] Half-precision GEMM for bfloat16 - + +* Marius Hillenbrand + * [2020-05-12] Revise dynamic architecture detection for IBM z From 43c0d4f312ba3cd1a0ff8f389e6eded98113c0dd Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Tue, 12 May 2020 14:13:54 +0200 Subject: [PATCH 182/593] s390x: Add vectorized sgemm kernel for Z14 and newer Add a new GEMM kernel implementation to exploit the FP32 SIMD operations introduced with z14 and employ it for SGEMM on z14 and newer architectures. The SIMD extensions introduced with z13 support operations on double-sized scalars in vector registers. Thus, the existing SGEMM code would extend floats to doubles before operating on them. z14 extended SIMD support to operations on 32-bit floats. By employing these instructions, we can operate on twice the number of scalars per instruction (four floats in each vector registers) and avoid the conversion operations. The code is written in C with explicit vectorization. In experiments, this kernel improves performance on z14 and z15 by around 2x over the current implementation in assembly. The flexibilty of the C code paves the way for adjustments in subsequent commits. Tested via make -C test / ctest / utest and by a couple of additional unit tests that exercise blocking (e.g., partial register blocks with fewer than UNROLL_M rows and/or fewer than UNROLL_N columns). Signed-off-by: Marius Hillenbrand --- Makefile.zarch | 2 +- kernel/zarch/KERNEL.Z14 | 4 +- kernel/zarch/gemm_vec.c | 342 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 345 insertions(+), 3 deletions(-) create mode 100644 kernel/zarch/gemm_vec.c diff --git a/Makefile.zarch b/Makefile.zarch index 47ea1eb71..be1e34f6d 100644 --- a/Makefile.zarch +++ b/Makefile.zarch @@ -5,6 +5,6 @@ FCOMMON_OPT += -march=z13 -mzvector endif ifeq ($(CORE), Z14) -CCOMMON_OPT += -march=z14 -mzvector +CCOMMON_OPT += -march=z14 -mzvector -O3 FCOMMON_OPT += -march=z14 -mzvector endif diff --git a/kernel/zarch/KERNEL.Z14 b/kernel/zarch/KERNEL.Z14 index f6e3bec23..bd3a966b1 100644 --- a/kernel/zarch/KERNEL.Z14 +++ b/kernel/zarch/KERNEL.Z14 @@ -91,7 +91,7 @@ DTRMMKERNEL = trmm8x4V.S CTRMMKERNEL = ctrmm4x4V.S ZTRMMKERNEL = ztrmm4x4V.S -SGEMMKERNEL = strmm8x4V.S +SGEMMKERNEL = gemm_vec.c SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMITCOPY = ../generic/gemm_tcopy_8.c SGEMMONCOPY = ../generic/gemm_ncopy_4.c @@ -102,7 +102,7 @@ SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) - + DGEMMKERNEL = gemm8x4V.S DGEMMINCOPY = ../generic/gemm_ncopy_8.c DGEMMITCOPY = ../generic/gemm_tcopy_8.c diff --git a/kernel/zarch/gemm_vec.c b/kernel/zarch/gemm_vec.c new file mode 100644 index 000000000..e6d613c44 --- /dev/null +++ b/kernel/zarch/gemm_vec.c @@ -0,0 +1,342 @@ +/* + * Copyright (c) IBM Corporation 2020. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include "common.h" +#include + +#include +#include +#include + +#ifdef COMPLEX +#error "Handling for complex numbers is not supported in this kernel" +#endif + +#ifdef DOUBLE +#define UNROLL_M DGEMM_DEFAULT_UNROLL_M +#define UNROLL_N DGEMM_DEFAULT_UNROLL_N +#else +#define UNROLL_M SGEMM_DEFAULT_UNROLL_M +#define UNROLL_N SGEMM_DEFAULT_UNROLL_N +#endif + +static const size_t unroll_m = UNROLL_M; +static const size_t unroll_n = UNROLL_N; + +/* + * Background: + * + * The algorithm of GotoBLAS / OpenBLAS breaks down the matrix multiplication + * problem by splitting all matrices into partitions multiple times, so that the + * submatrices fit into the L1 or L2 caches. As a result, each multiplication of + * submatrices can stream data fast from L1 and L2 caches. Inbetween, it copies + * and rearranges the submatrices to enable contiguous memory accesses to + * improve locality in both caches and TLBs. + * + * At the heart of the algorithm is this kernel, which multiplies, a "Block + * matrix" A (small dimensions) with a "Panel matrix" B (number of rows is + * small) and adds the result into a "Panel matrix" C; GotoBLAS calls this + * operation GEBP. This kernel further partitions GEBP twice, such that (1) + * submatrices of C and B fit into the L1 caches (GEBP_column_block) and (2) a + * block of C fits into the registers, while multiplying panels from A and B + * streamed from the L2 and L1 cache, respectively (GEBP_block). + * + * + * Algorithm GEBP(A, B, C, m, n, k, alpha): + * + * The problem is calculating C += alpha * (A * B) + * C is an m x n matrix, A is an m x k matrix, B is an k x n matrix. + * + * - C is in column-major-order, with an offset of ldc to the element in the + * next column (same row). + * - A is in row-major-order yet stores SGEMM_UNROLL_M elements of each column + * contiguously while walking along rows. + * - B is in column-major-order but packs SGEMM_UNROLL_N elements of a row + * contiguously. + * If the numbers of rows and columns are not multiples of SGEMM_UNROLL_M or + * SGEMM_UNROLL_N, the remaining elements are arranged in blocks with power-of-2 + * dimensions (e.g., 5 remaining columns would be in a block-of-4 and a + * block-of-1). + * + * Note that packing A and B into that form is taken care of by the caller in + * driver/level3/level3.c (actually done by "copy kernels"). + * + * Steps: + * - Partition C and B into blocks of n_r (SGEMM_UNROLL_N) columns, C_j and B_j. + * Now, B_j should fit into the L1 cache. + * - For each partition, calculate C_j += alpha * (A * B_j) by + * (1) Calculate C_aux := A * B_j (see below) + * (2) unpack C_j = C_j + alpha * C_aux + * + * + * Algorithm for Calculating C_aux: + * + * - Further partition C_aux and A into groups of m_r (SGEMM_UNROLL_M) rows, + * such that the m_r x n_r-submatrix of C_aux can be held in registers. Each + * submatrix of C_aux can be calculated independently, and the registers are + * added back into C_j. + * + * - For each row-block of C_aux: + * (uses a row block of A and full B_j) + * - stream over all columns of A, multiply with elements from B and + * accumulate in registers. (use different inner-kernels to exploit + * vectorization for varying block sizes) + * - add alpha * row block of C_aux back into C_j. + * + * Reference: + * + * The summary above is based on staring at various kernel implementations and: + * K. Goto and R. A. Van de Geijn, Anatomy of High-Performance Matrix + * Multiplication, in ACM Transactions of Mathematical Software, Vol. 34, No. + * 3, May 2008. + */ + +#define VLEN_BYTES 16 +#define VLEN_FLOATS (VLEN_BYTES / sizeof(FLOAT)) + +typedef FLOAT vector_float __attribute__ ((vector_size (16))); + +/** + * Calculate for a row-block in C_i of size ROWSxCOLS using vector intrinsics. + * + * @param[in] A Pointer current block of input matrix A. + * @param[in] k Number of columns in A. + * @param[in] B Pointer current block of input matrix B. + * @param[inout] C Pointer current block of output matrix C. + * @param[in] ldc Offset between elements in adjacent columns in C. + * @param[in] alpha Scalar factor. + */ +#define VECTOR_BLOCK(ROWS, COLS) \ + static inline void GEBP_block_##ROWS##_##COLS( \ + FLOAT const *restrict A, BLASLONG bk, FLOAT const *restrict B, \ + FLOAT *restrict C, BLASLONG ldc, FLOAT alpha) { \ + _Static_assert( \ + ROWS % VLEN_FLOATS == 0, \ + "rows in block must be multiples of vector length"); \ + vector_float Caux[ROWS / VLEN_FLOATS][COLS]; \ + \ + for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) \ + for (BLASLONG j = 0; j < COLS; j++) \ + Caux[i][j] = vec_splats(ZERO); \ + \ + /* \ + * Stream over the row-block of A, which is packed \ + * column-by-column, multiply by coefficients in B and add up \ + * into temporaries Caux (which the compiler will hold in \ + * registers). Vectorization: Multiply column vectors from A \ + * with scalars from B and add up in column vectors of Caux. \ + * That equates to unrolling the loop over rows (in i) and \ + * executing each unrolled iteration as a vector element. \ + */ \ + for (BLASLONG k = 0; k < bk; k++) { \ + for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) { \ + vector_float Ak = \ + *(vector_float *)(A + i * VLEN_FLOATS + \ + k * ROWS); \ + \ + for (BLASLONG j = 0; j < COLS; j++) \ + Caux[i][j] += Ak * B[j + k * COLS]; \ + } \ + } \ + \ + /* \ + * Unpack row-block of C_aux into outer C_i, multiply by \ + * alpha and add up. \ + */ \ + for (BLASLONG j = 0; j < COLS; j++) { \ + for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) { \ + vector_float *C_ij = \ + (vector_float *)(C + i * VLEN_FLOATS + \ + j * ldc); \ + *C_ij += alpha * Caux[i][j]; \ + } \ + } \ + } + + +VECTOR_BLOCK(8, 4) +VECTOR_BLOCK(8, 2) +VECTOR_BLOCK(8, 1) +VECTOR_BLOCK(4, 4) +VECTOR_BLOCK(4, 2) +VECTOR_BLOCK(4, 1) + +#ifdef DOUBLE +VECTOR_BLOCK(2, 4) +VECTOR_BLOCK(2, 2) +#endif + +/** + * Handle calculation for row blocks in C_i of any size by dispatching into + * macro-defined (inline) functions or by deferring to a simple generic + * implementation. Note that the compiler can remove this awkward-looking + * dispatching code while inlineing. + * + * @param[in] m Number of rows in block C_i. + * @param[in] n Number of columns in block C_i. + * @param[in] first_row Index of first row of the block C_i (relative to C). + * @param[in] A Pointer to input matrix A (note: all of it). + * @param[in] k Number of columns in A and rows in B. + * @param[in] B Pointer to current column block (panel) of input matrix B. + * @param[inout] C Pointer to current column block (panel) of output matrix C. + * @param[in] ldc Offset between elements in adjacent columns in C. + * @param[in] alpha Scalar factor. + */ +static inline void GEBP_block(BLASLONG m, BLASLONG n, + BLASLONG first_row, + const FLOAT * restrict A, BLASLONG k, + const FLOAT * restrict B, + FLOAT *restrict C, BLASLONG ldc, + FLOAT alpha) +{ + A += first_row * k; + C += first_row; + +#define BLOCK(bm, bn) \ + if (m == bm && n == bn) { \ + GEBP_block_##bm##_##bn(A, k, B, C, ldc, alpha); \ + return; \ + } + + BLOCK(8, 4); BLOCK(8, 2); BLOCK(8, 1); + BLOCK(4, 4); BLOCK(4, 2); BLOCK(4, 1); + + #ifdef DOUBLE + BLOCK(2, 4); + BLOCK(2, 2); + #endif + +#undef BLOCK + + /* simple implementation for smaller block sizes: */ + FLOAT Caux[m][n] __attribute__ ((aligned (16))); + + /* + * Peel off first iteration (i.e., column of A) for initializing Caux + */ + for (BLASLONG i = 0; i < m; i++) + for (BLASLONG j = 0; j < n; j++) + Caux[i][j] = A[i] * B[j]; + + for (BLASLONG kk = 1; kk < k; kk++) + for (BLASLONG i = 0; i < m; i++) + for (BLASLONG j = 0; j < n; j++) + Caux[i][j] += A[i + kk * m] * B[j + kk * n]; + + for (BLASLONG i = 0; i < m; i++) + for (BLASLONG j = 0; j < n; j++) + C[i + j * ldc] += alpha * Caux[i][j]; +} + +/** + * Handle a column block (panel) of C and B while calculating C += alpha(A * B). + * + * @param[in] num_cols Number of columns in the block (in C and B). + * @param[in] first_col First column of the current block (in C and B). + * @param[in] A Pointer to input matrix A. + * @param[in] bk Number of columns in A and rows in B. + * @param[in] B Pointer to input matrix B (note: all of it). + * @param[in] bm Number of rows in C and A. + * @param[inout] C Pointer to output matrix C (note: all of it). + * @param[in] ldc Offset between elements in adjacent columns in C. + * @param[in] alpha Scalar factor. + */ +static inline void GEBP_column_block(BLASLONG num_cols, BLASLONG first_col, + const FLOAT *restrict A, BLASLONG bk, + const FLOAT *restrict B, BLASLONG bm, + FLOAT *restrict C, BLASLONG ldc, + FLOAT alpha) { + FLOAT *restrict C_i = C + first_col * ldc; + /* + * B is in column-order with n_r packed row elements, which does + * not matter -- we always move in full such blocks of + * column*pack + */ + const FLOAT *restrict B_i = B + first_col * bk; + + /* + * Calculate C_aux := A * B_j + * then unpack C_i += alpha * C_aux. + * + * For that purpose, further partition C_aux and A into blocks + * of m_r (unroll_m) rows, or powers-of-2 if smaller. + */ + BLASLONG row = 0; + for (BLASLONG block_size = unroll_m; block_size > 0; block_size /= 2) + for (; bm - row >= block_size; row += block_size) + GEBP_block(block_size, num_cols, row, A, bk, B_i, C_i, + ldc, alpha); +} + +/** + * Inner kernel for matrix-matrix multiplication. C += alpha (A * B) + * where C is an m-by-n matrix, A is m-by-k and B is k-by-n. Note that A, B, and + * C are pointers to submatrices of the actual matrices. + * + * @param[in] bm Number of rows in C and A. + * @param[in] bn Number of columns in C and B. + * @param[in] bk Number of columns in A and rows in B. + * @param[in] alpha Scalar factor. + * @param[in] ba Pointer to input matrix A. + * @param[in] bb Pointer to input matrix B. + * @param[inout] C Pointer to output matrix C. + * @param[in] ldc Offset between elements in adjacent columns in C. + * @returns 0 on success. + */ +int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, + FLOAT *restrict ba, FLOAT *restrict bb, + FLOAT *restrict C, BLASLONG ldc) +{ + if ( (bm == 0) || (bn == 0) || (bk == 0) || (alpha == ZERO)) + return 0; + + /* + * interface code allocates buffers for ba and bb at page + * granularity (i.e., using mmap(MAP_ANONYMOUS), so enable the compiler + * to make use of the fact in vector load operations. + */ + ba = __builtin_assume_aligned(ba, 16); + bb = __builtin_assume_aligned(bb, 16); + + /* + * Partition B and C into blocks of n_r (unroll_n) columns, called B_i + * and C_i. For each partition, calculate C_i += alpha * (A * B_j). + * + * For remaining columns that do not fill up a block of n_r, iteratively + * use smaller block sizes of powers of 2. + */ + BLASLONG col = 0; + for (BLASLONG block_size = unroll_n; block_size > 0; block_size /= 2) + for (; bn - col >= block_size; col += block_size) + GEBP_column_block(block_size, col, ba, bk, bb, bm, C, ldc, alpha); + + return 0; +} From 71b6eaf459e55e7b5fe5047052c39c49f16c3680 Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Tue, 12 May 2020 14:40:30 +0200 Subject: [PATCH 183/593] s390x: Use new sgemm kernel also for strmm on Z14 and newer Employ the newly added GEMM kernel also for STRMM on Z14. The implementation in C with vector intrinsics exploits FP32 SIMD operations and thereby gains performance over the existing assembly code. Extend the implementation for handling triangular matrix multiplication, accordingly. As added benefit, the more flexible C code enables us to adjust register blocking in the subsequent commit. Tested via make -C test / ctest / utest and by a couple of additional unit tests that exercise blocking. Signed-off-by: Marius Hillenbrand --- kernel/zarch/KERNEL.Z14 | 8 +--- kernel/zarch/gemm_vec.c | 104 +++++++++++++++++++++++++++++++++++++--- 2 files changed, 98 insertions(+), 14 deletions(-) diff --git a/kernel/zarch/KERNEL.Z14 b/kernel/zarch/KERNEL.Z14 index bd3a966b1..49fa28175 100644 --- a/kernel/zarch/KERNEL.Z14 +++ b/kernel/zarch/KERNEL.Z14 @@ -86,7 +86,7 @@ DGEMVTKERNEL = dgemv_t_4.c CGEMVTKERNEL = cgemv_t_4.c ZGEMVTKERNEL = zgemv_t_4.c -STRMMKERNEL = strmm8x4V.S +STRMMKERNEL = gemm_vec.c DTRMMKERNEL = trmm8x4V.S CTRMMKERNEL = ctrmm4x4V.S ZTRMMKERNEL = ztrmm4x4V.S @@ -101,8 +101,6 @@ SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) - - DGEMMKERNEL = gemm8x4V.S DGEMMINCOPY = ../generic/gemm_ncopy_8.c DGEMMITCOPY = ../generic/gemm_tcopy_8.c @@ -145,7 +143,3 @@ ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - - - - diff --git a/kernel/zarch/gemm_vec.c b/kernel/zarch/gemm_vec.c index e6d613c44..a9531c7a5 100644 --- a/kernel/zarch/gemm_vec.c +++ b/kernel/zarch/gemm_vec.c @@ -51,6 +51,29 @@ static const size_t unroll_m = UNROLL_M; static const size_t unroll_n = UNROLL_N; +/* Handling of triangular matrices */ +#ifdef TRMMKERNEL +static const bool trmm = true; +static const bool left = +#ifdef LEFT + true; +#else + false; +#endif + +static const bool backwards = +#if defined(LEFT) != defined(TRANSA) + true; +#else + false; +#endif + +#else +static const bool trmm = false; +static const bool left = false; +static const bool backwards = false; +#endif /* TRMMKERNEL */ + /* * Background: * @@ -111,6 +134,17 @@ static const size_t unroll_n = UNROLL_N; * vectorization for varying block sizes) * - add alpha * row block of C_aux back into C_j. * + * Note that there are additional mechanics for handling triangular matrices, + * calculating B := alpha (A * B) where either of the matrices A or B can be + * triangular. In case of A, the macro "LEFT" is defined. In addition, A can + * optionally be transposed. + * The code effectively skips an "offset" number of columns in A and rows of B + * in each block, to save unnecessary work by exploiting the triangular nature. + * To handle all cases, the code discerns (1) a "left" mode when A is triangular + * and (2) "forward" / "backwards" modes where only the first "offset" + * columns/rows of A/B are used or where the first "offset" columns/rows are + * skipped, respectively. + * * Reference: * * The summary above is based on staring at various kernel implementations and: @@ -176,7 +210,11 @@ typedef FLOAT vector_float __attribute__ ((vector_size (16))); vector_float *C_ij = \ (vector_float *)(C + i * VLEN_FLOATS + \ j * ldc); \ - *C_ij += alpha * Caux[i][j]; \ + if (trmm) { \ + *C_ij = alpha * Caux[i][j]; \ + } else { \ + *C_ij += alpha * Caux[i][j]; \ + } \ } \ } \ } @@ -209,17 +247,37 @@ VECTOR_BLOCK(2, 2) * @param[inout] C Pointer to current column block (panel) of output matrix C. * @param[in] ldc Offset between elements in adjacent columns in C. * @param[in] alpha Scalar factor. + * @param[in] offset Number of columns of A and rows of B to skip (for triangular matrices). + * @param[in] off Running offset for handling triangular matrices. */ static inline void GEBP_block(BLASLONG m, BLASLONG n, BLASLONG first_row, const FLOAT * restrict A, BLASLONG k, const FLOAT * restrict B, FLOAT *restrict C, BLASLONG ldc, - FLOAT alpha) + FLOAT alpha, + BLASLONG offset, BLASLONG off) { + if (trmm && left) + off = offset + first_row; + A += first_row * k; C += first_row; + if (trmm) { + if (backwards) { + A += off * m; + B += off * n; + k -= off; + } else { + if (left) { + k = off + m; + } else { + k = off + n; + } + } + } + #define BLOCK(bm, bn) \ if (m == bm && n == bn) { \ GEBP_block_##bm##_##bn(A, k, B, C, ldc, alpha); \ @@ -253,7 +311,11 @@ static inline void GEBP_block(BLASLONG m, BLASLONG n, for (BLASLONG i = 0; i < m; i++) for (BLASLONG j = 0; j < n; j++) - C[i + j * ldc] += alpha * Caux[i][j]; + if (trmm) { + C[i + j * ldc] = alpha * Caux[i][j]; + } else { + C[i + j * ldc] += alpha * Caux[i][j]; + } } /** @@ -268,12 +330,15 @@ static inline void GEBP_block(BLASLONG m, BLASLONG n, * @param[inout] C Pointer to output matrix C (note: all of it). * @param[in] ldc Offset between elements in adjacent columns in C. * @param[in] alpha Scalar factor. + * @param[in] offset Number of columns of A and rows of B to skip (for triangular matrices). */ static inline void GEBP_column_block(BLASLONG num_cols, BLASLONG first_col, const FLOAT *restrict A, BLASLONG bk, const FLOAT *restrict B, BLASLONG bm, FLOAT *restrict C, BLASLONG ldc, - FLOAT alpha) { + FLOAT alpha, + BLASLONG const offset) { + FLOAT *restrict C_i = C + first_col * ldc; /* * B is in column-order with n_r packed row elements, which does @@ -282,6 +347,15 @@ static inline void GEBP_column_block(BLASLONG num_cols, BLASLONG first_col, */ const FLOAT *restrict B_i = B + first_col * bk; + BLASLONG off = 0; + if (trmm) { + if (left) { + off = offset; + } else { + off = -offset + first_col; + } + } + /* * Calculate C_aux := A * B_j * then unpack C_i += alpha * C_aux. @@ -293,7 +367,7 @@ static inline void GEBP_column_block(BLASLONG num_cols, BLASLONG first_col, for (BLASLONG block_size = unroll_m; block_size > 0; block_size /= 2) for (; bm - row >= block_size; row += block_size) GEBP_block(block_size, num_cols, row, A, bk, B_i, C_i, - ldc, alpha); + ldc, alpha, offset, off); } /** @@ -301,6 +375,9 @@ static inline void GEBP_column_block(BLASLONG num_cols, BLASLONG first_col, * where C is an m-by-n matrix, A is m-by-k and B is k-by-n. Note that A, B, and * C are pointers to submatrices of the actual matrices. * + * For triangular matrix multiplication, calculate B := alpha (A * B) where A + * or B can be triangular (in case of A, the macro LEFT will be defined). + * * @param[in] bm Number of rows in C and A. * @param[in] bn Number of columns in C and B. * @param[in] bk Number of columns in A and rows in B. @@ -309,11 +386,16 @@ static inline void GEBP_column_block(BLASLONG num_cols, BLASLONG first_col, * @param[in] bb Pointer to input matrix B. * @param[inout] C Pointer to output matrix C. * @param[in] ldc Offset between elements in adjacent columns in C. + * @param[in] offset Number of columns of A and rows of B to skip (for triangular matrices). * @returns 0 on success. */ int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, FLOAT *restrict ba, FLOAT *restrict bb, - FLOAT *restrict C, BLASLONG ldc) + FLOAT *restrict C, BLASLONG ldc +#ifdef TRMMKERNEL + , BLASLONG offset +#endif + ) { if ( (bm == 0) || (bn == 0) || (bk == 0) || (alpha == ZERO)) return 0; @@ -326,6 +408,14 @@ int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, ba = __builtin_assume_aligned(ba, 16); bb = __builtin_assume_aligned(bb, 16); + /* + * Use offset and off even when compiled as SGEMMKERNEL to simplify + * function signatures and function calls. + */ +#ifndef TRMMKERNEL + BLASLONG const offset = 0; +#endif + /* * Partition B and C into blocks of n_r (unroll_n) columns, called B_i * and C_i. For each partition, calculate C_i += alpha * (A * B_j). @@ -336,7 +426,7 @@ int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, BLASLONG col = 0; for (BLASLONG block_size = unroll_n; block_size > 0; block_size /= 2) for (; bn - col >= block_size; col += block_size) - GEBP_column_block(block_size, col, ba, bk, bb, bm, C, ldc, alpha); + GEBP_column_block(block_size, col, ba, bk, bb, bm, C, ldc, alpha, offset); return 0; } From 1b0b4349a11f8de40037d9bddf9ddb9b094cdd2c Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Tue, 12 May 2020 15:06:38 +0200 Subject: [PATCH 184/593] s390x/Z14: Change register blocking for SGEMM to 16x4 Change register blocking for SGEMM (and STRMM) on z14 from 8x4 to 16x4 by adjusting SGEMM_DEFAULT_UNROLL_M and choosing the appropriate copy implementations. Actually make KERNEL.Z14 more flexible, so that the change in param.h suffices. As a result, performance for SGEMM improves by around 30% on z15. On z14, FP SIMD instructions can operate on float-sized scalars in vector registers, while z13 could do that for double-sized scalars only. Thus, we can double the amount of elements of C that are held in registers in an SGEMM kernel. Signed-off-by: Marius Hillenbrand --- kernel/zarch/KERNEL.Z14 | 10 ++++++---- kernel/zarch/gemm_vec.c | 15 +++++++++++++++ param.h | 2 +- 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/kernel/zarch/KERNEL.Z14 b/kernel/zarch/KERNEL.Z14 index 49fa28175..96e6745fd 100644 --- a/kernel/zarch/KERNEL.Z14 +++ b/kernel/zarch/KERNEL.Z14 @@ -92,12 +92,14 @@ CTRMMKERNEL = ctrmm4x4V.S ZTRMMKERNEL = ztrmm4x4V.S SGEMMKERNEL = gemm_vec.c -SGEMMINCOPY = ../generic/gemm_ncopy_8.c -SGEMMITCOPY = ../generic/gemm_tcopy_8.c -SGEMMONCOPY = ../generic/gemm_ncopy_4.c -SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +ifneq ($(SGEMM_UNROLL_M),$(SGEMM_UNROLL_N)) +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/zarch/gemm_vec.c b/kernel/zarch/gemm_vec.c index a9531c7a5..4e1b3e3fb 100644 --- a/kernel/zarch/gemm_vec.c +++ b/kernel/zarch/gemm_vec.c @@ -220,6 +220,15 @@ typedef FLOAT vector_float __attribute__ ((vector_size (16))); } +#if UNROLL_M == 16 +VECTOR_BLOCK(16, 4) +VECTOR_BLOCK(16, 2) +VECTOR_BLOCK(16, 1) +#endif +#if UNROLL_N == 8 +VECTOR_BLOCK(8, 8) +VECTOR_BLOCK(4, 8) +#endif VECTOR_BLOCK(8, 4) VECTOR_BLOCK(8, 2) VECTOR_BLOCK(8, 1) @@ -284,6 +293,12 @@ static inline void GEBP_block(BLASLONG m, BLASLONG n, return; \ } +#if UNROLL_M == 16 + BLOCK(16, 4); BLOCK(16, 2); BLOCK(16, 1); +#endif +#if UNROLL_N == 8 + BLOCK(8, 8); BLOCK(4, 8); +#endif BLOCK(8, 4); BLOCK(8, 2); BLOCK(8, 1); BLOCK(4, 4); BLOCK(4, 2); BLOCK(4, 1); diff --git a/param.h b/param.h index 7094249e8..6f0a3b727 100644 --- a/param.h +++ b/param.h @@ -2999,7 +2999,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL -#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_M 8 From cb9dc36dd5d7ecf40cd8f3d8e9ffe08bc525c427 Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Tue, 12 May 2020 16:14:00 +0200 Subject: [PATCH 185/593] Update CONTRIBUTORS.md Signed-off-by: Marius Hillenbrand --- CONTRIBUTORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 738475a93..fd4ab4bec 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -186,3 +186,4 @@ In chronological order: * Marius Hillenbrand * [2020-05-12] Revise dynamic architecture detection for IBM z + * [2020-05-12] Add new sgemm and strmm kernel for IBM z14 From 2840432e49ca57f8338c46575a44dfe1416a20d3 Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Wed, 13 May 2020 17:48:50 +0200 Subject: [PATCH 186/593] s390x: improvise vector alignment hints for older compilers Introduce inline assembly so that we can employ vector loads with alignment hints on older compilers (pre gcc-9), since these are still used in distributions such as RHEL 8 and Ubuntu 18.04 LTS. Informing the hardware about alignment can speed up vector loads. For that purpose, we can encode hints about 8-byte or 16-byte alignment of the memory operand into the opcodes. gcc-9 and newer automatically emit such hints, where applicable. Add a bit of inline assembly that achieves the same for older compilers. Since an older binutils may not know about the additional operand for the hints, we explicitly encode the opcode in hex. Signed-off-by: Marius Hillenbrand --- kernel/zarch/gemm_vec.c | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/kernel/zarch/gemm_vec.c b/kernel/zarch/gemm_vec.c index 4e1b3e3fb..2d4457f06 100644 --- a/kernel/zarch/gemm_vec.c +++ b/kernel/zarch/gemm_vec.c @@ -158,6 +158,32 @@ static const bool backwards = false; typedef FLOAT vector_float __attribute__ ((vector_size (16))); +/** + * Load a vector into register, and hint on 8-byte alignment to improve + * performance. gcc-9 and newer will create these hints by itself. For older + * compiler versions, use inline assembly to explicitly express the hint. + * Provide explicit hex encoding to cater for binutils versions that do not know + * about vector-load with alignment hints yet. + * + * Note that, for block sizes where we apply vectorization, vectors in A will + * always be 8-byte aligned. + */ +static inline vector_float vec_load_hinted(FLOAT const *restrict a) { + vector_float const *restrict addr = (vector_float const *restrict)a; + vector_float y; + +#if __GNUC__ < 9 + // hex-encode vl %[out],%[addr],3 + asm(".insn vrx,0xe70000003006,%[out],%[addr],3" + : [ out ] "=v"(y) + : [ addr ] "R"(*addr)); +#else + y = *addr; +#endif + + return y; +} + /** * Calculate for a row-block in C_i of size ROWSxCOLS using vector intrinsics. * @@ -192,9 +218,8 @@ typedef FLOAT vector_float __attribute__ ((vector_size (16))); */ \ for (BLASLONG k = 0; k < bk; k++) { \ for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) { \ - vector_float Ak = \ - *(vector_float *)(A + i * VLEN_FLOATS + \ - k * ROWS); \ + vector_float Ak = vec_load_hinted( \ + A + i * VLEN_FLOATS + k * ROWS); \ \ for (BLASLONG j = 0; j < COLS; j++) \ Caux[i][j] += Ak * B[j + k * COLS]; \ From 3d5e159e7a8f5a1344fc737c6e7d4446bef686ad Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 17 May 2020 15:26:57 +0200 Subject: [PATCH 187/593] Ignore spurious all-numeric library names derived from mishandled jobserver flags --- c_check | 1 + 1 file changed, 1 insertion(+) diff --git a/c_check b/c_check index c7899c84f..8234c2081 100644 --- a/c_check +++ b/c_check @@ -310,6 +310,7 @@ $linker_a = ""; && ($flags !~ /advapi32/) && ($flags !~ /shell32/) && ($flags !~ /omp/) + && ($flags !~ /[0-9]+/) ) { $linker_l .= $flags . " " } From 55602fce56115e04e41017eb0ac9ada1326c8f1c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 17 May 2020 15:28:14 +0200 Subject: [PATCH 188/593] Ignore spurious all-numeric library names derived from mishandled jobserver flags --- f_check | 1 + 1 file changed, 1 insertion(+) diff --git a/f_check b/f_check index fac8fc707..d702044cc 100644 --- a/f_check +++ b/f_check @@ -335,6 +335,7 @@ if ($link ne "") { && ($flags !~ /advapi32/) && ($flags !~ /shell32/) && ($flags !~ /omp/) + && ($flags !~ /[0-9]+/) && ($flags !~ /^\-l$/) ) { $linker_l .= $flags . " "; From 6baa9a778d014e3f0733221338b676d8877da43b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 19 May 2020 17:59:31 +0200 Subject: [PATCH 189/593] Improve declaration of LAPACKE_get_nancheck --- lapack-netlib/LAPACKE/include/lapacke.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/LAPACKE/include/lapacke.h b/lapack-netlib/LAPACKE/include/lapacke.h index 6eb0b696b..012c104bb 100644 --- a/lapack-netlib/LAPACKE/include/lapacke.h +++ b/lapack-netlib/LAPACKE/include/lapacke.h @@ -12575,7 +12575,7 @@ lapack_int LAPACKE_zhetrs_aa_2stage_work( int matrix_layout, char uplo, lapack_i /* APIs for set/get nancheck flags */ void LAPACKE_set_nancheck( int flag ); -int LAPACKE_get_nancheck( ); +int LAPACKE_get_nancheck( void ); #ifdef __cplusplus } From bdd795ed03667861b762836aa64e4b2bd33bf485 Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Tue, 19 May 2020 14:30:44 +0200 Subject: [PATCH 190/593] s390x/GEMM: replace 0-init with peeled first iteration ... since it gains another ~2% of SGEMM and DGEMM performance on z15; also, the code just called for that cleanup. Signed-off-by: Marius Hillenbrand --- kernel/zarch/gemm_vec.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/kernel/zarch/gemm_vec.c b/kernel/zarch/gemm_vec.c index 2d4457f06..eb6d7700b 100644 --- a/kernel/zarch/gemm_vec.c +++ b/kernel/zarch/gemm_vec.c @@ -203,9 +203,12 @@ static inline vector_float vec_load_hinted(FLOAT const *restrict a) { "rows in block must be multiples of vector length"); \ vector_float Caux[ROWS / VLEN_FLOATS][COLS]; \ \ - for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) \ + for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) { \ + vector_float A0 = \ + vec_load_hinted(A + i * VLEN_FLOATS); \ for (BLASLONG j = 0; j < COLS; j++) \ - Caux[i][j] = vec_splats(ZERO); \ + Caux[i][j] = A0 * B[j]; \ + } \ \ /* \ * Stream over the row-block of A, which is packed \ @@ -216,7 +219,7 @@ static inline vector_float vec_load_hinted(FLOAT const *restrict a) { * That equates to unrolling the loop over rows (in i) and \ * executing each unrolled iteration as a vector element. \ */ \ - for (BLASLONG k = 0; k < bk; k++) { \ + for (BLASLONG k = 1; k < bk; k++) { \ for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) { \ vector_float Ak = vec_load_hinted( \ A + i * VLEN_FLOATS + k * ROWS); \ From 89fe17f20e7d1d10a7ec3315bf9b1816a3d47ce9 Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Tue, 19 May 2020 14:56:34 +0200 Subject: [PATCH 191/593] s390x: Use new sgemm kernel also for DGEMM and DTRMM on Z14 Apply our new GEMM kernel implementation, written in C with vector intrinsics, also for DGEMM and DTRMM on Z14 and newer (i.e., architectures with FP32 SIMD instructions). As a result, we gain around 10% in performance on z15, in addition to improving maintainability. Signed-off-by: Marius Hillenbrand --- kernel/zarch/KERNEL.Z14 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/zarch/KERNEL.Z14 b/kernel/zarch/KERNEL.Z14 index 96e6745fd..3510938a7 100644 --- a/kernel/zarch/KERNEL.Z14 +++ b/kernel/zarch/KERNEL.Z14 @@ -87,7 +87,7 @@ CGEMVTKERNEL = cgemv_t_4.c ZGEMVTKERNEL = zgemv_t_4.c STRMMKERNEL = gemm_vec.c -DTRMMKERNEL = trmm8x4V.S +DTRMMKERNEL = gemm_vec.c CTRMMKERNEL = ctrmm4x4V.S ZTRMMKERNEL = ztrmm4x4V.S @@ -103,7 +103,7 @@ SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) -DGEMMKERNEL = gemm8x4V.S +DGEMMKERNEL = gemm_vec.c DGEMMINCOPY = ../generic/gemm_ncopy_8.c DGEMMITCOPY = ../generic/gemm_tcopy_8.c DGEMMONCOPY = ../generic/gemm_ncopy_4.c From d475db29c6fc9112ba3612f3b1bbdf73bf7fb96a Mon Sep 17 00:00:00 2001 From: zhangdanfeng Date: Mon, 18 May 2020 16:47:33 +0800 Subject: [PATCH 192/593] optimized for cortex-a53 Signed-off-by: zhangdanfeng --- kernel/arm64/sgemm_kernel_8x8_cortexa53.S | 2335 +++++++++++++++++++++ 1 file changed, 2335 insertions(+) create mode 100644 kernel/arm64/sgemm_kernel_8x8_cortexa53.S diff --git a/kernel/arm64/sgemm_kernel_8x8_cortexa53.S b/kernel/arm64/sgemm_kernel_8x8_cortexa53.S new file mode 100644 index 000000000..0c9629eab --- /dev/null +++ b/kernel/arm64/sgemm_kernel_8x8_cortexa53.S @@ -0,0 +1,2335 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc) */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define offset x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pA x15 +#define temp x16 + +#define alpha0 s10 +#define alphaV0 v10.s[0] +#define alpha1 s11 +#define alphaV1 v11.s[0] +#define alpha2 s14 +#define alphaV2 v14.s[0] +#define alpha3 s15 +#define alphaV3 v15.s[0] + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pA +// 16 temp +// 17 +// 18 must save +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA0_0, pA0_1, pA0_2, pA0_3 +//v01 pA0_4, pA0_5, pA0_6, pA0_7 +//v02 pA1_0, pA1_1, pA1_2, pA1_3 +//v03 pA1_4, pA1_5, pA1_6, pA1_7 +//v04 pB0_0, pB0_1, pB0_2, pB0_3 +//v05 pB0_4, pB0_5, pB0_6, pB0_7 +//v06 pB1_0, pB1_1, pB1_2, pB1_3 +//v07 pB1_4, pB1_5, pB1_6, pB1_7 +//v08 must save +//v09 must save +//v10 must save ALPHA0 +//v11 must save ALPHA1 +//v12 must save +//v13 must save +//v14 must save ALPHA2 +//v15 must save ALPHA3 +//v16 must save C00, C01, C02, C03 +//v17 must save C04, C05, C06, C07 +//v18 C08, C09, C10, C11 +//v19 C12, C13, C14, C15 +//v20 C16, C17, C18, C19 +//v21 C20, C21, C22, C23 +//v22 C24, C25, C26, C27 +//v23 C28, C29, C30, C31 +//v24 C32, C33, C34, C35 +//v25 C36, C37, C38, C39 +//v26 C40, C41, C42, C43 +//v27 C44, C45, C46, C47 +//v28 C48, C49, C50, C51 +//v29 C52, C53, C54, C55 +//v30 C56, C57, C58, C59 +//v31 C60, C61, C62, C63 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INIT8x8 + fmov s16, wzr + fmov s17, wzr + fmov s18, s16 + fmov s19, s17 + fmov s20, wzr + fmov s21, s16 + fmov s22, s17 + fmov s23, s18 + fmov s24, wzr + fmov s25, s16 + fmov s26, s17 + fmov s27, s18 + fmov s28, wzr + fmov s29, s16 + fmov s30, s17 + fmov s31, s18 +.endm + +.macro KERNEL8x8_I + ld1 {v0.4s, v1.4s}, [pA], #32 + ld1 {v4.4s, v5.4s}, [pB], #32 + ldr d2, [pA], #8 + ldr d6, [pB], #8 + ldr d3, [pA, #8] + ldr d7, [pB, #8] + + ldr x20, [pA], #16 + fmul v16.4s, v0.4s, v4.s[0] + ldr x24, [pB], #16 + fmul v17.4s, v1.4s, v4.s[0] + ldr x21, [pA], #8 + fmul v18.4s, v0.4s, v4.s[1] + ldr x25, [pB], #8 + fmul v19.4s, v1.4s, v4.s[1] + fmul v20.4s, v0.4s, v4.s[2] + fmul v21.4s, v1.4s, v4.s[2] + fmul v22.4s, v0.4s, v4.s[3] + fmul v23.4s, v1.4s, v4.s[3] + fmul v24.4s, v0.4s, v5.s[0] + fmul v25.4s, v1.4s, v5.s[0] + fmul v26.4s, v0.4s, v5.s[1] + fmul v27.4s, v1.4s, v5.s[1] + fmul v28.4s, v0.4s, v5.s[2] + fmul v29.4s, v1.4s, v5.s[2] + fmul v30.4s, v0.4s, v5.s[3] + fmul v31.4s, v1.4s, v5.s[3] +.endm + +.macro KERNEL8x8_M1 + ldr d2, [pA], #8 + fmov v0.d[1], x18 + ldr d6, [pB], #8 + fmov v4.d[1], x22 + ldr d3, [pA, #8] + fmov v1.d[1], x19 + ldr d7, [pB, #8] + fmov v5.d[1], x23 + fmla v16.4s, v0.4s, v4.s[0] + ldr x20, [pA], #16 + fmla v17.4s, v1.4s, v4.s[0] + ldr x24, [pB], #16 + fmla v18.4s, v0.4s, v4.s[1] + ldr x21, [pA], #8 + fmla v19.4s, v1.4s, v4.s[1] + ldr x25, [pB], #8 + fmla v20.4s, v0.4s, v4.s[2] + fmla v21.4s, v1.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + fmla v23.4s, v1.4s, v4.s[3] + fmla v24.4s, v0.4s, v5.s[0] + fmla v25.4s, v1.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v27.4s, v1.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v29.4s, v1.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] + fmla v31.4s, v1.4s, v5.s[3] +.endm + +.macro KERNEL8x8_M2 + ldr d0, [pA], #8 + fmov v2.d[1], x20 + ldr d4, [pB], #8 + fmov v6.d[1], x24 + ldr d1, [pA, #8] + fmov v3.d[1], x21 + ldr d5, [pB, #8] + fmov v7.d[1], x25 + fmla v16.4s, v2.4s, v6.s[0] + ldr x18, [pA], #16 + fmla v17.4s, v3.4s, v6.s[0] + ldr x22, [pB], #16 + fmla v18.4s, v2.4s, v6.s[1] + ldr x19, [pA], #8 + fmla v19.4s, v3.4s, v6.s[1] + ldr x23, [pB], #8 + fmla v20.4s, v2.4s, v6.s[2] + fmla v21.4s, v3.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + fmla v23.4s, v3.4s, v6.s[3] + fmla v24.4s, v2.4s, v7.s[0] + fmla v25.4s, v3.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v27.4s, v3.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v29.4s, v3.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] + fmla v31.4s, v3.4s, v7.s[3] +.endm + +.macro KERNEL8x8_E + fmov v2.d[1], x20 + fmov v6.d[1], x24 + fmov v3.d[1], x21 + fmov v7.d[1], x25 + fmla v16.4s, v2.4s, v6.s[0] + fmla v17.4s, v3.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v19.4s, v3.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v21.4s, v3.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + fmla v23.4s, v3.4s, v6.s[3] + fmla v24.4s, v2.4s, v7.s[0] + fmla v25.4s, v3.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v27.4s, v3.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v29.4s, v3.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] + fmla v31.4s, v3.4s, v7.s[3] +.endm + +.macro KERNEL8x8_SUB + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v4.s[0] + fmla v17.4s, v1.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v19.4s, v1.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v21.4s, v1.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + fmla v23.4s, v1.4s, v4.s[3] + fmla v24.4s, v0.4s, v5.s[0] + fmla v25.4s, v1.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v27.4s, v1.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v29.4s, v1.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] + fmla v31.4s, v1.4s, v5.s[3] +.endm + +.macro SAVE8x8 + add pCRow1, pCRow0, LDC + + ld1 {v0.4s, v1.4s}, [pCRow0] + fmla v0.4s, v16.4s, alphaV0 + fmla v1.4s, v17.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + ld1 {v2.4s, v3.4s}, [pCRow1] + fmla v2.4s, v18.4s, alphaV2 + fmla v3.4s, v19.4s, alphaV3 + st1 {v2.4s, v3.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + ld1 {v4.4s, v5.4s}, [pCRow2] + fmla v4.4s, v20.4s, alphaV0 + fmla v5.4s, v21.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + ld1 {v6.4s, v7.4s}, [pCRow1] + fmla v6.4s, v22.4s, alphaV2 + fmla v7.4s, v23.4s, alphaV3 + st1 {v6.4s, v7.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + ld1 {v0.4s, v1.4s}, [pCRow2] + fmla v0.4s, v24.4s, alphaV0 + fmla v1.4s, v25.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + ld1 {v2.4s, v3.4s}, [pCRow1] + fmla v2.4s, v26.4s, alphaV2 + fmla v3.4s, v27.4s, alphaV3 + st1 {v2.4s, v3.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + ld1 {v4.4s, v5.4s}, [pCRow2] + fmla v4.4s, v28.4s, alphaV0 + fmla v5.4s, v29.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow2] + + ld1 {v6.4s, v7.4s}, [pCRow1] + fmla v6.4s, v30.4s, alphaV2 + fmla v7.4s, v31.4s, alphaV3 + st1 {v6.4s, v7.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + + +.macro INIT4x8 + fmov s16, wzr + fmov s18, wzr + fmov s20, wzr + fmov s22, s16 + fmov s24, wzr + fmov s26, s16 + fmov s28, s18 + fmov s30, s20 +.endm + +.macro KERNEL4x8_I + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + + fmul v16.4s, v0.4s, v4.s[0] + fmul v18.4s, v0.4s, v4.s[1] + fmul v20.4s, v0.4s, v4.s[2] + fmul v22.4s, v0.4s, v4.s[3] + fmul v24.4s, v0.4s, v5.s[0] + fmul v26.4s, v0.4s, v5.s[1] + fmul v28.4s, v0.4s, v5.s[2] + fmul v30.4s, v0.4s, v5.s[3] + + ld1 {v6.4s}, [pB] + add pB, pB, #16 + ld1 {v7.4s}, [pB] + add pB, pB, #16 + ld1 {v2.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL4x8_M1 + fmla v16.4s, v0.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + fmla v24.4s, v0.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] + + ld1 {v6.4s}, [pB] + add pB, pB, #16 + ld1 {v7.4s}, [pB] + add pB, pB, #16 + ld1 {v2.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL4x8_M2 + fmla v16.4s, v2.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + fmla v24.4s, v2.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] + + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL4x8_E + fmla v16.4s, v2.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + fmla v24.4s, v2.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] +.endm + +.macro KERNEL4x8_SUB + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + fmla v24.4s, v0.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] +.endm + +.macro SAVE4x8 + add pCRow1, pCRow0, LDC + + ld1 {v0.4s}, [pCRow0] + fmla v0.4s, v16.4s, alphaV0 + st1 {v0.4s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + ld1 {v2.4s}, [pCRow1] + fmla v2.4s, v18.4s, alphaV2 + st1 {v2.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + ld1 {v4.4s}, [pCRow2] + fmla v4.4s, v20.4s, alphaV0 + st1 {v4.4s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + ld1 {v6.4s}, [pCRow1] + fmla v6.4s, v22.4s, alphaV2 + st1 {v6.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + ld1 {v0.4s}, [pCRow2] + fmla v0.4s, v24.4s, alphaV0 + st1 {v0.4s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + ld1 {v2.4s}, [pCRow1] + fmla v2.4s, v26.4s, alphaV2 + st1 {v2.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + ld1 {v4.4s}, [pCRow2] + fmla v4.4s, v28.4s, alphaV0 + st1 {v4.4s}, [pCRow2] + + ld1 {v6.4s}, [pCRow1] + fmla v6.4s, v30.4s, alphaV2 + st1 {v6.4s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x8 + fmov s16, wzr + fmov s18, wzr + fmov s20, wzr + fmov s22, s16 + fmov s24, wzr + fmov s26, s16 + fmov s28, s18 + fmov s30, s20 +.endm + +.macro KERNEL2x8_SUB + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ld1 {v0.2s}, [pA] + add pA, pA, #8 + + fmla v16.2s, v0.2s, v4.s[0] + fmla v18.2s, v0.2s, v4.s[1] + fmla v20.2s, v0.2s, v4.s[2] + fmla v22.2s, v0.2s, v4.s[3] + fmla v24.2s, v0.2s, v5.s[0] + fmla v26.2s, v0.2s, v5.s[1] + fmla v28.2s, v0.2s, v5.s[2] + fmla v30.2s, v0.2s, v5.s[3] +.endm + +.macro SAVE2x8 + add pCRow1, pCRow0, LDC + + ld1 {v0.2s}, [pCRow0] + fmla v0.2s, v16.2s, alphaV0 + st1 {v0.2s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + ld1 {v2.2s}, [pCRow1] + fmla v2.2s, v18.2s, alphaV2 + st1 {v2.2s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + ld1 {v4.2s}, [pCRow2] + fmla v4.2s, v20.2s, alphaV0 + st1 {v4.2s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + ld1 {v6.2s}, [pCRow1] + fmla v6.2s, v22.2s, alphaV2 + st1 {v6.2s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + ld1 {v0.2s}, [pCRow2] + fmla v0.2s, v24.2s, alphaV0 + st1 {v0.2s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + ld1 {v2.2s}, [pCRow1] + fmla v2.2s, v26.2s, alphaV2 + st1 {v2.2s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + ld1 {v4.2s}, [pCRow2] + fmla v4.2s, v28.2s, alphaV0 + st1 {v4.2s}, [pCRow2] + + ld1 {v6.2s}, [pCRow1] + fmla v6.2s, v30.2s, alphaV2 + st1 {v6.2s}, [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x8 + fmov s16, wzr + fmov s18, wzr + fmov s20, wzr + fmov s22, s16 + fmov s24, wzr + fmov s26, s16 + fmov s28, s18 + fmov s30, s20 +.endm + +.macro KERNEL1x8_SUB + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ldr s0, [pA] + add pA, pA, #4 + + fmla s16, s0, v4.s[0] + fmla s18, s0, v4.s[1] + fmla s20, s0, v4.s[2] + fmla s22, s0, v4.s[3] + fmla s24, s0, v5.s[0] + fmla s26, s0, v5.s[1] + fmla s28, s0, v5.s[2] + fmla s30, s0, v5.s[3] +.endm + +.macro SAVE1x8 + add pCRow1, pCRow0, LDC + + ldr s0, [pCRow0] + fmla s0, s16, alphaV0 + str s0, [pCRow0] + + add pCRow2, pCRow1, LDC + + ldr s2, [pCRow1] + fmla s2, s18, alphaV2 + str s2, [pCRow1] + + add pCRow1, pCRow2, LDC + + ldr s4, [pCRow2] + fmla s4, s20, alphaV0 + str s4, [pCRow2] + + add pCRow2, pCRow1, LDC + + ldr s6, [pCRow1] + fmla s6, s22, alphaV2 + str s6, [pCRow1] + + add pCRow1, pCRow2, LDC + + ldr s0, [pCRow2] + fmla s0, s24, alphaV0 + str s0, [pCRow2] + + add pCRow2, pCRow1, LDC + + ldr s2, [pCRow1] + fmla s2, s26, alphaV2 + str s2, [pCRow1] + + add pCRow1, pCRow2, LDC + + ldr s4, [pCRow2] + fmla s4, s28, alphaV0 + str s4, [pCRow2] + + ldr s6, [pCRow1] + fmla s6, s30, alphaV2 + str s6, [pCRow1] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************/ + +.macro INIT8x4 + fmov s16, wzr + fmov s17, wzr + fmov s20, wzr + fmov s21, s16 + fmov s24, wzr + fmov s25, s16 + fmov s28, wzr + fmov s29, s16 +.endm + +.macro KERNEL8x4_I + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmul v16.4s, v0.4s, v8.s[0] + fmul v17.4s, v1.4s, v8.s[0] + fmul v20.4s, v0.4s, v8.s[1] + fmul v21.4s, v1.4s, v8.s[1] + fmul v24.4s, v0.4s, v9.s[0] + fmul v25.4s, v1.4s, v9.s[0] + fmul v28.4s, v0.4s, v9.s[1] + fmul v29.4s, v1.4s, v9.s[1] + + ld1 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + ld1 {v4.4s}, [pA] + add pA, pA, #16 + ld1 {v5.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL8x4_M1 + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v24.4s, v0.4s, v9.s[0] + fmla v25.4s, v1.4s, v9.s[0] + fmla v28.4s, v0.4s, v9.s[1] + fmla v29.4s, v1.4s, v9.s[1] + + ld1 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + ld1 {v4.4s}, [pA] + add pA, pA, #16 + ld1 {v5.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL8x4_M2 + fmla v16.4s, v4.4s, v12.s[0] + fmla v17.4s, v5.4s, v12.s[0] + fmla v20.4s, v4.4s, v12.s[1] + fmla v21.4s, v5.4s, v12.s[1] + fmla v24.4s, v4.4s, v13.s[0] + fmla v25.4s, v5.4s, v13.s[0] + fmla v28.4s, v4.4s, v13.s[1] + fmla v29.4s, v5.4s, v13.s[1] + + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL8x4_E + fmla v16.4s, v4.4s, v12.s[0] + fmla v17.4s, v5.4s, v12.s[0] + fmla v20.4s, v4.4s, v12.s[1] + fmla v21.4s, v5.4s, v12.s[1] + fmla v24.4s, v4.4s, v13.s[0] + fmla v25.4s, v5.4s, v13.s[0] + fmla v28.4s, v4.4s, v13.s[1] + fmla v29.4s, v5.4s, v13.s[1] +.endm + +.macro KERNEL8x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v24.4s, v0.4s, v9.s[0] + fmla v25.4s, v1.4s, v9.s[0] + fmla v28.4s, v0.4s, v9.s[1] + fmla v29.4s, v1.4s, v9.s[1] +.endm + +.macro SAVE8x4 + add pCRow1, pCRow0, LDC + + ld1 {v0.4s, v1.4s}, [pCRow0] + fmla v0.4s, v16.4s, alphaV0 + fmla v1.4s, v17.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + ld1 {v4.4s, v5.4s}, [pCRow1] + fmla v4.4s, v20.4s, alphaV0 + fmla v5.4s, v21.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + ld1 {v0.4s, v1.4s}, [pCRow2] + fmla v0.4s, v24.4s, alphaV0 + fmla v1.4s, v25.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow2] + + ld1 {v4.4s, v5.4s}, [pCRow1] + fmla v4.4s, v28.4s, alphaV0 + fmla v5.4s, v29.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + + +.macro INIT4x4 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 + fmov s24, s17 + fmov s25, s16 + fmov s28, s17 + fmov s29, s16 +.endm + +.macro KERNEL4x4_I + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + fmul v16.2s, v0.2s, v8.s[0] + fmul v29.2s, v1.2s, v9.s[1] + + fmul v20.2s, v0.2s, v8.s[1] + fmul v25.2s, v1.2s, v9.s[0] + + fmul v24.2s, v0.2s, v9.s[0] + fmul v21.2s, v1.2s, v8.s[1] + + fmul v28.2s, v0.2s, v9.s[1] + fmul v17.2s, v1.2s, v8.s[0] + + ld1 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + ld1 {v4.2s, v5.2s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL4x4_M1 + fmla v16.2s, v0.2s, v8.s[0] + fmla v29.2s, v1.2s, v9.s[1] + + ld1 {v12.2s, v13.2s}, [pB] // For next round + add pB, pB, #16 + + fmla v20.2s, v0.2s, v8.s[1] + fmla v25.2s, v1.2s, v9.s[0] + + ld1 {v4.2s, v5.2s}, [pA] // For next round + add pA, pA, #16 + + fmla v24.2s, v0.2s, v9.s[0] + fmla v21.2s, v1.2s, v8.s[1] + + prfm PLDL1KEEP, [pB, #512] + + fmla v28.2s, v0.2s, v9.s[1] + fmla v17.2s, v1.2s, v8.s[0] +.endm + +.macro KERNEL4x4_M2 + fmla v16.2s, v4.2s, v12.s[0] + fmla v29.2s, v5.2s, v13.s[1] + + ld1 {v8.2s, v9.2s}, [pB] // For next round + add pB, pB, #16 + + fmla v20.2s, v4.2s, v12.s[1] + fmla v25.2s, v5.2s, v13.s[0] + + ld1 {v0.2s, v1.2s}, [pA] // For next round + add pA, pA, #16 + + fmla v24.2s, v4.2s, v13.s[0] + fmla v21.2s, v5.2s, v12.s[1] + + prfm PLDL1KEEP, [pA, #512] + + fmla v28.2s, v4.2s, v13.s[1] + fmla v17.2s, v5.2s, v12.s[0] +.endm + +.macro KERNEL4x4_E + fmla v16.2s, v4.2s, v12.s[0] + fmla v29.2s, v5.2s, v13.s[1] + + fmla v20.2s, v4.2s, v12.s[1] + fmla v25.2s, v5.2s, v13.s[0] + + fmla v24.2s, v4.2s, v13.s[0] + fmla v21.2s, v5.2s, v12.s[1] + + fmla v28.2s, v4.2s, v13.s[1] + fmla v17.2s, v5.2s, v12.s[0] +.endm + +.macro KERNEL4x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + fmla v16.2s, v0.2s, v8.s[0] + fmla v29.2s, v1.2s, v9.s[1] + + fmla v20.2s, v0.2s, v8.s[1] + fmla v25.2s, v1.2s, v9.s[0] + + fmla v24.2s, v0.2s, v9.s[0] + fmla v21.2s, v1.2s, v8.s[1] + + fmla v28.2s, v0.2s, v9.s[1] + fmla v17.2s, v1.2s, v8.s[0] +.endm + +.macro SAVE4x4 + ld1 {v8.2s, v9.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + fmla v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + + add pCRow1, pCRow0, LDC + ld1 {v12.2s, v13.2s}, [pCRow1] + fmla v12.2s, v20.2s, alphaV2 + fmla v13.2s, v21.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] + + add pCRow2, pCRow1, LDC + ld1 {v8.2s, v9.2s}, [pCRow2] + fmla v8.2s, v24.2s, alphaV0 + fmla v9.2s, v25.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow2] + + add pCRow1, pCRow2, LDC + ld1 {v12.2s, v13.2s}, [pCRow1] + fmla v12.2s, v28.2s, alphaV2 + fmla v13.2s, v29.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x4 + fmov s16, wzr + fmov s20, s16 + fmov s24, s20 + fmov s28, s16 +.endm + +.macro KERNEL2x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s}, [pA] + add pA, pA, #8 + + fmla v16.2s, v0.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v28.2s, v0.2s, v9.s[1] +.endm + +.macro SAVE2x4 + ld1 {v8.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] + + add pCRow1, pCRow0, LDC + ld1 {v12.2s}, [pCRow1] + fmla v12.2s, v20.2s, alphaV1 + st1 {v12.2s}, [pCRow1] + + add pCRow2, pCRow1, LDC + ld1 {v8.2s}, [pCRow2] + fmla v8.2s, v24.2s, alphaV2 + st1 {v8.2s}, [pCRow2] + + add pCRow1, pCRow2, LDC + ld1 {v12.2s}, [pCRow1] + fmla v12.2s, v28.2s, alphaV3 + st1 {v12.2s}, [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x4 + fmov s16, wzr + fmov s20, s16 +.endm + +.macro KERNEL1x4_SUB + ldr s0, [pA] + add pA, pA, #4 + + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + + fmla v16.2s, v8.2s, v0.s[0] + fmla v20.2s, v9.2s, v0.s[0] +.endm + +.macro SAVE1x4 + add pCRow1, pCRow0, LDC + ld1 {v8.s}[0], [pCRow0] + ld1 {v8.s}[1], [pCRow1] + fmla v8.2s, v16.2s, alphaV0 + st1 {v8.s}[0], [pCRow0] + st1 {v8.s}[1], [pCRow1] + + add pCRow2, pCRow1, LDC + add pCRow1, pCRow2, LDC + ld1 {v12.s}[0], [pCRow2] + ld1 {v12.s}[1], [pCRow1] + fmla v12.2s, v20.2s, alphaV1 + st1 {v12.s}[0], [pCRow2] + st1 {v12.s}[1], [pCRow1] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************/ + +.macro INIT8x2 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 +.endm + +.macro KERNEL8x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] +.endm + +.macro SAVE8x2 + add pCRow1, pCRow0, LDC + + ld1 {v0.4s, v1.4s}, [pCRow0] + fmla v0.4s, v16.4s, alphaV0 + fmla v1.4s, v17.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + ld1 {v4.4s, v5.4s}, [pCRow1] + fmla v4.4s, v20.4s, alphaV0 + fmla v5.4s, v21.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT4x2 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 +.endm + +.macro KERNEL4x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + ld1 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v21.2s, v1.2s, v8.s[1] +.endm + +.macro SAVE4x2 + ld1 {v8.2s, v9.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + fmla v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + + add pCRow1, pCRow0, LDC + ld1 {v12.2s, v13.2s}, [pCRow1] + fmla v12.2s, v20.2s, alphaV2 + fmla v13.2s, v21.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x2 + fmov s16, wzr + fmov s20, s16 +.endm + +.macro KERNEL2x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + + ld1 {v0.2s}, [pA] + add pA, pA, #8 + + fmla v16.2s, v0.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] +.endm + +.macro SAVE2x2 + ld1 {v8.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] + + add pCRow1 , pCRow0, LDC + ld1 {v12.2s}, [pCRow1] + fmla v12.2s, v20.2s, alphaV1 + st1 {v12.2s}, [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x2 + fmov s16, wzr +.endm + +.macro KERNEL1x2_SUB + ld1 {v8.2s} , [pB] + add pB , pB, #8 + + ldr s0 , [pA] + add pA, pA, #4 + + fmla v16.2s, v8.2s, v0.s[0] +.endm + +.macro SAVE1x2 + add pCRow1 , pCRow0, LDC + ld1 {v8.s}[0], [pCRow0] + ld1 {v8.s}[1], [pCRow1] + fmla v8.2s, v16.2s, alphaV0 + st1 {v8.s}[0], [pCRow0] + st1 {v8.s}[1], [pCRow1] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************/ + +.macro INIT8x1 + fmov s16, wzr + fmov s17, wzr +.endm + +.macro KERNEL8x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] +.endm + +.macro SAVE8x1 + ld1 {v0.4s, v1.4s}, [pCRow0] + fmla v0.4s, v16.4s, alphaV0 + fmla v1.4s, v17.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow0] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT4x1 + fmov s16, wzr + fmov s17, s16 +.endm + +.macro KERNEL4x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.2s, v1.2s}, [pA] + add pA , pA, #16 + + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] +.endm + +.macro SAVE4x1 + ld1 {v8.2s, v9.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + fmla v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x1 + fmov s16, wzr +.endm + +.macro KERNEL2x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.2s}, [pA] + add pA , pA, #8 + + fmla v16.2s, v0.2s, v8.s[0] +.endm + +.macro SAVE2x1 + ld1 {v8.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x1 + fmov s16, wzr +.endm + +.macro KERNEL1x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ldr s0, [pA] + add pA , pA, #4 + + fmadd s16, s0, s8, s16 +.endm + +.macro SAVE1x1 + ldr s8, [pCRow0] + fmla s8, s16, alphaV0 + str s8, [pCRow0] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + +.Lsgemm_kernel_begin: + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + fmov alpha0, s0 + fmov alpha1, s0 + fmov alpha2, s0 + fmov alpha3, s0 + + lsl LDC, LDC, #2 // ldc = ldc * 4 + + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #3 // J = J / 8 + cmp counterJ, #0 + ble .Lsgemm_kernel_L4_BEGIN + +/******************************************************************************/ +/******************************************************************************/ + +.Lsgemm_kernel_L8_BEGIN: + mov pCRow0, pC // pCRow0 = C + add pC, pC, LDC, lsl #3 + + mov pA, origPA // pA = start of A array + +/******************************************************************************/ + +.Lsgemm_kernel_L8_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble .Lsgemm_kernel_L8_M4_BEGIN + +.Lsgemm_kernel_L8_M8_20: + + mov pB, origPB + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #2 // is there at least 16 to do? + blt .Lsgemm_kernel_L8_M8_32 + + KERNEL8x8_I // do one in the K + KERNEL8x8_M2 // do another in the K + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + + subs counterL, counterL, #2 + ble .Lsgemm_kernel_L8_M8_22a + .align 5 + +.Lsgemm_kernel_L8_M8_22: + + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L8_M8_22 + +.Lsgemm_kernel_L8_M8_22a: + + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_E + + b .Lsgemm_kernel_L8_M8_44 + +.Lsgemm_kernel_L8_M8_32: + + tst counterL, #1 + ble .Lsgemm_kernel_L8_M8_40 + + KERNEL8x8_I + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_E + + b .Lsgemm_kernel_L8_M8_44 + +.Lsgemm_kernel_L8_M8_40: + + INIT8x8 + +.Lsgemm_kernel_L8_M8_44: + + ands counterL , origK, #7 + ble .Lsgemm_kernel_L8_M8_100 + +.Lsgemm_kernel_L8_M8_46: + + KERNEL8x8_SUB + + subs counterL, counterL, 1 + bgt .Lsgemm_kernel_L8_M8_46 + +.Lsgemm_kernel_L8_M8_100: + + SAVE8x8 + +.Lsgemm_kernel_L8_M8_END: + subs counterI, counterI, #1 + bne .Lsgemm_kernel_L8_M8_20 + +/******************************************************************************/ + +.Lsgemm_kernel_L8_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble .Lsgemm_kernel_L8_END + + tst counterI, #4 + ble .Lsgemm_kernel_L8_M2_BEGIN + +.Lsgemm_kernel_L8_M4_20: + + mov pB, origPB + + asr counterL , origK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt .Lsgemm_kernel_L8_M4_32 + + KERNEL4x8_I // do one in the K + KERNEL4x8_M2 // do another in the K + + subs counterL, counterL, #2 + ble .Lsgemm_kernel_L8_M4_22a + .align 5 + +.Lsgemm_kernel_L8_M4_22: + + KERNEL4x8_M1 + KERNEL4x8_M2 + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L8_M4_22 + +.Lsgemm_kernel_L8_M4_22a: + + KERNEL4x8_M1 + KERNEL4x8_E + + b .Lsgemm_kernel_L8_M4_44 + +.Lsgemm_kernel_L8_M4_32: + + tst counterL, #1 + ble .Lsgemm_kernel_L8_M4_40 + + KERNEL4x8_I + KERNEL4x8_E + + b .Lsgemm_kernel_L8_M4_44 + +.Lsgemm_kernel_L8_M4_40: + + INIT4x8 + +.Lsgemm_kernel_L8_M4_44: + + ands counterL , origK, #1 + ble .Lsgemm_kernel_L8_M4_100 + +.Lsgemm_kernel_L8_M4_46: + + KERNEL4x8_SUB + +.Lsgemm_kernel_L8_M4_100: + + SAVE4x8 + +.Lsgemm_kernel_L8_M4_END: + +/******************************************************************************/ + +.Lsgemm_kernel_L8_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble .Lsgemm_kernel_L8_END + + tst counterI, #2 // counterI = counterI / 2 + ble .Lsgemm_kernel_L8_M1_BEGIN + +.Lsgemm_kernel_L8_M2_20: + + INIT2x8 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lsgemm_kernel_L8_M2_40 + +.Lsgemm_kernel_L8_M2_22: + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L8_M2_22 + + +.Lsgemm_kernel_L8_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lsgemm_kernel_L8_M2_100 + +.Lsgemm_kernel_L8_M2_42: + + KERNEL2x8_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L8_M2_42 + +.Lsgemm_kernel_L8_M2_100: + + SAVE2x8 + +.Lsgemm_kernel_L8_M2_END: + +/******************************************************************************/ + +.Lsgemm_kernel_L8_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble .Lsgemm_kernel_L8_END + +.Lsgemm_kernel_L8_M1_20: + + INIT1x8 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lsgemm_kernel_L8_M1_40 + +.Lsgemm_kernel_L8_M1_22: + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L8_M1_22 + + +.Lsgemm_kernel_L8_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lsgemm_kernel_L8_M1_100 + +.Lsgemm_kernel_L8_M1_42: + + KERNEL1x8_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L8_M1_42 + +.Lsgemm_kernel_L8_M1_100: + + SAVE1x8 + +.Lsgemm_kernel_L8_END: + lsl temp, origK, #5 // B = B + K * 4 * 8 + add origPB, origPB, temp + + subs counterJ, counterJ , #1 // j-- + bgt .Lsgemm_kernel_L8_BEGIN + +/******************************************************************************/ +/******************************************************************************/ + +.Lsgemm_kernel_L4_BEGIN: + + mov counterJ , origN + tst counterJ , #7 + ble .Lsgemm_kernel_L999 + + tst counterJ , #4 + ble .Lsgemm_kernel_L2_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #2 + + mov pA, origPA // pA = A + +/******************************************************************************/ + +.Lsgemm_kernel_L4_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble .Lsgemm_kernel_L4_M4_BEGIN + +.Lsgemm_kernel_L4_M8_20: + + mov pB, origPB + + asr counterL , origK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt .Lsgemm_kernel_L4_M8_32 + + KERNEL8x4_I // do one in the K + KERNEL8x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble .Lsgemm_kernel_L4_M8_22a + .align 5 + +.Lsgemm_kernel_L4_M8_22: + + KERNEL8x4_M1 + KERNEL8x4_M2 + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L4_M8_22 + +.Lsgemm_kernel_L4_M8_22a: + + KERNEL8x4_M1 + KERNEL8x4_E + + b .Lsgemm_kernel_L4_M8_44 + +.Lsgemm_kernel_L4_M8_32: + + tst counterL, #1 + ble .Lsgemm_kernel_L4_M8_40 + + KERNEL8x4_I + KERNEL8x4_E + + b .Lsgemm_kernel_L4_M8_44 + +.Lsgemm_kernel_L4_M8_40: + + INIT8x4 + +.Lsgemm_kernel_L4_M8_44: + + ands counterL , origK, #1 + ble .Lsgemm_kernel_L4_M8_100 + +.Lsgemm_kernel_L4_M8_46: + + KERNEL8x4_SUB + +.Lsgemm_kernel_L4_M8_100: + + SAVE8x4 + +.Lsgemm_kernel_L4_M8_END: + subs counterI, counterI, #1 + bne .Lsgemm_kernel_L4_M8_20 + +/******************************************************************************/ + +.Lsgemm_kernel_L4_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble .Lsgemm_kernel_L4_END + + tst counterI, #4 + ble .Lsgemm_kernel_L4_M2_BEGIN + +.Lsgemm_kernel_L4_M4_20: + + mov pB, origPB + + asr counterL , origK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt .Lsgemm_kernel_L4_M4_32 + + KERNEL4x4_I // do one in the K + KERNEL4x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble .Lsgemm_kernel_L4_M4_22a + .align 5 + +.Lsgemm_kernel_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L4_M4_22 + +.Lsgemm_kernel_L4_M4_22a: + + KERNEL4x4_M1 + KERNEL4x4_E + + b .Lsgemm_kernel_L4_M4_44 + +.Lsgemm_kernel_L4_M4_32: + + tst counterL, #1 + ble .Lsgemm_kernel_L4_M4_40 + + KERNEL4x4_I + KERNEL4x4_E + + b .Lsgemm_kernel_L4_M4_44 + +.Lsgemm_kernel_L4_M4_40: + + INIT4x4 + +.Lsgemm_kernel_L4_M4_44: + + ands counterL , origK, #1 + ble .Lsgemm_kernel_L4_M4_100 + +.Lsgemm_kernel_L4_M4_46: + + KERNEL4x4_SUB + +.Lsgemm_kernel_L4_M4_100: + + SAVE4x4 + +.Lsgemm_kernel_L4_M4_END: + +/******************************************************************************/ + +.Lsgemm_kernel_L4_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble .Lsgemm_kernel_L4_END + + tst counterI, #2 // counterI = counterI / 2 + ble .Lsgemm_kernel_L4_M1_BEGIN + +.Lsgemm_kernel_L4_M2_20: + + INIT2x4 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lsgemm_kernel_L4_M2_40 + +.Lsgemm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L4_M2_22 + + +.Lsgemm_kernel_L4_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lsgemm_kernel_L4_M2_100 + +.Lsgemm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L4_M2_42 + +.Lsgemm_kernel_L4_M2_100: + + SAVE2x4 + +.Lsgemm_kernel_L4_M2_END: + +/******************************************************************************/ + +.Lsgemm_kernel_L4_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble .Lsgemm_kernel_L4_END + +.Lsgemm_kernel_L4_M1_20: + + INIT1x4 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lsgemm_kernel_L4_M1_40 + +.Lsgemm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L4_M1_22 + + +.Lsgemm_kernel_L4_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lsgemm_kernel_L4_M1_100 + +.Lsgemm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L4_M1_42 + +.Lsgemm_kernel_L4_M1_100: + + SAVE1x4 + +.Lsgemm_kernel_L4_END: + add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 + +/******************************************************************************/ +/******************************************************************************/ + +.Lsgemm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble .Lsgemm_kernel_L999 + + tst counterJ , #2 + ble .Lsgemm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #1 + + mov pA, origPA // pA = A + +/******************************************************************************/ + +.Lsgemm_kernel_L2_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI,#0 + ble .Lsgemm_kernel_L2_M4_BEGIN + +.Lsgemm_kernel_L2_M8_20: + + INIT8x2 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble .Lsgemm_kernel_L2_M8_40 + .align 5 + +.Lsgemm_kernel_L2_M8_22: + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L2_M8_22 + + +.Lsgemm_kernel_L2_M8_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lsgemm_kernel_L2_M8_100 + +.Lsgemm_kernel_L2_M8_42: + + KERNEL8x2_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L2_M8_42 + +.Lsgemm_kernel_L2_M8_100: + + SAVE8x2 + +.Lsgemm_kernel_L2_M8_END: + + subs counterI, counterI, #1 + bgt .Lsgemm_kernel_L2_M8_20 + +/******************************************************************************/ + +.Lsgemm_kernel_L2_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble .Lsgemm_kernel_L2_END + + tst counterI, #4 + ble .Lsgemm_kernel_L2_M2_BEGIN + +.Lsgemm_kernel_L2_M4_20: + + INIT4x2 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble .Lsgemm_kernel_L2_M4_40 + .align 5 + +.Lsgemm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L2_M4_22 + + +.Lsgemm_kernel_L2_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lsgemm_kernel_L2_M4_100 + +.Lsgemm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L2_M4_42 + +.Lsgemm_kernel_L2_M4_100: + + SAVE4x2 + +.Lsgemm_kernel_L2_M4_END: + +/******************************************************************************/ + +.Lsgemm_kernel_L2_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble .Lsgemm_kernel_L2_END + + tst counterI, #2 // counterI = counterI / 2 + ble .Lsgemm_kernel_L2_M1_BEGIN + +.Lsgemm_kernel_L2_M2_20: + + INIT2x2 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble .Lsgemm_kernel_L2_M2_40 + +.Lsgemm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L2_M2_22 + + +.Lsgemm_kernel_L2_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lsgemm_kernel_L2_M2_100 + +.Lsgemm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L2_M2_42 + +.Lsgemm_kernel_L2_M2_100: + + SAVE2x2 + +.Lsgemm_kernel_L2_M2_END: + +/******************************************************************************/ + +.Lsgemm_kernel_L2_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble .Lsgemm_kernel_L2_END + +.Lsgemm_kernel_L2_M1_20: + + INIT1x2 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble .Lsgemm_kernel_L2_M1_40 + +.Lsgemm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L2_M1_22 + + +.Lsgemm_kernel_L2_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lsgemm_kernel_L2_M1_100 + +.Lsgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L2_M1_42 + +.Lsgemm_kernel_L2_M1_100: + + SAVE1x2 + +.Lsgemm_kernel_L2_END: + + add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 + +/******************************************************************************/ +/******************************************************************************/ + +.Lsgemm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble .Lsgemm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + + mov pA, origPA // pA = A + +/******************************************************************************/ + +.Lsgemm_kernel_L1_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 + cmp counterI, #0 + ble .Lsgemm_kernel_L1_M4_BEGIN + +.Lsgemm_kernel_L1_M8_20: + + INIT8x1 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lsgemm_kernel_L1_M8_40 + .align 5 + +.Lsgemm_kernel_L1_M8_22: + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L1_M8_22 + + +.Lsgemm_kernel_L1_M8_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lsgemm_kernel_L1_M8_100 + +.Lsgemm_kernel_L1_M8_42: + + KERNEL8x1_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L1_M8_42 + +.Lsgemm_kernel_L1_M8_100: + + SAVE8x1 + +.Lsgemm_kernel_L1_M8_END: + + subs counterI, counterI, #1 + bgt .Lsgemm_kernel_L1_M8_20 + +/******************************************************************************/ + +.Lsgemm_kernel_L1_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble .Lsgemm_kernel_L1_END + + tst counterI, #4 + ble .Lsgemm_kernel_L1_M2_BEGIN + +.Lsgemm_kernel_L1_M4_20: + + INIT4x1 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lsgemm_kernel_L1_M4_40 + .align 5 + +.Lsgemm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L1_M4_22 + + +.Lsgemm_kernel_L1_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lsgemm_kernel_L1_M4_100 + +.Lsgemm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L1_M4_42 + +.Lsgemm_kernel_L1_M4_100: + + SAVE4x1 + +.Lsgemm_kernel_L1_M4_END: + +/******************************************************************************/ + +.Lsgemm_kernel_L1_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble .Lsgemm_kernel_L1_END + + tst counterI, #2 // counterI = counterI / 2 + ble .Lsgemm_kernel_L1_M1_BEGIN + +.Lsgemm_kernel_L1_M2_20: + + INIT2x1 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lsgemm_kernel_L1_M2_40 + +.Lsgemm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L1_M2_22 + + +.Lsgemm_kernel_L1_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lsgemm_kernel_L1_M2_100 + +.Lsgemm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L1_M2_42 + +.Lsgemm_kernel_L1_M2_100: + + SAVE2x1 + +.Lsgemm_kernel_L1_M2_END: + +/******************************************************************************/ + +.Lsgemm_kernel_L1_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble .Lsgemm_kernel_L1_END + +.Lsgemm_kernel_L1_M1_20: + + INIT1x1 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lsgemm_kernel_L1_M1_40 + +.Lsgemm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L1_M1_22 + + +.Lsgemm_kernel_L1_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lsgemm_kernel_L1_M1_100 + +.Lsgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L1_M1_42 + +.Lsgemm_kernel_L1_M1_100: + + SAVE1x1 + +.Lsgemm_kernel_L1_END: + +/******************************************************************************/ + +.Lsgemm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + From 0e6eb8c247acc736d1711c8782747add140c2de7 Mon Sep 17 00:00:00 2001 From: zhangdanfeng Date: Mon, 18 May 2020 16:51:33 +0800 Subject: [PATCH 193/593] sgemm kernel use sgemm_kernel_8x8_cortexa53 Signed-off-by: zhangdanfeng --- kernel/arm64/KERNEL.CORTEXA53 | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/arm64/KERNEL.CORTEXA53 b/kernel/arm64/KERNEL.CORTEXA53 index c1d33fa3e..87ca525b7 100644 --- a/kernel/arm64/KERNEL.CORTEXA53 +++ b/kernel/arm64/KERNEL.CORTEXA53 @@ -1,3 +1,5 @@ include $(KERNELDIR)/KERNEL.ARMV8 - +ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8) +SGEMMKERNEL = sgemm_kernel_8x8_cortexa53.S +endif From edb423d772c3f91841fbad9afbff024aa109b893 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E4=B8=B9=E6=9E=AB?= Date: Wed, 20 May 2020 21:52:49 +0800 Subject: [PATCH 194/593] align general register using to strmm_kernel_8x8 --- kernel/arm64/sgemm_kernel_8x8_cortexa53.S | 242 +++++++++++----------- 1 file changed, 120 insertions(+), 122 deletions(-) diff --git a/kernel/arm64/sgemm_kernel_8x8_cortexa53.S b/kernel/arm64/sgemm_kernel_8x8_cortexa53.S index 0c9629eab..4fcce38d5 100644 --- a/kernel/arm64/sgemm_kernel_8x8_cortexa53.S +++ b/kernel/arm64/sgemm_kernel_8x8_cortexa53.S @@ -24,7 +24,6 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ - #define ASSEMBLER #include "common.h" @@ -78,14 +77,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 17 // 18 must save // 19 must save -// 20 must save -// 21 must save -// 22 must save -// 23 must save -// 24 must save -// 25 must save -// 26 must save -// 27 must save +// 20 must save pA0_2, pA0_3 +// 21 must save pA0_6, pA0_7 +// 22 must save pA1_2, pA1_3 +// 23 must save pA1_6, pA1_7 +// 24 must save pB0_2, pB0_3 +// 25 must save pB0_6, pB0_7 +// 26 must save pB1_2, pB1_3 +// 27 must save pB1_6, pB1_7 // 28 must save // 29 frame // 30 link @@ -155,13 +154,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr d3, [pA, #8] ldr d7, [pB, #8] - ldr x20, [pA], #16 + ldr x22, [pA], #16 fmul v16.4s, v0.4s, v4.s[0] - ldr x24, [pB], #16 + ldr x26, [pB], #16 fmul v17.4s, v1.4s, v4.s[0] - ldr x21, [pA], #8 + ldr x23, [pA], #8 fmul v18.4s, v0.4s, v4.s[1] - ldr x25, [pB], #8 + ldr x27, [pB], #8 fmul v19.4s, v1.4s, v4.s[1] fmul v20.4s, v0.4s, v4.s[2] fmul v21.4s, v1.4s, v4.s[2] @@ -179,21 +178,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL8x8_M1 ldr d2, [pA], #8 - fmov v0.d[1], x18 + fmov v0.d[1], x20 ldr d6, [pB], #8 - fmov v4.d[1], x22 + fmov v4.d[1], x24 ldr d3, [pA, #8] - fmov v1.d[1], x19 + fmov v1.d[1], x21 ldr d7, [pB, #8] - fmov v5.d[1], x23 + fmov v5.d[1], x25 fmla v16.4s, v0.4s, v4.s[0] - ldr x20, [pA], #16 + ldr x22, [pA], #16 fmla v17.4s, v1.4s, v4.s[0] - ldr x24, [pB], #16 + ldr x26, [pB], #16 fmla v18.4s, v0.4s, v4.s[1] - ldr x21, [pA], #8 + ldr x23, [pA], #8 fmla v19.4s, v1.4s, v4.s[1] - ldr x25, [pB], #8 + ldr x27, [pB], #8 fmla v20.4s, v0.4s, v4.s[2] fmla v21.4s, v1.4s, v4.s[2] fmla v22.4s, v0.4s, v4.s[3] @@ -210,21 +209,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL8x8_M2 ldr d0, [pA], #8 - fmov v2.d[1], x20 + fmov v2.d[1], x22 ldr d4, [pB], #8 - fmov v6.d[1], x24 + fmov v6.d[1], x26 ldr d1, [pA, #8] - fmov v3.d[1], x21 + fmov v3.d[1], x23 ldr d5, [pB, #8] - fmov v7.d[1], x25 + fmov v7.d[1], x27 fmla v16.4s, v2.4s, v6.s[0] - ldr x18, [pA], #16 + ldr x20, [pA], #16 fmla v17.4s, v3.4s, v6.s[0] - ldr x22, [pB], #16 + ldr x24, [pB], #16 fmla v18.4s, v2.4s, v6.s[1] - ldr x19, [pA], #8 + ldr x21, [pA], #8 fmla v19.4s, v3.4s, v6.s[1] - ldr x23, [pB], #8 + ldr x25, [pB], #8 fmla v20.4s, v2.4s, v6.s[2] fmla v21.4s, v3.4s, v6.s[2] fmla v22.4s, v2.4s, v6.s[3] @@ -240,10 +239,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x8_E - fmov v2.d[1], x20 - fmov v6.d[1], x24 - fmov v3.d[1], x21 - fmov v7.d[1], x25 + fmov v2.d[1], x22 + fmov v6.d[1], x26 + fmov v3.d[1], x23 + fmov v7.d[1], x27 fmla v16.4s, v2.4s, v6.s[0] fmla v17.4s, v3.4s, v6.s[0] fmla v18.4s, v2.4s, v6.s[1] @@ -363,67 +362,69 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_I - ld1 {v4.4s}, [pB] - add pB, pB, #16 - ld1 {v5.4s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 + ld1 {v0.4s}, [pA], #16 + ld1 {v4.4s, v5.4s}, [pB], #32 + ldr d2, [pA], #8 + ldr d6, [pB], #8 + ldr d7, [pB, #8] + ldr x21, [pA], #8 fmul v16.4s, v0.4s, v4.s[0] + ldr x26, [pB], #16 fmul v18.4s, v0.4s, v4.s[1] + ldr x27, [pB], #8 fmul v20.4s, v0.4s, v4.s[2] fmul v22.4s, v0.4s, v4.s[3] fmul v24.4s, v0.4s, v5.s[0] fmul v26.4s, v0.4s, v5.s[1] fmul v28.4s, v0.4s, v5.s[2] fmul v30.4s, v0.4s, v5.s[3] - - ld1 {v6.4s}, [pB] - add pB, pB, #16 - ld1 {v7.4s}, [pB] - add pB, pB, #16 - ld1 {v2.4s}, [pA] - add pA, pA, #16 .endm .macro KERNEL4x8_M1 + ldr d2, [pA], #8 + fmov v0.d[1], x20 + ldr d6, [pB], #8 + fmov v4.d[1], x24 + ldr d7, [pB, #8] + fmov v5.d[1], x25 fmla v16.4s, v0.4s, v4.s[0] + ldr x21, [pA], #8 fmla v18.4s, v0.4s, v4.s[1] + ldr x26, [pB], #16 fmla v20.4s, v0.4s, v4.s[2] + ldr x27, [pB], #8 fmla v22.4s, v0.4s, v4.s[3] fmla v24.4s, v0.4s, v5.s[0] fmla v26.4s, v0.4s, v5.s[1] fmla v28.4s, v0.4s, v5.s[2] fmla v30.4s, v0.4s, v5.s[3] - - ld1 {v6.4s}, [pB] - add pB, pB, #16 - ld1 {v7.4s}, [pB] - add pB, pB, #16 - ld1 {v2.4s}, [pA] - add pA, pA, #16 .endm .macro KERNEL4x8_M2 + ldr d0, [pA], #8 + fmov v2.d[1], x21 + ldr d4, [pB], #8 + fmov v6.d[1], x26 + ldr d5, [pB, #8] + fmov v7.d[1], x27 fmla v16.4s, v2.4s, v6.s[0] + ldr x20, [pA], #8 fmla v18.4s, v2.4s, v6.s[1] + ldr x24, [pB], #16 fmla v20.4s, v2.4s, v6.s[2] + ldr x25, [pB], #8 fmla v22.4s, v2.4s, v6.s[3] fmla v24.4s, v2.4s, v7.s[0] fmla v26.4s, v2.4s, v7.s[1] fmla v28.4s, v2.4s, v7.s[2] fmla v30.4s, v2.4s, v7.s[3] - - ld1 {v4.4s}, [pB] - add pB, pB, #16 - ld1 {v5.4s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 .endm .macro KERNEL4x8_E + fmov v2.d[1], x21 + fmov v6.d[1], x26 + fmov v7.d[1], x27 fmla v16.4s, v2.4s, v6.s[0] fmla v18.4s, v2.4s, v6.s[1] fmla v20.4s, v2.4s, v6.s[2] @@ -678,93 +679,90 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_I - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 + ld1 {v8.4s}, [pB], #16 + ld1 {v0.4s, v1.4s}, [pA], #32 + ldr d9, [pB], #8 + ldr d2, [pA], #8 + ldr d3, [pA, #8] fmul v16.4s, v0.4s, v8.s[0] + ldr x25, [pB], #8 fmul v17.4s, v1.4s, v8.s[0] + ldr x22, [pA], #16 fmul v20.4s, v0.4s, v8.s[1] + ldr x23, [pA], #8 fmul v21.4s, v1.4s, v8.s[1] - fmul v24.4s, v0.4s, v9.s[0] - fmul v25.4s, v1.4s, v9.s[0] - fmul v28.4s, v0.4s, v9.s[1] - fmul v29.4s, v1.4s, v9.s[1] - - ld1 {v12.2s, v13.2s}, [pB] - add pB, pB, #16 - ld1 {v4.4s}, [pA] - add pA, pA, #16 - ld1 {v5.4s}, [pA] - add pA, pA, #16 + fmul v24.4s, v0.4s, v8.s[2] + fmul v25.4s, v1.4s, v8.s[2] + fmul v28.4s, v0.4s, v8.s[3] + fmul v29.4s, v1.4s, v8.s[3] .endm .macro KERNEL8x4_M1 + ldr d9, [pB], #8 + fmov v8.d[1], x24 + ldr d2, [pA], #8 + fmov v0.d[1], x20 + ldr d3, [pA, #8] + fmov v1.d[1], x21 fmla v16.4s, v0.4s, v8.s[0] + ldr x25, [pB], #8 fmla v17.4s, v1.4s, v8.s[0] + ldr x22, [pA], #16 fmla v20.4s, v0.4s, v8.s[1] + ldr x23, [pA], #8 fmla v21.4s, v1.4s, v8.s[1] - fmla v24.4s, v0.4s, v9.s[0] - fmla v25.4s, v1.4s, v9.s[0] - fmla v28.4s, v0.4s, v9.s[1] - fmla v29.4s, v1.4s, v9.s[1] - - ld1 {v12.2s, v13.2s}, [pB] - add pB, pB, #16 - ld1 {v4.4s}, [pA] - add pA, pA, #16 - ld1 {v5.4s}, [pA] - add pA, pA, #16 + fmla v24.4s, v0.4s, v8.s[2] + fmla v25.4s, v1.4s, v8.s[2] + fmla v28.4s, v0.4s, v8.s[3] + fmla v29.4s, v1.4s, v8.s[3] .endm .macro KERNEL8x4_M2 - fmla v16.4s, v4.4s, v12.s[0] - fmla v17.4s, v5.4s, v12.s[0] - fmla v20.4s, v4.4s, v12.s[1] - fmla v21.4s, v5.4s, v12.s[1] - fmla v24.4s, v4.4s, v13.s[0] - fmla v25.4s, v5.4s, v13.s[0] - fmla v28.4s, v4.4s, v13.s[1] - fmla v29.4s, v5.4s, v13.s[1] - - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 + ldr d8, [pB], #8 + fmov v9.d[1], x25 + ldr d0, [pA], #8 + fmov v2.d[1], x22 + ldr d1, [pA, #8] + fmov v3.d[1], x23 + fmla v16.4s, v2.4s, v9.s[0] + ldr x24, [pB], #8 + fmla v17.4s, v3.4s, v9.s[0] + ldr x20, [pA], #16 + fmla v20.4s, v2.4s, v9.s[1] + ldr x21, [pA], #8 + fmla v21.4s, v3.4s, v9.s[1] + fmla v24.4s, v2.4s, v9.s[2] + fmla v25.4s, v3.4s, v9.s[2] + fmla v28.4s, v2.4s, v9.s[3] + fmla v29.4s, v3.4s, v9.s[3] .endm .macro KERNEL8x4_E - fmla v16.4s, v4.4s, v12.s[0] - fmla v17.4s, v5.4s, v12.s[0] - fmla v20.4s, v4.4s, v12.s[1] - fmla v21.4s, v5.4s, v12.s[1] - fmla v24.4s, v4.4s, v13.s[0] - fmla v25.4s, v5.4s, v13.s[0] - fmla v28.4s, v4.4s, v13.s[1] - fmla v29.4s, v5.4s, v13.s[1] + fmov v9.d[1], x25 + fmov v2.d[1], x22 + fmov v3.d[1], x23 + fmla v16.4s, v2.4s, v9.s[0] + fmla v17.4s, v3.4s, v9.s[0] + fmla v20.4s, v2.4s, v9.s[1] + fmla v21.4s, v3.4s, v9.s[1] + fmla v24.4s, v2.4s, v9.s[2] + fmla v25.4s, v3.4s, v9.s[2] + fmla v28.4s, v2.4s, v9.s[3] + fmla v29.4s, v3.4s, v9.s[3] .endm .macro KERNEL8x4_SUB - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 - + ld1 {v8.4s}, [pB], #16 + ld1 {v0.4s, v1.4s}, [pA], #32 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] fmla v20.4s, v0.4s, v8.s[1] fmla v21.4s, v1.4s, v8.s[1] - fmla v24.4s, v0.4s, v9.s[0] - fmla v25.4s, v1.4s, v9.s[0] - fmla v28.4s, v0.4s, v9.s[1] - fmla v29.4s, v1.4s, v9.s[1] + fmla v24.4s, v0.4s, v8.s[2] + fmla v25.4s, v1.4s, v8.s[2] + fmla v28.4s, v0.4s, v8.s[3] + fmla v29.4s, v1.4s, v8.s[3] .endm .macro SAVE8x4 From a1fc6041cdeaf10cbaab2c67f8001f795ef779ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E4=B8=B9=E6=9E=AB?= Date: Wed, 20 May 2020 21:55:32 +0800 Subject: [PATCH 195/593] use general register to speedup --- kernel/arm64/strmm_kernel_8x8_cortexa53.S | 2823 +++++++++++++++++++++ 1 file changed, 2823 insertions(+) create mode 100644 kernel/arm64/strmm_kernel_8x8_cortexa53.S diff --git a/kernel/arm64/strmm_kernel_8x8_cortexa53.S b/kernel/arm64/strmm_kernel_8x8_cortexa53.S new file mode 100644 index 000000000..4b84623f3 --- /dev/null +++ b/kernel/arm64/strmm_kernel_8x8_cortexa53.S @@ -0,0 +1,2823 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 x7 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset) */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define offset x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pA x15 +#define temp x16 +#define tempOffset x17 +#define tempK x18 + +#define alpha0 s10 +#define alphaV0 v10.s[0] +#define alpha1 s11 +#define alphaV1 v11.s[0] +#define alpha2 s14 +#define alphaV2 v14.s[0] +#define alpha3 s15 +#define alphaV3 v15.s[0] + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pA +// 16 temp +// 17 tempOffset +// 18 must save tempK +// 19 must save +// 20 must save pA0_2, pA0_3 +// 21 must save pA0_6, pA0_7 +// 22 must save pA1_2, pA1_3 +// 23 must save pA1_6, pA1_7 +// 24 must save pB0_2, pB0_3 +// 25 must save pB0_6, pB0_7 +// 26 must save pB1_2, pB1_3 +// 27 must save pB1_6, pB1_7 +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA0_0, pA0_1, pA0_2, pA0_3 +//v01 pA0_4, pA0_5, pA0_6, pA0_7 +//v02 pA1_0, pA1_1, pA1_2, pA1_3 +//v03 pA1_4, pA1_5, pA1_6, pA1_7 +//v04 pB0_0, pB0_1, pB0_2, pB0_3 +//v05 pB0_4, pB0_5, pB0_6, pB0_7 +//v06 pB1_0, pB1_1, pB1_2, pB1_3 +//v07 pB1_4, pB1_5, pB1_6, pB1_7 +//v08 must save +//v09 must save +//v10 must save ALPHA0 +//v11 must save ALPHA1 +//v12 must save +//v13 must save +//v14 must save ALPHA2 +//v15 must save ALPHA3 +//v16 must save C00, C01, C02, C03 +//v17 must save C04, C05, C06, C07 +//v18 C08, C09, C10, C11 +//v19 C12, C13, C14, C15 +//v20 C16, C17, C18, C19 +//v21 C20, C21, C22, C23 +//v22 C24, C25, C26, C27 +//v23 C28, C29, C30, C31 +//v24 C32, C33, C34, C35 +//v25 C36, C37, C38, C39 +//v26 C40, C41, C42, C43 +//v27 C44, C45, C46, C47 +//v28 C48, C49, C50, C51 +//v29 C52, C53, C54, C55 +//v30 C56, C57, C58, C59 +//v31 C60, C61, C62, C63 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INIT8x8 + fmov s16, wzr + fmov s17, wzr + fmov s18, s16 + fmov s19, s17 + fmov s20, wzr + fmov s21, s16 + fmov s22, s17 + fmov s23, s18 + fmov s24, wzr + fmov s25, s16 + fmov s26, s17 + fmov s27, s18 + fmov s28, wzr + fmov s29, s16 + fmov s30, s17 + fmov s31, s18 +.endm + +.macro KERNEL8x8_I + ld1 {v0.4s, v1.4s}, [pA], #32 + ld1 {v4.4s, v5.4s}, [pB], #32 + ldr d2, [pA], #8 + ldr d6, [pB], #8 + ldr d3, [pA, #8] + ldr d7, [pB, #8] + + ldr x22, [pA], #16 + fmul v16.4s, v0.4s, v4.s[0] + ldr x26, [pB], #16 + fmul v17.4s, v1.4s, v4.s[0] + ldr x23, [pA], #8 + fmul v18.4s, v0.4s, v4.s[1] + ldr x27, [pB], #8 + fmul v19.4s, v1.4s, v4.s[1] + fmul v20.4s, v0.4s, v4.s[2] + fmul v21.4s, v1.4s, v4.s[2] + fmul v22.4s, v0.4s, v4.s[3] + fmul v23.4s, v1.4s, v4.s[3] + fmul v24.4s, v0.4s, v5.s[0] + fmul v25.4s, v1.4s, v5.s[0] + fmul v26.4s, v0.4s, v5.s[1] + fmul v27.4s, v1.4s, v5.s[1] + fmul v28.4s, v0.4s, v5.s[2] + fmul v29.4s, v1.4s, v5.s[2] + fmul v30.4s, v0.4s, v5.s[3] + fmul v31.4s, v1.4s, v5.s[3] +.endm + +.macro KERNEL8x8_M1 + ldr d2, [pA], #8 + fmov v0.d[1], x20 + ldr d6, [pB], #8 + fmov v4.d[1], x24 + ldr d3, [pA, #8] + fmov v1.d[1], x21 + ldr d7, [pB, #8] + fmov v5.d[1], x25 + fmla v16.4s, v0.4s, v4.s[0] + ldr x22, [pA], #16 + fmla v17.4s, v1.4s, v4.s[0] + ldr x26, [pB], #16 + fmla v18.4s, v0.4s, v4.s[1] + ldr x23, [pA], #8 + fmla v19.4s, v1.4s, v4.s[1] + ldr x27, [pB], #8 + fmla v20.4s, v0.4s, v4.s[2] + fmla v21.4s, v1.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + fmla v23.4s, v1.4s, v4.s[3] + fmla v24.4s, v0.4s, v5.s[0] + fmla v25.4s, v1.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v27.4s, v1.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v29.4s, v1.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] + fmla v31.4s, v1.4s, v5.s[3] +.endm + +.macro KERNEL8x8_M2 + ldr d0, [pA], #8 + fmov v2.d[1], x22 + ldr d4, [pB], #8 + fmov v6.d[1], x26 + ldr d1, [pA, #8] + fmov v3.d[1], x23 + ldr d5, [pB, #8] + fmov v7.d[1], x27 + fmla v16.4s, v2.4s, v6.s[0] + ldr x20, [pA], #16 + fmla v17.4s, v3.4s, v6.s[0] + ldr x24, [pB], #16 + fmla v18.4s, v2.4s, v6.s[1] + ldr x21, [pA], #8 + fmla v19.4s, v3.4s, v6.s[1] + ldr x25, [pB], #8 + fmla v20.4s, v2.4s, v6.s[2] + fmla v21.4s, v3.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + fmla v23.4s, v3.4s, v6.s[3] + fmla v24.4s, v2.4s, v7.s[0] + fmla v25.4s, v3.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v27.4s, v3.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v29.4s, v3.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] + fmla v31.4s, v3.4s, v7.s[3] +.endm + +.macro KERNEL8x8_E + fmov v2.d[1], x22 + fmov v6.d[1], x26 + fmov v3.d[1], x23 + fmov v7.d[1], x27 + fmla v16.4s, v2.4s, v6.s[0] + fmla v17.4s, v3.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v19.4s, v3.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v21.4s, v3.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + fmla v23.4s, v3.4s, v6.s[3] + fmla v24.4s, v2.4s, v7.s[0] + fmla v25.4s, v3.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v27.4s, v3.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v29.4s, v3.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] + fmla v31.4s, v3.4s, v7.s[3] +.endm + +.macro KERNEL8x8_SUB + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v4.s[0] + fmla v17.4s, v1.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v19.4s, v1.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v21.4s, v1.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + fmla v23.4s, v1.4s, v4.s[3] + fmla v24.4s, v0.4s, v5.s[0] + fmla v25.4s, v1.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v27.4s, v1.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v29.4s, v1.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] + fmla v31.4s, v1.4s, v5.s[3] +.endm + +.macro SAVE8x8 + add pCRow1, pCRow0, LDC + + fmul v0.4s, v16.4s, alphaV0 + fmul v1.4s, v17.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + fmul v2.4s, v18.4s, alphaV2 + fmul v3.4s, v19.4s, alphaV3 + st1 {v2.4s, v3.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + fmul v4.4s, v20.4s, alphaV0 + fmul v5.4s, v21.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + fmul v6.4s, v22.4s, alphaV2 + fmul v7.4s, v23.4s, alphaV3 + st1 {v6.4s, v7.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + fmul v0.4s, v24.4s, alphaV0 + fmul v1.4s, v25.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + fmul v2.4s, v26.4s, alphaV2 + fmul v3.4s, v27.4s, alphaV3 + st1 {v2.4s, v3.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + fmul v4.4s, v28.4s, alphaV0 + fmul v5.4s, v29.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow2] + + fmul v6.4s, v30.4s, alphaV2 + fmul v7.4s, v31.4s, alphaV3 + st1 {v6.4s, v7.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + + +.macro INIT4x8 + fmov s16, wzr + fmov s18, wzr + fmov s20, wzr + fmov s22, s16 + fmov s24, wzr + fmov s26, s16 + fmov s28, s18 + fmov s30, s20 +.endm + +.macro KERNEL4x8_I + ld1 {v0.4s}, [pA], #16 + ld1 {v4.4s, v5.4s}, [pB], #32 + + ldr d2, [pA], #8 + ldr d6, [pB], #8 + ldr d7, [pB, #8] + ldr x21, [pA], #8 + fmul v16.4s, v0.4s, v4.s[0] + ldr x26, [pB], #16 + fmul v18.4s, v0.4s, v4.s[1] + ldr x27, [pB], #8 + fmul v20.4s, v0.4s, v4.s[2] + fmul v22.4s, v0.4s, v4.s[3] + fmul v24.4s, v0.4s, v5.s[0] + fmul v26.4s, v0.4s, v5.s[1] + fmul v28.4s, v0.4s, v5.s[2] + fmul v30.4s, v0.4s, v5.s[3] +.endm + +.macro KERNEL4x8_M1 + ldr d2, [pA], #8 + fmov v0.d[1], x20 + ldr d6, [pB], #8 + fmov v4.d[1], x24 + ldr d7, [pB, #8] + fmov v5.d[1], x25 + fmla v16.4s, v0.4s, v4.s[0] + ldr x21, [pA], #8 + fmla v18.4s, v0.4s, v4.s[1] + ldr x26, [pB], #16 + fmla v20.4s, v0.4s, v4.s[2] + ldr x27, [pB], #8 + fmla v22.4s, v0.4s, v4.s[3] + fmla v24.4s, v0.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] +.endm + +.macro KERNEL4x8_M2 + ldr d0, [pA], #8 + fmov v2.d[1], x21 + ldr d4, [pB], #8 + fmov v6.d[1], x26 + ldr d5, [pB, #8] + fmov v7.d[1], x27 + fmla v16.4s, v2.4s, v6.s[0] + ldr x20, [pA], #8 + fmla v18.4s, v2.4s, v6.s[1] + ldr x24, [pB], #16 + fmla v20.4s, v2.4s, v6.s[2] + ldr x25, [pB], #8 + fmla v22.4s, v2.4s, v6.s[3] + fmla v24.4s, v2.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] +.endm + +.macro KERNEL4x8_E + fmov v2.d[1], x21 + fmov v6.d[1], x26 + fmov v7.d[1], x27 + fmla v16.4s, v2.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + fmla v24.4s, v2.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] +.endm + +.macro KERNEL4x8_SUB + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + fmla v24.4s, v0.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] +.endm + +.macro SAVE4x8 + add pCRow1, pCRow0, LDC + + + fmul v0.4s, v16.4s, alphaV0 + st1 {v0.4s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + + fmul v2.4s, v18.4s, alphaV2 + st1 {v2.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul v4.4s, v20.4s, alphaV0 + st1 {v4.4s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + + fmul v6.4s, v22.4s, alphaV2 + st1 {v6.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul v0.4s, v24.4s, alphaV0 + st1 {v0.4s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + + fmul v2.4s, v26.4s, alphaV2 + st1 {v2.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul v4.4s, v28.4s, alphaV0 + st1 {v4.4s}, [pCRow2] + + + fmul v6.4s, v30.4s, alphaV2 + st1 {v6.4s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x8 + fmov s16, wzr + fmov s18, wzr + fmov s20, wzr + fmov s22, s16 + fmov s24, wzr + fmov s26, s16 + fmov s28, s18 + fmov s30, s20 +.endm + +.macro KERNEL2x8_SUB + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ld1 {v0.2s}, [pA] + add pA, pA, #8 + + fmla v16.2s, v0.2s, v4.s[0] + fmla v18.2s, v0.2s, v4.s[1] + fmla v20.2s, v0.2s, v4.s[2] + fmla v22.2s, v0.2s, v4.s[3] + fmla v24.2s, v0.2s, v5.s[0] + fmla v26.2s, v0.2s, v5.s[1] + fmla v28.2s, v0.2s, v5.s[2] + fmla v30.2s, v0.2s, v5.s[3] +.endm + +.macro SAVE2x8 + add pCRow1, pCRow0, LDC + + + fmul v0.2s, v16.2s, alphaV0 + st1 {v0.2s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + + fmul v2.2s, v18.2s, alphaV2 + st1 {v2.2s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul v4.2s, v20.2s, alphaV0 + st1 {v4.2s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + + fmul v6.2s, v22.2s, alphaV2 + st1 {v6.2s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul v0.2s, v24.2s, alphaV0 + st1 {v0.2s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + + fmul v2.2s, v26.2s, alphaV2 + st1 {v2.2s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul v4.2s, v28.2s, alphaV0 + st1 {v4.2s}, [pCRow2] + + + fmul v6.2s, v30.2s, alphaV2 + st1 {v6.2s}, [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x8 + fmov s16, wzr + fmov s18, wzr + fmov s20, wzr + fmov s22, s16 + fmov s24, wzr + fmov s26, s16 + fmov s28, s18 + fmov s30, s20 +.endm + +.macro KERNEL1x8_SUB + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ldr s0, [pA] + add pA, pA, #4 + + fmla s16, s0, v4.s[0] + fmla s18, s0, v4.s[1] + fmla s20, s0, v4.s[2] + fmla s22, s0, v4.s[3] + fmla s24, s0, v5.s[0] + fmla s26, s0, v5.s[1] + fmla s28, s0, v5.s[2] + fmla s30, s0, v5.s[3] +.endm + +.macro SAVE1x8 + add pCRow1, pCRow0, LDC + + + fmul s0, s16, alphaV0 + str s0, [pCRow0] + + add pCRow2, pCRow1, LDC + + + fmul s2, s18, alphaV2 + str s2, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul s4, s20, alphaV0 + str s4, [pCRow2] + + add pCRow2, pCRow1, LDC + + + fmul s6, s22, alphaV2 + str s6, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul s0, s24, alphaV0 + str s0, [pCRow2] + + add pCRow2, pCRow1, LDC + + + fmul s2, s26, alphaV2 + str s2, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul s4, s28, alphaV0 + str s4, [pCRow2] + + + fmul s6, s30, alphaV2 + str s6, [pCRow1] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************/ + +.macro INIT8x4 + fmov s16, wzr + fmov s17, wzr + fmov s20, wzr + fmov s21, s16 + fmov s24, wzr + fmov s25, s16 + fmov s28, wzr + fmov s29, s16 +.endm + +.macro KERNEL8x4_I + ld1 {v8.4s}, [pB], #16 + ld1 {v0.4s, v1.4s}, [pA], #32 + + ldr d9, [pB], #8 + ldr d2, [pA], #8 + ldr d3, [pA, #8] + fmul v16.4s, v0.4s, v8.s[0] + ldr x25, [pB], #8 + fmul v17.4s, v1.4s, v8.s[0] + ldr x22, [pA], #16 + fmul v20.4s, v0.4s, v8.s[1] + ldr x23, [pA], #8 + fmul v21.4s, v1.4s, v8.s[1] + fmul v24.4s, v0.4s, v8.s[2] + fmul v25.4s, v1.4s, v8.s[2] + fmul v28.4s, v0.4s, v8.s[3] + fmul v29.4s, v1.4s, v8.s[3] +.endm + +.macro KERNEL8x4_M1 + ldr d9, [pB], #8 + fmov v8.d[1], x24 + ldr d2, [pA], #8 + fmov v0.d[1], x20 + ldr d3, [pA, #8] + fmov v1.d[1], x21 + fmla v16.4s, v0.4s, v8.s[0] + ldr x25, [pB], #8 + fmla v17.4s, v1.4s, v8.s[0] + ldr x22, [pA], #16 + fmla v20.4s, v0.4s, v8.s[1] + ldr x23, [pA], #8 + fmla v21.4s, v1.4s, v8.s[1] + fmla v24.4s, v0.4s, v8.s[2] + fmla v25.4s, v1.4s, v8.s[2] + fmla v28.4s, v0.4s, v8.s[3] + fmla v29.4s, v1.4s, v8.s[3] +.endm + +.macro KERNEL8x4_M2 + ldr d8, [pB], #8 + fmov v9.d[1], x25 + ldr d0, [pA], #8 + fmov v2.d[1], x22 + ldr d1, [pA, #8] + fmov v3.d[1], x23 + fmla v16.4s, v2.4s, v9.s[0] + ldr x24, [pB], #8 + fmla v17.4s, v3.4s, v9.s[0] + ldr x20, [pA], #16 + fmla v20.4s, v2.4s, v9.s[1] + ldr x21, [pA], #8 + fmla v21.4s, v3.4s, v9.s[1] + fmla v24.4s, v2.4s, v9.s[2] + fmla v25.4s, v3.4s, v9.s[2] + fmla v28.4s, v2.4s, v9.s[3] + fmla v29.4s, v3.4s, v9.s[3] +.endm + +.macro KERNEL8x4_E + fmov v9.d[1], x25 + fmov v2.d[1], x22 + fmov v3.d[1], x23 + fmla v16.4s, v2.4s, v9.s[0] + fmla v17.4s, v3.4s, v9.s[0] + fmla v20.4s, v2.4s, v9.s[1] + fmla v21.4s, v3.4s, v9.s[1] + fmla v24.4s, v2.4s, v9.s[2] + fmla v25.4s, v3.4s, v9.s[2] + fmla v28.4s, v2.4s, v9.s[3] + fmla v29.4s, v3.4s, v9.s[3] +.endm + +.macro KERNEL8x4_SUB + ld1 {v8.4s}, [pB], #16 + ld1 {v0.4s, v1.4s}, [pA], #32 + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v24.4s, v0.4s, v8.s[2] + fmla v25.4s, v1.4s, v8.s[2] + fmla v28.4s, v0.4s, v8.s[3] + fmla v29.4s, v1.4s, v8.s[3] +.endm + +.macro SAVE8x4 + add pCRow1, pCRow0, LDC + + + fmul v0.4s, v16.4s, alphaV0 + fmul v1.4s, v17.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + + fmul v4.4s, v20.4s, alphaV0 + fmul v5.4s, v21.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul v0.4s, v24.4s, alphaV0 + fmul v1.4s, v25.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow2] + + + fmul v4.4s, v28.4s, alphaV0 + fmul v5.4s, v29.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + + +.macro INIT4x4 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 + fmov s24, s17 + fmov s25, s16 + fmov s28, s17 + fmov s29, s16 +.endm + +.macro KERNEL4x4_I + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + fmul v16.2s, v0.2s, v8.s[0] + fmul v29.2s, v1.2s, v9.s[1] + + fmul v20.2s, v0.2s, v8.s[1] + fmul v25.2s, v1.2s, v9.s[0] + + fmul v24.2s, v0.2s, v9.s[0] + fmul v21.2s, v1.2s, v8.s[1] + + fmul v28.2s, v0.2s, v9.s[1] + fmul v17.2s, v1.2s, v8.s[0] + + ld1 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + ld1 {v4.2s, v5.2s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL4x4_M1 + fmla v16.2s, v0.2s, v8.s[0] + fmla v29.2s, v1.2s, v9.s[1] + + ld1 {v12.2s, v13.2s}, [pB] // For next round + add pB, pB, #16 + + fmla v20.2s, v0.2s, v8.s[1] + fmla v25.2s, v1.2s, v9.s[0] + + ld1 {v4.2s, v5.2s}, [pA] // For next round + add pA, pA, #16 + + fmla v24.2s, v0.2s, v9.s[0] + fmla v21.2s, v1.2s, v8.s[1] + + prfm PLDL1KEEP, [pB, #512] + + fmla v28.2s, v0.2s, v9.s[1] + fmla v17.2s, v1.2s, v8.s[0] +.endm + +.macro KERNEL4x4_M2 + fmla v16.2s, v4.2s, v12.s[0] + fmla v29.2s, v5.2s, v13.s[1] + + ld1 {v8.2s, v9.2s}, [pB] // For next round + add pB, pB, #16 + + fmla v20.2s, v4.2s, v12.s[1] + fmla v25.2s, v5.2s, v13.s[0] + + ld1 {v0.2s, v1.2s}, [pA] // For next round + add pA, pA, #16 + + fmla v24.2s, v4.2s, v13.s[0] + fmla v21.2s, v5.2s, v12.s[1] + + prfm PLDL1KEEP, [pA, #512] + + fmla v28.2s, v4.2s, v13.s[1] + fmla v17.2s, v5.2s, v12.s[0] +.endm + +.macro KERNEL4x4_E + fmla v16.2s, v4.2s, v12.s[0] + fmla v29.2s, v5.2s, v13.s[1] + + fmla v20.2s, v4.2s, v12.s[1] + fmla v25.2s, v5.2s, v13.s[0] + + fmla v24.2s, v4.2s, v13.s[0] + fmla v21.2s, v5.2s, v12.s[1] + + fmla v28.2s, v4.2s, v13.s[1] + fmla v17.2s, v5.2s, v12.s[0] +.endm + +.macro KERNEL4x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + fmla v16.2s, v0.2s, v8.s[0] + fmla v29.2s, v1.2s, v9.s[1] + + fmla v20.2s, v0.2s, v8.s[1] + fmla v25.2s, v1.2s, v9.s[0] + + fmla v24.2s, v0.2s, v9.s[0] + fmla v21.2s, v1.2s, v8.s[1] + + fmla v28.2s, v0.2s, v9.s[1] + fmla v17.2s, v1.2s, v8.s[0] +.endm + +.macro SAVE4x4 + + fmul v8.2s, v16.2s, alphaV0 + fmul v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + + add pCRow1, pCRow0, LDC + + fmul v12.2s, v20.2s, alphaV2 + fmul v13.2s, v21.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] + + add pCRow2, pCRow1, LDC + + fmul v8.2s, v24.2s, alphaV0 + fmul v9.2s, v25.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow2] + + add pCRow1, pCRow2, LDC + + fmul v12.2s, v28.2s, alphaV2 + fmul v13.2s, v29.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x4 + fmov s16, wzr + fmov s20, s16 + fmov s24, s20 + fmov s28, s16 +.endm + +.macro KERNEL2x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s}, [pA] + add pA, pA, #8 + + fmla v16.2s, v0.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v28.2s, v0.2s, v9.s[1] +.endm + +.macro SAVE2x4 + + fmul v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] + + add pCRow1, pCRow0, LDC + + fmul v12.2s, v20.2s, alphaV1 + st1 {v12.2s}, [pCRow1] + + add pCRow2, pCRow1, LDC + + fmul v8.2s, v24.2s, alphaV2 + st1 {v8.2s}, [pCRow2] + + add pCRow1, pCRow2, LDC + + fmul v12.2s, v28.2s, alphaV3 + st1 {v12.2s}, [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x4 + fmov s16, wzr + fmov s20, s16 +.endm + +.macro KERNEL1x4_SUB + ldr s0, [pA] + add pA, pA, #4 + + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + + fmla v16.2s, v8.2s, v0.s[0] + fmla v20.2s, v9.2s, v0.s[0] +.endm + +.macro SAVE1x4 + add pCRow1, pCRow0, LDC + + + fmul v8.2s, v16.2s, alphaV0 + st1 {v8.s}[0], [pCRow0] + st1 {v8.s}[1], [pCRow1] + + add pCRow2, pCRow1, LDC + add pCRow1, pCRow2, LDC + + + fmul v12.2s, v20.2s, alphaV1 + st1 {v12.s}[0], [pCRow2] + st1 {v12.s}[1], [pCRow1] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************/ + +.macro INIT8x2 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 +.endm + +.macro KERNEL8x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] +.endm + +.macro SAVE8x2 + add pCRow1, pCRow0, LDC + + + fmul v0.4s, v16.4s, alphaV0 + fmul v1.4s, v17.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + + fmul v4.4s, v20.4s, alphaV0 + fmul v5.4s, v21.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT4x2 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 +.endm + +.macro KERNEL4x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + ld1 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v21.2s, v1.2s, v8.s[1] +.endm + +.macro SAVE4x2 + + fmul v8.2s, v16.2s, alphaV0 + fmul v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + + add pCRow1, pCRow0, LDC + + fmul v12.2s, v20.2s, alphaV2 + fmul v13.2s, v21.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x2 + fmov s16, wzr + fmov s20, s16 +.endm + +.macro KERNEL2x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + + ld1 {v0.2s}, [pA] + add pA, pA, #8 + + fmla v16.2s, v0.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] +.endm + +.macro SAVE2x2 + + fmul v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] + + add pCRow1 , pCRow0, LDC + + fmul v12.2s, v20.2s, alphaV1 + st1 {v12.2s}, [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x2 + fmov s16, wzr +.endm + +.macro KERNEL1x2_SUB + ld1 {v8.2s} , [pB] + add pB , pB, #8 + + ldr s0 , [pA] + add pA, pA, #4 + + fmla v16.2s, v8.2s, v0.s[0] +.endm + +.macro SAVE1x2 + add pCRow1 , pCRow0, LDC + + + fmul v8.2s, v16.2s, alphaV0 + st1 {v8.s}[0], [pCRow0] + st1 {v8.s}[1], [pCRow1] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************/ + +.macro INIT8x1 + fmov s16, wzr + fmov s17, wzr +.endm + +.macro KERNEL8x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] +.endm + +.macro SAVE8x1 + + fmul v0.4s, v16.4s, alphaV0 + fmul v1.4s, v17.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow0] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT4x1 + fmov s16, wzr + fmov s17, s16 +.endm + +.macro KERNEL4x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.2s, v1.2s}, [pA] + add pA , pA, #16 + + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] +.endm + +.macro SAVE4x1 + + fmul v8.2s, v16.2s, alphaV0 + fmul v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x1 + fmov s16, wzr +.endm + +.macro KERNEL2x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.2s}, [pA] + add pA , pA, #8 + + fmla v16.2s, v0.2s, v8.s[0] +.endm + +.macro SAVE2x1 + + fmul v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x1 + fmov s16, wzr +.endm + +.macro KERNEL1x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ldr s0, [pA] + add pA , pA, #4 + + fmadd s16, s0, s8, s16 +.endm + +.macro SAVE1x1 + + fmul s8, s16, alpha0 + str s8, [pCRow0] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + +.Lstrmm_kernel_begin: + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + fmov alpha0, s0 + fmov alpha1, s0 + fmov alpha2, s0 + fmov alpha3, s0 + + lsl LDC, LDC, #2 // ldc = ldc * 4 + +#if !defined(LEFT) + neg tempOffset, offset +#endif + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #3 // J = J / 8 + cmp counterJ, #0 + ble .Lstrmm_kernel_L4_BEGIN + +/******************************************************************************/ +/******************************************************************************/ + +.Lstrmm_kernel_L8_BEGIN: + mov pCRow0, pC // pCRow0 = C + add pC, pC, LDC, lsl #3 + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = start of A array + +/******************************************************************************/ + +.Lstrmm_kernel_L8_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble .Lstrmm_kernel_L8_M4_BEGIN + +.Lstrmm_kernel_L8_M8_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pA, pA, temp + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #8 +#else + add tempK, tempOffset, #8 +#endif + + asr counterL , tempK, #3 // L = K / 8 + cmp counterL , #2 // is there at least 16 to do? + blt .Lstrmm_kernel_L8_M8_32 + + KERNEL8x8_I // do one in the K + KERNEL8x8_M2 // do another in the K + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + + subs counterL, counterL, #2 + ble .Lstrmm_kernel_L8_M8_22a + .align 5 + +.Lstrmm_kernel_L8_M8_22: + + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L8_M8_22 + +.Lstrmm_kernel_L8_M8_22a: + + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_E + + b .Lstrmm_kernel_L8_M8_44 + +.Lstrmm_kernel_L8_M8_32: + + tst counterL, #1 + ble .Lstrmm_kernel_L8_M8_40 + + KERNEL8x8_I + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_M2 + KERNEL8x8_M1 + KERNEL8x8_E + + b .Lstrmm_kernel_L8_M8_44 + +.Lstrmm_kernel_L8_M8_40: + + INIT8x8 + +.Lstrmm_kernel_L8_M8_44: + + ands counterL , tempK, #7 + ble .Lstrmm_kernel_L8_M8_100 + +.Lstrmm_kernel_L8_M8_46: + + KERNEL8x8_SUB + + subs counterL, counterL, 1 + bgt .Lstrmm_kernel_L8_M8_46 + +.Lstrmm_kernel_L8_M8_100: + + SAVE8x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #8 +#else + sub tempK, tempK, #8 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #8 +#endif + +.Lstrmm_kernel_L8_M8_END: + subs counterI, counterI, #1 + bne .Lstrmm_kernel_L8_M8_20 + +/******************************************************************************/ + +.Lstrmm_kernel_L8_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble .Lstrmm_kernel_L8_END + + tst counterI, #4 + ble .Lstrmm_kernel_L8_M2_BEGIN + +.Lstrmm_kernel_L8_M4_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pA, pA, temp + lsl temp, tempOffset, #5 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #8 +#endif + + asr counterL , tempK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt .Lstrmm_kernel_L8_M4_32 + + KERNEL4x8_I // do one in the K + KERNEL4x8_M2 // do another in the K + + subs counterL, counterL, #2 + ble .Lstrmm_kernel_L8_M4_22a + .align 5 + +.Lstrmm_kernel_L8_M4_22: + + KERNEL4x8_M1 + KERNEL4x8_M2 + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L8_M4_22 + +.Lstrmm_kernel_L8_M4_22a: + + KERNEL4x8_M1 + KERNEL4x8_E + + b .Lstrmm_kernel_L8_M4_44 + +.Lstrmm_kernel_L8_M4_32: + + tst counterL, #1 + ble .Lstrmm_kernel_L8_M4_40 + + KERNEL4x8_I + KERNEL4x8_E + + b .Lstrmm_kernel_L8_M4_44 + +.Lstrmm_kernel_L8_M4_40: + + INIT4x8 + +.Lstrmm_kernel_L8_M4_44: + + ands counterL , tempK, #1 + ble .Lstrmm_kernel_L8_M4_100 + +.Lstrmm_kernel_L8_M4_46: + + KERNEL4x8_SUB + +.Lstrmm_kernel_L8_M4_100: + + SAVE4x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #8 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +.Lstrmm_kernel_L8_M4_END: + +/******************************************************************************/ + +.Lstrmm_kernel_L8_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble .Lstrmm_kernel_L8_END + + tst counterI, #2 // counterI = counterI / 2 + ble .Lstrmm_kernel_L8_M1_BEGIN + +.Lstrmm_kernel_L8_M2_20: + + INIT2x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pA, pA, temp + lsl temp, tempOffset, #5 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #8 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lstrmm_kernel_L8_M2_40 + +.Lstrmm_kernel_L8_M2_22: + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L8_M2_22 + + +.Lstrmm_kernel_L8_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lstrmm_kernel_L8_M2_100 + +.Lstrmm_kernel_L8_M2_42: + + KERNEL2x8_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L8_M2_42 + +.Lstrmm_kernel_L8_M2_100: + + SAVE2x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #8 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +.Lstrmm_kernel_L8_M2_END: + +/******************************************************************************/ + +.Lstrmm_kernel_L8_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble .Lstrmm_kernel_L8_END + +.Lstrmm_kernel_L8_M1_20: + + INIT1x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #2 + add pA, pA, temp + lsl temp, tempOffset, #5 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #8 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lstrmm_kernel_L8_M1_40 + +.Lstrmm_kernel_L8_M1_22: + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L8_M1_22 + + +.Lstrmm_kernel_L8_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lstrmm_kernel_L8_M1_100 + +.Lstrmm_kernel_L8_M1_42: + + KERNEL1x8_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L8_M1_42 + +.Lstrmm_kernel_L8_M1_100: + + SAVE1x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #8 +#endif + lsl temp, tempK, #2 + add pA, pA, temp + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif + +.Lstrmm_kernel_L8_END: + lsl temp, origK, #5 // B = B + K * 4 * 8 + add origPB, origPB, temp + +#if !defined(LEFT) + add tempOffset, tempOffset, #8 +#endif + + subs counterJ, counterJ , #1 // j-- + bgt .Lstrmm_kernel_L8_BEGIN + +/******************************************************************************/ +/******************************************************************************/ + +.Lstrmm_kernel_L4_BEGIN: + + mov counterJ , origN + tst counterJ , #7 + ble .Lstrmm_kernel_L999 + + tst counterJ , #4 + ble .Lstrmm_kernel_L2_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #2 + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = A + +/******************************************************************************/ + +.Lstrmm_kernel_L4_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble .Lstrmm_kernel_L4_M4_BEGIN + +.Lstrmm_kernel_L4_M8_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pA, pA, temp + lsl temp, tempOffset, #4 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #8 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL , tempK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt .Lstrmm_kernel_L4_M8_32 + + KERNEL8x4_I // do one in the K + KERNEL8x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble .Lstrmm_kernel_L4_M8_22a + .align 5 + +.Lstrmm_kernel_L4_M8_22: + + KERNEL8x4_M1 + KERNEL8x4_M2 + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L4_M8_22 + +.Lstrmm_kernel_L4_M8_22a: + + KERNEL8x4_M1 + KERNEL8x4_E + + b .Lstrmm_kernel_L4_M8_44 + +.Lstrmm_kernel_L4_M8_32: + + tst counterL, #1 + ble .Lstrmm_kernel_L4_M8_40 + + KERNEL8x4_I + KERNEL8x4_E + + b .Lstrmm_kernel_L4_M8_44 + +.Lstrmm_kernel_L4_M8_40: + + INIT8x4 + +.Lstrmm_kernel_L4_M8_44: + + ands counterL , tempK, #1 + ble .Lstrmm_kernel_L4_M8_100 + +.Lstrmm_kernel_L4_M8_46: + + KERNEL8x4_SUB + +.Lstrmm_kernel_L4_M8_100: + + SAVE8x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #8 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #8 +#endif +.Lstrmm_kernel_L4_M8_END: + subs counterI, counterI, #1 + bne .Lstrmm_kernel_L4_M8_20 + +/******************************************************************************/ + +.Lstrmm_kernel_L4_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble .Lstrmm_kernel_L4_END + + tst counterI, #4 + ble .Lstrmm_kernel_L4_M2_BEGIN + +.Lstrmm_kernel_L4_M4_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #4 +#endif + asr counterL , tempK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt .Lstrmm_kernel_L4_M4_32 + + KERNEL4x4_I // do one in the K + KERNEL4x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble .Lstrmm_kernel_L4_M4_22a + .align 5 + +.Lstrmm_kernel_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L4_M4_22 + +.Lstrmm_kernel_L4_M4_22a: + + KERNEL4x4_M1 + KERNEL4x4_E + + b .Lstrmm_kernel_L4_M4_44 + +.Lstrmm_kernel_L4_M4_32: + + tst counterL, #1 + ble .Lstrmm_kernel_L4_M4_40 + + KERNEL4x4_I + KERNEL4x4_E + + b .Lstrmm_kernel_L4_M4_44 + +.Lstrmm_kernel_L4_M4_40: + + INIT4x4 + +.Lstrmm_kernel_L4_M4_44: + + ands counterL , tempK, #1 + ble .Lstrmm_kernel_L4_M4_100 + +.Lstrmm_kernel_L4_M4_46: + + KERNEL4x4_SUB + +.Lstrmm_kernel_L4_M4_100: + + SAVE4x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif +.Lstrmm_kernel_L4_M4_END: + +/******************************************************************************/ + +.Lstrmm_kernel_L4_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble .Lstrmm_kernel_L4_END + + tst counterI, #2 // counterI = counterI / 2 + ble .Lstrmm_kernel_L4_M1_BEGIN + +.Lstrmm_kernel_L4_M2_20: + + INIT2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pA, pA, temp + lsl temp, tempOffset, #4 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #4 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lstrmm_kernel_L4_M2_40 + +.Lstrmm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L4_M2_22 + + +.Lstrmm_kernel_L4_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lstrmm_kernel_L4_M2_100 + +.Lstrmm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L4_M2_42 + +.Lstrmm_kernel_L4_M2_100: + + SAVE2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif +.Lstrmm_kernel_L4_M2_END: + +/******************************************************************************/ + +.Lstrmm_kernel_L4_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble .Lstrmm_kernel_L4_END + +.Lstrmm_kernel_L4_M1_20: + + INIT1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + lsl temp, tempOffset, #2 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #4 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lstrmm_kernel_L4_M1_40 + +.Lstrmm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L4_M1_22 + + +.Lstrmm_kernel_L4_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lstrmm_kernel_L4_M1_100 + +.Lstrmm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L4_M1_42 + +.Lstrmm_kernel_L4_M1_100: + + SAVE1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #2 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif +.Lstrmm_kernel_L4_END: + add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 +#if !defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +/******************************************************************************/ +/******************************************************************************/ + +.Lstrmm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble .Lstrmm_kernel_L999 + + tst counterJ , #2 + ble .Lstrmm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #1 + +#if defined(LEFT) + mov tempOffset, offset +#endif + mov pA, origPA // pA = A + +/******************************************************************************/ + +.Lstrmm_kernel_L2_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI,#0 + ble .Lstrmm_kernel_L2_M4_BEGIN + +.Lstrmm_kernel_L2_M8_20: + + INIT8x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pA, pA, temp + lsl temp, tempOffset, #3 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #8 +#else + add tempK, tempOffset, #2 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble .Lstrmm_kernel_L2_M8_40 + .align 5 + +.Lstrmm_kernel_L2_M8_22: + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L2_M8_22 + + +.Lstrmm_kernel_L2_M8_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lstrmm_kernel_L2_M8_100 + +.Lstrmm_kernel_L2_M8_42: + + KERNEL8x2_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L2_M8_42 + +.Lstrmm_kernel_L2_M8_100: + + SAVE8x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #8 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #8 +#endif +.Lstrmm_kernel_L2_M8_END: + + subs counterI, counterI, #1 + bgt .Lstrmm_kernel_L2_M8_20 + +/******************************************************************************/ + +.Lstrmm_kernel_L2_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble .Lstrmm_kernel_L2_END + + tst counterI, #4 + ble .Lstrmm_kernel_L2_M2_BEGIN + +.Lstrmm_kernel_L2_M4_20: + + INIT4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #4 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #2 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble .Lstrmm_kernel_L2_M4_40 + .align 5 + +.Lstrmm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L2_M4_22 + + +.Lstrmm_kernel_L2_M4_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lstrmm_kernel_L2_M4_100 + +.Lstrmm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L2_M4_42 + +.Lstrmm_kernel_L2_M4_100: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif +.Lstrmm_kernel_L2_M4_END: + +/******************************************************************************/ + +.Lstrmm_kernel_L2_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble .Lstrmm_kernel_L2_END + + tst counterI, #2 // counterI = counterI / 2 + ble .Lstrmm_kernel_L2_M1_BEGIN + +.Lstrmm_kernel_L2_M2_20: + + INIT2x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #2 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble .Lstrmm_kernel_L2_M2_40 + +.Lstrmm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L2_M2_22 + + +.Lstrmm_kernel_L2_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lstrmm_kernel_L2_M2_100 + +.Lstrmm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L2_M2_42 + +.Lstrmm_kernel_L2_M2_100: + + SAVE2x2 +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +.Lstrmm_kernel_L2_M2_END: + +/******************************************************************************/ + +.Lstrmm_kernel_L2_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble .Lstrmm_kernel_L2_END + +.Lstrmm_kernel_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #2 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #2 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble .Lstrmm_kernel_L2_M1_40 + +.Lstrmm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L2_M1_22 + + +.Lstrmm_kernel_L2_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lstrmm_kernel_L2_M1_100 + +.Lstrmm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L2_M1_42 + +.Lstrmm_kernel_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #2 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif +.Lstrmm_kernel_L2_END: +#if !defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 + +/******************************************************************************/ +/******************************************************************************/ + +.Lstrmm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble .Lstrmm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + +#if defined(LEFT) + mov tempOffset, offset +#endif + mov pA, origPA // pA = A + +/******************************************************************************/ + +.Lstrmm_kernel_L1_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 + cmp counterI, #0 + ble .Lstrmm_kernel_L1_M4_BEGIN + +.Lstrmm_kernel_L1_M8_20: + + INIT8x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pA, pA, temp + lsl temp, tempOffset, #2 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #8 +#else + add tempK, tempOffset, #1 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lstrmm_kernel_L1_M8_40 + .align 5 + +.Lstrmm_kernel_L1_M8_22: + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L1_M8_22 + + +.Lstrmm_kernel_L1_M8_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lstrmm_kernel_L1_M8_100 + +.Lstrmm_kernel_L1_M8_42: + + KERNEL8x1_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L1_M8_42 + +.Lstrmm_kernel_L1_M8_100: + + SAVE8x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #8 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #2 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #8 +#endif +.Lstrmm_kernel_L1_M8_END: + + subs counterI, counterI, #1 + bgt .Lstrmm_kernel_L1_M8_20 + +/******************************************************************************/ + +.Lstrmm_kernel_L1_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble .Lstrmm_kernel_L1_END + + tst counterI, #4 + ble .Lstrmm_kernel_L1_M2_BEGIN + +.Lstrmm_kernel_L1_M4_20: + + INIT4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #2 + add pB, pB, temp + lsl temp, tempOffset, #4 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #1 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lstrmm_kernel_L1_M4_40 + .align 5 + +.Lstrmm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L1_M4_22 + + +.Lstrmm_kernel_L1_M4_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lstrmm_kernel_L1_M4_100 + +.Lstrmm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L1_M4_42 + +.Lstrmm_kernel_L1_M4_100: + + SAVE4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #2 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif +.Lstrmm_kernel_L1_M4_END: + +/******************************************************************************/ + +.Lstrmm_kernel_L1_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble .Lstrmm_kernel_L1_END + + tst counterI, #2 // counterI = counterI / 2 + ble .Lstrmm_kernel_L1_M1_BEGIN + +.Lstrmm_kernel_L1_M2_20: + + INIT2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #2 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #1 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lstrmm_kernel_L1_M2_40 + +.Lstrmm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L1_M2_22 + + +.Lstrmm_kernel_L1_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lstrmm_kernel_L1_M2_100 + +.Lstrmm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L1_M2_42 + +.Lstrmm_kernel_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #2 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif +.Lstrmm_kernel_L1_M2_END: + +/******************************************************************************/ + +.Lstrmm_kernel_L1_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble .Lstrmm_kernel_L1_END + +.Lstrmm_kernel_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #2 + add pB, pB, temp + lsl temp, tempOffset, #2 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #1 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lstrmm_kernel_L1_M1_40 + +.Lstrmm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L1_M1_22 + + +.Lstrmm_kernel_L1_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lstrmm_kernel_L1_M1_100 + +.Lstrmm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L1_M1_42 + +.Lstrmm_kernel_L1_M1_100: + + SAVE1x1 + +.Lstrmm_kernel_L1_END: + +/******************************************************************************/ + +.Lstrmm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + From 9df79ae9a3ff69853020e78437d4394f3b97a332 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E4=B8=B9=E6=9E=AB?= Date: Wed, 20 May 2020 21:57:12 +0800 Subject: [PATCH 196/593] update sgemm and strmm kernel selecting strategy --- kernel/arm64/KERNEL.CORTEXA53 | 193 +++++++++++++++++++++++++++++++++- 1 file changed, 191 insertions(+), 2 deletions(-) diff --git a/kernel/arm64/KERNEL.CORTEXA53 b/kernel/arm64/KERNEL.CORTEXA53 index 87ca525b7..4219acf98 100644 --- a/kernel/arm64/KERNEL.CORTEXA53 +++ b/kernel/arm64/KERNEL.CORTEXA53 @@ -1,5 +1,194 @@ -include $(KERNELDIR)/KERNEL.ARMV8 +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = axpy.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + + +SASUMKERNEL = asum.S +DASUMKERNEL = asum.S +CASUMKERNEL = casum.S +ZASUMKERNEL = zasum.S + +SCOPYKERNEL = copy.S +DCOPYKERNEL = copy.S +CCOPYKERNEL = copy.S +ZCOPYKERNEL = copy.S + +SSWAPKERNEL = swap.S +DSWAPKERNEL = swap.S +CSWAPKERNEL = swap.S +ZSWAPKERNEL = swap.S + +ISAMAXKERNEL = iamax.S +IDAMAXKERNEL = iamax.S +ICAMAXKERNEL = izamax.S +IZAMAXKERNEL = izamax.S + +SNRM2KERNEL = nrm2.S +DNRM2KERNEL = nrm2.S +CNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S + +DDOTKERNEL = dot.S +SDOTKERNEL = dot.S +CDOTKERNEL = zdot.S +ZDOTKERNEL = zdot.S +DSDOTKERNEL = dot.S + +DGEMM_BETA = dgemm_beta.S +SGEMM_BETA = sgemm_beta.S ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8) -SGEMMKERNEL = sgemm_kernel_8x8_cortexa53.S +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S +else +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +endif +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +ifeq ($(SGEMM_UNROLL_M), 16) +SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S +else +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +endif +ifeq ($(SGEMM_UNROLL_M), 4) +SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S +else +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +endif +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ifeq ($(SGEMM_UNROLL_N), 16) +SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S +else +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +endif +ifeq ($(SGEMM_UNROLL_N), 4) +SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S +else +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +endif +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) + +ifeq ($(DGEMM_UNROLL_M), 8) +DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S +DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S +else +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +endif + +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +ifeq ($(DGEMM_UNROLL_N), 4) +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S +else +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +endif + +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) endif +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) From ea5bdc3f72dfffbf86d708da95792e8657e90fb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E4=B8=B9=E6=9E=AB?= Date: Wed, 20 May 2020 22:34:47 +0800 Subject: [PATCH 197/593] split cortex-a53 param to match 8x8 kernel --- param.h | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/param.h b/param.h index 6f0a3b727..c780e7ef1 100644 --- a/param.h +++ b/param.h @@ -2623,7 +2623,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 -#if defined(CORTEXA53) || defined(CORTEXA57) || \ +#if defined(CORTEXA57) || \ defined(CORTEXA72) || defined(CORTEXA73) || \ defined(FALKOR) || defined(TSV110) || defined(EMAG8180) @@ -2669,6 +2669,35 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 2048 +#elif defined(CORTEXA53) + +#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_N 8 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P 256 +#define DGEMM_DEFAULT_P 160 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 128 + +#define SGEMM_DEFAULT_Q 256 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 112 + +#define SGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 2048 + #elif defined(THUNDERX) #define SGEMM_DEFAULT_UNROLL_M 4 From 2a3aa913541d987ce8de0423fbfa1ca2ca07c05d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E4=B8=B9=E6=9E=AB?= Date: Wed, 20 May 2020 22:35:26 +0800 Subject: [PATCH 198/593] update CONTRIBUTORS.md, adding myself --- CONTRIBUTORS.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index fd4ab4bec..aba39e56f 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -187,3 +187,6 @@ In chronological order: * Marius Hillenbrand * [2020-05-12] Revise dynamic architecture detection for IBM z * [2020-05-12] Add new sgemm and strmm kernel for IBM z14 + +* Danfeng Zhang + * [2020-05-20] Improve performance of SGEMM and STRMM on Arm Cortex-A53 \ No newline at end of file From 06387ac0e6c29d636dee1ae7d7c935dd9180591d Mon Sep 17 00:00:00 2001 From: Guodong Xu Date: Mon, 25 May 2020 10:40:12 +0000 Subject: [PATCH 199/593] make GCC version detection OS-independent Previous design put GCC version detection inside of OSNAME 'WINNT'. However, such detections are required for 'Linux' and possibly other OS'es as well. For example, there is usage of the GCC versions in Makefile.arm64. When compiling on Linux machine, in the previous design, Markfile.arm64 will not know the correct GCC version. The fix is to move GCC version detection into common part, not wrapped by anything. Signed-off-by: Guodong Xu --- Makefile.system | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/Makefile.system b/Makefile.system index 98d9ae313..08637e9ac 100644 --- a/Makefile.system +++ b/Makefile.system @@ -277,6 +277,15 @@ NO_LAPACK = 1 override FEXTRALIB = endif +ifeq ($(C_COMPILER), GCC) +GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) +GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) +GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) +GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) +GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) +GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) +endif + # # OS dependent settings # @@ -323,13 +332,7 @@ ifeq ($(C_COMPILER), CLANG) CCOMMON_OPT += -DMS_ABI endif -ifeq ($(C_COMPILER), GCC) #Version tests for supporting specific features (MS_ABI, POWER9 intrinsics) -GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) -GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) -GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) -GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) -GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) ifeq ($(GCCVERSIONGT4), 1) # GCC Major version > 4 # It is compatible with MSVC ABI. @@ -343,7 +346,6 @@ ifeq ($(GCCMINORVERSIONGTEQ7), 1) CCOMMON_OPT += -DMS_ABI endif endif -endif # Ensure the correct stack alignment on Win32 # http://permalink.gmane.org/gmane.comp.lib.openblas.general/97 From 33c8790603bedee92fb7bcd458aa464cdcffbc7a Mon Sep 17 00:00:00 2001 From: pkubaj Date: Mon, 25 May 2020 13:14:09 +0200 Subject: [PATCH 200/593] Add powerpc (32-bit) Only powerpc64 is present. --- Makefile.system | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile.system b/Makefile.system index 98d9ae313..0969cb70e 100644 --- a/Makefile.system +++ b/Makefile.system @@ -21,6 +21,8 @@ ifeq ($(ARCH), amd64) override ARCH=x86_64 else ifeq ($(ARCH), powerpc64) override ARCH=power +else ifeq ($(ARCH), powerpc) +override ARCH=power else ifeq ($(ARCH), i386) override ARCH=x86 else ifeq ($(ARCH), aarch64) From 200296b0f4c9f070632d29d4ed8ebb738914a2dd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 29 May 2020 13:23:51 +0200 Subject: [PATCH 201/593] remove libomp from link list only for pgfortran at least the AMD (aocc) flavor of flang wants to link to a (real or dummy) libomp by default --- f_check | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/f_check b/f_check index d702044cc..fd06a6516 100644 --- a/f_check +++ b/f_check @@ -334,7 +334,7 @@ if ($link ne "") { && ($flags !~ /kernel32/) && ($flags !~ /advapi32/) && ($flags !~ /shell32/) - && ($flags !~ /omp/) + && ($vendor =~ /PGI/ && $flags !~ /omp/) && ($flags !~ /[0-9]+/) && ($flags !~ /^\-l$/) ) { From 6e270f91ec9bd610678b4d4a9026a653a402d9ad Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 29 May 2020 13:29:10 +0200 Subject: [PATCH 202/593] add support for RETURN_BY_STACK semantics, e.g. clang --- benchmark/zdot.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/benchmark/zdot.c b/benchmark/zdot.c index ed9d4d2e8..136135c9c 100644 --- a/benchmark/zdot.c +++ b/benchmark/zdot.c @@ -170,9 +170,11 @@ int main(int argc, char *argv[]){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } gettimeofday( &start, (struct timezone *)0); - +#ifdef RETURN_BY_STACK + DOT (&result , &m, x, &inc_x, y, &inc_y ); +#else result = DOT (&m, x, &inc_x, y, &inc_y ); - +#endif gettimeofday( &stop, (struct timezone *)0); time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; From ced49466f05d4e96b46d377b0a1374f4a72aafe8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 29 May 2020 13:35:51 +0200 Subject: [PATCH 203/593] Use the fortran compiler to link LAPACK-related benchmarks to fix linking problems with (at least) the AMD version of flang that creates dependencies on more than just the fortran runtime. --- benchmark/Makefile | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/benchmark/Makefile b/benchmark/Makefile index 53f422be4..2f70ceaf3 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -1825,7 +1825,7 @@ zsymv.veclib : zsymv.$(SUFFIX) ##################################### Sgeev #################################################### sgeev.goto : sgeev.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm sgeev.acml : sgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1841,7 +1841,7 @@ sgeev.veclib : sgeev.$(SUFFIX) ##################################### Dgeev #################################################### dgeev.goto : dgeev.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dgeev.acml : dgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1858,7 +1858,7 @@ dgeev.veclib : dgeev.$(SUFFIX) ##################################### Cgeev #################################################### cgeev.goto : cgeev.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm cgeev.acml : cgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1875,7 +1875,7 @@ cgeev.veclib : cgeev.$(SUFFIX) ##################################### Zgeev #################################################### zgeev.goto : zgeev.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zgeev.acml : zgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1891,7 +1891,7 @@ zgeev.veclib : zgeev.$(SUFFIX) ##################################### Sgetri #################################################### sgetri.goto : sgetri.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm sgetri.acml : sgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1907,7 +1907,7 @@ sgetri.veclib : sgetri.$(SUFFIX) ##################################### Dgetri #################################################### dgetri.goto : dgetri.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dgetri.acml : dgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1924,7 +1924,7 @@ dgetri.veclib : dgetri.$(SUFFIX) ##################################### Cgetri #################################################### cgetri.goto : cgetri.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm cgetri.acml : cgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1941,7 +1941,7 @@ cgetri.veclib : cgetri.$(SUFFIX) ##################################### Zgetri #################################################### zgetri.goto : zgetri.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zgetri.acml : zgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) From 76d2612e0c02b20bfb6e9f57e9ad02ed3242e601 Mon Sep 17 00:00:00 2001 From: Ilhan Polat Date: Sat, 30 May 2020 14:11:11 +0200 Subject: [PATCH 204/593] BUG: Fix the loop range in ZHEEQUB.f --- lapack-netlib/SRC/zheequb.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/zheequb.f b/lapack-netlib/SRC/zheequb.f index d698232e8..7d719f41e 100644 --- a/lapack-netlib/SRC/zheequb.f +++ b/lapack-netlib/SRC/zheequb.f @@ -271,7 +271,7 @@ AVG = AVG / N STD = 0.0D0 - DO I = N+1, N + DO I = N+1, 2*N WORK( I ) = S( I-N ) * WORK( I-N ) - AVG END DO CALL ZLASSQ( N, WORK( N+1 ), 1, SCALE, SUMSQ ) From 909897f13b286189d72b503c35fbec2d774dbaa2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 31 May 2020 12:37:57 +0200 Subject: [PATCH 205/593] Document option USE_LOCKING --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 70760d64d..5118475cc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,6 +23,7 @@ option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS fun option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF) option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF) option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF) +option(USE_LOCKING "Use locks even in single-threaded builds to make them callable from multiple threads" OFF) if(${CMAKE_SYSTEM_NAME} MATCHES "Linux") option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON) else() From 4db00121dc08789a49084e440fa360d150651abe Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 31 May 2020 12:39:36 +0200 Subject: [PATCH 206/593] Disable EXPRECISION and add -lm on OSX (same as the BSDs and Linux) --- cmake/os.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/os.cmake b/cmake/os.cmake index 2d25e7aaa..c644bc3f7 100644 --- a/cmake/os.cmake +++ b/cmake/os.cmake @@ -8,7 +8,7 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux") set(NO_EXPRECISION 1) endif () -if (${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD|OpenBSD|NetBSD|DragonFly") +if (${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD|OpenBSD|NetBSD|DragonFly|Darwin") set(EXTRALIB "${EXTRALIB} -lm") set(NO_EXPRECISION 1) endif () From 86552bf4c74708fb53dd69253de8ef7dd948170a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 31 May 2020 15:22:12 +0200 Subject: [PATCH 207/593] Update f_check --- f_check | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/f_check b/f_check index fd06a6516..17d863224 100644 --- a/f_check +++ b/f_check @@ -334,7 +334,7 @@ if ($link ne "") { && ($flags !~ /kernel32/) && ($flags !~ /advapi32/) && ($flags !~ /shell32/) - && ($vendor =~ /PGI/ && $flags !~ /omp/) + && ($flags !~ /omp/ || ($vendor !~ /PGI/ && $flags =~ /omp/)) && ($flags !~ /[0-9]+/) && ($flags !~ /^\-l$/) ) { From b31a68b835500d8880a8b366457ca3c2112db630 Mon Sep 17 00:00:00 2001 From: Leonard Lausen Date: Sun, 31 May 2020 01:17:05 +0000 Subject: [PATCH 208/593] Add Github Actions test for DYNAMIC_ARCH builds --- .github/workflows/dynamic_arch.yml | 70 ++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 .github/workflows/dynamic_arch.yml diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml new file mode 100644 index 000000000..a89e53738 --- /dev/null +++ b/.github/workflows/dynamic_arch.yml @@ -0,0 +1,70 @@ +name: continuous build + +on: [push, pull_request] + +jobs: + build: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, macos-latest] + build: [cmake, make] + steps: + - name: Checkout repository + uses: actions/checkout@v2 + + - name: Compilation cache + uses: actions/cache@v2 + with: + path: ~/.ccache + # We include the commit sha in the cache key, as new cache entries are + # only created if there is no existing entry for the key yet. + key: ${{ runner.os }}-ccache-${{ github.sha }} + # Restore any ccache cache entry, if none for + # ${{ runner.os }}-ccache-${{ github.sha }} exists + restore-keys: | + ${{ runner.os }}-ccache + + - name: Install Dependencies + run: | + if [ "$RUNNER_OS" == "Linux" ]; then + sudo apt-get install -y gfortran cmake ccache + elif [ "$RUNNER_OS" == "macOS" ]; then + brew install coreutils cmake ccache + else + echo "$RUNNER_OS not supported" + exit 1 + fi + ccache -M 300M # Limit the ccache size; Github's overall cache limit is 5GB + + - name: Build + if: matrix.build == 'make' + run: | + if [ "$RUNNER_OS" == "Linux" ]; then + export PATH="/usr/lib/ccache:${PATH}" + elif [ "$RUNNER_OS" == "macOS" ]; then + export PATH="$(brew --prefix)/opt/ccache/libexec:${PATH}" + else + echo "$RUNNER_OS not supported" + exit 1 + fi + + make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0 + + - name: CMake build + if: matrix.build == 'cmake' + run: | + if [ "$RUNNER_OS" == "Linux" ]; then + export PATH="/usr/lib/ccache:${PATH}" + elif [ "$RUNNER_OS" == "macOS" ]; then + export PATH="$(brew --prefix)/opt/ccache/libexec:${PATH}" + else + echo "$RUNNER_OS not supported" + exit 1 + fi + + mkdir build + cd build + cmake -DDYNAMIC_ARCH=1 -DNOFORTRAN=0 -DBUILD_WITHOUT_LAPACK=0 .. + make -j$(nproc) From 5a709b8340d8fac767215c32db694a481530c30c Mon Sep 17 00:00:00 2001 From: Leonard Lausen Date: Mon, 1 Jun 2020 20:51:11 +0000 Subject: [PATCH 209/593] Print CPU info in output --- .github/workflows/dynamic_arch.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml index a89e53738..8e90681ce 100644 --- a/.github/workflows/dynamic_arch.yml +++ b/.github/workflows/dynamic_arch.yml @@ -26,6 +26,17 @@ jobs: restore-keys: | ${{ runner.os }}-ccache + - name: Print system information + run: | + if [ "$RUNNER_OS" == "Linux" ]; then + cat /proc/cpuinfo + elif [ "$RUNNER_OS" == "macOS" ]; then + sysctl -a | grep machdep.cpu + else + echo "$RUNNER_OS not supported" + exit 1 + fi + - name: Install Dependencies run: | if [ "$RUNNER_OS" == "Linux" ]; then From 54fa90fb251927d5cdc9cd7e8430443d8e19cea7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 2 Jun 2020 17:31:45 +0200 Subject: [PATCH 210/593] Keep apple clang 11.0.3 from trying to optimize this (and running out of registers) --- kernel/x86_64/sgemm_kernel_16x4_skylakex.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c index d174bbcc3..797c3a823 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_skylakex.c +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c @@ -24,6 +24,9 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ +#if defined(__apple_build_version__) && __clang_major__ == 11 && __clang_minor__ == 0 && __clang_patchlevel__ == 3 +#pragma clang optimize off +#endif /* comment below left for history, data does not represent the implementation in this file */ From 9f7358d7dc768ed6e126891e61eebde2e34e29e2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 3 Jun 2020 08:52:53 +0200 Subject: [PATCH 211/593] Keep Apple Clang from optimizing this --- kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c index 3b1af33c1..aa355e10e 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c @@ -1,3 +1,8 @@ +#if defined(__apple_build_version__) && __clang_major__ == 11 && __clang_minor__ == 0 && __clang_patchlevel__ == 3 +#pragma clang optimize off +#endif + + /* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store */ /* r10 to assist prefetch, r12 = k << 4(const), r13 = k(const), r14 = b_head_pos(const), r15 = %1 + 3r12 */ From b1ee81228a06e78b20b4611c49f810e507d329b7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 3 Jun 2020 09:13:29 +0200 Subject: [PATCH 212/593] Change complex DOT and ROT to generic kernels and switch CGEMM in response to test failures seen in #2628 and BLAS-Tester --- kernel/power/KERNEL.PPCG4 | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/kernel/power/KERNEL.PPCG4 b/kernel/power/KERNEL.PPCG4 index f615754bb..54660b54d 100644 --- a/kernel/power/KERNEL.PPCG4 +++ b/kernel/power/KERNEL.PPCG4 @@ -20,8 +20,10 @@ ZAXPYKERNEL = zaxpy_ppc440.S SDOTKERNEL = dot_ppc440.S DDOTKERNEL = dot_ppc440.S -CDOTKERNEL = zdot_ppc440.S -ZDOTKERNEL = zdot_ppc440.S +#CDOTKERNEL = zdot_ppc440.S +#ZDOTKERNEL = zdot_ppc440.S +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c ISAMAXKERNEL = iamax_ppc440.S IDAMAXKERNEL = iamax_ppc440.S @@ -52,8 +54,11 @@ ZNRM2KERNEL = znrm2_ppc440.S SROTKERNEL = rot_ppc440.S DROTKERNEL = rot_ppc440.S -CROTKERNEL = zrot_ppc440.S -ZROTKERNEL = zrot_ppc440.S +#CROTKERNEL = zrot_ppc440.S +#ZROTKERNEL = zrot_ppc440.S +CROTKERNEL = ../arm/zrot.c +ZROTKERNEL = ../arm/zrot.c + SSCALKERNEL = scal_ppc440.S DSCALKERNEL = scal_ppc440.S @@ -78,13 +83,18 @@ DGEMMINCOPYOBJ = DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) -CGEMMKERNEL = zgemm_kernel_altivec_g4.S -CGEMMINCOPY = ../generic/zgemm_ncopy_8.c -CGEMMITCOPY = ../generic/zgemm_tcopy_8.c +#CGEMMKERNEL = zgemm_kernel_altivec_g4.S +#CGEMMINCOPY = ../generic/zgemm_ncopy_8.c +#CGEMMITCOPY = ../generic/zgemm_tcopy_8.c +CGEMMKERNEL = zgemm_kernel.S +CGEMMINCOPY = +CGEMMONCOPY = CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) -CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMINCOPYOBJ = +#cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = +#cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_g4.S From f16e39554dc6d65074caefe4b61aed2700099618 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 3 Jun 2020 09:15:29 +0200 Subject: [PATCH 213/593] Change PPCG4 CGEMM_M to match kernel change --- param.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/param.h b/param.h index c780e7ef1..04928277c 100644 --- a/param.h +++ b/param.h @@ -1974,7 +1974,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_N 4 -#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 From c2b3f0b3f63cbc8aae1ca3f348ce4609649f8829 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 3 Jun 2020 10:22:15 +0200 Subject: [PATCH 214/593] Revert "keep Apple Clang from optimizing this" --- kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c index aa355e10e..e0937fa38 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c @@ -1,6 +1,6 @@ -#if defined(__apple_build_version__) && __clang_major__ == 11 && __clang_minor__ == 0 && __clang_patchlevel__ == 3 -#pragma clang optimize off -#endif +//#if defined(__apple_build_version__) && __clang_major__ == 11 && __clang_minor__ == 0 && __clang_patchlevel__ == 3 +//#pragma clang optimize off +//#endif /* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store */ From c2001f7756a9635a10ae547f6fdf28ee7e7933a7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 3 Jun 2020 12:18:15 +0200 Subject: [PATCH 215/593] Make cmake build verbose to see options in use --- .github/workflows/dynamic_arch.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml index 8e90681ce..06fff7168 100644 --- a/.github/workflows/dynamic_arch.yml +++ b/.github/workflows/dynamic_arch.yml @@ -77,5 +77,5 @@ jobs: mkdir build cd build - cmake -DDYNAMIC_ARCH=1 -DNOFORTRAN=0 -DBUILD_WITHOUT_LAPACK=0 .. + cmake -DDYNAMIC_ARCH=1 -DNOFORTRAN=0 -DBUILD_WITHOUT_LAPACK=0 -DCMAKE_VERBOSE_MAKEFILES=ON .. make -j$(nproc) From e153bdeb703ab135eb6f7d83eef1723d642fe10c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 3 Jun 2020 13:46:43 +0200 Subject: [PATCH 216/593] Update dynamic_arch.yml --- .github/workflows/dynamic_arch.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml index 06fff7168..f79c547af 100644 --- a/.github/workflows/dynamic_arch.yml +++ b/.github/workflows/dynamic_arch.yml @@ -77,5 +77,5 @@ jobs: mkdir build cd build - cmake -DDYNAMIC_ARCH=1 -DNOFORTRAN=0 -DBUILD_WITHOUT_LAPACK=0 -DCMAKE_VERBOSE_MAKEFILES=ON .. + cmake -DDYNAMIC_ARCH=1 -DNOFORTRAN=0 -DBUILD_WITHOUT_LAPACK=0 -DCMAKE_VERBOSE_MAKEFILE=ON .. make -j$(nproc) From 89323458a9d44a1971ecfd9d7e4ce2ddad0eafd5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 3 Jun 2020 15:07:25 +0200 Subject: [PATCH 217/593] preset optimization level for apple clang --- kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c index e0937fa38..d81637fa8 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c @@ -1,6 +1,6 @@ -//#if defined(__apple_build_version__) && __clang_major__ == 11 && __clang_minor__ == 0 && __clang_patchlevel__ == 3 -//#pragma clang optimize off -//#endif +#if defined(__apple_build_version__) && __clang_major__ == 11 && __clang_minor__ == 0 && __clang_patchlevel__ == 3 +#pragma clang optimize "O2" +#endif /* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store */ From 456dc04441fa72b11e600a69c8d752a0d93ecc8a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 3 Jun 2020 15:15:41 +0200 Subject: [PATCH 218/593] Update sgemm_kernel_16x4_skylakex_3.c --- kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c index d81637fa8..3b1af33c1 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c @@ -1,8 +1,3 @@ -#if defined(__apple_build_version__) && __clang_major__ == 11 && __clang_minor__ == 0 && __clang_patchlevel__ == 3 -#pragma clang optimize "O2" -#endif - - /* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store */ /* r10 to assist prefetch, r12 = k << 4(const), r13 = k(const), r14 = b_head_pos(const), r15 = %1 + 3r12 */ From e6e2e531bc850a0c213dc3fb8c2ab07a2802b430 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 3 Jun 2020 15:16:27 +0200 Subject: [PATCH 219/593] revert clang pragma --- kernel/x86_64/sgemm_kernel_16x4_skylakex.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c index 797c3a823..d174bbcc3 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_skylakex.c +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c @@ -24,9 +24,6 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ -#if defined(__apple_build_version__) && __clang_major__ == 11 && __clang_minor__ == 0 && __clang_patchlevel__ == 3 -#pragma clang optimize off -#endif /* comment below left for history, data does not represent the implementation in this file */ From a8f42ae85ce696ecd2e258c2d6b770f588045501 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 3 Jun 2020 15:28:59 +0200 Subject: [PATCH 220/593] set cmake build type to Release --- .github/workflows/dynamic_arch.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml index f79c547af..b6a4090bd 100644 --- a/.github/workflows/dynamic_arch.yml +++ b/.github/workflows/dynamic_arch.yml @@ -77,5 +77,5 @@ jobs: mkdir build cd build - cmake -DDYNAMIC_ARCH=1 -DNOFORTRAN=0 -DBUILD_WITHOUT_LAPACK=0 -DCMAKE_VERBOSE_MAKEFILE=ON .. + cmake -DDYNAMIC_ARCH=1 -DNOFORTRAN=0 -DBUILD_WITHOUT_LAPACK=0 -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_BUILD_TYPE=Release .. make -j$(nproc) From f82fa802d164a064da257bb459c3d13629fd56f8 Mon Sep 17 00:00:00 2001 From: ZhangDanfeng <467688405@qq.com> Date: Thu, 4 Jun 2020 02:08:48 +0800 Subject: [PATCH 221/593] Insert prefetch Signed-off-by: ZhangDanfeng <467688405@qq.com> --- kernel/arm64/sgemm_kernel_8x8_cortexa53.S | 664 +++++++++++----------- 1 file changed, 319 insertions(+), 345 deletions(-) diff --git a/kernel/arm64/sgemm_kernel_8x8_cortexa53.S b/kernel/arm64/sgemm_kernel_8x8_cortexa53.S index 4fcce38d5..fec0c9ae9 100644 --- a/kernel/arm64/sgemm_kernel_8x8_cortexa53.S +++ b/kernel/arm64/sgemm_kernel_8x8_cortexa53.S @@ -57,6 +57,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define alpha3 s15 #define alphaV3 v15.s[0] +#define A_PRE_SIZE 640 +#define B_PRE_SIZE 224 +#define C_PRE_SIZE 96 + // 00 origM // 01 origN // 02 origK @@ -147,13 +151,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x8_I - ld1 {v0.4s, v1.4s}, [pA], #32 - ld1 {v4.4s, v5.4s}, [pB], #32 + ldp q0, q1, [pA], #32 + ldp q4, q5, [pB], #32 + ldr d2, [pA], #8 ldr d6, [pB], #8 ldr d3, [pA, #8] ldr d7, [pB, #8] - ldr x22, [pA], #16 fmul v16.4s, v0.4s, v4.s[0] ldr x26, [pB], #16 @@ -163,7 +167,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr x27, [pB], #8 fmul v19.4s, v1.4s, v4.s[1] fmul v20.4s, v0.4s, v4.s[2] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] fmul v21.4s, v1.4s, v4.s[2] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmul v22.4s, v0.4s, v4.s[3] fmul v23.4s, v1.4s, v4.s[3] fmul v24.4s, v0.4s, v5.s[0] @@ -194,7 +200,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmla v19.4s, v1.4s, v4.s[1] ldr x27, [pB], #8 fmla v20.4s, v0.4s, v4.s[2] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] fmla v21.4s, v1.4s, v4.s[2] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla v22.4s, v0.4s, v4.s[3] fmla v23.4s, v1.4s, v4.s[3] fmla v24.4s, v0.4s, v5.s[0] @@ -225,7 +233,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmla v19.4s, v3.4s, v6.s[1] ldr x25, [pB], #8 fmla v20.4s, v2.4s, v6.s[2] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] fmla v21.4s, v3.4s, v6.s[2] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla v22.4s, v2.4s, v6.s[3] fmla v23.4s, v3.4s, v6.s[3] fmla v24.4s, v2.4s, v7.s[0] @@ -248,7 +258,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmla v18.4s, v2.4s, v6.s[1] fmla v19.4s, v3.4s, v6.s[1] fmla v20.4s, v2.4s, v6.s[2] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] fmla v21.4s, v3.4s, v6.s[2] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla v22.4s, v2.4s, v6.s[3] fmla v23.4s, v3.4s, v6.s[3] fmla v24.4s, v2.4s, v7.s[0] @@ -262,21 +274,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x8_SUB - ld1 {v4.4s}, [pB] - add pB, pB, #16 - ld1 {v5.4s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 + ldp q0, q1, [pA], #32 + ldp q4, q5, [pB], #32 fmla v16.4s, v0.4s, v4.s[0] fmla v17.4s, v1.4s, v4.s[0] fmla v18.4s, v0.4s, v4.s[1] fmla v19.4s, v1.4s, v4.s[1] fmla v20.4s, v0.4s, v4.s[2] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] fmla v21.4s, v1.4s, v4.s[2] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla v22.4s, v0.4s, v4.s[3] fmla v23.4s, v1.4s, v4.s[3] fmla v24.4s, v0.4s, v5.s[0] @@ -290,66 +298,74 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE8x8 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add pCRow1, pCRow0, LDC - ld1 {v0.4s, v1.4s}, [pCRow0] + ldp q0, q1, [pCRow0] fmla v0.4s, v16.4s, alphaV0 fmla v1.4s, v17.4s, alphaV1 - st1 {v0.4s, v1.4s}, [pCRow0] + stp q0, q1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] add pCRow2, pCRow1, LDC - ld1 {v2.4s, v3.4s}, [pCRow1] + ldp q2, q3, [pCRow1] fmla v2.4s, v18.4s, alphaV2 fmla v3.4s, v19.4s, alphaV3 - st1 {v2.4s, v3.4s}, [pCRow1] + stp q2, q3, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] add pCRow1, pCRow2, LDC - ld1 {v4.4s, v5.4s}, [pCRow2] + ldp q4, q5, [pCRow2] fmla v4.4s, v20.4s, alphaV0 fmla v5.4s, v21.4s, alphaV1 - st1 {v4.4s, v5.4s}, [pCRow2] + stp q4, q5, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] add pCRow2, pCRow1, LDC - ld1 {v6.4s, v7.4s}, [pCRow1] + ldp q6, q7, [pCRow1] fmla v6.4s, v22.4s, alphaV2 fmla v7.4s, v23.4s, alphaV3 - st1 {v6.4s, v7.4s}, [pCRow1] + stp q6, q7, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] add pCRow1, pCRow2, LDC - ld1 {v0.4s, v1.4s}, [pCRow2] + ldp q0, q1, [pCRow2] fmla v0.4s, v24.4s, alphaV0 fmla v1.4s, v25.4s, alphaV1 - st1 {v0.4s, v1.4s}, [pCRow2] + stp q0, q1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] add pCRow2, pCRow1, LDC - ld1 {v2.4s, v3.4s}, [pCRow1] + ldp q2, q3, [pCRow1] fmla v2.4s, v26.4s, alphaV2 fmla v3.4s, v27.4s, alphaV3 - st1 {v2.4s, v3.4s}, [pCRow1] + stp q2, q3, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] add pCRow1, pCRow2, LDC - ld1 {v4.4s, v5.4s}, [pCRow2] + ldp q4, q5, [pCRow2] fmla v4.4s, v28.4s, alphaV0 fmla v5.4s, v29.4s, alphaV1 - st1 {v4.4s, v5.4s}, [pCRow2] + stp q4, q5, [pCRow2] + + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] - ld1 {v6.4s, v7.4s}, [pCRow1] + ldp q6, q7, [pCRow1] fmla v6.4s, v30.4s, alphaV2 fmla v7.4s, v31.4s, alphaV3 - st1 {v6.4s, v7.4s}, [pCRow1] + stp q6, q7, [pCRow1] add pCRow0, pCRow0, #32 .endm /******************************************************************************/ - .macro INIT4x8 fmov s16, wzr fmov s18, wzr @@ -362,19 +378,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_I - ld1 {v0.4s}, [pA], #16 - ld1 {v4.4s, v5.4s}, [pB], #32 + ldr q0, [pA], #16 + ldp q4, q5, [pB], #32 ldr d2, [pA], #8 ldr d6, [pB], #8 ldr d7, [pB, #8] - ldr x21, [pA], #8 + ldr x22, [pA], #8 fmul v16.4s, v0.4s, v4.s[0] ldr x26, [pB], #16 fmul v18.4s, v0.4s, v4.s[1] ldr x27, [pB], #8 fmul v20.4s, v0.4s, v4.s[2] fmul v22.4s, v0.4s, v4.s[3] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmul v24.4s, v0.4s, v5.s[0] fmul v26.4s, v0.4s, v5.s[1] fmul v28.4s, v0.4s, v5.s[2] @@ -388,13 +405,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmov v4.d[1], x24 ldr d7, [pB, #8] fmov v5.d[1], x25 + ldr x22, [pA], #8 fmla v16.4s, v0.4s, v4.s[0] - ldr x21, [pA], #8 - fmla v18.4s, v0.4s, v4.s[1] ldr x26, [pB], #16 - fmla v20.4s, v0.4s, v4.s[2] + fmla v18.4s, v0.4s, v4.s[1] ldr x27, [pB], #8 + fmla v20.4s, v0.4s, v4.s[2] fmla v22.4s, v0.4s, v4.s[3] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla v24.4s, v0.4s, v5.s[0] fmla v26.4s, v0.4s, v5.s[1] fmla v28.4s, v0.4s, v5.s[2] @@ -403,18 +421,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x8_M2 ldr d0, [pA], #8 - fmov v2.d[1], x21 + fmov v2.d[1], x22 ldr d4, [pB], #8 fmov v6.d[1], x26 ldr d5, [pB, #8] fmov v7.d[1], x27 - fmla v16.4s, v2.4s, v6.s[0] ldr x20, [pA], #8 - fmla v18.4s, v2.4s, v6.s[1] + fmla v16.4s, v2.4s, v6.s[0] ldr x24, [pB], #16 - fmla v20.4s, v2.4s, v6.s[2] + fmla v18.4s, v2.4s, v6.s[1] ldr x25, [pB], #8 + fmla v20.4s, v2.4s, v6.s[2] fmla v22.4s, v2.4s, v6.s[3] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla v24.4s, v2.4s, v7.s[0] fmla v26.4s, v2.4s, v7.s[1] fmla v28.4s, v2.4s, v7.s[2] @@ -422,13 +441,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_E - fmov v2.d[1], x21 + fmov v2.d[1], x22 fmov v6.d[1], x26 fmov v7.d[1], x27 fmla v16.4s, v2.4s, v6.s[0] fmla v18.4s, v2.4s, v6.s[1] fmla v20.4s, v2.4s, v6.s[2] fmla v22.4s, v2.4s, v6.s[3] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla v24.4s, v2.4s, v7.s[0] fmla v26.4s, v2.4s, v7.s[1] fmla v28.4s, v2.4s, v7.s[2] @@ -436,17 +456,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_SUB - ld1 {v4.4s}, [pB] - add pB, pB, #16 - ld1 {v5.4s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 + ldr q0, [pA], #16 + ldp q4, q5, [pB], #32 fmla v16.4s, v0.4s, v4.s[0] fmla v18.4s, v0.4s, v4.s[1] fmla v20.4s, v0.4s, v4.s[2] fmla v22.4s, v0.4s, v4.s[3] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla v24.4s, v0.4s, v5.s[0] fmla v26.4s, v0.4s, v5.s[1] fmla v28.4s, v0.4s, v5.s[2] @@ -456,49 +473,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE4x8 add pCRow1, pCRow0, LDC - ld1 {v0.4s}, [pCRow0] + ldr q0, [pCRow0] fmla v0.4s, v16.4s, alphaV0 - st1 {v0.4s}, [pCRow0] + str q0, [pCRow0] add pCRow2, pCRow1, LDC - ld1 {v2.4s}, [pCRow1] + ldr q2, [pCRow1] fmla v2.4s, v18.4s, alphaV2 - st1 {v2.4s}, [pCRow1] + str q2, [pCRow1] add pCRow1, pCRow2, LDC - ld1 {v4.4s}, [pCRow2] + ldr q4, [pCRow2] fmla v4.4s, v20.4s, alphaV0 - st1 {v4.4s}, [pCRow2] + str q4, [pCRow2] add pCRow2, pCRow1, LDC - ld1 {v6.4s}, [pCRow1] + ldr q6, [pCRow1] fmla v6.4s, v22.4s, alphaV2 - st1 {v6.4s}, [pCRow1] + str q6, [pCRow1] add pCRow1, pCRow2, LDC - ld1 {v0.4s}, [pCRow2] + ldr q0, [pCRow2] fmla v0.4s, v24.4s, alphaV0 - st1 {v0.4s}, [pCRow2] + str q0, [pCRow2] add pCRow2, pCRow1, LDC - ld1 {v2.4s}, [pCRow1] + ldr q2, [pCRow1] fmla v2.4s, v26.4s, alphaV2 - st1 {v2.4s}, [pCRow1] + str q2, [pCRow1] add pCRow1, pCRow2, LDC - ld1 {v4.4s}, [pCRow2] + ldr q4, [pCRow2] fmla v4.4s, v28.4s, alphaV0 - st1 {v4.4s}, [pCRow2] + str q4, [pCRow2] - ld1 {v6.4s}, [pCRow1] + ldr q6, [pCRow1] fmla v6.4s, v30.4s, alphaV2 - st1 {v6.4s}, [pCRow1] + str q6, [pCRow1] add pCRow0, pCRow0, #16 .endm @@ -517,17 +534,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL2x8_SUB - ld1 {v4.4s}, [pB] - add pB, pB, #16 - ld1 {v5.4s}, [pB] - add pB, pB, #16 - ld1 {v0.2s}, [pA] - add pA, pA, #8 + ldr d0, [pA], #8 + ldp q4, q5, [pB], #32 fmla v16.2s, v0.2s, v4.s[0] fmla v18.2s, v0.2s, v4.s[1] fmla v20.2s, v0.2s, v4.s[2] fmla v22.2s, v0.2s, v4.s[3] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla v24.2s, v0.2s, v5.s[0] fmla v26.2s, v0.2s, v5.s[1] fmla v28.2s, v0.2s, v5.s[2] @@ -537,49 +551,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE2x8 add pCRow1, pCRow0, LDC - ld1 {v0.2s}, [pCRow0] + ldr d0, [pCRow0] fmla v0.2s, v16.2s, alphaV0 - st1 {v0.2s}, [pCRow0] + str d0, [pCRow0] add pCRow2, pCRow1, LDC - ld1 {v2.2s}, [pCRow1] + ldr d2, [pCRow1] fmla v2.2s, v18.2s, alphaV2 - st1 {v2.2s}, [pCRow1] + str d2, [pCRow1] add pCRow1, pCRow2, LDC - ld1 {v4.2s}, [pCRow2] + ldr d4, [pCRow2] fmla v4.2s, v20.2s, alphaV0 - st1 {v4.2s}, [pCRow2] + str d4, [pCRow2] add pCRow2, pCRow1, LDC - ld1 {v6.2s}, [pCRow1] + ldr d6, [pCRow1] fmla v6.2s, v22.2s, alphaV2 - st1 {v6.2s}, [pCRow1] + str d6, [pCRow1] add pCRow1, pCRow2, LDC - ld1 {v0.2s}, [pCRow2] + ldr d0, [pCRow2] fmla v0.2s, v24.2s, alphaV0 - st1 {v0.2s}, [pCRow2] + str d0, [pCRow2] add pCRow2, pCRow1, LDC - ld1 {v2.2s}, [pCRow1] + ldr d2, [pCRow1] fmla v2.2s, v26.2s, alphaV2 - st1 {v2.2s}, [pCRow1] + str d2, [pCRow1] add pCRow1, pCRow2, LDC - ld1 {v4.2s}, [pCRow2] + ldr d4, [pCRow2] fmla v4.2s, v28.2s, alphaV0 - st1 {v4.2s}, [pCRow2] + str d4, [pCRow2] - ld1 {v6.2s}, [pCRow1] + ldr d6, [pCRow1] fmla v6.2s, v30.2s, alphaV2 - st1 {v6.2s}, [pCRow1] + str d6, [pCRow1] add pCRow0, pCRow0, #8 .endm @@ -598,17 +612,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL1x8_SUB - ld1 {v4.4s}, [pB] - add pB, pB, #16 - ld1 {v5.4s}, [pB] - add pB, pB, #16 - ldr s0, [pA] - add pA, pA, #4 + ldp q4, q5, [pB], #32 + ldr s0, [pA], #4 fmla s16, s0, v4.s[0] fmla s18, s0, v4.s[1] fmla s20, s0, v4.s[2] fmla s22, s0, v4.s[3] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla s24, s0, v5.s[0] fmla s26, s0, v5.s[1] fmla s28, s0, v5.s[2] @@ -620,47 +631,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr s0, [pCRow0] fmla s0, s16, alphaV0 - str s0, [pCRow0] + str s0, [pCRow0] add pCRow2, pCRow1, LDC ldr s2, [pCRow1] fmla s2, s18, alphaV2 - str s2, [pCRow1] + str s2, [pCRow1] add pCRow1, pCRow2, LDC ldr s4, [pCRow2] fmla s4, s20, alphaV0 - str s4, [pCRow2] + str s4, [pCRow2] add pCRow2, pCRow1, LDC ldr s6, [pCRow1] fmla s6, s22, alphaV2 - str s6, [pCRow1] + str s6, [pCRow1] add pCRow1, pCRow2, LDC ldr s0, [pCRow2] fmla s0, s24, alphaV0 - str s0, [pCRow2] + str s0, [pCRow2] add pCRow2, pCRow1, LDC ldr s2, [pCRow1] fmla s2, s26, alphaV2 - str s2, [pCRow1] + str s2, [pCRow1] add pCRow1, pCRow2, LDC ldr s4, [pCRow2] fmla s4, s28, alphaV0 - str s4, [pCRow2] + str s4, [pCRow2] ldr s6, [pCRow1] fmla s6, s30, alphaV2 - str s6, [pCRow1] + str s6, [pCRow1] add pCRow0, pCRow0, #4 .endm @@ -679,118 +690,137 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_I - ld1 {v8.4s}, [pB], #16 - ld1 {v0.4s, v1.4s}, [pA], #32 + ldp q0, q1, [pA], #32 + ldr q4, [pB], #16 - ldr d9, [pB], #8 ldr d2, [pA], #8 + ldr d6, [pB], #8 ldr d3, [pA, #8] - fmul v16.4s, v0.4s, v8.s[0] - ldr x25, [pB], #8 - fmul v17.4s, v1.4s, v8.s[0] + fmul v16.4s, v0.4s, v4.s[0] ldr x22, [pA], #16 - fmul v20.4s, v0.4s, v8.s[1] + fmul v17.4s, v1.4s, v4.s[0] + ldr x26, [pB], #8 + fmul v18.4s, v0.4s, v4.s[1] ldr x23, [pA], #8 - fmul v21.4s, v1.4s, v8.s[1] - fmul v24.4s, v0.4s, v8.s[2] - fmul v25.4s, v1.4s, v8.s[2] - fmul v28.4s, v0.4s, v8.s[3] - fmul v29.4s, v1.4s, v8.s[3] + fmul v19.4s, v1.4s, v4.s[1] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmul v20.4s, v0.4s, v4.s[2] + fmul v21.4s, v1.4s, v4.s[2] + fmul v22.4s, v0.4s, v4.s[3] + fmul v23.4s, v1.4s, v4.s[3] .endm .macro KERNEL8x4_M1 - ldr d9, [pB], #8 - fmov v8.d[1], x24 ldr d2, [pA], #8 fmov v0.d[1], x20 + ldr d6, [pB], #8 + fmov v4.d[1], x24 ldr d3, [pA, #8] fmov v1.d[1], x21 - fmla v16.4s, v0.4s, v8.s[0] - ldr x25, [pB], #8 - fmla v17.4s, v1.4s, v8.s[0] ldr x22, [pA], #16 - fmla v20.4s, v0.4s, v8.s[1] + fmla v16.4s, v0.4s, v4.s[0] + ldr x26, [pB], #8 + fmla v17.4s, v1.4s, v4.s[0] ldr x23, [pA], #8 - fmla v21.4s, v1.4s, v8.s[1] - fmla v24.4s, v0.4s, v8.s[2] - fmla v25.4s, v1.4s, v8.s[2] - fmla v28.4s, v0.4s, v8.s[3] - fmla v29.4s, v1.4s, v8.s[3] + fmla v18.4s, v0.4s, v4.s[1] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla v19.4s, v1.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v21.4s, v1.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + fmla v23.4s, v1.4s, v4.s[3] .endm .macro KERNEL8x4_M2 - ldr d8, [pB], #8 - fmov v9.d[1], x25 ldr d0, [pA], #8 fmov v2.d[1], x22 + ldr d4, [pB], #8 + fmov v6.d[1], x26 ldr d1, [pA, #8] fmov v3.d[1], x23 - fmla v16.4s, v2.4s, v9.s[0] - ldr x24, [pB], #8 - fmla v17.4s, v3.4s, v9.s[0] ldr x20, [pA], #16 - fmla v20.4s, v2.4s, v9.s[1] + fmla v16.4s, v2.4s, v6.s[0] + ldr x24, [pB], #8 + fmla v17.4s, v3.4s, v6.s[0] ldr x21, [pA], #8 - fmla v21.4s, v3.4s, v9.s[1] - fmla v24.4s, v2.4s, v9.s[2] - fmla v25.4s, v3.4s, v9.s[2] - fmla v28.4s, v2.4s, v9.s[3] - fmla v29.4s, v3.4s, v9.s[3] + fmla v18.4s, v2.4s, v6.s[1] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla v19.4s, v3.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v21.4s, v3.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + fmla v23.4s, v3.4s, v6.s[3] .endm .macro KERNEL8x4_E - fmov v9.d[1], x25 fmov v2.d[1], x22 + fmov v6.d[1], x26 fmov v3.d[1], x23 - fmla v16.4s, v2.4s, v9.s[0] - fmla v17.4s, v3.4s, v9.s[0] - fmla v20.4s, v2.4s, v9.s[1] - fmla v21.4s, v3.4s, v9.s[1] - fmla v24.4s, v2.4s, v9.s[2] - fmla v25.4s, v3.4s, v9.s[2] - fmla v28.4s, v2.4s, v9.s[3] - fmla v29.4s, v3.4s, v9.s[3] + fmla v16.4s, v2.4s, v6.s[0] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla v17.4s, v3.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v19.4s, v3.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v21.4s, v3.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + fmla v23.4s, v3.4s, v6.s[3] + fmla v24.4s, v2.4s, v7.s[0] + fmla v25.4s, v3.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v27.4s, v3.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v29.4s, v3.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] + fmla v31.4s, v3.4s, v7.s[3] .endm .macro KERNEL8x4_SUB - ld1 {v8.4s}, [pB], #16 - ld1 {v0.4s, v1.4s}, [pA], #32 - fmla v16.4s, v0.4s, v8.s[0] - fmla v17.4s, v1.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.s[1] - fmla v21.4s, v1.4s, v8.s[1] - fmla v24.4s, v0.4s, v8.s[2] - fmla v25.4s, v1.4s, v8.s[2] - fmla v28.4s, v0.4s, v8.s[3] - fmla v29.4s, v1.4s, v8.s[3] + ldp q0, q1, [pA], #32 + ldr q4, [pB], #16 + + fmla v16.4s, v0.4s, v4.s[0] + fmla v17.4s, v1.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v19.4s, v1.4s, v4.s[1] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla v20.4s, v0.4s, v4.s[2] + fmla v21.4s, v1.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + fmla v23.4s, v1.4s, v4.s[3] .endm .macro SAVE8x4 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add pCRow1, pCRow0, LDC - ld1 {v0.4s, v1.4s}, [pCRow0] + ldp q0, q1, [pCRow0] fmla v0.4s, v16.4s, alphaV0 fmla v1.4s, v17.4s, alphaV1 - st1 {v0.4s, v1.4s}, [pCRow0] + stp q0, q1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] add pCRow2, pCRow1, LDC - ld1 {v4.4s, v5.4s}, [pCRow1] - fmla v4.4s, v20.4s, alphaV0 - fmla v5.4s, v21.4s, alphaV1 - st1 {v4.4s, v5.4s}, [pCRow1] + ldp q4, q5, [pCRow1] + fmla v4.4s, v18.4s, alphaV0 + fmla v5.4s, v19.4s, alphaV1 + stp q4, q5, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] add pCRow1, pCRow2, LDC - ld1 {v0.4s, v1.4s}, [pCRow2] - fmla v0.4s, v24.4s, alphaV0 - fmla v1.4s, v25.4s, alphaV1 - st1 {v0.4s, v1.4s}, [pCRow2] + ldp q0, q1, [pCRow2] + fmla v0.4s, v20.4s, alphaV0 + fmla v1.4s, v21.4s, alphaV1 + stp q0, q1, [pCRow2] - ld1 {v4.4s, v5.4s}, [pCRow1] - fmla v4.4s, v28.4s, alphaV0 - fmla v5.4s, v29.4s, alphaV1 - st1 {v4.4s, v5.4s}, [pCRow1] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ldp q4, q5, [pCRow1] + fmla v4.4s, v22.4s, alphaV0 + fmla v5.4s, v23.4s, alphaV1 + stp q4, q5, [pCRow1] add pCRow0, pCRow0, #32 .endm @@ -800,139 +830,89 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT4x4 fmov s16, wzr - fmov s17, s16 - fmov s20, s17 - fmov s21, s16 - fmov s24, s17 - fmov s25, s16 - fmov s28, s17 - fmov s29, s16 + fmov s18, wzr + fmov s20, wzr + fmov s22, wzr .endm .macro KERNEL4x4_I - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.2s, v1.2s}, [pA] - add pA, pA, #16 - - fmul v16.2s, v0.2s, v8.s[0] - fmul v29.2s, v1.2s, v9.s[1] - - fmul v20.2s, v0.2s, v8.s[1] - fmul v25.2s, v1.2s, v9.s[0] - - fmul v24.2s, v0.2s, v9.s[0] - fmul v21.2s, v1.2s, v8.s[1] - - fmul v28.2s, v0.2s, v9.s[1] - fmul v17.2s, v1.2s, v8.s[0] + ldr q0, [pA], #16 + ldr q4, [pB], #16 - ld1 {v12.2s, v13.2s}, [pB] - add pB, pB, #16 - ld1 {v4.2s, v5.2s}, [pA] - add pA, pA, #16 + ldr d2, [pA], #8 + ldr d6, [pB], #8 + fmul v16.4s, v0.4s, v4.s[0] + ldr x22, [pA], #8 + fmul v18.4s, v0.4s, v4.s[1] + ldr x26, [pB], #8 + fmul v20.4s, v0.4s, v4.s[2] + fmul v22.4s, v0.4s, v4.s[3] .endm .macro KERNEL4x4_M1 - fmla v16.2s, v0.2s, v8.s[0] - fmla v29.2s, v1.2s, v9.s[1] - - ld1 {v12.2s, v13.2s}, [pB] // For next round - add pB, pB, #16 - - fmla v20.2s, v0.2s, v8.s[1] - fmla v25.2s, v1.2s, v9.s[0] - - ld1 {v4.2s, v5.2s}, [pA] // For next round - add pA, pA, #16 - - fmla v24.2s, v0.2s, v9.s[0] - fmla v21.2s, v1.2s, v8.s[1] - - prfm PLDL1KEEP, [pB, #512] - - fmla v28.2s, v0.2s, v9.s[1] - fmla v17.2s, v1.2s, v8.s[0] + ldr d2, [pA], #8 + fmov v0.d[1], x20 + ldr d6, [pB], #8 + fmov v4.d[1], x24 + ldr x22, [pA], #8 + ldr x26, [pB], #8 + fmla v16.4s, v0.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] .endm .macro KERNEL4x4_M2 - fmla v16.2s, v4.2s, v12.s[0] - fmla v29.2s, v5.2s, v13.s[1] - - ld1 {v8.2s, v9.2s}, [pB] // For next round - add pB, pB, #16 - - fmla v20.2s, v4.2s, v12.s[1] - fmla v25.2s, v5.2s, v13.s[0] - - ld1 {v0.2s, v1.2s}, [pA] // For next round - add pA, pA, #16 - - fmla v24.2s, v4.2s, v13.s[0] - fmla v21.2s, v5.2s, v12.s[1] - - prfm PLDL1KEEP, [pA, #512] - - fmla v28.2s, v4.2s, v13.s[1] - fmla v17.2s, v5.2s, v12.s[0] + ldr d0, [pA], #8 + fmov v2.d[1], x22 + ldr d4, [pB], #8 + fmov v6.d[1], x26 + ldr x20, [pA], #8 + ldr x24, [pB], #8 + fmla v16.4s, v2.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] .endm .macro KERNEL4x4_E - fmla v16.2s, v4.2s, v12.s[0] - fmla v29.2s, v5.2s, v13.s[1] - - fmla v20.2s, v4.2s, v12.s[1] - fmla v25.2s, v5.2s, v13.s[0] - - fmla v24.2s, v4.2s, v13.s[0] - fmla v21.2s, v5.2s, v12.s[1] - - fmla v28.2s, v4.2s, v13.s[1] - fmla v17.2s, v5.2s, v12.s[0] + fmov v2.d[1], x22 + fmov v6.d[1], x26 + fmla v16.4s, v2.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] .endm .macro KERNEL4x4_SUB - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.2s, v1.2s}, [pA] - add pA, pA, #16 - - fmla v16.2s, v0.2s, v8.s[0] - fmla v29.2s, v1.2s, v9.s[1] + ldr q0, [pA], #16 + ldr q4, [pB], #16 - fmla v20.2s, v0.2s, v8.s[1] - fmla v25.2s, v1.2s, v9.s[0] - - fmla v24.2s, v0.2s, v9.s[0] - fmla v21.2s, v1.2s, v8.s[1] - - fmla v28.2s, v0.2s, v9.s[1] - fmla v17.2s, v1.2s, v8.s[0] + fmla v16.4s, v0.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] .endm .macro SAVE4x4 - ld1 {v8.2s, v9.2s}, [pCRow0] - fmla v8.2s, v16.2s, alphaV0 - fmla v9.2s, v17.2s, alphaV1 - st1 {v8.2s, v9.2s}, [pCRow0] + ldr q0, [pCRow0] + fmla v0.4s, v16.4s, alphaV0 + str q0, [pCRow0] add pCRow1, pCRow0, LDC - ld1 {v12.2s, v13.2s}, [pCRow1] - fmla v12.2s, v20.2s, alphaV2 - fmla v13.2s, v21.2s, alphaV3 - st1 {v12.2s, v13.2s}, [pCRow1] + ldr q1, [pCRow1] + fmla v1.4s, v18.4s, alphaV2 + str q1, [pCRow1] add pCRow2, pCRow1, LDC - ld1 {v8.2s, v9.2s}, [pCRow2] - fmla v8.2s, v24.2s, alphaV0 - fmla v9.2s, v25.2s, alphaV1 - st1 {v8.2s, v9.2s}, [pCRow2] + ldr q2, [pCRow2] + fmla v2.4s, v20.4s, alphaV0 + str q2, [pCRow2] add pCRow1, pCRow2, LDC - ld1 {v12.2s, v13.2s}, [pCRow1] - fmla v12.2s, v28.2s, alphaV2 - fmla v13.2s, v29.2s, alphaV3 - st1 {v12.2s, v13.2s}, [pCRow1] + ldr q3, [pCRow1] + fmla v3.4s, v22.4s, alphaV2 + str q3, [pCRow1] add pCRow0, pCRow0, #16 .endm @@ -941,42 +921,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT2x4 fmov s16, wzr - fmov s20, s16 - fmov s24, s20 - fmov s28, s16 + fmov s18, wzr + fmov s20, wzr + fmov s22, s16 .endm .macro KERNEL2x4_SUB - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.2s}, [pA] - add pA, pA, #8 + ldr d0, [pA], #8 + ldr q4, [pB], #16 - fmla v16.2s, v0.2s, v8.s[0] - fmla v20.2s, v0.2s, v8.s[1] - fmla v24.2s, v0.2s, v9.s[0] - fmla v28.2s, v0.2s, v9.s[1] + fmla v16.2s, v0.2s, v4.s[0] + fmla v18.2s, v0.2s, v4.s[1] + fmla v20.2s, v0.2s, v4.s[2] + fmla v22.2s, v0.2s, v4.s[3] .endm .macro SAVE2x4 - ld1 {v8.2s}, [pCRow0] + ldr d8, [pCRow0] fmla v8.2s, v16.2s, alphaV0 - st1 {v8.2s}, [pCRow0] + str d8, [pCRow0] add pCRow1, pCRow0, LDC - ld1 {v12.2s}, [pCRow1] - fmla v12.2s, v20.2s, alphaV1 - st1 {v12.2s}, [pCRow1] + ldr d12, [pCRow1] + fmla v12.2s, v18.2s, alphaV1 + str d12, [pCRow1] add pCRow2, pCRow1, LDC - ld1 {v8.2s}, [pCRow2] - fmla v8.2s, v24.2s, alphaV2 - st1 {v8.2s}, [pCRow2] + ldr d8, [pCRow2] + fmla v8.2s, v20.2s, alphaV2 + str d8, [pCRow2] add pCRow1, pCRow2, LDC - ld1 {v12.2s}, [pCRow1] - fmla v12.2s, v28.2s, alphaV3 - st1 {v12.2s}, [pCRow1] + ldr d12, [pCRow1] + fmla v12.2s, v22.2s, alphaV3 + str d12, [pCRow1] add pCRow0, pCRow0, #8 .endm @@ -1023,39 +1001,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT8x2 fmov s16, wzr fmov s17, s16 - fmov s20, s17 - fmov s21, s16 + fmov s18, s17 + fmov s19, s16 .endm .macro KERNEL8x2_SUB - ld1 {v8.2s}, [pB] - add pB, pB, #8 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 - - fmla v16.4s, v0.4s, v8.s[0] - fmla v17.4s, v1.4s, v8.s[0] + ldp q0, q1, [pA], #32 + ldr d4, [pB], #8 - fmla v20.4s, v0.4s, v8.s[1] - fmla v21.4s, v1.4s, v8.s[1] + fmla v16.4s, v0.4s, v4.s[0] + fmla v17.4s, v1.4s, v4.s[0] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla v18.4s, v0.4s, v4.s[1] + fmla v19.4s, v1.4s, v4.s[1] .endm .macro SAVE8x2 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add pCRow1, pCRow0, LDC - ld1 {v0.4s, v1.4s}, [pCRow0] + ldp q0, q1, [pCRow0] fmla v0.4s, v16.4s, alphaV0 fmla v1.4s, v17.4s, alphaV1 - st1 {v0.4s, v1.4s}, [pCRow0] + stp q0, q1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] add pCRow2, pCRow1, LDC - ld1 {v4.4s, v5.4s}, [pCRow1] - fmla v4.4s, v20.4s, alphaV0 - fmla v5.4s, v21.4s, alphaV1 - st1 {v4.4s, v5.4s}, [pCRow1] + ldp q4, q5, [pCRow1] + fmla v4.4s, v18.4s, alphaV0 + fmla v5.4s, v19.4s, alphaV1 + stp q4, q5, [pCRow1] add pCRow0, pCRow0, #32 .endm @@ -1162,23 +1138,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x1_SUB - ldr s8, [pB] - add pB , pB, #4 + ldr s4, [pB], #4 + ldp q0, q1, [pA], #32 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 - - fmla v16.4s, v0.4s, v8.s[0] - fmla v17.4s, v1.4s, v8.s[0] + fmla v16.4s, v0.4s, v4.s[0] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla v17.4s, v1.4s, v4.s[0] .endm .macro SAVE8x1 - ld1 {v0.4s, v1.4s}, [pCRow0] + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + ldp q0, q1, [pCRow0] fmla v0.4s, v16.4s, alphaV0 fmla v1.4s, v17.4s, alphaV1 - st1 {v0.4s, v1.4s}, [pCRow0] + stp q0, q1, [pCRow0] add pCRow0, pCRow0, #32 .endm @@ -1247,13 +1221,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr s0, [pA] add pA , pA, #4 - fmadd s16, s0, s8, s16 + fmadd s16, s0, s8, s16 .endm .macro SAVE1x1 - ldr s8, [pCRow0] + ldr s8, [pCRow0] fmla s8, s16, alphaV0 - str s8, [pCRow0] + str s8, [pCRow0] add pCRow0, pCRow0, #4 .endm @@ -1290,8 +1264,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov pB, origPB mov counterJ, origN - asr counterJ, counterJ, #3 // J = J / 8 - cmp counterJ, #0 + asr counterJ, counterJ, #3 // J = J / 8 + cmp counterJ, #0 ble .Lsgemm_kernel_L4_BEGIN /******************************************************************************/ @@ -1308,15 +1282,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .Lsgemm_kernel_L8_M8_BEGIN: mov counterI, origM - asr counterI, counterI, #3 // counterI = counterI / 8 - cmp counterI, #0 + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 ble .Lsgemm_kernel_L8_M4_BEGIN .Lsgemm_kernel_L8_M8_20: mov pB, origPB - asr counterL , origK, #3 // L = K / 8 + asr counterL , origK, #3 // L = K / 8 cmp counterL , #2 // is there at least 16 to do? blt .Lsgemm_kernel_L8_M8_32 @@ -1415,7 +1389,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov pB, origPB - asr counterL , origK, #1 // L = K / 2 + asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? blt .Lsgemm_kernel_L8_M4_32 @@ -1487,7 +1461,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov pB, origPB - asr counterL , origK, #3 // counterL = counterL / 8 + asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble .Lsgemm_kernel_L8_M2_40 @@ -1538,7 +1512,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov pB, origPB - asr counterL , origK, #3 // counterL = counterL / 8 + asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble .Lsgemm_kernel_L8_M1_40 @@ -1603,15 +1577,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .Lsgemm_kernel_L4_M8_BEGIN: mov counterI, origM - asr counterI, counterI, #3 // counterI = counterI / 8 - cmp counterI, #0 + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 ble .Lsgemm_kernel_L4_M4_BEGIN .Lsgemm_kernel_L4_M8_20: mov pB, origPB - asr counterL , origK, #1 // L = K / 2 + asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? blt .Lsgemm_kernel_L4_M8_32 @@ -1683,7 +1657,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov pB, origPB - asr counterL , origK, #1 // L = K / 2 + asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? blt .Lsgemm_kernel_L4_M4_32 @@ -1755,7 +1729,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov pB, origPB - asr counterL , origK, #3 // counterL = counterL / 8 + asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble .Lsgemm_kernel_L4_M2_40 @@ -1806,7 +1780,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov pB, origPB - asr counterL , origK, #3 // counterL = counterL / 8 + asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble .Lsgemm_kernel_L4_M1_40 @@ -1867,7 +1841,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .Lsgemm_kernel_L2_M8_BEGIN: mov counterI, origM - asr counterI, counterI, #3 // counterI = counterI / 8 + asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI,#0 ble .Lsgemm_kernel_L2_M4_BEGIN @@ -2041,7 +2015,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov pB, origPB - asr counterL , origK, #3 // counterL = counterL / 8 + asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 ble .Lsgemm_kernel_L2_M1_40 @@ -2100,7 +2074,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .Lsgemm_kernel_L1_M8_BEGIN: mov counterI, origM - asr counterI, counterI, #3 + asr counterI, counterI, #3 cmp counterI, #0 ble .Lsgemm_kernel_L1_M4_BEGIN @@ -2223,7 +2197,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov pB, origPB - asr counterL , origK, #3 // counterL = counterL / 8 + asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble .Lsgemm_kernel_L1_M2_40 @@ -2274,7 +2248,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov pB, origPB - asr counterL , origK, #3 // counterL = counterL / 8 + asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble .Lsgemm_kernel_L1_M1_40 From 9b7877ccf1bd77a24adacd79c3b91addc86d2408 Mon Sep 17 00:00:00 2001 From: ZhangDanfeng <467688405@qq.com> Date: Thu, 4 Jun 2020 02:09:38 +0800 Subject: [PATCH 222/593] sgemm copy source init Signed-off-by: ZhangDanfeng <467688405@qq.com> --- kernel/arm64/KERNEL.CORTEXA53 | 9 +- kernel/arm64/sgemm_ncopy_8.S | 562 +++++++++++++++++++++++++++ kernel/arm64/sgemm_tcopy_8.S | 707 ++++++++++++++++++++++++++++++++++ 3 files changed, 1270 insertions(+), 8 deletions(-) create mode 100644 kernel/arm64/sgemm_ncopy_8.S create mode 100644 kernel/arm64/sgemm_tcopy_8.S diff --git a/kernel/arm64/KERNEL.CORTEXA53 b/kernel/arm64/KERNEL.CORTEXA53 index 4219acf98..eba38a92e 100644 --- a/kernel/arm64/KERNEL.CORTEXA53 +++ b/kernel/arm64/KERNEL.CORTEXA53 @@ -126,16 +126,9 @@ endif SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) endif -ifeq ($(SGEMM_UNROLL_N), 16) + SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S -else -SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c -endif -ifeq ($(SGEMM_UNROLL_N), 4) SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S -else -SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c -endif SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/arm64/sgemm_ncopy_8.S b/kernel/arm64/sgemm_ncopy_8.S new file mode 100644 index 000000000..f99b1d992 --- /dev/null +++ b/kernel/arm64/sgemm_ncopy_8.S @@ -0,0 +1,562 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER +#include "common.h" + +#define M x0 +#define N x1 +#define A00 x2 +#define LDA x3 +#define B00 x4 + +#define A01 x5 +#define A02 x6 +#define A03 x7 +#define A04 x8 +#define A05 x9 +#define A06 x10 +#define A07 x11 +#define A08 x12 + +#define I x13 +#define J x14 +#define K x15 + +#define TEMP1 x16 +#define TEMP2 x17 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro SAVE_REGS + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] +.endm + +.macro RESTORE_REGS + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) +.endm + +.macro COPY4x8 + ldr q0, [A01], #16 + ldr q1, [A02], #16 + ins v8.s[0], v0.s[0] + ins v10.s[0], v0.s[1] + ins v12.s[0], v0.s[2] + ins v14.s[0], v0.s[3] + ins v8.s[1], v1.s[0] + ins v10.s[1], v1.s[1] + ins v12.s[1], v1.s[2] + ins v14.s[1], v1.s[3] + + ldr q2, [A03], #16 + ldr q3, [A04], #16 + ins v8.s[2], v2.s[0] + ins v10.s[2], v2.s[1] + ins v12.s[2], v2.s[2] + ins v14.s[2], v2.s[3] + ins v8.s[3], v3.s[0] + ins v10.s[3], v3.s[1] + ins v12.s[3], v3.s[2] + ins v14.s[3], v3.s[3] + + ldr q4, [A05], #16 + ldr q5, [A06], #16 + ins v9.s[0], v4.s[0] + ins v11.s[0], v4.s[1] + ins v13.s[0], v4.s[2] + ins v15.s[0], v4.s[3] + ins v9.s[1], v5.s[0] + ins v11.s[1], v5.s[1] + ins v13.s[1], v5.s[2] + ins v15.s[1], v5.s[3] + + ldr q6, [A07], #16 + ldr q7, [A08], #16 + ins v9.s[2], v6.s[0] + ins v11.s[2], v6.s[1] + ins v13.s[2], v6.s[2] + ins v15.s[2], v6.s[3] + ins v9.s[3], v7.s[0] + ins v11.s[3], v7.s[1] + ins v13.s[3], v7.s[2] + ins v15.s[3], v7.s[3] + + st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00], #64 + st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [B00], #64 +.endm + +.macro COPY2x8 + ldr d0, [A01], #8 + ldr d1, [A02], #8 + ins v8.s[0], v0.s[0] + ins v10.s[0], v0.s[1] + ins v8.s[1], v1.s[0] + ins v10.s[1], v1.s[1] + + ldr d2, [A03], #8 + ldr d3, [A04], #8 + ins v8.s[2], v2.s[0] + ins v10.s[2], v2.s[1] + ins v8.s[3], v3.s[0] + ins v10.s[3], v3.s[1] + + ldr d4, [A05], #8 + ldr d5, [A06], #8 + ins v9.s[0], v4.s[0] + ins v11.s[0], v4.s[1] + ins v9.s[1], v5.s[0] + ins v11.s[1], v5.s[1] + + ldr d6, [A07], #8 + ldr d7, [A08], #8 + ins v9.s[2], v6.s[0] + ins v11.s[2], v6.s[1] + ins v9.s[3], v7.s[0] + ins v11.s[3], v7.s[1] + + st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00], #64 +.endm + +.macro COPY1x8 + ldr s0, [A01], #4 + ldr s1, [A02], #4 + ins v8.s[0], v0.s[0] + ins v8.s[1], v1.s[0] + + ldr s2, [A03], #4 + ldr s3, [A04], #4 + ins v8.s[2], v2.s[0] + ins v8.s[3], v3.s[0] + + ldr s4, [A05], #4 + ldr s5, [A06], #4 + ins v9.s[0], v4.s[0] + ins v9.s[1], v5.s[0] + + ldr s6, [A07], #4 + ldr s7, [A08], #4 + ins v9.s[2], v6.s[0] + ins v9.s[3], v7.s[0] + + st1 {v8.4s, v9.4s}, [B00], #32 +.endm + +.macro COPY4x4 + ldr q0, [A01], #16 + ldr q1, [A02], #16 + ins v8.s[0], v0.s[0] + ins v9.s[0], v0.s[1] + ins v10.s[0], v0.s[2] + ins v11.s[0], v0.s[3] + ins v8.s[1], v1.s[0] + ins v9.s[1], v1.s[1] + ins v10.s[1], v1.s[2] + ins v11.s[1], v1.s[3] + + ldr q2, [A03], #16 + ldr q3, [A04], #16 + ins v8.s[2], v2.s[0] + ins v9.s[2], v2.s[1] + ins v10.s[2], v2.s[2] + ins v11.s[2], v2.s[3] + ins v8.s[3], v3.s[0] + ins v9.s[3], v3.s[1] + ins v10.s[3], v3.s[2] + ins v11.s[3], v3.s[3] + + st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00], #64 +.endm + +.macro COPY2x4 + ldr d0, [A01], #8 + ldr d1, [A02], #8 + ins v8.s[0], v0.s[0] + ins v9.s[0], v0.s[1] + ins v8.s[1], v1.s[0] + ins v9.s[1], v1.s[1] + + ldr d2, [A03], #8 + ldr d3, [A04], #8 + ins v8.s[2], v2.s[0] + ins v9.s[2], v2.s[1] + ins v8.s[3], v3.s[0] + ins v9.s[3], v3.s[1] + + st1 {v8.4s, v9.4s}, [B00], #32 +.endm + +.macro COPY1x4 + ldr s0, [A01], #4 + ldr s1, [A02], #4 + ins v8.s[0], v0.s[0] + ins v8.s[1], v1.s[0] + + ldr s2, [A03], #4 + ldr s3, [A04], #4 + ins v8.s[2], v2.s[0] + ins v8.s[3], v3.s[0] + + st1 {v8.4s}, [B00], #16 +.endm + +.macro COPY4x2 + ldr q0, [A01], #16 + ldr q1, [A02], #16 + ins v8.s[0], v0.s[0] + ins v9.s[0], v0.s[1] + ins v10.s[0], v0.s[2] + ins v11.s[0], v0.s[3] + ins v8.s[1], v1.s[0] + ins v9.s[1], v1.s[1] + ins v10.s[1], v1.s[2] + ins v11.s[1], v1.s[3] + + st1 {v8.2s, v9.2s, v10.2s, v11.2s}, [B00], #32 +.endm + +.macro COPY2x2 + ldr d0, [A01], #8 + ldr d1, [A02], #8 + ins v8.s[0], v0.s[0] + ins v9.s[0], v0.s[1] + ins v8.s[1], v1.s[0] + ins v9.s[1], v1.s[1] + + st1 {v8.2s, v9.2s}, [B00], #16 +.endm + +.macro COPY1x2 + ldr s0, [A01], #4 + ldr s1, [A02], #4 + ins v8.s[0], v0.s[0] + ins v8.s[1], v1.s[0] + + st1 {v8.2s}, [B00], #8 +.endm + +.macro COPY1x1 + ldr s0, [A01], #4 + str s0, [B00], #4 +.endm + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + SAVE_REGS + + lsl LDA, LDA, #2 // LDA = LDA * SIZE + +.Lsgemm_ncopy_L8_BEGIN: + + asr J, N, #3 // J = N / 8 + cmp J, #0 + ble .Lsgemm_ncopy_L4_BEGIN + + .align 5 +.Lsgemm_ncopy_L8_M4_BEGIN: + + mov A01, A00 + add A02, A01, LDA + add A03, A02, LDA + add A04, A03, LDA + add A05, A04, LDA + add A06, A05, LDA + add A07, A06, LDA + add A08, A07, LDA + add A00, A08, LDA + + asr I, M, #2 // I = M / 4 + cmp I, #0 + ble .Lsgemm_ncopy_L8_M4_40 + + asr K, M, #4 // K = M / 16(cacheline) + mov TEMP1, A01 + + .align 5 +.Lsgemm_tcopy_L8_warnup_1: + + ldr s0, [TEMP1], #64 + + subs K, K, #1 + bgt .Lsgemm_tcopy_L8_warnup_1 + + asr K, M, #4 // K = M / 16(cacheline) + mov TEMP1, A02 + + .align 5 +.Lsgemm_tcopy_L8_warnup_2: + + ldr s0, [TEMP1], #64 + + subs K, K, #1 + bgt .Lsgemm_tcopy_L8_warnup_2 + + asr K, M, #4 // K = M / 16(cacheline) + mov TEMP1, A03 + + .align 5 +.Lsgemm_tcopy_L8_warnup_3: + + ldr s0, [TEMP1], #64 + + subs K, K, #1 + bgt .Lsgemm_tcopy_L8_warnup_3 + + asr K, M, #4 // K = M / 16(cacheline) + mov TEMP1, A04 + + .align 5 +.Lsgemm_tcopy_L8_warnup_4: + + ldr s0, [TEMP1], #64 + + subs K, K, #1 + bgt .Lsgemm_tcopy_L8_warnup_4 + + asr K, M, #4 // K = M / 16(cacheline) + mov TEMP1, A05 + + .align 5 +.Lsgemm_tcopy_L8_warnup_5: + + ldr s0, [TEMP1], #64 + + subs K, K, #1 + bgt .Lsgemm_tcopy_L8_warnup_5 + + asr K, M, #4 // K = M / 16(cacheline) + mov TEMP1, A06 + + .align 5 +.Lsgemm_tcopy_L8_warnup_6: + + ldr s0, [TEMP1], #64 + + subs K, K, #1 + bgt .Lsgemm_tcopy_L8_warnup_6 + + asr K, M, #4 // K = M / 16(cacheline) + mov TEMP1, A07 + + .align 5 +.Lsgemm_tcopy_L8_warnup_7: + + ldr s0, [TEMP1], #64 + + subs K, K, #1 + bgt .Lsgemm_tcopy_L8_warnup_7 + + asr K, M, #4 // K = M / 16(cacheline) + mov TEMP1, A08 + + .align 5 +.Lsgemm_tcopy_L8_warnup_8: + + ldr s0, [TEMP1], #64 + + subs K, K, #1 + bgt .Lsgemm_tcopy_L8_warnup_8 + + .align 5 +.Lsgemm_ncopy_L8_M4_20: + + COPY4x8 + + subs I, I, #1 + bne .Lsgemm_ncopy_L8_M4_20 + +.Lsgemm_ncopy_L8_M4_40: + + and I, M, #2 + cmp I, #0 + ble .Lsgemm_ncopy_L8_M4_60 + + COPY2x8 + +.Lsgemm_ncopy_L8_M4_60: + + and I, M, #1 + cmp I, #0 + ble .Lsgemm_ncopy_L8_M4_END + + COPY1x8 + +.Lsgemm_ncopy_L8_M4_END: + + subs J , J, #1 // j-- + bne .Lsgemm_ncopy_L8_M4_BEGIN + +/*********************************************************************************************/ + +.Lsgemm_ncopy_L4_BEGIN: + + tst N, #7 + ble .Lsgemm_ncopy_L999 + + tst N, #4 + ble .Lsgemm_ncopy_L2_BEGIN + +.Lsgemm_ncopy_L4_M4_BEGIN: + mov A01, A00 + add A02, A01, LDA + add A03, A02, LDA + add A04, A03, LDA + add A00, A04, LDA + + asr I, M, #2 // I = M / 4 + cmp I, #0 + ble .Lsgemm_ncopy_L4_M4_40 + + .align 5 +.Lsgemm_ncopy_L4_M4_20: + + COPY4x4 + + subs I, I, #1 + bne .Lsgemm_ncopy_L4_M4_20 + +.Lsgemm_ncopy_L4_M4_40: + + and I, M, #2 + cmp I, #0 + ble .Lsgemm_ncopy_L4_M4_60 + + COPY2x4 + +.Lsgemm_ncopy_L4_M4_60: + + and I, M, #1 + cmp I, #0 + ble .Lsgemm_ncopy_L4_M4_END + + COPY1x4 + +.Lsgemm_ncopy_L4_M4_END: + + +/*********************************************************************************************/ + +.Lsgemm_ncopy_L2_BEGIN: + + tst N, #2 + ble .Lsgemm_ncopy_L1_BEGIN + +.Lsgemm_ncopy_L2_M4_BEGIN: + + mov A01, A00 + add A02, A01, LDA + add A00, A02, LDA + + asr I, M, #2 // I = M / 4 + cmp I, #0 + ble .Lsgemm_ncopy_L2_M4_40 + + .align 5 +.Lsgemm_ncopy_L2_M4_20: + + COPY4x2 + + subs I , I , #1 + bne .Lsgemm_ncopy_L2_M4_20 + + +.Lsgemm_ncopy_L2_M4_40: + + and I, M, #2 + cmp I, #0 + ble .Lsgemm_ncopy_L2_M4_60 + + COPY2x2 + +.Lsgemm_ncopy_L2_M4_60: + + and I, M, #1 + cmp I, #0 + ble .Lsgemm_ncopy_L2_M4_END + + COPY1x2 + +.Lsgemm_ncopy_L2_M4_END: + +.Lsgemm_ncopy_L1_BEGIN: + + tst N, #1 + ble .Lsgemm_ncopy_L999 + +.Lsgemm_ncopy_L1_M1_BEGIN: + + mov A01, A00 + + mov I, M + cmp I, #0 + ble .Lsgemm_ncopy_L1_M1_END + + .align 5 +.Lsgemm_ncopy_L1_M1_20: + + COPY1x1 + + subs I, I, #1 + bne .Lsgemm_ncopy_L1_M1_20 + +.Lsgemm_ncopy_L1_M1_END: + +.Lsgemm_ncopy_L999: + + mov x0, #0 + RESTORE_REGS + ret + + EPILOGUE diff --git a/kernel/arm64/sgemm_tcopy_8.S b/kernel/arm64/sgemm_tcopy_8.S new file mode 100644 index 000000000..7d81ba266 --- /dev/null +++ b/kernel/arm64/sgemm_tcopy_8.S @@ -0,0 +1,707 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER +#include "common.h" + +#define M x0 +#define N x1 +#define A x2 +#define LDA x3 +#define B x4 + +#define M8 x5 + +#define A01 x6 +#define A02 x7 +#define A03 x8 +#define A04 x9 +#define A05 x10 +#define A06 x11 +#define A07 x12 +#define A08 x13 + +#define B01 x14 +#define B02 x15 +#define B03 x16 +#define B04 x17 +#define B00 x22 + + +#define I x18 +#define J x19 + +#define TEMP1 x20 + +#define A_PREFETCH 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ +.macro SAVE_REGS + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] +.endm + +.macro RESTORE_REGS + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) +.endm + +/*************************************************************************************************************************/ + +.macro COPY8x8 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + prfm PLDL1KEEP, [A03, #A_PREFETCH] + prfm PLDL1KEEP, [A04, #A_PREFETCH] + prfm PLDL1KEEP, [A05, #A_PREFETCH] + prfm PLDL1KEEP, [A06, #A_PREFETCH] + prfm PLDL1KEEP, [A07, #A_PREFETCH] + prfm PLDL1KEEP, [A08, #A_PREFETCH] + + ldp q0, q1, [A01] + ldp q2, q3, [A02] + add A01, A01, #32 + add A02, A02, #32 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] + add TEMP1, B00, #64 + + ldp q4, q5, [A03] + ldp q6, q7, [A04] + add A03, A03, #32 + add A04, A04, #32 + + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ldp q8, q9, [A05] + ldp q10, q11, [A06] + add A05, A05, #32 + add A06, A06, #32 + + st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ldp q12, q13, [A07] + ldp q14, q15, [A08] + add A07, A07, #32 + add A08, A08, #32 + + st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + add B00, B00, M8 +.endm + +.macro COPY4x8 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + //prfm PLDL1KEEP, [A05, #A_PREFETCH] + //prfm PLDL1KEEP, [A06, #A_PREFETCH] + //prfm PLDL1KEEP, [A07, #A_PREFETCH] + //prfm PLDL1KEEP, [A08, #A_PREFETCH] + + ldr q0, [A01] + ldr q1, [A02] + ldr q2, [A03] + ldr q3, [A04] + add A01, A01, #16 + add A02, A02, #16 + add A03, A03, #16 + add A04, A04, #16 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01] + add B01, B01, #64 + + ldr q4, [A05] + ldr q5, [A06] + ldr q6, [A07] + ldr q7, [A08] + + add A05, A05, #16 + add A06, A06, #16 + add A07, A07, #16 + add A08, A08, #16 + + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [B01] + add B01, B01, #64 +.endm + +.macro COPY2x8 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + //prfm PLDL1KEEP, [A05, #A_PREFETCH] + //prfm PLDL1KEEP, [A06, #A_PREFETCH] + //prfm PLDL1KEEP, [A07, #A_PREFETCH] + //prfm PLDL1KEEP, [A08, #A_PREFETCH] + + ldr d0, [A01] + ldr d1, [A02] + ldr d2, [A03] + ldr d3, [A04] + + add A01, A01, #8 + add A02, A02, #8 + add A03, A03, #8 + add A04, A04, #8 + + stp d0, d1, [B02] + add B02, B02, #16 + stp d2, d3, [B02] + add B02, B02, #16 + + ldr d4, [A05] + ldr d5, [A06] + ldr d6, [A07] + ldr d7, [A08] + + add A05, A05, #8 + add A06, A06, #8 + add A07, A07, #8 + add A08, A08, #8 + + stp d4, d5, [B02] + add B02, B02, #16 + stp d6, d7, [B02] + add B02, B02, #16 + +.endm + +.macro COPY1x8 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + //prfm PLDL1KEEP, [A05, #A_PREFETCH] + //prfm PLDL1KEEP, [A06, #A_PREFETCH] + //prfm PLDL1KEEP, [A07, #A_PREFETCH] + //prfm PLDL1KEEP, [A08, #A_PREFETCH] + + ldr s0, [A01] + ldr s1, [A02] + ldr s2, [A03] + ldr s3, [A04] + + add A01, A01, #4 + add A02, A02, #4 + add A03, A03, #4 + add A04, A04, #4 + + stp s0, s1, [B03] + add B03, B03, #8 + stp s2, s3, [B03] + add B03, B03, #8 + + ldr s4, [A05] + ldr s5, [A06] + ldr s6, [A07] + ldr s7, [A08] + + ldr d4, [A05], #8 + ldr d5, [A06], #8 + ldr d6, [A07], #8 + ldr d7, [A08], #8 + + stp s4, s5, [B03] + add B03, B03, #8 + stp s6, s7, [B03] + add B03, B03, #8 + +.endm + +/*************************************************************************************************************************/ + +.macro COPY8x4 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + prfm PLDL1KEEP, [A03, #A_PREFETCH] + prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ldp q0, q1, [A01] + ldp q2, q3, [A02] + add A01, A01, #32 + add A02, A02, #32 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] + add TEMP1, B00, #64 + + ldp q4, q5, [A03] + ldp q6, q7, [A04] + add A03, A03, #32 + add A04, A04, #32 + + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + add B00, B00, M8 +.endm + +.macro COPY4x4 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ldr q0, [A01] + ldr q1, [A02] + ldr q2, [A03] + ldr q3, [A04] + add A01, A01, #16 + add A02, A02, #16 + add A03, A03, #16 + add A04, A04, #16 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01] + + add B01, B01, #64 +.endm + +.macro COPY2x4 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ldr d0, [A01] + ldr d1, [A02] + ldr d2, [A03] + ldr d3, [A04] + + add A01, A01, #8 + add A02, A02, #8 + add A03, A03, #8 + add A04, A04, #8 + + stp d0, d1, [B02] + add B02, B02, #16 + stp d2, d3, [B02] + + add B02, B02, #16 +.endm + +.macro COPY1x4 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ldr s0, [A01] + ldr s1, [A02] + ldr s2, [A03] + ldr s3, [A04] + + add A01, A01, #4 + add A02, A02, #4 + add A03, A03, #4 + add A04, A04, #4 + + stp s0, s1, [B03] + add B03, B03, #8 + stp s2, s3, [B03] + add B03, B03, #8 + +.endm + +/*************************************************************************************************************************/ + +.macro COPY8x2 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ld1 {v0.4s, v1.4s}, [A01] + ld1 {v2.4s, v3.4s}, [A02] + add A01, A01, #32 + add A02, A02, #32 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] + add B00, B00, M8 +.endm + +.macro COPY4x2 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ldr q0, [A01] + ldr q1, [A02] + add A01, A01, #16 + add A02, A02, #16 + + stp q0, q1, [B01] + add B01, B01, #32 +.endm + +.macro COPY2x2 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ldr d0, [A01] + ldr d1, [A02] + + add A01, A01, #8 + add A02, A02, #8 + + stp d0, d1, [B02] + add B02, B02, #16 +.endm + +.macro COPY1x2 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ldr s0, [A01] + ldr s1, [A02] + + add A01, A01, #4 + add A02, A02, #4 + + stp s0, s1, [B03] + + add B03, B03, #8 +.endm + +/*************************************************************************************************************************/ + +.macro COPY8x1 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ldp q0, q1, [A01] + add A01, A01, #32 + stp q0, q1, [B00] + + add B00, B00, M8 +.endm + +.macro COPY4x1 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ldr q0, [A01] + add A01, A01, #16 + str q0, [B01] + + add B01, B01, #16 +.endm + +.macro COPY2x1 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ldr d0, [A01] + add A01, A01, #8 + str d0, [B02] + + add B02, B02, #8 +.endm + +.macro COPY1x1 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ldr s0, [A01] + add A01, A01, #4 + str s0, [B03] + + add B03, B03, #4 +.endm + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + SAVE_REGS + + lsl LDA, LDA, #2 // LDA = LDA * SIZE + + lsl TEMP1, M, #2 // TEMP1 = M * SIZE + + and B01 , N , #-8 + and B02 , N , #-4 + and B03 , N , #-2 + + mul B01, B01, TEMP1 + mul B02, B02, TEMP1 + mul B03, B03, TEMP1 + + add B01 , B01, B + add B02 , B02, B + add B03 , B03, B + + lsl M8, M, #5 // M8 = M * 8 * SIZE + +.Lsgemm_tcopy_L8_BEGIN: + + asr J, M, #3 // J = M / 8 + cmp J, #0 + ble .Lsgemm_tcopy_L4_BEGIN + + .align 5 +.Lsgemm_tcopy_L8_M8_BEGIN: + + mov A01, A + add A02, A01, LDA + add A03, A02, LDA + add A04, A03, LDA + add A05, A04, LDA + add A06, A05, LDA + add A07, A06, LDA + add A08, A07, LDA + add A, A08, LDA + + mov B00, B + add B, B00, #256 // B = B + 8 * 8 * SIZE + + asr I, N, #3 // I = N / 8 + cmp I, #0 + ble .Lsgemm_tcopy_L8_M8_40 + + .align 5 +.Lsgemm_tcopy_L8_M8_20: + + COPY8x8 + + subs I , I , #1 + bne .Lsgemm_tcopy_L8_M8_20 + +.Lsgemm_tcopy_L8_M8_40: + + tst N , #4 + ble .Lsgemm_tcopy_L8_M8_60 + + COPY4x8 + +.Lsgemm_tcopy_L8_M8_60: + + tst N , #2 + ble .Lsgemm_tcopy_L8_M8_80 + + COPY2x8 + +.Lsgemm_tcopy_L8_M8_80: + + tst N, #1 + ble .Lsgemm_tcopy_L8_M8_END + + COPY1x8 + +.Lsgemm_tcopy_L8_M8_END: + + subs J, J, #1 // j-- + bne .Lsgemm_tcopy_L8_M8_BEGIN + +/*********************************************************************************************/ + +.Lsgemm_tcopy_L4_BEGIN: + + tst M, #7 + ble .Lsgemm_tcopy_L999 + + tst M, #4 + ble .Lsgemm_tcopy_L2_BEGIN + +.Lsgemm_tcopy_L4_M8_BEGIN: + + mov A01, A + add A02, A01, LDA + add A03, A02, LDA + add A04, A03, LDA + add A, A04, LDA + + mov B00, B + add B, B00, #128 // B = B + 4 * 8 * SIZE + + asr I, N, #3 // I = N / 8 + cmp I, #0 + ble .Lsgemm_tcopy_L4_M8_40 + + .align 5 +.Lsgemm_tcopy_L4_M8_20: + + COPY8x4 + + subs I , I , #1 + bne .Lsgemm_tcopy_L4_M8_20 + +.Lsgemm_tcopy_L4_M8_40: + + tst N , #4 + ble .Lsgemm_tcopy_L4_M8_60 + + COPY4x4 + +.Lsgemm_tcopy_L4_M8_60: + + tst N , #2 + ble .Lsgemm_tcopy_L4_M8_80 + + COPY2x4 + +.Lsgemm_tcopy_L4_M8_80: + + tst N , #1 + ble .Lsgemm_tcopy_L4_M8_END + + COPY1x4 + + +.Lsgemm_tcopy_L4_M8_END: + +/*********************************************************************************************/ + +.Lsgemm_tcopy_L2_BEGIN: + + tst M, #3 + ble .Lsgemm_tcopy_L999 + + tst M, #2 + ble .Lsgemm_tcopy_L1_BEGIN + +.Lsgemm_tcopy_L2_M16_BEGIN: + + mov A01, A + add A02, A01, LDA + add A, A02, LDA + + mov B00, B + add B, B00, #64 // B = B + 2 * 8 * SIZE + + asr I, N, #3 // I = N / 8 + cmp I, #0 + ble .Lsgemm_tcopy_L2_M8_40 + + .align 5 +.Lsgemm_tcopy_L2_M8_20: + + COPY8x2 + + subs I , I , #1 + bne .Lsgemm_tcopy_L2_M8_20 + +.Lsgemm_tcopy_L2_M8_40: + + tst N , #4 + ble .Lsgemm_tcopy_L2_M8_60 + + COPY4x2 + +.Lsgemm_tcopy_L2_M8_60: + + tst N , #2 + ble .Lsgemm_tcopy_L2_M8_80 + + COPY2x2 + +.Lsgemm_tcopy_L2_M8_80: + + tst N , #1 + ble .Lsgemm_tcopy_L2_M8_END + + COPY1x2 + +.Lsgemm_tcopy_L2_M8_END: + +/*********************************************************************************************/ + +.Lsgemm_tcopy_L1_BEGIN: + + tst M, #1 + ble .Lsgemm_tcopy_L999 + + +.Lsgemm_tcopy_L1_M16_BEGIN: + + mov A01, A // A01 = A + mov B00, B + + asr I, N, #3 // I = M / 8 + cmp I, #0 + ble .Lsgemm_tcopy_L1_M8_40 + + .align 5 +.Lsgemm_tcopy_L1_M8_20: + + COPY8x1 + + subs I , I , #1 + bne .Lsgemm_tcopy_L1_M8_20 + +.Lsgemm_tcopy_L1_M8_40: + + tst N , #4 + ble .Lsgemm_tcopy_L1_M8_60 + + COPY4x1 + +.Lsgemm_tcopy_L1_M8_60: + + tst N , #2 + ble .Lsgemm_tcopy_L1_M8_80 + + COPY2x1 + +.Lsgemm_tcopy_L1_M8_80: + + tst N , #1 + ble .Lsgemm_tcopy_L1_M8_END + + COPY1x1 + + +.Lsgemm_tcopy_L1_M8_END: + +.Lsgemm_tcopy_L999: + + mov x0, #0 // set return value + RESTORE_REGS + ret + + EPILOGUE From 6e97df7b478fa912e4a4488050c7837e7943be3f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 4 Jun 2020 14:45:31 +0200 Subject: [PATCH 223/593] Add CMAKE support for MAX_STACK_ALLOC setting --- cmake/system.cmake | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cmake/system.cmake b/cmake/system.cmake index 65e5aa508..61e73fb71 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -297,6 +297,14 @@ if (USE_SIMPLE_THREADED_LEVEL3) set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_SIMPLE_THREADED_LEVEL3") endif () +if (DEFINED MAX_STACK_ALLOC) +if (NOT ${MAX_STACK_ALLOC} EQUAL 0) +set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_STACK_ALLOC=${MAX_STACK_ALLOC}") +endif () +else () +set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_STACK_ALLOC=2048") +endif () + if (DEFINED LIBNAMESUFFIX) set(LIBPREFIX "libopenblas_${LIBNAMESUFFIX}") else () From f1953b8b814621784e5ac3dc0761dcf4e7bb3891 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 4 Jun 2020 17:58:13 +0200 Subject: [PATCH 224/593] Update azure-pipelines.yml --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 639cb3558..864ffbfe0 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -60,7 +60,7 @@ jobs: cmakeArgs: '-G "Visual Studio 16 2019" ..' - task: CMake@1 inputs: - cmakeArgs: '--build . --config Release' + cmakeArgs: '--build . --config Release -- /verbosity:detailed' workingDirectory: 'build' - script: | cd build From 32c1c1e12512371e6435eebb0d1ad149e18bef9c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 4 Jun 2020 19:03:46 +0200 Subject: [PATCH 225/593] Update azure-pipelines.yml --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 864ffbfe0..639cb3558 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -60,7 +60,7 @@ jobs: cmakeArgs: '-G "Visual Studio 16 2019" ..' - task: CMake@1 inputs: - cmakeArgs: '--build . --config Release -- /verbosity:detailed' + cmakeArgs: '--build . --config Release' workingDirectory: 'build' - script: | cd build From bb12c2c8541bc97f20677be995ea7d2f5df30355 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 4 Jun 2020 19:07:27 +0200 Subject: [PATCH 226/593] Limit MAX_STACK_ALLOC availability to non-Wndows --- cmake/system.cmake | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cmake/system.cmake b/cmake/system.cmake index 61e73fb71..7e7f726c5 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -297,6 +297,7 @@ if (USE_SIMPLE_THREADED_LEVEL3) set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_SIMPLE_THREADED_LEVEL3") endif () +if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows") if (DEFINED MAX_STACK_ALLOC) if (NOT ${MAX_STACK_ALLOC} EQUAL 0) set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_STACK_ALLOC=${MAX_STACK_ALLOC}") @@ -304,6 +305,7 @@ endif () else () set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_STACK_ALLOC=2048") endif () +endif () if (DEFINED LIBNAMESUFFIX) set(LIBPREFIX "libopenblas_${LIBNAMESUFFIX}") From 0464e662ad1257c9624170cc332ab8edc5906acb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 5 Jun 2020 10:03:36 +0200 Subject: [PATCH 227/593] make blas_quickdivide unsigned and guard against miscompilation --- common_x86_64.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/common_x86_64.h b/common_x86_64.h index 0247674cd..15d0c30aa 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -80,7 +80,7 @@ static void __inline blas_lock(volatile BLASULONG *address){ #endif do { - while (*address) {YIELDING;}; + while (*address) {YIELDING;} #ifndef C_MSVC __asm__ __volatile__( @@ -199,9 +199,9 @@ static __inline BLASLONG blas_quickdivide(BLASLONG x, BLASLONG y){ #else extern unsigned int blas_quick_divide_table[]; -static __inline int blas_quickdivide(unsigned int x, unsigned int y){ +static __inline unsigned int blas_quickdivide(unsigned int x, unsigned int y){ - unsigned int result; + volatile unsigned int result; if (y <= 1) return x; @@ -215,7 +215,6 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ y = blas_quick_divide_table[y]; __asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x) : "0" (y)); - return result; } #endif From 7f60fb6b91e1b9d4af39ae7b05717aea374bcee5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 5 Jun 2020 10:04:16 +0200 Subject: [PATCH 228/593] Delete spurious copy of common_param.h --- kernel/common_param.h | 1403 ----------------------------------------- 1 file changed, 1403 deletions(-) delete mode 100644 kernel/common_param.h diff --git a/kernel/common_param.h b/kernel/common_param.h deleted file mode 100644 index 29bb65e5c..000000000 --- a/kernel/common_param.h +++ /dev/null @@ -1,1403 +0,0 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -#ifndef COMMON_PARAM_H -#define COMMON_PARAM_H - -#ifndef ASSEMBLER - -#ifdef DYNAMIC_ARCH - -typedef struct { - int dtb_entries; - int offsetA, offsetB, align; - -#if 1 - int shgemm_p, shgemm_q, shgemm_r; - int shgemm_unroll_m, shgemm_unroll_n, shgemm_unroll_mn; - - float (*shamax_k) (BLASLONG, float *, BLASLONG); - float (*shamin_k) (BLASLONG, float *, BLASLONG); - float (*shmax_k) (BLASLONG, float *, BLASLONG); - float (*shmin_k) (BLASLONG, float *, BLASLONG); -BLASLONG (*ishamax_k)(BLASLONG, float *, BLASLONG); -BLASLONG (*ishamin_k)(BLASLONG, float *, BLASLONG); -BLASLONG (*ishmax_k) (BLASLONG, float *, BLASLONG); -BLASLONG (*ishmin_k) (BLASLONG, float *, BLASLONG); - - float (*shnrm2_k) (BLASLONG, float *, BLASLONG); - float (*shasum_k) (BLASLONG, float *, BLASLONG); - float (*shsum_k) (BLASLONG, float *, BLASLONG); - int (*shcopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - float (*shdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - double (*dshdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - - int (*shrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); - - int (*shaxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - int (*shscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - int (*shswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - - int (*shgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*shgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*shger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - - int (*shsymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*shsymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - - int (*shgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG); - int (*shgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG); - - int (*shgemm_incopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); - int (*shgemm_itcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); - int (*shgemm_oncopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); - int (*shgemm_otcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); - - int (*shtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*shtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*shtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*shtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - - int (*shtrsm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - - int (*shtrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*shtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*shtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*shtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - - int (*shtrmm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - - int (*shsymm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shsymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shsymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shsymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - - int (*shneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*shlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); - -#endif - int sgemm_p, sgemm_q, sgemm_r; - int sgemm_unroll_m, sgemm_unroll_n, sgemm_unroll_mn; - - int exclusive_cache; - - float (*samax_k) (BLASLONG, float *, BLASLONG); - float (*samin_k) (BLASLONG, float *, BLASLONG); - float (*smax_k) (BLASLONG, float *, BLASLONG); - float (*smin_k) (BLASLONG, float *, BLASLONG); -BLASLONG (*isamax_k)(BLASLONG, float *, BLASLONG); -BLASLONG (*isamin_k)(BLASLONG, float *, BLASLONG); -BLASLONG (*ismax_k) (BLASLONG, float *, BLASLONG); -BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); - - float (*snrm2_k) (BLASLONG, float *, BLASLONG); - float (*sasum_k) (BLASLONG, float *, BLASLONG); - float (*ssum_k) (BLASLONG, float *, BLASLONG); - int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - - int (*srot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); - - int (*saxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - int (*sscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - int (*sswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - - int (*sgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*sgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*sger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - - int (*ssymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*ssymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - - int (*sgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); - int (*sgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - - - int (*sgemm_incopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*sgemm_itcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*sgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*sgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); - - int (*strsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*strsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*strsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*strsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - - int (*strsm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - - int (*strmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*strmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*strmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*strmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - - int (*strmm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - - int (*ssymm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ssymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ssymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ssymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - - int (*sneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*slaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); - - int dgemm_p, dgemm_q, dgemm_r; - int dgemm_unroll_m, dgemm_unroll_n, dgemm_unroll_mn; - - double (*damax_k) (BLASLONG, double *, BLASLONG); - double (*damin_k) (BLASLONG, double *, BLASLONG); - double (*dmax_k) (BLASLONG, double *, BLASLONG); - double (*dmin_k) (BLASLONG, double *, BLASLONG); -BLASLONG (*idamax_k)(BLASLONG, double *, BLASLONG); -BLASLONG (*idamin_k)(BLASLONG, double *, BLASLONG); -BLASLONG (*idmax_k) (BLASLONG, double *, BLASLONG); -BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); - - double (*dnrm2_k) (BLASLONG, double *, BLASLONG); - double (*dasum_k) (BLASLONG, double *, BLASLONG); - double (*dsum_k) (BLASLONG, double *, BLASLONG); - int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); - double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); - int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); - - int (*daxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); - int (*dscal_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); - int (*dswap_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); - - int (*dgemv_n) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*dgemv_t) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*dger_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - - int (*dsymv_L) (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*dsymv_U) (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - - int (*dgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG); - int (*dgemm_beta )(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); - - int (*dgemm_incopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*dgemm_itcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*dgemm_oncopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*dgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); - - int (*dtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*dtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*dtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*dtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); - - int (*dtrsm_iunucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_iunncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_iutucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_iutncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_ilnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_ilnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_iltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_iltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_ounucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_ounncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_outucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_outncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_olnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_olnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_oltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_oltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - - int (*dtrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*dtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*dtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*dtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); - - int (*dtrmm_iunucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_iunncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_iutucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_iutncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_ilnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_ilnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_iltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_iltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_ounucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_ounncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_outucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_outncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_olnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_olnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_oltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_oltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - - int (*dsymm_iutcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dsymm_iltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dsymm_outcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dsymm_oltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - - int (*dneg_tcopy) (BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*dlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, double *, BLASLONG, blasint *, double *); - -#ifdef EXPRECISION - - int qgemm_p, qgemm_q, qgemm_r; - int qgemm_unroll_m, qgemm_unroll_n, qgemm_unroll_mn; - - xdouble (*qamax_k) (BLASLONG, xdouble *, BLASLONG); - xdouble (*qamin_k) (BLASLONG, xdouble *, BLASLONG); - xdouble (*qmax_k) (BLASLONG, xdouble *, BLASLONG); - xdouble (*qmin_k) (BLASLONG, xdouble *, BLASLONG); -BLASLONG (*iqamax_k)(BLASLONG, xdouble *, BLASLONG); -BLASLONG (*iqamin_k)(BLASLONG, xdouble *, BLASLONG); -BLASLONG (*iqmax_k) (BLASLONG, xdouble *, BLASLONG); -BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG); - - xdouble (*qnrm2_k) (BLASLONG, xdouble *, BLASLONG); - xdouble (*qasum_k) (BLASLONG, xdouble *, BLASLONG); - xdouble (*qsum_k) (BLASLONG, xdouble *, BLASLONG); - int (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - xdouble (*qdot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - int (*qrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble); - - int (*qaxpy_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - int (*qscal_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - int (*qswap_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - - int (*qgemv_n) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*qgemv_t) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*qger_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - - int (*qsymv_L) (BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*qsymv_U) (BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - - int (*qgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); - int (*qgemm_beta )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - - int (*qgemm_incopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*qgemm_itcopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*qgemm_oncopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*qgemm_otcopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - - int (*qtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*qtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*qtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*qtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - - int (*qtrsm_iunucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_iunncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_iutucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_iutncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_ilnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_ilnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_iltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_iltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_ounucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_ounncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_outucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_outncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_olnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_olnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_oltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_oltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - - int (*qtrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*qtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*qtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*qtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - - int (*qtrmm_iunucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_iunncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_iutucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_iutncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_ilnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_ilnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_iltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_iltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_ounucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_ounncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_outucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_outncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_olnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_olnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_oltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_oltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - - int (*qsymm_iutcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qsymm_iltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qsymm_outcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qsymm_oltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - - int (*qneg_tcopy) (BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*qlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, xdouble *, BLASLONG, blasint *, xdouble *); - -#endif - - int cgemm_p, cgemm_q, cgemm_r; - int cgemm_unroll_m, cgemm_unroll_n, cgemm_unroll_mn; - - float (*camax_k) (BLASLONG, float *, BLASLONG); - float (*camin_k) (BLASLONG, float *, BLASLONG); -BLASLONG (*icamax_k)(BLASLONG, float *, BLASLONG); -BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); - - float (*cnrm2_k) (BLASLONG, float *, BLASLONG); - float (*casum_k) (BLASLONG, float *, BLASLONG); - float (*csum_k) (BLASLONG, float *, BLASLONG); - int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - int (*csrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); - - int (*caxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - int (*caxpyc_k)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - int (*cscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - int (*cswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - - int (*cgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*cgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*cgemv_r) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*cgemv_c) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*cgemv_o) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*cgemv_u) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*cgemv_s) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*cgemv_d) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*cgeru_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*cgerc_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*cgerv_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*cgerd_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - - int (*csymv_L) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*csymv_U) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*chemv_L) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*chemv_U) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*chemv_M) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*chemv_V) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - - int (*cgemm_kernel_n )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); - int (*cgemm_kernel_l )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); - int (*cgemm_kernel_r )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); - int (*cgemm_kernel_b )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); - int (*cgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - - int (*cgemm_incopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*cgemm_itcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*cgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*cgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); - - int (*ctrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrsm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrsm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrsm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - - int (*ctrsm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - - int (*ctrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrmm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrmm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrmm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrmm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - - int (*ctrmm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - - int (*csymm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*csymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*csymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*csymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - - int (*chemm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*chemm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*chemm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*chemm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - - int cgemm3m_p, cgemm3m_q, cgemm3m_r; - int cgemm3m_unroll_m, cgemm3m_unroll_n, cgemm3m_unroll_mn; - - int (*cgemm3m_kernel)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); - - int (*cgemm3m_incopyb)(BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*cgemm3m_incopyr)(BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*cgemm3m_incopyi)(BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*cgemm3m_itcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*cgemm3m_itcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*cgemm3m_itcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, float *); - - int (*cgemm3m_oncopyb)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); - int (*cgemm3m_oncopyr)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); - int (*cgemm3m_oncopyi)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); - int (*cgemm3m_otcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); - int (*cgemm3m_otcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); - int (*cgemm3m_otcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); - - int (*csymm3m_iucopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*csymm3m_ilcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*csymm3m_iucopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*csymm3m_ilcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*csymm3m_iucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*csymm3m_ilcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - - int (*csymm3m_oucopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - int (*csymm3m_olcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - int (*csymm3m_oucopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - int (*csymm3m_olcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - int (*csymm3m_oucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - int (*csymm3m_olcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - - int (*chemm3m_iucopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*chemm3m_ilcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*chemm3m_iucopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*chemm3m_ilcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*chemm3m_iucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*chemm3m_ilcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - - int (*chemm3m_oucopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - int (*chemm3m_olcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - int (*chemm3m_oucopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - int (*chemm3m_olcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - int (*chemm3m_oucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - int (*chemm3m_olcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - - int (*cneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*claswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); - - int zgemm_p, zgemm_q, zgemm_r; - int zgemm_unroll_m, zgemm_unroll_n, zgemm_unroll_mn; - - double (*zamax_k) (BLASLONG, double *, BLASLONG); - double (*zamin_k) (BLASLONG, double *, BLASLONG); -BLASLONG (*izamax_k)(BLASLONG, double *, BLASLONG); -BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG); - - double (*znrm2_k) (BLASLONG, double *, BLASLONG); - double (*zasum_k) (BLASLONG, double *, BLASLONG); - double (*zsum_k) (BLASLONG, double *, BLASLONG); - int (*zcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); - openblas_complex_double (*zdotu_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); - openblas_complex_double (*zdotc_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); - int (*zdrot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); - - int (*zaxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); - int (*zaxpyc_k)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); - int (*zscal_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); - int (*zswap_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); - - int (*zgemv_n) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zgemv_t) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zgemv_r) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zgemv_c) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zgemv_o) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zgemv_u) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zgemv_s) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zgemv_d) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zgeru_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zgerc_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zgerv_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zgerd_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - - int (*zsymv_L) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zsymv_U) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zhemv_L) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zhemv_U) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zhemv_M) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zhemv_V) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - - int (*zgemm_kernel_n )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); - int (*zgemm_kernel_l )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); - int (*zgemm_kernel_r )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); - int (*zgemm_kernel_b )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); - int (*zgemm_beta )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); - - int (*zgemm_incopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*zgemm_itcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*zgemm_oncopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*zgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); - - int (*ztrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrsm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrsm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrsm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - - int (*ztrsm_iunucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_iunncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_iutucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_iutncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_ilnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_ilnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_iltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_iltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_ounucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_ounncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_outucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_outncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_olnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_olnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_oltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_oltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - - int (*ztrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrmm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrmm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrmm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrmm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - - int (*ztrmm_iunucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_iunncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_iutucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_iutncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_ilnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_ilnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_iltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_iltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_ounucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_ounncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_outucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_outncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_olnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_olnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_oltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_oltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - - int (*zsymm_iutcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zsymm_iltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zsymm_outcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zsymm_oltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - - int (*zhemm_iutcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zhemm_iltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zhemm_outcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zhemm_oltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - - int zgemm3m_p, zgemm3m_q, zgemm3m_r; - int zgemm3m_unroll_m, zgemm3m_unroll_n, zgemm3m_unroll_mn; - - int (*zgemm3m_kernel)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); - - int (*zgemm3m_incopyb)(BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*zgemm3m_incopyr)(BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*zgemm3m_incopyi)(BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*zgemm3m_itcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*zgemm3m_itcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*zgemm3m_itcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, double *); - - int (*zgemm3m_oncopyb)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); - int (*zgemm3m_oncopyr)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); - int (*zgemm3m_oncopyi)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); - int (*zgemm3m_otcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); - int (*zgemm3m_otcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); - int (*zgemm3m_otcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); - - int (*zsymm3m_iucopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zsymm3m_ilcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zsymm3m_iucopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zsymm3m_ilcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zsymm3m_iucopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zsymm3m_ilcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - - int (*zsymm3m_oucopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - int (*zsymm3m_olcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - int (*zsymm3m_oucopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - int (*zsymm3m_olcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - int (*zsymm3m_oucopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - int (*zsymm3m_olcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - - int (*zhemm3m_iucopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zhemm3m_ilcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zhemm3m_iucopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zhemm3m_ilcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zhemm3m_iucopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zhemm3m_ilcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - - int (*zhemm3m_oucopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - int (*zhemm3m_olcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - int (*zhemm3m_oucopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - int (*zhemm3m_olcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - int (*zhemm3m_oucopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - int (*zhemm3m_olcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - - int (*zneg_tcopy) (BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*zlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, double *, BLASLONG, blasint *, double *); - -#ifdef EXPRECISION - - int xgemm_p, xgemm_q, xgemm_r; - int xgemm_unroll_m, xgemm_unroll_n, xgemm_unroll_mn; - - xdouble (*xamax_k) (BLASLONG, xdouble *, BLASLONG); - xdouble (*xamin_k) (BLASLONG, xdouble *, BLASLONG); -BLASLONG (*ixamax_k)(BLASLONG, xdouble *, BLASLONG); -BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); - - xdouble (*xnrm2_k) (BLASLONG, xdouble *, BLASLONG); - xdouble (*xasum_k) (BLASLONG, xdouble *, BLASLONG); - xdouble (*xsum_k) (BLASLONG, xdouble *, BLASLONG); - int (*xcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - openblas_complex_xdouble (*xdotu_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - openblas_complex_xdouble (*xdotc_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - int (*xqrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble); - - int (*xaxpy_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - int (*xaxpyc_k)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - int (*xscal_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - int (*xswap_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - - int (*xgemv_n) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemv_t) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemv_r) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemv_c) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemv_o) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemv_u) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemv_s) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemv_d) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgeru_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgerc_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgerv_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgerd_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - - int (*xsymv_L) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xsymv_U) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xhemv_L) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xhemv_U) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xhemv_M) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xhemv_V) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - - int (*xgemm_kernel_n )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); - int (*xgemm_kernel_l )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); - int (*xgemm_kernel_r )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); - int (*xgemm_kernel_b )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); - int (*xgemm_beta )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - - int (*xgemm_incopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemm_itcopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemm_oncopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemm_otcopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - - int (*xtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrsm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrsm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrsm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - - int (*xtrsm_iunucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_iunncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_iutucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_iutncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_ilnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_ilnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_iltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_iltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_ounucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_ounncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_outucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_outncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_olnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_olnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_oltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_oltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - - int (*xtrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrmm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrmm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrmm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrmm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - - int (*xtrmm_iunucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_iunncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_iutucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_iutncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_ilnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_ilnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_iltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_iltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_ounucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_ounncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_outucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_outncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_olnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_olnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_oltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_oltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - - int (*xsymm_iutcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xsymm_iltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xsymm_outcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xsymm_oltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - - int (*xhemm_iutcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xhemm_iltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xhemm_outcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xhemm_oltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - - int xgemm3m_p, xgemm3m_q, xgemm3m_r; - int xgemm3m_unroll_m, xgemm3m_unroll_n, xgemm3m_unroll_mn; - - int (*xgemm3m_kernel)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); - - int (*xgemm3m_incopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemm3m_incopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemm3m_incopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemm3m_itcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemm3m_itcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemm3m_itcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - - int (*xgemm3m_oncopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); - int (*xgemm3m_oncopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); - int (*xgemm3m_oncopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); - int (*xgemm3m_otcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); - int (*xgemm3m_otcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); - int (*xgemm3m_otcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); - - int (*xsymm3m_iucopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xsymm3m_ilcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xsymm3m_iucopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xsymm3m_ilcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xsymm3m_iucopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xsymm3m_ilcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - - int (*xsymm3m_oucopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - int (*xsymm3m_olcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - int (*xsymm3m_oucopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - int (*xsymm3m_olcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - int (*xsymm3m_oucopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - int (*xsymm3m_olcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - - int (*xhemm3m_iucopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xhemm3m_ilcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xhemm3m_iucopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xhemm3m_ilcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xhemm3m_iucopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xhemm3m_ilcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - - int (*xhemm3m_oucopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - int (*xhemm3m_olcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - int (*xhemm3m_oucopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - int (*xhemm3m_olcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - int (*xhemm3m_oucopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - int (*xhemm3m_olcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - - int (*xneg_tcopy) (BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, xdouble *, BLASLONG, blasint *, xdouble *); - -#endif - - - void (*init)(void); - - int snum_opt, dnum_opt, qnum_opt; - - int (*saxpby_k) (BLASLONG, float, float*, BLASLONG,float, float*, BLASLONG); - int (*daxpby_k) (BLASLONG, double, double*, BLASLONG,double, double*, BLASLONG); - int (*caxpby_k) (BLASLONG, float, float, float*, BLASLONG,float,float, float*, BLASLONG); - int (*zaxpby_k) (BLASLONG, double, double, double*, BLASLONG,double,double, double*, BLASLONG); - - int (*somatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); - int (*somatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); - int (*somatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); - int (*somatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); - - int (*domatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); - int (*domatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); - int (*domatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); - int (*domatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); - - int (*comatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); - int (*comatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); - int (*comatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); - int (*comatcopy_k_rt) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); - - int (*comatcopy_k_cnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); - int (*comatcopy_k_ctc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); - int (*comatcopy_k_rnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); - int (*comatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); - - int (*zomatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); - int (*zomatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); - int (*zomatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); - int (*zomatcopy_k_rt) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); - - int (*zomatcopy_k_cnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); - int (*zomatcopy_k_ctc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); - int (*zomatcopy_k_rnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); - int (*zomatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); - - int (*simatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG); - int (*simatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG); - int (*simatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG); - int (*simatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG); - - int (*dimatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG); - int (*dimatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG); - int (*dimatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG); - int (*dimatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG); - - int (*cimatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); - int (*cimatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); - int (*cimatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); - int (*cimatcopy_k_rt) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); - - int (*cimatcopy_k_cnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); - int (*cimatcopy_k_ctc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); - int (*cimatcopy_k_rnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); - int (*cimatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); - - int (*zimatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); - int (*zimatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); - int (*zimatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); - int (*zimatcopy_k_rt) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); - - int (*zimatcopy_k_cnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); - int (*zimatcopy_k_ctc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); - int (*zimatcopy_k_rnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); - int (*zimatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); - - int (*sgeadd_k) (BLASLONG, BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG); - int (*dgeadd_k) (BLASLONG, BLASLONG, double, double *, BLASLONG, double, double *, BLASLONG); - int (*cgeadd_k) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float, float, float *, BLASLONG); - int (*zgeadd_k) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double, double, double *, BLASLONG); - -} gotoblas_t; - -extern gotoblas_t *gotoblas; - -#define DTB_ENTRIES gotoblas -> dtb_entries -#define GEMM_OFFSET_A gotoblas -> offsetA -#define GEMM_OFFSET_B gotoblas -> offsetB -#define GEMM_ALIGN gotoblas -> align - -#define HAVE_EX_L2 gotoblas -> exclusive_cache - -#define SHGEMM_P gotoblas -> shgemm_p -#define SHGEMM_Q gotoblas -> shgemm_q -#define SHGEMM_R gotoblas -> shgemm_r -#define SHGEMM_UNROLL_M gotoblas -> shgemm_unroll_m -#define SHGEMM_UNROLL_N gotoblas -> shgemm_unroll_n -#define SHGEMM_UNROLL_MN gotoblas -> shgemm_unroll_mn - -#define SGEMM_P gotoblas -> sgemm_p -#define SGEMM_Q gotoblas -> sgemm_q -#define SGEMM_R gotoblas -> sgemm_r -#define SGEMM_UNROLL_M gotoblas -> sgemm_unroll_m -#define SGEMM_UNROLL_N gotoblas -> sgemm_unroll_n -#define SGEMM_UNROLL_MN gotoblas -> sgemm_unroll_mn - -#define DGEMM_P gotoblas -> dgemm_p -#define DGEMM_Q gotoblas -> dgemm_q -#define DGEMM_R gotoblas -> dgemm_r -#define DGEMM_UNROLL_M gotoblas -> dgemm_unroll_m -#define DGEMM_UNROLL_N gotoblas -> dgemm_unroll_n -#define DGEMM_UNROLL_MN gotoblas -> dgemm_unroll_mn - -#define QGEMM_P gotoblas -> qgemm_p -#define QGEMM_Q gotoblas -> qgemm_q -#define QGEMM_R gotoblas -> qgemm_r -#define QGEMM_UNROLL_M gotoblas -> qgemm_unroll_m -#define QGEMM_UNROLL_N gotoblas -> qgemm_unroll_n -#define QGEMM_UNROLL_MN gotoblas -> qgemm_unroll_mn - -#define CGEMM_P gotoblas -> cgemm_p -#define CGEMM_Q gotoblas -> cgemm_q -#define CGEMM_R gotoblas -> cgemm_r -#define CGEMM_UNROLL_M gotoblas -> cgemm_unroll_m -#define CGEMM_UNROLL_N gotoblas -> cgemm_unroll_n -#define CGEMM_UNROLL_MN gotoblas -> cgemm_unroll_mn - -#define ZGEMM_P gotoblas -> zgemm_p -#define ZGEMM_Q gotoblas -> zgemm_q -#define ZGEMM_R gotoblas -> zgemm_r -#define ZGEMM_UNROLL_M gotoblas -> zgemm_unroll_m -#define ZGEMM_UNROLL_N gotoblas -> zgemm_unroll_n -#define ZGEMM_UNROLL_MN gotoblas -> zgemm_unroll_mn - -#define XGEMM_P gotoblas -> xgemm_p -#define XGEMM_Q gotoblas -> xgemm_q -#define XGEMM_R gotoblas -> xgemm_r -#define XGEMM_UNROLL_M gotoblas -> xgemm_unroll_m -#define XGEMM_UNROLL_N gotoblas -> xgemm_unroll_n -#define XGEMM_UNROLL_MN gotoblas -> xgemm_unroll_mn - -#define CGEMM3M_P gotoblas -> cgemm3m_p -#define CGEMM3M_Q gotoblas -> cgemm3m_q -#define CGEMM3M_R gotoblas -> cgemm3m_r -#define CGEMM3M_UNROLL_M gotoblas -> cgemm3m_unroll_m -#define CGEMM3M_UNROLL_N gotoblas -> cgemm3m_unroll_n -#define CGEMM3M_UNROLL_MN gotoblas -> cgemm3m_unroll_mn - -#define ZGEMM3M_P gotoblas -> zgemm3m_p -#define ZGEMM3M_Q gotoblas -> zgemm3m_q -#define ZGEMM3M_R gotoblas -> zgemm3m_r -#define ZGEMM3M_UNROLL_M gotoblas -> zgemm3m_unroll_m -#define ZGEMM3M_UNROLL_N gotoblas -> zgemm3m_unroll_n -#define ZGEMM3M_UNROLL_MN gotoblas -> zgemm3m_unroll_mn - -#define XGEMM3M_P gotoblas -> xgemm3m_p -#define XGEMM3M_Q gotoblas -> xgemm3m_q -#define XGEMM3M_R gotoblas -> xgemm3m_r -#define XGEMM3M_UNROLL_M gotoblas -> xgemm3m_unroll_m -#define XGEMM3M_UNROLL_N gotoblas -> xgemm3m_unroll_n -#define XGEMM3M_UNROLL_MN gotoblas -> xgemm3m_unroll_mn - -#else - -#define DTB_ENTRIES DTB_DEFAULT_ENTRIES - -#define GEMM_OFFSET_A GEMM_DEFAULT_OFFSET_A -#define GEMM_OFFSET_B GEMM_DEFAULT_OFFSET_B -#define GEMM_ALIGN GEMM_DEFAULT_ALIGN - -#ifdef HAVE_EXCLUSIVE_CACHE -#define HAVE_EX_L2 1 -#else -#define HAVE_EX_L2 0 -#endif - -#define SHGEMM_P SHGEMM_DEFAULT_P -#define SHGEMM_Q SHGEMM_DEFAULT_Q -#define SHGEMM_R SHGEMM_DEFAULT_R -#define SHGEMM_UNROLL_M SHGEMM_DEFAULT_UNROLL_M -#define SHGEMM_UNROLL_N SHGEMM_DEFAULT_UNROLL_N -#ifdef SHGEMM_DEFAULT_UNROLL_MN -#define SHGEMM_UNROLL_MN SHGEMM_DEFAULT_UNROLL_MN -#else -#define SHGEMM_UNROLL_MN MAX((SHGEMM_UNROLL_M), (SHGEMM_UNROLL_N)) -#endif - -#define SGEMM_P SGEMM_DEFAULT_P -#define SGEMM_Q SGEMM_DEFAULT_Q -#define SGEMM_R SGEMM_DEFAULT_R -#define SGEMM_UNROLL_M SGEMM_DEFAULT_UNROLL_M -#define SGEMM_UNROLL_N SGEMM_DEFAULT_UNROLL_N -#ifdef SGEMM_DEFAULT_UNROLL_MN -#define SGEMM_UNROLL_MN SGEMM_DEFAULT_UNROLL_MN -#else -#define SGEMM_UNROLL_MN MAX((SGEMM_UNROLL_M), (SGEMM_UNROLL_N)) -#endif - -#define DGEMM_P DGEMM_DEFAULT_P -#define DGEMM_Q DGEMM_DEFAULT_Q -#define DGEMM_R DGEMM_DEFAULT_R -#define DGEMM_UNROLL_M DGEMM_DEFAULT_UNROLL_M -#define DGEMM_UNROLL_N DGEMM_DEFAULT_UNROLL_N -#ifdef DGEMM_DEFAULT_UNROLL_MN -#define DGEMM_UNROLL_MN DGEMM_DEFAULT_UNROLL_MN -#else -#define DGEMM_UNROLL_MN MAX((DGEMM_UNROLL_M), (DGEMM_UNROLL_N)) -#endif - -#define QGEMM_P QGEMM_DEFAULT_P -#define QGEMM_Q QGEMM_DEFAULT_Q -#define QGEMM_R QGEMM_DEFAULT_R -#define QGEMM_UNROLL_M QGEMM_DEFAULT_UNROLL_M -#define QGEMM_UNROLL_N QGEMM_DEFAULT_UNROLL_N -#define QGEMM_UNROLL_MN MAX((QGEMM_UNROLL_M), (QGEMM_UNROLL_N)) - -#define CGEMM_P CGEMM_DEFAULT_P -#define CGEMM_Q CGEMM_DEFAULT_Q -#define CGEMM_R CGEMM_DEFAULT_R -#define CGEMM_UNROLL_M CGEMM_DEFAULT_UNROLL_M -#define CGEMM_UNROLL_N CGEMM_DEFAULT_UNROLL_N -#ifdef CGEMM_DEFAULT_UNROLL_MN -#define CGEMM_UNROLL_MN CGEMM_DEFAULT_UNROLL_MN -#else -#define CGEMM_UNROLL_MN MAX((CGEMM_UNROLL_M), (CGEMM_UNROLL_N)) -#endif - -#define ZGEMM_P ZGEMM_DEFAULT_P -#define ZGEMM_Q ZGEMM_DEFAULT_Q -#define ZGEMM_R ZGEMM_DEFAULT_R -#define ZGEMM_UNROLL_M ZGEMM_DEFAULT_UNROLL_M -#define ZGEMM_UNROLL_N ZGEMM_DEFAULT_UNROLL_N -#ifdef ZGEMM_DEFAULT_UNROLL_MN -#define ZGEMM_UNROLL_MN ZGEMM_DEFAULT_UNROLL_MN -#else -#define ZGEMM_UNROLL_MN MAX((ZGEMM_UNROLL_M), (ZGEMM_UNROLL_N)) -#endif - -#define XGEMM_P XGEMM_DEFAULT_P -#define XGEMM_Q XGEMM_DEFAULT_Q -#define XGEMM_R XGEMM_DEFAULT_R -#define XGEMM_UNROLL_M XGEMM_DEFAULT_UNROLL_M -#define XGEMM_UNROLL_N XGEMM_DEFAULT_UNROLL_N -#define XGEMM_UNROLL_MN MAX((XGEMM_UNROLL_M), (XGEMM_UNROLL_N)) - -#ifdef CGEMM3M_DEFAULT_UNROLL_N - -#define CGEMM3M_P CGEMM3M_DEFAULT_P -#define CGEMM3M_Q CGEMM3M_DEFAULT_Q -#define CGEMM3M_R CGEMM3M_DEFAULT_R -#define CGEMM3M_UNROLL_M CGEMM3M_DEFAULT_UNROLL_M -#define CGEMM3M_UNROLL_N CGEMM3M_DEFAULT_UNROLL_N -#define CGEMM3M_UNROLL_MN MAX((CGEMM3M_UNROLL_M), (CGEMM3M_UNROLL_N)) - -#else - -#define CGEMM3M_P SGEMM_DEFAULT_P -#define CGEMM3M_Q SGEMM_DEFAULT_Q -#define CGEMM3M_R SGEMM_DEFAULT_R -#define CGEMM3M_UNROLL_M SGEMM_DEFAULT_UNROLL_M -#define CGEMM3M_UNROLL_N SGEMM_DEFAULT_UNROLL_N -#define CGEMM3M_UNROLL_MN MAX((CGEMM_UNROLL_M), (CGEMM_UNROLL_N)) - -#endif - - -#ifdef ZGEMM3M_DEFAULT_UNROLL_N - -#define ZGEMM3M_P ZGEMM3M_DEFAULT_P -#define ZGEMM3M_Q ZGEMM3M_DEFAULT_Q -#define ZGEMM3M_R ZGEMM3M_DEFAULT_R -#define ZGEMM3M_UNROLL_M ZGEMM3M_DEFAULT_UNROLL_M -#define ZGEMM3M_UNROLL_N ZGEMM3M_DEFAULT_UNROLL_N -#define ZGEMM3M_UNROLL_MN MAX((ZGEMM_UNROLL_M), (ZGEMM_UNROLL_N)) - -#else - -#define ZGEMM3M_P DGEMM_DEFAULT_P -#define ZGEMM3M_Q DGEMM_DEFAULT_Q -#define ZGEMM3M_R DGEMM_DEFAULT_R -#define ZGEMM3M_UNROLL_M DGEMM_DEFAULT_UNROLL_M -#define ZGEMM3M_UNROLL_N DGEMM_DEFAULT_UNROLL_N -#define ZGEMM3M_UNROLL_MN MAX((ZGEMM_UNROLL_M), (ZGEMM_UNROLL_N)) - -#endif - -#define XGEMM3M_P QGEMM_DEFAULT_P -#define XGEMM3M_Q QGEMM_DEFAULT_Q -#define XGEMM3M_R QGEMM_DEFAULT_R -#define XGEMM3M_UNROLL_M QGEMM_DEFAULT_UNROLL_M -#define XGEMM3M_UNROLL_N QGEMM_DEFAULT_UNROLL_N -#define XGEMM3M_UNROLL_MN MAX((QGEMM_UNROLL_M), (QGEMM_UNROLL_N)) - - -#endif -#endif - -#ifndef COMPLEX -#if defined(XDOUBLE) -#define GEMM_P QGEMM_P -#define GEMM_Q QGEMM_Q -#define GEMM_R QGEMM_R -#define GEMM_UNROLL_M QGEMM_UNROLL_M -#define GEMM_UNROLL_N QGEMM_UNROLL_N -#define GEMM_UNROLL_MN QGEMM_UNROLL_MN -#define GEMM_DEFAULT_P QGEMM_DEFAULT_P -#define GEMM_DEFAULT_Q QGEMM_DEFAULT_Q -#define GEMM_DEFAULT_R QGEMM_DEFAULT_R -#define GEMM_DEFAULT_UNROLL_M QGEMM_DEFAULT_UNROLL_M -#define GEMM_DEFAULT_UNROLL_N QGEMM_DEFAULT_UNROLL_N -#elif defined(DOUBLE) -#define GEMM_P DGEMM_P -#define GEMM_Q DGEMM_Q -#define GEMM_R DGEMM_R -#define GEMM_UNROLL_M DGEMM_UNROLL_M -#define GEMM_UNROLL_N DGEMM_UNROLL_N -#define GEMM_UNROLL_MN DGEMM_UNROLL_MN -#define GEMM_DEFAULT_P DGEMM_DEFAULT_P -#define GEMM_DEFAULT_Q DGEMM_DEFAULT_Q -#define GEMM_DEFAULT_R DGEMM_DEFAULT_R -#define GEMM_DEFAULT_UNROLL_M DGEMM_DEFAULT_UNROLL_M -#define GEMM_DEFAULT_UNROLL_N DGEMM_DEFAULT_UNROLL_N -#elif defined(HALF) -#define GEMM_P SHGEMM_P -#define GEMM_Q SHGEMM_Q -#define GEMM_R SHGEMM_R -#define GEMM_UNROLL_M SHGEMM_UNROLL_M -#define GEMM_UNROLL_N SHGEMM_UNROLL_N -#define GEMM_UNROLL_MN SHGEMM_UNROLL_MN -#define GEMM_DEFAULT_P SHGEMM_DEFAULT_P -#define GEMM_DEFAULT_Q SHGEMM_DEFAULT_Q -#define GEMM_DEFAULT_R SHGEMM_DEFAULT_R -#define GEMM_DEFAULT_UNROLL_M SHGEMM_DEFAULT_UNROLL_M -#define GEMM_DEFAULT_UNROLL_N SHGEMM_DEFAULT_UNROLL_N -#else -#define GEMM_P SGEMM_P -#define GEMM_Q SGEMM_Q -#define GEMM_R SGEMM_R -#define GEMM_UNROLL_M SGEMM_UNROLL_M -#define GEMM_UNROLL_N SGEMM_UNROLL_N -#define GEMM_UNROLL_MN SGEMM_UNROLL_MN -#define GEMM_DEFAULT_P SGEMM_DEFAULT_P -#define GEMM_DEFAULT_Q SGEMM_DEFAULT_Q -#define GEMM_DEFAULT_R SGEMM_DEFAULT_R -#define GEMM_DEFAULT_UNROLL_M SGEMM_DEFAULT_UNROLL_M -#define GEMM_DEFAULT_UNROLL_N SGEMM_DEFAULT_UNROLL_N -#endif -#else -#if defined(XDOUBLE) -#define GEMM_P XGEMM_P -#define GEMM_Q XGEMM_Q -#define GEMM_R XGEMM_R -#define GEMM_UNROLL_M XGEMM_UNROLL_M -#define GEMM_UNROLL_N XGEMM_UNROLL_N -#define GEMM_UNROLL_MN XGEMM_UNROLL_MN -#define GEMM_DEFAULT_P XGEMM_DEFAULT_P -#define GEMM_DEFAULT_Q XGEMM_DEFAULT_Q -#define GEMM_DEFAULT_R XGEMM_DEFAULT_R -#define GEMM_DEFAULT_UNROLL_M XGEMM_DEFAULT_UNROLL_M -#define GEMM_DEFAULT_UNROLL_N XGEMM_DEFAULT_UNROLL_N -#elif defined(DOUBLE) -#define GEMM_P ZGEMM_P -#define GEMM_Q ZGEMM_Q -#define GEMM_R ZGEMM_R -#define GEMM_UNROLL_M ZGEMM_UNROLL_M -#define GEMM_UNROLL_N ZGEMM_UNROLL_N -#define GEMM_UNROLL_MN ZGEMM_UNROLL_MN -#define GEMM_DEFAULT_P ZGEMM_DEFAULT_P -#define GEMM_DEFAULT_Q ZGEMM_DEFAULT_Q -#define GEMM_DEFAULT_R ZGEMM_DEFAULT_R -#define GEMM_DEFAULT_UNROLL_M ZGEMM_DEFAULT_UNROLL_M -#define GEMM_DEFAULT_UNROLL_N ZGEMM_DEFAULT_UNROLL_N -#else -#define GEMM_P CGEMM_P -#define GEMM_Q CGEMM_Q -#define GEMM_R CGEMM_R -#define GEMM_UNROLL_M CGEMM_UNROLL_M -#define GEMM_UNROLL_N CGEMM_UNROLL_N -#define GEMM_UNROLL_MN CGEMM_UNROLL_MN -#define GEMM_DEFAULT_P CGEMM_DEFAULT_P -#define GEMM_DEFAULT_Q CGEMM_DEFAULT_Q -#define GEMM_DEFAULT_R CGEMM_DEFAULT_R -#define GEMM_DEFAULT_UNROLL_M CGEMM_DEFAULT_UNROLL_M -#define GEMM_DEFAULT_UNROLL_N CGEMM_DEFAULT_UNROLL_N -#endif -#endif - -#ifdef XDOUBLE -#define GEMM3M_UNROLL_M XGEMM3M_UNROLL_M -#define GEMM3M_UNROLL_N XGEMM3M_UNROLL_N -#elif defined(DOUBLE) -#define GEMM3M_UNROLL_M ZGEMM3M_UNROLL_M -#define GEMM3M_UNROLL_N ZGEMM3M_UNROLL_N -#else -#define GEMM3M_UNROLL_M CGEMM3M_UNROLL_M -#define GEMM3M_UNROLL_N CGEMM3M_UNROLL_N -#endif - - -#ifndef QGEMM_DEFAULT_UNROLL_M -#define QGEMM_DEFAULT_UNROLL_M 2 -#endif - -#ifndef QGEMM_DEFAULT_UNROLL_N -#define QGEMM_DEFAULT_UNROLL_N 2 -#endif - -#ifndef XGEMM_DEFAULT_UNROLL_M -#define XGEMM_DEFAULT_UNROLL_M 2 -#endif - -#ifndef XGEMM_DEFAULT_UNROLL_N -#define XGEMM_DEFAULT_UNROLL_N 2 -#endif - -#ifndef GEMM_THREAD -#define GEMM_THREAD gemm_thread_n -#endif - -#ifndef SHGEMM_DEFAULT_R -#define SHGEMM_DEFAULT_R (((BUFFER_SIZE - ((SHGEMM_DEFAULT_P * SHGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SHGEMM_DEFAULT_Q * 4) - 15) & ~15UL) -#endif - -#ifndef SGEMM_DEFAULT_R -#define SGEMM_DEFAULT_R (((BUFFER_SIZE - ((SGEMM_DEFAULT_P * SGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SGEMM_DEFAULT_Q * 4) - 15) & ~15UL) -#endif - -#ifndef DGEMM_DEFAULT_R -#define DGEMM_DEFAULT_R (((BUFFER_SIZE - ((DGEMM_DEFAULT_P * DGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (DGEMM_DEFAULT_Q * 8) - 15) & ~15UL) -#endif - -#ifndef QGEMM_DEFAULT_R -#define QGEMM_DEFAULT_R (((BUFFER_SIZE - ((QGEMM_DEFAULT_P * QGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (QGEMM_DEFAULT_Q * 16) - 15) & ~15UL) -#endif - -#ifndef CGEMM_DEFAULT_R -#define CGEMM_DEFAULT_R (((BUFFER_SIZE - ((CGEMM_DEFAULT_P * CGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (CGEMM_DEFAULT_Q * 8) - 15) & ~15UL) -#endif - -#ifndef ZGEMM_DEFAULT_R -#define ZGEMM_DEFAULT_R (((BUFFER_SIZE - ((ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (ZGEMM_DEFAULT_Q * 16) - 15) & ~15UL) -#endif - -#ifndef XGEMM_DEFAULT_R -#define XGEMM_DEFAULT_R (((BUFFER_SIZE - ((XGEMM_DEFAULT_P * XGEMM_DEFAULT_Q * 32 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (XGEMM_DEFAULT_Q * 32) - 15) & ~15UL) -#endif - -#ifndef SNUMOPT -#define SNUMOPT 2 -#endif - -#ifndef DNUMOPT -#define DNUMOPT 2 -#endif - -#ifndef QNUMOPT -#define QNUMOPT 1 -#endif - -#ifndef GEMM3M_P -#ifdef XDOUBLE -#define GEMM3M_P XGEMM3M_P -#elif defined(DOUBLE) -#define GEMM3M_P ZGEMM3M_P -#else -#define GEMM3M_P CGEMM3M_P -#endif -#endif - -#ifndef GEMM3M_Q -#ifdef XDOUBLE -#define GEMM3M_Q XGEMM3M_Q -#elif defined(DOUBLE) -#define GEMM3M_Q ZGEMM3M_Q -#else -#define GEMM3M_Q CGEMM3M_Q -#endif -#endif - -#ifndef GEMM3M_R -#ifdef XDOUBLE -#define GEMM3M_R XGEMM3M_R -#elif defined(DOUBLE) -#define GEMM3M_R ZGEMM3M_R -#else -#define GEMM3M_R CGEMM3M_R -#endif -#endif - - -#endif From 28915eed726404bd14ed2828d45fe5293c55603e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 5 Jun 2020 10:05:34 +0200 Subject: [PATCH 229/593] Cosmetic fixes for non-C99 compilers --- test/compare_sgemm_shgemm.c | 65 +++++++++---------------------------- 1 file changed, 16 insertions(+), 49 deletions(-) diff --git a/test/compare_sgemm_shgemm.c b/test/compare_sgemm_shgemm.c index 7e254f844..d37ae6851 100644 --- a/test/compare_sgemm_shgemm.c +++ b/test/compare_sgemm_shgemm.c @@ -46,83 +46,50 @@ typedef union } bits; } bfloat16_bits; -typedef union -{ - float v; - struct - { - uint32_t m:23; - uint32_t e:8; - uint32_t s:1; - } bits; -} float32_bits; - -float -float16to32 (bfloat16_bits f16) -{ - float32_bits f32; - f32.bits.s = f16.bits.s; - f32.bits.e = f16.bits.e; - f32.bits.m = (uint32_t) f16.bits.m << 16; - return f32.v; -} - int main (int argc, char *argv[]) { int m, n, k; int i, j, l; + int x; int ret = 0; int loop = 100; char transA = 'N', transB = 'N'; float alpha = 1.0, beta = 0.0; + char transa = 'N'; + char transb = 'N'; - for (int x = 0; x <= loop; x++) + for (x = 0; x <= loop; x++) { m = k = n = x; float A[m * k]; float B[k * n]; float C[m * n]; bfloat16_bits AA[m * k], BB[k * n]; - float DD[m * n], CC[m * n]; + float CC[m * n]; - for (int j = 0; j < m; j++) + for (j = 0; j < m; j++) { - for (int i = 0; i < m; i++) + for (i = 0; i < m; i++) { - A[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; - B[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; + A[j * k + i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) + 0.5; + B[j * k + i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) + 0.5; C[j * k + i] = 0; AA[j * k + i].v = *(uint32_t *) & A[j * k + i] >> 16; BB[j * k + i].v = *(uint32_t *) & B[j * k + i] >> 16; CC[j * k + i] = 0; - DD[j * k + i] = 0; } } SGEMM (&transA, &transB, &m, &n, &k, &alpha, A, - &m, B, &k, &beta, C, &m); + &m, B, &k, &beta, C, &m); SHGEMM (&transA, &transB, &m, &n, &k, &alpha, AA, - &m, BB, &k, &beta, CC, &m); + &m, BB, &k, &beta, CC, &m); + for (i = 0; i < n; i++) - for (j = 0; j < m; j++) - for (l = 0; l < k; l++) - if (fabs (CC[i * m + j] - C[i * m + j]) > 1.0) - ret++; - if (transA == 'N' && transB == 'N') - { - for (i = 0; i < n; i++) - for (j = 0; j < m; j++) - for (l = 0; l < k; l++) - { - DD[i * m + j] += - float16to32 (AA[l * m + j]) * float16to32 (BB[l + k * i]); - } - for (i = 0; i < n; i++) - for (j = 0; j < m; j++) - for (l = 0; l < k; l++) - if (CC[i * m + j] != DD[i * m + j]) - ret++; - } + for (j = 0; j < m; j++) + for (l = 0; l < k; l++) + if (fabs(CC[i * m + j]-C[i * m + j]) > 1.0) + ret++; } if (ret != 0) fprintf (stderr, "FATAL ERROR SHGEMM - Return code: %d\n", ret); From 0e3ac4a06bc3cce26d593f5b8acad20a6121d1ed Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sat, 6 Jun 2020 14:56:57 +0800 Subject: [PATCH 230/593] Add files via upload --- kernel/x86_64/dgemm_kernel_16x2_skylakex.c | 150 ++++++++++++--------- 1 file changed, 89 insertions(+), 61 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_16x2_skylakex.c b/kernel/x86_64/dgemm_kernel_16x2_skylakex.c index 416ace59b..9f2bf24e2 100644 --- a/kernel/x86_64/dgemm_kernel_16x2_skylakex.c +++ b/kernel/x86_64/dgemm_kernel_16x2_skylakex.c @@ -54,40 +54,40 @@ #define kernel_kstart_n10(mdim,updk) "" #define kernel_kstart_n12(mdim,updk) "" #define kernel_kend_n4(mdim) "xorq %3,%3;"\ - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8)\ - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0)\ + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16) #define kernel_kend_n6(mdim) "xorq %3,%3;"\ - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8) acc_kend_nc3_k1m##mdim(0,8)\ - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) acc_kend_nc3_k1m##mdim(16,24)\ - loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32,40)\ - loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48,56) + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0) acc_kend_nc3_k1m##mdim(0)\ + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16) acc_kend_nc3_k1m##mdim(16)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48) #define kernel_kend_n8(mdim) "xorq %3,%3;"\ - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8) acc_kend_nc3_k1m##mdim(0,8) acc_kend_nc4_k1m##mdim(0,8)\ - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) acc_kend_nc3_k1m##mdim(16,24) acc_kend_nc4_k1m##mdim(16,24)\ - loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32,40) acc_kend_nc4_k1m##mdim(32,40)\ - loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48,56) acc_kend_nc4_k1m##mdim(48,56)\ - loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64,72)\ - loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80,88) + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0) acc_kend_nc3_k1m##mdim(0) acc_kend_nc4_k1m##mdim(0)\ + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16) acc_kend_nc3_k1m##mdim(16) acc_kend_nc4_k1m##mdim(16)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32) acc_kend_nc4_k1m##mdim(32)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48) acc_kend_nc4_k1m##mdim(48)\ + loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64)\ + loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80) #define kernel_kend_n10(mdim) "xorq %3,%3;"\ - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8) acc_kend_nc3_k1m##mdim(0,8) acc_kend_nc4_k1m##mdim(0,8) acc_kend_nc5_k1m##mdim(0,8)\ - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) acc_kend_nc3_k1m##mdim(16,24) acc_kend_nc4_k1m##mdim(16,24) acc_kend_nc5_k1m##mdim(16,24)\ - loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32,40) acc_kend_nc4_k1m##mdim(32,40) acc_kend_nc5_k1m##mdim(32,40)\ - loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48,56) acc_kend_nc4_k1m##mdim(48,56) acc_kend_nc5_k1m##mdim(48,56)\ - loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64,72) acc_kend_nc5_k1m##mdim(64,72)\ - loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80,88) acc_kend_nc5_k1m##mdim(80,88)\ - loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(96,104)\ - loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(112,120) + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0) acc_kend_nc3_k1m##mdim(0) acc_kend_nc4_k1m##mdim(0) acc_kend_nc5_k1m##mdim(0)\ + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16) acc_kend_nc3_k1m##mdim(16) acc_kend_nc4_k1m##mdim(16) acc_kend_nc5_k1m##mdim(16)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32) acc_kend_nc4_k1m##mdim(32) acc_kend_nc5_k1m##mdim(32)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48) acc_kend_nc4_k1m##mdim(48) acc_kend_nc5_k1m##mdim(48)\ + loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64) acc_kend_nc5_k1m##mdim(64)\ + loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80) acc_kend_nc5_k1m##mdim(80)\ + loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(96)\ + loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(112) #define kernel_kend_n12(mdim) "xorq %3,%3;"\ - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8) acc_kend_nc3_k1m##mdim(0,8) acc_kend_nc4_k1m##mdim(0,8) acc_kend_nc5_k1m##mdim(0,8) acc_kend_nc6_k1m##mdim(0,8)\ - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) acc_kend_nc3_k1m##mdim(16,24) acc_kend_nc4_k1m##mdim(16,24) acc_kend_nc5_k1m##mdim(16,24) acc_kend_nc6_k1m##mdim(16,24)\ - loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32,40) acc_kend_nc4_k1m##mdim(32,40) acc_kend_nc5_k1m##mdim(32,40) acc_kend_nc6_k1m##mdim(32,40)\ - loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48,56) acc_kend_nc4_k1m##mdim(48,56) acc_kend_nc5_k1m##mdim(48,56) acc_kend_nc6_k1m##mdim(48,56)\ - loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64,72) acc_kend_nc5_k1m##mdim(64,72) acc_kend_nc6_k1m##mdim(64,72)\ - loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80,88) acc_kend_nc5_k1m##mdim(80,88) acc_kend_nc6_k1m##mdim(80,88)\ - loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(96,104) acc_kend_nc6_k1m##mdim(96,104)\ - loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(112,120) acc_kend_nc6_k1m##mdim(112,120)\ - loada_kend_k1m##mdim acc_kend_nc6_k1m##mdim(128,136)\ - loada_kend_k1m##mdim acc_kend_nc6_k1m##mdim(144,152) + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0) acc_kend_nc3_k1m##mdim(0) acc_kend_nc4_k1m##mdim(0) acc_kend_nc5_k1m##mdim(0) acc_kend_nc6_k1m##mdim(0)\ + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16) acc_kend_nc3_k1m##mdim(16) acc_kend_nc4_k1m##mdim(16) acc_kend_nc5_k1m##mdim(16) acc_kend_nc6_k1m##mdim(16)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32) acc_kend_nc4_k1m##mdim(32) acc_kend_nc5_k1m##mdim(32) acc_kend_nc6_k1m##mdim(32)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48) acc_kend_nc4_k1m##mdim(48) acc_kend_nc5_k1m##mdim(48) acc_kend_nc6_k1m##mdim(48)\ + loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64) acc_kend_nc5_k1m##mdim(64) acc_kend_nc6_k1m##mdim(64)\ + loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80) acc_kend_nc5_k1m##mdim(80) acc_kend_nc6_k1m##mdim(80)\ + loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(96) acc_kend_nc6_k1m##mdim(96)\ + loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(112) acc_kend_nc6_k1m##mdim(112)\ + loada_kend_k1m##mdim acc_kend_nc6_k1m##mdim(128)\ + loada_kend_k1m##mdim acc_kend_nc6_k1m##mdim(144) #endif #else #define HEAD_SET_OFF(ndim) {} @@ -129,18 +129,28 @@ #define init_update_k(mdim) "" #define save_update_k(mdim) "" #endif - + #define KERNEL_h_k1m16n1 \ "vmovupd (%0),%%zmm1; vmovupd 64(%0),%%zmm2; addq $128,%0;"\ "vbroadcastsd (%1),%%zmm3; vfmadd231pd %%zmm1,%%zmm3,%%zmm8; vfmadd231pd %%zmm2,%%zmm3,%%zmm9;" #define KERNEL_k1m16n1 KERNEL_h_k1m16n1 "addq $8,%1;" -#define KERNEL_h_k1m16n2 KERNEL_h_k1m16n1\ +#ifdef BROADCAST_KERNEL + #define KERNEL_h_k1m16n2 KERNEL_h_k1m16n1\ "vbroadcastsd 8(%1),%%zmm4; vfmadd231pd %%zmm1,%%zmm4,%%zmm10; vfmadd231pd %%zmm2,%%zmm4,%%zmm11;" -#define KERNEL_k1m16n2 KERNEL_h_k1m16n2 "addq $16,%1;" -#define unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,boff1,boff2,...)\ + #define unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,boff1,...)\ "vbroadcastsd "#boff1"("#__VA_ARGS__"),%%zmm3; vfmadd231pd %%zmm1,%%zmm3,%%zmm"#c1_no"; vfmadd231pd %%zmm2,%%zmm3,%%zmm"#c2_no";"\ - "vbroadcastsd "#boff2"("#__VA_ARGS__"),%%zmm4; vfmadd231pd %%zmm1,%%zmm4,%%zmm"#c3_no"; vfmadd231pd %%zmm2,%%zmm4,%%zmm"#c4_no";" -#define unit_acc_m16n2(c1_no,c2_no,c3_no,c4_no,...) unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,0,8,__VA_ARGS__) + "vbroadcastsd "#boff1"+8("#__VA_ARGS__"),%%zmm4; vfmadd231pd %%zmm1,%%zmm4,%%zmm"#c3_no"; vfmadd231pd %%zmm2,%%zmm4,%%zmm"#c4_no";" + #define unit_acc_m16n2(c1_no,c2_no,c3_no,c4_no,...) unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,0,__VA_ARGS__) +#else + #define unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,boff1,...)\ + "vbroadcastf32x4 "#boff1"("#__VA_ARGS__"),%%zmm5; vfmadd231pd %%zmm1,%%zmm5,%%zmm"#c1_no"; vfmadd231pd %%zmm2,%%zmm5,%%zmm"#c2_no";"\ + "vfmadd231pd %%zmm3,%%zmm5,%%zmm"#c3_no"; vfmadd231pd %%zmm4,%%zmm5,%%zmm"#c4_no";" + #define unit_acc_m16n2(c1_no,c2_no,c3_no,c4_no,...) unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,0,__VA_ARGS__) + #define KERNEL_h_k1m16n2 \ + "vmovddup (%0),%%zmm1; vmovddup 8(%0),%%zmm2; vmovddup 64(%0),%%zmm3; vmovddup 72(%0),%%zmm4; addq $128,%0;"\ + unit_acc_m16n2(8,9,10,11,%1) +#endif +#define KERNEL_k1m16n2 KERNEL_h_k1m16n2 "addq $16,%1;" #define KERNEL_h_k1m16n4 KERNEL_h_k1m16n2 "prefetcht0 384(%0);" unit_acc_m16n2(12,13,14,15,%1,%%r12,1) #define KERNEL_k1m16n4 KERNEL_h_k1m16n4 "addq $16,%1;" #define KERNEL_k1m16n6 KERNEL_h_k1m16n4 unit_acc_m16n2(16,17,18,19,%1,%%r12,2) "addq $16,%1;" @@ -151,24 +161,42 @@ #define KERNEL_h_k1m16n12 KERNEL_h_k1m16n10 unit_acc_m16n2(28,29,30,31,%%r15,%%r12,2) #define KERNEL_k1m16n12 KERNEL_h_k1m16n12 "addq $16,%%r15;" #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0) + #ifdef BROADCAST_KERNEL #define loada_kend_k1m16 "vmovupd (%0,%3,1),%%zmm1; vmovupd 64(%0,%3,1),%%zmm2; addq $128,%3;" - #define acc_kend_nc2_k1m16(boff1,boff2) unit_acc_gen_m16n2(12,13,14,15,boff1,boff2,%1,%%r12,1) - #define acc_kend_nc3_k1m16(boff1,boff2) unit_acc_gen_m16n2(16,17,18,19,boff1,boff2,%1,%%r12,2) - #define acc_kend_nc4_k1m16(boff1,boff2) unit_acc_gen_m16n2(20,21,22,23,boff1,boff2,%%r15) - #define acc_kend_nc5_k1m16(boff1,boff2) unit_acc_gen_m16n2(24,25,26,27,boff1,boff2,%%r15,%%r12,1) - #define acc_kend_nc6_k1m16(boff1,boff2) unit_acc_gen_m16n2(28,29,30,31,boff1,boff2,%%r15,%%r12,2) + #else + #define loada_kend_k1m16 "vmovddup (%0,%3,1),%%zmm1; vmovddup 8(%0,%3,1),%%zmm2; vmovddup 64(%0,%3,1),%%zmm3; vmovddup 72(%0,%3,1),%%zmm4; addq $128,%3;" + #endif + #define acc_kend_nc2_k1m16(boff1) unit_acc_gen_m16n2(12,13,14,15,boff1,%1,%%r12,1) + #define acc_kend_nc3_k1m16(boff1) unit_acc_gen_m16n2(16,17,18,19,boff1,%1,%%r12,2) + #define acc_kend_nc4_k1m16(boff1) unit_acc_gen_m16n2(20,21,22,23,boff1,%%r15) + #define acc_kend_nc5_k1m16(boff1) unit_acc_gen_m16n2(24,25,26,27,boff1,%%r15,%%r12,1) + #define acc_kend_nc6_k1m16(boff1) unit_acc_gen_m16n2(28,29,30,31,boff1,%%r15,%%r12,2) #endif #define save_init_m16 "movq %2,%3; addq $128,%2;" #ifdef TRMMKERNEL #define SAVE_m16n1 "vmulpd %%zmm8,%%zmm0,%%zmm8; vmovupd %%zmm8,(%2); vmulpd %%zmm9,%%zmm0,%%zmm9; vmovupd %%zmm9,64(%2); addq $128,%2;" + #ifdef BROADCAST_KERNEL #define unit_save_m16n2(c1_no,c2_no,c3_no,c4_no)\ "vmulpd %%zmm"#c1_no",%%zmm0,%%zmm"#c1_no"; vmovupd %%zmm"#c1_no",(%3); vmulpd %%zmm"#c2_no",%%zmm0,%%zmm"#c2_no"; vmovupd %%zmm"#c2_no",64(%3);"\ "vmulpd %%zmm"#c3_no",%%zmm0,%%zmm"#c3_no"; vmovupd %%zmm"#c3_no",(%3,%4,1); vmulpd %%zmm"#c4_no",%%zmm0,%%zmm"#c4_no"; vmovupd %%zmm"#c4_no",64(%3,%4,1); leaq (%3,%4,2),%3;" + #else + #define unit_save_m16n2(c1_no,c2_no,c3_no,c4_no)\ + "vunpcklpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm1; vunpcklpd %%zmm"#c4_no",%%zmm"#c3_no",%%zmm2; vunpckhpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm3; vunpckhpd %%zmm"#c4_no",%%zmm"#c3_no",%%zmm4;"\ + "vmulpd %%zmm1,%%zmm0,%%zmm1; vmovupd %%zmm1,(%3); vmulpd %%zmm2,%%zmm0,%%zmm2; vmovupd %%zmm2,64(%3);"\ + "vmulpd %%zmm3,%%zmm0,%%zmm3; vmovupd %%zmm3,(%3,%4,1); vmulpd %%zmm4,%%zmm0,%%zmm4; vmovupd %%zmm4,64(%3,%4,1); leaq (%3,%4,2),%3;" + #endif #else #define SAVE_m16n1 "vfmadd213pd (%2),%%zmm0,%%zmm8; vmovupd %%zmm8,(%2); vfmadd213pd 64(%2),%%zmm0,%%zmm9; vmovupd %%zmm9,64(%2); addq $128,%2;" + #ifdef BROADCAST_KERNEL #define unit_save_m16n2(c1_no,c2_no,c3_no,c4_no)\ "vfmadd213pd (%3),%%zmm0,%%zmm"#c1_no"; vmovupd %%zmm"#c1_no",(%3); vfmadd213pd 64(%3),%%zmm0,%%zmm"#c2_no"; vmovupd %%zmm"#c2_no",64(%3);"\ "vfmadd213pd (%3,%4,1),%%zmm0,%%zmm"#c3_no"; vmovupd %%zmm"#c3_no",(%3,%4,1); vfmadd213pd 64(%3,%4,1),%%zmm0,%%zmm"#c4_no"; vmovupd %%zmm"#c4_no",64(%3,%4,1); leaq (%3,%4,2),%3;" + #else + #define unit_save_m16n2(c1_no,c2_no,c3_no,c4_no)\ + "vunpcklpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm1; vunpcklpd %%zmm"#c4_no",%%zmm"#c3_no",%%zmm2; vunpckhpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm3; vunpckhpd %%zmm"#c4_no",%%zmm"#c3_no",%%zmm4;"\ + "vfmadd213pd (%3),%%zmm0,%%zmm1; vmovupd %%zmm1,(%3); vfmadd213pd 64(%3),%%zmm0,%%zmm2; vmovupd %%zmm2,64(%3);"\ + "vfmadd213pd (%3,%4,1),%%zmm0,%%zmm3; vmovupd %%zmm3,(%3,%4,1); vfmadd213pd 64(%3,%4,1),%%zmm0,%%zmm4; vmovupd %%zmm4,64(%3,%4,1); leaq (%3,%4,2),%3;" + #endif #endif #define SAVE_m16n2 save_init_m16 unit_save_m16n2(8,9,10,11) #define SAVE_m16n4 SAVE_m16n2 unit_save_m16n2(12,13,14,15) @@ -206,11 +234,11 @@ #define KERNEL_k1m8n12 KERNEL_h_k1m8n12 "addq $16,%%r15;" #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0) #define loada_kend_k1m8 "vmovddup (%0,%3,1),%%zmm1; vmovddup 8(%0,%3,1),%%zmm2; addq $64,%3;" - #define acc_kend_nc2_k1m8(boff1,boff2) unit_acc_gen_m8n2(10,11,boff1,%1,%%r12,1) - #define acc_kend_nc3_k1m8(boff1,boff2) unit_acc_gen_m8n2(12,13,boff1,%1,%%r12,2) - #define acc_kend_nc4_k1m8(boff1,boff2) unit_acc_gen_m8n2(14,15,boff1,%%r15) - #define acc_kend_nc5_k1m8(boff1,boff2) unit_acc_gen_m8n2(16,17,boff1,%%r15,%%r12,1) - #define acc_kend_nc6_k1m8(boff1,boff2) unit_acc_gen_m8n2(18,19,boff1,%%r15,%%r12,2) + #define acc_kend_nc2_k1m8(boff1) unit_acc_gen_m8n2(10,11,boff1,%1,%%r12,1) + #define acc_kend_nc3_k1m8(boff1) unit_acc_gen_m8n2(12,13,boff1,%1,%%r12,2) + #define acc_kend_nc4_k1m8(boff1) unit_acc_gen_m8n2(14,15,boff1,%%r15) + #define acc_kend_nc5_k1m8(boff1) unit_acc_gen_m8n2(16,17,boff1,%%r15,%%r12,1) + #define acc_kend_nc6_k1m8(boff1) unit_acc_gen_m8n2(18,19,boff1,%%r15,%%r12,2) #endif #define save_init_m8 "movq %2,%3; addq $64,%2;" #ifdef TRMMKERNEL @@ -258,11 +286,11 @@ #define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $16,%%r15;" #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0) #define loada_kend_k1m4 "vmovddup (%0,%3,1),%%ymm1; vmovddup 8(%0,%3,1),%%ymm2; addq $32,%3;" - #define acc_kend_nc2_k1m4(boff1,boff2) unit_acc_gen_m4n2(6,7,boff1,%1,%%r12,1) - #define acc_kend_nc3_k1m4(boff1,boff2) unit_acc_gen_m4n2(8,9,boff1,%1,%%r12,2) - #define acc_kend_nc4_k1m4(boff1,boff2) unit_acc_gen_m4n2(10,11,boff1,%%r15) - #define acc_kend_nc5_k1m4(boff1,boff2) unit_acc_gen_m4n2(12,13,boff1,%%r15,%%r12,1) - #define acc_kend_nc6_k1m4(boff1,boff2) unit_acc_gen_m4n2(14,15,boff1,%%r15,%%r12,2) + #define acc_kend_nc2_k1m4(boff1) unit_acc_gen_m4n2(6,7,boff1,%1,%%r12,1) + #define acc_kend_nc3_k1m4(boff1) unit_acc_gen_m4n2(8,9,boff1,%1,%%r12,2) + #define acc_kend_nc4_k1m4(boff1) unit_acc_gen_m4n2(10,11,boff1,%%r15) + #define acc_kend_nc5_k1m4(boff1) unit_acc_gen_m4n2(12,13,boff1,%%r15,%%r12,1) + #define acc_kend_nc6_k1m4(boff1) unit_acc_gen_m4n2(14,15,boff1,%%r15,%%r12,2) #endif #define save_init_m4 "movq %2,%3; addq $32,%2;" #ifdef TRMMKERNEL @@ -311,11 +339,11 @@ #define KERNEL_k1m2n12 KERNEL_h_k1m2n12 "addq $16,%%r15;" #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0) #define loada_kend_k1m2 "vmovddup (%0,%3,1),%%xmm1; vmovddup 8(%0,%3,1),%%xmm2; addq $16,%3;" - #define acc_kend_nc2_k1m2(boff1,boff2) unit_acc_gen_m2n2(6,7,boff1,%1,%%r12,1) - #define acc_kend_nc3_k1m2(boff1,boff2) unit_acc_gen_m2n2(8,9,boff1,%1,%%r12,2) - #define acc_kend_nc4_k1m2(boff1,boff2) unit_acc_gen_m2n2(10,11,boff1,%%r15) - #define acc_kend_nc5_k1m2(boff1,boff2) unit_acc_gen_m2n2(12,13,boff1,%%r15,%%r12,1) - #define acc_kend_nc6_k1m2(boff1,boff2) unit_acc_gen_m2n2(14,15,boff1,%%r15,%%r12,2) + #define acc_kend_nc2_k1m2(boff1) unit_acc_gen_m2n2(6,7,boff1,%1,%%r12,1) + #define acc_kend_nc3_k1m2(boff1) unit_acc_gen_m2n2(8,9,boff1,%1,%%r12,2) + #define acc_kend_nc4_k1m2(boff1) unit_acc_gen_m2n2(10,11,boff1,%%r15) + #define acc_kend_nc5_k1m2(boff1) unit_acc_gen_m2n2(12,13,boff1,%%r15,%%r12,1) + #define acc_kend_nc6_k1m2(boff1) unit_acc_gen_m2n2(14,15,boff1,%%r15,%%r12,2) #endif #define save_init_m2 "movq %2,%3; addq $16,%2;" #ifdef TRMMKERNEL @@ -362,11 +390,11 @@ #define KERNEL_k1m1n12 KERNEL_h_k1m1n12 "addq $16,%%r15;" #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0) #define loada_kend_k1m1 "vmovddup (%0,%3,1),%%xmm1; addq $8,%3;" - #define acc_kend_nc2_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%1,%%r12,1),%%xmm1,%%xmm5;" - #define acc_kend_nc3_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%1,%%r12,2),%%xmm1,%%xmm6;" - #define acc_kend_nc4_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%%r15),%%xmm1,%%xmm7;" - #define acc_kend_nc5_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%%r15,%%r12,1),%%xmm1,%%xmm8;" - #define acc_kend_nc6_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%%r15,%%r12,2),%%xmm1,%%xmm9;" + #define acc_kend_nc2_k1m1(boff1) "vfmadd231pd "#boff1"(%1,%%r12,1),%%xmm1,%%xmm5;" + #define acc_kend_nc3_k1m1(boff1) "vfmadd231pd "#boff1"(%1,%%r12,2),%%xmm1,%%xmm6;" + #define acc_kend_nc4_k1m1(boff1) "vfmadd231pd "#boff1"(%%r15),%%xmm1,%%xmm7;" + #define acc_kend_nc5_k1m1(boff1) "vfmadd231pd "#boff1"(%%r15,%%r12,1),%%xmm1,%%xmm8;" + #define acc_kend_nc6_k1m1(boff1) "vfmadd231pd "#boff1"(%%r15,%%r12,2),%%xmm1,%%xmm9;" #endif #define save_init_m1 "movq %2,%3; addq $8,%2;" #ifdef TRMMKERNEL From 13c28889a2c9a29ac781e51be747782a71cdaf50 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 6 Jun 2020 15:22:27 +0200 Subject: [PATCH 231/593] Update "cosmetic fixes for non-C99 compilers" --- test/compare_sgemm_shgemm.c | 58 +++++++++++++++++++++++++++++-------- 1 file changed, 46 insertions(+), 12 deletions(-) diff --git a/test/compare_sgemm_shgemm.c b/test/compare_sgemm_shgemm.c index d37ae6851..57aee7b8f 100644 --- a/test/compare_sgemm_shgemm.c +++ b/test/compare_sgemm_shgemm.c @@ -46,6 +46,27 @@ typedef union } bits; } bfloat16_bits; +typedef union +{ + float v; + struct + { + uint32_t m:23; + uint32_t e:8; + uint32_t s:1; + } bits; +} float32_bits; + +float +float16to32 (bfloat16_bits f16) +{ + float32_bits f32; + f32.bits.s = f16.bits.s; + f32.bits.e = f16.bits.e; + f32.bits.m = (uint32_t) f16.bits.m << 16; + return f32.v; +} + int main (int argc, char *argv[]) { @@ -56,8 +77,6 @@ main (int argc, char *argv[]) int loop = 100; char transA = 'N', transB = 'N'; float alpha = 1.0, beta = 0.0; - char transa = 'N'; - char transb = 'N'; for (x = 0; x <= loop; x++) { @@ -66,30 +85,45 @@ main (int argc, char *argv[]) float B[k * n]; float C[m * n]; bfloat16_bits AA[m * k], BB[k * n]; - float CC[m * n]; + float DD[m * n], CC[m * n]; for (j = 0; j < m; j++) { for (i = 0; i < m; i++) { - A[j * k + i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) + 0.5; - B[j * k + i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) + 0.5; + A[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; + B[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; C[j * k + i] = 0; AA[j * k + i].v = *(uint32_t *) & A[j * k + i] >> 16; BB[j * k + i].v = *(uint32_t *) & B[j * k + i] >> 16; CC[j * k + i] = 0; + DD[j * k + i] = 0; } } SGEMM (&transA, &transB, &m, &n, &k, &alpha, A, - &m, B, &k, &beta, C, &m); + &m, B, &k, &beta, C, &m); SHGEMM (&transA, &transB, &m, &n, &k, &alpha, AA, - &m, BB, &k, &beta, CC, &m); - + &m, BB, &k, &beta, CC, &m); for (i = 0; i < n; i++) - for (j = 0; j < m; j++) - for (l = 0; l < k; l++) - if (fabs(CC[i * m + j]-C[i * m + j]) > 1.0) - ret++; + for (j = 0; j < m; j++) + for (l = 0; l < k; l++) + if (fabs (CC[i * m + j] - C[i * m + j]) > 1.0) + ret++; + if (transA == 'N' && transB == 'N') + { + for (i = 0; i < n; i++) + for (j = 0; j < m; j++) + for (l = 0; l < k; l++) + { + DD[i * m + j] += + float16to32 (AA[l * m + j]) * float16to32 (BB[l + k * i]); + } + for (i = 0; i < n; i++) + for (j = 0; j < m; j++) + for (l = 0; l < k; l++) + if (CC[i * m + j] != DD[i * m + j]) + ret++; + } } if (ret != 0) fprintf (stderr, "FATAL ERROR SHGEMM - Return code: %d\n", ret); From 4e28dc6353f9705d71bb1aa1bc0ae84dd1d2610b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 7 Jun 2020 00:05:02 +0200 Subject: [PATCH 232/593] Use only -O1 with AMD AOCC version of flang to prevent miscompilation of LAPACK codes and tests on Ryzen --- Makefile.system | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index 547babaaa..252c816a9 100644 --- a/Makefile.system +++ b/Makefile.system @@ -783,6 +783,7 @@ endif ifeq ($(F_COMPILER), FLANG) CCOMMON_OPT += -DF_INTERFACE_FLANG +FCOMMON_OPT += -frecursive ifdef BINARY64 ifdef INTERFACE64 ifneq ($(INTERFACE64), 0) @@ -796,6 +797,11 @@ endif ifeq ($(USE_OPENMP), 1) FCOMMON_OPT += -fopenmp endif +ifeq ($(OSNAME), Linux) +ifeq ($(ARCH), x86_64) +FLANG_VENDOR := $(shell expr `$(FC) --version|cut -f 1 -d "."|head -1`) +endif +endif endif ifeq ($(F_COMPILER), G77) @@ -1270,8 +1276,11 @@ endif override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF) - +ifeq ($(FLANG_VENDOR),AOCC) +override FFLAGS += $(filter-out -O2 -O3,$(COMMON_OPT)) -O1 $(FCOMMON_OPT) +else override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) +endif override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF) #MAKEOVERRIDES = From 522aaf53bfdc759cddfec6e25a701907702e9f7b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 7 Jun 2020 14:30:20 +0200 Subject: [PATCH 233/593] Break out of potentially infinite rescaling loop in LAPACK xLARGV/xLARTG/xLARTGP Reference-LAPACK issue 411 --- lapack-netlib/SRC/clargv.f | 2 +- lapack-netlib/SRC/clartg.f | 2 +- lapack-netlib/SRC/dlartg.f | 2 +- lapack-netlib/SRC/dlartgp.f | 2 +- lapack-netlib/SRC/slartg.f | 2 +- lapack-netlib/SRC/slartgp.f | 2 +- lapack-netlib/SRC/zlargv.f | 2 +- lapack-netlib/SRC/zlartg.f | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/lapack-netlib/SRC/clargv.f b/lapack-netlib/SRC/clargv.f index ba53cae6f..36c5108df 100644 --- a/lapack-netlib/SRC/clargv.f +++ b/lapack-netlib/SRC/clargv.f @@ -200,7 +200,7 @@ FS = FS*SAFMN2 GS = GS*SAFMN2 SCALE = SCALE*SAFMN2 - IF( SCALE.GE.SAFMX2 ) + IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20 ) $ GO TO 10 ELSE IF( SCALE.LE.SAFMN2 ) THEN IF( G.EQ.CZERO ) THEN diff --git a/lapack-netlib/SRC/clartg.f b/lapack-netlib/SRC/clartg.f index da9a1cdef..baa68b657 100644 --- a/lapack-netlib/SRC/clartg.f +++ b/lapack-netlib/SRC/clartg.f @@ -161,7 +161,7 @@ FS = FS*SAFMN2 GS = GS*SAFMN2 SCALE = SCALE*SAFMN2 - IF( SCALE.GE.SAFMX2 ) + IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20) $ GO TO 10 ELSE IF( SCALE.LE.SAFMN2 ) THEN IF( G.EQ.CZERO.OR.SISNAN( ABS( G ) ) ) THEN diff --git a/lapack-netlib/SRC/dlartg.f b/lapack-netlib/SRC/dlartg.f index 1c7c46f63..dc49986a0 100644 --- a/lapack-netlib/SRC/dlartg.f +++ b/lapack-netlib/SRC/dlartg.f @@ -163,7 +163,7 @@ F1 = F1*SAFMN2 G1 = G1*SAFMN2 SCALE = MAX( ABS( F1 ), ABS( G1 ) ) - IF( SCALE.GE.SAFMX2 ) + IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20) $ GO TO 10 R = SQRT( F1**2+G1**2 ) CS = F1 / R diff --git a/lapack-netlib/SRC/dlartgp.f b/lapack-netlib/SRC/dlartgp.f index 0cb0d2d13..334e416e8 100644 --- a/lapack-netlib/SRC/dlartgp.f +++ b/lapack-netlib/SRC/dlartgp.f @@ -161,7 +161,7 @@ F1 = F1*SAFMN2 G1 = G1*SAFMN2 SCALE = MAX( ABS( F1 ), ABS( G1 ) ) - IF( SCALE.GE.SAFMX2 ) + IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20 ) $ GO TO 10 R = SQRT( F1**2+G1**2 ) CS = F1 / R diff --git a/lapack-netlib/SRC/slartg.f b/lapack-netlib/SRC/slartg.f index 784d4bc36..307c9c83a 100644 --- a/lapack-netlib/SRC/slartg.f +++ b/lapack-netlib/SRC/slartg.f @@ -163,7 +163,7 @@ F1 = F1*SAFMN2 G1 = G1*SAFMN2 SCALE = MAX( ABS( F1 ), ABS( G1 ) ) - IF( SCALE.GE.SAFMX2 ) + IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20) $ GO TO 10 R = SQRT( F1**2+G1**2 ) CS = F1 / R diff --git a/lapack-netlib/SRC/slartgp.f b/lapack-netlib/SRC/slartgp.f index ad76c94b4..f8be5f52b 100644 --- a/lapack-netlib/SRC/slartgp.f +++ b/lapack-netlib/SRC/slartgp.f @@ -161,7 +161,7 @@ F1 = F1*SAFMN2 G1 = G1*SAFMN2 SCALE = MAX( ABS( F1 ), ABS( G1 ) ) - IF( SCALE.GE.SAFMX2 ) + IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20) $ GO TO 10 R = SQRT( F1**2+G1**2 ) CS = F1 / R diff --git a/lapack-netlib/SRC/zlargv.f b/lapack-netlib/SRC/zlargv.f index 1e17983d5..f83ca1851 100644 --- a/lapack-netlib/SRC/zlargv.f +++ b/lapack-netlib/SRC/zlargv.f @@ -201,7 +201,7 @@ FS = FS*SAFMN2 GS = GS*SAFMN2 SCALE = SCALE*SAFMN2 - IF( SCALE.GE.SAFMX2 ) + IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20 ) $ GO TO 10 ELSE IF( SCALE.LE.SAFMN2 ) THEN IF( G.EQ.CZERO ) THEN diff --git a/lapack-netlib/SRC/zlartg.f b/lapack-netlib/SRC/zlartg.f index 8989bb896..894b4ded0 100644 --- a/lapack-netlib/SRC/zlartg.f +++ b/lapack-netlib/SRC/zlartg.f @@ -161,7 +161,7 @@ FS = FS*SAFMN2 GS = GS*SAFMN2 SCALE = SCALE*SAFMN2 - IF( SCALE.GE.SAFMX2 ) + IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20 ) $ GO TO 10 ELSE IF( SCALE.LE.SAFMN2 ) THEN IF( G.EQ.CZERO.OR.DISNAN( ABS( G ) ) ) THEN From 4cb1db0e3bd5a48433a9193b19994d539250ebdc Mon Sep 17 00:00:00 2001 From: Leonard Lausen Date: Tue, 9 Jun 2020 06:25:45 +0000 Subject: [PATCH 234/593] Test flang build --- .github/workflows/dynamic_arch.yml | 32 +++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml index b6a4090bd..ca53e8857 100644 --- a/.github/workflows/dynamic_arch.yml +++ b/.github/workflows/dynamic_arch.yml @@ -9,6 +9,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest, macos-latest] + fortran: [gfortran, flang] build: [cmake, make] steps: - name: Checkout repository @@ -24,7 +25,7 @@ jobs: # Restore any ccache cache entry, if none for # ${{ runner.os }}-ccache-${{ github.sha }} exists restore-keys: | - ${{ runner.os }}-ccache + ${{ runner.os }}-ccache- - name: Print system information run: | @@ -49,8 +50,8 @@ jobs: fi ccache -M 300M # Limit the ccache size; Github's overall cache limit is 5GB - - name: Build - if: matrix.build == 'make' + - name: gfortran build + if: matrix.build == 'make' && matrix.fortran == 'gfortran' run: | if [ "$RUNNER_OS" == "Linux" ]; then export PATH="/usr/lib/ccache:${PATH}" @@ -63,8 +64,29 @@ jobs: make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0 - - name: CMake build - if: matrix.build == 'cmake' + - name: flang build + if: matrix.build == 'make' && matrix.fortran == 'flang' + run: | + if [ "$RUNNER_OS" == "Linux" ]; then + export PATH="/usr/lib/ccache:${PATH}" + elif [ "$RUNNER_OS" == "macOS" ]; then + exit 0 + else + echo "$RUNNER_OS not supported" + exit 1 + fi + + cd /usr/ + sudo wget -nv https://github.com/flang-compiler/flang/releases/download/flang_20190329/flang-20190329-x86-70.tgz + sudo tar xf flang-20190329-x86-70.tgz + sudo rm flang-20190329-x86-70.tgz + cd - + + make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0 FC=flang + + + - name: CMake gfortran build + if: matrix.build == 'cmake' && matrix.fortran == 'gfortran' run: | if [ "$RUNNER_OS" == "Linux" ]; then export PATH="/usr/lib/ccache:${PATH}" From b98923f33a58c6d78d49e0a22bb6203df5c3f713 Mon Sep 17 00:00:00 2001 From: Leonard Lausen Date: Tue, 9 Jun 2020 06:54:42 +0000 Subject: [PATCH 235/593] Test enforce -O1 for flang --- Makefile.system | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index 252c816a9..a343a9829 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1276,7 +1276,7 @@ endif override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF) -ifeq ($(FLANG_VENDOR),AOCC) +ifeq ($(F_COMPILER), FLANG) override FFLAGS += $(filter-out -O2 -O3,$(COMMON_OPT)) -O1 $(FCOMMON_OPT) else override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) From f07a80354b60121a91f40574ebd6d7c306dcd100 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 9 Jun 2020 16:07:03 +0200 Subject: [PATCH 236/593] Apply previously AOCC-specific workaround to all versions of flang --- Makefile.system | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/Makefile.system b/Makefile.system index 252c816a9..56e94f2a6 100644 --- a/Makefile.system +++ b/Makefile.system @@ -797,11 +797,11 @@ endif ifeq ($(USE_OPENMP), 1) FCOMMON_OPT += -fopenmp endif -ifeq ($(OSNAME), Linux) -ifeq ($(ARCH), x86_64) -FLANG_VENDOR := $(shell expr `$(FC) --version|cut -f 1 -d "."|head -1`) -endif -endif +#ifeq ($(OSNAME), Linux) +#ifeq ($(ARCH), x86_64) +#FLANG_VENDOR := $(shell expr `$(FC) --version|cut -f 1 -d "."|head -1`) +#endif +#endif endif ifeq ($(F_COMPILER), G77) @@ -1276,7 +1276,8 @@ endif override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF) -ifeq ($(FLANG_VENDOR),AOCC) +#ifeq ($(FLANG_VENDOR),AOCC) +ifeq ($(F_COMPILER),FLANG) override FFLAGS += $(filter-out -O2 -O3,$(COMMON_OPT)) -O1 $(FCOMMON_OPT) else override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) From ba2c5b404d1c0ac5ed6037c44a6adefb385bc73a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 9 Jun 2020 16:09:34 +0200 Subject: [PATCH 237/593] When building with flang, use it also for the final link step to get dependencies right --- exports/Makefile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/exports/Makefile b/exports/Makefile index c92d6e996..01a313b35 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -155,8 +155,12 @@ ifeq ($(F_COMPILER), INTEL) -Wl,--whole-archive $< -Wl,--no-whole-archive \ -Wl,-soname,$(INTERNALNAME) $(EXTRALIB) $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. +else ifeq ($(F_COMPILER), FLANG) + $(FC) $(FFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ + -Wl,--whole-archive $< -Wl,--no-whole-archive \ + -Wl,-soname,$(INTERNALNAME) $(EXTRALIB) + $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. else - ifneq ($(C_COMPILER), LSB) $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ -Wl,--whole-archive $< -Wl,--no-whole-archive \ From 3ce469a34f435f64d9b3f6ca8ccba69dcfce31b3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 9 Jun 2020 16:11:13 +0200 Subject: [PATCH 238/593] Limit optimization level to O1 for flang and add -frecursive --- cmake/fc.cmake | 1 + cmake/system.cmake | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/cmake/fc.cmake b/cmake/fc.cmake index cc330ae2c..9dcedffb2 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -16,6 +16,7 @@ if (${F_COMPILER} STREQUAL "FLANG") if (USE_OPENMP) set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp") endif () + set(FCOMMON_OPT "${FCOMMON_OPT} -frecursive") endif () if (${F_COMPILER} STREQUAL "G77") diff --git a/cmake/system.cmake b/cmake/system.cmake index 7e7f726c5..c2ae471d2 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -417,6 +417,15 @@ if (${CMAKE_C_COMPILER} STREQUAL "LSB" OR ${CMAKE_SYSTEM_NAME} STREQUAL "Windows set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DLAPACK_COMPLEX_STRUCTURE") endif () +if ("${CMAKE_BUILD_TYPE}" STREQUAL "Release") +if ("${F_COMPILER}" STREQUAL "FLANG") + set(FILTER_FLAGS "-O2;-O3") + foreach (FILTER_FLAG ${FILTER_FLAGS}) + string(REPLACE ${FILTER_FLAG} "-O1" CMAKE_Fortran_FLAGS_RELEASE ${CMAKE_Fortran_FLAGS_RELEASE}) + endforeach () +endif () +endif () + if (NOT DEFINED SUFFIX) set(SUFFIX o) endif () From bc6fd20a40f0750311311b31872cf3e97f845617 Mon Sep 17 00:00:00 2001 From: ZhangDanfeng <467688405@qq.com> Date: Wed, 10 Jun 2020 01:01:16 +0800 Subject: [PATCH 239/593] fix INIT8x4 Signed-off-by: ZhangDanfeng <467688405@qq.com> --- kernel/arm64/sgemm_kernel_8x8_cortexa53.S | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/kernel/arm64/sgemm_kernel_8x8_cortexa53.S b/kernel/arm64/sgemm_kernel_8x8_cortexa53.S index fec0c9ae9..628a928ca 100644 --- a/kernel/arm64/sgemm_kernel_8x8_cortexa53.S +++ b/kernel/arm64/sgemm_kernel_8x8_cortexa53.S @@ -681,12 +681,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT8x4 fmov s16, wzr fmov s17, wzr + fmov s18, wzr + fmov s19, s16 fmov s20, wzr fmov s21, s16 - fmov s24, wzr - fmov s25, s16 - fmov s28, wzr - fmov s29, s16 + fmov s22, wzr + fmov s23, s16 .endm .macro KERNEL8x4_I @@ -765,14 +765,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmla v21.4s, v3.4s, v6.s[2] fmla v22.4s, v2.4s, v6.s[3] fmla v23.4s, v3.4s, v6.s[3] - fmla v24.4s, v2.4s, v7.s[0] - fmla v25.4s, v3.4s, v7.s[0] - fmla v26.4s, v2.4s, v7.s[1] - fmla v27.4s, v3.4s, v7.s[1] - fmla v28.4s, v2.4s, v7.s[2] - fmla v29.4s, v3.4s, v7.s[2] - fmla v30.4s, v2.4s, v7.s[3] - fmla v31.4s, v3.4s, v7.s[3] .endm .macro KERNEL8x4_SUB From 9fe930f205c3ad56fe92d9c4c65e48836db33a27 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Thu, 11 Jun 2020 15:47:20 -0500 Subject: [PATCH 240/593] powerpc: Add support for future processor This is the initial patch to support build infrastructure for POWER10 architecture. --- Makefile.power | 10 ++ Makefile.system | 7 ++ TargetList.txt | 1 + cmake/arch.cmake | 2 +- cmake/prebuild.cmake | 2 +- common.h | 7 +- common_power.h | 8 +- cpuid_power.c | 15 ++- driver/others/dynamic_power.c | 16 ++- getarch.c | 13 +++ kernel/CMakeLists.txt | 2 +- kernel/Makefile.L3 | 4 + kernel/power/KERNEL.POWER10 | 214 ++++++++++++++++++++++++++++++++++ kernel/power/casum.c | 2 +- kernel/power/ccopy.c | 2 +- kernel/power/crot.c | 2 +- kernel/power/cswap.c | 2 +- kernel/power/dasum.c | 2 +- kernel/power/daxpy.c | 2 +- kernel/power/dcopy.c | 2 +- kernel/power/ddot.c | 2 +- kernel/power/dgemv_n.c | 2 +- kernel/power/drot.c | 2 +- kernel/power/dscal.c | 2 +- kernel/power/dswap.c | 2 +- kernel/power/sasum.c | 2 +- kernel/power/scopy.c | 2 +- kernel/power/sdot.c | 2 +- kernel/power/srot.c | 2 +- kernel/power/sscal.c | 2 +- kernel/power/sswap.c | 2 +- kernel/power/zasum.c | 2 +- kernel/power/zaxpy.c | 2 +- kernel/power/zcopy.c | 2 +- kernel/power/zdot.c | 2 +- kernel/power/zscal.c | 2 +- kernel/power/zswap.c | 2 +- param.h | 2 +- 38 files changed, 309 insertions(+), 42 deletions(-) create mode 100644 kernel/power/KERNEL.POWER10 diff --git a/Makefile.power b/Makefile.power index 24d8aa8a7..5c431860f 100644 --- a/Makefile.power +++ b/Makefile.power @@ -9,6 +9,16 @@ else USE_OPENMP = 1 endif +ifeq ($(CORE), POWER10) +ifeq ($(USE_OPENMP), 1) +COMMON_OPT += -Ofast -mcpu=future -mtune=future -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp +FCOMMON_OPT += -O2 -frecursive -mcpu=future -mtune=future -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp +else +COMMON_OPT += -Ofast -mcpu=future -mtune=future -mvsx -malign-power -fno-fast-math +FCOMMON_OPT += -O2 -frecursive -mcpu=future -mtune=future -malign-power -fno-fast-math +endif +endif + ifeq ($(CORE), POWER9) ifeq ($(USE_OPENMP), 1) COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp diff --git a/Makefile.system b/Makefile.system index 56e94f2a6..3decc1457 100644 --- a/Makefile.system +++ b/Makefile.system @@ -595,6 +595,7 @@ DYNAMIC_CORE = POWER6 DYNAMIC_CORE += POWER8 ifneq ($(C_COMPILER), GCC) DYNAMIC_CORE += POWER9 +DYNAMIC_CORE += POWER10 endif ifeq ($(C_COMPILER), GCC) GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) @@ -603,6 +604,12 @@ DYNAMIC_CORE += POWER9 else $(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.) endif +GCCVERSIONGTEQ11 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 11) +ifeq ($(GCCVERSIONGTEQ11), 1) +DYNAMIC_CORE += POWER10 +else +$(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.) +endif endif endif diff --git a/TargetList.txt b/TargetList.txt index e2d2f4026..4e54e3077 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -49,6 +49,7 @@ POWER6 POWER7 POWER8 POWER9 +POWER10 PPCG4 PPC970 PPC970MP diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 9d51f777c..d56ba99cb 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -49,7 +49,7 @@ if (DYNAMIC_ARCH) endif () if (POWER) - set(DYNAMIC_CORE POWER6 POWER8 POWER9) + set(DYNAMIC_CORE POWER6 POWER8 POWER9 POWER10) endif () if (X86) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 067b97b4b..30256870c 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -420,7 +420,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(ZGEMM_UNROLL_M 8) set(ZGEMM_UNROLL_N 2) set(SYMV_P 8) - elseif ("${TCORE}" STREQUAL "POWER9") + elseif ("${TCORE}" STREQUAL "POWER9" OR "${TCORE}" STREQUAL "POWER10") file(APPEND ${TARGET_CONF_TEMP} "#define L1_DATA_SIZE 32768\n" "#define L1_DATA_LINESIZE 128\n" diff --git a/common.h b/common.h index e2c8cdee5..00b34a3f7 100644 --- a/common.h +++ b/common.h @@ -360,13 +360,8 @@ typedef int blasint; #endif #endif -#ifdef POWER8 -#ifndef YIELDING -#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); -#endif -#endif -#ifdef POWER9 +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #ifndef YIELDING #define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); #endif diff --git a/common_power.h b/common_power.h index e29d0f382..aa19794b5 100644 --- a/common_power.h +++ b/common_power.h @@ -68,7 +68,7 @@ #endif -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #define MB __asm__ __volatile__ ("eieio":::"memory") #define WMB __asm__ __volatile__ ("eieio":::"memory") #define RMB __asm__ __volatile__ ("eieio":::"memory") @@ -272,7 +272,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define HAVE_PREFETCH #endif -#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || defined(PPC970) +#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || defined(POWER10) || defined(PPC970) #define DCBT_ARG 0 #else #define DCBT_ARG 8 @@ -294,7 +294,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define L1_PREFETCH dcbtst #endif -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #define L1_DUALFETCH #define L1_PREFETCHSIZE (16 + 128 * 100) #define L1_PREFETCH dcbtst @@ -843,7 +843,7 @@ Lmcount$lazy_ptr: #define BUFFER_SIZE ( 2 << 20) #elif defined(PPC440FP2) #define BUFFER_SIZE ( 16 << 20) -#elif defined(POWER8) || defined(POWER9) +#elif defined(POWER8) || defined(POWER9) || defined(POWER10) #define BUFFER_SIZE ( 64 << 20) #else #define BUFFER_SIZE ( 16 << 20) diff --git a/cpuid_power.c b/cpuid_power.c index d5ba6fb2c..b36aa4945 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -57,6 +57,7 @@ #define CPUTYPE_PPCG4 7 #define CPUTYPE_POWER8 8 #define CPUTYPE_POWER9 9 +#define CPUTYPE_POWER10 10 char *cpuname[] = { "UNKNOWN", @@ -68,7 +69,8 @@ char *cpuname[] = { "CELL", "PPCG4", "POWER8", - "POWER9" + "POWER9", + "POWER10" }; char *lowercpuname[] = { @@ -81,7 +83,8 @@ char *lowercpuname[] = { "cell", "ppcg4", "power8", - "power9" + "power9", + "power10" }; char *corename[] = { @@ -94,7 +97,8 @@ char *corename[] = { "CELL", "PPCG4", "POWER8", - "POWER9" + "POWER9", + "POWER10" }; int detect(void){ @@ -125,6 +129,7 @@ int detect(void){ if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9; + if (!strncasecmp(p, "POWER10", 7)) return CPUTYPE_POWER10; if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; @@ -157,6 +162,7 @@ int detect(void){ if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9; + if (!strncasecmp(p, "POWER10", 7)) return CPUTYPE_POWER10; if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; return CPUTYPE_POWER5; @@ -179,6 +185,9 @@ int detect(void){ int id; __asm __volatile("mfpvr %0" : "=r"(id)); switch ( id >> 16 ) { + case 0x80: // POWER10 + return CPUTYPE_POWER10; + break; case 0x4e: // POWER9 return CPUTYPE_POWER9; break; diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index 8c831b998..811a5fae3 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -6,6 +6,9 @@ extern gotoblas_t gotoblas_POWER8; #if (!defined __GNUC__) || ( __GNUC__ >= 6) extern gotoblas_t gotoblas_POWER9; #endif +#if (!defined __GNUC__) || ( __GNUC__ >= 11) +extern gotoblas_t gotoblas_POWER10; +#endif extern void openblas_warning(int verbose, const char *msg); @@ -13,7 +16,8 @@ static char *corename[] = { "unknown", "POWER6", "POWER8", - "POWER9" + "POWER9", + "POWER10" }; #define NUM_CORETYPES 4 @@ -23,6 +27,9 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_POWER8) return corename[2]; #if (!defined __GNUC__) || ( __GNUC__ >= 6) if (gotoblas == &gotoblas_POWER9) return corename[3]; +#endif +#if (!defined __GNUC__) || ( __GNUC__ >= 11) + if (gotoblas == &gotoblas_POWER10) return corename[4]; #endif return corename[0]; } @@ -36,6 +43,10 @@ static gotoblas_t *get_coretype(void) { #if (!defined __GNUC__) || ( __GNUC__ >= 6) if (__builtin_cpu_is("power9")) return &gotoblas_POWER9; +#endif +#if (!defined __GNUC__) || ( __GNUC__ >= 11) + if (__builtin_cpu_is("isa_3_1") && __builtin_cpu_supports ("mma")) + return &gotoblas_POWER10; #endif return NULL; } @@ -61,6 +72,9 @@ static gotoblas_t *force_coretype(char * coretype) { case 2: return (&gotoblas_POWER8); #if (!defined __GNUC__) || ( __GNUC__ >= 6) case 3: return (&gotoblas_POWER9); +#endif +#if (!defined __GNUC__) || ( __GNUC__ >= 11) + case 4: return (&gotoblas_POWER10); #endif default: return NULL; } diff --git a/getarch.c b/getarch.c index c173d58b8..164947f3e 100644 --- a/getarch.c +++ b/getarch.c @@ -650,6 +650,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "POWER9" #endif +#if defined(FORCE_POWER10) +#define FORCE +#define ARCHITECTURE "POWER" +#define SUBARCHITECTURE "POWER10" +#define SUBDIRNAME "power" +#define ARCHCONFIG "-DPOWER10 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \ + "-DL2_SIZE=4194304 -DL2_LINESIZE=128 " \ + "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "power10" +#define CORENAME "POWER10" +#endif + #ifdef FORCE_PPCG4 #define FORCE #define ARCHITECTURE "POWER" diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index b114c6a33..d1349c5f8 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -130,7 +130,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) if (ARM OR ARM64 OR (TARGET_CORE MATCHES LONGSOON3B) OR (TARGET_CORE MATCHES GENERIC) OR (TARGET_CORE MATCHES HASWELL) OR (TARGET_CORE MATCHES ZEN) OR (TARGET_CORE MATCHES SKYLAKEX) ) set(USE_TRMM true) endif () - if (ZARCH OR (TARGET_CORE MATCHES POWER8) OR (TARGET_CORE MATCHES POWER9)) + if (ZARCH OR (TARGET_CORE MATCHES POWER8) OR (TARGET_CORE MATCHES POWER9) OR (TARGET_CORE MATCHES POWER10)) set(USE_TRMM true) endif () diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index da6c5fd57..0cb02ef85 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -51,6 +51,10 @@ ifeq ($(CORE), POWER9) USE_TRMM = 1 endif +ifeq ($(CORE), POWER10) +USE_TRMM = 1 +endif + ifeq ($(ARCH), zarch) USE_TRMM = 1 endif diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 new file mode 100644 index 000000000..ab8fbfcd9 --- /dev/null +++ b/kernel/power/KERNEL.POWER10 @@ -0,0 +1,214 @@ +ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) +include $(KERNELDIR)/KERNEL.POWER8 +else + +#SGEMM_BETA = ../generic/gemm_beta.c +#DGEMM_BETA = ../generic/gemm_beta.c +#CGEMM_BETA = ../generic/zgemm_beta.c +#ZGEMM_BETA = ../generic/zgemm_beta.c + +STRMMKERNEL = sgemm_kernel_power9.S +DTRMMKERNEL = dgemm_kernel_power9.S +CTRMMKERNEL = cgemm_kernel_power9.S +ZTRMMKERNEL = zgemm_kernel_power9.S + +SGEMMKERNEL = sgemm_kernel_power9.S +SGEMMINCOPY = ../generic/gemm_ncopy_16.c +SGEMMITCOPY = sgemm_tcopy_16_power8.S +SGEMMONCOPY = ../generic/gemm_ncopy_8.c +SGEMMOTCOPY = sgemm_tcopy_8_power8.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = dgemm_kernel_power9.S +DGEMMINCOPY = ../generic/gemm_ncopy_16.c +DGEMMITCOPY = dgemm_tcopy_16_power8.S +DGEMMONCOPY = dgemm_ncopy_4_power8.S +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = cgemm_kernel_power9.S +CGEMMINCOPY = ../generic/zgemm_ncopy_8.c +CGEMMITCOPY = ../generic/zgemm_tcopy_8.c +CGEMMONCOPY = ../generic/zgemm_ncopy_4.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = zgemm_kernel_power9.S +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c +ZGEMMITCOPY = zgemm_tcopy_8_power8.S +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +#Todo: CGEMM3MKERNEL should be 4x4 blocksizes. +#CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse3.S +#ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S + +#Pure C for other kernels +#SAMAXKERNEL = ../arm/amax.c +#DAMAXKERNEL = ../arm/amax.c +#CAMAXKERNEL = ../arm/zamax.c +#ZAMAXKERNEL = ../arm/zamax.c +# +#SAMINKERNEL = ../arm/amin.c +#DAMINKERNEL = ../arm/amin.c +#CAMINKERNEL = ../arm/zamin.c +#ZAMINKERNEL = ../arm/zamin.c +# +#SMAXKERNEL = ../arm/max.c +#DMAXKERNEL = ../arm/max.c +# +#SMINKERNEL = ../arm/min.c +#DMINKERNEL = ../arm/min.c +# +ifneq ($(GCCVERSIONGTEQ9),1) +ISAMAXKERNEL = isamax_power9.S +else +ISAMAXKERNEL = isamax.c +endif +IDAMAXKERNEL = idamax.c +ifneq ($(GCCVERSIONGTEQ9),1) +ICAMAXKERNEL = icamax_power9.S +else +ICAMAXKERNEL = icamax.c +endif +IZAMAXKERNEL = izamax.c +# +ifneq ($(GCCVERSIONGTEQ9),1) +ISAMINKERNEL = isamin_power9.S +else +ISAMINKERNEL = isamin.c +endif +IDAMINKERNEL = idamin.c +ifneq ($(GCCVERSIONGTEQ9),1) +ICAMINKERNEL = icamin_power9.S +else +ICAMINKERNEL = icamin.c +endif +IZAMINKERNEL = izamin.c +# +#ISMAXKERNEL = ../arm/imax.c +#IDMAXKERNEL = ../arm/imax.c +# +#ISMINKERNEL = ../arm/imin.c +#IDMINKERNEL = ../arm/imin.c +# +SASUMKERNEL = sasum.c +DASUMKERNEL = dasum.c +CASUMKERNEL = casum.c +ZASUMKERNEL = zasum.c +# +SAXPYKERNEL = saxpy.c +DAXPYKERNEL = daxpy.c +ifneq ($(GCCVERSIONGTEQ9),1) +CAXPYKERNEL = caxpy_power9.S +else +CAXPYKERNEL = caxpy.c +endif +ZAXPYKERNEL = zaxpy.c +# +SCOPYKERNEL = scopy.c +DCOPYKERNEL = dcopy.c +CCOPYKERNEL = ccopy.c +ZCOPYKERNEL = zcopy.c +# +SDOTKERNEL = sdot.c +DDOTKERNEL = ddot.c +DSDOTKERNEL = sdot.c +ifneq ($(GCCVERSIONGTEQ9),1) +CDOTKERNEL = cdot_power9.S +else +CDOTKERNEL = cdot.c +endif +ZDOTKERNEL = zdot.c +# +SNRM2KERNEL = ../arm/nrm2.c +DNRM2KERNEL = ../arm/nrm2.c +CNRM2KERNEL = ../arm/znrm2.c +ZNRM2KERNEL = ../arm/znrm2.c +# +SROTKERNEL = srot.c +DROTKERNEL = drot.c +CROTKERNEL = crot.c +ZROTKERNEL = zrot.c +# +SSCALKERNEL = sscal.c +DSCALKERNEL = dscal.c +CSCALKERNEL = zscal.c +ZSCALKERNEL = zscal.c +# +SSWAPKERNEL = sswap.c +DSWAPKERNEL = dswap.c +CSWAPKERNEL = cswap.c +ZSWAPKERNEL = zswap.c +# + +SGEMVNKERNEL = sgemv_n.c +DGEMVNKERNEL = dgemv_n.c +CGEMVNKERNEL = cgemv_n.c +ZGEMVNKERNEL = zgemv_n_4.c +# +SGEMVTKERNEL = sgemv_t.c +DGEMVTKERNEL = dgemv_t.c +CGEMVTKERNEL = cgemv_t.c +ZGEMVTKERNEL = zgemv_t_4.c + + +#SSYMV_U_KERNEL = ../generic/symv_k.c +#SSYMV_L_KERNEL = ../generic/symv_k.c +#DSYMV_U_KERNEL = ../generic/symv_k.c +#DSYMV_L_KERNEL = ../generic/symv_k.c +#QSYMV_U_KERNEL = ../generic/symv_k.c +#QSYMV_L_KERNEL = ../generic/symv_k.c +#CSYMV_U_KERNEL = ../generic/zsymv_k.c +#CSYMV_L_KERNEL = ../generic/zsymv_k.c +#ZSYMV_U_KERNEL = ../generic/zsymv_k.c +#ZSYMV_L_KERNEL = ../generic/zsymv_k.c +#XSYMV_U_KERNEL = ../generic/zsymv_k.c +#XSYMV_L_KERNEL = ../generic/zsymv_k.c + +#ZHEMV_U_KERNEL = ../generic/zhemv_k.c +#ZHEMV_L_KERNEL = ../generic/zhemv_k.c + +LSAME_KERNEL = ../generic/lsame.c +SCABS_KERNEL = ../generic/cabs.c +DCABS_KERNEL = ../generic/cabs.c +QCABS_KERNEL = ../generic/cabs.c + +#Dump kernel +CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c +ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c + +endif diff --git a/kernel/power/casum.c b/kernel/power/casum.c index a9ece0768..3478a39ef 100644 --- a/kernel/power/casum.c +++ b/kernel/power/casum.c @@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "casum_microk_power8.c" #endif diff --git a/kernel/power/ccopy.c b/kernel/power/ccopy.c index 50df84cc5..cbe5b48d2 100644 --- a/kernel/power/ccopy.c +++ b/kernel/power/ccopy.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "ccopy_microk_power8.c" #endif diff --git a/kernel/power/crot.c b/kernel/power/crot.c index 2a5835546..5c1d44620 100644 --- a/kernel/power/crot.c +++ b/kernel/power/crot.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) static void crot_kernel_8 (long n, float *x, float *y, float c, float s) { diff --git a/kernel/power/cswap.c b/kernel/power/cswap.c index 31e02fe5a..88cb1d638 100644 --- a/kernel/power/cswap.c +++ b/kernel/power/cswap.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "cswap_microk_power8.c" #endif diff --git a/kernel/power/dasum.c b/kernel/power/dasum.c index d0e060977..09e06d909 100644 --- a/kernel/power/dasum.c +++ b/kernel/power/dasum.c @@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "dasum_microk_power8.c" #endif diff --git a/kernel/power/daxpy.c b/kernel/power/daxpy.c index f09611ff0..018beafd1 100644 --- a/kernel/power/daxpy.c +++ b/kernel/power/daxpy.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "daxpy_microk_power8.c" #endif diff --git a/kernel/power/dcopy.c b/kernel/power/dcopy.c index 27b39144b..cf203e71e 100644 --- a/kernel/power/dcopy.c +++ b/kernel/power/dcopy.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "dcopy_microk_power8.c" #endif diff --git a/kernel/power/ddot.c b/kernel/power/ddot.c index f985df1c5..bd9e1fb97 100644 --- a/kernel/power/ddot.c +++ b/kernel/power/ddot.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "ddot_microk_power8.c" #endif diff --git a/kernel/power/dgemv_n.c b/kernel/power/dgemv_n.c index 1a3d7669c..b4dfda550 100644 --- a/kernel/power/dgemv_n.c +++ b/kernel/power/dgemv_n.c @@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "dgemv_n_microk_power8.c" #endif diff --git a/kernel/power/drot.c b/kernel/power/drot.c index baeb54205..b808ab566 100644 --- a/kernel/power/drot.c +++ b/kernel/power/drot.c @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma GCC optimize "O1" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "drot_microk_power8.c" #endif diff --git a/kernel/power/dscal.c b/kernel/power/dscal.c index 779a08e9c..7e0fe48c0 100644 --- a/kernel/power/dscal.c +++ b/kernel/power/dscal.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "dscal_microk_power8.c" #endif diff --git a/kernel/power/dswap.c b/kernel/power/dswap.c index 52b7f50da..795bb10b4 100644 --- a/kernel/power/dswap.c +++ b/kernel/power/dswap.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "dswap_microk_power8.c" #endif diff --git a/kernel/power/sasum.c b/kernel/power/sasum.c index 5908347d3..b259d7d76 100644 --- a/kernel/power/sasum.c +++ b/kernel/power/sasum.c @@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "sasum_microk_power8.c" #endif diff --git a/kernel/power/scopy.c b/kernel/power/scopy.c index 5e3fe45a5..5207d386e 100644 --- a/kernel/power/scopy.c +++ b/kernel/power/scopy.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "scopy_microk_power8.c" #endif diff --git a/kernel/power/sdot.c b/kernel/power/sdot.c index ae527dde9..8de434e41 100644 --- a/kernel/power/sdot.c +++ b/kernel/power/sdot.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "sdot_microk_power8.c" #endif diff --git a/kernel/power/srot.c b/kernel/power/srot.c index 6af813c16..9638a59eb 100644 --- a/kernel/power/srot.c +++ b/kernel/power/srot.c @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma GCC optimize "O1" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "srot_microk_power8.c" #endif diff --git a/kernel/power/sscal.c b/kernel/power/sscal.c index 4f3ba5698..ddd5b2c5b 100644 --- a/kernel/power/sscal.c +++ b/kernel/power/sscal.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "sscal_microk_power8.c" #endif diff --git a/kernel/power/sswap.c b/kernel/power/sswap.c index 23d13280f..a56434444 100644 --- a/kernel/power/sswap.c +++ b/kernel/power/sswap.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "sswap_microk_power8.c" #endif diff --git a/kernel/power/zasum.c b/kernel/power/zasum.c index f61c62e75..8383e39ab 100644 --- a/kernel/power/zasum.c +++ b/kernel/power/zasum.c @@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "zasum_microk_power8.c" #endif diff --git a/kernel/power/zaxpy.c b/kernel/power/zaxpy.c index f0f8c6910..4a7c26c69 100644 --- a/kernel/power/zaxpy.c +++ b/kernel/power/zaxpy.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "zaxpy_microk_power8.c" #endif diff --git a/kernel/power/zcopy.c b/kernel/power/zcopy.c index b21d6ef15..bb80decd2 100644 --- a/kernel/power/zcopy.c +++ b/kernel/power/zcopy.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "zcopy_microk_power8.c" #endif diff --git a/kernel/power/zdot.c b/kernel/power/zdot.c index fd36c7f44..9086ef35b 100644 --- a/kernel/power/zdot.c +++ b/kernel/power/zdot.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "zdot_microk_power8.c" #endif diff --git a/kernel/power/zscal.c b/kernel/power/zscal.c index a1b441d2c..16b584bca 100644 --- a/kernel/power/zscal.c +++ b/kernel/power/zscal.c @@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma GCC optimize "O1" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(DOUBLE) #include "zscal_microk_power8.c" #endif diff --git a/kernel/power/zswap.c b/kernel/power/zswap.c index 1d8826f41..c6508f032 100644 --- a/kernel/power/zswap.c +++ b/kernel/power/zswap.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "zswap_microk_power8.c" #endif diff --git a/param.h b/param.h index 04928277c..fd0ea7599 100644 --- a/param.h +++ b/param.h @@ -2260,7 +2260,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(POWER9) +#if defined(POWER9) || defined(POWER10) #define SNUMOPT 16 #define DNUMOPT 8 From 007d9f97d7d53bfb569708ba51f8d9fdd3fba211 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 13 Jun 2020 19:25:28 +0200 Subject: [PATCH 241/593] Make gotoblas_corename report the name of the selected TARGET rather than its aliases --- driver/others/dynamic.c | 50 +++++++++++++++++++++++++++++++++++------ 1 file changed, 43 insertions(+), 7 deletions(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 2e87e186a..1bf0e4a6d 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -764,18 +764,53 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_NORTHWOOD) return corename[ 3]; if (gotoblas == &gotoblas_PRESCOTT) return corename[ 4]; if (gotoblas == &gotoblas_BANIAS) return corename[ 5]; - if (gotoblas == &gotoblas_ATOM) return corename[ 6]; + if (gotoblas == &gotoblas_ATOM) +#ifdef DYNAMIC_OLDER + return corename[ 6]; +#else + return corename[10]; +#endif if (gotoblas == &gotoblas_CORE2) return corename[ 7]; - if (gotoblas == &gotoblas_PENRYN) return corename[ 8]; - if (gotoblas == &gotoblas_DUNNINGTON) return corename[ 9]; + if (gotoblas == &gotoblas_PENRYN) +#ifdef DYNAMIC_OLDER + return corename[ 8]; +#else + return corename[7]; +#endif + if (gotoblas == &gotoblas_DUNNINGTON) +#ifdef DYNAMIC_OLDER + return corename[ 9]; +#else + return corename[7]; +#endif if (gotoblas == &gotoblas_NEHALEM) return corename[10]; if (gotoblas == &gotoblas_ATHLON) return corename[11]; - if (gotoblas == &gotoblas_OPTERON_SSE3) return corename[12]; - if (gotoblas == &gotoblas_OPTERON) return corename[13]; + if (gotoblas == &gotoblas_OPTERON_SSE3) +#ifdef DYNAMIC_OLDER + return corename[12]; +#else + return corename[7]; +#endif + if (gotoblas == &gotoblas_OPTERON) +#ifdef DYNAMIC_OLDER + return corename[13]; +#else + return corename[7]; +#endif if (gotoblas == &gotoblas_BARCELONA) return corename[14]; - if (gotoblas == &gotoblas_NANO) return corename[15]; + if (gotoblas == &gotoblas_NANO) +#ifdef DYNAMIC_OLDER + return corename[15]; +#else + return corename[10]; +#endif if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16]; - if (gotoblas == &gotoblas_BOBCAT) return corename[17]; + if (gotoblas == &gotoblas_BOBCAT) +#ifdef DYNAMIC_OLDER + return corename[17]; +#else + return corename[7]; +#endif if (gotoblas == &gotoblas_BULLDOZER) return corename[18]; if (gotoblas == &gotoblas_PILEDRIVER) return corename[19]; if (gotoblas == &gotoblas_HASWELL) return corename[20]; @@ -787,6 +822,7 @@ char *gotoblas_corename(void) { } + static gotoblas_t *force_coretype(char *coretype){ int i ; From 41fc6f3cd2c46ff8ed136b2eb03782c434646c00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20M=C3=A4rtens?= Date: Sat, 13 Jun 2020 22:37:39 +0200 Subject: [PATCH 242/593] Added missing exported symbols. --- exports/gensymbol | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/exports/gensymbol b/exports/gensymbol index 0a68a3572..73b4be248 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -40,15 +40,10 @@ ztbsv,ztpmv,ztpsv,ztrmm,ztrmv,ztrsm,ztrsv, xerbla, saxpby,daxpby,caxpby,zaxpby, + somatcopy, domatcopy, comatcopy, zomatcopy, + simatcopy, dimatcopy, cimatcopy, zimatcopy, sgeadd,dgeadd,cgeadd,zgeadd, - somatcopy, - simatcopy, - domatcopy, - dimatcopy, - comatcopy, - cimatcopy, - zomatcopy, - zimatcopy, + ssum, dsum, scsum, dzsum ); @halfblasobjs = (shgemm); @@ -81,7 +76,12 @@ cblas_saxpby,cblas_daxpby,cblas_caxpby,cblas_zaxpby, cblas_somatcopy, cblas_domatcopy, cblas_comatcopy, cblas_zomatcopy, cblas_simatcopy, cblas_dimatcopy, cblas_cimatcopy, cblas_zimatcopy, - cblas_sgeadd, cblas_dgeadd,cblas_cgeadd, cblas_zgeadd + cblas_sgeadd, cblas_dgeadd,cblas_cgeadd, cblas_zgeadd, + cblas_isamin, cblas_idamin, cblas_icamin, cblas_izamin, + cblas_ismin, cblas_idmin, cblas_icmin, cblas_izmin, + cblas_ismax, cblas_idmax, cblas_icmax, cblas_izmax, + cblas_ssum, cblas_dsum, cblas_scsum, cblas_dzsum, + cblas_xerbla ); @halfcblasobjs = (cblas_shgemm); @@ -3501,9 +3501,12 @@ if ($ARGV[1] eq "x86") { @underscore_objs = (@underscore_objs, @gemm3mobjs); if ($ARGV[1] eq "ia64") { @underscore_objs = (@underscore_objs, @gemm3mobjs); }; if ($ARGV[1] eq "MIPS") { @underscore_objs = (@underscore_objs, @gemm3mobjs); }; - if ($ARGV[4] == 0) { @no_underscore_objs = (@cblasobjs, @misc_no_underscore_objs); + if ($ARGV[1] eq "x86_64") { @no_underscore_objs = (@no_underscore_objs, @cblasgemm3mobjs); }; + if ($ARGV[1] eq "x86") { @no_underscore_objs = (@no_underscore_objs, @cblasgemm3mobjs); }; + if ($ARGV[1] eq "ia64") { @no_underscore_objs = (@no_underscore_objs, @cblasgemm3mobjs); }; + if ($ARGV[1] eq "MIPS") { @no_underscore_objs = (@no_underscore_objs, @cblasgemm3mobjs); }; }else{ #NO_CBLAS=1 @no_underscore_objs = (@misc_no_underscore_objs); From abf670757b2838dd68eede52aba29275b4c6d2cb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 13 Jun 2020 23:21:13 +0200 Subject: [PATCH 243/593] Respect predefined defaults for AR, AS, LD and RANLIB --- Makefile.system | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Makefile.system b/Makefile.system index 56e94f2a6..d5e747824 100644 --- a/Makefile.system +++ b/Makefile.system @@ -263,10 +263,10 @@ endif ARFLAGS = CPP = $(COMPILER) -E -AR = $(CROSS_SUFFIX)ar -AS = $(CROSS_SUFFIX)as -LD = $(CROSS_SUFFIX)ld -RANLIB = $(CROSS_SUFFIX)ranlib +AR ?= $(CROSS_SUFFIX)ar +AS ?= $(CROSS_SUFFIX)as +LD ?= $(CROSS_SUFFIX)ld +RANLIB ?= $(CROSS_SUFFIX)ranlib NM = $(CROSS_SUFFIX)nm DLLWRAP = $(CROSS_SUFFIX)dllwrap OBJCOPY = $(CROSS_SUFFIX)objcopy From 0ed2adf0b25f25ddedf8c858010d233b87d615cc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 14 Jun 2020 00:01:20 +0200 Subject: [PATCH 244/593] Fix spelling of flang option -Mrecursive and add -Kieee --- Makefile.system | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/Makefile.system b/Makefile.system index 56e94f2a6..31cdd12b2 100644 --- a/Makefile.system +++ b/Makefile.system @@ -783,7 +783,7 @@ endif ifeq ($(F_COMPILER), FLANG) CCOMMON_OPT += -DF_INTERFACE_FLANG -FCOMMON_OPT += -frecursive +FCOMMON_OPT += -Mrecursive -Kieee ifdef BINARY64 ifdef INTERFACE64 ifneq ($(INTERFACE64), 0) @@ -797,11 +797,6 @@ endif ifeq ($(USE_OPENMP), 1) FCOMMON_OPT += -fopenmp endif -#ifeq ($(OSNAME), Linux) -#ifeq ($(ARCH), x86_64) -#FLANG_VENDOR := $(shell expr `$(FC) --version|cut -f 1 -d "."|head -1`) -#endif -#endif endif ifeq ($(F_COMPILER), G77) @@ -1276,7 +1271,6 @@ endif override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF) -#ifeq ($(FLANG_VENDOR),AOCC) ifeq ($(F_COMPILER),FLANG) override FFLAGS += $(filter-out -O2 -O3,$(COMMON_OPT)) -O1 $(FCOMMON_OPT) else From 1dd712131e0e4efcae52f7171cb47d427cd60fff Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 14 Jun 2020 00:09:31 +0200 Subject: [PATCH 245/593] Fix spelling of flang option -Mrecursive and add -Kieee --- cmake/fc.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/fc.cmake b/cmake/fc.cmake index 9dcedffb2..fc1f9bb22 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -16,7 +16,7 @@ if (${F_COMPILER} STREQUAL "FLANG") if (USE_OPENMP) set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp") endif () - set(FCOMMON_OPT "${FCOMMON_OPT} -frecursive") + set(FCOMMON_OPT "${FCOMMON_OPT} -Mrecursive -Kieee") endif () if (${F_COMPILER} STREQUAL "G77") From 18a11137f1be433b88ef34e0fb115e7280a67d12 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 14 Jun 2020 10:26:25 +0200 Subject: [PATCH 246/593] Update BLAS tests to correspond to Reference-LAPACK 3.9.0 replaces calculation of machine precision with call to epsilon intrinsic and removes the requirement for previous output files to be removed before rerunning tests --- test/cblat1.f | 83 ++++++++++++++++------ test/cblat2.f | 188 +++++++++++++++++++++++++++++-------------------- test/cblat3.f | 187 ++++++++++++++++++++++++++++++------------------ test/dblat2.f | 186 +++++++++++++++++++++++++++++------------------- test/dblat3.f | 168 ++++++++++++++++++++++++++++---------------- test/sblat2.f | 186 +++++++++++++++++++++++++++++------------------- test/sblat3.f | 168 ++++++++++++++++++++++++++++---------------- test/zblat1.f | 83 ++++++++++++++++------ test/zblat2.f | 188 +++++++++++++++++++++++++++++-------------------- test/zblat3.f | 191 ++++++++++++++++++++++++++++++++------------------ 10 files changed, 1036 insertions(+), 592 deletions(-) diff --git a/test/cblat1.f b/test/cblat1.f index d6b53d105..ecf2a44cb 100644 --- a/test/cblat1.f +++ b/test/cblat1.f @@ -1,7 +1,49 @@ +*> \brief \b CBLAT1 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* PROGRAM CBLAT1 +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> Test program for the COMPLEX Level 1 BLAS. +*> Based upon the original BLAS test routine together with: +*> +*> F06GAF Example Program Text +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date April 2012 +* +*> \ingroup complex_blas_testing +* +* ===================================================================== PROGRAM CBLAT1 -* Test program for the COMPLEX Level 1 BLAS. -* Based upon the original BLAS test routine together with: -* F06GAF Example Program Text +* +* -- Reference BLAS test routine (version 3.7.0) -- +* -- Reference BLAS is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* April 2012 +* +* ===================================================================== +* * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) @@ -114,8 +156,8 @@ + (5.0E0,6.0E0), (5.0E0,6.0E0), (0.1E0,0.1E0), + (-0.6E0,0.1E0), (0.1E0,-0.3E0), (7.0E0,8.0E0), + (7.0E0,8.0E0), (7.0E0,8.0E0), (7.0E0,8.0E0), - + (7.0E0,8.0E0), (0.3E0,0.1E0), (0.1E0,0.4E0), - + (0.4E0,0.1E0), (0.1E0,0.2E0), (2.0E0,3.0E0), + + (7.0E0,8.0E0), (0.3E0,0.1E0), (0.5E0,0.0E0), + + (0.0E0,0.5E0), (0.0E0,0.2E0), (2.0E0,3.0E0), + (2.0E0,3.0E0), (2.0E0,3.0E0), (2.0E0,3.0E0)/ DATA ((CV(I,J,2),I=1,8),J=1,5)/(0.1E0,0.1E0), + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), @@ -129,10 +171,10 @@ + (3.0E0,6.0E0), (-0.6E0,0.1E0), (4.0E0,7.0E0), + (0.1E0,-0.3E0), (7.0E0,2.0E0), (7.0E0,2.0E0), + (7.0E0,2.0E0), (0.3E0,0.1E0), (5.0E0,8.0E0), - + (0.1E0,0.4E0), (6.0E0,9.0E0), (0.4E0,0.1E0), - + (8.0E0,3.0E0), (0.1E0,0.2E0), (9.0E0,4.0E0)/ - DATA STRUE2/0.0E0, 0.5E0, 0.6E0, 0.7E0, 0.7E0/ - DATA STRUE4/0.0E0, 0.7E0, 1.0E0, 1.3E0, 1.7E0/ + + (0.5E0,0.0E0), (6.0E0,9.0E0), (0.0E0,0.5E0), + + (8.0E0,3.0E0), (0.0E0,0.2E0), (9.0E0,4.0E0)/ + DATA STRUE2/0.0E0, 0.5E0, 0.6E0, 0.7E0, 0.8E0/ + DATA STRUE4/0.0E0, 0.7E0, 1.0E0, 1.3E0, 1.6E0/ DATA ((CTRUE5(I,J,1),I=1,8),J=1,5)/(0.1E0,0.1E0), + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), @@ -145,8 +187,8 @@ + (0.11E0,-0.03E0), (-0.17E0,0.46E0), + (-0.17E0,-0.19E0), (7.0E0,8.0E0), (7.0E0,8.0E0), + (7.0E0,8.0E0), (7.0E0,8.0E0), (7.0E0,8.0E0), - + (0.19E0,-0.17E0), (0.32E0,0.09E0), - + (0.23E0,-0.24E0), (0.18E0,0.01E0), + + (0.19E0,-0.17E0), (0.20E0,-0.35E0), + + (0.35E0,0.20E0), (0.14E0,0.08E0), + (2.0E0,3.0E0), (2.0E0,3.0E0), (2.0E0,3.0E0), + (2.0E0,3.0E0)/ DATA ((CTRUE5(I,J,2),I=1,8),J=1,5)/(0.1E0,0.1E0), @@ -162,9 +204,9 @@ + (-0.17E0,0.46E0), (4.0E0,7.0E0), + (-0.17E0,-0.19E0), (7.0E0,2.0E0), (7.0E0,2.0E0), + (7.0E0,2.0E0), (0.19E0,-0.17E0), (5.0E0,8.0E0), - + (0.32E0,0.09E0), (6.0E0,9.0E0), - + (0.23E0,-0.24E0), (8.0E0,3.0E0), - + (0.18E0,0.01E0), (9.0E0,4.0E0)/ + + (0.20E0,-0.35E0), (6.0E0,9.0E0), + + (0.35E0,0.20E0), (8.0E0,3.0E0), + + (0.14E0,0.08E0), (9.0E0,4.0E0)/ DATA ((CTRUE6(I,J,1),I=1,8),J=1,5)/(0.1E0,0.1E0), + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), @@ -177,8 +219,8 @@ + (0.03E0,0.03E0), (-0.18E0,0.03E0), + (0.03E0,-0.09E0), (7.0E0,8.0E0), (7.0E0,8.0E0), + (7.0E0,8.0E0), (7.0E0,8.0E0), (7.0E0,8.0E0), - + (0.09E0,0.03E0), (0.03E0,0.12E0), - + (0.12E0,0.03E0), (0.03E0,0.06E0), (2.0E0,3.0E0), + + (0.09E0,0.03E0), (0.15E0,0.00E0), + + (0.00E0,0.15E0), (0.00E0,0.06E0), (2.0E0,3.0E0), + (2.0E0,3.0E0), (2.0E0,3.0E0), (2.0E0,3.0E0)/ DATA ((CTRUE6(I,J,2),I=1,8),J=1,5)/(0.1E0,0.1E0), + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), @@ -193,8 +235,8 @@ + (-0.18E0,0.03E0), (4.0E0,7.0E0), + (0.03E0,-0.09E0), (7.0E0,2.0E0), (7.0E0,2.0E0), + (7.0E0,2.0E0), (0.09E0,0.03E0), (5.0E0,8.0E0), - + (0.03E0,0.12E0), (6.0E0,9.0E0), (0.12E0,0.03E0), - + (8.0E0,3.0E0), (0.03E0,0.06E0), (9.0E0,4.0E0)/ + + (0.15E0,0.00E0), (6.0E0,9.0E0), (0.00E0,0.15E0), + + (8.0E0,3.0E0), (0.00E0,0.06E0), (9.0E0,4.0E0)/ DATA ITRUE3/0, 1, 2, 2, 2/ * .. Executable Statements .. DO 60 INCX = 1, 2 @@ -529,7 +571,8 @@ * * .. Parameters .. INTEGER NOUT - PARAMETER (NOUT=6) + REAL ZERO + PARAMETER (NOUT=6, ZERO=0.0E0) * .. Scalar Arguments .. REAL SFAC INTEGER LEN @@ -552,7 +595,7 @@ * DO 40 I = 1, LEN SD = SCOMP(I) - STRUE(I) - IF (SDIFF(ABS(SSIZE(I))+ABS(SFAC*SD),ABS(SSIZE(I))).EQ.0.0E0) + IF (ABS(SFAC*SD) .LE. ABS(SSIZE(I))*EPSILON(ZERO)) + GO TO 40 * * HERE SCOMP(I) IS NOT CLOSE TO STRUE(I). diff --git a/test/cblat2.f b/test/cblat2.f index 20f188100..8c7bac48e 100644 --- a/test/cblat2.f +++ b/test/cblat2.f @@ -1,68 +1,114 @@ +*> \brief \b CBLAT2 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* PROGRAM CBLAT2 +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> Test program for the COMPLEX Level 2 Blas. +*> +*> The program must be driven by a short data file. The first 18 records +*> of the file are read using list-directed input, the last 17 records +*> are read using the format ( A6, L2 ). An annotated example of a data +*> file can be obtained by deleting the first 3 characters from the +*> following 35 lines: +*> 'cblat2.out' NAME OF SUMMARY OUTPUT FILE +*> 6 UNIT NUMBER OF SUMMARY FILE +*> 'CBLA2T.SNAP' NAME OF SNAPSHOT OUTPUT FILE +*> -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +*> F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +*> F LOGICAL FLAG, T TO STOP ON FAILURES. +*> T LOGICAL FLAG, T TO TEST ERROR EXITS. +*> 16.0 THRESHOLD VALUE OF TEST RATIO +*> 6 NUMBER OF VALUES OF N +*> 0 1 2 3 5 9 VALUES OF N +*> 4 NUMBER OF VALUES OF K +*> 0 1 2 4 VALUES OF K +*> 4 NUMBER OF VALUES OF INCX AND INCY +*> 1 2 -1 -2 VALUES OF INCX AND INCY +*> 3 NUMBER OF VALUES OF ALPHA +*> (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +*> 3 NUMBER OF VALUES OF BETA +*> (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +*> CGEMV T PUT F FOR NO TEST. SAME COLUMNS. +*> CGBMV T PUT F FOR NO TEST. SAME COLUMNS. +*> CHEMV T PUT F FOR NO TEST. SAME COLUMNS. +*> CHBMV T PUT F FOR NO TEST. SAME COLUMNS. +*> CHPMV T PUT F FOR NO TEST. SAME COLUMNS. +*> CTRMV T PUT F FOR NO TEST. SAME COLUMNS. +*> CTBMV T PUT F FOR NO TEST. SAME COLUMNS. +*> CTPMV T PUT F FOR NO TEST. SAME COLUMNS. +*> CTRSV T PUT F FOR NO TEST. SAME COLUMNS. +*> CTBSV T PUT F FOR NO TEST. SAME COLUMNS. +*> CTPSV T PUT F FOR NO TEST. SAME COLUMNS. +*> CGERC T PUT F FOR NO TEST. SAME COLUMNS. +*> CGERU T PUT F FOR NO TEST. SAME COLUMNS. +*> CHER T PUT F FOR NO TEST. SAME COLUMNS. +*> CHPR T PUT F FOR NO TEST. SAME COLUMNS. +*> CHER2 T PUT F FOR NO TEST. SAME COLUMNS. +*> CHPR2 T PUT F FOR NO TEST. SAME COLUMNS. +*> +*> Further Details +*> =============== +*> +*> See: +*> +*> Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. +*> An extended set of Fortran Basic Linear Algebra Subprograms. +*> +*> Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics +*> and Computer Science Division, Argonne National Laboratory, +*> 9700 South Cass Avenue, Argonne, Illinois 60439, US. +*> +*> Or +*> +*> NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms +*> Group Ltd., NAG Central Office, 256 Banbury Road, Oxford +*> OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st +*> Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. +*> +*> +*> -- Written on 10-August-1987. +*> Richard Hanson, Sandia National Labs. +*> Jeremy Du Croz, NAG Central Office. +*> +*> 10-9-00: Change STATUS='NEW' to 'UNKNOWN' so that the testers +*> can be run multiple times without deleting generated +*> output files (susan) +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date April 2012 +* +*> \ingroup complex_blas_testing +* +* ===================================================================== PROGRAM CBLAT2 * -* Test program for the COMPLEX Level 2 Blas. -* -* The program must be driven by a short data file. The first 18 records -* of the file are read using list-directed input, the last 17 records -* are read using the format ( A6, L2 ). An annotated example of a data -* file can be obtained by deleting the first 3 characters from the -* following 35 lines: -* 'CBLAT2.SUMM' NAME OF SUMMARY OUTPUT FILE -* 6 UNIT NUMBER OF SUMMARY FILE -* 'CBLA2T.SNAP' NAME OF SNAPSHOT OUTPUT FILE -* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) -* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO STOP ON FAILURES. -* T LOGICAL FLAG, T TO TEST ERROR EXITS. -* 16.0 THRESHOLD VALUE OF TEST RATIO -* 6 NUMBER OF VALUES OF N -* 0 1 2 3 5 9 VALUES OF N -* 4 NUMBER OF VALUES OF K -* 0 1 2 4 VALUES OF K -* 4 NUMBER OF VALUES OF INCX AND INCY -* 1 2 -1 -2 VALUES OF INCX AND INCY -* 3 NUMBER OF VALUES OF ALPHA -* (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA -* 3 NUMBER OF VALUES OF BETA -* (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA -* CGEMV T PUT F FOR NO TEST. SAME COLUMNS. -* CGBMV T PUT F FOR NO TEST. SAME COLUMNS. -* CHEMV T PUT F FOR NO TEST. SAME COLUMNS. -* CHBMV T PUT F FOR NO TEST. SAME COLUMNS. -* CHPMV T PUT F FOR NO TEST. SAME COLUMNS. -* CTRMV T PUT F FOR NO TEST. SAME COLUMNS. -* CTBMV T PUT F FOR NO TEST. SAME COLUMNS. -* CTPMV T PUT F FOR NO TEST. SAME COLUMNS. -* CTRSV T PUT F FOR NO TEST. SAME COLUMNS. -* CTBSV T PUT F FOR NO TEST. SAME COLUMNS. -* CTPSV T PUT F FOR NO TEST. SAME COLUMNS. -* CGERC T PUT F FOR NO TEST. SAME COLUMNS. -* CGERU T PUT F FOR NO TEST. SAME COLUMNS. -* CHER T PUT F FOR NO TEST. SAME COLUMNS. -* CHPR T PUT F FOR NO TEST. SAME COLUMNS. -* CHER2 T PUT F FOR NO TEST. SAME COLUMNS. -* CHPR2 T PUT F FOR NO TEST. SAME COLUMNS. -* -* See: -* -* Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. -* An extended set of Fortran Basic Linear Algebra Subprograms. -* -* Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics -* and Computer Science Division, Argonne National Laboratory, -* 9700 South Cass Avenue, Argonne, Illinois 60439, US. -* -* Or -* -* NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms -* Group Ltd., NAG Central Office, 256 Banbury Road, Oxford -* OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st -* Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. +* -- Reference BLAS test routine (version 3.7.0) -- +* -- Reference BLAS is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* April 2012 * -* -* -- Written on 10-August-1987. -* Richard Hanson, Sandia National Labs. -* Jeremy Du Croz, NAG Central Office. +* ===================================================================== * * .. Parameters .. INTEGER NIN @@ -71,8 +117,8 @@ PARAMETER ( NSUBS = 17 ) COMPLEX ZERO, ONE PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) - REAL RZERO, RHALF, RONE - PARAMETER ( RZERO = 0.0, RHALF = 0.5, RONE = 1.0 ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) INTEGER NMAX, INCMAX PARAMETER ( NMAX = 65, INCMAX = 2 ) INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX @@ -126,7 +172,7 @@ * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT - OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) + OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. @@ -135,7 +181,7 @@ READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN - OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI @@ -240,14 +286,7 @@ * * Compute EPS (the machine precision). * - EPS = RONE - 90 CONTINUE - IF( SDIFF( RONE + EPS, RONE ).EQ.RZERO ) - $ GO TO 100 - EPS = RHALF*EPS - GO TO 90 - 100 CONTINUE - EPS = EPS + EPS + EPS = EPSILON(RZERO) WRITE( NOUT, FMT = 9998 )EPS * * Check the reliability of CMVCH using exact data. @@ -3079,7 +3118,6 @@ 50 CONTINUE END IF * - 60 CONTINUE LCERES = .TRUE. GO TO 80 70 CONTINUE diff --git a/test/cblat3.f b/test/cblat3.f index 5df1ddd64..a65e1364c 100644 --- a/test/cblat3.f +++ b/test/cblat3.f @@ -1,50 +1,96 @@ +*> \brief \b CBLAT3 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* PROGRAM CBLAT3 +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> Test program for the COMPLEX Level 3 Blas. +*> +*> The program must be driven by a short data file. The first 14 records +*> of the file are read using list-directed input, the last 9 records +*> are read using the format ( A6, L2 ). An annotated example of a data +*> file can be obtained by deleting the first 3 characters from the +*> following 23 lines: +*> 'cblat3.out' NAME OF SUMMARY OUTPUT FILE +*> 6 UNIT NUMBER OF SUMMARY FILE +*> 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +*> -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +*> F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +*> F LOGICAL FLAG, T TO STOP ON FAILURES. +*> T LOGICAL FLAG, T TO TEST ERROR EXITS. +*> 16.0 THRESHOLD VALUE OF TEST RATIO +*> 6 NUMBER OF VALUES OF N +*> 0 1 2 3 5 9 VALUES OF N +*> 3 NUMBER OF VALUES OF ALPHA +*> (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +*> 3 NUMBER OF VALUES OF BETA +*> (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +*> CGEMM T PUT F FOR NO TEST. SAME COLUMNS. +*> CHEMM T PUT F FOR NO TEST. SAME COLUMNS. +*> CSYMM T PUT F FOR NO TEST. SAME COLUMNS. +*> CTRMM T PUT F FOR NO TEST. SAME COLUMNS. +*> CTRSM T PUT F FOR NO TEST. SAME COLUMNS. +*> CHERK T PUT F FOR NO TEST. SAME COLUMNS. +*> CSYRK T PUT F FOR NO TEST. SAME COLUMNS. +*> CHER2K T PUT F FOR NO TEST. SAME COLUMNS. +*> CSYR2K T PUT F FOR NO TEST. SAME COLUMNS. +*> +*> Further Details +*> =============== +*> +*> See: +*> +*> Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. +*> A Set of Level 3 Basic Linear Algebra Subprograms. +*> +*> Technical Memorandum No.88 (Revision 1), Mathematics and +*> Computer Science Division, Argonne National Laboratory, 9700 +*> South Cass Avenue, Argonne, Illinois 60439, US. +*> +*> -- Written on 8-February-1989. +*> Jack Dongarra, Argonne National Laboratory. +*> Iain Duff, AERE Harwell. +*> Jeremy Du Croz, Numerical Algorithms Group Ltd. +*> Sven Hammarling, Numerical Algorithms Group Ltd. +*> +*> 10-9-00: Change STATUS='NEW' to 'UNKNOWN' so that the testers +*> can be run multiple times without deleting generated +*> output files (susan) +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date April 2012 +* +*> \ingroup complex_blas_testing +* +* ===================================================================== PROGRAM CBLAT3 * -* Test program for the COMPLEX Level 3 Blas. -* -* The program must be driven by a short data file. The first 14 records -* of the file are read using list-directed input, the last 9 records -* are read using the format ( A6, L2 ). An annotated example of a data -* file can be obtained by deleting the first 3 characters from the -* following 23 lines: -* 'CBLAT3.SUMM' NAME OF SUMMARY OUTPUT FILE -* 6 UNIT NUMBER OF SUMMARY FILE -* 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE -* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) -* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO STOP ON FAILURES. -* T LOGICAL FLAG, T TO TEST ERROR EXITS. -* 16.0 THRESHOLD VALUE OF TEST RATIO -* 6 NUMBER OF VALUES OF N -* 0 1 2 3 5 9 VALUES OF N -* 3 NUMBER OF VALUES OF ALPHA -* (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA -* 3 NUMBER OF VALUES OF BETA -* (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA -* CGEMM T PUT F FOR NO TEST. SAME COLUMNS. -* CHEMM T PUT F FOR NO TEST. SAME COLUMNS. -* CSYMM T PUT F FOR NO TEST. SAME COLUMNS. -* CTRMM T PUT F FOR NO TEST. SAME COLUMNS. -* CTRSM T PUT F FOR NO TEST. SAME COLUMNS. -* CHERK T PUT F FOR NO TEST. SAME COLUMNS. -* CSYRK T PUT F FOR NO TEST. SAME COLUMNS. -* CHER2K T PUT F FOR NO TEST. SAME COLUMNS. -* CSYR2K T PUT F FOR NO TEST. SAME COLUMNS. -* -* See: -* -* Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. -* A Set of Level 3 Basic Linear Algebra Subprograms. -* -* Technical Memorandum No.88 (Revision 1), Mathematics and -* Computer Science Division, Argonne National Laboratory, 9700 -* South Cass Avenue, Argonne, Illinois 60439, US. +* -- Reference BLAS test routine (version 3.7.0) -- +* -- Reference BLAS is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* April 2012 * -* -- Written on 8-February-1989. -* Jack Dongarra, Argonne National Laboratory. -* Iain Duff, AERE Harwell. -* Jeremy Du Croz, Numerical Algorithms Group Ltd. -* Sven Hammarling, Numerical Algorithms Group Ltd. +* ===================================================================== * * .. Parameters .. INTEGER NIN @@ -53,8 +99,8 @@ PARAMETER ( NSUBS = 9 ) COMPLEX ZERO, ONE PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) - REAL RZERO, RHALF, RONE - PARAMETER ( RZERO = 0.0, RHALF = 0.5, RONE = 1.0 ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) INTEGER NMAX PARAMETER ( NMAX = 65 ) INTEGER NIDMAX, NALMAX, NBEMAX @@ -103,7 +149,7 @@ * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT - OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) + OPEN( NOUT, FILE = SUMMRY ) NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. @@ -112,7 +158,7 @@ READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN - OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + OPEN( NTRA, FILE = SNAPS ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI @@ -189,14 +235,7 @@ * * Compute EPS (the machine precision). * - EPS = RONE - 70 CONTINUE - IF( SDIFF( RONE + EPS, RONE ).EQ.RZERO ) - $ GO TO 80 - EPS = RHALF*EPS - GO TO 70 - 80 CONTINUE - EPS = EPS + EPS + EPS = EPSILON(RZERO) WRITE( NOUT, FMT = 9998 )EPS * * Check the reliability of CMMCH using exact data. @@ -1301,8 +1340,6 @@ NC = 0 RESET = .TRUE. ERRMAX = RZERO - RALS = RONE - RBETS = RONE * DO 100 IN = 1, NIDIM N = IDIM( IN ) @@ -1948,7 +1985,7 @@ * * Tests the error exits from the Level 3 Blas. * Requires a special version of the error-handling routine XERBLA. -* ALPHA, RALPHA, BETA, RBETA, A, B and C should not need to be defined. +* A, B and C should not need to be defined. * * Auxiliary routine for test program for Level 3 Blas. * @@ -1958,12 +1995,19 @@ * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * +* 3-19-92: Initialize ALPHA, BETA, RALPHA, and RBETA (eca) +* 3-19-92: Fix argument 12 in calls to CSYMM and CHEMM +* with INFOT = 9 (eca) +* * .. Scalar Arguments .. INTEGER ISNUM, NOUT CHARACTER*6 SRNAMT * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK +* .. Parameters .. + REAL ONE, TWO + PARAMETER ( ONE = 1.0E0, TWO = 2.0E0 ) * .. Local Scalars .. COMPLEX ALPHA, BETA REAL RALPHA, RBETA @@ -1981,6 +2025,14 @@ * LERR is set to .TRUE. by the special version of XERBLA each time * it is called, and is then tested and re-set by CHKXER. LERR = .FALSE. +* +* Initialize ALPHA, BETA, RALPHA, and RBETA. +* + ALPHA = CMPLX( ONE, -ONE ) + BETA = CMPLX( TWO, -TWO ) + RALPHA = ONE + RBETA = TWO +* GO TO ( 10, 20, 30, 40, 50, 60, 70, 80, $ 90 )ISNUM 10 INFOT = 1 @@ -2207,16 +2259,16 @@ CALL CHEMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL CHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL CHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL CHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL CHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL CHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) @@ -2274,16 +2326,16 @@ CALL CSYMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL CSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL CSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL CSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL CSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL CSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) @@ -3270,7 +3322,6 @@ 50 CONTINUE END IF * - 60 CONTINUE LCERES = .TRUE. GO TO 80 70 CONTINUE diff --git a/test/dblat2.f b/test/dblat2.f index 4002d4368..9bbbe9792 100644 --- a/test/dblat2.f +++ b/test/dblat2.f @@ -1,75 +1,121 @@ +*> \brief \b DBLAT2 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* PROGRAM DBLAT2 +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> Test program for the DOUBLE PRECISION Level 2 Blas. +*> +*> The program must be driven by a short data file. The first 18 records +*> of the file are read using list-directed input, the last 16 records +*> are read using the format ( A6, L2 ). An annotated example of a data +*> file can be obtained by deleting the first 3 characters from the +*> following 34 lines: +*> 'dblat2.out' NAME OF SUMMARY OUTPUT FILE +*> 6 UNIT NUMBER OF SUMMARY FILE +*> 'DBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE +*> -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +*> F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +*> F LOGICAL FLAG, T TO STOP ON FAILURES. +*> T LOGICAL FLAG, T TO TEST ERROR EXITS. +*> 16.0 THRESHOLD VALUE OF TEST RATIO +*> 6 NUMBER OF VALUES OF N +*> 0 1 2 3 5 9 VALUES OF N +*> 4 NUMBER OF VALUES OF K +*> 0 1 2 4 VALUES OF K +*> 4 NUMBER OF VALUES OF INCX AND INCY +*> 1 2 -1 -2 VALUES OF INCX AND INCY +*> 3 NUMBER OF VALUES OF ALPHA +*> 0.0 1.0 0.7 VALUES OF ALPHA +*> 3 NUMBER OF VALUES OF BETA +*> 0.0 1.0 0.9 VALUES OF BETAC +*> DGEMV T PUT F FOR NO TEST. SAME COLUMNS. +*> DGBMV T PUT F FOR NO TEST. SAME COLUMNS. +*> DSYMV T PUT F FOR NO TEST. SAME COLUMNS. +*> DSBMV T PUT F FOR NO TEST. SAME COLUMNS. +*> DSPMV T PUT F FOR NO TEST. SAME COLUMNS. +*> DTRMV T PUT F FOR NO TEST. SAME COLUMNS. +*> DTBMV T PUT F FOR NO TEST. SAME COLUMNS. +*> DTPMV T PUT F FOR NO TEST. SAME COLUMNS. +*> DTRSV T PUT F FOR NO TEST. SAME COLUMNS. +*> DTBSV T PUT F FOR NO TEST. SAME COLUMNS. +*> DTPSV T PUT F FOR NO TEST. SAME COLUMNS. +*> DGER T PUT F FOR NO TEST. SAME COLUMNS. +*> DSYR T PUT F FOR NO TEST. SAME COLUMNS. +*> DSPR T PUT F FOR NO TEST. SAME COLUMNS. +*> DSYR2 T PUT F FOR NO TEST. SAME COLUMNS. +*> DSPR2 T PUT F FOR NO TEST. SAME COLUMNS. +*> +*> Further Details +*> =============== +*> +*> See: +*> +*> Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. +*> An extended set of Fortran Basic Linear Algebra Subprograms. +*> +*> Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics +*> and Computer Science Division, Argonne National Laboratory, +*> 9700 South Cass Avenue, Argonne, Illinois 60439, US. +*> +*> Or +*> +*> NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms +*> Group Ltd., NAG Central Office, 256 Banbury Road, Oxford +*> OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st +*> Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. +*> +*> +*> -- Written on 10-August-1987. +*> Richard Hanson, Sandia National Labs. +*> Jeremy Du Croz, NAG Central Office. +*> +*> 10-9-00: Change STATUS='NEW' to 'UNKNOWN' so that the testers +*> can be run multiple times without deleting generated +*> output files (susan) +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date April 2012 +* +*> \ingroup double_blas_testing +* +* ===================================================================== PROGRAM DBLAT2 * -* Test program for the DOUBLE PRECISION Level 2 Blas. -* -* The program must be driven by a short data file. The first 18 records -* of the file are read using list-directed input, the last 16 records -* are read using the format ( A6, L2 ). An annotated example of a data -* file can be obtained by deleting the first 3 characters from the -* following 34 lines: -* 'DBLAT2.SUMM' NAME OF SUMMARY OUTPUT FILE -* 6 UNIT NUMBER OF SUMMARY FILE -* 'DBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE -* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) -* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO STOP ON FAILURES. -* T LOGICAL FLAG, T TO TEST ERROR EXITS. -* 16.0 THRESHOLD VALUE OF TEST RATIO -* 6 NUMBER OF VALUES OF N -* 0 1 2 3 5 9 VALUES OF N -* 4 NUMBER OF VALUES OF K -* 0 1 2 4 VALUES OF K -* 4 NUMBER OF VALUES OF INCX AND INCY -* 1 2 -1 -2 VALUES OF INCX AND INCY -* 3 NUMBER OF VALUES OF ALPHA -* 0.0 1.0 0.7 VALUES OF ALPHA -* 3 NUMBER OF VALUES OF BETA -* 0.0 1.0 0.9 VALUES OF BETA -* DGEMV T PUT F FOR NO TEST. SAME COLUMNS. -* DGBMV T PUT F FOR NO TEST. SAME COLUMNS. -* DSYMV T PUT F FOR NO TEST. SAME COLUMNS. -* DSBMV T PUT F FOR NO TEST. SAME COLUMNS. -* DSPMV T PUT F FOR NO TEST. SAME COLUMNS. -* DTRMV T PUT F FOR NO TEST. SAME COLUMNS. -* DTBMV T PUT F FOR NO TEST. SAME COLUMNS. -* DTPMV T PUT F FOR NO TEST. SAME COLUMNS. -* DTRSV T PUT F FOR NO TEST. SAME COLUMNS. -* DTBSV T PUT F FOR NO TEST. SAME COLUMNS. -* DTPSV T PUT F FOR NO TEST. SAME COLUMNS. -* DGER T PUT F FOR NO TEST. SAME COLUMNS. -* DSYR T PUT F FOR NO TEST. SAME COLUMNS. -* DSPR T PUT F FOR NO TEST. SAME COLUMNS. -* DSYR2 T PUT F FOR NO TEST. SAME COLUMNS. -* DSPR2 T PUT F FOR NO TEST. SAME COLUMNS. -* -* See: -* -* Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. -* An extended set of Fortran Basic Linear Algebra Subprograms. -* -* Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics -* and Computer Science Division, Argonne National Laboratory, -* 9700 South Cass Avenue, Argonne, Illinois 60439, US. -* -* Or -* -* NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms -* Group Ltd., NAG Central Office, 256 Banbury Road, Oxford -* OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st -* Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. +* -- Reference BLAS test routine (version 3.7.0) -- +* -- Reference BLAS is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* April 2012 * -* -* -- Written on 10-August-1987. -* Richard Hanson, Sandia National Labs. -* Jeremy Du Croz, NAG Central Office. +* ===================================================================== * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 5 ) INTEGER NSUBS PARAMETER ( NSUBS = 16 ) - DOUBLE PRECISION ZERO, HALF, ONE - PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) INTEGER NMAX, INCMAX PARAMETER ( NMAX = 65, INCMAX = 2 ) INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX @@ -121,7 +167,7 @@ * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT - OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) + OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. @@ -130,7 +176,7 @@ READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN - OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI @@ -235,14 +281,7 @@ * * Compute EPS (the machine precision). * - EPS = ONE - 90 CONTINUE - IF( DDIFF( ONE + EPS, ONE ).EQ.ZERO ) - $ GO TO 100 - EPS = HALF*EPS - GO TO 90 - 100 CONTINUE - EPS = EPS + EPS + EPS = EPSILON(ZERO) WRITE( NOUT, FMT = 9998 )EPS * * Check the reliability of DMVCH using exact data. @@ -2982,7 +3021,6 @@ 50 CONTINUE END IF * - 60 CONTINUE LDERES = .TRUE. GO TO 80 70 CONTINUE diff --git a/test/dblat3.f b/test/dblat3.f index 082e03e5e..1ebec4ffa 100644 --- a/test/dblat3.f +++ b/test/dblat3.f @@ -1,55 +1,101 @@ +*> \brief \b DBLAT3 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* PROGRAM DBLAT3 +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> Test program for the DOUBLE PRECISION Level 3 Blas. +*> +*> The program must be driven by a short data file. The first 14 records +*> of the file are read using list-directed input, the last 6 records +*> are read using the format ( A6, L2 ). An annotated example of a data +*> file can be obtained by deleting the first 3 characters from the +*> following 20 lines: +*> 'dblat3.out' NAME OF SUMMARY OUTPUT FILE +*> 6 UNIT NUMBER OF SUMMARY FILE +*> 'DBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +*> -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +*> F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +*> F LOGICAL FLAG, T TO STOP ON FAILURES. +*> T LOGICAL FLAG, T TO TEST ERROR EXITS. +*> 16.0 THRESHOLD VALUE OF TEST RATIO +*> 6 NUMBER OF VALUES OF N +*> 0 1 2 3 5 9 VALUES OF N +*> 3 NUMBER OF VALUES OF ALPHA +*> 0.0 1.0 0.7 VALUES OF ALPHA +*> 3 NUMBER OF VALUES OF BETA +*> 0.0 1.0 1.3 VALUES OF BETA +*> DGEMM T PUT F FOR NO TEST. SAME COLUMNS. +*> DSYMM T PUT F FOR NO TEST. SAME COLUMNS. +*> DTRMM T PUT F FOR NO TEST. SAME COLUMNS. +*> DTRSM T PUT F FOR NO TEST. SAME COLUMNS. +*> DSYRK T PUT F FOR NO TEST. SAME COLUMNS. +*> DSYR2K T PUT F FOR NO TEST. SAME COLUMNS. +*> +*> Further Details +*> =============== +*> +*> See: +*> +*> Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. +*> A Set of Level 3 Basic Linear Algebra Subprograms. +*> +*> Technical Memorandum No.88 (Revision 1), Mathematics and +*> Computer Science Division, Argonne National Laboratory, 9700 +*> South Cass Avenue, Argonne, Illinois 60439, US. +*> +*> -- Written on 8-February-1989. +*> Jack Dongarra, Argonne National Laboratory. +*> Iain Duff, AERE Harwell. +*> Jeremy Du Croz, Numerical Algorithms Group Ltd. +*> Sven Hammarling, Numerical Algorithms Group Ltd. +*> +*> 10-9-00: Change STATUS='NEW' to 'UNKNOWN' so that the testers +*> can be run multiple times without deleting generated +*> output files (susan) +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date April 2012 +* +*> \ingroup double_blas_testing +* +* ===================================================================== PROGRAM DBLAT3 * -* Test program for the DOUBLE PRECISION Level 3 Blas. -* -* The program must be driven by a short data file. The first 14 records -* of the file are read using list-directed input, the last 6 records -* are read using the format ( A6, L2 ). An annotated example of a data -* file can be obtained by deleting the first 3 characters from the -* following 20 lines: -* 'DBLAT3.SUMM' NAME OF SUMMARY OUTPUT FILE -* 6 UNIT NUMBER OF SUMMARY FILE -* 'DBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE -* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) -* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO STOP ON FAILURES. -* T LOGICAL FLAG, T TO TEST ERROR EXITS. -* 16.0 THRESHOLD VALUE OF TEST RATIO -* 6 NUMBER OF VALUES OF N -* 0 1 2 3 5 9 VALUES OF N -* 3 NUMBER OF VALUES OF ALPHA -* 0.0 1.0 0.7 VALUES OF ALPHA -* 3 NUMBER OF VALUES OF BETA -* 0.0 1.0 1.3 VALUES OF BETA -* DGEMM T PUT F FOR NO TEST. SAME COLUMNS. -* DSYMM T PUT F FOR NO TEST. SAME COLUMNS. -* DTRMM T PUT F FOR NO TEST. SAME COLUMNS. -* DTRSM T PUT F FOR NO TEST. SAME COLUMNS. -* DSYRK T PUT F FOR NO TEST. SAME COLUMNS. -* DSYR2K T PUT F FOR NO TEST. SAME COLUMNS. -* -* See: -* -* Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. -* A Set of Level 3 Basic Linear Algebra Subprograms. -* -* Technical Memorandum No.88 (Revision 1), Mathematics and -* Computer Science Division, Argonne National Laboratory, 9700 -* South Cass Avenue, Argonne, Illinois 60439, US. +* -- Reference BLAS test routine (version 3.7.0) -- +* -- Reference BLAS is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* April 2012 * -* -- Written on 8-February-1989. -* Jack Dongarra, Argonne National Laboratory. -* Iain Duff, AERE Harwell. -* Jeremy Du Croz, Numerical Algorithms Group Ltd. -* Sven Hammarling, Numerical Algorithms Group Ltd. +* ===================================================================== * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 5 ) INTEGER NSUBS PARAMETER ( NSUBS = 6 ) - DOUBLE PRECISION ZERO, HALF, ONE - PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) INTEGER NMAX PARAMETER ( NMAX = 65 ) INTEGER NIDMAX, NALMAX, NBEMAX @@ -96,7 +142,7 @@ * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT - OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) + OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. @@ -105,7 +151,7 @@ READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN - OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI @@ -182,14 +228,7 @@ * * Compute EPS (the machine precision). * - EPS = ONE - 70 CONTINUE - IF( DDIFF( ONE + EPS, ONE ).EQ.ZERO ) - $ GO TO 80 - EPS = HALF*EPS - GO TO 70 - 80 CONTINUE - EPS = EPS + EPS + EPS = EPSILON(ZERO) WRITE( NOUT, FMT = 9998 )EPS * * Check the reliability of DMMCH using exact data. @@ -1802,7 +1841,7 @@ * * Tests the error exits from the Level 3 Blas. * Requires a special version of the error-handling routine XERBLA. -* ALPHA, BETA, A, B and C should not need to be defined. +* A, B and C should not need to be defined. * * Auxiliary routine for test program for Level 3 Blas. * @@ -1812,12 +1851,18 @@ * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * +* 3-19-92: Initialize ALPHA and BETA (eca) +* 3-19-92: Fix argument 12 in calls to SSYMM with INFOT = 9 (eca) +* * .. Scalar Arguments .. INTEGER ISNUM, NOUT CHARACTER*6 SRNAMT * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK +* .. Parameters .. + DOUBLE PRECISION ONE, TWO + PARAMETER ( ONE = 1.0D0, TWO = 2.0D0 ) * .. Local Scalars .. DOUBLE PRECISION ALPHA, BETA * .. Local Arrays .. @@ -1834,6 +1879,12 @@ * LERR is set to .TRUE. by the special version of XERBLA each time * it is called, and is then tested and re-set by CHKXER. LERR = .FALSE. +* +* Initialize ALPHA and BETA. +* + ALPHA = ONE + BETA = TWO +* GO TO ( 10, 20, 30, 40, 50, 60 )ISNUM 10 INFOT = 1 CALL DGEMM( '/', 'N', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) @@ -1963,16 +2014,16 @@ CALL DSYMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL DSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL DSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL DSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL DSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL DSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL DSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL DSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL DSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL DSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) @@ -2660,7 +2711,6 @@ 50 CONTINUE END IF * - 60 CONTINUE LDERES = .TRUE. GO TO 80 70 CONTINUE diff --git a/test/sblat2.f b/test/sblat2.f index a1074be52..56ead8640 100644 --- a/test/sblat2.f +++ b/test/sblat2.f @@ -1,75 +1,121 @@ +*> \brief \b SBLAT2 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* PROGRAM SBLAT2 +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> Test program for the REAL Level 2 Blas. +*> +*> The program must be driven by a short data file. The first 18 records +*> of the file are read using list-directed input, the last 16 records +*> are read using the format ( A6, L2 ). An annotated example of a data +*> file can be obtained by deleting the first 3 characters from the +*> following 34 lines: +*> 'sblat2.out' NAME OF SUMMARY OUTPUT FILE +*> 6 UNIT NUMBER OF SUMMARY FILE +*> 'SBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE +*> -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +*> F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +*> F LOGICAL FLAG, T TO STOP ON FAILURES. +*> T LOGICAL FLAG, T TO TEST ERROR EXITS. +*> 16.0 THRESHOLD VALUE OF TEST RATIO +*> 6 NUMBER OF VALUES OF N +*> 0 1 2 3 5 9 VALUES OF N +*> 4 NUMBER OF VALUES OF K +*> 0 1 2 4 VALUES OF K +*> 4 NUMBER OF VALUES OF INCX AND INCY +*> 1 2 -1 -2 VALUES OF INCX AND INCY +*> 3 NUMBER OF VALUES OF ALPHA +*> 0.0 1.0 0.7 VALUES OF ALPHA +*> 3 NUMBER OF VALUES OF BETA +*> 0.0 1.0 0.9 VALUES OF BETA +*> SGEMV T PUT F FOR NO TEST. SAME COLUMNS. +*> SGBMV T PUT F FOR NO TEST. SAME COLUMNS. +*> SSYMV T PUT F FOR NO TEST. SAME COLUMNS. +*> SSBMV T PUT F FOR NO TEST. SAME COLUMNS. +*> SSPMV T PUT F FOR NO TEST. SAME COLUMNS. +*> STRMV T PUT F FOR NO TEST. SAME COLUMNS. +*> STBMV T PUT F FOR NO TEST. SAME COLUMNS. +*> STPMV T PUT F FOR NO TEST. SAME COLUMNS. +*> STRSV T PUT F FOR NO TEST. SAME COLUMNS. +*> STBSV T PUT F FOR NO TEST. SAME COLUMNS. +*> STPSV T PUT F FOR NO TEST. SAME COLUMNS. +*> SGER T PUT F FOR NO TEST. SAME COLUMNS. +*> SSYR T PUT F FOR NO TEST. SAME COLUMNS. +*> SSPR T PUT F FOR NO TEST. SAME COLUMNS. +*> SSYR2 T PUT F FOR NO TEST. SAME COLUMNS. +*> SSPR2 T PUT F FOR NO TEST. SAME COLUMNS. +*> +*> Further Details +*> =============== +*> +*> See: +*> +*> Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. +*> An extended set of Fortran Basic Linear Algebra Subprograms. +*> +*> Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics +*> and Computer Science Division, Argonne National Laboratory, +*> 9700 South Cass Avenue, Argonne, Illinois 60439, US. +*> +*> Or +*> +*> NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms +*> Group Ltd., NAG Central Office, 256 Banbury Road, Oxford +*> OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st +*> Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. +*> +*> +*> -- Written on 10-August-1987. +*> Richard Hanson, Sandia National Labs. +*> Jeremy Du Croz, NAG Central Office. +*> +*> 10-9-00: Change STATUS='NEW' to 'UNKNOWN' so that the testers +*> can be run multiple times without deleting generated +*> output files (susan) +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date April 2012 +* +*> \ingroup single_blas_testing +* +* ===================================================================== PROGRAM SBLAT2 * -* Test program for the REAL Level 2 Blas. -* -* The program must be driven by a short data file. The first 18 records -* of the file are read using list-directed input, the last 16 records -* are read using the format ( A6, L2 ). An annotated example of a data -* file can be obtained by deleting the first 3 characters from the -* following 34 lines: -* 'SBLAT2.SUMM' NAME OF SUMMARY OUTPUT FILE -* 6 UNIT NUMBER OF SUMMARY FILE -* 'SBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE -* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) -* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO STOP ON FAILURES. -* T LOGICAL FLAG, T TO TEST ERROR EXITS. -* 16.0 THRESHOLD VALUE OF TEST RATIO -* 6 NUMBER OF VALUES OF N -* 0 1 2 3 5 9 VALUES OF N -* 4 NUMBER OF VALUES OF K -* 0 1 2 4 VALUES OF K -* 4 NUMBER OF VALUES OF INCX AND INCY -* 1 2 -1 -2 VALUES OF INCX AND INCY -* 3 NUMBER OF VALUES OF ALPHA -* 0.0 1.0 0.7 VALUES OF ALPHA -* 3 NUMBER OF VALUES OF BETA -* 0.0 1.0 0.9 VALUES OF BETA -* SGEMV T PUT F FOR NO TEST. SAME COLUMNS. -* SGBMV T PUT F FOR NO TEST. SAME COLUMNS. -* SSYMV T PUT F FOR NO TEST. SAME COLUMNS. -* SSBMV T PUT F FOR NO TEST. SAME COLUMNS. -* SSPMV T PUT F FOR NO TEST. SAME COLUMNS. -* STRMV T PUT F FOR NO TEST. SAME COLUMNS. -* STBMV T PUT F FOR NO TEST. SAME COLUMNS. -* STPMV T PUT F FOR NO TEST. SAME COLUMNS. -* STRSV T PUT F FOR NO TEST. SAME COLUMNS. -* STBSV T PUT F FOR NO TEST. SAME COLUMNS. -* STPSV T PUT F FOR NO TEST. SAME COLUMNS. -* SGER T PUT F FOR NO TEST. SAME COLUMNS. -* SSYR T PUT F FOR NO TEST. SAME COLUMNS. -* SSPR T PUT F FOR NO TEST. SAME COLUMNS. -* SSYR2 T PUT F FOR NO TEST. SAME COLUMNS. -* SSPR2 T PUT F FOR NO TEST. SAME COLUMNS. -* -* See: -* -* Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. -* An extended set of Fortran Basic Linear Algebra Subprograms. -* -* Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics -* and Computer Science Division, Argonne National Laboratory, -* 9700 South Cass Avenue, Argonne, Illinois 60439, US. -* -* Or -* -* NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms -* Group Ltd., NAG Central Office, 256 Banbury Road, Oxford -* OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st -* Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. +* -- Reference BLAS test routine (version 3.7.0) -- +* -- Reference BLAS is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* April 2012 * -* -* -- Written on 10-August-1987. -* Richard Hanson, Sandia National Labs. -* Jeremy Du Croz, NAG Central Office. +* ===================================================================== * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 5 ) INTEGER NSUBS PARAMETER ( NSUBS = 16 ) - REAL ZERO, HALF, ONE - PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0, ONE = 1.0 ) INTEGER NMAX, INCMAX PARAMETER ( NMAX = 65, INCMAX = 2 ) INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX @@ -121,7 +167,7 @@ * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT - OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) + OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. @@ -130,7 +176,7 @@ READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN - OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI @@ -235,14 +281,7 @@ * * Compute EPS (the machine precision). * - EPS = ONE - 90 CONTINUE - IF( SDIFF( ONE + EPS, ONE ).EQ.ZERO ) - $ GO TO 100 - EPS = HALF*EPS - GO TO 90 - 100 CONTINUE - EPS = EPS + EPS + EPS = EPSILON(ZERO) WRITE( NOUT, FMT = 9998 )EPS * * Check the reliability of SMVCH using exact data. @@ -2982,7 +3021,6 @@ 50 CONTINUE END IF * - 60 CONTINUE LSERES = .TRUE. GO TO 80 70 CONTINUE diff --git a/test/sblat3.f b/test/sblat3.f index 325a9eb92..66edac14e 100644 --- a/test/sblat3.f +++ b/test/sblat3.f @@ -1,55 +1,101 @@ +*> \brief \b SBLAT3 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* PROGRAM SBLAT3 +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> Test program for the REAL Level 3 Blas. +*> +*> The program must be driven by a short data file. The first 14 records +*> of the file are read using list-directed input, the last 6 records +*> are read using the format ( A6, L2 ). An annotated example of a data +*> file can be obtained by deleting the first 3 characters from the +*> following 20 lines: +*> 'sblat3.out' NAME OF SUMMARY OUTPUT FILE +*> 6 UNIT NUMBER OF SUMMARY FILE +*> 'SBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +*> -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +*> F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +*> F LOGICAL FLAG, T TO STOP ON FAILURES. +*> T LOGICAL FLAG, T TO TEST ERROR EXITS. +*> 16.0 THRESHOLD VALUE OF TEST RATIO +*> 6 NUMBER OF VALUES OF N +*> 0 1 2 3 5 9 VALUES OF N +*> 3 NUMBER OF VALUES OF ALPHA +*> 0.0 1.0 0.7 VALUES OF ALPHA +*> 3 NUMBER OF VALUES OF BETA +*> 0.0 1.0 1.3 VALUES OF BETA +*> SGEMM T PUT F FOR NO TEST. SAME COLUMNS. +*> SSYMM T PUT F FOR NO TEST. SAME COLUMNS. +*> STRMM T PUT F FOR NO TEST. SAME COLUMNS. +*> STRSM T PUT F FOR NO TEST. SAME COLUMNS. +*> SSYRK T PUT F FOR NO TEST. SAME COLUMNS. +*> SSYR2K T PUT F FOR NO TEST. SAME COLUMNS. +*> +*> Further Details +*> =============== +*> +*> See: +*> +*> Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. +*> A Set of Level 3 Basic Linear Algebra Subprograms. +*> +*> Technical Memorandum No.88 (Revision 1), Mathematics and +*> Computer Science Division, Argonne National Laboratory, 9700 +*> South Cass Avenue, Argonne, Illinois 60439, US. +*> +*> -- Written on 8-February-1989. +*> Jack Dongarra, Argonne National Laboratory. +*> Iain Duff, AERE Harwell. +*> Jeremy Du Croz, Numerical Algorithms Group Ltd. +*> Sven Hammarling, Numerical Algorithms Group Ltd. +*> +*> 10-9-00: Change STATUS='NEW' to 'UNKNOWN' so that the testers +*> can be run multiple times without deleting generated +*> output files (susan) +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date April 2012 +* +*> \ingroup single_blas_testing +* +* ===================================================================== PROGRAM SBLAT3 * -* Test program for the REAL Level 3 Blas. -* -* The program must be driven by a short data file. The first 14 records -* of the file are read using list-directed input, the last 6 records -* are read using the format ( A6, L2 ). An annotated example of a data -* file can be obtained by deleting the first 3 characters from the -* following 20 lines: -* 'SBLAT3.SUMM' NAME OF SUMMARY OUTPUT FILE -* 6 UNIT NUMBER OF SUMMARY FILE -* 'SBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE -* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) -* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO STOP ON FAILURES. -* T LOGICAL FLAG, T TO TEST ERROR EXITS. -* 16.0 THRESHOLD VALUE OF TEST RATIO -* 6 NUMBER OF VALUES OF N -* 0 1 2 3 5 9 VALUES OF N -* 3 NUMBER OF VALUES OF ALPHA -* 0.0 1.0 0.7 VALUES OF ALPHA -* 3 NUMBER OF VALUES OF BETA -* 0.0 1.0 1.3 VALUES OF BETA -* SGEMM T PUT F FOR NO TEST. SAME COLUMNS. -* SSYMM T PUT F FOR NO TEST. SAME COLUMNS. -* STRMM T PUT F FOR NO TEST. SAME COLUMNS. -* STRSM T PUT F FOR NO TEST. SAME COLUMNS. -* SSYRK T PUT F FOR NO TEST. SAME COLUMNS. -* SSYR2K T PUT F FOR NO TEST. SAME COLUMNS. -* -* See: -* -* Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. -* A Set of Level 3 Basic Linear Algebra Subprograms. -* -* Technical Memorandum No.88 (Revision 1), Mathematics and -* Computer Science Division, Argonne National Laboratory, 9700 -* South Cass Avenue, Argonne, Illinois 60439, US. +* -- Reference BLAS test routine (version 3.7.0) -- +* -- Reference BLAS is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* April 2012 * -* -- Written on 8-February-1989. -* Jack Dongarra, Argonne National Laboratory. -* Iain Duff, AERE Harwell. -* Jeremy Du Croz, Numerical Algorithms Group Ltd. -* Sven Hammarling, Numerical Algorithms Group Ltd. +* ===================================================================== * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 5 ) INTEGER NSUBS PARAMETER ( NSUBS = 6 ) - REAL ZERO, HALF, ONE - PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0, ONE = 1.0 ) INTEGER NMAX PARAMETER ( NMAX = 65 ) INTEGER NIDMAX, NALMAX, NBEMAX @@ -96,7 +142,7 @@ * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT - OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) + OPEN( NOUT, FILE = SUMMRY ) NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. @@ -105,7 +151,7 @@ READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN - OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + OPEN( NTRA, FILE = SNAPS ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI @@ -182,14 +228,7 @@ * * Compute EPS (the machine precision). * - EPS = ONE - 70 CONTINUE - IF( SDIFF( ONE + EPS, ONE ).EQ.ZERO ) - $ GO TO 80 - EPS = HALF*EPS - GO TO 70 - 80 CONTINUE - EPS = EPS + EPS + EPS = EPSILON(ZERO) WRITE( NOUT, FMT = 9998 )EPS * * Check the reliability of SMMCH using exact data. @@ -1802,7 +1841,7 @@ * * Tests the error exits from the Level 3 Blas. * Requires a special version of the error-handling routine XERBLA. -* ALPHA, BETA, A, B and C should not need to be defined. +* A, B and C should not need to be defined. * * Auxiliary routine for test program for Level 3 Blas. * @@ -1812,12 +1851,18 @@ * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * +* 3-19-92: Initialize ALPHA and BETA (eca) +* 3-19-92: Fix argument 12 in calls to SSYMM with INFOT = 9 (eca) +* * .. Scalar Arguments .. INTEGER ISNUM, NOUT CHARACTER*6 SRNAMT * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK +* .. Parameters .. + REAL ONE, TWO + PARAMETER ( ONE = 1.0E0, TWO = 2.0E0 ) * .. Local Scalars .. REAL ALPHA, BETA * .. Local Arrays .. @@ -1834,6 +1879,12 @@ * LERR is set to .TRUE. by the special version of XERBLA each time * it is called, and is then tested and re-set by CHKXER. LERR = .FALSE. +* +* Initialize ALPHA and BETA. +* + ALPHA = ONE + BETA = TWO +* GO TO ( 10, 20, 30, 40, 50, 60 )ISNUM 10 INFOT = 1 CALL SGEMM( '/', 'N', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) @@ -1963,16 +2014,16 @@ CALL SSYMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL SSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL SSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL SSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL SSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL SSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL SSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL SSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL SSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL SSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) @@ -2660,7 +2711,6 @@ 50 CONTINUE END IF * - 60 CONTINUE LSERES = .TRUE. GO TO 80 70 CONTINUE diff --git a/test/zblat1.f b/test/zblat1.f index 8b4b8d21e..2d7b88490 100644 --- a/test/zblat1.f +++ b/test/zblat1.f @@ -1,7 +1,49 @@ +*> \brief \b ZBLAT1 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* PROGRAM ZBLAT1 +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> Test program for the COMPLEX*16 Level 1 BLAS. +*> +*> Based upon the original BLAS test routine together with: +*> F06GAF Example Program Text +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date April 2012 +* +*> \ingroup complex16_blas_testing +* +* ===================================================================== PROGRAM ZBLAT1 -* Test program for the COMPLEX*16 Level 1 BLAS. -* Based upon the original BLAS test routine together with: -* F06GAF Example Program Text +* +* -- Reference BLAS test routine (version 3.7.0) -- +* -- Reference BLAS is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* April 2012 +* +* ===================================================================== +* * .. Parameters .. INTEGER NOUT PARAMETER (NOUT=6) @@ -114,8 +156,8 @@ + (5.0D0,6.0D0), (5.0D0,6.0D0), (0.1D0,0.1D0), + (-0.6D0,0.1D0), (0.1D0,-0.3D0), (7.0D0,8.0D0), + (7.0D0,8.0D0), (7.0D0,8.0D0), (7.0D0,8.0D0), - + (7.0D0,8.0D0), (0.3D0,0.1D0), (0.1D0,0.4D0), - + (0.4D0,0.1D0), (0.1D0,0.2D0), (2.0D0,3.0D0), + + (7.0D0,8.0D0), (0.3D0,0.1D0), (0.5D0,0.0D0), + + (0.0D0,0.5D0), (0.0D0,0.2D0), (2.0D0,3.0D0), + (2.0D0,3.0D0), (2.0D0,3.0D0), (2.0D0,3.0D0)/ DATA ((CV(I,J,2),I=1,8),J=1,5)/(0.1D0,0.1D0), + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), @@ -129,10 +171,10 @@ + (3.0D0,6.0D0), (-0.6D0,0.1D0), (4.0D0,7.0D0), + (0.1D0,-0.3D0), (7.0D0,2.0D0), (7.0D0,2.0D0), + (7.0D0,2.0D0), (0.3D0,0.1D0), (5.0D0,8.0D0), - + (0.1D0,0.4D0), (6.0D0,9.0D0), (0.4D0,0.1D0), - + (8.0D0,3.0D0), (0.1D0,0.2D0), (9.0D0,4.0D0)/ - DATA STRUE2/0.0D0, 0.5D0, 0.6D0, 0.7D0, 0.7D0/ - DATA STRUE4/0.0D0, 0.7D0, 1.0D0, 1.3D0, 1.7D0/ + + (0.5D0,0.0D0), (6.0D0,9.0D0), (0.0D0,0.5D0), + + (8.0D0,3.0D0), (0.0D0,0.2D0), (9.0D0,4.0D0)/ + DATA STRUE2/0.0D0, 0.5D0, 0.6D0, 0.7D0, 0.8D0/ + DATA STRUE4/0.0D0, 0.7D0, 1.0D0, 1.3D0, 1.6D0/ DATA ((CTRUE5(I,J,1),I=1,8),J=1,5)/(0.1D0,0.1D0), + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), @@ -145,8 +187,8 @@ + (0.11D0,-0.03D0), (-0.17D0,0.46D0), + (-0.17D0,-0.19D0), (7.0D0,8.0D0), (7.0D0,8.0D0), + (7.0D0,8.0D0), (7.0D0,8.0D0), (7.0D0,8.0D0), - + (0.19D0,-0.17D0), (0.32D0,0.09D0), - + (0.23D0,-0.24D0), (0.18D0,0.01D0), + + (0.19D0,-0.17D0), (0.20D0,-0.35D0), + + (0.35D0,0.20D0), (0.14D0,0.08D0), + (2.0D0,3.0D0), (2.0D0,3.0D0), (2.0D0,3.0D0), + (2.0D0,3.0D0)/ DATA ((CTRUE5(I,J,2),I=1,8),J=1,5)/(0.1D0,0.1D0), @@ -162,9 +204,9 @@ + (-0.17D0,0.46D0), (4.0D0,7.0D0), + (-0.17D0,-0.19D0), (7.0D0,2.0D0), (7.0D0,2.0D0), + (7.0D0,2.0D0), (0.19D0,-0.17D0), (5.0D0,8.0D0), - + (0.32D0,0.09D0), (6.0D0,9.0D0), - + (0.23D0,-0.24D0), (8.0D0,3.0D0), - + (0.18D0,0.01D0), (9.0D0,4.0D0)/ + + (0.20D0,-0.35D0), (6.0D0,9.0D0), + + (0.35D0,0.20D0), (8.0D0,3.0D0), + + (0.14D0,0.08D0), (9.0D0,4.0D0)/ DATA ((CTRUE6(I,J,1),I=1,8),J=1,5)/(0.1D0,0.1D0), + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), @@ -177,8 +219,8 @@ + (0.03D0,0.03D0), (-0.18D0,0.03D0), + (0.03D0,-0.09D0), (7.0D0,8.0D0), (7.0D0,8.0D0), + (7.0D0,8.0D0), (7.0D0,8.0D0), (7.0D0,8.0D0), - + (0.09D0,0.03D0), (0.03D0,0.12D0), - + (0.12D0,0.03D0), (0.03D0,0.06D0), (2.0D0,3.0D0), + + (0.09D0,0.03D0), (0.15D0,0.00D0), + + (0.00D0,0.15D0), (0.00D0,0.06D0), (2.0D0,3.0D0), + (2.0D0,3.0D0), (2.0D0,3.0D0), (2.0D0,3.0D0)/ DATA ((CTRUE6(I,J,2),I=1,8),J=1,5)/(0.1D0,0.1D0), + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), @@ -193,8 +235,8 @@ + (-0.18D0,0.03D0), (4.0D0,7.0D0), + (0.03D0,-0.09D0), (7.0D0,2.0D0), (7.0D0,2.0D0), + (7.0D0,2.0D0), (0.09D0,0.03D0), (5.0D0,8.0D0), - + (0.03D0,0.12D0), (6.0D0,9.0D0), (0.12D0,0.03D0), - + (8.0D0,3.0D0), (0.03D0,0.06D0), (9.0D0,4.0D0)/ + + (0.15D0,0.00D0), (6.0D0,9.0D0), (0.00D0,0.15D0), + + (8.0D0,3.0D0), (0.00D0,0.06D0), (9.0D0,4.0D0)/ DATA ITRUE3/0, 1, 2, 2, 2/ * .. Executable Statements .. DO 60 INCX = 1, 2 @@ -529,7 +571,8 @@ * * .. Parameters .. INTEGER NOUT - PARAMETER (NOUT=6) + DOUBLE PRECISION ZERO + PARAMETER (NOUT=6, ZERO=0.0D0) * .. Scalar Arguments .. DOUBLE PRECISION SFAC INTEGER LEN @@ -552,7 +595,7 @@ * DO 40 I = 1, LEN SD = SCOMP(I) - STRUE(I) - IF (SDIFF(ABS(SSIZE(I))+ABS(SFAC*SD),ABS(SSIZE(I))).EQ.0.0D0) + IF (ABS(SFAC*SD) .LE. ABS(SSIZE(I))*EPSILON(ZERO)) + GO TO 40 * * HERE SCOMP(I) IS NOT CLOSE TO STRUE(I). diff --git a/test/zblat2.f b/test/zblat2.f index e65cdcc70..4a20ac567 100644 --- a/test/zblat2.f +++ b/test/zblat2.f @@ -1,68 +1,114 @@ +*> \brief \b ZBLAT2 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* PROGRAM ZBLAT2 +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> Test program for the COMPLEX*16 Level 2 Blas. +*> +*> The program must be driven by a short data file. The first 18 records +*> of the file are read using list-directed input, the last 17 records +*> are read using the format ( A6, L2 ). An annotated example of a data +*> file can be obtained by deleting the first 3 characters from the +*> following 35 lines: +*> 'zblat2.out' NAME OF SUMMARY OUTPUT FILE +*> 6 UNIT NUMBER OF SUMMARY FILE +*> 'CBLA2T.SNAP' NAME OF SNAPSHOT OUTPUT FILE +*> -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +*> F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +*> F LOGICAL FLAG, T TO STOP ON FAILURES. +*> T LOGICAL FLAG, T TO TEST ERROR EXITS. +*> 16.0 THRESHOLD VALUE OF TEST RATIO +*> 6 NUMBER OF VALUES OF N +*> 0 1 2 3 5 9 VALUES OF N +*> 4 NUMBER OF VALUES OF K +*> 0 1 2 4 VALUES OF K +*> 4 NUMBER OF VALUES OF INCX AND INCY +*> 1 2 -1 -2 VALUES OF INCX AND INCY +*> 3 NUMBER OF VALUES OF ALPHA +*> (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +*> 3 NUMBER OF VALUES OF BETA +*> (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +*> ZGEMV T PUT F FOR NO TEST. SAME COLUMNS. +*> ZGBMV T PUT F FOR NO TEST. SAME COLUMNS. +*> ZHEMV T PUT F FOR NO TEST. SAME COLUMNS. +*> ZHBMV T PUT F FOR NO TEST. SAME COLUMNS. +*> ZHPMV T PUT F FOR NO TEST. SAME COLUMNS. +*> ZTRMV T PUT F FOR NO TEST. SAME COLUMNS. +*> ZTBMV T PUT F FOR NO TEST. SAME COLUMNS. +*> ZTPMV T PUT F FOR NO TEST. SAME COLUMNS. +*> ZTRSV T PUT F FOR NO TEST. SAME COLUMNS. +*> ZTBSV T PUT F FOR NO TEST. SAME COLUMNS. +*> ZTPSV T PUT F FOR NO TEST. SAME COLUMNS. +*> ZGERC T PUT F FOR NO TEST. SAME COLUMNS. +*> ZGERU T PUT F FOR NO TEST. SAME COLUMNS. +*> ZHER T PUT F FOR NO TEST. SAME COLUMNS. +*> ZHPR T PUT F FOR NO TEST. SAME COLUMNS. +*> ZHER2 T PUT F FOR NO TEST. SAME COLUMNS. +*> ZHPR2 T PUT F FOR NO TEST. SAME COLUMNS. +*> +*> Further Details +*> =============== +*> +*> See: +*> +*> Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. +*> An extended set of Fortran Basic Linear Algebra Subprograms. +*> +*> Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics +*> and Computer Science Division, Argonne National Laboratory, +*> 9700 South Cass Avenue, Argonne, Illinois 60439, US. +*> +*> Or +*> +*> NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms +*> Group Ltd., NAG Central Office, 256 Banbury Road, Oxford +*> OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st +*> Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. +*> +*> +*> -- Written on 10-August-1987. +*> Richard Hanson, Sandia National Labs. +*> Jeremy Du Croz, NAG Central Office. +*> +*> 10-9-00: Change STATUS='NEW' to 'UNKNOWN' so that the testers +*> can be run multiple times without deleting generated +*> output files (susan) +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date April 2012 +* +*> \ingroup complex16_blas_testing +* +* ===================================================================== PROGRAM ZBLAT2 * -* Test program for the COMPLEX*16 Level 2 Blas. -* -* The program must be driven by a short data file. The first 18 records -* of the file are read using list-directed input, the last 17 records -* are read using the format ( A6, L2 ). An annotated example of a data -* file can be obtained by deleting the first 3 characters from the -* following 35 lines: -* 'ZBLAT2.SUMM' NAME OF SUMMARY OUTPUT FILE -* 6 UNIT NUMBER OF SUMMARY FILE -* 'CBLA2T.SNAP' NAME OF SNAPSHOT OUTPUT FILE -* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) -* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO STOP ON FAILURES. -* T LOGICAL FLAG, T TO TEST ERROR EXITS. -* 16.0 THRESHOLD VALUE OF TEST RATIO -* 6 NUMBER OF VALUES OF N -* 0 1 2 3 5 9 VALUES OF N -* 4 NUMBER OF VALUES OF K -* 0 1 2 4 VALUES OF K -* 4 NUMBER OF VALUES OF INCX AND INCY -* 1 2 -1 -2 VALUES OF INCX AND INCY -* 3 NUMBER OF VALUES OF ALPHA -* (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA -* 3 NUMBER OF VALUES OF BETA -* (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA -* ZGEMV T PUT F FOR NO TEST. SAME COLUMNS. -* ZGBMV T PUT F FOR NO TEST. SAME COLUMNS. -* ZHEMV T PUT F FOR NO TEST. SAME COLUMNS. -* ZHBMV T PUT F FOR NO TEST. SAME COLUMNS. -* ZHPMV T PUT F FOR NO TEST. SAME COLUMNS. -* ZTRMV T PUT F FOR NO TEST. SAME COLUMNS. -* ZTBMV T PUT F FOR NO TEST. SAME COLUMNS. -* ZTPMV T PUT F FOR NO TEST. SAME COLUMNS. -* ZTRSV T PUT F FOR NO TEST. SAME COLUMNS. -* ZTBSV T PUT F FOR NO TEST. SAME COLUMNS. -* ZTPSV T PUT F FOR NO TEST. SAME COLUMNS. -* ZGERC T PUT F FOR NO TEST. SAME COLUMNS. -* ZGERU T PUT F FOR NO TEST. SAME COLUMNS. -* ZHER T PUT F FOR NO TEST. SAME COLUMNS. -* ZHPR T PUT F FOR NO TEST. SAME COLUMNS. -* ZHER2 T PUT F FOR NO TEST. SAME COLUMNS. -* ZHPR2 T PUT F FOR NO TEST. SAME COLUMNS. -* -* See: -* -* Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. -* An extended set of Fortran Basic Linear Algebra Subprograms. -* -* Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics -* and Computer Science Division, Argonne National Laboratory, -* 9700 South Cass Avenue, Argonne, Illinois 60439, US. -* -* Or -* -* NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms -* Group Ltd., NAG Central Office, 256 Banbury Road, Oxford -* OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st -* Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. +* -- Reference BLAS test routine (version 3.7.0) -- +* -- Reference BLAS is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* April 2012 * -* -* -- Written on 10-August-1987. -* Richard Hanson, Sandia National Labs. -* Jeremy Du Croz, NAG Central Office. +* ===================================================================== * * .. Parameters .. INTEGER NIN @@ -72,8 +118,8 @@ COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), $ ONE = ( 1.0D0, 0.0D0 ) ) - DOUBLE PRECISION RZERO, RHALF, RONE - PARAMETER ( RZERO = 0.0D0, RHALF = 0.5D0, RONE = 1.0D0 ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) INTEGER NMAX, INCMAX PARAMETER ( NMAX = 65, INCMAX = 2 ) INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX @@ -127,7 +173,7 @@ * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT - OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) + OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. @@ -136,7 +182,7 @@ READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN - OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI @@ -241,14 +287,7 @@ * * Compute EPS (the machine precision). * - EPS = RONE - 90 CONTINUE - IF( DDIFF( RONE + EPS, RONE ).EQ.RZERO ) - $ GO TO 100 - EPS = RHALF*EPS - GO TO 90 - 100 CONTINUE - EPS = EPS + EPS + EPS = EPSILON(RZERO) WRITE( NOUT, FMT = 9998 )EPS * * Check the reliability of ZMVCH using exact data. @@ -3087,7 +3126,6 @@ 50 CONTINUE END IF * - 60 CONTINUE LZERES = .TRUE. GO TO 80 70 CONTINUE diff --git a/test/zblat3.f b/test/zblat3.f index f03b1a617..0e38334e9 100644 --- a/test/zblat3.f +++ b/test/zblat3.f @@ -1,50 +1,97 @@ +*> \brief \b ZBLAT3 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* PROGRAM ZBLAT3 +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> Test program for the COMPLEX*16 Level 3 Blas. +*> +*> The program must be driven by a short data file. The first 14 records +*> of the file are read using list-directed input, the last 9 records +*> are read using the format ( A6, L2 ). An annotated example of a data +*> file can be obtained by deleting the first 3 characters from the +*> following 23 lines: +*> 'zblat3.out' NAME OF SUMMARY OUTPUT FILE +*> 6 UNIT NUMBER OF SUMMARY FILE +*> 'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +*> -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +*> F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +*> F LOGICAL FLAG, T TO STOP ON FAILURES. +*> T LOGICAL FLAG, T TO TEST ERROR EXITS. +*> 16.0 THRESHOLD VALUE OF TEST RATIO +*> 6 NUMBER OF VALUES OF N +*> 0 1 2 3 5 9 VALUES OF N +*> 3 NUMBER OF VALUES OF ALPHA +*> (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +*> 3 NUMBER OF VALUES OF BETA +*> (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +*> ZGEMM T PUT F FOR NO TEST. SAME COLUMNS. +*> ZHEMM T PUT F FOR NO TEST. SAME COLUMNS. +*> ZSYMM T PUT F FOR NO TEST. SAME COLUMNS. +*> ZTRMM T PUT F FOR NO TEST. SAME COLUMNS. +*> ZTRSM T PUT F FOR NO TEST. SAME COLUMNS. +*> ZHERK T PUT F FOR NO TEST. SAME COLUMNS. +*> ZSYRK T PUT F FOR NO TEST. SAME COLUMNS. +*> ZHER2K T PUT F FOR NO TEST. SAME COLUMNS. +*> ZSYR2K T PUT F FOR NO TEST. SAME COLUMNS. +*> +*> +*> Further Details +*> =============== +*> +*> See: +*> +*> Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. +*> A Set of Level 3 Basic Linear Algebra Subprograms. +*> +*> Technical Memorandum No.88 (Revision 1), Mathematics and +*> Computer Science Division, Argonne National Laboratory, 9700 +*> South Cass Avenue, Argonne, Illinois 60439, US. +*> +*> -- Written on 8-February-1989. +*> Jack Dongarra, Argonne National Laboratory. +*> Iain Duff, AERE Harwell. +*> Jeremy Du Croz, Numerical Algorithms Group Ltd. +*> Sven Hammarling, Numerical Algorithms Group Ltd. +*> +*> 10-9-00: Change STATUS='NEW' to 'UNKNOWN' so that the testers +*> can be run multiple times without deleting generated +*> output files (susan) +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date April 2012 +* +*> \ingroup complex16_blas_testing +* +* ===================================================================== PROGRAM ZBLAT3 * -* Test program for the COMPLEX*16 Level 3 Blas. -* -* The program must be driven by a short data file. The first 14 records -* of the file are read using list-directed input, the last 9 records -* are read using the format ( A6, L2 ). An annotated example of a data -* file can be obtained by deleting the first 3 characters from the -* following 23 lines: -* 'ZBLAT3.SUMM' NAME OF SUMMARY OUTPUT FILE -* 6 UNIT NUMBER OF SUMMARY FILE -* 'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE -* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) -* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO STOP ON FAILURES. -* T LOGICAL FLAG, T TO TEST ERROR EXITS. -* 16.0 THRESHOLD VALUE OF TEST RATIO -* 6 NUMBER OF VALUES OF N -* 0 1 2 3 5 9 VALUES OF N -* 3 NUMBER OF VALUES OF ALPHA -* (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA -* 3 NUMBER OF VALUES OF BETA -* (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA -* ZGEMM T PUT F FOR NO TEST. SAME COLUMNS. -* ZHEMM T PUT F FOR NO TEST. SAME COLUMNS. -* ZSYMM T PUT F FOR NO TEST. SAME COLUMNS. -* ZTRMM T PUT F FOR NO TEST. SAME COLUMNS. -* ZTRSM T PUT F FOR NO TEST. SAME COLUMNS. -* ZHERK T PUT F FOR NO TEST. SAME COLUMNS. -* ZSYRK T PUT F FOR NO TEST. SAME COLUMNS. -* ZHER2K T PUT F FOR NO TEST. SAME COLUMNS. -* ZSYR2K T PUT F FOR NO TEST. SAME COLUMNS. -* -* See: -* -* Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. -* A Set of Level 3 Basic Linear Algebra Subprograms. -* -* Technical Memorandum No.88 (Revision 1), Mathematics and -* Computer Science Division, Argonne National Laboratory, 9700 -* South Cass Avenue, Argonne, Illinois 60439, US. +* -- Reference BLAS test routine (version 3.7.0) -- +* -- Reference BLAS is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* April 2012 * -* -- Written on 8-February-1989. -* Jack Dongarra, Argonne National Laboratory. -* Iain Duff, AERE Harwell. -* Jeremy Du Croz, Numerical Algorithms Group Ltd. -* Sven Hammarling, Numerical Algorithms Group Ltd. +* ===================================================================== * * .. Parameters .. INTEGER NIN @@ -54,8 +101,8 @@ COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), $ ONE = ( 1.0D0, 0.0D0 ) ) - DOUBLE PRECISION RZERO, RHALF, RONE - PARAMETER ( RZERO = 0.0D0, RHALF = 0.5D0, RONE = 1.0D0 ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) INTEGER NMAX PARAMETER ( NMAX = 65 ) INTEGER NIDMAX, NALMAX, NBEMAX @@ -104,7 +151,7 @@ * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT - OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) + OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. @@ -113,7 +160,7 @@ READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN - OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI @@ -190,14 +237,7 @@ * * Compute EPS (the machine precision). * - EPS = RONE - 70 CONTINUE - IF( DDIFF( RONE + EPS, RONE ).EQ.RZERO ) - $ GO TO 80 - EPS = RHALF*EPS - GO TO 70 - 80 CONTINUE - EPS = EPS + EPS + EPS = EPSILON(RZERO) WRITE( NOUT, FMT = 9998 )EPS * * Check the reliability of ZMMCH using exact data. @@ -1303,8 +1343,6 @@ NC = 0 RESET = .TRUE. ERRMAX = RZERO - RALS = RONE - RBETS = RONE * DO 100 IN = 1, NIDIM N = IDIM( IN ) @@ -1951,7 +1989,7 @@ * * Tests the error exits from the Level 3 Blas. * Requires a special version of the error-handling routine XERBLA. -* ALPHA, RALPHA, BETA, RBETA, A, B and C should not need to be defined. +* A, B and C should not need to be defined. * * Auxiliary routine for test program for Level 3 Blas. * @@ -1961,12 +1999,20 @@ * Jeremy Du Croz, Numerical Algorithms Group Ltd. * Sven Hammarling, Numerical Algorithms Group Ltd. * +* 3-19-92: Initialize ALPHA, BETA, RALPHA, and RBETA (eca) +* 3-19-92: Fix argument 12 in calls to ZSYMM and ZHEMM +* with INFOT = 9 (eca) +* 10-9-00: Declared INTRINSIC DCMPLX (susan) +* * .. Scalar Arguments .. INTEGER ISNUM, NOUT CHARACTER*6 SRNAMT * .. Scalars in Common .. INTEGER INFOT, NOUTC LOGICAL LERR, OK +* .. Parameters .. + REAL ONE, TWO + PARAMETER ( ONE = 1.0D0, TWO = 2.0D0 ) * .. Local Scalars .. COMPLEX*16 ALPHA, BETA DOUBLE PRECISION RALPHA, RBETA @@ -1975,6 +2021,8 @@ * .. External Subroutines .. EXTERNAL ZGEMM, ZHEMM, ZHER2K, ZHERK, CHKXER, ZSYMM, $ ZSYR2K, ZSYRK, ZTRMM, ZTRSM +* .. Intrinsic Functions .. + INTRINSIC DCMPLX * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK, LERR * .. Executable Statements .. @@ -1984,6 +2032,14 @@ * LERR is set to .TRUE. by the special version of XERBLA each time * it is called, and is then tested and re-set by CHKXER. LERR = .FALSE. +* +* Initialize ALPHA, BETA, RALPHA, and RBETA. +* + ALPHA = DCMPLX( ONE, -ONE ) + BETA = DCMPLX( TWO, -TWO ) + RALPHA = ONE + RBETA = TWO +* GO TO ( 10, 20, 30, 40, 50, 60, 70, 80, $ 90 )ISNUM 10 INFOT = 1 @@ -2210,16 +2266,16 @@ CALL ZHEMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL ZHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL ZHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL ZHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL ZHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL ZHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL ZHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL ZHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL ZHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL ZHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) @@ -2277,16 +2333,16 @@ CALL ZSYMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL ZSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL ZSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL ZSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL ZSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL ZSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL ZSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 9 - CALL ZSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL ZSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) INFOT = 12 CALL ZSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) @@ -3276,7 +3332,6 @@ 50 CONTINUE END IF * - 60 CONTINUE LZERES = .TRUE. GO TO 80 70 CONTINUE From 79cdcde7173cbb8adf231867fb6643afaada5712 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 14 Jun 2020 17:18:16 +0200 Subject: [PATCH 247/593] Re-enable higher optimization levels for flang while disabling loop unrolling for AOCC flang --- Makefile.system | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/Makefile.system b/Makefile.system index 31cdd12b2..d68353b12 100644 --- a/Makefile.system +++ b/Makefile.system @@ -784,6 +784,14 @@ endif ifeq ($(F_COMPILER), FLANG) CCOMMON_OPT += -DF_INTERFACE_FLANG FCOMMON_OPT += -Mrecursive -Kieee +ifeq ($(OSNAME), Linux) +ifeq ($(ARCH), x86_64) +FLANG_VENDOR := $(shell expr `$(FC) --version|cut -f 1 -d "."|head -1`) +ifeq ($(FLANG_VENDOR),AOCC) +FCOMMON_OPT += -fno-unroll-loops +endif +endif +endif ifdef BINARY64 ifdef INTERFACE64 ifneq ($(INTERFACE64), 0) @@ -1271,11 +1279,7 @@ endif override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF) -ifeq ($(F_COMPILER),FLANG) -override FFLAGS += $(filter-out -O2 -O3,$(COMMON_OPT)) -O1 $(FCOMMON_OPT) -else override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) -endif override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF) #MAKEOVERRIDES = From 6876221cf340d3efb71f64e73aef5006b3bc96d6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 14 Jun 2020 17:40:24 +0200 Subject: [PATCH 248/593] Remove optimization level limit for flang again and add -fno-unroll-loops for AOCC flang 2.x instead --- cmake/system.cmake | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index c2ae471d2..d8dcc3cf3 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -419,10 +419,9 @@ endif () if ("${CMAKE_BUILD_TYPE}" STREQUAL "Release") if ("${F_COMPILER}" STREQUAL "FLANG") - set(FILTER_FLAGS "-O2;-O3") - foreach (FILTER_FLAG ${FILTER_FLAGS}) - string(REPLACE ${FILTER_FLAG} "-O1" CMAKE_Fortran_FLAGS_RELEASE ${CMAKE_Fortran_FLAGS_RELEASE}) - endforeach () +if (${CMAKE_Fortran_COMPILER_VERSION} VERSION_LESS_EQUAL 3) + set(CMAKE_Fortran_FLAGS_RELEASE "${CMAKE_Fortran_FLAGS_RELEASE} -fno-unroll-loops") +endif () endif () endif () From 72888497e2ffb6233ffd18ccf0b4d4bb01701b17 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 14 Jun 2020 21:55:31 +0200 Subject: [PATCH 249/593] Update with 0.3.10 changes --- Changelog.txt | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index 5f924629b..cbf0b50f5 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,77 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.10 + 14-Jun-2020 + +common: + * Improved thread locking behaviour in blas_server and parallel getrf + * Imported bugfix 394 from LAPACK (spurious reference to "XERBL" + due to overlong lines) + * Imported bugfix 403 from LAPACK (compile option "recursive" required + for correctness with Intel and PGI) + * Imported bugfix 408 from LAPACK (wrong scaling in ZHEEQUB) + * Imported bugfix 411 from LAPACK (infinite loop in LARGV/LARTG/LARTGP) + * Fixed mismatches between BUFFERSIZE and GEMM_UNROLL parameters that + could lead to crashes at large matrix sizes + * Restored internal soname in dynamic libraries on FreeBSD and Dragonfly + * Added API (openblas_setaffinity) to set the thread affinity on Linux + * Added initial infrastructure for half-precision floating point + (bfloat16) support with a generic implementation of SHGEMM + * Added CMAKE build system support for building the cblas_Xgemm3m + functions + * Fixed CMAKE support for building in a path with embedded spaces + * Fixed CMAKE (non)handling of NO_EXPRECISION and MAX_STACK_ALLOC + * Fixed GCC version detection in the Makefiles + * Allowed overriding the names of AR, AS and LD in Makefile builds + +POWER: + * Fixed big-endian POWER8 ELFv2 builds on FreeBSD + * Fixed GCC version checks and DYNAMIC_ARCH builds on POWER9 + * Fixed CMAKE build support for POWER9 + * fixed a potential race condition in the thread buffer allocation + * Worked around LAPACK test failures on PPC G4 + +MIPS: + * Fixed a potential race condition in the thread buffer allocation + * Added support for MIPS 24K/24KE family based on P5600 kernels + +MIPS64: + * fixed a potential race condition in the thread buffer allocation + * Added TARGET=GENERIC + +ARMV7: + * Fixed a race condition in the thread buffer allocation + +ARMV8: + * Fixed a race condition in the thread buffer allocation + * Fixed zero initialisation in the assembly for SGEMM and DGEMM BETA + * Improved performance of the ThunderX2 DAXPY kernel + * Added an optimized SGEMM kernel for Cortex A53 + * Fixed Makefile support for INTERFACE64 (8-byte integer) + +x86_64: + * Fixed a syntax error in the CMAKE setup for SkylakeX + * Improved performance of STRSM on Haswell, SkylakeX and Ryzen + * Improved SGEMM performance on SGEMM for workloads with ldc a + multiple of 1024 + * Improved DGEMM performance on Skylake X + * Fixed unwanted AVX512-dependency of SGEMM in DYNAMIC_ARCH + builds created on SkylakeX + * Removed data alignment requirement in the SSE2 copy kernels + that could cause spurious crashes + * Added a workaround for an optimizer bug in AppleClang 11.0.3 + * Fixed LAPACK test failures due to wrong options for Intel Fortran + * Fixed compilation and LAPACK test results with recent Flang + and AMD AOCC + * Fixed DYNAMIC_ARCH builds with CMAKE on OS X + * Fixed missing exports of cblas_i?amin, cblas_i?min, cblas_i?max, + cblas_?sum, cblas_?gemm3m in the shared library on OS + * Fixed reporting of cpu name in DYNAMIC_ARCH builds (would sometimes + show the name of an older generation chip supported by the same kernels) + +IBM Z: + * Improved performance of SGEMM/STRMM and DGEMM/DTRMM on Z14 + ==================================================================== Version 0.3.9 1-Mar-2020 From 1eb197905056afa1b3e6d138d6084fb4d2b46322 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 14 Jun 2020 21:57:15 +0200 Subject: [PATCH 250/593] Increment version to 0.3.10.dev --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5118475cc..9d4aa0ca6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 9.dev) +set(OpenBLAS_PATCH_VERSION 10.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From 3b673a24b704ab37e89eaf3832971726927e45e9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 14 Jun 2020 21:57:52 +0200 Subject: [PATCH 251/593] Increment version to 0.3.10.dev --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 8549e6394..2c12177ee 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.9.dev +VERSION = 0.3.10.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From 1c53e1366d5441ee7fa22b77be7bea8c5eabef32 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 14 Jun 2020 22:04:37 +0200 Subject: [PATCH 252/593] Increment version to 0.3.10.dev --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 12621d6b8..bb5322a1d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 10) +set(OpenBLAS_PATCH_VERSION 10.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") From 1bd3cd66c270134d138f7b61cd158407a07086cf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 14 Jun 2020 22:05:19 +0200 Subject: [PATCH 253/593] Increment version to 0.3.10.dev --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 551c094ca..2c12177ee 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.10 +VERSION = 0.3.10.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From a2d13ea61183099c05aa31e23ef59e1411d77177 Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Tue, 16 Jun 2020 14:40:50 +0200 Subject: [PATCH 254/593] Fix gcc version detection for zarch Employ common variables for gcc version detection and fix the broken check for gcc >= 5.2. Fixes #2668 Signed-off-by: Marius Hillenbrand --- Makefile.system | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/Makefile.system b/Makefile.system index 8d78b420f..5738b14ec 100644 --- a/Makefile.system +++ b/Makefile.system @@ -282,9 +282,11 @@ endif ifeq ($(C_COMPILER), GCC) GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) +GCCVERSIONEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` = 5) GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) +GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 2) GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) endif @@ -570,20 +572,27 @@ ifeq ($(ARCH), zarch) DYNAMIC_CORE = ZARCH_GENERIC # Z13 is supported since gcc-5.2, gcc-6, and in RHEL 7.3 and newer -GCC_GE_52 := $(subst 0,,$(shell expr `$(CC) -dumpversion` \>= "5.2")) +ifeq ($(GCCVERSIONGT5), 1) + ZARCH_SUPPORT_Z13 := 1 +else ifeq ($(GCCVERSIONEQ5), 1) +ifeq ($(GCCMINORVERSIONGTEQ2), 1) + ZARCH_SUPPORT_Z13 := 1 +endif +endif ifeq ($(wildcard /etc/redhat-release), /etc/redhat-release) -RHEL_WITH_Z13 := $(subst 0,,$(shell source /etc/os-release ; expr $$VERSION_ID \>= "7.3")) +ifeq ($(shell source /etc/os-release ; expr $$VERSION_ID \>= "7.3"), 1) + ZARCH_SUPPORT_Z13 := 1 +endif endif -ifeq ($(or $(GCC_GE_52),$(RHEL_WITH_Z13)), 1) +ifeq ($(ZARCH_SUPPORT_Z13), 1) DYNAMIC_CORE += Z13 else $(info OpenBLAS: Not building Z13 kernels because gcc is older than 5.2 or 6.x) endif -GCC_MAJOR_GE_7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) -ifeq ($(GCC_MAJOR_GE_7), 1) +ifeq ($(GCCVERSIONGTEQ7), 1) DYNAMIC_CORE += Z14 else $(info OpenBLAS: Not building Z14 kernels because gcc is older than 7.x) From 23892917667d87072eef2f18b6120f5d3c029f90 Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Tue, 16 Jun 2020 14:45:09 +0200 Subject: [PATCH 255/593] Makefile.system: remove duplicate variable GCCVERSIONGT5 ... to bring unified gcc version detection with common variables to the one remaining spot in Makefile.system. Signed-off-by: Marius Hillenbrand --- Makefile.system | 1 - 1 file changed, 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index 5738b14ec..63cdbccd8 100644 --- a/Makefile.system +++ b/Makefile.system @@ -606,7 +606,6 @@ ifneq ($(C_COMPILER), GCC) DYNAMIC_CORE += POWER9 endif ifeq ($(C_COMPILER), GCC) -GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) ifeq ($(GCCVERSIONGT5), 1) DYNAMIC_CORE += POWER9 else From cde4690721ad54043239db000a46537a9169ca02 Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Tue, 16 Jun 2020 15:45:59 +0200 Subject: [PATCH 256/593] RFC: Use gcc -dumpfullversion to get minor version with gcc-7.x In gcc-7.1, the behavior of -dumpversion changed to be configured at compile-time. On some distributions it only dumps the major version (e.g., Ubuntu), so the current checks for the gcc minor version report false negatives. As a replacement, gcc-7.1 introduced -dumpfullversion which always prints the full version. Update the gcc version detection in Makefile.system to employ -dumpfullversion with gcc-7 and newer. Posting this patch for discussion, since it emerged from discussions around issue #2668 and PR #2669. It is not solving a problem right now, but may be useful in the future. Signed-off-by: Marius Hillenbrand --- Makefile.system | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/Makefile.system b/Makefile.system index 63cdbccd8..7e0b2757e 100644 --- a/Makefile.system +++ b/Makefile.system @@ -286,8 +286,15 @@ GCCVERSIONEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` = 5) GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) -GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 2) -GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) +# Note that the behavior of -dumpversion is compile-time-configurable for +# gcc-7.x and newer. Use -dumpfullversion there +ifeq ($(GCCVERSIONGTEQ7),1) + GCCDUMPVERSION_PARAM := -dumpfullversion +else + GCCDUMPVERSION_PARAM := -dumpversion +endif +GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 2) +GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7) endif # From 478898b37a91836a39d046f8c70e26c6c9fc06c7 Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Wed, 17 Jun 2020 16:08:48 +0200 Subject: [PATCH 257/593] cpp_thread_test/dgemv: cap concurrency to number of hw threads on small systems ... instead of (number of hw threads - 4) to avoid invalid numbers on smaller systems. Currently, systems with 4 or fewer CPUs (e.g., small CI VMs) would fail the test. Fixes one of the issues discussed in #2668 Signed-off-by: Marius Hillenbrand --- cpp_thread_test/dgemv_thread_safety.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp_thread_test/dgemv_thread_safety.cpp b/cpp_thread_test/dgemv_thread_safety.cpp index 5411fec29..277594ff0 100644 --- a/cpp_thread_test/dgemv_thread_safety.cpp +++ b/cpp_thread_test/dgemv_thread_safety.cpp @@ -18,7 +18,7 @@ int main(int argc, char* argv[]){ uint32_t maxHwThreads = omp_get_max_threads(); if (maxHwThreads < 52) - numConcurrentThreads = maxHwThreads -4; + numConcurrentThreads = maxHwThreads; if (argc > 4){ std::cout<<"ERROR: too many arguments for thread safety tester"< Date: Wed, 17 Jun 2020 16:15:44 +0200 Subject: [PATCH 258/593] cpp_thread_test/dgemv: fail early if concurrency is zero The two test cases dgemv_tester and dgemm_tester accept the degree of concurrency as command line argument (amongst others). Fail early if value 0 has been specified, instead of later with less-clear symptoms. Signed-off-by: Marius Hillenbrand --- cpp_thread_test/cpp_thread_safety_common.h | 8 ++++++++ cpp_thread_test/dgemm_thread_safety.cpp | 2 ++ cpp_thread_test/dgemv_thread_safety.cpp | 2 ++ 3 files changed, 12 insertions(+) diff --git a/cpp_thread_test/cpp_thread_safety_common.h b/cpp_thread_test/cpp_thread_safety_common.h index 60ab5bb2f..8005369a8 100644 --- a/cpp_thread_test/cpp_thread_safety_common.h +++ b/cpp_thread_test/cpp_thread_safety_common.h @@ -5,6 +5,14 @@ inline void pauser(){ std::getline(std::cin, dummy); } +void FailIfThreadsAreZero(uint32_t numConcurrentThreads) { + if(numConcurrentThreads == 0) { + std::cout<<"ERROR: Invalid parameter 0 for number of concurrent calls into OpenBLAS!"<>& matBlock, std::mt19937_64& PRNG, std::uniform_real_distribution& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){ for(uint32_t i=0; i(randomMatSize*randomMatSize); j++){ diff --git a/cpp_thread_test/dgemm_thread_safety.cpp b/cpp_thread_test/dgemm_thread_safety.cpp index 1c5287524..104c64f2a 100644 --- a/cpp_thread_test/dgemm_thread_safety.cpp +++ b/cpp_thread_test/dgemm_thread_safety.cpp @@ -46,6 +46,8 @@ int main(int argc, char* argv[]){ std::cout<<"Number of concurrent calls into OpenBLAS : "<(randomMatSize*randomMatSize)*numConcurrentThreads*3*8)/static_cast(1024*1024)<<" MiB of RAM\n"<(randomMatSize*randomMatSize)*numConcurrentThreads*8)+(static_cast(randomMatSize)*numConcurrentThreads*8*2))/static_cast(1024*1024)<<" MiB of RAM\n"< Date: Sat, 20 Jun 2020 00:07:43 +0800 Subject: [PATCH 259/593] AVX512 dgemm tcopy_16 function --- kernel/x86_64/KERNEL.SKYLAKEX | 5 +- kernel/x86_64/dgemm_tcopy_16_skylakex.c | 129 ++++++++++++++++++++++++ 2 files changed, 133 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/dgemm_tcopy_16_skylakex.c diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 65f031d03..9b8b84c30 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -14,7 +14,7 @@ STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c DGEMMKERNEL = dgemm_kernel_16x2_skylakex.c DTRMMKERNEL = dgemm_kernel_16x2_skylakex.c DGEMMINCOPY = ../generic/gemm_ncopy_16.c -DGEMMITCOPY = ../generic/gemm_tcopy_16.c +DGEMMITCOPY = dgemm_tcopy_16_skylakex.c DGEMMONCOPY = ../generic/gemm_ncopy_2.c DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c @@ -24,3 +24,6 @@ DGEMM_BETA = dgemm_beta_skylakex.c CGEMMKERNEL = cgemm_kernel_8x2_skylakex.c ZGEMMKERNEL = zgemm_kernel_4x2_skylakex.c + +CSCALKERNEL = ../arm/zscal.c +ZSCALKERNEL = ../arm/zscal.c diff --git a/kernel/x86_64/dgemm_tcopy_16_skylakex.c b/kernel/x86_64/dgemm_tcopy_16_skylakex.c new file mode 100644 index 000000000..a1da60f8f --- /dev/null +++ b/kernel/x86_64/dgemm_tcopy_16_skylakex.c @@ -0,0 +1,129 @@ +#include +#include "common.h" +#include + +int CNAME(BLASLONG dim_second, BLASLONG dim_first, double *src, BLASLONG lead_dim, double *dst){ + double *src1, *src2, *src3, *src4, *dst1; + __m512d z1,z2,z3,z4,z5,z6,z7,z8; __m256d y1,y2,y3,y4; __m128d x1,x2,x3,x4; double s1,s2,s3,s4; + BLASLONG dim1_count, dim2_count, src_inc; + src_inc = 4 * lead_dim - dim_first; + src1 = src; src2 = src + lead_dim; src3 = src2 + lead_dim; src4 = src3 + lead_dim; + for(dim2_count=dim_second; dim2_count>3; dim2_count-=4){ + dst1 = dst + 16 * (dim_second - dim2_count); + for(dim1_count=dim_first; dim1_count>15; dim1_count-=16){ + z1 = _mm512_loadu_pd(src1); z2 = _mm512_loadu_pd(src1+8); src1 += 16; + z3 = _mm512_loadu_pd(src2); z4 = _mm512_loadu_pd(src2+8); src2 += 16; + z5 = _mm512_loadu_pd(src3); z6 = _mm512_loadu_pd(src3+8); src3 += 16; + z7 = _mm512_loadu_pd(src4); z8 = _mm512_loadu_pd(src4+8); src4 += 16; + _mm512_storeu_pd(dst1+ 0,z1); _mm512_storeu_pd(dst1+ 8,z2); + _mm512_storeu_pd(dst1+16,z3); _mm512_storeu_pd(dst1+24,z4); + _mm512_storeu_pd(dst1+32,z5); _mm512_storeu_pd(dst1+40,z6); + _mm512_storeu_pd(dst1+48,z7); _mm512_storeu_pd(dst1+56,z8); dst1 += 16 * dim_second; + } + dst1 -= 8 * (dim_second - dim2_count); + if(dim1_count>7){ + z1 = _mm512_loadu_pd(src1); src1 += 8; + z2 = _mm512_loadu_pd(src2); src2 += 8; + z3 = _mm512_loadu_pd(src3); src3 += 8; + z4 = _mm512_loadu_pd(src4); src4 += 8; + _mm512_storeu_pd(dst1+ 0,z1); _mm512_storeu_pd(dst1+ 8,z2); + _mm512_storeu_pd(dst1+16,z3); _mm512_storeu_pd(dst1+24,z4); dst1 += 8 * dim_second; + dim1_count -= 8; + } + dst1 -= 4 * (dim_second - dim2_count); + if(dim1_count>3){ + y1 = _mm256_loadu_pd(src1); src1 += 4; + y2 = _mm256_loadu_pd(src2); src2 += 4; + y3 = _mm256_loadu_pd(src3); src3 += 4; + y4 = _mm256_loadu_pd(src4); src4 += 4; + _mm256_storeu_pd(dst1+ 0,y1); _mm256_storeu_pd(dst1+ 4,y2); + _mm256_storeu_pd(dst1+ 8,y3); _mm256_storeu_pd(dst1+12,y4); dst1 += 4 * dim_second; + dim1_count -= 4; + } + dst1 -= 2 * (dim_second - dim2_count); + if(dim1_count>1){ + x1 = _mm_loadu_pd(src1); src1 += 2; + x2 = _mm_loadu_pd(src2); src2 += 2; + x3 = _mm_loadu_pd(src3); src3 += 2; + x4 = _mm_loadu_pd(src4); src4 += 2; + _mm_storeu_pd(dst1+0,x1); _mm_storeu_pd(dst1+2,x2); + _mm_storeu_pd(dst1+4,x3); _mm_storeu_pd(dst1+6,x4); dst1 += 2 * dim_second; + dim1_count -= 2; + } + dst1 -= dim_second - dim2_count; + if(dim1_count>0){ + s1 = *src1; src1++; s2 = *src2; src2++; s3 = *src3; src3++; s4 = *src4; src4++; + dst1[0] = s1; dst1[1] = s2; dst1[2] = s3; dst1[3] = s4; + } + src1 += src_inc; src2 += src_inc; src3 += src_inc; src4 += src_inc; + } + src_inc -= 2 * lead_dim; + for(; dim2_count>1; dim2_count-=2){ + dst1 = dst + 16 * (dim_second - dim2_count); + for(dim1_count=dim_first; dim1_count>15; dim1_count-=16){ + z1 = _mm512_loadu_pd(src1); z2 = _mm512_loadu_pd(src1+8); src1 += 16; + z3 = _mm512_loadu_pd(src2); z4 = _mm512_loadu_pd(src2+8); src2 += 16; + _mm512_storeu_pd(dst1+ 0,z1); _mm512_storeu_pd(dst1+ 8,z2); + _mm512_storeu_pd(dst1+16,z3); _mm512_storeu_pd(dst1+24,z4); dst1 += 16 * dim_second; + } + dst1 -= 8 * (dim_second - dim2_count); + if(dim1_count>7){ + z1 = _mm512_loadu_pd(src1); src1 += 8; + z2 = _mm512_loadu_pd(src2); src2 += 8; + _mm512_storeu_pd(dst1+ 0,z1); _mm512_storeu_pd(dst1+ 8,z2); dst1 += 8 * dim_second; + dim1_count -= 8; + } + dst1 -= 4 * (dim_second - dim2_count); + if(dim1_count>3){ + y1 = _mm256_loadu_pd(src1); src1 += 4; + y2 = _mm256_loadu_pd(src2); src2 += 4; + _mm256_storeu_pd(dst1+ 0,y1); _mm256_storeu_pd(dst1+ 4,y2); dst1 += 4 * dim_second; + dim1_count -= 4; + } + dst1 -= 2 * (dim_second - dim2_count); + if(dim1_count>1){ + x1 = _mm_loadu_pd(src1); src1 += 2; + x2 = _mm_loadu_pd(src2); src2 += 2; + _mm_storeu_pd(dst1+0,x1); _mm_storeu_pd(dst1+2,x2); dst1 += 2 * dim_second; + dim1_count -= 2; + } + dst1 -= dim_second - dim2_count; + if(dim1_count>0){ + s1 = *src1; src1++; s2 = *src2; src2++; + dst1[0] = s1; dst1[1] = s2; + } + src1 += src_inc; src2 += src_inc; + } + src_inc -= lead_dim; + for(; dim2_count>0; dim2_count--){ + dst1 = dst + 16 * (dim_second - dim2_count); + for(dim1_count=dim_first; dim1_count>15; dim1_count-=16){ + z1 = _mm512_loadu_pd(src1); z2 = _mm512_loadu_pd(src1+8); src1 += 16; + _mm512_storeu_pd(dst1+ 0,z1); _mm512_storeu_pd(dst1+ 8,z2); dst1 += 16 * dim_second; + } + dst1 -= 8 * (dim_second - dim2_count); + if(dim1_count>7){ + z1 = _mm512_loadu_pd(src1); src1 += 8; + _mm512_storeu_pd(dst1+ 0,z1); dst1 += 8 * dim_second; + dim1_count -= 8; + } + dst1 -= 4 * (dim_second - dim2_count); + if(dim1_count>3){ + y1 = _mm256_loadu_pd(src1); src1 += 4; + _mm256_storeu_pd(dst1+ 0,y1); dst1 += 4 * dim_second; + dim1_count -= 4; + } + dst1 -= 2 * (dim_second - dim2_count); + if(dim1_count>1){ + x1 = _mm_loadu_pd(src1); src1 += 2; + _mm_storeu_pd(dst1+0,x1); dst1 += 2 * dim_second; + dim1_count -= 2; + } + dst1 -= dim_second - dim2_count; + if(dim1_count>0){ + s1 = *src1; src1++; + dst1[0] = s1; + } + src1 += src_inc; + } +} From e6b92750349e273d6bb7b28673f10c39cff90c26 Mon Sep 17 00:00:00 2001 From: User User-User Date: Wed, 24 Jun 2020 09:12:23 +0300 Subject: [PATCH 260/593] address vs2019 C4293 --- driver/others/dynamic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 1bf0e4a6d..38eb76643 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -332,7 +332,7 @@ int support_avx512(){ if((ebx & (1<<7)) == 0){ ret=0; //OS does not even support AVX2 } - if((ebx & (1<<31)) != 0){ + if((ebx & (1u<<31)) != 0){ xgetbv(0, &eax, &edx); if((eax & 0xe0) == 0xe0) ret=1; //OS supports AVX512VL @@ -632,7 +632,7 @@ static gotoblas_t *get_coretype(void){ cpuid(0x80000000, &eax, &ebx, &ecx, &edx); if ( (eax & 0xffff) >= 0x01) { cpuid(0x80000001, &eax, &ebx, &ecx, &edx); - if ((edx & (1 << 30)) == 0 || (edx & (1 << 31)) == 0) + if ((edx & (1 << 30)) == 0 || (edx & (1u << 31)) == 0) return NULL; } else From df4ade070f745d5c542067b5fd5bab3e29d39dcf Mon Sep 17 00:00:00 2001 From: Kavana Bhat Date: Wed, 24 Jun 2020 04:25:47 -0500 Subject: [PATCH 261/593] Fix for #2671 --- kernel/Makefile.L3 | 94 +++++++++++++++++++++++----------------------- 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 0cb02ef85..86772cb22 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -483,7 +483,7 @@ $(KDIR)$(SHGEMMONCOPYOBJ) : $(KERNELDIR)/$(SHGEMMONCOPY) $(KDIR)$(SHGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SHGEMMOTCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DHALF -UDOUBLE -UCOMPLEX $< -o shgemmotcopy.s + $(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemmotcopy.s m4 shgemmotcopy.s > shgemmotcopy_nomacros.s $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemmotcopy_nomacros.s -o $@ rm shgemmotcopy.s shgemmotcopy_nomacros.s @@ -498,7 +498,7 @@ $(KDIR)$(SHGEMMINCOPYOBJ) : $(KERNELDIR)/$(SHGEMMINCOPY) $(KDIR)$(SHGEMMITCOPYOBJ) : $(KERNELDIR)/$(SHGEMMITCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DHALF -UDOUBLE -UCOMPLEX $< -o shgemmitcopy.s + $(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemmitcopy.s m4 shgemmitcopy.s > shgemmitcopy_nomacros.s $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemmitcopy_nomacros.s -o $@ rm shgemmitcopy.s shgemmitcopy_nomacros.s @@ -514,7 +514,7 @@ $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY) $(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmotcopy.s + $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmotcopy.s m4 sgemmotcopy.s > sgemmotcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmotcopy_nomacros.s -o $@ rm sgemmotcopy.s sgemmotcopy_nomacros.s @@ -530,7 +530,7 @@ $(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY) $(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmitcopy.s + $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmitcopy.s m4 sgemmitcopy.s > sgemmitcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmitcopy_nomacros.s -o $@ rm sgemmitcopy.s sgemmitcopy_nomacros.s @@ -542,7 +542,7 @@ endif $(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_ncopy.s + $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_ncopy.s m4 dgemm_ncopy.s > dgemm_ncopy_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_ncopy_nomacros.s -o $@ rm dgemm_ncopy.s dgemm_ncopy_nomacros.s @@ -560,7 +560,7 @@ $(KDIR)$(DGEMMINCOPYOBJ) : $(KERNELDIR)/$(DGEMMINCOPY) $(KDIR)$(DGEMMITCOPYOBJ) : $(KERNELDIR)/$(DGEMMITCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_itcopy.s + $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_itcopy.s m4 dgemm_itcopy.s > dgemm_itcopy_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_itcopy_nomacros.s -o $@ rm dgemm_itcopy.s dgemm_itcopy_nomacros.s @@ -603,7 +603,7 @@ $(KDIR)$(CGEMMINCOPYOBJ) : $(KERNELDIR)/$(CGEMMINCOPY) $(KDIR)$(CGEMMITCOPYOBJ) : $(KERNELDIR)/$(CGEMMITCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -E $< -o cgemm_itcopy.s + $(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -S $< -o - > cgemm_itcopy.s m4 cgemm_itcopy.s > cgemm_itcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX cgemm_itcopy_nomacros.s -o $@ rm cgemm_itcopy.s cgemm_itcopy_nomacros.s @@ -626,7 +626,7 @@ $(KDIR)$(ZGEMMINCOPYOBJ) : $(KERNELDIR)/$(ZGEMMINCOPY) $(KDIR)$(ZGEMMITCOPYOBJ) : $(KERNELDIR)/$(ZGEMMITCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o zgemm_itcopy.s + $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > zgemm_itcopy.s m4 zgemm_itcopy.s > zgemm_itcopy_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX zgemm_itcopy_nomacros.s -o $@ rm zgemm_itcopy.s zgemm_itcopy_nomacros.s @@ -658,7 +658,7 @@ endif $(KDIR)sgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemm_kernel$(TSUFFIX).s + $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemm_kernel$(TSUFFIX).s m4 sgemm_kernel$(TSUFFIX).s > sgemm_kernel$(TSUFFIX)_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemm_kernel$(TSUFFIX)_nomacros.s -o $@ rm sgemm_kernel$(TSUFFIX).s sgemm_kernel$(TSUFFIX)_nomacros.s @@ -670,7 +670,7 @@ ifeq ($(BUILD_HALF), 1) $(KDIR)shgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DHALF -UDOUBLE -UCOMPLEX $< -o shgemm_kernel$(TSUFFIX).s + $(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemm_kernel$(TSUFFIX).s m4 shgemm_kernel$(TSUFFIX).s > shgemm_kernel$(TSUFFIX)_nomacros.s $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemm_kernel$(TSUFFIX)_nomacros.s -o $@ rm shgemm_kernel$(TSUFFIX).s shgemm_kernel$(TSUFFIX)_nomacros.s @@ -681,7 +681,7 @@ endif $(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_kernel$(TSUFFIX).s + $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_kernel$(TSUFFIX).s m4 dgemm_kernel$(TSUFFIX).s > dgemm_kernel$(TSUFFIX)_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_kernel$(TSUFFIX)_nomacros.s -o $@ rm dgemm_kernel$(TSUFFIX).s dgemm_kernel$(TSUFFIX)_nomacros.s @@ -694,7 +694,7 @@ $(KDIR)qgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEP $(KDIR)cgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNN $< -o cgemm_kernel_n.s + $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DNN $< -o - > cgemm_kernel_n.s m4 cgemm_kernel_n.s > cgemm_kernel_n_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN cgemm_kernel_n_nomacros.s -o $@ rm cgemm_kernel_n.s cgemm_kernel_n_nomacros.s @@ -704,7 +704,7 @@ endif $(KDIR)cgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCN $< -o cgemm_kernel_l.s + $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DCN $< -o - > cgemm_kernel_l.s m4 cgemm_kernel_l.s > cgemm_kernel_l_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN cgemm_kernel_l_nomacros.s -o $@ rm cgemm_kernel_l.s cgemm_kernel_l_nomacros.s @@ -714,7 +714,7 @@ endif $(KDIR)cgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s + $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DNC $< -o - > cgemm_kernel_r.s m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@ rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s @@ -724,7 +724,7 @@ endif $(KDIR)cgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCC $< -o cgemm_kernel_b.s + $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DCC $< -o - > cgemm_kernel_b.s m4 cgemm_kernel_b.s > cgemm_kernel_b_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC cgemm_kernel_b_nomacros.s -o $@ rm cgemm_kernel_b.s cgemm_kernel_b_nomacros.s @@ -734,7 +734,7 @@ endif $(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNN $< -o zgemm_kernel_n.s + $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DNN $< -o - > zgemm_kernel_n.s m4 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@ rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s @@ -744,7 +744,7 @@ endif $(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCN $< -o zgemm_kernel_l.s + $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DCN $< -o - > zgemm_kernel_l.s m4 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@ rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s @@ -754,7 +754,7 @@ endif $(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNC $< -o zgemm_kernel_r.s + $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DNC $< -o - > zgemm_kernel_r.s m4 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@ rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s @@ -764,7 +764,7 @@ endif $(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCC $< -o zgemm_kernel_b.s + $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DCC $< -o - > zgemm_kernel_b.s m4 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@ rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s @@ -788,7 +788,7 @@ $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD ifdef USE_TRMM $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o strmmkernel_ln.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o - > strmmkernel_ln.s m4 strmmkernel_ln.s > strmmkernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA strmmkernel_ln_nomacros.s -o $@ rm strmmkernel_ln.s strmmkernel_ln_nomacros.s @@ -798,7 +798,7 @@ endif $(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o strmmkernel_lt.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o - > strmmkernel_lt.s m4 strmmkernel_lt.s > strmmkernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA strmmkernel_lt_nomacros.s -o $@ rm strmmkernel_lt.s strmmkernel_lt_nomacros.s @@ -808,7 +808,7 @@ endif $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o strmmkernel_rn.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o - > strmmkernel_rn.s m4 strmmkernel_rn.s > strmmkernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA strmmkernel_rn_nomacros.s -o $@ rm strmmkernel_rn.s strmmkernel_rn_nomacros.s @@ -818,7 +818,7 @@ endif $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s @@ -828,7 +828,7 @@ endif $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o dtrmm_kernel_ln.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o - > dtrmm_kernel_ln.s m4 dtrmm_kernel_ln.s > dtrmm_kernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA dtrmm_kernel_ln_nomacros.s -o $@ rm dtrmm_kernel_ln.s dtrmm_kernel_ln_nomacros.s @@ -838,7 +838,7 @@ endif $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o dtrmm_kernel_lt.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o - > dtrmm_kernel_lt.s m4 dtrmm_kernel_lt.s > dtrmm_kernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA dtrmm_kernel_lt_nomacros.s -o $@ rm dtrmm_kernel_lt.s dtrmm_kernel_lt_nomacros.s @@ -848,7 +848,7 @@ endif $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o dtrmm_kernel_rn.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o - > dtrmm_kernel_rn.s m4 dtrmm_kernel_rn.s > dtrmm_kernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA dtrmm_kernel_rn_nomacros.s -o $@ rm dtrmm_kernel_rn.s dtrmm_kernel_rn_nomacros.s @@ -858,7 +858,7 @@ endif $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o dtrmm_kernel_rt.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > dtrmm_kernel_rt.s m4 dtrmm_kernel_rt.s > dtrmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA dtrmm_kernel_rt_nomacros.s -o $@ rm dtrmm_kernel_rt.s dtrmm_kernel_rt_nomacros.s @@ -880,7 +880,7 @@ $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_ln.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_ln.s m4 ctrmm_kernel_ln.s > ctrmm_kernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_ln_nomacros.s -o $@ rm ctrmm_kernel_ln.s ctrmm_kernel_ln_nomacros.s @@ -890,7 +890,7 @@ endif $(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_lt.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_lt.s m4 ctrmm_kernel_lt.s > ctrmm_kernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_lt_nomacros.s -o $@ rm ctrmm_kernel_lt.s ctrmm_kernel_lt_nomacros.s @@ -900,7 +900,7 @@ endif $(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lr.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o - > ctrmm_kernel_lr.s m4 ctrmm_kernel_lr.s > ctrmm_kernel_lr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ctrmm_kernel_lr_nomacros.s -o $@ rm ctrmm_kernel_lr.s ctrmm_kernel_lr_nomacros.s @@ -910,7 +910,7 @@ endif $(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lc.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o - > ctrmm_kernel_lc.s m4 ctrmm_kernel_lc.s > ctrmm_kernel_lc_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ctrmm_kernel_lc_nomacros.s -o $@ rm ctrmm_kernel_lc_nomacros.s ctrmm_kernel_lc.s @@ -920,7 +920,7 @@ endif $(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rn.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_rn.s m4 ctrmm_kernel_rn.s > ctrmm_kernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_rn_nomacros.s -o $@ rm ctrmm_kernel_rn.s ctrmm_kernel_rn_nomacros.s @@ -930,7 +930,7 @@ endif $(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rt.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_rt.s m4 ctrmm_kernel_rt.s > ctrmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_rt_nomacros.s -o $@ rm ctrmm_kernel_rt.s ctrmm_kernel_rt_nomacros.s @@ -940,7 +940,7 @@ endif $(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ctrmm_kernel_rr.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o - > ctrmm_kernel_rr.s m4 ctrmm_kernel_rr.s > ctrmm_kernel_rr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ctrmm_kernel_rr_nomacros.s -o $@ rm ctrmm_kernel_rr.s ctrmm_kernel_rr_nomacros.s @@ -950,7 +950,7 @@ endif $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ctrmm_kernel_RC.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o - > ctrmm_kernel_RC.s m4 ctrmm_kernel_RC.s > ctrmm_kernel_RC_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ctrmm_kernel_RC_nomacros.s -o $@ rm ctrmm_kernel_RC.s ctrmm_kernel_RC_nomacros.s @@ -960,7 +960,7 @@ endif $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_ln.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_ln.s m4 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@ rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s @@ -970,7 +970,7 @@ endif $(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_lt.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_lt.s m4 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@ rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s @@ -980,7 +980,7 @@ endif $(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lr.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o - > ztrmm_kernel_lr.s m4 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@ rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s @@ -990,7 +990,7 @@ endif $(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lc.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o - > ztrmm_kernel_lc.s m4 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@ rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s @@ -1000,7 +1000,7 @@ endif $(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rn.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_rn.s m4 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@ rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s @@ -1010,7 +1010,7 @@ endif $(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rt.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_rt.s m4 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@ rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s @@ -1020,7 +1020,7 @@ endif $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rr.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o - > ztrmm_kernel_rr.s m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@ rm ztrmm_kernel_rr.s ztrmm_kernel_rr_nomacros.s @@ -1030,7 +1030,7 @@ endif $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rc.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o - > ztrmm_kernel_rc.s m4 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@ rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s @@ -1050,7 +1050,7 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s @@ -1184,7 +1184,7 @@ $(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DT $(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o dtrsm_kernel_lt.s + $(CC) $(CFLAGS) -S -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o - > dtrsm_kernel_lt.s m4 dtrsm_kernel_lt.s > dtrsm_kernel_lt_nomacros.s $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ dtrsm_kernel_lt_nomacros.s -o $@ rm dtrsm_kernel_lt.s dtrsm_kernel_lt_nomacros.s @@ -2460,7 +2460,7 @@ $(KDIR)cgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMM $(KDIR)cgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(PFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s + $(CC) $(PFLAGS) -S -UDOUBLE -DCOMPLEX -DNC $< -o - > cgemm_kernel_r.s m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@ rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s @@ -2506,7 +2506,7 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s m4 strmmkernel_rn.s > strmm_kernel_rt_nomacros.s $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s From 571eadb88063c91ea9b5b1bcb2ae33cd8fbc5762 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Wed, 24 Jun 2020 14:48:15 -0500 Subject: [PATCH 262/593] powerpc: Optimized SGEMM/DGEMM/CGEMM for POWER10 This patch introduces new optimized version of SGEMM, CGEMM and DGEMM using power10 Matrix-Multiply Assist (MMA) feature introduced in POWER ISA v3.1. This patch makes use of new POWER10 compute instructions for matrix multiplication operation. Tested on simulator and there are no new test failures. Cycles count reduced by 30-50% compared to POWER9 version depending on M/N/K sizes. MMA GCC patch for reference: https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=8ee2640bfdc62f835ec9740278f948034bc7d9f1 --- kernel/power/KERNEL.POWER10 | 12 +- kernel/power/cgemm_kernel_power10.S | 286 +++ kernel/power/cgemm_logic_power10.S | 2814 +++++++++++++++++++++++++++ kernel/power/cgemm_macros_power10.S | 2131 ++++++++++++++++++++ kernel/power/dgemm_kernel_power10.c | 864 ++++++++ kernel/power/sgemm_kernel_power10.c | 1334 +++++++++++++ 6 files changed, 7435 insertions(+), 6 deletions(-) create mode 100644 kernel/power/cgemm_kernel_power10.S create mode 100644 kernel/power/cgemm_logic_power10.S create mode 100644 kernel/power/cgemm_macros_power10.S create mode 100644 kernel/power/dgemm_kernel_power10.c create mode 100644 kernel/power/sgemm_kernel_power10.c diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index ab8fbfcd9..00d31f8b6 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -7,12 +7,12 @@ else #CGEMM_BETA = ../generic/zgemm_beta.c #ZGEMM_BETA = ../generic/zgemm_beta.c -STRMMKERNEL = sgemm_kernel_power9.S -DTRMMKERNEL = dgemm_kernel_power9.S -CTRMMKERNEL = cgemm_kernel_power9.S +STRMMKERNEL = sgemm_kernel_power10.c +DTRMMKERNEL = dgemm_kernel_power10.c +CTRMMKERNEL = cgemm_kernel_power10.S ZTRMMKERNEL = zgemm_kernel_power9.S -SGEMMKERNEL = sgemm_kernel_power9.S +SGEMMKERNEL = sgemm_kernel_power10.c SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = sgemm_tcopy_16_power8.S SGEMMONCOPY = ../generic/gemm_ncopy_8.c @@ -22,7 +22,7 @@ SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) -DGEMMKERNEL = dgemm_kernel_power9.S +DGEMMKERNEL = dgemm_kernel_power10.c DGEMMINCOPY = ../generic/gemm_ncopy_16.c DGEMMITCOPY = dgemm_tcopy_16_power8.S DGEMMONCOPY = dgemm_ncopy_4_power8.S @@ -32,7 +32,7 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) -CGEMMKERNEL = cgemm_kernel_power9.S +CGEMMKERNEL = cgemm_kernel_power10.S CGEMMINCOPY = ../generic/zgemm_ncopy_8.c CGEMMITCOPY = ../generic/zgemm_tcopy_8.c CGEMMONCOPY = ../generic/zgemm_ncopy_4.c diff --git a/kernel/power/cgemm_kernel_power10.S b/kernel/power/cgemm_kernel_power10.S new file mode 100644 index 000000000..e04f948dd --- /dev/null +++ b/kernel/power/cgemm_kernel_power10.S @@ -0,0 +1,286 @@ +/*************************************************************************** +Copyright (c) 2013-2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + + +#define LOAD ld +#define STACKSIZE (512 ) +#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ +#define M r3 +#define N r4 +#define K r5 + + +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 + + +#define alpha_r vs51 +#define alpha_i vs55 +#define save_permute_1 vs59 +#define permute_mask vs63 +#define o0 0 + + +#define T1 r11 +#define T2 r12 +#define T3 r14 +#define T4 r15 +#define T5 r16 +#define T6 r17 +#define L r18 +#define T7 r19 +#define T8 r20 +#define TEMP_REG r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define T9 r27 +#define T10 r28 +#define PRE r29 + +#define T12 r30 +#define T13 r31 + +#include "cgemm_macros_power10.S" + +.equ perm_const1, 0x0405060700010203 +.equ perm_const2, 0x0c0d0e0f08090a0b +.equ save_permute_12, 0x0c0d0e0f1c1d1e1f +.equ save_permute_11, 0x0405060714151617 + + + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + + addi SP, SP, -STACKSIZE + mflr r0 + + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + + stxv vs52, 288(SP) + stxv vs53, 304(SP) + stxv vs54, 320(SP) + stxv vs55, 336(SP) + stxv vs56, 352(SP) + stxv vs57, 368(SP) + stxv vs58, 384(SP) + stxv vs59, 400(SP) + stxv vs60, 416(SP) + stxv vs61, 432(SP) + stxv vs62, 448(SP) + stxv vs63, 464(SP) + std r0, FLINK_SAVE(SP) + + + + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) + + + +#ifdef TRMMKERNEL + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#endif + slwi LDC, LDC, ZBASE_SHIFT + + + + /*alpha is stored in f1. convert to single and splat*/ + xscvdpspn alpha_r,vs1 + xscvdpspn alpha_i,vs2 + xxspltw alpha_r,alpha_r,0 + xxspltw alpha_i,alpha_i,0 +/*load reverse permute mask for big endian + uint128 = 0xc0d0e0f08090a0b0405060700010203 +*/ + + lis T2, perm_const2@highest + lis T1, perm_const1@highest + lis T3, save_permute_12@highest + lis T4, save_permute_11@highest + + + ori T2, T2, perm_const2@higher + ori T1, T1, perm_const1@higher + ori T3, T3, save_permute_12@higher + ori T4, T4, save_permute_11@higher + + + rldicr T2, T2, 32, 31 + rldicr T1, T1, 32, 31 + rldicr T3, T3, 32, 31 + rldicr T4, T4, 32, 31 + + oris T2, T2, perm_const2@h + oris T1, T1, perm_const1@h + oris T3, T3, save_permute_12@h + oris T4, T4, save_permute_11@h + + + ori T2, T2, perm_const2@l + ori T1, T1, perm_const1@l + ori T3, T3, save_permute_12@l + ori T4, T4, save_permute_11@l + + + li r0,0 + li PRE,512 + +#if defined(CC) || defined(CR) || defined(RC) || defined(RR) +/*negate for this case as we will use addition -1*(a+b) */ + xvnegsp alpha_r,alpha_r + xvnegsp alpha_i,alpha_i +#endif + + mtvsrdd permute_mask,T2,T1 + mtvsrdd save_permute_1,T3,T4 + + /*mask is reverse permute so we have to make it inner permute */ + xxpermdi permute_mask, permute_mask, permute_mask,2 + +#include "cgemm_logic_power10.S" + +.L999: + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + ld r0, FLINK_SAVE(SP) + + lxv vs52, 288(SP) + lxv vs53, 304(SP) + lxv vs54, 320(SP) + lxv vs55, 336(SP) + lxv vs56, 352(SP) + lxv vs57, 368(SP) + lxv vs58, 384(SP) + lxv vs59, 400(SP) + mtlr r0 + lxv vs60, 416(SP) + lxv vs61, 432(SP) + lxv vs62, 448(SP) + lxv vs63, 464(SP) + + addi SP, SP, STACKSIZE + blr + + + EPILOGUE +#endif diff --git a/kernel/power/cgemm_logic_power10.S b/kernel/power/cgemm_logic_power10.S new file mode 100644 index 000000000..3700ac87b --- /dev/null +++ b/kernel/power/cgemm_logic_power10.S @@ -0,0 +1,2814 @@ +/*************************************************************************** +Copyright (c) 2013-2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define MY_ALIGN .align 3 +b CGEMM_L4 +/* MINI SUBROUTINES */ +/* 4x8 MAIN 128x+2 LOOP */ + + +CGEMM_L4x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD4x8_2 + MY_ALIGN +CGEMM_L4x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL4x8_L2 128,64,0,0 +CGEMM_L4x8_K128: +/*----------------------------------------*/ + KERNEL4x8_L2 128,64,1,0 + dcbt AO, T2 + KERNEL4x8_L2 128,64,2,0 + KERNEL4x8_L2 128,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL4x8_L2 128,64,4,0 + KERNEL4x8_L2 128,64,5,0 + dcbt AO, T4 + KERNEL4x8_L2 128,64,6,0 + KERNEL4x8_L2 128,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL4x8_L2 128,64,8,0 + KERNEL4x8_L2 128,64,9,0 + KERNEL4x8_L2 128,64,10,0 + KERNEL4x8_L2 128,64,11,0 + dcbt BO, T4 + KERNEL4x8_L2 128,64,12,0 + KERNEL4x8_L2 128,64,13,0 + KERNEL4x8_L2 128,64,14,0 + KERNEL4x8_L2 128,64,15,0 + KERNEL4x8_L2 128,64,16,0 + KERNEL4x8_L2 128,64,17,0 + KERNEL4x8_L2 128,64,18,0 + KERNEL4x8_L2 128,64,19,0 + KERNEL4x8_L2 128,64,20,0 + KERNEL4x8_L2 128,64,21,0 + KERNEL4x8_L2 128,64,22,0 + KERNEL4x8_L2 128,64,23,0 + KERNEL4x8_L2 128,64,24,0 + KERNEL4x8_L2 128,64,25,0 + KERNEL4x8_L2 128,64,26,0 + KERNEL4x8_L2 128,64,27,0 + KERNEL4x8_L2 128,64,28,0 + KERNEL4x8_L2 128,64,29,0 + KERNEL4x8_L2 128,64,30,0 + KERNEL4x8_L2 128,64,31,0 + KERNEL4x8_L2 128,64,32,0 + KERNEL4x8_L2 128,64,33,0 + KERNEL4x8_L2 128,64,34,0 + KERNEL4x8_L2 128,64,35,0 + KERNEL4x8_L2 128,64,36,0 + KERNEL4x8_L2 128,64,37,0 + KERNEL4x8_L2 128,64,38,0 + KERNEL4x8_L2 128,64,39,0 + KERNEL4x8_L2 128,64,40,0 + KERNEL4x8_L2 128,64,41,0 + KERNEL4x8_L2 128,64,42,0 + KERNEL4x8_L2 128,64,43,0 + KERNEL4x8_L2 128,64,44,0 + KERNEL4x8_L2 128,64,45,0 + KERNEL4x8_L2 128,64,46,0 + KERNEL4x8_L2 128,64,47,0 + KERNEL4x8_L2 128,64,48,0 + KERNEL4x8_L2 128,64,49,0 + KERNEL4x8_L2 128,64,50,0 + KERNEL4x8_L2 128,64,51,0 + KERNEL4x8_L2 128,64,52,0 + KERNEL4x8_L2 128,64,53,0 + KERNEL4x8_L2 128,64,54,0 + KERNEL4x8_L2 128,64,55,0 + KERNEL4x8_L2 128,64,56,0 + KERNEL4x8_L2 128,64,57,0 + KERNEL4x8_L2 128,64,58,0 + KERNEL4x8_L2 128,64,59,0 + KERNEL4x8_L2 128,64,60,0 + KERNEL4x8_L2 128,64,61,0 + KERNEL4x8_L2 128,64,62,0 + KERNEL4x8_L2 128,64,63,1 + bdnz CGEMM_L4x8_LOOP + MY_ALIGN +CGEMM_L4x8_LOOP_END: +/*----------------------------------------*/ + END4x8_2 + blr + MY_ALIGN + + +CGEMM_4x8_L64_SUB: +/*----------------------------------------*/ + LOAD4x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL4x8_L2 128,64,0,0 + KERNEL4x8_L2 128,64,1,0 + dcbt AO, T2 + KERNEL4x8_L2 128,64,2,0 + KERNEL4x8_L2 128,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL4x8_L2 128,64,4,0 + KERNEL4x8_L2 128,64,5,0 + dcbt AO, T4 + KERNEL4x8_L2 128,64,6,0 + KERNEL4x8_L2 128,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL4x8_L2 128,64,8,0 + KERNEL4x8_L2 128,64,9,0 + KERNEL4x8_L2 128,64,10,0 + KERNEL4x8_L2 128,64,11,0 + dcbt BO, T4 + KERNEL4x8_L2 128,64,12,0 + KERNEL4x8_L2 128,64,13,0 + KERNEL4x8_L2 128,64,14,0 + KERNEL4x8_L2 128,64,15,0 + KERNEL4x8_L2 128,64,16,0 + KERNEL4x8_L2 128,64,17,0 + KERNEL4x8_L2 128,64,18,0 + KERNEL4x8_L2 128,64,19,0 + KERNEL4x8_L2 128,64,20,0 + KERNEL4x8_L2 128,64,21,0 + KERNEL4x8_L2 128,64,22,0 + KERNEL4x8_L2 128,64,23,0 + KERNEL4x8_L2 128,64,24,0 + KERNEL4x8_L2 128,64,25,0 + KERNEL4x8_L2 128,64,26,0 + KERNEL4x8_L2 128,64,27,0 + KERNEL4x8_L2 128,64,28,0 + KERNEL4x8_L2 128,64,29,0 + KERNEL4x8_L2 128,64,30,0 + KERNEL4x8_E2 128,64,31,1 + blr + MY_ALIGN + + +CGEMM_4x8_L32_SUB: +/*----------------------------------------*/ + LOAD4x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL4x8_L2 128,64,0,0 + KERNEL4x8_L2 128,64,1,0 + dcbt AO, T2 + KERNEL4x8_L2 128,64,2,0 + KERNEL4x8_L2 128,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL4x8_L2 128,64,4,0 + KERNEL4x8_L2 128,64,5,0 + dcbt AO, T4 + KERNEL4x8_L2 128,64,6,0 + KERNEL4x8_L2 128,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL4x8_L2 128,64,8,0 + KERNEL4x8_L2 128,64,9,0 + KERNEL4x8_L2 128,64,10,0 + KERNEL4x8_L2 128,64,11,0 + dcbt BO, T4 + KERNEL4x8_L2 128,64,12,0 + KERNEL4x8_L2 128,64,13,0 + KERNEL4x8_L2 128,64,14,0 + KERNEL4x8_E2 128,64,15,1 + blr + MY_ALIGN + + +CGEMM_4x8_L16_SUB: +/*----------------------------------------*/ + LOAD4x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL4x8_L2 128,64,0,0 + KERNEL4x8_L2 128,64,1,0 + dcbt AO, T2 + KERNEL4x8_L2 128,64,2,0 + KERNEL4x8_L2 128,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL4x8_L2 128,64,4,0 + KERNEL4x8_L2 128,64,5,0 + dcbt AO, T4 + KERNEL4x8_L2 128,64,6,0 + KERNEL4x8_E2 128,64,7,1 + blr + MY_ALIGN + + +CGEMM_4x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD4x4_2 + MY_ALIGN +CGEMM_L4x4_LOOP: +/*----------------------------------------*/ + KERNEL4x4_L2 64,64,0,0 +CGEMM_L4x4_K32: +/*----------------------------------------*/ + KERNEL4x4_L2 64,64,1,0 + KERNEL4x4_L2 64,64,2,0 + KERNEL4x4_L2 64,64,3,0 + KERNEL4x4_L2 64,64,4,0 + KERNEL4x4_L2 64,64,5,0 + KERNEL4x4_L2 64,64,6,0 + KERNEL4x4_L2 64,64,7,0 + KERNEL4x4_L2 64,64,8,0 + KERNEL4x4_L2 64,64,9,0 + KERNEL4x4_L2 64,64,10,0 + KERNEL4x4_L2 64,64,11,0 + KERNEL4x4_L2 64,64,12,0 + KERNEL4x4_L2 64,64,13,0 + KERNEL4x4_L2 64,64,14,0 + KERNEL4x4_L2 64,64,15,1 + bdnz CGEMM_L4x4_LOOP + MY_ALIGN +CGEMM_L4x4_LOOP_END: +/*----------------------------------------*/ + END4x4_2 + blr + MY_ALIGN + + +CGEMM_4x4_L16_SUB: +/*----------------------------------------*/ + LOAD4x4_2 + KERNEL4x4_L2 64,64,0,0 + KERNEL4x4_L2 64,64,1,0 + KERNEL4x4_L2 64,64,2,0 + KERNEL4x4_L2 64,64,3,0 + KERNEL4x4_L2 64,64,4,0 + KERNEL4x4_L2 64,64,5,0 + KERNEL4x4_L2 64,64,6,0 + KERNEL4x4_E2 64,64,7,1 + blr + MY_ALIGN + + +CGEMM_4x4_L8_SUB: +/*----------------------------------------*/ + LOAD4x4_2 + KERNEL4x4_L2 64,64,0,0 + KERNEL4x4_L2 64,64,1,0 + KERNEL4x4_L2 64,64,2,0 + KERNEL4x4_E2 64,64,3,1 + blr + + +CGEMM_4x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD4x2_2 + MY_ALIGN +CGEMM_L4x2_LOOP: +/*----------------------------------------*/ + KERNEL4x2_L2 32,64,0,0 +CGEMM_L4x2_K32: +/*----------------------------------------*/ + KERNEL4x2_L2 32,64,1,0 + KERNEL4x2_L2 32,64,2,0 + KERNEL4x2_L2 32,64,3,0 + KERNEL4x2_L2 32,64,4,0 + KERNEL4x2_L2 32,64,5,0 + KERNEL4x2_L2 32,64,6,0 + KERNEL4x2_L2 32,64,7,0 + KERNEL4x2_L2 32,64,8,0 + KERNEL4x2_L2 32,64,9,0 + KERNEL4x2_L2 32,64,10,0 + KERNEL4x2_L2 32,64,11,0 + KERNEL4x2_L2 32,64,12,0 + KERNEL4x2_L2 32,64,13,0 + KERNEL4x2_L2 32,64,14,0 + KERNEL4x2_L2 32,64,15,1 + bdnz CGEMM_L4x2_LOOP + MY_ALIGN + + +CGEMM_L4x2_LOOP_END: +/*----------------------------------------*/ + END4x2_2 + blr + MY_ALIGN +CGEMM_4x2_L16_SUB: +/*----------------------------------------*/ + LOAD4x2_2 + KERNEL4x2_L2 32,64,0,0 + KERNEL4x2_L2 32,64,1,0 + KERNEL4x2_L2 32,64,2,0 + KERNEL4x2_L2 32,64,3,0 + KERNEL4x2_L2 32,64,4,0 + KERNEL4x2_L2 32,64,5,0 + KERNEL4x2_L2 32,64,6,0 + KERNEL4x2_E2 32,64,7,1 + blr + MY_ALIGN +CGEMM_4x2_L8_SUB: +/*----------------------------------------*/ + LOAD4x2_2 + KERNEL4x2_L2 32,64,0,0 + KERNEL4x2_L2 32,64,1,0 + KERNEL4x2_L2 32,64,2,0 + KERNEL4x2_E2 32,64,3,1 + blr + + +CGEMM_4x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD4x1_2 + MY_ALIGN +CGEMM_L4x1_LOOP: +/*----------------------------------------*/ + KERNEL4x1_L2 16,64,0,0 +CGEMM_L4x1_K32: +/*----------------------------------------*/ + KERNEL4x1_L2 16,64,1,0 + KERNEL4x1_L2 16,64,2,0 + KERNEL4x1_L2 16,64,3,0 + KERNEL4x1_L2 16,64,4,0 + KERNEL4x1_L2 16,64,5,0 + KERNEL4x1_L2 16,64,6,0 + KERNEL4x1_L2 16,64,7,0 + KERNEL4x1_L2 16,64,8,0 + KERNEL4x1_L2 16,64,9,0 + KERNEL4x1_L2 16,64,10,0 + KERNEL4x1_L2 16,64,11,0 + KERNEL4x1_L2 16,64,12,0 + KERNEL4x1_L2 16,64,13,0 + KERNEL4x1_L2 16,64,14,0 + KERNEL4x1_L2 16,64,15,1 + bdnz CGEMM_L4x1_LOOP + MY_ALIGN +CGEMM_L4x1_LOOP_END: +/*----------------------------------------*/ + END4x1_2 + blr + + MY_ALIGN +CGEMM_4x1_L16_SUB: +/*----------------------------------------*/ + LOAD4x1_2 + KERNEL4x1_L2 16,64,0,0 + KERNEL4x1_L2 16,64,1,0 + KERNEL4x1_L2 16,64,2,0 + KERNEL4x1_L2 16,64,3,0 + KERNEL4x1_L2 16,64,4,0 + KERNEL4x1_L2 16,64,5,0 + KERNEL4x1_L2 16,64,6,0 + KERNEL4x1_E2 16,64,7,1 + blr + MY_ALIGN + + +CGEMM_4x1_L8_SUB: +/*----------------------------------------*/ + LOAD4x1_2 + KERNEL4x1_L2 16,64,0,0 + KERNEL4x1_L2 16,64,1,0 + KERNEL4x1_L2 16,64,2,0 + KERNEL4x1_E2 16,64,3,1 + blr + + + +/* MAIN LOOP BEGINS */ + MY_ALIGN + + +CGEMM_L4: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) && !defined(LEFT) + neg TEMP_REG, OFFSET +#endif + /* Pre set value in vs57 as 0xffff0000ffff0000 for masking */ + vspltisb v24, -1 + vspltisb v25, 0 + xxsldwi vs57, vs56, vs57, 1 + xxpermdi vs57, vs57, vs57, 3 + srawi. J, N, 2 + ble CGEMM_L4_END + + +CGEMM_L4_BEGIN: +/*----------------------------------------*/ + mr CO, C + slwi T1, LDC , 2 + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble CGEMM_L4x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +CGEMM_L4x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,8,4 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T1-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + ZERO4x8 + ble CGEMM_L4x8_SUB0 + bl CGEMM_L4x8_LMAIN_SUB + andi. L, T1, 127 + ble CGEMM_L4x8_SAVE + b CGEMM_L4x8_SUB2 + + +CGEMM_L4x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T8,1 + bne CMP4x8_128K + addi BO,BO,-32 + addi AO,AO,-64 + LOAD4x8O 64,32 + END4x8_WITHOUT_ADD + LOAD4x8_2O 128, 64 + mtctr T8 + bl CGEMM_L4x8_K128 + b CGEMM_L4x8_SAVE + CMP4x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,128 +#else + cmpwi K,128 +#endif + bne CGEMM_L4x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-128 + LOAD4x8_2O 128,64 + bl CGEMM_L4x8_K128 + b CGEMM_L4x8_SAVE + MY_ALIGN + + +CGEMM_L4x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble CGEMM_L4x8_SUB2_32 + bl CGEMM_4x8_L64_SUB + MY_ALIGN + + +CGEMM_L4x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble CGEMM_L4x8_SUB2_16 + bl CGEMM_4x8_L32_SUB + MY_ALIGN + + +CGEMM_L4x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L4x8_SUB2_8 + bl CGEMM_4x8_L16_SUB + MY_ALIGN + + +CGEMM_L4x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L4x8_SUB2_4 + LOAD4x8_2 + KERNEL4x8_L2 128,64, 0,0 + KERNEL4x8_L2 128,64, 1,0 + KERNEL4x8_L2 128,64, 2,0 + KERNEL4x8_E2 128,64, 3,1 + MY_ALIGN + + +CGEMM_L4x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L4x8_SUB2_2 + LOAD4x8_2 + KERNEL4x8_L2 128,64, 0,0 + KERNEL4x8_E2 128,64, 1,1 + MY_ALIGN + + +CGEMM_L4x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L4x8_SUB2_1 + LOAD4x8_2 + KERNEL4x8_E2 128,64, 0,1 + MY_ALIGN + + +CGEMM_L4x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L4x8_SAVE + KERNEL4x8 + + MY_ALIGN +CGEMM_L4x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + MY_ALIGN + SAVE4x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,4 +#endif + bgt CGEMM_L4x8_BEGIN + andi. T2, M, 7 + ble CGEMM_L4x1_END + andi. T1, M, 4 + ble CGEMM_L4x4_END + b CGEMM_L4x4_BEGIN + MY_ALIGN + + +CGEMM_L4x8_END: +/*----------------------------------------*/ + + +CGEMM_L4x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble CGEMM_L4x1_END + andi. T1, M, 4 + ble CGEMM_L4x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,4,4 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO4x4 + ble CGEMM_L4x4_SUB0 + bl CGEMM_4x4_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L4x4_SAVE + b CGEMM_L4x4_SUB2 + + +CGEMM_L4x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP4x4_32K + addi BO,BO,-32 + addi AO,AO,-32 + LOAD4x4O 32,32 + END4x4_WITHOUT_ADD + LOAD4x4_2O 64, 64 + mtctr T8 + bl CGEMM_L4x4_K32 + b CGEMM_L4x4_SAVE + CMP4x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L4x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-64 + LOAD4x4_2O 64,64 + bl CGEMM_L4x4_K32 + b CGEMM_L4x4_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L4x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L4x4_SUB2_8 + bl CGEMM_4x4_L16_SUB + MY_ALIGN + + +CGEMM_L4x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L4x4_SUB2_4 + bl CGEMM_4x4_L8_SUB + MY_ALIGN + + +CGEMM_L4x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L4x4_SUB2_2 + LOAD4x4_2 + KERNEL4x4_L2 64,64, 0,0 + KERNEL4x4_E2 64,64, 1,1 + MY_ALIGN + + +CGEMM_L4x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L4x4_SUB2_1 + LOAD4x4_2 + KERNEL4x4_E2 64,64, 0,1 + MY_ALIGN + + +CGEMM_L4x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L4x4_SAVE + KERNEL4x4 + + +CGEMM_L4x4_SAVE: +/*----------------------------------------*/ + SAVE4x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,4 +#endif + + +CGEMM_L4x4_END: +/*----------------------------------------*/ + + +CGEMM_L4x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble CGEMM_L4x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,2,4 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO4x2 + ble CGEMM_L4x2_SUB0 + bl CGEMM_4x2_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L4x2_SAVE + b CGEMM_L4x2_SUB2 + + +CGEMM_L4x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP4x2_32K + addi BO,BO,-32 + addi AO,AO,-16 + LOAD4x2O 16,32 + END4x2_WITHOUT_ADD + LOAD4x2_2O 32, 64 + mtctr T8 + bl CGEMM_L4x2_K32 + b CGEMM_L4x2_SAVE + CMP4x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L4x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-32 + LOAD4x2_2O 32,64 + bl CGEMM_L4x2_K32 + b CGEMM_L4x2_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L4x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L4x2_SUB2_8 + bl CGEMM_4x2_L16_SUB + MY_ALIGN + + +CGEMM_L4x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L4x2_SUB2_4 + bl CGEMM_4x2_L8_SUB + MY_ALIGN + + +CGEMM_L4x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L4x2_SUB2_2 + LOAD4x2_2 + KERNEL4x2_L2 32,64, 0,0 + KERNEL4x2_E2 32,64, 1,1 + MY_ALIGN + + +CGEMM_L4x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L4x2_SUB2_1 + LOAD4x2_2 + KERNEL4x2_E2 32,64, 0,1 + MY_ALIGN + + +CGEMM_L4x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L4x2_SAVE + KERNEL4x2 + + MY_ALIGN +CGEMM_L4x2_SAVE: +/*----------------------------------------*/ + SAVE4x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,4 +#endif + + +CGEMM_L4x2_END: +/*----------------------------------------*/ + + +CGEMM_L4x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble CGEMM_L4x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,1,4 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO4x1 + ble CGEMM_L4x1_SUB0 + bl CGEMM_4x1_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L4x1_SAVE + b CGEMM_L4x1_SUB2 + + +CGEMM_L4x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP4x1_32K + addi BO,BO,-32 + addi AO,AO,-8 + LOAD4x1O 8,32 + END4x1_WITHOUT_ADD + LOAD4x1_2O 16, 64 + mtctr T8 + bl CGEMM_L4x1_K32 + b CGEMM_L4x1_SAVE + CMP4x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L4x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-16 + LOAD4x1_2O 16,64 + bl CGEMM_L4x1_K32 + b CGEMM_L4x1_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L4x1_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L4x1_SUB2_8 + bl CGEMM_4x1_L16_SUB + MY_ALIGN + + +CGEMM_L4x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L4x1_SUB2_4 + bl CGEMM_4x1_L8_SUB + MY_ALIGN + + +CGEMM_L4x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L4x1_SUB2_2 + LOAD4x1_2 + KERNEL4x1_L2 16,64, 0,0 + KERNEL4x1_E2 16,64, 1,1 + MY_ALIGN + + +CGEMM_L4x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L4x1_SUB2_1 + LOAD4x1_2 + KERNEL4x1_E2 16,64, 0,1 + MY_ALIGN + + +CGEMM_L4x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L4x1_SAVE + KERNEL4x1 + + MY_ALIGN +CGEMM_L4x1_SAVE: +/*----------------------------------------*/ + + SAVE4x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,4 +#endif + + +CGEMM_L4x1_END: +/*----------------------------------------*/ + slwi T1, K, 5 + addic. J, J, -1 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 4 +#endif + bgt CGEMM_L4_BEGIN + + +CGEMM_L4_END: + +b CGEMM_L2 +/* MINI SUBROUTINES */ +/* 2x8 MAIN 128x+2 LOOP */ + + +CGEMM_L2x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x8_2 + MY_ALIGN +CGEMM_L2x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 128,32,0,0 +CGEMM_L2x8_K128: +/*----------------------------------------*/ + KERNEL2x8_L2 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L2 128,32,2,0 + KERNEL2x8_L2 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 128,32,4,0 + KERNEL2x8_L2 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L2 128,32,6,0 + KERNEL2x8_L2 128,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 128,32,8,0 + KERNEL2x8_L2 128,32,9,0 + KERNEL2x8_L2 128,32,10,0 + KERNEL2x8_L2 128,32,11,0 + dcbt BO, T4 + KERNEL2x8_L2 128,32,12,0 + KERNEL2x8_L2 128,32,13,0 + KERNEL2x8_L2 128,32,14,0 + KERNEL2x8_L2 128,32,15,0 + KERNEL2x8_L2 128,32,16,0 + KERNEL2x8_L2 128,32,17,0 + KERNEL2x8_L2 128,32,18,0 + KERNEL2x8_L2 128,32,19,0 + KERNEL2x8_L2 128,32,20,0 + KERNEL2x8_L2 128,32,21,0 + KERNEL2x8_L2 128,32,22,0 + KERNEL2x8_L2 128,32,23,0 + KERNEL2x8_L2 128,32,24,0 + KERNEL2x8_L2 128,32,25,0 + KERNEL2x8_L2 128,32,26,0 + KERNEL2x8_L2 128,32,27,0 + KERNEL2x8_L2 128,32,28,0 + KERNEL2x8_L2 128,32,29,0 + KERNEL2x8_L2 128,32,30,0 + KERNEL2x8_L2 128,32,31,0 + KERNEL2x8_L2 128,32,32,0 + KERNEL2x8_L2 128,32,33,0 + KERNEL2x8_L2 128,32,34,0 + KERNEL2x8_L2 128,32,35,0 + KERNEL2x8_L2 128,32,36,0 + KERNEL2x8_L2 128,32,37,0 + KERNEL2x8_L2 128,32,38,0 + KERNEL2x8_L2 128,32,39,0 + KERNEL2x8_L2 128,32,40,0 + KERNEL2x8_L2 128,32,41,0 + KERNEL2x8_L2 128,32,42,0 + KERNEL2x8_L2 128,32,43,0 + KERNEL2x8_L2 128,32,44,0 + KERNEL2x8_L2 128,32,45,0 + KERNEL2x8_L2 128,32,46,0 + KERNEL2x8_L2 128,32,47,0 + KERNEL2x8_L2 128,32,48,0 + KERNEL2x8_L2 128,32,49,0 + KERNEL2x8_L2 128,32,50,0 + KERNEL2x8_L2 128,32,51,0 + KERNEL2x8_L2 128,32,52,0 + KERNEL2x8_L2 128,32,53,0 + KERNEL2x8_L2 128,32,54,0 + KERNEL2x8_L2 128,32,55,0 + KERNEL2x8_L2 128,32,56,0 + KERNEL2x8_L2 128,32,57,0 + KERNEL2x8_L2 128,32,58,0 + KERNEL2x8_L2 128,32,59,0 + KERNEL2x8_L2 128,32,60,0 + KERNEL2x8_L2 128,32,61,0 + KERNEL2x8_L2 128,32,62,0 + KERNEL2x8_L2 128,32,63,1 + bdnz CGEMM_L2x8_LOOP + MY_ALIGN +CGEMM_L2x8_LOOP_END: +/*----------------------------------------*/ + END2x8_2 + blr + MY_ALIGN + + +CGEMM_2x8_L64_SUB: +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 128,32,0,0 + KERNEL2x8_L2 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L2 128,32,2,0 + KERNEL2x8_L2 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 128,32,4,0 + KERNEL2x8_L2 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L2 128,32,6,0 + KERNEL2x8_L2 128,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 128,32,8,0 + KERNEL2x8_L2 128,32,9,0 + KERNEL2x8_L2 128,32,10,0 + KERNEL2x8_L2 128,32,11,0 + dcbt BO, T4 + KERNEL2x8_L2 128,32,12,0 + KERNEL2x8_L2 128,32,13,0 + KERNEL2x8_L2 128,32,14,0 + KERNEL2x8_L2 128,32,15,0 + KERNEL2x8_L2 128,32,16,0 + KERNEL2x8_L2 128,32,17,0 + KERNEL2x8_L2 128,32,18,0 + KERNEL2x8_L2 128,32,19,0 + KERNEL2x8_L2 128,32,20,0 + KERNEL2x8_L2 128,32,21,0 + KERNEL2x8_L2 128,32,22,0 + KERNEL2x8_L2 128,32,23,0 + KERNEL2x8_L2 128,32,24,0 + KERNEL2x8_L2 128,32,25,0 + KERNEL2x8_L2 128,32,26,0 + KERNEL2x8_L2 128,32,27,0 + KERNEL2x8_L2 128,32,28,0 + KERNEL2x8_L2 128,32,29,0 + KERNEL2x8_L2 128,32,30,0 + KERNEL2x8_E2 128,32,31,1 + blr + MY_ALIGN + + +CGEMM_2x8_L32_SUB: +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 128,32,0,0 + KERNEL2x8_L2 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L2 128,32,2,0 + KERNEL2x8_L2 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 128,32,4,0 + KERNEL2x8_L2 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L2 128,32,6,0 + KERNEL2x8_L2 128,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 128,32,8,0 + KERNEL2x8_L2 128,32,9,0 + KERNEL2x8_L2 128,32,10,0 + KERNEL2x8_L2 128,32,11,0 + dcbt BO, T4 + KERNEL2x8_L2 128,32,12,0 + KERNEL2x8_L2 128,32,13,0 + KERNEL2x8_L2 128,32,14,0 + KERNEL2x8_E2 128,32,15,1 + blr + MY_ALIGN + + +CGEMM_2x8_L16_SUB: +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 128,32,0,0 + KERNEL2x8_L2 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L2 128,32,2,0 + KERNEL2x8_L2 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 128,32,4,0 + KERNEL2x8_L2 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L2 128,32,6,0 + KERNEL2x8_E2 128,32,7,1 + blr + MY_ALIGN + + +CGEMM_2x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x4_2 + MY_ALIGN +CGEMM_L2x4_LOOP: +/*----------------------------------------*/ + KERNEL2x4_L2 64,32,0,0 +CGEMM_L2x4_K32: +/*----------------------------------------*/ + KERNEL2x4_L2 64,32,1,0 + KERNEL2x4_L2 64,32,2,0 + KERNEL2x4_L2 64,32,3,0 + KERNEL2x4_L2 64,32,4,0 + KERNEL2x4_L2 64,32,5,0 + KERNEL2x4_L2 64,32,6,0 + KERNEL2x4_L2 64,32,7,0 + KERNEL2x4_L2 64,32,8,0 + KERNEL2x4_L2 64,32,9,0 + KERNEL2x4_L2 64,32,10,0 + KERNEL2x4_L2 64,32,11,0 + KERNEL2x4_L2 64,32,12,0 + KERNEL2x4_L2 64,32,13,0 + KERNEL2x4_L2 64,32,14,0 + KERNEL2x4_L2 64,32,15,1 + bdnz CGEMM_L2x4_LOOP + MY_ALIGN +CGEMM_L2x4_LOOP_END: +/*----------------------------------------*/ + END2x4_2 + blr + MY_ALIGN + + +CGEMM_2x4_L16_SUB: +/*----------------------------------------*/ + LOAD2x4_2 + KERNEL2x4_L2 64,32,0,0 + KERNEL2x4_L2 64,32,1,0 + KERNEL2x4_L2 64,32,2,0 + KERNEL2x4_L2 64,32,3,0 + KERNEL2x4_L2 64,32,4,0 + KERNEL2x4_L2 64,32,5,0 + KERNEL2x4_L2 64,32,6,0 + KERNEL2x4_E2 64,32,7,1 + blr + MY_ALIGN + + +CGEMM_2x4_L8_SUB: +/*----------------------------------------*/ + LOAD2x4_2 + KERNEL2x4_L2 64,32,0,0 + KERNEL2x4_L2 64,32,1,0 + KERNEL2x4_L2 64,32,2,0 + KERNEL2x4_E2 64,32,3,1 + blr + + +CGEMM_2x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x2_2 + MY_ALIGN +CGEMM_L2x2_LOOP: +/*----------------------------------------*/ + KERNEL2x2_L2 32,32,0,0 +CGEMM_L2x2_K32: +/*----------------------------------------*/ + KERNEL2x2_L2 32,32,1,0 + KERNEL2x2_L2 32,32,2,0 + KERNEL2x2_L2 32,32,3,0 + KERNEL2x2_L2 32,32,4,0 + KERNEL2x2_L2 32,32,5,0 + KERNEL2x2_L2 32,32,6,0 + KERNEL2x2_L2 32,32,7,0 + KERNEL2x2_L2 32,32,8,0 + KERNEL2x2_L2 32,32,9,0 + KERNEL2x2_L2 32,32,10,0 + KERNEL2x2_L2 32,32,11,0 + KERNEL2x2_L2 32,32,12,0 + KERNEL2x2_L2 32,32,13,0 + KERNEL2x2_L2 32,32,14,0 + KERNEL2x2_L2 32,32,15,1 + bdnz CGEMM_L2x2_LOOP + MY_ALIGN + + +CGEMM_L2x2_LOOP_END: +/*----------------------------------------*/ + END2x2_2 + blr + MY_ALIGN +CGEMM_2x2_L16_SUB: +/*----------------------------------------*/ + LOAD2x2_2 + KERNEL2x2_L2 32,32,0,0 + KERNEL2x2_L2 32,32,1,0 + KERNEL2x2_L2 32,32,2,0 + KERNEL2x2_L2 32,32,3,0 + KERNEL2x2_L2 32,32,4,0 + KERNEL2x2_L2 32,32,5,0 + KERNEL2x2_L2 32,32,6,0 + KERNEL2x2_E2 32,32,7,1 + blr + MY_ALIGN +CGEMM_2x2_L8_SUB: +/*----------------------------------------*/ + LOAD2x2_2 + KERNEL2x2_L2 32,32,0,0 + KERNEL2x2_L2 32,32,1,0 + KERNEL2x2_L2 32,32,2,0 + KERNEL2x2_E2 32,32,3,1 + blr + + +CGEMM_2x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x1_2 + MY_ALIGN +CGEMM_L2x1_LOOP: +/*----------------------------------------*/ + KERNEL2x1_L2 16,32,0,0 +CGEMM_L2x1_K32: +/*----------------------------------------*/ + KERNEL2x1_L2 16,32,1,0 + KERNEL2x1_L2 16,32,2,0 + KERNEL2x1_L2 16,32,3,0 + KERNEL2x1_L2 16,32,4,0 + KERNEL2x1_L2 16,32,5,0 + KERNEL2x1_L2 16,32,6,0 + KERNEL2x1_L2 16,32,7,0 + KERNEL2x1_L2 16,32,8,0 + KERNEL2x1_L2 16,32,9,0 + KERNEL2x1_L2 16,32,10,0 + KERNEL2x1_L2 16,32,11,0 + KERNEL2x1_L2 16,32,12,0 + KERNEL2x1_L2 16,32,13,0 + KERNEL2x1_L2 16,32,14,0 + KERNEL2x1_L2 16,32,15,1 + bdnz CGEMM_L2x1_LOOP + MY_ALIGN +CGEMM_L2x1_LOOP_END: +/*----------------------------------------*/ + END2x1_2 + blr + + MY_ALIGN +CGEMM_2x1_L16_SUB: +/*----------------------------------------*/ + LOAD2x1_2 + KERNEL2x1_L2 16,32,0,0 + KERNEL2x1_L2 16,32,1,0 + KERNEL2x1_L2 16,32,2,0 + KERNEL2x1_L2 16,32,3,0 + KERNEL2x1_L2 16,32,4,0 + KERNEL2x1_L2 16,32,5,0 + KERNEL2x1_L2 16,32,6,0 + KERNEL2x1_E2 16,32,7,1 + blr + MY_ALIGN + + +CGEMM_2x1_L8_SUB: +/*----------------------------------------*/ + LOAD2x1_2 + KERNEL2x1_L2 16,32,0,0 + KERNEL2x1_L2 16,32,1,0 + KERNEL2x1_L2 16,32,2,0 + KERNEL2x1_E2 16,32,3,1 + blr + + + +/* MAIN LOOP BEGINS */ + MY_ALIGN + + +CGEMM_L2: +/*----------------------------------------*/ + + andi. J, N, 2 + ble CGEMM_L2_END + + +CGEMM_L2_BEGIN: +/*----------------------------------------*/ + mr CO, C + slwi T1, LDC , 1 + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble CGEMM_L2x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +CGEMM_L2x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,8,2 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T1-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + ZERO2x8 + ble CGEMM_L2x8_SUB0 + bl CGEMM_L2x8_LMAIN_SUB + andi. L, T1, 127 + ble CGEMM_L2x8_SAVE + b CGEMM_L2x8_SUB2 + + +CGEMM_L2x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T8,1 + bne CMP2x8_128K + addi BO,BO,-16 + addi AO,AO,-64 + LOAD2x8O 64,16 + END2x8_WITHOUT_ADD + LOAD2x8_2O 128, 32 + mtctr T8 + bl CGEMM_L2x8_K128 + b CGEMM_L2x8_SAVE + CMP2x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,128 +#else + cmpwi K,128 +#endif + bne CGEMM_L2x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-128 + LOAD2x8_2O 128,32 + bl CGEMM_L2x8_K128 + b CGEMM_L2x8_SAVE + MY_ALIGN + + +CGEMM_L2x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble CGEMM_L2x8_SUB2_32 + bl CGEMM_2x8_L64_SUB + MY_ALIGN + + +CGEMM_L2x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble CGEMM_L2x8_SUB2_16 + bl CGEMM_2x8_L32_SUB + MY_ALIGN + + +CGEMM_L2x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L2x8_SUB2_8 + bl CGEMM_2x8_L16_SUB + MY_ALIGN + + +CGEMM_L2x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L2x8_SUB2_4 + LOAD2x8_2 + KERNEL2x8_L2 128,32, 0,0 + KERNEL2x8_L2 128,32, 1,0 + KERNEL2x8_L2 128,32, 2,0 + KERNEL2x8_E2 128,32, 3,1 + MY_ALIGN + + +CGEMM_L2x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L2x8_SUB2_2 + LOAD2x8_2 + KERNEL2x8_L2 128,32, 0,0 + KERNEL2x8_E2 128,32, 1,1 + MY_ALIGN + + +CGEMM_L2x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L2x8_SUB2_1 + LOAD2x8_2 + KERNEL2x8_E2 128,32, 0,1 + MY_ALIGN + + +CGEMM_L2x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L2x8_SAVE + KERNEL2x8 + + MY_ALIGN +CGEMM_L2x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + MY_ALIGN + SAVE2x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2 +#endif + bgt CGEMM_L2x8_BEGIN + andi. T2, M, 7 + ble CGEMM_L2x1_END + andi. T1, M, 4 + ble CGEMM_L2x4_END + b CGEMM_L2x4_BEGIN + MY_ALIGN + + +CGEMM_L2x8_END: +/*----------------------------------------*/ + + +CGEMM_L2x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble CGEMM_L2x1_END + andi. T1, M, 4 + ble CGEMM_L2x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,4,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x4 + ble CGEMM_L2x4_SUB0 + bl CGEMM_2x4_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L2x4_SAVE + b CGEMM_L2x4_SUB2 + + +CGEMM_L2x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x4_32K + addi BO,BO,-16 + addi AO,AO,-32 + LOAD2x4O 32,16 + END2x4_WITHOUT_ADD + LOAD2x4_2O 64, 32 + mtctr T8 + bl CGEMM_L2x4_K32 + b CGEMM_L2x4_SAVE + CMP2x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L2x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-64 + LOAD2x4_2O 64,32 + bl CGEMM_L2x4_K32 + b CGEMM_L2x4_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L2x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L2x4_SUB2_8 + bl CGEMM_2x4_L16_SUB + MY_ALIGN + + +CGEMM_L2x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L2x4_SUB2_4 + bl CGEMM_2x4_L8_SUB + MY_ALIGN + + +CGEMM_L2x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L2x4_SUB2_2 + LOAD2x4_2 + KERNEL2x4_L2 64,32, 0,0 + KERNEL2x4_E2 64,32, 1,1 + MY_ALIGN + + +CGEMM_L2x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L2x4_SUB2_1 + LOAD2x4_2 + KERNEL2x4_E2 64,32, 0,1 + MY_ALIGN + + +CGEMM_L2x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L2x4_SAVE + KERNEL2x4 + + +CGEMM_L2x4_SAVE: +/*----------------------------------------*/ + SAVE2x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2 +#endif + + +CGEMM_L2x4_END: +/*----------------------------------------*/ + + +CGEMM_L2x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble CGEMM_L2x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,2,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x2 + ble CGEMM_L2x2_SUB0 + bl CGEMM_2x2_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L2x2_SAVE + b CGEMM_L2x2_SUB2 + + +CGEMM_L2x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x2_32K + addi BO,BO,-16 + addi AO,AO,-16 + LOAD2x2O 16,16 + END2x2_WITHOUT_ADD + LOAD2x2_2O 32, 32 + mtctr T8 + bl CGEMM_L2x2_K32 + b CGEMM_L2x2_SAVE + CMP2x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L2x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-32 + LOAD2x2_2O 32,32 + bl CGEMM_L2x2_K32 + b CGEMM_L2x2_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L2x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L2x2_SUB2_8 + bl CGEMM_2x2_L16_SUB + MY_ALIGN + + +CGEMM_L2x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L2x2_SUB2_4 + bl CGEMM_2x2_L8_SUB + MY_ALIGN + + +CGEMM_L2x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L2x2_SUB2_2 + LOAD2x2_2 + KERNEL2x2_L2 32,32, 0,0 + KERNEL2x2_E2 32,32, 1,1 + MY_ALIGN + + +CGEMM_L2x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L2x2_SUB2_1 + LOAD2x2_2 + KERNEL2x2_E2 32,32, 0,1 + MY_ALIGN + + +CGEMM_L2x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L2x2_SAVE + KERNEL2x2 + + MY_ALIGN +CGEMM_L2x2_SAVE: +/*----------------------------------------*/ + SAVE2x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2 +#endif + + +CGEMM_L2x2_END: +/*----------------------------------------*/ + + +CGEMM_L2x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble CGEMM_L2x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,1,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x1 + ble CGEMM_L2x1_SUB0 + bl CGEMM_2x1_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L2x1_SAVE + b CGEMM_L2x1_SUB2 + + +CGEMM_L2x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x1_32K + addi BO,BO,-16 + addi AO,AO,-8 + LOAD2x1O 8,16 + END2x1_WITHOUT_ADD + LOAD2x1_2O 16, 32 + mtctr T8 + bl CGEMM_L2x1_K32 + b CGEMM_L2x1_SAVE + CMP2x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L2x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-16 + LOAD2x1_2O 16,32 + bl CGEMM_L2x1_K32 + b CGEMM_L2x1_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L2x1_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L2x1_SUB2_8 + bl CGEMM_2x1_L16_SUB + MY_ALIGN + + +CGEMM_L2x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L2x1_SUB2_4 + bl CGEMM_2x1_L8_SUB + MY_ALIGN + + +CGEMM_L2x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L2x1_SUB2_2 + LOAD2x1_2 + KERNEL2x1_L2 16,32, 0,0 + KERNEL2x1_E2 16,32, 1,1 + MY_ALIGN + + +CGEMM_L2x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L2x1_SUB2_1 + LOAD2x1_2 + KERNEL2x1_E2 16,32, 0,1 + MY_ALIGN + + +CGEMM_L2x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L2x1_SAVE + KERNEL2x1 + + MY_ALIGN +CGEMM_L2x1_SAVE: +/*----------------------------------------*/ + + SAVE2x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2 +#endif + + +CGEMM_L2x1_END: +/*----------------------------------------*/ + slwi T1, K, 4 + + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 2 +#endif + +CGEMM_L2_END: + + +b CGEMM_L1 +/* MINI SUBROUTINES */ +/* 1x8 MAIN 128x+2 LOOP */ + + +CGEMM_L1x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x8_2 + MY_ALIGN +CGEMM_L1x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 128,16,0,0 +CGEMM_L1x8_K128: +/*----------------------------------------*/ + KERNEL1x8_L2 128,16,1,0 + dcbt AO, T2 + KERNEL1x8_L2 128,16,2,0 + KERNEL1x8_L2 128,16,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 128,16,4,0 + KERNEL1x8_L2 128,16,5,0 + dcbt AO, T4 + KERNEL1x8_L2 128,16,6,0 + KERNEL1x8_L2 128,16,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 128,16,8,0 + KERNEL1x8_L2 128,16,9,0 + KERNEL1x8_L2 128,16,10,0 + KERNEL1x8_L2 128,16,11,0 + dcbt BO, T4 + KERNEL1x8_L2 128,16,12,0 + KERNEL1x8_L2 128,16,13,0 + KERNEL1x8_L2 128,16,14,0 + KERNEL1x8_L2 128,16,15,0 + KERNEL1x8_L2 128,16,16,0 + KERNEL1x8_L2 128,16,17,0 + KERNEL1x8_L2 128,16,18,0 + KERNEL1x8_L2 128,16,19,0 + KERNEL1x8_L2 128,16,20,0 + KERNEL1x8_L2 128,16,21,0 + KERNEL1x8_L2 128,16,22,0 + KERNEL1x8_L2 128,16,23,0 + KERNEL1x8_L2 128,16,24,0 + KERNEL1x8_L2 128,16,25,0 + KERNEL1x8_L2 128,16,26,0 + KERNEL1x8_L2 128,16,27,0 + KERNEL1x8_L2 128,16,28,0 + KERNEL1x8_L2 128,16,29,0 + KERNEL1x8_L2 128,16,30,0 + KERNEL1x8_L2 128,16,31,0 + KERNEL1x8_L2 128,16,32,0 + KERNEL1x8_L2 128,16,33,0 + KERNEL1x8_L2 128,16,34,0 + KERNEL1x8_L2 128,16,35,0 + KERNEL1x8_L2 128,16,36,0 + KERNEL1x8_L2 128,16,37,0 + KERNEL1x8_L2 128,16,38,0 + KERNEL1x8_L2 128,16,39,0 + KERNEL1x8_L2 128,16,40,0 + KERNEL1x8_L2 128,16,41,0 + KERNEL1x8_L2 128,16,42,0 + KERNEL1x8_L2 128,16,43,0 + KERNEL1x8_L2 128,16,44,0 + KERNEL1x8_L2 128,16,45,0 + KERNEL1x8_L2 128,16,46,0 + KERNEL1x8_L2 128,16,47,0 + KERNEL1x8_L2 128,16,48,0 + KERNEL1x8_L2 128,16,49,0 + KERNEL1x8_L2 128,16,50,0 + KERNEL1x8_L2 128,16,51,0 + KERNEL1x8_L2 128,16,52,0 + KERNEL1x8_L2 128,16,53,0 + KERNEL1x8_L2 128,16,54,0 + KERNEL1x8_L2 128,16,55,0 + KERNEL1x8_L2 128,16,56,0 + KERNEL1x8_L2 128,16,57,0 + KERNEL1x8_L2 128,16,58,0 + KERNEL1x8_L2 128,16,59,0 + KERNEL1x8_L2 128,16,60,0 + KERNEL1x8_L2 128,16,61,0 + KERNEL1x8_L2 128,16,62,0 + KERNEL1x8_L2 128,16,63,1 + bdnz CGEMM_L1x8_LOOP + MY_ALIGN +CGEMM_L1x8_LOOP_END: +/*----------------------------------------*/ + END1x8_2 + blr + MY_ALIGN + + +CGEMM_1x8_L64_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 128,16,0,0 + KERNEL1x8_L2 128,16,1,0 + dcbt AO, T2 + KERNEL1x8_L2 128,16,2,0 + KERNEL1x8_L2 128,16,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 128,16,4,0 + KERNEL1x8_L2 128,16,5,0 + dcbt AO, T4 + KERNEL1x8_L2 128,16,6,0 + KERNEL1x8_L2 128,16,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 128,16,8,0 + KERNEL1x8_L2 128,16,9,0 + KERNEL1x8_L2 128,16,10,0 + KERNEL1x8_L2 128,16,11,0 + dcbt BO, T4 + KERNEL1x8_L2 128,16,12,0 + KERNEL1x8_L2 128,16,13,0 + KERNEL1x8_L2 128,16,14,0 + KERNEL1x8_L2 128,16,15,0 + KERNEL1x8_L2 128,16,16,0 + KERNEL1x8_L2 128,16,17,0 + KERNEL1x8_L2 128,16,18,0 + KERNEL1x8_L2 128,16,19,0 + KERNEL1x8_L2 128,16,20,0 + KERNEL1x8_L2 128,16,21,0 + KERNEL1x8_L2 128,16,22,0 + KERNEL1x8_L2 128,16,23,0 + KERNEL1x8_L2 128,16,24,0 + KERNEL1x8_L2 128,16,25,0 + KERNEL1x8_L2 128,16,26,0 + KERNEL1x8_L2 128,16,27,0 + KERNEL1x8_L2 128,16,28,0 + KERNEL1x8_L2 128,16,29,0 + KERNEL1x8_L2 128,16,30,0 + KERNEL1x8_E2 128,16,31,1 + blr + MY_ALIGN + + +CGEMM_1x8_L32_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 128,16,0,0 + KERNEL1x8_L2 128,16,1,0 + dcbt AO, T2 + KERNEL1x8_L2 128,16,2,0 + KERNEL1x8_L2 128,16,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 128,16,4,0 + KERNEL1x8_L2 128,16,5,0 + dcbt AO, T4 + KERNEL1x8_L2 128,16,6,0 + KERNEL1x8_L2 128,16,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 128,16,8,0 + KERNEL1x8_L2 128,16,9,0 + KERNEL1x8_L2 128,16,10,0 + KERNEL1x8_L2 128,16,11,0 + dcbt BO, T4 + KERNEL1x8_L2 128,16,12,0 + KERNEL1x8_L2 128,16,13,0 + KERNEL1x8_L2 128,16,14,0 + KERNEL1x8_E2 128,16,15,1 + blr + MY_ALIGN + + +CGEMM_1x8_L16_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 128,16,0,0 + KERNEL1x8_L2 128,16,1,0 + dcbt AO, T2 + KERNEL1x8_L2 128,16,2,0 + KERNEL1x8_L2 128,16,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 128,16,4,0 + KERNEL1x8_L2 128,16,5,0 + dcbt AO, T4 + KERNEL1x8_L2 128,16,6,0 + KERNEL1x8_E2 128,16,7,1 + blr + MY_ALIGN + + +CGEMM_1x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x4_2 + MY_ALIGN +CGEMM_L1x4_LOOP: +/*----------------------------------------*/ + KERNEL1x4_L2 64,16,0,0 +CGEMM_L1x4_K32: +/*----------------------------------------*/ + KERNEL1x4_L2 64,16,1,0 + KERNEL1x4_L2 64,16,2,0 + KERNEL1x4_L2 64,16,3,0 + KERNEL1x4_L2 64,16,4,0 + KERNEL1x4_L2 64,16,5,0 + KERNEL1x4_L2 64,16,6,0 + KERNEL1x4_L2 64,16,7,0 + KERNEL1x4_L2 64,16,8,0 + KERNEL1x4_L2 64,16,9,0 + KERNEL1x4_L2 64,16,10,0 + KERNEL1x4_L2 64,16,11,0 + KERNEL1x4_L2 64,16,12,0 + KERNEL1x4_L2 64,16,13,0 + KERNEL1x4_L2 64,16,14,0 + KERNEL1x4_L2 64,16,15,1 + bdnz CGEMM_L1x4_LOOP + MY_ALIGN +CGEMM_L1x4_LOOP_END: +/*----------------------------------------*/ + END1x4_2 + blr + MY_ALIGN + + +CGEMM_1x4_L16_SUB: +/*----------------------------------------*/ + LOAD1x4_2 + KERNEL1x4_L2 64,16,0,0 + KERNEL1x4_L2 64,16,1,0 + KERNEL1x4_L2 64,16,2,0 + KERNEL1x4_L2 64,16,3,0 + KERNEL1x4_L2 64,16,4,0 + KERNEL1x4_L2 64,16,5,0 + KERNEL1x4_L2 64,16,6,0 + KERNEL1x4_E2 64,16,7,1 + blr + MY_ALIGN + + +CGEMM_1x4_L8_SUB: +/*----------------------------------------*/ + LOAD1x4_2 + KERNEL1x4_L2 64,16,0,0 + KERNEL1x4_L2 64,16,1,0 + KERNEL1x4_L2 64,16,2,0 + KERNEL1x4_E2 64,16,3,1 + blr + + +CGEMM_1x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x2_2 + MY_ALIGN +CGEMM_L1x2_LOOP: +/*----------------------------------------*/ + KERNEL1x2_L2 32,16,0,0 +CGEMM_L1x2_K32: +/*----------------------------------------*/ + KERNEL1x2_L2 32,16,1,0 + KERNEL1x2_L2 32,16,2,0 + KERNEL1x2_L2 32,16,3,0 + KERNEL1x2_L2 32,16,4,0 + KERNEL1x2_L2 32,16,5,0 + KERNEL1x2_L2 32,16,6,0 + KERNEL1x2_L2 32,16,7,0 + KERNEL1x2_L2 32,16,8,0 + KERNEL1x2_L2 32,16,9,0 + KERNEL1x2_L2 32,16,10,0 + KERNEL1x2_L2 32,16,11,0 + KERNEL1x2_L2 32,16,12,0 + KERNEL1x2_L2 32,16,13,0 + KERNEL1x2_L2 32,16,14,0 + KERNEL1x2_L2 32,16,15,1 + bdnz CGEMM_L1x2_LOOP + MY_ALIGN + + +CGEMM_L1x2_LOOP_END: +/*----------------------------------------*/ + END1x2_2 + blr + MY_ALIGN +CGEMM_1x2_L16_SUB: +/*----------------------------------------*/ + LOAD1x2_2 + KERNEL1x2_L2 32,16,0,0 + KERNEL1x2_L2 32,16,1,0 + KERNEL1x2_L2 32,16,2,0 + KERNEL1x2_L2 32,16,3,0 + KERNEL1x2_L2 32,16,4,0 + KERNEL1x2_L2 32,16,5,0 + KERNEL1x2_L2 32,16,6,0 + KERNEL1x2_E2 32,16,7,1 + blr + MY_ALIGN +CGEMM_1x2_L8_SUB: +/*----------------------------------------*/ + LOAD1x2_2 + KERNEL1x2_L2 32,16,0,0 + KERNEL1x2_L2 32,16,1,0 + KERNEL1x2_L2 32,16,2,0 + KERNEL1x2_E2 32,16,3,1 + blr + + +CGEMM_1x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x1_2 + MY_ALIGN +CGEMM_L1x1_LOOP: +/*----------------------------------------*/ + KERNEL1x1_L2 16,16,0,0 +CGEMM_L1x1_K32: +/*----------------------------------------*/ + KERNEL1x1_L2 16,16,1,0 + KERNEL1x1_L2 16,16,2,0 + KERNEL1x1_L2 16,16,3,0 + KERNEL1x1_L2 16,16,4,0 + KERNEL1x1_L2 16,16,5,0 + KERNEL1x1_L2 16,16,6,0 + KERNEL1x1_L2 16,16,7,0 + KERNEL1x1_L2 16,16,8,0 + KERNEL1x1_L2 16,16,9,0 + KERNEL1x1_L2 16,16,10,0 + KERNEL1x1_L2 16,16,11,0 + KERNEL1x1_L2 16,16,12,0 + KERNEL1x1_L2 16,16,13,0 + KERNEL1x1_L2 16,16,14,0 + KERNEL1x1_L2 16,16,15,1 + bdnz CGEMM_L1x1_LOOP + MY_ALIGN +CGEMM_L1x1_LOOP_END: +/*----------------------------------------*/ + END1x1_2 + blr + + MY_ALIGN +CGEMM_1x1_L16_SUB: +/*----------------------------------------*/ + LOAD1x1_2 + KERNEL1x1_L2 16,16,0,0 + KERNEL1x1_L2 16,16,1,0 + KERNEL1x1_L2 16,16,2,0 + KERNEL1x1_L2 16,16,3,0 + KERNEL1x1_L2 16,16,4,0 + KERNEL1x1_L2 16,16,5,0 + KERNEL1x1_L2 16,16,6,0 + KERNEL1x1_E2 16,16,7,1 + blr + MY_ALIGN + + +CGEMM_1x1_L8_SUB: +/*----------------------------------------*/ + LOAD1x1_2 + KERNEL1x1_L2 16,16,0,0 + KERNEL1x1_L2 16,16,1,0 + KERNEL1x1_L2 16,16,2,0 + KERNEL1x1_E2 16,16,3,1 + blr + + + +/* MAIN LOOP BEGINS */ + MY_ALIGN + + +CGEMM_L1: +/*----------------------------------------*/ + + andi. J, N, 1 + ble CGEMM_L1_END + +CGEMM_L1_BEGIN: +/*----------------------------------------*/ + mr CO, C + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble CGEMM_L1x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +CGEMM_L1x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,8,1 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T1-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + ZERO1x8 + ble CGEMM_L1x8_SUB0 + bl CGEMM_L1x8_LMAIN_SUB + andi. L, T1, 127 + ble CGEMM_L1x8_SAVE + b CGEMM_L1x8_SUB2 + + +CGEMM_L1x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T8,1 + bne CMP1x8_128K + addi BO,BO,-8 + addi AO,AO,-64 + LOAD1x8O 64,8 + END1x8_WITHOUT_ADD + LOAD1x8_2O 128, 16 + mtctr T8 + bl CGEMM_L1x8_K128 + b CGEMM_L1x8_SAVE + CMP1x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,128 +#else + cmpwi K,128 +#endif + bne CGEMM_L1x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-16 + addi AO,AO,-128 + LOAD1x8_2O 128,16 + bl CGEMM_L1x8_K128 + b CGEMM_L1x8_SAVE + MY_ALIGN + + +CGEMM_L1x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble CGEMM_L1x8_SUB2_32 + bl CGEMM_1x8_L64_SUB + MY_ALIGN + + +CGEMM_L1x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble CGEMM_L1x8_SUB2_16 + bl CGEMM_1x8_L32_SUB + MY_ALIGN + + +CGEMM_L1x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L1x8_SUB2_8 + bl CGEMM_1x8_L16_SUB + MY_ALIGN + + +CGEMM_L1x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L1x8_SUB2_4 + LOAD1x8_2 + KERNEL1x8_L2 128,16, 0,0 + KERNEL1x8_L2 128,16, 1,0 + KERNEL1x8_L2 128,16, 2,0 + KERNEL1x8_E2 128,16, 3,1 + MY_ALIGN + + +CGEMM_L1x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L1x8_SUB2_2 + LOAD1x8_2 + KERNEL1x8_L2 128,16, 0,0 + KERNEL1x8_E2 128,16, 1,1 + MY_ALIGN + + +CGEMM_L1x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L1x8_SUB2_1 + LOAD1x8_2 + KERNEL1x8_E2 128,16, 0,1 + MY_ALIGN + + +CGEMM_L1x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L1x8_SAVE + KERNEL1x8 + + MY_ALIGN +CGEMM_L1x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + MY_ALIGN + SAVE1x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1 +#endif + bgt CGEMM_L1x8_BEGIN + andi. T2, M, 7 + ble CGEMM_L1x1_END + andi. T1, M, 4 + ble CGEMM_L1x4_END + b CGEMM_L1x4_BEGIN + MY_ALIGN + + +CGEMM_L1x8_END: +/*----------------------------------------*/ + + +CGEMM_L1x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble CGEMM_L1x1_END + andi. T1, M, 4 + ble CGEMM_L1x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,4,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 31x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 31x */ +#endif + ZERO1x4 + ble CGEMM_L1x4_SUB0 + bl CGEMM_1x4_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L1x4_SAVE + b CGEMM_L1x4_SUB2 + + +CGEMM_L1x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x4_32K + addi BO,BO,-8 + addi AO,AO,-32 + LOAD1x4O 32,8 + END1x4_WITHOUT_ADD + LOAD1x4_2O 64, 16 + mtctr T8 + bl CGEMM_L1x4_K32 + b CGEMM_L1x4_SAVE + CMP1x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L1x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-16 + addi AO,AO,-64 + LOAD1x4_2O 64,16 + bl CGEMM_L1x4_K32 + b CGEMM_L1x4_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L1x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L1x4_SUB2_8 + bl CGEMM_1x4_L16_SUB + MY_ALIGN + + +CGEMM_L1x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L1x4_SUB2_4 + bl CGEMM_1x4_L8_SUB + MY_ALIGN + + +CGEMM_L1x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L1x4_SUB2_2 + LOAD1x4_2 + KERNEL1x4_L2 64,16, 0,0 + KERNEL1x4_E2 64,16, 1,1 + MY_ALIGN + + +CGEMM_L1x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L1x4_SUB2_1 + LOAD1x4_2 + KERNEL1x4_E2 64,16, 0,1 + MY_ALIGN + + +CGEMM_L1x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L1x4_SAVE + KERNEL1x4 + + +CGEMM_L1x4_SAVE: +/*----------------------------------------*/ + SAVE1x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1 +#endif + + +CGEMM_L1x4_END: +/*----------------------------------------*/ + + +CGEMM_L1x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble CGEMM_L1x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,2,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 31x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 31x */ +#endif + ZERO1x2 + ble CGEMM_L1x2_SUB0 + bl CGEMM_1x2_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L1x2_SAVE + b CGEMM_L1x2_SUB2 + + +CGEMM_L1x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x2_32K + addi BO,BO,-8 + addi AO,AO,-16 + LOAD1x2O 16,8 + END1x2_WITHOUT_ADD + LOAD1x2_2O 32, 16 + mtctr T8 + bl CGEMM_L1x2_K32 + b CGEMM_L1x2_SAVE + CMP1x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L1x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-16 + addi AO,AO,-32 + LOAD1x2_2O 32,16 + bl CGEMM_L1x2_K32 + b CGEMM_L1x2_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L1x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L1x2_SUB2_8 + bl CGEMM_1x2_L16_SUB + MY_ALIGN + + +CGEMM_L1x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L1x2_SUB2_4 + bl CGEMM_1x2_L8_SUB + MY_ALIGN + + +CGEMM_L1x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L1x2_SUB2_2 + LOAD1x2_2 + KERNEL1x2_L2 32,16, 0,0 + KERNEL1x2_E2 32,16, 1,1 + MY_ALIGN + + +CGEMM_L1x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L1x2_SUB2_1 + LOAD1x2_2 + KERNEL1x2_E2 32,16, 0,1 + MY_ALIGN + + +CGEMM_L1x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L1x2_SAVE + KERNEL1x2 + + MY_ALIGN +CGEMM_L1x2_SAVE: +/*----------------------------------------*/ + SAVE1x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1 +#endif + + +CGEMM_L1x2_END: +/*----------------------------------------*/ + + +CGEMM_L1x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble CGEMM_L1x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,1,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 31x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 31x */ +#endif + ZERO1x1 + ble CGEMM_L1x1_SUB0 + bl CGEMM_1x1_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L1x1_SAVE + b CGEMM_L1x1_SUB2 + + +CGEMM_L1x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x1_32K + addi BO,BO,-8 + addi AO,AO,-8 + LOAD1x1O 8,8 + END1x1_WITHOUT_ADD + LOAD1x1_2O 16, 16 + mtctr T8 + bl CGEMM_L1x1_K32 + b CGEMM_L1x1_SAVE + CMP1x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L1x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-16 + addi AO,AO,-16 + LOAD1x1_2O 16,16 + bl CGEMM_L1x1_K32 + b CGEMM_L1x1_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L1x1_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L1x1_SUB2_8 + bl CGEMM_1x1_L16_SUB + MY_ALIGN + + +CGEMM_L1x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L1x1_SUB2_4 + bl CGEMM_1x1_L8_SUB + MY_ALIGN + + +CGEMM_L1x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L1x1_SUB2_2 + LOAD1x1_2 + KERNEL1x1_L2 16,16, 0,0 + KERNEL1x1_E2 16,16, 1,1 + MY_ALIGN + + +CGEMM_L1x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L1x1_SUB2_1 + LOAD1x1_2 + KERNEL1x1_E2 16,16, 0,1 + MY_ALIGN + + +CGEMM_L1x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L1x1_SAVE + KERNEL1x1 + + MY_ALIGN +CGEMM_L1x1_SAVE: +/*----------------------------------------*/ + + SAVE1x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1 +#endif + + +CGEMM_L1x1_END: +/*----------------------------------------*/ + slwi T1, K, 3 + + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 1 +#endif + +CGEMM_L1_END: + + + + diff --git a/kernel/power/cgemm_macros_power10.S b/kernel/power/cgemm_macros_power10.S new file mode 100644 index 000000000..b66e93405 --- /dev/null +++ b/kernel/power/cgemm_macros_power10.S @@ -0,0 +1,2131 @@ +/*************************************************************************** +Copyright (c) 2013-2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define unit_size 8 +#define DISP32(ind, disp) (ind*unit_size*32+disp) +#define DISP16(ind, disp) (ind*unit_size*16+disp) +#define DISP8(ind, disp) (ind*unit_size*8+disp) +#define DISP4(ind, disp) (ind*unit_size*4+disp) +#define DISP2(ind, disp) (ind*unit_size*2+disp) +#define DISP1(ind, disp) (ind*unit_size+disp) +#define DISPX(disp) (disp) + +.macro AGGREGATE_REALS_IMAGES VSINR_OUT1, VSINR, VSINI_OUT2, VSINI +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + xvsubsp \VSINR_OUT1, \VSINR_OUT1, \VSINR + xvaddsp \VSINI_OUT2, \VSINI_OUT2, \VSINI +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + xvaddsp \VSINR_OUT1, \VSINR_OUT1, \VSINR + xvsubsp \VSINI_OUT2, \VSINI_OUT2, \VSINI +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + xvaddsp \VSINR_OUT1, \VSINR_OUT1, \VSINR + xvsubsp \VSINI_OUT2, \VSINI, \VSINI_OUT2 +#else // CC || CR || RC || RR + /*we will assume {-alpha_r,-alpha_i} for this case */ + /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ + xvsubsp \VSINR_OUT1, \VSINR, \VSINR_OUT1 + /*we will negate alpha image instead to fix sign*/ + xvaddsp \VSINI_OUT2, \VSINI_OUT2, \VSINI +#endif +.endm + +.macro AGGREGATE_REALS_IMAGES_A_PERMUTE VSINR_OUT1, VSINR, VSINI_OUT2, VSINI +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + xvsubsp \VSINR_OUT1, \VSINR_OUT1, \VSINR + xvaddsp \VSINI_OUT2, \VSINI_OUT2, \VSINI +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + xvaddsp \VSINR_OUT1, \VSINR_OUT1, \VSINR + xvsubsp \VSINI_OUT2, \VSINI, \VSINI_OUT2 +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + xvaddsp \VSINR_OUT1, \VSINR_OUT1, \VSINR + xvsubsp \VSINI_OUT2, \VSINI_OUT2, \VSINI +#else // CC || CR || RC || RR + /*we will assume {-alpha_r,-alpha_i} for this case */ + /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ + xvsubsp \VSINR_OUT1, \VSINR, \VSINR_OUT1 + /*we will negate alpha image instead to fix sign*/ + xvaddsp \VSINI_OUT2, \VSINI_OUT2, \VSINI +#endif +.endm + +/* {i0,i1} * {alpha_i,alpha_i} [- VSOUT1] ;[VSOUT2 +] {r0,r1}*{alpha_i,alpha_i} */ + +.macro MULT_APLHA_PART1 VSINRR, VSINII, VSOUT1, VSOUT2 + xvmulsp \VSOUT1, \VSINII, alpha_i + xvmulsp \VSOUT2, \VSINRR, alpha_i +.endm + +/* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */ + +.macro MULT_APLHA_PART2 VSINRR, VSINII, VSOUT1, VSOUT2 + xvmsubasp \VSOUT1, \VSINRR, alpha_r + xvmaddasp \VSOUT2, \VSINII, alpha_r +.endm + +.macro PERMUTE1 OUT, R1, R2, R3, R4 + xxsel vs62, \R1, \R2, vs57 + xxsel \OUT, \R3, \R4, vs57 + xxpermdi \OUT, \OUT, vs62, 1 +.endm +.macro PERMUTE2 OUT, R1, R2, R3, R4 + xxsel vs62, \R2, \R1, vs57 + xxsel \OUT, \R4, \R3, vs57 + xxpermdi \OUT, vs62, \OUT, 1 + xxperm \OUT, \OUT, permute_mask +.endm +.macro PERMUTE3 OUT, R1, R2, R3, R4 + xxsel vs62, \R1, \R2, vs57 + xxsel \OUT, \R3, \R4, vs57 + xxpermdi \OUT, vs62, \OUT, 2 +.endm +.macro PERMUTE4 OUT, R1, R2, R3, R4 + xxsel vs62, \R2, \R1, vs57 + xxsel \OUT, \R4, \R3, vs57 + xxpermdi \OUT, \OUT, vs62, 2 + xxperm \OUT, \OUT, permute_mask +.endm +.macro GROUP1 + xxperm vs0, vs32, permute_mask + xxperm vs4, vs40, permute_mask + xxperm vs1, vs33, permute_mask + xxperm vs5, vs41, permute_mask + xxperm vs8, vs36, permute_mask + xxperm vs12, vs44, permute_mask + xxperm vs9, vs37, permute_mask + xxperm vs13, vs45, permute_mask +.endm +.macro AGG_GROUP1 + AGGREGATE_REALS_IMAGES vs32, vs0, vs40, vs4 + AGGREGATE_REALS_IMAGES vs33, vs1, vs41, vs5 + AGGREGATE_REALS_IMAGES vs36, vs8, vs44, vs12 + AGGREGATE_REALS_IMAGES vs37, vs9, vs45, vs13 +.endm +.macro GROUP2 + xxperm vs0, vs34, permute_mask + xxperm vs4, vs42, permute_mask + xxperm vs1, vs35, permute_mask + xxperm vs5, vs43, permute_mask + xxperm vs8, vs38, permute_mask + xxperm vs12, vs46, permute_mask + xxperm vs9, vs39, permute_mask + xxperm vs13, vs47, permute_mask +.endm +.macro AGG_GROUP2 + AGGREGATE_REALS_IMAGES vs34, vs0, vs42, vs4 + AGGREGATE_REALS_IMAGES vs35, vs1, vs43, vs5 + AGGREGATE_REALS_IMAGES vs38, vs8, vs46, vs12 + AGGREGATE_REALS_IMAGES vs39, vs9, vs47, vs13 +.endm +.macro MULTIPLY_GROUP1 + MULT_APLHA_PART1 vs32, vs40, vs0, vs1 + MULT_APLHA_PART1 vs33, vs41, vs2, vs3 + MULT_APLHA_PART1 vs36, vs44, vs8, vs9 + MULT_APLHA_PART1 vs37, vs45, vs10, vs11 + MULT_APLHA_PART2 vs32, vs40, vs0, vs1 + MULT_APLHA_PART2 vs33, vs41, vs2, vs3 + MULT_APLHA_PART2 vs36, vs44, vs8, vs9 + MULT_APLHA_PART2 vs37, vs45, vs10, vs11 +.endm +.macro MULTIPLY_GROUP2 + MULT_APLHA_PART1 vs34, vs42, vs4, vs5 + MULT_APLHA_PART1 vs35, vs43, vs6, vs7 + MULT_APLHA_PART1 vs38, vs46, vs12, vs13 + MULT_APLHA_PART1 vs39, vs47, vs14, vs15 + MULT_APLHA_PART2 vs34, vs42, vs4, vs5 + MULT_APLHA_PART2 vs35, vs43, vs6, vs7 + MULT_APLHA_PART2 vs38, vs46, vs12, vs13 + MULT_APLHA_PART2 vs39, vs47, vs14, vs15 +.endm +/* reconstruct r, i pairs*/ +.macro RECONSTRUCT_PAIR1 + xxperm vs0, vs1, save_permute_1 + xxperm vs2, vs3, save_permute_1 + xxperm vs8, vs9, save_permute_1 + xxperm vs10, vs11, save_permute_1 +.endm +.macro RECONSTRUCT_PAIR2 + xxperm vs4, vs5, save_permute_1 + xxperm vs6, vs7, save_permute_1 + xxperm vs12, vs13, save_permute_1 + xxperm vs14, vs15, save_permute_1 +.endm +.macro SHUFFLE_ACC ACC, R0, R1, R2, R3, O1, O2, O3, O4 + xxmfacc \ACC + PERMUTE1 \O1, \R3, \R2, \R1, \R0 + PERMUTE2 \O2, \R1, \R0, \R3, \R2 + PERMUTE3 \O3, \R1, \R0, \R3, \R2 + PERMUTE4 \O4, \R3, \R2, \R1, \R0 +.endm +/* macros for N=4 and M=8 +**********************************************************************************************/ +.macro ZERO4x8 + xxsetaccz 0 + xxsetaccz 1 + xxsetaccz 2 + xxsetaccz 3 + xxsetaccz 4 + xxsetaccz 5 + xxsetaccz 6 + xxsetaccz 7 +.endm + +.macro LOAD4x8 + LOAD4x8O 0, 0 +.endm + +.macro LOAD4x8O OffsetA, OffsetB + lxvp vs34, (\OffsetB+0)(BO) + lxvp vs32, (\OffsetA+0)(AO) + lxvp vs36, (\OffsetA+32)(AO) +.endm + +.macro END4x8_NORMAL + END4x8 AO, BO, 64, 32 +.endm + +.macro END4x8_WITHOUT_ADD + END4x8 AO, BO, 0, 0 +.endm + +.macro END4x8 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvf32gerpp 3, 36, 35 + xvf32gerpp 2, 37, 35 + xvf32gerpp 1, 32, 35 + xvf32gerpp 0, 33, 35 + xvf32gerpp 7, 36, 34 + xvf32gerpp 6, 37, 34 + xvf32gerpp 5, 32, 34 + xvf32gerpp 4, 33, 34 +.endm + +.macro LOAD4x8_2 + LOAD4x8_2O 0, 0 +.endm + +.macro LOAD4x8_2O OffsetA, OffsetB + lxvp vs34, (\OffsetB)(BO) + lxvp vs38, (32+\OffsetB)(BO) + lxvp vs32, (0+\OffsetA)(AO) + lxvp vs36, (32+\OffsetA)(AO) + lxvp vs40, (64+\OffsetA)(AO) + lxvp vs42, (64+32+\OffsetA)(AO) +.endm + +.macro END4x8_2 + /*for load2 offset will be 128 and 64*/ + KERNEL4x8_2 AO, BO, 128, 64, 0, 1, 1 +.endm + +.macro KERNEL4x8_E2 OffsetA, OffsetB, Index, IsLast + KERNEL4x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL4x8_L2 OffsetA, OffsetB, Index, IsLast + KERNEL4x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL4x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete + xvf32gerpp 3, 36, 35 + xvf32gerpp 2, 37, 35 + xvf32gerpp 1, 32, 35 + xvf32gerpp 0, 33, 35 + xvf32gerpp 7, 36, 34 + xvf32gerpp 6, 37, 34 + xvf32gerpp 5, 32, 34 + xvf32gerpp 4, 33, 34 +.if \Complete==0 + lxvp vs34, DISP8(\Index, \OffsetB)(\BREG) + lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG) + lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG) +.endif + xvf32gerpp 3, 42, 39 + xvf32gerpp 2, 43, 39 + xvf32gerpp 1, 40, 39 + xvf32gerpp 0, 41, 39 + xvf32gerpp 7, 42, 38 + xvf32gerpp 6, 43, 38 + xvf32gerpp 5, 40, 38 + xvf32gerpp 4, 41, 38 +.if \Complete==0 + lxvp vs40, DISP16(\Index, 64+\OffsetA)(\AREG) + lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG) + lxvp vs42, DISP16(\Index, 64+32+\OffsetA)(\AREG) +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP8(\Index, \OffsetB) + addi \AREG, \AREG, DISP16(\Index, \OffsetA) +.else + addi \BREG, \BREG, DISP8(\Index, 64) + addi \AREG, \AREG, DISP16(\Index, 128) +.endif +.endif +.endm + +.macro KERNEL4x8 + LOAD4x8 + END4x8 AO, BO, 64, 32 +.endm + +.macro SAVE4x8 + SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44 + SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45 + SHUFFLE_ACC 2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46 + SHUFFLE_ACC 3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47 + SHUFFLE_ACC 4, vs16, vs17, vs18, vs19, vs48, vs56, vs52, vs60 + SHUFFLE_ACC 5, vs20, vs21, vs22, vs23, vs49, vs16, vs53, vs61 + SHUFFLE_ACC 7, vs28, vs29, vs30, vs31, vs17, vs19, vs18, vs20 + SHUFFLE_ACC 6, vs24, vs25, vs26, vs27, vs50, vs58, vs54, vs21 + add T4, LDC, LDC + add T1, CO, LDC +#ifndef TRMMKERNEL + lxvp vs24, 0(CO) +#endif +#ifndef TRMMKERNEL + lxvp vs26, 32(CO) +#endif +#ifndef TRMMKERNEL + lxvp vs28, 0(T1) +#endif + xxperm vs2, vs34, permute_mask + xxperm vs6, vs42, permute_mask +#ifndef TRMMKERNEL + lxvp vs30, 32(T1) +#endif + xxperm vs3, vs35, permute_mask + xxperm vs7, vs43, permute_mask + add T2, CO, T4 + add T3, T1, T4 + GROUP1 + AGG_GROUP1 + AGGREGATE_REALS_IMAGES vs34, vs2, vs42, vs6 + xxperm vs10, vs38, permute_mask + xxperm vs14, vs46, permute_mask + AGGREGATE_REALS_IMAGES vs35, vs3, vs43, vs7 + xxperm vs11, vs39, permute_mask + xxperm vs15, vs47, permute_mask + xxperm vs0, vs48, permute_mask + xxperm vs4, vs56, permute_mask + xxperm vs1, vs49, permute_mask + xxperm vs5, vs16, permute_mask + AGGREGATE_REALS_IMAGES vs38, vs10, vs46, vs14 + xxperm vs2, vs50, permute_mask + xxperm vs6, vs58, permute_mask + AGGREGATE_REALS_IMAGES vs39, vs11, vs47, vs15 + xxperm vs3, vs17, permute_mask + xxperm vs7, vs19, permute_mask + AGGREGATE_REALS_IMAGES vs48, vs0, vs56, vs4 + xxperm vs8, vs52, permute_mask + xxperm vs12, vs60, permute_mask + AGGREGATE_REALS_IMAGES vs49, vs1, vs16, vs5 + xxperm vs9, vs53, permute_mask + xxperm vs13, vs61, permute_mask + AGGREGATE_REALS_IMAGES vs50, vs2, vs58, vs6 + xxperm vs10, vs54, permute_mask + xxperm vs14, vs21, permute_mask + AGGREGATE_REALS_IMAGES vs17, vs3, vs19, vs7 + xxperm vs11, vs18, permute_mask + xxperm vs15, vs20, permute_mask + AGGREGATE_REALS_IMAGES vs52, vs8, vs60, vs12 + AGGREGATE_REALS_IMAGES vs53, vs9, vs61, vs13 +/*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULT_APLHA_PART1 vs32, vs40, vs0, vs1 + AGGREGATE_REALS_IMAGES vs54, vs10, vs21, vs14 + MULT_APLHA_PART1 vs33, vs41, vs2, vs3 + AGGREGATE_REALS_IMAGES vs18, vs11, vs20, vs15 + MULT_APLHA_PART1 vs34, vs42, vs4, vs5 + MULT_APLHA_PART1 vs35, vs43, vs6, vs7 + MULT_APLHA_PART2 vs32, vs40, vs0, vs1 + MULT_APLHA_PART2 vs33, vs41, vs2, vs3 + MULT_APLHA_PART2 vs34, vs42, vs4, vs5 + MULT_APLHA_PART2 vs35, vs43, vs6, vs7 +#ifndef TRMMKERNEL + lxvp vs32, 0(T2) +#endif + MULT_APLHA_PART1 vs36, vs44, vs8, vs9 + MULT_APLHA_PART1 vs37, vs45, vs10, vs11 +#ifndef TRMMKERNEL + lxvp vs40, 32(T2) +#endif + MULT_APLHA_PART1 vs38, vs46, vs12, vs13 + MULT_APLHA_PART1 vs39, vs47, vs14, vs15 +#ifndef TRMMKERNEL + lxvp vs34, 0(T3) +#endif + MULT_APLHA_PART2 vs36, vs44, vs8, vs9 + MULT_APLHA_PART2 vs37, vs45, vs10, vs11 +#ifndef TRMMKERNEL + lxvp vs42, 32(T3) +#endif + MULT_APLHA_PART2 vs38, vs46, vs12, vs13 + MULT_APLHA_PART2 vs39, vs47, vs14, vs15 + RECONSTRUCT_PAIR1 + RECONSTRUCT_PAIR2 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1, vs8, vs0, 2 + xxpermdi vs3, vs10, vs2, 2 + xxpermdi vs5, vs12, vs4, 2 + xxpermdi vs7, vs14, vs6, 2 + xxpermdi vs9, vs0, vs8, 2 + xxpermdi vs11, vs2, vs10, 2 + xvaddsp vs24, vs24, vs3 + xvaddsp vs25, vs25, vs1 + xxpermdi vs13, vs4, vs12, 2 + xxpermdi vs15, vs6, vs14, 2 + xvaddsp vs26, vs26, vs7 + xvaddsp vs27, vs27, vs5 + xvaddsp vs28, vs28, vs11 + xvaddsp vs29, vs29, vs9 + xvaddsp vs30, vs30, vs15 + xvaddsp vs31, vs31, vs13 +#else + xxpermdi vs25, vs8, vs0, 2 + xxpermdi vs24, vs10, vs2, 2 + xxpermdi vs27, vs12, vs4, 2 + xxpermdi vs26, vs14, vs6, 2 + xxpermdi vs29, vs0, vs8, 2 + xxpermdi vs28, vs2, vs10, 2 + xxpermdi vs31, vs4, vs12, 2 + xxpermdi vs30, vs6, vs14, 2 +#endif + stxvp vs24, 0(CO) + MULT_APLHA_PART1 vs48, vs56, vs0, vs1 + MULT_APLHA_PART1 vs49, vs16, vs2, vs3 + stxvp vs26, 32(CO) + MULT_APLHA_PART1 vs50, vs58, vs4, vs5 + MULT_APLHA_PART1 vs17, vs19, vs6, vs7 + stxvp vs28, 0(T1) + MULT_APLHA_PART2 vs48, vs56, vs0, vs1 + MULT_APLHA_PART2 vs49, vs16, vs2, vs3 + stxvp vs30, 32(T1) + MULT_APLHA_PART2 vs50, vs58, vs4, vs5 + MULT_APLHA_PART2 vs17, vs19, vs6, vs7 + MULT_APLHA_PART1 vs52, vs60, vs8, vs9 + MULT_APLHA_PART1 vs53, vs61, vs10, vs11 + MULT_APLHA_PART1 vs54, vs21, vs12, vs13 + MULT_APLHA_PART1 vs18, vs20, vs14, vs15 + MULT_APLHA_PART2 vs52, vs60, vs8, vs9 + MULT_APLHA_PART2 vs53, vs61, vs10, vs11 + MULT_APLHA_PART2 vs54, vs21, vs12, vs13 + MULT_APLHA_PART2 vs18, vs20, vs14, vs15 + RECONSTRUCT_PAIR1 + RECONSTRUCT_PAIR2 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1, vs8, vs0, 2 + xxpermdi vs3, vs10, vs2, 2 + xxpermdi vs5, vs12, vs4, 2 + xxpermdi vs7, vs14, vs6, 2 + xxpermdi vs9, vs0, vs8, 2 + xxpermdi vs11, vs2, vs10, 2 + xvaddsp vs32, vs32, vs3 + xvaddsp vs33, vs33, vs1 + xxpermdi vs13, vs4, vs12, 2 + xxpermdi vs15, vs6, vs14, 2 + xvaddsp vs40, vs40, vs7 + xvaddsp vs41, vs41, vs5 + xvaddsp vs34, vs34, vs11 + xvaddsp vs35, vs35, vs9 + xvaddsp vs42, vs42, vs15 + xvaddsp vs43, vs43, vs13 +#else + xxpermdi vs33, vs8, vs0, 2 + xxpermdi vs32, vs10, vs2, 2 + xxpermdi vs41, vs12, vs4, 2 + xxpermdi vs40, vs14, vs6, 2 + xxpermdi vs35, vs0, vs8, 2 + xxpermdi vs34, vs2, vs10, 2 + xxpermdi vs43, vs4, vs12, 2 + xxpermdi vs42, vs6, vs14, 2 +#endif + stxvp vs32, 0(T2) + stxvp vs40, 32(T2) + stxvp vs34, 0(T3) + stxvp vs42, 32(T3) + addi CO, CO, 64 +.endm + +/* macros for N=4 and M=4 +**********************************************************************************************/ + +.macro ZERO4x4 + xxsetaccz 0 + xxsetaccz 1 + xxsetaccz 2 + xxsetaccz 3 +.endm + +.macro LOAD4x4 + LOAD4x4O 0, 0 +.endm + +.macro LOAD4x4O OffsetA, OffsetB + lxvp vs34, (\OffsetB+0)(BO) + lxvp vs32, (\OffsetA+0)(AO) +.endm + +.macro END4x4_NORMAL + END4x4 AO, BO, 32, 32 +.endm + +.macro END4x4_WITHOUT_ADD + END4x4 AO, BO, 0, 0 +.endm + +.macro END4x4 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvf32gerpp 3, 32, 34 + xvf32gerpp 2, 33, 34 + xvf32gerpp 1, 32, 35 + xvf32gerpp 0, 33, 35 +.endm + +.macro LOAD4x4_2 + LOAD4x4_2O 0, 0 +.endm + +.macro LOAD4x4_2O OffsetA, OffsetB + lxvp vs34, (\OffsetB)(BO) + lxvp vs38, (32+\OffsetB)(BO) + lxvp vs32, (0+\OffsetA)(AO) + lxvp vs36, (32+\OffsetA)(AO) +.endm + +.macro END4x4_2 + /*for load2 offset will be 64 and 64*/ + KERNEL4x4_2 AO, BO, 64, 64, 0, 1, 1 +.endm + +.macro KERNEL4x4_E2 OffsetA, OffsetB, Index, IsLast + KERNEL4x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL4x4_L2 OffsetA, OffsetB, Index, IsLast + KERNEL4x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL4x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete + xvf32gerpp 3, 32, 34 + xvf32gerpp 2, 33, 34 + xvf32gerpp 1, 32, 35 + xvf32gerpp 0, 33, 35 +.if \Complete==0 + lxvp vs34, DISP8(\Index, \OffsetB)(\BREG) + lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG) +.endif + xvf32gerpp 3, 36, 38 + xvf32gerpp 2, 37, 38 + xvf32gerpp 1, 36, 39 + xvf32gerpp 0, 37, 39 +.if \Complete==0 + lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG) + lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG) +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP8(\Index, \OffsetB) + addi \AREG, \AREG, DISP8(\Index, \OffsetA) +.else + addi \BREG, \BREG, DISP8(\Index, 64) + addi \AREG, \AREG, DISP8(\Index, 64) +.endif +.endif +.endm + +.macro KERNEL4x4 + LOAD4x4 + END4x4 AO, BO, 32, 32 +.endm + +.macro SAVE4x4 + SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44 + SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45 + SHUFFLE_ACC 2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46 + SHUFFLE_ACC 3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47 + add T4, LDC, LDC + add T1, CO, LDC +#ifndef TRMMKERNEL + lxvp vs24, 0(CO) +#endif + add T2, CO, T4 + add T3, T1, T4 +#ifndef TRMMKERNEL + lxvp vs26, 0(T1) +#endif + #ifndef TRMMKERNEL + lxvp vs28, 0(T2) +#endif +#ifndef TRMMKERNEL + lxvp vs30, 0(T3) +#endif + GROUP1 + AGG_GROUP1 + GROUP2 + AGG_GROUP2 + /*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULTIPLY_GROUP1 + MULTIPLY_GROUP2 +/* reconstruct r, i pairs*/ + RECONSTRUCT_PAIR1 + RECONSTRUCT_PAIR2 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1, vs8, vs0, 2 + xxpermdi vs3, vs10, vs2, 2 + xxpermdi vs9, vs0, vs8, 2 + xxpermdi vs11, vs2, vs10, 2 + xxpermdi vs5, vs12, vs4, 2 + xxpermdi vs7, vs14, vs6, 2 + xxpermdi vs13, vs4, vs12, 2 + xxpermdi vs15, vs6, vs14, 2 + xvaddsp vs24, vs24, vs3 + xvaddsp vs25, vs25, vs1 + xvaddsp vs26, vs26, vs11 + xvaddsp vs27, vs27, vs9 + xvaddsp vs28, vs28, vs7 + xvaddsp vs29, vs29, vs5 + xvaddsp vs30, vs30, vs15 + xvaddsp vs31, vs31, vs13 +#else + xxpermdi vs25, vs8, vs0, 2 + xxpermdi vs24, vs10, vs2, 2 + xxpermdi vs27, vs0, vs8, 2 + xxpermdi vs26, vs2, vs10, 2 + xxpermdi vs29, vs12, vs4, 2 + xxpermdi vs28, vs14, vs6, 2 + xxpermdi vs31, vs4, vs12, 2 + xxpermdi vs30, vs6, vs14, 2 +#endif + stxvp vs24, 0(CO) + stxvp vs26, 0(T1) + stxvp vs28, 0(T2) + stxvp vs30, 0(T3) + addi CO, CO, 32 +.endm + +/* macros for N=4 and M=2 +**********************************************************************************************/ + +.macro ZERO4x2 + xxsetaccz 0 + xxsetaccz 1 +.endm + +.macro LOAD4x2 + LOAD4x2O 0, 0 +.endm + +.macro LOAD4x2O OffsetA, OffsetB + lxv vs32, (\OffsetA+0)(AO) + lxvp vs34, (\OffsetB+0)(BO) +.endm + +.macro END4x2_NORMAL + END4x2 AO, BO, 16, 32 +.endm + +.macro END4x2_WITHOUT_ADD + END4x2 AO, BO, 0, 0 +.endm + +.macro END4x2 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvf32gerpp 1, 34, 32 + xvf32gerpp 0, 35, 32 +.endm + +.macro LOAD4x2_2 + LOAD4x2_2O 0, 0 +.endm + +.macro LOAD4x2_2O OffsetA, OffsetB + lxvp vs32, (\OffsetA)(AO) + lxvp vs34, (0+\OffsetB)(BO) + lxvp vs36, (32+\OffsetB)(BO) +.endm + +.macro END4x2_2 + /*for load2 offset will be 32 and 64*/ + KERNEL4x2_2 AO, BO, 32, 64, 0, 1, 1 +.endm + +.macro KERNEL4x2_E2 OffsetA, OffsetB, Index, IsLast + KERNEL4x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL4x2_L2 OffsetA, OffsetB, Index, IsLast + KERNEL4x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL4x2_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete + xvf32gerpp 1, 34, 33 + xvf32gerpp 0, 35, 33 +.if \Complete==0 + lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG) +.endif + xvf32gerpp 1, 36, 32 + xvf32gerpp 0, 37, 32 +.if \Complete==0 + lxvp vs32, DISP4(\Index, \OffsetA)(\AREG) + lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG) +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP4(\Index, \OffsetA) + addi \BREG, \BREG, DISP8(\Index, \OffsetB) +.else + addi \AREG, \AREG, DISP4(\Index, 32) + addi \BREG, \BREG, DISP8(\Index, 64) +.endif +.endif +.endm + +.macro KERNEL4x2 + LOAD4x2 + END4x2 AO, BO, 16, 32 +.endm + +.macro SAVE4x2 + SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44 + SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45 + add T4, LDC, LDC + add T1, CO, LDC + add T2, CO, T4 + add T3, T1, T4 +#ifndef TRMMKERNEL + lxv vs24, 0(CO) +#endif +#ifndef TRMMKERNEL + lxv vs25, 0(T1) +#endif +#ifndef TRMMKERNEL + lxv vs26, 0(T2) +#endif +#ifndef TRMMKERNEL + lxv vs27, 0(T3) +#endif + GROUP1 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs33, vs1, vs41, vs5 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs36, vs8, vs44, vs12 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs37, vs9, vs45, vs13 + /*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULTIPLY_GROUP1 +/* reconstruct r, i pairs*/ + RECONSTRUCT_PAIR1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1, vs8, vs0, 0 + xxpermdi vs9, vs10, vs2, 0 + xxpermdi vs3, vs0, vs8, 3 + xxpermdi vs11, vs2, vs10, 3 + xvaddsp vs24, vs24, vs1 + xvaddsp vs26, vs26, vs9 + xvaddsp vs25, vs25, vs3 + xvaddsp vs27, vs27, vs11 +#else + xxpermdi vs24, vs8, vs0, 0 + xxpermdi vs26, vs10, vs2, 0 + xxpermdi vs25, vs0, vs8, 3 + xxpermdi vs27, vs2, vs10, 3 +#endif + stxv vs24, 0(CO) + stxv vs25, 0(T1) + stxv vs26, 0(T2) + stxv vs27, 0(T3) + addi CO, CO, 16 +.endm + +/* macros for N=4 and M=2 +**********************************************************************************************/ + +.macro ZERO4x1 + xxsetaccz 0 + xxsetaccz 1 +.endm + +.macro LOAD4x1 + LOAD4x1O 0, 0 +.endm + +.macro LOAD4x1O OffsetA, OffsetB + lxsd v0, (\OffsetA+0)(AO) + lxvp vs34, (\OffsetB+0)(BO) +.endm + +.macro END4x1_NORMAL + END4x1 AO, BO,8, 32 +.endm + +.macro END4x1_WITHOUT_ADD + END4x1 AO, BO, 0, 0 +.endm + +.macro END4x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvf32gerpp 0, 35, 32 + xvf32gerpp 1, 34, 32 +.endm + +.macro LOAD4x1_2 + LOAD4x1_2O 0, 0 +.endm + +.macro LOAD4x1_2O OffsetA, OffsetB + lxv vs32, (\OffsetA)(AO) + vspltisb v6, 0 + xxpermdi vs33, vs32, vs38, 0 + xxpermdi vs32, vs32, vs38, 2 + lxvp vs34, (0+\OffsetB)(BO) + lxvp vs36, (32+\OffsetB)(BO) +.endm + +.macro END4x1_2 + /*for load2 offset will be 16 and 64*/ + KERNEL4x1_2 AO, BO, 16, 64, 0, 1, 1 +.endm + +.macro KERNEL4x1_E2 OffsetA, OffsetB, Index, IsLast + KERNEL4x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL4x1_L2 OffsetA, OffsetB, Index, IsLast + KERNEL4x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL4x1_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete + xvf32gerpp 0, 35, 32 + xvf32gerpp 1, 34, 32 +.if \Complete==0 + lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG) +.endif + xvf32gerpp 0, 37, 33 + xvf32gerpp 1, 36, 33 +.if \Complete==0 + lxv vs32, DISP2(\Index, \OffsetA)(\AREG) + lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG) + xxpermdi vs33, vs32, vs38, 0 + xxpermdi vs32, vs32, vs38, 2 +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index, \OffsetA) + addi \BREG, \BREG, DISP8(\Index, \OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index, 16) + addi \BREG, \BREG, DISP8(\Index, 64) +.endif +.endif +.endm + +.macro KERNEL4x1 + LOAD4x1 + END4x1 AO, BO, 8, 32 +.endm + +.macro SAVE4x1 + SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44 + SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45 + xxpermdi vs32, vs32, vs36, 1 + xxpermdi vs40, vs40, vs44, 1 + xxpermdi vs33, vs33, vs37, 1 + xxpermdi vs41, vs41, vs45, 1 + add T4, LDC, LDC + add T1, CO, LDC + add T2, CO, T4 + add T3, T1, T4 +#ifndef TRMMKERNEL + lxsd v4, 0(CO) +#endif +#ifndef TRMMKERNEL + lxsd v5, 0(T1) +#endif +#ifndef TRMMKERNEL + lxsd v6, 0(T2) +#endif +#ifndef TRMMKERNEL + lxsd v7, 0(T3) +#endif + xxperm vs0, vs32, permute_mask + xxperm vs4, vs40, permute_mask + xxperm vs1, vs33, permute_mask + xxperm vs5, vs41, permute_mask + AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs33, vs1, vs41, vs5 + /*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULT_APLHA_PART1 vs32, vs40, vs0, vs1 + MULT_APLHA_PART1 vs33, vs41, vs2, vs3 + MULT_APLHA_PART2 vs32, vs40, vs0, vs1 + MULT_APLHA_PART2 vs33, vs41, vs2, vs3 +/* reconstruct r, i pairs*/ + xxperm vs0, vs1, save_permute_1 + xxperm vs2, vs3, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxspltd vs1, vs0, 0 + xxspltd vs3, vs0, 1 + xxspltd vs9, vs2, 0 + xxspltd vs11, vs2, 1 + /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/ + xvaddsp vs36, vs36, vs1 + xvaddsp vs37, vs37, vs3 + xvaddsp vs38, vs38, vs9 + xvaddsp vs39, vs39, vs11 +#else + /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/ + xxspltd vs36, vs0, 0 + xxspltd vs37, vs0, 1 + xxspltd vs38, vs2, 0 + xxspltd vs39, vs2, 1 +#endif + stxsd v4, 0(CO) + stxsd v5, 0(T1) + stxsd v6, 0(T2) + stxsd v7, 0(T3) + addi CO, CO, 8 +.endm + +/* macros for N=2 and M=8 +**********************************************************************************************/ + +.macro ZERO2x8 + xxsetaccz 0 + xxsetaccz 1 + xxsetaccz 2 + xxsetaccz 3 +.endm + +.macro LOAD2x8 + LOAD2x8O 0, 0 +.endm + +.macro LOAD2x8O OffsetA, OffsetB + lxv vs34, (\OffsetB+0)(BO) + lxvp vs32, (\OffsetA+0)(AO) + lxvp vs36, (\OffsetA+32)(AO) +.endm + +.macro END2x8_NORMAL + END2x8 AO, BO, 64, 16 +.endm + +.macro END2x8_WITHOUT_ADD + END2x8 AO, BO, 0, 0 +.endm + +.macro END2x8 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvf32gerpp 2, 37, 34 + xvf32gerpp 3, 36, 34 + xvf32gerpp 0, 33, 34 + xvf32gerpp 1, 32, 34 +.endm + +.macro LOAD2x8_2 + LOAD2x8_2O 0, 0 +.endm + +.macro LOAD2x8_2O OffsetA, OffsetB + lxvp vs34, (\OffsetB)(BO) + lxvp vs32, (0+\OffsetA)(AO) + lxvp vs36, (32+\OffsetA)(AO) + lxvp vs38, (64+\OffsetA)(AO) + lxvp vs40, (64+32+\OffsetA)(AO) +.endm + +.macro END2x8_2 + /*for load2 offset will be 128 and 32*/ + KERNEL2x8_2 AO, BO, 128, 32, 0, 1, 1 +.endm + +.macro KERNEL2x8_E2 OffsetA, OffsetB, Index, IsLast + KERNEL2x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL2x8_L2 OffsetA, OffsetB, Index, IsLast + KERNEL2x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL2x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete + xvf32gerpp 2, 37, 35 + xvf32gerpp 3, 36, 35 + xvf32gerpp 0, 33, 35 + xvf32gerpp 1, 32, 35 + +.if \Complete==0 + lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG) + lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG) +.endif + xvf32gerpp 2, 41, 34 + xvf32gerpp 3, 40, 34 + xvf32gerpp 0, 39, 34 + xvf32gerpp 1, 38, 34 + +.if \Complete==0 + lxvp vs34, DISP4(\Index, \OffsetB)(\BREG) + lxvp vs38, DISP16(\Index, 64+\OffsetA)(\AREG) + lxvp vs40, DISP16(\Index, 64+32+\OffsetA)(\AREG) +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP4(\Index, \OffsetB) + addi \AREG, \AREG, DISP16(\Index, \OffsetA) +.else + addi \BREG, \BREG, DISP4(\Index, 32) + addi \AREG, \AREG, DISP16(\Index, 128) +.endif +.endif +.endm + +.macro KERNEL2x8 + LOAD2x8 + END2x8 AO, BO, 64, 16 +.endm + +.macro SAVE2x8 + SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44 + SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45 + SHUFFLE_ACC 2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46 + SHUFFLE_ACC 3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47 + add T1, CO, LDC +#ifndef TRMMKERNEL + lxvp vs24, 0(CO) +#endif +#ifndef TRMMKERNEL + lxvp vs26, 32(CO) +#endif +#ifndef TRMMKERNEL + lxvp vs28, 0(T1) +#endif +#ifndef TRMMKERNEL + lxvp vs30, 32(T1) +#endif + add T2, CO, T4 + add T3, T1, T4 + GROUP1 + AGG_GROUP1 + GROUP2 + AGG_GROUP2 + /*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULTIPLY_GROUP1 + MULTIPLY_GROUP2 +/* reconstruct r, i pairs*/ + RECONSTRUCT_PAIR1 + RECONSTRUCT_PAIR2 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1, vs8, vs0, 2 + xxpermdi vs3, vs10, vs2, 2 + xxpermdi vs5, vs12, vs4, 2 + xxpermdi vs7, vs14, vs6, 2 + xxpermdi vs9, vs0, vs8, 2 + xxpermdi vs11, vs2, vs10, 2 + xvaddsp vs24, vs24, vs3 + xvaddsp vs25, vs25, vs1 + xxpermdi vs13, vs4, vs12, 2 + xxpermdi vs15, vs6, vs14, 2 + xvaddsp vs26, vs26, vs7 + xvaddsp vs27, vs27, vs5 + xvaddsp vs28, vs28, vs11 + xvaddsp vs29, vs29, vs9 + xvaddsp vs30, vs30, vs15 + xvaddsp vs31, vs31, vs13 +#else + xxpermdi vs25, vs8, vs0, 2 + xxpermdi vs24, vs10, vs2, 2 + xxpermdi vs27, vs12, vs4, 2 + xxpermdi vs26, vs14, vs6, 2 + xxpermdi vs29, vs0, vs8, 2 + xxpermdi vs28, vs2, vs10, 2 + xxpermdi vs31, vs4, vs12, 2 + xxpermdi vs30, vs6, vs14, 2 +#endif + stxvp vs24, 0(CO) + stxvp vs26, 32(CO) + stxvp vs28, 0(T1) + stxvp vs30, 32(T1) + addi CO, CO, 64 +.endm + +/* macros for N=2 and M=4 +**********************************************************************************************/ + +.macro ZERO2x4 + xxsetaccz 0 + xxsetaccz 1 +.endm + +.macro LOAD2x4 + LOAD2x4O 0, 0 +.endm + +.macro LOAD2x4O OffsetA, OffsetB + lxv vs34, (\OffsetB+0)(BO) + lxvp vs32, (\OffsetA+0)(AO) +.endm + +.macro END2x4_NORMAL + END2x4 AO, BO, 32, 16 +.endm + +.macro END2x4_WITHOUT_ADD + END2x4 AO, BO, 0, 0 +.endm + +.macro END2x4 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvf32gerpp 0, 33, 34 + xvf32gerpp 1, 32, 34 +.endm + +.macro LOAD2x4_2 + LOAD2x4_2O 0, 0 +.endm + +.macro LOAD2x4_2O OffsetA, OffsetB + lxvp vs34, (\OffsetB)(BO) + lxvp vs32, (0+\OffsetA)(AO) + lxvp vs36, (32+\OffsetA)(AO) +.endm + +.macro END2x4_2 + /*for load2 offset will be 64 and 32*/ + KERNEL2x4_2 AO, BO, 64, 32, 0, 1, 1 +.endm + +.macro KERNEL2x4_E2 OffsetA, OffsetB, Index, IsLast + KERNEL2x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL2x4_L2 OffsetA, OffsetB, Index, IsLast + KERNEL2x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL2x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete + xvf32gerpp 0, 33, 35 + xvf32gerpp 1, 32, 35 +.if \Complete==0 + lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG) +.endif + xvf32gerpp 0, 37, 34 + xvf32gerpp 1, 36, 34 +.if \Complete==0 + lxvp vs34, DISP4(\Index, \OffsetB)(\BREG) + lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG) +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP4(\Index, \OffsetB) + addi \AREG, \AREG, DISP8(\Index, \OffsetA) +.else + addi \BREG, \BREG, DISP4(\Index, 32) + addi \AREG, \AREG, DISP8(\Index, 64) +.endif +.endif +.endm + +.macro KERNEL2x4 + LOAD2x4 + END2x4 AO, BO, 32, 16 +.endm + +.macro SAVE2x4 + SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44 + SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45 + add T1, CO, LDC +#ifndef TRMMKERNEL + lxvp vs24, 0(CO) +#endif +#ifndef TRMMKERNEL + lxvp vs26, 0(T1) +#endif + GROUP1 + AGG_GROUP1 + /*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULTIPLY_GROUP1 +/* reconstruct r, i pairs*/ + RECONSTRUCT_PAIR1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1, vs8, vs0, 2 + xxpermdi vs3, vs10, vs2, 2 + xxpermdi vs9, vs0, vs8, 2 + xxpermdi vs11, vs2, vs10, 2 + xvaddsp vs24, vs24, vs3 + xvaddsp vs25, vs25, vs1 + xvaddsp vs26, vs26, vs11 + xvaddsp vs27, vs27, vs9 +#else + xxpermdi vs25, vs8, vs0, 2 + xxpermdi vs24, vs10, vs2, 2 + xxpermdi vs27, vs0, vs8, 2 + xxpermdi vs26, vs2, vs10, 2 +#endif + stxvp vs24, 0(CO) + stxvp vs26, 0(T1) + addi CO, CO, 32 +.endm + +/* macros for N=2 and M=2 +**********************************************************************************************/ + +.macro ZERO2x2 + xxsetaccz 0 +.endm + +.macro LOAD2x2 + LOAD2x2O 0, 0 +.endm + +.macro LOAD2x2O OffsetA, OffsetB + lxv vs32, (\OffsetA+0)(AO) + lxv vs34, (\OffsetB+0)(BO) +.endm + +.macro END2x2_NORMAL + END2x2 AO, BO, 16, 16 +.endm + +.macro END2x2_WITHOUT_ADD + END2x2 AO, BO, 0, 0 +.endm + +.macro END2x2 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvf32gerpp 0, 34, 32 +.endm + +.macro LOAD2x2_2 + LOAD2x2_2O 0, 0 +.endm + +.macro LOAD2x2_2O OffsetA, OffsetB + lxvp vs32, (\OffsetA)(AO) + lxvp vs34, (0+\OffsetB)(BO) +.endm + +.macro END2x2_2 + /*for load2 offset will be 32 and 32*/ + KERNEL2x2_2 AO, BO, 32, 32, 0, 1, 1 +.endm + +.macro KERNEL2x2_E2 OffsetA, OffsetB, Index, IsLast + KERNEL2x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL2x2_L2 OffsetA, OffsetB, Index, IsLast + KERNEL2x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL2x2_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete + xvf32gerpp 0, 34, 32 + xvf32gerpp 0, 35, 33 +.if \Complete==0 + lxvp vs32, DISP4(\Index, \OffsetA)(\AREG) + lxvp vs34, DISP4(\Index, \OffsetA)(\BREG) +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP4(\Index, \OffsetA) + addi \BREG, \BREG, DISP4(\Index, \OffsetB) +.else + addi \AREG, \AREG, DISP4(\Index, 32) + addi \BREG, \BREG, DISP4(\Index, 32) +.endif +.endif +.endm + +.macro KERNEL2x2 + LOAD2x2 + END2x2 AO, BO, 16, 16 +.endm + +.macro SAVE2x2 + SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44 + add T1, CO, LDC +#ifndef TRMMKERNEL + lxv vs24, 0(CO) +#endif +#ifndef TRMMKERNEL + lxv vs26, 0(T1) +#endif + xxperm vs0, vs32, permute_mask + xxperm vs4, vs40, permute_mask + xxperm vs8, vs36, permute_mask + xxperm vs12, vs44, permute_mask + AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs36, vs8, vs44, vs12 + /*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULT_APLHA_PART1 vs32, vs40, vs0, vs1 + MULT_APLHA_PART1 vs36, vs44, vs8, vs9 + MULT_APLHA_PART2 vs32, vs40, vs0, vs1 + MULT_APLHA_PART2 vs36, vs44, vs8, vs9 +/* reconstruct r, i pairs*/ + xxperm vs0, vs1, save_permute_1 + xxperm vs8, vs9, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1, vs8, vs0, 0 + xxpermdi vs9, vs0, vs8, 3 + xvaddsp vs24, vs24, vs1 + xvaddsp vs26, vs26, vs9 +#else + xxpermdi vs24, vs8, vs0, 0 + xxpermdi vs26, vs0, vs8, 3 +#endif + stxv vs24, 0(CO) + stxv vs26, 0(T1) + addi CO, CO, 16 +.endm + +/* macros for N=2 and M=1 +**********************************************************************************************/ + +.macro ZERO2x1 + xxlxor vs32, vs32, vs32 + xxlxor vs40, vs40, vs40 +.endm + +.macro LOAD2x1 + LOAD2x1O 0, 0 +.endm + +.macro LOAD2x1O OffsetA, OffsetB + lxsd v4, (\OffsetA+0)(AO) + lxv vs0, (\OffsetB+0)(BO) + xxspltd vs24, vs36, 0 + xxperm vs26, vs24, permute_mask +.endm + +.macro END2x1_NORMAL + END2x1 AO, BO,8, 16 +.endm + +.macro END2x1_WITHOUT_ADD + END2x1 AO, BO, 0, 0 +.endm + +.macro END2x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddasp vs32, vs0, vs24 + xvmaddasp vs40, vs0, vs26 +.endm + +.macro LOAD2x1_2 + LOAD2x1_2O 0, 0 +.endm + +.macro LOAD2x1_2O OffsetA, OffsetB + lxv vs27, (\OffsetA)(AO) + lxvp vs4, (0+\OffsetB)(BO) + xxspltd vs8, vs27, 1 + xxspltd vs24, vs27, 0 + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask +.endm + +.macro END2x1_2 + /*for load2 offset will be 16 and 32*/ + KERNEL2x1_2 AO, BO, 16, 32, 0, 1, 1 +.endm + +.macro KERNEL2x1_E2 OffsetA, OffsetB, Index, IsLast + KERNEL2x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL2x1_L2 OffsetA, OffsetB, Index, IsLast + KERNEL2x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL2x1_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete + xvmaddasp vs32, vs5, vs8 + xvmaddasp vs40, vs5, vs10 +.if \Complete==0 + lxv vs27, DISP2(\Index, \OffsetA)(\AREG) + xxspltd vs8, vs27, 1 +.endif +.if \Complete==0 + xxperm vs10, vs8, permute_mask +.endif + xvmaddasp vs32, vs4, vs24 + xvmaddasp vs40, vs4, vs26 +.if \Complete==0 + xxspltd vs24, vs27, 0 + xxperm vs26, vs24, permute_mask +.endif +.if \Complete==0 + lxvp vs4, DISP4(\Index, 0+\OffsetB)(\BREG) +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index, \OffsetA) + addi \BREG, \BREG, DISP4(\Index, \OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index, 16) + addi \BREG, \BREG, DISP4(\Index, 32) +.endif +.endif +.endm + +.macro KERNEL2x1 + LOAD2x1 + END2x1 AO, BO, 8, 16 +.endm + +.macro SAVE2x1 + add T1, CO, LDC +#ifndef TRMMKERNEL + lxsd v4, 0(CO) +#endif +#ifndef TRMMKERNEL + lxsd v5, 0(T1) +#endif + xxperm vs0, vs32, permute_mask + xxperm vs4, vs40, permute_mask + AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs33, vs1, vs41, vs5 + /*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULT_APLHA_PART1 vs32, vs40, vs0, vs1 + MULT_APLHA_PART2 vs32, vs40, vs0, vs1 +/* reconstruct r, i pairs*/ + xxperm vs0, vs1, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxspltd vs1, vs0, 0 + xxspltd vs3, vs0, 1 + /*--v4==vs36 v5==vs37---*/ + xvaddsp vs36, vs36, vs1 + xvaddsp vs37, vs37, vs3 +#else + /*--v4==vs36 v5==vs37---*/ + xxspltd vs36, vs0, 0 + xxspltd vs37, vs0, 1 +#endif + stxsd v4, 0(CO) + stxsd v5, 0(T1) + addi CO, CO, 8 +.endm + +/* macros for N=1 and M=8 +**********************************************************************************************/ + +.macro ZERO1x8 + xxsetaccz 0 + xxsetaccz 1 + xxsetaccz 2 + xxsetaccz 3 +.endm + +.macro LOAD1x8 + LOAD1x8O 0, 0 +.endm + +.macro LOAD1x8O OffsetA, OffsetB + lxsd v2, (\OffsetB+0)(BO) + lxvp vs32, (\OffsetA+0)(AO) + lxvp vs36, (\OffsetA+32)(AO) +.endm + +.macro END1x8_NORMAL + END1x8 AO, BO, 64,8 +.endm + +.macro END1x8_WITHOUT_ADD + END1x8 AO, BO, 0, 0 +.endm + +.macro END1x8 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvf32gerpp 0, 34, 33 + xvf32gerpp 1, 34, 32 + xvf32gerpp 2, 34, 37 + xvf32gerpp 3, 34, 36 +.endm + +.macro LOAD1x8_2 + LOAD1x8_2O 0, 0 +.endm + +.macro LOAD1x8_2O OffsetA, OffsetB + lxv vs34, (\OffsetB)(BO) + lxvp vs32, (0+\OffsetA)(AO) + lxvp vs36, (32+\OffsetA)(AO) + vspltisb v10, 0 + xxpermdi vs35, vs34, vs42, 0 + xxpermdi vs34, vs34, vs42, 2 + lxvp vs38, (64+\OffsetA)(AO) + lxvp vs40, (64+32+\OffsetA)(AO) +.endm + +.macro END1x8_2 + /*for load2 offset will be 128 and 16*/ + KERNEL1x8_2 AO, BO, 128, 16, 0, 1, 1 +.endm + +.macro KERNEL1x8_E2 OffsetA, OffsetB, Index, IsLast + KERNEL1x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL1x8_L2 OffsetA, OffsetB, Index, IsLast + KERNEL1x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL1x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete + xvf32gerpp 0, 34, 33 + xvf32gerpp 1, 34, 32 +.if \Complete==0 + lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG) +.endif + xvf32gerpp 2, 34, 37 + xvf32gerpp 3, 34, 36 +.if \Complete==0 + lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG) +.endif + xvf32gerpp 0, 35, 39 + xvf32gerpp 1, 35, 38 +.if \Complete==0 + lxvp vs38, DISP16(\Index, 64+\OffsetA)(\AREG) +.endif + xvf32gerpp 2, 35, 41 + xvf32gerpp 3, 35, 40 +.if \Complete==0 + lxv vs34, DISP2(\Index, \OffsetB)(\BREG) + xxpermdi vs35, vs34, vs42, 0 + xxpermdi vs34, vs34, vs42, 2 + lxvp vs40, DISP16(\Index, 64+32+\OffsetA)(\AREG) +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP2(\Index, \OffsetB) + addi \AREG, \AREG, DISP16(\Index, \OffsetA) +.else + addi \BREG, \BREG, DISP2(\Index, 16) + addi \AREG, \AREG, DISP16(\Index, 128) +.endif +.endif +.endm + +.macro KERNEL1x8 + LOAD1x8 + END1x8 AO, BO, 64,8 +.endm + +.macro SAVE1x8 + SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44 + SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45 + SHUFFLE_ACC 2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46 + SHUFFLE_ACC 3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47 + xxpermdi vs32, vs32, vs36, 0 + xxpermdi vs33, vs33, vs37, 0 + xxpermdi vs34, vs34, vs38, 0 + xxpermdi vs35, vs35, vs39, 0 + xxpermdi vs40, vs40, vs44, 0 + xxperm vs40, vs40, permute_mask + xxpermdi vs41, vs41, vs45, 0 + xxperm vs41, vs41, permute_mask + xxpermdi vs42, vs42, vs46, 0 + xxperm vs42, vs42, permute_mask + xxpermdi vs43, vs43, vs47, 0 + xxperm vs43, vs43, permute_mask +#ifndef TRMMKERNEL + lxvp vs24, 0(CO) +#endif + xxperm vs0, vs32, permute_mask + xxperm vs4, vs40, permute_mask +#ifndef TRMMKERNEL + lxvp vs26, 32(CO) +#endif + xxperm vs1, vs33, permute_mask + xxperm vs5, vs41, permute_mask + xxperm vs2, vs34, permute_mask + xxperm vs6, vs42, permute_mask + xxperm vs3, vs35, permute_mask + xxperm vs7, vs43, permute_mask + AGGREGATE_REALS_IMAGES vs32, vs0, vs40, vs4 + AGGREGATE_REALS_IMAGES vs33, vs1, vs41, vs5 + AGGREGATE_REALS_IMAGES vs34, vs2, vs42, vs6 + AGGREGATE_REALS_IMAGES vs35, vs3, vs43, vs7 + /*inner reverse save_permute and store vs28 */ + xxpermdi vs28,save_permute_1,save_permute_1, 2 + /*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULT_APLHA_PART1 vs32, vs40, vs0, vs1 + MULT_APLHA_PART1 vs33, vs41, vs2, vs3 + MULT_APLHA_PART1 vs34, vs42, vs4, vs5 + MULT_APLHA_PART1 vs35, vs43, vs6, vs7 + MULT_APLHA_PART2 vs32, vs40, vs0, vs1 + MULT_APLHA_PART2 vs33, vs41, vs2, vs3 + MULT_APLHA_PART2 vs34, vs42, vs4, vs5 + MULT_APLHA_PART2 vs35, vs43, vs6, vs7 +/* reconstruct r, i pairs*/ + xxperm vs0, vs1, vs28 + xxperm vs2, vs3, vs28 + xxperm vs4, vs5, vs28 + xxperm vs6, vs7, vs28 +#ifndef TRMMKERNEL + /* add */ + xvaddsp vs24, vs24, vs2 + xvaddsp vs25, vs25, vs0 + xvaddsp vs26, vs26, vs6 + xvaddsp vs27, vs27, vs4 + stxvp vs24, 0(CO) + stxvp vs26, 32(CO) +#else +/* reconstruct r, i pairs*/ + stxv vs0, 0(CO) + stxv vs2, 16(CO) + stxv vs4, 32(CO) + stxv vs6, 48(CO) +#endif + addi CO, CO, 64 +.endm + +/* macros for N=1 and M=4 +**********************************************************************************************/ + +.macro ZERO1x4 + xxsetaccz 0 + xxsetaccz 1 +.endm + +.macro LOAD1x4 + LOAD1x4O 0, 0 +.endm + +.macro LOAD1x4O OffsetA, OffsetB + lxsd v2, (\OffsetB+0)(BO) + lxvp vs32, (\OffsetA+0)(AO) +.endm + +.macro END1x4_NORMAL + END1x4 AO, BO, 32,8 +.endm + +.macro END1x4_WITHOUT_ADD + END1x4 AO, BO, 0, 0 +.endm + +.macro END1x4 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvf32gerpp 0, 34, 33 + xvf32gerpp 1, 34, 32 +.endm + +.macro LOAD1x4_2 + LOAD1x4_2O 0, 0 +.endm + +.macro LOAD1x4_2O OffsetA, OffsetB + lxv vs34, (\OffsetB)(BO) + lxvp vs32, (0+\OffsetA)(AO) + vspltisb v6, 0 + xxpermdi vs35, vs34, vs38, 0 + xxpermdi vs34, vs34, vs38, 2 + lxvp vs36, (32+\OffsetA)(AO) +.endm + +.macro END1x4_2 + /*for load2 offset will be 64 and 16*/ + KERNEL1x4_2 AO, BO, 64, 16, 0, 1, 1 +.endm + +.macro KERNEL1x4_E2 OffsetA, OffsetB, Index, IsLast + KERNEL1x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL1x4_L2 OffsetA, OffsetB, Index, IsLast + KERNEL1x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL1x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete + xvf32gerpp 0, 34, 33 + xvf32gerpp 1, 34, 32 +.if \Complete==0 + lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG) +.endif + xvf32gerpp 0, 35, 37 + xvf32gerpp 1, 35, 36 +.if \Complete==0 + lxv vs34, DISP2(\Index, \OffsetB)(\BREG) + xxpermdi vs35, vs34, vs38, 0 + xxpermdi vs34, vs34, vs38, 2 + lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG) +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP2(\Index, \OffsetB) + addi \AREG, \AREG, DISP8(\Index, \OffsetA) +.else + addi \BREG, \BREG, DISP2(\Index, 16) + addi \AREG, \AREG, DISP8(\Index, 64) +.endif +.endif +.endm + +.macro KERNEL1x4 + LOAD1x4 + END1x4 AO, BO, 32,8 +.endm + +.macro SAVE1x4 + SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44 + SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45 + xxpermdi vs32, vs32, vs36, 0 + xxpermdi vs40, vs40, vs44, 0 + xxpermdi vs33, vs33, vs37, 0 + xxpermdi vs41, vs41, vs45, 0 + xxperm vs40, vs40, permute_mask + xxperm vs41, vs41, permute_mask +#ifndef TRMMKERNEL + lxvp vs24, 0(CO) +#endif + xxperm vs0, vs32, permute_mask + xxperm vs4, vs40, permute_mask + xxperm vs1, vs33, permute_mask + xxperm vs5, vs41, permute_mask + AGGREGATE_REALS_IMAGES vs32, vs0, vs40, vs4 + AGGREGATE_REALS_IMAGES vs33, vs1, vs41, vs5 + /*inner reverse save_permute and store vs28 */ + xxpermdi vs28,save_permute_1,save_permute_1, 2 + /*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULT_APLHA_PART1 vs32, vs40, vs0, vs1 + MULT_APLHA_PART1 vs33, vs41, vs2, vs3 + MULT_APLHA_PART2 vs32, vs40, vs0, vs1 + MULT_APLHA_PART2 vs33, vs41, vs2, vs3 +/* reconstruct r, i pairs*/ + xxperm vs0, vs1, vs28 + xxperm vs2, vs3, vs28 +#ifndef TRMMKERNEL + /* add */ + xvaddsp vs24, vs24, vs2 + xvaddsp vs25, vs25, vs0 + stxvp vs24, 0(CO) +#else +/* reconstruct r, i pairs*/ + stxv vs0, 0(CO) + stxv vs2, 16(CO) +#endif + addi CO, CO, 32 +.endm + +/* macros for N=1 and M=2 +**********************************************************************************************/ + +.macro ZERO1x2 + xxlxor vs32, vs32, vs32 + xxlxor vs40, vs40, vs40 +.endm + +.macro LOAD1x2 + LOAD1x2O 0, 0 +.endm + +.macro LOAD1x2O OffsetA, OffsetB + lxsd vs4, (\OffsetB+0)(BO) + lxv vs0, (\OffsetA+0)(AO) + xxspltd vs24, vs36, 0 + xxperm vs26, vs24, permute_mask +.endm + +.macro END1x2_NORMAL + END1x2 AO, BO, 16,8 +.endm + +.macro END1x2_WITHOUT_ADD + END1x2 AO, BO, 0, 0 +.endm + +.macro END1x2 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddasp vs32, vs0, vs24 + xvmaddasp vs40, vs0, vs26 +.endm + +.macro LOAD1x2_2 + LOAD1x2_2O 0, 0 +.endm + +.macro LOAD1x2_2O OffsetA, OffsetB + lxv vs27, (\OffsetB)(BO) + lxvp vs4, (0+\OffsetA)(AO) + xxspltd vs8, vs27, 1 + xxspltd vs24, vs27, 0 + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask +.endm + +.macro END1x2_2 + /*for load2 offset will be 32 and 16*/ + KERNEL1x2_2 AO, BO, 32, 16, 0, 1, 1 +.endm + +.macro KERNEL1x2_E2 OffsetA, OffsetB, Index, IsLast + KERNEL1x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL1x2_L2 OffsetA, OffsetB, Index, IsLast + KERNEL1x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL1x2_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete +.if \Complete==0 + lxv vs27, DISP2(\Index, \OffsetB)(\BREG) +.endif + xvmaddasp vs32, vs5, vs8 + xvmaddasp vs40, vs5, vs10 + +.if \Complete==0 + xxspltd vs8, vs27, 1 + xxperm vs10, vs8, permute_mask +.endif + xvmaddasp vs32, vs4, vs24 + xvmaddasp vs40, vs4, vs26 +.if \Complete==0 + lxvp vs4, DISP4(\Index, 0+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxspltd vs24, vs27, 0 + xxperm vs26, vs24, permute_mask +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP2(\Index, \OffsetB) + addi \AREG, \AREG, DISP4(\Index, \OffsetA) +.else + addi \BREG, \BREG, DISP2(\Index, 16) + addi \AREG, \AREG, DISP4(\Index, 32) +.endif +.endif +.endm + +.macro KERNEL1x2 + LOAD1x2 + END1x2 AO, BO, 16,8 +.endm + +.macro SAVE1x2 +#ifndef TRMMKERNEL + lxv vs24, 0(CO) +#endif + xxperm vs0, vs32, permute_mask + xxperm vs4, vs40, permute_mask + AGGREGATE_REALS_IMAGES vs32, vs0, vs40, vs4 + /*inner reverse save_permute and store vs28 */ + xxpermdi vs28,save_permute_1,save_permute_1, 2 + /*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULT_APLHA_PART1 vs32, vs40, vs0, vs1 + MULT_APLHA_PART2 vs32, vs40, vs0, vs1 +/* reconstruct r, i pairs*/ + xxperm vs0, vs1, vs28 +#ifndef TRMMKERNEL + /* add */ + xvaddsp vs24, vs24, vs0 + stxv vs24, 0(CO) +#else +/* reconstruct r, i pairs*/ + stxv vs0, 0(CO) +#endif + addi CO, CO, 16 +.endm + +/* macros for N=1 and M=1 +**********************************************************************************************/ +.macro ZERO1x1 + xxlxor vs32, vs32, vs32 + xxlxor vs40, vs40, vs40 +.endm + +.macro LOAD1x1 + LOAD1x1O 0, 0 +.endm + +.macro LOAD1x1O OffsetA, OffsetB + lxsd v4, (\OffsetB+0)(BO) + lxsd v5, (\OffsetA+0)(AO) + xxperm vs38, vs36, permute_mask +.endm + +.macro END1x1_NORMAL + END1x1 AO, BO,8,8 +.endm + +.macro END1x1_WITHOUT_ADD + END1x1 AO, BO, 0, 0 +.endm + +.macro END1x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddasp vs32, vs37, vs36 + xvmaddasp vs40, vs37, vs38 +.endm + +.macro LOAD1x1_2 + LOAD1x1_2O 0, 0 +.endm + +.macro LOAD1x1_2O OffsetA, OffsetB + lxv vs8, (\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + xxperm vs10, vs8, permute_mask +.endm + +.macro END1x1_2 + /*for load2 offset will be 16 and 16*/ + KERNEL1x1_2 AO, BO, 16, 16, 0, 1, 1 +.endm + +.macro KERNEL1x1_E2 OffsetA, OffsetB, Index, IsLast + KERNEL1x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL1x1_L2 OffsetA, OffsetB, Index, IsLast + KERNEL1x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL1x1_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete + xvmaddasp vs32, vs4, vs8 + xvmaddasp vs40, vs4, vs10 +.if \Complete==0 + lxv vs8, DISP2(\Index, \OffsetB)(\BREG) + lxv vs4, DISP2(\Index, \OffsetB)(\AREG) + xxperm vs10, vs8, permute_mask +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP2(\Index, \OffsetB) + addi \AREG, \AREG, DISP2(\Index, \OffsetA) +.else + addi \BREG, \BREG, DISP2(\Index, 16) + addi \AREG, \AREG, DISP2(\Index, 16) +.endif +.endif +.endm + +.macro KERNEL1x1 + LOAD1x1 + END1x1 AO, BO, 8,8 +.endm + +.macro SAVE1x1 +#ifndef TRMMKERNEL + lxsd v4, 0(CO) +#endif + /*aggregate x2*/ + xxpermdi vs33, vs32, vs32, 2 + xxpermdi vs41, vs40, vs40, 2 + xvaddsp vs32, vs32, vs33 + xvaddsp vs40, vs40, vs41 + + xxperm vs0, vs32, permute_mask + xxperm vs4, vs40, permute_mask + AGGREGATE_REALS_IMAGES vs32, vs0, vs40, vs4 + /*inner reverse save_permute and store vs28 */ + xxpermdi vs28,save_permute_1,save_permute_1, 2 + /*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULT_APLHA_PART1 vs32, vs40, vs37, vs1 + MULT_APLHA_PART2 vs32, vs40, vs37, vs1 +/* reconstruct r, i pairs*/ + xxperm vs37, vs1, vs28 +#ifndef TRMMKERNEL + /* add */ + xvaddsp vs36, vs36, vs37 + stxsd v4, 0(CO) +#else +/* vs37 is v5 */ + stxsd v5, 0(CO) +#endif + addi CO, CO, 8 +.endm + +/****************************TRMM POINTER REFRESH MACROSES*************************/ +.macro SHIFT_REG REG1,REG2,SHIFT_VAL +.if \SHIFT_VAL==16 + slwi \REG1, \REG2, 7 +.elseif \SHIFT_VAL==8 + slwi \REG1, \REG2, 6 +.elseif \SHIFT_VAL==4 + slwi \REG1, \REG2, 5 +.elseif \SHIFT_VAL==2 + slwi \REG1, \REG2, 4 +.elseif \SHIFT_VAL==1 + slwi \REG1, \REG2, 3 +.endif +.endm + +/* +//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// ptrbb = bb; +// #else +// ptrba += off*8; +// ptrbb = bb + off*4; +// #endif +*/ +.macro REFRESH_POINTERS PTR_A,PTR_B, OFF_VAL, B_VAL, C_A, C_B +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +/* ptrbb = bb;*/ + mr \PTR_B, \B_VAL /* refresh BPOINT */ +#else +/* +// ptrba =ptrba+ off*C_A; +// ptrbb = bb + off*C_B; +*/ + SHIFT_REG T4, \OFF_VAL, \C_B /* Number of values in B shifted */ + SHIFT_REG T2, \OFF_VAL, \C_A /* Number of values in A shifted */ + add \PTR_B, \B_VAL, T4 /* Add values to BO */ + add \PTR_A, \PTR_A, T2 /* Add values to AO */ +#endif +.endm + +/* +// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +// temp = bk-off; +// #elif defined(LEFT) +// temp = off+8; // number of values in A +// #else +// temp = off+4; // number of values in B +// #endif +*/ +.macro REFRESH_TEMP_BK TEMP_BK, BK_VAL, OFF_VAL, INCR_A, INCR_B + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + /* temp = bk-off;*/ + sub \TEMP_BK, \BK_VAL, \OFF_VAL + #elif defined(LEFT) + /* temp = off+INCR_A; // number of values in A */ + addi \TEMP_BK, \OFF_VAL, \INCR_A + #else + /* temp = off+INCR_B // number of values in B*/ + addi \TEMP_BK, \OFF_VAL, \INCR_B + #endif +.endm +/* +// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// temp = bk - off; +// #ifdef LEFT +// temp -= 8; // number of values in A +// #else +// temp -= 4; // number of values in B +// #endif +// ptrba += temp*8; +// ptrbb += temp*4; +// #endif + +// #ifdef LEFT +// off += 8; // number of values in A +// #endif +*/ +.macro REFRESH_AFTER_SAVE TEMP_BK, BK_VAL, OFF_VAL,PTR_B,PTR_A, C_A, C_B + #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /*temp = bk - off;*/ + sub \TEMP_BK, \BK_VAL, \OFF_VAL + #ifdef LEFT + /*temp -= 8; // number of values in A*/ + addi \TEMP_BK, \TEMP_BK,-\C_A + #else + /*temp -= 4; // number of values in B*/ + addi \TEMP_BK, \TEMP_BK,-\C_B + #endif + /*ptrba += temp*C_A; + ptrbb += temp*C_B;*/ + SHIFT_REG T4, \TEMP_BK, \C_A + SHIFT_REG T2, \TEMP_BK, \C_B + add \PTR_A, \PTR_A, T4/*ptrba+temp*C_A*/ + add \PTR_B, \PTR_B, T2 + #endif + #ifdef LEFT + /*off += 8; // number of values in A*/ + addi \OFF_VAL, \OFF_VAL, \C_A + #endif +.endm diff --git a/kernel/power/dgemm_kernel_power10.c b/kernel/power/dgemm_kernel_power10.c new file mode 100644 index 000000000..b3ee301be --- /dev/null +++ b/kernel/power/dgemm_kernel_power10.c @@ -0,0 +1,864 @@ +/********************************************************************************* +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ +#include "common.h" +#include + +typedef unsigned char vec_t __attribute__ ((vector_size (16))); +typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); +typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); + +#ifdef TRMMKERNEL +#define SAVE_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] = result[3] * alpha; \ + rowC = (v4sf_t *) &CO[1*ldc+J]; \ + rowC[0] = result[2] * alpha; \ + rowC = (v4sf_t *) &CO[2*ldc+J]; \ + rowC[0] = result[1] * alpha; \ + rowC = (v4sf_t *) &CO[3*ldc+J]; \ + rowC[0] = result[0] * alpha; +#define SAVE_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[4* ldc+J]; \ + rowC[0] = result[3] * alpha; \ + rowC = (v4sf_t *) &CO[5*ldc+J]; \ + rowC[0] = result[2] * alpha; \ + rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC[0] = result[1] * alpha; \ + rowC = (v4sf_t *) &CO[7*ldc+J]; \ + rowC[0] = result[0] * alpha; +#define SAVE2x4_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] = result[3] * alpha; \ + rowC = (v4sf_t *) &CO[1* ldc+J]; \ + rowC[0] = result[2] * alpha; +#else +#define SAVE_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] += result[3] * alpha; \ + rowC = (v4sf_t *) &CO[1*ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v4sf_t *) &CO[2*ldc+J]; \ + rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[3*ldc+J]; \ + rowC[0] += result[0] * alpha; +#define SAVE_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[4* ldc+J]; \ + rowC[0] += result[3] * alpha; \ + rowC = (v4sf_t *) &CO[5*ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[7*ldc+J]; \ + rowC[0] += result[0] * alpha; +#define SAVE2x4_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] += result[3] * alpha; \ + rowC = (v4sf_t *) &CO[1* ldc+J]; \ + rowC[0] += result[2] * alpha; +#endif + +#define SET_ACC_ZERO4() \ + __builtin_mma_xxsetaccz (&acc0); \ + __builtin_mma_xxsetaccz (&acc1); \ + __builtin_mma_xxsetaccz (&acc2); \ + __builtin_mma_xxsetaccz (&acc3); + +#define SET_ACC_ZERO8() \ + __builtin_mma_xxsetaccz (&acc0); \ + __builtin_mma_xxsetaccz (&acc1); \ + __builtin_mma_xxsetaccz (&acc2); \ + __builtin_mma_xxsetaccz (&acc3); \ + __builtin_mma_xxsetaccz (&acc4); \ + __builtin_mma_xxsetaccz (&acc5); \ + __builtin_mma_xxsetaccz (&acc6); \ + __builtin_mma_xxsetaccz (&acc7); + +#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +#define REFRESH_TEMP_BK(x, y) \ + temp = k - off; +#elif defined(LEFT) +#define REFRESH_TEMP_BK(x, y) \ + temp = off + x; +#else +#define REFRESH_TEMP_BK(x, y) \ + temp = off + y; +#endif +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +#define REFRESH_POINTERS(x, y) \ + BO = B; \ + REFRESH_TEMP_BK(x, y) +#else +#define REFRESH_POINTERS(x, y) \ + AO += off * x; \ + BO = B + off * y; \ + REFRESH_TEMP_BK(x, y) +#endif + +#ifdef LEFT +#define REFRESH_OFF(x) \ + off += x; +#else +#define REFRESH_OFF(x) +#endif + +#ifdef LEFT +#define UPDATE_TEMP(x, y) \ + temp -= x; +#else +#define UPDATE_TEMP(x, y) \ + temp -= y; +#endif + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +#define REFRESH_TMP_AFTER_SAVE(x, y) \ + temp = k - off; \ + UPDATE_TEMP(x, y) \ + AO += temp * x; \ + BO += temp * y; +#else +#define REFRESH_TMP_AFTER_SAVE(x, y) +#endif + +#define REFRESH_AFTER_SAVE(x,y) \ + REFRESH_TMP_AFTER_SAVE(x, y) \ + REFRESH_OFF(x) +/************************************************************************************* +* GEMM Kernel +*************************************************************************************/ +int +CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, + FLOAT * C, BLASLONG ldc +#ifdef TRMMKERNEL + , BLASLONG offset +#endif + ) +{ + BLASLONG N = n; + BLASLONG i1; +#if defined(TRMMKERNEL) + BLASLONG off; +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#endif + v4sf_t valpha = { alpha, alpha }; + N = n >> 2; + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, j, temp; + FLOAT *CO; + FLOAT *AO; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + CO = C; + C += ldc << 2; + AO = A; + PREFETCH1 (A, 128); + PREFETCH1 (A, 256); + i = m >> 4; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (16, 4); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + BLASLONG l = 0; + PREFETCH1 (CO, 0); + PREFETCH1 (CO + ldc, 0); + PREFETCH1 (CO + ldc + ldc, 0); + PREFETCH1 (CO + ldc + ldc + ldc, 0); + PREFETCH1 (CO, 128); + PREFETCH1 (CO + ldc, 128); + PREFETCH1 (CO + ldc + ldc, 128); + PREFETCH1 (CO + ldc + ldc + ldc, 128); + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + SET_ACC_ZERO8 (); + for (l = 0; l < temp; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 4]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & BO[l << 2]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); + __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); + __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]); + __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]); + __builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]); + __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]); + __builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc2, 4); + SAVE_ACC (&acc1, 2); + SAVE_ACC (&acc3, 6); + SAVE_ACC (&acc4, 8); + SAVE_ACC (&acc6, 12); + SAVE_ACC (&acc5, 10); + SAVE_ACC (&acc7, 14); + AO += temp << 4; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (16, 4) +#endif + CO += 16; + } + i = (m & 15) >> 3; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 4); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3; + SET_ACC_ZERO4 (); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 3]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & BO[l << 2]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); + __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); + __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc2, 4); + SAVE_ACC (&acc1, 2); + SAVE_ACC (&acc3, 6); + CO += 8; + AO += temp << 3; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 4) +#endif + } + i = (m & 7) >> 2; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 4); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 2]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & BO[l << 2]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc1, 2); + CO += 4; + AO += temp << 2; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 4) +#endif + } + i = (m & 3) >> 1; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 4); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0; + __builtin_mma_xxsetaccz (&acc0); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 1]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & BO[l << 2]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + } + SAVE_ACC (&acc0, 0); + CO += 2; + AO += temp << 1; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 4) +#endif + } + i = (m & 1) >> 0; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 4); +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0 }; + v4sf_t t1 = { 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowA = { AO[l], AO[l] }; + v4sf_t rowB = { BO[l << 2], BO[(l << 2) + 1] }; + v4sf_t rowB1 = { BO[(l << 2) + 2], BO[(l << 2) + 3] }; + t += rowA * rowB; + t1 += rowA * rowB1; + } + t = t * valpha; + t1 = t1 * valpha; +#if defined(TRMMKERNEL) + CO[0 * ldc] = t[0]; + CO[1 * ldc] = t[1]; + CO[2 * ldc] = t1[0]; + CO[3 * ldc] = t1[1]; +#else + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; + CO[2 * ldc] += t1[0]; + CO[3 * ldc] += t1[1]; +#endif + CO += 1; + AO += temp; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 4) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 4; // number of values in A +#endif + B += k << 2; + } + N = (n & 3) >> 1; + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, j, temp; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + FLOAT *CO; + FLOAT *AO; + CO = C; + C += ldc << 1; + AO = A; + i = m >> 4; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (16, 2); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + SET_ACC_ZERO8 (); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + FLOAT t[4] = { 0, 0, 0, 0 }; + t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & t[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + vec_t *rowA = (vec_t *) & AO[l << 4]; + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); + __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); + __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]); + __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]); + __builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]); + __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]); + __builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]); + } + SAVE2x4_ACC (&acc0, 0); + SAVE2x4_ACC (&acc1, 2); + SAVE2x4_ACC (&acc2, 4); + SAVE2x4_ACC (&acc3, 6); + SAVE2x4_ACC (&acc4, 8); + SAVE2x4_ACC (&acc5, 10); + SAVE2x4_ACC (&acc6, 12); + SAVE2x4_ACC (&acc7, 14); + CO += 16; + AO += temp << 4; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (16, 2) +#endif + } + i = (m & 15) >> 3; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 2); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3; + SET_ACC_ZERO4 (); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + FLOAT t[4] = { 0, 0, 0, 0 }; + t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & t[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + vec_t *rowA = (vec_t *) & AO[l << 3]; + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); + __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); + __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]); + } + SAVE2x4_ACC (&acc0, 0); + SAVE2x4_ACC (&acc1, 2); + SAVE2x4_ACC (&acc2, 4); + SAVE2x4_ACC (&acc3, 6); + CO += 8; + AO += temp << 3; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 2) +#endif + } + i = (m & 7) >> 2; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 2); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + FLOAT t[4] = { 0, 0, 0, 0 }; + t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & t[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + vec_t *rowA = (vec_t *) & AO[l << 2]; + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); + } + SAVE2x4_ACC (&acc0, 0); + SAVE2x4_ACC (&acc1, 2); + CO += 4; + AO += temp << 2; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 2) +#endif + } + i = (m & 3) >> 1; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 2); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0; + __builtin_mma_xxsetaccz (&acc0); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + FLOAT t[4] = { 0, 0, 0, 0 }; + t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & t[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + vec_t *rowA = (vec_t *) & AO[l << 1]; + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + } + SAVE2x4_ACC (&acc0, 0); + CO += 2; + AO += temp << 1; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 2) +#endif + } + i = (m & 1) >> 0; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 2); +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowA = { AO[l], AO[l] }; + v4sf_t rowB = { BO[l << 1], BO[(l << 1) + 1] }; + t += rowA * rowB; + } + t = t * valpha; +#if defined(TRMMKERNEL) + CO[0 * ldc] = t[0]; + CO[1 * ldc] = t[1]; +#else + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; +#endif + CO += 1; + AO += temp; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 2) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; // number of values in A +#endif + B += k << 1; + } + N = (n & 1) >> 0; + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, temp; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + FLOAT *CO; + FLOAT *AO; + CO = C; + C += ldc; + AO = A; + i = m; + while (i >= 16) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (16, 1) +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0 }; + v4sf_t t1 = { 0, 0 }; + v4sf_t t2 = { 0, 0 }; + v4sf_t t3 = { 0, 0 }; + v4sf_t t4 = { 0, 0 }; + v4sf_t t5 = { 0, 0 }; + v4sf_t t6 = { 0, 0 }; + v4sf_t t7 = { 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowB = { BO[l], BO[l] }; + v4sf_t rowA = { AO[l << 4], AO[(l << 4) + 1] }; + v4sf_t rowA1 = { AO[(l << 4) + 2], AO[(l << 4) + 3] }; + v4sf_t rowA2 = { AO[(l << 4) + 4], AO[(l << 4) + 5] }; + v4sf_t rowA3 = { AO[(l << 4) + 6], AO[(l << 4) + 7] }; + v4sf_t rowA4 = { AO[(l << 4) + 8], AO[(l << 4) + 9] }; + v4sf_t rowA5 = { AO[(l << 4) + 10], AO[(l << 4) + 11] }; + v4sf_t rowA6 = { AO[(l << 4) + 12], AO[(l << 4) + 13] }; + v4sf_t rowA7 = { AO[(l << 4) + 14], AO[(l << 4) + 15] }; + t += rowA * rowB; + t1 += rowA1 * rowB; + t2 += rowA2 * rowB; + t3 += rowA3 * rowB; + t4 += rowA4 * rowB; + t5 += rowA5 * rowB; + t6 += rowA6 * rowB; + t7 += rowA7 * rowB; + } + t = t * valpha; + t1 = t1 * valpha; + t2 = t2 * valpha; + t3 = t3 * valpha; + t4 = t4 * valpha; + t5 = t5 * valpha; + t6 = t6 * valpha; + t7 = t7 * valpha; +#if defined(TRMMKERNEL) + CO[0] = t[0]; + CO[1] = t[1]; + CO[2] = t1[0]; + CO[3] = t1[1]; + CO[4] = t2[0]; + CO[5] = t2[1]; + CO[6] = t3[0]; + CO[7] = t3[1]; + CO[8] = t4[0]; + CO[9] = t4[1]; + CO[10] = t5[0]; + CO[11] = t5[1]; + CO[12] = t6[0]; + CO[13] = t6[1]; + CO[14] = t7[0]; + CO[15] = t7[1]; +#else + CO[0] += t[0]; + CO[1] += t[1]; + CO[2] += t1[0]; + CO[3] += t1[1]; + CO[4] += t2[0]; + CO[5] += t2[1]; + CO[6] += t3[0]; + CO[7] += t3[1]; + CO[8] += t4[0]; + CO[9] += t4[1]; + CO[10] += t5[0]; + CO[11] += t5[1]; + CO[12] += t6[0]; + CO[13] += t6[1]; + CO[14] += t7[0]; + CO[15] += t7[1]; +#endif + AO += temp << 4; + BO += temp; + CO += 16; + i -= 16; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (16, 1) +#endif + } + while (i >= 8) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 1) +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0 }; + v4sf_t t1 = { 0, 0 }; + v4sf_t t2 = { 0, 0 }; + v4sf_t t3 = { 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowB = { BO[l], BO[l] }; + v4sf_t rowA = { AO[l << 3], AO[(l << 3) + 1] }; + v4sf_t rowA1 = { AO[(l << 3) + 2], AO[(l << 3) + 3] }; + v4sf_t rowA2 = { AO[(l << 3) + 4], AO[(l << 3) + 5] }; + v4sf_t rowA3 = { AO[(l << 3) + 6], AO[(l << 3) + 7] }; + t += rowA * rowB; + t1 += rowA1 * rowB; + t2 += rowA2 * rowB; + t3 += rowA3 * rowB; + } + t = t * valpha; + t1 = t1 * valpha; + t2 = t2 * valpha; + t3 = t3 * valpha; +#if defined(TRMMKERNEL) + CO[0] = t[0]; + CO[1] = t[1]; + CO[2] = t1[0]; + CO[3] = t1[1]; + CO[4] = t2[0]; + CO[5] = t2[1]; + CO[6] = t3[0]; + CO[7] = t3[1]; +#else + CO[0] += t[0]; + CO[1] += t[1]; + CO[2] += t1[0]; + CO[3] += t1[1]; + CO[4] += t2[0]; + CO[5] += t2[1]; + CO[6] += t3[0]; + CO[7] += t3[1]; +#endif + AO += temp << 3; + BO += temp; + CO += 8; + i -= 8; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 1) +#endif + } + while (i >= 4) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 1) +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0 }; + v4sf_t t1 = { 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowB = { BO[l], BO[l] }; + v4sf_t rowA = { AO[l << 2], AO[(l << 2) + 1] }; + v4sf_t rowA1 = { AO[(l << 2) + 2], AO[(l << 2) + 3] }; + t += rowA * rowB; + t1 += rowA1 * rowB; + } + t = t * valpha; + t1 = t1 * valpha; +#if defined(TRMMKERNEL) + CO[0] = t[0]; + CO[1] = t[1]; + CO[2] = t1[0]; + CO[3] = t1[1]; +#else + CO[0] += t[0]; + CO[1] += t[1]; + CO[2] += t1[0]; + CO[3] += t1[1]; +#endif + AO += temp << 2; + BO += temp; + CO += 4; + i -= 4; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 1) +#endif + } + while (i >= 2) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 1) +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowB = { BO[l], BO[l] }; + v4sf_t rowA = { AO[l << 1], AO[(l << 1) + 1] }; + t += rowA * rowB; + } + t = t * valpha; +#if defined(TRMMKERNEL) + CO[0] = t[0]; + CO[1] = t[1]; +#else + CO[0] += t[0]; + CO[1] += t[1]; +#endif + AO += temp << 1; + BO += temp; + CO += 2; + i -= 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 1) +#endif + } + while (i >= 1) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 1) +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + FLOAT t = 0; + for (l = 0; l < temp; l++) + { + t += AO[l] * BO[l]; + } + AO += temp; + BO += temp; +#if defined(TRMMKERNEL) + CO[0] = t * alpha; +#else + CO[0] += t * alpha; +#endif + CO += 1; + i -= 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 1) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; // number of values in A +#endif + B += k; + } + return 0; +} diff --git a/kernel/power/sgemm_kernel_power10.c b/kernel/power/sgemm_kernel_power10.c new file mode 100644 index 000000000..01c122c6d --- /dev/null +++ b/kernel/power/sgemm_kernel_power10.c @@ -0,0 +1,1334 @@ +/********************************************************************************* +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ +#include "common.h" +#include + +typedef unsigned char vec_t __attribute__ ((vector_size (16))); +typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); +typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); +#if defined(TRMMKERNEL) +#define SAVE_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] = result[3] * alpha; \ + rowC = (v4sf_t *) &CO[1*ldc+J]; \ + rowC[0] = result[2] * alpha; \ + rowC = (v4sf_t *) &CO[2*ldc+J]; \ + rowC[0] = result[1] * alpha; \ + rowC = (v4sf_t *) &CO[3*ldc+J]; \ + rowC[0] = result[0] * alpha; +#define SAVE_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[4* ldc+J]; \ + rowC[0] = result[3] * alpha; \ + rowC = (v4sf_t *) &CO[5*ldc+J]; \ + rowC[0] = result[2] * alpha; \ + rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC[0] = result[1] * alpha; \ + rowC = (v4sf_t *) &CO[7*ldc+J]; \ + rowC[0] = result[0] * alpha; +#define SAVE4x2_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v2sf_t *) &CO[0* ldc+J]; \ + rowC[0] = result[6] * alpha; \ + rowC = (v2sf_t *) &CO[1* ldc+J]; \ + rowC[0] = result[4] * alpha; \ + rowC = (v2sf_t *) &CO[2* ldc+J]; \ + rowC[0] = result[2] * alpha; \ + rowC = (v2sf_t *) &CO[3* ldc+J]; \ + rowC[0] = result[0] * alpha; +#define SAVE4x2_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v2sf_t *) &CO[4* ldc+J]; \ + rowC[0] = result[6] * alpha; \ + rowC = (v2sf_t *) &CO[5* ldc+J]; \ + rowC[0] = result[4] * alpha; \ + rowC = (v2sf_t *) &CO[6* ldc+J]; \ + rowC[0] = result[2] * alpha; \ + rowC = (v2sf_t *) &CO[7* ldc+J]; \ + rowC[0] = result[0] * alpha; +#define SAVE2x4_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] = result[3] * alpha; \ + rowC = (v4sf_t *) &CO[1* ldc+J]; \ + rowC[0] = result[2] * alpha; +#else +#define SAVE_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] += result[3] * alpha; \ + rowC = (v4sf_t *) &CO[1*ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v4sf_t *) &CO[2*ldc+J]; \ + rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[3*ldc+J]; \ + rowC[0] += result[0] * alpha; +#define SAVE_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[4* ldc+J]; \ + rowC[0] += result[3] * alpha; \ + rowC = (v4sf_t *) &CO[5*ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[7*ldc+J]; \ + rowC[0] += result[0] * alpha; +#define SAVE4x2_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v2sf_t *) &CO[0* ldc+J]; \ + rowC[0] += result[6] * alpha; \ + rowC = (v2sf_t *) &CO[1* ldc+J]; \ + rowC[0] += result[4] * alpha; \ + rowC = (v2sf_t *) &CO[2* ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v2sf_t *) &CO[3* ldc+J]; \ + rowC[0] += result[0] * alpha; +#define SAVE4x2_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v2sf_t *) &CO[4* ldc+J]; \ + rowC[0] += result[6] * alpha; \ + rowC = (v2sf_t *) &CO[5* ldc+J]; \ + rowC[0] += result[4] * alpha; \ + rowC = (v2sf_t *) &CO[6* ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v2sf_t *) &CO[7* ldc+J]; \ + rowC[0] += result[0] * alpha; +#define SAVE2x4_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] += result[3] * alpha; \ + rowC = (v4sf_t *) &CO[1* ldc+J]; \ + rowC[0] += result[2] * alpha; +#endif +#define KERNEL(i, j) \ + __builtin_mma_xvf32gerpp (&acc0, rowB[i], rowA[j]); \ + __builtin_mma_xvf32gerpp (&acc1, rowB[i+1], rowA[j]); \ + __builtin_mma_xvf32gerpp (&acc2, rowB[i], rowA[j+1]); \ + __builtin_mma_xvf32gerpp (&acc3, rowB[i+1], rowA[j+1]); \ + __builtin_mma_xvf32gerpp (&acc4, rowB[i], rowA[j+2]); \ + __builtin_mma_xvf32gerpp (&acc5, rowB[i+1], rowA[j+2]); \ + __builtin_mma_xvf32gerpp (&acc6, rowB[i], rowA[j+3]); \ + __builtin_mma_xvf32gerpp (&acc7, rowB[i+1], rowA[j+3]); +#define SET_ACC_ZERO4() \ + __builtin_mma_xxsetaccz (&acc0); \ + __builtin_mma_xxsetaccz (&acc1); \ + __builtin_mma_xxsetaccz (&acc2); \ + __builtin_mma_xxsetaccz (&acc3); + +#define SET_ACC_ZERO8() \ + __builtin_mma_xxsetaccz (&acc0); \ + __builtin_mma_xxsetaccz (&acc1); \ + __builtin_mma_xxsetaccz (&acc2); \ + __builtin_mma_xxsetaccz (&acc3); \ + __builtin_mma_xxsetaccz (&acc4); \ + __builtin_mma_xxsetaccz (&acc5); \ + __builtin_mma_xxsetaccz (&acc6); \ + __builtin_mma_xxsetaccz (&acc7); + +#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +#define REFRESH_TEMP_BK(x, y) \ + temp = k - off; +#elif defined(LEFT) +#define REFRESH_TEMP_BK(x, y) \ + temp = off + x; +#else +#define REFRESH_TEMP_BK(x, y) \ + temp = off + y; +#endif +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +#define REFRESH_POINTERS(x, y) \ + BO = B; \ + REFRESH_TEMP_BK(x, y) +#else +#define REFRESH_POINTERS(x, y) \ + AO += off * x; \ + BO = B + off * y; \ + REFRESH_TEMP_BK(x, y) +#endif + +#ifdef LEFT +#define REFRESH_OFF(x) \ + off += x; +#else +#define REFRESH_OFF(x) +#endif + +#ifdef LEFT +#define UPDATE_TEMP(x, y) \ + temp -= x; +#else +#define UPDATE_TEMP(x, y) \ + temp -= y; +#endif + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +#define REFRESH_TMP_AFTER_SAVE(x, y) \ + temp = k - off; \ + UPDATE_TEMP(x, y) \ + AO += temp * x; \ + BO += temp * y; +#else +#define REFRESH_TMP_AFTER_SAVE(x, y) +#endif + +#define REFRESH_AFTER_SAVE(x,y) \ + REFRESH_TMP_AFTER_SAVE(x, y) \ + REFRESH_OFF(x) +/************************************************************************************* +* GEMM Kernel +*************************************************************************************/ +int +CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, + FLOAT * C, BLASLONG ldc +#ifdef TRMMKERNEL + , BLASLONG offset +#endif + ) +{ + BLASLONG N = n; + BLASLONG i1; +#if defined(TRMMKERNEL) + BLASLONG off; +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#endif + + v4sf_t valpha = { alpha, alpha, alpha, alpha }; + N = n >> 3; + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, j, temp; + FLOAT *CO; + FLOAT *AO; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + CO = C; + C += ldc << 3; + AO = A; + PREFETCH1 (A, 128); + PREFETCH1 (A, 256); + i = m >> 4; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (16, 8); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + SET_ACC_ZERO8 (); + BLASLONG l = 0; + BLASLONG K = temp / 64; + for (l = 0; l < K; l++) + { + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + KERNEL (0, 0); + KERNEL (2, 4); + KERNEL (4, 8); + KERNEL (6, 12); + KERNEL (8, 16); + KERNEL (10, 20); + KERNEL (12, 24); + KERNEL (14, 28); + KERNEL (16, 32); + KERNEL (18, 36); + KERNEL (20, 40); + KERNEL (22, 44); + KERNEL (24, 48); + KERNEL (26, 52); + KERNEL (28, 56); + KERNEL (30, 60); + KERNEL (32, 64); + KERNEL (34, 68); + KERNEL (36, 72); + KERNEL (38, 76); + KERNEL (40, 80); + KERNEL (42, 84); + KERNEL (44, 88); + KERNEL (46, 92); + KERNEL (48, 96); + KERNEL (50, 100); + KERNEL (52, 104); + KERNEL (54, 108); + KERNEL (56, 112); + KERNEL (58, 116); + KERNEL (60, 120); + KERNEL (62, 124); + KERNEL (64, 128); + KERNEL (66, 132); + KERNEL (68, 136); + KERNEL (70, 140); + KERNEL (72, 144); + KERNEL (74, 148); + KERNEL (76, 152); + KERNEL (78, 156); + KERNEL (80, 160); + KERNEL (82, 164); + KERNEL (84, 168); + KERNEL (86, 172); + KERNEL (88, 176); + KERNEL (90, 180); + KERNEL (92, 184); + KERNEL (94, 188); + KERNEL (96, 192); + KERNEL (98, 196); + KERNEL (100, 200); + KERNEL (102, 204); + KERNEL (104, 208); + KERNEL (106, 212); + KERNEL (108, 216); + KERNEL (110, 220); + KERNEL (112, 224); + KERNEL (114, 228); + KERNEL (116, 232); + KERNEL (118, 236); + KERNEL (120, 240); + KERNEL (122, 244); + KERNEL (124, 248); + KERNEL (126, 252); + AO += 1024; + BO += 512; + } + if ((temp & 63) >> 5) + { + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + KERNEL (0, 0); + KERNEL (2, 4); + KERNEL (4, 8); + KERNEL (6, 12); + KERNEL (8, 16); + KERNEL (10, 20); + KERNEL (12, 24); + KERNEL (14, 28); + KERNEL (16, 32); + KERNEL (18, 36); + KERNEL (20, 40); + KERNEL (22, 44); + KERNEL (24, 48); + KERNEL (26, 52); + KERNEL (28, 56); + KERNEL (30, 60); + KERNEL (32, 64); + KERNEL (34, 68); + KERNEL (36, 72); + KERNEL (38, 76); + KERNEL (40, 80); + KERNEL (42, 84); + KERNEL (44, 88); + KERNEL (46, 92); + KERNEL (48, 96); + KERNEL (50, 100); + KERNEL (52, 104); + KERNEL (54, 108); + KERNEL (56, 112); + KERNEL (58, 116); + KERNEL (60, 120); + KERNEL (62, 124); + AO += 512; + BO += 256; + } + if ((temp & 31) >> 4) + { + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + KERNEL (0, 0); + KERNEL (2, 4); + KERNEL (4, 8); + KERNEL (6, 12); + KERNEL (8, 16); + KERNEL (10, 20); + KERNEL (12, 24); + KERNEL (14, 28); + KERNEL (16, 32); + KERNEL (18, 36); + KERNEL (20, 40); + KERNEL (22, 44); + KERNEL (24, 48); + KERNEL (26, 52); + KERNEL (28, 56); + KERNEL (30, 60); + AO += 256; + BO += 128; + } + if ((temp & 15) >> 3) + { + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + KERNEL (0, 0); + KERNEL (2, 4); + KERNEL (4, 8); + KERNEL (6, 12); + KERNEL (8, 16); + KERNEL (10, 20); + KERNEL (12, 24); + KERNEL (14, 28); + AO += 128; + BO += 64; + } + if ((temp & 7) >> 2) + { + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + KERNEL (0, 0); + KERNEL (2, 4); + KERNEL (4, 8); + KERNEL (6, 12); + AO += 64; + BO += 32; + } + if ((temp & 3) >> 1) + { + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + KERNEL (0, 0); + KERNEL (2, 4); + AO += 32; + BO += 16; + } + if ((temp & 1) >> 0) + { + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + KERNEL (0, 0); + AO += 16; + BO += 8; + } + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc2, 4); + SAVE_ACC1 (&acc1, 0); + SAVE_ACC1 (&acc3, 4); + SAVE_ACC (&acc4, 8); + SAVE_ACC (&acc6, 12); + SAVE_ACC1 (&acc5, 8); + SAVE_ACC1 (&acc7, 12); +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (16, 8) +#endif + CO += 16; + } + i = (m & 15) >> 3; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 8); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3; + SET_ACC_ZERO4 (); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 3]; + vec_t *rowB = (vec_t *) & BO[l << 3]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]); + __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[1]); + __builtin_mma_xvf32gerpp (&acc3, rowB[1], rowA[1]); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc2, 4); + SAVE_ACC1 (&acc1, 0); + SAVE_ACC1 (&acc3, 4); + AO += (temp << 3); + BO += (temp << 3); + CO += 8; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 8) +#endif + } + i = (m & 7) >> 2; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 8); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 2]; + vec_t *rowB = (vec_t *) & BO[l << 3]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC1 (&acc1, 0); + CO += 4; + AO += (temp << 2); + BO += (temp << 3); +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 8) +#endif + } + i = (m & 3) >> 1; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 8); +#else + BO = B; + temp = k; +#endif + + v2sf_t *rowC; + v2sf_t result[8]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + FLOAT t[4] = { 0 }; + t[0] = AO[l << 1], t[1] = AO[(l << 1) + 1]; + vec_t *rowA = (vec_t *) & t[0]; + vec_t *rowB = (vec_t *) & BO[l << 3]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]); + } + SAVE4x2_ACC (&acc0, 0); + SAVE4x2_ACC1 (&acc1, 0); + CO += 2; + AO += (temp << 1); + BO += (temp << 3); +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 8) +#endif + } + i = (m & 1) >> 0; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 8); +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; + v4sf_t t1 = { 0, 0, 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowA = { AO[l], AO[l], AO[l], AO[l] }; + v4sf_t rowB = { BO[l << 3], BO[(l << 3) + 1], BO[(l << 3) + 2], + BO[(l << 3) + 3] + }; + v4sf_t rowB1 = + { BO[(l << 3) + 4], BO[(l << 3) + 5], BO[(l << 3) + 6], + BO[(l << 3) + 7] + }; + t += rowA * rowB; + t1 += rowA * rowB1; + } + t = t * valpha; + t1 = t1 * valpha; +#if defined(TRMMKERNEL) + CO[0 * ldc] = t[0]; + CO[1 * ldc] = t[1]; + CO[2 * ldc] = t[2]; + CO[3 * ldc] = t[3]; + CO[4 * ldc] = t1[0]; + CO[5 * ldc] = t1[1]; + CO[6 * ldc] = t1[2]; + CO[7 * ldc] = t1[3]; +#else + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; + CO[2 * ldc] += t[2]; + CO[3 * ldc] += t[3]; + CO[4 * ldc] += t1[0]; + CO[5 * ldc] += t1[1]; + CO[6 * ldc] += t1[2]; + CO[7 * ldc] += t1[3]; +#endif + CO += 1; + AO += temp; + BO += (temp << 3); +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 8) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 8; // number of values in A +#endif + + B += k << 3; + } + N = (n & 7) >> 2; + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, j, temp; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + FLOAT *CO; + FLOAT *AO; + CO = C; + C += ldc << 2; + AO = A; +#if !defined(TRMMKERNEL) + i = m >> 5; + for (j = 0; j < i; j++) + { + FLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + FLOAT *A1; + A1 = AO + (16 * k); + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + SET_ACC_ZERO8 (); + BLASLONG l = 0; + for (l = 0; l < k; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 4]; + vec_t *rowA1 = (vec_t *) & A1[l << 4]; + vec_t *rowB = (vec_t *) & BO[l << 2]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); + __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]); + __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]); + __builtin_mma_xvf32gerpp (&acc4, rowB[0], rowA1[0]); + __builtin_mma_xvf32gerpp (&acc5, rowB[0], rowA1[1]); + __builtin_mma_xvf32gerpp (&acc6, rowB[0], rowA1[2]); + __builtin_mma_xvf32gerpp (&acc7, rowB[0], rowA1[3]); + } + + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc1, 4); + CO += 8; + SAVE_ACC (&acc2, 0); + SAVE_ACC (&acc3, 4); + CO += 8; + SAVE_ACC (&acc4, 0); + SAVE_ACC (&acc5, 4); + CO += 8; + SAVE_ACC (&acc6, 0); + SAVE_ACC (&acc7, 4); + CO += 8; + AO += k << 5; + BO += k << 2; + } + i = (m & 31) >> 4; +#else + i = m >> 4; +#endif + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (16, 4); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3; + SET_ACC_ZERO4 (); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 4]; + vec_t *rowB = (vec_t *) & BO[l << 2]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); + __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]); + __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]); + } + + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc1, 4); + CO += 8; + SAVE_ACC (&acc2, 0); + SAVE_ACC (&acc3, 4); + CO += 8; + AO += temp << 4; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (16, 4) +#endif + } + i = (m & 15) >> 3; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 4); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 3]; + vec_t *rowB = (vec_t *) & BO[l << 2]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc1, 4); + CO += 8; + AO += temp << 3; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 4) +#endif + } + i = (m & 7) >> 2; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 4); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + __vector_quad acc0; + v4sf_t result[4]; + __builtin_mma_xxsetaccz (&acc0); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 2]; + vec_t *rowB = (vec_t *) & BO[l << 2]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + } + SAVE_ACC (&acc0, 0); + CO += 4; + AO += temp << 2; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 4) +#endif + } + i = (m & 3) >> 1; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 4); +#else + BO = B; + temp = k; +#endif + v2sf_t *rowC; + v2sf_t result[8]; + __vector_quad acc0; + __builtin_mma_xxsetaccz (&acc0); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + FLOAT t[4] = { 0 }; + t[0] = AO[l << 1], t[1] = AO[(l << 1) + 1]; + vec_t *rowA = (vec_t *) & t[0]; + vec_t *rowB = (vec_t *) & BO[l << 2]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + } + SAVE4x2_ACC (&acc0, 0); + CO += 2; + AO += temp << 1; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 4) +#endif + } + i = (m & 1) >> 0; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 4) +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowA = { AO[l], AO[l], AO[l], AO[l] }; + v4sf_t rowB = { BO[l << 2], BO[(l << 2) + 1], BO[(l << 2) + 2], + BO[(l << 2) + 3] + }; + t += rowA * rowB; + } + t = t * valpha; +#if defined(TRMMKERNEL) + CO[0 * ldc] = t[0]; + CO[1 * ldc] = t[1]; + CO[2 * ldc] = t[2]; + CO[3 * ldc] = t[3]; +#else + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; + CO[2 * ldc] += t[2]; + CO[3 * ldc] += t[3]; +#endif + CO += 1; + AO += temp; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 4) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 4; // number of values in A +#endif + + B += k << 2; + } + N = (n & 3) >> 1; + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, j, temp; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + FLOAT *CO; + FLOAT *AO; + CO = C; + C += ldc << 1; + AO = A; +#if !defined(TRMMKERNEL) + i = m >> 5; + for (j = 0; j < i; j++) + { + FLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + FLOAT *A1; + A1 = AO + (16 * k); + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + SET_ACC_ZERO8 (); + BLASLONG l = 0; + for (l = 0; l < k; l++) + { + FLOAT t[4] = { 0 }; + t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; + vec_t *rowB = (vec_t *) & t[0]; + vec_t *rowA = (vec_t *) & AO[l << 4]; + vec_t *rowA1 = (vec_t *) & A1[l << 4]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); + __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]); + __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]); + __builtin_mma_xvf32gerpp (&acc4, rowB[0], rowA1[0]); + __builtin_mma_xvf32gerpp (&acc5, rowB[0], rowA1[1]); + __builtin_mma_xvf32gerpp (&acc6, rowB[0], rowA1[2]); + __builtin_mma_xvf32gerpp (&acc7, rowB[0], rowA1[3]); + } + SAVE2x4_ACC (&acc0, 0); + SAVE2x4_ACC (&acc1, 4); + SAVE2x4_ACC (&acc2, 8); + SAVE2x4_ACC (&acc3, 12); + CO += 16; + SAVE2x4_ACC (&acc4, 0); + SAVE2x4_ACC (&acc5, 4); + SAVE2x4_ACC (&acc6, 8); + SAVE2x4_ACC (&acc7, 12); + CO += 16; + AO += k << 5; + BO += k << 1; + } + i = (m & 31) >> 4; +#else + i = m >> 4; +#endif + for (j = 0; j < i; j++) + { + FLOAT *BO; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3; + SET_ACC_ZERO4 (); + BLASLONG l = 0; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (16, 2) +#else + BO = B; + temp = k; +#endif + for (l = 0; l < temp; l++) + { + FLOAT t[4] = { 0 }; + t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; + vec_t *rowB = (vec_t *) & t[0]; + vec_t *rowA = (vec_t *) & AO[l << 4]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); + __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]); + __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]); + } + SAVE2x4_ACC (&acc0, 0); + SAVE2x4_ACC (&acc1, 4); + SAVE2x4_ACC (&acc2, 8); + SAVE2x4_ACC (&acc3, 12); + CO += 16; + AO += temp << 4; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (16, 2) +#endif + } + i = (m & 15) >> 3; + for (j = 0; j < i; j++) + { + FLOAT *BO; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 2) +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + FLOAT t[4] = { 0 }; + t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; + vec_t *rowB = (vec_t *) & t[0]; + vec_t *rowA = (vec_t *) & AO[l << 3]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); + } + SAVE2x4_ACC (&acc0, 0); + SAVE2x4_ACC (&acc1, 4); + CO += 8; + AO += temp << 3; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 2) +#endif + } + i = (m & 7) >> 2; + for (j = 0; j < i; j++) + { + FLOAT *BO; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0; + __builtin_mma_xxsetaccz (&acc0); +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 2) +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + FLOAT t[4] = { 0 }; + t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; + vec_t *rowB = (vec_t *) & t[0]; + vec_t *rowA = (vec_t *) & AO[l << 2]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + } + SAVE2x4_ACC (&acc0, 0); + CO += 4; + AO += temp << 2; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 2) +#endif + } + i = (m & 3) >> 1; + for (j = 0; j < i; j++) + { + FLOAT *BO; + BLASLONG l = 0; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 2) +#else + BO = B; + temp = k; +#endif + v4sf_t t = { 0, 0, 0, 0 }; + for (l = 0; l < (temp << 1); l += 2) + { + v4sf_t rowA = { AO[l], AO[l], AO[l + 1], AO[l + 1] }; + v4sf_t rowB = { BO[l], BO[l + 1], BO[l], BO[l + 1] }; + t += rowA * rowB; + } + t = t * valpha; +#if defined(TRMMKERNEL) + CO[0 * ldc] = t[0]; + CO[1 * ldc] = t[1]; + CO[0 * ldc + 1] = t[2]; + CO[1 * ldc + 1] = t[3]; +#else + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; + CO[0 * ldc + 1] += t[2]; + CO[1 * ldc + 1] += t[3]; +#endif + CO += 2; + AO += temp << 1; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 2) +#endif + } + i = (m & 1) >> 0; + for (j = 0; j < i; j++) + { + FLOAT *BO; + BLASLONG l = 0; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 2) +#else + BO = B; + temp = k; +#endif + v4sf_t t = { 0, 0, 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowA = { AO[l], AO[l], 0, 0 }; + v4sf_t rowB = { BO[l << 1], BO[(l << 1) + 1], 0, 0 }; + t += rowA * rowB; + } + t = t * valpha; +#if defined(TRMMKERNEL) + CO[0 * ldc] = t[0]; + CO[1 * ldc] = t[1]; +#else + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; +#endif + CO += 1; + AO += temp; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 2) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; // number of values in A +#endif + + B += k << 1; + } + N = (n & 1) >> 0; + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, temp; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + FLOAT *CO; + FLOAT *AO; + CO = C; + C += ldc; + AO = A; + i = m; + while (i >= 16) + { + FLOAT *BO; + BLASLONG l = 0; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (16, 1) +#else + BO = B; + temp = k; +#endif + + v4sf_t t = { 0, 0, 0, 0 }; + v4sf_t t1 = { 0, 0, 0, 0 }; + v4sf_t t2 = { 0, 0, 0, 0 }; + v4sf_t t3 = { 0, 0, 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowB = { BO[l], BO[l], BO[l], BO[l] }; + v4sf_t rowA = { AO[l << 4], AO[(l << 4) + 1], AO[(l << 4) + 2], + AO[(l << 4) + 3] + }; + v4sf_t rowA1 = + { AO[(l << 4) + 4], AO[(l << 4) + 5], AO[(l << 4) + 6], + AO[(l << 4) + 7] + }; + v4sf_t rowA2 = + { AO[(l << 4) + 8], AO[(l << 4) + 9], AO[(l << 4) + 10], + AO[(l << 4) + 11] + }; + v4sf_t rowA3 = + { AO[(l << 4) + 12], AO[(l << 4) + 13], AO[(l << 4) + 14], + AO[(l << 4) + 15] + }; + t += rowA * rowB; + t1 += rowA1 * rowB; + t2 += rowA2 * rowB; + t3 += rowA3 * rowB; + } + t = t * valpha; + t1 = t1 * valpha; + t2 = t2 * valpha; + t3 = t3 * valpha; +#if defined(TRMMKERNEL) + CO[0] = t[0]; + CO[1] = t[1]; + CO[2] = t[2]; + CO[3] = t[3]; + CO[4] = t1[0]; + CO[5] = t1[1]; + CO[6] = t1[2]; + CO[7] = t1[3]; + CO[8] = t2[0]; + CO[9] = t2[1]; + CO[10] = t2[2]; + CO[11] = t2[3]; + CO[12] = t3[0]; + CO[13] = t3[1]; + CO[14] = t3[2]; + CO[15] = t3[3]; +#else + CO[0] += t[0]; + CO[1] += t[1]; + CO[2] += t[2]; + CO[3] += t[3]; + CO[4] += t1[0]; + CO[5] += t1[1]; + CO[6] += t1[2]; + CO[7] += t1[3]; + CO[8] += t2[0]; + CO[9] += t2[1]; + CO[10] += t2[2]; + CO[11] += t2[3]; + CO[12] += t3[0]; + CO[13] += t3[1]; + CO[14] += t3[2]; + CO[15] += t3[3]; +#endif + AO += temp << 4; + BO += temp; + CO += 16; + i -= 16; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (16, 1) +#endif + } + while (i >= 8) + { + FLOAT *BO; + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; + v4sf_t t1 = { 0, 0, 0, 0 }; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 1) +#else + BO = B; + temp = k; +#endif + + for (l = 0; l < temp; l++) + { + v4sf_t rowB = { BO[l], BO[l], BO[l], BO[l] }; + v4sf_t rowA = { AO[l << 3], AO[(l << 3) + 1], AO[(l << 3) + 2], + AO[(l << 3) + 3] + }; + v4sf_t rowA1 = + { AO[(l << 3) + 4], AO[(l << 3) + 5], AO[(l << 3) + 6], + AO[(l << 3) + 7] + }; + t += rowA * rowB; + t1 += rowA1 * rowB; + } + t = t * valpha; + t1 = t1 * valpha; +#if defined(TRMMKERNEL) + CO[0] = t[0]; + CO[1] = t[1]; + CO[2] = t[2]; + CO[3] = t[3]; + CO[4] = t1[0]; + CO[5] = t1[1]; + CO[6] = t1[2]; + CO[7] = t1[3]; +#else + CO[0] += t[0]; + CO[1] += t[1]; + CO[2] += t[2]; + CO[3] += t[3]; + CO[4] += t1[0]; + CO[5] += t1[1]; + CO[6] += t1[2]; + CO[7] += t1[3]; +#endif + AO += temp << 3; + BO += temp; + CO += 8; + i -= 8; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 1) +#endif + } + while (i >= 4) + { + FLOAT *BO; + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 1) +#else + BO = B; + temp = k; +#endif + + for (l = 0; l < temp; l++) + { + v4sf_t rowB = { BO[l], BO[l], BO[l], BO[l] }; + v4sf_t rowA = { AO[l << 2], AO[(l << 2) + 1], AO[(l << 2) + 2], + AO[(l << 2) + 3] + }; + t += rowA * rowB; + } + t = t * valpha; +#if defined(TRMMKERNEL) + CO[0] = t[0]; + CO[1] = t[1]; + CO[2] = t[2]; + CO[3] = t[3]; +#else + CO[0] += t[0]; + CO[1] += t[1]; + CO[2] += t[2]; + CO[3] += t[3]; +#endif + AO += temp << 2; + BO += temp; + CO += 4; + i -= 4; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 1) +#endif + } + while (i >= 2) + { + FLOAT *BO; + BLASLONG l = 0; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 1) +#else + BO = B; + temp = k; +#endif + + v4sf_t t = { 0, 0, 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowB = { BO[l], BO[l], 0, 0 }; + v4sf_t rowA = { AO[l << 1], AO[(l << 1) + 1], 0, 0 }; + t += rowA * rowB; + } + t = t * valpha; +#if defined(TRMMKERNEL) + CO[0] = t[0]; + CO[1] = t[1]; +#else + CO[0] += t[0]; + CO[1] += t[1]; +#endif + AO += temp << 1; + BO += temp; + CO += 2; + i -= 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 1) +#endif + } + while (i >= 1) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 1) +#else + BO = B; + temp = k; +#endif + + BLASLONG l = 0; + FLOAT t = 0; + for (l = 0; l < temp; l++) + { + t += AO[l] * BO[l]; + } + AO += temp; + BO += temp; +#if defined(TRMMKERNEL) + CO[0] = t * alpha; +#else + CO[0] += t * alpha; +#endif + CO += 1; + i -= 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 1) +#endif + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; // number of values in A +#endif + B += k; + } + return 0; +} From bb2f52844bbcd5c786d7b37f8c4d88dbf7a3b89e Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Wed, 24 Jun 2020 14:50:12 -0500 Subject: [PATCH 263/593] powerpc: Optimized ZGEMM kernel for POWER10 This patch introduces new optimized version of ZGEMM kernel using power10 Matrix-Multiply Assist (MMA) feature introduced in POWER ISA v3.1. This patch makes use of new POWER10 compute instructions for matrix multiplication operation. Tested on simulator and there are no new test failures. Cycles count reduced by 30-50% compared to POWER9 version depending on M/N/K sizes. --- kernel/power/KERNEL.POWER10 | 4 +- kernel/power/zgemm_kernel_power10.S | 245 ++++ kernel/power/zgemm_logic_power10.S | 1735 +++++++++++++++++++++++++++ kernel/power/zgemm_macros_power10.S | 1138 ++++++++++++++++++ 4 files changed, 3120 insertions(+), 2 deletions(-) create mode 100644 kernel/power/zgemm_kernel_power10.S create mode 100644 kernel/power/zgemm_logic_power10.S create mode 100644 kernel/power/zgemm_macros_power10.S diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index 00d31f8b6..4fc7190b0 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -10,7 +10,7 @@ else STRMMKERNEL = sgemm_kernel_power10.c DTRMMKERNEL = dgemm_kernel_power10.c CTRMMKERNEL = cgemm_kernel_power10.S -ZTRMMKERNEL = zgemm_kernel_power9.S +ZTRMMKERNEL = zgemm_kernel_power10.S SGEMMKERNEL = sgemm_kernel_power10.c SGEMMINCOPY = ../generic/gemm_ncopy_16.c @@ -42,7 +42,7 @@ CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) -ZGEMMKERNEL = zgemm_kernel_power9.S +ZGEMMKERNEL = zgemm_kernel_power10.S ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c diff --git a/kernel/power/zgemm_kernel_power10.S b/kernel/power/zgemm_kernel_power10.S new file mode 100644 index 000000000..fca389e69 --- /dev/null +++ b/kernel/power/zgemm_kernel_power10.S @@ -0,0 +1,245 @@ +/*************************************************************************** +Copyright (c) 2013-2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#define LOAD ld + +#define STACKSIZE 512 + +#define FZERO 312+192(SP) + +#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ + +#define M r3 +#define N r4 +#define K r5 + + +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 + + + +#define o0 0 +#define alpha_r vs62 +#define alpha_i vs63 + +#define VECSAVE r11 + +#define FRAMEPOINTER r12 + +#define T10 r14 + +#define L r15 +#define T8 r16 +#define T5 r17 +#define T2 r19 +#define TEMP_REG r20 +#define T6 r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define T7 r27 +#define T3 r28 +#define T4 r29 + +#define PRE r30 +#define T1 r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + mr FRAMEPOINTER, SP + addi SP, SP, -STACKSIZE + mflr r0 + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + xxspltd alpha_r,vs1,0 /*copy from register f1 */ + xxspltd alpha_i,vs2,0 /*copy from register f2 */ + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + + stxv vs20, 288(SP) + stxv vs21, 304(SP) + stxv vs22, 320(SP) + stxv vs23, 336(SP) + stxv vs24, 352(SP) + stxv vs25, 368(SP) + stxv vs26, 384(SP) + stxv vs27, 400(SP) + stxv vs28, 416(SP) + stxv vs29, 432(SP) + stxv vs30, 448(SP) + stxv vs31, 464(SP) + + std r0, FLINK_SAVE(SP) + + +#if defined(linux) || defined(__FreeBSD__) + ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) +#endif + + +#ifdef TRMMKERNEL +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) + ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) +#endif +#endif + + +#include "zgemm_macros_power10.S" + + + + slwi LDC, LDC, ZBASE_SHIFT + li PRE, 512 + li r0, 0 + + +#if defined(CC) || defined(CR) || defined(RC) || defined(RR) +/*negate for this case as we will use addition -1*(a+b) */ + xvnegdp alpha_r,alpha_r + xvnegdp alpha_i,alpha_i +#endif + .align 4 + +#include "zgemm_logic_power10.S" + +L999: + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + ld r0, FLINK_SAVE(SP) + + lxv vs20, 288(SP) + lxv vs21, 304(SP) + lxv vs22, 320(SP) + lxv vs23, 336(SP) + lxv vs24, 352(SP) + lxv vs25, 368(SP) + lxv vs26, 384(SP) + lxv vs27, 400(SP) + mtlr r0 + lxv vs28, 416(SP) + lxv vs29, 432(SP) + lxv vs30, 448(SP) + lxv vs31, 464(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE +#endif diff --git a/kernel/power/zgemm_logic_power10.S b/kernel/power/zgemm_logic_power10.S new file mode 100644 index 000000000..1143733e0 --- /dev/null +++ b/kernel/power/zgemm_logic_power10.S @@ -0,0 +1,1735 @@ +/*************************************************************************** +Copyright (c) 2013-2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define MY_ALIGN .align 3 +b ZGEMM_L2 +/* MINI SUBROUTINES */ +/* 2x8 MAIN 128x+2 LOOP */ + + +ZGEMM_L2x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + MY_ALIGN +ZGEMM_L2x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_2 0, 0 +ZGEMM_L2x8_K128: +/*----------------------------------------*/ + KERNEL2x8_2 1, 0 + dcbt AO, T2 + KERNEL2x8_2 2, 0 + KERNEL2x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_2 4, 0 + KERNEL2x8_2 5, 0 + dcbt AO, T4 + KERNEL2x8_2 6, 0 + KERNEL2x8_2 7, 0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_2 8, 0 + KERNEL2x8_2 9, 0 + KERNEL2x8_2 10, 0 + KERNEL2x8_2 11, 0 + dcbt BO, T4 + KERNEL2x8_2 12, 0 + KERNEL2x8_2 13, 0 + KERNEL2x8_2 14, 0 + KERNEL2x8_2 15, 0 + KERNEL2x8_2 16, 0 + KERNEL2x8_2 17, 0 + KERNEL2x8_2 18, 0 + KERNEL2x8_2 19, 0 + KERNEL2x8_2 20, 0 + KERNEL2x8_2 21, 0 + KERNEL2x8_2 22, 0 + KERNEL2x8_2 23, 0 + KERNEL2x8_2 24, 0 + KERNEL2x8_2 25, 0 + KERNEL2x8_2 26, 0 + KERNEL2x8_2 27, 0 + KERNEL2x8_2 28, 0 + KERNEL2x8_2 29, 0 + KERNEL2x8_2 30, 0 + KERNEL2x8_2 31, 0 + KERNEL2x8_2 32, 0 + KERNEL2x8_2 33, 0 + KERNEL2x8_2 34, 0 + KERNEL2x8_2 35, 0 + KERNEL2x8_2 36, 0 + KERNEL2x8_2 37, 0 + KERNEL2x8_2 38, 0 + KERNEL2x8_2 39, 0 + KERNEL2x8_2 40, 0 + KERNEL2x8_2 41, 0 + KERNEL2x8_2 42, 0 + KERNEL2x8_2 43, 0 + KERNEL2x8_2 44, 0 + KERNEL2x8_2 45, 0 + KERNEL2x8_2 46, 0 + KERNEL2x8_2 47, 0 + KERNEL2x8_2 48, 0 + KERNEL2x8_2 49, 0 + KERNEL2x8_2 50, 0 + KERNEL2x8_2 51, 0 + KERNEL2x8_2 52, 0 + KERNEL2x8_2 53, 0 + KERNEL2x8_2 54, 0 + KERNEL2x8_2 55, 0 + KERNEL2x8_2 56, 0 + KERNEL2x8_2 57, 0 + KERNEL2x8_2 58, 0 + KERNEL2x8_2 59, 0 + KERNEL2x8_2 60, 0 + KERNEL2x8_2 61, 0 + KERNEL2x8_2 62, 0 + KERNEL2x8_2 63, 1 + bdz ZGEMM_L2x8_LOOP_END + b ZGEMM_L2x8_LOOP + MY_ALIGN + +ZGEMM_L2x8_LOOP_END: +/*----------------------------------------*/ + KERNEL2x8_2 0, 1 + blr + MY_ALIGN + + +ZGEMM_2x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + MY_ALIGN +ZGEMM_L2x4_LOOP: +/*----------------------------------------*/ + KERNEL2x4_2 0, 0 +ZGEMM_L2x4_K32: +/*----------------------------------------*/ + KERNEL2x4_2 1, 0 + KERNEL2x4_2 2, 0 + KERNEL2x4_2 3, 0 + KERNEL2x4_2 4, 0 + KERNEL2x4_2 5, 0 + KERNEL2x4_2 6, 0 + KERNEL2x4_2 7, 0 + KERNEL2x4_2 8, 0 + KERNEL2x4_2 9, 0 + KERNEL2x4_2 10, 0 + KERNEL2x4_2 11, 0 + KERNEL2x4_2 12, 0 + KERNEL2x4_2 13, 0 + KERNEL2x4_2 14, 0 + KERNEL2x4_2 15, 1 + bdnz ZGEMM_L2x4_LOOP + MY_ALIGN +ZGEMM_L2x4_LOOP_END: +/*----------------------------------------*/ + KERNEL2x4_2 0, 1 + blr + MY_ALIGN + + +ZGEMM_2x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + MY_ALIGN +ZGEMM_L2x2_LOOP: +/*----------------------------------------*/ + KERNEL2x2_2 0, 0 +ZGEMM_L2x2_K32: +/*----------------------------------------*/ + KERNEL2x2_2 1, 0 + KERNEL2x2_2 2, 0 + KERNEL2x2_2 3, 0 + KERNEL2x2_2 4, 0 + KERNEL2x2_2 5, 0 + KERNEL2x2_2 6, 0 + KERNEL2x2_2 7, 0 + KERNEL2x2_2 8, 0 + KERNEL2x2_2 9, 0 + KERNEL2x2_2 10, 0 + KERNEL2x2_2 11, 0 + KERNEL2x2_2 12, 0 + KERNEL2x2_2 13, 0 + KERNEL2x2_2 14, 0 + KERNEL2x2_2 15, 1 + bdnz ZGEMM_L2x2_LOOP + MY_ALIGN + + +ZGEMM_L2x2_LOOP_END: +/*----------------------------------------*/ + KERNEL2x2_2 0, 1 + blr + MY_ALIGN + +ZGEMM_2x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x1_2 + MY_ALIGN +ZGEMM_L2x1_LOOP: +/*----------------------------------------*/ + KERNEL2x1_L2 32, 64, 0, 0 +ZGEMM_L2x1_K32: +/*----------------------------------------*/ + KERNEL2x1_L2 32, 64, 1, 0 + KERNEL2x1_L2 32, 64, 2, 0 + KERNEL2x1_L2 32, 64, 3, 0 + KERNEL2x1_L2 32, 64, 4, 0 + KERNEL2x1_L2 32, 64, 5, 0 + KERNEL2x1_L2 32, 64, 6, 0 + KERNEL2x1_L2 32, 64, 7, 0 + KERNEL2x1_L2 32, 64, 8, 0 + KERNEL2x1_L2 32, 64, 9, 0 + KERNEL2x1_L2 32, 64, 10, 0 + KERNEL2x1_L2 32, 64, 11, 0 + KERNEL2x1_L2 32, 64, 12, 0 + KERNEL2x1_L2 32, 64, 13, 0 + KERNEL2x1_L2 32, 64, 14, 0 + KERNEL2x1_L2 32, 64, 15, 1 + bdnz ZGEMM_L2x1_LOOP + MY_ALIGN +ZGEMM_L2x1_LOOP_END: +/*----------------------------------------*/ + END2x1_2 + blr + + MY_ALIGN + + +/* MAIN LOOP BEGINS */ + MY_ALIGN + + +ZGEMM_L2: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) && !defined(LEFT) + neg TEMP_REG, OFFSET +#endif + srawi. J, N, 1 + bgt ZGEMM_L2_BEGIN + b ZGEMM_L2_END + +ZGEMM_L2_BEGIN: +/*----------------------------------------*/ + mr CO, C + slwi T1, LDC, 1 + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + bgt ZGEMM_L2_BEGIN_CONTINUE + b ZGEMM_L2x8_END + +ZGEMM_L2_BEGIN_CONTINUE: + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +ZGEMM_L2x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 8, 2 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 8, 2 + mr T1, T6 +#else + mr T1, K +#endif +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /* T8 <- T1 % 128 */ + + KERNEL2x8_PRELOAD + KERNEL2x8_ZERO_AND_PRIME_MMA + ble ZGEMM_L2x8_SUB0 + bl ZGEMM_L2x8_LMAIN_SUB + andi. L, T1, 127 + + bgt ZGEMM_L2x8_BEGIN_CONTINUE + b ZGEMM_L2x8_SAVE + +ZGEMM_L2x8_BEGIN_CONTINUE: + b ZGEMM_L2x8_SUB2 + + +ZGEMM_L2x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6, 129 +#else + andi. L, K, 255 + cmpwi K, 129 +#endif + li T8, 1 + bne CMP2x8_128K + LOAD_END_2x8 128, 32 + KERNEL2x8_PRELOAD + addi BO, BO, -64 + addi AO,AO, -256 + mtctr T8 + bl ZGEMM_L2x8_K128 + b ZGEMM_L2x8_SAVE + +CMP2x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 128 +#else + cmpwi K, 128 +#endif + bne ZGEMM_L2x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -64 + addi AO,AO, -256 + bl ZGEMM_L2x8_K128 + b ZGEMM_L2x8_SAVE + MY_ALIGN + + +ZGEMM_L2x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble ZGEMM_L2x8_SUB2_32 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_2 0, 0 + KERNEL2x8_2 1, 0 + dcbt AO, T2 + KERNEL2x8_2 2, 0 + KERNEL2x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_2 4, 0 + KERNEL2x8_2 5, 0 + dcbt AO, T4 + KERNEL2x8_2 6, 0 + KERNEL2x8_2 7, 0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_2 8, 0 + KERNEL2x8_2 9, 0 + KERNEL2x8_2 10, 0 + KERNEL2x8_2 11, 0 + dcbt BO, T4 + KERNEL2x8_2 12, 0 + KERNEL2x8_2 13, 0 + KERNEL2x8_2 14, 0 + KERNEL2x8_2 15, 0 + KERNEL2x8_2 16, 0 + KERNEL2x8_2 17, 0 + KERNEL2x8_2 18, 0 + KERNEL2x8_2 19, 0 + KERNEL2x8_2 20, 0 + KERNEL2x8_2 21, 0 + KERNEL2x8_2 22, 0 + KERNEL2x8_2 23, 0 + KERNEL2x8_2 24, 0 + KERNEL2x8_2 25, 0 + KERNEL2x8_2 26, 0 + KERNEL2x8_2 27, 0 + KERNEL2x8_2 28, 0 + KERNEL2x8_2 29, 0 + KERNEL2x8_2 30, 0 + KERNEL2x8_2 31, 1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble ZGEMM_L2x8_SUB2_16 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_2 0, 0 + KERNEL2x8_2 1, 0 + dcbt AO, T2 + KERNEL2x8_2 2, 0 + KERNEL2x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_2 4, 0 + KERNEL2x8_2 5, 0 + dcbt AO, T4 + KERNEL2x8_2 6, 0 + KERNEL2x8_2 7, 0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_2 8, 0 + KERNEL2x8_2 9, 0 + KERNEL2x8_2 10, 0 + KERNEL2x8_2 11, 0 + dcbt BO, T4 + KERNEL2x8_2 12, 0 + KERNEL2x8_2 13, 0 + KERNEL2x8_2 14, 0 + KERNEL2x8_2 15, 1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L2x8_SUB2_8 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_2 0, 0 + KERNEL2x8_2 1, 0 + dcbt AO, T2 + KERNEL2x8_2 2, 0 + KERNEL2x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_2 4, 0 + KERNEL2x8_2 5, 0 + dcbt AO, T4 + KERNEL2x8_2 6, 0 + KERNEL2x8_2 7, 1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L2x8_SUB2_4 + KERNEL2x8_2 0, 0 + KERNEL2x8_2 1, 0 + KERNEL2x8_2 2, 0 + KERNEL2x8_2 3, 1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L2x8_SUB2_2 + KERNEL2x8_2 0, 0 + KERNEL2x8_2 1, 1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L2x8_SUB2_1 + KERNEL2x8_2 0, 1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L2x8_SAVE + LOAD_END_2x8 128, 32 + + +ZGEMM_L2x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + KERNEL2x8_UNPRIME_MMA + SAVE2x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 8, 2 +#endif + + ble ZGEMM_L2x8_SAVE_CONTINUE + b ZGEMM_L2x8_BEGIN + +ZGEMM_L2x8_SAVE_CONTINUE: + andi. T2, M, 7 + ble ZGEMM_L2x1_END + andi. T1, M, 4 + ble ZGEMM_L2x4_END + b ZGEMM_L2x4_BEGIN + MY_ALIGN + + +ZGEMM_L2x8_END: +/*----------------------------------------*/ + + +ZGEMM_L2x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble ZGEMM_L2x1_END + andi. T1, M, 4 + ble ZGEMM_L2x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 4, 2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 4, 2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + KERNEL2x4_PRELOAD + KERNEL2x4_ZERO_AND_PRIME_MMA + ble ZGEMM_L2x4_SUB0 + bl ZGEMM_2x4_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L2x4_SAVE + b ZGEMM_L2x4_SUB2 + + +ZGEMM_L2x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6, 33 +#else + andi. L, K, 63 + cmpwi K, 33 +#endif + li T8, 1 + bne CMP2x4_32K + LOAD_END_2x4 64, 32 + KERNEL2x4_PRELOAD + addi BO, BO, -64 + addi AO,AO, -128 + mtctr T8 + bl ZGEMM_L2x4_K32 + b ZGEMM_L2x4_SAVE + CMP2x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 32 +#else + cmpwi K, 32 +#endif + bne ZGEMM_L2x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -64 + addi AO,AO, -128 + bl ZGEMM_L2x4_K32 + b ZGEMM_L2x4_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L2x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L2x4_SUB2_8 + KERNEL2x4_2 0, 0 + KERNEL2x4_2 1, 0 + KERNEL2x4_2 2, 0 + KERNEL2x4_2 3, 0 + KERNEL2x4_2 4, 0 + KERNEL2x4_2 5, 0 + KERNEL2x4_2 6, 0 + KERNEL2x4_2 7, 1 + MY_ALIGN + + +ZGEMM_L2x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L2x4_SUB2_4 + KERNEL2x4_2 0, 0 + KERNEL2x4_2 1, 0 + KERNEL2x4_2 2, 0 + KERNEL2x4_2 3, 1 + MY_ALIGN + + +ZGEMM_L2x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L2x4_SUB2_2 + KERNEL2x4_2 0, 0 + KERNEL2x4_2 1, 1 + MY_ALIGN + + +ZGEMM_L2x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L2x4_SUB2_1 + KERNEL2x4_2 0, 1 + MY_ALIGN + + +ZGEMM_L2x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L2x4_SAVE + LOAD_END_2x4 64, 32 + + +ZGEMM_L2x4_SAVE: +/*----------------------------------------*/ + KERNEL2x4_UNPRIME_MMA + SAVE2x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 4, 2 +#endif + + +ZGEMM_L2x4_END: +/*----------------------------------------*/ + + +ZGEMM_L2x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble ZGEMM_L2x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 2, 2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 2, 2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + KERNEL2x2_PRELOAD + KERNEL2x2_ZERO_AND_PRIME_MMA + ble ZGEMM_L2x2_SUB0 + bl ZGEMM_2x2_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L2x2_SAVE + b ZGEMM_L2x2_SUB2 + + +ZGEMM_L2x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6, 33 +#else + andi. L, K, 63 + cmpwi K, 33 +#endif + li T8, 1 + bne CMP2x2_32K + LOAD_END_2x2 32, 32 + KERNEL2x2_PRELOAD + addi BO, BO, -64 + addi AO,AO, -64 + mtctr T8 + bl ZGEMM_L2x2_K32 + b ZGEMM_L2x2_SAVE + CMP2x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 32 +#else + cmpwi K, 32 +#endif + bne ZGEMM_L2x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -64 + addi AO,AO, -64 + bl ZGEMM_L2x2_K32 + b ZGEMM_L2x2_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L2x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L2x2_SUB2_8 + KERNEL2x2_2 0, 0 + KERNEL2x2_2 1, 0 + KERNEL2x2_2 2, 0 + KERNEL2x2_2 3, 0 + KERNEL2x2_2 4, 0 + KERNEL2x2_2 5, 0 + KERNEL2x2_2 6, 0 + KERNEL2x2_2 7, 1 + MY_ALIGN + + +ZGEMM_L2x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L2x2_SUB2_4 + KERNEL2x2_2 0, 0 + KERNEL2x2_2 1, 0 + KERNEL2x2_2 2, 0 + KERNEL2x2_2 3, 1 + MY_ALIGN + + +ZGEMM_L2x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L2x2_SUB2_2 + KERNEL2x2_2 0, 0 + KERNEL2x2_2 1, 1 + MY_ALIGN + + +ZGEMM_L2x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L2x2_SUB2_1 + KERNEL2x2_2 0, 1 + MY_ALIGN + + +ZGEMM_L2x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L2x2_SAVE + LOAD_END_2x2 32, 32 + + +ZGEMM_L2x2_SAVE: +/*----------------------------------------*/ + KERNEL2x2_UNPRIME_MMA + SAVE2x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 2, 2 +#endif + + +ZGEMM_L2x2_END: +/*----------------------------------------*/ + + +ZGEMM_L2x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble ZGEMM_L2x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 1, 2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 1, 2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x1 + ble ZGEMM_L2x1_SUB0 + bl ZGEMM_2x1_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L2x1_SAVE + b ZGEMM_L2x1_SUB2 + + +ZGEMM_L2x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6, 33 +#else + andi. L, K, 63 + cmpwi K, 33 +#endif + li T8, 1 + bne CMP2x1_32K + addi BO, BO, -32 + addi AO,AO, -16 + LOAD2x1O 16, 32 + END2x1_WITHOUT_ADD + LOAD2x1_2O 32, 64 + mtctr T8 + bl ZGEMM_L2x1_K32 + b ZGEMM_L2x1_SAVE + CMP2x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 32 +#else + cmpwi K, 32 +#endif + bne ZGEMM_L2x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -64 + addi AO,AO, -32 + LOAD2x1_2O 32, 64 + bl ZGEMM_L2x1_K32 + b ZGEMM_L2x1_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L2x1_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L2x1_SUB2_8 + LOAD2x1_2 + KERNEL2x1_L2 32, 64, 0, 0 + KERNEL2x1_L2 32, 64, 1, 0 + KERNEL2x1_L2 32, 64, 2, 0 + KERNEL2x1_L2 32, 64, 3, 0 + KERNEL2x1_L2 32, 64, 4, 0 + KERNEL2x1_L2 32, 64, 5, 0 + KERNEL2x1_L2 32, 64, 6, 0 + KERNEL2x1_E2 32, 64, 7, 1 + MY_ALIGN + + +ZGEMM_L2x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L2x1_SUB2_4 + LOAD2x1_2 + KERNEL2x1_L2 32, 64, 0, 0 + KERNEL2x1_L2 32, 64, 1, 0 + KERNEL2x1_L2 32, 64, 2, 0 + KERNEL2x1_E2 32, 64, 3, 1 + MY_ALIGN + + +ZGEMM_L2x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L2x1_SUB2_2 + LOAD2x1_2 + KERNEL2x1_L2 32, 64, 0, 0 + KERNEL2x1_E2 32, 64, 1, 1 + MY_ALIGN + + +ZGEMM_L2x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L2x1_SUB2_1 + LOAD2x1_2 + KERNEL2x1_E2 32, 64, 0, 1 + MY_ALIGN + + +ZGEMM_L2x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L2x1_SAVE + KERNEL2x1 + + +ZGEMM_L2x1_SAVE: +/*----------------------------------------*/ + SAVE2x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 1, 2 +#endif + + +ZGEMM_L2x1_END: +/*----------------------------------------*/ + slwi T1, K, 5 + addic. J, J, -1 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 2 +#endif + ble ZGEMM_L2_END + b ZGEMM_L2_BEGIN + +ZGEMM_L2_END: + +b ZGEMM_L1 +/* MINI SUBROUTINES */ +/* 1x8 MAIN 128x+2 LOOP */ + + +ZGEMM_L1x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + MY_ALIGN +ZGEMM_L1x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_2 0, 0 +ZGEMM_L1x8_K128: +/*----------------------------------------*/ + KERNEL1x8_2 1, 0 + dcbt AO, T2 + KERNEL1x8_2 2, 0 + KERNEL1x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_2 4, 0 + KERNEL1x8_2 5, 0 + dcbt AO, T4 + KERNEL1x8_2 6, 0 + KERNEL1x8_2 7, 0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_2 8, 0 + KERNEL1x8_2 9, 0 + KERNEL1x8_2 10, 0 + KERNEL1x8_2 11, 0 + dcbt BO, T4 + KERNEL1x8_2 12, 0 + KERNEL1x8_2 13, 0 + KERNEL1x8_2 14, 0 + KERNEL1x8_2 15, 0 + KERNEL1x8_2 16, 0 + KERNEL1x8_2 17, 0 + KERNEL1x8_2 18, 0 + KERNEL1x8_2 19, 0 + KERNEL1x8_2 20, 0 + KERNEL1x8_2 21, 0 + KERNEL1x8_2 22, 0 + KERNEL1x8_2 23, 0 + KERNEL1x8_2 24, 0 + KERNEL1x8_2 25, 0 + KERNEL1x8_2 26, 0 + KERNEL1x8_2 27, 0 + KERNEL1x8_2 28, 0 + KERNEL1x8_2 29, 0 + KERNEL1x8_2 30, 0 + KERNEL1x8_2 31, 0 + KERNEL1x8_2 32, 0 + KERNEL1x8_2 33, 0 + KERNEL1x8_2 34, 0 + KERNEL1x8_2 35, 0 + KERNEL1x8_2 36, 0 + KERNEL1x8_2 37, 0 + KERNEL1x8_2 38, 0 + KERNEL1x8_2 39, 0 + KERNEL1x8_2 40, 0 + KERNEL1x8_2 41, 0 + KERNEL1x8_2 42, 0 + KERNEL1x8_2 43, 0 + KERNEL1x8_2 44, 0 + KERNEL1x8_2 45, 0 + KERNEL1x8_2 46, 0 + KERNEL1x8_2 47, 0 + KERNEL1x8_2 48, 0 + KERNEL1x8_2 49, 0 + KERNEL1x8_2 50, 0 + KERNEL1x8_2 51, 0 + KERNEL1x8_2 52, 0 + KERNEL1x8_2 53, 0 + KERNEL1x8_2 54, 0 + KERNEL1x8_2 55, 0 + KERNEL1x8_2 56, 0 + KERNEL1x8_2 57, 0 + KERNEL1x8_2 58, 0 + KERNEL1x8_2 59, 0 + KERNEL1x8_2 60, 0 + KERNEL1x8_2 61, 0 + KERNEL1x8_2 62, 0 + KERNEL1x8_2 63, 1 + bdnz ZGEMM_L1x8_LOOP + MY_ALIGN +ZGEMM_L1x8_LOOP_END: +/*----------------------------------------*/ + KERNEL1x8_2 0, 1 + blr + MY_ALIGN + + +ZGEMM_1x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + MY_ALIGN + + +ZGEMM_L1x4_LOOP: +/*----------------------------------------*/ + KERNEL1x4_2 0, 0 + + +ZGEMM_L1x4_K32: +/*----------------------------------------*/ + KERNEL1x4_2 1, 0 + KERNEL1x4_2 2, 0 + KERNEL1x4_2 3, 0 + KERNEL1x4_2 4, 0 + KERNEL1x4_2 5, 0 + KERNEL1x4_2 6, 0 + KERNEL1x4_2 7, 0 + KERNEL1x4_2 8, 0 + KERNEL1x4_2 9, 0 + KERNEL1x4_2 10, 0 + KERNEL1x4_2 11, 0 + KERNEL1x4_2 12, 0 + KERNEL1x4_2 13, 0 + KERNEL1x4_2 14, 0 + KERNEL1x4_2 15, 1 + bdnz ZGEMM_L1x4_LOOP + MY_ALIGN + + +ZGEMM_L1x4_LOOP_END: +/*----------------------------------------*/ + KERNEL1x4_2 0, 1 + blr + MY_ALIGN + + +ZGEMM_1x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + MY_ALIGN + + +ZGEMM_L1x2_LOOP: +/*----------------------------------------*/ + KERNEL1x2_2 0, 0 + + +ZGEMM_L1x2_K32: +/*----------------------------------------*/ + KERNEL1x2_2 1, 0 + KERNEL1x2_2 2, 0 + KERNEL1x2_2 3, 0 + KERNEL1x2_2 4, 0 + KERNEL1x2_2 5, 0 + KERNEL1x2_2 6, 0 + KERNEL1x2_2 7, 0 + KERNEL1x2_2 8, 0 + KERNEL1x2_2 9, 0 + KERNEL1x2_2 10, 0 + KERNEL1x2_2 11, 0 + KERNEL1x2_2 12, 0 + KERNEL1x2_2 13, 0 + KERNEL1x2_2 14, 0 + KERNEL1x2_2 15, 1 + bdnz ZGEMM_L1x2_LOOP + MY_ALIGN + + +ZGEMM_L1x2_LOOP_END: +/*----------------------------------------*/ + KERNEL1x2_2 0, 1 + blr + MY_ALIGN + + +ZGEMM_1x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x1_2 + MY_ALIGN + + +ZGEMM_L1x1_LOOP: +/*----------------------------------------*/ + KERNEL1x1_L2 32, 32, 0, 0 + + +ZGEMM_L1x1_K32: +/*----------------------------------------*/ + KERNEL1x1_L2 32, 32, 1, 0 + KERNEL1x1_L2 32, 32, 2, 0 + KERNEL1x1_L2 32, 32, 3, 0 + KERNEL1x1_L2 32, 32, 4, 0 + KERNEL1x1_L2 32, 32, 5, 0 + KERNEL1x1_L2 32, 32, 6, 0 + KERNEL1x1_L2 32, 32, 7, 0 + KERNEL1x1_L2 32, 32, 8, 0 + KERNEL1x1_L2 32, 32, 9, 0 + KERNEL1x1_L2 32, 32, 10, 0 + KERNEL1x1_L2 32, 32, 11, 0 + KERNEL1x1_L2 32, 32, 12, 0 + KERNEL1x1_L2 32, 32, 13, 0 + KERNEL1x1_L2 32, 32, 14, 0 + KERNEL1x1_L2 32, 32, 15, 1 + bdnz ZGEMM_L1x1_LOOP + MY_ALIGN + + +ZGEMM_L1x1_LOOP_END: +/*----------------------------------------*/ + END1x1_2 + blr + MY_ALIGN + + +/*----------------------N1 BEGINS---------*/ +ZGEMM_L1: +/*----------------------------------------*/ + andi. T1, N, 1 + ble ZGEMM_L1_END + +ZGEMM_L1_BEGIN: +/*----------------------------------------*/ + mr CO, C + + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble ZGEMM_L1x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +ZGEMM_L1x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 8, 1 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 8, 1 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T11-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + KERNEL1x8_ZERO_AND_PRIME_MMA + ble ZGEMM_L1x8_SUB0 + bl ZGEMM_L1x8_LMAIN_SUB + andi. L, T1, 127 + ble ZGEMM_L1x8_SAVE + b ZGEMM_L1x8_SUB2 + + +ZGEMM_L1x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6, 129 +#else + andi. L, K, 255 + cmpwi K, 129 +#endif + li T8, 1 + bne CMP1x8_128K + LOAD_END_1x8 -128, -16 + mtctr T8 + bl ZGEMM_L1x8_K128 + b ZGEMM_L1x8_SAVE + CMP1x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 128 +#else + cmpwi K, 128 +#endif + bne ZGEMM_L1x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -32 + addi AO,AO, -256 + bl ZGEMM_L1x8_K128 + b ZGEMM_L1x8_SAVE + MY_ALIGN + + +ZGEMM_L1x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble ZGEMM_L1x8_SUB2_32 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_2 0, 0 + KERNEL1x8_2 1, 0 + dcbt AO, T2 + KERNEL1x8_2 2, 0 + KERNEL1x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_2 4, 0 + KERNEL1x8_2 5, 0 + dcbt AO, T4 + KERNEL1x8_2 6, 0 + KERNEL1x8_2 7, 0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_2 8, 0 + KERNEL1x8_2 9, 0 + KERNEL1x8_2 10, 0 + KERNEL1x8_2 11, 0 + dcbt BO, T4 + KERNEL1x8_2 12, 0 + KERNEL1x8_2 13, 0 + KERNEL1x8_2 14, 0 + KERNEL1x8_2 15, 0 + KERNEL1x8_2 16, 0 + KERNEL1x8_2 17, 0 + KERNEL1x8_2 18, 0 + KERNEL1x8_2 19, 0 + KERNEL1x8_2 20, 0 + KERNEL1x8_2 21, 0 + KERNEL1x8_2 22, 0 + KERNEL1x8_2 23, 0 + KERNEL1x8_2 24, 0 + KERNEL1x8_2 25, 0 + KERNEL1x8_2 26, 0 + KERNEL1x8_2 27, 0 + KERNEL1x8_2 28, 0 + KERNEL1x8_2 29, 0 + KERNEL1x8_2 30, 0 + KERNEL1x8_2 31, 1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble ZGEMM_L1x8_SUB2_16 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_2 0, 0 + KERNEL1x8_2 1, 0 + dcbt AO, T2 + KERNEL1x8_2 2, 0 + KERNEL1x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_2 4, 0 + KERNEL1x8_2 5, 0 + dcbt AO, T4 + KERNEL1x8_2 6, 0 + KERNEL1x8_2 7, 0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_2 8, 0 + KERNEL1x8_2 9, 0 + KERNEL1x8_2 10, 0 + KERNEL1x8_2 11, 0 + dcbt BO, T4 + KERNEL1x8_2 12, 0 + KERNEL1x8_2 13, 0 + KERNEL1x8_2 14, 0 + KERNEL1x8_2 15, 1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L1x8_SUB2_8 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_2 0, 0 + KERNEL1x8_2 1, 0 + dcbt AO, T2 + KERNEL1x8_2 2, 0 + KERNEL1x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_2 4, 0 + KERNEL1x8_2 5, 0 + dcbt AO, T4 + KERNEL1x8_2 6, 0 + KERNEL1x8_2 7, 1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L1x8_SUB2_4 + KERNEL1x8_2 0, 0 + KERNEL1x8_2 1, 0 + KERNEL1x8_2 2, 0 + KERNEL1x8_2 3, 1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L1x8_SUB2_2 + KERNEL1x8_2 0, 0 + KERNEL1x8_2 1, 1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L1x8_SUB2_1 + KERNEL1x8_2 0, 1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L1x8_SAVE + LOAD_END_1x8 128, 16 + + +ZGEMM_L1x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + KERNEL1x8_UNPRIME_MMA + SAVE1x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 8, 1 +#endif + bgt ZGEMM_L1x8_BEGIN + andi. T2, M, 7 + ble ZGEMM_L1x1_END + andi. T1, M, 4 + ble ZGEMM_L1x4_END + b ZGEMM_L1x4_BEGIN + MY_ALIGN + + +ZGEMM_L1x8_END: +/*----------------------------------------*/ + + +ZGEMM_L1x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble ZGEMM_L1x1_END + andi. T1, M, 4 + ble ZGEMM_L1x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 4, 1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 4, 1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + KERNEL1x4_ZERO_AND_PRIME_MMA + ble ZGEMM_L1x4_SUB0 + bl ZGEMM_1x4_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L1x4_SAVE + b ZGEMM_L1x4_SUB2 + + +ZGEMM_L1x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6, 33 +#else + andi. L, K, 63 + cmpwi K, 33 +#endif + li T8, 1 + bne CMP1x4_32K + LOAD_END_1x4 -64, -16 + mtctr T8 + bl ZGEMM_L1x4_K32 + b ZGEMM_L1x4_SAVE + CMP1x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 32 +#else + cmpwi K, 32 +#endif + bne ZGEMM_L1x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -32 + addi AO,AO, -128 + bl ZGEMM_L1x4_K32 + b ZGEMM_L1x4_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L1x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L1x4_SUB2_8 + KERNEL1x4_2 0, 0 + KERNEL1x4_2 1, 0 + KERNEL1x4_2 2, 0 + KERNEL1x4_2 3, 0 + KERNEL1x4_2 4, 0 + KERNEL1x4_2 5, 0 + KERNEL1x4_2 6, 0 + KERNEL1x4_2 7, 1 + MY_ALIGN + + +ZGEMM_L1x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L1x4_SUB2_4 + KERNEL1x4_2 0, 0 + KERNEL1x4_2 1, 0 + KERNEL1x4_2 2, 0 + KERNEL1x4_2 3, 1 + MY_ALIGN + + +ZGEMM_L1x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L1x4_SUB2_2 + KERNEL1x4_2 0, 0 + KERNEL1x4_2 1, 1 + MY_ALIGN + + +ZGEMM_L1x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L1x4_SUB2_1 + KERNEL1x4_2 0, 1 + MY_ALIGN + + +ZGEMM_L1x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L1x4_SAVE + LOAD_END_1x4 64,16 + + + +ZGEMM_L1x4_SAVE: +/*----------------------------------------*/ + KERNEL1x4_UNPRIME_MMA + SAVE1x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 4, 1 +#endif + + +ZGEMM_L1x4_END: +/*----------------------------------------*/ + + +ZGEMM_L1x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble ZGEMM_L1x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 2, 1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 2, 1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + KERNEL1x2_ZERO_AND_PRIME_MMA + ble ZGEMM_L1x2_SUB0 + bl ZGEMM_1x2_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L1x2_SAVE + b ZGEMM_L1x2_SUB2 + + +ZGEMM_L1x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6, 33 +#else + andi. L, K, 63 + cmpwi K, 33 +#endif + li T8, 1 + bne CMP1x2_32K + LOAD_END_1x2 -32, -16 + mtctr T8 + bl ZGEMM_L1x2_K32 + b ZGEMM_L1x2_SAVE + CMP1x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 32 +#else + cmpwi K, 32 +#endif + bne ZGEMM_L1x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -32 + addi AO,AO, -64 + bl ZGEMM_L1x2_K32 + b ZGEMM_L1x2_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L1x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L1x2_SUB2_8 + KERNEL1x2_2 0, 0 + KERNEL1x2_2 1, 0 + KERNEL1x2_2 2, 0 + KERNEL1x2_2 3, 0 + KERNEL1x2_2 4, 0 + KERNEL1x2_2 5, 0 + KERNEL1x2_2 6, 0 + KERNEL1x2_2 7, 1 + MY_ALIGN + + +ZGEMM_L1x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L1x2_SUB2_4 + KERNEL1x2_2 0, 0 + KERNEL1x2_2 1, 0 + KERNEL1x2_2 2, 0 + KERNEL1x2_2 3, 1 + MY_ALIGN + + +ZGEMM_L1x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L1x2_SUB2_2 + KERNEL1x2_2 0, 0 + KERNEL1x2_2 1, 1 + MY_ALIGN + + +ZGEMM_L1x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L1x2_SUB2_1 + KERNEL1x2_2 0, 1 + MY_ALIGN + + +ZGEMM_L1x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L1x2_SAVE + LOAD_END_1x2 32,16 + + +ZGEMM_L1x2_SAVE: +/*----------------------------------------*/ + KERNEL1x2_UNPRIME_MMA + SAVE1x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 2, 1 +#endif + + +ZGEMM_L1x2_END: +/*----------------------------------------*/ + + +ZGEMM_L1x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble ZGEMM_L1x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 1, 1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 1, 1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO1x1 + ble ZGEMM_L1x1_SUB0 + bl ZGEMM_1x1_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L1x1_SAVE + b ZGEMM_L1x1_SUB2 + + +ZGEMM_L1x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6, 33 +#else + andi. L, K, 63 + cmpwi K, 33 +#endif + li T8, 1 + bne CMP1x1_32K + addi BO, BO, -16 + addi AO,AO, -16 + LOAD1x1O 16, 16 + END1x1_WITHOUT_ADD + LOAD1x1_2O 32, 32 + mtctr T8 + bl ZGEMM_L1x1_K32 + b ZGEMM_L1x1_SAVE + CMP1x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 32 +#else + cmpwi K, 32 +#endif + bne ZGEMM_L1x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -32 + addi AO,AO, -32 + LOAD1x1_2O 32, 32 + bl ZGEMM_L1x1_K32 + b ZGEMM_L1x1_SAVE + MY_ALIGN + + +ZGEMM_L1x1_SUB2: +/*----------------------------------------*/ + andi. T1, L, 16 + ble ZGEMM_L1x1_SUB2_8 + LOAD1x1_2 + KERNEL1x1_L2 32, 32, 0, 0 + KERNEL1x1_L2 32, 32, 1, 0 + KERNEL1x1_L2 32, 32, 2, 0 + KERNEL1x1_L2 32, 32, 3, 0 + KERNEL1x1_L2 32, 32, 4, 0 + KERNEL1x1_L2 32, 32, 5, 0 + KERNEL1x1_L2 32, 32, 6, 0 + KERNEL1x1_E2 32, 32, 7, 1 + MY_ALIGN + + +ZGEMM_L1x1_SUB2_8: +/*----------------------------------------*/ + andi. T1, L, 8 + ble ZGEMM_L1x1_SUB2_4 + LOAD1x1_2 + KERNEL1x1_L2 32, 32, 0, 0 + KERNEL1x1_L2 32, 32, 1, 0 + KERNEL1x1_L2 32, 32, 2, 0 + KERNEL1x1_E2 32, 32, 3, 1 + MY_ALIGN + + +ZGEMM_L1x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L1x1_SUB2_2 + LOAD1x1_2 + KERNEL1x1_L2 32, 32, 0, 0 + KERNEL1x1_E2 32, 32, 1, 1 + MY_ALIGN + + +ZGEMM_L1x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L1x1_SUB2_1 + LOAD1x1_2 + KERNEL1x1_E2 32, 32, 0, 1 + MY_ALIGN + + +ZGEMM_L1x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L1x1_SAVE + KERNEL1x1 + + +ZGEMM_L1x1_SAVE: +/*----------------------------------------*/ + SAVE1x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 1, 1 +#endif + + +ZGEMM_L1x1_END: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 1 +#endif + + +ZGEMM_L1_END: +/*----------------------------------------*/ diff --git a/kernel/power/zgemm_macros_power10.S b/kernel/power/zgemm_macros_power10.S new file mode 100644 index 000000000..42f9c5ad4 --- /dev/null +++ b/kernel/power/zgemm_macros_power10.S @@ -0,0 +1,1138 @@ +/*************************************************************************** +Copyright (c) 2013-2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define unit_size 16 +#define DISP32(ind,disp) (ind*unit_size*32+disp) +#define DISP16(ind,disp) (ind*unit_size*16+disp) +#define DISP8(ind,disp) (ind*unit_size*8+disp) +#define DISP4(ind,disp) (ind*unit_size*4+disp) +#define DISP2(ind,disp) (ind*unit_size*2+disp) +#define DISP1(ind,disp) (ind*unit_size+disp) +#define DISPX(disp) (disp) +/* HELPERS FOR SAVE */ +/* {r0,i0} and {r1,i1} into {r0,r1} {i0,i1} */ + + +.macro LOAD_COUPLE_AS_RR_II VS_OUT1,VS_OUT2,VS_TEMP1,VS_TEMP2,REG,LOFFSET +#ifndef TRMMKERNEL + lxv \VS_TEMP1, DISPX(\LOFFSET)(\REG) + lxv \VS_TEMP2, DISPX(\LOFFSET+16)(\REG) + xxmrgld \VS_OUT1,\VS_TEMP1,\VS_TEMP2 + xxmrghd \VS_OUT2,\VS_TEMP1,\VS_TEMP2 +#endif +.endm +/*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/ + + +.macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2 + xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/ + xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/ +.endm +/*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/ + + +.macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2 + xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*imag */ + xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/ +.endm +/* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/ + + +.macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + xvsubdp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubdp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubdp \VSINI_OUT2,\VSINI,\VSINI_OUT2 +#else // CC || CR || RC || RR + /*we will assume {-alpha_r,-alpha_i} for this case */ + /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ + xvsubdp \VSINR_OUT1,\VSINR,\VSINR_OUT1 + /*we will negate alpha image instead instead to fix sign*/ + xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#endif +.endm +/* {i0,i1} * {alpha_i,alpha_i} - VSOUT1 ;VSOUT2 + {r0,r1}*{alpha_i,alpha_i} */ + + +.macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2 +#ifndef TRMMKERNEL + xvmsubadp \VSOUT1,\VSINII, alpha_i + xvmaddadp \VSOUT2,\VSINRR, alpha_i +#else + xvmuldp \VSOUT1,\VSINII, alpha_i + xvmuldp \VSOUT2,\VSINRR, alpha_i +#endif +.endm +/* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */ + + +.macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2 + xvmsubadp \VSOUT1,\VSINRR, alpha_r + xvmaddadp \VSOUT2,\VSINII, alpha_r +.endm +/* unpack to store 2{r,r} {i,i} into {r,i} {r,i} (big endian because of stxv) */ + + +.macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2 + xxmrghd \VSOUT1,\VSIN2,\VSIN1 + xxmrgld \VSOUT2,\VSIN2,\VSIN1 +.endm + + +.macro STORE_COUPLE REG,LOFFSET,VSIN1,VSIN2 + stxv \VSIN1, DISPX(\LOFFSET)(\REG) + stxv \VSIN2, DISPX(\LOFFSET+16)(\REG) +.endm + + +.macro SAVE8 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,VSRes9,VSRes10,VSRes11,VSRes12,VSRes13,VSRes14,VSRes15,VSRes16,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs34,vs35 + LOAD_COUPLE_AS_RR_II vs46,vs47,vs50,vs51,\BASE_REG,\LOFFSET + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs36,vs37 + LOAD_COUPLE_AS_RR_II vs48,vs49,vs52,vs53,\BASE_REG,(\LOFFSET+32) + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs38,vs39 + LOAD_COUPLE_AS_RR_II vs56,vs57,vs50,vs51,\BASE_REG,(\LOFFSET +64) + RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs40,vs41 + LOAD_COUPLE_AS_RR_II vs58,vs59,vs52,vs53,\BASE_REG,(\LOFFSET+96) + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes9,\VSRes11,vs42,vs43 + AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37 + RESULT_INTO_REALIMAG_IMAGREAL \VSRes10,\VSRes12,vs44,vs45 + AGGREGATE_REALS_IMAGES vs38,vs39,vs40,vs41 + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes13,\VSRes15,\VSRes1,\VSRes2 + MULT_APLHA_PART1 vs34,vs36, vs46,vs47 + RESULT_INTO_REALIMAG_IMAGREAL \VSRes14,\VSRes16,\VSRes3,\VSRes4 + MULT_APLHA_PART1 vs38,vs40,vs48,vs49 + MULT_APLHA_PART2 vs34,vs36,vs46,vs47 + AGGREGATE_REALS_IMAGES vs42,vs43,vs44,vs45 + MULT_APLHA_PART2 vs38,vs40,vs48,vs49 + AGGREGATE_REALS_IMAGES \VSRes1,\VSRes2,\VSRes3,\VSRes4 + UNPACK_FOR_STORE vs46,vs47,vs39,vs41 + MULT_APLHA_PART1 vs42,vs44, vs56,vs57 + UNPACK_FOR_STORE vs48,vs49,vs35,vs37 + MULT_APLHA_PART1 \VSRes1,\VSRes3, vs58,vs59 + STORE_COUPLE \BASE_REG,\LOFFSET,vs39,vs41 + MULT_APLHA_PART2 vs42,vs44,vs56,vs57 + STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs35,vs37 + MULT_APLHA_PART2 \VSRes1,\VSRes3, vs58,vs59 + UNPACK_FOR_STORE vs56,vs57,vs42,vs44 + UNPACK_FOR_STORE vs58,vs59,\VSRes1,\VSRes3 + STORE_COUPLE \BASE_REG,(\LOFFSET +64),vs42,vs44 + STORE_COUPLE \BASE_REG,(\LOFFSET+96),\VSRes1,\VSRes3 +.endm + + +.macro SAVE4 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs34,vs35 + LOAD_COUPLE_AS_RR_II vs46,vs47,vs50,vs51,\BASE_REG,\LOFFSET + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs36,vs37 + LOAD_COUPLE_AS_RR_II vs48,vs49,vs52,vs53,\BASE_REG,(\LOFFSET+32) + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs38,vs39 + RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs40,vs41 + AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37 + AGGREGATE_REALS_IMAGES vs38,vs39,vs40,vs41 + MULT_APLHA_PART1 vs34,vs36, vs46,vs47 + MULT_APLHA_PART1 vs38,vs40, vs48,vs49 + MULT_APLHA_PART2 vs34,vs36, vs46,vs47 + MULT_APLHA_PART2 vs38,vs40,vs48,vs49 + UNPACK_FOR_STORE vs46,vs47,vs39,vs41 + UNPACK_FOR_STORE vs48,vs49,vs35,vs37 + STORE_COUPLE \BASE_REG,\LOFFSET,vs39,vs41 + STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs35,vs37 +.endm + + +.macro SAVE2 VSRes1,VSRes2,VSRes3,VSRes4,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs34,vs35 + LOAD_COUPLE_AS_RR_II vs46,vs47,vs50,vs51,\BASE_REG,\LOFFSET + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs36,vs37 + AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37 + MULT_APLHA_PART1 vs34,vs36, vs46,vs47 + MULT_APLHA_PART2 vs34,vs36, vs46,vs47 + UNPACK_FOR_STORE vs46,vs47,vs39,vs41 + STORE_COUPLE \BASE_REG,\LOFFSET,vs39,vs41 +.endm + + +.macro SAVE1 VSRes1,VSRes2,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs34,vs35 +#ifndef TRMMKERNEL + lxv vs50, (\LOFFSET)(\BASE_REG) + xxmrgld vs46,vs50,vs50 + xxmrghd vs47,vs50,vs50 +#endif + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes2,vs36,vs37 + AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37 + MULT_APLHA_PART1 vs34,vs36, vs46,vs47 + MULT_APLHA_PART2 vs34,vs36, vs46,vs47 + UNPACK_FOR_STORE vs46,vs47,vs39,vs41 + xxmrghd vs39,vs47,vs46 + stxv vs39, (\LOFFSET)(\BASE_REG) +.endm + +/********************************************************************************************** +* + +.macros for N=2 and M=8 +**********************************************************************************************/ + +.macro KERNEL2x8_ZERO_AND_PRIME_MMA + /* zero out and prime the MMA accumulators */ + xxsetaccz 0 + xxsetaccz 1 + xxsetaccz 2 + xxsetaccz 3 + xxsetaccz 4 + xxsetaccz 5 + xxsetaccz 6 + xxsetaccz 7 +.endm + + +.macro KERNEL2x8_PRELOAD + lxvp vs32, 0(AO) // load real,imag from A + lxvp vs34, 32(AO) // load real,imag from A + lxvp vs36, 64(AO) // load real,imag from A + lxvp vs38, 96(AO) // load real,imag from A + lxvp vs48, 0(BO) // load real imag from B +.endm + + +.macro KERNEL2x8_2 Index, IsLast + lxvp vs40, DISP16(\Index,128)(AO) // load real,imag from A + lxvp vs42, DISP16(\Index,160)(AO) // load real,imag from A + lxvp vs44, DISP16(\Index,192)(AO) // load real,imag from A + lxvp vs46, DISP16(\Index,224)(AO) // load real,imag from A + lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 2, vs36, vs49 + xvf64gerpp 3, vs38, vs49 + xvf64gerpp 4, vs32, vs48 + xvf64gerpp 5, vs34, vs48 + xvf64gerpp 6, vs36, vs48 + xvf64gerpp 7, vs38, vs48 + lxvp vs32, DISP16(\Index, 256)(AO) // load real,imag from A + lxvp vs34, DISP16(\Index, 288)(AO) // load real,imag from A + lxvp vs36, DISP16(\Index, 320)(AO) // load real,imag from A + lxvp vs38, DISP16(\Index, 352)(AO) // load real,imag from A + lxvp vs48, DISP4(\Index, 64)(BO) // load real imag from B + xvf64gerpp 0, vs40, vs51 + xvf64gerpp 1, vs42, vs51 + xvf64gerpp 2, vs44, vs51 + xvf64gerpp 3, vs46, vs51 + xvf64gerpp 4, vs40, vs50 + xvf64gerpp 5, vs42, vs50 + xvf64gerpp 6, vs44, vs50 + xvf64gerpp 7, vs46, vs50 +.if \IsLast==1 + addi AO, AO, DISP16(\Index,256) + addi BO, BO, DISP4(\Index,64) +.endif +.endm + + +.macro LOAD_END_2x8 OffsetA,OffsetB + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 2, vs36, vs49 + xvf64gerpp 3, vs38, vs49 + xvf64gerpp 4, vs32, vs48 + xvf64gerpp 5, vs34, vs48 + xvf64gerpp 6, vs36, vs48 + xvf64gerpp 7, vs38, vs48 + addi BO, BO, \OffsetB + addi AO, AO, \OffsetA +.endm + + +.macro KERNEL2x8_UNPRIME_MMA + /* "unprime" MMA accumulators */ + xxmfacc 0 + xxmfacc 1 + xxmfacc 2 + xxmfacc 3 + xxmfacc 4 + xxmfacc 5 + xxmfacc 6 + xxmfacc 7 +.endm + + +.macro SAVE2x8 + add T1, CO ,LDC + xxpermdi vs32, vs0, vs1, 0b01 + xxpermdi vs33, vs0, vs1, 0b10 + xxpermdi vs34, vs2, vs3, 0b01 + xxpermdi vs35, vs2, vs3, 0b10 + xxpermdi vs36, vs4, vs5, 0b01 + xxpermdi vs37, vs4, vs5, 0b10 + xxpermdi vs38, vs6, vs7, 0b01 + xxpermdi vs39, vs6, vs7, 0b10 + xxpermdi vs40, vs8, vs9, 0b01 + xxpermdi vs41, vs8, vs9, 0b10 + xxpermdi vs42, vs10, vs11, 0b01 + xxpermdi vs43, vs10, vs11, 0b10 + xxpermdi vs44, vs12, vs13, 0b01 + xxpermdi vs45, vs12, vs13, 0b10 + xxpermdi vs46, vs14, vs15, 0b01 + xxpermdi vs47, vs14, vs15, 0b10 + + xxlor vs2, vs32, vs32 + xxlor vs3, vs33, vs33 + xxlor vs0, vs34, vs34 + xxlor vs1, vs35, vs35 + xxlor vs6, vs36, vs36 + xxlor vs7, vs37, vs37 + xxlor vs4, vs38, vs38 + xxlor vs5, vs39, vs39 + xxlor vs10, vs40, vs40 + xxlor vs11, vs41, vs41 + xxlor vs8, vs42, vs42 + xxlor vs9, vs43, vs43 + xxlor vs14, vs44, vs44 + xxlor vs15, vs45, vs45 + xxlor vs12, vs46, vs46 + xxlor vs13, vs47, vs47 + + xxpermdi vs32, vs16, vs17, 0b01 + xxpermdi vs33, vs16, vs17, 0b10 + xxpermdi vs34, vs18, vs19, 0b01 + xxpermdi vs35, vs18, vs19, 0b10 + xxpermdi vs36, vs20, vs21, 0b01 + xxpermdi vs37, vs20, vs21, 0b10 + xxpermdi vs38, vs22, vs23, 0b01 + xxpermdi vs39, vs22, vs23, 0b10 + xxpermdi vs40, vs24, vs25, 0b01 + xxpermdi vs41, vs24, vs25, 0b10 + xxpermdi vs42, vs26, vs27, 0b01 + xxpermdi vs43, vs26, vs27, 0b10 + xxpermdi vs44, vs28, vs29, 0b01 + xxpermdi vs45, vs28, vs29, 0b10 + xxpermdi vs46, vs30, vs31, 0b01 + xxpermdi vs47, vs30, vs31, 0b10 + + xxlor vs18, vs32, vs32 + xxlor vs19, vs33, vs33 + xxlor vs16, vs34, vs34 + xxlor vs17, vs35, vs35 + xxlor vs22, vs36, vs36 + xxlor vs23, vs37, vs37 + xxlor vs20, vs38, vs38 + xxlor vs21, vs39, vs39 + xxlor vs26, vs40, vs40 + xxlor vs27, vs41, vs41 + xxlor vs24, vs42, vs42 + xxlor vs25, vs43, vs43 + xxlor vs30, vs44, vs44 + xxlor vs31, vs45, vs45 + xxlor vs28, vs46, vs46 + xxlor vs29, vs47, vs47 + + SAVE8 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0 + SAVE8 vs16,vs17,vs18,vs19,vs20,vs21,vs22,vs23,vs24,vs25,vs26,vs27,vs28,vs29,vs30,vs31,T1,0 + addi CO, CO, 128 +.endm + +/********************************************************************************************** +* + +.macros for N=2 and M=4 +**********************************************************************************************/ + +.macro KERNEL2x4_ZERO_AND_PRIME_MMA + /* zero out and prime the MMA accumulators */ + xxsetaccz 0 + xxsetaccz 1 + xxsetaccz 2 + xxsetaccz 3 +.endm + + +.macro KERNEL2x4_PRELOAD + lxvp vs32, 0(AO) // load real,imag from A + lxvp vs34, 32(AO) // load real,imag from A + lxvp vs48, 0(BO) // load real imag from B +.endm + + +.macro KERNEL2x4_2 Index, IsLast + lxvp vs40, DISP8(\Index, 64)(AO) // load real,imag from A + lxvp vs42, DISP8(\Index, 96)(AO) // load real,imag from A + lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 2, vs32, vs48 + xvf64gerpp 3, vs34, vs48 + lxvp vs32, DISP8(\Index, 128)(AO) // load real,imag from A + lxvp vs34, DISP8(\Index, 160)(AO) // load real,imag from A + lxvp vs48, DISP4(\Index, 64)(BO) // load real,imag from B + xvf64gerpp 0, vs40, vs51 + xvf64gerpp 1, vs42, vs51 + xvf64gerpp 2, vs40, vs50 + xvf64gerpp 3, vs42, vs50 +.if \IsLast==1 + addi AO, AO, DISP8(\Index,128) + addi BO, BO, DISP4(\Index,64) +.endif +.endm + + +.macro LOAD_END_2x4 OffsetA, OffsetB + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 2, vs32, vs48 + xvf64gerpp 3, vs34, vs48 + addi BO, BO, \OffsetB + addi AO, AO, \OffsetA +.endm + + +.macro KERNEL2x4_UNPRIME_MMA + /* "unprime" MMA accumulators */ + xxmfacc 0 + xxmfacc 1 + xxmfacc 2 + xxmfacc 3 +.endm + + +.macro SAVE2x4 + add T1, CO ,LDC + xxpermdi vs32, vs0, vs1, 0b01 + xxpermdi vs33, vs0, vs1, 0b10 + xxpermdi vs34, vs2, vs3, 0b01 + xxpermdi vs35, vs2, vs3, 0b10 + xxpermdi vs36, vs4, vs5, 0b01 + xxpermdi vs37, vs4, vs5, 0b10 + xxpermdi vs38, vs6, vs7, 0b01 + xxpermdi vs39, vs6, vs7, 0b10 + xxpermdi vs40, vs8, vs9, 0b01 + xxpermdi vs41, vs8, vs9, 0b10 + xxpermdi vs42, vs10, vs11, 0b01 + xxpermdi vs43, vs10, vs11, 0b10 + xxpermdi vs44, vs12, vs13, 0b01 + xxpermdi vs45, vs12, vs13, 0b10 + xxpermdi vs46, vs14, vs15, 0b01 + xxpermdi vs47, vs14, vs15, 0b10 + + xxlor vs2, vs32, vs32 + xxlor vs3, vs33, vs33 + xxlor vs0, vs34, vs34 + xxlor vs1, vs35, vs35 + xxlor vs6, vs36, vs36 + xxlor vs7, vs37, vs37 + xxlor vs4, vs38, vs38 + xxlor vs5, vs39, vs39 + xxlor vs10, vs40, vs40 + xxlor vs11, vs41, vs41 + xxlor vs8, vs42, vs42 + xxlor vs9, vs43, vs43 + xxlor vs14, vs44, vs44 + xxlor vs15, vs45, vs45 + xxlor vs12, vs46, vs46 + xxlor vs13, vs47, vs47 + + SAVE4 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0 + SAVE4 vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,T1,0 + addi CO, CO, 64 +.endm + +/********************************************************************************************** +* + +.macros for N=2 and M=2 +**********************************************************************************************/ + +.macro KERNEL2x2_ZERO_AND_PRIME_MMA + /* zero out and prime the MMA accumulators */ + xxsetaccz 0 + xxsetaccz 1 +.endm + + +.macro KERNEL2x2_PRELOAD + lxvp vs32, 0(AO) // load real,imag from A + lxvp vs48, 0(BO) // load real imag from B +.endm + + +.macro KERNEL2x2_2 Index, IsLast + lxvp vs40, DISP4(\Index, 32)(AO) // load real,imag from A + lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs32, vs48 + lxvp vs32, DISP4(\Index, 64)(AO) // load real,imag from A + lxvp vs48, DISP4(\Index, 64)(BO) // load real imag from B + xvf64gerpp 0, vs40, vs51 + xvf64gerpp 1, vs40, vs50 +.if \IsLast==1 + addi AO, AO, DISP4(\Index,64) + addi BO, BO, DISP4(\Index,64) +.endif +.endm + + +.macro LOAD_END_2x2 OffsetA,OffsetB + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs32, vs48 + addi BO, BO, \OffsetB + addi AO, AO, \OffsetA +.endm + + +.macro KERNEL2x2_UNPRIME_MMA + /* "unprime" MMA accumulators */ + xxmfacc 0 + xxmfacc 1 +.endm + + +.macro SAVE2x2 + add T1, CO ,LDC + xxpermdi vs32, vs0, vs1, 0b01 + xxpermdi vs33, vs0, vs1, 0b10 + xxpermdi vs34, vs2, vs3, 0b01 + xxpermdi vs35, vs2, vs3, 0b10 + xxpermdi vs36, vs4, vs5, 0b01 + xxpermdi vs37, vs4, vs5, 0b10 + xxpermdi vs38, vs6, vs7, 0b01 + xxpermdi vs39, vs6, vs7, 0b10 + + xxlor vs2, vs32, vs32 + xxlor vs3, vs33, vs33 + xxlor vs0, vs34, vs34 + xxlor vs1, vs35, vs35 + xxlor vs6, vs36, vs36 + xxlor vs7, vs37, vs37 + xxlor vs4, vs38, vs38 + xxlor vs5, vs39, vs39 + + SAVE2 vs0,vs1,vs2,vs3,CO,0 + SAVE2 vs4,vs5,vs6,vs7,T1,0 + addi CO, CO, 32 +.endm + +/********************************************************************************************** +* + +.macros for N=2 and M=1 +**********************************************************************************************/ + +.macro ZERO2x1 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2, vs2 + xxlxor vs3, vs3, vs3 + +.endm + + +.macro LOAD2x1 + LOAD2x1O 0,0 +.endm + + +.macro LOAD2x1O OffsetA,OffsetB + lxv vs48,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs50, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs49, vs48 + xxswapd vs51, vs50 + lxv vs32, (0+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END2x1_WITHOUT_ADD + END2x1 AO,BO,0,0 +.endm + + +.macro END2x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs0, vs32, vs48 + xvmaddadp vs2, vs32, vs50 + xvmaddadp vs1, vs32, vs49 + xvmaddadp vs3, vs32, vs51 +.endm + + +.macro LOAD2x1_2 + LOAD2x1_2O 0,0 +.endm + + +.macro LOAD2x1_2O OffsetA,OffsetB + lxv vs48,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs50, (\OffsetB+16)(BO) // load real,imag from B + lxv vs52, (\OffsetB+32)(BO) // load real,imag from B + lxv vs54, (\OffsetB+48)(BO) // load real,imag from B + xxswapd vs49, vs48 + xxswapd vs51, vs50 + lxv vs32, (0+\OffsetA)(AO) // load real,imag from A + lxv vs40, (16+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END2x1_2 + /*for load2 offset will be 32 and 64*/ + KERNEL2x1_2 AO,BO, 32,64,0 ,1,1 +.endm + + +.macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xxswapd vs53, vs52 + xxswapd vs55, vs54 + xvmaddadp vs0, vs32, vs48 + xvmaddadp vs2, vs32, vs50 + xvmaddadp vs1, vs32, vs49 + xvmaddadp vs3, vs32, vs51 +.if \Complete==0 + lxv vs32, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs48, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B + lxv vs50, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \Complete==0 + xxswapd vs49, vs48 + xxswapd vs51, vs50 +.endif + xvmaddadp vs0, vs40, vs52 + xvmaddadp vs2, vs40, vs54 + xvmaddadp vs1, vs40, vs53 + xvmaddadp vs3, vs40, vs55 +.if \Complete==0 + lxv vs40, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs52, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B + lxv vs54, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index,\OffsetA) + addi \BREG, \BREG, DISP4(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index,32) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif +.endm + + +.macro KERNEL2x1 + LOAD2x1 + END2x1 AO, BO, 16,32 +.endm + + +.macro SAVE2x1 + add T1, CO ,LDC + SAVE1 vs0,vs1,CO,0 + SAVE1 vs2,vs3,T1,0 + addi CO, CO, 16 +.endm + +/********************************************************************************************** +* + +.macros for N=1 and M=8 +**********************************************************************************************/ + +.macro KERNEL1x8_ZERO_AND_PRIME_MMA + /* zero out and prime the MMA accumulators */ + xxsetaccz 0 + xxsetaccz 1 + xxsetaccz 2 + xxsetaccz 3 +.endm + + +.macro KERNEL1x8_2 Index,IsLast + lxvp vs32, DISP16(\Index, 0)(AO) // load real,imag from A + lxvp vs34, DISP16(\Index, 32)(AO) // load real,imag from A + lxvp vs36, DISP16(\Index, 64)(AO) // load real,imag from A + lxvp vs38, DISP16(\Index, 96)(AO) // load real,imag from A + lxvp vs40, DISP16(\Index, 128)(AO) // load real,imag from A + lxvp vs42, DISP16(\Index, 160)(AO) // load real,imag from A + lxvp vs44, DISP16(\Index, 192)(AO) // load real,imag from A + lxvp vs46, DISP16(\Index, 224)(AO) // load real,imag from A + lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 2, vs36, vs49 + xvf64gerpp 3, vs38, vs49 + xvf64gerpp 0, vs40, vs48 + xvf64gerpp 1, vs42, vs48 + xvf64gerpp 2, vs44, vs48 + xvf64gerpp 3, vs46, vs48 +.if \IsLast==1 + addi AO, AO, DISP16(\Index,256) + addi BO, BO, DISP2(\Index,32) +.endif +.endm + + +.macro LOAD_END_1x8 OffsetA,OffsetB + lxvp vs32, 0(AO) // load real,imag from A + lxvp vs34, 32(AO) // load real,imag from A + lxvp vs36, 64(AO) // load real,imag from A + lxvp vs38, 96(AO) // load real,imag from A + lxv vs48, 0(BO) // load real imag from B + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 1, vs34, vs48 + xvf64gerpp 2, vs36, vs48 + xvf64gerpp 3, vs38, vs48 + addi BO, BO, \OffsetB + addi AO, AO, \OffsetA +.endm + + +.macro KERNEL1x8_UNPRIME_MMA + /* "unprime" MMA accumulators */ + xxmfacc 0 + xxmfacc 1 + xxmfacc 2 + xxmfacc 3 +.endm + + +.macro SAVE1x8 + xxpermdi vs32, vs0, vs1, 0b01 + xxpermdi vs33, vs0, vs1, 0b10 + xxpermdi vs34, vs2, vs3, 0b01 + xxpermdi vs35, vs2, vs3, 0b10 + xxpermdi vs36, vs4, vs5, 0b01 + xxpermdi vs37, vs4, vs5, 0b10 + xxpermdi vs38, vs6, vs7, 0b01 + xxpermdi vs39, vs6, vs7, 0b10 + xxpermdi vs40, vs8, vs9, 0b01 + xxpermdi vs41, vs8, vs9, 0b10 + xxpermdi vs42, vs10, vs11, 0b01 + xxpermdi vs43, vs10, vs11, 0b10 + xxpermdi vs44, vs12, vs13, 0b01 + xxpermdi vs45, vs12, vs13, 0b10 + xxpermdi vs46, vs14, vs15, 0b01 + xxpermdi vs47, vs14, vs15, 0b10 + + xxlor vs2, vs32, vs32 + xxlor vs3, vs33, vs33 + xxlor vs0, vs34, vs34 + xxlor vs1, vs35, vs35 + xxlor vs6, vs36, vs36 + xxlor vs7, vs37, vs37 + xxlor vs4, vs38, vs38 + xxlor vs5, vs39, vs39 + xxlor vs10, vs40, vs40 + xxlor vs11, vs41, vs41 + xxlor vs8, vs42, vs42 + xxlor vs9, vs43, vs43 + xxlor vs14, vs44, vs44 + xxlor vs15, vs45, vs45 + xxlor vs12, vs46, vs46 + xxlor vs13, vs47, vs47 + + SAVE8 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0 + addi CO, CO, 128 +.endm + +/********************************************************************************************** +* + +.macros for N=1 and M=4 +**********************************************************************************************/ + +.macro KERNEL1x4_ZERO_AND_PRIME_MMA + /* zero out and prime the MMA accumulators */ + xxsetaccz 0 + xxsetaccz 1 +.endm + + +.macro KERNEL1x4_2 Index,IsLast + lxvp vs32, DISP8(\Index, 0)(AO) // load real,imag from A + lxvp vs34, DISP8(\Index, 32)(AO) // load real,imag from A + lxvp vs40, DISP8(\Index, 64)(AO) // load real,imag from A + lxvp vs42, DISP8(\Index, 96)(AO) // load real,imag from A + lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 0, vs40, vs48 + xvf64gerpp 1, vs42, vs48 +.if \IsLast==1 + addi AO, AO, DISP8(\Index,128) + addi BO, BO, DISP2(\Index,32) +.endif +.endm + + +.macro LOAD_END_1x4 OffsetA,OffsetB + lxvp vs32, 0(AO) // load real,imag from A + lxvp vs34, 32(AO) // load real,imag from A + lxv vs48, 0(BO) // load real imag from B + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 1, vs34, vs48 + addi BO, BO, \OffsetB + addi AO, AO, \OffsetA +.endm + + +.macro KERNEL1x4_UNPRIME_MMA + /* "unprime" MMA accumulators */ + xxmfacc 0 + xxmfacc 1 +.endm + + +.macro SAVE1x4 + xxpermdi vs32, vs0, vs1, 0b01 + xxpermdi vs33, vs0, vs1, 0b10 + xxpermdi vs34, vs2, vs3, 0b01 + xxpermdi vs35, vs2, vs3, 0b10 + xxpermdi vs36, vs4, vs5, 0b01 + xxpermdi vs37, vs4, vs5, 0b10 + xxpermdi vs38, vs6, vs7, 0b01 + xxpermdi vs39, vs6, vs7, 0b10 + + xxlor vs2, vs32, vs32 + xxlor vs3, vs33, vs33 + xxlor vs0, vs34, vs34 + xxlor vs1, vs35, vs35 + xxlor vs6, vs36, vs36 + xxlor vs7, vs37, vs37 + xxlor vs4, vs38, vs38 + xxlor vs5, vs39, vs39 + + SAVE4 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0 + addi CO, CO, 64 +.endm + +/********************************************************************************************** +* + +.macros for N=1 and M=2 +**********************************************************************************************/ + +.macro KERNEL1x2_ZERO_AND_PRIME_MMA + /* zero out and prime the MMA accumulators */ + xxsetaccz 0 +.endm + + +.macro KERNEL1x2_2 Index,IsLast + lxvp vs32, DISP4(\Index, 0)(AO) // load real,imag from A + lxvp vs40, DISP4(\Index, 32)(AO) // load real,imag from A + lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 0, vs40, vs48 +.if \IsLast==1 + addi AO, AO, DISP4(\Index,64) + addi BO, BO, DISP2(\Index,32) +.endif +.endm + + +.macro LOAD_END_1x2 OffsetA,OffsetB + lxvp vs32, 0(AO) // load real,imag from A + lxv vs48, 0(BO) // load real imag from B + xvf64gerpp 0, vs32, vs48 + addi BO, BO, \OffsetB + addi AO, AO, \OffsetA +.endm + + +.macro KERNEL1x2_UNPRIME_MMA + /* "unprime" MMA accumulators */ + xxmfacc 0 +.endm + + +.macro SAVE1x2 + xxpermdi vs32, vs0, vs1, 0b01 + xxpermdi vs33, vs0, vs1, 0b10 + xxpermdi vs34, vs2, vs3, 0b01 + xxpermdi vs35, vs2, vs3, 0b10 + + xxlor vs2, vs32, vs32 + xxlor vs3, vs33, vs33 + xxlor vs0, vs34, vs34 + xxlor vs1, vs35, vs35 + + SAVE2 vs0,vs1,vs2,vs3,CO,0 + addi CO, CO, 32 +.endm + +/********************************************************************************************** +* + +.macros for N=1 and M=1 +**********************************************************************************************/ + +.macro ZERO1x1 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +.endm + + +.macro LOAD1x1 + LOAD1x1O 0,0 +.endm + + +.macro LOAD1x1O OffsetA,OffsetB + lxv vs48,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs32, (0+\OffsetA)(AO) // load real,imag from A + xxswapd vs49, vs48 + +.endm + + +.macro END1x1_WITHOUT_ADD + END1x1 AO,BO,0,0 +.endm + + +.macro END1x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs0, vs32, vs48 + xvmaddadp vs1, vs32, vs49 +.endm + + +.macro LOAD1x1_2 + LOAD1x1_2O 0,0 +.endm + + +.macro LOAD1x1_2O OffsetA,OffsetB + lxv vs48,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs52, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs49, vs48 + + lxv vs32, (0+\OffsetA)(AO) // load real,imag from A + lxv vs40, (16+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END1x1_2 + /*for load2 offset will be 32 and 32*/ + KERNEL1x1_2 AO,BO, 32,32,0 ,1,1 +.endm + + + +.macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xxswapd vs53, vs52 + xvmaddadp vs0, vs32, vs48 + xvmaddadp vs1, vs32, vs49 +.if \Complete==0 + lxv vs32, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs48, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B +.endif +.if \Complete==0 + xxswapd vs49, vs48 +.endif + xvmaddadp vs0, vs40, vs52 + xvmaddadp vs1, vs40, vs53 +.if \Complete==0 + lxv vs40, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs52, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index,\OffsetA) + addi \BREG, \BREG, DISP2(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index,32) + addi \BREG, \BREG, DISP2(\Index,32) +.endif +.endif +.endm + + + +.macro KERNEL1x1 + LOAD1x1 + END1x1 AO, BO, 16,16 +.endm + + + +.macro SAVE1x1 + SAVE1 vs0,vs1,CO,0 + addi CO, CO, 16 +.endm + +/****************************TRMM POINTER REFRESH + +.macroSES*************************/ + + +.macro SHIFT_REG REG1,REG2,SHIFT_VAL + .if \SHIFT_VAL==16 + slwi \REG1, \REG2, 8 + .elseif \SHIFT_VAL==8 + slwi \REG1, \REG2, 7 + .elseif \SHIFT_VAL==4 + slwi \REG1, \REG2, 6 + .elseif \SHIFT_VAL==2 + slwi \REG1, \REG2, 5 + .elseif \SHIFT_VAL==1 + slwi \REG1, \REG2, 4 + .endif +.endm +/* +//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// ptrbb = bb; +// #else +// ptrba += off*16; +// ptrbb = bb + off*2; +// #endif +*/ + + +.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B + #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /* ptrbb = bb;*/ + mr \PTR_B,\B_VAL /* refresh BPOINT */ + #else + /* + // ptrba =ptrba+ off*C_A; + // ptrbb = bb + off*C_B; + */ + SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ + SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ + add \PTR_B, \B_VAL , T4 /* Add values to BO */ + add \PTR_A, \PTR_A, T2 /* Add values to AO */ + #endif +.endm + +/* +// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +// temp = bk-off; +// #elif defined(LEFT) +// temp = off+16; // number of values in A +// #else +// temp = off+2; // number of values in B +// #endif +*/ + + +.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + /* temp = bk-off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + #elif defined(LEFT) + /* temp = off+INCR_A; // number of values in A */ + addi \TEMP_BK, \OFF_VAL, \INCR_A + #else + /* temp = off+INCR_B // number of values in B*/ + addi \TEMP_BK,\OFF_VAL, \INCR_B + #endif +.endm +/* +// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// temp = bk - off; +// #ifdef LEFT +// temp -= 16; // number of values in A +// #else +// temp -= 2; // number of values in B +// #endif +// ptrba += temp*16; +// ptrbb += temp*2; +// #endif +// #ifdef LEFT +// off += 16; // number of values in A +// #endif +*/ + + + +.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B + #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /*temp = bk - off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + #ifdef LEFT + /*temp -= 8; // number of values in A*/ + addi \TEMP_BK,\TEMP_BK,-\C_A + #else + /*temp -= 4; // number of values in B*/ + addi \TEMP_BK,\TEMP_BK,-\C_B + #endif + /*ptrba += temp*C_A; + ptrbb += temp*C_B;*/ + SHIFT_REG T4,\TEMP_BK,\C_A + SHIFT_REG T2,\TEMP_BK,\C_B + add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ + add \PTR_B, \PTR_B,T2 + #endif + #ifdef LEFT + /*off += 8; // number of values in A*/ + addi \OFF_VAL,\OFF_VAL,\C_A + #endif +.endm + From 3446e58dafd054ec7bf1736272c32c73f56fc5be Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 25 Jun 2020 12:31:35 +0200 Subject: [PATCH 264/593] Fix handling of uname output on AIX --- c_check | 1 + 1 file changed, 1 insertion(+) diff --git a/c_check b/c_check index 8234c2081..dd700b8b4 100644 --- a/c_check +++ b/c_check @@ -6,6 +6,7 @@ # Checking cross compile $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); $hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch); +$hostarch = `uname -p` if ($hostos eq "AIX"); $hostarch = "x86_64" if ($hostarch eq "amd64"); $hostarch = "arm" if ($hostarch =~ /^arm.*/); $hostarch = "arm64" if ($hostarch eq "aarch64"); From 72a0ec8e757a8db7323295585fd28f309a36d575 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 25 Jun 2020 12:55:10 +0200 Subject: [PATCH 265/593] Fix reading of CPU name from prtconf output on AIX --- cpuid_power.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/cpuid_power.c b/cpuid_power.c index b36aa4945..ed51df211 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -57,7 +57,6 @@ #define CPUTYPE_PPCG4 7 #define CPUTYPE_POWER8 8 #define CPUTYPE_POWER9 9 -#define CPUTYPE_POWER10 10 char *cpuname[] = { "UNKNOWN", @@ -83,8 +82,8 @@ char *lowercpuname[] = { "cell", "ppcg4", "power8", - "power9", - "power10" + "power9", + "power10" }; char *corename[] = { @@ -97,8 +96,8 @@ char *corename[] = { "CELL", "PPCG4", "POWER8", - "POWER9", - "POWER10" + "POWER9", + "POWER10" }; int detect(void){ @@ -154,17 +153,17 @@ int detect(void){ pclose(infile); - if (!strncasecmp(p, "POWER3", 6)) return CPUTYPE_POWER3; - if (!strncasecmp(p, "POWER4", 6)) return CPUTYPE_POWER4; - if (!strncasecmp(p, "PPC970", 6)) return CPUTYPE_PPC970; - if (!strncasecmp(p, "POWER5", 6)) return CPUTYPE_POWER5; - if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; - if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; - if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; - if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9; - if (!strncasecmp(p, "POWER10", 7)) return CPUTYPE_POWER10; - if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; - if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; + if (strstr(p, "POWER3")) return CPUTYPE_POWER3; + if (strstr(p, "POWER4")) return CPUTYPE_POWER4; + if (strstr(p, "PPC970")) return CPUTYPE_PPC970; + if (strstr(p, "POWER5")) return CPUTYPE_POWER5; + if (strstr(p, "POWER6")) return CPUTYPE_POWER6; + if (strstr(p, "POWER7")) return CPUTYPE_POWER6; + if (strstr(p, "POWER8")) return CPUTYPE_POWER8; + if (strstr(p, "POWER9")) return CPUTYPE_POWER9; + if (strstr(p, "POWER10")) return CPUTYPE_POWER10; + if (strstr(p, "Cell")) return CPUTYPE_CELL; + if (strstr(p, "7447")) return CPUTYPE_PPCG4; return CPUTYPE_POWER5; #endif From 3f613b130114ffe226b4068b35793eb46e072a48 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 25 Jun 2020 12:57:00 +0200 Subject: [PATCH 266/593] Tentative changes for building on AIX --- Makefile.power | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/Makefile.power b/Makefile.power index 5c431860f..b2fa04386 100644 --- a/Makefile.power +++ b/Makefile.power @@ -34,8 +34,11 @@ ifeq ($(USE_OPENMP), 1) COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp else -COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -fno-fast-math -FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math +COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -fno-fast-math +FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math +ifeq ($(OSNAME), AIX) +FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math +endif endif endif @@ -78,6 +81,9 @@ CCOMMON_OPT += -mpowerpc64 -maix64 ifeq ($(COMPILER_F77), g77) FCOMMON_OPT += -mpowerpc64 -maix64 endif +ifeq ($(F_COMPILER), GFORTRAN) +FCOMMON_OPT += -mpowerpc64 -maix64 +endif ifeq ($(COMPILER_F77), xlf) FCOMMON_OPT += -q64 endif From c592f0f80a75251e9ddda7c4b00dcc0b263083d4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 25 Jun 2020 12:58:13 +0200 Subject: [PATCH 267/593] Fix utest build on AIX --- utest/Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/utest/Makefile b/utest/Makefile index 0b9892411..31d4ccf00 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -34,6 +34,9 @@ endif ifeq ($(C_COMPILER), PGI) OBJS = utest_main2.o endif +ifeq ($(OSNAME), AIX) +OBJS = utest_main2.o +endif all : run_test From c0afc11742a388fbc7ad91928b1566cd6bd28388 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 25 Jun 2020 13:12:36 +0200 Subject: [PATCH 268/593] Fix POWERPC builds on AIX (gcc/gfortran 7) 1. macro preprocessing for POWER8 and later kernels only 2. default buffer size used by AIX version of m4 is too small --- kernel/Makefile.L3 | 144 ++++++++++++++++++++++----------------------- 1 file changed, 72 insertions(+), 72 deletions(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 0cb02ef85..c7865480f 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -482,7 +482,7 @@ $(KDIR)$(SHGEMMONCOPYOBJ) : $(KERNELDIR)/$(SHGEMMONCOPY) $(KDIR)$(SHGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SHGEMMOTCOPY) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DHALF -UDOUBLE -UCOMPLEX $< -o shgemmotcopy.s m4 shgemmotcopy.s > shgemmotcopy_nomacros.s $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemmotcopy_nomacros.s -o $@ @@ -497,7 +497,7 @@ $(KDIR)$(SHGEMMINCOPYOBJ) : $(KERNELDIR)/$(SHGEMMINCOPY) $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(SHGEMMITCOPYOBJ) : $(KERNELDIR)/$(SHGEMMITCOPY) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DHALF -UDOUBLE -UCOMPLEX $< -o shgemmitcopy.s m4 shgemmitcopy.s > shgemmitcopy_nomacros.s $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemmitcopy_nomacros.s -o $@ @@ -513,7 +513,7 @@ $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmotcopy.s m4 sgemmotcopy.s > sgemmotcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmotcopy_nomacros.s -o $@ @@ -529,7 +529,7 @@ $(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmitcopy.s m4 sgemmitcopy.s > sgemmitcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmitcopy_nomacros.s -o $@ @@ -541,7 +541,7 @@ endif endif $(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_ncopy.s m4 dgemm_ncopy.s > dgemm_ncopy_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_ncopy_nomacros.s -o $@ @@ -559,7 +559,7 @@ $(KDIR)$(DGEMMINCOPYOBJ) : $(KERNELDIR)/$(DGEMMINCOPY) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(DGEMMITCOPYOBJ) : $(KERNELDIR)/$(DGEMMITCOPY) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_itcopy.s m4 dgemm_itcopy.s > dgemm_itcopy_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_itcopy_nomacros.s -o $@ @@ -602,7 +602,7 @@ $(KDIR)$(CGEMMINCOPYOBJ) : $(KERNELDIR)/$(CGEMMINCOPY) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(CGEMMITCOPYOBJ) : $(KERNELDIR)/$(CGEMMITCOPY) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -E $< -o cgemm_itcopy.s m4 cgemm_itcopy.s > cgemm_itcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX cgemm_itcopy_nomacros.s -o $@ @@ -625,7 +625,7 @@ $(KDIR)$(ZGEMMINCOPYOBJ) : $(KERNELDIR)/$(ZGEMMINCOPY) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(ZGEMMITCOPYOBJ) : $(KERNELDIR)/$(ZGEMMITCOPY) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o zgemm_itcopy.s m4 zgemm_itcopy.s > zgemm_itcopy_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX zgemm_itcopy_nomacros.s -o $@ @@ -657,7 +657,7 @@ endif endif $(KDIR)sgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemm_kernel$(TSUFFIX).s m4 sgemm_kernel$(TSUFFIX).s > sgemm_kernel$(TSUFFIX)_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemm_kernel$(TSUFFIX)_nomacros.s -o $@ @@ -669,7 +669,7 @@ endif ifeq ($(BUILD_HALF), 1) $(KDIR)shgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DHALF -UDOUBLE -UCOMPLEX $< -o shgemm_kernel$(TSUFFIX).s m4 shgemm_kernel$(TSUFFIX).s > shgemm_kernel$(TSUFFIX)_nomacros.s $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemm_kernel$(TSUFFIX)_nomacros.s -o $@ @@ -680,7 +680,7 @@ endif endif $(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_kernel$(TSUFFIX).s m4 dgemm_kernel$(TSUFFIX).s > dgemm_kernel$(TSUFFIX)_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_kernel$(TSUFFIX)_nomacros.s -o $@ @@ -693,9 +693,9 @@ $(KDIR)qgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEP $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ $(KDIR)cgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNN $< -o cgemm_kernel_n.s - m4 cgemm_kernel_n.s > cgemm_kernel_n_nomacros.s + m4 -B 16384 cgemm_kernel_n.s > cgemm_kernel_n_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN cgemm_kernel_n_nomacros.s -o $@ rm cgemm_kernel_n.s cgemm_kernel_n_nomacros.s else @@ -703,9 +703,9 @@ else endif $(KDIR)cgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCN $< -o cgemm_kernel_l.s - m4 cgemm_kernel_l.s > cgemm_kernel_l_nomacros.s + m4 -B 16384 cgemm_kernel_l.s > cgemm_kernel_l_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN cgemm_kernel_l_nomacros.s -o $@ rm cgemm_kernel_l.s cgemm_kernel_l_nomacros.s else @@ -713,9 +713,9 @@ else endif $(KDIR)cgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s - m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s + m4 -B 16384 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@ rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s else @@ -723,9 +723,9 @@ else endif $(KDIR)cgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCC $< -o cgemm_kernel_b.s - m4 cgemm_kernel_b.s > cgemm_kernel_b_nomacros.s + m4 -B 16384 cgemm_kernel_b.s > cgemm_kernel_b_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC cgemm_kernel_b_nomacros.s -o $@ rm cgemm_kernel_b.s cgemm_kernel_b_nomacros.s else @@ -733,9 +733,9 @@ else endif $(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNN $< -o zgemm_kernel_n.s - m4 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s + m4 -B 16384 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@ rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s else @@ -743,9 +743,9 @@ else endif $(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCN $< -o zgemm_kernel_l.s - m4 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s + m4 -B 16384 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@ rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s else @@ -753,9 +753,9 @@ else endif $(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNC $< -o zgemm_kernel_r.s - m4 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s + m4 -B 16384 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@ rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s else @@ -763,9 +763,9 @@ else endif $(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCC $< -o zgemm_kernel_b.s - m4 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s + m4 -B 16384 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@ rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s else @@ -787,7 +787,7 @@ $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD ifdef USE_TRMM $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o strmmkernel_ln.s m4 strmmkernel_ln.s > strmmkernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA strmmkernel_ln_nomacros.s -o $@ @@ -797,7 +797,7 @@ else endif $(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o strmmkernel_lt.s m4 strmmkernel_lt.s > strmmkernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA strmmkernel_lt_nomacros.s -o $@ @@ -807,7 +807,7 @@ else endif $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o strmmkernel_rn.s m4 strmmkernel_rn.s > strmmkernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA strmmkernel_rn_nomacros.s -o $@ @@ -817,7 +817,7 @@ else endif $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ @@ -827,7 +827,7 @@ else endif $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o dtrmm_kernel_ln.s m4 dtrmm_kernel_ln.s > dtrmm_kernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA dtrmm_kernel_ln_nomacros.s -o $@ @@ -837,7 +837,7 @@ else endif $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o dtrmm_kernel_lt.s m4 dtrmm_kernel_lt.s > dtrmm_kernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA dtrmm_kernel_lt_nomacros.s -o $@ @@ -847,7 +847,7 @@ else endif $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o dtrmm_kernel_rn.s m4 dtrmm_kernel_rn.s > dtrmm_kernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA dtrmm_kernel_rn_nomacros.s -o $@ @@ -857,7 +857,7 @@ else endif $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o dtrmm_kernel_rt.s m4 dtrmm_kernel_rt.s > dtrmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA dtrmm_kernel_rt_nomacros.s -o $@ @@ -879,9 +879,9 @@ $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ $(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_ln.s - m4 ctrmm_kernel_ln.s > ctrmm_kernel_ln_nomacros.s + m4 -B 16384 ctrmm_kernel_ln.s > ctrmm_kernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_ln_nomacros.s -o $@ rm ctrmm_kernel_ln.s ctrmm_kernel_ln_nomacros.s else @@ -889,9 +889,9 @@ else endif $(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_lt.s - m4 ctrmm_kernel_lt.s > ctrmm_kernel_lt_nomacros.s + m4 -B 16384 ctrmm_kernel_lt.s > ctrmm_kernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_lt_nomacros.s -o $@ rm ctrmm_kernel_lt.s ctrmm_kernel_lt_nomacros.s else @@ -899,9 +899,9 @@ else endif $(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lr.s - m4 ctrmm_kernel_lr.s > ctrmm_kernel_lr_nomacros.s + m4 -B 16384 ctrmm_kernel_lr.s > ctrmm_kernel_lr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ctrmm_kernel_lr_nomacros.s -o $@ rm ctrmm_kernel_lr.s ctrmm_kernel_lr_nomacros.s else @@ -909,9 +909,9 @@ else endif $(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lc.s - m4 ctrmm_kernel_lc.s > ctrmm_kernel_lc_nomacros.s + m4 -B 16384 ctrmm_kernel_lc.s > ctrmm_kernel_lc_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ctrmm_kernel_lc_nomacros.s -o $@ rm ctrmm_kernel_lc_nomacros.s ctrmm_kernel_lc.s else @@ -919,9 +919,9 @@ else endif $(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rn.s - m4 ctrmm_kernel_rn.s > ctrmm_kernel_rn_nomacros.s + m4 -B 16384 ctrmm_kernel_rn.s > ctrmm_kernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_rn_nomacros.s -o $@ rm ctrmm_kernel_rn.s ctrmm_kernel_rn_nomacros.s else @@ -929,9 +929,9 @@ else endif $(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rt.s - m4 ctrmm_kernel_rt.s > ctrmm_kernel_rt_nomacros.s + m4 -B 16384 ctrmm_kernel_rt.s > ctrmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_rt_nomacros.s -o $@ rm ctrmm_kernel_rt.s ctrmm_kernel_rt_nomacros.s else @@ -939,9 +939,9 @@ else endif $(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ctrmm_kernel_rr.s - m4 ctrmm_kernel_rr.s > ctrmm_kernel_rr_nomacros.s + m4 -B 16384 ctrmm_kernel_rr.s > ctrmm_kernel_rr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ctrmm_kernel_rr_nomacros.s -o $@ rm ctrmm_kernel_rr.s ctrmm_kernel_rr_nomacros.s else @@ -949,9 +949,9 @@ else endif $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ctrmm_kernel_RC.s - m4 ctrmm_kernel_RC.s > ctrmm_kernel_RC_nomacros.s + m4 -B 16384 ctrmm_kernel_RC.s > ctrmm_kernel_RC_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ctrmm_kernel_RC_nomacros.s -o $@ rm ctrmm_kernel_RC.s ctrmm_kernel_RC_nomacros.s else @@ -959,9 +959,9 @@ else endif $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_ln.s - m4 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s + m4 -B 16384 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@ rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s else @@ -969,9 +969,9 @@ else endif $(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_lt.s - m4 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s + m4 -B 16384 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@ rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s else @@ -979,9 +979,9 @@ else endif $(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lr.s - m4 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s + m4 -B 16384 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@ rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s else @@ -989,9 +989,9 @@ else endif $(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lc.s - m4 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s + m4 -B 16384 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@ rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s else @@ -999,9 +999,9 @@ else endif $(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rn.s - m4 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s + m4 -B 16384 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@ rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s else @@ -1009,9 +1009,9 @@ else endif $(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rt.s - m4 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s + m4 -B 16384 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@ rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s else @@ -1019,7 +1019,7 @@ else endif $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rr.s m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@ @@ -1029,9 +1029,9 @@ else endif $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rc.s - m4 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s + m4 -B 16384 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@ rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s else @@ -1049,9 +1049,9 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s - m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s + m4 -B 16384 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s else @@ -1183,9 +1183,9 @@ $(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DT $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@ $(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o dtrsm_kernel_lt.s - m4 dtrsm_kernel_lt.s > dtrsm_kernel_lt_nomacros.s + m4 -B 16384 dtrsm_kernel_lt.s > dtrsm_kernel_lt_nomacros.s $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ dtrsm_kernel_lt_nomacros.s -o $@ rm dtrsm_kernel_lt.s dtrsm_kernel_lt_nomacros.s else @@ -2459,7 +2459,7 @@ $(KDIR)cgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMM $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@ $(KDIR)cgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(PFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@ @@ -2505,7 +2505,7 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ $(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s m4 strmmkernel_rn.s > strmm_kernel_rt_nomacros.s $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ From c854ef5471e7b1673b408673239ee1b917518496 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 25 Jun 2020 13:29:52 +0200 Subject: [PATCH 269/593] Fix variable names in conditional --- kernel/Makefile.L3 | 94 +++++++++++++++++++++++----------------------- 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index c7865480f..3d63ff861 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -482,7 +482,7 @@ $(KDIR)$(SHGEMMONCOPYOBJ) : $(KERNELDIR)/$(SHGEMMONCOPY) $(KDIR)$(SHGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SHGEMMOTCOPY) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DHALF -UDOUBLE -UCOMPLEX $< -o shgemmotcopy.s m4 shgemmotcopy.s > shgemmotcopy_nomacros.s $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemmotcopy_nomacros.s -o $@ @@ -497,7 +497,7 @@ $(KDIR)$(SHGEMMINCOPYOBJ) : $(KERNELDIR)/$(SHGEMMINCOPY) $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(SHGEMMITCOPYOBJ) : $(KERNELDIR)/$(SHGEMMITCOPY) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DHALF -UDOUBLE -UCOMPLEX $< -o shgemmitcopy.s m4 shgemmitcopy.s > shgemmitcopy_nomacros.s $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemmitcopy_nomacros.s -o $@ @@ -513,7 +513,7 @@ $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmotcopy.s m4 sgemmotcopy.s > sgemmotcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmotcopy_nomacros.s -o $@ @@ -529,7 +529,7 @@ $(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmitcopy.s m4 sgemmitcopy.s > sgemmitcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmitcopy_nomacros.s -o $@ @@ -541,7 +541,7 @@ endif endif $(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_ncopy.s m4 dgemm_ncopy.s > dgemm_ncopy_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_ncopy_nomacros.s -o $@ @@ -559,7 +559,7 @@ $(KDIR)$(DGEMMINCOPYOBJ) : $(KERNELDIR)/$(DGEMMINCOPY) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(DGEMMITCOPYOBJ) : $(KERNELDIR)/$(DGEMMITCOPY) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_itcopy.s m4 dgemm_itcopy.s > dgemm_itcopy_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_itcopy_nomacros.s -o $@ @@ -602,7 +602,7 @@ $(KDIR)$(CGEMMINCOPYOBJ) : $(KERNELDIR)/$(CGEMMINCOPY) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(CGEMMITCOPYOBJ) : $(KERNELDIR)/$(CGEMMITCOPY) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -E $< -o cgemm_itcopy.s m4 cgemm_itcopy.s > cgemm_itcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX cgemm_itcopy_nomacros.s -o $@ @@ -625,7 +625,7 @@ $(KDIR)$(ZGEMMINCOPYOBJ) : $(KERNELDIR)/$(ZGEMMINCOPY) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(ZGEMMITCOPYOBJ) : $(KERNELDIR)/$(ZGEMMITCOPY) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o zgemm_itcopy.s m4 zgemm_itcopy.s > zgemm_itcopy_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX zgemm_itcopy_nomacros.s -o $@ @@ -657,7 +657,7 @@ endif endif $(KDIR)sgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemm_kernel$(TSUFFIX).s m4 sgemm_kernel$(TSUFFIX).s > sgemm_kernel$(TSUFFIX)_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemm_kernel$(TSUFFIX)_nomacros.s -o $@ @@ -669,7 +669,7 @@ endif ifeq ($(BUILD_HALF), 1) $(KDIR)shgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DHALF -UDOUBLE -UCOMPLEX $< -o shgemm_kernel$(TSUFFIX).s m4 shgemm_kernel$(TSUFFIX).s > shgemm_kernel$(TSUFFIX)_nomacros.s $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemm_kernel$(TSUFFIX)_nomacros.s -o $@ @@ -680,7 +680,7 @@ endif endif $(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_kernel$(TSUFFIX).s m4 dgemm_kernel$(TSUFFIX).s > dgemm_kernel$(TSUFFIX)_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_kernel$(TSUFFIX)_nomacros.s -o $@ @@ -693,7 +693,7 @@ $(KDIR)qgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEP $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ $(KDIR)cgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNN $< -o cgemm_kernel_n.s m4 -B 16384 cgemm_kernel_n.s > cgemm_kernel_n_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN cgemm_kernel_n_nomacros.s -o $@ @@ -703,7 +703,7 @@ else endif $(KDIR)cgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCN $< -o cgemm_kernel_l.s m4 -B 16384 cgemm_kernel_l.s > cgemm_kernel_l_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN cgemm_kernel_l_nomacros.s -o $@ @@ -713,7 +713,7 @@ else endif $(KDIR)cgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s m4 -B 16384 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@ @@ -723,7 +723,7 @@ else endif $(KDIR)cgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCC $< -o cgemm_kernel_b.s m4 -B 16384 cgemm_kernel_b.s > cgemm_kernel_b_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC cgemm_kernel_b_nomacros.s -o $@ @@ -733,7 +733,7 @@ else endif $(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNN $< -o zgemm_kernel_n.s m4 -B 16384 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@ @@ -743,7 +743,7 @@ else endif $(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCN $< -o zgemm_kernel_l.s m4 -B 16384 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@ @@ -753,7 +753,7 @@ else endif $(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNC $< -o zgemm_kernel_r.s m4 -B 16384 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@ @@ -763,7 +763,7 @@ else endif $(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCC $< -o zgemm_kernel_b.s m4 -B 16384 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@ @@ -787,7 +787,7 @@ $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD ifdef USE_TRMM $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o strmmkernel_ln.s m4 strmmkernel_ln.s > strmmkernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA strmmkernel_ln_nomacros.s -o $@ @@ -797,7 +797,7 @@ else endif $(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o strmmkernel_lt.s m4 strmmkernel_lt.s > strmmkernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA strmmkernel_lt_nomacros.s -o $@ @@ -807,7 +807,7 @@ else endif $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o strmmkernel_rn.s m4 strmmkernel_rn.s > strmmkernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA strmmkernel_rn_nomacros.s -o $@ @@ -817,7 +817,7 @@ else endif $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ @@ -827,7 +827,7 @@ else endif $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o dtrmm_kernel_ln.s m4 dtrmm_kernel_ln.s > dtrmm_kernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA dtrmm_kernel_ln_nomacros.s -o $@ @@ -837,7 +837,7 @@ else endif $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o dtrmm_kernel_lt.s m4 dtrmm_kernel_lt.s > dtrmm_kernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA dtrmm_kernel_lt_nomacros.s -o $@ @@ -847,7 +847,7 @@ else endif $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o dtrmm_kernel_rn.s m4 dtrmm_kernel_rn.s > dtrmm_kernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA dtrmm_kernel_rn_nomacros.s -o $@ @@ -857,7 +857,7 @@ else endif $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o dtrmm_kernel_rt.s m4 dtrmm_kernel_rt.s > dtrmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA dtrmm_kernel_rt_nomacros.s -o $@ @@ -879,7 +879,7 @@ $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ $(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_ln.s m4 -B 16384 ctrmm_kernel_ln.s > ctrmm_kernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_ln_nomacros.s -o $@ @@ -889,7 +889,7 @@ else endif $(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_lt.s m4 -B 16384 ctrmm_kernel_lt.s > ctrmm_kernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_lt_nomacros.s -o $@ @@ -899,7 +899,7 @@ else endif $(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lr.s m4 -B 16384 ctrmm_kernel_lr.s > ctrmm_kernel_lr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ctrmm_kernel_lr_nomacros.s -o $@ @@ -909,7 +909,7 @@ else endif $(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lc.s m4 -B 16384 ctrmm_kernel_lc.s > ctrmm_kernel_lc_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ctrmm_kernel_lc_nomacros.s -o $@ @@ -919,7 +919,7 @@ else endif $(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rn.s m4 -B 16384 ctrmm_kernel_rn.s > ctrmm_kernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_rn_nomacros.s -o $@ @@ -929,7 +929,7 @@ else endif $(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rt.s m4 -B 16384 ctrmm_kernel_rt.s > ctrmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_rt_nomacros.s -o $@ @@ -939,7 +939,7 @@ else endif $(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ctrmm_kernel_rr.s m4 -B 16384 ctrmm_kernel_rr.s > ctrmm_kernel_rr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ctrmm_kernel_rr_nomacros.s -o $@ @@ -949,7 +949,7 @@ else endif $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ctrmm_kernel_RC.s m4 -B 16384 ctrmm_kernel_RC.s > ctrmm_kernel_RC_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ctrmm_kernel_RC_nomacros.s -o $@ @@ -959,7 +959,7 @@ else endif $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_ln.s m4 -B 16384 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@ @@ -969,7 +969,7 @@ else endif $(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_lt.s m4 -B 16384 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@ @@ -979,7 +979,7 @@ else endif $(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lr.s m4 -B 16384 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@ @@ -989,7 +989,7 @@ else endif $(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lc.s m4 -B 16384 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@ @@ -999,7 +999,7 @@ else endif $(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rn.s m4 -B 16384 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@ @@ -1009,7 +1009,7 @@ else endif $(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rt.s m4 -B 16384 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@ @@ -1019,7 +1019,7 @@ else endif $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rr.s m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@ @@ -1029,7 +1029,7 @@ else endif $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rc.s m4 -B 16384 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@ @@ -1049,7 +1049,7 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s m4 -B 16384 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ @@ -1183,7 +1183,7 @@ $(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DT $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@ $(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o dtrsm_kernel_lt.s m4 -B 16384 dtrsm_kernel_lt.s > dtrsm_kernel_lt_nomacros.s $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ dtrsm_kernel_lt_nomacros.s -o $@ @@ -2459,7 +2459,7 @@ $(KDIR)cgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMM $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@ $(KDIR)cgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(PFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@ @@ -2505,7 +2505,7 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ $(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s m4 strmmkernel_rn.s > strmm_kernel_rt_nomacros.s $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ From 2a91452bdd1d735b11156add482b9f35c3d01c69 Mon Sep 17 00:00:00 2001 From: Matthew Treinish Date: Thu, 25 Jun 2020 11:32:09 -0400 Subject: [PATCH 270/593] Add cpu detection support for comet lake U Comet Lake U CPUs have family: 6, model: 6, extended family: 0, and extended model: 10 were not being correctly detected by GETARCH during openblas builds and would show CORE=UNKNOWN and LIBCORE=unknown. This commit adds the necessary information to cpuid_x86 to detect extended family 10 model 6 and return the proper core information. It's essentially just a skylake cpu, not skylake x, so I just took the used the same return fields as skylake. --- cpuid_x86.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/cpuid_x86.c b/cpuid_x86.c index e29adecae..1fe5ca152 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1955,6 +1955,19 @@ int get_coretype(void){ return CORE_NEHALEM; } break; + case 10: + switch (model) { + case 6: + // Comet Lake U + if(support_avx()) + #ifndef NO_AVX2 + return CORE_HASWELL; + #else + return CORE_SANDYBRIDGE; + #endif + else + return CORE_NEHALEM; + } case 5: switch (model) { case 6: From f37e941d5270e396ed27e4ad5fd484fb257b742b Mon Sep 17 00:00:00 2001 From: Matthew Treinish Date: Thu, 25 Jun 2020 11:56:49 -0400 Subject: [PATCH 271/593] Add support to driver/others/dynamic.c too --- driver/others/dynamic.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 38eb76643..7677f265a 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -618,6 +618,18 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } + case 10: + if (model == 6) { + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } return NULL; } case 0xf: From 2f9c10810c932fc015cb4e5078cab7117bc120b6 Mon Sep 17 00:00:00 2001 From: Matthew Treinish Date: Thu, 25 Jun 2020 15:53:56 -0400 Subject: [PATCH 272/593] Also set CPUTYPE in get_cpuname() --- cpuid_x86.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/cpuid_x86.c b/cpuid_x86.c index 1fe5ca152..3538690b9 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1406,6 +1406,16 @@ int get_cpuname(void){ return CPUTYPE_SANDYBRIDGE; else return CPUTYPE_NEHALEM; + } + case 10: //family 6 exmodel 10 + switch (model) { + case 6: // Comet Lake U + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; } break; } From d23419accc2f60a27b95cb29f11f76443a82d111 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Thu, 25 Jun 2020 22:19:08 -0500 Subject: [PATCH 273/593] powerpc: Optimized SHGEMM kernel for POWER10 This patch introduces new optimized version of SHGEMM kernel using power10 Matrix-Multiply Assist (MMA) feature introduced in POWER ISA v3.1. This patch makes use of new POWER10 compute instructions for matrix multiplication operation. Tested on simulator and there are no new test failures. --- kernel/generic/gemm_ncopy_16.c | 32 +- kernel/generic/gemm_ncopy_8.c | 44 +- kernel/generic/gemm_tcopy_16.c | 26 +- kernel/generic/gemm_tcopy_8.c | 46 +- kernel/power/KERNEL.POWER10 | 11 + kernel/power/shgemm_kernel_power10.c | 1044 ++++++++++++++++++++++++++ param.h | 13 + 7 files changed, 1142 insertions(+), 74 deletions(-) create mode 100644 kernel/power/shgemm_kernel_power10.c diff --git a/kernel/generic/gemm_ncopy_16.c b/kernel/generic/gemm_ncopy_16.c index 5f91d0dbe..d3ab46472 100644 --- a/kernel/generic/gemm_ncopy_16.c +++ b/kernel/generic/gemm_ncopy_16.c @@ -39,24 +39,24 @@ #include #include "common.h" -int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ BLASLONG i, j; - FLOAT *aoffset; - FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; - FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; - FLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12; - FLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16; - - FLOAT *boffset; - FLOAT ctemp01, ctemp02, ctemp03, ctemp04; - FLOAT ctemp05, ctemp06, ctemp07, ctemp08; - FLOAT ctemp09, ctemp10, ctemp11, ctemp12; - FLOAT ctemp13, ctemp14, ctemp15, ctemp16; - FLOAT ctemp17, ctemp18, ctemp19, ctemp20; - FLOAT ctemp21, ctemp22, ctemp23, ctemp24; - FLOAT ctemp25, ctemp26, ctemp27, ctemp28; - FLOAT ctemp29, ctemp30, ctemp31, ctemp32; + IFLOAT *aoffset; + IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; + IFLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12; + IFLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16; + + IFLOAT *boffset; + IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; + IFLOAT ctemp05, ctemp06, ctemp07, ctemp08; + IFLOAT ctemp09, ctemp10, ctemp11, ctemp12; + IFLOAT ctemp13, ctemp14, ctemp15, ctemp16; + IFLOAT ctemp17, ctemp18, ctemp19, ctemp20; + IFLOAT ctemp21, ctemp22, ctemp23, ctemp24; + IFLOAT ctemp25, ctemp26, ctemp27, ctemp28; + IFLOAT ctemp29, ctemp30, ctemp31, ctemp32; aoffset = a; boffset = b; diff --git a/kernel/generic/gemm_ncopy_8.c b/kernel/generic/gemm_ncopy_8.c index a49a778e6..aaf9c8917 100644 --- a/kernel/generic/gemm_ncopy_8.c +++ b/kernel/generic/gemm_ncopy_8.c @@ -39,30 +39,30 @@ #include #include "common.h" -int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ BLASLONG i, j; - FLOAT *aoffset; - FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; - FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; - - FLOAT *boffset; - FLOAT ctemp01, ctemp02, ctemp03, ctemp04; - FLOAT ctemp05, ctemp06, ctemp07, ctemp08; - FLOAT ctemp09, ctemp10, ctemp11, ctemp12; - FLOAT ctemp13, ctemp14, ctemp15, ctemp16; - FLOAT ctemp17, ctemp18, ctemp19, ctemp20; - FLOAT ctemp21, ctemp22, ctemp23, ctemp24; - FLOAT ctemp25, ctemp26, ctemp27, ctemp28; - FLOAT ctemp29, ctemp30, ctemp31, ctemp32; - FLOAT ctemp33, ctemp34, ctemp35, ctemp36; - FLOAT ctemp37, ctemp38, ctemp39, ctemp40; - FLOAT ctemp41, ctemp42, ctemp43, ctemp44; - FLOAT ctemp45, ctemp46, ctemp47, ctemp48; - FLOAT ctemp49, ctemp50, ctemp51, ctemp52; - FLOAT ctemp53, ctemp54, ctemp55, ctemp56; - FLOAT ctemp57, ctemp58, ctemp59, ctemp60; - FLOAT ctemp61, ctemp62, ctemp63, ctemp64; + IFLOAT *aoffset; + IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; + + IFLOAT *boffset; + IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; + IFLOAT ctemp05, ctemp06, ctemp07, ctemp08; + IFLOAT ctemp09, ctemp10, ctemp11, ctemp12; + IFLOAT ctemp13, ctemp14, ctemp15, ctemp16; + IFLOAT ctemp17, ctemp18, ctemp19, ctemp20; + IFLOAT ctemp21, ctemp22, ctemp23, ctemp24; + IFLOAT ctemp25, ctemp26, ctemp27, ctemp28; + IFLOAT ctemp29, ctemp30, ctemp31, ctemp32; + IFLOAT ctemp33, ctemp34, ctemp35, ctemp36; + IFLOAT ctemp37, ctemp38, ctemp39, ctemp40; + IFLOAT ctemp41, ctemp42, ctemp43, ctemp44; + IFLOAT ctemp45, ctemp46, ctemp47, ctemp48; + IFLOAT ctemp49, ctemp50, ctemp51, ctemp52; + IFLOAT ctemp53, ctemp54, ctemp55, ctemp56; + IFLOAT ctemp57, ctemp58, ctemp59, ctemp60; + IFLOAT ctemp61, ctemp62, ctemp63, ctemp64; aoffset = a; diff --git a/kernel/generic/gemm_tcopy_16.c b/kernel/generic/gemm_tcopy_16.c index 56268ebf2..14252599a 100644 --- a/kernel/generic/gemm_tcopy_16.c +++ b/kernel/generic/gemm_tcopy_16.c @@ -39,22 +39,22 @@ #include #include "common.h" -int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ BLASLONG i, j; - FLOAT *aoffset; - FLOAT *aoffset1, *aoffset2; - FLOAT *boffset; - - FLOAT ctemp01, ctemp02, ctemp03, ctemp04; - FLOAT ctemp05, ctemp06, ctemp07, ctemp08; - FLOAT ctemp09, ctemp10, ctemp11, ctemp12; - FLOAT ctemp13, ctemp14, ctemp15, ctemp16; - FLOAT ctemp17, ctemp18, ctemp19, ctemp20; - FLOAT ctemp21, ctemp22, ctemp23, ctemp24; - FLOAT ctemp25, ctemp26, ctemp27, ctemp28; - FLOAT ctemp29, ctemp30, ctemp31, ctemp32; + IFLOAT *aoffset; + IFLOAT *aoffset1, *aoffset2; + IFLOAT *boffset; + + IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; + IFLOAT ctemp05, ctemp06, ctemp07, ctemp08; + IFLOAT ctemp09, ctemp10, ctemp11, ctemp12; + IFLOAT ctemp13, ctemp14, ctemp15, ctemp16; + IFLOAT ctemp17, ctemp18, ctemp19, ctemp20; + IFLOAT ctemp21, ctemp22, ctemp23, ctemp24; + IFLOAT ctemp25, ctemp26, ctemp27, ctemp28; + IFLOAT ctemp29, ctemp30, ctemp31, ctemp32; aoffset = a; boffset = b; diff --git a/kernel/generic/gemm_tcopy_8.c b/kernel/generic/gemm_tcopy_8.c index b28f3d219..3e8a839db 100644 --- a/kernel/generic/gemm_tcopy_8.c +++ b/kernel/generic/gemm_tcopy_8.c @@ -39,32 +39,32 @@ #include #include "common.h" -int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ BLASLONG i, j; - FLOAT *aoffset; - FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; - FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; - - FLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4; - - FLOAT ctemp01, ctemp02, ctemp03, ctemp04; - FLOAT ctemp05, ctemp06, ctemp07, ctemp08; - FLOAT ctemp09, ctemp10, ctemp11, ctemp12; - FLOAT ctemp13, ctemp14, ctemp15, ctemp16; - FLOAT ctemp17, ctemp18, ctemp19, ctemp20; - FLOAT ctemp21, ctemp22, ctemp23, ctemp24; - FLOAT ctemp25, ctemp26, ctemp27, ctemp28; - FLOAT ctemp29, ctemp30, ctemp31, ctemp32; - FLOAT ctemp33, ctemp34, ctemp35, ctemp36; - FLOAT ctemp37, ctemp38, ctemp39, ctemp40; - FLOAT ctemp41, ctemp42, ctemp43, ctemp44; - FLOAT ctemp45, ctemp46, ctemp47, ctemp48; - FLOAT ctemp49, ctemp50, ctemp51, ctemp52; - FLOAT ctemp53, ctemp54, ctemp55, ctemp56; - FLOAT ctemp57, ctemp58, ctemp59, ctemp60; - FLOAT ctemp61, ctemp62, ctemp63, ctemp64; + IFLOAT *aoffset; + IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; + + IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4; + + IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; + IFLOAT ctemp05, ctemp06, ctemp07, ctemp08; + IFLOAT ctemp09, ctemp10, ctemp11, ctemp12; + IFLOAT ctemp13, ctemp14, ctemp15, ctemp16; + IFLOAT ctemp17, ctemp18, ctemp19, ctemp20; + IFLOAT ctemp21, ctemp22, ctemp23, ctemp24; + IFLOAT ctemp25, ctemp26, ctemp27, ctemp28; + IFLOAT ctemp29, ctemp30, ctemp31, ctemp32; + IFLOAT ctemp33, ctemp34, ctemp35, ctemp36; + IFLOAT ctemp37, ctemp38, ctemp39, ctemp40; + IFLOAT ctemp41, ctemp42, ctemp43, ctemp44; + IFLOAT ctemp45, ctemp46, ctemp47, ctemp48; + IFLOAT ctemp49, ctemp50, ctemp51, ctemp52; + IFLOAT ctemp53, ctemp54, ctemp55, ctemp56; + IFLOAT ctemp57, ctemp58, ctemp59, ctemp60; + IFLOAT ctemp61, ctemp62, ctemp63, ctemp64; aoffset = a; boffset = b; diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index 4fc7190b0..39f5e9414 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -7,6 +7,17 @@ else #CGEMM_BETA = ../generic/zgemm_beta.c #ZGEMM_BETA = ../generic/zgemm_beta.c +SHGEMM_BETA = ../generic/gemm_beta.c +SHGEMMKERNEL = shgemm_kernel_power10.c +SHGEMMINCOPY = ../generic/gemm_ncopy_16.c +SHGEMMITCOPY = ../generic/gemm_tcopy_16.c +SHGEMMONCOPY = ../generic/gemm_ncopy_8.c +SHGEMMOTCOPY = ../generic/gemm_tcopy_8.c +SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX) +SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX) +SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX) +SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX) + STRMMKERNEL = sgemm_kernel_power10.c DTRMMKERNEL = dgemm_kernel_power10.c CTRMMKERNEL = cgemm_kernel_power10.S diff --git a/kernel/power/shgemm_kernel_power10.c b/kernel/power/shgemm_kernel_power10.c new file mode 100644 index 000000000..7455f925c --- /dev/null +++ b/kernel/power/shgemm_kernel_power10.c @@ -0,0 +1,1044 @@ +/********************************************************************************* +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ +#include "common.h" +#include +#if defined(HALF) && defined(HALFCONVERSION) +static float +bfloat16tof32 (bfloat16 f16) +{ + float result = 0; + unsigned short *q = (unsigned short *) (&result); +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + q[0] = f16; +#else + q[1] = f16; +#endif + return result; +} + +#define BF16TOF32(x) (bfloat16tof32(x)) +#else +#define BF16TOF32(x) x +#endif + +typedef unsigned char vec_t __attribute__ ((vector_size (16))); +typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); +typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); + +vector char mask = + { 0x0, 0x1, 0x8, 0x9, 0x2, 0x3, 0xa, 0xb, 0x4, 0x5, 0xc, 0xd, 0x6, 0x7, 0xe, + 0xf +}; + +/* + * BFLOAT16 xvbf16ger2pp instruction needs 4×2 matrix of + * bfloat16 floating-point values as input. Hence this + * merging is needed on A and B matrices. + */ +#define MERGE_ROW(x) vec_perm(x, x, mask) +#define MERGE_HIGH(x, y) (vec_t) vec_mergeh ((vector short)x, (vector short)y) +#define MERGE_LOW(x, y) (vec_t) vec_mergel ((vector short)x, (vector short)y) + +#define SAVE_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] += result[3] * alpha; \ + rowC = (v4sf_t *) &CO[1*ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v4sf_t *) &CO[2*ldc+J]; \ + rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[3*ldc+J]; \ + rowC[0] += result[0] * alpha; +#define SAVE_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[4* ldc+J]; \ + rowC[0] += result[3] * alpha; \ + rowC = (v4sf_t *) &CO[5*ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[7*ldc+J]; \ + rowC[0] += result[0] * alpha; +#define SAVE4x2_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v2sf_t *) &CO[0* ldc+J]; \ + rowC[0] += result[6] * alpha; \ + rowC = (v2sf_t *) &CO[1* ldc+J]; \ + rowC[0] += result[4] * alpha; \ + rowC = (v2sf_t *) &CO[2* ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v2sf_t *) &CO[3* ldc+J]; \ + rowC[0] += result[0] * alpha; +#define SAVE4x2_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v2sf_t *) &CO[4* ldc+J]; \ + rowC[0] += result[6] * alpha; \ + rowC = (v2sf_t *) &CO[5* ldc+J]; \ + rowC[0] += result[4] * alpha; \ + rowC = (v2sf_t *) &CO[6* ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v2sf_t *) &CO[7* ldc+J]; \ + rowC[0] += result[0] * alpha; + +#define MMA __builtin_mma_xvbf16ger2pp + +#define SAVE2x4_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] += result[3] * alpha; \ + rowC = (v4sf_t *) &CO[1* ldc+J]; \ + rowC[0] += result[2] * alpha; + +#define SET_ACC_ZERO4() \ + __builtin_mma_xxsetaccz (&acc0); \ + __builtin_mma_xxsetaccz (&acc1); \ + __builtin_mma_xxsetaccz (&acc2); \ + __builtin_mma_xxsetaccz (&acc3); + +#define SET_ACC_ZERO8() \ + __builtin_mma_xxsetaccz (&acc0); \ + __builtin_mma_xxsetaccz (&acc1); \ + __builtin_mma_xxsetaccz (&acc2); \ + __builtin_mma_xxsetaccz (&acc3); \ + __builtin_mma_xxsetaccz (&acc4); \ + __builtin_mma_xxsetaccz (&acc5); \ + __builtin_mma_xxsetaccz (&acc6); \ + __builtin_mma_xxsetaccz (&acc7); + +#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); +/************************************************************************************* +* SHGEMM Kernel +*************************************************************************************/ +int +CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, + IFLOAT * B, FLOAT * C, BLASLONG ldc) +{ + BLASLONG N = n; + BLASLONG i1; + v4sf_t valpha = { alpha, alpha, alpha, alpha }; + vector short vzero = { 0, 0, 0, 0, 0, 0, 0, 0 }; + N = n >> 3; + /* Loop for n >= 8. */ + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, j; + FLOAT *CO; + IFLOAT *AO; + CO = C; + C += ldc << 3; + AO = A; + PREFETCH1 (A, 128); + PREFETCH1 (A, 256); + i = m >> 4; + /* Loop for m >= 16. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + SET_ACC_ZERO8 (); + BLASLONG l = 0; + for (l = 0; l < k / 2; l++) + { + vec_t *rowA = (vec_t *) & (AO[l << 5]); + vec_t *rowB = (vec_t *) & (BO[l << 4]); + vec_t rowB_h = MERGE_HIGH (rowB[0], rowB[1]); + vec_t rowB_l = MERGE_LOW (rowB[0], rowB[1]); + vec_t rowA_h = MERGE_HIGH (rowA[0], rowA[2]); + vec_t rowA_l = MERGE_LOW (rowA[0], rowA[2]); + vec_t rowA2_h = MERGE_HIGH (rowA[1], rowA[3]); + vec_t rowA2_l = MERGE_LOW (rowA[1], rowA[3]); + MMA (&acc0, rowB_h, rowA_h); + MMA (&acc1, rowB_l, rowA_h); + MMA (&acc2, rowB_h, rowA_l); + MMA (&acc3, rowB_l, rowA_l); + MMA (&acc4, rowB_h, rowA2_h); + MMA (&acc5, rowB_l, rowA2_h); + MMA (&acc6, rowB_h, rowA2_l); + MMA (&acc7, rowB_l, rowA2_l); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 4; + vec_t *rowA = (vec_t *) & (AO[l << 1]); + vec_t *rowB = (vec_t *) & (BO[l]); + vec_t rowB_h = MERGE_HIGH (rowB[0], rowB[1]); + vec_t rowB_l = MERGE_LOW (rowB[0], rowB[1]); + vec_t rowA_h = MERGE_HIGH (rowA[0], vzero); + vec_t rowA_l = MERGE_LOW (rowA[0], vzero); + vec_t rowA2_h = MERGE_HIGH (rowA[1], vzero); + vec_t rowA2_l = MERGE_LOW (rowA[1], vzero); + MMA (&acc0, rowB_h, rowA_h); + MMA (&acc1, rowB_l, rowA_h); + MMA (&acc2, rowB_h, rowA_l); + MMA (&acc3, rowB_l, rowA_l); + MMA (&acc4, rowB_h, rowA2_h); + MMA (&acc5, rowB_l, rowA2_h); + MMA (&acc6, rowB_h, rowA2_l); + MMA (&acc7, rowB_l, rowA2_l); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc2, 4); + SAVE_ACC1 (&acc1, 0); + SAVE_ACC1 (&acc3, 4); + SAVE_ACC (&acc4, 8); + SAVE_ACC (&acc6, 12); + SAVE_ACC1 (&acc5, 8); + SAVE_ACC1 (&acc7, 12); + CO += 16; + + AO += (k << 4); + BO += (k << 3); + } + i = (m & 15) >> 3; + /* Loop for m >= 8. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3; + SET_ACC_ZERO4 (); + BLASLONG l = 0; + for (l = 0; l < k / 2; l++) + { + vec_t *rowA = (vec_t *) & (AO[l << 4]); + vec_t *rowB = (vec_t *) & (BO[l << 4]); + vec_t rowB_h = MERGE_HIGH (rowB[0], rowB[1]); + vec_t rowB_l = MERGE_LOW (rowB[0], rowB[1]); + vec_t rowA_h = MERGE_HIGH (rowA[0], rowA[1]); + vec_t rowA_l = MERGE_LOW (rowA[0], rowA[1]); + MMA (&acc0, rowB_h, rowA_h); + MMA (&acc1, rowB_l, rowA_h); + MMA (&acc2, rowB_h, rowA_l); + MMA (&acc3, rowB_l, rowA_l); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 4; + vec_t *rowA = (vec_t *) & (AO[l]); + vec_t *rowB = (vec_t *) & (BO[l]); + vec_t rowB_h = MERGE_HIGH (rowB[0], rowB[1]); + vec_t rowB_l = MERGE_LOW (rowB[0], rowB[1]); + vec_t rowA_h = MERGE_HIGH (rowA[0], vzero); + vec_t rowA_l = MERGE_LOW (rowA[0], vzero); + MMA (&acc0, rowB_h, rowA_h); + MMA (&acc1, rowB_l, rowA_h); + MMA (&acc2, rowB_h, rowA_l); + MMA (&acc3, rowB_l, rowA_l); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc2, 4); + SAVE_ACC1 (&acc1, 0); + SAVE_ACC1 (&acc3, 4); + CO += 8; + AO += (k << 3); + BO += (k << 3); + } + i = (m & 7) >> 2; + /* Loop for m >= 4. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + BLASLONG l = 0; + for (l = 0; l < k / 2; l++) + { + vec_t *rowA = (vec_t *) & (AO[l << 3]); + vec_t *rowB = (vec_t *) & (BO[l << 4]); + vec_t rowA_mrg = MERGE_ROW (rowA[0]); + MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), rowA_mrg); + MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), rowA_mrg); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 3; + vector short rowA = + { AO[l + 0], 0, AO[l + 1], 0, AO[l + 2], 0, AO[l + 3], 0 }; + vec_t *rowB = (vec_t *) & (BO[l << 1]); + MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), (vec_t) rowA); + MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), (vec_t) rowA); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC1 (&acc1, 0); + CO += 4; + AO += (k << 2); + BO += (k << 3); + } + i = (m & 3) >> 1; + /* Loop for m >= 2. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v2sf_t *rowC; + v2sf_t result[8]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + BLASLONG l = 0; + for (l = 0; l < k / 2; l++) + { + vector short rowA = + { AO[(l << 2) + 0], AO[(l << 2) + 2], AO[(l << 2) + 1], + AO[(l << 2) + 3], + 0, 0, 0, 0 + }; + vec_t *rowB = (vec_t *) & (BO[l << 4]); + MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), (vec_t) rowA); + MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), (vec_t) rowA); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 2; + vector short rowA = { AO[l + 0], 0, AO[l + 1], 0, 0, 0, 0, 0 }; + vec_t *rowB = (vec_t *) & (BO[(l << 2)]); + MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), (vec_t) rowA); + MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), (vec_t) rowA); + } + SAVE4x2_ACC (&acc0, 0); + SAVE4x2_ACC1 (&acc1, 0); + CO += 2; + AO += (k << 1); + BO += (k << 3); + } + i = (m & 1) >> 0; + /* Loop for m = 1. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 } + , t1 = + { + 0, 0, 0, 0}; + for (l = 0; l < k; l++) + { + v4sf_t rowA = + { BF16TOF32 (AO[l]), BF16TOF32 (AO[l]), BF16TOF32 (AO[l]), + BF16TOF32 (AO[l]) + }; + v4sf_t rowB = + { BF16TOF32 (BO[l << 3]), BF16TOF32 (BO[(l << 3) + 1]), + BF16TOF32 (BO[(l << 3) + 2]), + BF16TOF32 (BO[(l << 3) + 3]) + }; + v4sf_t rowB1 = + { BF16TOF32 (BO[(l << 3) + 4]), BF16TOF32 (BO[(l << 3) + 5]), + BF16TOF32 (BO[(l << 3) + 6]), + BF16TOF32 (BO[(l << 3) + 7]) + }; + t += rowA * rowB; + t1 += rowA * rowB1; + } + t = t * valpha; + t1 = t1 * valpha; + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; + CO[2 * ldc] += t[2]; + CO[3 * ldc] += t[3]; + CO[4 * ldc] += t1[0]; + CO[5 * ldc] += t1[1]; + CO[6 * ldc] += t1[2]; + CO[7 * ldc] += t1[3]; + CO += 1; + AO += k; + BO += (k << 3); + } + B += k << 3; + } + N = (n & 7) >> 2; + /* Loop for n >= 4. */ + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, j; + FLOAT *CO; + IFLOAT *AO; + CO = C; + C += ldc << 2; + AO = A; + i = m >> 5; + /* Loop for m >= 32. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + IFLOAT *A1 = AO + (16 * k); + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + SET_ACC_ZERO8 (); + BLASLONG l = 0; + for (l = 0; l < k / 2; l++) + { + vec_t *rowA = (vec_t *) & (AO[l << 5]); + vec_t *rowA1 = (vec_t *) & (A1[l << 5]); + vec_t *rowB = (vec_t *) & (BO[l << 3]); + vec_t rowB_mrg = MERGE_ROW (rowB[0]); + MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], rowA[2])); + MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], rowA[2])); + MMA (&acc2, rowB_mrg, MERGE_HIGH (rowA[1], rowA[3])); + MMA (&acc3, rowB_mrg, MERGE_LOW (rowA[1], rowA[3])); + MMA (&acc4, rowB_mrg, MERGE_HIGH (rowA1[0], rowA1[2])); + MMA (&acc5, rowB_mrg, MERGE_LOW (rowA1[0], rowA1[2])); + MMA (&acc6, rowB_mrg, MERGE_HIGH (rowA1[1], rowA1[3])); + MMA (&acc7, rowB_mrg, MERGE_LOW (rowA1[1], rowA1[3])); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 3; + vec_t *rowA = (vec_t *) & (AO[(l << 2)]); + vec_t *rowA1 = (vec_t *) & (A1[(l << 2)]); + vec_t *rowB = (vec_t *) & (BO[l]); + vec_t rowB_mrg = MERGE_ROW (rowB[0]); + MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], vzero)); + MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], vzero)); + MMA (&acc2, rowB_mrg, MERGE_HIGH (rowA[1], vzero)); + MMA (&acc3, rowB_mrg, MERGE_LOW (rowA[1], vzero)); + MMA (&acc4, rowB_mrg, MERGE_HIGH (rowA1[0], vzero)); + MMA (&acc5, rowB_mrg, MERGE_LOW (rowA1[0], vzero)); + MMA (&acc6, rowB_mrg, MERGE_HIGH (rowA1[1], vzero)); + MMA (&acc7, rowB_mrg, MERGE_LOW (rowA1[1], vzero)); + } + + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc1, 4); + CO += 8; + SAVE_ACC (&acc2, 0); + SAVE_ACC (&acc3, 4); + CO += 8; + SAVE_ACC (&acc4, 0); + SAVE_ACC (&acc5, 4); + CO += 8; + SAVE_ACC (&acc6, 0); + SAVE_ACC (&acc7, 4); + CO += 8; + AO += k << 5; + BO += k << 2; + } + i = (m & 31) >> 4; + /* Loop for m >= 16. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3; + SET_ACC_ZERO4 (); + BLASLONG l = 0; + for (l = 0; l < k / 2; l++) + { + vec_t *rowA = (vec_t *) & (AO[l << 5]); + vec_t *rowB = (vec_t *) & (BO[l << 3]); + vec_t rowB_mrg = MERGE_ROW (rowB[0]); + MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], rowA[2])); + MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], rowA[2])); + MMA (&acc2, rowB_mrg, MERGE_HIGH (rowA[1], rowA[3])); + MMA (&acc3, rowB_mrg, MERGE_LOW (rowA[1], rowA[3])); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 3; + vec_t *rowA = (vec_t *) & (AO[(l << 2)]); + vec_t *rowB = (vec_t *) & (BO[l]); + vec_t rowB_mrg = MERGE_ROW (rowB[0]); + MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], vzero)); + MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], vzero)); + MMA (&acc2, rowB_mrg, MERGE_HIGH (rowA[1], vzero)); + MMA (&acc3, rowB_mrg, MERGE_LOW (rowA[1], vzero)); + } + + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc1, 4); + CO += 8; + SAVE_ACC (&acc2, 0); + SAVE_ACC (&acc3, 4); + CO += 8; + AO += k << 4; + BO += k << 2; + } + i = (m & 15) >> 3; + /* Loop for m >= 8. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + BLASLONG l = 0; + for (l = 0; l < k / 2; l++) + { + vec_t *rowA = (vec_t *) & (AO[l << 4]); + vec_t *rowB = (vec_t *) & (BO[l << 3]); + vec_t rowB_mrg = MERGE_ROW (rowB[0]); + MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], rowA[1])); + MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], rowA[1])); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 3; + vec_t *rowA = (vec_t *) & (AO[l << 1]); + vec_t *rowB = (vec_t *) & (BO[l]); + vec_t rowB_mrg = MERGE_ROW (rowB[0]); + MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], vzero)); + MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], vzero)); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc1, 4); + CO += 8; + AO += k << 3; + BO += k << 2; + } + i = (m & 7) >> 2; + /* Loop for m >= 4. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v4sf_t *rowC; + __vector_quad acc0; + v4sf_t result[4]; + BLASLONG l = 0; + __builtin_mma_xxsetaccz (&acc0); + for (l = 0; l < k / 2; l++) + { + vec_t *rowA = (vec_t *) & (AO[l << 3]); + vec_t *rowB = (vec_t *) & (BO[l << 3]); + MMA (&acc0, MERGE_ROW (rowB[0]), MERGE_ROW (rowA[0])); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 3; + vector short rowA = + { AO[l], 0, AO[l + 1], 0, AO[l + 2], 0, AO[l + 3], 0 }; + vec_t *rowB = (vec_t *) & (BO[l]); + MMA (&acc0, MERGE_ROW (rowB[0]), (vec_t) rowA); + } + SAVE_ACC (&acc0, 0); + CO += 4; + AO += k << 2; + BO += k << 2; + } + i = (m & 3) >> 1; + /* Loop for m >= 2. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v2sf_t *rowC; + v2sf_t result[8]; + __vector_quad acc0; + BLASLONG l = 0; + __builtin_mma_xxsetaccz (&acc0); + for (l = 0; l < k / 2; l++) + { + vector short rowA = + { AO[(l << 2) + 0], AO[(l << 2) + 2], AO[(l << 2) + 1], + AO[(l << 2) + 3], + 0, 0, 0, 0 + }; + vec_t *rowB = (vec_t *) & (BO[l << 3]); + MMA (&acc0, MERGE_ROW (rowB[0]), (vec_t) rowA); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 2; + vector short rowA = { AO[l], 0, AO[l + 1], 0, 0, 0, 0, 0 }; + vec_t *rowB = (vec_t *) & (BO[l << 1]); + MMA (&acc0, MERGE_ROW (rowB[0]), (vec_t) rowA); + } + SAVE4x2_ACC (&acc0, 0); + CO += 2; + AO += k << 1; + BO += k << 2; + } + i = (m & 1) >> 0; + /* Loop for m = 1. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; + for (l = 0; l < k; l++) + { + v4sf_t rowA = + { BF16TOF32 (AO[l]), BF16TOF32 (AO[l]), BF16TOF32 (AO[l]), + BF16TOF32 (AO[l]) + }; + v4sf_t rowB = + { BF16TOF32 (BO[l << 2]), BF16TOF32 (BO[(l << 2) + 1]), + BF16TOF32 (BO[(l << 2) + 2]), + BF16TOF32 (BO[(l << 2) + 3]) + }; + t += rowA * rowB; + } + t = t * valpha; + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; + CO[2 * ldc] += t[2]; + CO[3 * ldc] += t[3]; + AO += k; + BO += (k << 2); + CO += 1; + } + + B += k << 2; + } + N = (n & 3) >> 1; + /* Loop for n >= 2. */ + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, j; + FLOAT *CO; + IFLOAT *AO; + CO = C; + C += ldc << 1; + AO = A; + i = m >> 5; + /* Loop for m >= 32. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + IFLOAT *A1 = AO + (16 * k); + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + SET_ACC_ZERO8 (); + BLASLONG l = 0; + for (l = 0; l < k / 2; l++) + { + vector short rowB = + { BO[(l << 2) + 0], BO[(l << 2) + 2], BO[(l << 2) + 1], + BO[(l << 2) + 3], + 0, 0, 0, 0 + }; + vec_t *rowA = (vec_t *) & (AO[l << 5]); + vec_t *rowA1 = (vec_t *) & (A1[l << 5]); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2])); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2])); + MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3])); + MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3])); + MMA (&acc4, (vec_t) rowB, MERGE_HIGH (rowA1[0], rowA1[2])); + MMA (&acc5, (vec_t) rowB, MERGE_LOW (rowA1[0], rowA1[2])); + MMA (&acc6, (vec_t) rowB, MERGE_HIGH (rowA1[1], rowA1[3])); + MMA (&acc7, (vec_t) rowB, MERGE_LOW (rowA1[1], rowA1[3])); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 2; + vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 }; + vec_t *rowA = (vec_t *) & (AO[l << 3]); + vec_t *rowA1 = (vec_t *) & (A1[l << 3]); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2])); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2])); + MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3])); + MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3])); + MMA (&acc4, (vec_t) rowB, MERGE_HIGH (rowA1[0], rowA1[2])); + MMA (&acc5, (vec_t) rowB, MERGE_LOW (rowA1[0], rowA1[2])); + MMA (&acc6, (vec_t) rowB, MERGE_HIGH (rowA1[1], rowA1[3])); + MMA (&acc7, (vec_t) rowB, MERGE_LOW (rowA1[1], rowA1[3])); + } + SAVE2x4_ACC (&acc0, 0); + SAVE2x4_ACC (&acc1, 4); + SAVE2x4_ACC (&acc2, 8); + SAVE2x4_ACC (&acc3, 12); + CO += 16; + SAVE2x4_ACC (&acc4, 0); + SAVE2x4_ACC (&acc5, 4); + SAVE2x4_ACC (&acc6, 8); + SAVE2x4_ACC (&acc7, 12); + CO += 16; + AO += k << 5; + BO += k << 1; + } + i = (m & 31) >> 4; + /* Loop for m >= 16. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3; + SET_ACC_ZERO4 (); + BLASLONG l = 0; + for (l = 0; l < k / 2; l++) + { + vector short rowB = + { BO[(l << 2) + 0], BO[(l << 2) + 2], BO[(l << 2) + 1], + BO[(l << 2) + 3], + 0, 0, 0, 0 + }; + vec_t *rowA = (vec_t *) & (AO[l << 5]); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2])); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2])); + MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3])); + MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3])); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 2; + vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 }; + vec_t *rowA = (vec_t *) & (AO[l << 3]); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2])); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2])); + MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3])); + MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3])); + } + SAVE2x4_ACC (&acc0, 0); + SAVE2x4_ACC (&acc1, 4); + SAVE2x4_ACC (&acc2, 8); + SAVE2x4_ACC (&acc3, 12); + CO += 16; + AO += k << 4; + BO += k << 1; + } + i = (m & 15) >> 3; + /* Loop for m >= 8. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + BLASLONG l = 0; + for (l = 0; l < k / 2; l++) + { + vector short rowB = + { BO[(l << 2) + 0], BO[(l << 2) + 2], BO[(l << 2) + 1], + BO[(l << 2) + 3], + 0, 0, 0, 0 + }; + vec_t *rowA = (vec_t *) & (AO[l << 4]); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[1])); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[1])); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 2; + vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 }; + vec_t *rowA = (vec_t *) & (AO[(l << 2)]); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[1])); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[1])); + } + SAVE2x4_ACC (&acc0, 0); + SAVE2x4_ACC (&acc1, 4); + CO += 8; + AO += k << 3; + BO += k << 1; + } + i = (m & 7) >> 2; + /* Loop for m >= 4. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0; + __builtin_mma_xxsetaccz (&acc0); + BLASLONG l = 0; + for (l = 0; l < k / 2; l++) + { + vector short rowB = + { BO[(l << 2) + 0], BO[(l << 2) + 2], BO[(l << 2) + 1], + BO[(l << 2) + 3], + 0, 0, 0, 0 + }; + vec_t *rowA = (vec_t *) & (AO[l << 3]); + MMA (&acc0, (vec_t) rowB, MERGE_ROW (rowA[0])); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 2; + vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 }; + vec_t *rowA = (vec_t *) & (AO[l << 1]); + MMA (&acc0, (vec_t) rowB, MERGE_ROW (rowA[0])); + } + SAVE2x4_ACC (&acc0, 0); + CO += 4; + AO += k << 2; + BO += k << 1; + } + i = (m & 3) >> 1; + /* Loop for m >= 2. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; + for (l = 0; l < (k << 1); l += 2) + { + v4sf_t rowA = + { BF16TOF32 (AO[l]), BF16TOF32 (AO[l]), BF16TOF32 (AO[l + 1]), + BF16TOF32 (AO[l + 1]) + }; + v4sf_t rowB = + { BF16TOF32 (BO[l]), BF16TOF32 (BO[l + 1]), BF16TOF32 (BO[l]), + BF16TOF32 (BO[l + 1]) + }; + t += rowA * rowB; + } + t = t * valpha; + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; + CO[0 * ldc + 1] += t[2]; + CO[1 * ldc + 1] += t[3]; + CO += 2; + AO += k << 1; + BO += k << 1; + } + i = (m & 1) >> 0; + /* Loop for m = 1. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; + for (l = 0; l < k; l++) + { + v4sf_t rowA = { BF16TOF32 (AO[l]), BF16TOF32 (AO[l]), 0, 0 }; + v4sf_t rowB = + { BF16TOF32 (BO[l << 1]), BF16TOF32 (BO[(l << 1) + 1]), 0, + 0 + }; + t += rowA * rowB; + } + CO[0 * ldc] += t[0] * alpha; + CO[1 * ldc] += t[1] * alpha; + CO += 1; + AO += k; + BO += k << 1; + } + B += k << 1; + } + N = (n & 1) >> 0; + /* Loop for n = 1. */ + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i; + FLOAT *CO; + IFLOAT *AO; + CO = C; + C += ldc; + AO = A; + i = m; + /* Loop for m >= 16. */ + while (i >= 16) + { + IFLOAT *BO = B; + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; + v4sf_t t1 = { 0, 0, 0, 0 }; + v4sf_t t2 = { 0, 0, 0, 0 }; + v4sf_t t3 = { 0, 0, 0, 0 }; + for (l = 0; l < k; l++) + { + v4sf_t rowB = + { BF16TOF32 (BO[l]), BF16TOF32 (BO[l]), BF16TOF32 (BO[l]), + BF16TOF32 (BO[l]) + }; + v4sf_t rowA = + { BF16TOF32 (AO[l << 4]), BF16TOF32 (AO[(l << 4) + 1]), + BF16TOF32 (AO[(l << 4) + 2]), + BF16TOF32 (AO[(l << 4) + 3]) + }; + v4sf_t rowA1 = + { BF16TOF32 (AO[(l << 4) + 4]), BF16TOF32 (AO[(l << 4) + 5]), + BF16TOF32 (AO[(l << 4) + 6]), + BF16TOF32 (AO[(l << 4) + 7]) + }; + v4sf_t rowA2 = + { BF16TOF32 (AO[(l << 4) + 8]), BF16TOF32 (AO[(l << 4) + 9]), + BF16TOF32 (AO[(l << 4) + 10]), + BF16TOF32 (AO[(l << 4) + 11]) + }; + v4sf_t rowA3 = { BF16TOF32 (AO[(l << 4) + 12]), + BF16TOF32 (AO[(l << 4) + 13]), BF16TOF32 (AO[(l << 4) + 14]), + BF16TOF32 (AO[(l << 4) + 15]) + }; + t += rowA * rowB; + t1 += rowA1 * rowB; + t2 += rowA2 * rowB; + t3 += rowA3 * rowB; + } + t = t * valpha; + t1 = t1 * valpha; + t2 = t2 * valpha; + t3 = t3 * valpha; + CO[0] += t[0]; + CO[1] += t[1]; + CO[2] += t[2]; + CO[3] += t[3]; + CO[4] += t1[0]; + CO[5] += t1[1]; + CO[6] += t1[2]; + CO[7] += t1[3]; + CO[8] += t2[0]; + CO[9] += t2[1]; + CO[10] += t2[2]; + CO[11] += t2[3]; + CO[12] += t3[0]; + CO[13] += t3[1]; + CO[14] += t3[2]; + CO[15] += t3[3]; + AO += k << 4; + BO += k; + CO += 16; + i -= 16; + } + /* Loop for m >= 8. */ + while (i >= 8) + { + IFLOAT *BO = B; + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; + v4sf_t t1 = { 0, 0, 0, 0 }; + for (l = 0; l < k; l++) + { + v4sf_t rowB = + { BF16TOF32 (BO[l]), BF16TOF32 (BO[l]), BF16TOF32 (BO[l]), + BF16TOF32 (BO[l]) + }; + v4sf_t rowA = + { BF16TOF32 (AO[l << 3]), BF16TOF32 (AO[(l << 3) + 1]), + BF16TOF32 (AO[(l << 3) + 2]), + BF16TOF32 (AO[(l << 3) + 3]) + }; + v4sf_t rowA1 = + { BF16TOF32 (AO[(l << 3) + 4]), BF16TOF32 (AO[(l << 3) + 5]), + BF16TOF32 (AO[(l << 3) + 6]), + BF16TOF32 (AO[(l << 3) + 7]) + }; + t += rowA * rowB; + t1 += rowA1 * rowB; + } + t = t * valpha; + t1 = t1 * valpha; + CO[0] += t[0]; + CO[1] += t[1]; + CO[2] += t[2]; + CO[3] += t[3]; + CO[4] += t1[0]; + CO[5] += t1[1]; + CO[6] += t1[2]; + CO[7] += t1[3]; + AO += k << 3; + BO += k; + CO += 8; + i -= 8; + } + /* Loop for m >= 4. */ + while (i >= 4) + { + IFLOAT *BO = B; + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; + for (l = 0; l < k; l++) + { + v4sf_t rowB = + { BF16TOF32 (BO[l]), BF16TOF32 (BO[l]), BF16TOF32 (BO[l]), + BF16TOF32 (BO[l]) + }; + v4sf_t rowA = + { BF16TOF32 (AO[l << 2]), BF16TOF32 (AO[(l << 2) + 1]), + BF16TOF32 (AO[(l << 2) + 2]), + BF16TOF32 (AO[(l << 2) + 3]) + }; + t += rowA * rowB; + } + t = t * valpha; + CO[0] += t[0]; + CO[1] += t[1]; + CO[2] += t[2]; + CO[3] += t[3]; + AO += k << 2; + BO += k; + CO += 4; + i -= 4; + } + /* Loop for m >= 2. */ + while (i >= 2) + { + IFLOAT *BO = B; + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; + for (l = 0; l < k; l++) + { + v4sf_t rowB = { BF16TOF32 (BO[l]), BF16TOF32 (BO[l]), 0, 0 }; + v4sf_t rowA = + { BF16TOF32 (AO[l << 1]), BF16TOF32 (AO[(l << 1) + 1]), 0, + 0 + }; + t += rowA * rowB; + } + t = t * valpha; + CO[0] += t[0]; + CO[1] += t[1]; + AO += k << 1; + BO += k; + CO += 2; + i -= 2; + } + /* Loop for m = 1. */ + while (i >= 1) + { + IFLOAT *BO = B; + BLASLONG l = 0; + FLOAT t = 0; + for (l = 0; l < k; l++) + { + t += BF16TOF32 (AO[l]) * BF16TOF32 (BO[l]); + } + AO += k; + BO += k; + CO[0] += t * alpha; + CO += 1; + i -= 1; + } + + B += k; + } + + return 0; +} diff --git a/param.h b/param.h index fd0ea7599..e8cf53f0a 100644 --- a/param.h +++ b/param.h @@ -2297,6 +2297,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#if defined(POWER10) +#undef SHGEMM_DEFAULT_UNROLL_N +#undef SHGEMM_DEFAULT_UNROLL_M +#undef SHGEMM_DEFAULT_P +#undef SHGEMM_DEFAULT_R +#undef SHGEMM_DEFAULT_Q +#define SHGEMM_DEFAULT_UNROLL_M 16 +#define SHGEMM_DEFAULT_UNROLL_N 8 +#define SHGEMM_DEFAULT_P 832 +#define SHGEMM_DEFAULT_Q 1026 +#define SHGEMM_DEFAULT_R 4096 +#endif + #if defined(SPARC) && defined(V7) #define SNUMOPT 4 From e30ad0e521e77d3b72b8d46c18434cc911374f8d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 26 Jun 2020 09:00:43 +0200 Subject: [PATCH 274/593] Strip UTF8 byte order marker from source --- kernel/x86_64/sgemm_kernel_8x4_haswell_2.c | 2 +- kernel/x86_64/strsm_kernel_8x4_haswell_LN.c | 2 +- kernel/x86_64/strsm_kernel_8x4_haswell_L_common.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/x86_64/sgemm_kernel_8x4_haswell_2.c b/kernel/x86_64/sgemm_kernel_8x4_haswell_2.c index 5ab3e6d1f..a2e78c58d 100644 --- a/kernel/x86_64/sgemm_kernel_8x4_haswell_2.c +++ b/kernel/x86_64/sgemm_kernel_8x4_haswell_2.c @@ -1,4 +1,4 @@ -/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store, %6 = b_pref */ +/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store, %6 = b_pref */ /* r10 = tmp, r11 = m_counter, r12 = k << 2(const), r13 = tmp, r14 = b_head_pos(const), r15 = tmp */ /* m = 8 *//* ymm0 for alpha, ymm1-ymm3 for temporary use, ymm4-ymm15 for accumulators */ diff --git a/kernel/x86_64/strsm_kernel_8x4_haswell_LN.c b/kernel/x86_64/strsm_kernel_8x4_haswell_LN.c index 4131debb1..5410bd4ae 100644 --- a/kernel/x86_64/strsm_kernel_8x4_haswell_LN.c +++ b/kernel/x86_64/strsm_kernel_8x4_haswell_LN.c @@ -1,4 +1,4 @@ -#include "common.h" +#include "common.h" #include #include "strsm_kernel_8x4_haswell_L_common.h" diff --git a/kernel/x86_64/strsm_kernel_8x4_haswell_L_common.h b/kernel/x86_64/strsm_kernel_8x4_haswell_L_common.h index cfa56da97..2862a5b8d 100644 --- a/kernel/x86_64/strsm_kernel_8x4_haswell_L_common.h +++ b/kernel/x86_64/strsm_kernel_8x4_haswell_L_common.h @@ -1,4 +1,4 @@ -/* r11 = m_counter, r12 = size_of_k_elements, r13 = kk, r14 = b_head, r15 = a_head */ +/* r11 = m_counter, r12 = size_of_k_elements, r13 = kk, r14 = b_head, r15 = a_head */ /* register i/o: %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc, %5 = k_counter */ /* memory input: %6 = K, %7 = offset, %8 = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}, %9 = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}, %10 = M */ From 584ef8d4ae57d9eda3a8e27b84d2d37c60e8e4a5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 27 Jun 2020 14:36:37 +0200 Subject: [PATCH 275/593] Add support for Comet Lake H & S --- driver/others/dynamic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 7677f265a..c03b0b21d 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -619,7 +619,7 @@ static gotoblas_t *get_coretype(void){ } } case 10: - if (model == 6) { + if (model == 5 || model == 6) { if(support_avx2()) return &gotoblas_HASWELL; if(support_avx()) { From 83f47468254c5bca8e86a659e709de3f2cc4ffd4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 27 Jun 2020 14:41:24 +0200 Subject: [PATCH 276/593] Add support for Comet Lake H and S --- cpuid_x86.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index 3538690b9..356800b78 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1409,6 +1409,7 @@ int get_cpuname(void){ } case 10: //family 6 exmodel 10 switch (model) { + case 5: // Comet Lake H and S case 6: // Comet Lake U if(support_avx2()) return CPUTYPE_HASWELL; @@ -1967,16 +1968,16 @@ int get_coretype(void){ break; case 10: switch (model) { - case 6: - // Comet Lake U + case 5: // Comet Lake H and S + case 6: // Comet Lake U if(support_avx()) #ifndef NO_AVX2 return CORE_HASWELL; #else - return CORE_SANDYBRIDGE; + return CORE_SANDYBRIDGE; #endif else - return CORE_NEHALEM; + return CORE_NEHALEM; } case 5: switch (model) { From 634e1305f9caf640dfa42e61d4da564d8aedf16b Mon Sep 17 00:00:00 2001 From: EGuesnet <51407514+EGuesnet@users.noreply.github.com> Date: Tue, 30 Jun 2020 15:16:39 +0200 Subject: [PATCH 277/593] Update cgemm_kernel_8x4_power8.S --- kernel/power/cgemm_kernel_8x4_power8.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/power/cgemm_kernel_8x4_power8.S b/kernel/power/cgemm_kernel_8x4_power8.S index 2bc99974f..6be8c128c 100644 --- a/kernel/power/cgemm_kernel_8x4_power8.S +++ b/kernel/power/cgemm_kernel_8x4_power8.S @@ -424,7 +424,7 @@ L999: lwz r16, 204(SP) lwz r15, 208(SP) lwz r14, 212(SP) - addi r11, 224 + addi r11, SP, 224 #endif lvx v20, r11, r0 addi r11, r11, 16 @@ -459,4 +459,4 @@ L999: blr EPILOGUE -#endif^ +#endif From 4ab3651591d231c69f0f16dbeae26e2cc7ee819f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 2 Jul 2020 17:00:15 +0200 Subject: [PATCH 278/593] Option -mavx2 requires at least gcc 4.7 --- Makefile.x86_64 | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index f2de51ef4..2676bd258 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -31,14 +31,24 @@ ifeq ($(CORE), HASWELL) ifndef DYNAMIC_ARCH ifndef NO_AVX2 ifeq ($(C_COMPILER), GCC) +# AVX2 support was added in 4.7.0 +GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) +GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) +ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11) CCOMMON_OPT += -mavx2 endif +endif ifeq ($(F_COMPILER), GFORTRAN) +# AVX2 support was added in 4.7.0 +GCCVERSIONGTEQ4 := $(shell expr `$(FC) -dumpversion | cut -f1 -d.` \>= 4) +GCCMINORVERSIONGTEQ7 := $(shell expr `$(FC) -dumpversion | cut -f2 -d.` \>= 7) +ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11) FCOMMON_OPT += -mavx2 endif endif endif endif +endif From 10a2923f640e9b1aa3f8bca34e71481586aa3acd Mon Sep 17 00:00:00 2001 From: Jussi Enkovaara Date: Tue, 7 Jul 2020 13:35:43 +0300 Subject: [PATCH 279/593] fixes #2238 Always obey omp_get_max_threads() when build with USE_OPENMP --- common_thread.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/common_thread.h b/common_thread.h index 6ec40e096..ec0c65b22 100644 --- a/common_thread.h +++ b/common_thread.h @@ -132,18 +132,18 @@ extern int blas_server_avail; static __inline int num_cpu_avail(int level) { #ifdef USE_OPENMP - int openmp_nthreads=0; + int openmp_nthreads=omp_get_max_threads(); #endif +#ifndef USE_OPENMP if (blas_cpu_number == 1 - +#endif #ifdef USE_OPENMP - || omp_in_parallel() + if (openmp_nthreads == 1 || omp_in_parallel() #endif - ) return 1; + ) return 1; #ifdef USE_OPENMP - openmp_nthreads=omp_get_max_threads(); if (blas_cpu_number != openmp_nthreads) { goto_set_num_threads(openmp_nthreads); } From 8751a69271721b0593eafecd1cdd974d2839c864 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 7 Jul 2020 15:46:32 +0200 Subject: [PATCH 280/593] Obtain actual cpu count on AIX and suppress spurious NO_AVX512 on non-x86 --- getarch.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/getarch.c b/getarch.c index 164947f3e..2cdf77259 100644 --- a/getarch.c +++ b/getarch.c @@ -90,11 +90,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #endif +#if defined(AIX) +#include +#endif +#if defined(__x86_64__) || defined(_M_X64) #if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) #else #define NO_AVX512 #endif +#endif /* #define FORCE_P2 */ /* #define FORCE_KATMAI */ /* #define FORCE_COPPERMINE */ @@ -1297,6 +1302,11 @@ static int get_num_cores(void) { sysctl(m, 2, &count, &len, NULL, 0); return count; + +#elif defined(AIX) + //returns the number of processors which are currently online + return sysconf(_SC_NPROCESSORS_ONLN); + #else return 2; #endif From 45d819ca82f6a562de04cc5cfd3b70fd513fd4b8 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Tue, 7 Jul 2020 11:25:20 -0500 Subject: [PATCH 281/593] Changing mcpu option as power10 As compiler enabled mcpu option as power10, changing it from future. --- Makefile.power | 8 ++++---- driver/others/dynamic_power.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Makefile.power b/Makefile.power index 5c431860f..beb311945 100644 --- a/Makefile.power +++ b/Makefile.power @@ -11,11 +11,11 @@ endif ifeq ($(CORE), POWER10) ifeq ($(USE_OPENMP), 1) -COMMON_OPT += -Ofast -mcpu=future -mtune=future -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp -FCOMMON_OPT += -O2 -frecursive -mcpu=future -mtune=future -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp +COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp +FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp else -COMMON_OPT += -Ofast -mcpu=future -mtune=future -mvsx -malign-power -fno-fast-math -FCOMMON_OPT += -O2 -frecursive -mcpu=future -mtune=future -malign-power -fno-fast-math +COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -malign-power -fno-fast-math +FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -malign-power -fno-fast-math endif endif diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index 811a5fae3..f625b9431 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -45,7 +45,7 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_POWER9; #endif #if (!defined __GNUC__) || ( __GNUC__ >= 11) - if (__builtin_cpu_is("isa_3_1") && __builtin_cpu_supports ("mma")) + if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")) return &gotoblas_POWER10; #endif return NULL; From 1d63631afe3da02ade6aa7ca7698b08754c148a8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 9 Jul 2020 11:42:02 +0200 Subject: [PATCH 282/593] Add lapack-test --- CMakeLists.txt | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index bb5322a1d..7cdc4181a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,8 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 10.dev) - +set(OpenBLAS_PATCH_VERSION 9.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions @@ -236,6 +235,10 @@ if (NOT MSVC AND NOT NOFORTRAN) endif() endif() +if (NOT NOFORTRAN) + add_subdirectory(lapack-netlib/TESTING) +endif() + set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES VERSION ${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION} SOVERSION ${OpenBLAS_MAJOR_VERSION} From 60188a8c82398281794956f41c3e7232f0004532 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 9 Jul 2020 11:44:31 +0200 Subject: [PATCH 283/593] Append crude hack for enabling lapack tests in the OpenBLAS build --- lapack-netlib/TESTING/CMakeLists.txt | 393 +++++++++++++++++++++++++++ 1 file changed, 393 insertions(+) diff --git a/lapack-netlib/TESTING/CMakeLists.txt b/lapack-netlib/TESTING/CMakeLists.txt index d5ca95013..755826bfe 100644 --- a/lapack-netlib/TESTING/CMakeLists.txt +++ b/lapack-netlib/TESTING/CMakeLists.txt @@ -1,3 +1,5 @@ +enable_testing() + if(MSVC_VERSION) # string(REPLACE "/STACK:10000000" "/STACK:900000000000000000" # CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}") @@ -168,3 +170,394 @@ if(PYTHONINTERP_FOUND) COMMAND ${PYTHON_EXECUTABLE} "lapack_testing.py" ) endif() + + + +# $1 exec, $2 input, $3 output_result +FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh +"rm -f $3\n" +"$1 < $2\n" +"grep -q FATAL $3\n" +"if [ $? -eq 0 ]; then\n" +"echo Error\n" +"exit 1\n" +"else\n" +"exit 0\n" +"fi\n" +) + + +add_test(NAME "REAL_LAPACK_linear_equation_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/stest.in" "${CMAKE_CURRENT_BINARY_DIR}/stest.out" +) +add_test(NAME "COMPLEX_LAPACK_linear_equation_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ctest.in" "${CMAKE_CURRENT_BINARY_DIR}/ctest.out" +) +add_test(NAME "DOUBLE_PRECISION_LAPACK_linear_equation_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN//xlintstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dtest.in" "${CMAKE_CURRENT_BINARY_DIR}/dtest.out" +) +add_test(NAME "COMPLEX16_LAPACK_linear_equation_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN//xlintstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ztest.in" "${CMAKE_CURRENT_BINARY_DIR}/ztest.out" +) + +add_test(NAME "SINGLE-DOUBLE_PRECISION_LAPACK_prototype_linear_equation_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstds" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dstest.in" " ${CMAKE_CURRENT_BINARY_DIR}/dstest.out" +) +# ======== COMPLEX-COMPLEX16 LIN TESTS ======================== + +add_test(NAME "Testing_COMPLEX-COMPLEX16_LAPACK_prototype_linear_equation_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstzc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zctest.in" " ${CMAKE_CURRENT_BINARY_DIR}/zctest.out" +) + +# ======== SINGLE RFP LIN TESTS ======================== + +add_test(NAME "Testing_REAL_LAPACK_RFP_prototype_linear_equation_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfs" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/stest_rfp.in" "${CMAKE_CURRENT_BINARY_DIR}/stest_rfp.out" +) + +# ======== COMPLEX16 RFP LIN TESTS ======================== + +add_test(NAME "Testing_DOUBLE_PRECISION_LAPACK_RFP_prototype_linear_equation_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dtest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/dtest_rfp.out" +) +# ======== COMPLEX16 RFP LIN TESTS ======================== + +add_test(NAME "Testing_COMPLEX_LAPACK_RFP_prototype_linear_equation_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ctest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/ctest_rfp.out" +) + +# ======== COMPLEX16 RFP LIN TESTS ======================== + +add_test(NAME "Testing_COMPLEX16_LAPACK_RFP_prototype_linear_equation_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ztest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/ztest_rfp.out" +) +# +# +# ======== SINGLE EIG TESTS =========================== +# + +add_test(NAME "SNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/snep.out" +) + +add_test(NAME "SSEP:_Testing_Symmetric_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssep.out" +) + +add_test(NAME "SSE2:_Testing_Symmetric_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/sse2.out" +) + +add_test(NAME "SSVD:_Testing_Singular_Value_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssvd.out" +) + +add_test(NAME "SSEC:_Testing_REAL_Eigen_Condition_Routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sec.in" " ${CMAKE_CURRENT_BINARY_DIR}/sec.out" +) + +add_test(NAME "SSEV:_Testing_REAL_Nonsymmetric_Eigenvalue_Driver" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sed.in" " ${CMAKE_CURRENT_BINARY_DIR}/sed.out" +) + +add_test(NAME "SGG:_Testing_REAL_Nonsymmetric_Generalized_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgg.out" +) + +add_test(NAME "SGD:_Testing_REAL_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgd.out" +) + +add_test(NAME "SSB:_Testing_REAL_Symmetric_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ssb.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssb.out" +) + +add_test(NAME "SSG:_Testing_REAL_Symmetric_Generalized_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ssg.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssg.out" +) + +add_test(NAME "SGEBAL:_Testing_the_balancing_of_a_REAL_general_matrix" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbal.out" +) + +add_test(NAME "SGEBAK:_Testing_the_back_transformation_of_a_REAL_balanced_matrix" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbak.out" +) + +add_test(NAME "SGGBAL:_Testing_the_balancing_of_a_pair_of_REAL_general_matrices" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgbal.out" +) + +add_test(NAME "SGGBAK:_Testing_the_back_transformation_of_a_pair_of_REAL_balanced_matrices" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgbak.out" +) + +add_test(NAME "SBB:_Testing_banded_Singular_Value_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbb.out" +) + +add_test(NAME "SGLM:_Testing_Generalized_Linear_Regression_Model_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/sglm.out" +) + +add_test(NAME "SGQR:_Testing_Generalized_QR_and_RQ_factorization_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgqr.out" +) + +add_test(NAME "SGSV:_Testing_Generalized_Singular_Value_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" "${CMAKE_CURRENT_BINARY_DIR}/sgsv.out" +) + +add_test(NAME "SCSD:_Testing_CS_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/scsd.out" +) + +add_test(NAME "SLSE:_Testing_Constrained_Linear_Least_Squares_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/slse.out" +) + +# ======== COMPLEX EIG TESTS =========================== + +add_test(NAME "CNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/cnep.out" +) + +add_test(NAME "CSEP:_Testing_Symmetric_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/csep.out" +) + +add_test(NAME "CSE2:_Testing_Symmetric_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/cse2.out" +) + +add_test(NAME "CSVD:_Testing_Singular_Value_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/csvd.out" +) + +add_test(NAME "CEC:_Testing_COMPLEX_Eigen_Condition_Routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cec.in" " ${CMAKE_CURRENT_BINARY_DIR}/cec.out" +) + +add_test(NAME "CES:_Testing_COMPLEX_Nonsymmetric_Schur_Form_Driver" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ced.in" " ${CMAKE_CURRENT_BINARY_DIR}/ced.out" +) + +add_test(NAME "CGG:_Testing_COMPLEX_Nonsymmetric_Generalized_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgg.out" +) + +add_test(NAME "CGD:_Testing_COMPLEX_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgd.out" +) + +add_test(NAME "CHB:_Testing_Hermitian_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csb.in" " ${CMAKE_CURRENT_BINARY_DIR}/csb.out" +) + +add_test(NAME "CSG:_Testing_Symmetric_Generalized_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csg.in" " ${CMAKE_CURRENT_BINARY_DIR}/csg.out" +) + +add_test(NAME "CGEBAL:_Testing_the_balancing_of_a_COMPLEX_general_matrix" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbal.out" +) + +add_test(NAME "CGEBAK:_Testing_the_back_transformation_of_a_COMPLEX_balanced_matrix" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbak.out" +) + +add_test(NAME "CGGBAL:_Testing_the_balancing_of_a_pair_of_COMPLEX_general_matrices" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgbal.out" +) + +add_test(NAME "CGGBAK:_Testing_the_back_transformation_of_a_pair_of_COMPLEX_balanced_matrices" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgbak.out" +) + +add_test(NAME "CBB:_Testing_banded_Singular_Value_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbb.out" +) + +add_test(NAME "CGLM:_Testing_Generalized_Linear_Regression_Model_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/cglm.out" +) + +add_test(NAME "CGQR:_Testing_Generalized_QR_and_RQ_factorization_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgqr.out" +) + +add_test(NAME "CGSV:_Testing_Generalized_Singular_Value_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgsv.out" +) + +add_test(NAME "CCSD:_Testing_CS_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/ccsd.out" +) + +add_test(NAME "CLSE:_Testing_Constrained_Linear_Least_Squares_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/clse.out" +) + +# ======== DOUBLE EIG TESTS =========================== + +add_test(NAME "DNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/dnep.out" +) + +add_test(NAME "DSEP:_Testing_Symmetric_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsep.out" +) + +add_test(NAME "DSE2:_Testing_Symmetric_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/dse2.out" +) + +add_test(NAME "DSVD:_Testing_Singular_Value_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsvd.out" +) + +add_test(NAME "DEC:_Testing_DOUBLE_PRECISION_Eigen_Condition_Routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dec.in" " ${CMAKE_CURRENT_BINARY_DIR}/dec.out" +) + +add_test(NAME "DEV:_Testing_DOUBLE_PRECISION_Nonsymmetric_Eigenvalue_Driver" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ded.in" " ${CMAKE_CURRENT_BINARY_DIR}/ded.out" +) + +add_test(NAME "DGG:_Testing_DOUBLE_PRECISION_Nonsymmetric_Generalized_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgg.out" +) + +add_test(NAME "DGD:_Testing_DOUBLE_PRECISION_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgd.out" +) + +add_test(NAME "DSB:_Testing_DOUBLE_PRECISION_Symmetric_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dsb.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsb.out" +) + +add_test(NAME "DSG:_Testing_DOUBLE_PRECISION_Symmetric_Generalized_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dsg.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsg.out" +) + +add_test(NAME "DGEBAL:_Testing_the_balancing_of_a_DOUBLE_PRECISION_general_matrix" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbal.out" +) + +add_test(NAME "DGEBAK:_Testing_the_back_transformation_of_a_DOUBLE_PRECISION_balanced_matrix" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbak.out" +) + +add_test(NAME "DGGBAL:_Testing_the_balancing_of_a_pair_of_DOUBLE_PRECISION_general_matrices" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgbal.out" +) + +add_test(NAME "DGGBAK:_Testing_the_back_transformation_of_a_pair_of_DOUBLE_PRECISION_balanced_matrices" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgbak.out" +) + +add_test(NAME "DBB:_Testing_banded_Singular_Value_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbb.out" +) + +add_test(NAME "DGLM:_Testing_Generalized_Linear_Regression_Model_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/dglm.out" +) + +add_test(NAME "DGQR:_Testing_Generalized_QR_and_RQ_factorization_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgqr.out" +) + +add_test(NAME "DGSV:_Testing_Generalized_Singular_Value_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgsv.out" +) + +add_test(NAME "DCSD:_Testing_CS_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dcsd.out" +) + +add_test(NAME "DLSE:_Testing_Constrained_Linear_Least_Squares_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/dlse.out" +) + +# ======== COMPLEX16 EIG TESTS =========================== + +add_test(NAME "ZNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/znep.out" +) + +add_test(NAME "ZSEP:_Testing_Symmetric_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsep.out" +) + +add_test(NAME "ZSE2:_Testing_Symmetric_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/zse2.out" +) + +add_test(NAME "ZSVD:_Testing_Singular_Value_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsvd.out" +) + +add_test(NAME "ZEC:_Testing_COMPLEX16_Eigen_Condition_Routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zec.in" " ${CMAKE_CURRENT_BINARY_DIR}/zec.out" +) + +add_test(NAME "ZES:_Testing_COMPLEX16_Nonsymmetric_Schur_Form_Driver" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zed.in" " ${CMAKE_CURRENT_BINARY_DIR}/zed.out" +) + +add_test(NAME "ZGG:_Testing_COMPLEX16_Nonsymmetric_Generalized_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgg.out" +) + +add_test(NAME "ZGD:_Testing_COMPLEX16_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgd.out" +) + +add_test(NAME "ZHB:_Testing_Hermitian_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zsb.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsb.out" +) + +add_test(NAME "ZSG:_Testing_Symmetric_Generalized_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zsg.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsg.out" +) + +add_test(NAME "ZGEBAL:_Testing_the_balancing_of_a_COMPLEX16_general_matrix" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbal.out" +) + +add_test(NAME "ZGEBAK:_Testing_the_back_transformation_of_a_COMPLEX16_balanced_matrix" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbak.out" +) + +add_test(NAME "ZGGBAL:_Testing_the_balancing_of_a_pair_of_COMPLEX_general_matrices" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgbal.out" +) + +add_test(NAME "ZGGBAK:_Testing_the_back_transformation_of_a_pair_of_COMPLEX16_balanced_matrices" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgbak.out" +) + +add_test(NAME "ZBB:_Testing_banded_Singular_Value_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbb.out" +) + +add_test(NAME "ZGLM:_Testing_Generalized_Linear_Regression_Model_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/zglm.out" +) + +add_test(NAME "ZGQR:_Testing_Generalized_QR_and_RQ_factorization_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgqr.out" +) + +add_test(NAME "ZGSV:_Testing_Generalized_Singular_Value_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgsv.out" +) + +add_test(NAME "ZCSD:_Testing_CS_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zcsd.out" +) + +add_test(NAME "Constrained_Linear_Least_Squares_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/zlse.out" +) From 29b5887d5f00bd94478fe84ac4518c4cb0391941 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 9 Jul 2020 13:12:35 +0200 Subject: [PATCH 284/593] Modify for building with OpenBLAS --- lapack-netlib/TESTING/EIG/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/TESTING/EIG/CMakeLists.txt b/lapack-netlib/TESTING/EIG/CMakeLists.txt index 20fd25b4a..70eea8443 100644 --- a/lapack-netlib/TESTING/EIG/CMakeLists.txt +++ b/lapack-netlib/TESTING/EIG/CMakeLists.txt @@ -98,7 +98,7 @@ set(ZEIGTST zchkee.f macro(add_eig_executable name) add_executable(${name} ${ARGN}) - target_link_libraries(${name} tmglib ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) + target_link_libraries(${name} openblas) endmacro() if(BUILD_SINGLE) From c502760befbb25e6a9415dbd6b1e811f711e7cf3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 9 Jul 2020 13:13:16 +0200 Subject: [PATCH 285/593] Modify for building with OpenBLAS --- lapack-netlib/TESTING/LIN/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lapack-netlib/TESTING/LIN/CMakeLists.txt b/lapack-netlib/TESTING/LIN/CMakeLists.txt index c941d3577..954cab193 100644 --- a/lapack-netlib/TESTING/LIN/CMakeLists.txt +++ b/lapack-netlib/TESTING/LIN/CMakeLists.txt @@ -239,7 +239,8 @@ set(ZLINTSTRFP zchkrfp.f zdrvrfp.f zdrvrf1.f zdrvrf2.f zdrvrf3.f zdrvrf4.f zerrr macro(add_lin_executable name) add_executable(${name} ${ARGN}) - target_link_libraries(${name} tmglib ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) + target_link_libraries(${name} openblas) +#${TMGLIB} ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) endmacro() if(BUILD_SINGLE) From f76602474945ce7d5f930080e2b3fd016e945bc9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 9 Jul 2020 13:44:25 +0200 Subject: [PATCH 286/593] enable fortran for cmake --- lapack-netlib/TESTING/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lapack-netlib/TESTING/CMakeLists.txt b/lapack-netlib/TESTING/CMakeLists.txt index 755826bfe..80e6b3232 100644 --- a/lapack-netlib/TESTING/CMakeLists.txt +++ b/lapack-netlib/TESTING/CMakeLists.txt @@ -1,3 +1,5 @@ +enable_language(Fortran) + enable_testing() if(MSVC_VERSION) From d4a0299e166b33ed9d61a488018f3e1bb5491d30 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 9 Jul 2020 13:57:27 +0200 Subject: [PATCH 287/593] Do not build lapack-test on MSVC for now (same as with BLAS test) --- CMakeLists.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7cdc4181a..7e51e7e38 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -233,9 +233,6 @@ if (NOT MSVC AND NOT NOFORTRAN) if(NOT NO_CBLAS) add_subdirectory(ctest) endif() -endif() - -if (NOT NOFORTRAN) add_subdirectory(lapack-netlib/TESTING) endif() From af1e140e35cbdbb4d1f98addf3b817b1369460a3 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Thu, 9 Jul 2020 21:46:06 -0500 Subject: [PATCH 288/593] Change minimum gcc version for POWER10 As the MMA patches for POWER10 are backported to gcc10.2, changing the minimum gcc version needed to build OpenBLAS for POWER10. --- Makefile.system | 7 ++++++- driver/others/dynamic_power.c | 12 ++++++++---- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/Makefile.system b/Makefile.system index 1b473c59d..61ae264bf 100644 --- a/Makefile.system +++ b/Makefile.system @@ -286,6 +286,8 @@ GCCVERSIONEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` = 5) GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) +GCCVERSIONGTEQ11 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 11) +GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10) # Note that the behavior of -dumpversion is compile-time-configurable for # gcc-7.x and newer. Use -dumpfullversion there ifeq ($(GCCVERSIONGTEQ7),1) @@ -619,9 +621,12 @@ DYNAMIC_CORE += POWER9 else $(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.) endif -GCCVERSIONGTEQ11 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 11) ifeq ($(GCCVERSIONGTEQ11), 1) DYNAMIC_CORE += POWER10 +else ifeq ($(GCCVERSIONEQ10), 1) +ifeq ($(GCCMINORVERSIONGTEQ2), 1) +DYNAMIC_CORE += POWER10 +endif else $(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.) endif diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index f625b9431..ca1d42408 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -6,7 +6,11 @@ extern gotoblas_t gotoblas_POWER8; #if (!defined __GNUC__) || ( __GNUC__ >= 6) extern gotoblas_t gotoblas_POWER9; #endif -#if (!defined __GNUC__) || ( __GNUC__ >= 11) +#if (!defined __GNUC__) || ( __GNUC__ >= 11) \ + || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2) +#define HAVE_P10_SUPPORT 1 +#endif +#ifdef HAVE_P10_SUPPORT extern gotoblas_t gotoblas_POWER10; #endif @@ -28,7 +32,7 @@ char *gotoblas_corename(void) { #if (!defined __GNUC__) || ( __GNUC__ >= 6) if (gotoblas == &gotoblas_POWER9) return corename[3]; #endif -#if (!defined __GNUC__) || ( __GNUC__ >= 11) +#ifdef HAVE_P10_SUPPORT if (gotoblas == &gotoblas_POWER10) return corename[4]; #endif return corename[0]; @@ -44,7 +48,7 @@ static gotoblas_t *get_coretype(void) { if (__builtin_cpu_is("power9")) return &gotoblas_POWER9; #endif -#if (!defined __GNUC__) || ( __GNUC__ >= 11) +#ifdef HAVE_P10_SUPPORT if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")) return &gotoblas_POWER10; #endif @@ -73,7 +77,7 @@ static gotoblas_t *force_coretype(char * coretype) { #if (!defined __GNUC__) || ( __GNUC__ >= 6) case 3: return (&gotoblas_POWER9); #endif -#if (!defined __GNUC__) || ( __GNUC__ >= 11) +#ifdef HAVE_P10_SUPPORT case 4: return (&gotoblas_POWER10); #endif default: return NULL; From ae3a90f78f7c34a7d53b3650637f5c442b19940c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 12 Jul 2020 18:51:58 +0200 Subject: [PATCH 289/593] merge overwritten part of power10 support --- cpuid_power.c | 1 + 1 file changed, 1 insertion(+) diff --git a/cpuid_power.c b/cpuid_power.c index ed51df211..8f578d68f 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -57,6 +57,7 @@ #define CPUTYPE_PPCG4 7 #define CPUTYPE_POWER8 8 #define CPUTYPE_POWER9 9 +#define CPUTYPE_POWER10 10 char *cpuname[] = { "UNKNOWN", From 5865c7d4d6bc3a5a32a477d181a1568e95b7c167 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 12 Jul 2020 18:59:01 +0200 Subject: [PATCH 290/593] Make 32bit POWER8 use POWER6 kernels for now --- Makefile.system | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Makefile.system b/Makefile.system index 1b473c59d..e3b644cf3 100644 --- a/Makefile.system +++ b/Makefile.system @@ -109,6 +109,9 @@ endif ifeq ($(TARGET), ARMV8) GETARCH_FLAGS := -DFORCE_ARMV7 endif +ifeq ($(TARGET), POWER8) +GETARCH_FLAGS := -DFORCE_POWER6 +endif endif From bd2498c88643834f49f6d0bc764754631a71ee50 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 14 Jul 2020 18:07:58 +0200 Subject: [PATCH 291/593] Use POWER6 GEMM parameters on 32bit POWER8 --- param.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/param.h b/param.h index e8cf53f0a..efe0e1096 100644 --- a/param.h +++ b/param.h @@ -2225,7 +2225,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 65536 #define GEMM_DEFAULT_ALIGN 0x0ffffUL - +#if defined(__32BIT__) +#warning using BINARY32==POWER6 +#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 4 +#else #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 8 #define DGEMM_DEFAULT_UNROLL_M 16 @@ -2234,7 +2244,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_N 2 - +#endif #define SGEMM_DEFAULT_P 1280UL #define DGEMM_DEFAULT_P 640UL #define CGEMM_DEFAULT_P 640UL From b144423f0f4d91e0f642b4c4c66b1cf919fcae0e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 14 Jul 2020 18:10:12 +0200 Subject: [PATCH 292/593] Do not define USE_TRMM for 32bit POWER8 --- kernel/Makefile.L3 | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index dfdaf5cf4..1904264be 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -44,8 +44,10 @@ USE_TRMM = 1 endif ifeq ($(CORE), POWER8) +ifeq ($(BINARY64),1) USE_TRMM = 1 endif +endif ifeq ($(CORE), POWER9) USE_TRMM = 1 @@ -514,7 +516,7 @@ $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY) $(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmotcopy.s + $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmotcopy.s m4 sgemmotcopy.s > sgemmotcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmotcopy_nomacros.s -o $@ rm sgemmotcopy.s sgemmotcopy_nomacros.s @@ -530,7 +532,7 @@ $(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY) $(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmitcopy.s + $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmitcopy.s m4 sgemmitcopy.s > sgemmitcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmitcopy_nomacros.s -o $@ rm sgemmitcopy.s sgemmitcopy_nomacros.s From f8c2697701dfbcc3cba307245aab06134c86f53f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 14 Jul 2020 18:11:19 +0200 Subject: [PATCH 293/593] Use POWER6 GEMM, TRMM and DTRSM on 32bit POWER8 --- kernel/power/KERNEL.POWER8 | 84 ++++++++++++++++++++++++++++++-------- 1 file changed, 68 insertions(+), 16 deletions(-) diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 7fba5b4d6..dc6646d50 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -1,3 +1,51 @@ +ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1) +$(info baue power6) +SGEMMKERNEL = gemm_kernel_power6.S +SGEMMINCOPY = +SGEMMITCOPY = +SGEMMONCOPY = gemm_ncopy_4.S +SGEMMOTCOPY = gemm_tcopy_4.S +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_power6.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = gemm_ncopy_4.S +DGEMMOTCOPY = gemm_tcopy_4.S +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_power6.S +CGEMMINCOPY = ../generic/zgemm_ncopy_2.c +CGEMMITCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_4.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_power6.S +ZGEMMINCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMITCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) +DTRSMKERNEL_LN = trsm_kernel_power6_LN.S +DTRSMKERNEL_LT = trsm_kernel_power6_LT.S +DTRSMKERNEL_RN = trsm_kernel_power6_LT.S +DTRSMKERNEL_RT = trsm_kernel_power6_RT.S + +CAXPYKERNEL = zaxpy.S + +else + +$(info baue power8) #SGEMM_BETA = ../generic/gemm_beta.c #DGEMM_BETA = ../generic/gemm_beta.c #CGEMM_BETA = ../generic/zgemm_beta.c @@ -47,16 +95,21 @@ ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +#DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +#DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S +#DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +#DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c @@ -153,15 +206,15 @@ ZASUMKERNEL = zasum.c # SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c -ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) -ifneq ($(GCCVERSIONGTEQ9),1) -CAXPYKERNEL = caxpy_power8.S -else -CAXPYKERNEL = caxpy.c -endif -else -CAXPYKERNEL = caxpy.c -endif +#ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) +#ifneq ($(GCCVERSIONGTEQ9),1) +#CAXPYKERNEL = caxpy_power8.S +#else +#CAXPYKERNEL = caxpy.c +#endif +#else +#CAXPYKERNEL = caxpy.c +#endif # ZAXPYKERNEL = zaxpy.c # @@ -173,7 +226,7 @@ ZCOPYKERNEL = zcopy.c SDOTKERNEL = sdot.c DDOTKERNEL = ddot.c DSDOTKERNEL = sdot.c -CDOTKERNEL = cdot.c +CDOTKERNEL = ../arm/zdot.c ZDOTKERNEL = zdot.c # SNRM2KERNEL = ../arm/nrm2.c @@ -183,7 +236,7 @@ ZNRM2KERNEL = ../arm/znrm2.c # SROTKERNEL = srot.c DROTKERNEL = drot.c -CROTKERNEL = crot.c +#CROTKERNEL = crot.c ZROTKERNEL = zrot.c # SSCALKERNEL = sscal.c @@ -239,4 +292,3 @@ IDAMINKERNEL = ../arm/iamin.c IZAMAXKERNEL = ../arm/izamax.c IZAMINKERNEL = ../arm/izamin.c endif - From da17abec871ed96e1c959eee4ad11a1346d25b2d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 14 Jul 2020 18:20:03 +0200 Subject: [PATCH 294/593] fix trailing whitespace --- kernel/Makefile.L3 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 1904264be..d5de070a5 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -516,7 +516,7 @@ $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY) $(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmotcopy.s + $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmotcopy.s m4 sgemmotcopy.s > sgemmotcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmotcopy_nomacros.s -o $@ rm sgemmotcopy.s sgemmotcopy_nomacros.s @@ -532,7 +532,7 @@ $(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY) $(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmitcopy.s + $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmitcopy.s m4 sgemmitcopy.s > sgemmitcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmitcopy_nomacros.s -o $@ rm sgemmitcopy.s sgemmitcopy_nomacros.s From 417c4e8af8ab1a985ddd8d7fe15cf13d47cd82a3 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Tue, 14 Jul 2020 11:54:04 -0500 Subject: [PATCH 295/593] Add new linker option for POWER10 While building with DYNAMIC_ARCH on POWER9 with POWER10 aware toolchain, new LDFLAG is needed to avoid POWER10 instructions on PLT calls . --- Makefile.system | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index 4f17c25b9..3312a0be3 100644 --- a/Makefile.system +++ b/Makefile.system @@ -617,6 +617,7 @@ DYNAMIC_CORE += POWER8 ifneq ($(C_COMPILER), GCC) DYNAMIC_CORE += POWER9 DYNAMIC_CORE += POWER10 +override LDFLAGS += -Wl,-no-power10-stubs endif ifeq ($(C_COMPILER), GCC) ifeq ($(GCCVERSIONGT5), 1) @@ -626,9 +627,11 @@ $(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.) endif ifeq ($(GCCVERSIONGTEQ11), 1) DYNAMIC_CORE += POWER10 -else ifeq ($(GCCVERSIONEQ10), 1) +override LDFLAGS += -Wl,-no-power10-stubs +else ifeq ($(GCCVERSIONGTEQ10), 1) ifeq ($(GCCMINORVERSIONGTEQ2), 1) DYNAMIC_CORE += POWER10 +override LDFLAGS += -Wl,-no-power10-stubs endif else $(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.) From f308e741b2cad79196b096fde3aad9b562b1410a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 15 Jul 2020 10:00:07 +0200 Subject: [PATCH 296/593] remove debug output and revert changes to cdot and crot --- kernel/power/KERNEL.POWER8 | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index dc6646d50..bb93a6a23 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -1,5 +1,4 @@ ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1) -$(info baue power6) SGEMMKERNEL = gemm_kernel_power6.S SGEMMINCOPY = SGEMMITCOPY = @@ -45,7 +44,6 @@ CAXPYKERNEL = zaxpy.S else -$(info baue power8) #SGEMM_BETA = ../generic/gemm_beta.c #DGEMM_BETA = ../generic/gemm_beta.c #CGEMM_BETA = ../generic/zgemm_beta.c @@ -226,7 +224,7 @@ ZCOPYKERNEL = zcopy.c SDOTKERNEL = sdot.c DDOTKERNEL = ddot.c DSDOTKERNEL = sdot.c -CDOTKERNEL = ../arm/zdot.c +CDOTKERNEL = cdot.c ZDOTKERNEL = zdot.c # SNRM2KERNEL = ../arm/nrm2.c @@ -236,7 +234,7 @@ ZNRM2KERNEL = ../arm/znrm2.c # SROTKERNEL = srot.c DROTKERNEL = drot.c -#CROTKERNEL = crot.c +CROTKERNEL = crot.c ZROTKERNEL = zrot.c # SSCALKERNEL = sscal.c From 0033f8be0d8fcc5c8ae9ba8f0cae556297015c81 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 16 Jul 2020 23:32:54 +0200 Subject: [PATCH 297/593] Use vec_vsx_ld/st to fix misaligned accesses flagged by asan --- kernel/power/saxpy.c | 96 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 93 insertions(+), 3 deletions(-) diff --git a/kernel/power/saxpy.c b/kernel/power/saxpy.c index 393cdfadc..360d64146 100644 --- a/kernel/power/saxpy.c +++ b/kernel/power/saxpy.c @@ -28,6 +28,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" +#define offset_0 0 +#define offset_1 16 +#define offset_2 32 +#define offset_3 48 +#define offset_4 64 +#define offset_5 80 +#define offset_6 96 +#define offset_7 112 +#define offset_8 128 +#define offset_9 144 +#define offset_10 160 +#define offset_11 176 +#define offset_12 192 +#define offset_13 208 +#define offset_14 224 +#define offset_15 240 @@ -37,12 +53,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha) { BLASLONG i = 0; - __vector float v_a = {alpha,alpha,alpha,alpha}; - __vector float * v_y=(__vector float *)y; - __vector float * v_x=(__vector float *)x; + __vector float v_a __attribute((aligned(16))) = {alpha,alpha,alpha,alpha}; + __vector float * vptr_y =(__vector float *)y; + __vector float * vptr_x =(__vector float *)x; for(; i Date: Thu, 16 Jul 2020 22:17:39 +0000 Subject: [PATCH 298/593] handle missing lack of fortran compiler more gracefully --- cmake/f_check.cmake | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/cmake/f_check.cmake b/cmake/f_check.cmake index f877fc3e1..1fd6c2ad2 100644 --- a/cmake/f_check.cmake +++ b/cmake/f_check.cmake @@ -21,7 +21,14 @@ # NEED2UNDERSCORES if (NOT NO_LAPACK) - enable_language(Fortran) + check_language(Fortran) + if(CMAKE_Fortran_COMPILER) + enable_language(Fortran) + else() + message(STATUS "No Fortran compiler found, can build only BLAS but not LAPACK") + set (NOFORTRAN 1) + set (NO_LAPACK 1) + endif() else() include(CMakeForceCompiler) CMAKE_FORCE_Fortran_COMPILER(gfortran GNU) From 9d000ecaa2c888d2e777c7223602e5811858f8a6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 16 Jul 2020 22:36:35 +0000 Subject: [PATCH 299/593] include CheckLanguage module --- cmake/f_check.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/cmake/f_check.cmake b/cmake/f_check.cmake index 1fd6c2ad2..0f5d0e15d 100644 --- a/cmake/f_check.cmake +++ b/cmake/f_check.cmake @@ -21,6 +21,7 @@ # NEED2UNDERSCORES if (NOT NO_LAPACK) + include(CheckLanguage) check_language(Fortran) if(CMAKE_Fortran_COMPILER) enable_language(Fortran) From 26b7f24d164150d80f3672018c836e8a4f20260b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 18 Jul 2020 12:51:37 +0000 Subject: [PATCH 300/593] Update cross-compiling example to reflect change in Loongson gcc for #2723 --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 6dc3c7b42..4e5e3e956 100644 --- a/README.md +++ b/README.md @@ -58,6 +58,10 @@ Examples: ```sh make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A ``` + or same with the newer mips-crosscompiler put out by Loongson that defaults to the 32bit ABI: + ```sh + make HOSTCC=gcc CC='/opt/mips-loongson-gcc7.3-linux-gnu/2019.06-29/bin/mips-linux-gnu-gcc -mabi=64' FC='/opt/mips-loongson-gcc7.3-linux-gnu/2019.06-29/bin/mips-linux-gnu-gfortran -mabi=64' TARGET=LOONGSON3A + ``` * On an x86 box, compile this library for a loongson3a CPU with loongcc (based on Open64) compiler: ```sh From 4afd11dae5c254b3c78cd0fa241fe14305e599dd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 18 Jul 2020 16:57:41 +0000 Subject: [PATCH 301/593] Add a check for C11 atomics and stdatomic.h --- c_check | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/c_check b/c_check index dd700b8b4..314c2b157 100644 --- a/c_check +++ b/c_check @@ -249,6 +249,28 @@ if (($architecture eq "x86") || ($architecture eq "x86_64")) { } } +$c11_atomics = 0; +if ($data =~ /HAVE_C11/) { + eval "use File::Temp qw(tempfile)"; + if ($@){ + warn "could not load PERL module File::Temp, so could not check compiler compatibility with C11"; + $c11_atomics = 0; + } else { + ($fh,$tmpf) = tempfile( SUFFIX => '.c' , UNLINK => 1 ); + print $tmpf "#include \nint main(void){}\n"; + $args = " -c -o $tmpf.o $tmpf"; + my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null"); + system(@cmd) == 0; + if ($? != 0) { + $c11_atomics = 0; + } else { + $c11_atomics = 1; + } + unlink("$tmpf.o"); + } +} + + $data = `$compiler_name $flags -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`; $data =~ /globl\s([_\.]*)(.*)/; @@ -352,6 +374,8 @@ print CONFFILE "#define __32BIT__\t1\n" if $binformat eq bin32; print CONFFILE "#define __64BIT__\t1\n" if $binformat eq bin64; print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne ""; print CONFFILE "#define HAVE_MSA\t1\n" if $have_msa eq 1; +print CONFFILE "#define HAVE_C11\t1\n" if $c11_atomics eq 1; + if ($os eq "LINUX") { From 97d6eb97b15d2ece319da9c741ca13b2976013cc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 18 Jul 2020 16:59:33 +0000 Subject: [PATCH 302/593] Report availability of C11 support --- ctest.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ctest.c b/ctest.c index 5e869b901..cd84ab1bb 100644 --- a/ctest.c +++ b/ctest.c @@ -153,3 +153,6 @@ ARCH_ARM ARCH_ARM64 #endif +#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) +HAVE_C11 +#endif From 94bab9d1f92325aec79aecc9daacfaef8903d359 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 18 Jul 2020 17:03:31 +0000 Subject: [PATCH 303/593] Update conditional for atomics to use HAVE_C11 --- driver/others/blas_server.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 04b614a6e..756e51b5d 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -141,7 +141,7 @@ typedef struct { } thread_status_t; -#if (__STDC_VERSION__ >= 201112L) +#ifdef HAVE_C11 #define atomic_load_queue(p) __atomic_load_n(p, __ATOMIC_RELAXED) #define atomic_store_queue(p, v) __atomic_store_n(p, v, __ATOMIC_RELAXED) #else From 791e046744116bbf06649ae43adf0febdcebb6a9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 18 Jul 2020 17:05:59 +0000 Subject: [PATCH 304/593] Update conditional for atomics to use HAVE_C11 --- driver/others/blas_server_omp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index 4255852c8..b4eb27c25 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -55,7 +55,7 @@ int blas_server_avail = 0; static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER]; -#if __STDC_VERSION__ >= 201112L +#ifdef HAVE_C11 static atomic_bool blas_buffer_inuse[MAX_PARALLEL_NUMBER]; #else static _Bool blas_buffer_inuse[MAX_PARALLEL_NUMBER]; @@ -320,7 +320,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ while(true) { for(i=0; i < MAX_PARALLEL_NUMBER; i++) { -#if __STDC_VERSION__ >= 201112L +#ifdef HAVE_C11 _Bool inuse = false; if(atomic_compare_exchange_weak(&blas_buffer_inuse[i], &inuse, true)) { #else @@ -345,7 +345,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ exec_threads(&queue[i], buf_index); } -#if __STDC_VERSION__ >= 201112L +#ifdef HAVE_C11 atomic_store(&blas_buffer_inuse[buf_index], false); #else blas_buffer_inuse[buf_index] = false; From 09eb9d2584bd978815571b2860f06dedc9f606d2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 18 Jul 2020 17:07:38 +0000 Subject: [PATCH 305/593] Update conditional for atomics to HAVE_C11 --- driver/others/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index a5595aed4..9b6c226a1 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1095,7 +1095,7 @@ static BLASULONG base_address = 0UL; static BLASULONG base_address = BASE_ADDRESS; #endif -#if __STDC_VERSION__ >= 201112L +#ifdef HAVE_C11 static _Atomic int memory_initialized = 0; #else static volatile int memory_initialized = 0; From 6f38de06d2a0ce372c631b01380be58932ec159a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 18 Jul 2020 17:09:01 +0000 Subject: [PATCH 306/593] Update conditional for atomics to use HAVE_C11 --- driver/level3/level3_gemm3m_thread.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/level3/level3_gemm3m_thread.c b/driver/level3/level3_gemm3m_thread.c index 9216daaed..39824fc5a 100644 --- a/driver/level3/level3_gemm3m_thread.c +++ b/driver/level3/level3_gemm3m_thread.c @@ -91,7 +91,7 @@ #endif typedef struct { -#if __STDC_VERSION__ >= 201112L +#ifdef HAVE_C11 _Atomic #else volatile From ce45af8151c96fcb1c75d3985d96c5b64a68f823 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 18 Jul 2020 17:09:56 +0000 Subject: [PATCH 307/593] Update conditional for atomics to use HAVE_C11 --- driver/level3/level3_syrk_threaded.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/level3/level3_syrk_threaded.c b/driver/level3/level3_syrk_threaded.c index 574f825b0..a041abac3 100644 --- a/driver/level3/level3_syrk_threaded.c +++ b/driver/level3/level3_syrk_threaded.c @@ -67,7 +67,7 @@ #endif typedef struct { -#if __STDC_VERSION__ >= 201112L +#ifdef HAVE_C11 _Atomic #else volatile From a36eb19ae0dfab6714f82abf90b1394012888ff3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 18 Jul 2020 17:13:24 +0000 Subject: [PATCH 308/593] Update conditional for C11 atomics to use HAVE_C11 --- common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common.h b/common.h index 00b34a3f7..d6637abe4 100644 --- a/common.h +++ b/common.h @@ -681,7 +681,7 @@ __declspec(dllimport) int __cdecl omp_in_parallel(void); __declspec(dllimport) int __cdecl omp_get_num_procs(void); #endif -#if (__STDC_VERSION__ >= 201112L) +#ifdef HAVE_C11 #if defined(C_GCC) && ( __GNUC__ < 7) // workaround for GCC bug 65467 #ifndef _Atomic From f4f74941bd5fadfe3fd662f4da8355f2c6250949 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 18 Jul 2020 17:14:50 +0000 Subject: [PATCH 309/593] Update conditional for atomics to use HAVE_C11 --- lapack/getrf/getrf_parallel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack/getrf/getrf_parallel.c b/lapack/getrf/getrf_parallel.c index c602822a8..fc410b0e7 100644 --- a/lapack/getrf/getrf_parallel.c +++ b/lapack/getrf/getrf_parallel.c @@ -68,7 +68,7 @@ double sqrt(double); #define GETRF_FACTOR 1.00 -#if (__STDC_VERSION__ >= 201112L) +#ifdef HAVE_C11 #define atomic_load_long(p) __atomic_load_n(p, __ATOMIC_RELAXED) #define atomic_store_long(p, v) __atomic_store_n(p, v, __ATOMIC_RELAXED) #else From bbe119ee3bc0393dbc1d3422690c5f628576a3b4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 18 Jul 2020 17:19:59 +0000 Subject: [PATCH 310/593] Update conditional for atomics to use HAVE_C11 --- lapack/getrf/potrf_parallel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack/getrf/potrf_parallel.c b/lapack/getrf/potrf_parallel.c index 312509685..008fcb8cc 100644 --- a/lapack/getrf/potrf_parallel.c +++ b/lapack/getrf/potrf_parallel.c @@ -101,7 +101,7 @@ static FLOAT dm1 = -1.; #endif typedef struct { -#if __STDC_VERSION__ >= 201112L +#ifdef HAVE_C11 _Atomic #else volatile From 9e21a100e32059adf102b300d2f52085cc25adb3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 20 Jul 2020 22:52:09 +0000 Subject: [PATCH 311/593] Add trivial check for stdatomic.h --- cmake/system_check.cmake | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index 94eb0a9c6..4382ffc4e 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -116,3 +116,10 @@ set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") endif() file(REMOVE "avx512.c" "avx512.o") endif() + +include(CheckIncludeFile) +CHECK_INCLUDE_FILE("stdatomic.h" HAVE_C11) +if (HAVE_C11 EQUAL 1) +message (STATUS found stdatomic.h) +set (CCOMMON_OPT "${CCOMMON_OPT} -DHAVE_C11") +endif() From 9ae154ba899c0c2b98a999432c8b480f8ec2da53 Mon Sep 17 00:00:00 2001 From: Wileam Phan Date: Mon, 20 Jul 2020 23:30:28 -0400 Subject: [PATCH 312/593] Patch for building on Summit --- Makefile.power | 43 ++++++++++++++++++++++++++++++++++++++----- Makefile.system | 22 ++++++++++++++++++++++ exports/Makefile | 4 ++++ f_check | 3 +++ 4 files changed, 67 insertions(+), 5 deletions(-) diff --git a/Makefile.power b/Makefile.power index ea84f5945..bf7037995 100644 --- a/Makefile.power +++ b/Makefile.power @@ -21,23 +21,56 @@ endif ifeq ($(CORE), POWER9) ifeq ($(USE_OPENMP), 1) -COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp +ifneq ($(C_COMPILER), PGI) +CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp +else +CCOMMON_OPT += -fast -Mvect=simd -Mcache_align -DUSE_OPENMP -mp +endif +ifneq ($(F_COMPILER), PGI) FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp else -COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -fno-fast-math +FCOMMON_OPT += -O2 -Mrecursive -DUSE_OPENMP -mp +endif +else +ifneq ($(C_COMPILER), PGI) +CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -fno-fast-math +else +CCOMMON_OPT += -fast -Mvect=simd -Mcache_align +endif +ifneq ($(F_COMPILER), PGI) FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -fno-fast-math +else +FCOMMON_OPT += -O2 -Mrecursive +endif endif endif ifeq ($(CORE), POWER8) ifeq ($(USE_OPENMP), 1) -COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp +ifneq ($(C_COMPILER), PGI) +CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp +else +CCOMMON_OPT += -fast -Mvect=simd -Mcache_align -DUSE_OPENMP -mp +endif +ifneq ($(F_COMPILER), PGI) FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp else -COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -fno-fast-math -FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math +FCOMMON_OPT += -O2 -Mrecursive -DUSE_OPENMP -mp +endif +else +ifneq ($(C_COMPILER), PGI) +CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -fno-fast-math +else +CCOMMON_OPT += -fast -Mvect=simd -Mcache_align +endif +ifneq ($(F_COMPILER), PGI) ifeq ($(OSNAME), AIX) FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math +else +FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math +endif +else +FCOMMON_OPT += -O2 -Mrecursive endif endif endif diff --git a/Makefile.system b/Makefile.system index 3312a0be3..d62c66ad3 100644 --- a/Makefile.system +++ b/Makefile.system @@ -796,8 +796,19 @@ endif ifeq ($(C_COMPILER), PGI) ifdef BINARY64 +ifeq ($(ARCH), x86_64) CCOMMON_OPT += -tp p7-64 -D__MMX__ -Mnollvm else +ifeq ($(ARCH), power) +ifeq ($(CORE), POWER8) +CCOMMON_OPT += -tp pwr8 +endif +ifeq ($(CORE), POWER9) +CCOMMON_OPT += -tp pwr9 +endif +endif +endif +else CCOMMON_OPT += -tp p7 endif endif @@ -960,8 +971,19 @@ ifneq ($(INTERFACE64), 0) FCOMMON_OPT += -i8 endif endif +ifeq ($(ARCH), x86_64) FCOMMON_OPT += -tp p7-64 else +ifeq ($(ARCH), power) +ifeq ($(CORE), POWER8) +FCOMMON_OPT += -tp pwr8 +endif +ifeq ($(CORE), POWER9) +FCOMMON_OPT += -tp pwr9 +endif +endif +endif +else FCOMMON_OPT += -tp p7 endif FCOMMON_OPT += -Mrecursive diff --git a/exports/Makefile b/exports/Makefile index 01a313b35..75901586c 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -55,6 +55,10 @@ endif endif endif +ifeq ($(C_COMPILER), PGI) +EXTRALIB += -pgf90libs +endif + ifneq (,$(filter 1 2,$(NOFORTRAN))) FEXTRALIB = endif diff --git a/f_check b/f_check index 17d863224..dd4d3475c 100644 --- a/f_check +++ b/f_check @@ -82,6 +82,9 @@ if ($compiler eq "") { if ($compiler =~ /flang/) { $vendor = FLANG; $openmp = "-fopenmp"; + } elsif ($compiler =~ /pgf/) { + $vendor = PGI; + $openmp = "-mp"; } else { $vendor = G77; $openmp = ""; From 6c33764ca43c7311bdd61e2371b08395cf3e3f01 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 22 Jul 2020 17:30:55 +0000 Subject: [PATCH 313/593] Unify BUFFER_SIZE settings for x86_64 again to fix potentially fatal mismatch in DYNAMIC_ARCH builds --- common_x86_64.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/common_x86_64.h b/common_x86_64.h index 15d0c30aa..bee7e8cdb 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -228,14 +228,8 @@ static __inline unsigned int blas_quickdivide(unsigned int x, unsigned int y){ #define HUGE_PAGESIZE ( 2 << 20) #ifndef BUFFERSIZE -#if defined(SKYLAKEX) -#define BUFFER_SIZE (32 << 21) -#elif defined(HASWELL) || defined(ZEN) #define BUFFER_SIZE (32 << 22) #else -#define BUFFER_SIZE (32 << 20) -#endif -#else #define BUFFER_SIZE (32 << BUFFERSIZE) #endif From 9796e552eaa8dff68bba3bbb45f2039032a1fb99 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 23 Jul 2020 17:03:28 +0200 Subject: [PATCH 314/593] Avoid undefining NAME,CNAME etc for pgcc as it makes it ignore the new defininitions --- Makefile.system | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile.system b/Makefile.system index d62c66ad3..cc72c02e8 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1241,7 +1241,9 @@ KERNELDIR = $(TOPDIR)/kernel/$(ARCH) include $(TOPDIR)/Makefile.$(ARCH) +ifneq ($(C_COMPILER), PGI) CCOMMON_OPT += -UASMNAME -UASMFNAME -UNAME -UCNAME -UCHAR_NAME -UCHAR_CNAME +endif CCOMMON_OPT += -DASMNAME=$(FU)$(*F) -DASMFNAME=$(FU)$(*F)$(BU) -DNAME=$(*F)$(BU) -DCNAME=$(*F) -DCHAR_NAME=\"$(*F)$(BU)\" -DCHAR_CNAME=\"$(*F)\" ifeq ($(CORE), PPC440) From 661c6bfa5a245fdcfd0788d29dff4ce83a508e1e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 23 Jul 2020 17:08:20 +0200 Subject: [PATCH 315/593] Exclude altivec code paths if the compiler does not support them --- kernel/power/casum.c | 2 ++ kernel/power/ccopy.c | 2 ++ kernel/power/cdot.c | 4 ++++ kernel/power/cgemv_n.c | 7 +++++-- kernel/power/cgemv_t.c | 7 +++++-- kernel/power/crot.c | 2 ++ kernel/power/cswap.c | 2 ++ kernel/power/dasum.c | 2 ++ kernel/power/daxpy.c | 2 ++ kernel/power/dcopy.c | 2 ++ kernel/power/ddot.c | 2 ++ kernel/power/dgemv_n.c | 2 ++ kernel/power/dgemv_t.c | 7 ++++++- kernel/power/drot.c | 2 ++ kernel/power/dscal.c | 2 ++ kernel/power/dswap.c | 2 ++ kernel/power/idamax.c | 9 +++++++++ kernel/power/idamin.c | 7 ++++++- kernel/power/izamax.c | 6 +++++- kernel/power/izamin.c | 8 +++++--- kernel/power/sasum.c | 2 ++ kernel/power/saxpy.c | 4 ++++ kernel/power/scopy.c | 2 ++ kernel/power/sdot.c | 3 +++ kernel/power/sgemv_n.c | 4 ++++ kernel/power/sgemv_t.c | 5 +++++ kernel/power/srot.c | 2 ++ kernel/power/sscal.c | 2 ++ kernel/power/sswap.c | 2 ++ kernel/power/zasum.c | 2 ++ kernel/power/zaxpy.c | 2 ++ kernel/power/zcopy.c | 2 ++ kernel/power/zdot.c | 12 +++++++++++- kernel/power/zgemv_n_4.c | 3 +++ kernel/power/zgemv_t_4.c | 3 +++ kernel/power/zrot.c | 5 ++++- kernel/power/zscal.c | 2 ++ kernel/power/zswap.c | 2 ++ 38 files changed, 126 insertions(+), 12 deletions(-) diff --git a/kernel/power/casum.c b/kernel/power/casum.c index 3478a39ef..06982bfba 100644 --- a/kernel/power/casum.c +++ b/kernel/power/casum.c @@ -47,8 +47,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "casum_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_16 diff --git a/kernel/power/ccopy.c b/kernel/power/ccopy.c index cbe5b48d2..5e58034dd 100644 --- a/kernel/power/ccopy.c +++ b/kernel/power/ccopy.c @@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "ccopy_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_32 diff --git a/kernel/power/cdot.c b/kernel/power/cdot.c index d5b18729a..ef5e4710f 100644 --- a/kernel/power/cdot.c +++ b/kernel/power/cdot.c @@ -23,6 +23,9 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +#if !defined(__VEC__) || !defined(__ALTIVEC__) +#include "../arm/zdot.c" +#else #include "common.h" #ifndef HAVE_KERNEL_8 @@ -168,3 +171,4 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA return (result); } +#endif diff --git a/kernel/power/cgemv_n.c b/kernel/power/cgemv_n.c index eec3fa37c..8663039c5 100644 --- a/kernel/power/cgemv_n.c +++ b/kernel/power/cgemv_n.c @@ -23,7 +23,10 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ +*****************************************************************************/ +#if !defined(__VEC__) || !defined(__ALTIVEC__) +#include "../arm/zgemv_n.c" +#else #include #include @@ -591,4 +594,4 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, return (0); } - +#endif diff --git a/kernel/power/cgemv_t.c b/kernel/power/cgemv_t.c index 691f7a3d3..1bfc235db 100644 --- a/kernel/power/cgemv_t.c +++ b/kernel/power/cgemv_t.c @@ -23,7 +23,10 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ +*****************************************************************************/ +#if !defined(__VEC__) || !defined(__ALTIVEC__) +#include "../arm/zgemv_t.c" +#else #include "common.h" @@ -595,4 +598,4 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, return (0); } - +#endif diff --git a/kernel/power/crot.c b/kernel/power/crot.c index 5c1d44620..fb4860dcd 100644 --- a/kernel/power/crot.c +++ b/kernel/power/crot.c @@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) static void crot_kernel_8 (long n, float *x, float *y, float c, float s) { @@ -169,6 +170,7 @@ static void crot_kernel_8 (long n, float *x, float *y, float c, float s) } #endif +#endif int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) diff --git a/kernel/power/cswap.c b/kernel/power/cswap.c index 88cb1d638..5144a2e93 100644 --- a/kernel/power/cswap.c +++ b/kernel/power/cswap.c @@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "cswap_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_32 diff --git a/kernel/power/dasum.c b/kernel/power/dasum.c index 09e06d909..999dc677a 100644 --- a/kernel/power/dasum.c +++ b/kernel/power/dasum.c @@ -47,8 +47,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "dasum_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_16 diff --git a/kernel/power/daxpy.c b/kernel/power/daxpy.c index 018beafd1..2de4e0911 100644 --- a/kernel/power/daxpy.c +++ b/kernel/power/daxpy.c @@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "daxpy_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_8 diff --git a/kernel/power/dcopy.c b/kernel/power/dcopy.c index cf203e71e..24279f8a2 100644 --- a/kernel/power/dcopy.c +++ b/kernel/power/dcopy.c @@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "dcopy_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_32 diff --git a/kernel/power/ddot.c b/kernel/power/ddot.c index bd9e1fb97..c5493015a 100644 --- a/kernel/power/ddot.c +++ b/kernel/power/ddot.c @@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "ddot_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_8 diff --git a/kernel/power/dgemv_n.c b/kernel/power/dgemv_n.c index b4dfda550..ac365b3b2 100644 --- a/kernel/power/dgemv_n.c +++ b/kernel/power/dgemv_n.c @@ -39,8 +39,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "dgemv_n_microk_power8.c" #endif +#endif #define NBMAX 4096 diff --git a/kernel/power/dgemv_t.c b/kernel/power/dgemv_t.c index 5d43f673f..09abd5a43 100644 --- a/kernel/power/dgemv_t.c +++ b/kernel/power/dgemv_t.c @@ -25,15 +25,19 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +#if !defined(__VEC__) || !defined(__ALTIVEC__) +#include "../arm/gemv_t.c" +#else + #include "common.h" #define NBMAX 1024 //#define PREFETCH 1 + #include #define HAVE_KERNEL4x8_ASM 1 - #if defined(HAVE_KERNEL4x8_ASM) static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, double *y, double alpha) { @@ -883,4 +887,5 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO return (0); } +#endif diff --git a/kernel/power/drot.c b/kernel/power/drot.c index b808ab566..951c2f9c9 100644 --- a/kernel/power/drot.c +++ b/kernel/power/drot.c @@ -40,8 +40,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma GCC optimize "O1" #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "drot_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_16 diff --git a/kernel/power/dscal.c b/kernel/power/dscal.c index 7e0fe48c0..39293252b 100644 --- a/kernel/power/dscal.c +++ b/kernel/power/dscal.c @@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "dscal_microk_power8.c" #endif +#endif #if !defined(HAVE_KERNEL_8) diff --git a/kernel/power/dswap.c b/kernel/power/dswap.c index 795bb10b4..ff3f95c79 100644 --- a/kernel/power/dswap.c +++ b/kernel/power/dswap.c @@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "dswap_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_32 diff --git a/kernel/power/idamax.c b/kernel/power/idamax.c index 195a8c68e..5016f67dd 100644 --- a/kernel/power/idamax.c +++ b/kernel/power/idamax.c @@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #include +#if defined(__VEC__) || defined(__ALTIVEC__) #include +#endif + #if defined(DOUBLE) #define ABS fabs @@ -37,6 +40,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#if defined(__VEC__) || defined(__ALTIVEC__) + /** * Find maximum index * Warning: requirements n>0 and n % 32 == 0 @@ -313,6 +318,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { return index; } +#endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i = 0; @@ -326,12 +332,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG n1 = n & -32; #if defined(_CALL_ELF) && (_CALL_ELF == 2) +#if defined(__VEC__) || defined(__ALTIVEC__) + if (n1 > 0) { max = diamax_kernel_32(n1, x, &maxf); i = n1; } +#endif #endif while (i < n) { if (ABS(x[i]) > maxf) { diff --git a/kernel/power/idamin.c b/kernel/power/idamin.c index 8a5538821..e37718c48 100644 --- a/kernel/power/idamin.c +++ b/kernel/power/idamin.c @@ -37,6 +37,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#if defined(__VEC__) || defined(__ALTIVEC__) + /** * Find minimum index * Warning: requirements n>0 and n % 32 == 0 @@ -313,7 +315,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { return index; } - +#endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { @@ -327,12 +329,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { if (inc_x == 1) { #if defined(_CALL_ELF) && (_CALL_ELF == 2) +#if defined(__VEC__) || defined(__ALTIVEC__) + BLASLONG n1 = n & -32; if (n1 > 0) { min = diamin_kernel_32(n1, x, &minf); i = n1; } +#endif #endif while (i < n) { if (ABS(x[i]) < minf) { diff --git a/kernel/power/izamax.c b/kernel/power/izamax.c index 7149da28b..fe9d5bf95 100644 --- a/kernel/power/izamax.c +++ b/kernel/power/izamax.c @@ -34,6 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#if defined(__VEC__) || defined(__ALTIVEC__) /** * Find maximum index @@ -299,7 +300,7 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) { } - +#endif @@ -317,6 +318,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (inc_x == 1) { #if defined(_CALL_ELF) && (_CALL_ELF == 2) +#if defined(__VEC__) || defined(__ALTIVEC__) + BLASLONG n1 = n & -16; if (n1 > 0) { @@ -324,6 +327,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) i = n1; ix = n1 << 1; } +#endif #endif while(i < n) diff --git a/kernel/power/izamin.c b/kernel/power/izamin.c index 692315b89..94f2383e0 100644 --- a/kernel/power/izamin.c +++ b/kernel/power/izamin.c @@ -24,7 +24,6 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" #include @@ -32,6 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ABS fabs #define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) +#if defined(__VEC__) || defined(__ALTIVEC__) /** * Find minimum index @@ -296,6 +296,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { return index; } +#endif @@ -316,6 +317,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) minf = CABS1(x,0); //index will not be incremented #if defined(_CALL_ELF) && (_CALL_ELF == 2) +#if defined(__VEC__) || defined(__ALTIVEC__) + BLASLONG n1 = n & -16; if (n1 > 0) { @@ -323,6 +326,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) i = n1; ix = n1 << 1; } +#endif #endif while(i < n) @@ -359,5 +363,3 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } } - - diff --git a/kernel/power/sasum.c b/kernel/power/sasum.c index b259d7d76..733137012 100644 --- a/kernel/power/sasum.c +++ b/kernel/power/sasum.c @@ -47,8 +47,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "sasum_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_32 diff --git a/kernel/power/saxpy.c b/kernel/power/saxpy.c index 393cdfadc..d005427b5 100644 --- a/kernel/power/saxpy.c +++ b/kernel/power/saxpy.c @@ -30,6 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#if defined(__VEC__) || defined(__ALTIVEC__) #ifndef HAVE_KERNEL_8 #include @@ -62,6 +63,7 @@ static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha) } } #endif +#endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { @@ -74,11 +76,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS { BLASLONG n1 = n & -64; +#if defined(__VEC__) || defined(__ALTIVEC__) if ( n1 ) saxpy_kernel_64(n1, x, y, da); i = n1; +#endif while(i < n) { diff --git a/kernel/power/scopy.c b/kernel/power/scopy.c index 5207d386e..8ff8cb329 100644 --- a/kernel/power/scopy.c +++ b/kernel/power/scopy.c @@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "scopy_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_32 diff --git a/kernel/power/sdot.c b/kernel/power/sdot.c index 8de434e41..ffeab6638 100644 --- a/kernel/power/sdot.c +++ b/kernel/power/sdot.c @@ -36,8 +36,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) + #include "sdot_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_16 diff --git a/kernel/power/sgemv_n.c b/kernel/power/sgemv_n.c index 81ac031a3..5dfb18f5b 100644 --- a/kernel/power/sgemv_n.c +++ b/kernel/power/sgemv_n.c @@ -24,7 +24,10 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +#if !defined(__VEC__) || !defined(__ALTIVEC__) +#include "../arm/gemv_n.c" +#else #include "common.h" @@ -463,4 +466,5 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO return(0); } +#endif diff --git a/kernel/power/sgemv_t.c b/kernel/power/sgemv_t.c index 3d8a442dc..62c517a9d 100644 --- a/kernel/power/sgemv_t.c +++ b/kernel/power/sgemv_t.c @@ -24,6 +24,10 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +#if !defined(__VEC__) || !defined(__ALTIVEC__) +#include "../arm/gemv_t.c" + +#else #include "common.h" @@ -477,3 +481,4 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO } +#endif diff --git a/kernel/power/srot.c b/kernel/power/srot.c index 9638a59eb..a53342f61 100644 --- a/kernel/power/srot.c +++ b/kernel/power/srot.c @@ -40,8 +40,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma GCC optimize "O1" #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "srot_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_16 diff --git a/kernel/power/sscal.c b/kernel/power/sscal.c index ddd5b2c5b..de37e10a5 100644 --- a/kernel/power/sscal.c +++ b/kernel/power/sscal.c @@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "sscal_microk_power8.c" #endif +#endif #if !defined(HAVE_KERNEL_16) diff --git a/kernel/power/sswap.c b/kernel/power/sswap.c index a56434444..44522f0a0 100644 --- a/kernel/power/sswap.c +++ b/kernel/power/sswap.c @@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "sswap_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_32 diff --git a/kernel/power/zasum.c b/kernel/power/zasum.c index 8383e39ab..305e50ede 100644 --- a/kernel/power/zasum.c +++ b/kernel/power/zasum.c @@ -47,8 +47,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "zasum_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_8 diff --git a/kernel/power/zaxpy.c b/kernel/power/zaxpy.c index 4a7c26c69..3064d5435 100644 --- a/kernel/power/zaxpy.c +++ b/kernel/power/zaxpy.c @@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "zaxpy_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_4 diff --git a/kernel/power/zcopy.c b/kernel/power/zcopy.c index bb80decd2..453f4e551 100644 --- a/kernel/power/zcopy.c +++ b/kernel/power/zcopy.c @@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "zcopy_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_16 diff --git a/kernel/power/zdot.c b/kernel/power/zdot.c index 9086ef35b..690765797 100644 --- a/kernel/power/zdot.c +++ b/kernel/power/zdot.c @@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "zdot_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_8 @@ -93,9 +95,11 @@ FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG in FLOAT dot[4] = { 0.0, 0.0, 0.0 , 0.0 } ; if ( n <= 0 ) - { + { /* __real__ result = 0.0 ; __imag__ result = 0.0 ; + */ + result = OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0); return(result); } @@ -149,11 +153,17 @@ FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG in } #if !defined(CONJ) + /* __real__ result = dot[0] - dot[1]; __imag__ result = dot[2] + dot[3]; + */ + result = OPENBLAS_MAKE_COMPLE_FLOAT(dot[0]-dot[1],dot[2]+dot[3]); #else + /* __real__ result = dot[0] + dot[1]; __imag__ result = dot[2] - dot[3]; + */ + result = OPENBLAS_MAKE_COMPLE_FLOAT(dot[0]+dot[1],dot[2]-dot[3]); #endif diff --git a/kernel/power/zgemv_n_4.c b/kernel/power/zgemv_n_4.c index ba019d6a5..1f7199c89 100644 --- a/kernel/power/zgemv_n_4.c +++ b/kernel/power/zgemv_n_4.c @@ -29,6 +29,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "common.h" +#if defined(__VEC__) || defined(__ALTIVEC__) + #define HAVE_KERNEL_4x4_VEC 1 #define HAVE_KERNEL_4x2_VEC 1 #define HAVE_KERNEL_4x1_VEC 1 @@ -37,6 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC) #include #endif +#endif // #define NBMAX 4096 diff --git a/kernel/power/zgemv_t_4.c b/kernel/power/zgemv_t_4.c index b34199af6..4ed27d96b 100644 --- a/kernel/power/zgemv_t_4.c +++ b/kernel/power/zgemv_t_4.c @@ -28,10 +28,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #define NBMAX 4096 +#if defined(__VEC__) || defined(__ALTIVEC__) + #define HAVE_KERNEL_4x4_VEC 1 #define HAVE_KERNEL_4x2_VEC 1 #define HAVE_KERNEL_4x1_VEC 1 +#endif #if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC) #include #endif diff --git a/kernel/power/zrot.c b/kernel/power/zrot.c index c6d666178..5e7ca3b23 100644 --- a/kernel/power/zrot.c +++ b/kernel/power/zrot.c @@ -24,6 +24,9 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +#if !defined(__VEC__) || !defined(__ALTIVEC__) +#include "../arm/zrot.c" +#else #include "common.h" @@ -262,4 +265,4 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT } - \ No newline at end of file +#endif diff --git a/kernel/power/zscal.c b/kernel/power/zscal.c index 16b584bca..5526f4d67 100644 --- a/kernel/power/zscal.c +++ b/kernel/power/zscal.c @@ -39,10 +39,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma GCC optimize "O1" #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #if defined(DOUBLE) #include "zscal_microk_power8.c" #endif #endif +#endif #ifndef HAVE_KERNEL_8 diff --git a/kernel/power/zswap.c b/kernel/power/zswap.c index c6508f032..3a5a8eb83 100644 --- a/kernel/power/zswap.c +++ b/kernel/power/zswap.c @@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "zswap_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_16 From 7c6e56b5dfa0dee6e39eef9cc17c10ea92c39ac2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 23 Jul 2020 17:10:59 +0200 Subject: [PATCH 316/593] Rewrite assignment to complex for better portability --- kernel/arm/zdot.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/arm/zdot.c b/kernel/arm/zdot.c index 733c235c6..a9f46dde7 100644 --- a/kernel/arm/zdot.c +++ b/kernel/arm/zdot.c @@ -48,10 +48,11 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA dot[0]=0.0; dot[1]=0.0; - +/* CREAL(result) = 0.0 ; CIMAG(result) = 0.0 ; - +*/ + result = OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0); if ( n < 1 ) return(result); inc_x2 = 2 * inc_x ; @@ -71,8 +72,9 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA i++ ; } - CREAL(result) = dot[0]; - CIMAG(result) = dot[1]; + /*CREAL(result) = dot[0]; + CIMAG(result) = dot[1];*/ + result = OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0],dot[1]); return(result); } From 21072e502ae620186dea2293e91b5685906bdc25 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 23 Jul 2020 17:34:56 +0000 Subject: [PATCH 317/593] Typo fix --- kernel/power/zdot.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/power/zdot.c b/kernel/power/zdot.c index 690765797..fe0e9284e 100644 --- a/kernel/power/zdot.c +++ b/kernel/power/zdot.c @@ -157,13 +157,13 @@ FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG in __real__ result = dot[0] - dot[1]; __imag__ result = dot[2] + dot[3]; */ - result = OPENBLAS_MAKE_COMPLE_FLOAT(dot[0]-dot[1],dot[2]+dot[3]); + result = OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]-dot[1],dot[2]+dot[3]); #else /* __real__ result = dot[0] + dot[1]; __imag__ result = dot[2] - dot[3]; */ - result = OPENBLAS_MAKE_COMPLE_FLOAT(dot[0]+dot[1],dot[2]-dot[3]); + result = OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]+dot[1],dot[2]-dot[3]); #endif From ca3561cab9d698b7816544a08848306853c17c01 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 23 Jul 2020 18:30:42 +0000 Subject: [PATCH 318/593] Add ifdefs around call to altivec microkernel --- kernel/power/crot.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/power/crot.c b/kernel/power/crot.c index fb4860dcd..84ba5d913 100644 --- a/kernel/power/crot.c +++ b/kernel/power/crot.c @@ -185,7 +185,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT if ( (inc_x == 1) && (inc_y == 1) ) { - +#if defined(__VEC__) || defined(__ALTIVEC__) BLASLONG n1 = n & -8; if ( n1 > 0 ) { @@ -193,7 +193,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT i=n1; ix=2*n1; } - +#endif while(i < n) { temp[0] = c*x[ix] + s*y[ix] ; From bf1f0734ff8c90261bc0f3b0f3887b489a10f8b6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 23 Jul 2020 20:40:13 +0000 Subject: [PATCH 319/593] Use OPENBLAS_MAKE_COMPLEX_FLOAT on PPC only --- kernel/arm/zdot.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/kernel/arm/zdot.c b/kernel/arm/zdot.c index a9f46dde7..ba0e57eb5 100644 --- a/kernel/arm/zdot.c +++ b/kernel/arm/zdot.c @@ -48,11 +48,12 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA dot[0]=0.0; dot[1]=0.0; -/* +#if !defined(__PPC__) CREAL(result) = 0.0 ; CIMAG(result) = 0.0 ; -*/ +#else result = OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0); +#endif if ( n < 1 ) return(result); inc_x2 = 2 * inc_x ; @@ -72,9 +73,12 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA i++ ; } - /*CREAL(result) = dot[0]; - CIMAG(result) = dot[1];*/ +#if !defined(__POWER__) + CREAL(result) = dot[0]; + CIMAG(result) = dot[1]; +#else result = OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0],dot[1]); +#endif return(result); } From 95d37e15754955f5c73195d2ca09208e99600ab9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 24 Jul 2020 10:13:46 +0000 Subject: [PATCH 320/593] Regroup the 32 and 64bit sections and restore 64bit CAXPY --- kernel/power/KERNEL.POWER8 | 49 ++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index bb93a6a23..cbf285913 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -1,3 +1,4 @@ +# Big-endian 32bit (AIX) is supported through the POWER6 GEMM kernels, no separate TRMM ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1) SGEMMKERNEL = gemm_kernel_power6.S SGEMMINCOPY = @@ -35,12 +36,6 @@ ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) -DTRSMKERNEL_LN = trsm_kernel_power6_LN.S -DTRSMKERNEL_LT = trsm_kernel_power6_LT.S -DTRSMKERNEL_RN = trsm_kernel_power6_LT.S -DTRSMKERNEL_RT = trsm_kernel_power6_RT.S - -CAXPYKERNEL = zaxpy.S else @@ -93,10 +88,6 @@ ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endif STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c @@ -104,10 +95,17 @@ STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -#DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -#DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S -#DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -#DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1) +DTRSMKERNEL_LN = trsm_kernel_power6_LN.S +DTRSMKERNEL_LT = trsm_kernel_power6_LT.S +DTRSMKERNEL_RN = trsm_kernel_power6_LT.S +DTRSMKERNEL_RT = trsm_kernel_power6_RT.S +else +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c @@ -204,15 +202,20 @@ ZASUMKERNEL = zasum.c # SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c -#ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) -#ifneq ($(GCCVERSIONGTEQ9),1) -#CAXPYKERNEL = caxpy_power8.S -#else -#CAXPYKERNEL = caxpy.c -#endif -#else -#CAXPYKERNEL = caxpy.c -#endif +ä +ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1) +CAXPYKERNEL = zaxpy.S +else +ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) +ifneq ($(GCCVERSIONGTEQ9),1) +CAXPYKERNEL = caxpy_power8.S +else +CAXPYKERNEL = caxpy.c +endif +else +CAXPYKERNEL = caxpy.c +endif +endif # ZAXPYKERNEL = zaxpy.c # From 251a09ec903fb05a93bbd36bd4138a73b330f09a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 24 Jul 2020 16:04:58 +0000 Subject: [PATCH 321/593] Typo fix --- kernel/power/KERNEL.POWER8 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index cbf285913..c2f4cd204 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -202,7 +202,7 @@ ZASUMKERNEL = zasum.c # SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c -ä +# ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1) CAXPYKERNEL = zaxpy.S else From 9be2688c78e1646e406e425b4c79e6f82db9f94e Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Fri, 24 Jul 2020 23:08:11 -0500 Subject: [PATCH 322/593] Fix to store results in correct order for POWER10 GEMM kernels There is a recent compiler change in __builtin_mma_disassemble_acc() which affects the order of storing result in POWER10. Also removing new LDFLAG -mno-power10-stub as it is handled by linker automatically. --- Makefile.system | 3 - kernel/power/dgemm_kernel_power10.c | 54 ++++++++-------- kernel/power/sgemm_kernel_power10.c | 94 ++++++++++++++-------------- kernel/power/shgemm_kernel_power10.c | 48 +++++++------- 4 files changed, 98 insertions(+), 101 deletions(-) diff --git a/Makefile.system b/Makefile.system index cc72c02e8..db651ef99 100644 --- a/Makefile.system +++ b/Makefile.system @@ -617,7 +617,6 @@ DYNAMIC_CORE += POWER8 ifneq ($(C_COMPILER), GCC) DYNAMIC_CORE += POWER9 DYNAMIC_CORE += POWER10 -override LDFLAGS += -Wl,-no-power10-stubs endif ifeq ($(C_COMPILER), GCC) ifeq ($(GCCVERSIONGT5), 1) @@ -627,11 +626,9 @@ $(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.) endif ifeq ($(GCCVERSIONGTEQ11), 1) DYNAMIC_CORE += POWER10 -override LDFLAGS += -Wl,-no-power10-stubs else ifeq ($(GCCVERSIONGTEQ10), 1) ifeq ($(GCCMINORVERSIONGTEQ2), 1) DYNAMIC_CORE += POWER10 -override LDFLAGS += -Wl,-no-power10-stubs endif else $(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.) diff --git a/kernel/power/dgemm_kernel_power10.c b/kernel/power/dgemm_kernel_power10.c index b3ee301be..a0bc1a777 100644 --- a/kernel/power/dgemm_kernel_power10.c +++ b/kernel/power/dgemm_kernel_power10.c @@ -27,64 +27,64 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -typedef unsigned char vec_t __attribute__ ((vector_size (16))); +typedef __vector unsigned char vec_t; typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); #ifdef TRMMKERNEL #define SAVE_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[0* ldc+J]; \ - rowC[0] = result[3] * alpha; \ + rowC[0] = result[0] * alpha; \ rowC = (v4sf_t *) &CO[1*ldc+J]; \ - rowC[0] = result[2] * alpha; \ - rowC = (v4sf_t *) &CO[2*ldc+J]; \ rowC[0] = result[1] * alpha; \ + rowC = (v4sf_t *) &CO[2*ldc+J]; \ + rowC[0] = result[2] * alpha; \ rowC = (v4sf_t *) &CO[3*ldc+J]; \ - rowC[0] = result[0] * alpha; + rowC[0] = result[3] * alpha; #define SAVE_ACC1(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[4* ldc+J]; \ - rowC[0] = result[3] * alpha; \ + rowC[0] = result[0] * alpha; \ rowC = (v4sf_t *) &CO[5*ldc+J]; \ - rowC[0] = result[2] * alpha; \ - rowC = (v4sf_t *) &CO[6*ldc+J]; \ rowC[0] = result[1] * alpha; \ + rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC[0] = result[2] * alpha; \ rowC = (v4sf_t *) &CO[7*ldc+J]; \ - rowC[0] = result[0] * alpha; + rowC[0] = result[3] * alpha; #define SAVE2x4_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[0* ldc+J]; \ - rowC[0] = result[3] * alpha; \ + rowC[0] = result[0] * alpha; \ rowC = (v4sf_t *) &CO[1* ldc+J]; \ - rowC[0] = result[2] * alpha; + rowC[0] = result[1] * alpha; #else #define SAVE_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[0* ldc+J]; \ - rowC[0] += result[3] * alpha; \ + rowC[0] += result[0] * alpha; \ rowC = (v4sf_t *) &CO[1*ldc+J]; \ - rowC[0] += result[2] * alpha; \ - rowC = (v4sf_t *) &CO[2*ldc+J]; \ rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[2*ldc+J]; \ + rowC[0] += result[2] * alpha; \ rowC = (v4sf_t *) &CO[3*ldc+J]; \ - rowC[0] += result[0] * alpha; + rowC[0] += result[3] * alpha; #define SAVE_ACC1(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[4* ldc+J]; \ - rowC[0] += result[3] * alpha; \ + rowC[0] += result[0] * alpha; \ rowC = (v4sf_t *) &CO[5*ldc+J]; \ - rowC[0] += result[2] * alpha; \ - rowC = (v4sf_t *) &CO[6*ldc+J]; \ rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC[0] += result[2] * alpha; \ rowC = (v4sf_t *) &CO[7*ldc+J]; \ - rowC[0] += result[0] * alpha; + rowC[0] += result[3] * alpha; #define SAVE2x4_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[0* ldc+J]; \ - rowC[0] += result[3] * alpha; \ + rowC[0] += result[0] * alpha; \ rowC = (v4sf_t *) &CO[1* ldc+J]; \ - rowC[0] += result[2] * alpha; + rowC[0] += result[1] * alpha; #endif #define SET_ACC_ZERO4() \ diff --git a/kernel/power/sgemm_kernel_power10.c b/kernel/power/sgemm_kernel_power10.c index 01c122c6d..81a5ec76b 100644 --- a/kernel/power/sgemm_kernel_power10.c +++ b/kernel/power/sgemm_kernel_power10.c @@ -27,103 +27,103 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -typedef unsigned char vec_t __attribute__ ((vector_size (16))); +typedef __vector unsigned char vec_t; typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); #if defined(TRMMKERNEL) #define SAVE_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[0* ldc+J]; \ - rowC[0] = result[3] * alpha; \ + rowC[0] = result[0] * alpha; \ rowC = (v4sf_t *) &CO[1*ldc+J]; \ - rowC[0] = result[2] * alpha; \ - rowC = (v4sf_t *) &CO[2*ldc+J]; \ rowC[0] = result[1] * alpha; \ + rowC = (v4sf_t *) &CO[2*ldc+J]; \ + rowC[0] = result[2] * alpha; \ rowC = (v4sf_t *) &CO[3*ldc+J]; \ - rowC[0] = result[0] * alpha; + rowC[0] = result[3] * alpha; #define SAVE_ACC1(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[4* ldc+J]; \ - rowC[0] = result[3] * alpha; \ + rowC[0] = result[0] * alpha; \ rowC = (v4sf_t *) &CO[5*ldc+J]; \ - rowC[0] = result[2] * alpha; \ - rowC = (v4sf_t *) &CO[6*ldc+J]; \ rowC[0] = result[1] * alpha; \ + rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC[0] = result[2] * alpha; \ rowC = (v4sf_t *) &CO[7*ldc+J]; \ - rowC[0] = result[0] * alpha; + rowC[0] = result[3] * alpha; #define SAVE4x2_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v2sf_t *) &CO[0* ldc+J]; \ - rowC[0] = result[6] * alpha; \ + rowC[0] = result[0] * alpha; \ rowC = (v2sf_t *) &CO[1* ldc+J]; \ - rowC[0] = result[4] * alpha; \ - rowC = (v2sf_t *) &CO[2* ldc+J]; \ rowC[0] = result[2] * alpha; \ + rowC = (v2sf_t *) &CO[2* ldc+J]; \ + rowC[0] = result[4] * alpha; \ rowC = (v2sf_t *) &CO[3* ldc+J]; \ - rowC[0] = result[0] * alpha; + rowC[0] = result[6] * alpha; #define SAVE4x2_ACC1(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v2sf_t *) &CO[4* ldc+J]; \ - rowC[0] = result[6] * alpha; \ + rowC[0] = result[0] * alpha; \ rowC = (v2sf_t *) &CO[5* ldc+J]; \ - rowC[0] = result[4] * alpha; \ - rowC = (v2sf_t *) &CO[6* ldc+J]; \ rowC[0] = result[2] * alpha; \ + rowC = (v2sf_t *) &CO[6* ldc+J]; \ + rowC[0] = result[4] * alpha; \ rowC = (v2sf_t *) &CO[7* ldc+J]; \ - rowC[0] = result[0] * alpha; + rowC[0] = result[6] * alpha; #define SAVE2x4_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[0* ldc+J]; \ - rowC[0] = result[3] * alpha; \ + rowC[0] = result[0] * alpha; \ rowC = (v4sf_t *) &CO[1* ldc+J]; \ - rowC[0] = result[2] * alpha; + rowC[0] = result[1] * alpha; #else #define SAVE_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[0* ldc+J]; \ - rowC[0] += result[3] * alpha; \ + rowC[0] += result[0] * alpha; \ rowC = (v4sf_t *) &CO[1*ldc+J]; \ - rowC[0] += result[2] * alpha; \ - rowC = (v4sf_t *) &CO[2*ldc+J]; \ rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[2*ldc+J]; \ + rowC[0] += result[2] * alpha; \ rowC = (v4sf_t *) &CO[3*ldc+J]; \ - rowC[0] += result[0] * alpha; + rowC[0] += result[3] * alpha; #define SAVE_ACC1(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[4* ldc+J]; \ - rowC[0] += result[3] * alpha; \ + rowC[0] += result[0] * alpha; \ rowC = (v4sf_t *) &CO[5*ldc+J]; \ - rowC[0] += result[2] * alpha; \ - rowC = (v4sf_t *) &CO[6*ldc+J]; \ rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC[0] += result[2] * alpha; \ rowC = (v4sf_t *) &CO[7*ldc+J]; \ - rowC[0] += result[0] * alpha; + rowC[0] += result[3] * alpha; #define SAVE4x2_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v2sf_t *) &CO[0* ldc+J]; \ - rowC[0] += result[6] * alpha; \ + rowC[0] += result[0] * alpha; \ rowC = (v2sf_t *) &CO[1* ldc+J]; \ - rowC[0] += result[4] * alpha; \ - rowC = (v2sf_t *) &CO[2* ldc+J]; \ rowC[0] += result[2] * alpha; \ + rowC = (v2sf_t *) &CO[2* ldc+J]; \ + rowC[0] += result[4] * alpha; \ rowC = (v2sf_t *) &CO[3* ldc+J]; \ - rowC[0] += result[0] * alpha; + rowC[0] += result[6] * alpha; #define SAVE4x2_ACC1(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v2sf_t *) &CO[4* ldc+J]; \ - rowC[0] += result[6] * alpha; \ + rowC[0] += result[0] * alpha; \ rowC = (v2sf_t *) &CO[5* ldc+J]; \ - rowC[0] += result[4] * alpha; \ - rowC = (v2sf_t *) &CO[6* ldc+J]; \ rowC[0] += result[2] * alpha; \ + rowC = (v2sf_t *) &CO[6* ldc+J]; \ + rowC[0] += result[4] * alpha; \ rowC = (v2sf_t *) &CO[7* ldc+J]; \ - rowC[0] += result[0] * alpha; + rowC[0] += result[6] * alpha; #define SAVE2x4_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[0* ldc+J]; \ - rowC[0] += result[3] * alpha; \ + rowC[0] += result[0] * alpha; \ rowC = (v4sf_t *) &CO[1* ldc+J]; \ - rowC[0] += result[2] * alpha; + rowC[0] += result[1] * alpha; #endif #define KERNEL(i, j) \ __builtin_mma_xvf32gerpp (&acc0, rowB[i], rowA[j]); \ diff --git a/kernel/power/shgemm_kernel_power10.c b/kernel/power/shgemm_kernel_power10.c index 7455f925c..1ae9e04bf 100644 --- a/kernel/power/shgemm_kernel_power10.c +++ b/kernel/power/shgemm_kernel_power10.c @@ -45,7 +45,7 @@ bfloat16tof32 (bfloat16 f16) #define BF16TOF32(x) x #endif -typedef unsigned char vec_t __attribute__ ((vector_size (16))); +typedef __vector unsigned char vec_t; typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); @@ -64,54 +64,54 @@ vector char mask = #define MERGE_LOW(x, y) (vec_t) vec_mergel ((vector short)x, (vector short)y) #define SAVE_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[0* ldc+J]; \ - rowC[0] += result[3] * alpha; \ + rowC[0] += result[0] * alpha; \ rowC = (v4sf_t *) &CO[1*ldc+J]; \ - rowC[0] += result[2] * alpha; \ - rowC = (v4sf_t *) &CO[2*ldc+J]; \ rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[2*ldc+J]; \ + rowC[0] += result[2] * alpha; \ rowC = (v4sf_t *) &CO[3*ldc+J]; \ - rowC[0] += result[0] * alpha; + rowC[0] += result[3] * alpha; #define SAVE_ACC1(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[4* ldc+J]; \ - rowC[0] += result[3] * alpha; \ + rowC[0] += result[0] * alpha; \ rowC = (v4sf_t *) &CO[5*ldc+J]; \ - rowC[0] += result[2] * alpha; \ - rowC = (v4sf_t *) &CO[6*ldc+J]; \ rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC[0] += result[2] * alpha; \ rowC = (v4sf_t *) &CO[7*ldc+J]; \ - rowC[0] += result[0] * alpha; + rowC[0] += result[3] * alpha; #define SAVE4x2_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v2sf_t *) &CO[0* ldc+J]; \ - rowC[0] += result[6] * alpha; \ + rowC[0] += result[0] * alpha; \ rowC = (v2sf_t *) &CO[1* ldc+J]; \ - rowC[0] += result[4] * alpha; \ - rowC = (v2sf_t *) &CO[2* ldc+J]; \ rowC[0] += result[2] * alpha; \ + rowC = (v2sf_t *) &CO[2* ldc+J]; \ + rowC[0] += result[4] * alpha; \ rowC = (v2sf_t *) &CO[3* ldc+J]; \ - rowC[0] += result[0] * alpha; + rowC[0] += result[6] * alpha; #define SAVE4x2_ACC1(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v2sf_t *) &CO[4* ldc+J]; \ - rowC[0] += result[6] * alpha; \ + rowC[0] += result[0] * alpha; \ rowC = (v2sf_t *) &CO[5* ldc+J]; \ - rowC[0] += result[4] * alpha; \ - rowC = (v2sf_t *) &CO[6* ldc+J]; \ rowC[0] += result[2] * alpha; \ + rowC = (v2sf_t *) &CO[6* ldc+J]; \ + rowC[0] += result[4] * alpha; \ rowC = (v2sf_t *) &CO[7* ldc+J]; \ - rowC[0] += result[0] * alpha; + rowC[0] += result[6] * alpha; #define MMA __builtin_mma_xvbf16ger2pp #define SAVE2x4_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[0* ldc+J]; \ - rowC[0] += result[3] * alpha; \ + rowC[0] += result[0] * alpha; \ rowC = (v4sf_t *) &CO[1* ldc+J]; \ - rowC[0] += result[2] * alpha; + rowC[0] += result[1] * alpha; #define SET_ACC_ZERO4() \ __builtin_mma_xxsetaccz (&acc0); \ From 4fda217f99f611df04f4dcec8378ee0441fdf6e8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 25 Jul 2020 06:42:39 +0000 Subject: [PATCH 323/593] Delete potrf_parallel.c (moving it to ../potrf) --- lapack/getrf/potrf_parallel.c | 667 ---------------------------------- 1 file changed, 667 deletions(-) delete mode 100644 lapack/getrf/potrf_parallel.c diff --git a/lapack/getrf/potrf_parallel.c b/lapack/getrf/potrf_parallel.c deleted file mode 100644 index 008fcb8cc..000000000 --- a/lapack/getrf/potrf_parallel.c +++ /dev/null @@ -1,667 +0,0 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -#include -#include "common.h" - -#ifndef USE_SIMPLE_THREADED_LEVEL3 - -//The array of job_t may overflow the stack. -//Instead, use malloc to alloc job_t. -#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD -#define USE_ALLOC_HEAP -#endif - - -static FLOAT dm1 = -1.; - -#ifndef KERNEL_FUNC -#ifndef LOWER -#define KERNEL_FUNC SYRK_KERNEL_U -#else -#define KERNEL_FUNC SYRK_KERNEL_L -#endif -#endif - -#ifndef LOWER -#ifndef COMPLEX -#define TRSM_KERNEL TRSM_KERNEL_LT -#else -#define TRSM_KERNEL TRSM_KERNEL_LC -#endif -#else -#ifndef COMPLEX -#define TRSM_KERNEL TRSM_KERNEL_RN -#else -#define TRSM_KERNEL TRSM_KERNEL_RR -#endif -#endif - -#ifndef CACHE_LINE_SIZE -#define CACHE_LINE_SIZE 8 -#endif - -#ifndef DIVIDE_RATE -#define DIVIDE_RATE 2 -#endif - -#ifndef SWITCH_RATIO -#define SWITCH_RATIO 2 -#endif - -#ifndef LOWER -#define TRANS -#endif - -#ifndef SYRK_LOCAL -#if !defined(LOWER) && !defined(TRANS) -#define SYRK_LOCAL SYRK_UN -#elif !defined(LOWER) && defined(TRANS) -#define SYRK_LOCAL SYRK_UT -#elif defined(LOWER) && !defined(TRANS) -#define SYRK_LOCAL SYRK_LN -#else -#define SYRK_LOCAL SYRK_LT -#endif -#endif - -typedef struct { -#ifdef HAVE_C11 - _Atomic -#else - volatile -#endif - BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; -} job_t; - - -#ifndef KERNEL_OPERATION -#ifndef COMPLEX -#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ - KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y)) -#else -#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ - KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y)) -#endif -#endif - -#ifndef ICOPY_OPERATION -#ifndef TRANS -#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); -#else -#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); -#endif -#endif - -#ifndef OCOPY_OPERATION -#ifdef TRANS -#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); -#else -#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); -#endif -#endif - -#ifndef S -#define S args -> a -#endif -#ifndef A -#define A args -> b -#endif -#ifndef C -#define C args -> c -#endif -#ifndef LDA -#define LDA args -> lda -#endif -#ifndef N -#define N args -> m -#endif -#ifndef K -#define K args -> k -#endif - -static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ - - FLOAT *buffer[DIVIDE_RATE]; - - BLASLONG k, lda; - BLASLONG m_from, m_to; - - FLOAT *alpha; - FLOAT *a, *c; - job_t *job = (job_t *)args -> common; - BLASLONG xxx, bufferside; - - BLASLONG jjs, min_jj; - BLASLONG is, min_i, div_n; - - BLASLONG i, current; - - k = K; - - a = (FLOAT *)A; - c = (FLOAT *)C; - - lda = LDA; - - alpha = (FLOAT *)args -> alpha; - - m_from = range_n[mypos + 0]; - m_to = range_n[mypos + 1]; - -#if 0 - fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld\n", mypos, m_from, m_to); -#endif - - div_n = (((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; - - buffer[0] = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); - for (i = 1; i < DIVIDE_RATE; i++) { - buffer[i] = buffer[i - 1] + GEMM_Q * div_n * COMPSIZE; - } - -#ifndef LOWER - TRSM_IUNCOPY(k, k, (FLOAT *)S, lda, 0, sb); -#else - TRSM_OLTCOPY(k, k, (FLOAT *)S, lda, 0, sb); -#endif - - for (xxx = m_from, bufferside = 0; xxx < m_to; xxx += div_n, bufferside ++) { - - for(jjs = xxx; jjs < MIN(m_to, xxx + div_n); jjs += min_jj){ - - min_jj = MIN(m_to, xxx + div_n) - jjs; - -#ifndef LOWER - if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; -#else - if (min_jj > GEMM_P) min_jj = GEMM_P; -#endif - -#ifndef LOWER - OCOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE); - - TRSM_KERNEL (k, min_jj, k, dm1, -#ifdef COMPLEX - ZERO, -#endif - sb, - buffer[bufferside] + k * (jjs - xxx) * COMPSIZE, - a + jjs * lda * COMPSIZE, lda, 0); -#else - ICOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE); - - TRSM_KERNEL (min_jj, k, k, dm1, -#ifdef COMPLEX - ZERO, -#endif - buffer[bufferside] + k * (jjs - xxx) * COMPSIZE, - sb, - a + jjs * COMPSIZE, lda, 0); -#endif - } - -#ifndef LOWER - for (i = 0; i <= mypos; i++) - job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; -#else - for (i = mypos; i < args -> nthreads; i++) - job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; -#endif - - WMB; - } - - min_i = m_to - m_from; - - if (min_i >= GEMM_P * 2) { - min_i = GEMM_P; - } else - if (min_i > GEMM_P) { - min_i = (((min_i + 1) / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; - } - -#ifndef LOWER - ICOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa); -#else - OCOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa); -#endif - - current = mypos; - -#ifndef LOWER - while (current < args -> nthreads) -#else - while (current >= 0) -#endif - { - div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; - - for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { - - /* thread has to wait */ - if (current != mypos) while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; - - KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha, - sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], - c, lda, m_from, xxx); - - if (m_from + min_i >= m_to) { - job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; - WMB; - } - } - -#ifndef LOWER - current ++; -#else - current --; -#endif - } - - for(is = m_from + min_i; is < m_to; is += min_i){ - min_i = m_to - is; - - if (min_i >= GEMM_P * 2) { - min_i = GEMM_P; - } else - if (min_i > GEMM_P) { - min_i = (((min_i + 1) / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; - } - -#ifndef LOWER - ICOPY_OPERATION(k, min_i, a, lda, 0, is, sa); -#else - OCOPY_OPERATION(k, min_i, a, lda, 0, is, sa); -#endif - - current = mypos; - -#ifndef LOWER - while (current < args -> nthreads) -#else - while (current >= 0) -#endif - { - div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; - - for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { - - KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha, - sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], - c, lda, is, xxx); - - if (is + min_i >= m_to) { - job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; - WMB; - } - } -#ifndef LOWER - current ++; -#else - current --; -#endif - } - } - - for (i = 0; i < args -> nthreads; i++) { - if (i != mypos) { - for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { - while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;}; - } - } - } - - return 0; - } - -static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){ - - blas_arg_t newarg; - -#ifndef USE_ALLOC_HEAP - job_t job[MAX_CPU_NUMBER]; -#else - job_t * job = NULL; -#endif - - blas_queue_t queue[MAX_CPU_NUMBER]; - - BLASLONG range[MAX_CPU_NUMBER + 100]; - - BLASLONG num_cpu; - - BLASLONG nthreads = args -> nthreads; - - BLASLONG width, i, j, k; - BLASLONG n, n_from, n_to; - int mode, mask; - double dnum; - -#ifndef COMPLEX -#ifdef XDOUBLE - mode = BLAS_XDOUBLE | BLAS_REAL; - mask = MAX(QGEMM_UNROLL_M, QGEMM_UNROLL_N) - 1; -#elif defined(DOUBLE) - mode = BLAS_DOUBLE | BLAS_REAL; - mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1; -#elif defined(HALF) - mode = BLAS_HALF | BLAS_REAL; - mask = MAX(SHGEMM_UNROLL_M, SHGEMM_UNROLL_N) - 1; -#else - mode = BLAS_SINGLE | BLAS_REAL; - mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1; -#endif -#else -#ifdef XDOUBLE - mode = BLAS_XDOUBLE | BLAS_COMPLEX; - mask = MAX(XGEMM_UNROLL_M, XGEMM_UNROLL_N) - 1; -#elif defined(DOUBLE) - mode = BLAS_DOUBLE | BLAS_COMPLEX; - mask = MAX(ZGEMM_UNROLL_M, ZGEMM_UNROLL_N) - 1; -#else - mode = BLAS_SINGLE | BLAS_COMPLEX; - mask = MAX(CGEMM_UNROLL_M, CGEMM_UNROLL_N) - 1; -#endif -#endif - - newarg.m = args -> m; - newarg.k = args -> k; - newarg.a = args -> a; - newarg.b = args -> b; - newarg.c = args -> c; - newarg.lda = args -> lda; - newarg.alpha = args -> alpha; - -#ifdef USE_ALLOC_HEAP - job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t)); - if(job==NULL){ - fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__); - exit(1); - } -#endif - - newarg.common = (void *)job; - - n_from = 0; - n_to = args -> m; - -#ifndef LOWER - - range[MAX_CPU_NUMBER] = n_to - n_from; - range[0] = 0; - num_cpu = 0; - i = 0; - n = n_to - n_from; - - dnum = (double)n * (double)n /(double)nthreads; - - while (i < n){ - - if (nthreads - num_cpu > 1) { - - double di = (double)i; - - width = ((((BLASLONG)(sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1)); - - if (num_cpu == 0) width = n - (((n - width)/(mask+1)) * (mask+1)); - - if ((width > n - i) || (width < mask)) width = n - i; - - } else { - width = n - i; - } - - range[MAX_CPU_NUMBER - num_cpu - 1] = range[MAX_CPU_NUMBER - num_cpu] - width; - - queue[num_cpu].mode = mode; - queue[num_cpu].routine = inner_thread; - queue[num_cpu].args = &newarg; - queue[num_cpu].range_m = NULL; - - queue[num_cpu].sa = NULL; - queue[num_cpu].sb = NULL; - queue[num_cpu].next = &queue[num_cpu + 1]; - - num_cpu ++; - i += width; - } - - for (i = 0; i < num_cpu; i ++) queue[i].range_n = &range[MAX_CPU_NUMBER - num_cpu]; - -#else - - range[0] = 0; - num_cpu = 0; - i = 0; - n = n_to - n_from; - - dnum = (double)n * (double)n /(double)nthreads; - - while (i < n){ - - if (nthreads - num_cpu > 1) { - - double di = (double)i; - - width = ((((BLASLONG)(sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1)); - - if ((width > n - i) || (width < mask)) width = n - i; - - } else { - width = n - i; - } - - range[num_cpu + 1] = range[num_cpu] + width; - - queue[num_cpu].mode = mode; - queue[num_cpu].routine = inner_thread; - queue[num_cpu].args = &newarg; - queue[num_cpu].range_m = NULL; - queue[num_cpu].range_n = range; - queue[num_cpu].sa = NULL; - queue[num_cpu].sb = NULL; - queue[num_cpu].next = &queue[num_cpu + 1]; - - num_cpu ++; - i += width; - } - -#endif - - newarg.nthreads = num_cpu; - - if (num_cpu) { - - for (j = 0; j < num_cpu; j++) { - for (i = 0; i < num_cpu; i++) { - for (k = 0; k < DIVIDE_RATE; k++) { - job[j].working[i][CACHE_LINE_SIZE * k] = 0; - } - } - } - - queue[0].sa = sa; - queue[0].sb = sb; - queue[num_cpu - 1].next = NULL; - - exec_blas(num_cpu, queue); - } - -#ifdef USE_ALLOC_HEAP - free(job); -#endif - - return 0; -} - -#endif - -blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { - - BLASLONG n, bk, i, blocking, lda; - BLASLONG info; - int mode; - blas_arg_t newarg; - FLOAT *a; - FLOAT alpha[2] = { -ONE, ZERO}; - -#ifndef COMPLEX -#ifdef XDOUBLE - mode = BLAS_XDOUBLE | BLAS_REAL; -#elif defined(DOUBLE) - mode = BLAS_DOUBLE | BLAS_REAL; -#else - mode = BLAS_SINGLE | BLAS_REAL; -#endif -#else -#ifdef XDOUBLE - mode = BLAS_XDOUBLE | BLAS_COMPLEX; -#elif defined(DOUBLE) - mode = BLAS_DOUBLE | BLAS_COMPLEX; -#else - mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif -#endif - - if (args -> nthreads == 1) { -#ifndef LOWER - info = POTRF_U_SINGLE(args, NULL, NULL, sa, sb, 0); -#else - info = POTRF_L_SINGLE(args, NULL, NULL, sa, sb, 0); -#endif - return info; - } - - n = args -> n; - a = (FLOAT *)args -> a; - lda = args -> lda; - - if (range_n) n = range_n[1] - range_n[0]; - - if (n <= GEMM_UNROLL_N * 2) { -#ifndef LOWER - info = POTRF_U_SINGLE(args, NULL, range_n, sa, sb, 0); -#else - info = POTRF_L_SINGLE(args, NULL, range_n, sa, sb, 0); -#endif - return info; - } - - newarg.lda = lda; - newarg.ldb = lda; - newarg.ldc = lda; - newarg.alpha = alpha; - newarg.beta = NULL; - newarg.nthreads = args -> nthreads; - - blocking = ((n / 2 + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N; - if (blocking > GEMM_Q) blocking = GEMM_Q; - - for (i = 0; i < n; i += blocking) { - bk = n - i; - if (bk > blocking) bk = blocking; - - newarg.m = bk; - newarg.n = bk; - newarg.a = a + (i + i * lda) * COMPSIZE; - - info = CNAME(&newarg, NULL, NULL, sa, sb, 0); - if (info) return info + i; - - if (n - i - bk > 0) { -#ifndef USE_SIMPLE_THREADED_LEVEL3 - newarg.m = n - i - bk; - newarg.k = bk; -#ifndef LOWER - newarg.b = a + ( i + (i + bk) * lda) * COMPSIZE; -#else - newarg.b = a + ((i + bk) + i * lda) * COMPSIZE; -#endif - newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE; - - thread_driver(&newarg, sa, sb); -#else - -#ifndef LOWER - newarg.m = bk; - newarg.n = n - i - bk; - newarg.a = a + (i + i * lda) * COMPSIZE; - newarg.b = a + (i + (i + bk) * lda) * COMPSIZE; - - gemm_thread_n(mode | BLAS_TRANSA_T, - &newarg, NULL, NULL, (void *)TRSM_LCUN, sa, sb, args -> nthreads); - - newarg.n = n - i - bk; - newarg.k = bk; - newarg.a = a + ( i + (i + bk) * lda) * COMPSIZE; - newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE; - -#if 0 - HERK_THREAD_UC(&newarg, NULL, NULL, sa, sb, 0); -#else - syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T, - &newarg, NULL, NULL, (void *)HERK_UC, sa, sb, args -> nthreads); -#endif -#else - newarg.m = n - i - bk; - newarg.n = bk; - newarg.a = a + (i + i * lda) * COMPSIZE; - newarg.b = a + (i + bk + i * lda) * COMPSIZE; - - gemm_thread_m(mode | BLAS_RSIDE | BLAS_TRANSA_T | BLAS_UPLO, - &newarg, NULL, NULL, (void *)TRSM_RCLN, sa, sb, args -> nthreads); - - newarg.n = n - i - bk; - newarg.k = bk; - newarg.a = a + (i + bk + i * lda) * COMPSIZE; - newarg.c = a + (i + bk + (i + bk) * lda) * COMPSIZE; - -#if 0 - HERK_THREAD_LN(&newarg, NULL, NULL, sa, sb, 0); -#else - syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T | BLAS_UPLO, - &newarg, NULL, NULL, (void *)HERK_LN, sa, sb, args -> nthreads); -#endif -#endif - -#endif - } - } - return 0; -} From f194ad59e1399a7fc99e877a3ec26a8d7ff5c585 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 25 Jul 2020 08:52:24 +0200 Subject: [PATCH 324/593] Use _Atomic instead of volatile where available (file moved from ../getrf) must have misplaced this in ../getrf when I made that change in March 2018 (40160ff) the only changes since then were RFC : Add half precision gemm for bfloat16 in OpenBLAS Rajalakshmi Srinivasaraghavan Rajalakshmi Srinivasaraghavan committed on 14 Apr 2020 as 7ebbb50 Change _STDC_VERSION__ to __STDC_VERSION__ Zhiyong Dang committed on 11 May 2018 as 3716267 --- lapack/potrf/potrf_parallel.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/lapack/potrf/potrf_parallel.c b/lapack/potrf/potrf_parallel.c index e61e8decb..008fcb8cc 100644 --- a/lapack/potrf/potrf_parallel.c +++ b/lapack/potrf/potrf_parallel.c @@ -101,7 +101,12 @@ static FLOAT dm1 = -1.; #endif typedef struct { - volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; +#ifdef HAVE_C11 + _Atomic +#else + volatile +#endif + BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; } job_t; @@ -375,6 +380,9 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){ #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_REAL; mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1; +#elif defined(HALF) + mode = BLAS_HALF | BLAS_REAL; + mask = MAX(SHGEMM_UNROLL_M, SHGEMM_UNROLL_N) - 1; #else mode = BLAS_SINGLE | BLAS_REAL; mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1; From 4e1be0e4813df72c26d94f8a452611b62576fcf9 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Thu, 11 Jun 2020 04:12:49 -0700 Subject: [PATCH 325/593] ARM64: Add THUNDERX3T110 Target --- Makefile.arm64 | 10 ++ Makefile.system | 1 + TargetList.txt | 1 + cmake/arch.cmake | 2 +- cmake/prebuild.cmake | 27 +++++ cpuid_arm64.c | 27 ++++- driver/others/dynamic_arm64.c | 8 +- getarch.c | 18 +++ interface/swap.c | 2 +- interface/zswap.c | 2 +- kernel/arm64/KERNEL.THUNDERX3T110 | 184 ++++++++++++++++++++++++++++++ param.h | 29 +++++ 12 files changed, 305 insertions(+), 6 deletions(-) create mode 100644 kernel/arm64/KERNEL.THUNDERX3T110 diff --git a/Makefile.arm64 b/Makefile.arm64 index a7cd82e3a..1091edfe5 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -56,6 +56,16 @@ CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 endif +ifeq ($(CORE), THUNDERX3T110) +ifeq ($(GCCVERSIONGTEQ10), 1) +CCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110 +FCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110 +else +CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 +FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 +endif +endif + ifeq ($(GCCVERSIONGTEQ9), 1) ifeq ($(CORE), TSV110) CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 diff --git a/Makefile.system b/Makefile.system index db651ef99..d7e71d00a 100644 --- a/Makefile.system +++ b/Makefile.system @@ -578,6 +578,7 @@ DYNAMIC_CORE += THUNDERX DYNAMIC_CORE += THUNDERX2T99 DYNAMIC_CORE += TSV110 DYNAMIC_CORE += EMAG8180 +DYNAMIC_CORE += THUNDERX3T110 endif ifeq ($(ARCH), zarch) diff --git a/TargetList.txt b/TargetList.txt index 4e54e3077..8ea2df9b7 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -96,6 +96,7 @@ FALKOR THUNDERX THUNDERX2T99 TSV110 +THUNDERX3T110 9.System Z: ZARCH_GENERIC diff --git a/cmake/arch.cmake b/cmake/arch.cmake index d56ba99cb..5388156bc 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -45,7 +45,7 @@ endif () if (DYNAMIC_ARCH) if (ARM64) - set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1) + set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) endif () if (POWER) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 30256870c..e50483a2f 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -338,6 +338,33 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_N 4) set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "THUNDERX3T110") + file(APPEND ${TARGET_CONF_TEMP} + "#define THUNDERX3T110\n" + "#define L1_CODE_SIZE\t65536\n" + "#define L1_CODE_LINESIZE\t64\n" + "#define L1_CODE_ASSOCIATIVE\t8\n" + "#define L1_DATA_SIZE\t65536\n" + "#define L1_DATA_LINESIZE\t64\n" + "#define L1_DATA_ASSOCIATIVE\t8\n" + "#define L2_SIZE\t524288\n" + "#define L2_LINESIZE\t64\n" + "#define L2_ASSOCIATIVE\t8\n" + "#define L3_SIZE\t94371840\n" + "#define L3_LINESIZE\t64\n" + "#define L3_ASSOCIATIVE\t32\n" + "#define DTB_DEFAULT_ENTRIES\t64\n" + "#define DTB_SIZE\t4096\n" + "#define ARMV8\n") + set(SGEMM_UNROLL_M 16) + set(SGEMM_UNROLL_N 4) + set(DGEMM_UNROLL_M 8) + set(DGEMM_UNROLL_N 4) + set(CGEMM_UNROLL_M 8) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 4) + set(ZGEMM_UNROLL_N 4) + set(SYMV_P 16) elseif ("${TCORE}" STREQUAL "TSV110") file(APPEND ${TARGET_CONF_TEMP} "#define ARMV8\n" diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 4103216e6..6f41be604 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -40,6 +40,7 @@ // Cavium #define CPU_THUNDERX 7 #define CPU_THUNDERX2T99 8 +#define CPU_THUNDERX3T110 12 //Hisilicon #define CPU_TSV110 9 // Ampere @@ -57,7 +58,8 @@ static char *cpuname[] = { "THUNDERX2T99", "TSV110", "EMAG8180", - "NEOVERSEN1" + "NEOVERSEN1", + "THUNDERX3T110" }; static char *cpuname_lower[] = { @@ -72,7 +74,8 @@ static char *cpuname_lower[] = { "thunderx2t99", "tsv110", "emag8180", - "neoversen1" + "neoversen1", + "thunderx3t110" }; int get_feature(char *search) @@ -158,6 +161,8 @@ int detect(void) return CPU_THUNDERX; else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0af")) return CPU_THUNDERX2T99; + else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0b8")) + return CPU_THUNDERX3T110; // HiSilicon else if (strstr(cpu_implementer, "0x48") && strstr(cpu_part, "0xd01")) return CPU_TSV110; @@ -372,7 +377,25 @@ void get_cpuconfig(void) printf("#define L2_LINESIZE 64\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_SIZE 4096\n"); + break; + case CPU_THUNDERX3T110: + printf("#define THUNDERX3T110 \n"); + printf("#define L1_CODE_SIZE 65536 \n"); + printf("#define L1_CODE_LINESIZE 64 \n"); + printf("#define L1_CODE_ASSOCIATIVE 8 \n"); + printf("#define L1_DATA_SIZE 32768 \n"); + printf("#define L1_DATA_LINESIZE 64 \n"); + printf("#define L1_DATA_ASSOCIATIVE 8 \n"); + printf("#define L2_SIZE 524288 \n"); + printf("#define L2_LINESIZE 64 \n"); + printf("#define L2_ASSOCIATIVE 8 \n"); + printf("#define L3_SIZE 94371840 \n"); + printf("#define L3_LINESIZE 64 \n"); + printf("#define L3_ASSOCIATIVE 32 \n"); + printf("#define DTB_DEFAULT_ENTRIES 64 \n"); + printf("#define DTB_SIZE 4096 \n"); + break; } get_cpucount(); } diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 11ef2725c..157b03365 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -53,10 +53,11 @@ extern gotoblas_t gotoblas_THUNDERX2T99; extern gotoblas_t gotoblas_TSV110; extern gotoblas_t gotoblas_EMAG8180; extern gotoblas_t gotoblas_NEOVERSEN1; +extern gotoblas_t gotoblas_THUNDERX3T110; extern void openblas_warning(int verbose, const char * msg); -#define NUM_CORETYPES 11 +#define NUM_CORETYPES 12 /* * In case asm/hwcap.h is outdated on the build system, make sure @@ -82,6 +83,7 @@ static char *corename[] = { "tsv110", "emag8180", "neoversen1", + "thunderx3t110", "unknown" }; @@ -97,6 +99,7 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_TSV110) return corename[ 8]; if (gotoblas == &gotoblas_EMAG8180) return corename[ 9]; if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10]; + if (gotoblas == &gotoblas_THUNDERX3T110) return corename[11]; return corename[NUM_CORETYPES]; } @@ -127,6 +130,7 @@ static gotoblas_t *force_coretype(char *coretype) { case 8: return (&gotoblas_TSV110); case 9: return (&gotoblas_EMAG8180); case 10: return (&gotoblas_NEOVERSEN1); + case 11: return (&gotoblas_THUNDERX3T110); } snprintf(message, 128, "Core not found: %s\n", coretype); openblas_warning(1, message); @@ -190,6 +194,8 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_THUNDERX; case 0x0af: // ThunderX2 return &gotoblas_THUNDERX2T99; + case 0x0b8: // ThunderX3 + return &gotoblas_THUNDERX3T110; } break; case 0x48: // HiSilicon diff --git a/getarch.c b/getarch.c index 2cdf77259..51c9a84e5 100644 --- a/getarch.c +++ b/getarch.c @@ -1174,6 +1174,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "EMAG8180" #endif +#ifdef FORCE_THUNDERX3T110 +#define ARMV8 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "THUNDERX3T110" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DTHUNDERX3T110 " \ + "-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \ + "-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \ + "-DL3_SIZE=94371840 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" +#define LIBNAME "thunderx3t110" +#define CORENAME "THUNDERX3T110" +#else +#endif + #ifdef FORCE_ZARCH_GENERIC #define FORCE #define ARCHITECTURE "ZARCH" diff --git a/interface/swap.c b/interface/swap.c index 17a9868a9..ea40b1fc2 100644 --- a/interface/swap.c +++ b/interface/swap.c @@ -42,7 +42,7 @@ #include "functable.h" #endif -#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) +#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) || defined(THUNDERX3T110) // Multithreaded swap gives performance benefits in ThunderX2T99 #else // Disable multi-threading as it does not show any performance diff --git a/interface/zswap.c b/interface/zswap.c index 372b15447..43971b73e 100644 --- a/interface/zswap.c +++ b/interface/zswap.c @@ -42,7 +42,7 @@ #include "functable.h" #endif -#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) +#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) || defined(THUNDERX3T110) // Multithreaded swap gives performance benefits in ThunderX2T99 #else // Disable multi-threading as it does not show any performance diff --git a/kernel/arm64/KERNEL.THUNDERX3T110 b/kernel/arm64/KERNEL.THUNDERX3T110 new file mode 100644 index 000000000..a20d0d4a6 --- /dev/null +++ b/kernel/arm64/KERNEL.THUNDERX3T110 @@ -0,0 +1,184 @@ +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = daxpy_thunderx2t99.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) + +ifeq ($(DGEMM_UNROLL_M), 8) +DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S +DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S +else +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +endif + +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +ifeq ($(DGEMM_UNROLL_N), 4) +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S +else +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +endif + +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +SASUMKERNEL = sasum_thunderx2t99.c +DASUMKERNEL = dasum_thunderx2t99.c +CASUMKERNEL = casum_thunderx2t99.c +ZASUMKERNEL = zasum_thunderx2t99.c + +SCOPYKERNEL = copy_thunderx2t99.c +DCOPYKERNEL = copy_thunderx2t99.c +CCOPYKERNEL = copy_thunderx2t99.c +ZCOPYKERNEL = copy_thunderx2t99.c + +SSWAPKERNEL = swap_thunderx2t99.S +DSWAPKERNEL = swap_thunderx2t99.S +CSWAPKERNEL = swap_thunderx2t99.S +ZSWAPKERNEL = swap_thunderx2t99.S + +ISAMAXKERNEL = iamax_thunderx2t99.c +IDAMAXKERNEL = iamax_thunderx2t99.c +ICAMAXKERNEL = izamax_thunderx2t99.c +IZAMAXKERNEL = izamax_thunderx2t99.c + +SNRM2KERNEL = scnrm2_thunderx2t99.c +CNRM2KERNEL = scnrm2_thunderx2t99.c +#DNRM2KERNEL = dznrm2_thunderx2t99_fast.c +#ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c +DNRM2KERNEL = dznrm2_thunderx2t99.c +ZNRM2KERNEL = dznrm2_thunderx2t99.c + + +DDOTKERNEL = dot_thunderx2t99.c +SDOTKERNEL = dot_thunderx2t99.c +CDOTKERNEL = zdot_thunderx2t99.c +ZDOTKERNEL = zdot_thunderx2t99.c +DSDOTKERNEL = dot.S + +ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4) +DGEMMKERNEL = dgemm_kernel_8x4_thunderx2t99.S +endif + +ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 16x4) +SGEMMKERNEL = sgemm_kernel_16x4_thunderx2t99.S +endif + +ifeq ($(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N), 8x4) +CGEMMKERNEL = cgemm_kernel_8x4_thunderx2t99.S +endif + +ifeq ($(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N), 4x4) +ZGEMMKERNEL = zgemm_kernel_4x4_thunderx2t99.S +endif diff --git a/param.h b/param.h index efe0e1096..476f237a1 100644 --- a/param.h +++ b/param.h @@ -2779,6 +2779,35 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 +#elif defined(THUNDERX3T110) + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 4 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 320 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 128 + +#define SGEMM_DEFAULT_Q 352 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 112 + +#define SGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + #elif defined(NEOVERSEN1) #define SGEMM_DEFAULT_UNROLL_M 16 From d557584b71578620520e7bcdea7e0f029d0a76e7 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Mon, 27 Jul 2020 14:11:07 -0500 Subject: [PATCH 326/593] Fix compilation issues with clang on POWER As gcc defaults to -malign-power, removing that option. Also adding -fno-integrated-as to use GNU assembler for powerpc assembly optimization files. Fixed other compilation errors reported in dgemv_t.c file. --- Makefile.power | 26 +++++++++++++------------- kernel/Makefile | 5 +++++ kernel/power/dgemv_t.c | 4 ++-- 3 files changed, 20 insertions(+), 15 deletions(-) diff --git a/Makefile.power b/Makefile.power index bf7037995..c1556fe82 100644 --- a/Makefile.power +++ b/Makefile.power @@ -11,34 +11,34 @@ endif ifeq ($(CORE), POWER10) ifeq ($(USE_OPENMP), 1) -COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp -FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp +COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -DUSE_OPENMP -fno-fast-math -fopenmp +FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -DUSE_OPENMP -fno-fast-math -fopenmp else -COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -malign-power -fno-fast-math -FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -malign-power -fno-fast-math +COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math +FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math endif endif ifeq ($(CORE), POWER9) ifeq ($(USE_OPENMP), 1) ifneq ($(C_COMPILER), PGI) -CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp +CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -DUSE_OPENMP -fno-fast-math -fopenmp else CCOMMON_OPT += -fast -Mvect=simd -Mcache_align -DUSE_OPENMP -mp endif ifneq ($(F_COMPILER), PGI) -FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp +FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -DUSE_OPENMP -fno-fast-math -fopenmp else FCOMMON_OPT += -O2 -Mrecursive -DUSE_OPENMP -mp endif else ifneq ($(C_COMPILER), PGI) -CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -fno-fast-math +CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math else CCOMMON_OPT += -fast -Mvect=simd -Mcache_align endif ifneq ($(F_COMPILER), PGI) -FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -fno-fast-math +FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -fno-fast-math else FCOMMON_OPT += -O2 -Mrecursive endif @@ -48,26 +48,26 @@ endif ifeq ($(CORE), POWER8) ifeq ($(USE_OPENMP), 1) ifneq ($(C_COMPILER), PGI) -CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp +CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -DUSE_OPENMP -fno-fast-math -fopenmp else CCOMMON_OPT += -fast -Mvect=simd -Mcache_align -DUSE_OPENMP -mp endif ifneq ($(F_COMPILER), PGI) -FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp +FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -DUSE_OPENMP -fno-fast-math -fopenmp else FCOMMON_OPT += -O2 -Mrecursive -DUSE_OPENMP -mp endif else ifneq ($(C_COMPILER), PGI) -CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -fno-fast-math +CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math else CCOMMON_OPT += -fast -Mvect=simd -Mcache_align endif ifneq ($(F_COMPILER), PGI) ifeq ($(OSNAME), AIX) -FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math +FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math else -FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math +FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math endif else FCOMMON_OPT += -O2 -Mrecursive diff --git a/kernel/Makefile b/kernel/Makefile index 9b468a6af..db3282c05 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -10,6 +10,11 @@ ifeq ($(C_COMPILER), GCC) GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) endif +ifeq ($(ARCH), power) +ifeq ($(C_COMPILER), CLANG) + override CFLAGS += -fno-integrated-as +endif +endif AVX2OPT = ifeq ($(C_COMPILER), GCC) # AVX2 support was added in 4.7.0 diff --git a/kernel/power/dgemv_t.c b/kernel/power/dgemv_t.c index 09abd5a43..c07b3c223 100644 --- a/kernel/power/dgemv_t.c +++ b/kernel/power/dgemv_t.c @@ -359,7 +359,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do "stxvd2x 39, %[off], %[y] \n\t" "stxvd2x 40, %[off2], %[y] \n\t" - : [memy] "+m" (*(const double (*)[8])y), + : [memy] "+m" (*(double (*)[8])y), [n] "+&r" (n), [a0] "=b" (a0), [a1] "=&b" (a1), @@ -373,7 +373,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do [off2]"=&b" (off2), [temp] "=&b" (tempR) : [memx] "m" (*(const double (*)[n])x), - [mem_ap] "m" (*(const double (*)[]) ap), + [mem_ap] "m" (*(const double (*)[n*8]) ap), [alpha] "d" (alpha), "[a0]" (ap), [x] "b" (x), From 921ec4e9e2ae5b1d32bcad04a19cdffe06e145c4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 27 Jul 2020 19:54:46 +0000 Subject: [PATCH 327/593] Adjust A53 SGEMM parameters to reflect move to 8x8 kernel --- cmake/prebuild.cmake | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 30256870c..ff7715c4b 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -195,8 +195,13 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS "#define HAVE_VFP\n" "#define HAVE_NEON\n" "#define ARMV8\n") +if ("${TCORE}" STREQUAL "CORTEXA57") set(SGEMM_UNROLL_M 16) set(SGEMM_UNROLL_N 4) +else + set(SGEMM_UNROLL_M 8) + set(SGEMM_UNROLL_N 8) +endif set(DGEMM_UNROLL_M 8) set(DGEMM_UNROLL_N 4) set(CGEMM_UNROLL_M 8) From 64e2e4aaf3d396740c0c0b66b5a10baf8fdef167 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 27 Jul 2020 20:19:22 +0000 Subject: [PATCH 328/593] missing braces --- cmake/prebuild.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index ff7715c4b..4067138b4 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -198,10 +198,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS if ("${TCORE}" STREQUAL "CORTEXA57") set(SGEMM_UNROLL_M 16) set(SGEMM_UNROLL_N 4) -else +else () set(SGEMM_UNROLL_M 8) set(SGEMM_UNROLL_N 8) -endif +endif () set(DGEMM_UNROLL_M 8) set(DGEMM_UNROLL_N 4) set(CGEMM_UNROLL_M 8) From 200f5c44cc14f356d7dba6af257044016a0573da Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 28 Jul 2020 13:45:23 +0000 Subject: [PATCH 329/593] Add AMD Renoir models and preliminary support for ZEN3 as ZEN2 also remap erroneous family 16 entry to BOBCAT and reclaim erroneous family 25 "Barcelona" for Zen3 --- cpuid_x86.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index 356800b78..ea846a392 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1454,10 +1454,11 @@ int get_cpuname(void){ return CPUTYPE_OPTERON; case 1: case 3: - case 7: - case 10: +// case 7: +// case 10: return CPUTYPE_BARCELONA; case 5: + case 7: return CPUTYPE_BOBCAT; case 6: switch (model) { @@ -1507,6 +1508,8 @@ int get_cpuname(void){ // AMD Ryzen case 8: // AMD Ryzen2 + default: + // Matisse/Renoir and other recent Ryzen2 if(support_avx()) #ifndef NO_AVX2 return CPUTYPE_ZEN; @@ -1516,6 +1519,16 @@ int get_cpuname(void){ else return CPUTYPE_BARCELONA; } + break; + case 10: // Zen3 + if(support_avx()) +#ifndef NO_AVX2 + return CPUTYPE_ZEN; +#else + return CPUTYPE_SANDYBRIDGE; // Zen is closer in architecture to Sandy Bridge than to Excavator +#endif + else + return CPUTYPE_BARCELONA; } break; } @@ -2107,7 +2120,7 @@ int get_coretype(void){ return CORE_PILEDRIVER; else return CORE_BARCELONA; //OS don't support AVX. - case 5: // New EXCAVATOR + case 5: // New EXCAVATOR if(support_avx()) return CORE_EXCAVATOR; else @@ -2135,12 +2148,14 @@ int get_coretype(void){ } break; } - } else if (exfamily == 8) { + } else if (exfamily == 8 || exfamily == 10) { switch (model) { case 1: // AMD Ryzen case 8: - // Ryzen 2 + // Ryzen 2 + default: + // Matisse,Renoir Ryzen2 models if(support_avx()) #ifndef NO_AVX2 return CORE_ZEN; From 12918358aa52aa9cdc194057d5e4b556933988aa Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 28 Jul 2020 13:53:17 +0000 Subject: [PATCH 330/593] Add AMD Renoir/Matisse and preliminary support for Zen3 as Zen2 also support AMD family 22 Jaguar/Puma as Bobcat --- driver/others/dynamic.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index c03b0b21d..5d71b1b2c 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -656,7 +656,7 @@ static gotoblas_t *get_coretype(void){ if ((exfamily == 0) || (exfamily == 2)) { if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3; else return &gotoblas_OPTERON; - } else if (exfamily == 5) { + } else if (exfamily == 5 || exfamily == 7) { return &gotoblas_BOBCAT; } else if (exfamily == 6) { if(model == 1){ @@ -710,7 +710,7 @@ static gotoblas_t *get_coretype(void){ } } } else if (exfamily == 8) { - if (model == 1 || model == 8) { + /* if (model == 1 || model == 8) */ { if(support_avx()) return &gotoblas_ZEN; else{ @@ -718,16 +718,24 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. } } - } else if (exfamily == 9) { + } else if (exfamily == 9) { if(support_avx()) return &gotoblas_ZEN; else{ openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. - } + } + } else if (exfamily == 10) { + if(support_avx()) + return &gotoblas_ZEN; + else{ + openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); + return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. + } }else { return &gotoblas_BARCELONA; } + } } From 5fa581c87e0f3979d0fc70b4ea485fc0d898ffb3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 28 Jul 2020 14:22:41 +0000 Subject: [PATCH 331/593] Put hint to use git develop rather than master branch in README --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 4e5e3e956..f8226f5cb 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,8 @@ You can download them from [file hosting on sourceforge.net](https://sourceforge ## Installation from Source Download from project homepage, https://xianyi.github.com/OpenBLAS/, or check out the code -using Git from https://github.com/xianyi/OpenBLAS.git. +using Git from https://github.com/xianyi/OpenBLAS.git. (If you want the most up to date version, be +sure to use the develop branch - master is several years out of date due to a change of maintainership.) Buildtime parameters can be chosen in Makefile.rule, see there for a short description of each option. Most can also be given directly on the make or cmake command line. From 39724e8128cee3ab49aaa1f508e97bf9f56db61e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 30 Jul 2020 01:14:08 +0200 Subject: [PATCH 332/593] Separate OpenMP handling and allow compilation of Power9 code with older gcc --- Makefile.power | 54 ++++++++++++++++++++++---------------------------- 1 file changed, 24 insertions(+), 30 deletions(-) diff --git a/Makefile.power b/Makefile.power index c1556fe82..37a02d692 100644 --- a/Makefile.power +++ b/Makefile.power @@ -10,54 +10,36 @@ USE_OPENMP = 1 endif ifeq ($(CORE), POWER10) -ifeq ($(USE_OPENMP), 1) -COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -DUSE_OPENMP -fno-fast-math -fopenmp -FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -DUSE_OPENMP -fno-fast-math -fopenmp -else COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math endif -endif ifeq ($(CORE), POWER9) -ifeq ($(USE_OPENMP), 1) ifneq ($(C_COMPILER), PGI) -CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -DUSE_OPENMP -fno-fast-math -fopenmp -else -CCOMMON_OPT += -fast -Mvect=simd -Mcache_align -DUSE_OPENMP -mp -endif -ifneq ($(F_COMPILER), PGI) -FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -DUSE_OPENMP -fno-fast-math -fopenmp +CCOMMON_OPT += -Ofast -mvsx -fno-fast-math +ifneq ($(GCCVERSIONGT4), 1) +$(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended) +CCOMMON_OPT += -mcpu=power8 -mtune=power8 else -FCOMMON_OPT += -O2 -Mrecursive -DUSE_OPENMP -mp +CCOMMON_OPT += -mcpu=power9 -mtune=power9 endif else -ifneq ($(C_COMPILER), PGI) -CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math -else CCOMMON_OPT += -fast -Mvect=simd -Mcache_align endif ifneq ($(F_COMPILER), PGI) -FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -fno-fast-math +FCOMMON_OPT += -O2 -frecursive -fno-fast-math +ifneq ($(GCCVERSIONGT4), 1) +$(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended) +FCOMMON_OPT += -mcpu=power8 -mtune=power8 else -FCOMMON_OPT += -O2 -Mrecursive +FCOMMON_OPT += -mcpu=power9 -mtune=power9 endif +else +FCOMMON_OPT += -O2 -Mrecursive endif endif ifeq ($(CORE), POWER8) -ifeq ($(USE_OPENMP), 1) -ifneq ($(C_COMPILER), PGI) -CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -DUSE_OPENMP -fno-fast-math -fopenmp -else -CCOMMON_OPT += -fast -Mvect=simd -Mcache_align -DUSE_OPENMP -mp -endif -ifneq ($(F_COMPILER), PGI) -FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -DUSE_OPENMP -fno-fast-math -fopenmp -else -FCOMMON_OPT += -O2 -Mrecursive -DUSE_OPENMP -mp -endif -else ifneq ($(C_COMPILER), PGI) CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math else @@ -73,6 +55,18 @@ else FCOMMON_OPT += -O2 -Mrecursive endif endif + +ifeq ($(USE_OPENMP), 1) +ifneq ($(C_COMPILER), PGI) +CCOMMON_OPT += -DUSE_OPENMP -fopenmp +else +CCOMMON_OPT += -DUSE_OPENMP -mp +endif +ifneq ($(F_COMPILER), PGI) +FCOMMON_OPT += -DUSE_OPENMP -fopenmp +else +FCOMMON_OPT += -DUSE_OPENMP -mp +endif endif # workaround for C->FORTRAN ABI violation in LAPACKE From f77b6a83f4c20ca4e4769a999a69b0f47f7f4bb1 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Wed, 29 Jul 2020 18:59:32 -0500 Subject: [PATCH 333/593] dgemv optimization for POWER10 Making use of new vector pair POWER10 instructions in dgemv_n and dgemv_t. Also adding a new block 4x128 to make use of Matrix-Multiply Assist (MMA) feature introduced in POWER ISA v3.1. Tested on simulator and there are no new test failures. --- kernel/power/KERNEL.POWER10 | 4 +- kernel/power/dgemv_n_microk_power10.c | 268 ++++++++ kernel/power/dgemv_n_power10.c | 565 +++++++++++++++++ kernel/power/dgemv_t_power10.c | 840 ++++++++++++++++++++++++++ 4 files changed, 1675 insertions(+), 2 deletions(-) create mode 100644 kernel/power/dgemv_n_microk_power10.c create mode 100644 kernel/power/dgemv_n_power10.c create mode 100644 kernel/power/dgemv_t_power10.c diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index 39f5e9414..f390fac61 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -187,12 +187,12 @@ ZSWAPKERNEL = zswap.c # SGEMVNKERNEL = sgemv_n.c -DGEMVNKERNEL = dgemv_n.c +DGEMVNKERNEL = dgemv_n_power10.c CGEMVNKERNEL = cgemv_n.c ZGEMVNKERNEL = zgemv_n_4.c # SGEMVTKERNEL = sgemv_t.c -DGEMVTKERNEL = dgemv_t.c +DGEMVTKERNEL = dgemv_t_power10.c CGEMVTKERNEL = cgemv_t.c ZGEMVTKERNEL = zgemv_t_4.c diff --git a/kernel/power/dgemv_n_microk_power10.c b/kernel/power/dgemv_n_microk_power10.c new file mode 100644 index 000000000..4be8a5f9b --- /dev/null +++ b/kernel/power/dgemv_n_microk_power10.c @@ -0,0 +1,268 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/30 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#define HAVE_KERNEL_4x4 1 + +static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y, double alpha) +{ + double *a0; + double *a1; + double *a2; + double *a3; + + __asm__ + ( + "lxvp 40, 0(%10) \n\t" // x0, x1 + XXSPLTD_S(32,%x9,0) // alpha, alpha + + "sldi %6, %13, 3 \n\t" // lda * sizeof (double) + + "xvmuldp 34, 41, 32 \n\t" // x0 * alpha, x1 * alpha + "xvmuldp 35, 40, 32 \n\t" // x2 * alpha, x3 * alpha + + "add %4, %3, %6 \n\t" // a0 = ap, a1 = a0 + lda + "add %6, %6, %6 \n\t" // 2 * lda + + XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha + XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha + XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha + XXSPLTD_S(35,35,0) // x3 * alpha, x3 * alpha + + "add %5, %3, %6 \n\t" // a2 = a0 + 2 * lda + "add %6, %4, %6 \n\t" // a3 = a1 + 2 * lda + + "dcbt 0, %3 \n\t" + "dcbt 0, %4 \n\t" + "dcbt 0, %5 \n\t" + "dcbt 0, %6 \n\t" + + "lxvp 40, 0(%3) \n\t" // a0[0], a0[1] + + "lxvp 42, 0(%4) \n\t" // a1[0], a1[1] + + "lxvp 44, 0(%5) \n\t" // a2[0], a2[1] + + "lxvp 46, 0(%6) \n\t" // a3[0], a3[1] + + "dcbt 0, %2 \n\t" + + "addi %3, %3, 32 \n\t" + "addi %4, %4, 32 \n\t" + "addi %5, %5, 32 \n\t" + "addi %6, %6, 32 \n\t" + + "addic. %1, %1, -4 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "lxvp 36, 0(%2) \n\t" // y0, y1 + + "xvmaddadp 36, 40, 32 \n\t" + "xvmaddadp 37, 41, 32 \n\t" + + "lxvp 40, 0(%3) \n\t" // a0[0], a0[1] + + "xvmaddadp 36, 42, 33 \n\t" + "addi %3, %3, 32 \n\t" + "xvmaddadp 37, 43, 33 \n\t" + + "lxvp 42, 0(%4) \n\t" // a1[0], a1[1] + + "xvmaddadp 36, 44, 34 \n\t" + "addi %4, %4, 32 \n\t" + "xvmaddadp 37, 45, 34 \n\t" + + "lxvp 44, 0(%5) \n\t" // a2[0], a2[1] + + "xvmaddadp 36, 46, 35 \n\t" + "addi %5, %5, 32 \n\t" + "xvmaddadp 37, 47, 35 \n\t" + + "stxvp 36, 0(%2) \n\t" // y0, y1 + + "lxvp 46, 0(%6) \n\t" // a3[0], a3[1] + + "addi %6, %6, 32 \n\t" + "addi %2, %2, 32 \n\t" + + "addic. %1, %1, -4 \n\t" + "ble two%= \n\t" + + + "lxvp 36, 0(%2) \n\t" // y0, y1 + + "xvmaddadp 36, 40, 32 \n\t" + "xvmaddadp 37, 41, 32 \n\t" + + "lxvp 40, 0(%3) \n\t" // a0[0], a0[1] + + "xvmaddadp 36, 42, 33 \n\t" + "addi %3, %3, 32 \n\t" + "xvmaddadp 37, 43, 33 \n\t" + + "lxvp 42, 0(%4) \n\t" // a1[0], a1[1] + + "xvmaddadp 36, 44, 34 \n\t" + "addi %4, %4, 32 \n\t" + "xvmaddadp 37, 45, 34 \n\t" + + "lxvp 44, 0(%5) \n\t" // a2[0], a2[1] + + "xvmaddadp 36, 46, 35 \n\t" + "addi %5, %5, 32 \n\t" + "xvmaddadp 37, 47, 35 \n\t" + + "stxvp 36, 0(%2) \n\t" // y0, y1 + + "lxvp 46, 0(%6) \n\t" // a3[0], a3[1] + + "addi %6, %6, 32 \n\t" + "addi %2, %2, 32 \n\t" + + "addic. %1, %1, -4 \n\t" + "ble two%= \n\t" + + + "lxvp 36, 0(%2) \n\t" // y0, y1 + + "xvmaddadp 36, 40, 32 \n\t" + "xvmaddadp 37, 41, 32 \n\t" + + "lxvp 40, 0(%3) \n\t" // a0[0], a0[1] + + "xvmaddadp 36, 42, 33 \n\t" + "addi %3, %3, 32 \n\t" + "xvmaddadp 37, 43, 33 \n\t" + + "lxvp 42, 0(%4) \n\t" // a1[0], a1[1] + + "xvmaddadp 36, 44, 34 \n\t" + "addi %4, %4, 32 \n\t" + "xvmaddadp 37, 45, 34 \n\t" + + "lxvp 44, 0(%5) \n\t" // a2[0], a2[1] + + "xvmaddadp 36, 46, 35 \n\t" + "addi %5, %5, 32 \n\t" + "xvmaddadp 37, 47, 35 \n\t" + + "stxvp 36, 0(%2) \n\t" // y0, y1 + + "lxvp 46, 0(%6) \n\t" // a3[0], a3[1] + + "addi %6, %6, 32 \n\t" + "addi %2, %2, 32 \n\t" + + "addic. %1, %1, -4 \n\t" + "ble two%= \n\t" + + + "lxvp 36, 0(%2) \n\t" // y0, y1 + + "xvmaddadp 36, 40, 32 \n\t" + "xvmaddadp 37, 41, 32 \n\t" + + "lxvp 40, 0(%3) \n\t" // a0[0], a0[1] + + "xvmaddadp 36, 42, 33 \n\t" + "addi %3, %3, 32 \n\t" + "xvmaddadp 37, 43, 33 \n\t" + + "lxvp 42, 0(%4) \n\t" // a1[0], a1[1] + + "xvmaddadp 36, 44, 34 \n\t" + "addi %4, %4, 32 \n\t" + "xvmaddadp 37, 45, 34 \n\t" + + "lxvp 44, 0(%5) \n\t" // a2[0], a2[1] + + "xvmaddadp 36, 46, 35 \n\t" + "addi %5, %5, 32 \n\t" + "xvmaddadp 37, 47, 35 \n\t" + + "stxvp 36, 0(%2) \n\t" // y0, y1 + + "lxvp 46, 0(%6) \n\t" // a3[0], a3[1] + + "addi %6, %6, 32 \n\t" + "addi %2, %2, 32 \n\t" + + "addic. %1, %1, -4 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "lxvp 36, 0(%2) \n\t" // y0, y1 + + "xvmaddadp 36, 40, 32 \n\t" + "xvmaddadp 37, 41, 32 \n\t" + + "xvmaddadp 36, 42, 33 \n\t" + "xvmaddadp 37, 43, 33 \n\t" + + "xvmaddadp 36, 44, 34 \n\t" + "xvmaddadp 37, 45, 34 \n\t" + + "xvmaddadp 36, 46, 35 \n\t" + "xvmaddadp 37, 47, 35 \n\t" + + "stxvp 36, 0(%2) \n\t" // y0, y1 + + "#n=%1 ap=%8=%12 lda=%13 x=%7=%10 y=%0=%2 alpha=%9 o16=%11\n" + "#a0=%3 a1=%4 a2=%5 a3=%6" + : + "+m" (*y), + "+r" (n), // 1 + "+b" (y), // 2 + "=b" (a0), // 3 + "=b" (a1), // 4 + "=&b" (a2), // 5 + "=&b" (a3) // 6 + : + "m" (*x), + "m" (*ap), + "d" (alpha), // 9 + "r" (x), // 10 + "b" (16), // 11 + "3" (ap), // 12 + "4" (lda) // 13 + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47" + ); +} diff --git a/kernel/power/dgemv_n_power10.c b/kernel/power/dgemv_n_power10.c new file mode 100644 index 000000000..ad5f1ba0d --- /dev/null +++ b/kernel/power/dgemv_n_power10.c @@ -0,0 +1,565 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +typedef __vector unsigned char vec_t; +typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); +typedef __vector_pair __attribute__((aligned(8))) vecp_t; + +#include "dgemv_n_microk_power10.c" + +#define MMA(X, APTR, ACC) \ + rX = (vec_t *) & X; \ + rowA = *((vecp_t*)((void*)&APTR)); \ + __builtin_mma_xvf64gerpp (ACC, rowA, rX[0]); + +#define SAVE(ACC, Z) \ + rowC = (v4sf_t *) &y[Z]; \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + result[0][1] = result[1][0]; \ + result[2][1] = result[3][0]; \ + rowC[0] += valpha * result[0]; \ + rowC[1] += valpha * result[2]; + +void +dgemv_kernel_4x128 (BLASLONG n, FLOAT * a_ptr, BLASLONG lda, FLOAT * xo, + FLOAT * y, FLOAT alpha) +{ + BLASLONG i, j, tmp; + FLOAT *a0 = a_ptr; + FLOAT *x1 = xo; + vector double valpha = { alpha, alpha }; + v4sf_t *rowC; + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + v4sf_t result[4]; + vecp_t rowA; + vec_t *rX; + tmp = (n / 32) * 32; + for (i = 0; i < tmp; i += 32) + { + xo = x1; + a0 = a_ptr; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + __builtin_mma_xxsetaccz (&acc2); + __builtin_mma_xxsetaccz (&acc3); + __builtin_mma_xxsetaccz (&acc4); + __builtin_mma_xxsetaccz (&acc5); + __builtin_mma_xxsetaccz (&acc6); + __builtin_mma_xxsetaccz (&acc7); + for (j = 0; j < 32; j++) + { + __builtin_prefetch (xo+j); + __builtin_prefetch (a0+i+j+lda); + MMA (xo[j], a0[i + 0 + j * lda], &acc0); + MMA (xo[j], a0[i + 4 + j * lda], &acc1); + MMA (xo[j], a0[i + 8 + j * lda], &acc2); + MMA (xo[j], a0[i + 12 + j * lda], &acc3); + MMA (xo[j], a0[i + 16 + j * lda], &acc4); + MMA (xo[j], a0[i + 20 + j * lda], &acc5); + MMA (xo[j], a0[i + 24 + j * lda], &acc6); + MMA (xo[j], a0[i + 28 + j * lda], &acc7); + } + xo += 32; + a0 += lda << 5; + for (j = 0; j < 32; j++) + { + __builtin_prefetch (xo+j); + __builtin_prefetch (a0+i+j+lda); + MMA (xo[j], a0[i + 0 + j * lda], &acc0); + MMA (xo[j], a0[i + 4 + j * lda], &acc1); + MMA (xo[j], a0[i + 8 + j * lda], &acc2); + MMA (xo[j], a0[i + 12 + j * lda], &acc3); + MMA (xo[j], a0[i + 16 + j * lda], &acc4); + MMA (xo[j], a0[i + 20 + j * lda], &acc5); + MMA (xo[j], a0[i + 24 + j * lda], &acc6); + MMA (xo[j], a0[i + 28 + j * lda], &acc7); + } + xo += 32; + a0 += lda << 5; + for (j = 0; j < 32; j++) + { + __builtin_prefetch (xo+j); + __builtin_prefetch (a0+i+j+lda); + MMA (xo[j], a0[i + 0 + j * lda], &acc0); + MMA (xo[j], a0[i + 4 + j * lda], &acc1); + MMA (xo[j], a0[i + 8 + j * lda], &acc2); + MMA (xo[j], a0[i + 12 + j * lda], &acc3); + MMA (xo[j], a0[i + 16 + j * lda], &acc4); + MMA (xo[j], a0[i + 20 + j * lda], &acc5); + MMA (xo[j], a0[i + 24 + j * lda], &acc6); + MMA (xo[j], a0[i + 28 + j * lda], &acc7); + } + xo += 32; + a0 += lda << 5; + for (j = 0; j < 32; j++) + { + __builtin_prefetch (xo+j); + __builtin_prefetch (a0+i+j+lda); + MMA (xo[j], a0[i + 0 + j * lda], &acc0); + MMA (xo[j], a0[i + 4 + j * lda], &acc1); + MMA (xo[j], a0[i + 8 + j * lda], &acc2); + MMA (xo[j], a0[i + 12 + j * lda], &acc3); + MMA (xo[j], a0[i + 16 + j * lda], &acc4); + MMA (xo[j], a0[i + 20 + j * lda], &acc5); + MMA (xo[j], a0[i + 24 + j * lda], &acc6); + MMA (xo[j], a0[i + 28 + j * lda], &acc7); + } + xo += 32; + a0 += lda << 5; + SAVE (&acc0, i + 0); + SAVE (&acc1, i + 4); + SAVE (&acc2, i + 8); + SAVE (&acc3, i + 12); + SAVE (&acc4, i + 16); + SAVE (&acc5, i + 20); + SAVE (&acc6, i + 24); + SAVE (&acc7, i + 28); + + } + for (i = tmp; i < n; i += 4) + { + xo = x1; + a0 = a_ptr; + __builtin_mma_xxsetaccz (&acc0); + for (j = 0; j < 32; j++) + { + __builtin_prefetch (xo+j); + __builtin_prefetch (a0+i+j+lda); + MMA (xo[j], a0[i + j * lda], &acc0); + } + xo += 32; + a0 += lda << 5; + for (j = 0; j < 32; j++) + { + __builtin_prefetch (xo+j); + __builtin_prefetch (a0+i+j+lda); + MMA (xo[j], a0[i + j * lda], &acc0); + } + xo += 32; + a0 += lda << 5; + for (j = 0; j < 32; j++) + { + __builtin_prefetch (xo+j); + __builtin_prefetch (a0+i+j+lda); + MMA (xo[j], a0[i + j * lda], &acc0); + } + xo += 32; + a0 += lda << 5; + for (j = 0; j < 32; j++) + { + __builtin_prefetch (xo+j); + __builtin_prefetch (a0+i+j+lda); + MMA (xo[j], a0[i + j * lda], &acc0); + } + xo += 32; + a0 += lda << 5; + SAVE (&acc0, i); + } +} + + +#define NBMAX 4096 + +#ifndef HAVE_KERNEL_4x4 + +static void dgemv_kernel_4x4(BLASLONG n, FLOAT *a_ptr, BLASLONG lda, FLOAT *xo, FLOAT *y, FLOAT alpha) +{ + BLASLONG i; + FLOAT x[4] __attribute__ ((aligned (16)));; + FLOAT *a0 = a_ptr; + FLOAT *a1 = a0 + lda; + FLOAT *a2 = a1 + lda; + FLOAT *a3 = a2 + lda; + + + for ( i=0; i<4; i++) + x[i] = xo[i] * alpha; + + for ( i=0; i< n; i+=4 ) + { + y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3]; + y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3]; + y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3]; + y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3]; + } +} + +#endif + +#ifndef HAVE_KERNEL_4x2 + +static void dgemv_kernel_4x2(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *xo, FLOAT *y, FLOAT alpha) +{ + BLASLONG i; + FLOAT x[4] __attribute__ ((aligned (16)));; + + for ( i=0; i<2; i++) + x[i] = xo[i] * alpha; + + for ( i=0; i< n; i+=4 ) + { + y[i] += a0[i]*x[0] + a1[i]*x[1]; + y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1]; + y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1]; + y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1]; + } +} + + +#endif + +#ifndef HAVE_KERNEL_4x1 + +static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT alpha) +{ + BLASLONG i; + FLOAT x[4] __attribute__ ((aligned (16)));; + + for ( i=0; i<1; i++) + x[i] = xo[i] * alpha; + + for ( i=0; i< n; i+=4 ) + { + y[i] += a0[i]*x[0]; + y[i+1] += a0[i+1]*x[0]; + y[i+2] += a0[i+2]*x[0]; + y[i+3] += a0[i+3]*x[0]; + } +} + + +#endif + + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) +{ + BLASLONG i; + if ( inc_dest != 1 ) + { + for ( i=0; i> 7; + n1 = (n - (n128 * 128)) >> 2; + n2 = (n - (n128 * 128)) & 3; + + m3 = m & 3 ; + m1 = m & -4 ; + m2 = (m & (NBMAX-1)) - m3 ; + + y_ptr = y; + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + a_ptr = a; + x_ptr = x; + + if ( inc_y != 1 ) + memset(ybuffer,0,NB*8); + else + ybuffer = y_ptr; + + if ( inc_x == 1 ) + { + + for( i = 0; i < n128 ; i++) + { + dgemv_kernel_4x128(NB,a_ptr,lda,x_ptr,ybuffer,alpha); + a_ptr += lda128; + x_ptr += 128; + } + + for( i = 0; i < n1 ; i++) + { + dgemv_kernel_4x4(NB,a_ptr,lda,x_ptr,ybuffer,alpha); + a_ptr += lda4; + x_ptr += 4; + } + + if ( n2 & 2 ) + { + dgemv_kernel_4x2(NB,a_ptr,a_ptr+lda,x_ptr,ybuffer,alpha); + a_ptr += lda*2; + x_ptr += 2; + } + + + if ( n2 & 1 ) + { + dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,alpha); + a_ptr += lda; + x_ptr += 1; + + } + + + } + else + { + for( i = 0; i < n128 ; i++) + { + FLOAT xbuffer[128] __attribute__ ((aligned (16))); + BLASLONG j; + for ( j = 0; j < 128 ; j++) + { + xbuffer[j] = x_ptr[0]; + x_ptr += inc_x; + } + dgemv_kernel_4x128(NB,a_ptr,lda,xbuffer,ybuffer,alpha); + a_ptr += lda128; + } + + for( i = 0; i < n1 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[1] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[3] = x_ptr[0]; + x_ptr += inc_x; + dgemv_kernel_4x4(NB,a_ptr,lda,xbuffer,ybuffer,alpha); + a_ptr += lda4; + } + + for( i = 0; i < n2 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha); + a_ptr += lda; + + } + + } + + a += NB; + if ( inc_y != 1 ) + { + add_y(NB,ybuffer,y_ptr,inc_y); + y_ptr += NB * inc_y; + } + else + y_ptr += NB ; + + } + + if ( m3 == 0 ) return(0); + + if ( m3 == 3 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + if ( lda == 3 && inc_x ==1 ) + { + + for( i = 0; i < ( n & -4 ); i+=4 ) + { + + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; + temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + + temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; + temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; + temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; + + a_ptr += 12; + x_ptr += 4; + } + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += 3; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + y_ptr[0] += alpha * temp2; + return(0); + } + + + if ( m3 == 2 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + if ( lda == 2 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4) ; i+=4 ) + { + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; + temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + a_ptr += 8; + x_ptr += 4; + + } + + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += 2; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + return(0); + } + + if ( m3 == 1 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp = 0.0; + if ( lda == 1 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4); i+=4 ) + { + temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; + + } + + for( ; i < n; i++ ) + { + temp += a_ptr[i] * x_ptr[i]; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp += a_ptr[0] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + } + + } + y_ptr[0] += alpha * temp; + return(0); + } + + + return(0); +} + + diff --git a/kernel/power/dgemv_t_power10.c b/kernel/power/dgemv_t_power10.c new file mode 100644 index 000000000..3db4d5785 --- /dev/null +++ b/kernel/power/dgemv_t_power10.c @@ -0,0 +1,840 @@ +/*************************************************************************** +Copyright (c) 2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include "common.h" + +#define NBMAX 1024 +//#define PREFETCH 1 +#include + +#define HAVE_KERNEL4x8_ASM 1 + + +#if defined(HAVE_KERNEL4x8_ASM) +static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, double *y, double alpha) { + + FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; + BLASLONG off2; + BLASLONG tempR; + __asm__( + + "sldi %[temp],%[off], 4 \n\t" // lda * sizeof (double) *2 + "sldi %[off], %[off], 3 \n\t" // lda * sizeof (double) + "xxlxor 34,34,34 \n\t" + "xxlxor 35,34,34 \n\t" + "add %[a2], %[a0], %[temp] \n\t" + "add %[a1], %[a0], %[off] \n\t" + "xxlxor 4,34,34 \n\t" + "xxlxor 5,34,34 \n\t" + "xxlxor 6,34,34 \n\t" + "xxlxor 7,34,34 \n\t" + "add %[a3], %[a2], %[off] \n\t" + "add %[a4], %[a2], %[temp] \n\t" + + "xxlxor 8,34,34 \n\t" + "xxlxor 9,34,34 \n\t" + "add %[a5], %[a3], %[temp] \n\t" + "li %[off],0 \n\t" + "li %[off2],16 \n\t" + + "add %[a6], %[a4], %[temp] \n\t" + "add %[a7], %[a5], %[temp] \n\t" + + + + + "lxvp 32, 0(%[x]) \n\t" + "lxvp 36, 0(%[a0]) \n\t" + "lxvp 38, 0(%[a1]) \n\t" + "lxvp 40, 0(%[a2]) \n\t" + "lxvp 42, 0(%[a3]) \n\t" + "lxvp 44, 0(%[a4]) \n\t" + "lxvp 46, 0(%[a5]) \n\t" + "lxvp 48, 0(%[a6]) \n\t" + "lxvp 50, 0(%[a7]) \n\t" +#if defined(PREFETCH) + "li %[temp],896 \n\t" +#endif + "addic. %[n],%[n],-4 \n\t" + + "li %[off],32 \n\t" + + + "ble- two%= \n\t" + + //-------------------------------------------------- + ".align 5 \n\t" + "one%=: \n\t" + "xvmaddadp 34,36,32 \n\t" + "xvmaddadp 35,38,32 \n\t" + "addi %[off2], %[off2],32 \n\t" + "xvmaddadp 4,40,32 \n\t" + "xvmaddadp 5,42,32 \n\t" + "xvmaddadp 6,44,32 \n\t" + "xvmaddadp 7,46,32 \n\t" + "xvmaddadp 8,48,32 \n\t" + "xvmaddadp 9,50,32 \n\t" + + "xvmaddadp 34,37,33 \n\t" + "xvmaddadp 35,39,33 \n\t" + "lxvp 36, 32(%[a0]) \n\t" + "lxvp 38, 32(%[a1]) \n\t" + "xvmaddadp 4,41,33 \n\t" + "xvmaddadp 5,43,33 \n\t" + "addi %[off], %[off],32 \n\t" + "lxvp 40, 32(%[a2]) \n\t" + "lxvp 42, 32(%[a3]) \n\t" + "xvmaddadp 6,45,33 \n\t" + "xvmaddadp 7,47,33 \n\t" + "lxvp 44, 32(%[a4]) \n\t" + "lxvp 46, 32(%[a5]) \n\t" + "xvmaddadp 8,49,33 \n\t" + "xvmaddadp 9,51,33 \n\t" + + "addic. %[n],%[n],-4 \n\t" + "lxvp 48, 32(%[a6]) \n\t" + "lxvp 50, 32(%[a7]) \n\t" + "lxvp 32, 32(%[x]) \n\t" + "ble- two%= \n\t" + "xvmaddadp 34,36,32 \n\t" + "xvmaddadp 35,38,32 \n\t" + "addi %[off2], %[off2],32 \n\t" + "xvmaddadp 4,40,32 \n\t" + "xvmaddadp 5,42,32 \n\t" + "xvmaddadp 6,44,32 \n\t" + "xvmaddadp 7,46,32 \n\t" + "xvmaddadp 8,48,32 \n\t" + "xvmaddadp 9,50,32 \n\t" + + "xvmaddadp 34,37,33 \n\t" + "xvmaddadp 35,39,33 \n\t" + "lxvp 36, 64(%[a0]) \n\t" + "lxvp 38, 64(%[a1]) \n\t" + "xvmaddadp 4,41,33 \n\t" + "xvmaddadp 5,43,33 \n\t" + "addi %[off], %[off],32 \n\t" + "lxvp 40, 64(%[a2]) \n\t" + "lxvp 42, 64(%[a3]) \n\t" + "xvmaddadp 6,45,33 \n\t" + "xvmaddadp 7,47,33 \n\t" + "lxvp 44, 64(%[a4]) \n\t" + "lxvp 46, 64(%[a5]) \n\t" + "xvmaddadp 8,49,33 \n\t" + "xvmaddadp 9,51,33 \n\t" + + "addic. %[n],%[n],-4 \n\t" + "lxvp 48, 64(%[a6]) \n\t" + "lxvp 50, 64(%[a7]) \n\t" + "lxvp 32, 64(%[x]) \n\t" + "ble- two%= \n\t" + "xvmaddadp 34,36,32 \n\t" + "xvmaddadp 35,38,32 \n\t" +#if defined(PREFETCH) + "addi %[temp],%[temp],128 \n\t" +#endif + "addi %[off2], %[off2],32 \n\t" + "xvmaddadp 4,40,32 \n\t" + "xvmaddadp 5,42,32 \n\t" + "xvmaddadp 6,44,32 \n\t" + "xvmaddadp 7,46,32 \n\t" + "xvmaddadp 8,48,32 \n\t" + "xvmaddadp 9,50,32 \n\t" +#if defined(PREFETCH) + "dcbt %[temp],%[a0] \n\t" +#endif + + "xvmaddadp 34,37,33 \n\t" + "xvmaddadp 35,39,33 \n\t" + "lxvp 36, 96(%[a0]) \n\t" + "lxvp 38, 96(%[a1]) \n\t" + "xvmaddadp 4,41,33 \n\t" + "xvmaddadp 5,43,33 \n\t" +#if defined(PREFETCH) + "dcbt %[temp],%[a1] \n\t" +#endif + "lxvp 40, 96(%[a2]) \n\t" + "lxvp 42, 96(%[a3]) \n\t" + "addi %[off], %[off],32 \n\t" + "xvmaddadp 6,45,33 \n\t" + "xvmaddadp 7,47,33 \n\t" + "lxvp 44, 96(%[a4]) \n\t" + "lxvp 46, 96(%[a5]) \n\t" + "xvmaddadp 8,49,33 \n\t" + "xvmaddadp 9,51,33 \n\t" +#if defined(PREFETCH) + "dcbt %[temp],%[a3] \n\t" +#endif + "lxvp 48, 96(%[a6]) \n\t" + "lxvp 50, 96(%[a7]) \n\t" + "lxvp 32, 96(%[x]) \n\t" + + "addic. %[n],%[n],-4 \n\t" + "ble- two%= \n\t" + + "addi %[off2], %[off2],32 \n\t" +#if defined(PREFETCH) + "dcbt %[temp],%[a2] \n\t" +#endif + "xvmaddadp 34,36,32 \n\t" + "xvmaddadp 35,38,32 \n\t" + "xvmaddadp 4,40,32 \n\t" + "xvmaddadp 5,42,32 \n\t" +#if defined(PREFETCH) + "dcbt %[temp],%[a4] \n\t" +#endif + "xvmaddadp 6,44,32 \n\t" + "xvmaddadp 7,46,32 \n\t" + "xvmaddadp 8,48,32 \n\t" + "xvmaddadp 9,50,32 \n\t" + +#if defined(PREFETCH) + "dcbt %[temp],%[a5] \n\t" +#endif + "xvmaddadp 34,37,33 \n\t" + "xvmaddadp 35,39,33 \n\t" + "lxvp 36, 128(%[a0]) \n\t" + "lxvp 38, 128(%[a1]) \n\t" + "xvmaddadp 4,41,33 \n\t" + "xvmaddadp 5,43,33 \n\t" + "addi %[off], %[off],32 \n\t" + "lxvp 40, 128(%[a2]) \n\t" + "lxvp 42, 128(%[a3]) \n\t" +#if defined(PREFETCH) + "dcbt %[temp],%[a6] \n\t" +#endif + "xvmaddadp 6,45,33 \n\t" + "xvmaddadp 7,47,33 \n\t" + "lxvp 44, 128(%[a4]) \n\t" + "lxvp 46, 128(%[a5]) \n\t" + "xvmaddadp 8,49,33 \n\t" + "xvmaddadp 9,51,33 \n\t" + +#if defined(PREFETCH) + "dcbt %[temp],%[a7] \n\t" +#endif + "addic. %[n],%[n],-4 \n\t" + "lxvp 48, 128(%[a6]) \n\t" + "lxvp 50, 128(%[a7]) \n\t" + "lxvp 32, 128(%[x]) \n\t" +#if defined(PREFETCH) + "dcbt %[temp],%[x] \n\t" +#endif + "addi %[a0], %[a0], 128 \n\t" + "addi %[a1], %[a1], 128 \n\t" + "addi %[a2], %[a2], 128 \n\t" + "addi %[a3], %[a3], 128 \n\t" + "addi %[a4], %[a4], 128 \n\t" + "addi %[a5], %[a5], 128 \n\t" + "addi %[a6], %[a6], 128 \n\t" + "addi %[a7], %[a7], 128 \n\t" + "addi %[x], %[x], 128 \n\t" + "bgt+ one%= \n\t" + ".align 5 \n\t" + "two%=: \n\t" + //-------------------------------------------- + + "xvmaddadp 34,36,32 \n\t" + "xvmaddadp 35,38,32 \n\t" + "xvmaddadp 4,40,32 \n\t" + "xvmaddadp 5,42,32 \n\t" + "xvmaddadp 6,44,32 \n\t" + "xvmaddadp 7,46,32 \n\t" + "xvmaddadp 8,48,32 \n\t" + "xvmaddadp 9,50,32 \n\t" + XXSPLTD_S(36,%x[alpha],0) + "xvmaddadp 34,37,33 \n\t" + "xvmaddadp 35,39,33 \n\t" + "xvmaddadp 4,41,33 \n\t" + "xvmaddadp 5,43,33 \n\t" + "xvmaddadp 6,45,33 \n\t" + "xvmaddadp 7,47,33 \n\t" + "xvmaddadp 8,49,33 \n\t" + "xvmaddadp 9,51,33 \n\t" + + "lxvp 38, 0(%[y]) \n\t" + "lxvp 40, 32(%[y]) \n\t" + + + + XXMRGLD_S(42,35,34) + XXMRGHD_S(43,35,34) + + XXMRGLD_S(44,5,4) + XXMRGHD_S(45,5,4) + + "xvadddp 42,42,43 \n\t" + + XXMRGLD_S(46,7,6) + XXMRGHD_S(47,7,6) + + "xvadddp 44,44,45 \n\t" + + XXMRGLD_S(48,9,8) + XXMRGHD_S(49,9,8) + + "xvadddp 46,46,47 \n\t" + + "xvmaddadp 39,42,36 \n\t" + "xvmaddadp 38,44,36 \n\t" + + "xvadddp 48,48,49 \n\t" + + "xvmaddadp 41,46,36 \n\t" + + "stxvp 38, 0(%[y]) \n\t" + "xvmaddadp 40,48,36 \n\t" + "stxvp 40, 32(%[y]) \n\t" + + : [memy] "+m" (*(double (*)[8])y), + [n] "+&r" (n), + [a0] "=b" (a0), + [a1] "=&b" (a1), + [a2] "=&b" (a2), + [a3] "=&b" (a3), + [a4] "=&b" (a4), + [a5] "=&b" (a5), + [a6] "=&b" (a6), + [a7] "=&b" (a7), + [off] "+&b" (lda), + [off2]"=&b" (off2), + [temp] "=&b" (tempR) + : [memx] "m" (*(const double (*)[n])x), + [mem_ap] "m" (*(const double (*)[n*8]) ap), + [alpha] "d" (alpha), + "[a0]" (ap), + [x] "b" (x), + [y] "b" (y) + : "cc","vs4","vs5","vs6","vs7","vs8","vs9" ,"vs32","vs33","vs34","vs35", "vs36", "vs37", "vs38", "vs39", + "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51" + ); + return; +} +#else +static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + BLASLONG i; +#if defined(PREFETCH) + BLASLONG j, c, k; +#endif + FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; + __vector double *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; + register __vector double temp0 = {0, 0}; + register __vector double temp1 = {0, 0}; + register __vector double temp2 = {0, 0}; + register __vector double temp3 = {0, 0}; + register __vector double temp4 = {0, 0}; + register __vector double temp5 = {0, 0}; + register __vector double temp6 = {0, 0}; + register __vector double temp7 = {0, 0}; + + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + a4 = a3 + lda; + a5 = a4 + lda; + a6 = a5 + lda; + a7 = a6 + lda; + va0 = (__vector double*) a0; + va1 = (__vector double*) a1; + va2 = (__vector double*) a2; + va3 = (__vector double*) a3; + va4 = (__vector double*) a4; + va5 = (__vector double*) a5; + va6 = (__vector double*) a6; + va7 = (__vector double*) a7; + v_x = (__vector double*) x; + +#if defined(PREFETCH) + + c = n >> 1; + + for (j = 0; j < c; j += 64) { + k = (c - j) > 64 ? 64 : (c - j); + __builtin_prefetch(v_x + 64); + __builtin_prefetch(va0 + 64); + __builtin_prefetch(va1 + 64); + __builtin_prefetch(va2 + 64); + __builtin_prefetch(va3 + 64); + __builtin_prefetch(va4 + 64); + __builtin_prefetch(va5 + 64); + __builtin_prefetch(va6 + 64); + __builtin_prefetch(va7 + 64); + for (i = 0; i < k; i += 2) { +#else + + for (i = 0; i < n/2; i += 2) { +#endif + temp0 += v_x[i] * va0[i]; + temp1 += v_x[i] * va1[i]; + temp2 += v_x[i] * va2[i]; + temp3 += v_x[i] * va3[i]; + temp4 += v_x[i] * va4[i]; + temp5 += v_x[i] * va5[i]; + temp6 += v_x[i] * va6[i]; + temp7 += v_x[i] * va7[i]; + temp0 += v_x[i + 1] * va0[i + 1]; + temp1 += v_x[i + 1] * va1[i + 1]; + temp2 += v_x[i + 1] * va2[i + 1]; + temp3 += v_x[i + 1] * va3[i + 1]; + + temp4 += v_x[i + 1] * va4[i + 1]; + temp5 += v_x[i + 1] * va5[i + 1]; + temp6 += v_x[i + 1] * va6[i + 1]; + temp7 += v_x[i + 1] * va7[i + 1]; + } +#if defined(PREFETCH) + va0 += 64; + va1 += 64; + va2 += 64; + va3 += 64; + va4 += 64; + va5 += 64; + va6 += 64; + va7 += 64; + v_x += 64; + + } +#endif + y[0] += alpha * (temp0[0] + temp0[1]); + y[1] += alpha * (temp1[0] + temp1[1]); + y[2] += alpha * (temp2[0] + temp2[1]); + y[3] += alpha * (temp3[0] + temp3[1]); + + y[4] += alpha * (temp4[0] + temp4[1]); + y[5] += alpha * (temp5[0] + temp5[1]); + y[6] += alpha * (temp6[0] + temp6[1]); + y[7] += alpha * (temp7[0] + temp7[1]); + +} + +#endif + + +static void dgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + BLASLONG i = 0; + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + __vector double* va0 = (__vector double*) a0; + __vector double* va1 = (__vector double*) a1; + __vector double* va2 = (__vector double*) a2; + __vector double* va3 = (__vector double*) a3; + __vector double* v_x = (__vector double*) x; + register __vector double temp0 = {0, 0}; + register __vector double temp1 = {0, 0}; + register __vector double temp2 = {0, 0}; + register __vector double temp3 = {0, 0}; + register __vector double temp4 = {0, 0}; + register __vector double temp5 = {0, 0}; + register __vector double temp6 = {0, 0}; + register __vector double temp7 = {0, 0}; + + for (i = 0; i < n / 2; i += 2) { + temp0 += v_x[i] * va0[i]; + temp1 += v_x[i] * va1[i]; + temp2 += v_x[i] * va2[i]; + temp3 += v_x[i] * va3[i]; + temp4 += v_x[i + 1] * va0[i + 1]; + temp5 += v_x[i + 1] * va1[i + 1]; + temp6 += v_x[i + 1] * va2[i + 1]; + temp7 += v_x[i + 1] * va3[i + 1]; + } + + temp0 += temp4; + temp1 += temp5; + temp2 += temp6; + temp3 += temp7; + y[0] += alpha * (temp0[0] + temp0[1]); + y[1] += alpha * (temp1[0] + temp1[1]); + y[2] += alpha * (temp2[0] + temp2[1]); + y[3] += alpha * (temp3[0] + temp3[1]); + +} + + +static void dgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) { + + BLASLONG i; + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + __vector double* va0 = (__vector double*) a0; + __vector double* va1 = (__vector double*) a1; + __vector double* v_x = (__vector double*) x; + __vector double temp0 = {0, 0}; + __vector double temp1 = {0, 0}; + for (i = 0; i < n / 2; i += 2) { + temp0 += v_x[i] * va0[i] + v_x[i + 1] * va0[i + 1]; + temp1 += v_x[i] * va1[i] + v_x[i + 1] * va1[i + 1]; + } + + + + y[0] += alpha * (temp0[0] + temp0[1]); + y[inc_y] += alpha * (temp1[0] + temp1[1]); +} + +static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + + BLASLONG i; + FLOAT *a0; + a0 = ap; + __vector double* va0 = (__vector double*) a0; + __vector double* v_x = (__vector double*) x; + __vector double temp0 = {0, 0}; + for (i = 0; i < n / 2; i += 2) { + temp0 += v_x[i] * va0[i] + v_x[i + 1] * va0[i + 1]; + } + + *y += alpha * (temp0[0] + temp0[1]); + +} + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { + BLASLONG i; + for (i = 0; i < n; i++) { + *dest++ = *src; + src += inc_src; + } +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { + BLASLONG i; + BLASLONG j; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + FLOAT ybuffer[8] __attribute__((aligned(16))); + FLOAT *xbuffer; + + if (m < 1) return (0); + if (n < 1) return (0); + + xbuffer = buffer; + + n1 = n >> 3; + n2 = n & 7; + + m3 = m & 3; + m1 = m - m3; + m2 = (m & (NBMAX - 1)) - m3; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + + if (inc_x != 1) + copy_x(NB, x_ptr, xbuffer, inc_x); + else + xbuffer = x_ptr; + + BLASLONG lda8 = lda << 3; + + + if (inc_y == 1) { + + for (i = 0; i < n1; i++) { + + dgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha); + + y_ptr += 8; + a_ptr += lda8; +#if defined(PREFETCH) + __builtin_prefetch(y_ptr+64); +#endif + } + + } else { + + for (i = 0; i < n1; i++) { + ybuffer[0] = 0; + ybuffer[1] = 0; + ybuffer[2] = 0; + ybuffer[3] = 0; + ybuffer[4] = 0; + ybuffer[5] = 0; + ybuffer[6] = 0; + ybuffer[7] = 0; + dgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha); + + + + *y_ptr += ybuffer[0]; + y_ptr += inc_y; + *y_ptr += ybuffer[1]; + y_ptr += inc_y; + *y_ptr += ybuffer[2]; + y_ptr += inc_y; + *y_ptr += ybuffer[3]; + y_ptr += inc_y; + + *y_ptr += ybuffer[4]; + y_ptr += inc_y; + *y_ptr += ybuffer[5]; + y_ptr += inc_y; + *y_ptr += ybuffer[6]; + y_ptr += inc_y; + *y_ptr += ybuffer[7]; + y_ptr += inc_y; + + a_ptr += lda8; + } + + } + + + if (n2 & 4) { + ybuffer[0] = 0; + ybuffer[1] = 0; + ybuffer[2] = 0; + ybuffer[3] = 0; + dgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha); + + a_ptr += lda<<2; + + *y_ptr += ybuffer[0]; + y_ptr += inc_y; + *y_ptr += ybuffer[1]; + y_ptr += inc_y; + *y_ptr += ybuffer[2]; + y_ptr += inc_y; + *y_ptr += ybuffer[3]; + y_ptr += inc_y; + } + + if (n2 & 2) { + dgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y); + a_ptr += lda << 1; + y_ptr += 2 * inc_y; + + } + + if (n2 & 1) { + dgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha); + a_ptr += lda; + y_ptr += inc_y; + + } + + a += NB; + x += NB * inc_x; + + + } + + if (m3 == 0) return (0); + + x_ptr = x; + a_ptr = a; + if (m3 == 3) { + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp2 = *x_ptr * alpha; + + FLOAT *aj = a_ptr; + y_ptr = y; + + if (lda == 3 && inc_y == 1) { + + for (j = 0; j < (n & -4); j += 4) { + + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; + y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; + y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; + y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; + aj += 12; + } + + for (; j < n; j++) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; + aj += 3; + } + + } else { + + if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + + for (j = 0; j < (n & -4); j += 4) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; + y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2; + y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2; + y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2; + aj += lda4; + } + + for (; j < n; j++) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; + aj += lda; + } + + } else { + + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; + y_ptr += inc_y; + aj += lda; + } + + } + + } + return (0); + } + + if (m3 == 2) { + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + + FLOAT *aj = a_ptr; + y_ptr = y; + + if (lda == 2 && inc_y == 1) { + + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; + y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1; + y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1; + y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1; + aj += 8; + + } + + for (; j < n; j++) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; + aj += 2; + } + + } else { + if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + + for (j = 0; j < (n & -4); j += 4) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; + y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; + y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; + y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; + aj += lda4; + } + + for (; j < n; j++) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; + aj += lda; + } + + } else { + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1; + y_ptr += inc_y; + aj += lda; + } + } + + } + return (0); + + } + + FLOAT xtemp = *x_ptr * alpha; + FLOAT *aj = a_ptr; + y_ptr = y; + if (lda == 1 && inc_y == 1) { + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += aj[j] * xtemp; + y_ptr[j + 1] += aj[j + 1] * xtemp; + y_ptr[j + 2] += aj[j + 2] * xtemp; + y_ptr[j + 3] += aj[j + 3] * xtemp; + } + for (; j < n; j++) { + y_ptr[j] += aj[j] * xtemp; + } + + + } else { + if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += *aj * xtemp; + y_ptr[j + 1] += *(aj + lda) * xtemp; + y_ptr[j + 2] += *(aj + lda2) * xtemp; + y_ptr[j + 3] += *(aj + lda3) * xtemp; + aj += lda4; + } + + for (; j < n; j++) { + y_ptr[j] += *aj * xtemp; + aj += lda; + } + + } else { + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp; + y_ptr += inc_y; + aj += lda; + } + + } + } + + return (0); + +} + From 104aa678b0f4bc4dd9f65959d0b6f1aeb7b6f6d3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 30 Jul 2020 11:40:52 +0200 Subject: [PATCH 334/593] Fix inadvertent version number reversal to 0.3.9.dev caused by #2710 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7e51e7e38..4bef6570c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 9.dev) +set(OpenBLAS_PATCH_VERSION 10.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From 589c74aed38bb7923d6653fa9370b81e4fe95b4a Mon Sep 17 00:00:00 2001 From: Kevin Adler Date: Thu, 30 Jul 2020 20:52:16 -0500 Subject: [PATCH 335/593] Use systemcfg APIs for CPU detection on AIX AIX libc already provides ready access to an integer that contains a bit identifying the CPU it's running on, so there's no need to call a program and grep its output. Additionally, prtconf is not available in the PASE runtime, which provides an AIX emulation layer on the IBM i operating system. The AIX systemcfg.h also provides macro definitions like POWER_8, POWER_9, etc for all the bits defining the CPUs as well as macros like __power_8(), __power_9_andup() that return booleans, but I did not use them. Since these macros depend on the level of the OS in which it is built, they may not be defined and instead the associated hex literals are used directly. --- cpuid_power.c | 41 +++++++++++++---------------------------- 1 file changed, 13 insertions(+), 28 deletions(-) diff --git a/cpuid_power.c b/cpuid_power.c index 8f578d68f..df3dc8668 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -38,6 +38,7 @@ #include #ifdef _AIX +#include #include #endif #ifdef __APPLE__ @@ -137,35 +138,19 @@ int detect(void){ #endif #ifdef _AIX - FILE *infile; - char buffer[512], *p; - - p = (char *)NULL; - infile = popen("prtconf|grep 'Processor Type'", "r"); - while (fgets(buffer, sizeof(buffer), infile)){ - if (!strncmp("Pro", buffer, 3)){ - p = strchr(buffer, ':') + 2; -#if 0 - fprintf(stderr, "%s\n", p); -#endif - break; - } - } - - pclose(infile); + // Cast from int to unsigned to ensure comparisons work for all bits in + // the bit mask, even the top bit + unsigned implementation = (unsigned) _system_configuration.implementation; - if (strstr(p, "POWER3")) return CPUTYPE_POWER3; - if (strstr(p, "POWER4")) return CPUTYPE_POWER4; - if (strstr(p, "PPC970")) return CPUTYPE_PPC970; - if (strstr(p, "POWER5")) return CPUTYPE_POWER5; - if (strstr(p, "POWER6")) return CPUTYPE_POWER6; - if (strstr(p, "POWER7")) return CPUTYPE_POWER6; - if (strstr(p, "POWER8")) return CPUTYPE_POWER8; - if (strstr(p, "POWER9")) return CPUTYPE_POWER9; - if (strstr(p, "POWER10")) return CPUTYPE_POWER10; - if (strstr(p, "Cell")) return CPUTYPE_CELL; - if (strstr(p, "7447")) return CPUTYPE_PPCG4; - return CPUTYPE_POWER5; + if (implementation >= 0x40000u) return CPUTYPE_POWER10; + else if (implementation & 0x20000) return CPUTYPE_POWER9; + else if (implementation & 0x10000) return CPUTYPE_POWER8; + else if (implementation & 0x08000) return CPUTYPE_POWER7; // POWER 7 + else if (implementation & 0x04000) return CPUTYPE_POWER6; + else if (implementation & 0x02000) return CPUTYPE_POWER5; + else if (implementation & 0x01000) return CPUTYPE_POWER4; // MPC7450 + else if (implementation & 0x00800) return CPUTYPE_POWER4; + else return CPUTYPE_POWER3; #endif #ifdef __APPLE__ From da9e2a7adafc2e0d321e6f2f90beaffed2853372 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 31 Jul 2020 16:03:33 +0200 Subject: [PATCH 336/593] Add SYMBOLPREFIX and/or SYMBOLSUFFIX to cblas prototypes --- Makefile | 3 ++- Makefile.install | 12 ++++++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index e113026dd..c1d943fac 100644 --- a/Makefile +++ b/Makefile @@ -365,11 +365,12 @@ clean :: @$(MAKE) -C kernel clean #endif @$(MAKE) -C reference clean - @rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h + @rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h *.so.renamed *.a.renamed *.so.0 ifeq ($(OSNAME), Darwin) @rm -rf getarch.dSYM getarch_2nd.dSYM endif @rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib + @rm -f cblas.tmp cblas.tmp2 @touch $(NETLIB_LAPACK_DIR)/make.inc @$(MAKE) -C $(NETLIB_LAPACK_DIR) clean @rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h diff --git a/Makefile.install b/Makefile.install index dad869f4c..12713a6db 100644 --- a/Makefile.install +++ b/Makefile.install @@ -45,7 +45,16 @@ install : lib.grd ifndef NO_CBLAS @echo Generating cblas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) - @sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h" + @cp cblas.h cblas.tmp +ifdef SYMBOLPREFIX + @sed 's/cblas/$(SYMBOLPREFIX)cblas/g' cblas.tmp > cblas.tmp2 + @sed 's/openblas/$(SYMBOLPREFIX)openblas/g' cblas.tmp2 > cblas.tmp +endif +ifdef SYMBOLSUFFIX + @sed 's/(OPENBLAS/$(SYMBOLSUFFIX)(OPENBLAS/g' cblas.tmp > cblas.tmp2 + @sed 's/(void)/$(SYMBOLSUFFIX)(void)/g' cblas.tmp2 > cblas.tmp +endif + @sed 's/common/openblas_config/g' cblas.tmp > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h" endif ifneq ($(OSNAME), AIX) @@ -168,4 +177,3 @@ endif @echo " endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo Install OK! - From 60cd5e55fc2b8d50b52ebc54c701cb7315ad74ca Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 1 Aug 2020 12:31:39 +0200 Subject: [PATCH 337/593] Protect against inadvertent activation of USE_CUDA --- driver/others/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/driver/others/Makefile b/driver/others/Makefile index 5653f3c25..7558ec058 100644 --- a/driver/others/Makefile +++ b/driver/others/Makefile @@ -47,8 +47,10 @@ endif endif ifdef USE_CUDA +ifeq ($(USE_CUDA), 1) COMMONOBJS += cuda_init.$(SUFFIX) endif +endif ifdef FUNCTION_PROFILE COMMONOBJS += profile.$(SUFFIX) From ecf4b9e0fca35ed15e3b0354002584fbd29a6166 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 1 Aug 2020 17:06:03 +0200 Subject: [PATCH 338/593] Improve substitution rules for SYMBOLPREFIX and -SUFFIX addition --- Makefile.install | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/Makefile.install b/Makefile.install index 12713a6db..01c0b1226 100644 --- a/Makefile.install +++ b/Makefile.install @@ -47,12 +47,18 @@ ifndef NO_CBLAS @echo Generating cblas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) @cp cblas.h cblas.tmp ifdef SYMBOLPREFIX - @sed 's/cblas/$(SYMBOLPREFIX)cblas/g' cblas.tmp > cblas.tmp2 - @sed 's/openblas/$(SYMBOLPREFIX)openblas/g' cblas.tmp2 > cblas.tmp + @sed 's/cblas[^( ]*/$(SYMBOLPREFIX)&/g' cblas.tmp > cblas.tmp2 + @sed 's/openblas[^( ]*/$(SYMBOLPREFIX)&/g' cblas.tmp2 > cblas.tmp + #change back any openblas_complex_float and double that got hit + @sed 's/$(SYMBOLPREFIX)openblas_complex_/openblas_complex_/g' cblas.tmp > cblas.tmp2 + @sed 's/goto[^( ]*/$(SYMBOLPREFIX)&/g' cblas.tmp2 > cblas.tmp endif ifdef SYMBOLSUFFIX - @sed 's/(OPENBLAS/$(SYMBOLSUFFIX)(OPENBLAS/g' cblas.tmp > cblas.tmp2 - @sed 's/(void)/$(SYMBOLSUFFIX)(void)/g' cblas.tmp2 > cblas.tmp + @sed 's/cblas[^( ]*/&$(SYMBOLSUFFIX)/g' cblas.tmp > cblas.tmp2 + @sed 's/openblas[^( ]*/&$(SYMBOLSUFFIX)/g' cblas.tmp2 > cblas.tmp + #change back any openblas_complex_float and double that got hit + @sed 's/\(openblas_complex_\)\([^ ]*\)$(SYMBOLSUFFIX)/\1\2 /g' cblas.tmp > cblas.tmp2 + @sed 's/goto[^( ]*/&$(SYMBOLSUFFIX)/g' cblas.tmp2 > cblas.tmp endif @sed 's/common/openblas_config/g' cblas.tmp > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h" endif From 6794ac34153d9def9a1056738090160868417702 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 2 Aug 2020 11:20:08 +0200 Subject: [PATCH 339/593] Add SYMBOLPREFIX and/or -SUFFIX to cblas.h if needed --- CMakeLists.txt | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7e51e7e38..c324e2241 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 9.dev) +set(OpenBLAS_PATCH_VERSION 10.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions @@ -249,7 +249,7 @@ if (BUILD_SHARED_LIBS AND BUILD_RELAPACK) endif() endif() -if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFIX} STREQUAL "") +if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") if (NOT DEFINED ARCH) set(ARCH_IN "x86_64") else() @@ -358,10 +358,21 @@ endif() if(NOT NO_CBLAS) message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}") - set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h) file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS) string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") + if (NOT ${SYMBOLPREFIX} STREQUAL "") + string(REPLACE " cblas" " ${SYMBOLPREFIX}cblas" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") + string(REPLACE " openblas" " ${SYMBOLPREFIX}openblas" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") + string (REPLACE " ${SYMBOLPREFIX}openblas_complex" " openblas_complex" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") + string(REPLACE " goto" " ${SYMBOLPREFIX}goto" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") + endif() + if (NOT ${SYMBOLSUFFIX} STREQUAL "") + string(REGEX REPLACE "(cblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") + string(REGEX REPLACE "(openblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") + string(REGEX REPLACE "(openblas_complex[^ ]*)${SYMBOLSUFFIX}" "\\1" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") + string(REGEX REPLACE "(goto[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") + endif() file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}") install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) endif() From 53add6a80df77fecac8b2b2e0c81a913a50eda42 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 2 Aug 2020 17:57:12 +0200 Subject: [PATCH 340/593] Apply library name suffix to openblas if any --- lapack-netlib/TESTING/EIG/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/TESTING/EIG/CMakeLists.txt b/lapack-netlib/TESTING/EIG/CMakeLists.txt index 70eea8443..e877b1422 100644 --- a/lapack-netlib/TESTING/EIG/CMakeLists.txt +++ b/lapack-netlib/TESTING/EIG/CMakeLists.txt @@ -98,7 +98,7 @@ set(ZEIGTST zchkee.f macro(add_eig_executable name) add_executable(${name} ${ARGN}) - target_link_libraries(${name} openblas) + target_link_libraries(${name} openblas${SUFFIX64_UNDERSCORE}) endmacro() if(BUILD_SINGLE) From aaf1a17168f50ce689b69a87b6643abcd0c1de51 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 2 Aug 2020 17:58:33 +0200 Subject: [PATCH 341/593] Apply current library name suffix --- lapack-netlib/TESTING/LIN/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/TESTING/LIN/CMakeLists.txt b/lapack-netlib/TESTING/LIN/CMakeLists.txt index 954cab193..0d0bb5418 100644 --- a/lapack-netlib/TESTING/LIN/CMakeLists.txt +++ b/lapack-netlib/TESTING/LIN/CMakeLists.txt @@ -239,7 +239,7 @@ set(ZLINTSTRFP zchkrfp.f zdrvrfp.f zdrvrf1.f zdrvrf2.f zdrvrf3.f zdrvrf4.f zerrr macro(add_lin_executable name) add_executable(${name} ${ARGN}) - target_link_libraries(${name} openblas) + target_link_libraries(${name} openblas${SUFFIX64_UNDERSCORE}) #${TMGLIB} ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) endmacro() From aa3a1e7d8ce7049605807375fb52331d000cd0cf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 2 Aug 2020 18:22:31 +0200 Subject: [PATCH 342/593] Multiply by two rather than left shift by one place fixes GCC ubsan report of "left shift of negative value -2" in the BLAS tests --- kernel/x86_64/cdot.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/cdot.c b/kernel/x86_64/cdot.c index 93fca0a0d..f71d7b6b4 100644 --- a/kernel/x86_64/cdot.c +++ b/kernel/x86_64/cdot.c @@ -141,8 +141,8 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA i=0; ix=0; iy=0; - inc_x <<= 1; - inc_y <<= 1; + inc_x *= 2; + inc_y *= 2; while(i < n) { From aa53a8a5cb8cfadb4b1230c4b4596dec7fcd75ac Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 2 Aug 2020 18:25:09 +0200 Subject: [PATCH 343/593] Multiply by two instead of left-shifting one place fixes GCC ubsan report of "left shift of negative value -2" in the BLAS tests --- kernel/x86_64/zdot.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c index 01169e8e6..423a6f23e 100644 --- a/kernel/x86_64/zdot.c +++ b/kernel/x86_64/zdot.c @@ -140,8 +140,8 @@ static void zdot_compute (BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLO i=0; ix=0; iy=0; - inc_x <<= 1; - inc_y <<= 1; + inc_x *= 2; + inc_y *= 2; while(i < n) { From 0ef4b3f1f2b8c4ea20afbd50c35d29971ea1c3e1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 2 Aug 2020 18:27:40 +0200 Subject: [PATCH 344/593] Multiply instead of doing a left shift of a potentially negative number fixes GCC ubsan report in the BLAS tests --- kernel/x86_64/cgemv_t_4.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/x86_64/cgemv_t_4.c b/kernel/x86_64/cgemv_t_4.c index 6bdea6787..f44fe7247 100644 --- a/kernel/x86_64/cgemv_t_4.c +++ b/kernel/x86_64/cgemv_t_4.c @@ -233,9 +233,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, if ( m < 1 ) return(0); if ( n < 1 ) return(0); - inc_x <<= 1; - inc_y <<= 1; - lda <<= 1; + inc_x *= 2; + inc_y *= 2; + lda *= 2; lda4 = lda << 2; xbuffer = buffer; From 81dcfdcf397dd93b03376ea1e17bd7d0d0c7a335 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 2 Aug 2020 18:29:56 +0200 Subject: [PATCH 345/593] Multiply by 2 instead of left-shifting a potentially negative number fixes GCC ubsan warning in the BLAS tests --- kernel/x86_64/zgemv_t_4.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/x86_64/zgemv_t_4.c b/kernel/x86_64/zgemv_t_4.c index 2ab7a671b..6221471f7 100644 --- a/kernel/x86_64/zgemv_t_4.c +++ b/kernel/x86_64/zgemv_t_4.c @@ -235,9 +235,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, if ( m < 1 ) return(0); if ( n < 1 ) return(0); - inc_x <<= 1; - inc_y <<= 1; - lda <<= 1; + inc_x *= 2; + inc_y *= 2; + lda <<= 1; lda4 = lda << 2; xbuffer = buffer; From 475b5c95b9ffb6a249bb8d8f99a8b9a6d5ec7441 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Fri, 7 Aug 2020 15:27:44 -0500 Subject: [PATCH 346/593] Remove extra symbol in Makefile While trying out different unroll values, noted that make failed due to this extra symbol. --- kernel/Makefile.L3 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index d5de070a5..8df306d5f 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -2351,7 +2351,7 @@ $(SGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMITCOPY) endif -$(D Date: Sat, 8 Aug 2020 18:05:20 +0200 Subject: [PATCH 347/593] Create Jenkinsfile for OSUOSL PowerCI --- Jenkinsfile | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 Jenkinsfile diff --git a/Jenkinsfile b/Jenkinsfile new file mode 100644 index 000000000..2b61bed9f --- /dev/null +++ b/Jenkinsfile @@ -0,0 +1,9 @@ +node { + stage('Checkout') { + checkout + } + + stage('Build') { + sh("make") + } +} From 6f5ca44c1afd3fe39cb3e18e34af7ad733b513e0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 9 Aug 2020 12:59:20 +0200 Subject: [PATCH 348/593] Expand TAU array as SGEMQR/DGEMQR read elements 2 and 3 --- lapack-netlib/TESTING/LIN/derrtsqr.f | 4 +++- lapack-netlib/TESTING/LIN/serrtsqr.f | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/lapack-netlib/TESTING/LIN/derrtsqr.f b/lapack-netlib/TESTING/LIN/derrtsqr.f index c8ad30257..d1d0ff02d 100644 --- a/lapack-netlib/TESTING/LIN/derrtsqr.f +++ b/lapack-netlib/TESTING/LIN/derrtsqr.f @@ -77,7 +77,7 @@ * .. * .. Local Arrays .. DOUBLE PRECISION A( NMAX, NMAX ), T( NMAX, NMAX ), W( NMAX ), - $ C( NMAX, NMAX ), TAU(NMAX) + $ C( NMAX, NMAX ), TAU(NMAX*2) * .. * .. External Subroutines .. EXTERNAL ALAESM, CHKXER, DGEQR, @@ -137,6 +137,8 @@ * TAU(1)=1 TAU(2)=1 + TAU(3)=1 + TAU(4)=1 SRNAMT = 'DGEMQR' NB=1 INFOT = 1 diff --git a/lapack-netlib/TESTING/LIN/serrtsqr.f b/lapack-netlib/TESTING/LIN/serrtsqr.f index f00f3e14b..7f91a3c39 100644 --- a/lapack-netlib/TESTING/LIN/serrtsqr.f +++ b/lapack-netlib/TESTING/LIN/serrtsqr.f @@ -77,7 +77,7 @@ * .. * .. Local Arrays .. REAL A( NMAX, NMAX ), T( NMAX, NMAX ), W( NMAX ), - $ C( NMAX, NMAX ), TAU(NMAX) + $ C( NMAX, NMAX ), TAU(NMAX*2) * .. * .. External Subroutines .. EXTERNAL ALAESM, CHKXER, SGEQR, @@ -137,6 +137,8 @@ * TAU(1)=1 TAU(2)=1 + TAU(3)=1 + TAU(4)=1 SRNAMT = 'SGEMQR' NB=1 INFOT = 1 From 64259d521a29514a77eea9ca8681884e7c59eb8f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 9 Aug 2020 13:02:27 +0200 Subject: [PATCH 349/593] Fix use of unallocated array in workspace query and wrong type of argument to xSCAL --- lapack-netlib/TESTING/LIN/cdrvls.f | 6 +++--- lapack-netlib/TESTING/LIN/zdrvls.f | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lapack-netlib/TESTING/LIN/cdrvls.f b/lapack-netlib/TESTING/LIN/cdrvls.f index d24e3885b..f43c10b72 100644 --- a/lapack-netlib/TESTING/LIN/cdrvls.f +++ b/lapack-netlib/TESTING/LIN/cdrvls.f @@ -372,13 +372,13 @@ END IF * Compute workspace needed for CGELSY CALL CGELSY( M, N, NRHS, A, LDA, B, LDB, - $ IWQ, RCOND, CRANK, WQ, -1, RWORK, + $ IWQ, RCOND, CRANK, WQ, -1, RWQ, $ INFO ) LWORK_CGELSY = INT( WQ( 1 ) ) LRWORK_CGELSY = 2*N * Compute workspace needed for CGELSS CALL CGELSS( M, N, NRHS, A, LDA, B, LDB, S, - $ RCOND, CRANK, WQ, -1, RWORK, INFO ) + $ RCOND, CRANK, WQ, -1, RWQ, INFO ) LWORK_CGELSS = INT( WQ( 1 ) ) LRWORK_CGELSS = 5*MNMIN * Compute workspace needed for CGELSD @@ -564,7 +564,7 @@ CALL CLARNV( 2, ISEED, NCOLS*NRHS, $ WORK ) CALL CSCAL( NCOLS*NRHS, - $ ONE / REAL( NCOLS ), WORK, + $ CONE / REAL( NCOLS ), WORK, $ 1 ) END IF CALL CGEMM( TRANS, 'No transpose', NROWS, diff --git a/lapack-netlib/TESTING/LIN/zdrvls.f b/lapack-netlib/TESTING/LIN/zdrvls.f index 4587c5686..1313c853b 100644 --- a/lapack-netlib/TESTING/LIN/zdrvls.f +++ b/lapack-netlib/TESTING/LIN/zdrvls.f @@ -372,12 +372,12 @@ END IF * Compute workspace needed for ZGELSY CALL ZGELSY( M, N, NRHS, A, LDA, B, LDB, IWQ, - $ RCOND, CRANK, WQ, -1, RWORK, INFO ) + $ RCOND, CRANK, WQ, -1, RWQ, INFO ) LWORK_ZGELSY = INT( WQ( 1 ) ) LRWORK_ZGELSY = 2*N * Compute workspace needed for ZGELSS CALL ZGELSS( M, N, NRHS, A, LDA, B, LDB, S, - $ RCOND, CRANK, WQ, -1 , RWORK, + $ RCOND, CRANK, WQ, -1 , RWQ, $ INFO ) LWORK_ZGELSS = INT( WQ( 1 ) ) LRWORK_ZGELSS = 5*MNMIN @@ -564,7 +564,7 @@ CALL ZLARNV( 2, ISEED, NCOLS*NRHS, $ WORK ) CALL ZSCAL( NCOLS*NRHS, - $ ONE / DBLE( NCOLS ), WORK, + $ CONE / DBLE( NCOLS ), WORK, $ 1 ) END IF CALL ZGEMM( TRANS, 'No transpose', NROWS, From c9d32674eaa2602184c2719dde15ac3fbebf41b7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 9 Aug 2020 19:17:04 +0200 Subject: [PATCH 350/593] Add memory barrier to the blas_lock implementation for Linux as recommended by cparrott73 in #2760 --- common_power.h | 1 + 1 file changed, 1 insertion(+) diff --git a/common_power.h b/common_power.h index aa19794b5..e0685f760 100644 --- a/common_power.h +++ b/common_power.h @@ -105,6 +105,7 @@ static void INLINE blas_lock(volatile unsigned long *address){ " bne- 1f\n" " stwcx. %2,0, %1\n" " bne- 0b\n" + " isync\n" "1: " : "=&r"(ret) : "r"(address), "r" (val) From e2828e30aa5fc5670d0f4d4d42fc26649a4c3c64 Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Tue, 11 Aug 2020 12:55:42 +0200 Subject: [PATCH 351/593] s390x: Optimize SGEMM/DGEMM blocks for z14 with explicit loop unrolling/interleaving Improve performance of SGEMM and DGEMM on z14 and z15 by unrolling and interleaving the inner loop of the SGEMM 16x4 and DGEMM 8x4 blocks. Specifically, we explicitly interleave vector register loads and computation of two iterations. Note that this change only adds one C function, since SGEMM 16x4 and DGEMM 8x4 actually map to the same C code: they both hold intermediate results in a 4x4 grid of vector registers, and the C implementation is built around that. Signed-off-by: Marius Hillenbrand --- kernel/zarch/gemm_vec.c | 213 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 212 insertions(+), 1 deletion(-) diff --git a/kernel/zarch/gemm_vec.c b/kernel/zarch/gemm_vec.c index eb6d7700b..eae2e4d69 100644 --- a/kernel/zarch/gemm_vec.c +++ b/kernel/zarch/gemm_vec.c @@ -249,7 +249,6 @@ static inline vector_float vec_load_hinted(FLOAT const *restrict a) { #if UNROLL_M == 16 -VECTOR_BLOCK(16, 4) VECTOR_BLOCK(16, 2) VECTOR_BLOCK(16, 1) #endif @@ -257,7 +256,9 @@ VECTOR_BLOCK(16, 1) VECTOR_BLOCK(8, 8) VECTOR_BLOCK(4, 8) #endif +#ifndef DOUBLE VECTOR_BLOCK(8, 4) +#endif VECTOR_BLOCK(8, 2) VECTOR_BLOCK(8, 1) VECTOR_BLOCK(4, 4) @@ -267,8 +268,218 @@ VECTOR_BLOCK(4, 1) #ifdef DOUBLE VECTOR_BLOCK(2, 4) VECTOR_BLOCK(2, 2) +VECTOR_BLOCK(2, 1) +#endif + + +/** + * Calculate a row-block that fits 4x4 vector registers using a loop + * unrolled-by-2 with explicit interleaving to better overlap loads and + * computation. + * This function fits 16x4 blocks for SGEMM and 8x4 blocks for DGEMM. + */ +#ifdef DOUBLE +static inline void GEBP_block_8_4( +#else // float +static inline void GEBP_block_16_4( +#endif + FLOAT const *restrict A, BLASLONG bk, FLOAT const *restrict B, + FLOAT *restrict C, BLASLONG ldc, FLOAT alpha) { +#define VEC_ROWS 4 +#define VEC_COLS 4 +#define ROWS VEC_ROWS * VLEN_FLOATS +#define COLS (VEC_COLS) + + /* + * Hold intermediate results in vector registers. + * Since we need to force the compiler's hand in places, we need to use + * individual variables in contrast to the generic implementation's + * arrays. + */ +#define INIT_ROW_OF_C(ROW) \ + vector_float A##ROW = vec_load_hinted(A + ROW * VLEN_FLOATS); \ + vector_float C_##ROW##_0 = A##ROW * B[0]; \ + vector_float C_##ROW##_1 = A##ROW * B[1]; \ + vector_float C_##ROW##_2 = A##ROW * B[2]; \ + vector_float C_##ROW##_3 = A##ROW * B[3]; + + INIT_ROW_OF_C(0) + INIT_ROW_OF_C(1) + INIT_ROW_OF_C(2) + INIT_ROW_OF_C(3) +#undef INIT_ROW_OF_C + + if (bk > 1) { + BLASLONG k = 1; + vector_float Ak[VEC_ROWS], Aknext[VEC_ROWS]; + vector_float Bk[VEC_COLS], Bknext[VEC_COLS]; + + /* + * Note that in several places, we enforce an instruction + * sequence that we identified empirically by utilizing dummy + * asm statements. + */ + + for (BLASLONG j = 0; j < VEC_COLS; j++) + Bk[j] = vec_splats(B[j + k * COLS]); + asm(""); + + for (BLASLONG i = 0; i < VEC_ROWS; i++) + Ak[i] = vec_load_hinted(A + i * VLEN_FLOATS + k * ROWS); + + for (; k < (bk - 2); k += 2) { + /* + * Load inputs for (k+1) into registers. + * Loading from B first is advantageous. + */ + for (BLASLONG j = 0; j < VEC_COLS; j++) + Bknext[j] = vec_splats(B[j + (k + 1) * COLS]); + asm(""); + for (BLASLONG i = 0; i < VEC_ROWS; i++) + Aknext[i] = vec_load_hinted(A + i * VLEN_FLOATS + + (k + 1) * ROWS); + + /* + * To achieve better instruction-level parallelism, + * make sure to first load input data for (k+1) before + * initiating compute for k. We enforce that ordering + * with a pseudo asm statement. + * Note that we need to massage this particular "barrier" + * depending on the gcc version. + */ +#if __GNUC__ > 7 +#define BARRIER_READ_BEFORE_COMPUTE(SUFFIX) \ + do { \ + asm("" \ + : "+v"(C_0_0), "+v"(C_0_1), "+v"(C_0_2), "+v"(C_0_3), "+v"(C_1_0), \ + "+v"(C_1_1), "+v"(C_1_2), "+v"(C_1_3) \ + : "v"(B##SUFFIX[0]), "v"(B##SUFFIX[1]), "v"(B##SUFFIX[2]), \ + "v"(B##SUFFIX[3]), "v"(A##SUFFIX[0]), "v"(A##SUFFIX[1]), \ + "v"(A##SUFFIX[2]), "v"(A##SUFFIX[3])); \ + asm("" \ + : "+v"(C_2_0), "+v"(C_2_1), "+v"(C_2_2), "+v"(C_2_3), "+v"(C_3_0), \ + "+v"(C_3_1), "+v"(C_3_2), "+v"(C_3_3) \ + : "v"(B##SUFFIX[0]), "v"(B##SUFFIX[1]), "v"(B##SUFFIX[2]), \ + "v"(B##SUFFIX[3]), "v"(A##SUFFIX[0]), "v"(A##SUFFIX[1]), \ + "v"(A##SUFFIX[2]), "v"(A##SUFFIX[3])); \ + } while (0) +#else // __GNUC__ <= 7 +#define BARRIER_READ_BEFORE_COMPUTE(SUFFIX) \ + do { \ + asm(""); \ + } while (0) #endif + BARRIER_READ_BEFORE_COMPUTE(knext); + + /* Compute for (k) */ + C_0_0 += Ak[0] * Bk[0]; + C_1_0 += Ak[1] * Bk[0]; + C_2_0 += Ak[2] * Bk[0]; + C_3_0 += Ak[3] * Bk[0]; + + C_0_1 += Ak[0] * Bk[1]; + C_1_1 += Ak[1] * Bk[1]; + C_2_1 += Ak[2] * Bk[1]; + C_3_1 += Ak[3] * Bk[1]; + + C_0_2 += Ak[0] * Bk[2]; + C_1_2 += Ak[1] * Bk[2]; + C_2_2 += Ak[2] * Bk[2]; + C_3_2 += Ak[3] * Bk[2]; + + C_0_3 += Ak[0] * Bk[3]; + C_1_3 += Ak[1] * Bk[3]; + C_2_3 += Ak[2] * Bk[3]; + C_3_3 += Ak[3] * Bk[3]; + + asm(""); + + /* + * Load inputs for (k+2) into registers. + * First load from B. + */ + for (BLASLONG j = 0; j < VEC_COLS; j++) + Bk[j] = vec_splats(B[j + (k + 2) * COLS]); + asm(""); + for (BLASLONG i = 0; i < VEC_ROWS; i++) + Ak[i] = vec_load_hinted(A + i * VLEN_FLOATS + (k + 2) * ROWS); + + /* + * As above, make sure to first schedule the loads for (k+2) + * before compute for (k+1). + */ + BARRIER_READ_BEFORE_COMPUTE(k); + + /* Compute on (k+1) */ + C_0_0 += Aknext[0] * Bknext[0]; + C_1_0 += Aknext[1] * Bknext[0]; + C_2_0 += Aknext[2] * Bknext[0]; + C_3_0 += Aknext[3] * Bknext[0]; + + C_0_1 += Aknext[0] * Bknext[1]; + C_1_1 += Aknext[1] * Bknext[1]; + C_2_1 += Aknext[2] * Bknext[1]; + C_3_1 += Aknext[3] * Bknext[1]; + + C_0_2 += Aknext[0] * Bknext[2]; + C_1_2 += Aknext[1] * Bknext[2]; + C_2_2 += Aknext[2] * Bknext[2]; + C_3_2 += Aknext[3] * Bknext[2]; + + C_0_3 += Aknext[0] * Bknext[3]; + C_1_3 += Aknext[1] * Bknext[3]; + C_2_3 += Aknext[2] * Bknext[3]; + C_3_3 += Aknext[3] * Bknext[3]; + } + + /* Wrapup remaining k's */ + for (; k < bk; k++) { + vector_float Ak; + +#define COMPUTE_WRAPUP_ROW(ROW) \ + Ak = vec_load_hinted(A + ROW * VLEN_FLOATS + k * ROWS); \ + C_##ROW##_0 += Ak * B[0 + k * COLS]; \ + C_##ROW##_1 += Ak * B[1 + k * COLS]; \ + C_##ROW##_2 += Ak * B[2 + k * COLS]; \ + C_##ROW##_3 += Ak * B[3 + k * COLS]; + + COMPUTE_WRAPUP_ROW(0) + COMPUTE_WRAPUP_ROW(1) + COMPUTE_WRAPUP_ROW(2) + COMPUTE_WRAPUP_ROW(3) +#undef COMPUTE_WRAPUP_ROW + } + } + + /* + * Unpack row-block of C_aux into outer C_i, multiply by + * alpha and add up (or assign for TRMM). + */ +#define WRITE_BACK_C(ROW, COL) \ + do { \ + vector_float *Cij = \ + (vector_float *)(C + ROW * VLEN_FLOATS + COL * ldc); \ + if (trmm) { \ + *Cij = alpha * C_##ROW##_##COL; \ + } else { \ + *Cij += alpha * C_##ROW##_##COL; \ + } \ + } while (0) + + WRITE_BACK_C(0, 0); WRITE_BACK_C(0, 1); WRITE_BACK_C(0, 2); WRITE_BACK_C(0, 3); + WRITE_BACK_C(1, 0); WRITE_BACK_C(1, 1); WRITE_BACK_C(1, 2); WRITE_BACK_C(1, 3); + WRITE_BACK_C(2, 0); WRITE_BACK_C(2, 1); WRITE_BACK_C(2, 2); WRITE_BACK_C(2, 3); + WRITE_BACK_C(3, 0); WRITE_BACK_C(3, 1); WRITE_BACK_C(3, 2); WRITE_BACK_C(3, 3); +#undef WRITE_BACK_C + +#undef ROWS +#undef VEC_ROWS +#undef COLS +#undef VEC_COLS +#undef BARRIER_READ_BEFORE_COMPUTE +} + /** * Handle calculation for row blocks in C_i of any size by dispatching into * macro-defined (inline) functions or by deferring to a simple generic From 07c334e7be2f30a07263f0f827cb92fd257704dc Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Tue, 11 Aug 2020 12:55:53 +0200 Subject: [PATCH 352/593] s390x: Factor out small block sizes for SGEMM/DGEMM on z14 For small register blockings that are too small to fill up vector registers with column vectors, we currently use a generic code block. Replace that with instantiations of the generic code as individual functions, so that the compiler can optimize each one separately. Signed-off-by: Marius Hillenbrand --- kernel/zarch/gemm_vec.c | 78 +++++++++++++++++++++++++++-------------- 1 file changed, 51 insertions(+), 27 deletions(-) diff --git a/kernel/zarch/gemm_vec.c b/kernel/zarch/gemm_vec.c index eae2e4d69..741c09431 100644 --- a/kernel/zarch/gemm_vec.c +++ b/kernel/zarch/gemm_vec.c @@ -265,12 +265,58 @@ VECTOR_BLOCK(4, 4) VECTOR_BLOCK(4, 2) VECTOR_BLOCK(4, 1) +/** + * Calculate for a row-block in C_i of size ROWSxCOLS using scalar operations. + * Simple implementation for smaller block sizes + * + * @param[in] A Pointer current block of input matrix A. + * @param[in] k Number of columns in A. + * @param[in] B Pointer current block of input matrix B. + * @param[inout] C Pointer current block of output matrix C. + * @param[in] ldc Offset between elements in adjacent columns in C. + * @param[in] alpha Scalar factor. + */ +#define SCALAR_BLOCK(ROWS, COLS) \ + static inline void GEBP_block_##ROWS##_##COLS( \ + FLOAT const *restrict A, BLASLONG k, FLOAT const *restrict B, \ + FLOAT *restrict C, BLASLONG ldc, FLOAT alpha) { \ + FLOAT Caux[ROWS][COLS] __attribute__((aligned(16))); \ + \ + /* \ + * Peel off first iteration (i.e., column of A) for \ + * initializing Caux \ + */ \ + for (BLASLONG i = 0; i < ROWS; i++) \ + for (BLASLONG j = 0; j < COLS; j++) Caux[i][j] = A[i] * B[j]; \ + \ + for (BLASLONG kk = 1; kk < k; kk++) \ + for (BLASLONG i = 0; i < ROWS; i++) \ + for (BLASLONG j = 0; j < COLS; j++) \ + Caux[i][j] += A[i + kk * ROWS] * B[j + kk * COLS]; \ + \ + for (BLASLONG i = 0; i < ROWS; i++) \ + for (BLASLONG j = 0; j < COLS; j++) \ + if (trmm) { \ + C[i + j * ldc] = alpha * Caux[i][j]; \ + } else { \ + C[i + j * ldc] += alpha * Caux[i][j]; \ + } \ + } + #ifdef DOUBLE VECTOR_BLOCK(2, 4) VECTOR_BLOCK(2, 2) VECTOR_BLOCK(2, 1) +#else +SCALAR_BLOCK(2, 4) +SCALAR_BLOCK(2, 2) +SCALAR_BLOCK(2, 1) #endif +SCALAR_BLOCK(1, 4) +SCALAR_BLOCK(1, 2) +SCALAR_BLOCK(1, 1) + /** * Calculate a row-block that fits 4x4 vector registers using a loop @@ -526,6 +572,8 @@ static inline void GEBP_block(BLASLONG m, BLASLONG n, } } + /* Dispatch into the implementation for each block size: */ + #define BLOCK(bm, bn) \ if (m == bm && n == bn) { \ GEBP_block_##bm##_##bn(A, k, B, C, ldc, alpha); \ @@ -541,35 +589,11 @@ static inline void GEBP_block(BLASLONG m, BLASLONG n, BLOCK(8, 4); BLOCK(8, 2); BLOCK(8, 1); BLOCK(4, 4); BLOCK(4, 2); BLOCK(4, 1); - #ifdef DOUBLE - BLOCK(2, 4); - BLOCK(2, 2); - #endif - -#undef BLOCK + BLOCK(2, 4); BLOCK(2, 2); BLOCK(2, 1); - /* simple implementation for smaller block sizes: */ - FLOAT Caux[m][n] __attribute__ ((aligned (16))); + BLOCK(1, 4); BLOCK(1, 2); BLOCK(1, 1); - /* - * Peel off first iteration (i.e., column of A) for initializing Caux - */ - for (BLASLONG i = 0; i < m; i++) - for (BLASLONG j = 0; j < n; j++) - Caux[i][j] = A[i] * B[j]; - - for (BLASLONG kk = 1; kk < k; kk++) - for (BLASLONG i = 0; i < m; i++) - for (BLASLONG j = 0; j < n; j++) - Caux[i][j] += A[i + kk * m] * B[j + kk * n]; - - for (BLASLONG i = 0; i < m; i++) - for (BLASLONG j = 0; j < n; j++) - if (trmm) { - C[i + j * ldc] = alpha * Caux[i][j]; - } else { - C[i + j * ldc] += alpha * Caux[i][j]; - } +#undef BLOCK } /** From e115c97e05889fc2e8edf041cdfd92d00d63a884 Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Tue, 11 Aug 2020 12:55:59 +0200 Subject: [PATCH 353/593] s390x/SGEMM: adjust default P and Q to multiples of M We recently changed the register blocking for SGEMM on s390x to 16x4. However, we did not adjust Q to a multiple of 16 and thus fell back to the 8x4 kernel at each block's margin, without need. Adjust P and Q to multiples of 16 to employ the faster 16x4 kernel for complete full-sized blocks. Signed-off-by: Marius Hillenbrand --- param.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/param.h b/param.h index 476f237a1..3e539a2b8 100644 --- a/param.h +++ b/param.h @@ -3092,12 +3092,12 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define ZGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_N 4 -#define SGEMM_DEFAULT_P 456 +#define SGEMM_DEFAULT_P 480 #define DGEMM_DEFAULT_P 320 #define CGEMM_DEFAULT_P 480 #define ZGEMM_DEFAULT_P 224 -#define SGEMM_DEFAULT_Q 488 +#define SGEMM_DEFAULT_Q 512 #define DGEMM_DEFAULT_Q 384 #define CGEMM_DEFAULT_Q 128 #define ZGEMM_DEFAULT_Q 352 From fee361ae64f2d02552713ade0ee972e6efdb1ed4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 11 Aug 2020 13:27:19 +0200 Subject: [PATCH 354/593] fix another source of NO_CBLAS=0 surprise --- interface/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/interface/Makefile b/interface/Makefile index 44a9fdcf0..2dbd60073 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -367,7 +367,7 @@ CZBLAS3OBJS += cblas_zgemm3m.$(SUFFIX) endif -ifndef NO_CBLAS +ifneq ($(NO_CBLAS), 1) override CFLAGS += -I. From 619343278d6d6e8ec3989fb883da333ee087d351 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 11 Aug 2020 13:40:40 +0200 Subject: [PATCH 355/593] Fix mishandling of NO_CBLAS=0 and NO_LAPACKE=0 --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index c1d943fac..7a03b08f0 100644 --- a/Makefile +++ b/Makefile @@ -141,7 +141,7 @@ ifndef NO_FBLAS $(MAKE) -C test all endif $(MAKE) -C utest all -ifndef NO_CBLAS +ifneq ($(NO_CBLAS), 1) $(MAKE) -C ctest all ifeq ($(CPP_THREAD_SAFETY_TEST), 1) $(MAKE) -C cpp_thread_test all @@ -244,7 +244,7 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib @$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib endif -ifndef NO_LAPACKE +ifneq ($(NO_LAPACKE), 1) @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapackelib endif endif From efdd237a91646f0ce58815ef6507c04e393813a6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 12 Aug 2020 23:08:38 +0200 Subject: [PATCH 356/593] Add a dedicated POWER9 build to the Travis CI (#2774) * Add dedicated POWER9 build (using new syntax to ensure it runs as a P9-only containerized job rather than a VM that might end up on P8 hardware half of the time) * Bump gcc version for POWER9 build --- .travis.yml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/.travis.yml b/.travis.yml index 101147353..307010e40 100644 --- a/.travis.yml +++ b/.travis.yml @@ -75,6 +75,23 @@ matrix: - TARGET_BOX=LINUX32 - BTYPE="BINARY=32" + - os: linux + arch: ppc64le + dist: bionic + compiler: gcc + before_script: + - sudo add-apt-repository 'ppa:ubuntu-toolchain-r/test' -y + - sudo apt-get update + - sudo apt-get install gcc-9 gfortran-9 -y + script: + - make QUIET_MAKE=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9 + - make -C test $COMMON_FLAGS $BTYPE + - make -C ctest $COMMON_FLAGS $BTYPE + - make -C utest $COMMON_FLAGS $BTYPE + env: + # for matrix annotation only + - TARGET_BOX=PPC64LE_LINUX_P9 + - os: linux compiler: gcc addons: From e740c4873d5b66851580ca53d9dce427325b8b9b Mon Sep 17 00:00:00 2001 From: "Chen, Guobing" Date: Thu, 13 Aug 2020 06:17:34 +0800 Subject: [PATCH 357/593] Enable COOPERLAKE build target Enable new build target platform -- COOPERLAKE. This target platform supports all the SKYLAKEX supported ISAs + avx512bf16. So all the SKYLAKEX specific kernels/drivers and related code are now extended to be also active on COOPERLAKE. Besides, new BF16 related kernels are active under this target. --- Makefile.system | 8 +- Makefile.x86_64 | 19 ++++ TargetList.txt | 1 + cmake/arch.cmake | 4 +- cmake/cc.cmake | 8 ++ cmake/system.cmake | 5 +- cpuid.h | 15 +-- cpuid_x86.c | 37 +++++++- driver/level3/level3.c | 2 +- driver/level3/level3_thread.c | 2 +- driver/level3/trmm_L.c | 8 +- driver/level3/trmm_R.c | 12 +-- driver/others/parameter.c | 11 ++- getarch.c | 30 ++++++ kernel/CMakeLists.txt | 2 +- kernel/Makefile | 12 ++- kernel/Makefile.L3 | 4 + kernel/setparam-ref.c | 2 +- kernel/x86/trsm_kernel_LN_2x4_penryn.S | 2 +- kernel/x86/trsm_kernel_LN_4x4_penryn.S | 2 +- kernel/x86/trsm_kernel_LT_2x4_penryn.S | 2 +- kernel/x86/trsm_kernel_LT_4x4_penryn.S | 2 +- kernel/x86/trsm_kernel_RT_2x4_penryn.S | 2 +- kernel/x86/trsm_kernel_RT_4x4_penryn.S | 2 +- kernel/x86/ztrsm_kernel_LN_2x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_LT_1x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_LT_2x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_RT_1x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_RT_2x2_penryn.S | 2 +- kernel/x86_64/KERNEL.COOPERLAKE | 1 + kernel/x86_64/caxpy.c | 2 +- kernel/x86_64/cdot.c | 2 +- kernel/x86_64/cgemv_n_4.c | 2 +- kernel/x86_64/cgemv_t_4.c | 2 +- kernel/x86_64/cscal.c | 2 +- kernel/x86_64/daxpy.c | 2 +- kernel/x86_64/ddot.c | 2 +- kernel/x86_64/dgemv_n_4.c | 2 +- kernel/x86_64/dgemv_t_4.c | 2 +- kernel/x86_64/dscal.c | 2 +- kernel/x86_64/dsymv_L.c | 2 +- kernel/x86_64/dsymv_U.c | 2 +- kernel/x86_64/saxpy.c | 2 +- kernel/x86_64/sdot.c | 2 +- kernel/x86_64/sgemv_n_4.c | 2 +- kernel/x86_64/sgemv_t_4.c | 2 +- kernel/x86_64/ssymv_L.c | 2 +- kernel/x86_64/ssymv_U.c | 2 +- kernel/x86_64/symv_L_sse.S | 2 +- kernel/x86_64/symv_L_sse2.S | 2 +- kernel/x86_64/symv_U_sse.S | 2 +- kernel/x86_64/symv_U_sse2.S | 2 +- kernel/x86_64/zaxpy.c | 2 +- kernel/x86_64/zdot.c | 2 +- kernel/x86_64/zgemv_n_4.c | 2 +- kernel/x86_64/zgemv_t_4.c | 2 +- kernel/x86_64/zscal.c | 2 +- kernel/x86_64/zsymv_L_sse.S | 2 +- kernel/x86_64/zsymv_L_sse2.S | 2 +- kernel/x86_64/zsymv_U_sse.S | 2 +- kernel/x86_64/zsymv_U_sse2.S | 2 +- param.h | 118 ++++++++++++++++++++++++ 62 files changed, 309 insertions(+), 76 deletions(-) create mode 100644 kernel/x86_64/KERNEL.COOPERLAKE diff --git a/Makefile.system b/Makefile.system index d7e71d00a..2286d14f2 100644 --- a/Makefile.system +++ b/Makefile.system @@ -88,6 +88,9 @@ endif ifeq ($(TARGET), SKYLAKEX) GETARCH_FLAGS := -DFORCE_NEHALEM endif +ifeq ($(TARGET), COOPERLAKE) +GETARCH_FLAGS := -DFORCE_NEHALEM +endif ifeq ($(TARGET), SANDYBRIDGE) GETARCH_FLAGS := -DFORCE_NEHALEM endif @@ -130,6 +133,9 @@ endif ifeq ($(TARGET_CORE), SKYLAKEX) GETARCH_FLAGS := -DFORCE_NEHALEM endif +ifeq ($(TARGET_CORE), COOPERLAKE) +GETARCH_FLAGS := -DFORCE_NEHALEM +endif ifeq ($(TARGET_CORE), SANDYBRIDGE) GETARCH_FLAGS := -DFORCE_NEHALEM endif @@ -553,7 +559,7 @@ DYNAMIC_CORE += HASWELL ZEN endif ifneq ($(NO_AVX512), 1) ifneq ($(NO_AVX2), 1) -DYNAMIC_CORE += SKYLAKEX +DYNAMIC_CORE += SKYLAKEX COOPERLAKE endif endif endif diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 2676bd258..96e9dbe44 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -27,6 +27,25 @@ endif endif endif +ifeq ($(CORE), COOPERLAKE) +ifndef DYNAMIC_ARCH +ifndef NO_AVX512 +CCOMMON_OPT += -march=cooperlake +FCOMMON_OPT += -march=cooperlake +ifeq ($(OSNAME), CYGWIN_NT) +CCOMMON_OPT += -fno-asynchronous-unwind-tables +FCOMMON_OPT += -fno-asynchronous-unwind-tables +endif +ifeq ($(OSNAME), WINNT) +ifeq ($(C_COMPILER), GCC) +CCOMMON_OPT += -fno-asynchronous-unwind-tables +FCOMMON_OPT += -fno-asynchronous-unwind-tables +endif +endif +endif +endif +endif + ifeq ($(CORE), HASWELL) ifndef DYNAMIC_ARCH ifndef NO_AVX2 diff --git a/TargetList.txt b/TargetList.txt index 8ea2df9b7..5934f3012 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -22,6 +22,7 @@ SANDYBRIDGE HASWELL SKYLAKEX ATOM +COOPERLAKE b)AMD CPU: ATHLON diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 5388156bc..c00f8fe71 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -76,9 +76,9 @@ if (DYNAMIC_ARCH) set(DYNAMIC_CORE ${DYNAMIC_CORE} HASWELL ZEN) endif () if (NOT NO_AVX512) - set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX) + set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX COOPERLAKE) string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") - endif () + endif () if (DYNAMIC_LIST) set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST}) endif () diff --git a/cmake/cc.cmake b/cmake/cc.cmake index d5551147c..88cf9f573 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -103,3 +103,11 @@ if (${CORE} STREQUAL "SKYLAKEX") endif () endif () endif () + +if (${CORE} STREQUAL "COOPERLAKE") + if (NOT DYNAMIC_ARCH) + if (NOT NO_AVX512) + set (CCOMMON_OPT = "${CCOMMON_OPT} -march=cooperlake") + endif () + endif () +endif () diff --git a/cmake/system.cmake b/cmake/system.cmake index d8dcc3cf3..2838e279f 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -33,7 +33,7 @@ endif () if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) message(STATUS "Compiling a ${BINARY}-bit binary.") set(NO_AVX 1) - if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX") + if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX" OR ${TARGET} STREQUAL "COOPERLAKE") set(TARGET "NEHALEM") endif () if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN") @@ -45,6 +45,9 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) endif () if (DEFINED TARGET) + if (${TARGET} STREQUAL "COOPERLAKE" AND NOT NO_AVX512) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake") + endif() if (${TARGET} STREQUAL "SKYLAKEX" AND NOT NO_AVX512) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") endif() diff --git a/cpuid.h b/cpuid.h index 697f43133..824e0bc70 100644 --- a/cpuid.h +++ b/cpuid.h @@ -118,6 +118,7 @@ #define CORE_ZEN 27 #define CORE_SKYLAKEX 28 #define CORE_DHYANA 29 +#define CORE_COOPERLAKE 30 #define HAVE_SSE (1 << 0) #define HAVE_SSE2 (1 << 1) @@ -137,11 +138,12 @@ #define HAVE_MISALIGNSSE (1 << 15) #define HAVE_128BITFPU (1 << 16) #define HAVE_FASTMOVU (1 << 17) -#define HAVE_AVX (1 << 18) -#define HAVE_FMA4 (1 << 19) -#define HAVE_FMA3 (1 << 20) -#define HAVE_AVX512VL (1 << 21) -#define HAVE_AVX2 (1 << 22) +#define HAVE_AVX (1 << 18) +#define HAVE_FMA4 (1 << 19) +#define HAVE_FMA3 (1 << 20) +#define HAVE_AVX512VL (1 << 21) +#define HAVE_AVX2 (1 << 22) +#define HAVE_AVX512BF16 (1 << 23) #define CACHE_INFO_L1_I 1 #define CACHE_INFO_L1_D 2 @@ -218,7 +220,8 @@ typedef struct { #define CPUTYPE_ZEN 51 #define CPUTYPE_SKYLAKEX 52 #define CPUTYPE_DHYANA 53 +#define CPUTYPE_COOPERLAKE 54 -#define CPUTYPE_HYGON_UNKNOWN 54 +#define CPUTYPE_HYGON_UNKNOWN 99 #endif diff --git a/cpuid_x86.c b/cpuid_x86.c index ea846a392..728d459d1 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -249,6 +249,22 @@ int support_avx512(){ #endif } +int support_avx512_bf16(){ +#if !defined(NO_AVX) && !defined(NO_AVX512) + int eax, ebx, ecx, edx; + int ret=0; + + if (!support_avx512()) + return 0; + cpuid_count(7, 1, &eax, &ebx, &ecx, &edx); + if((eax & 32) == 32){ + ret=1; // CPUID.7.1:EAX[bit 5] indicates whether avx512_bf16 supported or not + } + return ret; +#else + return 0; +#endif +} int get_vendor(void){ int eax, ebx, ecx, edx; @@ -335,6 +351,7 @@ int get_cputype(int gettype){ if (support_avx()) feature |= HAVE_AVX; if (support_avx2()) feature |= HAVE_AVX2; if (support_avx512()) feature |= HAVE_AVX512VL; + if (support_avx512_bf16()) feature |= HAVE_AVX512BF16; if ((ecx & (1 << 12)) != 0) feature |= HAVE_FMA3; #endif @@ -1337,6 +1354,8 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; case 5: // Skylake X + if(support_avx512_bf16()) + return CPUTYPE_COOPERLAKE; if(support_avx512()) return CPUTYPE_SKYLAKEX; if(support_avx2()) @@ -1677,7 +1696,8 @@ static char *cpuname[] = { "EXCAVATOR", "ZEN", "SKYLAKEX", - "DHYANA" + "DHYANA", + "COOPERLAKE" }; static char *lowercpuname[] = { @@ -1733,7 +1753,8 @@ static char *lowercpuname[] = { "excavator", "zen", "skylakex", - "dhyana" + "dhyana", + "cooperlake" }; static char *corename[] = { @@ -1766,7 +1787,8 @@ static char *corename[] = { "EXCAVATOR", "ZEN", "SKYLAKEX", - "DHYANA" + "DHYANA", + "COOPERLAKE" }; static char *corename_lower[] = { @@ -1799,7 +1821,8 @@ static char *corename_lower[] = { "excavator", "zen", "skylakex", - "dhyana" + "dhyana", + "cooperlake" }; @@ -2007,7 +2030,9 @@ int get_coretype(void){ case 5: // Skylake X #ifndef NO_AVX512 - return CORE_SKYLAKEX; + if(support_avx512_bf16()) + return CORE_COOPERLAKE; + return CORE_SKYLAKEX; #else if(support_avx()) #ifndef NO_AVX2 @@ -2276,6 +2301,7 @@ void get_cpuconfig(void){ if (features & HAVE_AVX ) printf("#define HAVE_AVX\n"); if (features & HAVE_AVX2 ) printf("#define HAVE_AVX2\n"); if (features & HAVE_AVX512VL ) printf("#define HAVE_AVX512VL\n"); + if (features & HAVE_AVX512BF16 ) printf("#define HAVE_AVX512BF16\n"); if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n"); @@ -2346,6 +2372,7 @@ void get_sse(void){ if (features & HAVE_AVX ) printf("HAVE_AVX=1\n"); if (features & HAVE_AVX2 ) printf("HAVE_AVX2=1\n"); if (features & HAVE_AVX512VL ) printf("HAVE_AVX512VL=1\n"); + if (features & HAVE_AVX512BF16 ) printf("HAVE_AVX512BF16=1\n"); if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n"); diff --git a/driver/level3/level3.c b/driver/level3/level3.c index c6bbb9ca9..a38506585 100644 --- a/driver/level3/level3.c +++ b/driver/level3/level3.c @@ -333,7 +333,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, #else for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#ifdef SKYLAKEX +#if defined(SKYLAKEX) || defined(COOPERLAKE) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 5a8d497d2..6e1fd9e99 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -367,7 +367,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Split local region of B into parts */ for(jjs = js; jjs < MIN(n_to, js + div_n); jjs += min_jj){ min_jj = MIN(n_to, js + div_n) - jjs; -#ifdef SKYLAKEX +#if defined(SKYLAKEX) || defined(COOPERLAKE) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else diff --git a/driver/level3/trmm_L.c b/driver/level3/trmm_L.c index 9117090b5..1027c0c73 100644 --- a/driver/level3/trmm_L.c +++ b/driver/level3/trmm_L.c @@ -135,7 +135,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#ifdef SKYLAKEX +#if defined(SKYLAKEX) || defined(COOPERLAKE) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -205,7 +205,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#ifdef SKYLAKEX +#if defined(SKYLAKEX) || defined(COOPERLAKE) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -300,7 +300,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#ifdef SKYLAKEX +#if defined(SKYLAKEX) || defined(COOPERLAKE) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -370,7 +370,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#ifdef SKYLAKEX +#if defined(SKYLAKEX) || defined(COOPERLAKE) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else diff --git a/driver/level3/trmm_R.c b/driver/level3/trmm_R.c index 62c6a2442..e8df7fb21 100644 --- a/driver/level3/trmm_R.c +++ b/driver/level3/trmm_R.c @@ -122,7 +122,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = 0; jjs < ls - js; jjs += min_jj){ min_jj = ls - js - jjs; -#ifdef SKYLAKEX +#if defined(SKYLAKEX) || defined(COOPERLAKE) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -146,7 +146,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = 0; jjs < min_l; jjs += min_jj){ min_jj = min_l - jjs; -#ifdef SKYLAKEX +#if defined(SKYLAKEX) || defined(COOPERLAKE) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -203,7 +203,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#ifdef SKYLAKEX +#if defined(SKYLAKEX) || defined(COOPERLAKE) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -258,7 +258,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = 0; jjs < min_l; jjs += min_jj){ min_jj = min_l - jjs; -#ifdef SKYLAKEX +#if defined(SKYLAKEX) || defined(COOPERLAKE) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -283,7 +283,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = 0; jjs < js - ls - min_l; jjs += min_jj){ min_jj = js - ls - min_l - jjs; -#ifdef SKYLAKEX +#if defined(SKYLAKEX) || defined(COOPERLAKE) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -344,7 +344,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#ifdef SKYLAKEX +#if defined(SKYLAKEX) || defined(COOPERLAKE) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else diff --git a/driver/others/parameter.c b/driver/others/parameter.c index b1f3befae..5d312fa87 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -180,9 +180,10 @@ int get_L2_size(void){ int eax, ebx, ecx, edx; #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \ - defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ - defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \ - defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || defined(SKYLAKEX) + defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ + defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \ + defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || \ + defined(ZEN) || defined(SKYLAKEX) || defined(COOPERLAKE) cpuid(0x80000006, &eax, &ebx, &ecx, &edx); @@ -266,7 +267,9 @@ int get_L2_size(void){ void blas_set_parameter(void){ int factor; -#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || defined(SKYLAKEX) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || \ + defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || \ + defined(SKYLAKEX) || defined(COOPERLAKE) int size = 16; #else int size = get_L2_size(); diff --git a/getarch.c b/getarch.c index 51c9a84e5..83043bdf2 100644 --- a/getarch.c +++ b/getarch.c @@ -365,6 +365,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif +#ifdef FORCE_COOPERLAKE +#ifdef NO_AVX512 +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "HASWELL" +#define ARCHCONFIG "-DHASWELL " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ + "-DFMA3" +#define LIBNAME "haswell" +#define CORENAME "HASWELL" +#else +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "COOPERLAKE" +#define ARCHCONFIG "-DCOOPERLAKE " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ + "-DFMA3 -DHAVE_AVX512VL -DHAVE_AVX512BF16 -march=cooperlake" +#define LIBNAME "cooperlake" +#define CORENAME "COOPERLAKE" +#endif +#endif + #ifdef FORCE_ATOM #define FORCE #define FORCE_INTEL diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index d1349c5f8..2f448e8f8 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -127,7 +127,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) # Makefile.L3 set(USE_TRMM false) - if (ARM OR ARM64 OR (TARGET_CORE MATCHES LONGSOON3B) OR (TARGET_CORE MATCHES GENERIC) OR (TARGET_CORE MATCHES HASWELL) OR (TARGET_CORE MATCHES ZEN) OR (TARGET_CORE MATCHES SKYLAKEX) ) + if (ARM OR ARM64 OR (TARGET_CORE MATCHES LONGSOON3B) OR (TARGET_CORE MATCHES GENERIC) OR (TARGET_CORE MATCHES HASWELL) OR (TARGET_CORE MATCHES ZEN) OR (TARGET_CORE MATCHES SKYLAKEX) OR (TARGET_CORE MATCHES COOPERLAKE)) set(USE_TRMM true) endif () if (ZARCH OR (TARGET_CORE MATCHES POWER8) OR (TARGET_CORE MATCHES POWER9) OR (TARGET_CORE MATCHES POWER10)) diff --git a/kernel/Makefile b/kernel/Makefile index db3282c05..0c883cd96 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -37,7 +37,17 @@ ifdef NO_AVX2 endif ifdef TARGET_CORE -ifeq ($(TARGET_CORE), SKYLAKEX) +ifeq ($(TARGET_CORE), COOPERLAKE) + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=cooperlake + ifeq ($(OSNAME), CYGWIN_NT) + override CFLAGS += -fno-asynchronous-unwind-tables + endif + ifeq ($(OSNAME), WINNT) + ifeq ($(C_COMPILER), GCC) + override CFLAGS += -fno-asynchronous-unwind-tables + endif + endif +else ifeq ($(TARGET_CORE), SKYLAKEX) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512 ifeq ($(OSNAME), CYGWIN_NT) override CFLAGS += -fno-asynchronous-unwind-tables diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 8df306d5f..bee8b216a 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -39,6 +39,10 @@ ifeq ($(CORE), SKYLAKEX) USE_TRMM = 1 endif +ifeq ($(CORE), COOPERLAKE) +USE_TRMM = 1 +endif + ifeq ($(CORE), ZEN) USE_TRMM = 1 endif diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index d3aa030c1..a0bdc7e14 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -1166,7 +1166,7 @@ static void init_parameter(void) { #endif #endif -#ifdef SKYLAKEX +#if defined (SKYLAKEX) || defined (COOPERLAKE) #ifdef DEBUG fprintf(stderr, "SkylakeX\n"); diff --git a/kernel/x86/trsm_kernel_LN_2x4_penryn.S b/kernel/x86/trsm_kernel_LN_2x4_penryn.S index 34653d400..fde9eba8e 100644 --- a/kernel/x86/trsm_kernel_LN_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_LN_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LN_4x4_penryn.S b/kernel/x86/trsm_kernel_LN_4x4_penryn.S index 492f34344..fddf7560f 100644 --- a/kernel/x86/trsm_kernel_LN_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_LN_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LT_2x4_penryn.S b/kernel/x86/trsm_kernel_LT_2x4_penryn.S index 6840c54ad..33afd2a61 100644 --- a/kernel/x86/trsm_kernel_LT_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_LT_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LT_4x4_penryn.S b/kernel/x86/trsm_kernel_LT_4x4_penryn.S index e2f731fca..b05bd6ee5 100644 --- a/kernel/x86/trsm_kernel_LT_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_LT_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_RT_2x4_penryn.S b/kernel/x86/trsm_kernel_RT_2x4_penryn.S index 11825429e..f960559a6 100644 --- a/kernel/x86/trsm_kernel_RT_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_RT_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_RT_4x4_penryn.S b/kernel/x86/trsm_kernel_RT_4x4_penryn.S index 4c054f399..cf842c9b5 100644 --- a/kernel/x86/trsm_kernel_RT_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_RT_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S index e67496736..63c44c27a 100644 --- a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S index 498057697..4cb01e50a 100644 --- a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S @@ -63,7 +63,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S index f3072983d..09d5d8e43 100644 --- a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S index 879ae9c38..7d129e54c 100644 --- a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S @@ -63,7 +63,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S index 6c308197b..d33599317 100644 --- a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86_64/KERNEL.COOPERLAKE b/kernel/x86_64/KERNEL.COOPERLAKE new file mode 100644 index 000000000..0b2f3c0ed --- /dev/null +++ b/kernel/x86_64/KERNEL.COOPERLAKE @@ -0,0 +1 @@ +include $(KERNELDIR)/KERNEL.SKYLAKEX diff --git a/kernel/x86_64/caxpy.c b/kernel/x86_64/caxpy.c index 586d05ac2..c19b98f02 100644 --- a/kernel/x86_64/caxpy.c +++ b/kernel/x86_64/caxpy.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "caxpy_microk_steamroller-2.c" #elif defined(BULLDOZER) #include "caxpy_microk_bulldozer-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined(SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) || defined(SKYLAKEX) || defined(COOPERLAKE) #include "caxpy_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "caxpy_microk_sandy-2.c" diff --git a/kernel/x86_64/cdot.c b/kernel/x86_64/cdot.c index f71d7b6b4..f2bf19dcd 100644 --- a/kernel/x86_64/cdot.c +++ b/kernel/x86_64/cdot.c @@ -34,7 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cdot_microk_bulldozer-2.c" #elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR) #include "cdot_microk_steamroller-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #include "cdot_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "cdot_microk_sandy-2.c" diff --git a/kernel/x86_64/cgemv_n_4.c b/kernel/x86_64/cgemv_n_4.c index d81766cd4..0ed02b8d8 100644 --- a/kernel/x86_64/cgemv_n_4.c +++ b/kernel/x86_64/cgemv_n_4.c @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "common.h" -#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #include "cgemv_n_microk_haswell-4.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "cgemv_n_microk_bulldozer-4.c" diff --git a/kernel/x86_64/cgemv_t_4.c b/kernel/x86_64/cgemv_t_4.c index f44fe7247..c2903b11f 100644 --- a/kernel/x86_64/cgemv_t_4.c +++ b/kernel/x86_64/cgemv_t_4.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #include "cgemv_t_microk_haswell-4.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "cgemv_t_microk_bulldozer-4.c" diff --git a/kernel/x86_64/cscal.c b/kernel/x86_64/cscal.c index 72af99809..6d75358a6 100644 --- a/kernel/x86_64/cscal.c +++ b/kernel/x86_64/cscal.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #include "cscal_microk_haswell-2.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) #include "cscal_microk_bulldozer-2.c" diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c index cde5bdaa6..d84c0c221 100644 --- a/kernel/x86_64/daxpy.c +++ b/kernel/x86_64/daxpy.c @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "daxpy_microk_piledriver-2.c" #elif defined(HASWELL) || defined(ZEN) #include "daxpy_microk_haswell-2.c" -#elif defined (SKYLAKEX) +#elif defined (SKYLAKEX) || defined (COOPERLAKE) #include "daxpy_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "daxpy_microk_sandy-2.c" diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c index 969357614..e4b6622e6 100644 --- a/kernel/x86_64/ddot.c +++ b/kernel/x86_64/ddot.c @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ddot_microk_nehalem-2.c" #elif defined(HASWELL) || defined(ZEN) #include "ddot_microk_haswell-2.c" -#elif defined (SKYLAKEX) +#elif defined (SKYLAKEX) || defined (COOPERLAKE) #include "ddot_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "ddot_microk_sandy-2.c" diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c index 6d33641e9..da68db0cd 100644 --- a/kernel/x86_64/dgemv_n_4.c +++ b/kernel/x86_64/dgemv_n_4.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dgemv_n_microk_nehalem-4.c" #elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "dgemv_n_microk_haswell-4.c" -#elif defined (SKYLAKEX) +#elif defined (SKYLAKEX) || defined (COOPERLAKE) #include "dgemv_n_microk_skylakex-4.c" #endif diff --git a/kernel/x86_64/dgemv_t_4.c b/kernel/x86_64/dgemv_t_4.c index ed672a757..a3bf28dc8 100644 --- a/kernel/x86_64/dgemv_t_4.c +++ b/kernel/x86_64/dgemv_t_4.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined (SKYLAKEX) +#if defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined (SKYLAKEX) || defined (COOPERLAKE) #include "dgemv_t_microk_haswell-4.c" #endif diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c index e2436f789..d1270d20b 100644 --- a/kernel/x86_64/dscal.c +++ b/kernel/x86_64/dscal.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dscal_microk_sandy-2.c" #elif defined(HASWELL) || defined(ZEN) #include "dscal_microk_haswell-2.c" -#elif defined (SKYLAKEX) +#elif defined (SKYLAKEX) || defined (COOPERLAKE) #include "dscal_microk_skylakex-2.c" #endif diff --git a/kernel/x86_64/dsymv_L.c b/kernel/x86_64/dsymv_L.c index a722cc9df..573377ee0 100644 --- a/kernel/x86_64/dsymv_L.c +++ b/kernel/x86_64/dsymv_L.c @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dsymv_L_microk_bulldozer-2.c" #elif defined(HASWELL) || defined(ZEN) #include "dsymv_L_microk_haswell-2.c" -#elif defined (SKYLAKEX) +#elif defined (SKYLAKEX) || defined (COOPERLAKE) #include "dsymv_L_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "dsymv_L_microk_sandy-2.c" diff --git a/kernel/x86_64/dsymv_U.c b/kernel/x86_64/dsymv_U.c index 431e4bb3f..530ac8b1d 100644 --- a/kernel/x86_64/dsymv_U.c +++ b/kernel/x86_64/dsymv_U.c @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "dsymv_U_microk_bulldozer-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #include "dsymv_U_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "dsymv_U_microk_sandy-2.c" diff --git a/kernel/x86_64/saxpy.c b/kernel/x86_64/saxpy.c index e1349da58..7b2845636 100644 --- a/kernel/x86_64/saxpy.c +++ b/kernel/x86_64/saxpy.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "saxpy_microk_nehalem-2.c" #elif defined(HASWELL) || defined(ZEN) #include "saxpy_microk_haswell-2.c" -#elif defined (SKYLAKEX) +#elif defined (SKYLAKEX) || defined (COOPERLAKE) #include "saxpy_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "saxpy_microk_sandy-2.c" diff --git a/kernel/x86_64/sdot.c b/kernel/x86_64/sdot.c index 3536afc9e..e816c67e9 100644 --- a/kernel/x86_64/sdot.c +++ b/kernel/x86_64/sdot.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sdot_microk_nehalem-2.c" #elif defined(HASWELL) || defined(ZEN) #include "sdot_microk_haswell-2.c" -#elif defined (SKYLAKEX) +#elif defined (SKYLAKEX) || defined (COOPERLAKE) #include "sdot_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "sdot_microk_sandy-2.c" diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index 63697970f..3eec21774 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_n_microk_nehalem-4.c" #elif defined(SANDYBRIDGE) #include "sgemv_n_microk_sandy-4.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #include "sgemv_n_microk_haswell-4.c" #endif diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index 86ecaf516..fe886f57f 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -34,7 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_t_microk_bulldozer-4.c" #elif defined(SANDYBRIDGE) #include "sgemv_t_microk_sandy-4.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #include "sgemv_t_microk_haswell-4.c" #endif diff --git a/kernel/x86_64/ssymv_L.c b/kernel/x86_64/ssymv_L.c index 73ae001ea..c9d698eb7 100644 --- a/kernel/x86_64/ssymv_L.c +++ b/kernel/x86_64/ssymv_L.c @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ssymv_L_microk_bulldozer-2.c" #elif defined(NEHALEM) #include "ssymv_L_microk_nehalem-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #include "ssymv_L_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "ssymv_L_microk_sandy-2.c" diff --git a/kernel/x86_64/ssymv_U.c b/kernel/x86_64/ssymv_U.c index f37c251a1..4d8aac1ab 100644 --- a/kernel/x86_64/ssymv_U.c +++ b/kernel/x86_64/ssymv_U.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ssymv_U_microk_bulldozer-2.c" #elif defined(NEHALEM) #include "ssymv_U_microk_nehalem-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #include "ssymv_U_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "ssymv_U_microk_sandy-2.c" diff --git a/kernel/x86_64/symv_L_sse.S b/kernel/x86_64/symv_L_sse.S index 8a5c44c9b..fea4fc746 100644 --- a/kernel/x86_64/symv_L_sse.S +++ b/kernel/x86_64/symv_L_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_L_sse2.S b/kernel/x86_64/symv_L_sse2.S index 0c40a3435..b853ef365 100644 --- a/kernel/x86_64/symv_L_sse2.S +++ b/kernel/x86_64/symv_L_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_U_sse.S b/kernel/x86_64/symv_U_sse.S index 7a2eeace5..bad367e91 100644 --- a/kernel/x86_64/symv_U_sse.S +++ b/kernel/x86_64/symv_U_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_U_sse2.S b/kernel/x86_64/symv_U_sse2.S index 0408b577c..147201751 100644 --- a/kernel/x86_64/symv_U_sse2.S +++ b/kernel/x86_64/symv_U_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zaxpy.c b/kernel/x86_64/zaxpy.c index 53866cf95..25e9f6d42 100644 --- a/kernel/x86_64/zaxpy.c +++ b/kernel/x86_64/zaxpy.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "zaxpy_microk_bulldozer-2.c" #elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "zaxpy_microk_steamroller-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #include "zaxpy_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "zaxpy_microk_sandy-2.c" diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c index 423a6f23e..90fd86daf 100644 --- a/kernel/x86_64/zdot.c +++ b/kernel/x86_64/zdot.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "zdot_microk_bulldozer-2.c" #elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR) #include "zdot_microk_steamroller-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #include "zdot_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "zdot_microk_sandy-2.c" diff --git a/kernel/x86_64/zgemv_n_4.c b/kernel/x86_64/zgemv_n_4.c index 0fedc496b..1f9d41859 100644 --- a/kernel/x86_64/zgemv_n_4.c +++ b/kernel/x86_64/zgemv_n_4.c @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #include "zgemv_n_microk_haswell-4.c" #elif defined(SANDYBRIDGE) #include "zgemv_n_microk_sandy-4.c" diff --git a/kernel/x86_64/zgemv_t_4.c b/kernel/x86_64/zgemv_t_4.c index 6221471f7..34f28b224 100644 --- a/kernel/x86_64/zgemv_t_4.c +++ b/kernel/x86_64/zgemv_t_4.c @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "zgemv_t_microk_bulldozer-4.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #include "zgemv_t_microk_haswell-4.c" #endif diff --git a/kernel/x86_64/zscal.c b/kernel/x86_64/zscal.c index 2a6d0e4c7..09a702a81 100644 --- a/kernel/x86_64/zscal.c +++ b/kernel/x86_64/zscal.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #include "zscal_microk_haswell-2.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) #include "zscal_microk_bulldozer-2.c" diff --git a/kernel/x86_64/zsymv_L_sse.S b/kernel/x86_64/zsymv_L_sse.S index e44bd7550..83ed41ba1 100644 --- a/kernel/x86_64/zsymv_L_sse.S +++ b/kernel/x86_64/zsymv_L_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_L_sse2.S b/kernel/x86_64/zsymv_L_sse2.S index e9f330c36..7ed2faf0f 100644 --- a/kernel/x86_64/zsymv_L_sse2.S +++ b/kernel/x86_64/zsymv_L_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_U_sse.S b/kernel/x86_64/zsymv_U_sse.S index 9f0dead18..5945f3f81 100644 --- a/kernel/x86_64/zsymv_U_sse.S +++ b/kernel/x86_64/zsymv_U_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_U_sse2.S b/kernel/x86_64/zsymv_U_sse2.S index b6106a37d..484d74f14 100644 --- a/kernel/x86_64/zsymv_U_sse2.S +++ b/kernel/x86_64/zsymv_U_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/param.h b/param.h index 3e539a2b8..1ab982dc5 100644 --- a/param.h +++ b/param.h @@ -1748,6 +1748,124 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#ifdef COOPERLAKE + +#define SNUMOPT 16 +#define DNUMOPT 8 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SYMV_P 8 + +#if defined(XDOUBLE) || defined(DOUBLE) +#define SWITCH_RATIO 8 +#define GEMM_PREFERED_SIZE 8 +#else +#define SWITCH_RATIO 16 +#define GEMM_PREFERED_SIZE 16 +#endif +#define USE_SGEMM_KERNEL_DIRECT 1 + +#ifdef ARCH_X86 + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#else + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define DGEMM_DEFAULT_UNROLL_M 16 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 8 +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 2 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#define SGEMM_DEFAULT_UNROLL_MN 32 +#define DGEMM_DEFAULT_UNROLL_MN 32 +#endif + +#ifdef ARCH_X86 + +#define SGEMM_DEFAULT_P 512 +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_P 512 +#define DGEMM_DEFAULT_R dgemm_r +#define QGEMM_DEFAULT_P 504 +#define QGEMM_DEFAULT_R qgemm_r +#define CGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_R 1024 +#define ZGEMM_DEFAULT_P 512 +#define ZGEMM_DEFAULT_R zgemm_r +#define XGEMM_DEFAULT_P 252 +#define XGEMM_DEFAULT_R xgemm_r +#define SGEMM_DEFAULT_Q 256 +#define DGEMM_DEFAULT_Q 256 +#define QGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 192 +#define XGEMM_DEFAULT_Q 128 + +#else + +#define SGEMM_DEFAULT_P 640 +#define DGEMM_DEFAULT_P 192 +#define CGEMM_DEFAULT_P 384 +#define ZGEMM_DEFAULT_P 256 + +#define SGEMM_DEFAULT_Q 320 +#define DGEMM_DEFAULT_Q 384 +#define CGEMM_DEFAULT_Q 192 +#define ZGEMM_DEFAULT_Q 128 + +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_R 8640 +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r + +#define QGEMM_DEFAULT_Q 128 +#define QGEMM_DEFAULT_P 504 +#define QGEMM_DEFAULT_R qgemm_r +#define XGEMM_DEFAULT_P 252 +#define XGEMM_DEFAULT_R xgemm_r +#define XGEMM_DEFAULT_Q 128 + +#define CGEMM3M_DEFAULT_UNROLL_N 4 +#define CGEMM3M_DEFAULT_UNROLL_M 8 +#define ZGEMM3M_DEFAULT_UNROLL_N 4 +#define ZGEMM3M_DEFAULT_UNROLL_M 4 + +#define CGEMM3M_DEFAULT_P 320 +#define ZGEMM3M_DEFAULT_P 256 +#define XGEMM3M_DEFAULT_P 112 +#define CGEMM3M_DEFAULT_Q 320 +#define ZGEMM3M_DEFAULT_Q 256 +#define XGEMM3M_DEFAULT_Q 224 +#define CGEMM3M_DEFAULT_R 12288 +#define ZGEMM3M_DEFAULT_R 12288 +#define XGEMM3M_DEFAULT_R 12288 + +#endif +#endif #ifdef ATOM From c62aad62e551cc238cee2e4f78169c62df88bc63 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 14 Aug 2020 00:35:45 +0200 Subject: [PATCH 358/593] Fix incorrect calls to DLASET Reference-LAPACK issue 429 --- lapack-netlib/TESTING/EIG/cchkhb2stg.f | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lapack-netlib/TESTING/EIG/cchkhb2stg.f b/lapack-netlib/TESTING/EIG/cchkhb2stg.f index 61537f44b..cd884febf 100644 --- a/lapack-netlib/TESTING/EIG/cchkhb2stg.f +++ b/lapack-netlib/TESTING/EIG/cchkhb2stg.f @@ -680,8 +680,8 @@ * the one from above. Compare it with D1 computed * using the DSBTRD. * - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) + CALL SLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) + CALL SLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) CALL CLACPY( ' ', K+1, N, A, LDA, U, LDU ) LH = MAX(1, 4*N) LW = LWORK - LH @@ -753,8 +753,8 @@ * the one from above. Compare it with D1 computed * using the DSBTRD. * - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) + CALL SLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) + CALL SLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) CALL CLACPY( ' ', K+1, N, A, LDA, U, LDU ) LH = MAX(1, 4*N) LW = LWORK - LH From d64f1ef26bc7c7f3ee6b54aaa2d394cf7842456d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 14 Aug 2020 00:40:24 +0200 Subject: [PATCH 359/593] Fix incorrect argument to SLASET Reference-LAPACK issue 425 (and 318) --- lapack-netlib/TESTING/EIG/schksb2stg.f | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lapack-netlib/TESTING/EIG/schksb2stg.f b/lapack-netlib/TESTING/EIG/schksb2stg.f index 07b6fa95c..7308bb690 100644 --- a/lapack-netlib/TESTING/EIG/schksb2stg.f +++ b/lapack-netlib/TESTING/EIG/schksb2stg.f @@ -670,8 +670,8 @@ * the one from above. Compare it with D1 computed * using the SSBTRD. * - CALL SLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) - CALL SLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) + CALL SLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) + CALL SLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) CALL SLACPY( ' ', K+1, N, A, LDA, U, LDU ) LH = MAX(1, 4*N) LW = LWORK - LH From 597010a9688c9f5688dc459ba92ef8a28ea20769 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 14 Aug 2020 00:41:56 +0200 Subject: [PATCH 360/593] Fix incorrect argument to SLASET Reference-LAPACK issue 425 (and 318) --- lapack-netlib/TESTING/EIG/schkst2stg.f | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lapack-netlib/TESTING/EIG/schkst2stg.f b/lapack-netlib/TESTING/EIG/schkst2stg.f index f386ab43c..83edb9dce 100644 --- a/lapack-netlib/TESTING/EIG/schkst2stg.f +++ b/lapack-netlib/TESTING/EIG/schkst2stg.f @@ -999,8 +999,8 @@ * the one from above. Compare it with D1 computed * using the 1-stage. * - CALL SLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) - CALL SLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) + CALL SLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) + CALL SLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) CALL SLACPY( "U", N, N, A, LDA, V, LDU ) LH = MAX(1, 4*N) LW = LWORK - LH From f5fcc5baec1c5aea7dbd7a2a8fdd41ae8b422a6e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 15 Aug 2020 13:30:29 +0200 Subject: [PATCH 361/593] Add trivial gemm test for multithread consistency --- cpp_thread_test/gemm64.cpp | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 cpp_thread_test/gemm64.cpp diff --git a/cpp_thread_test/gemm64.cpp b/cpp_thread_test/gemm64.cpp new file mode 100644 index 000000000..2c3442a2e --- /dev/null +++ b/cpp_thread_test/gemm64.cpp @@ -0,0 +1,20 @@ +#include +#include +int main ( int argc, char* argv[] ) { + const long n = ((long)1 << 31) - 1; + std::cout << n < Date: Sat, 15 Aug 2020 13:31:28 +0200 Subject: [PATCH 362/593] Update gemm64.cpp --- cpp_thread_test/gemm64.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp_thread_test/gemm64.cpp b/cpp_thread_test/gemm64.cpp index 2c3442a2e..df38416fa 100644 --- a/cpp_thread_test/gemm64.cpp +++ b/cpp_thread_test/gemm64.cpp @@ -1,5 +1,6 @@ #include -#include +#include "common.h" +#include "cblas.h" int main ( int argc, char* argv[] ) { const long n = ((long)1 << 31) - 1; std::cout << n < Date: Sat, 15 Aug 2020 13:33:52 +0200 Subject: [PATCH 363/593] Add simple sgemm preicsion test --- cpp_thread_test/Makefile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cpp_thread_test/Makefile b/cpp_thread_test/Makefile index 81e3470ef..0dc7229d7 100644 --- a/cpp_thread_test/Makefile +++ b/cpp_thread_test/Makefile @@ -10,5 +10,9 @@ dgemm_tester : dgemv_tester $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../libopenblas.a -lpthread -o dgemm_tester ./dgemm_tester +gemm64 : gemm64 + $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 gemm64.cpp ../libopenblas.a -lpthread -o gemm64 + ./gemm64 + clean :: - rm -f dgemv_tester dgemm_tester + rm -f dgemv_tester dgemm_tester gemm64 From 37ac23e8a36049d875d01887b292ec11751fccc8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 15 Aug 2020 13:38:05 +0200 Subject: [PATCH 364/593] Add simple MT sgemm precision test and INTERFACE64 build --- .drone.yml | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/.drone.yml b/.drone.yml index b1c211d14..fb009d46e 100644 --- a/.drone.yml +++ b/.drone.yml @@ -190,3 +190,29 @@ steps: - make -C ctest $COMMON_FLAGS - make -C utest $COMMON_FLAGS - make -C cpp_thread_test dgemm_tester + - make -C cpp_thread_test gemm64 +--- +kind: pipeline +name: epyc_native_test_int64 + +platform: + os: linux + arch: amd64 + +steps: +- name: Build and Test + image: ubuntu:18.04 + environment: + CC: gcc + COMMON_FLAGS: 'USE_OPENMP=1 INTERFACE64=1' + commands: + - echo "MAKE_FLAGS:= $COMMON_FLAGS" + - apt-get update -y + - apt-get install -y make $CC gfortran perl python g++ + - $CC --version + - make QUIET_MAKE=1 $COMMON_FLAGS + - make -C test $COMMON_FLAGS + - make -C ctest $COMMON_FLAGS + - make -C utest $COMMON_FLAGS + - make -C cpp_thread_test dgemm_tester + - make -C cpp_thread_test gemm64 From d57d503c150bb40e1478b88735818c1b76d64ed2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 15 Aug 2020 14:46:26 +0200 Subject: [PATCH 365/593] Update Makefile --- cpp_thread_test/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp_thread_test/Makefile b/cpp_thread_test/Makefile index 0dc7229d7..0d78990eb 100644 --- a/cpp_thread_test/Makefile +++ b/cpp_thread_test/Makefile @@ -11,7 +11,7 @@ dgemm_tester : dgemv_tester ./dgemm_tester gemm64 : gemm64 - $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 gemm64.cpp ../libopenblas.a -lpthread -o gemm64 + $(CXX) $(COMMON_OPT) -I.. -Wall -Wextra -Wshadow -fopenmp -std=c++11 gemm64.cpp ../libopenblas.a -lpthread -o gemm64 ./gemm64 clean :: From 82f8a0aebabab6e81386b75b6f172abb692dd31c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 15 Aug 2020 15:46:18 +0200 Subject: [PATCH 366/593] Update .drone.yml --- .drone.yml | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/.drone.yml b/.drone.yml index fb009d46e..e8353eb5c 100644 --- a/.drone.yml +++ b/.drone.yml @@ -166,6 +166,32 @@ steps: - make -C ctest $COMMON_FLAGS - make -C utest $COMMON_FLAGS - make -C cpp_thread_test dgemm_tester + - make -C cpp_thread_test gemm64 +--- +kind: pipeline +name: arm64_native_test_int64 + +platform: + os: linux + arch: arm64 + +steps: +- name: Build and Test + image: ubuntu:18.04 + environment: + CC: gcc + COMMON_FLAGS: 'USE_OPENMP=1 INTERFACE64=1' + commands: + - echo "MAKE_FLAGS:= $COMMON_FLAGS" + - apt-get update -y + - apt-get install -y make $CC gfortran perl python g++ + - $CC --version + - make QUIET_MAKE=1 $COMMON_FLAGS + - make -C test $COMMON_FLAGS + - make -C ctest $COMMON_FLAGS + - make -C utest $COMMON_FLAGS + - make -C cpp_thread_test dgemm_tester + - make -C cpp_thread_test gemm64 --- kind: pipeline name: epyc_native_test From 5ec8f716cf181b70352fa15a7beb45fc886312de Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 17 Aug 2020 15:19:40 +0200 Subject: [PATCH 367/593] revert --- .drone.yml | 52 ---------------------------------------------------- 1 file changed, 52 deletions(-) diff --git a/.drone.yml b/.drone.yml index e8353eb5c..b1c211d14 100644 --- a/.drone.yml +++ b/.drone.yml @@ -166,32 +166,6 @@ steps: - make -C ctest $COMMON_FLAGS - make -C utest $COMMON_FLAGS - make -C cpp_thread_test dgemm_tester - - make -C cpp_thread_test gemm64 ---- -kind: pipeline -name: arm64_native_test_int64 - -platform: - os: linux - arch: arm64 - -steps: -- name: Build and Test - image: ubuntu:18.04 - environment: - CC: gcc - COMMON_FLAGS: 'USE_OPENMP=1 INTERFACE64=1' - commands: - - echo "MAKE_FLAGS:= $COMMON_FLAGS" - - apt-get update -y - - apt-get install -y make $CC gfortran perl python g++ - - $CC --version - - make QUIET_MAKE=1 $COMMON_FLAGS - - make -C test $COMMON_FLAGS - - make -C ctest $COMMON_FLAGS - - make -C utest $COMMON_FLAGS - - make -C cpp_thread_test dgemm_tester - - make -C cpp_thread_test gemm64 --- kind: pipeline name: epyc_native_test @@ -216,29 +190,3 @@ steps: - make -C ctest $COMMON_FLAGS - make -C utest $COMMON_FLAGS - make -C cpp_thread_test dgemm_tester - - make -C cpp_thread_test gemm64 ---- -kind: pipeline -name: epyc_native_test_int64 - -platform: - os: linux - arch: amd64 - -steps: -- name: Build and Test - image: ubuntu:18.04 - environment: - CC: gcc - COMMON_FLAGS: 'USE_OPENMP=1 INTERFACE64=1' - commands: - - echo "MAKE_FLAGS:= $COMMON_FLAGS" - - apt-get update -y - - apt-get install -y make $CC gfortran perl python g++ - - $CC --version - - make QUIET_MAKE=1 $COMMON_FLAGS - - make -C test $COMMON_FLAGS - - make -C ctest $COMMON_FLAGS - - make -C utest $COMMON_FLAGS - - make -C cpp_thread_test dgemm_tester - - make -C cpp_thread_test gemm64 From a8c6fb9e1ce4d6cb3d4e8a782f9c4c69469aae91 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 17 Aug 2020 15:20:16 +0200 Subject: [PATCH 368/593] revert --- cpp_thread_test/Makefile | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/cpp_thread_test/Makefile b/cpp_thread_test/Makefile index 0d78990eb..81e3470ef 100644 --- a/cpp_thread_test/Makefile +++ b/cpp_thread_test/Makefile @@ -10,9 +10,5 @@ dgemm_tester : dgemv_tester $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../libopenblas.a -lpthread -o dgemm_tester ./dgemm_tester -gemm64 : gemm64 - $(CXX) $(COMMON_OPT) -I.. -Wall -Wextra -Wshadow -fopenmp -std=c++11 gemm64.cpp ../libopenblas.a -lpthread -o gemm64 - ./gemm64 - clean :: - rm -f dgemv_tester dgemm_tester gemm64 + rm -f dgemv_tester dgemm_tester From 6bfc66663c4b3bbd2c5f7ac05a150d2c4bd94af4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 17 Aug 2020 15:20:41 +0200 Subject: [PATCH 369/593] revert --- cpp_thread_test/gemm64.cpp | 21 --------------------- 1 file changed, 21 deletions(-) delete mode 100644 cpp_thread_test/gemm64.cpp diff --git a/cpp_thread_test/gemm64.cpp b/cpp_thread_test/gemm64.cpp deleted file mode 100644 index df38416fa..000000000 --- a/cpp_thread_test/gemm64.cpp +++ /dev/null @@ -1,21 +0,0 @@ -#include -#include "common.h" -#include "cblas.h" -int main ( int argc, char* argv[] ) { - const long n = ((long)1 << 31) - 1; - std::cout << n < Date: Mon, 17 Aug 2020 15:32:14 +0200 Subject: [PATCH 370/593] Add typedef for bfloat16 if needed --- openblas_config_template.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/openblas_config_template.h b/openblas_config_template.h index 49aea1cab..9955e5c73 100644 --- a/openblas_config_template.h +++ b/openblas_config_template.h @@ -34,6 +34,10 @@ typedef long BLASLONG; typedef unsigned long BLASULONG; #endif +#ifndef BFLOAT16 +typedef unsigned short bfloat16; +#endif + #ifdef OPENBLAS_USE64BITINT typedef BLASLONG blasint; #else From 6b731d917f9049ba426a82dccf9b7bdbcfd1bab3 Mon Sep 17 00:00:00 2001 From: Albert Ziegenhagel Date: Tue, 18 Aug 2020 08:48:48 +0200 Subject: [PATCH 371/593] Do not require pkg-config to generate the *.pc file Generating the pkg-config file does not actually depend on pkg-config being available. --- CMakeLists.txt | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c324e2241..4b82d7670 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -389,11 +389,9 @@ if(NOT NO_LAPACKE) install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64}) endif() -include(FindPkgConfig QUIET) -if(PKG_CONFIG_FOUND) - configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc @ONLY) - install (FILES ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/) -endif() +# Install pkg-config files +configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc @ONLY) +install (FILES ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/) # GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share". From 75eeb265d7c5715f05b63e8706593ef6d8485627 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Aug 2020 14:51:09 +0200 Subject: [PATCH 372/593] [WIP] Refactor the driver code for direct SGEMM (#2782) Move "direct SGEMM" functionality out of the SkylakeX SGEMM kernel and make it available (on x86_64 targets only for now) in DYNAMIC_ARCH builds * Add sgemm_direct targets in the kernel Makefile.L3 and CMakeLists.txt * Add direct_sgemm functions to the gotoblas struct in common_param.h * Move sgemm_direct_performant helper to separate file * Update gemm.c to macros for sgemm_direct to support dynamic_arch naming via common_s,h * (Conditionally) add sgemm_direct functions in setparam-ref.c --- common_level3.h | 4 +-- common_param.h | 5 ++++ common_s.h | 12 ++++++++ interface/gemm.c | 4 +-- kernel/CMakeLists.txt | 14 +++++++++ kernel/Makefile.L3 | 24 ++++++++++++++++ kernel/setparam-ref.c | 5 ++++ kernel/x86_64/sgemm_direct_performant.c | 30 ++++++++++++++++++++ kernel/x86_64/sgemm_direct_skylakex.c | 17 +++++++---- kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c | 2 +- 10 files changed, 107 insertions(+), 10 deletions(-) create mode 100644 kernel/x86_64/sgemm_direct_performant.c diff --git a/common_level3.h b/common_level3.h index 4e44a5e73..671a7a086 100644 --- a/common_level3.h +++ b/common_level3.h @@ -47,12 +47,12 @@ __global__ void cuda_dgemm_kernel(int, int, int, double *, double *, double *); extern "C" { #endif -extern void sgemm_kernel_direct(BLASLONG M, BLASLONG N, BLASLONG K, +void sgemm_direct(BLASLONG M, BLASLONG N, BLASLONG K, float * A, BLASLONG strideA, float * B, BLASLONG strideB, float * R, BLASLONG strideR); -extern int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K); +int sgemm_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K); int shgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, diff --git a/common_param.h b/common_param.h index c92609a76..0437482dc 100644 --- a/common_param.h +++ b/common_param.h @@ -175,6 +175,11 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*ssymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*ssymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +#ifdef ARCH_X86_64 + void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG); + int (*sgemm_direct_performant) (BLASLONG M, BLASLONG N, BLASLONG K); +#endif + int (*sgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); int (*sgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); diff --git a/common_s.h b/common_s.h index 23c432f7c..34903ec49 100644 --- a/common_s.h +++ b/common_s.h @@ -45,6 +45,10 @@ #define SSYMV_THREAD_U ssymv_thread_U #define SSYMV_THREAD_L ssymv_thread_L + +#define SGEMM_DIRECT_PERFORMANT sgemm_direct_performant +#define SGEMM_DIRECT sgemm_direct + #define SGEMM_ONCOPY sgemm_oncopy #define SGEMM_OTCOPY sgemm_otcopy @@ -204,6 +208,14 @@ #define SSYMV_THREAD_U ssymv_thread_U #define SSYMV_THREAD_L ssymv_thread_L +#ifdef ARCH_X86_64 +#define SGEMM_DIRECT_PERFORMANT gotoblas -> sgemm_direct_performant +#define SGEMM_DIRECT gotoblas -> sgemm_direct +#else +#define SGEMM_DIRECT_PERFORMANT sgemm_direct_performant +#define SGEMM_DIRECT sgemm_direct +#endif + #define SGEMM_ONCOPY gotoblas -> sgemm_oncopy #define SGEMM_OTCOPY gotoblas -> sgemm_otcopy #define SGEMM_INCOPY gotoblas -> sgemm_incopy diff --git a/interface/gemm.c b/interface/gemm.c index 99388e7d9..860e588fe 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -275,8 +275,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS #ifdef DYNAMIC_ARCH if (support_avx512() ) #endif - if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && sgemm_kernel_direct_performant(m,n,k)) { - sgemm_kernel_direct(m, n, k, a, lda, b, ldb, c, ldc); + if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && SGEMM_DIRECT_PERFORMANT(m,n,k)) { + SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc); return; } diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index d1349c5f8..d9fba6aca 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -134,6 +134,20 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) set(USE_TRMM true) endif () + set(USE_DIRECT_SGEMM false) + if (X86_64) + set(USE_DIRECT_SGEMM true) + endif() + + if (USE_DIRECT_SGEMM) + # if (NOT DEFINED SGEMMDIRECTKERNEL) + set (SGEMMDIRECTKERNEL sgemm_direct_skylakex.c) + set (SGEMMDIRECTPERFORMANT sgemm_direct_performant.c) + # endif() + GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTKERNEL}" "" "gemm_direct" false "" "" false SINGLE) + GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPERFORMANT}" "" "gemm_direct_performant" false "" "" false SINGLE) + endif() + foreach (float_type SINGLE DOUBLE HALF) string(SUBSTRING ${float_type} 0 1 float_char) if (${float_type} STREQUAL "HALF") diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 8df306d5f..a176b47fe 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -9,6 +9,10 @@ ifeq ($(ARCH), x86_64) USE_GEMM3M = 1 endif +ifeq ($(ARCH), x86_64) +USE_DIRECT_SGEMM = 1 +endif + ifeq ($(ARCH), ia64) USE_GEMM3M = 1 endif @@ -65,6 +69,13 @@ ifeq ($(CORE), Z14) USE_TRMM = 1 endif +ifdef USE_DIRECT_SGEMM +ifndef SGEMMDIRECTKERNEL +SGEMMDIRECTKERNEL = sgemm_direct_skylakex.c +SGEMMDIRECTPERFORMANT = sgemm_direct_performant.c +endif +endif + ifeq ($(BUILD_HALF), 1) ifndef SHGEMMKERNEL SHGEMM_BETA = ../generic/gemm_beta.c @@ -90,6 +101,12 @@ SKERNELOBJS += \ $(SGEMMINCOPYOBJ) $(SGEMMITCOPYOBJ) \ $(SGEMMONCOPYOBJ) $(SGEMMOTCOPYOBJ) +ifdef USE_DIRECT_SGEMM +SKERNELOBJS += \ + sgemm_direct$(TSUFFIX).$(SUFFIX) \ + sgemm_direct_performant$(TSUFFIX).$(SUFFIX) +endif + DKERNELOBJS += \ dgemm_kernel$(TSUFFIX).$(SUFFIX) \ $(DGEMMINCOPYOBJ) $(DGEMMITCOPYOBJ) \ @@ -668,6 +685,13 @@ else $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ endif +ifdef USE_DIRECT_SGEMM +$(KDIR)sgemm_direct_performant$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTPERFORMANT) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ +$(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ +endif + ifeq ($(BUILD_HALF), 1) $(KDIR)shgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND) diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index d3aa030c1..d3845003a 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -135,6 +135,11 @@ gotoblas_t TABLE_NAME = { sgemv_nTS, sgemv_tTS, sger_kTS, ssymv_LTS, ssymv_UTS, +#ifdef ARCH_X86_64 + sgemm_directTS, + sgemm_direct_performantTS, +#endif + sgemm_kernelTS, sgemm_betaTS, #if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N sgemm_incopyTS, sgemm_itcopyTS, diff --git a/kernel/x86_64/sgemm_direct_performant.c b/kernel/x86_64/sgemm_direct_performant.c new file mode 100644 index 000000000..5a20ce395 --- /dev/null +++ b/kernel/x86_64/sgemm_direct_performant.c @@ -0,0 +1,30 @@ +#include "common.h" +/* helper for the direct sgemm code written by Arjan van der Ven */ + + + + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K) +{ + unsigned long long mnk = M * N * K; + /* large matrixes -> not performant */ + if (mnk >= 28 * 512 * 512) + return 0; + + /* + * if the B matrix is not a nice multiple if 4 we get many unaligned accesses, + * and the regular sgemm copy/realignment of data pays off much quicker + */ + if ((N & 3) != 0 && (mnk >= 8 * 512 * 512)) + return 0; + +#ifdef SMP + /* if we can run multithreaded, the threading changes the based threshold */ + if (mnk > 2 * 350 * 512 && num_cpu_avail(3)> 1) + return 0; +#endif + + return 1; +} + + diff --git a/kernel/x86_64/sgemm_direct_skylakex.c b/kernel/x86_64/sgemm_direct_skylakex.c index 0e8f1318f..a7cddbb3d 100644 --- a/kernel/x86_64/sgemm_direct_skylakex.c +++ b/kernel/x86_64/sgemm_direct_skylakex.c @@ -1,7 +1,7 @@ - +#if defined(SKYLAKEX) || defined (COOPERLAKE) /* the direct sgemm code written by Arjan van der Ven */ -//#include - +#include +#include "common.h" /* * "Direct sgemm" code. This code operates directly on the inputs and outputs * of the sgemm call, avoiding the copies, memory realignments and threading, @@ -38,6 +38,7 @@ #define MATMUL_SCALAR(N,M) result##N##M += Aval##M * Bval##N; #define STORE_SCALAR(N,M) R[(i+M) * strideR + j + N] = result##N##M; +#if 0 int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K) { unsigned long long mnk = M * N * K; @@ -61,9 +62,10 @@ int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K) return 1; } +#endif - -void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR) +//void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR) +void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR) { int i, j, k; @@ -465,3 +467,8 @@ void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict } } } +#else +#include "common.h" +void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR) +{} +#endif diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c index 3b1af33c1..f3d614242 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c @@ -512,4 +512,4 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f return 0; } #include -#include "sgemm_direct_skylakex.c" +//#include "sgemm_direct_skylakex.c" From bb9cf766f5cfd5112adebfeb30f916350854b05d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Aug 2020 15:06:30 +0200 Subject: [PATCH 373/593] make march=cooperlake option conditional on gcc >= 10.1 --- Makefile.x86_64 | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 96e9dbe44..00975b25a 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -30,8 +30,15 @@ endif ifeq ($(CORE), COOPERLAKE) ifndef DYNAMIC_ARCH ifndef NO_AVX512 +ifeq ($(C_COMPILER), GCC) +# cooperlake support was added in 10.1 +GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10) +GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 1) +ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11) CCOMMON_OPT += -march=cooperlake FCOMMON_OPT += -march=cooperlake +endif +endif ifeq ($(OSNAME), CYGWIN_NT) CCOMMON_OPT += -fno-asynchronous-unwind-tables FCOMMON_OPT += -fno-asynchronous-unwind-tables From 81fbe8d08858ae0f1dd4de1bc5dfad864d8358f5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Aug 2020 16:10:15 +0200 Subject: [PATCH 374/593] -march=cooperlake only available in gcc >= 10 --- kernel/Makefile | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/Makefile b/kernel/Makefile index 0c883cd96..d5078c5ba 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -8,6 +8,7 @@ include $(TOPDIR)/Makefile.system ifeq ($(C_COMPILER), GCC) GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) +GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10) endif ifeq ($(ARCH), power) @@ -38,7 +39,12 @@ endif ifdef TARGET_CORE ifeq ($(TARGET_CORE), COOPERLAKE) - override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=cooperlake + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) + ifeq ($(GCCVERSIONGTEQ10, 1) + override CFLAGS += -march=cooperlake + else + override CFLAGS += -march=skylake-avx512 + endif ifeq ($(OSNAME), CYGWIN_NT) override CFLAGS += -fno-asynchronous-unwind-tables endif From 6f4dc7445d220ffd38e0ceaa17f983e359713760 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Aug 2020 16:36:55 +0200 Subject: [PATCH 375/593] Fix typo --- kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/Makefile b/kernel/Makefile index d5078c5ba..16211218f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -40,7 +40,7 @@ endif ifdef TARGET_CORE ifeq ($(TARGET_CORE), COOPERLAKE) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) - ifeq ($(GCCVERSIONGTEQ10, 1) + ifeq ($(GCCVERSIONGTEQ10), 1) override CFLAGS += -march=cooperlake else override CFLAGS += -march=skylake-avx512 From 430f741b302d98e0dab3eab2675cb7b4f7d096ed Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Aug 2020 17:17:53 +0200 Subject: [PATCH 376/593] -march=cooperlake requires gcc10 --- cmake/cc.cmake | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cmake/cc.cmake b/cmake/cc.cmake index 88cf9f573..d7608220c 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -107,7 +107,10 @@ endif () if (${CORE} STREQUAL "COOPERLAKE") if (NOT DYNAMIC_ARCH) if (NOT NO_AVX512) - set (CCOMMON_OPT = "${CCOMMON_OPT} -march=cooperlake") + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1) + set (CCOMMON_OPT = "${CCOMMON_OPT} -march=cooperlake") + endif() endif () endif () endif () From 6a3c07478682770ad05a3046ac0523bdde7050b1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Aug 2020 17:22:12 +0200 Subject: [PATCH 377/593] -march=cooperlake requires gcc10 --- cmake/system.cmake | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index 2838e279f..b4ffc1803 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -46,7 +46,14 @@ endif () if (DEFINED TARGET) if (${TARGET} STREQUAL "COOPERLAKE" AND NOT NO_AVX512) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake") + if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL10.1) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake") + else() + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") + endif() + endif() endif() if (${TARGET} STREQUAL "SKYLAKEX" AND NOT NO_AVX512) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") From 71d33c952da2ad57dbec3e4e48556db0f4f17610 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Aug 2020 17:44:23 +0200 Subject: [PATCH 378/593] Typo fix --- cmake/system.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index b4ffc1803..1b4368589 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -48,7 +48,7 @@ if (DEFINED TARGET) if (${TARGET} STREQUAL "COOPERLAKE" AND NOT NO_AVX512) if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) - if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL10.1) + if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake") else() set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") From 7c1986640b3be7ffd97f908cb6171f9b2b515c36 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Aug 2020 20:48:39 +0200 Subject: [PATCH 379/593] fallback from cooperlake to skylake if gcc<10 --- cmake/cc.cmake | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cmake/cc.cmake b/cmake/cc.cmake index d7608220c..c490dd9ab 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -110,6 +110,8 @@ if (${CORE} STREQUAL "COOPERLAKE") execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1) set (CCOMMON_OPT = "${CCOMMON_OPT} -march=cooperlake") + else () + set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512") endif() endif () endif () From b8ebfc933562cf2c55e6147a791d29aff0d4ef6d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Aug 2020 22:30:19 +0200 Subject: [PATCH 380/593] Update system.cmake --- cmake/system.cmake | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index 1b4368589..827ff5adb 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -46,13 +46,15 @@ endif () if (DEFINED TARGET) if (${TARGET} STREQUAL "COOPERLAKE" AND NOT NO_AVX512) - if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") +# if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake") else() set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") endif() +# elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG") +# set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") endif() endif() if (${TARGET} STREQUAL "SKYLAKEX" AND NOT NO_AVX512) From bd3207b4b437bf6927043b3bcdd135ac29f2a6a7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Aug 2020 22:51:10 +0200 Subject: [PATCH 381/593] Update system.cmake --- cmake/system.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index 827ff5adb..e3617c4e2 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -55,7 +55,7 @@ if (DEFINED TARGET) endif() # elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG") # set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") - endif() +# endif() endif() if (${TARGET} STREQUAL "SKYLAKEX" AND NOT NO_AVX512) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") From 7c0977c267b19179a847b8fbe74b5ecfdadbaa48 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 22 Aug 2020 13:53:44 +0200 Subject: [PATCH 382/593] Add OpenMP dependency to pkgconfig file if needed --- cmake/openblas.pc.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/openblas.pc.in b/cmake/openblas.pc.in index df4b2ab06..0bd49f996 100644 --- a/cmake/openblas.pc.in +++ b/cmake/openblas.pc.in @@ -7,5 +7,5 @@ Name: OpenBLAS Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version Version: @OPENBLAS_VERSION@ URL: https://github.com/xianyi/OpenBLAS -Libs: -L${libdir} -lopenblas${libsuffix} +Libs: @OpenMP_C_FLAGS@ -L${libdir} -lopenblas${libsuffix} Cflags: -I${includedir} From 1840bc5b523ff5dc17eebdbff3c0784a4ae1f03f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 22 Aug 2020 13:55:18 +0200 Subject: [PATCH 383/593] Add OpenMP dependency to pkgconfig file if needed --- Makefile.install | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/Makefile.install b/Makefile.install index 01c0b1226..7c1a3ca43 100644 --- a/Makefile.install +++ b/Makefile.install @@ -13,6 +13,14 @@ OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/openblas OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake OPENBLAS_CMAKE_CONFIG_VERSION := OpenBLASConfigVersion.cmake OPENBLAS_PKGCONFIG_DIR := $(OPENBLAS_LIBRARY_DIR)/pkgconfig +PKG_EXTRALIB := $(EXTRALIB) +ifeq ($(USE_OPENMP), 1) + ifeq ($(C_COMPILER), PGI) + PKG_EXTRALIB += -lomp + else + PKG_EXTRALIB += -lgomp + endif +endif .PHONY : install .NOTPARALLEL : install @@ -147,7 +155,7 @@ endif @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" - @echo 'extralib='$(EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" + @echo 'extralib='$(PKG_EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" From b2053239fc36f9ca8c29286d8fc553d0200907b0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 23 Aug 2020 15:08:16 +0200 Subject: [PATCH 384/593] Fix mssing dummy parameter (imag part of alpha) of zdot_thread_function --- kernel/x86_64/zdot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c index 90fd86daf..1bc785ac1 100644 --- a/kernel/x86_64/zdot.c +++ b/kernel/x86_64/zdot.c @@ -168,7 +168,7 @@ static void zdot_compute (BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLO #if defined(SMP) static int zdot_thread_function(BLASLONG n, BLASLONG dummy0, -BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y, +BLASLONG dummy1, FLOAT dummy2r, FLOAT dummy2i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *result, BLASLONG dummy3) { zdot_compute(n, x, inc_x, y, inc_y, (void *)result); From 0c1c903f1eb79719aa159b497cc2089d9fe61556 Mon Sep 17 00:00:00 2001 From: "Chen, Guobing" Date: Wed, 12 Aug 2020 03:28:25 +0800 Subject: [PATCH 385/593] Fix OMP num specify issue In current code, no matter what number of threads specified, all available CPU count is used when invoking OMP, which leads to very bad performance if the workload is small while all available CPUs are big. Lots of time are wasted on inter-thread sync. Fix this issue by really using the number specified by the variable 'num' from calling API. Signed-off-by: Chen, Guobing --- driver/others/blas_server_omp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index b4eb27c25..d9969b599 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -335,7 +335,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ break; } -#pragma omp parallel for schedule(OMP_SCHED) +#pragma omp parallel for num_threads(num) schedule(OMP_SCHED) for (i = 0; i < num; i ++) { #ifndef USE_SIMPLE_THREADED_LEVEL3 From 48a1364e105fccc7162adeab0de22487d52d88d3 Mon Sep 17 00:00:00 2001 From: pkubaj Date: Sun, 23 Aug 2020 18:50:19 +0000 Subject: [PATCH 386/593] Add aliases for armv6, armv7 FreeBSD uses those names for 32-bit ARM variants. --- Makefile.system | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile.system b/Makefile.system index 2286d14f2..e7d3dc4ce 100644 --- a/Makefile.system +++ b/Makefile.system @@ -25,6 +25,10 @@ else ifeq ($(ARCH), powerpc) override ARCH=power else ifeq ($(ARCH), i386) override ARCH=x86 +else ifeq ($(ARCH), armv6) +override ARCH=arm +else ifeq ($(ARCH), armv7) +override ARCH=arm else ifeq ($(ARCH), aarch64) override ARCH=arm64 else ifeq ($(ARCH), zarch) From 936966a42c1f2f0c63b49dc0a47e7e3039e520eb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 27 Aug 2020 10:59:08 +0200 Subject: [PATCH 387/593] Make ILAENV and xGETRF2 functions available --- relapack/src/lapack.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/relapack/src/lapack.h b/relapack/src/lapack.h index 776b0589f..9e9cdff7e 100644 --- a/relapack/src/lapack.h +++ b/relapack/src/lapack.h @@ -4,6 +4,13 @@ extern blasint LAPACK(lsame)(const char *, const char *); extern blasint LAPACK(xerbla)(const char *, const blasint *, int); +extern const blasint LAPACK(ilaenv)(const blasint *, const char*, const char*, const blasint* , int , int, int ); + +extern void LAPACK(sgetrf2)(const blasint *, const blasint *, float *, const blasint *, blasint *, blasint *); +extern void LAPACK(dgetrf2)(const blasint *, const blasint *, double *, const blasint *, blasint *, blasint *); +extern void LAPACK(cgetrf2)(const blasint *, const blasint *, float *, const blasint *, blasint *, blasint *); +extern void LAPACK(zgetrf2)(const blasint *, const blasint *, double *, const blasint *, blasint *, blasint *); + extern void LAPACK(slaswp)(const blasint *, float *, const blasint *, const blasint *, const blasint *, const blasint *, const blasint *); extern void LAPACK(dlaswp)(const blasint *, double *, const blasint *, const blasint *, const blasint *, const blasint *, const blasint *); extern void LAPACK(claswp)(const blasint *, float *, const blasint *, const blasint *, const blasint *, const blasint *, const blasint *); From 6797a3a1e0b3ad9f5df62e2b751c8d5ac50cbaf5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 27 Aug 2020 11:15:12 +0200 Subject: [PATCH 388/593] Add early returns --- relapack/src/cgetrf.c | 9 +++++++-- relapack/src/chegst.c | 2 ++ relapack/src/chetrf_rook.c | 4 ++-- relapack/src/clauum.c | 2 ++ relapack/src/cpotrf.c | 3 +++ relapack/src/csytrf.c | 3 ++- relapack/src/csytrf_rook.c | 4 ++-- relapack/src/ctgsyl.c | 7 +++++++ relapack/src/ctrsyl.c | 5 +++++ relapack/src/ctrtri.c | 2 ++ 10 files changed, 34 insertions(+), 7 deletions(-) diff --git a/relapack/src/cgetrf.c b/relapack/src/cgetrf.c index 878c9ec15..bf9ca53f4 100644 --- a/relapack/src/cgetrf.c +++ b/relapack/src/cgetrf.c @@ -30,6 +30,8 @@ void RELAPACK_cgetrf( return; } + if (*m == 0 || *n == 0) return; + const blasint sn = MIN(*m, *n); RELAPACK_cgetrf_rec(m, &sn, A, ldA, ipiv, info); @@ -62,9 +64,11 @@ static void RELAPACK_cgetrf_rec( blasint *info ) { - if (*n <= MAX(CROSSOVER_CGETRF, 1)) { + if (*m == 0 || *n == 0) return; + + if ( *n <= MAX(CROSSOVER_CGETRF, 1)) { // Unblocked - LAPACK(cgetf2)(m, n, A, ldA, ipiv, info); + LAPACK(cgetrf2)(m, n, A, ldA, ipiv, info); return; } @@ -96,6 +100,7 @@ static void RELAPACK_cgetrf_rec( // recursion(A_L, ipiv_T) RELAPACK_cgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info); + if (*info) return; // apply pivots to A_R LAPACK(claswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE); diff --git a/relapack/src/chegst.c b/relapack/src/chegst.c index fe77b03ea..8557c2952 100644 --- a/relapack/src/chegst.c +++ b/relapack/src/chegst.c @@ -40,6 +40,8 @@ void RELAPACK_chegst( return; } + if (*n == 0) return; + // Clean char * arguments const char cleanuplo = lower ? 'L' : 'U'; diff --git a/relapack/src/chetrf_rook.c b/relapack/src/chetrf_rook.c index 3d2fa3216..9ed1261cf 100644 --- a/relapack/src/chetrf_rook.c +++ b/relapack/src/chetrf_rook.c @@ -36,7 +36,7 @@ void RELAPACK_chetrf_rook( *info = -2; else if (*ldA < MAX(1, *n)) *info = -4; - else if (*lWork < minlWork && *lWork != -1) + else if ((*lWork < 1 || *lWork < minlWork) && *lWork != -1) *info = -7; else if (*lWork == -1) { // Work size query @@ -56,7 +56,7 @@ void RELAPACK_chetrf_rook( if (*info) { const blasint minfo = -*info; - LAPACK(xerbla)("CHETRF", &minfo, strlen("CHETRF")); + LAPACK(xerbla)("CHETRF_ROOK", &minfo, strlen("CHETRF_ROOK")); return; } diff --git a/relapack/src/clauum.c b/relapack/src/clauum.c index 2bc93f182..58a14e7da 100644 --- a/relapack/src/clauum.c +++ b/relapack/src/clauum.c @@ -32,6 +32,8 @@ void RELAPACK_clauum( return; } + if (*n == 0) return; + // Clean char * arguments const char cleanuplo = lower ? 'L' : 'U'; diff --git a/relapack/src/cpotrf.c b/relapack/src/cpotrf.c index 0f8e7ebb0..db06c6fef 100644 --- a/relapack/src/cpotrf.c +++ b/relapack/src/cpotrf.c @@ -32,6 +32,8 @@ void RELAPACK_cpotrf( return; } + if (*n == 0) return; + // Clean char * arguments const char cleanuplo = lower ? 'L' : 'U'; @@ -46,6 +48,7 @@ static void RELAPACK_cpotrf_rec( float *A, const blasint *ldA, blasint *info ){ + if (*n == 0) return; if (*n <= MAX(CROSSOVER_CPOTRF, 1)) { // Unblocked diff --git a/relapack/src/csytrf.c b/relapack/src/csytrf.c index 2ebc31001..807c91ece 100644 --- a/relapack/src/csytrf.c +++ b/relapack/src/csytrf.c @@ -36,7 +36,7 @@ void RELAPACK_csytrf( *info = -2; else if (*ldA < MAX(1, *n)) *info = -4; - else if (*lWork < minlWork && *lWork != -1) + else if ((*lWork < 1 || *lWork < minlWork) && *lWork != -1) *info = -7; else if (*lWork == -1) { // Work size query @@ -67,6 +67,7 @@ void RELAPACK_csytrf( blasint nout; // Recursive kernel +if (*n != 0) RELAPACK_csytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); #if XSYTRF_ALLOW_MALLOC diff --git a/relapack/src/csytrf_rook.c b/relapack/src/csytrf_rook.c index e8a9865cc..105c6b8b6 100644 --- a/relapack/src/csytrf_rook.c +++ b/relapack/src/csytrf_rook.c @@ -36,7 +36,7 @@ void RELAPACK_csytrf_rook( *info = -2; else if (*ldA < MAX(1, *n)) *info = -4; - else if (*lWork < minlWork && *lWork != -1) + else if ((*lWork < 1 || *lWork < minlWork) && *lWork != -1) *info = -7; else if (*lWork == -1) { // Work size query @@ -56,7 +56,7 @@ void RELAPACK_csytrf_rook( if (*info) { const blasint minfo = -*info; - LAPACK(xerbla)("CSYTRF", &minfo, strlen("CSYTRF")); + LAPACK(xerbla)("CSYTRF_ROOK", &minfo, strlen("CSYTRF_ROOK")); return; } diff --git a/relapack/src/ctgsyl.c b/relapack/src/ctgsyl.c index 704f3ef23..632bbc14e 100644 --- a/relapack/src/ctgsyl.c +++ b/relapack/src/ctgsyl.c @@ -68,6 +68,13 @@ void RELAPACK_ctgsyl( return; } + if ( *m == 0 || *n == 0) { + *scale = 1.; + if (notran && (*ijob != 0)) + *dif = 0.; + return; + } + // Clean char * arguments const char cleantrans = notran ? 'N' : 'C'; diff --git a/relapack/src/ctrsyl.c b/relapack/src/ctrsyl.c index fed6e847e..f7b841cb0 100644 --- a/relapack/src/ctrsyl.c +++ b/relapack/src/ctrsyl.c @@ -47,6 +47,11 @@ void RELAPACK_ctrsyl( return; } + if (*m == 0 || *n == 0) { + *scale = 1.; + return; + } + // Clean char * arguments const char cleantranA = notransA ? 'N' : 'C'; const char cleantranB = notransB ? 'N' : 'C'; diff --git a/relapack/src/ctrtri.c b/relapack/src/ctrtri.c index 5201a24c7..8d736007b 100644 --- a/relapack/src/ctrtri.c +++ b/relapack/src/ctrtri.c @@ -36,6 +36,8 @@ void RELAPACK_ctrtri( return; } + if (*n == 0) return; + // Clean char * arguments const char cleanuplo = lower ? 'L' : 'U'; const char cleandiag = nounit ? 'N' : 'U'; From c9b67141f0827ccdfefd0197b6f0daba50f35dc2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 27 Aug 2020 11:20:31 +0200 Subject: [PATCH 389/593] Add early returns --- relapack/src/dgetrf.c | 12 ++++++------ relapack/src/dsytrf.c | 3 ++- relapack/src/dsytrf_rook.c | 4 ++-- relapack/src/dtrsyl.c | 5 +++++ relapack/src/zgetrf.c | 9 +++++++-- relapack/src/zhetrf_rook.c | 4 ++-- relapack/src/zsytrf.c | 3 ++- relapack/src/zsytrf_rook.c | 5 +++-- relapack/src/ztrsyl.c | 5 +++++ relapack/src/ztrtri.c | 4 ++-- 10 files changed, 36 insertions(+), 18 deletions(-) diff --git a/relapack/src/dgetrf.c b/relapack/src/dgetrf.c index be960fde9..3ebfb18d2 100644 --- a/relapack/src/dgetrf.c +++ b/relapack/src/dgetrf.c @@ -29,15 +29,16 @@ void RELAPACK_dgetrf( return; } - const blasint sn = MIN(*m, *n); + if (*m == 0 || *n == 0) return; + const blasint sn = MIN(*m, *n); RELAPACK_dgetrf_rec(m, &sn, A, ldA, ipiv, info); // Right remainder if (*m < *n) { // Constants const double ONE[] = { 1. }; - const blasint iONE[] = { 1. }; + const blasint iONE[] = { 1 }; // Splitting const blasint rn = *n - *m; @@ -60,13 +61,11 @@ static void RELAPACK_dgetrf_rec( double *A, const blasint *ldA, blasint *ipiv, blasint *info ) { - - if (*n <= MAX(CROSSOVER_DGETRF, 1)) { + if ( *n <= MAX(CROSSOVER_DGETRF, 1)) { // Unblocked - LAPACK(dgetf2)(m, n, A, ldA, ipiv, info); + LAPACK(dgetrf2)(m, n, A, ldA, ipiv, info); return; } - // Constants const double ONE[] = { 1. }; const double MONE[] = { -1. }; @@ -95,6 +94,7 @@ static void RELAPACK_dgetrf_rec( // recursion(A_L, ipiv_T) RELAPACK_dgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info); + if (*info) return; // apply pivots to A_R LAPACK(dlaswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE); diff --git a/relapack/src/dsytrf.c b/relapack/src/dsytrf.c index 43d28f94e..ba869ad11 100644 --- a/relapack/src/dsytrf.c +++ b/relapack/src/dsytrf.c @@ -36,7 +36,7 @@ void RELAPACK_dsytrf( *info = -2; else if (*ldA < MAX(1, *n)) *info = -4; - else if (*lWork < minlWork && *lWork != -1) + else if ((*lWork < 1 || *lWork < minlWork) && *lWork != -1) *info = -7; else if (*lWork == -1) { // Work size query @@ -67,6 +67,7 @@ void RELAPACK_dsytrf( blasint nout; // Recursive kernel +if (*n != 0) RELAPACK_dsytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); #if XSYTRF_ALLOW_MALLOC diff --git a/relapack/src/dsytrf_rook.c b/relapack/src/dsytrf_rook.c index 78fa652ab..fcdc2809f 100644 --- a/relapack/src/dsytrf_rook.c +++ b/relapack/src/dsytrf_rook.c @@ -36,7 +36,7 @@ void RELAPACK_dsytrf_rook( *info = -2; else if (*ldA < MAX(1, *n)) *info = -4; - else if (*lWork < minlWork && *lWork != -1) + else if ((*lWork <1 || *lWork < minlWork) && *lWork != -1) *info = -7; else if (*lWork == -1) { // Work size query @@ -56,7 +56,7 @@ void RELAPACK_dsytrf_rook( if (*info) { const blasint minfo = -*info; - LAPACK(xerbla)("DSYTRF", &minfo, strlen("DSYTRF")); + LAPACK(xerbla)("DSYTRF_ROOK", &minfo, strlen("DSYTRF_ROOK")); return; } diff --git a/relapack/src/dtrsyl.c b/relapack/src/dtrsyl.c index 766377300..4948c4977 100644 --- a/relapack/src/dtrsyl.c +++ b/relapack/src/dtrsyl.c @@ -49,6 +49,11 @@ void RELAPACK_dtrsyl( return; } + if (*m == 0 || *n == 0) { + *scale = 1.; + return; + } + // Clean char * arguments const char cleantranA = notransA ? 'N' : (transA ? 'T' : 'C'); const char cleantranB = notransB ? 'N' : (transB ? 'T' : 'C'); diff --git a/relapack/src/zgetrf.c b/relapack/src/zgetrf.c index b0d14ffb1..8c3e8a8e8 100644 --- a/relapack/src/zgetrf.c +++ b/relapack/src/zgetrf.c @@ -30,6 +30,7 @@ void RELAPACK_zgetrf( return; } + if (*m == 0 || *n == 0) return; const blasint sn = MIN(*m, *n); RELAPACK_zgetrf_rec(m, &sn, A, ldA, ipiv, info); @@ -62,9 +63,11 @@ static void RELAPACK_zgetrf_rec( blasint *info ) { - if (*n <= MAX(CROSSOVER_ZGETRF, 1)) { + if (*m == 0 || *n == 0) return; + + if ( *n <= MAX(CROSSOVER_ZGETRF, 1)) { // Unblocked - LAPACK(zgetf2)(m, n, A, ldA, ipiv, info); + LAPACK(zgetrf2)(m, n, A, ldA, ipiv, info); return; } @@ -96,6 +99,8 @@ static void RELAPACK_zgetrf_rec( // recursion(A_L, ipiv_T) RELAPACK_zgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info); +if (*info) return; + // apply pivots to A_R LAPACK(zlaswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE); diff --git a/relapack/src/zhetrf_rook.c b/relapack/src/zhetrf_rook.c index 285aea96e..605e3a77f 100644 --- a/relapack/src/zhetrf_rook.c +++ b/relapack/src/zhetrf_rook.c @@ -36,7 +36,7 @@ void RELAPACK_zhetrf_rook( *info = -2; else if (*ldA < MAX(1, *n)) *info = -4; - else if (*lWork < minlWork && *lWork != -1) + else if ((*lWork < 1 || *lWork < minlWork) && *lWork != -1) *info = -7; else if (*lWork == -1) { // Work size query @@ -56,7 +56,7 @@ void RELAPACK_zhetrf_rook( if (*info) { const blasint minfo = -*info; - LAPACK(xerbla)("ZHETRF", &minfo, strlen("ZHETRF")); + LAPACK(xerbla)("ZHETRF_ROOK", &minfo, strlen("ZHETRF_ROOK")); return; } diff --git a/relapack/src/zsytrf.c b/relapack/src/zsytrf.c index f3412ad8f..59daba02f 100644 --- a/relapack/src/zsytrf.c +++ b/relapack/src/zsytrf.c @@ -36,7 +36,7 @@ void RELAPACK_zsytrf( *info = -2; else if (*ldA < MAX(1, *n)) *info = -4; - else if (*lWork < minlWork && *lWork != -1) + else if ((*lWork < 1 || *lWork < minlWork) && *lWork != -1) *info = -7; else if (*lWork == -1) { // Work size query @@ -67,6 +67,7 @@ void RELAPACK_zsytrf( blasint nout; // Recursive kernel + if (*n != 0) RELAPACK_zsytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); #if XSYTRF_ALLOW_MALLOC diff --git a/relapack/src/zsytrf_rook.c b/relapack/src/zsytrf_rook.c index fc6d73645..0fd8e7033 100644 --- a/relapack/src/zsytrf_rook.c +++ b/relapack/src/zsytrf_rook.c @@ -36,7 +36,7 @@ void RELAPACK_zsytrf_rook( *info = -2; else if (*ldA < MAX(1, *n)) *info = -4; - else if (*lWork < minlWork && *lWork != -1) + else if ((*lWork < 1 || *lWork < minlWork) && *lWork != -1) *info = -7; else if (*lWork == -1) { // Work size query @@ -56,7 +56,7 @@ void RELAPACK_zsytrf_rook( if (*info) { const blasint minfo = -*info; - LAPACK(xerbla)("ZSYTRF", &minfo, strlen("ZSYTRF")); + LAPACK(xerbla)("ZSYTRF_ROOK", &minfo, strlen("ZSYTRF_ROOK")); return; } @@ -67,6 +67,7 @@ void RELAPACK_zsytrf_rook( blasint nout; // Recursive kernel + if (*n != 0) RELAPACK_zsytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); #if XSYTRF_ALLOW_MALLOC diff --git a/relapack/src/ztrsyl.c b/relapack/src/ztrsyl.c index 567ef115a..9d0107526 100644 --- a/relapack/src/ztrsyl.c +++ b/relapack/src/ztrsyl.c @@ -47,6 +47,11 @@ void RELAPACK_ztrsyl( return; } + if (*m == 0 || *n == 0) { + *scale = 1.; + return; + } + // Clean char * arguments const char cleantranA = notransA ? 'N' : 'C'; const char cleantranB = notransB ? 'N' : 'C'; diff --git a/relapack/src/ztrtri.c b/relapack/src/ztrtri.c index 3f6606d84..54854f525 100644 --- a/relapack/src/ztrtri.c +++ b/relapack/src/ztrtri.c @@ -69,8 +69,8 @@ static void RELAPACK_ztrtri_rec( } // Constants - const double ONE[] = { 1. }; - const double MONE[] = { -1. }; + const double ONE[] = { 1., 0. }; + const double MONE[] = { -1. , 0. }; // Splitting const blasint n1 = ZREC_SPLIT(*n); From d64cc2be8143225330bbc5b7877b155a1df3a90f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 27 Aug 2020 11:22:50 +0200 Subject: [PATCH 390/593] Add early returns --- relapack/src/sgetrf.c | 15 +++++++++++---- relapack/src/ssytrf.c | 3 ++- relapack/src/ssytrf_rook.c | 5 +++-- relapack/src/strsyl.c | 5 +++++ 4 files changed, 21 insertions(+), 7 deletions(-) diff --git a/relapack/src/sgetrf.c b/relapack/src/sgetrf.c index 0231cc166..a0c7015fd 100644 --- a/relapack/src/sgetrf.c +++ b/relapack/src/sgetrf.c @@ -14,7 +14,6 @@ void RELAPACK_sgetrf( float *A, const blasint *ldA, blasint *ipiv, blasint *info ) { - // Check arguments *info = 0; if (*m < 0) @@ -28,6 +27,9 @@ void RELAPACK_sgetrf( LAPACK(xerbla)("SGETRF", &minfo, strlen("SGETRF")); return; } + + if (*m == 0 || *n == 0) return; + const blasint sn = MIN(*m, *n); RELAPACK_sgetrf_rec(m, &sn, A, ldA, ipiv, info); @@ -35,7 +37,7 @@ void RELAPACK_sgetrf( if (*m < *n) { // Constants const float ONE[] = { 1. }; - const blasint iONE[] = { 1. }; + const blasint iONE[] = { 1 }; // Splitting const blasint rn = *n - *m; @@ -58,9 +60,12 @@ static void RELAPACK_sgetrf_rec( float *A, const blasint *ldA, blasint *ipiv, blasint *info ) { - if (*n <= MAX(CROSSOVER_SGETRF, 1)) { + + if (*m == 0 || *n == 0) return; + + if ( *n <= MAX(CROSSOVER_SGETRF, 1)) { // Unblocked - LAPACK(sgetf2)(m, n, A, ldA, ipiv, info); + LAPACK(sgetrf2)(m, n, A, ldA, ipiv, info); return; } @@ -91,6 +96,8 @@ static void RELAPACK_sgetrf_rec( // recursion(A_L, ipiv_T) RELAPACK_sgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info); + if (*info) + return; // apply pivots to A_R LAPACK(slaswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE); diff --git a/relapack/src/ssytrf.c b/relapack/src/ssytrf.c index 9fe7ce4a6..5f8e03391 100644 --- a/relapack/src/ssytrf.c +++ b/relapack/src/ssytrf.c @@ -35,7 +35,7 @@ void RELAPACK_ssytrf( *info = -2; else if (*ldA < MAX(1, *n)) *info = -4; - else if (*lWork < minlWork && *lWork != -1) + else if ((*lWork <1 || *lWork < minlWork) && *lWork != -1) *info = -7; else if (*lWork == -1) { // Work size query @@ -66,6 +66,7 @@ void RELAPACK_ssytrf( blasint nout; // Recursive kernel +if (*n != 0) RELAPACK_ssytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); #if XSYTRF_ALLOW_MALLOC diff --git a/relapack/src/ssytrf_rook.c b/relapack/src/ssytrf_rook.c index abcf29d1c..b40f12271 100644 --- a/relapack/src/ssytrf_rook.c +++ b/relapack/src/ssytrf_rook.c @@ -36,7 +36,7 @@ void RELAPACK_ssytrf_rook( *info = -2; else if (*ldA < MAX(1, *n)) *info = -4; - else if (*lWork < minlWork && *lWork != -1) + else if ((*lWork < 1 ||*lWork < minlWork) && *lWork != -1) *info = -7; else if (*lWork == -1) { // Work size query @@ -56,7 +56,7 @@ void RELAPACK_ssytrf_rook( if (*info) { const blasint minfo = -*info; - LAPACK(xerbla)("SSYTRF", &minfo, strlen("SSYTRF")); + LAPACK(xerbla)("SSYTRF_ROOK", &minfo, strlen("SSYTRF_ROOK")); return; } @@ -67,6 +67,7 @@ void RELAPACK_ssytrf_rook( blasint nout; // Recursive kernel +if (*n != 0) RELAPACK_ssytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); #if XSYTRF_ALLOW_MALLOC diff --git a/relapack/src/strsyl.c b/relapack/src/strsyl.c index 012fb3548..d85963fcc 100644 --- a/relapack/src/strsyl.c +++ b/relapack/src/strsyl.c @@ -49,6 +49,11 @@ void RELAPACK_strsyl( return; } + if (*m == 0 || *n == 0) { + *scale = 1.; + return; + } + // Clean char * arguments const char cleantranA = notransA ? 'N' : (transA ? 'T' : 'C'); const char cleantranB = notransB ? 'N' : (transB ? 'T' : 'C'); From de636757173680ba0a936588ca7b42cdf7ff6c9a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 27 Aug 2020 11:25:18 +0200 Subject: [PATCH 391/593] Add early returns and fix sign errors in workspace calculations --- relapack/src/cgbtrf.c | 11 ++++++----- relapack/src/cpbtrf.c | 10 ++++++---- relapack/src/dgbtrf.c | 5 ++++- relapack/src/dpbtrf.c | 10 ++++++---- relapack/src/sgbtrf.c | 16 ++++++++++++---- relapack/src/spbtrf.c | 13 +++++++++---- relapack/src/zgbtrf.c | 16 +++++++++++++++- relapack/src/zpbtrf.c | 11 +++++++---- 8 files changed, 65 insertions(+), 27 deletions(-) diff --git a/relapack/src/cgbtrf.c b/relapack/src/cgbtrf.c index 61332c6a6..e52f2e6c1 100644 --- a/relapack/src/cgbtrf.c +++ b/relapack/src/cgbtrf.c @@ -36,6 +36,7 @@ void RELAPACK_cgbtrf( return; } + if (*m == 0 || *n == 0) return; // Constant const float ZERO[] = { 0., 0. }; @@ -56,10 +57,10 @@ void RELAPACK_cgbtrf( // Allocate work space const blasint n1 = CREC_SPLIT(*n); - const blasint mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; - const blasint nWorkl = (kv > n1) ? n1 : kv; - const blasint mWorku = (*kl > n1) ? n1 : *kl; - const blasint nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; + const blasint mWorkl = abs ( (kv > n1) ? MAX(1, *m - *kl) : kv); + const blasint nWorkl = abs ( (kv > n1) ? n1 : kv); + const blasint mWorku = abs ((*kl > n1) ? n1 : *kl); + const blasint nWorku = abs ((*kl > n1) ? MAX(0, *n - *kl) : *kl); float *Workl = malloc(mWorkl * nWorkl * 2 * sizeof(float)); float *Worku = malloc(mWorku * nWorku * 2 * sizeof(float)); LAPACK(claset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl); @@ -82,7 +83,7 @@ static void RELAPACK_cgbtrf_rec( blasint *info ) { - if (*n <= MAX(CROSSOVER_CGBTRF, 1)) { + if (*n <= MAX(CROSSOVER_CGBTRF, 1)|| *n > *kl || *ldAb == 1) { // Unblocked LAPACK(cgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info); return; diff --git a/relapack/src/cpbtrf.c b/relapack/src/cpbtrf.c index 971e547c6..a0fa13850 100644 --- a/relapack/src/cpbtrf.c +++ b/relapack/src/cpbtrf.c @@ -35,6 +35,8 @@ void RELAPACK_cpbtrf( return; } + if (*n == 0) return; + // Clean char * arguments const char cleanuplo = lower ? 'L' : 'U'; @@ -43,8 +45,8 @@ void RELAPACK_cpbtrf( // Allocate work space const blasint n1 = CREC_SPLIT(*n); - const blasint mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; - const blasint nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; + const blasint mWork = abs((*kd > n1) ? (lower ? *n - *kd : n1) : *kd); + const blasint nWork = abs((*kd > n1) ? (lower ? n1 : *n - *kd) : *kd); float *Work = malloc(mWork * nWork * 2 * sizeof(float)); LAPACK(claset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork); @@ -64,7 +66,7 @@ static void RELAPACK_cpbtrf_rec( blasint *info ){ - if (*n <= MAX(CROSSOVER_CPBTRF, 1)) { + if (*n <= MAX(CROSSOVER_CPBTRF, 1) || *ldAb==1) { // Unblocked LAPACK(cpbtf2)(uplo, n, kd, Ab, ldAb, info); return; @@ -148,7 +150,7 @@ static void RELAPACK_cpbtrf_rec( } // recursion(A_BR) - if (*kd > n1) + if (*kd > n1 && ldA != 0) RELAPACK_cpotrf(uplo, &n2, A_BR, ldA, info); else RELAPACK_cpbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info); diff --git a/relapack/src/dgbtrf.c b/relapack/src/dgbtrf.c index cdf06ad5b..aac10f251 100644 --- a/relapack/src/dgbtrf.c +++ b/relapack/src/dgbtrf.c @@ -36,6 +36,8 @@ void RELAPACK_dgbtrf( return; } + if (*m == 0 || *n == 0) return; + // Constant const double ZERO[] = { 0. }; @@ -83,7 +85,7 @@ static void RELAPACK_dgbtrf_rec( blasint *info ) { - if (*n <= MAX(CROSSOVER_DGBTRF, 1)) { + if (*n <= MAX(CROSSOVER_DGBTRF, 1) || *n > *kl || *ldAb == 1) { // Unblocked LAPACK(dgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info); return; @@ -195,6 +197,7 @@ static void RELAPACK_dgbtrf_rec( // Worku = A_TRr LAPACK(dlacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku); // Worku = A_TL \ Worku + if (ldWorku <= 0) return; BLAS(dtrsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku); // A_TRr = Worku LAPACK(dlacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA); diff --git a/relapack/src/dpbtrf.c b/relapack/src/dpbtrf.c index 9380b28ad..94e9b80e2 100644 --- a/relapack/src/dpbtrf.c +++ b/relapack/src/dpbtrf.c @@ -35,6 +35,8 @@ void RELAPACK_dpbtrf( return; } + if (*n == 0) return; + // Clean char * arguments const char cleanuplo = lower ? 'L' : 'U'; @@ -43,8 +45,8 @@ void RELAPACK_dpbtrf( // Allocate work space const blasint n1 = DREC_SPLIT(*n); - const blasint mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; - const blasint nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; + const blasint mWork = abs((*kd > n1) ? (lower ? *n - *kd : n1) : *kd); + const blasint nWork = abs((*kd > n1) ? (lower ? n1 : *n - *kd) : *kd); double *Work = malloc(mWork * nWork * sizeof(double)); LAPACK(dlaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork); @@ -64,7 +66,7 @@ static void RELAPACK_dpbtrf_rec( blasint *info ){ - if (*n <= MAX(CROSSOVER_DPBTRF, 1)) { + if (*n <= MAX(CROSSOVER_DPBTRF, 1) || *ldAb == 1) { // Unblocked LAPACK(dpbtf2)(uplo, n, kd, Ab, ldAb, info); return; @@ -148,7 +150,7 @@ static void RELAPACK_dpbtrf_rec( } // recursion(A_BR) - if (*kd > n1) + if (*kd > n1 && ldA != 0) RELAPACK_dpotrf(uplo, &n2, A_BR, ldA, info); else RELAPACK_dpbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info); diff --git a/relapack/src/sgbtrf.c b/relapack/src/sgbtrf.c index 3e3fdf455..76e84e671 100644 --- a/relapack/src/sgbtrf.c +++ b/relapack/src/sgbtrf.c @@ -35,6 +35,13 @@ void RELAPACK_sgbtrf( return; } + if (*m == 0 || *n == 0) return; + + if (*ldAb == 1) { + LAPACK(sgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info); + return; + } + // Constant const float ZERO[] = { 0. }; @@ -82,8 +89,9 @@ static void RELAPACK_sgbtrf_rec( blasint *info ) { + if (*m == 0 || *n == 0) return; - if (*n <= MAX(CROSSOVER_SGBTRF, 1)) { + if ( *n <= MAX(CROSSOVER_SGBTRF, 1) || *n > *kl || *ldAb == 1) { // Unblocked LAPACK(sgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info); return; @@ -160,7 +168,7 @@ static void RELAPACK_sgbtrf_rec( // recursion(Ab_L, ipiv_T) RELAPACK_sgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info); - + if (*info) return; // Workl = A_BLb LAPACK(slacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl); @@ -222,8 +230,8 @@ static void RELAPACK_sgbtrf_rec( // recursion(Ab_BR, ipiv_B) //cause of infinite recursion here ? -// RELAPACK_sgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); - LAPACK(sgbtf2)(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, info); + RELAPACK_sgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); +// LAPACK(sgbtf2)(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, info); if (*info) *info += n1; // shift pivots diff --git a/relapack/src/spbtrf.c b/relapack/src/spbtrf.c index 26804dcc2..330276312 100644 --- a/relapack/src/spbtrf.c +++ b/relapack/src/spbtrf.c @@ -35,6 +35,9 @@ void RELAPACK_spbtrf( return; } + + if (*n == 0) return; + // Clean char * arguments const char cleanuplo = lower ? 'L' : 'U'; @@ -43,8 +46,8 @@ void RELAPACK_spbtrf( // Allocate work space const blasint n1 = SREC_SPLIT(*n); - const blasint mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; - const blasint nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; + const blasint mWork = abs( (*kd > n1) ? (lower ? *n - *kd : n1) : *kd); + const blasint nWork = abs((*kd > n1) ? (lower ? n1 : *n - *kd) : *kd); float *Work = malloc(mWork * nWork * sizeof(float)); LAPACK(slaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork); @@ -64,7 +67,9 @@ static void RELAPACK_spbtrf_rec( blasint *info ){ - if (*n <= MAX(CROSSOVER_SPBTRF, 1)) { + if (*n == 0 ) return; + + if ( *n <= MAX(CROSSOVER_SPBTRF, 1) || *ldAb == 1) { // Unblocked LAPACK(spbtf2)(uplo, n, kd, Ab, ldAb, info); return; @@ -148,7 +153,7 @@ static void RELAPACK_spbtrf_rec( } // recursion(A_BR) - if (*kd > n1) + if (*kd > n1 && ldA != 0) RELAPACK_spotrf(uplo, &n2, A_BR, ldA, info); else RELAPACK_spbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info); diff --git a/relapack/src/zgbtrf.c b/relapack/src/zgbtrf.c index d4ba41753..5d7dfd3c7 100644 --- a/relapack/src/zgbtrf.c +++ b/relapack/src/zgbtrf.c @@ -36,6 +36,8 @@ void RELAPACK_zgbtrf( return; } + if (*m == 0 || *n == 0) return; + // Constant const double ZERO[] = { 0., 0. }; @@ -82,7 +84,7 @@ static void RELAPACK_zgbtrf_rec( blasint *info ) { - if (*n <= MAX(CROSSOVER_ZGBTRF, 1)) { + if (*n <= MAX(CROSSOVER_ZGBTRF, 1) || *n > *kl || *ldAb == 1) { // Unblocked LAPACK(zgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info); return; @@ -92,6 +94,7 @@ static void RELAPACK_zgbtrf_rec( const double ONE[] = { 1., 0. }; const double MONE[] = { -1., 0. }; const blasint iONE[] = { 1 }; + const blasint min11 = -11; // Loop iterators blasint i, j; @@ -158,6 +161,7 @@ static void RELAPACK_zgbtrf_rec( // recursion(Ab_L, ipiv_T) RELAPACK_zgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info); +if (*info) return; // Workl = A_BLb LAPACK(zlacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl); @@ -193,11 +197,21 @@ static void RELAPACK_zgbtrf_rec( } // A_TRl = A_TL \ A_TRl + if (*ldA < MAX(1,m1)) { + LAPACK(xerbla)("ZGBTRF", &min11, strlen("ZGBTRF")); + return; + } else { BLAS(ztrsm)("L", "L", "N", "U", &m1, &n21, ONE, A_TL, ldA, A_TRl, ldA); + } // Worku = A_TRr LAPACK(zlacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku); // Worku = A_TL \ Worku + if (*ldWorku < MAX(1,m1)) { + LAPACK(xerbla)("ZGBTRF", &min11, strlen("ZGBTRF")); + return; + } else { BLAS(ztrsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku); + } // A_TRr = Worku LAPACK(zlacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA); // A_BRtl = A_BRtl - A_BLt * A_TRl diff --git a/relapack/src/zpbtrf.c b/relapack/src/zpbtrf.c index fb0e1e97b..8b094380c 100644 --- a/relapack/src/zpbtrf.c +++ b/relapack/src/zpbtrf.c @@ -35,6 +35,8 @@ void RELAPACK_zpbtrf( return; } + if (*n == 0) return; + // Clean char * arguments const char cleanuplo = lower ? 'L' : 'U'; @@ -43,9 +45,10 @@ void RELAPACK_zpbtrf( // Allocate work space const blasint n1 = ZREC_SPLIT(*n); - const blasint mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; - const blasint nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; + const blasint mWork = abs((*kd > n1) ? (lower ? *n - *kd : n1) : *kd); + const blasint nWork = abs((*kd > n1) ? (lower ? n1 : *n - *kd) : *kd); double *Work = malloc(mWork * nWork * 2 * sizeof(double)); + LAPACK(zlaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork); // Recursive kernel @@ -64,7 +67,7 @@ static void RELAPACK_zpbtrf_rec( blasint *info ){ - if (*n <= MAX(CROSSOVER_ZPBTRF, 1)) { + if (*n <= MAX(CROSSOVER_ZPBTRF, 1) || *ldAb == 1) { // Unblocked LAPACK(zpbtf2)(uplo, n, kd, Ab, ldAb, info); return; @@ -148,7 +151,7 @@ static void RELAPACK_zpbtrf_rec( } // recursion(A_BR) - if (*kd > n1) + if (*kd > n1 && ldA != 0) RELAPACK_zpotrf(uplo, &n2, A_BR, ldA, info); else RELAPACK_zpbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info); From 085aae8bdb137ed2156f2bb4f005a17cd3106384 Mon Sep 17 00:00:00 2001 From: Kevin Adler Date: Thu, 27 Aug 2020 23:08:33 -0500 Subject: [PATCH 392/593] Fix compile error on AIX cpuid detection In 589c74a the cpuid detection was changed to use systemcfg, but a copy and paste error was introduced during some refactoring that caused POWER7 detection to reference CPUTYPE_POWER7 (which doesn't exist) instead of CPUTYPE_POWER6. --- cpuid_power.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpuid_power.c b/cpuid_power.c index df3dc8668..b17493bc8 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -145,7 +145,7 @@ int detect(void){ if (implementation >= 0x40000u) return CPUTYPE_POWER10; else if (implementation & 0x20000) return CPUTYPE_POWER9; else if (implementation & 0x10000) return CPUTYPE_POWER8; - else if (implementation & 0x08000) return CPUTYPE_POWER7; // POWER 7 + else if (implementation & 0x08000) return CPUTYPE_POWER6; // POWER 7 else if (implementation & 0x04000) return CPUTYPE_POWER6; else if (implementation & 0x02000) return CPUTYPE_POWER5; else if (implementation & 0x01000) return CPUTYPE_POWER4; // MPC7450 From 317ff27cda58fbd06f195bea27cab2448b55a0ac Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Fri, 28 Aug 2020 10:42:54 -0500 Subject: [PATCH 393/593] POWER10: Avoid setting accumulators to zero in gemm kernels For the first iteration, it is better to use xvf*ger instead of xvf*gerpp builtins which helps to avoid setting accumulators to zero. This helps to reduce few instructions. --- kernel/power/dgemm_kernel_power10.c | 156 ++++++++++++--------- kernel/power/sgemm_kernel_power10.c | 204 +++++++++++++++++----------- 2 files changed, 222 insertions(+), 138 deletions(-) diff --git a/kernel/power/dgemm_kernel_power10.c b/kernel/power/dgemm_kernel_power10.c index a0bc1a777..b2a29140e 100644 --- a/kernel/power/dgemm_kernel_power10.c +++ b/kernel/power/dgemm_kernel_power10.c @@ -87,22 +87,6 @@ typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); rowC[0] += result[1] * alpha; #endif -#define SET_ACC_ZERO4() \ - __builtin_mma_xxsetaccz (&acc0); \ - __builtin_mma_xxsetaccz (&acc1); \ - __builtin_mma_xxsetaccz (&acc2); \ - __builtin_mma_xxsetaccz (&acc3); - -#define SET_ACC_ZERO8() \ - __builtin_mma_xxsetaccz (&acc0); \ - __builtin_mma_xxsetaccz (&acc1); \ - __builtin_mma_xxsetaccz (&acc2); \ - __builtin_mma_xxsetaccz (&acc3); \ - __builtin_mma_xxsetaccz (&acc4); \ - __builtin_mma_xxsetaccz (&acc5); \ - __builtin_mma_xxsetaccz (&acc6); \ - __builtin_mma_xxsetaccz (&acc7); - #define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) @@ -210,12 +194,22 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, PREFETCH1 (CO + ldc + ldc, 128); PREFETCH1 (CO + ldc + ldc + ldc, 128); __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; - SET_ACC_ZERO8 (); - for (l = 0; l < temp; l++) + vec_t *rowA = (vec_t *) & AO[0]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & BO[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); + __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]); + __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]); + __builtin_mma_xvf64ger (&acc4, rowB, rowA[4]); + __builtin_mma_xvf64ger (&acc5, rowB, rowA[5]); + __builtin_mma_xvf64ger (&acc6, rowB, rowA[6]); + __builtin_mma_xvf64ger (&acc7, rowB, rowA[7]); + for (l = 1; l < temp; l++) { - vec_t *rowA = (vec_t *) & AO[l << 4]; - __vector_pair rowB; - vec_t *rb = (vec_t *) & BO[l << 2]; + rowA = (vec_t *) & AO[l << 4]; + rb = (vec_t *) & BO[l << 2]; __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); @@ -254,13 +248,19 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; v4sf_t result[4]; __vector_quad acc0, acc1, acc2, acc3; - SET_ACC_ZERO4 (); BLASLONG l = 0; - for (l = 0; l < temp; l++) + vec_t *rowA = (vec_t *) & AO[0]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & BO[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); + __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]); + __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]); + for (l = 1; l < temp; l++) { - vec_t *rowA = (vec_t *) & AO[l << 3]; - __vector_pair rowB; - vec_t *rb = (vec_t *) & BO[l << 2]; + rowA = (vec_t *) & AO[l << 3]; + rb = (vec_t *) & BO[l << 2]; __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); @@ -291,14 +291,17 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; v4sf_t result[4]; __vector_quad acc0, acc1; - __builtin_mma_xxsetaccz (&acc0); - __builtin_mma_xxsetaccz (&acc1); BLASLONG l = 0; - for (l = 0; l < temp; l++) + vec_t *rowA = (vec_t *) & AO[0]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & BO[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); + for (l = 1; l < temp; l++) { - vec_t *rowA = (vec_t *) & AO[l << 2]; - __vector_pair rowB; - vec_t *rb = (vec_t *) & BO[l << 2]; + rowA = (vec_t *) & AO[l << 2]; + rb = (vec_t *) & BO[l << 2]; __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); @@ -325,13 +328,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; v4sf_t result[4]; __vector_quad acc0; - __builtin_mma_xxsetaccz (&acc0); BLASLONG l = 0; - for (l = 0; l < temp; l++) + vec_t *rowA = (vec_t *) & AO[0]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & BO[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); + for (l = 1; l < temp; l++) { - vec_t *rowA = (vec_t *) & AO[l << 1]; - __vector_pair rowB; - vec_t *rb = (vec_t *) & BO[l << 2]; + rowA = (vec_t *) & AO[l << 1]; + rb = (vec_t *) & BO[l << 2]; __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); } @@ -414,16 +420,27 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; v4sf_t result[4]; __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; - SET_ACC_ZERO8 (); BLASLONG l = 0; - for (l = 0; l < temp; l++) + FLOAT t[4] = { 0, 0, 0, 0 }; + t[0] = BO[0], t[1] = BO[1]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & t[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + vec_t *rowA = (vec_t *) & AO[0]; + __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); + __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]); + __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]); + __builtin_mma_xvf64ger (&acc4, rowB, rowA[4]); + __builtin_mma_xvf64ger (&acc5, rowB, rowA[5]); + __builtin_mma_xvf64ger (&acc6, rowB, rowA[6]); + __builtin_mma_xvf64ger (&acc7, rowB, rowA[7]); + for (l = 1; l < temp; l++) { - FLOAT t[4] = { 0, 0, 0, 0 }; t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; - __vector_pair rowB; - vec_t *rb = (vec_t *) & t[0]; + rb = (vec_t *) & t[0]; __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); - vec_t *rowA = (vec_t *) & AO[l << 4]; + rowA = (vec_t *) & AO[l << 4]; __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); @@ -461,16 +478,23 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; v4sf_t result[4]; __vector_quad acc0, acc1, acc2, acc3; - SET_ACC_ZERO4 (); BLASLONG l = 0; - for (l = 0; l < temp; l++) + FLOAT t[4] = { 0, 0, 0, 0 }; + t[0] = BO[0], t[1] = BO[1]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & t[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + vec_t *rowA = (vec_t *) & AO[0]; + __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); + __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]); + __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]); + for (l = 1; l < temp; l++) { - FLOAT t[4] = { 0, 0, 0, 0 }; t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; - __vector_pair rowB; - vec_t *rb = (vec_t *) & t[0]; + rb = (vec_t *) & t[0]; __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); - vec_t *rowA = (vec_t *) & AO[l << 3]; + rowA = (vec_t *) & AO[l << 3]; __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); @@ -500,17 +524,21 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; v4sf_t result[4]; __vector_quad acc0, acc1; - __builtin_mma_xxsetaccz (&acc0); - __builtin_mma_xxsetaccz (&acc1); BLASLONG l = 0; - for (l = 0; l < temp; l++) + FLOAT t[4] = { 0, 0, 0, 0 }; + t[0] = BO[0], t[1] = BO[1]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & t[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + vec_t *rowA = (vec_t *) & AO[0]; + __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); + for (l = 1; l < temp; l++) { - FLOAT t[4] = { 0, 0, 0, 0 }; t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; - __vector_pair rowB; - vec_t *rb = (vec_t *) & t[0]; + rb = (vec_t *) & t[0]; __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); - vec_t *rowA = (vec_t *) & AO[l << 2]; + rowA = (vec_t *) & AO[l << 2]; __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); } @@ -536,16 +564,20 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; v4sf_t result[4]; __vector_quad acc0; - __builtin_mma_xxsetaccz (&acc0); BLASLONG l = 0; - for (l = 0; l < temp; l++) + FLOAT t[4] = { 0, 0, 0, 0 }; + t[0] = BO[0], t[1] = BO[1]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & t[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + vec_t *rowA = (vec_t *) & AO[0]; + __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); + for (l = 1; l < temp; l++) { - FLOAT t[4] = { 0, 0, 0, 0 }; t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; - __vector_pair rowB; - vec_t *rb = (vec_t *) & t[0]; + rb = (vec_t *) & t[0]; __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); - vec_t *rowA = (vec_t *) & AO[l << 1]; + rowA = (vec_t *) & AO[l << 1]; __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); } SAVE2x4_ACC (&acc0, 0); diff --git a/kernel/power/sgemm_kernel_power10.c b/kernel/power/sgemm_kernel_power10.c index 81a5ec76b..9fbf84695 100644 --- a/kernel/power/sgemm_kernel_power10.c +++ b/kernel/power/sgemm_kernel_power10.c @@ -134,21 +134,6 @@ typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); __builtin_mma_xvf32gerpp (&acc5, rowB[i+1], rowA[j+2]); \ __builtin_mma_xvf32gerpp (&acc6, rowB[i], rowA[j+3]); \ __builtin_mma_xvf32gerpp (&acc7, rowB[i+1], rowA[j+3]); -#define SET_ACC_ZERO4() \ - __builtin_mma_xxsetaccz (&acc0); \ - __builtin_mma_xxsetaccz (&acc1); \ - __builtin_mma_xxsetaccz (&acc2); \ - __builtin_mma_xxsetaccz (&acc3); - -#define SET_ACC_ZERO8() \ - __builtin_mma_xxsetaccz (&acc0); \ - __builtin_mma_xxsetaccz (&acc1); \ - __builtin_mma_xxsetaccz (&acc2); \ - __builtin_mma_xxsetaccz (&acc3); \ - __builtin_mma_xxsetaccz (&acc4); \ - __builtin_mma_xxsetaccz (&acc5); \ - __builtin_mma_xxsetaccz (&acc6); \ - __builtin_mma_xxsetaccz (&acc7); #define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); @@ -249,8 +234,20 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; v4sf_t result[4]; __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; - SET_ACC_ZERO8 (); BLASLONG l = 0; + vec_t *rowA1 = (vec_t *) & AO[0]; + vec_t *rowB1 = (vec_t *) & BO[0]; + __builtin_mma_xvf32ger (&acc0, rowB1[0], rowA1[0]); + __builtin_mma_xvf32ger (&acc1, rowB1[1], rowA1[0]); + __builtin_mma_xvf32ger (&acc2, rowB1[0], rowA1[1]); + __builtin_mma_xvf32ger (&acc3, rowB1[1], rowA1[1]); + __builtin_mma_xvf32ger (&acc4, rowB1[0], rowA1[2]); + __builtin_mma_xvf32ger (&acc5, rowB1[1], rowA1[2]); + __builtin_mma_xvf32ger (&acc6, rowB1[0], rowA1[3]); + __builtin_mma_xvf32ger (&acc7, rowB1[1], rowA1[3]); + AO += 16; + BO += 8; + temp--; BLASLONG K = temp / 64; for (l = 0; l < K; l++) { @@ -454,12 +451,17 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; v4sf_t result[4]; __vector_quad acc0, acc1, acc2, acc3; - SET_ACC_ZERO4 (); BLASLONG l = 0; - for (l = 0; l < temp; l++) + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32ger (&acc1, rowB[1], rowA[0]); + __builtin_mma_xvf32ger (&acc2, rowB[0], rowA[1]); + __builtin_mma_xvf32ger (&acc3, rowB[1], rowA[1]); + for (l = 1; l < temp; l++) { - vec_t *rowA = (vec_t *) & AO[l << 3]; - vec_t *rowB = (vec_t *) & BO[l << 3]; + rowA = (vec_t *) & AO[l << 3]; + rowB = (vec_t *) & BO[l << 3]; __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]); __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[1]); @@ -489,13 +491,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; v4sf_t result[4]; __vector_quad acc0, acc1; - __builtin_mma_xxsetaccz (&acc0); - __builtin_mma_xxsetaccz (&acc1); BLASLONG l = 0; - for (l = 0; l < temp; l++) + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32ger (&acc1, rowB[1], rowA[0]); + for (l = 1; l < temp; l++) { - vec_t *rowA = (vec_t *) & AO[l << 2]; - vec_t *rowB = (vec_t *) & BO[l << 3]; + rowA = (vec_t *) & AO[l << 2]; + rowB = (vec_t *) & BO[l << 3]; __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]); } @@ -522,15 +526,18 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v2sf_t *rowC; v2sf_t result[8]; __vector_quad acc0, acc1; - __builtin_mma_xxsetaccz (&acc0); - __builtin_mma_xxsetaccz (&acc1); BLASLONG l = 0; - for (l = 0; l < temp; l++) + FLOAT t[4] = { 0 }; + t[0] = AO[0], t[1] = AO[1]; + vec_t *rowA = (vec_t *) & t[0]; + vec_t *rowB = (vec_t *) & BO[0]; + __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32ger (&acc1, rowB[1], rowA[0]); + for (l = 1; l < temp; l++) { - FLOAT t[4] = { 0 }; t[0] = AO[l << 1], t[1] = AO[(l << 1) + 1]; - vec_t *rowA = (vec_t *) & t[0]; - vec_t *rowB = (vec_t *) & BO[l << 3]; + rowA = (vec_t *) & t[0]; + rowB = (vec_t *) & BO[l << 3]; __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]); } @@ -625,13 +632,23 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, FLOAT *A1; A1 = AO + (16 * k); __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; - SET_ACC_ZERO8 (); BLASLONG l = 0; - for (l = 0; l < k; l++) + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowA1 = (vec_t *) & A1[0]; + vec_t *rowB = (vec_t *) & BO[0]; + __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]); + __builtin_mma_xvf32ger (&acc2, rowB[0], rowA[2]); + __builtin_mma_xvf32ger (&acc3, rowB[0], rowA[3]); + __builtin_mma_xvf32ger (&acc4, rowB[0], rowA1[0]); + __builtin_mma_xvf32ger (&acc5, rowB[0], rowA1[1]); + __builtin_mma_xvf32ger (&acc6, rowB[0], rowA1[2]); + __builtin_mma_xvf32ger (&acc7, rowB[0], rowA1[3]); + for (l = 1; l < k; l++) { - vec_t *rowA = (vec_t *) & AO[l << 4]; - vec_t *rowA1 = (vec_t *) & A1[l << 4]; - vec_t *rowB = (vec_t *) & BO[l << 2]; + rowA = (vec_t *) & AO[l << 4]; + rowA1 = (vec_t *) & A1[l << 4]; + rowB = (vec_t *) & BO[l << 2]; __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]); @@ -673,12 +690,17 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; v4sf_t result[4]; __vector_quad acc0, acc1, acc2, acc3; - SET_ACC_ZERO4 (); BLASLONG l = 0; - for (l = 0; l < temp; l++) + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]); + __builtin_mma_xvf32ger (&acc2, rowB[0], rowA[2]); + __builtin_mma_xvf32ger (&acc3, rowB[0], rowA[3]); + for (l = 1; l < temp; l++) { - vec_t *rowA = (vec_t *) & AO[l << 4]; - vec_t *rowB = (vec_t *) & BO[l << 2]; + rowA = (vec_t *) & AO[l << 4]; + rowB = (vec_t *) & BO[l << 2]; __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]); @@ -710,13 +732,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; v4sf_t result[4]; __vector_quad acc0, acc1; - __builtin_mma_xxsetaccz (&acc0); - __builtin_mma_xxsetaccz (&acc1); BLASLONG l = 0; - for (l = 0; l < temp; l++) + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]); + for (l = 1; l < temp; l++) { - vec_t *rowA = (vec_t *) & AO[l << 3]; - vec_t *rowB = (vec_t *) & BO[l << 2]; + rowA = (vec_t *) & AO[l << 3]; + rowB = (vec_t *) & BO[l << 2]; __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); } @@ -742,12 +766,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; __vector_quad acc0; v4sf_t result[4]; - __builtin_mma_xxsetaccz (&acc0); BLASLONG l = 0; - for (l = 0; l < temp; l++) + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); + for (l = 1; l < temp; l++) { - vec_t *rowA = (vec_t *) & AO[l << 2]; - vec_t *rowB = (vec_t *) & BO[l << 2]; + rowA = (vec_t *) & AO[l << 2]; + rowB = (vec_t *) & BO[l << 2]; __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); } SAVE_ACC (&acc0, 0); @@ -771,14 +797,17 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v2sf_t *rowC; v2sf_t result[8]; __vector_quad acc0; - __builtin_mma_xxsetaccz (&acc0); BLASLONG l = 0; - for (l = 0; l < temp; l++) + FLOAT t[4] = { 0 }; + t[0] = AO[0], t[1] = AO[1]; + vec_t *rowA = (vec_t *) & t[0]; + vec_t *rowB = (vec_t *) & BO[0]; + __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); + for (l = 1; l < temp; l++) { - FLOAT t[4] = { 0 }; t[0] = AO[l << 1], t[1] = AO[(l << 1) + 1]; - vec_t *rowA = (vec_t *) & t[0]; - vec_t *rowB = (vec_t *) & BO[l << 2]; + rowA = (vec_t *) & t[0]; + rowB = (vec_t *) & BO[l << 2]; __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); } SAVE4x2_ACC (&acc0, 0); @@ -856,15 +885,26 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, FLOAT *A1; A1 = AO + (16 * k); __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; - SET_ACC_ZERO8 (); BLASLONG l = 0; - for (l = 0; l < k; l++) + FLOAT t[4] = { 0 }; + t[0] = BO[0], t[1] = BO[1]; + vec_t *rowB = (vec_t *) & t[0]; + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowA1 = (vec_t *) & A1[0]; + __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]); + __builtin_mma_xvf32ger (&acc2, rowB[0], rowA[2]); + __builtin_mma_xvf32ger (&acc3, rowB[0], rowA[3]); + __builtin_mma_xvf32ger (&acc4, rowB[0], rowA1[0]); + __builtin_mma_xvf32ger (&acc5, rowB[0], rowA1[1]); + __builtin_mma_xvf32ger (&acc6, rowB[0], rowA1[2]); + __builtin_mma_xvf32ger (&acc7, rowB[0], rowA1[3]); + for (l = 1; l < k; l++) { - FLOAT t[4] = { 0 }; t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; - vec_t *rowB = (vec_t *) & t[0]; - vec_t *rowA = (vec_t *) & AO[l << 4]; - vec_t *rowA1 = (vec_t *) & A1[l << 4]; + rowB = (vec_t *) & t[0]; + rowA = (vec_t *) & AO[l << 4]; + rowA1 = (vec_t *) & A1[l << 4]; __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]); @@ -897,7 +937,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; v4sf_t result[4]; __vector_quad acc0, acc1, acc2, acc3; - SET_ACC_ZERO4 (); BLASLONG l = 0; #if defined(TRMMKERNEL) REFRESH_POINTERS (16, 2) @@ -905,12 +944,19 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, BO = B; temp = k; #endif - for (l = 0; l < temp; l++) + FLOAT t[4] = { 0 }; + t[0] = BO[0], t[1] = BO[1]; + vec_t *rowB = (vec_t *) & t[0]; + vec_t *rowA = (vec_t *) & AO[0]; + __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]); + __builtin_mma_xvf32ger (&acc2, rowB[0], rowA[2]); + __builtin_mma_xvf32ger (&acc3, rowB[0], rowA[3]); + for (l = 1; l < temp; l++) { - FLOAT t[4] = { 0 }; t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; - vec_t *rowB = (vec_t *) & t[0]; - vec_t *rowA = (vec_t *) & AO[l << 4]; + rowB = (vec_t *) & t[0]; + rowA = (vec_t *) & AO[l << 4]; __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]); @@ -934,8 +980,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; v4sf_t result[4]; __vector_quad acc0, acc1; - __builtin_mma_xxsetaccz (&acc0); - __builtin_mma_xxsetaccz (&acc1); #if defined(TRMMKERNEL) REFRESH_POINTERS (8, 2) #else @@ -943,12 +987,17 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, temp = k; #endif BLASLONG l = 0; - for (l = 0; l < temp; l++) + FLOAT t[4] = { 0 }; + t[0] = BO[0], t[1] = BO[1]; + vec_t *rowB = (vec_t *) & t[0]; + vec_t *rowA = (vec_t *) & AO[0]; + __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]); + for (l = 1; l < temp; l++) { - FLOAT t[4] = { 0 }; t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; - vec_t *rowB = (vec_t *) & t[0]; - vec_t *rowA = (vec_t *) & AO[l << 3]; + rowB = (vec_t *) & t[0]; + rowA = (vec_t *) & AO[l << 3]; __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); } @@ -968,7 +1017,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; v4sf_t result[4]; __vector_quad acc0; - __builtin_mma_xxsetaccz (&acc0); #if defined(TRMMKERNEL) REFRESH_POINTERS (4, 2) #else @@ -976,12 +1024,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, temp = k; #endif BLASLONG l = 0; - for (l = 0; l < temp; l++) + FLOAT t[4] = { 0 }; + t[0] = BO[0], t[1] = BO[1]; + vec_t *rowB = (vec_t *) & t[0]; + vec_t *rowA = (vec_t *) & AO[0]; + __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); + for (l = 1; l < temp; l++) { - FLOAT t[4] = { 0 }; t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; - vec_t *rowB = (vec_t *) & t[0]; - vec_t *rowA = (vec_t *) & AO[l << 2]; + rowB = (vec_t *) & t[0]; + rowA = (vec_t *) & AO[l << 2]; __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); } SAVE2x4_ACC (&acc0, 0); From cb3c190a3a46057782fb518e81b51fc7909e01d8 Mon Sep 17 00:00:00 2001 From: Gengxin Xie Date: Fri, 21 Aug 2020 14:44:36 +0800 Subject: [PATCH 394/593] Implementaion of dasum, sasum with AVX2 & AVX512 intrinsic --- kernel/x86_64/KERNEL.HASWELL | 2 + kernel/x86_64/dasum.c | 96 ++++++++++++++++++++++ kernel/x86_64/dasum_microk_haswell-2.c | 35 ++++++++ kernel/x86_64/dasum_microk_skylakex-2.c | 27 ++++++ kernel/x86_64/sasum.c | 104 ++++++++++++++++++++++++ kernel/x86_64/sasum_microk_haswell-2.c | 36 ++++++++ kernel/x86_64/sasum_microk_skylakex-2.c | 27 ++++++ 7 files changed, 327 insertions(+) create mode 100644 kernel/x86_64/dasum.c create mode 100644 kernel/x86_64/dasum_microk_haswell-2.c create mode 100644 kernel/x86_64/dasum_microk_skylakex-2.c create mode 100644 kernel/x86_64/sasum.c create mode 100644 kernel/x86_64/sasum_microk_haswell-2.c create mode 100644 kernel/x86_64/sasum_microk_skylakex-2.c diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index ef8b36a57..b979fc0ae 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -100,3 +100,5 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c CGEMM3MKERNEL = cgemm3m_kernel_8x4_haswell.c ZGEMM3MKERNEL = zgemm3m_kernel_4x4_haswell.c +SASUMKERNEL = sasum.c +DASUMKERNEL = dasum.c diff --git a/kernel/x86_64/dasum.c b/kernel/x86_64/dasum.c new file mode 100644 index 000000000..31313416b --- /dev/null +++ b/kernel/x86_64/dasum.c @@ -0,0 +1,96 @@ +#include "common.h" +#include + +#define ABS fabs + +#if defined(SKYLAKEX) +#include "dasum_microk_skylakex-2.c" +#elif defined(HASWELL) +#include "dasum_microk_haswell-2.c" +#endif + +#ifndef HAVE_KERNEL_16 +static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1) +{ + + BLASLONG i=0; + FLOAT *x = x1; + FLOAT temp0, temp1, temp2, temp3; + FLOAT temp4, temp5, temp6, temp7; + FLOAT sum0 = 0.0; + FLOAT sum1 = 0.0; + FLOAT sum2 = 0.0; + FLOAT sum3 = 0.0; + + while ( i< n ) + { + + temp0 = ABS(x[0]); + temp1 = ABS(x[1]); + temp2 = ABS(x[2]); + temp3 = ABS(x[3]); + temp4 = ABS(x[4]); + temp5 = ABS(x[5]); + temp6 = ABS(x[6]); + temp7 = ABS(x[7]); + + sum0 += temp0; + sum1 += temp1; + sum2 += temp2; + sum3 += temp3; + + sum0 += temp4; + sum1 += temp5; + sum2 += temp6; + sum3 += temp7; + + x+=8; + i+=8; + + } + + return sum0+sum1+sum2+sum3; +} + +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + BLASLONG n1; + + if (n <= 0 || inc_x <= 0) return(sumf); + + if ( inc_x == 1 ) + { + + n1 = n & -16; + if ( n1 > 0 ) + { + + sumf = dasum_kernel_16(n1, x); + i=n1; + } + + while(i < n) + { + sumf += ABS(x[i]); + i++; + } + + } + else + { + + n *= inc_x; + while(i < n) + { + sumf += ABS(x[i]); + i += inc_x; + } + + } + return(sumf); +} + diff --git a/kernel/x86_64/dasum_microk_haswell-2.c b/kernel/x86_64/dasum_microk_haswell-2.c new file mode 100644 index 000000000..bf9d85e73 --- /dev/null +++ b/kernel/x86_64/dasum_microk_haswell-2.c @@ -0,0 +1,35 @@ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#define HAVE_KERNEL_16 1 + +#include + +static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1) +{ + BLASLONG i = 0; + __m256d accum_0, accum_1, accum_2, accum_3; + + accum_0 = _mm256_setzero_pd(); + accum_1 = _mm256_setzero_pd(); + accum_2 = _mm256_setzero_pd(); + accum_3 = _mm256_setzero_pd(); + + __m256i abs_mask = _mm256_set1_epi64x(0x7fffffffffffffff); + for (; i < n; i += 16) { + accum_0 += (__m256d)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 0]), abs_mask); + accum_1 += (__m256d)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 4]), abs_mask); + accum_2 += (__m256d)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 8]), abs_mask); + accum_3 += (__m256d)_mm256_and_si256(_mm256_loadu_si256(&x1[i+12]), abs_mask); + } + + accum_0 = accum_0 + accum_1 + accum_2 + accum_3; + + __m128d half_accum0; + half_accum0 = _mm_add_pd(_mm256_extractf128_pd(accum_0, 0), _mm256_extractf128_pd(accum_0, 1)); + + half_accum0 = _mm_hadd_pd(half_accum0, half_accum0); + + return half_accum0[0]; + +} +#endif diff --git a/kernel/x86_64/dasum_microk_skylakex-2.c b/kernel/x86_64/dasum_microk_skylakex-2.c new file mode 100644 index 000000000..2c959b1ad --- /dev/null +++ b/kernel/x86_64/dasum_microk_skylakex-2.c @@ -0,0 +1,27 @@ +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#if defined(__AVX512CD__) +#define HAVE_KERNEL_16 1 + +#include + +static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1) +{ + BLASLONG i = 0; + + __m512d accum_0, accum_1; + + accum_0 = _mm512_setzero_pd(); + accum_1 = _mm512_setzero_pd(); + + for (; i < n; i += 16) { + accum_0 += _mm512_abs_pd(_mm512_loadu_pd(&x1[i+ 0])); + accum_1 += _mm512_abs_pd(_mm512_loadu_pd(&x1[i+ 8])); + } + + accum_0 += accum_1; + return _mm512_reduce_add_pd(accum_0); +} +#endif +#endif diff --git a/kernel/x86_64/sasum.c b/kernel/x86_64/sasum.c new file mode 100644 index 000000000..601255546 --- /dev/null +++ b/kernel/x86_64/sasum.c @@ -0,0 +1,104 @@ +#include "common.h" +#include + +#if defined(DOUBLE) + +#error supports float only + +#else + +#define ABS fabsf + +#endif + +#if defined(SKYLAKEX) +#include "sasum_microk_skylakex-2.c" +#elif defined(HASWELL) +#include "sasum_microk_haswell-2.c" +#endif + +#ifndef HAVE_KERNEL_32 + +static FLOAT sasum_kernel_32(BLASLONG n, FLOAT *x1) +{ + + BLASLONG i=0; + FLOAT *x = x1; + FLOAT temp0, temp1, temp2, temp3; + FLOAT temp4, temp5, temp6, temp7; + FLOAT sum0 = 0.0; + FLOAT sum1 = 0.0; + FLOAT sum2 = 0.0; + FLOAT sum3 = 0.0; + + while ( i< n ) + { + + temp0 = ABS(x[0]); + temp1 = ABS(x[1]); + temp2 = ABS(x[2]); + temp3 = ABS(x[3]); + temp4 = ABS(x[4]); + temp5 = ABS(x[5]); + temp6 = ABS(x[6]); + temp7 = ABS(x[7]); + + sum0 += temp0; + sum1 += temp1; + sum2 += temp2; + sum3 += temp3; + + sum0 += temp4; + sum1 += temp5; + sum2 += temp6; + sum3 += temp7; + + x+=8; + i+=8; + + } + + return sum0+sum1+sum2+sum3; +} + +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + BLASLONG n1; + + if (n <= 0 || inc_x <= 0) return(sumf); + + if ( inc_x == 1 ) + { + + n1 = n & -32; + if ( n1 > 0 ) + { + + sumf = sasum_kernel_32(n1, x); + i=n1; + } + + while(i < n) + { + sumf += ABS(x[i]); + i++; + } + + } + else + { + + n *= inc_x; + while(i < n) + { + sumf += ABS(x[i]); + i += inc_x; + } + + } + return(sumf); +} diff --git a/kernel/x86_64/sasum_microk_haswell-2.c b/kernel/x86_64/sasum_microk_haswell-2.c new file mode 100644 index 000000000..f46e76ebf --- /dev/null +++ b/kernel/x86_64/sasum_microk_haswell-2.c @@ -0,0 +1,36 @@ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#define HAVE_KERNEL_32 1 + +#include + +static FLOAT sasum_kernel_32(BLASLONG n, FLOAT *x1) +{ + BLASLONG i = 0; + __m256 accum_0, accum_1, accum_2, accum_3; + + accum_0 = _mm256_setzero_ps(); + accum_1 = _mm256_setzero_ps(); + accum_2 = _mm256_setzero_ps(); + accum_3 = _mm256_setzero_ps(); + + __m256i abs_mask = _mm256_set1_epi32(0x7fffffff); + for (; i < n; i += 32) { + accum_0 += (__m256)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 0]), abs_mask); + accum_1 += (__m256)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 8]), abs_mask); + accum_2 += (__m256)_mm256_and_si256(_mm256_loadu_si256(&x1[i+16]), abs_mask); + accum_3 += (__m256)_mm256_and_si256(_mm256_loadu_si256(&x1[i+24]), abs_mask); + } + + accum_0 = accum_0 + accum_1 + accum_2 + accum_3; + + __m128 half_accum0; + half_accum0 = _mm_add_ps(_mm256_extractf128_ps(accum_0, 0), _mm256_extractf128_ps(accum_0, 1)); + + half_accum0 = _mm_hadd_ps(half_accum0, half_accum0); + half_accum0 = _mm_hadd_ps(half_accum0, half_accum0); + + return half_accum0[0]; + +} +#endif diff --git a/kernel/x86_64/sasum_microk_skylakex-2.c b/kernel/x86_64/sasum_microk_skylakex-2.c new file mode 100644 index 000000000..b1c49fd09 --- /dev/null +++ b/kernel/x86_64/sasum_microk_skylakex-2.c @@ -0,0 +1,27 @@ +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#if defined(__AVX512CD__) +#define HAVE_KERNEL_32 1 + +#include + +static FLOAT sasum_kernel_32(BLASLONG n, FLOAT *x1) +{ + BLASLONG i = 0; + + __m512 accum_0, accum_1; + + accum_0 = _mm512_setzero_ps(); + accum_1 = _mm512_setzero_ps(); + + for (; i < n; i += 32) { + accum_0 += _mm512_abs_ps(_mm512_loadu_ps(&x1[i+ 0])); + accum_1 += _mm512_abs_ps(_mm512_loadu_ps(&x1[i+ 16])); + } + + accum_0 += accum_1; + return _mm512_reduce_add_ps(accum_0); +} +#endif +#endif From 448152cdd809c6ab16f1767660e2f4b5b3aa4ef6 Mon Sep 17 00:00:00 2001 From: Gengxin Xie Date: Mon, 31 Aug 2020 14:39:08 +0800 Subject: [PATCH 395/593] define __AVX2__ to ensure the haswell code compiled with avx2 --- kernel/x86_64/dasum_microk_haswell-2.c | 2 +- kernel/x86_64/sasum_microk_haswell-2.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/dasum_microk_haswell-2.c b/kernel/x86_64/dasum_microk_haswell-2.c index bf9d85e73..7639dfd04 100644 --- a/kernel/x86_64/dasum_microk_haswell-2.c +++ b/kernel/x86_64/dasum_microk_haswell-2.c @@ -1,4 +1,4 @@ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#if (( defined(__GNUC__) && __GNUC__ > 6 ) || (defined(__clang__) && __clang_major__ >= 6)) && defined(__AVX2__) #define HAVE_KERNEL_16 1 diff --git a/kernel/x86_64/sasum_microk_haswell-2.c b/kernel/x86_64/sasum_microk_haswell-2.c index f46e76ebf..b628729f5 100644 --- a/kernel/x86_64/sasum_microk_haswell-2.c +++ b/kernel/x86_64/sasum_microk_haswell-2.c @@ -1,4 +1,4 @@ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#if (( defined(__GNUC__) && __GNUC__ > 6 ) || (defined(__clang__) && __clang_major__ >= 6)) && defined(__AVX2__) #define HAVE_KERNEL_32 1 From 5feb087c05beff18208c31b369d74dc3badeada3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 31 Aug 2020 20:02:08 +0200 Subject: [PATCH 396/593] Handle Apple labeling armv8 as arm64 rather than aarch64 --- cmake/system_check.cmake | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index 4382ffc4e..511a7c7d1 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -54,14 +54,14 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") endif() elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*") set(X86 1) -elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)") - set(ARM 1) -elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)") +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*)") if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") set(ARM64 1) else() set(ARM 1) endif() +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)") + set(ARM 1) elseif (${CMAKE_CROSSCOMPILING}) if (${TARGET} STREQUAL "CORE2") if (NOT BINARY) From 3210a427345126112d3a1501d2ea8024aea861cc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 31 Aug 2020 20:03:21 +0200 Subject: [PATCH 397/593] Report cpu as ARMV8 instead of just giving up on non-Linux hosts --- cpuid_arm64.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 6f41be604..1fd43148a 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -197,6 +197,8 @@ int detect(void) } +#else + return CPU_ARMV8; #endif return CPU_UNKNOWN; From f42e84d46c52f4ee1e05af8f365cd85de8a77b95 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 1 Sep 2020 10:44:48 +0200 Subject: [PATCH 398/593] Fix misnaming of LAPACK_?ggsvp function prototypes as LAPACKE_ (#2808) * Fix misnaming of LAPACK_?ggsvp and ?ggsvd function prototypes as LAPACKE_ * Drop the LAPACKE matrix_layout parameter from the argument lists, change ints to pointers and add missing work arguments. --- lapack-netlib/LAPACKE/include/lapack.h | 116 +++++++++++++------------ 1 file changed, 62 insertions(+), 54 deletions(-) diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h index 36e53ec24..4f48b7c87 100644 --- a/lapack-netlib/LAPACKE/include/lapack.h +++ b/lapack-netlib/LAPACKE/include/lapack.h @@ -3650,45 +3650,45 @@ void LAPACK_zggrqf( lapack_int* info ); #define LAPACK_sggsvd LAPACK_GLOBAL(sggsvd,SGGSVD) -lapack_int LAPACKE_sggsvd( int matrix_layout, char jobu, char jobv, char jobq, - lapack_int m, lapack_int n, lapack_int p, +lapack_int LAPACK_sggsvd( char const* jobu, char const* jobv, char const* jobq, + lapack_int* m, lapack_int* n, lapack_int* p, lapack_int* k, lapack_int* l, float* a, - lapack_int lda, float* b, lapack_int ldb, - float* alpha, float* beta, float* u, lapack_int ldu, - float* v, lapack_int ldv, float* q, lapack_int ldq, - lapack_int* iwork ); + lapack_int* lda, float* b, lapack_int* ldb, + float* alpha, float* beta, float* u, lapack_int* ldu, + float* v, lapack_int* ldv, float* q, lapack_int* ldq, + float* work, lapack_int* iwork, lapack_int* info ); #define LAPACK_dggsvd LAPACK_GLOBAL(dggsvd,DGGSVD) -lapack_int LAPACKE_dggsvd( int matrix_layout, char jobu, char jobv, char jobq, - lapack_int m, lapack_int n, lapack_int p, +lapack_int LAPACK_dggsvd( char const* jobu, char const* jobv, char const* jobq, + lapack_int* m, lapack_int* n, lapack_int* p, lapack_int* k, lapack_int* l, double* a, - lapack_int lda, double* b, lapack_int ldb, + lapack_int* lda, double* b, lapack_int* ldb, double* alpha, double* beta, double* u, - lapack_int ldu, double* v, lapack_int ldv, double* q, - lapack_int ldq, lapack_int* iwork ); + lapack_int* ldu, double* v, lapack_int* ldv, double* q, + lapack_int* ldq, float* work, lapack_int* iwork, lapack_int* info ); #define LAPACK_cggsvd LAPACK_GLOBAL(cggsvd,CGGSVD) -lapack_int LAPACKE_cggsvd( int matrix_layout, char jobu, char jobv, char jobq, - lapack_int m, lapack_int n, lapack_int p, +lapack_int LAPACK_cggsvd( char const* jobu, char const* jobv, char const* jobq, + lapack_int* m, lapack_int* n, lapack_int* p, lapack_int* k, lapack_int* l, - lapack_complex_float* a, lapack_int lda, - lapack_complex_float* b, lapack_int ldb, + lapack_complex_float* a, lapack_int* lda, + lapack_complex_float* b, lapack_int* ldb, float* alpha, float* beta, lapack_complex_float* u, - lapack_int ldu, lapack_complex_float* v, - lapack_int ldv, lapack_complex_float* q, - lapack_int ldq, lapack_int* iwork ); + lapack_int* ldu, lapack_complex_float* v, + lapack_int* ldv, lapack_complex_float* q, + lapack_int* ldq, float* work, lapack_int* rwork, lapack_int* iwork, lapack_int *info ); #define LAPACK_zggsvd LAPACK_GLOBAL(zggsvd,ZGGSVD) -lapack_int LAPACKE_zggsvd( int matrix_layout, char jobu, char jobv, char jobq, - lapack_int m, lapack_int n, lapack_int p, +lapack_int LAPACK_zggsvd( char const* jobu, char const* jobv, char const* jobq, + lapack_int* m, lapack_int* n, lapack_int* p, lapack_int* k, lapack_int* l, - lapack_complex_double* a, lapack_int lda, - lapack_complex_double* b, lapack_int ldb, + lapack_complex_double* a, lapack_int* lda, + lapack_complex_double* b, lapack_int* ldb, double* alpha, double* beta, - lapack_complex_double* u, lapack_int ldu, - lapack_complex_double* v, lapack_int ldv, - lapack_complex_double* q, lapack_int ldq, - lapack_int* iwork ); + lapack_complex_double* u, lapack_int* ldu, + lapack_complex_double* v, lapack_int* ldv, + lapack_complex_double* q, lapack_int* ldq, + float* work, lapack_int* rwork, lapack_int* iwork, lapack_int* info ); #define LAPACK_cggsvd3 LAPACK_GLOBAL(cggsvd3,CGGSVD3) void LAPACK_cggsvd3( @@ -3753,41 +3753,49 @@ void LAPACK_zggsvd3( lapack_int* info ); #define LAPACK_sggsvp LAPACK_GLOBAL(sggsvp,SGGSVP) -lapack_int LAPACKE_sggsvp( int matrix_layout, char jobu, char jobv, char jobq, - lapack_int m, lapack_int p, lapack_int n, float* a, - lapack_int lda, float* b, lapack_int ldb, float tola, - float tolb, lapack_int* k, lapack_int* l, float* u, - lapack_int ldu, float* v, lapack_int ldv, float* q, - lapack_int ldq ); +lapack_int LAPACK_sggsvp( char const* jobu, char const* jobv, char const* jobq, + lapack_int* m, lapack_int* p, lapack_int* n, float* a, + lapack_int* lda, float* b, lapack_int* ldb, float* tola, + float* tolb, lapack_int* k, lapack_int* l, float* u, + lapack_int* ldu, float* v, lapack_int* ldv, float* q, + lapack_int* ldq, lapack_int* iwork, float* tau, + float* work, lapack_int* info); #define LAPACK_dggsvp LAPACK_GLOBAL(dggsvp,DGGSVP) -lapack_int LAPACKE_dggsvp( int matrix_layout, char jobu, char jobv, char jobq, - lapack_int m, lapack_int p, lapack_int n, double* a, - lapack_int lda, double* b, lapack_int ldb, - double tola, double tolb, lapack_int* k, - lapack_int* l, double* u, lapack_int ldu, double* v, - lapack_int ldv, double* q, lapack_int ldq ); +lapack_int LAPACK_dggsvp( char const* jobu, char const* jobv, char const* jobq, + lapack_int* m, lapack_int* p, lapack_int* n, double* a, + lapack_int* lda, double* b, lapack_int* ldb, + double* tola, double* tolb, lapack_int* k, + lapack_int* l, double* u, lapack_int* ldu, double* v, + lapack_int* ldv, double* q, lapack_int* ldq, + lapack_int* iwork, double* tau, double* work, + lapack_int* info); #define LAPACK_cggsvp LAPACK_GLOBAL(cggsvp,CGGSVP) -lapack_int LAPACKE_cggsvp( int matrix_layout, char jobu, char jobv, char jobq, - lapack_int m, lapack_int p, lapack_int n, - lapack_complex_float* a, lapack_int lda, - lapack_complex_float* b, lapack_int ldb, float tola, - float tolb, lapack_int* k, lapack_int* l, - lapack_complex_float* u, lapack_int ldu, - lapack_complex_float* v, lapack_int ldv, - lapack_complex_float* q, lapack_int ldq ); +lapack_int LAPACK_cggsvp( char const* jobu, char const* jobv, char const* jobq, + lapack_int* m, lapack_int* p, lapack_int* n, + lapack_complex_float* a, lapack_int* lda, + lapack_complex_float* b, lapack_int* ldb, float* tola, + float* tolb, lapack_int* k, lapack_int* l, + lapack_complex_float* u, lapack_int* ldu, + lapack_complex_float* v, lapack_int* ldv, + lapack_complex_float* q, lapack_int* ldq, + lapack_int* iwork, lapack_int* rwork, + lapack_complex_float* tau, lapack_complex_float* work, + lapack_int* info); #define LAPACK_zggsvp LAPACK_GLOBAL(zggsvp,ZGGSVP) -lapack_int LAPACKE_zggsvp( int matrix_layout, char jobu, char jobv, char jobq, - lapack_int m, lapack_int p, lapack_int n, - lapack_complex_double* a, lapack_int lda, - lapack_complex_double* b, lapack_int ldb, - double tola, double tolb, lapack_int* k, +lapack_int LAPACK_zggsvp( char const* jobu, char const* jobv, char const* jobq, + lapack_int* m, lapack_int* p, lapack_int* n, + lapack_complex_double* a, lapack_int* lda, + lapack_complex_double* b, lapack_int* ldb, + double* tola, double* tolb, lapack_int* k, lapack_int* l, lapack_complex_double* u, - lapack_int ldu, lapack_complex_double* v, - lapack_int ldv, lapack_complex_double* q, - lapack_int ldq ); + lapack_int* ldu, lapack_complex_double* v, + lapack_int* ldv, lapack_complex_double* q, + lapack_int* ldq, lapack_int* iwork, lapack_int* rwork, + lapack_complex_double* tau, lapack_complex_double* work, + lapack_int* info); #define LAPACK_cggsvp3 LAPACK_GLOBAL(cggsvp3,CGGSVP3) void LAPACK_cggsvp3( From 68b1713c300ac152d1efcb3c02f0c59fafcd39e1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 1 Sep 2020 17:19:14 +0200 Subject: [PATCH 399/593] Merge pull request #2811 from martin-frbg/issue2806 Make NO_AVX512 option override the AVX512 compile test in CMAKE builds as well --- cmake/system.cmake | 5 +++++ cmake/system_check.cmake | 2 ++ 2 files changed, 7 insertions(+) diff --git a/cmake/system.cmake b/cmake/system.cmake index e3617c4e2..c0f3c6ed2 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -110,6 +110,11 @@ if (NO_AVX2) set(GETARCH_FLAGS "${GETARCH_FLAGS} -DNO_AVX2") endif () +if (NO_AVX512) + message(STATUS "Disabling Advanced Vector Extensions 512 (AVX512).") + set(GETARCH_FLAGS "${GETARCH_FLAGS} -DNO_AVX512") +endif () + if (CMAKE_BUILD_TYPE STREQUAL "Debug") set(GETARCH_FLAGS "${GETARCH_FLAGS} ${CMAKE_C_FLAGS_DEBUG}") endif () diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index 511a7c7d1..d06f4779f 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -109,6 +109,7 @@ else() endif() if (X86_64 OR X86) +if (NOT NO_AVX512) file(WRITE ${PROJECT_BINARY_DIR}/avx512.c "#include \n\nint main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }") execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -c -v -o ${PROJECT_BINARY_DIR}/avx512.o ${PROJECT_BINARY_DIR}/avx512.c OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512) if (NO_AVX512 EQUAL 1) @@ -116,6 +117,7 @@ set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") endif() file(REMOVE "avx512.c" "avx512.o") endif() +endif() include(CheckIncludeFile) CHECK_INCLUDE_FILE("stdatomic.h" HAVE_C11) From e4900caa1180e9b13766a97e708992f9df61b1a1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 1 Sep 2020 19:54:08 +0200 Subject: [PATCH 400/593] Fix c_check misinterpreting arm64 in uname output to mean armv7 additionla fix for upcoming OSX on ARM64 related to #2804, as suggested by fxcoudert in #2805 --- c_check | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/c_check b/c_check index 314c2b157..5ea93b75c 100644 --- a/c_check +++ b/c_check @@ -8,7 +8,7 @@ $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); $hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch); $hostarch = `uname -p` if ($hostos eq "AIX"); $hostarch = "x86_64" if ($hostarch eq "amd64"); -$hostarch = "arm" if ($hostarch =~ /^arm.*/); +$hostarch = "arm" if ($hostarch ne "arm64" && $hostarch =~ /^arm.*/); $hostarch = "arm64" if ($hostarch eq "aarch64"); $hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/); $hostarch = "zarch" if ($hostarch eq "s390x"); From 60ef193258f580115640794e0c867ef45cb16974 Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Tue, 1 Sep 2020 13:59:06 +0200 Subject: [PATCH 401/593] s390x: use "lghi" for immediate values to fix build with clang Some of the kernels written in assembly utilize a "load address" instruction for loading an immediate value into a register. That is both unnecessarily complex and LLVM's assembler does not understand that specific syntax. Thus, replace with the appropriate "load immediate" instruction, which is also clearer to read. Signed-off-by: Marius Hillenbrand --- kernel/zarch/ctrmm4x4V.S | 18 +++++++++--------- kernel/zarch/gemm8x4V.S | 24 ++++++++++++------------ kernel/zarch/strmm8x4V.S | 24 ++++++++++++------------ kernel/zarch/ztrmm4x4V.S | 18 +++++++++--------- 4 files changed, 42 insertions(+), 42 deletions(-) diff --git a/kernel/zarch/ctrmm4x4V.S b/kernel/zarch/ctrmm4x4V.S index c0e4df17d..123f2ead0 100644 --- a/kernel/zarch/ctrmm4x4V.S +++ b/kernel/zarch/ctrmm4x4V.S @@ -198,7 +198,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,4,4 nill LOCAL_VAR1,3 #else - la LOCAL_VAR1,3(0,0) + lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L4x4_BK_Store @@ -254,7 +254,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,2,4 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L2x4_BK_Store @@ -305,7 +305,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,1,4 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L1x4_BK_Store @@ -385,7 +385,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,4,2 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L4x2_BK_Store @@ -442,7 +442,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,2,2 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L2x2_BK_Store @@ -492,7 +492,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,1,2 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L1x2_BK_Store @@ -568,7 +568,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,4,1 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L4x1_BK_Store @@ -620,7 +620,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,2,1 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L2x1_BK_Store @@ -670,7 +670,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,1,1 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L1x1_BK_Store diff --git a/kernel/zarch/gemm8x4V.S b/kernel/zarch/gemm8x4V.S index 27fd5f57b..633e60ea6 100644 --- a/kernel/zarch/gemm8x4V.S +++ b/kernel/zarch/gemm8x4V.S @@ -147,7 +147,7 @@ brctg LOCAL_VAR1,.L8x4_4_BK ALIGN_4 .L8x4_mod: -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ jz .L8x4_BK_Store @@ -183,7 +183,7 @@ brctg LOCAL_VAR1,.L4x4_4_BK ALIGN_4 .L4x4_mod: -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ jz .L4x4_BK_Store @@ -217,7 +217,7 @@ brctg LOCAL_VAR1,.L2x4_4_BK ALIGN_4 .L2x4_mod: -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ jz .L2x4_BK_Store @@ -252,7 +252,7 @@ brctg LOCAL_VAR1,.L1x4_4_BK ALIGN_4 .L1x4_mod: -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ jz .L1x4_BK_Store @@ -309,7 +309,7 @@ brctg LOCAL_VAR1,.L8x2_4_BK ALIGN_4 .L8x2_mod: -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ jz .L8x2_BK_Store @@ -346,7 +346,7 @@ brctg LOCAL_VAR1,.L4x2_4_BK ALIGN_4 .L4x2_mod: -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ jz .L4x2_BK_Store @@ -380,7 +380,7 @@ brctg LOCAL_VAR1,.L2x2_4_BK ALIGN_4 .L2x2_mod: -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ jz .L2x2_BK_Store @@ -415,7 +415,7 @@ brctg LOCAL_VAR1,.L1x2_4_BK ALIGN_4 .L1x2_mod: -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ jz .L1x2_BK_Store @@ -471,7 +471,7 @@ brctg LOCAL_VAR1,.L8x1_4_BK ALIGN_4 .L8x1_mod: -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ jz .L8x1_BK_Store @@ -508,7 +508,7 @@ brctg LOCAL_VAR1,.L4x1_4_BK ALIGN_4 .L4x1_mod: -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ jz .L4x1_BK_Store @@ -542,7 +542,7 @@ brctg LOCAL_VAR1,.L2x1_4_BK ALIGN_4 .L2x1_mod: -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ jz .L2x1_BK_Store @@ -577,7 +577,7 @@ brctg LOCAL_VAR1,.L1x1_4_BK ALIGN_4 .L1x1_mod: -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ jz .L1x1_BK_Store diff --git a/kernel/zarch/strmm8x4V.S b/kernel/zarch/strmm8x4V.S index f8e748167..e34a7a05a 100644 --- a/kernel/zarch/strmm8x4V.S +++ b/kernel/zarch/strmm8x4V.S @@ -186,7 +186,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,8,4 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L8x4_BK_Store @@ -239,7 +239,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,4,4 nill LOCAL_VAR1,3 #else - la LOCAL_VAR1,3(0,0) + lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L4x4_BK_Store @@ -290,7 +290,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,2,4 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L2x4_BK_Store @@ -341,7 +341,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,1,4 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L1x4_BK_Store @@ -423,7 +423,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,8,2 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L8x2_BK_Store @@ -475,7 +475,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,4,2 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L4x2_BK_Store @@ -525,7 +525,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,2,2 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L2x2_BK_Store @@ -575,7 +575,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,1,2 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L1x2_BK_Store @@ -655,7 +655,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,8,1 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L8x1_BK_Store @@ -708,7 +708,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,4,1 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L4x1_BK_Store @@ -757,7 +757,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,2,1 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L2x1_BK_Store @@ -807,7 +807,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,1,1 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L1x1_BK_Store diff --git a/kernel/zarch/ztrmm4x4V.S b/kernel/zarch/ztrmm4x4V.S index 52ee15f06..6fd7f2509 100644 --- a/kernel/zarch/ztrmm4x4V.S +++ b/kernel/zarch/ztrmm4x4V.S @@ -196,7 +196,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,4,4 nill LOCAL_VAR1,3 #else - la LOCAL_VAR1,3(0,0) + lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L4x4_BK_Store @@ -256,7 +256,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,2,4 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L2x4_BK_Store @@ -307,7 +307,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,1,4 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L1x4_BK_Store @@ -390,7 +390,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,4,2 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L4x2_BK_Store @@ -447,7 +447,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,2,2 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L2x2_BK_Store @@ -497,7 +497,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,1,2 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L1x2_BK_Store @@ -573,7 +573,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,4,1 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L4x1_BK_Store @@ -625,7 +625,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,2,1 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L2x1_BK_Store @@ -675,7 +675,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,1,1 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L1x1_BK_Store From a1616a0b8653fb06d607c5f8efafa01b0106dded Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Tue, 1 Sep 2020 11:58:48 +0200 Subject: [PATCH 402/593] s390x: replace nop with "nop 0" in inline assembly ... as a bandaid for building with clang until LLVM's internal assembler supports nops without operand. Signed-off-by: Marius Hillenbrand --- kernel/zarch/dgemv_n_4.c | 6 +++--- kernel/zarch/dgemv_t_4.c | 2 +- kernel/zarch/icamax.c | 2 +- kernel/zarch/icamin.c | 2 +- kernel/zarch/idamax.c | 2 +- kernel/zarch/idamin.c | 2 +- kernel/zarch/idmax.c | 2 +- kernel/zarch/idmin.c | 2 +- kernel/zarch/isamax.c | 2 +- kernel/zarch/isamin.c | 2 +- kernel/zarch/ismax.c | 2 +- kernel/zarch/ismin.c | 2 +- kernel/zarch/izamax.c | 2 +- kernel/zarch/izamin.c | 2 +- kernel/zarch/sgemv_n_4.c | 6 +++--- kernel/zarch/sgemv_t_4.c | 2 +- 16 files changed, 20 insertions(+), 20 deletions(-) diff --git a/kernel/zarch/dgemv_n_4.c b/kernel/zarch/dgemv_n_4.c index 502ba837e..b2a3d1e8d 100644 --- a/kernel/zarch/dgemv_n_4.c +++ b/kernel/zarch/dgemv_n_4.c @@ -169,7 +169,7 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" "3:\n\t" - "nop" + "nop 0" : "+m"(*(struct { FLOAT x[n]; } *) y) : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), @@ -274,7 +274,7 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" "3:\n\t" - "nop" + "nop 0" : "+m"(*(struct { FLOAT x[n]; } *) y) : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), @@ -351,7 +351,7 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y, "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" "3:\n\t" - "nop" + "nop 0" : "+m"(*(struct { FLOAT x[n]; } *) y) : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) a0),[a0] "a"(a0), "m"(*(const FLOAT (*)[1]) x),[x] "a"(x),[alpha] "Q"(*alpha), diff --git a/kernel/zarch/dgemv_t_4.c b/kernel/zarch/dgemv_t_4.c index de72a1798..30cec14f7 100644 --- a/kernel/zarch/dgemv_t_4.c +++ b/kernel/zarch/dgemv_t_4.c @@ -438,7 +438,7 @@ static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) { "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" "3:\n\t" - "nop" + "nop 0" : "+m"(*(struct { FLOAT x[n]; } *) dest) : [dest] "a"(dest),[da] "Q"(da), "m"(*(const struct { FLOAT x[n]; } *) src), [src] "a"(src),[n] "r"(n) diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c index a2546b812..2d5c48407 100644 --- a/kernel/zarch/icamax.c +++ b/kernel/zarch/icamax.c @@ -213,7 +213,7 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { "ste %%f0,%[amax]\n\t" "vlgvg %[iamax],%%v1,0\n\t" "2:\n\t" - "nop" + "nop 0" : [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n) : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c index 09654b742..1d51bb2c2 100644 --- a/kernel/zarch/icamin.c +++ b/kernel/zarch/icamin.c @@ -213,7 +213,7 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { "ste %%f0,%[amin]\n\t" "vlgvg %[iamin],%%v1,0\n\t" "2:\n\t" - "nop" + "nop 0" : [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n) : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c index b292c1d15..f9bfe3494 100644 --- a/kernel/zarch/idamax.c +++ b/kernel/zarch/idamax.c @@ -160,7 +160,7 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { "std %%f0,%[amax]\n\t" "vlgvg %[iamax],%%v1,0\n\t" "2:\n\t" - "nop" + "nop 0" : [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n) : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c index f9a8119e1..b7ce70027 100644 --- a/kernel/zarch/idamin.c +++ b/kernel/zarch/idamin.c @@ -160,7 +160,7 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { "std %%f0,%[amin]\n\t" "vlgvg %[iamin],%%v1,0\n\t" "2:\n\t" - "nop" + "nop 0" : [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n) : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", diff --git a/kernel/zarch/idmax.c b/kernel/zarch/idmax.c index 8f283bc17..55471ce50 100644 --- a/kernel/zarch/idmax.c +++ b/kernel/zarch/idmax.c @@ -140,7 +140,7 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) { "std %%f0,%[max]\n\t" "vlgvg %[imax],%%v1,0\n\t" "2:\n\t" - "nop" + "nop 0" : [imax] "=r"(imax),[max] "=Q"(*max),[n] "+&r"(n) : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", diff --git a/kernel/zarch/idmin.c b/kernel/zarch/idmin.c index e4b7bb4fe..ec1c69822 100644 --- a/kernel/zarch/idmin.c +++ b/kernel/zarch/idmin.c @@ -140,7 +140,7 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) { "std %%f0,%[min]\n\t" "vlgvg %[imin],%%v1,0\n\t" "2:\n\t" - "nop" + "nop 0" : [imin] "=r"(imin),[min] "=Q"(*min),[n] "+&r"(n) : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", diff --git a/kernel/zarch/isamax.c b/kernel/zarch/isamax.c index ac86435d7..6ea46c716 100644 --- a/kernel/zarch/isamax.c +++ b/kernel/zarch/isamax.c @@ -204,7 +204,7 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) { "ste %%f0,%[amax]\n\t" "vlgvg %[iamax],%%v1,0\n\t" "2:\n\t" - "nop" + "nop 0" : [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n) : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", diff --git a/kernel/zarch/isamin.c b/kernel/zarch/isamin.c index 3f2d039eb..18cfa2a6e 100644 --- a/kernel/zarch/isamin.c +++ b/kernel/zarch/isamin.c @@ -204,7 +204,7 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) { "ste %%f0,%[amin]\n\t" "vlgvg %[iamin],%%v1,0\n\t" "2:\n\t" - "nop" + "nop 0" : [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n) : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", diff --git a/kernel/zarch/ismax.c b/kernel/zarch/ismax.c index 41172c1bd..be990b9d5 100644 --- a/kernel/zarch/ismax.c +++ b/kernel/zarch/ismax.c @@ -184,7 +184,7 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) { "ste %%f0,%[max]\n\t" "vlgvg %[imax],%%v1,0\n\t" "2:\n\t" - "nop" + "nop 0" : [imax] "=r"(imax),[max] "=Q"(*max),[n] "+&r"(n) : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", diff --git a/kernel/zarch/ismin.c b/kernel/zarch/ismin.c index e2684df41..a27c8a743 100644 --- a/kernel/zarch/ismin.c +++ b/kernel/zarch/ismin.c @@ -184,7 +184,7 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) { "ste %%f0,%[min]\n\t" "vlgvg %[imin],%%v1,0\n\t" "2:\n\t" - "nop" + "nop 0" : [imin] "=r"(imin),[min] "=Q"(*min),[n] "+&r"(n) : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", diff --git a/kernel/zarch/izamax.c b/kernel/zarch/izamax.c index daca1d6f7..cb299cb24 100644 --- a/kernel/zarch/izamax.c +++ b/kernel/zarch/izamax.c @@ -157,7 +157,7 @@ static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax) { "std %%f0,%[amax]\n\t" "vlgvg %[iamax],%%v1,0\n\t" "2:\n\t" - "nop" + "nop 0" : [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n) : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", diff --git a/kernel/zarch/izamin.c b/kernel/zarch/izamin.c index 9ababb91f..4dfa1a9db 100644 --- a/kernel/zarch/izamin.c +++ b/kernel/zarch/izamin.c @@ -157,7 +157,7 @@ static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin) { "std %%f0,%[amin]\n\t" "vlgvg %[iamin],%%v1,0\n\t" "2:\n\t" - "nop" + "nop 0" : [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n) : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", diff --git a/kernel/zarch/sgemv_n_4.c b/kernel/zarch/sgemv_n_4.c index a1efef373..a0d522b83 100644 --- a/kernel/zarch/sgemv_n_4.c +++ b/kernel/zarch/sgemv_n_4.c @@ -159,7 +159,7 @@ static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "agfi %%r1,16\n\t" "brctg %%r0,2b\n\t" "3:\n\t" - "nop" + "nop 0" : "+m"(*(struct { FLOAT x[n]; } *) y) : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), @@ -258,7 +258,7 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "agfi %%r1,16\n\t" "brctg %%r0,2b\n\t" "3:\n\t" - "nop" + "nop 0" : "+m"(*(struct { FLOAT x[n]; } *) y) : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), @@ -331,7 +331,7 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y, "agfi %%r1,16\n\t" "brctg %%r0,2b\n\t" "3:\n\t" - "nop" + "nop 0" : "+m"(*(struct { FLOAT x[n]; } *) y) : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) a0),[a0] "a"(a0), "m"(*(const FLOAT (*)[1]) x),[x] "a"(x),[alpha] "Q"(*alpha), diff --git a/kernel/zarch/sgemv_t_4.c b/kernel/zarch/sgemv_t_4.c index 81d7c9fe7..81e600695 100644 --- a/kernel/zarch/sgemv_t_4.c +++ b/kernel/zarch/sgemv_t_4.c @@ -431,7 +431,7 @@ static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) { "agfi %%r1,16\n\t" "brctg %%r0,2b\n\t" "3:\n\t" - "nop" + "nop 0" : "+m"(*(struct { FLOAT x[n]; } *) dest) : [dest] "a"(dest),[da] "Q"(da), "m"(*(const struct { FLOAT x[n]; } *) src), [src] "a"(src),[n] "r"(n) From b9b3265ec8a78762263f54944e35c849013e0cab Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Tue, 1 Sep 2020 12:04:28 +0200 Subject: [PATCH 403/593] s390x: avoid inline assembly for vector loads for clang ... since clang does not support the instruction format for inline assembly and also it is not required for current versions of clang. Signed-off-by: Marius Hillenbrand --- kernel/zarch/gemm_vec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/zarch/gemm_vec.c b/kernel/zarch/gemm_vec.c index 741c09431..b7d7cc04b 100644 --- a/kernel/zarch/gemm_vec.c +++ b/kernel/zarch/gemm_vec.c @@ -172,7 +172,7 @@ static inline vector_float vec_load_hinted(FLOAT const *restrict a) { vector_float const *restrict addr = (vector_float const *restrict)a; vector_float y; -#if __GNUC__ < 9 +#if __GNUC__ < 9 && !defined(__clang__) // hex-encode vl %[out],%[addr],3 asm(".insn vrx,0xe70000003006,%[out],%[addr],3" : [ out ] "=v"(y) From 87e5bbd88795d09f4bec0691d33f91e8109eb424 Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Tue, 1 Sep 2020 12:08:05 +0200 Subject: [PATCH 404/593] s390x: avoid variable-length arrays in struct for asm operands ... since it is not required and clang does not support that gcc extension. Instead, use a variable-length array directly for these operands. Note that, while the actual inline assembly code does not directly use these memory operands, they serve to inform the compiler that it cannot reorder reads or writes to/from the input and output data across the inline asm statements. Signed-off-by: Marius Hillenbrand --- kernel/zarch/camax.c | 2 +- kernel/zarch/camin.c | 2 +- kernel/zarch/casum.c | 2 +- kernel/zarch/caxpy.c | 6 +++--- kernel/zarch/ccopy.c | 4 ++-- kernel/zarch/cdot.c | 6 +++--- kernel/zarch/cgemv_n_4.c | 30 +++++++++++++++--------------- kernel/zarch/cgemv_t_4.c | 32 ++++++++++++++++---------------- kernel/zarch/crot.c | 4 ++-- kernel/zarch/cscal.c | 14 +++++++------- kernel/zarch/csum.c | 2 +- kernel/zarch/cswap.c | 4 ++-- kernel/zarch/damax.c | 2 +- kernel/zarch/damax_z13.c | 2 +- kernel/zarch/damin.c | 2 +- kernel/zarch/damin_z13.c | 2 +- kernel/zarch/dasum.c | 2 +- kernel/zarch/daxpy.c | 4 ++-- kernel/zarch/dcopy.c | 4 ++-- kernel/zarch/ddot.c | 4 ++-- kernel/zarch/dgemv_n_4.c | 24 ++++++++++++------------ kernel/zarch/dgemv_t_4.c | 28 ++++++++++++++-------------- kernel/zarch/dmax.c | 2 +- kernel/zarch/dmax_z13.c | 2 +- kernel/zarch/dmin.c | 2 +- kernel/zarch/dmin_z13.c | 2 +- kernel/zarch/drot.c | 2 +- kernel/zarch/dscal.c | 4 ++-- kernel/zarch/dsdot.c | 4 ++-- kernel/zarch/dsum.c | 2 +- kernel/zarch/dswap.c | 2 +- kernel/zarch/icamax.c | 2 +- kernel/zarch/icamin.c | 2 +- kernel/zarch/idamax.c | 2 +- kernel/zarch/idamin.c | 2 +- kernel/zarch/idmax.c | 2 +- kernel/zarch/idmin.c | 2 +- kernel/zarch/isamax.c | 2 +- kernel/zarch/isamin.c | 2 +- kernel/zarch/ismax.c | 2 +- kernel/zarch/ismin.c | 2 +- kernel/zarch/izamax.c | 2 +- kernel/zarch/izamin.c | 2 +- kernel/zarch/samax.c | 2 +- kernel/zarch/samin.c | 2 +- kernel/zarch/sasum.c | 2 +- kernel/zarch/saxpy.c | 4 ++-- kernel/zarch/scopy.c | 4 ++-- kernel/zarch/sdot.c | 4 ++-- kernel/zarch/sgemv_n_4.c | 24 ++++++++++++------------ kernel/zarch/sgemv_t_4.c | 28 ++++++++++++++-------------- kernel/zarch/smax.c | 2 +- kernel/zarch/smin.c | 2 +- kernel/zarch/srot.c | 2 +- kernel/zarch/sscal.c | 4 ++-- kernel/zarch/ssum.c | 2 +- kernel/zarch/sswap.c | 2 +- kernel/zarch/zamax.c | 2 +- kernel/zarch/zamax_z13.c | 2 +- kernel/zarch/zamin.c | 2 +- kernel/zarch/zamin_z13.c | 2 +- kernel/zarch/zasum.c | 2 +- kernel/zarch/zaxpy.c | 6 +++--- kernel/zarch/zcopy.c | 4 ++-- kernel/zarch/zdot.c | 6 +++--- kernel/zarch/zgemv_n_4.c | 30 +++++++++++++++--------------- kernel/zarch/zgemv_t_4.c | 32 ++++++++++++++++---------------- kernel/zarch/zrot.c | 4 ++-- kernel/zarch/zscal.c | 14 +++++++------- kernel/zarch/zsum.c | 2 +- kernel/zarch/zswap.c | 4 ++-- 71 files changed, 212 insertions(+), 212 deletions(-) diff --git a/kernel/zarch/camax.c b/kernel/zarch/camax.c index b10ca4752..018a9a9c0 100644 --- a/kernel/zarch/camax.c +++ b/kernel/zarch/camax.c @@ -136,7 +136,7 @@ static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) { "wfmaxsb %%v0,%%v0,%%v16,0\n\t" "ler %[amax],%%f0" : [amax] "=f"(amax),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/camin.c b/kernel/zarch/camin.c index 40945fae8..7b3b36630 100644 --- a/kernel/zarch/camin.c +++ b/kernel/zarch/camin.c @@ -136,7 +136,7 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) { "wfminsb %%v0,%%v0,%%v16,0\n\t" "ler %[amin],%%f0" : [amin] "=f"(amin),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/casum.c b/kernel/zarch/casum.c index e28f2018c..f3b9ed628 100644 --- a/kernel/zarch/casum.c +++ b/kernel/zarch/casum.c @@ -108,7 +108,7 @@ static FLOAT casum_kernel_32(BLASLONG n, FLOAT *x) { "vfasb %%v24,%%v24,%%v25\n\t" "vstef %%v24,%[asum],0" : [asum] "=Q"(asum),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/caxpy.c b/kernel/zarch/caxpy.c index 14a124ae2..c0a7a71f4 100644 --- a/kernel/zarch/caxpy.c +++ b/kernel/zarch/caxpy.c @@ -99,9 +99,9 @@ static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { "vst %%v19,112(%%r1,%[y])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), - "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), + "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) : "cc", "r1", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/ccopy.c b/kernel/zarch/ccopy.c index 0a5e03992..9e08edc3b 100644 --- a/kernel/zarch/ccopy.c +++ b/kernel/zarch/ccopy.c @@ -36,9 +36,9 @@ static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { "la %[x],256(%[x])\n\t" "la %[y],256(%[y])\n\t" "brctg %[n],0b" - : "=m"(*(struct { FLOAT x[n * 2]; } *) y),[x] "+&a"(x),[y] "+&a"(y), + : "=m"(*(FLOAT (*)[n * 2]) y),[x] "+&a"(x),[y] "+&a"(y), [n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x) + : "m"(*(const FLOAT (*)[n * 2]) x) : "cc"); } diff --git a/kernel/zarch/cdot.c b/kernel/zarch/cdot.c index d90f9c871..0d6dfbeb1 100644 --- a/kernel/zarch/cdot.c +++ b/kernel/zarch/cdot.c @@ -97,9 +97,9 @@ static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { "vstef %%v24,4(%[d]),1\n\t" "vstef %%v25,8(%[d]),1\n\t" "vstef %%v25,12(%[d]),0" - : "=m"(*(struct { FLOAT x[4]; } *) d),[n] "+&r"(n) - : [d] "a"(d), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), - "m"(*(const struct { FLOAT x[n * 2]; } *) y),[y] "a"(y) + : "=m"(*(FLOAT (*)[4]) d),[n] "+&r"(n) + : [d] "a"(d), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), + "m"(*(const FLOAT (*)[n * 2]) y),[y] "a"(y) : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/cgemv_n_4.c b/kernel/zarch/cgemv_n_4.c index 5c36bc338..5fdf7717e 100644 --- a/kernel/zarch/cgemv_n_4.c +++ b/kernel/zarch/cgemv_n_4.c @@ -146,12 +146,12 @@ static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "vst %%v0,0(%%r1,%[y])\n\t" "agfi %%r1,16\n\t" "brctg %[n],0b\n\t" - : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3), - "m"(*(const struct { FLOAT x[8]; } *) x),[x] "a"(x) + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap0),[ap0] "a"(ap0), + "m"(*(const FLOAT (*)[n * 2]) ap1),[ap1] "a"(ap1), + "m"(*(const FLOAT (*)[n * 2]) ap2),[ap2] "a"(ap2), + "m"(*(const FLOAT (*)[n * 2]) ap3),[ap3] "a"(ap3), + "m"(*(const FLOAT (*)[8]) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); @@ -238,10 +238,10 @@ static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "vst %%v0,0(%%r1,%[y])\n\t" "agfi %%r1,16\n\t" "brctg %[n],0b\n\t" - : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[4]; } *) x),[x] "a"(x) + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap0),[ap0] "a"(ap0), + "m"(*(const FLOAT (*)[n * 2]) ap1),[ap1] "a"(ap1), + "m"(*(const FLOAT (*)[4]) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"); } @@ -307,9 +307,9 @@ static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { "vst %%v0,0(%%r1,%[y])\n\t" "agfi %%r1,16\n\t" "brctg %[n],0b\n\t" - : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap), - "m"(*(const struct { FLOAT x[2]; } *) x),[x] "a"(x) + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap),[ap] "a"(ap), + "m"(*(const FLOAT (*)[2]) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19"); } @@ -350,8 +350,8 @@ static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, "vst %%v23,16(%%r1,%[dest])\n\t" "agfi %%r1,32\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) dest),[n] "+&r"(n) - : [dest] "a"(dest), "m"(*(const struct { FLOAT x[n * 2]; } *) src), + : "+m"(*(FLOAT (*)[n * 2]) dest),[n] "+&r"(n) + : [dest] "a"(dest), "m"(*(const FLOAT (*)[n * 2]) src), [src] "a"(src),[alpha_r] "Q"(alpha_r),[alpha_i] "Q"(alpha_i) : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"); diff --git a/kernel/zarch/cgemv_t_4.c b/kernel/zarch/cgemv_t_4.c index e10edfab0..2bdac9ea1 100644 --- a/kernel/zarch/cgemv_t_4.c +++ b/kernel/zarch/cgemv_t_4.c @@ -159,13 +159,13 @@ static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmasb %%v23,%%v19,%%v21,%%v23\n\t" "vst %%v22,0(%[y])\n\t" "vst %%v23,16(%[y])" - : "+m"(*(struct { FLOAT x[8]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3), - "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), - "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) + : "+m"(*(FLOAT (*)[8]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap0),[ap0] "a"(ap0), + "m"(*(const FLOAT (*)[n * 2]) ap1),[ap1] "a"(ap1), + "m"(*(const FLOAT (*)[n * 2]) ap2),[ap2] "a"(ap2), + "m"(*(const FLOAT (*)[n * 2]) ap3),[ap3] "a"(ap3), + "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), + "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); @@ -271,11 +271,11 @@ static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmasb %%v20,%%v16,%%v18,%%v20\n\t" "vfmasb %%v20,%%v17,%%v19,%%v20\n\t" "vst %%v20,0(%[y])" - : "+m"(*(struct { FLOAT x[4]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), - "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) + : "+m"(*(FLOAT (*)[4]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap0),[ap0] "a"(ap0), + "m"(*(const FLOAT (*)[n * 2]) ap1),[ap1] "a"(ap1), + "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), + "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"); } @@ -361,10 +361,10 @@ static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, "vfmasb %%v0,%%v16,%%v18,%%v0\n\t" "vfmasb %%v0,%%v17,%%v19,%%v0\n\t" "vsteg %%v0,0(%[y]),0" - : "+m"(*(struct { FLOAT x[2]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap), - "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), - "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) + : "+m"(*(FLOAT (*)[2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap),[ap] "a"(ap), + "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), + "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); } diff --git a/kernel/zarch/crot.c b/kernel/zarch/crot.c index aab155f8b..5a0990f3d 100644 --- a/kernel/zarch/crot.c +++ b/kernel/zarch/crot.c @@ -169,8 +169,8 @@ static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vst %%v23, 240(%%r1,%[y])\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) x), - "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) + : "+m"(*(FLOAT (*)[n * 2]) x), + "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) : [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s) : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", diff --git a/kernel/zarch/cscal.c b/kernel/zarch/cscal.c index 9fc54cf29..f9e89a452 100644 --- a/kernel/zarch/cscal.c +++ b/kernel/zarch/cscal.c @@ -80,8 +80,8 @@ static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x) { "vst %%v23,112(%%r1,%[x])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) - : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha), + : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha), [alpha] "a"(alpha) : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", @@ -132,8 +132,8 @@ static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { "vst %%v23,112(%%r1,%[x])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) - : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha), + : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha), [alpha] "a"(alpha) : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"); @@ -171,8 +171,8 @@ static void cscal_kernel_16_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) { "vst %%v23,112(%%r1,%[x])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) - : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha), + : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha), [alpha] "a"(alpha) : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"); @@ -194,7 +194,7 @@ static void cscal_kernel_16_zero(BLASLONG n, FLOAT *x) { "vst %%v0,112(%%r1,%[x])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" - : "=m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) + : "=m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) : [x] "a"(x) : "cc", "r1", "v0"); } diff --git a/kernel/zarch/csum.c b/kernel/zarch/csum.c index e9413da8e..b076501aa 100644 --- a/kernel/zarch/csum.c +++ b/kernel/zarch/csum.c @@ -90,7 +90,7 @@ static FLOAT csum_kernel_32(BLASLONG n, FLOAT *x) { "vfasb %%v24,%%v24,%%v25\n\t" "vstef %%v24,%[sum],0" : [sum] "=Q"(sum),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/cswap.c b/kernel/zarch/cswap.c index 198994e18..f3ab77ab5 100644 --- a/kernel/zarch/cswap.c +++ b/kernel/zarch/cswap.c @@ -99,8 +99,8 @@ static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { "vst %%v31, 240(%%r1,%[y])\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) x), - "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) + : "+m"(*(FLOAT (*)[n * 2]) x), + "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) : [x] "a"(x),[y] "a"(y) : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", diff --git a/kernel/zarch/damax.c b/kernel/zarch/damax.c index caacb50dc..d19181cbe 100644 --- a/kernel/zarch/damax.c +++ b/kernel/zarch/damax.c @@ -76,7 +76,7 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) { "wfmaxdb %%v0,%%v0,%%v16,8\n\t" "lpdr %[amax],%%f0" : [amax] "=f"(amax),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/damax_z13.c b/kernel/zarch/damax_z13.c index f3db4c108..5bc0d1721 100644 --- a/kernel/zarch/damax_z13.c +++ b/kernel/zarch/damax_z13.c @@ -110,7 +110,7 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) { "vsel %%v0,%%v0,%%v16,%%v17\n\t" "ldr %[amax],%%f0" : [amax] "=f"(amax),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/damin.c b/kernel/zarch/damin.c index 0163a144b..4e0558af4 100644 --- a/kernel/zarch/damin.c +++ b/kernel/zarch/damin.c @@ -76,7 +76,7 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) { "wfmindb %%v0,%%v0,%%v16,8\n\t" "lpdr %[amin],%%f0" : [amin] "=f"(amin),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/damin_z13.c b/kernel/zarch/damin_z13.c index 4196b2e15..a7efd4b26 100644 --- a/kernel/zarch/damin_z13.c +++ b/kernel/zarch/damin_z13.c @@ -110,7 +110,7 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) { "vsel %%v0,%%v0,%%v16,%%v17\n\t" "ldr %[amin],%%f0" : [amin] "=f"(amin),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/dasum.c b/kernel/zarch/dasum.c index aa1382b10..9703cd3be 100644 --- a/kernel/zarch/dasum.c +++ b/kernel/zarch/dasum.c @@ -106,7 +106,7 @@ static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) { "vfadb %%v24,%%v24,%%v25\n\t" "vsteg %%v24,%[asum],0" : [asum] "=Q"(asum),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/daxpy.c b/kernel/zarch/daxpy.c index 5b0208c20..4e59ef7c6 100644 --- a/kernel/zarch/daxpy.c +++ b/kernel/zarch/daxpy.c @@ -100,8 +100,8 @@ static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { "vst %%v27,240(%%r1,%[y])\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x), + : "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), [alpha] "Q"(*alpha) : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/dcopy.c b/kernel/zarch/dcopy.c index 691b90c64..3c546568f 100644 --- a/kernel/zarch/dcopy.c +++ b/kernel/zarch/dcopy.c @@ -36,8 +36,8 @@ static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { "la %[x],256(%[x])\n\t" "la %[y],256(%[y])\n\t" "brctg %[n],0b" - : "=m"(*(struct { FLOAT x[n]; } *) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x) + : "=m"(*(FLOAT (*)[n]) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x) : "cc"); } diff --git a/kernel/zarch/ddot.c b/kernel/zarch/ddot.c index 9cad68f4b..c0ed8b72e 100644 --- a/kernel/zarch/ddot.c +++ b/kernel/zarch/ddot.c @@ -80,8 +80,8 @@ static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { "adbr %%f0,%%f1\n\t" "ldr %[dot],%%f0" : [dot] "=f"(dot),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x), - "m"(*(const struct { FLOAT x[n]; } *) y),[y] "a"(y) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), + "m"(*(const FLOAT (*)[n]) y),[y] "a"(y) : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/dgemv_n_4.c b/kernel/zarch/dgemv_n_4.c index b2a3d1e8d..e1c5c4472 100644 --- a/kernel/zarch/dgemv_n_4.c +++ b/kernel/zarch/dgemv_n_4.c @@ -170,12 +170,12 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "brctg %%r0,2b\n\t" "3:\n\t" "nop 0" - : "+m"(*(struct { FLOAT x[n]; } *) y) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[n]; } *) ap2),[ap2] "a"(ap2), - "m"(*(const struct { FLOAT x[n]; } *) ap3),[ap3] "a"(ap3), - "m"(*(const struct { FLOAT x[4]; } *) x),[x] "a"(x),[alpha] "Q"(*alpha), + : "+m"(*(FLOAT (*)[n]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap0),[ap0] "a"(ap0), + "m"(*(const FLOAT (*)[n]) ap1),[ap1] "a"(ap1), + "m"(*(const FLOAT (*)[n]) ap2),[ap2] "a"(ap2), + "m"(*(const FLOAT (*)[n]) ap3),[ap3] "a"(ap3), + "m"(*(const FLOAT (*)[4]) x),[x] "a"(x),[alpha] "Q"(*alpha), [n] "r"(n) : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", @@ -275,10 +275,10 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "brctg %%r0,2b\n\t" "3:\n\t" "nop 0" - : "+m"(*(struct { FLOAT x[n]; } *) y) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[2]; } *) x),[x] "a"(x),[alpha] "Q"(*alpha), + : "+m"(*(FLOAT (*)[n]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap0),[ap0] "a"(ap0), + "m"(*(const FLOAT (*)[n]) ap1),[ap1] "a"(ap1), + "m"(*(const FLOAT (*)[2]) x),[x] "a"(x),[alpha] "Q"(*alpha), [n] "r"(n) : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", @@ -352,8 +352,8 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y, "brctg %%r0,2b\n\t" "3:\n\t" "nop 0" - : "+m"(*(struct { FLOAT x[n]; } *) y) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) a0),[a0] "a"(a0), + : "+m"(*(FLOAT (*)[n]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) a0),[a0] "a"(a0), "m"(*(const FLOAT (*)[1]) x),[x] "a"(x),[alpha] "Q"(*alpha), [n] "r"(n) : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", diff --git a/kernel/zarch/dgemv_t_4.c b/kernel/zarch/dgemv_t_4.c index 30cec14f7..513cffe5a 100644 --- a/kernel/zarch/dgemv_t_4.c +++ b/kernel/zarch/dgemv_t_4.c @@ -173,12 +173,12 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "vrepg %%v4,%%v3,1\n\t" "adbr %%f3,%%f4\n\t" "std %%f3,24(%[y])" - : "=m"(*(struct { FLOAT x[4]; } *) y) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[n]; } *) ap2),[ap2] "a"(ap2), - "m"(*(const struct { FLOAT x[n]; } *) ap3),[ap3] "a"(ap3), - "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n) + : "=m"(*(FLOAT (*)[4]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap0),[ap0] "a"(ap0), + "m"(*(const FLOAT (*)[n]) ap1),[ap1] "a"(ap1), + "m"(*(const FLOAT (*)[n]) ap2),[ap2] "a"(ap2), + "m"(*(const FLOAT (*)[n]) ap3),[ap3] "a"(ap3), + "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); @@ -280,10 +280,10 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "vrepg %%v2,%%v1,1\n\t" "adbr %%f1,%%f2\n\t" "std %%f1,8(%[y])" - : "=m"(*(struct { FLOAT x[2]; } *) y) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n) + : "=m"(*(FLOAT (*)[2]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap0),[ap0] "a"(ap0), + "m"(*(const FLOAT (*)[n]) ap1),[ap1] "a"(ap1), + "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); @@ -360,8 +360,8 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) { "adbr %%f0,%%f1\n\t" "std %%f0,0(%[y])" : "=m"(*(FLOAT (*)[1]) y) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) a0),[a0] "a"(a0), - "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) a0),[a0] "a"(a0), + "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); @@ -439,8 +439,8 @@ static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) { "brctg %%r0,2b\n\t" "3:\n\t" "nop 0" - : "+m"(*(struct { FLOAT x[n]; } *) dest) - : [dest] "a"(dest),[da] "Q"(da), "m"(*(const struct { FLOAT x[n]; } *) src), + : "+m"(*(FLOAT (*)[n]) dest) + : [dest] "a"(dest),[da] "Q"(da), "m"(*(const FLOAT (*)[n]) src), [src] "a"(src),[n] "r"(n) : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", diff --git a/kernel/zarch/dmax.c b/kernel/zarch/dmax.c index cdc8d5d08..4b76e0dd6 100644 --- a/kernel/zarch/dmax.c +++ b/kernel/zarch/dmax.c @@ -73,7 +73,7 @@ static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) { "wfmaxdb %%v0,%%v0,%%v16,0\n\t" "ldr %[max],%%f0" : [max] "=f"(max),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/dmax_z13.c b/kernel/zarch/dmax_z13.c index c4e8d91f8..93acee2db 100644 --- a/kernel/zarch/dmax_z13.c +++ b/kernel/zarch/dmax_z13.c @@ -90,7 +90,7 @@ static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) { "vsel %%v0,%%v0,%%v16,%%v17\n\t" "ldr %[max],%%f0" : [max] "=f"(max),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/dmin.c b/kernel/zarch/dmin.c index f9b129cbd..21d55f323 100644 --- a/kernel/zarch/dmin.c +++ b/kernel/zarch/dmin.c @@ -73,7 +73,7 @@ static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) { "wfmindb %%v0,%%v0,%%v16,0\n\t" "ldr %[min],%%f0" : [min] "=f"(min),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/dmin_z13.c b/kernel/zarch/dmin_z13.c index 77f021c1d..7d2dae3fb 100644 --- a/kernel/zarch/dmin_z13.c +++ b/kernel/zarch/dmin_z13.c @@ -90,7 +90,7 @@ static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) { "vsel %%v0,%%v0,%%v16,%%v17\n\t" "ldr %[min],%%f0" : [min] "=f"(min),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/drot.c b/kernel/zarch/drot.c index 11fbe15b6..9d6d1a80d 100644 --- a/kernel/zarch/drot.c +++ b/kernel/zarch/drot.c @@ -169,7 +169,7 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vst %%v23, 240(%%r1,%[y])\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y), + : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y), [n] "+&r"(n) : [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s) : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", diff --git a/kernel/zarch/dscal.c b/kernel/zarch/dscal.c index 2961eff20..a5a5e3468 100644 --- a/kernel/zarch/dscal.c +++ b/kernel/zarch/dscal.c @@ -59,7 +59,7 @@ static void dscal_kernel_16(BLASLONG n, FLOAT da, FLOAT *x) { "vst %%v31,112(%%r1,%[x])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n) + : "+m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) : [x] "a"(x),[da] "Q"(da) : "cc", "r1", "v0", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); @@ -81,7 +81,7 @@ static void dscal_kernel_16_zero(BLASLONG n, FLOAT *x) { "vst %%v0,112(%%r1,%[x])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" - : "=m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n) + : "=m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) : [x] "a"(x) : "cc", "r1", "v0"); } diff --git a/kernel/zarch/dsdot.c b/kernel/zarch/dsdot.c index 5fa88c3b9..2952bcf42 100644 --- a/kernel/zarch/dsdot.c +++ b/kernel/zarch/dsdot.c @@ -112,8 +112,8 @@ static double dsdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { "adbr %%f0,%%f1\n\t" "ldr %[dot],%%f0" : [dot] "=f"(dot),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x), - "m"(*(const struct { FLOAT x[n]; } *) y),[y] "a"(y) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), + "m"(*(const FLOAT (*)[n]) y),[y] "a"(y) : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/dsum.c b/kernel/zarch/dsum.c index 8d44873c0..69b9f9b41 100644 --- a/kernel/zarch/dsum.c +++ b/kernel/zarch/dsum.c @@ -88,7 +88,7 @@ static FLOAT dsum_kernel_32(BLASLONG n, FLOAT *x) { "vfadb %%v24,%%v24,%%v25\n\t" "vsteg %%v24,%[sum],0" : [sum] "=Q"(sum),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/dswap.c b/kernel/zarch/dswap.c index f0c9ded51..46cbbba23 100644 --- a/kernel/zarch/dswap.c +++ b/kernel/zarch/dswap.c @@ -99,7 +99,7 @@ static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { "vst %%v31, 240(%%r1,%[y])\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y), + : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y), [n] "+&r"(n) : [x] "a"(x),[y] "a"(y) : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c index 2d5c48407..459196d00 100644 --- a/kernel/zarch/icamax.c +++ b/kernel/zarch/icamax.c @@ -215,7 +215,7 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { "2:\n\t" "nop 0" : [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c index 1d51bb2c2..9bcf3646b 100644 --- a/kernel/zarch/icamin.c +++ b/kernel/zarch/icamin.c @@ -215,7 +215,7 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { "2:\n\t" "nop 0" : [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c index f9bfe3494..0f53488d3 100644 --- a/kernel/zarch/idamax.c +++ b/kernel/zarch/idamax.c @@ -162,7 +162,7 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { "2:\n\t" "nop 0" : [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c index b7ce70027..f48bde894 100644 --- a/kernel/zarch/idamin.c +++ b/kernel/zarch/idamin.c @@ -162,7 +162,7 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { "2:\n\t" "nop 0" : [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/idmax.c b/kernel/zarch/idmax.c index 55471ce50..1fdf1fa02 100644 --- a/kernel/zarch/idmax.c +++ b/kernel/zarch/idmax.c @@ -142,7 +142,7 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) { "2:\n\t" "nop 0" : [imax] "=r"(imax),[max] "=Q"(*max),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/idmin.c b/kernel/zarch/idmin.c index ec1c69822..282f26bbd 100644 --- a/kernel/zarch/idmin.c +++ b/kernel/zarch/idmin.c @@ -142,7 +142,7 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) { "2:\n\t" "nop 0" : [imin] "=r"(imin),[min] "=Q"(*min),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/isamax.c b/kernel/zarch/isamax.c index 6ea46c716..a30a96412 100644 --- a/kernel/zarch/isamax.c +++ b/kernel/zarch/isamax.c @@ -206,7 +206,7 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) { "2:\n\t" "nop 0" : [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT(*)[n]) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/isamin.c b/kernel/zarch/isamin.c index 18cfa2a6e..b29027ff4 100644 --- a/kernel/zarch/isamin.c +++ b/kernel/zarch/isamin.c @@ -206,7 +206,7 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) { "2:\n\t" "nop 0" : [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/ismax.c b/kernel/zarch/ismax.c index be990b9d5..3d751ff6b 100644 --- a/kernel/zarch/ismax.c +++ b/kernel/zarch/ismax.c @@ -186,7 +186,7 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) { "2:\n\t" "nop 0" : [imax] "=r"(imax),[max] "=Q"(*max),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/ismin.c b/kernel/zarch/ismin.c index a27c8a743..e57c0bfa6 100644 --- a/kernel/zarch/ismin.c +++ b/kernel/zarch/ismin.c @@ -186,7 +186,7 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) { "2:\n\t" "nop 0" : [imin] "=r"(imin),[min] "=Q"(*min),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/izamax.c b/kernel/zarch/izamax.c index cb299cb24..fda76f471 100644 --- a/kernel/zarch/izamax.c +++ b/kernel/zarch/izamax.c @@ -159,7 +159,7 @@ static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax) { "2:\n\t" "nop 0" : [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27"); diff --git a/kernel/zarch/izamin.c b/kernel/zarch/izamin.c index 4dfa1a9db..412ab15ca 100644 --- a/kernel/zarch/izamin.c +++ b/kernel/zarch/izamin.c @@ -159,7 +159,7 @@ static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin) { "2:\n\t" "nop 0" : [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27"); diff --git a/kernel/zarch/samax.c b/kernel/zarch/samax.c index fdda6dd32..20da4406a 100644 --- a/kernel/zarch/samax.c +++ b/kernel/zarch/samax.c @@ -78,7 +78,7 @@ static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x) { "wfmaxsb %%v0,%%v0,%%v16,8\n\t" "lper %[amax],%%f0" : [amax] "=f"(amax),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/samin.c b/kernel/zarch/samin.c index f05e851f9..e7e4fd9b7 100644 --- a/kernel/zarch/samin.c +++ b/kernel/zarch/samin.c @@ -78,7 +78,7 @@ static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x) { "wfminsb %%v0,%%v0,%%v16,8\n\t" "lper %[amin],%%f0" : [amin] "=f"(amin),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/sasum.c b/kernel/zarch/sasum.c index d56f2697b..4cf74f351 100644 --- a/kernel/zarch/sasum.c +++ b/kernel/zarch/sasum.c @@ -108,7 +108,7 @@ static FLOAT sasum_kernel_64(BLASLONG n, FLOAT *x) { "vfasb %%v24,%%v24,%%v25\n\t" "vstef %%v24,%[asum],0" : [asum] "=Q"(asum),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/saxpy.c b/kernel/zarch/saxpy.c index ca34a47ff..8bcb1a61b 100644 --- a/kernel/zarch/saxpy.c +++ b/kernel/zarch/saxpy.c @@ -100,8 +100,8 @@ static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { "vst %%v27,240(%%r1,%[y])\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x), + : "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), [alpha] "Q"(*alpha) : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/scopy.c b/kernel/zarch/scopy.c index 5c453cfbb..631c9f929 100644 --- a/kernel/zarch/scopy.c +++ b/kernel/zarch/scopy.c @@ -36,8 +36,8 @@ static void scopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) { "la %[x],256(%[x])\n\t" "la %[y],256(%[y])\n\t" "brctg %[n],0b" - : "=m"(*(struct { FLOAT x[n]; } *) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x) + : "=m"(*(FLOAT (*)[n]) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x) : "cc"); } diff --git a/kernel/zarch/sdot.c b/kernel/zarch/sdot.c index d870b30f0..d27c17162 100644 --- a/kernel/zarch/sdot.c +++ b/kernel/zarch/sdot.c @@ -84,8 +84,8 @@ static FLOAT sdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { "aebr %%f0,%%f3\n\t" "ler %[dot],%%f0" : [dot] "=f"(dot),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x), - "m"(*(const struct { FLOAT x[n]; } *) y),[y] "a"(y) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), + "m"(*(const FLOAT (*)[n]) y),[y] "a"(y) : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/sgemv_n_4.c b/kernel/zarch/sgemv_n_4.c index a0d522b83..b4cfb61de 100644 --- a/kernel/zarch/sgemv_n_4.c +++ b/kernel/zarch/sgemv_n_4.c @@ -160,12 +160,12 @@ static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "brctg %%r0,2b\n\t" "3:\n\t" "nop 0" - : "+m"(*(struct { FLOAT x[n]; } *) y) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[n]; } *) ap2),[ap2] "a"(ap2), - "m"(*(const struct { FLOAT x[n]; } *) ap3),[ap3] "a"(ap3), - "m"(*(const struct { FLOAT x[4]; } *) x),[x] "a"(x),[alpha] "Q"(*alpha), + : "+m"(*(FLOAT (*)[n]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap0),[ap0] "a"(ap0), + "m"(*(const FLOAT (*)[n]) ap1),[ap1] "a"(ap1), + "m"(*(const FLOAT (*)[n]) ap2),[ap2] "a"(ap2), + "m"(*(const FLOAT (*)[n]) ap3),[ap3] "a"(ap3), + "m"(*(const FLOAT (*)[4]) x),[x] "a"(x),[alpha] "Q"(*alpha), [n] "r"(n) : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", @@ -259,10 +259,10 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "brctg %%r0,2b\n\t" "3:\n\t" "nop 0" - : "+m"(*(struct { FLOAT x[n]; } *) y) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[2]; } *) x),[x] "a"(x),[alpha] "Q"(*alpha), + : "+m"(*(FLOAT (*)[n]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap0),[ap0] "a"(ap0), + "m"(*(const FLOAT (*)[n]) ap1),[ap1] "a"(ap1), + "m"(*(const FLOAT (*)[2]) x),[x] "a"(x),[alpha] "Q"(*alpha), [n] "r"(n) : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", @@ -332,8 +332,8 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y, "brctg %%r0,2b\n\t" "3:\n\t" "nop 0" - : "+m"(*(struct { FLOAT x[n]; } *) y) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) a0),[a0] "a"(a0), + : "+m"(*(FLOAT (*)[n]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) a0),[a0] "a"(a0), "m"(*(const FLOAT (*)[1]) x),[x] "a"(x),[alpha] "Q"(*alpha), [n] "r"(n) : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", diff --git a/kernel/zarch/sgemv_t_4.c b/kernel/zarch/sgemv_t_4.c index 81e600695..3c708200c 100644 --- a/kernel/zarch/sgemv_t_4.c +++ b/kernel/zarch/sgemv_t_4.c @@ -172,12 +172,12 @@ static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "vrepg %%v4,%%v3,1\n\t" "aebr %%f3,%%f4\n\t" "ste %%f3,12(%[y])" - : "=m"(*(struct { FLOAT x[4]; } *) y) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[n]; } *) ap2),[ap2] "a"(ap2), - "m"(*(const struct { FLOAT x[n]; } *) ap3),[ap3] "a"(ap3), - "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n) + : "=m"(*(FLOAT (*)[4]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap0),[ap0] "a"(ap0), + "m"(*(const FLOAT (*)[n]) ap1),[ap1] "a"(ap1), + "m"(*(const FLOAT (*)[n]) ap2),[ap2] "a"(ap2), + "m"(*(const FLOAT (*)[n]) ap3),[ap3] "a"(ap3), + "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); @@ -278,10 +278,10 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "vrepg %%v2,%%v1,1\n\t" "aebr %%f1,%%f2\n\t" "ste %%f1,4(%[y])" - : "=m"(*(struct { FLOAT x[2]; } *) y) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n) + : "=m"(*(FLOAT (*)[2]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap0),[ap0] "a"(ap0), + "m"(*(const FLOAT (*)[n]) ap1),[ap1] "a"(ap1), + "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); @@ -357,8 +357,8 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) { "aebr %%f0,%%f1\n\t" "ste %%f0,0(%[y])" : "=m"(*(FLOAT (*)[1]) y) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) a0),[a0] "a"(a0), - "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) a0),[a0] "a"(a0), + "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); @@ -432,8 +432,8 @@ static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) { "brctg %%r0,2b\n\t" "3:\n\t" "nop 0" - : "+m"(*(struct { FLOAT x[n]; } *) dest) - : [dest] "a"(dest),[da] "Q"(da), "m"(*(const struct { FLOAT x[n]; } *) src), + : "+m"(*(FLOAT (*)[n]) dest) + : [dest] "a"(dest),[da] "Q"(da), "m"(*(const FLOAT (*)[n]) src), [src] "a"(src),[n] "r"(n) : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", diff --git a/kernel/zarch/smax.c b/kernel/zarch/smax.c index 7015aaa1d..0c7433cbc 100644 --- a/kernel/zarch/smax.c +++ b/kernel/zarch/smax.c @@ -75,7 +75,7 @@ static FLOAT smax_kernel_64(BLASLONG n, FLOAT *x) { "wfmaxsb %%v0,%%v0,%%v16,0\n\t" "ler %[max],%%f0" : [max] "=f"(max),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT(*)[n]) x),[x] "a"(x) : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/smin.c b/kernel/zarch/smin.c index b6875c5c6..5e0f3860d 100644 --- a/kernel/zarch/smin.c +++ b/kernel/zarch/smin.c @@ -75,7 +75,7 @@ static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x) { "wfminsb %%v0,%%v0,%%v16,0\n\t" "ler %[min],%%f0" : [min] "=f"(min),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/srot.c b/kernel/zarch/srot.c index 4f471d866..c235adcbe 100644 --- a/kernel/zarch/srot.c +++ b/kernel/zarch/srot.c @@ -169,7 +169,7 @@ static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vst %%v23, 240(%%r1,%[y])\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y), + : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y), [n] "+&r"(n) : [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s) : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", diff --git a/kernel/zarch/sscal.c b/kernel/zarch/sscal.c index 9b9930dc8..da2f49eaf 100644 --- a/kernel/zarch/sscal.c +++ b/kernel/zarch/sscal.c @@ -59,7 +59,7 @@ static void sscal_kernel_32(BLASLONG n, FLOAT da, FLOAT *x) { "vst %%v31,112(%%r1,%[x])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n) + : "+m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) : [x] "a"(x),[da] "Q"(da) : "cc", "r1", "v0", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); @@ -81,7 +81,7 @@ static void sscal_kernel_32_zero(BLASLONG n, FLOAT *x) { "vst %%v0,112(%%r1,%[x])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" - : "=m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n) + : "=m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) : [x] "a"(x) : "cc", "r1", "v0"); } diff --git a/kernel/zarch/ssum.c b/kernel/zarch/ssum.c index 3f3f46a85..02aabdff6 100644 --- a/kernel/zarch/ssum.c +++ b/kernel/zarch/ssum.c @@ -91,7 +91,7 @@ static FLOAT ssum_kernel_64(BLASLONG n, FLOAT *x) { "vfasb %%v24,%%v24,%%v25\n\t" "vstef %%v24,%[sum],0" : [sum] "=Q"(sum),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/sswap.c b/kernel/zarch/sswap.c index 0c62f189d..ec860765a 100644 --- a/kernel/zarch/sswap.c +++ b/kernel/zarch/sswap.c @@ -99,7 +99,7 @@ static void sswap_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) { "vst %%v31, 240(%%r1,%[y])\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y), + : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y), [n] "+&r"(n) : [x] "a"(x),[y] "a"(y) : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", diff --git a/kernel/zarch/zamax.c b/kernel/zarch/zamax.c index aa04ab91f..98e40d073 100644 --- a/kernel/zarch/zamax.c +++ b/kernel/zarch/zamax.c @@ -114,7 +114,7 @@ static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) { "wfmaxdb %%v0,%%v0,%%v16,0\n\t" "ldr %[amax],%%f0" : [amax] "=f"(amax),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/zamax_z13.c b/kernel/zarch/zamax_z13.c index 37278d6db..f727ad67a 100644 --- a/kernel/zarch/zamax_z13.c +++ b/kernel/zarch/zamax_z13.c @@ -123,7 +123,7 @@ static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) { "vsel %%v0,%%v0,%%v16,%%v17\n\t" "ldr %[amax],%%f0" : [amax] "=f"(amax),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27"); diff --git a/kernel/zarch/zamin.c b/kernel/zarch/zamin.c index 0b5402853..2e43fefd9 100644 --- a/kernel/zarch/zamin.c +++ b/kernel/zarch/zamin.c @@ -114,7 +114,7 @@ static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) { "wfmindb %%v0,%%v0,%%v16,0\n\t" "ldr %[amin],%%f0" : [amin] "=f"(amin),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/zamin_z13.c b/kernel/zarch/zamin_z13.c index e37bb2236..e52802595 100644 --- a/kernel/zarch/zamin_z13.c +++ b/kernel/zarch/zamin_z13.c @@ -123,7 +123,7 @@ static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) { "vsel %%v0,%%v0,%%v16,%%v17\n\t" "ldr %[amin],%%f0" : [amin] "=f"(amin),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27"); diff --git a/kernel/zarch/zasum.c b/kernel/zarch/zasum.c index aeef8d77e..0003f38a5 100644 --- a/kernel/zarch/zasum.c +++ b/kernel/zarch/zasum.c @@ -106,7 +106,7 @@ static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) { "vfadb %%v24,%%v24,%%v25\n\t" "vsteg %%v24,%[asum],0" : [asum] "=Q"(asum),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/zaxpy.c b/kernel/zarch/zaxpy.c index 9363ec32d..f2c115597 100644 --- a/kernel/zarch/zaxpy.c +++ b/kernel/zarch/zaxpy.c @@ -95,9 +95,9 @@ static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { "vst %%v19,112(%%r1,%[y])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), - "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), + "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) : "cc", "r1", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/zcopy.c b/kernel/zarch/zcopy.c index 5a46aec1c..d91d9f367 100644 --- a/kernel/zarch/zcopy.c +++ b/kernel/zarch/zcopy.c @@ -36,9 +36,9 @@ static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { "la %[x],256(%[x])\n\t" "la %[y],256(%[y])\n\t" "brctg %[n],0b" - : "=m"(*(struct { FLOAT x[n * 2]; } *) y),[x] "+&a"(x),[y] "+&a"(y), + : "=m"(*(FLOAT (*)[n * 2]) y),[x] "+&a"(x),[y] "+&a"(y), [n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x) + : "m"(*(const FLOAT (*)[n * 2]) x) : "cc"); } diff --git a/kernel/zarch/zdot.c b/kernel/zarch/zdot.c index ac6e69c23..6b7144101 100644 --- a/kernel/zarch/zdot.c +++ b/kernel/zarch/zdot.c @@ -93,9 +93,9 @@ static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { "vsteg %%v24,8(%[d]),1\n\t" "vsteg %%v25,16(%[d]),1\n\t" "vsteg %%v25,24(%[d]),0" - : "=m"(*(struct { FLOAT x[4]; } *) d),[n] "+&r"(n) - : [d] "a"(d), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), - "m"(*(const struct { FLOAT x[n * 2]; } *) y),[y] "a"(y) + : "=m"(*(FLOAT (*)[4]) d),[n] "+&r"(n) + : [d] "a"(d), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), + "m"(*(const FLOAT (*)[n * 2]) y),[y] "a"(y) : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/zgemv_n_4.c b/kernel/zarch/zgemv_n_4.c index 13045a359..2ef9b4de8 100644 --- a/kernel/zarch/zgemv_n_4.c +++ b/kernel/zarch/zgemv_n_4.c @@ -112,12 +112,12 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "vst %%v1,16(%%r1,%[y])\n\t" "agfi %%r1,32\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3), - "m"(*(const struct { FLOAT x[8]; } *) x),[x] "a"(x) + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap0),[ap0] "a"(ap0), + "m"(*(const FLOAT (*)[n * 2]) ap1),[ap1] "a"(ap1), + "m"(*(const FLOAT (*)[n * 2]) ap2),[ap2] "a"(ap2), + "m"(*(const FLOAT (*)[n * 2]) ap3),[ap3] "a"(ap3), + "m"(*(const FLOAT (*)[8]) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); @@ -172,10 +172,10 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "vst %%v1,16(%%r1,%[y])\n\t" "agfi %%r1,32\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[4]; } *) x),[x] "a"(x) + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap0),[ap0] "a"(ap0), + "m"(*(const FLOAT (*)[n * 2]) ap1),[ap1] "a"(ap1), + "m"(*(const FLOAT (*)[4]) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27"); } @@ -210,9 +210,9 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { "vst %%v1,16(%%r1,%[y])\n\t" "agfi %%r1,32\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap), - "m"(*(const struct { FLOAT x[2]; } *) x),[x] "a"(x) + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap),[ap] "a"(ap), + "m"(*(const FLOAT (*)[2]) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21"); } @@ -261,8 +261,8 @@ static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, "vst %%v31,48(%%r1,%[dest])\n\t" "agfi %%r1,64\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) dest),[n] "+&r"(n) - : [dest] "a"(dest), "m"(*(const struct { FLOAT x[n * 2]; } *) src), + : "+m"(*(FLOAT (*)[n * 2]) dest),[n] "+&r"(n) + : [dest] "a"(dest), "m"(*(const FLOAT (*)[n * 2]) src), [src] "a"(src),[alpha_r] "Q"(alpha_r),[alpha_i] "Q"(alpha_i) : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", diff --git a/kernel/zarch/zgemv_t_4.c b/kernel/zarch/zgemv_t_4.c index 031c31e29..c10769266 100644 --- a/kernel/zarch/zgemv_t_4.c +++ b/kernel/zarch/zgemv_t_4.c @@ -141,13 +141,13 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vst %%v27,16(%[y])\n\t" "vst %%v28,32(%[y])\n\t" "vst %%v29,48(%[y])" - : "+m"(*(struct { FLOAT x[8]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3), - "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), - "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) + : "+m"(*(FLOAT (*)[8]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap0),[ap0] "a"(ap0), + "m"(*(const FLOAT (*)[n * 2]) ap1),[ap1] "a"(ap1), + "m"(*(const FLOAT (*)[n * 2]) ap2),[ap2] "a"(ap2), + "m"(*(const FLOAT (*)[n * 2]) ap3),[ap3] "a"(ap3), + "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), + "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); @@ -229,11 +229,11 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v23,%%v19,%%v21,%%v23\n\t" "vst %%v22,0(%[y])\n\t" "vst %%v23,16(%[y])\n\t" - : "+m"(*(struct { FLOAT x[4]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), - "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) + : "+m"(*(FLOAT (*)[4]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap0),[ap0] "a"(ap0), + "m"(*(const FLOAT (*)[n * 2]) ap1),[ap1] "a"(ap1), + "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), + "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"); } @@ -294,10 +294,10 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, "vfmadb %%v0,%%v16,%%v18,%%v0\n\t" "vfmadb %%v0,%%v17,%%v19,%%v0\n\t" "vst %%v0,0(%[y])\n\t" - : "+m"(*(struct { FLOAT x[2]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap), - "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), - "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) + : "+m"(*(FLOAT (*)[2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap),[ap] "a"(ap), + "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), + "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19"); } diff --git a/kernel/zarch/zrot.c b/kernel/zarch/zrot.c index 6284d5a47..3b87e356a 100644 --- a/kernel/zarch/zrot.c +++ b/kernel/zarch/zrot.c @@ -169,8 +169,8 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vst %%v23, 240(%%r1,%[y])\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) x), - "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) + : "+m"(*(FLOAT (*)[n * 2]) x), + "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) : [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s) : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", diff --git a/kernel/zarch/zscal.c b/kernel/zarch/zscal.c index e497a6d7b..a5a8f694d 100644 --- a/kernel/zarch/zscal.c +++ b/kernel/zarch/zscal.c @@ -78,8 +78,8 @@ static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) { "vst %%v23,112(%%r1,%[x])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) - : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha), + : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha), [alpha] "a"(alpha) : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", @@ -128,8 +128,8 @@ static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { "vst %%v23,112(%%r1,%[x])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) - : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha), + : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha), [alpha] "a"(alpha) : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"); @@ -167,8 +167,8 @@ static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) { "vst %%v23,112(%%r1,%[x])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) - : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha), + : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha), [alpha] "a"(alpha) : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"); @@ -190,7 +190,7 @@ static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) { "vst %%v0,112(%%r1,%[x])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" - : "=m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) + : "=m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) : [x] "a"(x) : "cc", "r1", "v0"); } diff --git a/kernel/zarch/zsum.c b/kernel/zarch/zsum.c index e0f978d87..b35832af8 100644 --- a/kernel/zarch/zsum.c +++ b/kernel/zarch/zsum.c @@ -89,7 +89,7 @@ static FLOAT zsum_kernel_16(BLASLONG n, FLOAT *x) { "vfadb %%v24,%%v24,%%v25\n\t" "vsteg %%v24,%[sum],0" : [sum] "=Q"(sum),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/zswap.c b/kernel/zarch/zswap.c index bc466866c..7a2d1f882 100644 --- a/kernel/zarch/zswap.c +++ b/kernel/zarch/zswap.c @@ -99,8 +99,8 @@ static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { "vst %%v31, 240(%%r1,%[y])\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) x), - "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) + : "+m"(*(FLOAT (*)[n * 2]) x), + "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) : [x] "a"(x),[y] "a"(y) : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", From 095f4e6964ba150b1293747d842a60294836be45 Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Tue, 1 Sep 2020 15:09:32 +0200 Subject: [PATCH 405/593] s390x: allow clang to emit fused multiply-adds (replicates gcc's default behavior) gcc's default setting for floating-point expression contraction is "fast", which allows the compiler to emit fused multiply adds instead of separate multiplies and adds (amongst others). Fused multiply-adds, which assembly kernels typically apply, also bring a significant performance advantage to the C implementation for matrix-matrix multiplication on s390x. To enable that performance advantage for builds with clang, add -ffp-contract=fast to the compiler options. Signed-off-by: Marius Hillenbrand --- Makefile.zarch | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Makefile.zarch b/Makefile.zarch index be1e34f6d..b841d9b4d 100644 --- a/Makefile.zarch +++ b/Makefile.zarch @@ -8,3 +8,9 @@ ifeq ($(CORE), Z14) CCOMMON_OPT += -march=z14 -mzvector -O3 FCOMMON_OPT += -march=z14 -mzvector endif + +# Enable floating-point expression contraction for clang, since it is the +# default for gcc +ifeq ($(C_COMPILER), CLANG) +CCOMMON_OPT += -ffp-contract=fast +endif From 2ee5b899ce9777c63710de1ede75c362db5bcd47 Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Tue, 1 Sep 2020 16:16:53 +0200 Subject: [PATCH 406/593] s390x: enable S/DGEMM block with explicit loop unrolling + interleaving with clang The code for SGEMM 16x4 and DGEMM 8x4 blocks on z14 and z15 uses explicit unrolling and interleaving to improve performance. The code employs an empty inline asm statement with operands that constrain the compiler's instruction scheduling and thereby enforce proper overlapping of load and compute phases. Fix an ifdef to apply that for clang builds, as well. Signed-off-by: Marius Hillenbrand --- kernel/zarch/gemm_vec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/zarch/gemm_vec.c b/kernel/zarch/gemm_vec.c index b7d7cc04b..ef0b1d1e3 100644 --- a/kernel/zarch/gemm_vec.c +++ b/kernel/zarch/gemm_vec.c @@ -393,7 +393,7 @@ static inline void GEBP_block_16_4( * Note that we need to massage this particular "barrier" * depending on the gcc version. */ -#if __GNUC__ > 7 +#if __GNUC__ > 7 || defined(__clang__) #define BARRIER_READ_BEFORE_COMPUTE(SUFFIX) \ do { \ asm("" \ From 029fd01cfbcc0b18475faee8353585313c88a95b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 2 Sep 2020 22:47:38 +0200 Subject: [PATCH 407/593] Detect AppleSilicon cpu on OSX --- cpuid_arm64.c | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 1fd43148a..df1be85ba 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -26,6 +26,11 @@ *****************************************************************************/ #include +#ifdef OS_DARWIN +#include +int32_t value; +size_t length=sizeof(value); +#endif #define CPU_UNKNOWN 0 #define CPU_ARMV8 1 @@ -45,6 +50,8 @@ #define CPU_TSV110 9 // Ampere #define CPU_EMAG8180 10 +// Apple +#define CPU_SILICON 11 static char *cpuname[] = { "UNKNOWN", @@ -59,7 +66,8 @@ static char *cpuname[] = { "TSV110", "EMAG8180", "NEOVERSEN1", - "THUNDERX3T110" + "THUNDERX3T110", + "SILICON" }; static char *cpuname_lower[] = { @@ -75,7 +83,8 @@ static char *cpuname_lower[] = { "tsv110", "emag8180", "neoversen1", - "thunderx3t110" + "thunderx3t110", + "silicon" }; int get_feature(char *search) @@ -198,6 +207,10 @@ int detect(void) } #else +#ifdef DARWIN + sysctlbyname("hw.cpufamily",&value,&length,NULL,0); + if (value ==131287967) return CPU_SILICON; +#endif return CPU_ARMV8; #endif @@ -247,7 +260,10 @@ int n=0; printf("#define NUM_CORES %d\n",n); #endif - +#ifdef DARWIN + sysctlbyname("hw.physicalcpu_max",&value,&length,NULL,0); + printf("#define NUM_CORES %d\n",value); +#endif } @@ -398,6 +414,19 @@ void get_cpuconfig(void) printf("#define DTB_DEFAULT_ENTRIES 64 \n"); printf("#define DTB_SIZE 4096 \n"); break; +#ifdef DARWIN + case CPU_SILICON: + printf("#define SILICON \n"); + sysctlbyname("hw.l1icachesize",&value,&length,NULL,0); + printf("#define L1_CODE_SIZE %d \n",value); + sysctlbyname("hw.cachelinesize",&value,&length,NULL,0); + printf("#define L1_CODE_LINESIZE %d \n",value); + sysctlbyname("hw.l1dcachesize",&value,&length,NULL,0); + printf("#define L1_DATA_SIZE %d \n",value); + sysctlbyname("hw.l2dcachesize",&value,&length,NULL,0); + printf("#define L2_DATA_SIZE %d \n",value); + break; +#endif } get_cpucount(); } From b37d17382a092905bb7c2a263ad0ca269e53f541 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 2 Sep 2020 22:48:49 +0200 Subject: [PATCH 408/593] Add Apple Silicon --- TargetList.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/TargetList.txt b/TargetList.txt index 5934f3012..de907bdb3 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -98,6 +98,7 @@ THUNDERX THUNDERX2T99 TSV110 THUNDERX3T110 +SILICON 9.System Z: ZARCH_GENERIC From 4a4d1ca6e025de7c88b8d18794775c4114168359 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 2 Sep 2020 22:52:12 +0200 Subject: [PATCH 409/593] Add AppleSIlicon cpu --- Makefile.arm64 | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Makefile.arm64 b/Makefile.arm64 index 1091edfe5..78ba79aa0 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -66,6 +66,11 @@ FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 endif endif +ifeq ($(CORE), SILICON) +CCOMMON_OPT += -march=armv8.3-a +FCOMMON_OPT += -march=armv8.3-a +endif + ifeq ($(GCCVERSIONGTEQ9), 1) ifeq ($(CORE), TSV110) CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 From 80794fe8fd2a877cd0387ffc64b21a786ae449f6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 2 Sep 2020 22:56:58 +0200 Subject: [PATCH 410/593] Create KERNEL.SILICON --- kernel/arm64/KERNEL.SILICON | 1 + 1 file changed, 1 insertion(+) create mode 100644 kernel/arm64/KERNEL.SILICON diff --git a/kernel/arm64/KERNEL.SILICON b/kernel/arm64/KERNEL.SILICON new file mode 100644 index 000000000..e3efef1f5 --- /dev/null +++ b/kernel/arm64/KERNEL.SILICON @@ -0,0 +1 @@ +include $(KERNELDIR)/KERNEL.ARMV8 From 0ce2aa3163fd2225c746cb5b8b1d82dc1a6fbceb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 2 Sep 2020 23:41:51 +0200 Subject: [PATCH 411/593] Fix data type of rwork array --- lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c b/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c index 91458136c..c5eca535e 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c @@ -47,8 +47,8 @@ lapack_int LAPACKE_cgesvdq( int matrix_layout, char joba, char jobp, lapack_complex_float* cwork = NULL; lapack_complex_float cwork_query; lapack_int lrwork = -1; - double* rwork = NULL; - double rwork_query; + float* rwork = NULL; + float rwork_query; lapack_int i; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_cgesvdq", -1 ); @@ -84,7 +84,7 @@ lapack_int LAPACKE_cgesvdq( int matrix_layout, char joba, char jobp, info = LAPACK_WORK_MEMORY_ERROR; goto exit_level_0; } - rwork = (double*)LAPACKE_malloc( sizeof(double) * lrwork ); + rwork = (float*)LAPACKE_malloc( sizeof(float) * lrwork ); if( rwork == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; goto exit_level_0; From c31b72965ecf2b745c3a515f8abf889f9dd24473 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 2 Sep 2020 23:44:44 +0200 Subject: [PATCH 412/593] Fix data type of work array in zgesvdq prototype --- lapack-netlib/LAPACKE/include/lapack.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h index 4f48b7c87..c045892df 100644 --- a/lapack-netlib/LAPACKE/include/lapack.h +++ b/lapack-netlib/LAPACKE/include/lapack.h @@ -2513,7 +2513,7 @@ void LAPACK_zgesvdq( lapack_complex_double* U, lapack_int const* ldu, lapack_complex_double* V, lapack_int const* ldv, lapack_int* numrank, lapack_int* iwork, lapack_int const* liwork, - lapack_complex_float* cwork, lapack_int* lcwork, + lapack_complex_double* cwork, lapack_int* lcwork, double* rwork, lapack_int const* lrwork, lapack_int* info ); From 1b0f17eeed840d8e9642afd7d801259279d587cf Mon Sep 17 00:00:00 2001 From: Gengxin Xie Date: Tue, 1 Sep 2020 15:41:48 +0800 Subject: [PATCH 413/593] align to 64, using SSE when input size is small --- kernel/x86_64/dasum.c | 140 ++++++++++------------- kernel/x86_64/dasum_microk_haswell-2.c | 91 +++++++++++---- kernel/x86_64/dasum_microk_skylakex-2.c | 79 ++++++++++--- kernel/x86_64/sasum.c | 146 +++++++++++------------- kernel/x86_64/sasum_microk_haswell-2.c | 88 ++++++++++---- kernel/x86_64/sasum_microk_skylakex-2.c | 72 +++++++++--- 6 files changed, 392 insertions(+), 224 deletions(-) diff --git a/kernel/x86_64/dasum.c b/kernel/x86_64/dasum.c index 31313416b..8a40ea4b9 100644 --- a/kernel/x86_64/dasum.c +++ b/kernel/x86_64/dasum.c @@ -1,7 +1,8 @@ #include "common.h" -#include -#define ABS fabs +#ifndef ABS_K +#define ABS_K(a) ((a) > 0 ? (a) : (-(a))) +#endif #if defined(SKYLAKEX) #include "dasum_microk_skylakex-2.c" @@ -9,88 +10,73 @@ #include "dasum_microk_haswell-2.c" #endif -#ifndef HAVE_KERNEL_16 -static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1) +#ifndef HAVE_DASUM_KERNEL +static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1) { - BLASLONG i=0; - FLOAT *x = x1; - FLOAT temp0, temp1, temp2, temp3; - FLOAT temp4, temp5, temp6, temp7; - FLOAT sum0 = 0.0; - FLOAT sum1 = 0.0; - FLOAT sum2 = 0.0; - FLOAT sum3 = 0.0; - - while ( i< n ) - { - - temp0 = ABS(x[0]); - temp1 = ABS(x[1]); - temp2 = ABS(x[2]); - temp3 = ABS(x[3]); - temp4 = ABS(x[4]); - temp5 = ABS(x[5]); - temp6 = ABS(x[6]); - temp7 = ABS(x[7]); - - sum0 += temp0; - sum1 += temp1; - sum2 += temp2; - sum3 += temp3; - - sum0 += temp4; - sum1 += temp5; - sum2 += temp6; - sum3 += temp7; - - x+=8; - i+=8; - - } - - return sum0+sum1+sum2+sum3; + BLASLONG i=0; + BLASLONG n_8 = n & -8; + FLOAT *x = x1; + FLOAT temp0, temp1, temp2, temp3; + FLOAT temp4, temp5, temp6, temp7; + FLOAT sum0 = 0.0; + FLOAT sum1 = 0.0; + FLOAT sum2 = 0.0; + FLOAT sum3 = 0.0; + FLOAT sum4 = 0.0; + + while (i < n_8) { + temp0 = ABS_K(x[0]); + temp1 = ABS_K(x[1]); + temp2 = ABS_K(x[2]); + temp3 = ABS_K(x[3]); + temp4 = ABS_K(x[4]); + temp5 = ABS_K(x[5]); + temp6 = ABS_K(x[6]); + temp7 = ABS_K(x[7]); + + sum0 += temp0; + sum1 += temp1; + sum2 += temp2; + sum3 += temp3; + + sum0 += temp4; + sum1 += temp5; + sum2 += temp6; + sum3 += temp7; + + x+=8; + i+=8; + } + + while (i < n) { + sum4 += ABS_K(x1[i]); + i++; + } + + return sum0+sum1+sum2+sum3+sum4; } #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0; - FLOAT sumf = 0.0; - BLASLONG n1; - - if (n <= 0 || inc_x <= 0) return(sumf); - - if ( inc_x == 1 ) - { - - n1 = n & -16; - if ( n1 > 0 ) - { - - sumf = dasum_kernel_16(n1, x); - i=n1; - } - - while(i < n) - { - sumf += ABS(x[i]); - i++; - } - - } - else - { - - n *= inc_x; - while(i < n) - { - sumf += ABS(x[i]); - i += inc_x; - } - - } - return(sumf); + BLASLONG i=0; + FLOAT sumf = 0.0; + + if (n <= 0 || inc_x <= 0) return(sumf); + + if ( inc_x == 1 ) { + sumf = dasum_kernel(n, x); + } + else { + n *= inc_x; + + while(i < n) { + sumf += ABS_K(x[i]); + i += inc_x; + } + } + return(sumf); } diff --git a/kernel/x86_64/dasum_microk_haswell-2.c b/kernel/x86_64/dasum_microk_haswell-2.c index 7639dfd04..4fc73ddd4 100644 --- a/kernel/x86_64/dasum_microk_haswell-2.c +++ b/kernel/x86_64/dasum_microk_haswell-2.c @@ -1,35 +1,86 @@ #if (( defined(__GNUC__) && __GNUC__ > 6 ) || (defined(__clang__) && __clang_major__ >= 6)) && defined(__AVX2__) -#define HAVE_KERNEL_16 1 +#define HAVE_DASUM_KERNEL #include +#include -static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1) +#ifndef ABS_K +#define ABS_K(a) ((a) > 0 ? (a) : (-(a))) +#endif + +static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1) { BLASLONG i = 0; - __m256d accum_0, accum_1, accum_2, accum_3; - - accum_0 = _mm256_setzero_pd(); - accum_1 = _mm256_setzero_pd(); - accum_2 = _mm256_setzero_pd(); - accum_3 = _mm256_setzero_pd(); - - __m256i abs_mask = _mm256_set1_epi64x(0x7fffffffffffffff); - for (; i < n; i += 16) { - accum_0 += (__m256d)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 0]), abs_mask); - accum_1 += (__m256d)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 4]), abs_mask); - accum_2 += (__m256d)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 8]), abs_mask); - accum_3 += (__m256d)_mm256_and_si256(_mm256_loadu_si256(&x1[i+12]), abs_mask); + FLOAT sumf = 0.0; + + if (n >= 256) { + BLASLONG align_256 = ((32 - ((uintptr_t)x1 & (uintptr_t)0x1f)) >> 3) & 0x3; + + for (i = 0; i < align_256; i++) { + sumf += ABS_K(x1[i]); + } + + n -= align_256; + x1 += align_256; + } + + BLASLONG tail_index_SSE = n&(~7); + BLASLONG tail_index_AVX2 = n&(~255); + + if (n >= 256) { + __m256d accum_0, accum_1, accum_2, accum_3; + + accum_0 = _mm256_setzero_pd(); + accum_1 = _mm256_setzero_pd(); + accum_2 = _mm256_setzero_pd(); + accum_3 = _mm256_setzero_pd(); + + __m256i abs_mask = _mm256_set1_epi64x(0x7fffffffffffffff); + for (i = 0; i < tail_index_AVX2; i += 16) { + accum_0 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 0]), abs_mask); + accum_1 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 4]), abs_mask); + accum_2 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 8]), abs_mask); + accum_3 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+12]), abs_mask); + } + + accum_0 = accum_0 + accum_1 + accum_2 + accum_3; + + __m128d half_accum0; + half_accum0 = _mm_add_pd(_mm256_extractf128_pd(accum_0, 0), _mm256_extractf128_pd(accum_0, 1)); + + half_accum0 = _mm_hadd_pd(half_accum0, half_accum0); + + sumf += half_accum0[0]; } + + if (n >= 8) { + __m128d accum_20, accum_21, accum_22, accum_23; + accum_20 = _mm_setzero_pd(); + accum_21 = _mm_setzero_pd(); + accum_22 = _mm_setzero_pd(); + accum_23 = _mm_setzero_pd(); - accum_0 = accum_0 + accum_1 + accum_2 + accum_3; + __m128i abs_mask2 = _mm_set1_epi64x(0x7fffffffffffffff); + for (i = tail_index_AVX2; i < tail_index_SSE; i += 8) { + accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2); + accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 2]), abs_mask2); + accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2); + accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 6]), abs_mask2); + } - __m128d half_accum0; - half_accum0 = _mm_add_pd(_mm256_extractf128_pd(accum_0, 0), _mm256_extractf128_pd(accum_0, 1)); + accum_20 = accum_20 + accum_21 + accum_22 + accum_23; + __m128d half_accum20; + half_accum20 = _mm_hadd_pd(accum_20, accum_20); - half_accum0 = _mm_hadd_pd(half_accum0, half_accum0); + sumf += half_accum20[0]; + } + + for (i = tail_index_SSE; i < n; ++i) { + sumf += ABS_K(x1[i]); + } - return half_accum0[0]; + return sumf; } #endif diff --git a/kernel/x86_64/dasum_microk_skylakex-2.c b/kernel/x86_64/dasum_microk_skylakex-2.c index 2c959b1ad..aea8c02d9 100644 --- a/kernel/x86_64/dasum_microk_skylakex-2.c +++ b/kernel/x86_64/dasum_microk_skylakex-2.c @@ -1,27 +1,80 @@ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) -#if defined(__AVX512CD__) -#define HAVE_KERNEL_16 1 +#define HAVE_DASUM_KERNEL 1 #include -static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1) +#include + +#ifndef ABS_K +#define ABS_K(a) ((a) > 0 ? (a) : (-(a))) +#endif + +static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1) { BLASLONG i = 0; + FLOAT sumf = 0.0; + + if (n >= 256) { + BLASLONG align_512 = ((64 - ((uintptr_t)x1 & (uintptr_t)0x3f)) >> 3) & 0x7; - __m512d accum_0, accum_1; + for (i = 0; i < align_512; i++) { + sumf += ABS_K(x1[i]); + } + + n -= align_512; + x1 += align_512; + } + + BLASLONG tail_index_SSE = n&(~7); + BLASLONG tail_index_AVX512 = n&(~255); - accum_0 = _mm512_setzero_pd(); - accum_1 = _mm512_setzero_pd(); + // + if ( n >= 256 ) { - for (; i < n; i += 16) { - accum_0 += _mm512_abs_pd(_mm512_loadu_pd(&x1[i+ 0])); - accum_1 += _mm512_abs_pd(_mm512_loadu_pd(&x1[i+ 8])); + __m512d accum_0, accum_1, accum_2, accum_3; + accum_0 = _mm512_setzero_pd(); + accum_1 = _mm512_setzero_pd(); + accum_2 = _mm512_setzero_pd(); + accum_3 = _mm512_setzero_pd(); + for (i = 0; i < tail_index_AVX512; i += 32) { + accum_0 += _mm512_abs_pd(_mm512_load_pd(&x1[i + 0])); + accum_1 += _mm512_abs_pd(_mm512_load_pd(&x1[i + 8])); + accum_2 += _mm512_abs_pd(_mm512_load_pd(&x1[i +16])); + accum_3 += _mm512_abs_pd(_mm512_load_pd(&x1[i +24])); + } + + accum_0 = accum_0 + accum_1 + accum_2 + accum_3; + sumf += _mm512_reduce_add_pd(accum_0); } - accum_0 += accum_1; - return _mm512_reduce_add_pd(accum_0); + if (n >= 8) { + __m128d accum_20, accum_21, accum_22, accum_23; + accum_20 = _mm_setzero_pd(); + accum_21 = _mm_setzero_pd(); + accum_22 = _mm_setzero_pd(); + accum_23 = _mm_setzero_pd(); + + __m128i abs_mask2 = _mm_set1_epi64x(0x7fffffffffffffff); + for (i = tail_index_AVX512; i < tail_index_SSE; i += 8) { + accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2); + accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 2]), abs_mask2); + accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2); + accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 6]), abs_mask2); + } + + accum_20 = accum_20 + accum_21 + accum_22 + accum_23; + __m128d half_accum20; + half_accum20 = _mm_hadd_pd(accum_20, accum_20); + + sumf += half_accum20[0]; + } + + for (i = tail_index_SSE; i < n; ++i) { + sumf += ABS_K(x1[i]); + } + + return sumf; } #endif -#endif diff --git a/kernel/x86_64/sasum.c b/kernel/x86_64/sasum.c index 601255546..36ec4a737 100644 --- a/kernel/x86_64/sasum.c +++ b/kernel/x86_64/sasum.c @@ -1,13 +1,11 @@ #include "common.h" -#include #if defined(DOUBLE) - #error supports float only - #else - -#define ABS fabsf +#ifndef ABS_K +#define ABS_K(a) ((a) > 0 ? (a) : (-(a))) +#endif #endif @@ -17,88 +15,76 @@ #include "sasum_microk_haswell-2.c" #endif -#ifndef HAVE_KERNEL_32 +#ifndef HAVE_SASUM_KERNEL -static FLOAT sasum_kernel_32(BLASLONG n, FLOAT *x1) +static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1) { - BLASLONG i=0; - FLOAT *x = x1; - FLOAT temp0, temp1, temp2, temp3; - FLOAT temp4, temp5, temp6, temp7; - FLOAT sum0 = 0.0; - FLOAT sum1 = 0.0; - FLOAT sum2 = 0.0; - FLOAT sum3 = 0.0; - - while ( i< n ) - { - - temp0 = ABS(x[0]); - temp1 = ABS(x[1]); - temp2 = ABS(x[2]); - temp3 = ABS(x[3]); - temp4 = ABS(x[4]); - temp5 = ABS(x[5]); - temp6 = ABS(x[6]); - temp7 = ABS(x[7]); - - sum0 += temp0; - sum1 += temp1; - sum2 += temp2; - sum3 += temp3; - - sum0 += temp4; - sum1 += temp5; - sum2 += temp6; - sum3 += temp7; - - x+=8; - i+=8; - - } - - return sum0+sum1+sum2+sum3; + BLASLONG i=0; + BLASLONG n_8 = n & -8; + FLOAT *x = x1; + FLOAT temp0, temp1, temp2, temp3; + FLOAT temp4, temp5, temp6, temp7; + FLOAT sum0 = 0.0; + FLOAT sum1 = 0.0; + FLOAT sum2 = 0.0; + FLOAT sum3 = 0.0; + FLOAT sum4 = 0.0; + + while (i < n_8) { + + temp0 = ABS_K(x[0]); + temp1 = ABS_K(x[1]); + temp2 = ABS_K(x[2]); + temp3 = ABS_K(x[3]); + temp4 = ABS_K(x[4]); + temp5 = ABS_K(x[5]); + temp6 = ABS_K(x[6]); + temp7 = ABS_K(x[7]); + + sum0 += temp0; + sum1 += temp1; + sum2 += temp2; + sum3 += temp3; + + sum0 += temp4; + sum1 += temp5; + sum2 += temp6; + sum3 += temp7; + + x+=8; + i+=8; + + } + + while (i < n) { + sum4 += ABS_K(x1[i]); + i++; + } + + return sum0+sum1+sum2+sum3+sum4; } #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0; - FLOAT sumf = 0.0; - BLASLONG n1; - - if (n <= 0 || inc_x <= 0) return(sumf); - - if ( inc_x == 1 ) - { - - n1 = n & -32; - if ( n1 > 0 ) - { - - sumf = sasum_kernel_32(n1, x); - i=n1; - } - - while(i < n) - { - sumf += ABS(x[i]); - i++; - } - - } - else - { - - n *= inc_x; - while(i < n) - { - sumf += ABS(x[i]); - i += inc_x; - } - - } - return(sumf); + BLASLONG i=0; + FLOAT sumf = 0.0; + + if (n <= 0 || inc_x <= 0) return(sumf); + + if ( inc_x == 1 ) { + sumf = sasum_kernel(n, x); + } + else { + + n *= inc_x; + while(i < n) { + sumf += ABS_K(x[i]); + i += inc_x; + } + + } + return(sumf); } diff --git a/kernel/x86_64/sasum_microk_haswell-2.c b/kernel/x86_64/sasum_microk_haswell-2.c index b628729f5..8e6cb9a47 100644 --- a/kernel/x86_64/sasum_microk_haswell-2.c +++ b/kernel/x86_64/sasum_microk_haswell-2.c @@ -1,36 +1,82 @@ #if (( defined(__GNUC__) && __GNUC__ > 6 ) || (defined(__clang__) && __clang_major__ >= 6)) && defined(__AVX2__) -#define HAVE_KERNEL_32 1 +#define HAVE_SASUM_KERNEL 1 #include +#include -static FLOAT sasum_kernel_32(BLASLONG n, FLOAT *x1) +#ifndef ABS_K +#define ABS_K(a) ((a) > 0 ? (a) : (-(a))) +#endif + +static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1) { BLASLONG i = 0; - __m256 accum_0, accum_1, accum_2, accum_3; - - accum_0 = _mm256_setzero_ps(); - accum_1 = _mm256_setzero_ps(); - accum_2 = _mm256_setzero_ps(); - accum_3 = _mm256_setzero_ps(); - - __m256i abs_mask = _mm256_set1_epi32(0x7fffffff); - for (; i < n; i += 32) { - accum_0 += (__m256)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 0]), abs_mask); - accum_1 += (__m256)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 8]), abs_mask); - accum_2 += (__m256)_mm256_and_si256(_mm256_loadu_si256(&x1[i+16]), abs_mask); - accum_3 += (__m256)_mm256_and_si256(_mm256_loadu_si256(&x1[i+24]), abs_mask); + FLOAT sumf = 0.0; + + if (n >= 256) { + BLASLONG align_256 = ((32 - ((uintptr_t)x1 & (uintptr_t)0x1f)) >> 2) & 0x7; + + for (i = 0; i < align_256; i++) { + sumf += ABS_K(x1[i]); + } + + n -= align_256; + x1 += align_256; } - accum_0 = accum_0 + accum_1 + accum_2 + accum_3; + BLASLONG tail_index_SSE = n&(~7); + BLASLONG tail_index_AVX2 = n&(~255); + + if (n >= 256) { + __m256 accum_0, accum_1, accum_2, accum_3; + + accum_0 = _mm256_setzero_ps(); + accum_1 = _mm256_setzero_ps(); + accum_2 = _mm256_setzero_ps(); + accum_3 = _mm256_setzero_ps(); - __m128 half_accum0; - half_accum0 = _mm_add_ps(_mm256_extractf128_ps(accum_0, 0), _mm256_extractf128_ps(accum_0, 1)); + __m256i abs_mask = _mm256_set1_epi32(0x7fffffff); + for (i = 0; i < tail_index_AVX2; i += 32) { + accum_0 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+ 0]), abs_mask); + accum_1 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+ 8]), abs_mask); + accum_2 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+16]), abs_mask); + accum_3 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+24]), abs_mask); + } - half_accum0 = _mm_hadd_ps(half_accum0, half_accum0); - half_accum0 = _mm_hadd_ps(half_accum0, half_accum0); + accum_0 = accum_0 + accum_1 + accum_2 + accum_3; + __m128 half_accum0; + half_accum0 = _mm_add_ps(_mm256_extractf128_ps(accum_0, 0), _mm256_extractf128_ps(accum_0, 1)); - return half_accum0[0]; + half_accum0 = _mm_hadd_ps(half_accum0, half_accum0); + half_accum0 = _mm_hadd_ps(half_accum0, half_accum0); + + sumf += half_accum0[0]; + + } + + if (n >= 8) { + __m128 accum_20, accum_21; + accum_20 = _mm_setzero_ps(); + accum_21 = _mm_setzero_ps(); + + __m128i abs_mask2 = _mm_set1_epi32(0x7fffffff); + for (i = tail_index_AVX2; i < tail_index_SSE; i += 8) { + accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2); + accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2); + } + + accum_20 += accum_21; + accum_20 = _mm_hadd_ps(accum_20, accum_20); + accum_20 = _mm_hadd_ps(accum_20, accum_20); + + sumf += accum_20[0]; + } + + for (i = tail_index_SSE; i < n; ++i) { + sumf += ABS_K(x1[i]); + } + return sumf; } #endif diff --git a/kernel/x86_64/sasum_microk_skylakex-2.c b/kernel/x86_64/sasum_microk_skylakex-2.c index b1c49fd09..c8c69d1e0 100644 --- a/kernel/x86_64/sasum_microk_skylakex-2.c +++ b/kernel/x86_64/sasum_microk_skylakex-2.c @@ -1,27 +1,73 @@ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) -#if defined(__AVX512CD__) -#define HAVE_KERNEL_32 1 +#define HAVE_SASUM_KERNEL 1 + +#ifndef ABS_K +#define ABS_K(a) ((a) > 0 ? (a) : (-(a))) +#endif #include +#include -static FLOAT sasum_kernel_32(BLASLONG n, FLOAT *x1) +static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1) { BLASLONG i = 0; + FLOAT sumf = 0.0; + + if (n >= 256) { + BLASLONG align_512 = ((64 - ((uintptr_t)x1 & (uintptr_t)0x3f)) >> 2) & 0xf; + + for (i = 0; i < align_512; i++) { + sumf += ABS_K(x1[i]); + } + n -= align_512; + x1 += align_512; + } + + BLASLONG tail_index_SSE = n&(~7); + BLASLONG tail_index_AVX512 = n&(~255); - __m512 accum_0, accum_1; + if (n >= 256) { + __m512 accum_0, accum_1, accum_2, accum_3; + accum_0 = _mm512_setzero_ps(); + accum_1 = _mm512_setzero_ps(); + accum_2 = _mm512_setzero_ps(); + accum_3 = _mm512_setzero_ps(); - accum_0 = _mm512_setzero_ps(); - accum_1 = _mm512_setzero_ps(); + for (i = 0; i < tail_index_AVX512; i += 64) { + accum_0 += _mm512_abs_ps(_mm512_load_ps(&x1[i + 0])); + accum_1 += _mm512_abs_ps(_mm512_load_ps(&x1[i +16])); + accum_2 += _mm512_abs_ps(_mm512_load_ps(&x1[i +32])); + accum_3 += _mm512_abs_ps(_mm512_load_ps(&x1[i +48])); + } - for (; i < n; i += 32) { - accum_0 += _mm512_abs_ps(_mm512_loadu_ps(&x1[i+ 0])); - accum_1 += _mm512_abs_ps(_mm512_loadu_ps(&x1[i+ 16])); + accum_0 = accum_0 + accum_1 + accum_2 + accum_3; + sumf += _mm512_reduce_add_ps(accum_0); } - accum_0 += accum_1; - return _mm512_reduce_add_ps(accum_0); + if (n >= 8) { + __m128 accum_20, accum_21; + accum_20 = _mm_setzero_ps(); + accum_21 = _mm_setzero_ps(); + + __m128i abs_mask2 = _mm_set1_epi32(0x7fffffff); + for (i = tail_index_AVX512; i < tail_index_SSE; i += 8) { + accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2); + accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2); + } + + accum_20 += accum_21; + accum_20 = _mm_hadd_ps(accum_20, accum_20); + accum_20 = _mm_hadd_ps(accum_20, accum_20); + + sumf += accum_20[0]; + } + + for (i = tail_index_SSE; i < n; i++) { + sumf += ABS_K(x1[i]); + } + + return sumf; } #endif -#endif From 17dca035de526d69d9639f16622e9aeda7cd7ffd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 3 Sep 2020 08:38:08 +0200 Subject: [PATCH 414/593] rename SILICON to VORTEX --- Makefile.arm64 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.arm64 b/Makefile.arm64 index 78ba79aa0..62a877fff 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -66,7 +66,7 @@ FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 endif endif -ifeq ($(CORE), SILICON) +ifeq ($(CORE), VORTEX) CCOMMON_OPT += -march=armv8.3-a FCOMMON_OPT += -march=armv8.3-a endif From ea3a58c8442c9327b43e5cd2109865782759e6e8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 3 Sep 2020 08:38:53 +0200 Subject: [PATCH 415/593] Rename SILICON to VORTEX --- TargetList.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TargetList.txt b/TargetList.txt index de907bdb3..66eca4506 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -98,7 +98,7 @@ THUNDERX THUNDERX2T99 TSV110 THUNDERX3T110 -SILICON +VORTEX 9.System Z: ZARCH_GENERIC From af5bc955035ba1a590c3b8a72a403c414bee45ee Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 3 Sep 2020 08:43:26 +0200 Subject: [PATCH 416/593] Rename SILICON to VORTEX and fix duplicate numbering --- cpuid_arm64.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index df1be85ba..a0d3e15b9 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -51,7 +51,7 @@ size_t length=sizeof(value); // Ampere #define CPU_EMAG8180 10 // Apple -#define CPU_SILICON 11 +#define CPU_VORTEX 13 static char *cpuname[] = { "UNKNOWN", @@ -67,7 +67,7 @@ static char *cpuname[] = { "EMAG8180", "NEOVERSEN1", "THUNDERX3T110", - "SILICON" + "VORTEX" }; static char *cpuname_lower[] = { @@ -84,7 +84,7 @@ static char *cpuname_lower[] = { "emag8180", "neoversen1", "thunderx3t110", - "silicon" + "vortex" }; int get_feature(char *search) @@ -209,7 +209,7 @@ int detect(void) #else #ifdef DARWIN sysctlbyname("hw.cpufamily",&value,&length,NULL,0); - if (value ==131287967) return CPU_SILICON; + if (value ==131287967) return CPU_VORTEX; #endif return CPU_ARMV8; #endif @@ -415,8 +415,8 @@ void get_cpuconfig(void) printf("#define DTB_SIZE 4096 \n"); break; #ifdef DARWIN - case CPU_SILICON: - printf("#define SILICON \n"); + case CPU_VORTEX: + printf("#define VORTEX \n"); sysctlbyname("hw.l1icachesize",&value,&length,NULL,0); printf("#define L1_CODE_SIZE %d \n",value); sysctlbyname("hw.cachelinesize",&value,&length,NULL,0); From 775a87242d374e140fa784931a04bf01d4738e1f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 3 Sep 2020 08:44:20 +0200 Subject: [PATCH 417/593] Rename KERNEL.SILICON to KERNEL.VORTEX --- kernel/arm64/{KERNEL.SILICON => KERNEL.VORTEX} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename kernel/arm64/{KERNEL.SILICON => KERNEL.VORTEX} (100%) diff --git a/kernel/arm64/KERNEL.SILICON b/kernel/arm64/KERNEL.VORTEX similarity index 100% rename from kernel/arm64/KERNEL.SILICON rename to kernel/arm64/KERNEL.VORTEX From deaeb6c5b89f64bbe9d5ba0126690ae5d57ae0ce Mon Sep 17 00:00:00 2001 From: "Chen, Guobing" Date: Thu, 27 Aug 2020 06:42:28 +0800 Subject: [PATCH 418/593] Add bfloat16 based dot and conversion with single/double 1. Added bfloat16 based dot as new API: shdot 2. Implemented generic kernel and cooperlake-specific (AVX512-BF16) kernel for shdot 3. Added 4 conversion APIs for bfloat16 data type <=> single/double: shstobf16 shdtobf16 sbf16tos dbf16tod shstobf16 -- convert single float array to bfloat16 array shdtobf16 -- convert double float array to bfloat16 array sbf16tos -- convert bfloat16 array to single float array dbf16tod -- convert bfloat16 array to double float array 4. Implemented generic kernels for all 4 conversion APIs, and cooperlake-specific kernel for shstobf16 and shdtobf16 5. Update level1 thread facilitate functions and macros to support multi-threading for these new APIs 6. Fix Cooperlake platform detection/specify issue when under dynamic-arch building 7. Change the typedef of bfloat16 from unsigned short to more strict uint16_t Signed-off-by: Chen, Guobing --- Makefile.tail | 7 +- cblas.h | 11 ++ cmake/kernel.cmake | 4 +- common.h | 3 +- common_interface.h | 5 + common_level1.h | 6 + common_macro.h | 6 + common_param.h | 7 +- common_sh.h | 12 ++ common_thread.h | 19 ++- common_x86_64.h | 23 +++ driver/others/blas_l1_thread.c | 74 ++++++++-- driver/others/blas_server.c | 93 ++++++++---- driver/others/blas_server_omp.c | 71 +++++++-- driver/others/blas_server_win32.c | 69 +++++++-- driver/others/dynamic.c | 44 +++++- exports/gensymbol | 4 +- interface/Makefile | 38 ++++- interface/bf16dot.c | 52 +++++++ interface/bf16to.c | 62 ++++++++ interface/tobf16.c | 61 ++++++++ kernel/Makefile.L1 | 36 +++++ kernel/setparam-ref.c | 4 +- kernel/x86_64/KERNEL | 12 ++ kernel/x86_64/bf16to.c | 114 +++++++++++++++ kernel/x86_64/dtobf16_microk_cooperlake.c | 104 +++++++++++++ kernel/x86_64/shdot.c | 115 +++++++++++++++ kernel/x86_64/shdot_microk_cooperlake.c | 159 ++++++++++++++++++++ kernel/x86_64/stobf16_microk_cooperlake.c | 86 +++++++++++ kernel/x86_64/tobf16.c | 170 ++++++++++++++++++++++ openblas_config_template.h | 3 +- 31 files changed, 1392 insertions(+), 82 deletions(-) create mode 100644 interface/bf16dot.c create mode 100644 interface/bf16to.c create mode 100644 interface/tobf16.c create mode 100644 kernel/x86_64/bf16to.c create mode 100644 kernel/x86_64/dtobf16_microk_cooperlake.c create mode 100644 kernel/x86_64/shdot.c create mode 100644 kernel/x86_64/shdot_microk_cooperlake.c create mode 100644 kernel/x86_64/stobf16_microk_cooperlake.c create mode 100644 kernel/x86_64/tobf16.c diff --git a/Makefile.tail b/Makefile.tail index 39902982b..cfc4a36fc 100644 --- a/Makefile.tail +++ b/Makefile.tail @@ -5,13 +5,14 @@ QBLASOBJS_P = $(QBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) CBLASOBJS_P = $(CBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) ZBLASOBJS_P = $(ZBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) XBLASOBJS_P = $(XBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) +SHEXTOBJS_P = $(SHEXTOBJS:.$(SUFFIX)=.$(PSUFFIX)) COMMONOBJS_P = $(COMMONOBJS:.$(SUFFIX)=.$(PSUFFIX)) HPLOBJS_P = $(HPLOBJS:.$(SUFFIX)=.$(PSUFFIX)) -BLASOBJS = $(SHBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) -BLASOBJS_P = $(SHBLASOBJS_P) $(SBLASOBJS_P) $(DBLASOBJS_P) $(CBLASOBJS_P) $(ZBLASOBJS_P) +BLASOBJS = $(SHEXTOBJS) $(SHBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) +BLASOBJS_P = $(SHEXTOBJS_P) $(SHBLASOBJS_P) $(SBLASOBJS_P) $(DBLASOBJS_P) $(CBLASOBJS_P) $(ZBLASOBJS_P) ifdef EXPRECISION BLASOBJS += $(QBLASOBJS) $(XBLASOBJS) @@ -30,6 +31,7 @@ $(QBLASOBJS) $(QBLASOBJS_P) : override CFLAGS += -DXDOUBLE -UCOMPLEX $(CBLASOBJS) $(CBLASOBJS_P) : override CFLAGS += -UDOUBLE -DCOMPLEX $(ZBLASOBJS) $(ZBLASOBJS_P) : override CFLAGS += -DDOUBLE -DCOMPLEX $(XBLASOBJS) $(XBLASOBJS_P) : override CFLAGS += -DXDOUBLE -DCOMPLEX +$(SHEXTOBJS) $(SHEXTOBJS_P) : override CFLAGS += -DHALF -UDOUBLE -UCOMPLEX $(SHBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(SBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) @@ -38,6 +40,7 @@ $(QBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(CBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(ZBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(XBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) +$(SHEXTOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) libs :: $(BLASOBJS) $(COMMONOBJS) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ diff --git a/cblas.h b/cblas.h index 4bc5588d8..21f3958f2 100644 --- a/cblas.h +++ b/cblas.h @@ -382,6 +382,17 @@ void cblas_cgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint void cblas_zgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double *calpha, double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double *cbeta, double *c, OPENBLAS_CONST blasint cldc); +/*** BFLOAT16 and INT8 extensions ***/ +/* convert float array to BFLOAT16 array by rounding */ +void cblas_shstobf16(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *in, OPENBLAS_CONST blasint incin, bfloat16 *out, OPENBLAS_CONST blasint incout); +/* convert double array to BFLOAT16 array by rounding */ +void cblas_shdtobf16(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *in, OPENBLAS_CONST blasint incin, bfloat16 *out, OPENBLAS_CONST blasint incout); +/* convert BFLOAT16 array to float array */ +void cblas_sbf16tos(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPENBLAS_CONST blasint incin, float *out, OPENBLAS_CONST blasint incout); +/* convert BFLOAT16 array to double array */ +void cblas_dbf16tod(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPENBLAS_CONST blasint incin, double *out, OPENBLAS_CONST blasint incout); +/* dot production of BFLOAT16 input arrays, and output as float */ +float cblas_shdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST bfloat16 *y, OPENBLAS_CONST blasint incy); #ifdef __cplusplus } diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake index 4b505a102..79eeaae6f 100644 --- a/cmake/kernel.cmake +++ b/cmake/kernel.cmake @@ -126,12 +126,14 @@ if (BUILD_HALF) set(SHAXPYKERNEL ../arm/axpy.c) set(SHAXPBYKERNEL ../arm/axpby.c) set(SHCOPYKERNEL ../arm/copy.c) - set(SHDOTKERNEL ../arm/dot.c) + set(SHDOTKERNEL ../x86_64/shdot.c) set(SHROTKERNEL ../arm/rot.c) set(SHSCALKERNEL ../arm/scal.c) set(SHNRM2KERNEL ../arm/nrm2.c) set(SHSUMKERNEL ../arm/sum.c) set(SHSWAPKERNEL ../arm/swap.c) + set(TOBF16KERNEL ../x86_64/tobf16.c) + set(BF16TOKERNEL ../x86_64/bf16to.c) endif () endmacro () diff --git a/common.h b/common.h index d6637abe4..adc162536 100644 --- a/common.h +++ b/common.h @@ -258,7 +258,8 @@ typedef unsigned long BLASULONG; #endif #ifndef BFLOAT16 -typedef unsigned short bfloat16; +#include +typedef uint16_t bfloat16; #define HALFCONVERSION 1 #endif diff --git a/common_interface.h b/common_interface.h index 78f5be6b0..35a957aa1 100644 --- a/common_interface.h +++ b/common_interface.h @@ -54,6 +54,11 @@ double BLASFUNC(dsdot) (blasint *, float *, blasint *, float *, blasint *); double BLASFUNC(ddot) (blasint *, double *, blasint *, double *, blasint *); xdouble BLASFUNC(qdot) (blasint *, xdouble *, blasint *, xdouble *, blasint *); +float BLASFUNC(shdot) (blasint *, bfloat16 *, blasint *, bfloat16 *, blasint *); +void BLASFUNC(shstobf16) (blasint *, float *, blasint *, bfloat16 *, blasint *); +void BLASFUNC(shdtobf16) (blasint *, double *, blasint *, bfloat16 *, blasint *); +void BLASFUNC(sbf16tos) (blasint *, bfloat16 *, blasint *, float *, blasint *); +void BLASFUNC(dbf16tod) (blasint *, bfloat16 *, blasint *, double *, blasint *); #ifdef RETURN_BY_STRUCT typedef struct { diff --git a/common_level1.h b/common_level1.h index 74cafb6db..88aa275a5 100644 --- a/common_level1.h +++ b/common_level1.h @@ -46,6 +46,12 @@ float sdot_k(BLASLONG, float *, BLASLONG, float *, BLASLONG); double dsdot_k(BLASLONG, float *, BLASLONG, float *, BLASLONG); double ddot_k(BLASLONG, double *, BLASLONG, double *, BLASLONG); xdouble qdot_k(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); +float shdot_k(BLASLONG, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG); + +void shstobf16_k(BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG); +void shdtobf16_k(BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG); +void sbf16tos_k (BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG); +void dbf16tod_k (BLASLONG, bfloat16 *, BLASLONG, double *, BLASLONG); openblas_complex_float cdotc_k (BLASLONG, float *, BLASLONG, float *, BLASLONG); openblas_complex_float cdotu_k (BLASLONG, float *, BLASLONG, float *, BLASLONG); diff --git a/common_macro.h b/common_macro.h index 8fe1f156f..3d6bcd9e8 100644 --- a/common_macro.h +++ b/common_macro.h @@ -646,6 +646,11 @@ #elif defined(HALF) +#define D_TO_BF16_K SHDTOBF16_K +#define D_BF16_TO_K DBF16TOD_K +#define S_TO_BF16_K SHSTOBF16_K +#define S_BF16_TO_K SBF16TOS_K + #define AMAX_K SAMAX_K #define AMIN_K SAMIN_K #define MAX_K SMAX_K @@ -657,6 +662,7 @@ #define ASUM_K SASUM_K #define DOTU_K SDOTU_K #define DOTC_K SDOTC_K +#define BF16_DOT_K SHDOT_K #define AXPYU_K SAXPYU_K #define AXPYC_K SAXPYC_K #define AXPBY_K SAXPBY_K diff --git a/common_param.h b/common_param.h index 0437482dc..a52de98ab 100644 --- a/common_param.h +++ b/common_param.h @@ -51,6 +51,11 @@ typedef struct { int shgemm_p, shgemm_q, shgemm_r; int shgemm_unroll_m, shgemm_unroll_n, shgemm_unroll_mn; + void (*shstobf16_k) (BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG); + void (*shdtobf16_k) (BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG); + void (*sbf16tos_k) (BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG); + void (*dbf16tod_k) (BLASLONG, bfloat16 *, BLASLONG, double *, BLASLONG); + float (*shamax_k) (BLASLONG, float *, BLASLONG); float (*shamin_k) (BLASLONG, float *, BLASLONG); float (*shmax_k) (BLASLONG, float *, BLASLONG); @@ -64,7 +69,7 @@ BLASLONG (*ishmin_k) (BLASLONG, float *, BLASLONG); float (*shasum_k) (BLASLONG, float *, BLASLONG); float (*shsum_k) (BLASLONG, float *, BLASLONG); int (*shcopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - float (*shdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); + float (*shdot_k) (BLASLONG, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG); double (*dshdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*shrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); diff --git a/common_sh.h b/common_sh.h index 7a0045762..5dc99b3bd 100644 --- a/common_sh.h +++ b/common_sh.h @@ -3,6 +3,12 @@ #ifndef DYNAMIC_ARCH +#define SHDOT_K shdot_k +#define SHSTOBF16_K shstobf16_k +#define SHDTOBF16_K shdtobf16_k +#define SBF16TOS_K sbf16tos_k +#define DBF16TOD_K dbf16tod_k + #define SHGEMM_ONCOPY shgemm_oncopy #define SHGEMM_OTCOPY shgemm_otcopy @@ -18,6 +24,12 @@ #else +#define SHDOT_K gotoblas -> shdot_k +#define SHSTOBF16_K gotoblas -> shstobf16_k +#define SHDTOBF16_K gotoblas -> shdtobf16_k +#define SBF16TOS_K gotoblas -> sbf16tos_k +#define DBF16TOD_K gotoblas -> dbf16tod_k + #define SHGEMM_ONCOPY gotoblas -> shgemm_oncopy #define SHGEMM_OTCOPY gotoblas -> shgemm_otcopy #define SHGEMM_INCOPY gotoblas -> shgemm_incopy diff --git a/common_thread.h b/common_thread.h index ec0c65b22..a18df0d78 100644 --- a/common_thread.h +++ b/common_thread.h @@ -59,12 +59,19 @@ extern int blas_omp_linked; #define BLAS_PTHREAD 0x4000U #define BLAS_NODE 0x2000U -#define BLAS_PREC 0x0003U -#define BLAS_SINGLE 0x0000U -#define BLAS_DOUBLE 0x0001U -#define BLAS_XDOUBLE 0x0002U -#define BLAS_REAL 0x0000U -#define BLAS_COMPLEX 0x0004U +#define BLAS_PREC 0x000FU +#define BLAS_INT8 0x0000U +#define BLAS_BFLOAT16 0x0001U +#define BLAS_SINGLE 0x0002U +#define BLAS_DOUBLE 0x0003U +#define BLAS_XDOUBLE 0x0004U +#define BLAS_STOBF16 0x0008U +#define BLAS_DTOBF16 0x0009U +#define BLAS_BF16TOS 0x000AU +#define BLAS_BF16TOD 0x000BU + +#define BLAS_REAL 0x0000U +#define BLAS_COMPLEX 0x1000U #define BLAS_TRANSA 0x0030U /* 2bit */ #define BLAS_TRANSA_N 0x0000U diff --git a/common_x86_64.h b/common_x86_64.h index bee7e8cdb..b813336c6 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -142,6 +142,29 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){ #endif } +static __inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx, int *edx) +{ +#ifdef C_MSVC + int cpuInfo[4] = {-1}; + __cpuidex(cpuInfo, op, count); + *eax = cpuInfo[0]; + *ebx = cpuInfo[1]; + *ecx = cpuInfo[2]; + *edx = cpuInfo[3]; +#else +#if defined(__i386__) && defined(__PIC__) + __asm__ __volatile__ + ("mov %%ebx, %%edi;" + "cpuid;" + "xchgl %%ebx, %%edi;" + : "=a" (*eax), "=D" (*ebx), "=c" (*ecx), "=d" (*edx) : "0" (op), "2" (count) : "cc"); +#else + __asm__ __volatile__ + ("cpuid": "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "0" (op), "2" (count) : "cc"); +#endif +#endif +} + /* #define WHEREAMI */ diff --git a/driver/others/blas_l1_thread.c b/driver/others/blas_l1_thread.c index e405c7465..04acbcc5f 100644 --- a/driver/others/blas_l1_thread.c +++ b/driver/others/blas_l1_thread.c @@ -49,9 +49,36 @@ int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha blas_arg_t args [MAX_CPU_NUMBER]; BLASLONG i, width, astride, bstride; - int num_cpu, calc_type; - - calc_type = (mode & BLAS_PREC) + ((mode & BLAS_COMPLEX) != 0) + 2; + int num_cpu, calc_type_a, calc_type_b; + + switch (mode & BLAS_PREC) { + case BLAS_INT8 : + case BLAS_BFLOAT16: + case BLAS_SINGLE : + case BLAS_DOUBLE : + case BLAS_XDOUBLE : + calc_type_a = calc_type_b = (mode & BLAS_PREC) + ((mode & BLAS_COMPLEX) != 0); + break; + case BLAS_STOBF16 : + calc_type_a = 2 + ((mode & BLAS_COMPLEX) != 0); + calc_type_b = 1 + ((mode & BLAS_COMPLEX) != 0); + break; + case BLAS_DTOBF16 : + calc_type_a = 3 + ((mode & BLAS_COMPLEX) != 0); + calc_type_b = 1 + ((mode & BLAS_COMPLEX) != 0); + break; + case BLAS_BF16TOS : + calc_type_a = 1 + ((mode & BLAS_COMPLEX) != 0); + calc_type_b = 2 + ((mode & BLAS_COMPLEX) != 0); + break; + case BLAS_BF16TOD : + calc_type_a = 1 + ((mode & BLAS_COMPLEX) != 0); + calc_type_b = 3 + ((mode & BLAS_COMPLEX) != 0); + break; + default: + calc_type_a = calc_type_b = 0; + break; + } mode |= BLAS_LEGACY; @@ -77,8 +104,8 @@ int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha bstride = width; } - astride <<= calc_type; - bstride <<= calc_type; + astride <<= calc_type_a; + bstride <<= calc_type_b; args[num_cpu].m = width; args[num_cpu].n = n; @@ -120,9 +147,36 @@ int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASL blas_arg_t args [MAX_CPU_NUMBER]; BLASLONG i, width, astride, bstride; - int num_cpu, calc_type; - - calc_type = (mode & BLAS_PREC) + ((mode & BLAS_COMPLEX) != 0) + 2; + int num_cpu, calc_type_a, calc_type_b; + + switch (mode & BLAS_PREC) { + case BLAS_INT8 : + case BLAS_BFLOAT16: + case BLAS_SINGLE : + case BLAS_DOUBLE : + case BLAS_XDOUBLE : + calc_type_a = calc_type_b = (mode & BLAS_PREC) + ((mode & BLAS_COMPLEX) != 0); + break; + case BLAS_STOBF16 : + calc_type_a = 2 + ((mode & BLAS_COMPLEX) != 0); + calc_type_b = 1 + ((mode & BLAS_COMPLEX) != 0); + break; + case BLAS_DTOBF16 : + calc_type_a = 3 + ((mode & BLAS_COMPLEX) != 0); + calc_type_b = 1 + ((mode & BLAS_COMPLEX) != 0); + break; + case BLAS_BF16TOS : + calc_type_a = 1 + ((mode & BLAS_COMPLEX) != 0); + calc_type_b = 2 + ((mode & BLAS_COMPLEX) != 0); + break; + case BLAS_BF16TOD : + calc_type_a = 1 + ((mode & BLAS_COMPLEX) != 0); + calc_type_b = 3 + ((mode & BLAS_COMPLEX) != 0); + break; + default: + calc_type_a = calc_type_b = 0; + break; + } mode |= BLAS_LEGACY; @@ -148,8 +202,8 @@ int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASL bstride = width; } - astride <<= calc_type; - bstride <<= calc_type; + astride <<= calc_type_a; + bstride <<= calc_type_b; args[num_cpu].m = width; args[num_cpu].n = n; diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 756e51b5d..8d3dda3bf 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -192,7 +192,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ if (!(mode & BLAS_COMPLEX)){ #ifdef EXPRECISION - if (mode & BLAS_XDOUBLE){ + if ((mode & BLAS_PREC) == BLAS_XDOUBLE){ /* REAL / Extended Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, @@ -205,7 +205,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> c, args -> ldc, sb); } else #endif - if (mode & BLAS_DOUBLE){ + if ((mode & BLAS_PREC) == BLAS_DOUBLE){ /* REAL / Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, @@ -216,21 +216,58 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); - } else { - /* REAL / Single */ - void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, - float *, BLASLONG, float *, BLASLONG, - float *, BLASLONG, void *) = func; - - afunc(args -> m, args -> n, args -> k, - ((float *)args -> alpha)[0], - args -> a, args -> lda, - args -> b, args -> ldb, - args -> c, args -> ldc, sb); + } else if ((mode & BLAS_PREC) == BLAS_SINGLE){ + /* REAL / Single */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, + float *, BLASLONG, float *, BLASLONG, + float *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((float *)args -> alpha)[0], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); +#ifdef BUILD_HALF + } else if ((mode & BLAS_PREC) == BLAS_BFLOAT16){ + /* REAL / BFLOAT16 */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16, + bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, + bfloat16 *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((bfloat16 *)args -> alpha)[0], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } else if ((mode & BLAS_PREC) == BLAS_STOBF16){ + /* REAL / BLAS_STOBF16 */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, + float *, BLASLONG, bfloat16 *, BLASLONG, + float *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((float *)args -> alpha)[0], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } else if ((mode & BLAS_PREC) == BLAS_DTOBF16){ + /* REAL / BLAS_DTOBF16 */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, + double *, BLASLONG, bfloat16 *, BLASLONG, + double *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((double *)args -> alpha)[0], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); +#endif + } else { + /* REAL / Other types in future */ } } else { #ifdef EXPRECISION - if (mode & BLAS_XDOUBLE){ + if ((mode & BLAS_PREC) == BLAS_XDOUBLE){ /* COMPLEX / Extended Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, @@ -244,7 +281,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> c, args -> ldc, sb); } else #endif - if (mode & BLAS_DOUBLE){ + if ((mode & BLAS_PREC) == BLAS_DOUBLE) { /* COMPLEX / Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, @@ -256,7 +293,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); - } else { + } else if ((mode & BLAS_PREC) == BLAS_SINGLE) { /* COMPLEX / Single */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, @@ -268,7 +305,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); - } + } else { + /* COMPLEX / Other types in future */ + } } } @@ -414,33 +453,37 @@ blas_queue_t *tscq; if (sb == NULL) { if (!(queue -> mode & BLAS_COMPLEX)){ #ifdef EXPRECISION - if (queue -> mode & BLAS_XDOUBLE){ + if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){ sb = (void *)(((BLASLONG)sa + ((QGEMM_P * QGEMM_Q * sizeof(xdouble) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); } else #endif - if (queue -> mode & BLAS_DOUBLE){ + if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE) { sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); - } else { + } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) { sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); - } + } else { + /* Other types in future */ + } } else { #ifdef EXPRECISION - if (queue -> mode & BLAS_XDOUBLE){ + if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){ sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof(xdouble) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); } else #endif - if (queue -> mode & BLAS_DOUBLE){ + if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){ sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); - } else { + } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) { sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); - } + } else { + /* Other types in future */ + } } queue->sb=sb; } diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index d9969b599..d126955e4 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -142,7 +142,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ if (!(mode & BLAS_COMPLEX)){ #ifdef EXPRECISION - if (mode & BLAS_XDOUBLE){ + if ((mode & BLAS_PREC) == BLAS_XDOUBLE){ /* REAL / Extended Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, @@ -155,7 +155,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> c, args -> ldc, sb); } else #endif - if (mode & BLAS_DOUBLE){ + if ((mode & BLAS_PREC) == BLAS_DOUBLE){ /* REAL / Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, @@ -166,7 +166,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); - } else { + } else if ((mode & BLAS_PREC) == BLAS_SINGLE){ /* REAL / Single */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, @@ -177,10 +177,47 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); +#ifdef BUILD_HALF + } else if ((mode & BLAS_PREC) == BLAS_BFLOAT16){ + /* REAL / BFLOAT16 */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16, + bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, + bfloat16 *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((bfloat16 *)args -> alpha)[0], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } else if ((mode & BLAS_PREC) == BLAS_STOBF16){ + /* REAL / BLAS_STOBF16 */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, + float *, BLASLONG, bfloat16 *, BLASLONG, + float *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((float *)args -> alpha)[0], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } else if ((mode & BLAS_PREC) == BLAS_DTOBF16){ + /* REAL / BLAS_DTOBF16 */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, + double *, BLASLONG, bfloat16 *, BLASLONG, + double *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((double *)args -> alpha)[0], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); +#endif + } else { + /* REAL / Other types in future */ } } else { #ifdef EXPRECISION - if (mode & BLAS_XDOUBLE){ + if ((mode & BLAS_PREC) == BLAS_XDOUBLE){ /* COMPLEX / Extended Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, @@ -194,7 +231,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> c, args -> ldc, sb); } else #endif - if (mode & BLAS_DOUBLE){ + if ((mode & BLAS_PREC) == BLAS_DOUBLE){ /* COMPLEX / Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, @@ -206,7 +243,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); - } else { + } else if ((mode & BLAS_PREC) == BLAS_SINGLE){ /* COMPLEX / Single */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, @@ -218,8 +255,10 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); - } - } + } else { + /* COMPLEX / Other types in future */ + } + } } static void exec_threads(blas_queue_t *queue, int buf_index){ @@ -255,32 +294,36 @@ static void exec_threads(blas_queue_t *queue, int buf_index){ if (sb == NULL) { if (!(queue -> mode & BLAS_COMPLEX)){ #ifdef EXPRECISION - if (queue -> mode & BLAS_XDOUBLE){ + if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){ sb = (void *)(((BLASLONG)sa + ((QGEMM_P * QGEMM_Q * sizeof(xdouble) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); } else #endif - if (queue -> mode & BLAS_DOUBLE){ + if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){ sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); - } else { + } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE){ sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + } else { + /* Other types in future */ } } else { #ifdef EXPRECISION - if (queue -> mode & BLAS_XDOUBLE){ + if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){ sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof(xdouble) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); } else #endif - if (queue -> mode & BLAS_DOUBLE){ + if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){ sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); - } else { + } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) { sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + } else { + /* Other types in future */ } } queue->sb=sb; diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 5ecc4428b..d2cc91757 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -77,7 +77,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ if (!(mode & BLAS_COMPLEX)){ #ifdef EXPRECISION - if (mode & BLAS_XDOUBLE){ + if ((mode & BLAS_PREC) == BLAS_XDOUBLE){ /* REAL / Extended Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, @@ -90,7 +90,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> c, args -> ldc, sb); } else #endif - if (mode & BLAS_DOUBLE){ + if ((mode & BLAS_PREC) == BLAS_DOUBLE){ /* REAL / Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, @@ -101,7 +101,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); - } else { + } else if ((mode & BLAS_PREC) == BLAS_SINGLE){ /* REAL / Single */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, @@ -112,10 +112,47 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); +#ifdef BUILD_HALF + } else if ((mode & BLAS_PREC) == BLAS_BFLOAT16){ + /* REAL / BFLOAT16 */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16, + bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, + bfloat16 *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((bfloat16 *)args -> alpha)[0], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } else if ((mode & BLAS_PREC) == BLAS_STOBF16){ + /* REAL / BLAS_STOBF16 */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, + float *, BLASLONG, bfloat16 *, BLASLONG, + float *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((float *)args -> alpha)[0], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } else if ((mode & BLAS_PREC) == BLAS_DTOBF16){ + /* REAL / BLAS_DTOBF16 */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, + double *, BLASLONG, bfloat16 *, BLASLONG, + double *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((double *)args -> alpha)[0], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); +#endif + } else { + /* REAL / Other types in future */ } } else { #ifdef EXPRECISION - if (mode & BLAS_XDOUBLE){ + if ((mode & BLAS_PREC) == BLAS_XDOUBLE){ /* COMPLEX / Extended Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, @@ -129,7 +166,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> c, args -> ldc, sb); } else #endif - if (mode & BLAS_DOUBLE){ + if ((mode & BLAS_PREC) == BLAS_DOUBLE){ /* COMPLEX / Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, @@ -141,7 +178,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); - } else { + } else if ((mode & BLAS_PREC) == BLAS_SINGLE) { /* COMPLEX / Single */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, @@ -153,7 +190,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); - } + } else { + /* COMPLEX / Other types in future */ + } } } @@ -233,32 +272,36 @@ static DWORD WINAPI blas_thread_server(void *arg){ if (sb == NULL) { if (!(queue -> mode & BLAS_COMPLEX)){ #ifdef EXPRECISION - if (queue -> mode & BLAS_XDOUBLE){ + if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){ sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * sizeof(xdouble) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); } else #endif - if (queue -> mode & BLAS_DOUBLE){ + if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){ sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); - } else { + } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) { sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + } else { + /* Other types in future */ } } else { #ifdef EXPRECISION - if (queue -> mode & BLAS_XDOUBLE){ + if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){ sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof(xdouble) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); } else #endif - if (queue -> mode & BLAS_DOUBLE){ + if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){ sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); - } else { + } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) { sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + } else { + /* Other types in future */ } } queue->sb=sb; diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 5d71b1b2c..21d2c7948 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -207,6 +207,19 @@ extern gotoblas_t gotoblas_SKYLAKEX; #else #define gotoblas_SKYLAKEX gotoblas_PRESCOTT #endif +#ifdef DYN_COOPERLAKE +extern gotoblas_t gotoblas_COOPERLAKE; +#elif defined(DYN_SKYLAKEX) +#define gotoblas_COOPERLAKE gotoblas_SKYLAKEX +#elif defined(DYN_HASWELL) +#define gotoblas_COOPERLAKE gotoblas_HASWELL +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_COOPERLAKE gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_COOPERLAKE gotoblas_NEHALEM +#else +#define gotoblas_COOPERLAKE gotoblas_PRESCOTT +#endif #else // not DYNAMIC_LIST @@ -247,14 +260,17 @@ extern gotoblas_t gotoblas_EXCAVATOR; #ifdef NO_AVX2 #define gotoblas_HASWELL gotoblas_SANDYBRIDGE #define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE +#define gotoblas_COOPERLAKE gotoblas_SANDYBRIDGE #define gotoblas_ZEN gotoblas_SANDYBRIDGE #else extern gotoblas_t gotoblas_HASWELL; extern gotoblas_t gotoblas_ZEN; #ifndef NO_AVX512 extern gotoblas_t gotoblas_SKYLAKEX; +extern gotoblas_t gotoblas_COOPERLAKE; #else #define gotoblas_SKYLAKEX gotoblas_HASWELL +#define gotoblas_COOPERLAKE gotoblas_HASWELL #endif #endif #else @@ -262,6 +278,7 @@ extern gotoblas_t gotoblas_SKYLAKEX; #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM #define gotoblas_HASWELL gotoblas_NEHALEM #define gotoblas_SKYLAKEX gotoblas_NEHALEM +#define gotoblas_COOPERLAKE gotoblas_NEHALEM #define gotoblas_BULLDOZER gotoblas_BARCELONA #define gotoblas_PILEDRIVER gotoblas_BARCELONA #define gotoblas_STEAMROLLER gotoblas_BARCELONA @@ -343,6 +360,23 @@ int support_avx512(){ #endif } +int support_avx512_bf16(){ +#if !defined(NO_AVX) && !defined(NO_AVX512) + int eax, ebx, ecx, edx; + int ret=0; + + if (!support_avx512()) + return 0; + cpuid_count(7, 1, &eax, &ebx, &ecx, &edx); + if((eax & 32) == 32){ + ret=1; // CPUID.7.1:EAX[bit 5] indicates whether avx512_bf16 supported or not + } + return ret; +#else + return 0; +#endif +} + extern void openblas_warning(int verbose, const char * msg); #define FALLBACK_VERBOSE 1 #define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n" @@ -524,7 +558,10 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } - if (model == 5) { + if (model == 5) { + // Intel Cooperlake + if(support_avx512_bf16()) + return &gotoblas_COOPERLAKE; // Intel Skylake X if (support_avx512()) return &gotoblas_SKYLAKEX; @@ -774,7 +811,8 @@ static char *corename[] = { "Steamroller", "Excavator", "Zen", - "SkylakeX" + "SkylakeX", + "Cooperlake" }; char *gotoblas_corename(void) { @@ -838,6 +876,7 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_EXCAVATOR) return corename[22]; if (gotoblas == &gotoblas_ZEN) return corename[23]; if (gotoblas == &gotoblas_SKYLAKEX) return corename[24]; + if (gotoblas == &gotoblas_COOPERLAKE) return corename[25]; return corename[0]; } @@ -868,6 +907,7 @@ static gotoblas_t *force_coretype(char *coretype){ switch (found) { + case 25: return (&gotoblas_COOPERLAKE); case 24: return (&gotoblas_SKYLAKEX); case 23: return (&gotoblas_ZEN); case 22: return (&gotoblas_EXCAVATOR); diff --git a/exports/gensymbol b/exports/gensymbol index 73b4be248..ce4d9bb64 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -46,7 +46,7 @@ ssum, dsum, scsum, dzsum ); -@halfblasobjs = (shgemm); +@halfblasobjs = (shgemm, shdot, shstobf16, shdtobf16, sbf16tos, dbf16tod); @cblasobjs = ( cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv, cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k, @@ -84,7 +84,7 @@ cblas_xerbla ); -@halfcblasobjs = (cblas_shgemm); +@halfcblasobjs = (cblas_shgemm, cblas_shdot, cblas_shstobf16, cblas_shdtobf16, cblas_sbf16tos, cblas_dbf16tod); @exblasobjs = ( qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm, diff --git a/interface/Makefile b/interface/Makefile index 2dbd60073..fde6227bc 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -47,7 +47,9 @@ SBLAS3OBJS = \ sgeadd.$(SUFFIX) ifeq ($(BUILD_HALF),1) +SHBLAS1OBJS = shdot.$(SUFFIX) SHBLAS3OBJS = shgemm.$(SUFFIX) +SHEXTOBJS = shstobf16.$(SUFFIX) shdtobf16.$(SUFFIX) sbf16tos.$(SUFFIX) dbf16tod.$(SUFFIX) endif DBLAS1OBJS = \ @@ -281,7 +283,9 @@ CSBLAS3OBJS = \ cblas_sgeadd.$(SUFFIX) ifeq ($(BUILD_HALF),1) +CSHBLAS1OBJS = cblas_shdot.$(SUFFIX) CSHBLAS3OBJS = cblas_shgemm.$(SUFFIX) +CSHEXTOBJS = cblas_shstobf16.$(SUFFIX) cblas_shdtobf16.$(SUFFIX) cblas_sbf16tos.$(SUFFIX) cblas_dbf16tod.$(SUFFIX) endif CDBLAS1OBJS = \ @@ -374,6 +378,7 @@ override CFLAGS += -I. SBLAS1OBJS += $(CSBLAS1OBJS) SBLAS2OBJS += $(CSBLAS2OBJS) SBLAS3OBJS += $(CSBLAS3OBJS) +SHBLAS1OBJS += $(CSHBLAS1OBJS) SHBLAS3OBJS += $(CSHBLAS3OBJS) DBLAS1OBJS += $(CDBLAS1OBJS) DBLAS2OBJS += $(CDBLAS2OBJS) @@ -385,10 +390,11 @@ ZBLAS1OBJS += $(CZBLAS1OBJS) ZBLAS2OBJS += $(CZBLAS2OBJS) ZBLAS3OBJS += $(CZBLAS3OBJS) +SHEXTOBJS += $(CSHEXTOBJS) endif SBLASOBJS = $(SBLAS1OBJS) $(SBLAS2OBJS) $(SBLAS3OBJS) -SHBLASOBJS = $(SHBLAS3OBJS) +SHBLASOBJS = $(SHBLAS1OBJS) $(SHBLAS3OBJS) DBLASOBJS = $(DBLAS1OBJS) $(DBLAS2OBJS) $(DBLAS3OBJS) QBLASOBJS = $(QBLAS1OBJS) $(QBLAS2OBJS) $(QBLAS3OBJS) CBLASOBJS = $(CBLAS1OBJS) $(CBLAS2OBJS) $(CBLAS3OBJS) @@ -463,7 +469,7 @@ ZBLASOBJS += $(ZLAPACKOBJS) endif -FUNCOBJS = $(SHBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) +FUNCOBJS = $(SHEXTOBJS) $(SHBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) ifdef EXPRECISION FUNCOBJS += $(QBLASOBJS) $(XBLASOBJS) @@ -491,7 +497,7 @@ endif clean :: @rm -f functable.h -level1 : $(SBLAS1OBJS) $(DBLAS1OBJS) $(QBLAS1OBJS) $(CBLAS1OBJS) $(ZBLAS1OBJS) $(XBLAS1OBJS) +level1 : $(BEXTOBJS) $(SHBLAS1OBJS) $(SBLAS1OBJS) $(DBLAS1OBJS) $(QBLAS1OBJS) $(CBLAS1OBJS) $(ZBLAS1OBJS) $(XBLAS1OBJS) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ level2 : $(SBLAS2OBJS) $(DBLAS2OBJS) $(QBLAS2OBJS) $(CBLAS2OBJS) $(ZBLAS2OBJS) $(XBLAS2OBJS) @@ -725,6 +731,19 @@ sdsdot.$(SUFFIX) sdsdot.$(PSUFFIX) : sdsdot.c dsdot.$(SUFFIX) dsdot.$(PSUFFIX) : dsdot.c $(CC) $(CFLAGS) -c $< -o $(@F) +ifeq ($(BUILD_HALF),1) +shdot.$(SUFFIX) shdot.$(PSUFFIX) : bf16dot.c + $(CC) $(CFLAGS) -c $< -o $(@F) +shstobf16.$(SUFFIX) shstobf16.$(PSUFFIX) : tobf16.c + $(CC) $(CFLAGS) -DSINGLE_PREC -UDOUBLE_PREC -c $< -o $(@F) +shdtobf16.$(SUFFIX) shdtobf16.$(PSUFFIX) : tobf16.c + $(CC) $(CFLAGS) -USINGLE_PREC -DDOUBLE_PREC -c $< -o $(@F) +sbf16tos.$(SUFFIX) sbf16tos.$(PSUFFIX) : bf16to.c + $(CC) $(CFLAGS) -DSINGLE_PREC -UDOUBLE_PREC -c $< -o $(@F) +dbf16tod.$(SUFFIX) dbf16tod.$(PSUFFIX) : bf16to.c + $(CC) $(CFLAGS) -USINGLE_PREC -DDOUBLE_PREC -c $< -o $(@F) +endif + sdot.$(SUFFIX) sdot.$(PSUFFIX) : dot.c $(CC) $(CFLAGS) -c $< -o $(@F) @@ -1463,6 +1482,19 @@ cblas_sdsdot.$(SUFFIX) cblas_sdsdot.$(PSUFFIX) : sdsdot.c cblas_dsdot.$(SUFFIX) cblas_dsdot.$(PSUFFIX) : dsdot.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) +ifeq ($(BUILD_HALF),1) +cblas_shdot.$(SUFFIX) cblas_shdot.$(PSUFFIX) : bf16dot.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) +cblas_shstobf16.$(SUFFIX) cblas_shstobf16.$(PSUFFIX) : tobf16.c + $(CC) $(CFLAGS) -DCBLAS -DSINGLE_PREC -UDOUBLE_PREC -c $< -o $(@F) +cblas_shdtobf16.$(SUFFIX) cblas_shdtobf16.$(PSUFFIX) : tobf16.c + $(CC) $(CFLAGS) -DCBLAS -USINGLE_PREC -DDOUBLE_PREC -c $< -o $(@F) +cblas_sbf16tos.$(SUFFIX) cblas_sbf16tos.$(PSUFFIX) : bf16to.c + $(CC) $(CFLAGS) -DCBLAS -DSINGLE_PREC -UDOUBLE_PREC -c $< -o $(@F) +cblas_dbf16tod.$(SUFFIX) cblas_dbf16tod.$(PSUFFIX) : bf16to.c + $(CC) $(CFLAGS) -DCBLAS -USINGLE_PREC -DDOUBLE_PREC -c $< -o $(@F) +endif + cblas_sdot.$(SUFFIX) cblas_sdot.$(PSUFFIX) : dot.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) diff --git a/interface/bf16dot.c b/interface/bf16dot.c new file mode 100644 index 000000000..33717e374 --- /dev/null +++ b/interface/bf16dot.c @@ -0,0 +1,52 @@ +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef CBLAS +float NAME(blasint *N, bfloat16 *x, blasint *INCX, bfloat16 *y, blasint *INCY){ + BLASLONG n = *N; + BLASLONG incx = *INCX; + BLASLONG incy = *INCY; + float ret; + PRINT_DEBUG_NAME; + + if (n <= 0) return 0.; + + IDEBUG_START; + FUNCTION_PROFILE_START(); + + if (incx < 0) x -= (n - 1) * incx; + if (incy < 0) y -= (n - 1) * incy; + ret = BF16_DOT_K(n, x, incx, y, incy); + + FUNCTION_PROFILE_END(1, 2 * n, 2 * n); + IDEBUG_END; + + return ret; + } + +#else + +float CNAME(blasint n, bfloat16 *x, blasint incx, bfloat16 *y, blasint incy){ + + float ret; + PRINT_DEBUG_CNAME; + + if (n <= 0) return 0.; + + IDEBUG_START; + FUNCTION_PROFILE_START(); + + if (incx < 0) x -= (n - 1) * incx; + if (incy < 0) y -= (n - 1) * incy; + ret = BF16_DOT_K(n, x, incx, y, incy); + + FUNCTION_PROFILE_END(1, 2 * n, 2 * n); + IDEBUG_END; + + return ret; +} + +#endif diff --git a/interface/bf16to.c b/interface/bf16to.c new file mode 100644 index 000000000..036c0b142 --- /dev/null +++ b/interface/bf16to.c @@ -0,0 +1,62 @@ +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#if defined(DOUBLE_PREC) +#define FLOAT_TYPE double +#elif defined(SINGLE_PREC) +#define FLOAT_TYPE float +#else +#endif + +#ifndef CBLAS +void NAME(blasint *N, bfloat16 *in, blasint *INC_IN, FLOAT_TYPE *out, blasint *INC_OUT){ + BLASLONG n = *N; + BLASLONG inc_in = *INC_IN; + BLASLONG inc_out = *INC_OUT; + + PRINT_DEBUG_NAME; + + if (n <= 0) return; + + IDEBUG_START; + FUNCTION_PROFILE_START(); + + if (inc_in < 0) in -= (n - 1) * inc_in; + if (inc_out < 0) out -= (n - 1) * inc_out; + +#if defined(DOUBLE_PREC) + D_BF16_TO_K(n, in, inc_in, out, inc_out); +#elif defined(SINGLE_PREC) + S_BF16_TO_K(n, in, inc_in, out, inc_out); +#else +#endif + + FUNCTION_PROFILE_END(1, 2 * n, 2 * n); + IDEBUG_END; +} +#else +void CNAME(blasint n, bfloat16 * in, blasint inc_in, FLOAT_TYPE * out, blasint inc_out){ + PRINT_DEBUG_CNAME; + + if (n <= 0) return; + + IDEBUG_START; + FUNCTION_PROFILE_START(); + + if (inc_in < 0) in -= (n - 1) * inc_in; + if (inc_out < 0) out -= (n - 1) * inc_out; + +#if defined(DOUBLE_PREC) + D_BF16_TO_K(n, in, inc_in, out, inc_out); +#elif defined(SINGLE_PREC) + S_BF16_TO_K(n, in, inc_in, out, inc_out); +#else +#endif + + FUNCTION_PROFILE_END(1, 2 * n, 2 * n); + IDEBUG_END; +} +#endif diff --git a/interface/tobf16.c b/interface/tobf16.c new file mode 100644 index 000000000..787d9d689 --- /dev/null +++ b/interface/tobf16.c @@ -0,0 +1,61 @@ +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#if defined(DOUBLE_PREC) +#define FLOAT_TYPE double +#elif defined(SINGLE_PREC) +#define FLOAT_TYPE float +#else +#endif + +#ifndef CBLAS +void NAME(blasint *N, FLOAT_TYPE *in, blasint *INC_IN, bfloat16 *out, blasint *INC_OUT){ + BLASLONG n = *N; + BLASLONG inc_in = *INC_IN; + BLASLONG inc_out = *INC_OUT; + + PRINT_DEBUG_NAME; + + if (n <= 0) return; + + IDEBUG_START; + FUNCTION_PROFILE_START(); + + if (inc_in < 0) in -= (n - 1) * inc_in; + if (inc_out < 0) out -= (n - 1) * inc_out; + +#if defined(DOUBLE_PREC) + D_TO_BF16_K(n, in, inc_in, out, inc_out); +#elif defined(SINGLE_PREC) + S_TO_BF16_K(n, in, inc_in, out, inc_out); +#else +#endif + + FUNCTION_PROFILE_END(1, 2 * n, 2 * n); + IDEBUG_END; +} +#else +void CNAME(blasint n, FLOAT_TYPE *in, blasint inc_in, bfloat16 *out, blasint inc_out){ + PRINT_DEBUG_CNAME; + + if (n <= 0) return; + + IDEBUG_START; + FUNCTION_PROFILE_START(); + + if (inc_in < 0) in -= (n - 1) * inc_in; + if (inc_out < 0) out -= (n - 1) * inc_out; + +#if defined(DOUBLE_PREC) + D_TO_BF16_K(n, in, inc_in, out, inc_out); +#elif defined(SINGLE_PREC) + S_TO_BF16_K(n, in, inc_in, out, inc_out); +#endif + + FUNCTION_PROFILE_END(1, 2 * n, 2 * n); + IDEBUG_END; +} +#endif diff --git a/kernel/Makefile.L1 b/kernel/Makefile.L1 index 970703230..c6576ee07 100644 --- a/kernel/Makefile.L1 +++ b/kernel/Makefile.L1 @@ -262,6 +262,20 @@ ifndef XDOTKERNEL XDOTKERNEL = zdot.S endif +ifeq ($(BUILD_HALF),1) +ifndef SHDOTKERNEL +SHDOTKERNEL = ../x86_64/shdot.c +endif + +ifndef TOBF16KERNEL +TOBF16KERNEL = ../x86_64/tobf16.c +endif + +ifndef BF16TOKERNEL +BF16TOKERNEL = ../x86_64/bf16to.c +endif +endif + ### NRM2 ### ifndef SNRM2KERNEL @@ -516,6 +530,15 @@ XBLASOBJS += \ xdotc_k$(TSUFFIX).$(SUFFIX) xdotu_k$(TSUFFIX).$(SUFFIX) xnrm2_k$(TSUFFIX).$(SUFFIX) xqrot_k$(TSUFFIX).$(SUFFIX) \ xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX) xsum_k$(TSUFFIX).$(SUFFIX) +ifeq ($(BUILD_HALF),1) +SHBLASOBJS += \ + shdot_k$(TSUFFIX).$(SUFFIX) +SHEXTOBJS += \ + shstobf16_k$(TSUFFIX).$(SUFFIX) shdtobf16_k$(TSUFFIX).$(SUFFIX) +SHEXTOBJS += \ + sbf16tos_k$(TSUFFIX).$(SUFFIX) dbf16tod_k$(TSUFFIX).$(SUFFIX) +endif + ### AMAX ### @@ -734,6 +757,19 @@ $(KDIR)ddot_k$(TSUFFIX).$(SUFFIX) $(KDIR)ddot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNEL $(KDIR)qdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QDOTKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@ +ifeq ($(BUILD_HALF),1) +$(KDIR)shdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)shdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHDOTKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX $< -o $@ +$(KDIR)shstobf16_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TOBF16KERNEL) + $(CC) -c $(CFLAGS) -UDOUBLE -DSINGLE $< -o $@ +$(KDIR)shdtobf16_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TOBF16KERNEL) + $(CC) -c $(CFLAGS) -DDOUBLE -USINGLE $< -o $@ +$(KDIR)sbf16tos_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(BF16TOKERNEL) + $(CC) -c $(CFLAGS) -UDOUBLE -DSINGLE $< -o $@ +$(KDIR)dbf16tod_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(BF16TOKERNEL) + $(CC) -c $(CFLAGS) -DDOUBLE -USINGLE $< -o $@ +endif + $(KDIR)sdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 582a1dc01..c43520310 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -62,9 +62,11 @@ gotoblas_t TABLE_NAME = { MAX(SHGEMM_DEFAULT_UNROLL_M, SHGEMM_DEFAULT_UNROLL_N), #endif + shstobf16_kTS, shdtobf16_kTS, sbf16tos_kTS, dbf16tod_kTS, + samax_kTS, samin_kTS, smax_kTS, smin_kTS, isamax_kTS, isamin_kTS, ismax_kTS, ismin_kTS, - snrm2_kTS, sasum_kTS, ssum_kTS, scopy_kTS, sdot_kTS, + snrm2_kTS, sasum_kTS, ssum_kTS, scopy_kTS, shdot_kTS, dsdot_kTS, srot_kTS, saxpy_kTS, sscal_kTS, sswap_kTS, sgemv_nTS, sgemv_tTS, sger_kTS, diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL index 4874711bb..4a2e13bed 100644 --- a/kernel/x86_64/KERNEL +++ b/kernel/x86_64/KERNEL @@ -146,6 +146,18 @@ ifndef XDOTKERNEL XDOTKERNEL = zdot.S endif +ifndef SHDOTKERNEL +SHDOTKERNEL = shdot.c +endif + +ifndef TOBF16KERNEL +TOBF16KERNEL = tobf16.c +endif + +ifndef BF16TOKERNEL +BF16TOKERNEL = bf16to.c +endif + ifndef ISAMAXKERNEL ISAMAXKERNEL = iamax_sse.S endif diff --git a/kernel/x86_64/bf16to.c b/kernel/x86_64/bf16to.c new file mode 100644 index 000000000..fc6b5a529 --- /dev/null +++ b/kernel/x86_64/bf16to.c @@ -0,0 +1,114 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#if defined(DOUBLE) +#define FLOAT_TYPE double +#elif defined(SINGLE) +#define FLOAT_TYPE float +#else +#endif + +/* Notes for algorithm: + * - Input denormal treated as zero + * - Force to be QNAN + */ +static void bf16to_kernel_1(BLASLONG n, const bfloat16 * in, BLASLONG inc_in, FLOAT_TYPE * out, BLASLONG inc_out) +{ + BLASLONG register index_in = 0; + BLASLONG register index_out = 0; + BLASLONG register index = 0; + uint16_t * tmp = NULL; +#if defined(DOUBLE) + float float_out = 0.0; +#endif + + while(index= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) + +#define HAVE_TOBF16_ACCL_KERNEL 1 +#include "common.h" +#include + +static void tobf16_accl_kernel(BLASLONG n, const double * in, bfloat16 * out) +{ + /* Get the 64-bytes unaligned header number targeting for avx512 + * processing (Assume input float array is natural aligned) */ + int align_header = ((64 - ((uintptr_t)in & (uintptr_t)0x3f)) >> 3) & 0x7; + + if (n < align_header) {align_header = n;} + + if (align_header != 0) { + unsigned char align_mask8 = (((unsigned char)0xff) >> (8-align_header)); + __m512d a = _mm512_maskz_loadu_pd(*((__mmask8*) &align_mask8), &in[0]); + _mm_mask_storeu_epi16(&out[0], *((__mmask8*) &align_mask8), (__m128i) _mm256_cvtneps_pbh(_mm512_cvtpd_ps(a))); + } + + if (n == align_header) { + return; + } else { + n -= align_header; + in += align_header; + out += align_header; + } + + int tail_index_8 = n&(~7); + int tail_index_32 = n&(~31); + int tail_index_128 = n&(~127); + unsigned char tail_mask8 = (((unsigned char) 0xff) >> (8 -(n&7))); + + /* Processing the main chunk with 128-elements per round */ + for (int i = 0; i < tail_index_128; i += 128) { + // Fold 1 + __m512 data1_512_low = _mm512_insertf32x8(_mm512_castps256_ps512(_mm512_cvtpd_ps(_mm512_load_pd(&in[i+ 0]))), _mm512_cvtpd_ps(_mm512_load_pd(&in[i+ 8])), 1); + __m512 data1_512_high = _mm512_insertf32x8(_mm512_castps256_ps512(_mm512_cvtpd_ps(_mm512_load_pd(&in[i+16]))), _mm512_cvtpd_ps(_mm512_load_pd(&in[i+24])), 1); + _mm512_storeu_si512(&out[i+ 0], (__m512i) _mm512_cvtne2ps_pbh(data1_512_high, data1_512_low)); + + // Fold 2 + __m512 data2_512_low = _mm512_insertf32x8(_mm512_castps256_ps512(_mm512_cvtpd_ps(_mm512_load_pd(&in[i+32]))), _mm512_cvtpd_ps(_mm512_load_pd(&in[i+40])), 1); + __m512 data2_512_high = _mm512_insertf32x8(_mm512_castps256_ps512(_mm512_cvtpd_ps(_mm512_load_pd(&in[i+48]))), _mm512_cvtpd_ps(_mm512_load_pd(&in[i+56])), 1); + _mm512_storeu_si512(&out[i+32], (__m512i) _mm512_cvtne2ps_pbh(data2_512_high, data2_512_low)); + + // Fold 3 + __m512 data3_512_low = _mm512_insertf32x8(_mm512_castps256_ps512(_mm512_cvtpd_ps(_mm512_load_pd(&in[i+64]))), _mm512_cvtpd_ps(_mm512_load_pd(&in[i+72])), 1); + __m512 data3_512_high = _mm512_insertf32x8(_mm512_castps256_ps512(_mm512_cvtpd_ps(_mm512_load_pd(&in[i+80]))), _mm512_cvtpd_ps(_mm512_load_pd(&in[i+88])), 1); + _mm512_storeu_si512(&out[i+64], (__m512i) _mm512_cvtne2ps_pbh(data3_512_high, data3_512_low)); + + // Fold 4 + __m512 data4_512_low = _mm512_insertf32x8(_mm512_castps256_ps512(_mm512_cvtpd_ps(_mm512_load_pd(&in[i+96]))), _mm512_cvtpd_ps(_mm512_load_pd(&in[i+104])), 1); + __m512 data4_512_high = _mm512_insertf32x8(_mm512_castps256_ps512(_mm512_cvtpd_ps(_mm512_load_pd(&in[i+112]))), _mm512_cvtpd_ps(_mm512_load_pd(&in[i+120])), 1); + _mm512_storeu_si512(&out[i+96], (__m512i) _mm512_cvtne2ps_pbh(data4_512_high, data4_512_low)); + } + + /* Processing the remaining <128 chunk with 32-elements per round */ + for (int j = tail_index_128; j < tail_index_32; j += 32) { + __m512 data1_512_low = _mm512_insertf32x8(_mm512_castps256_ps512(_mm512_cvtpd_ps(_mm512_load_pd(&in[j+ 0]))), _mm512_cvtpd_ps(_mm512_load_pd(&in[j+ 8])), 1); + __m512 data1_512_high = _mm512_insertf32x8(_mm512_castps256_ps512(_mm512_cvtpd_ps(_mm512_load_pd(&in[j+16]))), _mm512_cvtpd_ps(_mm512_load_pd(&in[j+24])), 1); + _mm512_storeu_si512(&out[j], (__m512i) _mm512_cvtne2ps_pbh(data1_512_high, data1_512_low)); + } + + /* Processing the remaining <32 chunk with 8-elements per round */ + for (int j = tail_index_32; j < tail_index_8; j += 8) { + _mm_storeu_si128((__m128i *)&out[j], (__m128i) _mm256_cvtneps_pbh(_mm512_cvtpd_ps(_mm512_load_pd(&in[j])))); + } + + /* Processing the remaining <8 chunk with masked processing */ + if ((n&7) > 0) { + __m512d data_512 = _mm512_maskz_load_pd(*((__mmask8*) &tail_mask8), &in[tail_index_8]); + _mm_mask_storeu_epi16(&out[tail_index_8], *((__mmask8*) &tail_mask8), (__m128i) _mm256_cvtneps_pbh(_mm512_cvtpd_ps(data_512))); + } +} + +#endif diff --git a/kernel/x86_64/shdot.c b/kernel/x86_64/shdot.c new file mode 100644 index 000000000..5073fda2a --- /dev/null +++ b/kernel/x86_64/shdot.c @@ -0,0 +1,115 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if defined(COOPERLAKE) +#include "shdot_microk_cooperlake.c" +#endif + +static float shdot_compute(BLASLONG n, bfloat16 *x, BLASLONG inc_x, bfloat16 *y, BLASLONG inc_y) +{ + float d = 0.0; + +#ifdef HAVE_SHDOT_ACCL_KERNEL + if ((inc_x == 1) && (inc_y == 1)) { + return shdot_accl_kernel(n, x, y); + } +#endif + + float * x_fp32 = malloc(sizeof(float)*n); + float * y_fp32 = malloc(sizeof(float)*n); + + SBF16TOS_K(n, x, inc_x, x_fp32, 1); + SBF16TOS_K(n, y, inc_y, y_fp32, 1); + + d = SDOTU_K(n, x_fp32, 1, y_fp32, 1); + + free(x_fp32); + free(y_fp32); + + return d; +} + +#if defined(SMP) +static int shdot_thread_func(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, bfloat16 dummy2, + bfloat16 *x, BLASLONG inc_x, bfloat16 *y, BLASLONG inc_y, + float *result, BLASLONG dummy3) +{ + *(float *)result = shdot_compute(n, x, inc_x, y, inc_y); + return 0; +} + +extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, + void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, + int (*function)(), int nthreads); +#endif + +float CNAME(BLASLONG n, bfloat16 *x, BLASLONG inc_x, bfloat16 *y, BLASLONG inc_y) +{ + float dot_result = 0.0; + + if (n <= 0) return 0.0; + +#if defined(SMP) + int nthreads; + int thread_thres = 40960; + bfloat16 dummy_alpha; +#endif + +#if defined(SMP) + if (inc_x == 0 || inc_y == 0 || n <= thread_thres) + nthreads = 1; + else + nthreads = num_cpu_avail(1); + + int best_threads = (int) (n/(float)thread_thres + 0.5); + + if (best_threads < nthreads) { + nthreads = best_threads; + } + + if (nthreads <= 1) { + dot_result = shdot_compute(n, x, inc_x, y, inc_y); + } else { + char thread_result[MAX_CPU_NUMBER * sizeof(double) * 2]; + int mode = BLAS_BFLOAT16 | BLAS_REAL; + blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, + x, inc_x, y, inc_y, thread_result, 0, + (void *)shdot_thread_func, nthreads); + float * ptr = (float *)thread_result; + for (int i = 0; i < nthreads; i++) { + dot_result += (*ptr); + ptr = (float *)(((char *)ptr) + sizeof(double) * 2); + } + } +#else + dot_result = shdot_compute(n, x, inc_x, y, inc_y); +#endif + + return dot_result; +} diff --git a/kernel/x86_64/shdot_microk_cooperlake.c b/kernel/x86_64/shdot_microk_cooperlake.c new file mode 100644 index 000000000..e645296f1 --- /dev/null +++ b/kernel/x86_64/shdot_microk_cooperlake.c @@ -0,0 +1,159 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) + +#define HAVE_SHDOT_ACCL_KERNEL 1 +#include "common.h" +#include + +static float shdot_accl_kernel(BLASLONG n, bfloat16 *x, bfloat16 *y) +{ + __m128 accum128 = _mm_setzero_ps(); + if (n> 127) { /* n range from 128 to inf. */ + long tail_index_32 = n&(~31); + long tail_index_128 = n&(~127); + unsigned int tail_mask_uint = (((unsigned int)0xffffffff) >> (32-(n&31))); + __mmask32 tail_mask = *((__mmask32*) &tail_mask_uint); + + __m512 accum512_0 = _mm512_setzero_ps(); + __m512 accum512_1 = _mm512_setzero_ps(); + __m512 accum512_2 = _mm512_setzero_ps(); + __m512 accum512_3 = _mm512_setzero_ps(); + + /* Processing the main chunk with 128-elements per round */ + for (long i = 0; i < tail_index_128; i += 128) { + accum512_0 = _mm512_dpbf16_ps(accum512_0, (__m512bh) _mm512_loadu_si512(&x[i+ 0]), (__m512bh) _mm512_loadu_si512(&y[i+ 0])); + accum512_1 = _mm512_dpbf16_ps(accum512_1, (__m512bh) _mm512_loadu_si512(&x[i+32]), (__m512bh) _mm512_loadu_si512(&y[i+32])); + accum512_2 = _mm512_dpbf16_ps(accum512_2, (__m512bh) _mm512_loadu_si512(&x[i+64]), (__m512bh) _mm512_loadu_si512(&y[i+64])); + accum512_3 = _mm512_dpbf16_ps(accum512_3, (__m512bh) _mm512_loadu_si512(&x[i+96]), (__m512bh) _mm512_loadu_si512(&y[i+96])); + } + + /* Processing the remaining <128 chunk with 32-elements per round */ + for (long j = tail_index_128; j < tail_index_32; j += 32) { + accum512_0 = _mm512_dpbf16_ps(accum512_0, (__m512bh) _mm512_loadu_si512(&x[j]), (__m512bh) _mm512_loadu_si512(&y[j])); + } + + /* Processing the remaining <32 chunk with masked 32-elements processing */ + if ((n&31) != 0) { + accum512_2 = _mm512_dpbf16_ps(accum512_2, + (__m512bh) _mm512_maskz_loadu_epi16(tail_mask, &x[tail_index_32]), + (__m512bh) _mm512_maskz_loadu_epi16(tail_mask, &y[tail_index_32])); + } + + /* Accumulate the 4 registers into 1 register */ + accum512_0 = _mm512_add_ps(accum512_0, accum512_1); + accum512_2 = _mm512_add_ps(accum512_2, accum512_3); + accum512_0 = _mm512_add_ps(accum512_0, accum512_2); + + __m256 accum256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1)); + accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf128_ps(accum256, 1)); + } else if (n > 31) { /* n range from 32 to 127 */ + /* Processing <128 chunk with 32-elements per round */ + __m256 accum256 = _mm256_setzero_ps(); + __m256 accum256_1 = _mm256_setzero_ps(); + int tail_index_32 = n&(~31); + for (int j = 0; j < tail_index_32; j += 32) { + accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256(&x[j+ 0]), (__m256bh) _mm256_loadu_si256(&y[j+ 0])); + accum256_1 = _mm256_dpbf16_ps(accum256_1, (__m256bh) _mm256_loadu_si256(&x[j+16]), (__m256bh) _mm256_loadu_si256(&y[j+16])); + } + accum256 = _mm256_add_ps(accum256, accum256_1); + + /* Processing the remaining <32 chunk with 16-elements processing */ + if ((n&16) != 0) { + accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256(&x[tail_index_32]), (__m256bh) _mm256_loadu_si256(&y[tail_index_32])); + } + accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf128_ps(accum256, 1)); + + /* Processing the remaining <16 chunk with 8-elements processing */ + if ((n&8) != 0) { + int tail_index_16 = n&(~15); + accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128(&x[tail_index_16]), (__m128bh) _mm_loadu_si128(&y[tail_index_16])); + } + + /* Processing the remaining <8 chunk with masked 8-elements processing */ + if ((n&7) != 0) { + unsigned char tail_mask_uint = (((unsigned char)0xff) >> (8-(n&7))); + __mmask8 tail_mask = *((__mmask8*) &tail_mask_uint); + int tail_index_8 = n&(~7); + accum128 = _mm_dpbf16_ps(accum128, + (__m128bh) _mm_maskz_loadu_epi16(tail_mask, &x[tail_index_8]), + (__m128bh) _mm_maskz_loadu_epi16(tail_mask, &y[tail_index_8])); + } + } else if (n > 15) { /* n range from 16 to 31 */ + /* Processing <32 chunk with 16-elements processing */ + __m256 accum256 = _mm256_setzero_ps(); + accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256(&x[0]), (__m256bh) _mm256_loadu_si256(&y[0])); + accum128 += _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf128_ps(accum256, 1)); + + /* Processing the remaining <16 chunk with 8-elements processing */ + if ((n&8) != 0) { + int tail_index_16 = n&(~15); + accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128(&x[tail_index_16]), (__m128bh) _mm_loadu_si128(&y[tail_index_16])); + } + + /* Processing the remaining <8 chunk with masked 8-elements processing */ + if ((n&7) != 0) { + unsigned char tail_mask_uint = (((unsigned char)0xff) >> (8-(n&7))); + __mmask8 tail_mask = *((__mmask8*) &tail_mask_uint); + int tail_index_8 = n&(~7); + accum128 = _mm_dpbf16_ps(accum128, + (__m128bh) _mm_maskz_loadu_epi16(tail_mask, &x[tail_index_8]), + (__m128bh) _mm_maskz_loadu_epi16(tail_mask, &y[tail_index_8])); + } + } else if (n > 7) { /* n range from 8 to 15 */ + /* Processing <16 chunk with 8-elements processing */ + accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128(&x[0]), (__m128bh) _mm_loadu_si128(&y[0])); + + /* Processing the remaining <8 chunk with masked 8-elements processing */ + if ((n&7) != 0) { + unsigned char tail_mask_uint = (((unsigned char)0xff) >> (8-(n&7))); + __mmask8 tail_mask = *((__mmask8*) &tail_mask_uint); + int tail_index_8 = n&(~7); + accum128 = _mm_dpbf16_ps(accum128, + (__m128bh) _mm_maskz_loadu_epi16(tail_mask, &x[tail_index_8]), + (__m128bh) _mm_maskz_loadu_epi16(tail_mask, &y[tail_index_8])); + } + } else { /* n range from 1 to 7 */ + unsigned char tail_mask_uint = (((unsigned char)0xff) >> (8-(n&7))); + __mmask8 tail_mask = *((__mmask8*) &tail_mask_uint); + accum128 = _mm_dpbf16_ps(accum128, + (__m128bh) _mm_maskz_loadu_epi16(tail_mask, &x[0]), + (__m128bh) _mm_maskz_loadu_epi16(tail_mask, &y[0])); + } + + /* Add up the 4 elements into lowest entry */ + __m128 accum128_1 = _mm_shuffle_ps(accum128, accum128, 14); + accum128 = _mm_add_ps(accum128, accum128_1); + accum128_1 = _mm_shuffle_ps(accum128, accum128, 1); + accum128 = _mm_add_ps(accum128, accum128_1); + + return accum128[0]; +} + +#endif diff --git a/kernel/x86_64/stobf16_microk_cooperlake.c b/kernel/x86_64/stobf16_microk_cooperlake.c new file mode 100644 index 000000000..2756a6934 --- /dev/null +++ b/kernel/x86_64/stobf16_microk_cooperlake.c @@ -0,0 +1,86 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) + +#define HAVE_TOBF16_ACCL_KERNEL 1 +#include "common.h" +#include + +static void tobf16_accl_kernel(BLASLONG n, const float * in, bfloat16 * out) +{ + /* Get the 64-bytes unaligned header number targeting for avx512 + * processing (Assume input float array is natural aligned) */ + int align_header = ((64 - ((uintptr_t)in & (uintptr_t)0x3f)) >> 2) & 0xf; + + if (n < align_header) {align_header = n;} + + if (align_header != 0) { + uint16_t align_mask16 = (((uint16_t)0xffff) >> (16-align_header)); + __m512 a = _mm512_maskz_loadu_ps(*((__mmask16*) &align_mask16), &in[0]); + _mm256_mask_storeu_epi16(&out[0], *((__mmask16*) &align_mask16), (__m256i) _mm512_cvtneps_pbh(a)); + } + + if (n == align_header) { + return; + } else { + n -= align_header; + in += align_header; + out += align_header; + } + + int tail_index_32 = n&(~31); + int tail_index_128 = n&(~127); + uint32_t tail_mask32 = (((uint32_t) 0xffffffff) >> (32-(n&31))); + uint16_t tail_mask16 = (((uint16_t) 0xffff) >> (16-(n&15))); + + /* Processing the main chunk with 128-elements per round */ + for (int i = 0; i < tail_index_128; i += 128) { + _mm512_storeu_si512(&out[i+ 0], (__m512i) _mm512_cvtne2ps_pbh(_mm512_load_ps(&in[i+ 16]), _mm512_load_ps(&in[i+ 0]))); + _mm512_storeu_si512(&out[i+32], (__m512i) _mm512_cvtne2ps_pbh(_mm512_load_ps(&in[i+ 48]), _mm512_load_ps(&in[i+32]))); + _mm512_storeu_si512(&out[i+64], (__m512i) _mm512_cvtne2ps_pbh(_mm512_load_ps(&in[i+ 80]), _mm512_load_ps(&in[i+64]))); + _mm512_storeu_si512(&out[i+96], (__m512i) _mm512_cvtne2ps_pbh(_mm512_load_ps(&in[i+112]), _mm512_load_ps(&in[i+96]))); + } + + /* Processing the remaining <128 chunk with 32-elements per round */ + for (int j = tail_index_128; j < tail_index_32; j += 32) { + _mm512_storeu_si512(&out[j], (__m512i) _mm512_cvtne2ps_pbh(_mm512_load_ps(&in[j+ 16]), _mm512_load_ps(&in[j]))); + } + + /* Processing the remaining <32 chunk with masked processing */ + if ((n&31) > 15) { + __m512 b = _mm512_load_ps(&in[tail_index_32]); + __m512 a = _mm512_maskz_load_ps(*((__mmask16*) &tail_mask16), &in[tail_index_32+16]); + _mm512_mask_storeu_epi16(&out[tail_index_32], *((__mmask32*) &tail_mask32), (__m512i) _mm512_cvtne2ps_pbh(a, b)); + } else if ((n&31) > 0) { + __m512 a = _mm512_maskz_load_ps(*((__mmask16*) &tail_mask16), &in[tail_index_32]); + _mm256_mask_storeu_epi16(&out[tail_index_32], *((__mmask16*) &tail_mask16), (__m256i) _mm512_cvtneps_pbh(a)); + } +} + +#endif diff --git a/kernel/x86_64/tobf16.c b/kernel/x86_64/tobf16.c new file mode 100644 index 000000000..3d1796621 --- /dev/null +++ b/kernel/x86_64/tobf16.c @@ -0,0 +1,170 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#if defined(DOUBLE) +#define FLOAT_TYPE double +#elif defined(SINGLE) +#define FLOAT_TYPE float +#else +#endif + +#if defined(COOPERLAKE) +#if defined(DOUBLE) +#include "dtobf16_microk_cooperlake.c" +#elif defined(SINGLE) +#include "stobf16_microk_cooperlake.c" +#endif +#endif + +/* Notes for algorithm: + * - Round to Nearest Even used generally + * - QNAN for NAN case + * - Input denormals are treated as zero + */ +static void tobf16_generic_kernel(BLASLONG n, const FLOAT_TYPE * in, BLASLONG inc_in, bfloat16 * out, BLASLONG inc_out) +{ + BLASLONG register index_in = 0; + BLASLONG register index_out = 0; + BLASLONG register index = 0; + float float_in = 0.0; + uint32_t * uint32_in = (uint32_t *)(&float_in); + uint16_t * uint16_in = (uint16_t *)(&float_in); + + while(index> 16) & 0x1u) + 0x7fffu); +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + *(out+index_out) = uint16_in[1]; +#else + *(out+index_out) = uint16_in[0]; +#endif + break; + } + + index_in += inc_in; + index_out += inc_out; + index++; + } +} + +#ifndef HAVE_TOBF16_ACCL_KERNEL +static void tobf16_accl_kernel(BLASLONG n, const FLOAT_TYPE * in, bfloat16 * out) +{ + tobf16_generic_kernel(n, in, 1, out, 1); +} +#endif + +static void tobf16_compute(BLASLONG n, FLOAT_TYPE * in, BLASLONG inc_in, bfloat16 * out, BLASLONG inc_out) +{ + if ((inc_in == 1) && (inc_out == 1)) { + tobf16_accl_kernel(n, in, out); + } else { + tobf16_generic_kernel(n, in, inc_in, out, inc_out); + } +} + +#if defined(SMP) +static int tobf16_thread_func(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT_TYPE dummy2, + FLOAT_TYPE *x, BLASLONG inc_x, bfloat16 *y, BLASLONG inc_y, + FLOAT_TYPE *dummy3, BLASLONG dummy4) +{ + tobf16_compute(n, x, inc_x, y, inc_y); + return 0; +} + +extern int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, + void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, + int (*function)(), int nthreads); +#endif + +void CNAME(BLASLONG n, FLOAT_TYPE * in, BLASLONG inc_in, bfloat16 * out, BLASLONG inc_out) +{ + if (n <= 0) return; + +#if defined(SMP) + int nthreads; + FLOAT_TYPE dummy_alpha; + FLOAT_TYPE dummy_c; +#endif + +#if defined(SMP) + if (inc_in == 0 || inc_out == 0 || n <= 100000) { + nthreads = 1; + } else { + if (n/100000 < 100) { + nthreads = 4; + } else { + nthreads = 16; + } + } + + if (nthreads == 1) { + tobf16_compute(n, in, inc_in, out, inc_out); + } else { +#if defined(DOUBLE) + int mode = BLAS_REAL | BLAS_DTOBF16; +#elif defined(SINGLE) + int mode = BLAS_REAL | BLAS_STOBF16; +#endif + blas_level1_thread(mode, n, 0, 0, &dummy_alpha, + in, inc_in, out, inc_out, &dummy_c, 0, + (void *)tobf16_thread_func, nthreads); + } +#else + tobf16_compute(n, in, inc_in, out, inc_out); +#endif + +} diff --git a/openblas_config_template.h b/openblas_config_template.h index 9955e5c73..858b8c5cb 100644 --- a/openblas_config_template.h +++ b/openblas_config_template.h @@ -35,7 +35,8 @@ typedef unsigned long BLASULONG; #endif #ifndef BFLOAT16 -typedef unsigned short bfloat16; +#include +typedef uint16_t bfloat16; #endif #ifdef OPENBLAS_USE64BITINT From 860247b5da58debb2082353a730f64049018bf35 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Wed, 2 Sep 2020 22:38:56 +0200 Subject: [PATCH 419/593] Follow-up to lapack#434 & lapack#409: fix signature mismatches --- lapack-netlib/LAPACKE/include/lapack.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h index c045892df..9a8e1a218 100644 --- a/lapack-netlib/LAPACKE/include/lapack.h +++ b/lapack-netlib/LAPACKE/include/lapack.h @@ -3665,7 +3665,7 @@ lapack_int LAPACK_dggsvd( char const* jobu, char const* jobv, char const* jobq, lapack_int* lda, double* b, lapack_int* ldb, double* alpha, double* beta, double* u, lapack_int* ldu, double* v, lapack_int* ldv, double* q, - lapack_int* ldq, float* work, lapack_int* iwork, lapack_int* info ); + lapack_int* ldq, double* work, lapack_int* iwork, lapack_int* info ); #define LAPACK_cggsvd LAPACK_GLOBAL(cggsvd,CGGSVD) lapack_int LAPACK_cggsvd( char const* jobu, char const* jobv, char const* jobq, @@ -3676,7 +3676,7 @@ lapack_int LAPACK_cggsvd( char const* jobu, char const* jobv, char const* jobq, float* alpha, float* beta, lapack_complex_float* u, lapack_int* ldu, lapack_complex_float* v, lapack_int* ldv, lapack_complex_float* q, - lapack_int* ldq, float* work, lapack_int* rwork, lapack_int* iwork, lapack_int *info ); + lapack_int* ldq, lapack_complex_float* work, float* rwork, lapack_int* iwork, lapack_int* info ); #define LAPACK_zggsvd LAPACK_GLOBAL(zggsvd,ZGGSVD) lapack_int LAPACK_zggsvd( char const* jobu, char const* jobv, char const* jobq, @@ -3688,7 +3688,7 @@ lapack_int LAPACK_zggsvd( char const* jobu, char const* jobv, char const* jobq, lapack_complex_double* u, lapack_int* ldu, lapack_complex_double* v, lapack_int* ldv, lapack_complex_double* q, lapack_int* ldq, - float* work, lapack_int* rwork, lapack_int* iwork, lapack_int* info ); + lapack_complex_double* work, double* rwork, lapack_int* iwork, lapack_int* info ); #define LAPACK_cggsvd3 LAPACK_GLOBAL(cggsvd3,CGGSVD3) void LAPACK_cggsvd3( @@ -3780,7 +3780,7 @@ lapack_int LAPACK_cggsvp( char const* jobu, char const* jobv, char const* jobq, lapack_complex_float* u, lapack_int* ldu, lapack_complex_float* v, lapack_int* ldv, lapack_complex_float* q, lapack_int* ldq, - lapack_int* iwork, lapack_int* rwork, + lapack_int* iwork, float* rwork, lapack_complex_float* tau, lapack_complex_float* work, lapack_int* info); @@ -3793,7 +3793,7 @@ lapack_int LAPACK_zggsvp( char const* jobu, char const* jobv, char const* jobq, lapack_int* l, lapack_complex_double* u, lapack_int* ldu, lapack_complex_double* v, lapack_int* ldv, lapack_complex_double* q, - lapack_int* ldq, lapack_int* iwork, lapack_int* rwork, + lapack_int* ldq, lapack_int* iwork, double* rwork, lapack_complex_double* tau, lapack_complex_double* work, lapack_int* info); From 1c6c71fa853226073779aba4cc5c08a2ba22300c Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Wed, 2 Sep 2020 22:41:50 +0200 Subject: [PATCH 420/593] Follow-up to lapack#434 & lapack#409: add missing 'const' in signatures Based on how the surrounding functions in lapack.h are handling the parameters, particularly the ?ggsv?3-variants of the affected functions --- lapack-netlib/LAPACKE/include/lapack.h | 80 +++++++++++++------------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h index 9a8e1a218..f0af3795d 100644 --- a/lapack-netlib/LAPACKE/include/lapack.h +++ b/lapack-netlib/LAPACKE/include/lapack.h @@ -3651,43 +3651,43 @@ void LAPACK_zggrqf( #define LAPACK_sggsvd LAPACK_GLOBAL(sggsvd,SGGSVD) lapack_int LAPACK_sggsvd( char const* jobu, char const* jobv, char const* jobq, - lapack_int* m, lapack_int* n, lapack_int* p, + lapack_int const* m, lapack_int const* n, lapack_int const* p, lapack_int* k, lapack_int* l, float* a, - lapack_int* lda, float* b, lapack_int* ldb, - float* alpha, float* beta, float* u, lapack_int* ldu, - float* v, lapack_int* ldv, float* q, lapack_int* ldq, + lapack_int const* lda, float* b, lapack_int const* ldb, + float* alpha, float* beta, float* u, lapack_int const* ldu, + float* v, lapack_int const* ldv, float* q, lapack_int const* ldq, float* work, lapack_int* iwork, lapack_int* info ); #define LAPACK_dggsvd LAPACK_GLOBAL(dggsvd,DGGSVD) lapack_int LAPACK_dggsvd( char const* jobu, char const* jobv, char const* jobq, - lapack_int* m, lapack_int* n, lapack_int* p, + lapack_int const* m, lapack_int const* n, lapack_int const* p, lapack_int* k, lapack_int* l, double* a, - lapack_int* lda, double* b, lapack_int* ldb, + lapack_int const* lda, double* b, lapack_int const* ldb, double* alpha, double* beta, double* u, - lapack_int* ldu, double* v, lapack_int* ldv, double* q, - lapack_int* ldq, double* work, lapack_int* iwork, lapack_int* info ); + lapack_int const* ldu, double* v, lapack_int const* ldv, double* q, + lapack_int const* ldq, double* work, lapack_int* iwork, lapack_int* info ); #define LAPACK_cggsvd LAPACK_GLOBAL(cggsvd,CGGSVD) lapack_int LAPACK_cggsvd( char const* jobu, char const* jobv, char const* jobq, - lapack_int* m, lapack_int* n, lapack_int* p, + lapack_int const* m, lapack_int const* n, lapack_int const* p, lapack_int* k, lapack_int* l, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* b, lapack_int* ldb, + lapack_complex_float* a, lapack_int const* lda, + lapack_complex_float* b, lapack_int const* ldb, float* alpha, float* beta, lapack_complex_float* u, - lapack_int* ldu, lapack_complex_float* v, - lapack_int* ldv, lapack_complex_float* q, - lapack_int* ldq, lapack_complex_float* work, float* rwork, lapack_int* iwork, lapack_int* info ); + lapack_int const* ldu, lapack_complex_float* v, + lapack_int const* ldv, lapack_complex_float* q, + lapack_int const* ldq, lapack_complex_float* work, float* rwork, lapack_int* iwork, lapack_int* info ); #define LAPACK_zggsvd LAPACK_GLOBAL(zggsvd,ZGGSVD) lapack_int LAPACK_zggsvd( char const* jobu, char const* jobv, char const* jobq, - lapack_int* m, lapack_int* n, lapack_int* p, + lapack_int const* m, lapack_int const* n, lapack_int const* p, lapack_int* k, lapack_int* l, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* b, lapack_int* ldb, + lapack_complex_double* a, lapack_int const* lda, + lapack_complex_double* b, lapack_int const* ldb, double* alpha, double* beta, - lapack_complex_double* u, lapack_int* ldu, - lapack_complex_double* v, lapack_int* ldv, - lapack_complex_double* q, lapack_int* ldq, + lapack_complex_double* u, lapack_int const* ldu, + lapack_complex_double* v, lapack_int const* ldv, + lapack_complex_double* q, lapack_int const* ldq, lapack_complex_double* work, double* rwork, lapack_int* iwork, lapack_int* info ); #define LAPACK_cggsvd3 LAPACK_GLOBAL(cggsvd3,CGGSVD3) @@ -3754,46 +3754,46 @@ void LAPACK_zggsvd3( #define LAPACK_sggsvp LAPACK_GLOBAL(sggsvp,SGGSVP) lapack_int LAPACK_sggsvp( char const* jobu, char const* jobv, char const* jobq, - lapack_int* m, lapack_int* p, lapack_int* n, float* a, - lapack_int* lda, float* b, lapack_int* ldb, float* tola, + lapack_int const* m, lapack_int const* p, lapack_int const* n, float* a, + lapack_int const* lda, float* b, lapack_int const* ldb, float* tola, float* tolb, lapack_int* k, lapack_int* l, float* u, - lapack_int* ldu, float* v, lapack_int* ldv, float* q, - lapack_int* ldq, lapack_int* iwork, float* tau, + lapack_int const* ldu, float* v, lapack_int const* ldv, float* q, + lapack_int const* ldq, lapack_int* iwork, float* tau, float* work, lapack_int* info); #define LAPACK_dggsvp LAPACK_GLOBAL(dggsvp,DGGSVP) lapack_int LAPACK_dggsvp( char const* jobu, char const* jobv, char const* jobq, - lapack_int* m, lapack_int* p, lapack_int* n, double* a, - lapack_int* lda, double* b, lapack_int* ldb, + lapack_int const* m, lapack_int const* p, lapack_int const* n, double* a, + lapack_int const* lda, double* b, lapack_int const* ldb, double* tola, double* tolb, lapack_int* k, - lapack_int* l, double* u, lapack_int* ldu, double* v, - lapack_int* ldv, double* q, lapack_int* ldq, + lapack_int* l, double* u, lapack_int const* ldu, double* v, + lapack_int const* ldv, double* q, lapack_int const* ldq, lapack_int* iwork, double* tau, double* work, lapack_int* info); #define LAPACK_cggsvp LAPACK_GLOBAL(cggsvp,CGGSVP) lapack_int LAPACK_cggsvp( char const* jobu, char const* jobv, char const* jobq, - lapack_int* m, lapack_int* p, lapack_int* n, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* b, lapack_int* ldb, float* tola, + lapack_int const* m, lapack_int const* p, lapack_int const* n, + lapack_complex_float* a, lapack_int const* lda, + lapack_complex_float* b, lapack_int const* ldb, float* tola, float* tolb, lapack_int* k, lapack_int* l, - lapack_complex_float* u, lapack_int* ldu, - lapack_complex_float* v, lapack_int* ldv, - lapack_complex_float* q, lapack_int* ldq, + lapack_complex_float* u, lapack_int const* ldu, + lapack_complex_float* v, lapack_int const* ldv, + lapack_complex_float* q, lapack_int const* ldq, lapack_int* iwork, float* rwork, lapack_complex_float* tau, lapack_complex_float* work, lapack_int* info); #define LAPACK_zggsvp LAPACK_GLOBAL(zggsvp,ZGGSVP) lapack_int LAPACK_zggsvp( char const* jobu, char const* jobv, char const* jobq, - lapack_int* m, lapack_int* p, lapack_int* n, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* b, lapack_int* ldb, + lapack_int const* m, lapack_int const* p, lapack_int const* n, + lapack_complex_double* a, lapack_int const* lda, + lapack_complex_double* b, lapack_int const* ldb, double* tola, double* tolb, lapack_int* k, lapack_int* l, lapack_complex_double* u, - lapack_int* ldu, lapack_complex_double* v, - lapack_int* ldv, lapack_complex_double* q, - lapack_int* ldq, lapack_int* iwork, double* rwork, + lapack_int const* ldu, lapack_complex_double* v, + lapack_int const* ldv, lapack_complex_double* q, + lapack_int const* ldq, lapack_int* iwork, double* rwork, lapack_complex_double* tau, lapack_complex_double* work, lapack_int* info); From 3426519ae2e4210dc6088b484ce7b8f1abd1d38d Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Wed, 2 Sep 2020 22:46:47 +0200 Subject: [PATCH 421/593] adapt ?ggsv?-functions to ambient code style in LAPACKE/include/lapack.h --- lapack-netlib/LAPACKE/include/lapack.h | 162 ++++++++++++++----------- 1 file changed, 92 insertions(+), 70 deletions(-) diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h index f0af3795d..aedaa308d 100644 --- a/lapack-netlib/LAPACKE/include/lapack.h +++ b/lapack-netlib/LAPACKE/include/lapack.h @@ -3650,45 +3650,58 @@ void LAPACK_zggrqf( lapack_int* info ); #define LAPACK_sggsvd LAPACK_GLOBAL(sggsvd,SGGSVD) -lapack_int LAPACK_sggsvd( char const* jobu, char const* jobv, char const* jobq, - lapack_int const* m, lapack_int const* n, lapack_int const* p, - lapack_int* k, lapack_int* l, float* a, - lapack_int const* lda, float* b, lapack_int const* ldb, - float* alpha, float* beta, float* u, lapack_int const* ldu, - float* v, lapack_int const* ldv, float* q, lapack_int const* ldq, - float* work, lapack_int* iwork, lapack_int* info ); +lapack_int LAPACK_sggsvd( + char const* jobu, char const* jobv, char const* jobq, + lapack_int const* m, lapack_int const* n, lapack_int const* p, + lapack_int* k, lapack_int* l, + float* a, lapack_int const* lda, + float* b, lapack_int const* ldb, + float* alpha, float* beta, + float* u, lapack_int const* ldu, + float* v, lapack_int const* ldv, + float* q, lapack_int const* ldq, + float* work, lapack_int* iwork, lapack_int* info ); #define LAPACK_dggsvd LAPACK_GLOBAL(dggsvd,DGGSVD) -lapack_int LAPACK_dggsvd( char const* jobu, char const* jobv, char const* jobq, - lapack_int const* m, lapack_int const* n, lapack_int const* p, - lapack_int* k, lapack_int* l, double* a, - lapack_int const* lda, double* b, lapack_int const* ldb, - double* alpha, double* beta, double* u, - lapack_int const* ldu, double* v, lapack_int const* ldv, double* q, - lapack_int const* ldq, double* work, lapack_int* iwork, lapack_int* info ); +lapack_int LAPACK_dggsvd( + char const* jobu, char const* jobv, char const* jobq, + lapack_int const* m, lapack_int const* n, lapack_int const* p, + lapack_int* k, lapack_int* l, + double* a, lapack_int const* lda, + double* b, lapack_int const* ldb, + double* alpha, double* beta, + double* u, lapack_int const* ldu, + double* v, lapack_int const* ldv, + double* q, lapack_int const* ldq, + double* work, lapack_int* iwork, lapack_int* info ); #define LAPACK_cggsvd LAPACK_GLOBAL(cggsvd,CGGSVD) -lapack_int LAPACK_cggsvd( char const* jobu, char const* jobv, char const* jobq, - lapack_int const* m, lapack_int const* n, lapack_int const* p, - lapack_int* k, lapack_int* l, - lapack_complex_float* a, lapack_int const* lda, - lapack_complex_float* b, lapack_int const* ldb, - float* alpha, float* beta, lapack_complex_float* u, - lapack_int const* ldu, lapack_complex_float* v, - lapack_int const* ldv, lapack_complex_float* q, - lapack_int const* ldq, lapack_complex_float* work, float* rwork, lapack_int* iwork, lapack_int* info ); +lapack_int LAPACK_cggsvd( + char const* jobu, char const* jobv, char const* jobq, + lapack_int const* m, lapack_int const* n, lapack_int const* p, + lapack_int* k, lapack_int* l, + lapack_complex_float* a, lapack_int const* lda, + lapack_complex_float* b, lapack_int const* ldb, + float* alpha, float* beta, + lapack_complex_float* u, lapack_int const* ldu, + lapack_complex_float* v, lapack_int const* ldv, + lapack_complex_float* q, lapack_int const* ldq, + lapack_complex_float* work, float* rwork, + lapack_int* iwork, lapack_int* info ); #define LAPACK_zggsvd LAPACK_GLOBAL(zggsvd,ZGGSVD) -lapack_int LAPACK_zggsvd( char const* jobu, char const* jobv, char const* jobq, - lapack_int const* m, lapack_int const* n, lapack_int const* p, - lapack_int* k, lapack_int* l, - lapack_complex_double* a, lapack_int const* lda, - lapack_complex_double* b, lapack_int const* ldb, - double* alpha, double* beta, - lapack_complex_double* u, lapack_int const* ldu, - lapack_complex_double* v, lapack_int const* ldv, - lapack_complex_double* q, lapack_int const* ldq, - lapack_complex_double* work, double* rwork, lapack_int* iwork, lapack_int* info ); +lapack_int LAPACK_zggsvd( + char const* jobu, char const* jobv, char const* jobq, + lapack_int const* m, lapack_int const* n, lapack_int const* p, + lapack_int* k, lapack_int* l, + lapack_complex_double* a, lapack_int const* lda, + lapack_complex_double* b, lapack_int const* ldb, + double* alpha, double* beta, + lapack_complex_double* u, lapack_int const* ldu, + lapack_complex_double* v, lapack_int const* ldv, + lapack_complex_double* q, lapack_int const* ldq, + lapack_complex_double* work, double* rwork, + lapack_int* iwork, lapack_int* info ); #define LAPACK_cggsvd3 LAPACK_GLOBAL(cggsvd3,CGGSVD3) void LAPACK_cggsvd3( @@ -3753,49 +3766,58 @@ void LAPACK_zggsvd3( lapack_int* info ); #define LAPACK_sggsvp LAPACK_GLOBAL(sggsvp,SGGSVP) -lapack_int LAPACK_sggsvp( char const* jobu, char const* jobv, char const* jobq, - lapack_int const* m, lapack_int const* p, lapack_int const* n, float* a, - lapack_int const* lda, float* b, lapack_int const* ldb, float* tola, - float* tolb, lapack_int* k, lapack_int* l, float* u, - lapack_int const* ldu, float* v, lapack_int const* ldv, float* q, - lapack_int const* ldq, lapack_int* iwork, float* tau, - float* work, lapack_int* info); +lapack_int LAPACK_sggsvp( + char const* jobu, char const* jobv, char const* jobq, + lapack_int const* m, lapack_int const* p, lapack_int const* n, + float* a, lapack_int const* lda, + float* b, lapack_int const* ldb, + float* tola, float* tolb, + lapack_int* k, lapack_int* l, + float* u, lapack_int const* ldu, + float* v, lapack_int const* ldv, + float* q, lapack_int const* ldq, + lapack_int* iwork, float* tau, + float* work, lapack_int* info ); #define LAPACK_dggsvp LAPACK_GLOBAL(dggsvp,DGGSVP) -lapack_int LAPACK_dggsvp( char const* jobu, char const* jobv, char const* jobq, - lapack_int const* m, lapack_int const* p, lapack_int const* n, double* a, - lapack_int const* lda, double* b, lapack_int const* ldb, - double* tola, double* tolb, lapack_int* k, - lapack_int* l, double* u, lapack_int const* ldu, double* v, - lapack_int const* ldv, double* q, lapack_int const* ldq, - lapack_int* iwork, double* tau, double* work, - lapack_int* info); +lapack_int LAPACK_dggsvp( + char const* jobu, char const* jobv, char const* jobq, + lapack_int const* m, lapack_int const* p, lapack_int const* n, + double* a, lapack_int const* lda, + double* b, lapack_int const* ldb, + double* tola, double* tolb, + lapack_int* k, lapack_int* l, + double* u, lapack_int const* ldu, + double* v, lapack_int const* ldv, + double* q, lapack_int const* ldq, + lapack_int* iwork, double* tau, + double* work, lapack_int* info ); #define LAPACK_cggsvp LAPACK_GLOBAL(cggsvp,CGGSVP) -lapack_int LAPACK_cggsvp( char const* jobu, char const* jobv, char const* jobq, - lapack_int const* m, lapack_int const* p, lapack_int const* n, - lapack_complex_float* a, lapack_int const* lda, - lapack_complex_float* b, lapack_int const* ldb, float* tola, - float* tolb, lapack_int* k, lapack_int* l, - lapack_complex_float* u, lapack_int const* ldu, - lapack_complex_float* v, lapack_int const* ldv, - lapack_complex_float* q, lapack_int const* ldq, - lapack_int* iwork, float* rwork, - lapack_complex_float* tau, lapack_complex_float* work, - lapack_int* info); +lapack_int LAPACK_cggsvp( + char const* jobu, char const* jobv, char const* jobq, + lapack_int const* m, lapack_int const* p, lapack_int const* n, + lapack_complex_float* a, lapack_int const* lda, + lapack_complex_float* b, lapack_int const* ldb, + float* tola, float* tolb, lapack_int* k, lapack_int* l, + lapack_complex_float* u, lapack_int const* ldu, + lapack_complex_float* v, lapack_int const* ldv, + lapack_complex_float* q, lapack_int const* ldq, + lapack_int* iwork, float* rwork, lapack_complex_float* tau, + lapack_complex_float* work, lapack_int* info ); #define LAPACK_zggsvp LAPACK_GLOBAL(zggsvp,ZGGSVP) -lapack_int LAPACK_zggsvp( char const* jobu, char const* jobv, char const* jobq, - lapack_int const* m, lapack_int const* p, lapack_int const* n, - lapack_complex_double* a, lapack_int const* lda, - lapack_complex_double* b, lapack_int const* ldb, - double* tola, double* tolb, lapack_int* k, - lapack_int* l, lapack_complex_double* u, - lapack_int const* ldu, lapack_complex_double* v, - lapack_int const* ldv, lapack_complex_double* q, - lapack_int const* ldq, lapack_int* iwork, double* rwork, - lapack_complex_double* tau, lapack_complex_double* work, - lapack_int* info); +lapack_int LAPACK_zggsvp( + char const* jobu, char const* jobv, char const* jobq, + lapack_int const* m, lapack_int const* p, lapack_int const* n, + lapack_complex_double* a, lapack_int const* lda, + lapack_complex_double* b, lapack_int const* ldb, + double* tola, double* tolb, lapack_int* k, lapack_int* l, + lapack_complex_double* u, lapack_int const* ldu, + lapack_complex_double* v, lapack_int const* ldv, + lapack_complex_double* q, lapack_int const* ldq, + lapack_int* iwork, double* rwork, lapack_complex_double* tau, + lapack_complex_double* work, lapack_int* info ); #define LAPACK_cggsvp3 LAPACK_GLOBAL(cggsvp3,CGGSVP3) void LAPACK_cggsvp3( From 718f67421aaf83fb33722e4267a2be40185f63de Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Fri, 4 Sep 2020 10:36:19 -0500 Subject: [PATCH 422/593] POWER9: Fix mcpu option with clang Adding check for compiler type before checking GCC version in Makefile. This allows clang to use power9 instead of power8 when CORE is POWER9. --- Makefile.power | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Makefile.power b/Makefile.power index 37a02d692..e766f8499 100644 --- a/Makefile.power +++ b/Makefile.power @@ -17,6 +17,7 @@ endif ifeq ($(CORE), POWER9) ifneq ($(C_COMPILER), PGI) CCOMMON_OPT += -Ofast -mvsx -fno-fast-math +ifeq ($(C_COMPILER), GCC) ifneq ($(GCCVERSIONGT4), 1) $(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended) CCOMMON_OPT += -mcpu=power8 -mtune=power8 @@ -24,10 +25,14 @@ else CCOMMON_OPT += -mcpu=power9 -mtune=power9 endif else +CCOMMON_OPT += -mcpu=power9 -mtune=power9 +endif +else CCOMMON_OPT += -fast -Mvect=simd -Mcache_align endif ifneq ($(F_COMPILER), PGI) FCOMMON_OPT += -O2 -frecursive -fno-fast-math +ifeq ($(C_COMPILER), GCC) ifneq ($(GCCVERSIONGT4), 1) $(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended) FCOMMON_OPT += -mcpu=power8 -mtune=power8 @@ -35,6 +40,9 @@ else FCOMMON_OPT += -mcpu=power9 -mtune=power9 endif else +FCOMMON_OPT += -mcpu=power9 -mtune=power9 +endif +else FCOMMON_OPT += -O2 -Mrecursive endif endif From 330044d82147a9a08fd10d503fec7f406cde2861 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 5 Sep 2020 09:44:33 +0200 Subject: [PATCH 423/593] Fix potentiol domain error in sqrt --- driver/level3/level3_syrk_threaded.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/driver/level3/level3_syrk_threaded.c b/driver/level3/level3_syrk_threaded.c index a041abac3..d7dcd68a3 100644 --- a/driver/level3/level3_syrk_threaded.c +++ b/driver/level3/level3_syrk_threaded.c @@ -526,7 +526,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO BLASLONG width, i, j, k; BLASLONG n, n_from, n_to; int mode, mask; - double dnum; + double dnum, di, dinum; if ((nthreads == 1) || (args -> n < nthreads * SWITCH_RATIO)) { SYRK_LOCAL(args, range_m, range_n, sa, sb, 0); @@ -601,9 +601,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO if (nthreads - num_cpu > 1) { - double di = (double)i; + di = (double)i; - width = (((BLASLONG)((sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1) ); + dinum = di * di + dnum; + + if (dinum > 0) + width = (((BLASLONG)((sqrt(dinum) - di) + mask)/(mask+1)) * (mask+1) ); + else + width = (((BLASLONG)(- di + mask)/(mask+1)) * (mask+1) ); if (num_cpu == 0) width = n - (((n - width)/(mask+1)) * (mask+1) ); @@ -643,10 +648,15 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO if (nthreads - num_cpu > 1) { - double di = (double)i; + di = (double)i; - width = (((BLASLONG)((sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1)); + dinum = di * di +dnum; + if (dinum > 0) + width = (((BLASLONG)((sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1)); + else + width = (((BLASLONG)(- di + mask)/(mask+1)) * (mask+1)); + if ((width > n - i) || (width < mask)) width = n - i; } else { From 8a2a137a9e4e4ec657c5befe361061607489aaa2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 5 Sep 2020 13:06:31 +0200 Subject: [PATCH 424/593] Correct argument to SLASET (Improves fix from PR2778) as explained by serguei-patchkovskii in Reference-LAPACK/lapack#438 (comment) , passing in an index of 1 instead of N leads to a standards violation accessing matrix A in SLASET, i.e. undefined behavior --- lapack-netlib/TESTING/EIG/cchkhb2stg.f | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lapack-netlib/TESTING/EIG/cchkhb2stg.f b/lapack-netlib/TESTING/EIG/cchkhb2stg.f index cd884febf..100f133ab 100644 --- a/lapack-netlib/TESTING/EIG/cchkhb2stg.f +++ b/lapack-netlib/TESTING/EIG/cchkhb2stg.f @@ -680,8 +680,8 @@ * the one from above. Compare it with D1 computed * using the DSBTRD. * - CALL SLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) - CALL SLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) + CALL SLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) + CALL SLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) CALL CLACPY( ' ', K+1, N, A, LDA, U, LDU ) LH = MAX(1, 4*N) LW = LWORK - LH @@ -753,8 +753,8 @@ * the one from above. Compare it with D1 computed * using the DSBTRD. * - CALL SLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) - CALL SLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) + CALL SLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) + CALL SLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) CALL CLACPY( ' ', K+1, N, A, LDA, U, LDU ) LH = MAX(1, 4*N) LW = LWORK - LH From 6f8fad87c5d272f3e01853906be0269d9b96b30a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 5 Sep 2020 19:44:01 +0200 Subject: [PATCH 425/593] Use POSIX2001 clock.gettime for higher resolution --- benchmark/asum.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/benchmark/asum.c b/benchmark/asum.c index 78ccdf47b..e3d16acfd 100644 --- a/benchmark/asum.c +++ b/benchmark/asum.c @@ -128,8 +128,13 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; +#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) struct timeval start, stop; double time1,timeg; +#else + struct timespec start = { 0, 0 }, stop = { 0, 0 }; + double time1, timeg; +#endif argc--;argv++; @@ -160,26 +165,30 @@ int main(int argc, char *argv[]){ fprintf(stderr, " %6d : ", (int)m); - for (l=0; l1) timeg /= loops; #ifdef COMPLEX From 7d9c77f421fd662f8e103f6fae8adefc49e42078 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 7 Sep 2020 22:03:46 +0200 Subject: [PATCH 426/593] Correct dimension argument to xLASET from Reference-LAPACK PR 438 --- lapack-netlib/TESTING/EIG/cchkst2stg.f | 8 ++++---- lapack-netlib/TESTING/EIG/dchksb2stg.f | 8 ++++---- lapack-netlib/TESTING/EIG/dchkst2stg.f | 8 ++++---- lapack-netlib/TESTING/EIG/zchkhb2stg.f | 8 ++++---- lapack-netlib/TESTING/EIG/zchkst2stg.f | 8 ++++---- 5 files changed, 20 insertions(+), 20 deletions(-) diff --git a/lapack-netlib/TESTING/EIG/cchkst2stg.f b/lapack-netlib/TESTING/EIG/cchkst2stg.f index 5c478577f..8c7f962b7 100644 --- a/lapack-netlib/TESTING/EIG/cchkst2stg.f +++ b/lapack-netlib/TESTING/EIG/cchkst2stg.f @@ -1014,8 +1014,8 @@ * the one from above. Compare it with D1 computed * using the 1-stage. * - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) + CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) + CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) CALL CLACPY( 'U', N, N, A, LDA, V, LDU ) LH = MAX(1, 4*N) LW = LWORK - LH @@ -1048,8 +1048,8 @@ * the one from above. Compare it with D1 computed * using the 1-stage. * - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) + CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) + CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) CALL CLACPY( 'L', N, N, A, LDA, V, LDU ) CALL CHETRD_2STAGE( 'N', "L", N, V, LDU, SD, SE, TAU, $ WORK, LH, WORK( LH+1 ), LW, IINFO ) diff --git a/lapack-netlib/TESTING/EIG/dchksb2stg.f b/lapack-netlib/TESTING/EIG/dchksb2stg.f index ee66f7ebb..88f6e18d3 100644 --- a/lapack-netlib/TESTING/EIG/dchksb2stg.f +++ b/lapack-netlib/TESTING/EIG/dchksb2stg.f @@ -670,8 +670,8 @@ * the one from above. Compare it with D1 computed * using the DSBTRD. * - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) + CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) + CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) CALL DLACPY( ' ', K+1, N, A, LDA, U, LDU ) LH = MAX(1, 4*N) LW = LWORK - LH @@ -743,8 +743,8 @@ * the one from above. Compare it with D1 computed * using the DSBTRD. * - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) + CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) + CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) CALL DLACPY( ' ', K+1, N, A, LDA, U, LDU ) LH = MAX(1, 4*N) LW = LWORK - LH diff --git a/lapack-netlib/TESTING/EIG/dchkst2stg.f b/lapack-netlib/TESTING/EIG/dchkst2stg.f index ca31c9d1f..7115175c2 100644 --- a/lapack-netlib/TESTING/EIG/dchkst2stg.f +++ b/lapack-netlib/TESTING/EIG/dchkst2stg.f @@ -999,8 +999,8 @@ * the one from above. Compare it with D1 computed * using the 1-stage. * - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) + CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) + CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) CALL DLACPY( "U", N, N, A, LDA, V, LDU ) LH = MAX(1, 4*N) LW = LWORK - LH @@ -1032,8 +1032,8 @@ * the one from above. Compare it with D1 computed * using the 1-stage. * - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) + CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) + CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) CALL DLACPY( "L", N, N, A, LDA, V, LDU ) CALL DSYTRD_2STAGE( 'N', "L", N, V, LDU, SD, SE, TAU, $ WORK, LH, WORK( LH+1 ), LW, IINFO ) diff --git a/lapack-netlib/TESTING/EIG/zchkhb2stg.f b/lapack-netlib/TESTING/EIG/zchkhb2stg.f index dbbb84348..05434e4e3 100644 --- a/lapack-netlib/TESTING/EIG/zchkhb2stg.f +++ b/lapack-netlib/TESTING/EIG/zchkhb2stg.f @@ -680,8 +680,8 @@ * the one from above. Compare it with D1 computed * using the DSBTRD. * - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) + CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) + CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) CALL ZLACPY( ' ', K+1, N, A, LDA, U, LDU ) LH = MAX(1, 4*N) LW = LWORK - LH @@ -753,8 +753,8 @@ * the one from above. Compare it with D1 computed * using the DSBTRD. * - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) + CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) + CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) CALL ZLACPY( ' ', K+1, N, A, LDA, U, LDU ) LH = MAX(1, 4*N) LW = LWORK - LH diff --git a/lapack-netlib/TESTING/EIG/zchkst2stg.f b/lapack-netlib/TESTING/EIG/zchkst2stg.f index 167e5f359..4eadca4f3 100644 --- a/lapack-netlib/TESTING/EIG/zchkst2stg.f +++ b/lapack-netlib/TESTING/EIG/zchkst2stg.f @@ -1014,8 +1014,8 @@ * the one from above. Compare it with D1 computed * using the 1-stage. * - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) + CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) + CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) CALL ZLACPY( 'U', N, N, A, LDA, V, LDU ) LH = MAX(1, 4*N) LW = LWORK - LH @@ -1048,8 +1048,8 @@ * the one from above. Compare it with D1 computed * using the 1-stage. * - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) + CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) + CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) CALL ZLACPY( 'L', N, N, A, LDA, V, LDU ) CALL ZHETRD_2STAGE( 'N', "L", N, V, LDU, SD, SE, TAU, $ WORK, LH, WORK( LH+1 ), LW, IINFO ) From 0629d8ebdb98995b995ac4593c98f7721703c8fc Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Fri, 4 Sep 2020 16:32:45 +0200 Subject: [PATCH 427/593] s390x/DYNAMIC_ARCH: generalize detecting supported archs for clang Simplify detection of which kernels we can compile on s390x. Instead of decoding the gcc version in a complicated manner, just check if CC supports a given -march=archXY flag. Together with the next patch, we thereby gain support for builds with LLVM/clang with DYNAMIC_ARCH=1. Signed-off-by: Marius Hillenbrand --- Makefile.system | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/Makefile.system b/Makefile.system index e7d3dc4ce..f4a42f729 100644 --- a/Makefile.system +++ b/Makefile.system @@ -295,7 +295,6 @@ endif ifeq ($(C_COMPILER), GCC) GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) -GCCVERSIONEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` = 5) GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) @@ -594,34 +593,34 @@ endif ifeq ($(ARCH), zarch) DYNAMIC_CORE = ZARCH_GENERIC -# Z13 is supported since gcc-5.2, gcc-6, and in RHEL 7.3 and newer -ifeq ($(GCCVERSIONGT5), 1) - ZARCH_SUPPORT_Z13 := 1 -else ifeq ($(GCCVERSIONEQ5), 1) -ifeq ($(GCCMINORVERSIONGTEQ2), 1) - ZARCH_SUPPORT_Z13 := 1 -endif -endif - -ifeq ($(wildcard /etc/redhat-release), /etc/redhat-release) -ifeq ($(shell source /etc/os-release ; expr $$VERSION_ID \>= "7.3"), 1) - ZARCH_SUPPORT_Z13 := 1 -endif -endif - -ifeq ($(ZARCH_SUPPORT_Z13), 1) +# if the compiler accepts -march=arch11 or -march=z13 and can compile a file +# with z13-specific inline assembly, then we can include support for Z13. +# note: -march=z13 is equivalent to -march=arch11 yet some compiler releases +# only support one or the other. +# note: LLVM version 6.x supported -march=z13 yet could not handle vector +# registers in inline assembly, so the check for supporting the -march flag is +# not enough. +ZARCH_TEST_COMPILE=-c $(TOPDIR)/kernel/zarch/damin_z13.c -I$(TOPDIR) -o /dev/null > /dev/null 2> /dev/null +ZARCH_CC_SUPPORTS_ARCH11=$(shell $(CC) -march=arch11 $(ZARCH_TEST_COMPILE) && echo 1) +ZARCH_CC_SUPPORTS_Z13=$(shell $(CC) -march=z13 $(ZARCH_TEST_COMPILE) && echo 1) + +ifeq ($(or $(ZARCH_CC_SUPPORTS_ARCH11), $(ZARCH_CC_SUPPORTS_Z13)), 1) DYNAMIC_CORE += Z13 else -$(info OpenBLAS: Not building Z13 kernels because gcc is older than 5.2 or 6.x) +$(info OpenBLAS: Not building Z13 kernels because the compiler $(CC) does not support it) endif -ifeq ($(GCCVERSIONGTEQ7), 1) +# as above for z13, check for -march=arch12 and z14 support in the compiler. +ZARCH_CC_SUPPORTS_ARCH12=$(shell $(CC) -march=arch12 $(ZARCH_TEST_COMPILE) && echo 1) +ZARCH_CC_SUPPORTS_Z14=$(shell $(CC) -march=z14 $(ZARCH_TEST_COMPILE) && echo 1) +ifeq ($(or $(ZARCH_CC_SUPPORTS_ARCH12), $(ZARCH_CC_SUPPORTS_Z14)), 1) DYNAMIC_CORE += Z14 else -$(info OpenBLAS: Not building Z14 kernels because gcc is older than 7.x) -endif +$(info OpenBLAS: Not building Z14 kernels because the compiler $(CC) does not support it) endif +endif # ARCH zarch + ifeq ($(ARCH), power) DYNAMIC_CORE = POWER6 DYNAMIC_CORE += POWER8 From 4f34bcfb5e2da40ffe02c9f0765b9f4e18e8f6f5 Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Mon, 7 Sep 2020 17:04:03 +0200 Subject: [PATCH 428/593] s390x/DYNAMIC_ARCH: pass supported arch levels from Makefile to run-time code ... instead of duplicating the (old) mechanism from the Makefile that aimed to derive supported architecture generations from the gcc version. To enable builds with DYNAMIC_ARCH with older compiler releases, the Makefile and drivers/other/dynamic_arch.c need a common view of the architecture support built into the library. We follow the notation from x86 when used with DYNAMIC_LIST, where defines DYN_ denote support for a given generation to be built in. Since there are far fewer architecture generations in OpenBLAS for s390x, that does not bloat command lines too much. Signed-off-by: Marius Hillenbrand --- Makefile.system | 2 ++ driver/others/dynamic_zarch.c | 48 ++++++++++++++++------------------- 2 files changed, 24 insertions(+), 26 deletions(-) diff --git a/Makefile.system b/Makefile.system index f4a42f729..1b832ba41 100644 --- a/Makefile.system +++ b/Makefile.system @@ -606,6 +606,7 @@ ZARCH_CC_SUPPORTS_Z13=$(shell $(CC) -march=z13 $(ZARCH_TEST_COMPILE) && echo 1) ifeq ($(or $(ZARCH_CC_SUPPORTS_ARCH11), $(ZARCH_CC_SUPPORTS_Z13)), 1) DYNAMIC_CORE += Z13 +CCOMMON_OPT += -DDYN_Z13 else $(info OpenBLAS: Not building Z13 kernels because the compiler $(CC) does not support it) endif @@ -615,6 +616,7 @@ ZARCH_CC_SUPPORTS_ARCH12=$(shell $(CC) -march=arch12 $(ZARCH_TEST_COMPILE) && ec ZARCH_CC_SUPPORTS_Z14=$(shell $(CC) -march=z14 $(ZARCH_TEST_COMPILE) && echo 1) ifeq ($(or $(ZARCH_CC_SUPPORTS_ARCH12), $(ZARCH_CC_SUPPORTS_Z14)), 1) DYNAMIC_CORE += Z14 +CCOMMON_OPT += -DDYN_Z14 else $(info OpenBLAS: Not building Z14 kernels because the compiler $(CC) does not support it) endif diff --git a/driver/others/dynamic_zarch.c b/driver/others/dynamic_zarch.c index 403b34111..dac8909fb 100644 --- a/driver/others/dynamic_zarch.c +++ b/driver/others/dynamic_zarch.c @@ -1,18 +1,6 @@ #include "common.h" #include -// Gate kernels for z13 and z14 on gcc version -#if (__GNUC__ == 5 && __GNUC_MINOR__ >= 2) || __GNUC__ >= 6 || \ - /* RHEL 7 since 7.3: */ \ - (__GNUC__ == 4 && __GNUC_MINOR__ == 8 && __GNUC_PATCHLEVEL__ == 5 && \ - __GNUC_RH_RELEASE__ >= 11) -#define HAVE_Z13_SUPPORT -#endif - -#if __GNUC__ >= 7 -#define HAVE_Z14_SUPPORT -#endif - // Guard the use of getauxval() on glibc version >= 2.16 #ifdef __GLIBC__ #include @@ -47,10 +35,10 @@ static unsigned long get_hwcap(void) { #endif // __GLIBC extern gotoblas_t gotoblas_ZARCH_GENERIC; -#ifdef HAVE_Z13_SUPPORT +#ifdef DYN_Z13 extern gotoblas_t gotoblas_Z13; #endif -#ifdef HAVE_Z14_SUPPORT +#ifdef DYN_Z14 extern gotoblas_t gotoblas_Z14; #endif @@ -66,10 +54,10 @@ static char* corename[] = { }; char* gotoblas_corename(void) { -#ifdef HAVE_Z13_SUPPORT +#ifdef DYN_Z13 if (gotoblas == &gotoblas_Z13) return corename[1]; #endif -#ifdef HAVE_Z14_SUPPORT +#ifdef DYN_Z14 if (gotoblas == &gotoblas_Z14) return corename[2]; #endif if (gotoblas == &gotoblas_ZARCH_GENERIC) return corename[3]; @@ -89,15 +77,15 @@ static gotoblas_t* get_coretype(void) { unsigned long hwcap __attribute__((unused)) = get_hwcap(); +#ifdef DYN_Z14 // z14 and z15 systems: exploit Vector Facility (SIMD) and // Vector-Enhancements Facility 1 (float SIMD instructions), if present. -#ifdef HAVE_Z14_SUPPORT if ((hwcap & HWCAP_S390_VX) && (hwcap & HWCAP_S390_VXE)) return &gotoblas_Z14; #endif +#ifdef DYN_Z13 // z13: Vector Facility (SIMD for double) -#ifdef HAVE_Z13_SUPPORT if (hwcap & HWCAP_S390_VX) return &gotoblas_Z13; #endif @@ -123,19 +111,27 @@ static gotoblas_t* force_coretype(char* coretype) { } } - switch (found) - { -#ifdef HAVE_Z13_SUPPORT - case 1: return (&gotoblas_Z13); + if (found == 1) { +#ifdef DYN_Z13 + return &gotoblas_Z13; +#else + openblas_warning(1, "Z13 support not compiled in"); + return NULL; #endif -#ifdef HAVE_Z14_SUPPORT - case 2: return (&gotoblas_Z14); + } else if (found == 2) { +#ifdef DYN_Z14 + return &gotoblas_Z14; +#else + openblas_warning(1, "Z14 support not compiled in"); + return NULL; #endif - case 3: return (&gotoblas_ZARCH_GENERIC); - default: return NULL; + } else if (found == 3) { + return &gotoblas_ZARCH_GENERIC; } + snprintf(message, 128, "Core not found: %s\n", coretype); openblas_warning(1, message); + return NULL; } void gotoblas_dynamic_init(void) { From a55fe06f251ff6269f4a126dec27f59bf3ea67f0 Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Mon, 7 Sep 2020 17:13:03 +0200 Subject: [PATCH 429/593] s390x/DYNAMIC_ARCH: define a HW_CAP flag to support slightly older glibc versions Enable building DYNAMIC_ARCH support with older versions of glibc that do not know about the hwcap flag HWCAP_S390_VXE yet. Signed-off-by: Marius Hillenbrand --- driver/others/dynamic_zarch.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/driver/others/dynamic_zarch.c b/driver/others/dynamic_zarch.c index dac8909fb..bf5eab9b2 100644 --- a/driver/others/dynamic_zarch.c +++ b/driver/others/dynamic_zarch.c @@ -65,6 +65,10 @@ char* gotoblas_corename(void) { return corename[0]; } +#ifndef HWCAP_S390_VXE +#define HWCAP_S390_VXE 8192 +#endif + /** * Detect the fitting set of kernels by retrieving the CPU features supported by * OS from the auxiliary value AT_HWCAP and choosing the set of kernels From f7731a358af7871a72dad3ada5d35963bb454ed7 Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Tue, 8 Sep 2020 15:15:15 +0200 Subject: [PATCH 430/593] Update CONTRIBUTERS.md - clang build fixes for IBM z Signed-off-by: Marius Hillenbrand --- CONTRIBUTORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index aba39e56f..7b994885a 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -187,6 +187,7 @@ In chronological order: * Marius Hillenbrand * [2020-05-12] Revise dynamic architecture detection for IBM z * [2020-05-12] Add new sgemm and strmm kernel for IBM z14 + * [2020-09-07] Fix builds with clang on IBM z, including dynamic architecture support * Danfeng Zhang * [2020-05-20] Improve performance of SGEMM and STRMM on Arm Cortex-A53 \ No newline at end of file From 047b8d7aff79d31c25c8c6a46fd917fafe4ca8c8 Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Tue, 8 Sep 2020 19:30:37 +0200 Subject: [PATCH 431/593] Add an s390 build with clang to the Travis configuration Since clang builds have been fixed on s390x, including support for DYNAMIC_ARCH, cover that build type in Travis. Explicitly request Ubuntu 20.04 (codename focal) to get a recent LLVM/clang version 10.x and thereby cover all s390x architecture generations supported in OpenBLAS. Ubuntu 18.10's LLVM/clang 6.x cannot build the inline assembly in some of the Z13 and Z14 kernels. LLVM/clang currently does not support OpenMP on s390x, so disable that in the build. Signed-off-by: Marius Hillenbrand --- .travis.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.travis.yml b/.travis.yml index 307010e40..3f8f766fe 100644 --- a/.travis.yml +++ b/.travis.yml @@ -43,6 +43,18 @@ matrix: - TARGET_BOX=IBMZ_LINUX - BTYPE="BINARY=64 USE_OPENMP=1" + - <<: *test-ubuntu + os: linux + dist: focal + arch: s390x + compiler: clang + before_script: + - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=Z13 NUM_THREADS=32" + env: + # for matrix annotation only + - TARGET_BOX=IBMZ_LINUX + - BTYPE="BINARY=64 USE_OPENMP=0 CC=clang" + - <<: *test-ubuntu env: - TARGET_BOX=LINUX64 From 746ad3bd190493a7219bc02547a050772d4a4e01 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 13 Sep 2020 18:40:59 +0200 Subject: [PATCH 432/593] Fix vendor match for GCC gfortran --- f_check | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/f_check b/f_check index dd4d3475c..f894aa9ac 100644 --- a/f_check +++ b/f_check @@ -69,7 +69,7 @@ if ($compiler eq "") { $bu = "_"; } - if ($data =~ /GNU/) { + if ($data =~ /GNU/ || $data =~ /GCC/ ) { $data =~ /(\d+)\.(\d+).(\d+)/; $major = $1; From 26792d2096ce0736a53bef6b8bf4ff0206ac3efa Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 13 Sep 2020 21:47:55 +0200 Subject: [PATCH 433/593] Copy BUILD_* directives to the compiler options to allow ifdef in tests --- cmake/system.cmake | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/cmake/system.cmake b/cmake/system.cmake index c0f3c6ed2..aa342c3d2 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -393,6 +393,18 @@ set(REVISION "-r${OpenBLAS_VERSION}") set(MAJOR_VERSION ${OpenBLAS_MAJOR_VERSION}) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CCOMMON_OPT}") +if (BUILD_SINGLE) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_SINGLE") +endif() +if (BUILD_DOUBLE) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_DOUBLE") +endif() +if (BUILD_COMPLEX) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX") +endif() +if (BUILD_COMPLEX16) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX16") +endif() if(NOT MSVC) set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CCOMMON_OPT}") endif() From 74e358bcd514cff2e9b32c13571c09176b56a3d8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 13 Sep 2020 21:49:01 +0200 Subject: [PATCH 434/593] Remove spurious complex16 tests --- ctest/c_dblas1.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/ctest/c_dblas1.c b/ctest/c_dblas1.c index e49ae6007..8e13afcaa 100644 --- a/ctest/c_dblas1.c +++ b/ctest/c_dblas1.c @@ -74,16 +74,6 @@ void F77_dswap( const int *N, double *X, const int *incX, return; } -double F77_dzasum(const int *N, void *X, const int *incX) -{ - return cblas_dzasum(*N, X, *incX); -} - -double F77_dznrm2(const int *N, OPENBLAS_CONST void *X, const int *incX) -{ - return cblas_dznrm2(*N, X, *incX); -} - int F77_idamax(const int *N, OPENBLAS_CONST double *X, const int *incX) { if (*N < 1 || *incX < 1) return(0); From 593ce9e23786796a483f44436e4aca57d042f05d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 13 Sep 2020 21:50:12 +0200 Subject: [PATCH 435/593] Make building individual tests depend on BUILD_SINGLE etc defines --- test/CMakeLists.txt | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index adeee3452..f1f773cba 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -3,11 +3,18 @@ include_directories(${PROJECT_BINARY_DIR}) enable_language(Fortran) -set(OpenBLAS_Tests - sblat1 sblat2 sblat3 - dblat1 dblat2 dblat3 - cblat1 cblat2 cblat3 - zblat1 zblat2 zblat3) +if (BUILD_SINGLE) + list( APPEND OpenBLAS_Tests sblat1 sblat2 sblat3) +endif() +if (BUILD_DOUBLE) + list (APPEND OpenBLAS_Tests dblat1 dblat2 dblat3) +endif() +if (BUILD_COMPLEX) + list (APPEND OpenBLAS_Tests cblat1 cblat2 cblat3) +endif() +if (BUILD_COMPLEX16) + list (APPEND OpenBLAS_Tests zblat1 zblat2 zblat3) +endif() foreach(test_bin ${OpenBLAS_Tests}) add_executable(${test_bin} ${test_bin}.f) From ce8939863626d3a194890e87edc9b7280f73b660 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 13 Sep 2020 21:52:18 +0200 Subject: [PATCH 436/593] Make tests for individual variable types conditional on the respective BUILD_ option --- utest/test_amax.c | 6 +++++- utest/test_axpy.c | 9 +++++++++ utest/test_dotu.c | 3 +++ utest/test_ismin.c | 2 ++ utest/test_min.c | 13 +++++++++++-- utest/test_potrs.c | 39 ++++++++++++++++++++++++++++++--------- utest/test_rot.c | 9 +++++++++ utest/test_swap.c | 9 +++++++++ 8 files changed, 78 insertions(+), 12 deletions(-) diff --git a/utest/test_amax.c b/utest/test_amax.c index 831804027..a9e5a1c85 100644 --- a/utest/test_amax.c +++ b/utest/test_amax.c @@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "openblas_utest.h" +#ifdef BUILD_SINGLE CTEST(amax, samax){ blasint N=3, inc=1; float te_max=0.0, tr_max=0.0; @@ -43,7 +44,8 @@ CTEST(amax, samax){ ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS); } - +#endif +#ifdef BUILD_DOUBLE CTEST(amax, damax){ blasint N=3, inc=1; double te_max=0.0, tr_max=0.0; @@ -54,3 +56,5 @@ CTEST(amax, damax){ ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), DOUBLE_EPS); } +#endif + diff --git a/utest/test_axpy.c b/utest/test_axpy.c index 603043073..5fd7c1b04 100644 --- a/utest/test_axpy.c +++ b/utest/test_axpy.c @@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "openblas_utest.h" +#ifdef BUILD_DOUBLE CTEST(axpy,daxpy_inc_0) { blasint i; @@ -52,7 +53,9 @@ CTEST(axpy,daxpy_inc_0) ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); } } +#endif +#ifdef BUILD_COMPLEX16 CTEST(axpy,zaxpy_inc_0) { blasint i; @@ -71,7 +74,9 @@ CTEST(axpy,zaxpy_inc_0) ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); } } +#endif +#ifdef BUILD_SINGLE CTEST(axpy,saxpy_inc_0) { blasint i; @@ -90,7 +95,9 @@ CTEST(axpy,saxpy_inc_0) ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); } } +#endif +#ifdef BUILD_COMPLEX CTEST(axpy,caxpy_inc_0) { blasint i; @@ -109,3 +116,5 @@ CTEST(axpy,caxpy_inc_0) ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); } } +#endif + diff --git a/utest/test_dotu.c b/utest/test_dotu.c index 918541848..542286403 100644 --- a/utest/test_dotu.c +++ b/utest/test_dotu.c @@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "openblas_utest.h" +#ifdef BUILD_COMPLEX16 CTEST( zdotu,zdotu_n_1) { blasint N=1,incX=1,incY=1; @@ -80,3 +81,5 @@ CTEST(zdotu, zdotu_offset_1) #endif } +#endif + diff --git a/utest/test_ismin.c b/utest/test_ismin.c index f23d6b545..af597807f 100644 --- a/utest/test_ismin.c +++ b/utest/test_ismin.c @@ -36,6 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ELEMENTS 50 #define INCREMENT 2 +#ifdef BUILD_SINGLE CTEST(ismin, positive_step_2){ blasint i; blasint N = ELEMENTS, inc = INCREMENT; @@ -87,3 +88,4 @@ CTEST(ismax, negative_step_2){ blasint index = BLASFUNC(ismax)(&N, x, &inc); ASSERT_EQUAL(9, index); } +#endif diff --git a/utest/test_min.c b/utest/test_min.c index fd31b5982..a627674ae 100644 --- a/utest/test_min.c +++ b/utest/test_min.c @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #include "openblas_utest.h" - +#ifdef BUILD_SINGLE CTEST(min, smin_negative){ blasint N=3, inc=1; float te_min=0.0, tr_min=0.0; @@ -43,7 +43,9 @@ CTEST(min, smin_negative){ ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), SINGLE_EPS); } +#endif +#ifdef BUILD_DOUBLE CTEST(min, dmin_positive){ blasint N=3, inc=1; double te_min=0.0, tr_min=0.0; @@ -54,7 +56,9 @@ CTEST(min, dmin_positive){ ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), DOUBLE_EPS); } +#endif +#ifdef BUILD_SINGLE CTEST(min, smin_zero){ blasint N=3, inc=1; float te_min=0.0, tr_min=0.0; @@ -76,7 +80,9 @@ CTEST(max, smax_negative){ ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS); } +#endif +#ifdef BUILD_DOUBLE CTEST(max, dmax_positive){ blasint N=3, inc=1; double te_max=0.0, tr_max=0.0; @@ -87,7 +93,8 @@ CTEST(max, dmax_positive){ ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), DOUBLE_EPS); } - +#endif +#ifdef BUILD_SINGLE CTEST(max, smax_zero){ blasint N=3, inc=1; float te_max=0.0, tr_max=0.0; @@ -98,3 +105,5 @@ CTEST(max, smax_zero){ ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS); } +#endif + diff --git a/utest/test_potrs.c b/utest/test_potrs.c index 7afeb4c9d..05ce3037b 100644 --- a/utest/test_potrs.c +++ b/utest/test_potrs.c @@ -39,10 +39,10 @@ void BLASFUNC(zpotrs_(char*, BLASINT*, BLASINT*, complex double*, BLASINT*, complex double*, BLASINT*, BLASINT*); */ - //https://github.com/xianyi/OpenBLAS/issues/695 CTEST(potrf, bug_695){ +#ifdef BUILD_COMPLEX openblas_complex_float A1[100] = { openblas_make_complex_float(5.8525753, +0.0), @@ -153,7 +153,9 @@ CTEST(potrf, bug_695){ blasint info[1]; BLASFUNC(cpotrf)(&up, &n, (float*)(A1), &n, info); //printf("%g+%g*I\n", creal(A1[91]), cimag(A1[91])); +#endif +#ifdef BUILD_COMPLEX16 openblas_complex_double A2[100] = { openblas_make_complex_double(3.0607147216796875, +0.0), @@ -283,7 +285,8 @@ CTEST(potrf, bug_695){ char lo = 'L'; blasint nrhs = 2; BLASFUNC(zpotrs)(&lo, &n, &nrhs, (double*)(A2), &n, (double*)(B), &n, info); - +#endif +#ifdef BUILD_COMPLEX // note that this is exactly equal to A1 openblas_complex_float A3[100] = { @@ -393,9 +396,9 @@ CTEST(potrf, bug_695){ if(isnan(CREAL(A3[91])) || isnan(CIMAG(A3[91]))) { CTEST_ERR("%s:%d got NaN", __FILE__, __LINE__); } +#endif } - // Check potrf factorizes a small problem correctly CTEST(potrf, smoketest_trivial){ float A1s[4] = {2, 0.3, 0.3, 3}; @@ -439,31 +442,43 @@ CTEST(potrf, smoketest_trivial){ uplo = 'U'; } +#ifdef BUILD_SINGLE BLASFUNC(scopy)(&nv, A1s, &inc, As, &inc); +#endif +#ifdef BUILD_DOUBLE BLASFUNC(dcopy)(&nv, A1d, &inc, Ad, &inc); +#endif +#ifdef BUILD_COMPLEX BLASFUNC(ccopy)(&nv, (float *)A1c, &inc, (float *)Ac, &inc); +#endif +#ifdef BUILD_COMPLEX16 BLASFUNC(zcopy)(&nv, (double *)A1z, &inc, (double *)Az, &inc); +#endif +#ifdef BUILD_SINGLE BLASFUNC(spotrf)(&uplo, &n, As, &n, &info); if (info != 0) { CTEST_ERR("%s:%d info != 0", __FILE__, __LINE__); } - +#endif +#ifdef BUILD_DOUBLE BLASFUNC(dpotrf)(&uplo, &n, Ad, &n, &info); if (info != 0) { CTEST_ERR("%s:%d info != 0", __FILE__, __LINE__); } - +#endif +#ifdef BUILD_COMPLEX BLASFUNC(cpotrf)(&uplo, &n, (float *)Ac, &n, &info); if (info != 0) { CTEST_ERR("%s:%d info != 0", __FILE__, __LINE__); } - +#endif +#ifdef BUILD_COMPLEX16 BLASFUNC(zpotrf)(&uplo, &n, (double *)Az, &n, &info); if (info != 0) { CTEST_ERR("%s:%d info != 0", __FILE__, __LINE__); } - +#endif /* Fill the other triangle */ if (uplo == 'L') { for (i = 0; i < n; ++i) { @@ -495,14 +510,20 @@ CTEST(potrf, smoketest_trivial){ trans1 = 'C'; trans2 = 'N'; } - +#ifdef BUILD_SINGLE BLASFUNC(sgemm)(&trans1, &trans2, &n, &n, &n, &ones, As, &n, As, &n, &zeros, Bs, &n); +#endif +#ifdef BUILD_DOUBLE BLASFUNC(dgemm)(&trans1, &trans2, &n, &n, &n, &oned, Ad, &n, Ad, &n, &zerod, Bd, &n); +#endif +#ifdef BUILD_COMPLEX BLASFUNC(cgemm)(&trans1, &trans2, &n, &n, &n, (float *)&onec, (float *)Ac, &n, (float *)Ac, &n, (float *)&zeroc, (float *)Bc, &n); +#endif +#ifdef BUILD_COMPLEX16 BLASFUNC(zgemm)(&trans1, &trans2, &n, &n, &n, (double *)&onez, (double *)Az, &n, (double *)Az, &n, (double *)&zeroz, (double *)Bz, &n); - +#endif /* Check result is close to original */ for (i = 0; i < n; ++i) { for (j = 0; j < n; ++j) { diff --git a/utest/test_rot.c b/utest/test_rot.c index cf72ad22d..0e74ecbb3 100644 --- a/utest/test_rot.c +++ b/utest/test_rot.c @@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "openblas_utest.h" +#ifdef BUILD_DOUBLE CTEST(rot,drot_inc_0) { blasint i=0; @@ -52,7 +53,9 @@ CTEST(rot,drot_inc_0) ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); } } +#endif +#ifdef BUILD_COMPLEX16 CTEST(rot,zdrot_inc_0) { blasint i=0; @@ -72,7 +75,9 @@ CTEST(rot,zdrot_inc_0) ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); } } +#endif +#ifdef BUILD_SINGLE CTEST(rot,srot_inc_0) { blasint i=0; @@ -91,7 +96,9 @@ CTEST(rot,srot_inc_0) ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS); } } +#endif +#ifdef BUILD_COMPLEX CTEST(rot, csrot_inc_0) { blasint i=0; @@ -110,3 +117,5 @@ CTEST(rot, csrot_inc_0) ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS); } } +#endif + diff --git a/utest/test_swap.c b/utest/test_swap.c index 259c83a5c..6d8ae8056 100644 --- a/utest/test_swap.c +++ b/utest/test_swap.c @@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "openblas_utest.h" +#ifdef BUILD_DOUBLE CTEST(swap,dswap_inc_0) { blasint i=0; @@ -50,7 +51,9 @@ CTEST(swap,dswap_inc_0) ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); } } +#endif +#ifdef BUILD_COMPLEX16 CTEST(swap,zswap_inc_0) { blasint i=0; @@ -68,7 +71,9 @@ CTEST(swap,zswap_inc_0) ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); } } +#endif +#ifdef BUILD_SINGLE CTEST(swap,sswap_inc_0) { blasint i=0; @@ -86,7 +91,9 @@ CTEST(swap,sswap_inc_0) ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS); } } +#endif +#ifdef BUILD_COMPLEX CTEST(swap,cswap_inc_0) { blasint i=0; @@ -104,3 +111,5 @@ CTEST(swap,cswap_inc_0) ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS); } } +#endif + From ec2948f14784c3559b11f9aed07646396c3527cf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 13 Sep 2020 22:17:46 +0200 Subject: [PATCH 437/593] Make tests conditional on BUILD_DOUBLE --- utest/test_kernel_regress.c | 2 ++ utest/test_rotmg.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/utest/test_kernel_regress.c b/utest/test_kernel_regress.c index 93a30b30c..5b131bb2c 100644 --- a/utest/test_kernel_regress.c +++ b/utest/test_kernel_regress.c @@ -22,6 +22,7 @@ double m[DATASIZE*DATASIZE]; CTEST(kernel_regress,skx_avx) { +#ifdef BUILD_DOUBLE double norm; int i, j, info; srand(0); @@ -47,4 +48,5 @@ CTEST(kernel_regress,skx_avx) norm = cblas_dnrm2(DATASIZE*DATASIZE, X, 1); ASSERT_DBL_NEAR_TOL(0.0, norm, 1e-10); +#endif } diff --git a/utest/test_rotmg.c b/utest/test_rotmg.c index e5ec78983..ad435f6b0 100644 --- a/utest/test_rotmg.c +++ b/utest/test_rotmg.c @@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "openblas_utest.h" +#ifdef BUILD_DOUBLE CTEST (drotmg,rotmg) { double te_d1, tr_d1; @@ -204,3 +205,4 @@ CTEST(drotmg, drotmg_D1_big_D2_big_flag_zero) ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS); } } +#endif From de139337b8bcb1c76cd157afd4d5fd035a76efdf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 13 Sep 2020 22:20:41 +0200 Subject: [PATCH 438/593] Remove spurious tests for complex ASUM and NRM2 --- ctest/c_sblas1.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/ctest/c_sblas1.c b/ctest/c_sblas1.c index 1a433b287..a562014a4 100644 --- a/ctest/c_sblas1.c +++ b/ctest/c_sblas1.c @@ -21,16 +21,6 @@ void F77_saxpy(blasint *N, const float *alpha, OPENBLAS_CONST float *X, return; } -float F77_scasum(blasint *N, float *X, blasint *incX) -{ - return cblas_scasum(*N, X, *incX); -} - -float F77_scnrm2(blasint *N, OPENBLAS_CONST float *X, blasint *incX) -{ - return cblas_scnrm2(*N, X, *incX); -} - void F77_scopy(blasint *N, OPENBLAS_CONST float *X, blasint *incX, float *Y, blasint *incY) { From 4d250d0cdf9f0d234aa9c3eeff246bbe1b9edd3b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 13 Sep 2020 23:29:01 +0200 Subject: [PATCH 439/593] Rearrange ifdefs --- utest/test_potrs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/utest/test_potrs.c b/utest/test_potrs.c index 05ce3037b..2681615f4 100644 --- a/utest/test_potrs.c +++ b/utest/test_potrs.c @@ -42,7 +42,6 @@ void BLASFUNC(zpotrs_(char*, BLASINT*, BLASINT*, complex double*, //https://github.com/xianyi/OpenBLAS/issues/695 CTEST(potrf, bug_695){ -#ifdef BUILD_COMPLEX openblas_complex_float A1[100] = { openblas_make_complex_float(5.8525753, +0.0), @@ -151,11 +150,11 @@ CTEST(potrf, bug_695){ blasint n=10; blasint info[1]; +#ifdef BUILD_COMPLEX BLASFUNC(cpotrf)(&up, &n, (float*)(A1), &n, info); //printf("%g+%g*I\n", creal(A1[91]), cimag(A1[91])); #endif -#ifdef BUILD_COMPLEX16 openblas_complex_double A2[100] = { openblas_make_complex_double(3.0607147216796875, +0.0), @@ -284,9 +283,9 @@ CTEST(potrf, bug_695){ }; char lo = 'L'; blasint nrhs = 2; +#ifdef BUILD_COMPLEX16 BLASFUNC(zpotrs)(&lo, &n, &nrhs, (double*)(A2), &n, (double*)(B), &n, info); #endif -#ifdef BUILD_COMPLEX // note that this is exactly equal to A1 openblas_complex_float A3[100] = { @@ -391,6 +390,7 @@ CTEST(potrf, bug_695){ openblas_make_complex_float(-0.9617417, -1.2486815), openblas_make_complex_float(3.4629636, +0.0) }; +#ifdef BUILD_COMPLEX BLASFUNC(cpotrf)(&up, &n, (float*)(A3), &n, info); // printf("%g+%g*I\n", creal(A3[91]), cimag(A3[91])); if(isnan(CREAL(A3[91])) || isnan(CIMAG(A3[91]))) { From 9e11c2d62f23ef2483d206aaf3952e0bd09d30cb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 13 Sep 2020 23:55:11 +0200 Subject: [PATCH 440/593] Add BUILD_SINGLE etc --- Makefile.rule | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Makefile.rule b/Makefile.rule index 2c12177ee..40bd1a854 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -277,5 +277,10 @@ COMMON_PROF = -pg # If you want to enable the experimental BFLOAT16 support # BUILD_HALF = 1 # +# the below is not yet configurable, use cmake if you need to build only select types +BUILD_SINGLE = 1 +BUILD_DOUBLE = 1 +BUILD_COMPLEX = 1 +BUILD_COMPLEX16 = 1 # End of user configuration # From ba644378dce720f6bb946aa2b585c9e71f257e1f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 14 Sep 2020 00:03:33 +0200 Subject: [PATCH 441/593] Copy BUILD_ options available to the compiler flags --- Makefile.system | 55 +++++++++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 22 deletions(-) diff --git a/Makefile.system b/Makefile.system index 1b832ba41..0ccf9eaed 100644 --- a/Makefile.system +++ b/Makefile.system @@ -295,6 +295,7 @@ endif ifeq ($(C_COMPILER), GCC) GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) +GCCVERSIONEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` = 5) GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) @@ -593,35 +594,33 @@ endif ifeq ($(ARCH), zarch) DYNAMIC_CORE = ZARCH_GENERIC -# if the compiler accepts -march=arch11 or -march=z13 and can compile a file -# with z13-specific inline assembly, then we can include support for Z13. -# note: -march=z13 is equivalent to -march=arch11 yet some compiler releases -# only support one or the other. -# note: LLVM version 6.x supported -march=z13 yet could not handle vector -# registers in inline assembly, so the check for supporting the -march flag is -# not enough. -ZARCH_TEST_COMPILE=-c $(TOPDIR)/kernel/zarch/damin_z13.c -I$(TOPDIR) -o /dev/null > /dev/null 2> /dev/null -ZARCH_CC_SUPPORTS_ARCH11=$(shell $(CC) -march=arch11 $(ZARCH_TEST_COMPILE) && echo 1) -ZARCH_CC_SUPPORTS_Z13=$(shell $(CC) -march=z13 $(ZARCH_TEST_COMPILE) && echo 1) - -ifeq ($(or $(ZARCH_CC_SUPPORTS_ARCH11), $(ZARCH_CC_SUPPORTS_Z13)), 1) +# Z13 is supported since gcc-5.2, gcc-6, and in RHEL 7.3 and newer +ifeq ($(GCCVERSIONGT5), 1) + ZARCH_SUPPORT_Z13 := 1 +else ifeq ($(GCCVERSIONEQ5), 1) +ifeq ($(GCCMINORVERSIONGTEQ2), 1) + ZARCH_SUPPORT_Z13 := 1 +endif +endif + +ifeq ($(wildcard /etc/redhat-release), /etc/redhat-release) +ifeq ($(shell source /etc/os-release ; expr $$VERSION_ID \>= "7.3"), 1) + ZARCH_SUPPORT_Z13 := 1 +endif +endif + +ifeq ($(ZARCH_SUPPORT_Z13), 1) DYNAMIC_CORE += Z13 -CCOMMON_OPT += -DDYN_Z13 else -$(info OpenBLAS: Not building Z13 kernels because the compiler $(CC) does not support it) +$(info OpenBLAS: Not building Z13 kernels because gcc is older than 5.2 or 6.x) endif -# as above for z13, check for -march=arch12 and z14 support in the compiler. -ZARCH_CC_SUPPORTS_ARCH12=$(shell $(CC) -march=arch12 $(ZARCH_TEST_COMPILE) && echo 1) -ZARCH_CC_SUPPORTS_Z14=$(shell $(CC) -march=z14 $(ZARCH_TEST_COMPILE) && echo 1) -ifeq ($(or $(ZARCH_CC_SUPPORTS_ARCH12), $(ZARCH_CC_SUPPORTS_Z14)), 1) +ifeq ($(GCCVERSIONGTEQ7), 1) DYNAMIC_CORE += Z14 -CCOMMON_OPT += -DDYN_Z14 else -$(info OpenBLAS: Not building Z14 kernels because the compiler $(CC) does not support it) +$(info OpenBLAS: Not building Z14 kernels because gcc is older than 7.x) +endif endif - -endif # ARCH zarch ifeq ($(ARCH), power) DYNAMIC_CORE = POWER6 @@ -1223,6 +1222,18 @@ endif ifeq ($(BUILD_HALF), 1) CCOMMON_OPT += -DBUILD_HALF endif +ifeq ($(BUILD_SINGLE), 1) +CCOMMON_OPT += -DBUILD_SINGLE +endif +ifeq ($(BUILD_DOUBLE), 1) +CCOMMON_OPT += -DBUILD_DOUBLE +endif +ifeq ($(BUILD_COMPLEX), 1) +CCOMMON_OPT += -DBUILD_COMPLEX +endif +ifeq ($(BUILD_COMPLEX16), 1) +CCOMMON_OPT += -DBUILD_COMPLEX16 +endif CCOMMON_OPT += -DVERSION=\"$(VERSION)\" From 274d6e015b56a9f0ccad928232ed3bd88a063754 Mon Sep 17 00:00:00 2001 From: fossum Date: Mon, 14 Sep 2020 13:10:48 -0500 Subject: [PATCH 442/593] Fixing a performance bug in trsm_[LR].c. --- driver/level3/trsm_L.c | 4 ++-- driver/level3/trsm_R.c | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/driver/level3/trsm_L.c b/driver/level3/trsm_L.c index d8130ee7e..d842efa93 100644 --- a/driver/level3/trsm_L.c +++ b/driver/level3/trsm_L.c @@ -131,7 +131,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; - if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; + if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; @@ -197,7 +197,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; - if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; + if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; diff --git a/driver/level3/trsm_R.c b/driver/level3/trsm_R.c index f6a57f93f..f76a8f7f3 100644 --- a/driver/level3/trsm_R.c +++ b/driver/level3/trsm_R.c @@ -126,7 +126,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; - if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; + if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; @@ -182,7 +182,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = 0; jjs < min_j - min_l - ls + js; jjs += min_jj){ min_jj = min_j - min_l - ls + js - jjs; - if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; + if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; @@ -243,7 +243,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; - if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; + if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; @@ -304,7 +304,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = 0; jjs < min_j - js + ls; jjs += min_jj){ min_jj = min_j - js + ls - jjs; - if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; + if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; From dfeca46098ff7b3cc47aa053195fe1c82bce87e9 Mon Sep 17 00:00:00 2001 From: fossum Date: Tue, 15 Sep 2020 08:59:50 -0500 Subject: [PATCH 443/593] Adding performance patch for trmm, just like #2836 --- driver/level3/trmm_L.c | 8 ++++---- driver/level3/trmm_R.c | 12 ++++++------ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/driver/level3/trmm_L.c b/driver/level3/trmm_L.c index 1027c0c73..ae8435d03 100644 --- a/driver/level3/trmm_L.c +++ b/driver/level3/trmm_L.c @@ -139,7 +139,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else - if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; + if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #endif @@ -209,7 +209,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else - if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; + if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #endif @@ -304,7 +304,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else - if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; + if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #endif @@ -374,7 +374,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else - if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; + if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #endif diff --git a/driver/level3/trmm_R.c b/driver/level3/trmm_R.c index e8df7fb21..3be43edde 100644 --- a/driver/level3/trmm_R.c +++ b/driver/level3/trmm_R.c @@ -126,7 +126,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else - if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; + if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #endif @@ -150,7 +150,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else - if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; + if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #endif @@ -207,7 +207,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else - if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; + if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #endif @@ -262,7 +262,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else - if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; + if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #endif @@ -287,7 +287,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else - if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; + if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #endif @@ -348,7 +348,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else - if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; + if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #endif From c4aeeeb9f4d59a28ca91382bc77e55d9abbaa6e7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 15 Sep 2020 23:15:34 +0200 Subject: [PATCH 444/593] Activate all BUILD_ options if none was specified --- cmake/system.cmake | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cmake/system.cmake b/cmake/system.cmake index aa342c3d2..8908a1890 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -393,6 +393,13 @@ set(REVISION "-r${OpenBLAS_VERSION}") set(MAJOR_VERSION ${OpenBLAS_MAJOR_VERSION}) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CCOMMON_OPT}") + +if (NOT BUILD_SINGLE AND NOT BUILD_DOUBLE AND NOT BUILD_COMPLEX AND NOT BUILD_COMPLEX16) + set (BUILD_SINGLE ON) + set (BUILD_DOUBLE ON) + set (BUILD_COMPLEX ON) + set (BUILD_COMPLEX16 ON) +endif() if (BUILD_SINGLE) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_SINGLE") endif() From 2e3b15d68bc108c112abdc0ea3dc8074134b3815 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 17 Sep 2020 13:43:55 +0200 Subject: [PATCH 445/593] Add CMakeLists.txt --- cpp_thread_test/CMakeLists.txt | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 cpp_thread_test/CMakeLists.txt diff --git a/cpp_thread_test/CMakeLists.txt b/cpp_thread_test/CMakeLists.txt new file mode 100644 index 000000000..5eccb12ce --- /dev/null +++ b/cpp_thread_test/CMakeLists.txt @@ -0,0 +1,23 @@ +include_directories(${PROJECT_SOURCE_DIR}) +include_directories(${PROJECT_BINARY_DIR}) + +enable_language(CXX) + +set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -DADD${BU} -DCBLAS") + +if (USE_OPENMP) +if (CPP_THREAD_SAFETY_TEST) + message(STATUS building thread safety test) + add_executable(dgemm_thread_safety dgemm_thread_safety.cpp) + target_link_libraries(dgemm_thread_safety ${OpenBLAS_LIBNAME}) + add_test( dgemm_thread_safety ${CMAKE_CURRENT_BINARY_DIR}/dgemm_thread_safety) +endif() + + +if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV) + add_executable(dgemv_thread_safety dgemv_thread_safety.cpp) + target_link_libraries(dgemv_thread_safety ${OpenBLAS_LIBNAME}) + add_test(dgemv_thread_safety ${CMAKE_CURRENT_BINARY_DIR}/dgemv_thread_safety) +endif() + +endif() From 8c5c991bd7e4eb89fc46d6c5ac41bd5ab9363836 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 17 Sep 2020 13:45:40 +0200 Subject: [PATCH 446/593] Add cpp_thread_test options --- CMakeLists.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4b82d7670..954c053e4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,6 +29,8 @@ option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding proc else() set(NO_AFFINITY 1) endif() +option(CPP_THREAD_SAFETY_TEST "Run a massively parallel DGEMM test to confirm thread safety of the library (requires OpenMP and about 1.3GB of RAM)" OFF) +option(CPP_THREAD_SAFETY_GEMV "Run a massively parallel DGEMV test to confirm thread safety of the library (requires OpenMP)" OFF) # Add a prefix or suffix to all exported symbol names in the shared library. # Avoids conflicts with other BLAS libraries, especially when using @@ -234,6 +236,9 @@ if (NOT MSVC AND NOT NOFORTRAN) add_subdirectory(ctest) endif() add_subdirectory(lapack-netlib/TESTING) + if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV) + add_subdirectory(cpp_thread_test) + endif() endif() set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES From 84c00c3c6e3f8f1344d632a559610d03a861f9fb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 17 Sep 2020 13:46:41 +0200 Subject: [PATCH 447/593] Support running just the GEMV version of the thread safety test --- Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Makefile b/Makefile index 7a03b08f0..93e8af2eb 100644 --- a/Makefile +++ b/Makefile @@ -146,6 +146,9 @@ ifneq ($(NO_CBLAS), 1) ifeq ($(CPP_THREAD_SAFETY_TEST), 1) $(MAKE) -C cpp_thread_test all endif +ifeq ($(CPP_THREAD_SAFETY_GEMV), 1) + $(MAKE) -C cpp_thread_test dgemv_tester +endif endif endif From 6abca76c4e0171a598ffc7f3bef8279c13d71546 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 17 Sep 2020 13:49:24 +0200 Subject: [PATCH 448/593] Add option for running only the less demanding GEMV version of the thread safety tests --- Makefile.rule | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Makefile.rule b/Makefile.rule index 40bd1a854..4d6f2d313 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -272,6 +272,9 @@ COMMON_PROF = -pg # work at all. # # CPP_THREAD_SAFETY_TEST = 1 +# +# use this to run only the less memory-hungry GEMV test +# CPP_THREAD_SAFETY_GEMV = 1 # If you want to enable the experimental BFLOAT16 support From 75d440caa083a32ca3b30809f18f1e29c75a967b Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Thu, 17 Sep 2020 16:45:07 +0200 Subject: [PATCH 449/593] s390x/DYNAMIC_ARCH: fixup broken merge and reapply simplification An unrelated commit and merge inadvertently reverted our recent two changes for simplifying DYNAMIC_ARCH on s390x. Simply reapply the changes. Simplify detection of which kernels we can compile on s390x. Instead of decoding the gcc version in a complicated manner, just check if CC supports a given -march=archXY flag. Together with the next patch, we thereby gain support for builds with LLVM/clang with DYNAMIC_ARCH=1. To enable builds with DYNAMIC_ARCH with older compiler releases, the Makefile and drivers/other/dynamic_arch.c need a common view of the architecture support built into the library. We follow the notation from x86 when used with DYNAMIC_LIST, where defines DYN_ denote support for a given generation to be built in. Since there are far fewer architecture generations in OpenBLAS for s390x, that does not bloat command lines too much. Closes: #2842 Fixes: ba644378dce7 ("Copy BUILD_ options available to the compiler flags" Signed-off-by: Marius Hillenbrand --- Makefile.system | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/Makefile.system b/Makefile.system index 0ccf9eaed..c46c88581 100644 --- a/Makefile.system +++ b/Makefile.system @@ -295,7 +295,6 @@ endif ifeq ($(C_COMPILER), GCC) GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) -GCCVERSIONEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` = 5) GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) @@ -594,34 +593,36 @@ endif ifeq ($(ARCH), zarch) DYNAMIC_CORE = ZARCH_GENERIC -# Z13 is supported since gcc-5.2, gcc-6, and in RHEL 7.3 and newer -ifeq ($(GCCVERSIONGT5), 1) - ZARCH_SUPPORT_Z13 := 1 -else ifeq ($(GCCVERSIONEQ5), 1) -ifeq ($(GCCMINORVERSIONGTEQ2), 1) - ZARCH_SUPPORT_Z13 := 1 -endif -endif - -ifeq ($(wildcard /etc/redhat-release), /etc/redhat-release) -ifeq ($(shell source /etc/os-release ; expr $$VERSION_ID \>= "7.3"), 1) - ZARCH_SUPPORT_Z13 := 1 -endif -endif - -ifeq ($(ZARCH_SUPPORT_Z13), 1) +# if the compiler accepts -march=arch11 or -march=z13 and can compile a file +# with z13-specific inline assembly, then we can include support for Z13. +# note: -march=z13 is equivalent to -march=arch11 yet some compiler releases +# only support one or the other. +# note: LLVM version 6.x supported -march=z13 yet could not handle vector +# registers in inline assembly, so the check for supporting the -march flag is +# not enough. +ZARCH_TEST_COMPILE=-c $(TOPDIR)/kernel/zarch/damin_z13.c -I$(TOPDIR) -o /dev/null > /dev/null 2> /dev/null +ZARCH_CC_SUPPORTS_ARCH11=$(shell $(CC) -march=arch11 $(ZARCH_TEST_COMPILE) && echo 1) +ZARCH_CC_SUPPORTS_Z13=$(shell $(CC) -march=z13 $(ZARCH_TEST_COMPILE) && echo 1) + +ifeq ($(or $(ZARCH_CC_SUPPORTS_ARCH11), $(ZARCH_CC_SUPPORTS_Z13)), 1) DYNAMIC_CORE += Z13 +CCOMMON_OPT += -DDYN_Z13 else -$(info OpenBLAS: Not building Z13 kernels because gcc is older than 5.2 or 6.x) +$(info OpenBLAS: Not building Z13 kernels because the compiler $(CC) does not support it) endif -ifeq ($(GCCVERSIONGTEQ7), 1) +# as above for z13, check for -march=arch12 and z14 support in the compiler. +ZARCH_CC_SUPPORTS_ARCH12=$(shell $(CC) -march=arch12 $(ZARCH_TEST_COMPILE) && echo 1) +ZARCH_CC_SUPPORTS_Z14=$(shell $(CC) -march=z14 $(ZARCH_TEST_COMPILE) && echo 1) +ifeq ($(or $(ZARCH_CC_SUPPORTS_ARCH12), $(ZARCH_CC_SUPPORTS_Z14)), 1) DYNAMIC_CORE += Z14 +CCOMMON_OPT += -DDYN_Z14 else -$(info OpenBLAS: Not building Z14 kernels because gcc is older than 7.x) -endif +$(info OpenBLAS: Not building Z14 kernels because the compiler $(CC) does not support it) endif +endif # ARCH zarch + ifeq ($(ARCH), power) DYNAMIC_CORE = POWER6 DYNAMIC_CORE += POWER8 From be43d2cb9651d37aed44307037dc98b837f95358 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Thu, 17 Sep 2020 12:56:28 -0500 Subject: [PATCH 450/593] Optimize daxpy/zaxpy for POWER10 This patch makes use of new POWER10 vector pair instructions for loads and stores. Tested in simulator and no new failures. --- kernel/power/KERNEL.POWER10 | 4 +- kernel/power/daxpy_microk_power10.c | 131 ++++++++++++++++++ kernel/power/daxpy_power10.c | 121 +++++++++++++++++ kernel/power/zaxpy_microk_power10.c | 200 ++++++++++++++++++++++++++++ kernel/power/zaxpy_power10.c | 126 ++++++++++++++++++ 5 files changed, 580 insertions(+), 2 deletions(-) create mode 100644 kernel/power/daxpy_microk_power10.c create mode 100644 kernel/power/daxpy_power10.c create mode 100644 kernel/power/zaxpy_microk_power10.c create mode 100644 kernel/power/zaxpy_power10.c diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index f390fac61..ec02e09ad 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -142,13 +142,13 @@ CASUMKERNEL = casum.c ZASUMKERNEL = zasum.c # SAXPYKERNEL = saxpy.c -DAXPYKERNEL = daxpy.c +DAXPYKERNEL = daxpy_power10.c ifneq ($(GCCVERSIONGTEQ9),1) CAXPYKERNEL = caxpy_power9.S else CAXPYKERNEL = caxpy.c endif -ZAXPYKERNEL = zaxpy.c +ZAXPYKERNEL = zaxpy_power10.c # SCOPYKERNEL = scopy.c DCOPYKERNEL = dcopy.c diff --git a/kernel/power/daxpy_microk_power10.c b/kernel/power/daxpy_microk_power10.c new file mode 100644 index 000000000..bc9199efd --- /dev/null +++ b/kernel/power/daxpy_microk_power10.c @@ -0,0 +1,131 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 + +static void daxpy_kernel_8 (long n, double *x, double *y, double alpha) +{ + __vector double t0; + + __asm__ + ( + XXSPLTD_S(%x4,%x6,0) + + "dcbt 0, %2 \n\t" + "dcbt 0, %3 \n\t" + + "lxvp 32, 0(%2) \n\t" + "lxvp 34, 32(%2) \n\t" + "lxvp 40, 64(%2) \n\t" + "lxvp 42, 96(%2) \n\t" + + "lxvp 36, 0(%3) \n\t" + "lxvp 38, 32(%3) \n\t" + "lxvp 44, 64(%3) \n\t" + "lxvp 46, 96(%3) \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %1, %1, -16 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvmaddadp 36, 32, %x4 \n\t" + "xvmaddadp 37, 33, %x4 \n\t" + + "lxvp 32, 0(%2) \n\t" + "stxvp 36, 0(%3) \n\t" + + "xvmaddadp 38, 34, %x4 \n\t" + "xvmaddadp 39, 35, %x4 \n\t" + + "lxvp 34, 32(%2) \n\t" + "stxvp 38, 32(%3) \n\t" + + + "lxvp 36, 128(%3) \n\t" + "lxvp 38, 160(%3) \n\t" + + "xvmaddadp 44, 40, %x4 \n\t" + "xvmaddadp 45, 41, %x4 \n\t" + + "lxvp 40, 64(%2) \n\t" + "stxvp 44, 64(%3) \n\t" + + "xvmaddadp 46, 42, %x4 \n\t" + "xvmaddadp 47, 43, %x4 \n\t" + + "lxvp 42, 96(%2) \n\t" + "stxvp 46, 96(%3) \n\t" + + "addi %2, %2, 128 \n\t" + "addi %3, %3, 128 \n\t" + + "lxvp 44, 64(%3) \n\t" + "lxvp 46, 96(%3) \n\t" + + "addic. %1, %1, -16 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "xvmaddadp 36, 32, %x4 \n\t" + "xvmaddadp 37, 33, %x4 \n\t" + "xvmaddadp 38, 34, %x4 \n\t" + "xvmaddadp 39, 35, %x4 \n\t" + + "xvmaddadp 44, 40, %x4 \n\t" + "xvmaddadp 45, 41, %x4 \n\t" + "xvmaddadp 46, 42, %x4 \n\t" + "xvmaddadp 47, 43, %x4 \n\t" + + "stxvp 36, 0(%3) \n\t" + "stxvp 38, 32(%3) \n\t" + "stxvp 44, 64(%3) \n\t" + "stxvp 46, 96(%3) \n\t" + + "#n=%1 x=%5=%2 y=%0=%3 alpha=%6 t0=%x4\n" + : + "+m" (*y), + "+r" (n), // 1 + "+b" (x), // 2 + "+b" (y), // 3 + "=wa" (t0) // 4 + : + "m" (*x), + "d" (alpha) // 6 + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37", "vs38", "vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47" + ); + +} + + diff --git a/kernel/power/daxpy_power10.c b/kernel/power/daxpy_power10.c new file mode 100644 index 000000000..ebe91a80f --- /dev/null +++ b/kernel/power/daxpy_power10.c @@ -0,0 +1,121 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if defined(__VEC__) || defined(__ALTIVEC__) +#include "daxpy_microk_power10.c" +#endif + + +#ifndef HAVE_KERNEL_8 + +static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha) +{ + BLASLONG register i = 0; + + while(i < n) + { + y[i] += alpha * x[i]; + y[i+1] += alpha * x[i+1]; + y[i+2] += alpha * x[i+2]; + y[i+3] += alpha * x[i+3]; + y[i+4] += alpha * x[i+4]; + y[i+5] += alpha * x[i+5]; + y[i+6] += alpha * x[i+6]; + y[i+7] += alpha * x[i+7]; + i+=8 ; + + } + +} + +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -16; + + if ( n1 ) + daxpy_kernel_8(n1, x, y, da); + + i = n1; + while(i < n) + { + + y[i] += da * x[i] ; + i++ ; + + } + return(0); + + + } + + BLASLONG n1 = n & -4; + + while(i < n1) + { + + FLOAT m1 = da * x[ix] ; + FLOAT m2 = da * x[ix+inc_x] ; + FLOAT m3 = da * x[ix+2*inc_x] ; + FLOAT m4 = da * x[ix+3*inc_x] ; + + y[iy] += m1 ; + y[iy+inc_y] += m2 ; + y[iy+2*inc_y] += m3 ; + y[iy+3*inc_y] += m4 ; + + ix += inc_x*4 ; + iy += inc_y*4 ; + i+=4 ; + + } + + while(i < n) + { + + y[iy] += da * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/power/zaxpy_microk_power10.c b/kernel/power/zaxpy_microk_power10.c new file mode 100644 index 000000000..8e593bbfa --- /dev/null +++ b/kernel/power/zaxpy_microk_power10.c @@ -0,0 +1,200 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4 1 +static void zaxpy_kernel_4 (long n, double *x, double *y, + double alpha_r, double alpha_i) +{ +#if !defined(CONJ) + static const double mvec[2] = { 1.0, -1.0 }; +#else + static const double mvec[2] = { -1.0, 1.0 }; +#endif + const double *mvecp = mvec; + + __vector double t0; + __vector double t1; + __vector double t2; + __vector double t3; + __vector double t4; + __vector double t5; + __vector double t6; + __vector double t7; + long ytmp; + + __asm__ + ( + XXSPLTD_S(32,%x15,0) // alpha_r + XXSPLTD_S(33,%x16,0) // alpha_i + "lxvd2x 36, 0, %17 \n\t" // mvec + +#if !defined(CONJ) + "xvmuldp 33, 33, 36 \n\t" // alpha_i * mvec +#else + "xvmuldp 32, 32, 36 \n\t" // alpha_r * mvec +#endif + + "mr %12, %3 \n\t" + "dcbt 0, %2 \n\t" + "dcbt 0, %3 \n\t" + + + "lxvp 40, 0(%2) \n\t" // x0 + "lxvp 42, 32(%2) \n\t" // x2 + "lxvp 48, 0(%3) \n\t" // y0 + "lxvp 50, 32(%3) \n\t" // y2 + + XXSWAPD_S(%x4,40) // exchange real and imag part + XXSWAPD_S(%x5,41) // exchange real and imag part + XXSWAPD_S(%x6,42) // exchange real and imag part + XXSWAPD_S(%x7,43) // exchange real and imag part + + "lxvp 44, 64(%2) \n\t" // x4 + "lxvp 46, 96(%2) \n\t" // x6 + "lxvp 34, 64(%3) \n\t" // y4 + "lxvp 38, 96(%3) \n\t" // y6 + + XXSWAPD_S(%x8,44) // exchange real and imag part + XXSWAPD_S(%x9,45) // exchange real and imag part + XXSWAPD_S(%x10,46) // exchange real and imag part + XXSWAPD_S(%x11,47) // exchange real and imag part + + "addi %2, %2, 128 \n\t" + "addi %3, %3, 128 \n\t" + + "addic. %1, %1, -8 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i + "xvmaddadp 49, 41, 32 \n\t" + "lxvp 40, 0(%2) \n\t" // x0 + "xvmaddadp 50, 42, 32 \n\t" + "xvmaddadp 51, 43, 32 \n\t" + "lxvp 42, 32(%2) \n\t" // x2 + + "xvmaddadp 34, 44, 32 \n\t" + "xvmaddadp 35, 45, 32 \n\t" + "lxvp 44, 64(%2) \n\t" // x4 + "xvmaddadp 38, 46, 32 \n\t" + "xvmaddadp 39, 47, 32 \n\t" + "lxvp 46, 96(%2) \n\t" // x6 + + "xvmaddadp 48, %x4, 33 \n\t" // alpha_i * x0_i , alpha_i * x0_r + "addi %2, %2, 128 \n\t" + "xvmaddadp 49, %x5, 33 \n\t" + "xvmaddadp 50, %x6, 33 \n\t" + "xvmaddadp 51, %x7, 33 \n\t" + + "xvmaddadp 34, %x8, 33 \n\t" + "xvmaddadp 35, %x9, 33 \n\t" + "xvmaddadp 38, %x10, 33 \n\t" + "xvmaddadp 39, %x11, 33 \n\t" + + "stxvp 48, 0(%12) \n\t" + "stxvp 50, 32(%12) \n\t" + "stxvp 34, 64(%12) \n\t" + "stxvp 38, 96(%12) \n\t" + + "addi %12, %12, 128 \n\t" + + XXSWAPD_S(%x4,40) // exchange real and imag part + XXSWAPD_S(%x5,41) // exchange real and imag part + "lxvp 48, 0(%3) \n\t" // y0 + XXSWAPD_S(%x6,42) // exchange real and imag part + XXSWAPD_S(%x7,43) // exchange real and imag part + "lxvp 50, 32(%3) \n\t" // y2 + + XXSWAPD_S(%x8,44) // exchange real and imag part + XXSWAPD_S(%x9,45) // exchange real and imag part + "lxvp 34, 64(%3) \n\t" // y4 + XXSWAPD_S(%x10,46) // exchange real and imag part + XXSWAPD_S(%x11,47) // exchange real and imag part + "lxvp 38, 96(%3) \n\t" // y6 + + "addi %3, %3, 128 \n\t" + + "addic. %1, %1, -8 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + "xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i + "xvmaddadp 49, 41, 32 \n\t" + "xvmaddadp 50, 42, 32 \n\t" + "xvmaddadp 51, 43, 32 \n\t" + + "xvmaddadp 34, 44, 32 \n\t" + "xvmaddadp 35, 45, 32 \n\t" + "xvmaddadp 38, 46, 32 \n\t" + "xvmaddadp 39, 47, 32 \n\t" + + "xvmaddadp 48, %x4, 33 \n\t" // alpha_i * x0_i , alpha_i * x0_r + "xvmaddadp 49, %x5, 33 \n\t" + "xvmaddadp 50, %x6, 33 \n\t" + "xvmaddadp 51, %x7, 33 \n\t" + + "xvmaddadp 34, %x8, 33 \n\t" + "xvmaddadp 35, %x9, 33 \n\t" + "xvmaddadp 38, %x10, 33 \n\t" + "xvmaddadp 39, %x11, 33 \n\t" + + "stxvp 48, 0(%12) \n\t" + "stxvp 50, 32(%12) \n\t" + "stxvp 34, 64(%12) \n\t" + "stxvp 38, 96(%12) \n\t" + + "#n=%1 x=%13=%2 y=%0=%3 alpha=(%15,%16) mvecp=%14=%17 ytmp=%12\n" + "#t0=%x4 t1=%x5 t2=%x6 t3=%x7 t4=%x8 t5=%x9 t6=%x10 t7=%x11" + : + "+m" (*y), + "+r" (n), // 1 + "+b" (x), // 2 + "+b" (y), // 3 + "=wa" (t0), // 4 + "=wa" (t1), // 5 + "=wa" (t2), // 6 + "=wa" (t3), // 7 + "=wa" (t4), // 8 + "=wa" (t5), // 9 + "=wa" (t6), // 10 + "=wa" (t7), // 11 + "=b" (ytmp) // 12 + : + "m" (*x), + "m" (*mvecp), + "d" (alpha_r), // 15 + "d" (alpha_i), // 16 + "12" (mvecp) // 17 + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51" + ); +} diff --git a/kernel/power/zaxpy_power10.c b/kernel/power/zaxpy_power10.c new file mode 100644 index 000000000..54cfb8fd7 --- /dev/null +++ b/kernel/power/zaxpy_power10.c @@ -0,0 +1,126 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + + +#if defined(__VEC__) || defined(__ALTIVEC__) +#include "zaxpy_microk_power10.c" +#endif + + +#ifndef HAVE_KERNEL_4 + +static void zaxpy_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT da_r,FLOAT da_i) +{ + BLASLONG register i = 0; + BLASLONG register ix = 0; + + + + while(i < n) + { +#if !defined(CONJ) + y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; + y[ix+2] += ( da_r * x[ix+2] - da_i * x[ix+3] ) ; + y[ix+3] += ( da_r * x[ix+3] + da_i * x[ix+2] ) ; +#else + y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; + y[ix+2] += ( da_r * x[ix+2] + da_i * x[ix+3] ) ; + y[ix+3] -= ( da_r * x[ix+3] - da_i * x[ix+2] ) ; +#endif + + ix+=4 ; + i+=2 ; + + } + +} + +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -16; + + if ( n1 ) + { + zaxpy_kernel_4 (n1, x, y, da_r, da_i); + ix = 2 * n1; + } + i = n1; + while(i < n) + { +#if !defined(CONJ) + y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; +#else + y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; +#endif + i++ ; + ix += 2; + + } + return(0); + + + } + + inc_x *=2; + inc_y *=2; + + while(i < n) + { + +#if !defined(CONJ) + y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; +#else + y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; +#endif + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + From 7e4d5c237cb10642a9cbf3c173b06045dd10c230 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 18 Sep 2020 09:19:46 +0200 Subject: [PATCH 451/593] Fix workspace query in xGELQ (Reference-LAPACK PR443) --- lapack-netlib/SRC/cgelq.f | 30 +++++++++++++++++++++--------- lapack-netlib/SRC/cgetsls.f | 2 +- lapack-netlib/SRC/dgelq.f | 30 +++++++++++++++++++++--------- lapack-netlib/SRC/dgetsls.f | 2 +- lapack-netlib/SRC/sgelq.f | 30 +++++++++++++++++++++--------- lapack-netlib/SRC/sgetsls.f | 2 +- lapack-netlib/SRC/zgelq.f | 30 +++++++++++++++++++++--------- lapack-netlib/SRC/zgetsls.f | 2 +- 8 files changed, 88 insertions(+), 40 deletions(-) diff --git a/lapack-netlib/SRC/cgelq.f b/lapack-netlib/SRC/cgelq.f index c3b2238bf..f0ff3a20d 100644 --- a/lapack-netlib/SRC/cgelq.f +++ b/lapack-netlib/SRC/cgelq.f @@ -26,7 +26,7 @@ *> where: *> *> Q is a N-by-N orthogonal matrix; -*> L is an lower-triangular M-by-M matrix; +*> L is a lower-triangular M-by-M matrix; *> 0 is a M-by-(N-M) zero matrix, if M < N. *> *> \endverbatim @@ -187,7 +187,7 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, LMINWS, MINT, MINW - INTEGER MB, NB, MINTSZ, NBLCKS + INTEGER MB, NB, MINTSZ, NBLCKS, LWMIN, LWOPT, LWREQ * .. * .. External Functions .. LOGICAL LSAME @@ -243,20 +243,32 @@ * * Determine if the workspace size satisfies minimal size * + IF( ( N.LE.M ) .OR. ( NB.LE.M ) .OR. ( NB.GE.N ) ) THEN + LWMIN = MAX( 1, N ) + LWOPT = MAX( 1, MB*N ) + ELSE + LWMIN = MAX( 1, M ) + LWOPT = MAX( 1, MB*M ) + END IF LMINWS = .FALSE. - IF( ( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) .OR. LWORK.LT.MB*M ) - $ .AND. ( LWORK.GE.M ) .AND. ( TSIZE.GE.MINTSZ ) + IF( ( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) .OR. LWORK.LT.LWOPT ) + $ .AND. ( LWORK.GE.LWMIN ) .AND. ( TSIZE.GE.MINTSZ ) $ .AND. ( .NOT.LQUERY ) ) THEN IF( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) ) THEN LMINWS = .TRUE. MB = 1 NB = N END IF - IF( LWORK.LT.MB*M ) THEN + IF( LWORK.LT.LWOPT ) THEN LMINWS = .TRUE. MB = 1 END IF END IF + IF( ( N.LE.M ) .OR. ( NB.LE.M ) .OR. ( NB.GE.N ) ) THEN + LWREQ = MAX( 1, MB*N ) + ELSE + LWREQ = MAX( 1, MB*M ) + END IF * IF( M.LT.0 ) THEN INFO = -1 @@ -267,7 +279,7 @@ ELSE IF( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) $ .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN INFO = -6 - ELSE IF( ( LWORK.LT.MAX( 1, M*MB ) ) .AND .( .NOT.LQUERY ) + ELSE IF( ( LWORK.LT.LWREQ ) .AND .( .NOT.LQUERY ) $ .AND. ( .NOT.LMINWS ) ) THEN INFO = -8 END IF @@ -281,9 +293,9 @@ T( 2 ) = MB T( 3 ) = NB IF( MINW ) THEN - WORK( 1 ) = MAX( 1, N ) + WORK( 1 ) = LWMIN ELSE - WORK( 1 ) = MAX( 1, MB*M ) + WORK( 1 ) = LWREQ END IF END IF IF( INFO.NE.0 ) THEN @@ -308,7 +320,7 @@ $ LWORK, INFO ) END IF * - WORK( 1 ) = MAX( 1, MB*M ) + WORK( 1 ) = LWREQ * RETURN * diff --git a/lapack-netlib/SRC/cgetsls.f b/lapack-netlib/SRC/cgetsls.f index 3d783be66..01de3c984 100644 --- a/lapack-netlib/SRC/cgetsls.f +++ b/lapack-netlib/SRC/cgetsls.f @@ -261,7 +261,7 @@ TSZM = INT( TQ( 1 ) ) LWM = INT( WORKQ( 1 ) ) CALL CGEMLQ( 'L', TRANS, N, NRHS, M, A, LDA, TQ, - $ TSZO, B, LDB, WORKQ, -1, INFO2 ) + $ TSZM, B, LDB, WORKQ, -1, INFO2 ) LWM = MAX( LWM, INT( WORKQ( 1 ) ) ) WSIZEO = TSZO + LWO WSIZEM = TSZM + LWM diff --git a/lapack-netlib/SRC/dgelq.f b/lapack-netlib/SRC/dgelq.f index fc14d892f..7b2f80862 100644 --- a/lapack-netlib/SRC/dgelq.f +++ b/lapack-netlib/SRC/dgelq.f @@ -26,7 +26,7 @@ *> where: *> *> Q is a N-by-N orthogonal matrix; -*> L is an lower-triangular M-by-M matrix; +*> L is a lower-triangular M-by-M matrix; *> 0 is a M-by-(N-M) zero matrix, if M < N. *> *> \endverbatim @@ -187,7 +187,7 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, LMINWS, MINT, MINW - INTEGER MB, NB, MINTSZ, NBLCKS + INTEGER MB, NB, MINTSZ, NBLCKS, LWMIN, LWOPT, LWREQ * .. * .. External Functions .. LOGICAL LSAME @@ -243,20 +243,32 @@ * * Determine if the workspace size satisfies minimal size * + IF( ( N.LE.M ) .OR. ( NB.LE.M ) .OR. ( NB.GE.N ) ) THEN + LWMIN = MAX( 1, N ) + LWOPT = MAX( 1, MB*N ) + ELSE + LWMIN = MAX( 1, M ) + LWOPT = MAX( 1, MB*M ) + END IF LMINWS = .FALSE. - IF( ( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) .OR. LWORK.LT.MB*M ) - $ .AND. ( LWORK.GE.M ) .AND. ( TSIZE.GE.MINTSZ ) + IF( ( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) .OR. LWORK.LT.LWOPT ) + $ .AND. ( LWORK.GE.LWMIN ) .AND. ( TSIZE.GE.MINTSZ ) $ .AND. ( .NOT.LQUERY ) ) THEN IF( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) ) THEN LMINWS = .TRUE. MB = 1 NB = N END IF - IF( LWORK.LT.MB*M ) THEN + IF( LWORK.LT.LWOPT ) THEN LMINWS = .TRUE. MB = 1 END IF END IF + IF( ( N.LE.M ) .OR. ( NB.LE.M ) .OR. ( NB.GE.N ) ) THEN + LWREQ = MAX( 1, MB*N ) + ELSE + LWREQ = MAX( 1, MB*M ) + END IF * IF( M.LT.0 ) THEN INFO = -1 @@ -267,7 +279,7 @@ ELSE IF( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) $ .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN INFO = -6 - ELSE IF( ( LWORK.LT.MAX( 1, M*MB ) ) .AND .( .NOT.LQUERY ) + ELSE IF( ( LWORK.LT.LWREQ ) .AND .( .NOT.LQUERY ) $ .AND. ( .NOT.LMINWS ) ) THEN INFO = -8 END IF @@ -281,9 +293,9 @@ T( 2 ) = MB T( 3 ) = NB IF( MINW ) THEN - WORK( 1 ) = MAX( 1, N ) + WORK( 1 ) = LWMIN ELSE - WORK( 1 ) = MAX( 1, MB*M ) + WORK( 1 ) = LWREQ END IF END IF IF( INFO.NE.0 ) THEN @@ -308,7 +320,7 @@ $ LWORK, INFO ) END IF * - WORK( 1 ) = MAX( 1, MB*M ) + WORK( 1 ) = LWREQ * RETURN * diff --git a/lapack-netlib/SRC/dgetsls.f b/lapack-netlib/SRC/dgetsls.f index dfc72c8b2..c2ba5e2b8 100644 --- a/lapack-netlib/SRC/dgetsls.f +++ b/lapack-netlib/SRC/dgetsls.f @@ -258,7 +258,7 @@ TSZM = INT( TQ( 1 ) ) LWM = INT( WORKQ( 1 ) ) CALL DGEMLQ( 'L', TRANS, N, NRHS, M, A, LDA, TQ, - $ TSZO, B, LDB, WORKQ, -1, INFO2 ) + $ TSZM, B, LDB, WORKQ, -1, INFO2 ) LWM = MAX( LWM, INT( WORKQ( 1 ) ) ) WSIZEO = TSZO + LWO WSIZEM = TSZM + LWM diff --git a/lapack-netlib/SRC/sgelq.f b/lapack-netlib/SRC/sgelq.f index 96c4097e8..e45c68db4 100644 --- a/lapack-netlib/SRC/sgelq.f +++ b/lapack-netlib/SRC/sgelq.f @@ -26,7 +26,7 @@ *> where: *> *> Q is a N-by-N orthogonal matrix; -*> L is an lower-triangular M-by-M matrix; +*> L is a lower-triangular M-by-M matrix; *> 0 is a M-by-(N-M) zero matrix, if M < N. *> *> \endverbatim @@ -187,7 +187,7 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, LMINWS, MINT, MINW - INTEGER MB, NB, MINTSZ, NBLCKS + INTEGER MB, NB, MINTSZ, NBLCKS, LWMIN, LWOPT, LWREQ * .. * .. External Functions .. LOGICAL LSAME @@ -243,20 +243,32 @@ * * Determine if the workspace size satisfies minimal size * + IF( ( N.LE.M ) .OR. ( NB.LE.M ) .OR. ( NB.GE.N ) ) THEN + LWMIN = MAX( 1, N ) + LWOPT = MAX( 1, MB*N ) + ELSE + LWMIN = MAX( 1, M ) + LWOPT = MAX( 1, MB*M ) + END IF LMINWS = .FALSE. - IF( ( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) .OR. LWORK.LT.MB*M ) - $ .AND. ( LWORK.GE.M ) .AND. ( TSIZE.GE.MINTSZ ) + IF( ( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) .OR. LWORK.LT.LWOPT ) + $ .AND. ( LWORK.GE.LWMIN ) .AND. ( TSIZE.GE.MINTSZ ) $ .AND. ( .NOT.LQUERY ) ) THEN IF( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) ) THEN LMINWS = .TRUE. MB = 1 NB = N END IF - IF( LWORK.LT.MB*M ) THEN + IF( LWORK.LT.LWOPT ) THEN LMINWS = .TRUE. MB = 1 END IF END IF + IF( ( N.LE.M ) .OR. ( NB.LE.M ) .OR. ( NB.GE.N ) ) THEN + LWREQ = MAX( 1, MB*N ) + ELSE + LWREQ = MAX( 1, MB*M ) + END IF * IF( M.LT.0 ) THEN INFO = -1 @@ -267,7 +279,7 @@ ELSE IF( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) $ .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN INFO = -6 - ELSE IF( ( LWORK.LT.MAX( 1, M*MB ) ) .AND .( .NOT.LQUERY ) + ELSE IF( ( LWORK.LT.LWREQ ) .AND .( .NOT.LQUERY ) $ .AND. ( .NOT.LMINWS ) ) THEN INFO = -8 END IF @@ -281,9 +293,9 @@ T( 2 ) = MB T( 3 ) = NB IF( MINW ) THEN - WORK( 1 ) = MAX( 1, N ) + WORK( 1 ) = LWMIN ELSE - WORK( 1 ) = MAX( 1, MB*M ) + WORK( 1 ) = LWREQ END IF END IF IF( INFO.NE.0 ) THEN @@ -308,7 +320,7 @@ $ LWORK, INFO ) END IF * - WORK( 1 ) = MAX( 1, MB*M ) + WORK( 1 ) = LWREQ RETURN * * End of SGELQ diff --git a/lapack-netlib/SRC/sgetsls.f b/lapack-netlib/SRC/sgetsls.f index 53d2f9431..3bf084515 100644 --- a/lapack-netlib/SRC/sgetsls.f +++ b/lapack-netlib/SRC/sgetsls.f @@ -258,7 +258,7 @@ TSZM = INT( TQ( 1 ) ) LWM = INT( WORKQ( 1 ) ) CALL SGEMLQ( 'L', TRANS, N, NRHS, M, A, LDA, TQ, - $ TSZO, B, LDB, WORKQ, -1, INFO2 ) + $ TSZM, B, LDB, WORKQ, -1, INFO2 ) LWM = MAX( LWM, INT( WORKQ( 1 ) ) ) WSIZEO = TSZO + LWO WSIZEM = TSZM + LWM diff --git a/lapack-netlib/SRC/zgelq.f b/lapack-netlib/SRC/zgelq.f index 4e7e7e38e..beb054b87 100644 --- a/lapack-netlib/SRC/zgelq.f +++ b/lapack-netlib/SRC/zgelq.f @@ -26,7 +26,7 @@ *> where: *> *> Q is a N-by-N orthogonal matrix; -*> L is an lower-triangular M-by-M matrix; +*> L is a lower-triangular M-by-M matrix; *> 0 is a M-by-(N-M) zero matrix, if M < N. *> *> \endverbatim @@ -187,7 +187,7 @@ * .. * .. Local Scalars .. LOGICAL LQUERY, LMINWS, MINT, MINW - INTEGER MB, NB, MINTSZ, NBLCKS + INTEGER MB, NB, MINTSZ, NBLCKS, LWMIN, LWOPT, LWREQ * .. * .. External Functions .. LOGICAL LSAME @@ -243,20 +243,32 @@ * * Determine if the workspace size satisfies minimal size * + IF( ( N.LE.M ) .OR. ( NB.LE.M ) .OR. ( NB.GE.N ) ) THEN + LWMIN = MAX( 1, N ) + LWOPT = MAX( 1, MB*N ) + ELSE + LWMIN = MAX( 1, M ) + LWOPT = MAX( 1, MB*M ) + END IF LMINWS = .FALSE. - IF( ( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) .OR. LWORK.LT.MB*M ) - $ .AND. ( LWORK.GE.M ) .AND. ( TSIZE.GE.MINTSZ ) + IF( ( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) .OR. LWORK.LT.LWOPT ) + $ .AND. ( LWORK.GE.LWMIN ) .AND. ( TSIZE.GE.MINTSZ ) $ .AND. ( .NOT.LQUERY ) ) THEN IF( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) ) THEN LMINWS = .TRUE. MB = 1 NB = N END IF - IF( LWORK.LT.MB*M ) THEN + IF( LWORK.LT.LWOPT ) THEN LMINWS = .TRUE. MB = 1 END IF END IF + IF( ( N.LE.M ) .OR. ( NB.LE.M ) .OR. ( NB.GE.N ) ) THEN + LWREQ = MAX( 1, MB*N ) + ELSE + LWREQ = MAX( 1, MB*M ) + END IF * IF( M.LT.0 ) THEN INFO = -1 @@ -267,7 +279,7 @@ ELSE IF( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) $ .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN INFO = -6 - ELSE IF( ( LWORK.LT.MAX( 1, M*MB ) ) .AND .( .NOT.LQUERY ) + ELSE IF( ( LWORK.LT.LWREQ ) .AND .( .NOT.LQUERY ) $ .AND. ( .NOT.LMINWS ) ) THEN INFO = -8 END IF @@ -281,9 +293,9 @@ T( 2 ) = MB T( 3 ) = NB IF( MINW ) THEN - WORK( 1 ) = MAX( 1, N ) + WORK( 1 ) = LWMIN ELSE - WORK( 1 ) = MAX( 1, MB*M ) + WORK( 1 ) = LWREQ END IF END IF IF( INFO.NE.0 ) THEN @@ -308,7 +320,7 @@ $ LWORK, INFO ) END IF * - WORK( 1 ) = MAX( 1, MB*M ) + WORK( 1 ) = LWREQ * RETURN * diff --git a/lapack-netlib/SRC/zgetsls.f b/lapack-netlib/SRC/zgetsls.f index 1aab3c662..11233785b 100644 --- a/lapack-netlib/SRC/zgetsls.f +++ b/lapack-netlib/SRC/zgetsls.f @@ -261,7 +261,7 @@ TSZM = INT( TQ( 1 ) ) LWM = INT( WORKQ( 1 ) ) CALL ZGEMLQ( 'L', TRANS, N, NRHS, M, A, LDA, TQ, - $ TSZO, B, LDB, WORKQ, -1, INFO2 ) + $ TSZM, B, LDB, WORKQ, -1, INFO2 ) LWM = MAX( LWM, INT( WORKQ( 1 ) ) ) WSIZEO = TSZO + LWO WSIZEM = TSZM + LWM From f91057cbad196be09541eccf1ece5472531f63aa Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Tue, 15 Sep 2020 10:54:37 +0200 Subject: [PATCH 452/593] s390x: move common vector definitions and utils into header ... to facilitate reuse beyond gemm_vec.c and avoid code duplication. Signed-off-by: Marius Hillenbrand --- kernel/zarch/gemm_vec.c | 34 ++----------------- kernel/zarch/vector-common.h | 64 ++++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 32 deletions(-) create mode 100644 kernel/zarch/vector-common.h diff --git a/kernel/zarch/gemm_vec.c b/kernel/zarch/gemm_vec.c index ef0b1d1e3..30f3171d2 100644 --- a/kernel/zarch/gemm_vec.c +++ b/kernel/zarch/gemm_vec.c @@ -30,12 +30,13 @@ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "common.h" -#include +#include "vector-common.h" #include #include #include + #ifdef COMPLEX #error "Handling for complex numbers is not supported in this kernel" #endif @@ -153,37 +154,6 @@ static const bool backwards = false; * 3, May 2008. */ -#define VLEN_BYTES 16 -#define VLEN_FLOATS (VLEN_BYTES / sizeof(FLOAT)) - -typedef FLOAT vector_float __attribute__ ((vector_size (16))); - -/** - * Load a vector into register, and hint on 8-byte alignment to improve - * performance. gcc-9 and newer will create these hints by itself. For older - * compiler versions, use inline assembly to explicitly express the hint. - * Provide explicit hex encoding to cater for binutils versions that do not know - * about vector-load with alignment hints yet. - * - * Note that, for block sizes where we apply vectorization, vectors in A will - * always be 8-byte aligned. - */ -static inline vector_float vec_load_hinted(FLOAT const *restrict a) { - vector_float const *restrict addr = (vector_float const *restrict)a; - vector_float y; - -#if __GNUC__ < 9 && !defined(__clang__) - // hex-encode vl %[out],%[addr],3 - asm(".insn vrx,0xe70000003006,%[out],%[addr],3" - : [ out ] "=v"(y) - : [ addr ] "R"(*addr)); -#else - y = *addr; -#endif - - return y; -} - /** * Calculate for a row-block in C_i of size ROWSxCOLS using vector intrinsics. * diff --git a/kernel/zarch/vector-common.h b/kernel/zarch/vector-common.h new file mode 100644 index 000000000..140d39d7b --- /dev/null +++ b/kernel/zarch/vector-common.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) IBM Corporation 2020. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#define VLEN_BYTES 16 +#define VLEN_FLOATS (VLEN_BYTES / sizeof(FLOAT)) + +typedef FLOAT vector_float __attribute__ ((vector_size (VLEN_BYTES))); + +/** + * Load a vector into register, and hint on 8-byte alignment to improve + * performance. gcc-9 and newer will create these hints by itself. For older + * compiler versions, use inline assembly to explicitly express the hint. + * Provide explicit hex encoding to cater for binutils versions that do not know + * about vector-load with alignment hints yet. + * + * Note that, for block sizes where we apply vectorization, vectors in A will + * always be 8-byte aligned. + */ +static inline vector_float vec_load_hinted(FLOAT const *restrict a) { + vector_float const *restrict addr = (vector_float const *restrict)a; + vector_float y; + +#if __GNUC__ < 9 && !defined(__clang__) + // hex-encode vl %[out],%[addr],3 + asm(".insn vrx,0xe70000003006,%[out],%[addr],3" + : [ out ] "=v"(y) + : [ addr ] "R"(*addr)); +#else + y = *addr; +#endif + + return y; +} From 77ea73f5e5579ea35b6be03bac455643b84e343d Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Wed, 16 Sep 2020 15:55:38 +0200 Subject: [PATCH 453/593] s390x: for clang use fp-contract=on instead of fast Make clang slightly more cautious when contracting floating-point operations (e.g., when applying fused multiply add) by setting -ffp-contract=on (instead of fast). Signed-off-by: Marius Hillenbrand --- Makefile.zarch | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.zarch b/Makefile.zarch index b841d9b4d..092ca2589 100644 --- a/Makefile.zarch +++ b/Makefile.zarch @@ -12,5 +12,5 @@ endif # Enable floating-point expression contraction for clang, since it is the # default for gcc ifeq ($(C_COMPILER), CLANG) -CCOMMON_OPT += -ffp-contract=fast +CCOMMON_OPT += -ffp-contract=on endif From 22aa81f3e587c85c5ccdcbbe2964cf5f89a00931 Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Mon, 14 Sep 2020 18:36:31 +0200 Subject: [PATCH 454/593] s390x: fix cscal and zscal implementations The implementation of complex scalar * vector multiplication for Z14 makes some LAPACK tests fail because the numerical differences to the reference implementation exceed the threshold (as can be seen by running make lapack-test and replacing kernel/zarch/cscal.c with a generic implementation for comparison). The complex multiplication uses terms of the form a * b + c * d for both real and imaginary parts. The assembly code (and compiler-emitted code as well) uses fused multiply add operations for the second product and sum. The results can be "surprising", for example when both terms in the imaginary part nearly cancel each other out. In that case, the second product contributes more digits to the sum than the first product that has been rounded before. One option is to use separate multiplications (which then round the same way) and a distinct add. Change the code to pursue that path, by (1) requesting the compiler not to contract the operations into FMAs and (2) replacing the assembly kernel with corresponding vectorized C code (where change 1 also applies). Signed-off-by: Marius Hillenbrand --- kernel/zarch/cscal.c | 96 ++++++++++++++------------------------------ kernel/zarch/zscal.c | 94 ++++++++++++++----------------------------- 2 files changed, 60 insertions(+), 130 deletions(-) diff --git a/kernel/zarch/cscal.c b/kernel/zarch/cscal.c index f9e89a452..57bb89c0a 100644 --- a/kernel/zarch/cscal.c +++ b/kernel/zarch/cscal.c @@ -25,67 +25,35 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +/* + * Avoid contraction of floating point operations, specifically fused + * multiply-add, because they can cause unexpected results in complex + * multiplication. + */ +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC optimize ("fp-contract=off") +#endif + +#if defined(__clang__) +#pragma clang fp contract(off) +#endif + #include "common.h" +#include "vector-common.h" -static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x) { - __asm__("vlrepf %%v0,0(%[alpha])\n\t" - "vlef %%v1,4(%[alpha]),0\n\t" - "vlef %%v1,4(%[alpha]),2\n\t" - "vflcsb %%v1,%%v1\n\t" - "vlef %%v1,4(%[alpha]),1\n\t" - "vlef %%v1,4(%[alpha]),3\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "verllg %%v24,%%v16,32\n\t" - "verllg %%v25,%%v17,32\n\t" - "verllg %%v26,%%v18,32\n\t" - "verllg %%v27,%%v19,32\n\t" - "verllg %%v28,%%v20,32\n\t" - "verllg %%v29,%%v21,32\n\t" - "verllg %%v30,%%v22,32\n\t" - "verllg %%v31,%%v23,32\n\t" - "vfmsb %%v16,%%v16,%%v0\n\t" - "vfmsb %%v17,%%v17,%%v0\n\t" - "vfmsb %%v18,%%v18,%%v0\n\t" - "vfmsb %%v19,%%v19,%%v0\n\t" - "vfmsb %%v20,%%v20,%%v0\n\t" - "vfmsb %%v21,%%v21,%%v0\n\t" - "vfmsb %%v22,%%v22,%%v0\n\t" - "vfmsb %%v23,%%v23,%%v0\n\t" - "vfmasb %%v16,%%v24,%%v1,%%v16\n\t" - "vfmasb %%v17,%%v25,%%v1,%%v17\n\t" - "vfmasb %%v18,%%v26,%%v1,%%v18\n\t" - "vfmasb %%v19,%%v27,%%v1,%%v19\n\t" - "vfmasb %%v20,%%v28,%%v1,%%v20\n\t" - "vfmasb %%v21,%%v29,%%v1,%%v21\n\t" - "vfmasb %%v22,%%v30,%%v1,%%v22\n\t" - "vfmasb %%v23,%%v31,%%v1,%%v23\n\t" - "vst %%v16,0(%%r1,%[x])\n\t" - "vst %%v17,16(%%r1,%[x])\n\t" - "vst %%v18,32(%%r1,%[x])\n\t" - "vst %%v19,48(%%r1,%[x])\n\t" - "vst %%v20,64(%%r1,%[x])\n\t" - "vst %%v21,80(%%r1,%[x])\n\t" - "vst %%v22,96(%%r1,%[x])\n\t" - "vst %%v23,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) - : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha), - [alpha] "a"(alpha) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); +static void cscal_kernel_16(BLASLONG n, FLOAT da_r, FLOAT da_i, FLOAT *x) { + vector_float da_r_vec = vec_splats(da_r); + vector_float da_i_vec = { -da_i, da_i, -da_i, da_i }; + + vector_float *x_vec_ptr = (vector_float *)x; + +#pragma GCC unroll 16 + for (size_t i = 0; i < n/2; i++) { + vector_float x_vec = vec_load_hinted(x + i * VLEN_FLOATS); + vector_float x_swapped = {x_vec[1], x_vec[0], x_vec[3], x_vec[2]}; + + x_vec_ptr[i] = x_vec * da_r_vec + x_swapped * da_i_vec; + } } static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { @@ -199,14 +167,12 @@ static void cscal_kernel_16_zero(BLASLONG n, FLOAT *x) { : "cc", "r1", "v0"); } -static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, +static void cscal_kernel_inc_8(BLASLONG n, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x) { BLASLONG i; BLASLONG inc_x2 = 2 * inc_x; BLASLONG inc_x3 = inc_x2 + inc_x; FLOAT t0, t1, t2, t3; - FLOAT da_r = alpha[0]; - FLOAT da_i = alpha[1]; for (i = 0; i < n; i += 4) { t0 = da_r * x[0] - da_i * x[1]; @@ -324,9 +290,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, BLASLONG n1 = n & -8; if (n1 > 0) { - alpha[0] = da_r; - alpha[1] = da_i; - cscal_kernel_inc_8(n1, alpha, x, inc_x); + cscal_kernel_inc_8(n1, da_r, da_i, x, inc_x); j = n1; i = n1 * inc_x; } @@ -362,7 +326,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, else if (da_i == 0) cscal_kernel_16_zero_i(n1, alpha, x); else - cscal_kernel_16(n1, alpha, x); + cscal_kernel_16(n1, da_r, da_i, x); i = n1 << 1; j = n1; diff --git a/kernel/zarch/zscal.c b/kernel/zarch/zscal.c index a5a8f694d..d39b8447e 100644 --- a/kernel/zarch/zscal.c +++ b/kernel/zarch/zscal.c @@ -25,65 +25,35 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +/* + * Avoid contraction of floating point operations, specifically fused + * multiply-add, because they can cause unexpected results in complex + * multiplication. + */ +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC optimize ("fp-contract=off") +#endif + +#if defined(__clang__) +#pragma clang fp contract(off) +#endif + #include "common.h" +#include "vector-common.h" -static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) { - __asm__("vlrepg %%v0,0(%[alpha])\n\t" - "vleg %%v1,8(%[alpha]),0\n\t" - "wflcdb %%v1,%%v1\n\t" - "vleg %%v1,8(%[alpha]),1\n\t" - "srlg %[n],%[n],3\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vpdi %%v24,%%v16,%%v16,4\n\t" - "vpdi %%v25,%%v17,%%v17,4\n\t" - "vpdi %%v26,%%v18,%%v18,4\n\t" - "vpdi %%v27,%%v19,%%v19,4\n\t" - "vpdi %%v28,%%v20,%%v20,4\n\t" - "vpdi %%v29,%%v21,%%v21,4\n\t" - "vpdi %%v30,%%v22,%%v22,4\n\t" - "vpdi %%v31,%%v23,%%v23,4\n\t" - "vfmdb %%v16,%%v16,%%v0\n\t" - "vfmdb %%v17,%%v17,%%v0\n\t" - "vfmdb %%v18,%%v18,%%v0\n\t" - "vfmdb %%v19,%%v19,%%v0\n\t" - "vfmdb %%v20,%%v20,%%v0\n\t" - "vfmdb %%v21,%%v21,%%v0\n\t" - "vfmdb %%v22,%%v22,%%v0\n\t" - "vfmdb %%v23,%%v23,%%v0\n\t" - "vfmadb %%v16,%%v24,%%v1,%%v16\n\t" - "vfmadb %%v17,%%v25,%%v1,%%v17\n\t" - "vfmadb %%v18,%%v26,%%v1,%%v18\n\t" - "vfmadb %%v19,%%v27,%%v1,%%v19\n\t" - "vfmadb %%v20,%%v28,%%v1,%%v20\n\t" - "vfmadb %%v21,%%v29,%%v1,%%v21\n\t" - "vfmadb %%v22,%%v30,%%v1,%%v22\n\t" - "vfmadb %%v23,%%v31,%%v1,%%v23\n\t" - "vst %%v16,0(%%r1,%[x])\n\t" - "vst %%v17,16(%%r1,%[x])\n\t" - "vst %%v18,32(%%r1,%[x])\n\t" - "vst %%v19,48(%%r1,%[x])\n\t" - "vst %%v20,64(%%r1,%[x])\n\t" - "vst %%v21,80(%%r1,%[x])\n\t" - "vst %%v22,96(%%r1,%[x])\n\t" - "vst %%v23,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) - : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha), - [alpha] "a"(alpha) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); +static void zscal_kernel_8(BLASLONG n, FLOAT da_r, FLOAT da_i, FLOAT *x) { + vector_float da_r_vec = vec_splats(da_r); + vector_float da_i_vec = { -da_i, da_i }; + + vector_float * x_vec_ptr = (vector_float *)x; + +#pragma GCC unroll 16 + for (size_t i = 0; i < n; i++) { + vector_float x_vec = vec_load_hinted(x + i * VLEN_FLOATS); + vector_float x_swapped = {x_vec[1], x_vec[0]}; + + x_vec_ptr[i] = x_vec * da_r_vec + x_swapped * da_i_vec; + } } static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { @@ -195,14 +165,12 @@ static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) { : "cc", "r1", "v0"); } -static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, +static void zscal_kernel_inc_8(BLASLONG n, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x) { BLASLONG i; BLASLONG inc_x2 = 2 * inc_x; BLASLONG inc_x3 = inc_x2 + inc_x; FLOAT t0, t1, t2, t3; - FLOAT da_r = alpha[0]; - FLOAT da_i = alpha[1]; for (i = 0; i < n; i += 4) { t0 = da_r * x[0] - da_i * x[1]; @@ -320,9 +288,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, BLASLONG n1 = n & -8; if (n1 > 0) { - alpha[0] = da_r; - alpha[1] = da_i; - zscal_kernel_inc_8(n1, alpha, x, inc_x); + zscal_kernel_inc_8(n1, da_r, da_i, x, inc_x); j = n1; i = n1 * inc_x; } @@ -358,7 +324,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, else if (da_i == 0) zscal_kernel_8_zero_i(n1, alpha, x); else - zscal_kernel_8(n1, alpha, x); + zscal_kernel_8(n1, da_r, da_i, x); i = n1 << 1; j = n1; From 325b539c26414f05666c0b0bfb2d6fe3e95cb039 Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Tue, 22 Sep 2020 10:38:35 +0800 Subject: [PATCH 455/593] Optimize the performance of daxpy by using universal intrinsics --- kernel/simd/intrin.h | 51 +++++++++++++++++++++++++++++++++++++ kernel/simd/intrin_avx.h | 19 ++++++++++++++ kernel/simd/intrin_avx512.h | 19 ++++++++++++++ kernel/simd/intrin_sse.h | 19 ++++++++++++++ kernel/x86_64/daxpy.c | 39 ++++++++++++++++------------ 5 files changed, 131 insertions(+), 16 deletions(-) create mode 100644 kernel/simd/intrin.h create mode 100644 kernel/simd/intrin_avx.h create mode 100644 kernel/simd/intrin_avx512.h create mode 100644 kernel/simd/intrin_sse.h diff --git a/kernel/simd/intrin.h b/kernel/simd/intrin.h new file mode 100644 index 000000000..ef599f065 --- /dev/null +++ b/kernel/simd/intrin.h @@ -0,0 +1,51 @@ +#ifndef _INTRIN_H_ +#define _INTRIN_H_ + +#ifdef __cplusplus +extern "C" { +#endif +// include head +/** SSE **/ +#ifdef HAVE_SSE +#include +#endif +/** SSE2 **/ +#ifdef HAVE_SSE2 +#include +#endif +/** SSE3 **/ +#ifdef HAVE_SSE3 +#include +#endif +/** SSSE3 **/ +#ifdef HAVE_SSSE3 +#include +#endif +/** SSE41 **/ +#ifdef HAVE_SSE4_1 +#include +#endif + +/** AVX **/ +#ifdef HAVE_AVX +#include +#endif + +// distribute +#if defined(HAVE_AVX512VL) || defined(HAVE_AVX512BF16) +#include "intrin_avx512.h" +#elif defined(HAVE_AVX2) +#include "intrin_avx.h" +#elif defined(HAVE_SSE2) +#include "intrin_sse.h" +#endif + +#ifndef V_SIMD + #define V_SIMD 0 + #define V_SIMD_F64 0 +#endif + +#ifdef __cplusplus +} +#endif +#endif // _INTRIN_H_ diff --git a/kernel/simd/intrin_avx.h b/kernel/simd/intrin_avx.h new file mode 100644 index 000000000..726254429 --- /dev/null +++ b/kernel/simd/intrin_avx.h @@ -0,0 +1,19 @@ +#define V_SIMD 256 +#define V_SIMD_F64 1 +/* +Data Type +*/ +typedef __m256 v_f32; +#define v_nlanes_f32 8 +/* +arithmetic +*/ +#define v_add_f32 _mm256_add_ps +#define v_mul_f32 _mm256_mul_ps +/* +memory +*/ +// unaligned load +#define v_loadu_f32 _mm256_loadu_ps +#define v_storeu_f32 _mm256_storeu_ps +#define v_setall_f32(VAL) _mm256_set1_ps(VAL) \ No newline at end of file diff --git a/kernel/simd/intrin_avx512.h b/kernel/simd/intrin_avx512.h new file mode 100644 index 000000000..775fe7aa5 --- /dev/null +++ b/kernel/simd/intrin_avx512.h @@ -0,0 +1,19 @@ +#define V_SIMD 512 +#define V_SIMD_F64 1 +/* +Data Type +*/ +typedef __m512 v_f32; +#define v_nlanes_f32 16 +/* +arithmetic +*/ +#define v_add_f32 _mm512_add_ps +#define v_mul_f32 _mm512_mul_ps +/* +memory +*/ +// unaligned load +#define v_loadu_f32(PTR) _mm512_loadu_ps((const __m512*)(PTR)) +#define v_storeu_f32(PTR) _mm512_storeu_ps((const __m512*)(PTR)) +#define v_setall_f32(VAL) _mm512_set1_ps(VAL) diff --git a/kernel/simd/intrin_sse.h b/kernel/simd/intrin_sse.h new file mode 100644 index 000000000..0cc159aa7 --- /dev/null +++ b/kernel/simd/intrin_sse.h @@ -0,0 +1,19 @@ +#define V_SIMD 128 +#define V_SIMD_F64 1 +/* +Data Type +*/ +typedef __m128 v_f32; +#define v_nlanes_f32 4 +/* +arithmetic +*/ +#define v_add_f32 _mm_add_ps +#define v_mul_f32 _mm_mul_ps +/* +memory +*/ +// unaligned load +#define v_loadu_f32 _mm_loadu_ps +#define v_storeu_f32 _mm_storeu_ps +#define v_setall_f32(VAL) _mm_set1_ps(VAL) \ No newline at end of file diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c index d84c0c221..9836faca1 100644 --- a/kernel/x86_64/daxpy.c +++ b/kernel/x86_64/daxpy.c @@ -45,28 +45,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "daxpy_microk_sandy-2.c" #endif - #ifndef HAVE_KERNEL_8 +#include"../simd/intrin.h" -static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; FLOAT a = *alpha; - +#if V_SIMD + v_f32 __alpha, tmp; + __alpha = v_setall_f32(*alpha); + const int vstep = v_nlanes_f32; + for (; i < n; i += vstep) { + tmp = v_add_f32(v_loadu_f32(y + i), v_mul_f32(__alpha, v_loadu_f32( x + i ))); + v_storeu_f32(y + i, tmp); + } +#else while(i < n) - { - y[i] += a * x[i]; - y[i+1] += a * x[i+1]; - y[i+2] += a * x[i+2]; - y[i+3] += a * x[i+3]; - y[i+4] += a * x[i+4]; - y[i+5] += a * x[i+5]; - y[i+6] += a * x[i+6]; - y[i+7] += a * x[i+7]; - i+=8 ; - - } - + { + y[i] += a * x[i]; + y[i+1] += a * x[i+1]; + y[i+2] += a * x[i+2]; + y[i+3] += a * x[i+3]; + y[i+4] += a * x[i+4]; + y[i+5] += a * x[i+5]; + y[i+6] += a * x[i+6]; + y[i+7] += a * x[i+7]; + i+=8 ; + } +#endif } #endif From 06cf73a239ab6cc997bcb29009eb52b28a817cc3 Mon Sep 17 00:00:00 2001 From: y00512012 Date: Tue, 22 Sep 2020 16:47:10 +0800 Subject: [PATCH 456/593] fix a bug of trmm --- driver/level3/trmm_L.c | 48 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 6 deletions(-) diff --git a/driver/level3/trmm_L.c b/driver/level3/trmm_L.c index ae8435d03..880de4df4 100644 --- a/driver/level3/trmm_L.c +++ b/driver/level3/trmm_L.c @@ -122,6 +122,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = min_l; if (min_i > GEMM_P) min_i = GEMM_P; + if( min_i > GEMM_UNROLL_M){ + min_i = (min_i / GEMM_UNROLL_M) * GEMM_UNROLL_M; + } START_RPCC(); @@ -161,9 +164,12 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO } - for(is = min_i; is < min_l; is += GEMM_P){ + for(is = min_i; is < min_l; is += min_i){ min_i = min_l - is; if (min_i > GEMM_P) min_i = GEMM_P; + if( min_i > GEMM_UNROLL_M){ + min_i = (min_i / GEMM_UNROLL_M) * GEMM_UNROLL_M; + } START_RPCC(); @@ -192,6 +198,10 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = ls; if (min_i > GEMM_P) min_i = GEMM_P; + if( min_i > GEMM_UNROLL_M){ + min_i = (min_i / GEMM_UNROLL_M) * GEMM_UNROLL_M; + } + START_RPCC(); @@ -231,9 +241,12 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO STOP_RPCC(gemmcost); } - for(is = min_i; is < ls; is += GEMM_P){ + for(is = min_i; is < ls; is += min_i){ min_i = ls - is; if (min_i > GEMM_P) min_i = GEMM_P; + if( min_i > GEMM_UNROLL_M){ + min_i = (min_i / GEMM_UNROLL_M) * GEMM_UNROLL_M; + } START_RPCC(); @@ -256,9 +269,12 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO STOP_RPCC(gemmcost); } - for(is = ls; is < ls + min_l; is += GEMM_P){ + for(is = ls; is < ls + min_l; is += min_i){ min_i = ls + min_l - is; if (min_i > GEMM_P) min_i = GEMM_P; + if( min_i > GEMM_UNROLL_M){ + min_i = (min_i / GEMM_UNROLL_M) * GEMM_UNROLL_M; + } START_RPCC(); @@ -287,6 +303,10 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = min_l; if (min_i > GEMM_P) min_i = GEMM_P; + if (min_i > GEMM_UNROLL_M){ + min_i = (min_i / GEMM_UNROLL_M) * GEMM_UNROLL_M; + } + START_RPCC(); @@ -327,9 +347,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO STOP_RPCC(trmmcost); } - for(is = m - min_l + min_i; is < m; is += GEMM_P){ + for(is = m - min_l + min_i; is < m; is += min_i){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; + if (min_i > GEMM_UNROLL_M){ + min_i = (min_i / GEMM_UNROLL_M) * GEMM_UNROLL_M; + } + + START_RPCC(); @@ -357,6 +382,10 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = min_l; if (min_i > GEMM_P) min_i = GEMM_P; + if (min_i > GEMM_UNROLL_M){ + min_i = (min_i / GEMM_UNROLL_M) * GEMM_UNROLL_M; + } + START_RPCC(); @@ -397,9 +426,13 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO STOP_RPCC(trmmcost); } - for(is = ls - min_l + min_i; is < ls; is += GEMM_P){ + for(is = ls - min_l + min_i; is < ls; is += min_i){ min_i = ls - is; if (min_i > GEMM_P) min_i = GEMM_P; + if (min_i > GEMM_UNROLL_M){ + min_i = (min_i / GEMM_UNROLL_M) * GEMM_UNROLL_M; + } + START_RPCC(); @@ -423,9 +456,12 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO } - for(is = ls; is < m; is += GEMM_P){ + for(is = ls; is < m; is += min_i){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; + if (min_i > GEMM_UNROLL_M){ + min_i = (min_i / GEMM_UNROLL_M) * GEMM_UNROLL_M; + } START_RPCC(); From 14f7dad3b7d728159bbeab72deb9e7878d108760 Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Tue, 22 Sep 2020 16:52:15 +0800 Subject: [PATCH 457/593] performance improved --- kernel/simd/intrin.h | 20 ++++++++++++++++++++ kernel/simd/intrin_avx.h | 10 ++++++++++ kernel/simd/intrin_avx512.h | 4 +++- kernel/simd/intrin_sse.h | 11 +++++++++++ kernel/x86_64/daxpy.c | 4 ++-- 5 files changed, 46 insertions(+), 3 deletions(-) diff --git a/kernel/simd/intrin.h b/kernel/simd/intrin.h index ef599f065..5997bb6ac 100644 --- a/kernel/simd/intrin.h +++ b/kernel/simd/intrin.h @@ -1,6 +1,26 @@ #ifndef _INTRIN_H_ #define _INTRIN_H_ +#if defined(_MSC_VER) +#define BLAS_INLINE __inline +#elif defined(__GNUC__) +#if defined(__STRICT_ANSI__) +#define BLAS_INLINE __inline__ +#else +#define BLAS_INLINE inline +#endif +#else +#define BLAS_INLINE +#endif + +#ifdef _MSC_VER +#define BLAS_FINLINE static __forceinline +#elif defined(__GNUC__) +#define BLAS_FINLINE static BLAS_INLINE __attribute__((always_inline)) +#else +#define BLAS_FINLINE static +#endif + #ifdef __cplusplus extern "C" { #endif diff --git a/kernel/simd/intrin_avx.h b/kernel/simd/intrin_avx.h index 726254429..f6257ae98 100644 --- a/kernel/simd/intrin_avx.h +++ b/kernel/simd/intrin_avx.h @@ -10,6 +10,16 @@ arithmetic */ #define v_add_f32 _mm256_add_ps #define v_mul_f32 _mm256_mul_ps + +#ifdef HAVE_FMA3 + // multiply and add, a*b + c + #define v_muladd_f32 _mm256_fmadd_ps +#else + // multiply and add, a*b + c + BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c) + { return v_add_f32(v_mul_f32(a, b), c); } +#endif // !HAVE_FMA3 + /* memory */ diff --git a/kernel/simd/intrin_avx512.h b/kernel/simd/intrin_avx512.h index 775fe7aa5..cb116a9a3 100644 --- a/kernel/simd/intrin_avx512.h +++ b/kernel/simd/intrin_avx512.h @@ -10,10 +10,12 @@ arithmetic */ #define v_add_f32 _mm512_add_ps #define v_mul_f32 _mm512_mul_ps +// multiply and add, a*b + c +#define v_muladd_f32 _mm512_fmadd_ps /* memory */ // unaligned load #define v_loadu_f32(PTR) _mm512_loadu_ps((const __m512*)(PTR)) -#define v_storeu_f32(PTR) _mm512_storeu_ps((const __m512*)(PTR)) +#define v_storeu_f32 _mm512_storeu_ps #define v_setall_f32(VAL) _mm512_set1_ps(VAL) diff --git a/kernel/simd/intrin_sse.h b/kernel/simd/intrin_sse.h index 0cc159aa7..260112028 100644 --- a/kernel/simd/intrin_sse.h +++ b/kernel/simd/intrin_sse.h @@ -10,6 +10,17 @@ arithmetic */ #define v_add_f32 _mm_add_ps #define v_mul_f32 _mm_mul_ps +#ifdef HAVE_FMA3 + // multiply and add, a*b + c + #define v_muladd_f32 _mm_fmadd_ps +#elif defined(HAVE_FMA4) + // multiply and add, a*b + c + #define v_muladd_f32 _mm_macc_ps +#else + // multiply and add, a*b + c + BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c) + { return v_add_f32(v_mul_f32(a, b), c); } +#endif // HAVE_FMA3 /* memory */ diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c index 9836faca1..b62e3dcb3 100644 --- a/kernel/x86_64/daxpy.c +++ b/kernel/x86_64/daxpy.c @@ -48,7 +48,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef HAVE_KERNEL_8 #include"../simd/intrin.h" -void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; FLOAT a = *alpha; @@ -57,7 +57,7 @@ void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) __alpha = v_setall_f32(*alpha); const int vstep = v_nlanes_f32; for (; i < n; i += vstep) { - tmp = v_add_f32(v_loadu_f32(y + i), v_mul_f32(__alpha, v_loadu_f32( x + i ))); + tmp = v_muladd_f32(__alpha, v_loadu_f32( x + i ), v_loadu_f32(y + i)); v_storeu_f32(y + i, tmp); } #else From 5ba01dd1a829c02cf7ccb1b790948570570eca05 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 22 Sep 2020 17:26:19 +0200 Subject: [PATCH 458/593] Add an OSX build with xcode12 --- .travis.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/.travis.yml b/.travis.yml index 3f8f766fe..482b4f648 100644 --- a/.travis.yml +++ b/.travis.yml @@ -204,6 +204,17 @@ matrix: env: - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-8" + - <<: *test-macos + osx_image: xcode12 + before_script: + - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" + - brew update + - brew install gcc@10 # for gfortran + script: + - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE + env: + - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10" + - <<: *test-macos osx_image: xcode10.0 env: From b886bd672b6f7aa97cb0ac8372a1ec1029d64bff Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 22 Sep 2020 23:18:55 +0200 Subject: [PATCH 459/593] add defines for building a subset of types --- common_param.h | 92 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 82 insertions(+), 10 deletions(-) diff --git a/common_param.h b/common_param.h index a52de98ab..a689ddf7d 100644 --- a/common_param.h +++ b/common_param.h @@ -146,26 +146,34 @@ BLASLONG (*ishmin_k) (BLASLONG, float *, BLASLONG); int (*shlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); #endif + +#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) int sgemm_p, sgemm_q, sgemm_r; int sgemm_unroll_m, sgemm_unroll_n, sgemm_unroll_mn; +#endif int exclusive_cache; +#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) float (*samax_k) (BLASLONG, float *, BLASLONG); float (*samin_k) (BLASLONG, float *, BLASLONG); float (*smax_k) (BLASLONG, float *, BLASLONG); float (*smin_k) (BLASLONG, float *, BLASLONG); + BLASLONG (*isamax_k)(BLASLONG, float *, BLASLONG); BLASLONG (*isamin_k)(BLASLONG, float *, BLASLONG); BLASLONG (*ismax_k) (BLASLONG, float *, BLASLONG); BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); - float (*snrm2_k) (BLASLONG, float *, BLASLONG); float (*sasum_k) (BLASLONG, float *, BLASLONG); +#endif +#ifdef BUILD_SINGLE float (*ssum_k) (BLASLONG, float *, BLASLONG); +#endif +#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); + //double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*srot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); @@ -175,6 +183,8 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*sgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*sgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +#endif +#ifdef BUILD_SINGLE int (*sger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*ssymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); @@ -185,6 +195,8 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*sgemm_direct_performant) (BLASLONG M, BLASLONG N, BLASLONG K); #endif +#endif +#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) int (*sgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); int (*sgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); @@ -193,7 +205,8 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*sgemm_itcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*sgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*sgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); - +#endif +#ifdef BUILD_SINGLE int (*strsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); @@ -245,10 +258,14 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*sneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*slaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); +#endif +#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16) int dgemm_p, dgemm_q, dgemm_r; int dgemm_unroll_m, dgemm_unroll_n, dgemm_unroll_mn; +#endif +#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16) double (*damax_k) (BLASLONG, double *, BLASLONG); double (*damin_k) (BLASLONG, double *, BLASLONG); double (*dmax_k) (BLASLONG, double *, BLASLONG); @@ -257,25 +274,37 @@ BLASLONG (*idamax_k)(BLASLONG, double *, BLASLONG); BLASLONG (*idamin_k)(BLASLONG, double *, BLASLONG); BLASLONG (*idmax_k) (BLASLONG, double *, BLASLONG); BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); +#endif +#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16) double (*dnrm2_k) (BLASLONG, double *, BLASLONG); double (*dasum_k) (BLASLONG, double *, BLASLONG); +#endif +#ifdef BUILD_DOUBLE double (*dsum_k) (BLASLONG, double *, BLASLONG); +#endif +#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16) int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); +#endif +#if defined (BUILD_SINGLE) || defined(BUILD_DOUBLE) + double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); +#endif +#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16) int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); - int (*daxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); int (*dscal_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); int (*dswap_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); - int (*dgemv_n) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*dgemv_t) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +#endif +#ifdef BUILD_DOUBLE int (*dger_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*dsymv_L) (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*dsymv_U) (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - +#endif +#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16) int (*dgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG); int (*dgemm_beta )(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); @@ -283,7 +312,8 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); int (*dgemm_itcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*dgemm_oncopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*dgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); - +#endif +#ifdef BUILD_DOUBLE int (*dtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int (*dtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int (*dtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); @@ -335,7 +365,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); int (*dneg_tcopy) (BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*dlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, double *, BLASLONG, blasint *, double *); - +#endif #ifdef EXPRECISION int qgemm_p, qgemm_q, qgemm_r; @@ -430,6 +460,7 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG); #endif +#ifdef BUILD_COMPLEX int cgemm_p, cgemm_q, cgemm_r; int cgemm_unroll_m, cgemm_unroll_n, cgemm_unroll_mn; @@ -593,7 +624,9 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); int (*cneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*claswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); +#endif +#ifdef BUILD_COMPLEX16 int zgemm_p, zgemm_q, zgemm_r; int zgemm_unroll_m, zgemm_unroll_n, zgemm_unroll_mn; @@ -757,6 +790,7 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG); int (*zneg_tcopy) (BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*zlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, double *, BLASLONG, blasint *, double *); +#endif #ifdef EXPRECISION @@ -930,22 +964,34 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); void (*init)(void); int snum_opt, dnum_opt, qnum_opt; - +#ifdef BUILD_SINGLE int (*saxpby_k) (BLASLONG, float, float*, BLASLONG,float, float*, BLASLONG); +#endif +#ifdef BUILD_DOUBLE int (*daxpby_k) (BLASLONG, double, double*, BLASLONG,double, double*, BLASLONG); +#endif +#ifdef BUILD_COMPLEX int (*caxpby_k) (BLASLONG, float, float, float*, BLASLONG,float,float, float*, BLASLONG); +#endif +#ifdef BUILD_COMPLEX16 int (*zaxpby_k) (BLASLONG, double, double, double*, BLASLONG,double,double, double*, BLASLONG); +#endif +#ifdef BUILD_SINGLE int (*somatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); int (*somatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); int (*somatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); int (*somatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); +#endif +#ifdef BUILD_DOUBLE int (*domatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*domatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*domatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*domatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); +#endif +#ifdef BUILD_COMPLEX int (*comatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); @@ -955,7 +1001,9 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*comatcopy_k_ctc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_rnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); +#endif +#ifdef BUILD_COMPLEX16 int (*zomatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); @@ -965,17 +1013,23 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*zomatcopy_k_ctc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_rnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); +#endif +#ifdef BUILD_SINGLE int (*simatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG); int (*simatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG); int (*simatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG); int (*simatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG); +#endif +#ifdef BUILD_DOUBLE int (*dimatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG); int (*dimatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG); int (*dimatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG); int (*dimatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG); +#endif +#ifdef BUILD_COMPLEX int (*cimatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); int (*cimatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); int (*cimatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); @@ -985,7 +1039,9 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*cimatcopy_k_ctc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); int (*cimatcopy_k_rnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); int (*cimatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); +#endif +#ifdef BUILD_COMPLEX16 int (*zimatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); int (*zimatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); int (*zimatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); @@ -995,12 +1051,20 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*zimatcopy_k_ctc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); int (*zimatcopy_k_rnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); int (*zimatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); +#endif +#ifdef BUILD_SINGLE int (*sgeadd_k) (BLASLONG, BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG); +#endif +#ifdef BUILD_DOUBLE int (*dgeadd_k) (BLASLONG, BLASLONG, double, double *, BLASLONG, double, double *, BLASLONG); +#endif +#ifdef BUILD_COMPLEX int (*cgeadd_k) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float, float, float *, BLASLONG); +#endif +#ifdef BUILD_COMPLEX16 int (*zgeadd_k) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double, double, double *, BLASLONG); - +#endif } gotoblas_t; extern gotoblas_t *gotoblas; @@ -1021,19 +1085,23 @@ extern gotoblas_t *gotoblas; #define SHGEMM_UNROLL_MN gotoblas -> shgemm_unroll_mn #endif +#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) #define SGEMM_P gotoblas -> sgemm_p #define SGEMM_Q gotoblas -> sgemm_q #define SGEMM_R gotoblas -> sgemm_r #define SGEMM_UNROLL_M gotoblas -> sgemm_unroll_m #define SGEMM_UNROLL_N gotoblas -> sgemm_unroll_n #define SGEMM_UNROLL_MN gotoblas -> sgemm_unroll_mn +#endif +#if defined (BUILD_DOUBLE) || defined (BUILD_COMPLEX16) #define DGEMM_P gotoblas -> dgemm_p #define DGEMM_Q gotoblas -> dgemm_q #define DGEMM_R gotoblas -> dgemm_r #define DGEMM_UNROLL_M gotoblas -> dgemm_unroll_m #define DGEMM_UNROLL_N gotoblas -> dgemm_unroll_n #define DGEMM_UNROLL_MN gotoblas -> dgemm_unroll_mn +#endif #define QGEMM_P gotoblas -> qgemm_p #define QGEMM_Q gotoblas -> qgemm_q @@ -1042,19 +1110,23 @@ extern gotoblas_t *gotoblas; #define QGEMM_UNROLL_N gotoblas -> qgemm_unroll_n #define QGEMM_UNROLL_MN gotoblas -> qgemm_unroll_mn +#ifdef BUILD_COMPLEX #define CGEMM_P gotoblas -> cgemm_p #define CGEMM_Q gotoblas -> cgemm_q #define CGEMM_R gotoblas -> cgemm_r #define CGEMM_UNROLL_M gotoblas -> cgemm_unroll_m #define CGEMM_UNROLL_N gotoblas -> cgemm_unroll_n #define CGEMM_UNROLL_MN gotoblas -> cgemm_unroll_mn +#endif +#ifdef BUILD_COMPLEX16 #define ZGEMM_P gotoblas -> zgemm_p #define ZGEMM_Q gotoblas -> zgemm_q #define ZGEMM_R gotoblas -> zgemm_r #define ZGEMM_UNROLL_M gotoblas -> zgemm_unroll_m #define ZGEMM_UNROLL_N gotoblas -> zgemm_unroll_n #define ZGEMM_UNROLL_MN gotoblas -> zgemm_unroll_mn +#endif #define XGEMM_P gotoblas -> xgemm_p #define XGEMM_Q gotoblas -> xgemm_q From 26611af8e1af43941ac02c642c16a64a37390304 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 22 Sep 2020 23:20:05 +0200 Subject: [PATCH 460/593] fix grouping of sources used for more than one type --- cmake/lapack.cmake | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake index 18a74d18e..73f2592ef 100644 --- a/cmake/lapack.cmake +++ b/cmake/lapack.cmake @@ -1,11 +1,12 @@ # Sources for compiling lapack-netlib. Can't use CMakeLists.txt because lapack-netlib already has its own cmake files. set(ALLAUX ilaenv.f ilaenv2stage.f ieeeck.f lsamen.f iparmq.f iparam2stage.F - ilaprec.f ilatrans.f ilauplo.f iladiag.f chla_transtype.f + ilaprec.f ilatrans.f ilauplo.f iladiag.f chla_transtype.f dlaset.f ../INSTALL/ilaver.f xerbla_array.f ../INSTALL/slamch.f) set(SCLAUX + scombssq.f sbdsvdx.f sstevx.f sstein.f sbdsdc.f sbdsqr.f sdisna.f slabad.f slacpy.f sladiv.f slae2.f slaebz.f slaed0.f slaed1.f slaed2.f slaed3.f slaed4.f slaed5.f slaed6.f @@ -25,6 +26,7 @@ set(SCLAUX set(DZLAUX dbdsdc.f + dbdsvdx.f dstevx.f dstein.f dbdsqr.f ddisna.f dlabad.f dlacpy.f dladiv.f dlae2.f dlaebz.f dlaed0.f dlaed1.f dlaed2.f dlaed3.f dlaed4.f dlaed5.f dlaed6.f dlaed7.f dlaed8.f dlaed9.f dlaeda.f dlaev2.f dlagtf.f @@ -35,14 +37,14 @@ set(DZLAUX dlartg.f dlaruv.f dlas2.f dlascl.f dlasd0.f dlasd1.f dlasd2.f dlasd3.f dlasd4.f dlasd5.f dlasd6.f dlasd7.f dlasd8.f dlasda.f dlasdq.f dlasdt.f - dlaset.f dlasq1.f dlasq2.f dlasq3.f dlasq4.f dlasq5.f dlasq6.f + dlasq1.f dlasq2.f dlasq3.f dlasq4.f dlasq5.f dlasq6.f dlasr.f dlasrt.f dlassq.f dlasv2.f dpttrf.f dstebz.f dstedc.f dsteqr.f dsterf.f dlaisnan.f disnan.f dlartgp.f dlartgs.f ../INSTALL/dlamch.f ../INSTALL/dsecnd_${TIMER}.f) set(SLASRC - sbdsvdx.f sgbbrd.f sgbcon.f sgbequ.f sgbrfs.f sgbsv.f + sgbbrd.f sgbcon.f sgbequ.f sgbrfs.f sgbsv.f sgbsvx.f sgbtf2.f sgbtrf.f sgbtrs.f sgebak.f sgebal.f sgebd2.f sgebrd.f sgecon.f sgeequ.f sgees.f sgeesx.f sgeev.f sgeevx.f sgehd2.f sgehrd.f sgelq2.f sgelqf.f @@ -83,8 +85,8 @@ set(SLASRC ssbev.f ssbevd.f ssbevx.f ssbgst.f ssbgv.f ssbgvd.f ssbgvx.f ssbtrd.f sspcon.f sspev.f sspevd.f sspevx.f sspgst.f sspgv.f sspgvd.f sspgvx.f ssprfs.f sspsv.f sspsvx.f ssptrd.f - ssptrf.f ssptri.f ssptrs.f sstegr.f sstein.f sstev.f sstevd.f sstevr.f - sstevx.f ssycon.f ssyev.f ssyevd.f ssyevr.f ssyevx.f ssygs2.f + ssptrf.f ssptri.f ssptrs.f sstegr.f sstev.f sstevd.f sstevr.f + ssycon.f ssyev.f ssyevd.f ssyevr.f ssyevx.f ssygs2.f ssygst.f ssygv.f ssygvd.f ssygvx.f ssyrfs.f ssysv.f ssysvx.f ssytd2.f ssytf2.f ssytrd.f ssytrf.f ssytri.f ssytri2.f ssytri2x.f ssyswapr.f ssytrs.f ssytrs2.f @@ -116,7 +118,7 @@ set(SLASRC ssytrd_2stage.f ssytrd_sy2sb.f ssytrd_sb2st.F ssb2st_kernels.f ssyevd_2stage.f ssyev_2stage.f ssyevx_2stage.f ssyevr_2stage.f ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f - scombssq.f sgesvdq.f slaorhr_col_getrfnp.f + sgesvdq.f slaorhr_col_getrfnp.f slaorhr_col_getrfnp2.f sorgtsqr.f sorhr_col.f ) set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f @@ -229,7 +231,7 @@ set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f cla_lin_berr.f clarscl2.f clascl2.f cla_wwaddw.f) set(DLASRC - dbdsvdx.f dgbbrd.f dgbcon.f dgbequ.f dgbrfs.f dgbsv.f + dgbbrd.f dgbcon.f dgbequ.f dgbrfs.f dgbsv.f dgbsvx.f dgbtf2.f dgbtrf.f dgbtrs.f dgebak.f dgebal.f dgebd2.f dgebrd.f dgecon.f dgeequ.f dgees.f dgeesx.f dgeev.f dgeevx.f dgehd2.f dgehrd.f dgelq2.f dgelqf.f @@ -270,8 +272,8 @@ set(DLASRC dsbev.f dsbevd.f dsbevx.f dsbgst.f dsbgv.f dsbgvd.f dsbgvx.f dsbtrd.f dspcon.f dspev.f dspevd.f dspevx.f dspgst.f dspgv.f dspgvd.f dspgvx.f dsprfs.f dspsv.f dspsvx.f dsptrd.f - dsptrf.f dsptri.f dsptrs.f dstegr.f dstein.f dstev.f dstevd.f dstevr.f - dstevx.f dsycon.f dsyev.f dsyevd.f dsyevr.f + dsptrf.f dsptri.f dsptrs.f dstegr.f dstev.f dstevd.f dstevr.f + dsycon.f dsyev.f dsyevd.f dsyevr.f dsyevx.f dsygs2.f dsygst.f dsygv.f dsygvd.f dsygvx.f dsyrfs.f dsysv.f dsysvx.f dsytd2.f dsytf2.f dsytrd.f dsytrf.f dsytri.f dsytrs.f dsytrs2.f @@ -474,12 +476,16 @@ endif() if(BUILD_COMPLEX) set(LA_REL_SRC ${LA_REL_SRC} ${CLASRC} ${ZCLASRC} ${ALLAUX} ${SCLAUX}) SET(LA_GEN_SRC ${LA_GEN_SRC} ${CMATGEN} ${SCATGEN}) - message(STATUS "Building Complex Precision") + message(STATUS "Building Single Precision Complex") endif() if(BUILD_COMPLEX16) set(LA_REL_SRC ${LA_REL_SRC} ${ZLASRC} ${ZCLASRC} ${ALLAUX} ${DZLAUX}) SET(LA_GEN_SRC ${LA_GEN_SRC} ${ZMATGEN} ${DZATGEN}) - message(STATUS "Building Double Complex Precision") +# for zlange/zlanhe + if (NOT BUILD_DOUBLE) + set (LA_REL_SRC ${LA_REL_SRC} dcombssq.f) + endif () + message(STATUS "Building Double Precision Complex") endif() # add lapack-netlib folder to the sources From 3287848c8f45335b9672a3d8cded592451af0d61 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 22 Sep 2020 23:20:51 +0200 Subject: [PATCH 461/593] Support building only seleced types --- driver/level2/CMakeLists.txt | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/driver/level2/CMakeLists.txt b/driver/level2/CMakeLists.txt index 8fceba905..f72e707e1 100644 --- a/driver/level2/CMakeLists.txt +++ b/driver/level2/CMakeLists.txt @@ -197,6 +197,19 @@ foreach (float_type ${FLOAT_TYPES}) endif () endforeach () +if (DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_SINGLE) + if (USE_THREAD) + GenerateNamedObjects("gemv_thread.c" "" "gemv_thread_n" false "" "" false "SINGLE") + GenerateNamedObjects("gemv_thread.c" "TRANSA" "gemv_thread_t" false "" "" false "SINGLE") + endif () +endif () +if (DEFINED BUILD_COMPLEX16 AND NOT DEFINED BUILD_DOUBLE) + if (USE_THREAD) + GenerateNamedObjects("gemv_thread.c" "" "gemv_thread_n" false "" "" false "DOUBLE") + GenerateNamedObjects("gemv_thread.c" "TRANSA" "gemv_thread_t" false "" "" false "DOUBLE") + endif () +endif () + if (USE_THREAD) GenerateCombinationObjects("${UL_SMP_SOURCES}" "LOWER" "U" "" 2) endif () From e5e2fbd593f78f6113b0dcee88cb3b63b613e53b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 22 Sep 2020 23:21:30 +0200 Subject: [PATCH 462/593] Support building only selected types --- driver/level3/CMakeLists.txt | 37 +++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/driver/level3/CMakeLists.txt b/driver/level3/CMakeLists.txt index f788c45b9..46cbb0d6d 100644 --- a/driver/level3/CMakeLists.txt +++ b/driver/level3/CMakeLists.txt @@ -14,6 +14,24 @@ foreach (GEMM_DEFINE ${GEMM_DEFINES}) endif () endforeach () +if (DEFINED BUILD_COMPLEX16 AND NOT DEFINED BUILD_DOUBLE) +foreach (GEMM_DEFINE ${GEMM_DEFINES}) + string(TOLOWER ${GEMM_DEFINE} GEMM_DEFINE_LC) + GenerateNamedObjects("gemm.c" "${GEMM_DEFINE}" "gemm_${GEMM_DEFINE_LC}" 0 "" "" false "DOUBLE") + if (USE_THREAD AND NOT USE_SIMPLE_THREADED_LEVEL3) + GenerateNamedObjects("gemm.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm_thread_${GEMM_DEFINE_LC}" 0 "" "" false "DOUBLE") + endif() +endforeach() +endif() +if (DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_SINGLE) +foreach (GEMM_DEFINE ${GEMM_DEFINES}) + string(TOLOWER ${GEMM_DEFINE} GEMM_DEFINE_LC) + GenerateNamedObjects("gemm.c" "${GEMM_DEFINE}" "gemm_${GEMM_DEFINE_LC}" 0 "" "" false "SINGLE") + if (USE_THREAD AND NOT USE_SIMPLE_THREADED_LEVEL3) + GenerateNamedObjects("gemm.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm_thread_${GEMM_DEFINE_LC}" 0 "" "" false "SINGLE") + endif() +endforeach() +endif() set(TRMM_TRSM_SOURCES trmm_L.c @@ -100,7 +118,24 @@ foreach (float_type ${FLOAT_TYPES}) endif() endif () endforeach () - + + if (DEFINED BUILD_COMPLEX16 AND NOT DEFINED BUILD_DOUBLE) + foreach (gemm_define ${GEMM_COMPLEX_DEFINES}) + string(TOLOWER ${gemm_define} gemm_define_LC) + if (USE_THREAD AND NOT USE_SIMPLE_THREADED_LEVEL3) + GenerateNamedObjects("gemm.c" "${gemm_define};THREADED_LEVEL3" "gemm_thread_${gemm_define_LC}" false "" "" false "DOUBLE" ) + endif() + endforeach() + endif () + if (DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_SINGLE) + foreach (gemm_define ${GEMM_COMPLEX_DEFINES}) + string(TOLOWER ${gemm_define} gemm_define_LC) + if (USE_THREAD AND NOT USE_SIMPLE_THREADED_LEVEL3) + GenerateNamedObjects("gemm.c" "${gemm_define};THREADED_LEVEL3" "gemm_thread_${gemm_define_LC}" false "" "" false "SINGLE" ) + endif() + endforeach() + endif () + # for gemm3m if(USE_GEMM3M) foreach (GEMM_DEFINE ${GEMM_DEFINES}) From 988a6f429e9d16bb27e73a7a8c859d5aa6e04d58 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 22 Sep 2020 23:23:33 +0200 Subject: [PATCH 463/593] Add BUILD_vartype defines --- driver/level3/syrk_thread.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/driver/level3/syrk_thread.c b/driver/level3/syrk_thread.c index b26d363c4..753cdb5ca 100644 --- a/driver/level3/syrk_thread.c +++ b/driver/level3/syrk_thread.c @@ -56,12 +56,16 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( if (!(mode & BLAS_COMPLEX)) { switch (mode & BLAS_PREC) { +#ifdef BUILD_SINGLE case BLAS_SINGLE: mask = SGEMM_UNROLL_MN - 1; break; +#endif +#ifdef BUILD_DOUBLE case BLAS_DOUBLE: mask = DGEMM_UNROLL_MN - 1; break; +#endif #ifdef EXPRECISION case BLAS_XDOUBLE: mask = MAX(QGEMM_UNROLL_M, QGEMM_UNROLL_N) - 1; @@ -70,12 +74,16 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( } } else { switch (mode & BLAS_PREC) { +#ifdef BUILD_COMPLEX case BLAS_SINGLE: mask = CGEMM_UNROLL_MN - 1; break; +#endif +#ifdef BUILD_COMPLEX16 case BLAS_DOUBLE: mask = ZGEMM_UNROLL_MN - 1; break; +#endif #ifdef EXPRECISION case BLAS_XDOUBLE: mask = MAX(XGEMM_UNROLL_M, XGEMM_UNROLL_N) - 1; From 357bff06b5b9ab7f4f1de8084eceb37cdcffa250 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 22 Sep 2020 23:24:22 +0200 Subject: [PATCH 464/593] Add BUILD_vartype defines --- driver/others/blas_server.c | 11 +++++++++-- driver/others/memory.c | 10 ++++++++-- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 8d3dda3bf..acfaed75d 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -459,13 +459,16 @@ blas_queue_t *tscq; } else #endif if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE) { +#ifdef BUILD_DOUBLE sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); - +#endif } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) { +#ifdef BUILD_SINGLE sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); - } else { +#endif + } else { /* Other types in future */ } } else { @@ -476,11 +479,15 @@ blas_queue_t *tscq; } else #endif if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){ +#ifdef BUILD_COMPLEX16 sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) { +#ifdef BUILD_COMPLEX sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif } else { /* Other types in future */ } diff --git a/driver/others/memory.c b/driver/others/memory.c index 9b6c226a1..08835ed6d 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2201,9 +2201,15 @@ static void *alloc_mmap(void *address){ #endif #endif - +#ifdef BUILD_DOUBLE allocsize = DGEMM_P * DGEMM_Q * sizeof(double); - +#elif defined(BUILD_COMPLEX16) + allocsize = ZGEMM_P * ZGEMM_Q * sizeof(double); +#elif defined(BUILD_COMPLEX) + allocsize = CGEMM_P * CGEMM_Q * sizeof(double); +#else + allocsize = SGEMM_P * SGEMM_Q * sizeof(double); +#endif start = (BLASULONG)map_address; current = (SCALING - 1) * BUFFER_SIZE; From b475b4bd0dbc0f9c750e6a8a31769a47a777f199 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 22 Sep 2020 23:25:04 +0200 Subject: [PATCH 465/593] Support building only a subset of types --- interface/CMakeLists.txt | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index 7a8fc6698..ad56c6dba 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -83,8 +83,12 @@ foreach (CBLAS_FLAG ${CBLAS_FLAGS}) GenerateNamedObjects("${BLAS3_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX}) #sdsdot, dsdot + if (BUILD_SINGLE OR BUILD_DOUBLE) GenerateNamedObjects("sdsdot.c" "" "sdsdot" ${CBLAS_FLAG} "" "" true "SINGLE") +endif () +if (BUILD_DOUBLE) GenerateNamedObjects("dsdot.c" "" "dsdot" ${CBLAS_FLAG} "" "" true "SINGLE") +endif () # trmm is trsm with a compiler flag set GenerateNamedObjects("trsm.c" "TRMM" "trmm" ${CBLAS_FLAG}) @@ -167,4 +171,31 @@ if (NOT DEFINED NO_LAPACK) GenerateNamedObjects("${LAPACK_MANGLED_SOURCES}" "" "" 0 "" "" 0 3) endif () +if (DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_SINGLE) + GenerateNamedObjects("scal.c" "" "scal" 0 "" "" false "SINGLE") + GenerateNamedObjects("copy.c" "" "copy" 0 "" "" false "SINGLE") + GenerateNamedObjects("dot.c" "" "dot" 0 "" "" false "SINGLE") + GenerateNamedObjects("rot.c" "" "rot" 0 "" "" false "SINGLE") + GenerateNamedObjects("nrm2.c" "" "nrm2" 0 "" "" false "SINGLE") + GenerateNamedObjects("gemv.c" "" "gemv" 0 "" "" false "SINGLE") + GenerateNamedObjects("gemm.c" "" "gemm" 0 "" "" false "SINGLE") + GenerateNamedObjects("asum.c" "" "asum" 0 "" "" false "SINGLE") + GenerateNamedObjects("swap.c" "" "swap" 0 "" "" false "SINGLE") + GenerateNamedObjects("axpy.c" "" "axpy" 0 "" "" false "SINGLE") + GenerateNamedObjects("imax.c" "USE_ABS" "i*amax" 0 "" "" false "SINGLE") +endif () +if (DEFINED BUILD_COMPLEX16 AND NOT DEFINED BUILD_DOUBLE) + GenerateNamedObjects("scal.c" "" "scal" 0 "" "" false "DOUBLE") + GenerateNamedObjects("copy.c" "" "copy" 0 "" "" false "DOUBLE") + GenerateNamedObjects("dot.c" "" "dot" 0 "" "" false "DOUBLE") + GenerateNamedObjects("rot.c" "" "rot" 0 "" "" false "DOUBLE") + GenerateNamedObjects("nrm2.c" "" "nrm2" 0 "" "" false "DOUBLE") + GenerateNamedObjects("gemv.c" "" "gemv" 0 "" "" false "DOUBLE") + GenerateNamedObjects("gemm.c" "" "gemm" 0 "" "" false "DOUBLE") + GenerateNamedObjects("asum.c" "" "asum" 0 "" "" false "DOUBLE") + GenerateNamedObjects("swap.c" "" "swap" 0 "" "" false "DOUBLE") + GenerateNamedObjects("axpy.c" "" "axpy" 0 "" "" false "DOUBLE") + GenerateNamedObjects("imax.c" "USE_ABS" "i*amax" 0 "" "" false "DOUBLE") +endif () + add_library(interface OBJECT ${OPENBLAS_SRC}) From dfbc62ef7e89e448f2a57f3aaf72a11dae61bbd2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 22 Sep 2020 23:25:59 +0200 Subject: [PATCH 466/593] Support building only a subset of types --- kernel/CMakeLists.txt | 94 +++++++++++- kernel/setparam-ref.c | 345 +++++++++++++++++++++++++++++++++++++++--- 2 files changed, 421 insertions(+), 18 deletions(-) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 84dd949a4..c81f2bf25 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -91,6 +91,59 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${DSDOTKERNEL}" "DSDOT" "d*dot_k" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${DSDOTKERNEL}" "DSDOT" "dsdot_k" false "" "" false "SINGLE") + if (DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_SINGLE) + GenerateNamedObjects("${KERNELDIR}/${SAMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SAMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SASUMKERNEL}" "" "asum_k" false "" "" false "SINGLE") + if (DEFINED SMAXKERNEL) + GenerateNamedObjects("${KERNELDIR}/${SMAXKERNEL}" "" "max_k" false "" "" false "SINGLE") + endif () + if (DEFINED SMINKERNEL) + GenerateNamedObjects("${KERNELDIR}/${SMINKERNEL}" "USE_MIN" "min_k" false "" "" false "SINGLE") + endif () + if (DEFINED ISMINKERNEL) + GenerateNamedObjects("${KERNELDIR}/${ISMINKERNEL}" "USE_MIN" "i*min_k" false "" "" false "SINGLE") + endif () + if (DEFINED ISMAXKERNEL) + GenerateNamedObjects("${KERNELDIR}/${ISMAXKERNEL}" "" "i*max_k" false "" "" false "SINGLE") + endif () + GenerateNamedObjects("${KERNELDIR}/${ISAMAXKERNEL}" "USE_ABS" "i*amax_k" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${ISAMINKERNEL}" "USE_ABS;USE_MIN" "i*amin_k" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SSCALKERNEL}" "" "scal_k" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SCOPYKERNEL}" "" "copy_k" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SSWAPKERNEL}" "" "swap_k" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SAXPYKERNEL}" "" "axpy_k" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SNRM2KERNEL}" "" "nrm2_k" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SDOTKERNEL}" "" "dot_k" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SROTKERNEL}" "" "rot_k" false "" "" false "SINGLE") + endif () + if (DEFINED BUILD_COMPLEX16 AND NOT DEFINED BUILD_DOUBLE) + GenerateNamedObjects("${KERNELDIR}/${DAMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DAMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DASUMKERNEL}" "" "asum_k" false "" "" false "DOUBLE") + if (DEFINED DMAXKERNEL) + GenerateNamedObjects("${KERNELDIR}/${DMAXKERNEL}" "" "max_k" false "" "" false "DOUBLE") + endif () + if (DEFINED DMINKERNEL) + GenerateNamedObjects("${KERNELDIR}/${DMINKERNEL}" "USE_MIN" "min_k" false "" "" false "DOUBLE") + endif () + if (DEFINED IDMINKERNEL) + GenerateNamedObjects("${KERNELDIR}/${IDMINKERNEL}" "USE_MIN" "i*min_k" false "" "" false "DOUBLE") + endif () + if (DEFINED IDMAXKERNEL) + GenerateNamedObjects("${KERNELDIR}/${IDMAXKERNEL}" "" "i*max_k" false "" "" false "DOUBLE") + endif () + GenerateNamedObjects("${KERNELDIR}/${IDAMAXKERNEL}" "USE_ABS" "i*amax_k" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${IDAMINKERNEL}" "USE_ABS;USE_MIN" "i*amin_k" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DSCALKERNEL}" "" "scal_k" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DCOPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DNRM2KERNEL}" "" "nrm2_k" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DROTKERNEL}" "" "rot_k" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DDOTKERNEL}" "" "dot_k" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DSWAPKERNEL}" "" "swap_k" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DAXPYKERNEL}" "" "axpy_k" false "" "" false "DOUBLE") + endif () + # Makefile.L2 GenerateCombinationObjects("generic/symv_k.c" "LOWER" "U" "" 1 "" "" 3) GenerateNamedObjects("generic/ger.c" "" "ger_k" false "" "" "" 3) @@ -124,7 +177,14 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false ${float_type}) endif () endforeach () - + if (DEFINED BUILD_COMPLEX16 AND NOT DEFINED BUILD_DOUBLE) + GenerateNamedObjects("${KERNELDIR}/${DGEMVNKERNEL}" "" "gemv_n" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false "DOUBLE") + endif () + if (DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_SINGLE) + GenerateNamedObjects("${KERNELDIR}/${SGEMVNKERNEL}" "" "gemv_n" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SGEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false "SINGLE") + endif () # Makefile.L3 set(USE_TRMM false) if (ARM OR ARM64 OR (TARGET_CORE MATCHES LONGSOON3B) OR (TARGET_CORE MATCHES GENERIC) OR (TARGET_CORE MATCHES HASWELL) OR (TARGET_CORE MATCHES ZEN) OR (TARGET_CORE MATCHES SKYLAKEX) OR (TARGET_CORE MATCHES COOPERLAKE)) @@ -159,6 +219,38 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) endif () GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type}) endforeach() + if (DEFINED BUILD_COMPLEX16 AND NOT DEFINED BUILD_DOUBLE) + GenerateNamedObjects("${KERNELDIR}/${DGEMMKERNEL}" "" "gemm_kernel" false "" "" false "DOUBLE") + if (DGEMMINCOPY) + GenerateNamedObjects("${KERNELDIR}/${DGEMMINCOPY}" "DOUBLE" "${DGEMMINCOPYOBJ}" false "" "" true "DOUBLE") + endif () + if (DGEMMITCOPY) + GenerateNamedObjects("${KERNELDIR}/${DGEMMITCOPY}" "DOUBLE" "${DGEMMITCOPYOBJ}" false "" "" true "DOUBLE") + endif () + if (DGEMMONCOPY) + GenerateNamedObjects("${KERNELDIR}/${DGEMMONCOPY}" "DOUBLE" "${DGEMMONCOPYOBJ}" false "" "" true "DOUBLE") + endif () + if (DGEMMOTCOPY) + GenerateNamedObjects("${KERNELDIR}/${DGEMMOTCOPY}" "DOUBLE" "${DGEMMOTCOPYOBJ}" false "" "" true "DOUBLE") + endif () + GenerateNamedObjects("${KERNELDIR}/${DGEMM_BETA}" "" "gemm_beta" false "" "" false "DOUBLE") + endif () + if (DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_SINGLE) + GenerateNamedObjects("${KERNELDIR}/${SGEMMKERNEL}" "" "gemm_kernel" false "" "" false "DOUBLE") + if (SGEMMINCOPY) + GenerateNamedObjects("${KERNELDIR}/${SGEMMINCOPY}" "DOUBLE" "${SGEMMINCOPYOBJ}" false "" "" true "SINGLE") + endif () + if (SGEMMITCOPY) + GenerateNamedObjects("${KERNELDIR}/${SGEMMITCOPY}" "DOUBLE" "${SGEMMITCOPYOBJ}" false "" "" true "SINGLE") + endif () + if (SGEMMONCOPY) + GenerateNamedObjects("${KERNELDIR}/${SGEMMONCOPY}" "DOUBLE" "${SGEMMONCOPYOBJ}" false "" "" true "SINGLE") + endif () + if (SGEMMOTCOPY) + GenerateNamedObjects("${KERNELDIR}/${SGEMMOTCOPY}" "DOUBLE" "${SGEMMOTCOPYOBJ}" false "" "" true "SINGLE") + endif () + GenerateNamedObjects("${KERNELDIR}/${SGEMM_BETA}" "" "gemm_beta" false "" "" false "SINGLE") + endif () foreach (float_type ${FLOAT_TYPES}) string(SUBSTRING ${float_type} 0 1 float_char) diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index c43520310..550af86a6 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -114,6 +114,7 @@ gotoblas_t TABLE_NAME = { #endif #endif +#if defined( BUILD_SINGLE) || defined(BUILD_COMPLEX) 0, 0, 0, SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N, #ifdef SGEMM_DEFAULT_UNROLL_MN @@ -121,7 +122,7 @@ gotoblas_t TABLE_NAME = { #else MAX(SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N), #endif - +#endif #ifdef HAVE_EXCLUSIVE_CACHE 1, @@ -129,19 +130,34 @@ gotoblas_t TABLE_NAME = { 0, #endif +#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) samax_kTS, samin_kTS, smax_kTS, smin_kTS, isamax_kTS, isamin_kTS, ismax_kTS, ismin_kTS, - snrm2_kTS, sasum_kTS, ssum_kTS, scopy_kTS, sdot_kTS, - dsdot_kTS, - srot_kTS, saxpy_kTS, sscal_kTS, sswap_kTS, - sgemv_nTS, sgemv_tTS, sger_kTS, + snrm2_kTS, sasum_kTS, +#endif +#ifdef BUILD_SINGLE + ssum_kTS, +#endif + +#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) + scopy_kTS, sdot_kTS, +// dsdot_kTS, + srot_kTS, saxpy_kTS, + sscal_kTS, + sswap_kTS, + sgemv_nTS, sgemv_tTS, +#endif +#ifdef BUILD_SINGLE + sger_kTS, ssymv_LTS, ssymv_UTS, #ifdef ARCH_X86_64 sgemm_directTS, sgemm_direct_performantTS, #endif - +#endif + +#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) sgemm_kernelTS, sgemm_betaTS, #if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N sgemm_incopyTS, sgemm_itcopyTS, @@ -149,6 +165,9 @@ gotoblas_t TABLE_NAME = { sgemm_oncopyTS, sgemm_otcopyTS, #endif sgemm_oncopyTS, sgemm_otcopyTS, +#endif + +#ifdef BUILD_SINGLE strsm_kernel_LNTS, strsm_kernel_LTTS, strsm_kernel_RNTS, strsm_kernel_RTTS, #if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N strsm_iunucopyTS, strsm_iunncopyTS, strsm_iutucopyTS, strsm_iutncopyTS, @@ -182,6 +201,9 @@ gotoblas_t TABLE_NAME = { NULL,NULL, #endif +#endif + +#if defined (BUILD_DOUBLE) || defined(BUILD_COMPLEX16) 0, 0, 0, DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N, #ifdef DGEMM_DEFAULT_UNROLL_MN @@ -189,14 +211,36 @@ gotoblas_t TABLE_NAME = { #else MAX(DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N), #endif +#endif + +#if defined (BUILD_DOUBLE) || defined(BUILD_COMPLEX16) damax_kTS, damin_kTS, dmax_kTS, dmin_kTS, idamax_kTS, idamin_kTS, idmax_kTS, idmin_kTS, - dnrm2_kTS, dasum_kTS, dsum_kTS, dcopy_kTS, ddot_kTS, - drot_kTS, daxpy_kTS, dscal_kTS, dswap_kTS, - dgemv_nTS, dgemv_tTS, dger_kTS, + dnrm2_kTS, dasum_kTS, +#endif +#if defined (BUILD_DOUBLE) + dsum_kTS, +#endif +#if defined (BUILD_DOUBLE) || defined(BUILD_COMPLEX16) + dcopy_kTS, ddot_kTS, +#endif +#if defined (BUILD_SINGLE) || defined(BUILD_DOUBLE) + dsdot_kTS, +#endif +#if defined (BUILD_DOUBLE) || defined(BUILD_COMPLEX16) + drot_kTS, + daxpy_kTS, + dscal_kTS, + dswap_kTS, + dgemv_nTS, dgemv_tTS, +#endif +#if defined (BUILD_DOUBLE) + dger_kTS, dsymv_LTS, dsymv_UTS, +#endif +#if defined (BUILD_DOUBLE) || defined(BUILD_COMPLEX16) dgemm_kernelTS, dgemm_betaTS, #if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N dgemm_incopyTS, dgemm_itcopyTS, @@ -204,6 +248,9 @@ gotoblas_t TABLE_NAME = { dgemm_oncopyTS, dgemm_otcopyTS, #endif dgemm_oncopyTS, dgemm_otcopyTS, +#endif + +#if defined (BUILD_DOUBLE) dtrsm_kernel_LNTS, dtrsm_kernel_LTTS, dtrsm_kernel_RNTS, dtrsm_kernel_RTTS, #if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N dtrsm_iunucopyTS, dtrsm_iunncopyTS, dtrsm_iutucopyTS, dtrsm_iutncopyTS, @@ -237,6 +284,8 @@ gotoblas_t TABLE_NAME = { NULL, NULL, #endif +#endif + #ifdef EXPRECISION 0, 0, 0, @@ -291,6 +340,7 @@ gotoblas_t TABLE_NAME = { #endif +#ifdef BUILD_COMPLEX 0, 0, 0, CGEMM_DEFAULT_UNROLL_M, CGEMM_DEFAULT_UNROLL_N, #ifdef CGEMM_DEFAULT_UNROLL_MN @@ -426,6 +476,9 @@ gotoblas_t TABLE_NAME = { NULL, NULL, #endif +#endif + +#ifdef BUILD_COMPLEX16 0, 0, 0, ZGEMM_DEFAULT_UNROLL_M, ZGEMM_DEFAULT_UNROLL_N, #ifdef ZGEMM_DEFAULT_UNROLL_MN @@ -560,6 +613,8 @@ gotoblas_t TABLE_NAME = { NULL, NULL, #endif +#endif + #ifdef EXPRECISION 0, 0, 0, @@ -691,25 +746,61 @@ gotoblas_t TABLE_NAME = { init_parameter, SNUMOPT, DNUMOPT, QNUMOPT, +#ifdef BUILD_SINGLE + saxpby_kTS, +#endif +#ifdef BUILD_DOUBLE + daxpby_kTS, +#endif +#ifdef BUILD_COMPLEX + caxpby_kTS, +#endif +#ifdef BUILD_COMPLEX16 + zaxpby_kTS, +#endif - saxpby_kTS, daxpby_kTS, caxpby_kTS, zaxpby_kTS, - +#ifdef BUILD_SINGLE somatcopy_k_cnTS, somatcopy_k_ctTS, somatcopy_k_rnTS, somatcopy_k_rtTS, +#endif +#ifdef BUILD_DOUBLE domatcopy_k_cnTS, domatcopy_k_ctTS, domatcopy_k_rnTS, domatcopy_k_rtTS, +#endif +#ifdef BUILD_COMPLEX comatcopy_k_cnTS, comatcopy_k_ctTS, comatcopy_k_rnTS, comatcopy_k_rtTS, comatcopy_k_cncTS, comatcopy_k_ctcTS, comatcopy_k_rncTS, comatcopy_k_rtcTS, +#endif +#ifdef BUILD_COMPLEX16 zomatcopy_k_cnTS, zomatcopy_k_ctTS, zomatcopy_k_rnTS, zomatcopy_k_rtTS, zomatcopy_k_cncTS, zomatcopy_k_ctcTS, zomatcopy_k_rncTS, zomatcopy_k_rtcTS, +#endif +#ifdef BUILD_SINGLE simatcopy_k_cnTS, simatcopy_k_ctTS, simatcopy_k_rnTS, simatcopy_k_rtTS, +#endif +#ifdef BUILD_DOUBLE dimatcopy_k_cnTS, dimatcopy_k_ctTS, dimatcopy_k_rnTS, dimatcopy_k_rtTS, +#endif +#ifdef BUILD_COMPLEX cimatcopy_k_cnTS, cimatcopy_k_ctTS, cimatcopy_k_rnTS, cimatcopy_k_rtTS, cimatcopy_k_cncTS, cimatcopy_k_ctcTS, cimatcopy_k_rncTS, cimatcopy_k_rtcTS, +#endif +#ifdef BUILD_COMPLEX16 zimatcopy_k_cnTS, zimatcopy_k_ctTS, zimatcopy_k_rnTS, zimatcopy_k_rtTS, zimatcopy_k_cncTS, zimatcopy_k_ctcTS, zimatcopy_k_rncTS, zimatcopy_k_rtcTS, +#endif - sgeadd_kTS, dgeadd_kTS, cgeadd_kTS, zgeadd_kTS - +#ifdef BUILD_SINGLE + sgeadd_kTS, +#endif +#ifdef BUILD_DOUBLE + dgeadd_kTS, +#endif +#ifdef BUILD_COMPLEX + cgeadd_kTS, +#endif +#ifdef BUILD_COMPLEX16 + zgeadd_kTS +#endif }; #if defined(ARCH_ARM64) @@ -717,26 +808,50 @@ static void init_parameter(void) { #if defined(BUILD_HALF) TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; #endif +#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#endif #if defined(BUILD_HALF) TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; #endif +#ifdef BUILD_SINGLE TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; +#endif #if defined(BUILD_HALF) TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; #endif +#ifdef BUILD_SINGLE TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R; +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; +#endif #ifdef EXPRECISION TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; @@ -989,22 +1104,34 @@ static void init_parameter(void) { TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; #endif +#ifdef BUILD_SINGLE TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; +#endif +#ifdef BUILD_COMPLEX #ifdef CGEMM3M_DEFAULT_Q TABLE_NAME.cgemm3m_q = CGEMM3M_DEFAULT_Q; #else TABLE_NAME.cgemm3m_q = SGEMM_DEFAULT_Q; #endif +#endif +#ifdef BUILD_COMPLEX16 #ifdef ZGEMM3M_DEFAULT_Q TABLE_NAME.zgemm3m_q = ZGEMM3M_DEFAULT_Q; #else TABLE_NAME.zgemm3m_q = DGEMM_DEFAULT_Q; #endif +#endif #ifdef EXPRECISION TABLE_NAME.qgemm_q = QGEMM_DEFAULT_Q; @@ -1018,10 +1145,18 @@ static void init_parameter(void) { fprintf(stderr, "Katmai, Coppermine, Banias, Athlon\n"); #endif +#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) TABLE_NAME.sgemm_p = 64 * (l2 >> 7); +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_p = 32 * (l2 >> 7); +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_p = 32 * (l2 >> 7); +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_p = 16 * (l2 >> 7); +#endif #ifdef EXPRECISION TABLE_NAME.qgemm_p = 16 * (l2 >> 7); TABLE_NAME.xgemm_p = 8 * (l2 >> 7); @@ -1034,10 +1169,18 @@ static void init_parameter(void) { fprintf(stderr, "Northwood\n"); #endif +#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) TABLE_NAME.sgemm_p = 96 * (l2 >> 7); +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_p = 48 * (l2 >> 7); +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_p = 48 * (l2 >> 7); +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_p = 24 * (l2 >> 7); +#endif #ifdef EXPRECISION TABLE_NAME.qgemm_p = 24 * (l2 >> 7); TABLE_NAME.xgemm_p = 12 * (l2 >> 7); @@ -1050,10 +1193,18 @@ static void init_parameter(void) { fprintf(stderr, "Atom\n"); #endif +#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) TABLE_NAME.sgemm_p = 256; +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_p = 128; +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_p = 128; +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_p = 64; +#endif #ifdef EXPRECISION TABLE_NAME.qgemm_p = 64; TABLE_NAME.xgemm_p = 32; @@ -1066,10 +1217,18 @@ static void init_parameter(void) { fprintf(stderr, "Prescott\n"); #endif +#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) TABLE_NAME.sgemm_p = 56 * (l2 >> 7); +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_p = 28 * (l2 >> 7); +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_p = 28 * (l2 >> 7); +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_p = 14 * (l2 >> 7); +#endif #ifdef EXPRECISION TABLE_NAME.qgemm_p = 14 * (l2 >> 7); TABLE_NAME.xgemm_p = 7 * (l2 >> 7); @@ -1082,10 +1241,18 @@ static void init_parameter(void) { fprintf(stderr, "Core2\n"); #endif +#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) TABLE_NAME.sgemm_p = 92 * (l2 >> 9) + 8; +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_p = 46 * (l2 >> 9) + 8; +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_p = 46 * (l2 >> 9) + 4; +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_p = 23 * (l2 >> 9) + 4; +#endif #ifdef EXPRECISION TABLE_NAME.qgemm_p = 92 * (l2 >> 9) + 8; TABLE_NAME.xgemm_p = 46 * (l2 >> 9) + 4; @@ -1098,10 +1265,18 @@ static void init_parameter(void) { fprintf(stderr, "Penryn\n"); #endif +#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) TABLE_NAME.sgemm_p = 42 * (l2 >> 9) + 8; +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_p = 42 * (l2 >> 9) + 8; +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_p = 21 * (l2 >> 9) + 4; +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_p = 21 * (l2 >> 9) + 4; +#endif #ifdef EXPRECISION TABLE_NAME.qgemm_p = 42 * (l2 >> 9) + 8; TABLE_NAME.xgemm_p = 21 * (l2 >> 9) + 4; @@ -1114,10 +1289,18 @@ static void init_parameter(void) { fprintf(stderr, "Dunnington\n"); #endif +#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) TABLE_NAME.sgemm_p = 42 * (l2 >> 9) + 8; +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_p = 42 * (l2 >> 9) + 8; +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_p = 21 * (l2 >> 9) + 4; +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_p = 21 * (l2 >> 9) + 4; +#endif #ifdef EXPRECISION TABLE_NAME.qgemm_p = 42 * (l2 >> 9) + 8; TABLE_NAME.xgemm_p = 21 * (l2 >> 9) + 4; @@ -1131,10 +1314,18 @@ static void init_parameter(void) { fprintf(stderr, "Nehalem\n"); #endif +#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#endif #ifdef EXPRECISION TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; @@ -1147,10 +1338,18 @@ static void init_parameter(void) { fprintf(stderr, "Sandybridge\n"); #endif +#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#endif #ifdef EXPRECISION TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; @@ -1163,10 +1362,18 @@ static void init_parameter(void) { fprintf(stderr, "Haswell\n"); #endif +#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#endif #ifdef EXPRECISION TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; @@ -1179,10 +1386,18 @@ static void init_parameter(void) { fprintf(stderr, "SkylakeX\n"); #endif +#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#endif #ifdef EXPRECISION TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; @@ -1196,10 +1411,18 @@ static void init_parameter(void) { fprintf(stderr, "Opteron\n"); #endif +#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) TABLE_NAME.sgemm_p = 224 + 56 * (l2 >> 7); +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_p = 112 + 28 * (l2 >> 7); +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_p = 112 + 28 * (l2 >> 7); +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_p = 56 + 14 * (l2 >> 7); +#endif #ifdef EXPRECISION TABLE_NAME.qgemm_p = 56 + 14 * (l2 >> 7); TABLE_NAME.xgemm_p = 28 + 7 * (l2 >> 7); @@ -1212,10 +1435,18 @@ static void init_parameter(void) { fprintf(stderr, "Barcelona\n"); #endif +#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#endif #ifdef EXPRECISION TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; @@ -1228,10 +1459,18 @@ static void init_parameter(void) { fprintf(stderr, "Bobcate\n"); #endif +#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#endif #ifdef EXPRECISION TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; @@ -1244,10 +1483,18 @@ static void init_parameter(void) { fprintf(stderr, "Bulldozer\n"); #endif +#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#endif #ifdef EXPRECISION TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; @@ -1260,10 +1507,18 @@ static void init_parameter(void) { fprintf(stderr, "Excavator\n"); #endif +#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#endif #ifdef EXPRECISION TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; @@ -1277,10 +1532,18 @@ static void init_parameter(void) { fprintf(stderr, "Piledriver\n"); #endif +#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#endif #ifdef EXPRECISION TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; @@ -1293,10 +1556,18 @@ static void init_parameter(void) { fprintf(stderr, "Steamroller\n"); #endif +#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#endif #ifdef EXPRECISION TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; @@ -1309,10 +1580,18 @@ static void init_parameter(void) { fprintf(stderr, "Zen\n"); #endif +#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#endif #ifdef EXPRECISION TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; @@ -1326,11 +1605,18 @@ static void init_parameter(void) { fprintf(stderr, "NANO\n"); #endif +#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; - +#endif #ifdef EXPRECISION @@ -1340,41 +1626,55 @@ static void init_parameter(void) { #endif - +#ifdef BUILD_COMPLEX #ifdef CGEMM3M_DEFAULT_P TABLE_NAME.cgemm3m_p = CGEMM3M_DEFAULT_P; #else TABLE_NAME.cgemm3m_p = TABLE_NAME.sgemm_p; #endif +#endif +#ifdef BUILD_COMPLEX16 #ifdef ZGEMM3M_DEFAULT_P TABLE_NAME.zgemm3m_p = ZGEMM3M_DEFAULT_P; #else TABLE_NAME.zgemm3m_p = TABLE_NAME.dgemm_p; #endif +#endif #ifdef EXPRECISION TABLE_NAME.xgemm3m_p = TABLE_NAME.qgemm_p; #endif - +#ifdef BUILD_SINGLE TABLE_NAME.sgemm_p = ((TABLE_NAME.sgemm_p + SGEMM_DEFAULT_UNROLL_M - 1)/SGEMM_DEFAULT_UNROLL_M) * SGEMM_DEFAULT_UNROLL_M; +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_p = ((TABLE_NAME.dgemm_p + DGEMM_DEFAULT_UNROLL_M - 1)/DGEMM_DEFAULT_UNROLL_M) * DGEMM_DEFAULT_UNROLL_M; +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_p = ((TABLE_NAME.cgemm_p + CGEMM_DEFAULT_UNROLL_M - 1)/CGEMM_DEFAULT_UNROLL_M) * CGEMM_DEFAULT_UNROLL_M; +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ((TABLE_NAME.zgemm_p + ZGEMM_DEFAULT_UNROLL_M - 1)/ZGEMM_DEFAULT_UNROLL_M) * ZGEMM_DEFAULT_UNROLL_M; +#endif +#ifdef BUILD_COMPLEX #ifdef CGEMM3M_DEFAULT_UNROLL_M TABLE_NAME.cgemm3m_p = ((TABLE_NAME.cgemm3m_p + CGEMM3M_DEFAULT_UNROLL_M - 1)/CGEMM3M_DEFAULT_UNROLL_M) * CGEMM3M_DEFAULT_UNROLL_M; #else TABLE_NAME.cgemm3m_p = ((TABLE_NAME.cgemm3m_p + SGEMM_DEFAULT_UNROLL_M - 1)/SGEMM_DEFAULT_UNROLL_M) * SGEMM_DEFAULT_UNROLL_M; #endif +#endif +#ifdef BUILD_COMPLEX16 #ifdef ZGEMM3M_DEFAULT_UNROLL_M TABLE_NAME.zgemm3m_p = ((TABLE_NAME.zgemm3m_p + ZGEMM3M_DEFAULT_UNROLL_M - 1)/ZGEMM3M_DEFAULT_UNROLL_M) * ZGEMM3M_DEFAULT_UNROLL_M; #else TABLE_NAME.zgemm3m_p = ((TABLE_NAME.zgemm3m_p + DGEMM_DEFAULT_UNROLL_M - 1)/DGEMM_DEFAULT_UNROLL_M) * DGEMM_DEFAULT_UNROLL_M; #endif +#endif #ifdef QUAD_PRECISION TABLE_NAME.qgemm_p = ((TABLE_NAME.qgemm_p + QGEMM_DEFAULT_UNROLL_M - 1)/QGEMM_DEFAULT_UNROLL_M) * QGEMM_DEFAULT_UNROLL_M; @@ -1386,15 +1686,19 @@ static void init_parameter(void) { fprintf(stderr, "L2 = %8d DGEMM_P .. %d\n", l2, TABLE_NAME.dgemm_p); #endif +#ifdef BUILD_SINGLE TABLE_NAME.sgemm_r = (((BUFFER_SIZE - ((TABLE_NAME.sgemm_p * TABLE_NAME.sgemm_q * 4 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) ) / (TABLE_NAME.sgemm_q * 4) - 15) & ~15); +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_r = (((BUFFER_SIZE - ((TABLE_NAME.dgemm_p * TABLE_NAME.dgemm_q * 8 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) ) / (TABLE_NAME.dgemm_q * 8) - 15) & ~15); +#endif #ifdef EXPRECISION TABLE_NAME.qgemm_r = (((BUFFER_SIZE - @@ -1403,26 +1707,33 @@ static void init_parameter(void) { ) / (TABLE_NAME.qgemm_q * 16) - 15) & ~15); #endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_r = (((BUFFER_SIZE - ((TABLE_NAME.cgemm_p * TABLE_NAME.cgemm_q * 8 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) ) / (TABLE_NAME.cgemm_q * 8) - 15) & ~15); +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_r = (((BUFFER_SIZE - ((TABLE_NAME.zgemm_p * TABLE_NAME.zgemm_q * 16 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) ) / (TABLE_NAME.zgemm_q * 16) - 15) & ~15); +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm3m_r = (((BUFFER_SIZE - ((TABLE_NAME.cgemm3m_p * TABLE_NAME.cgemm3m_q * 8 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) ) / (TABLE_NAME.cgemm3m_q * 8) - 15) & ~15); +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm3m_r = (((BUFFER_SIZE - ((TABLE_NAME.zgemm3m_p * TABLE_NAME.zgemm3m_q * 16 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) ) / (TABLE_NAME.zgemm3m_q * 16) - 15) & ~15); - +#endif From 0eaae30e8c0b9f80426a0557de774680b0e4ab5f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 22 Sep 2020 23:28:03 +0200 Subject: [PATCH 467/593] Adapt tests to having only a subset of types in the build --- test/CMakeLists.txt | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index f1f773cba..360ff2151 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -4,7 +4,7 @@ include_directories(${PROJECT_BINARY_DIR}) enable_language(Fortran) if (BUILD_SINGLE) - list( APPEND OpenBLAS_Tests sblat1 sblat2 sblat3) + list( APPEND OpenBLAS_Tests sblat1 sblat2 sblat3) endif() if (BUILD_DOUBLE) list (APPEND OpenBLAS_Tests dblat1 dblat2 dblat3) @@ -17,7 +17,7 @@ if (BUILD_COMPLEX16) endif() foreach(test_bin ${OpenBLAS_Tests}) -add_executable(${test_bin} ${test_bin}.f) + add_executable(${test_bin} ${test_bin}.f) target_link_libraries(${test_bin} ${OpenBLAS_LIBNAME}) endforeach() @@ -34,7 +34,19 @@ FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh "fi\n" ) -set(float_types s d c z) +#set(float_types s d c z) +if (BUILD_SINGLE) + list (APPEND float_types s) +endif() +if (BUILD_DOUBLE) + list (APPEND float_types d) +endif() +if (BUILD_COMPLEX) + list (APPEND float_types c) +endif() +if (BUILD_COMPLEX16) + list (APPEND float_types z) +endif() foreach(float_type ${float_types}) string(TOUPPER ${float_type} float_type_upper) add_test(NAME "${float_type}blas1" From 98153875e94c4c33d9cc4583711130cf8e23b8d0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 22 Sep 2020 23:28:57 +0200 Subject: [PATCH 468/593] Adapt tests to having only a subset of types in the library --- utest/test_potrs.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/utest/test_potrs.c b/utest/test_potrs.c index 2681615f4..f39287d6f 100644 --- a/utest/test_potrs.c +++ b/utest/test_potrs.c @@ -529,16 +529,20 @@ CTEST(potrf, smoketest_trivial){ for (j = 0; j < n; ++j) { double err; +#ifdef BUILD_SINGLE err = fabs(A1s[i+n*j] - Bs[i+n*j]); if (err > 1e-5) { CTEST_ERR("%s:%d %c s(%d,%d) difference: %g", __FILE__, __LINE__, uplo, i, j, err); } - +#endif +#ifdef BUILD_DOUBLE err = fabs(A1d[i+n*j] - Bd[i+n*j]); if (err > 1e-12) { CTEST_ERR("%s:%d %c d(%d,%d) difference: %g", __FILE__, __LINE__, uplo, i, j, err); } +#endif +#ifdef BUILD_COMPLEX #ifdef OPENBLAS_COMPLEX_C99 err = cabsf(A1c[i+n*j] - Bc[i+n*j]); #else @@ -548,7 +552,9 @@ CTEST(potrf, smoketest_trivial){ if (err > 1e-5) { CTEST_ERR("%s:%d %c c(%d,%d) difference: %g", __FILE__, __LINE__, uplo, i, j, err); } +#endif +#ifdef BUILD_COMPLEX16 #ifdef OPENBLAS_COMPLEX_C99 err = cabs(A1z[i+n*j] - Bz[i+n*j]); #else @@ -558,6 +564,7 @@ CTEST(potrf, smoketest_trivial){ if (err > 1e-12) { CTEST_ERR("%s:%d %c z(%d,%d) difference: %g", __FILE__, __LINE__, uplo, i, j, err); } +#endif } } } From f2e9a24e1a6da1eb3c297e979ac23f47d3685b07 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 23 Sep 2020 19:02:20 +0200 Subject: [PATCH 469/593] Add AWS Graviton2 build --- .travis.yml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/.travis.yml b/.travis.yml index 482b4f648..4bfdf485c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -233,6 +233,21 @@ matrix: - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch armv7 -miphoneos-version-min=5.1" - BTYPE="TARGET=ARMV7 HOSTCC=clang NOFORTRAN=1" + + - &test-graviton2 + os: linux + arch: arm64-graviton2 + dist: focal + group: edge + virt: lxd + compiler: gcc + addons: + apt: + packages: + - gfortran + script: + - travis_wait 45 make && make lapack-test + # whitelist branches: only: From c5a32288c6058223ada420a9e25a4533cf9475bd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 26 Sep 2020 23:24:37 +0200 Subject: [PATCH 470/593] Work around sgemm_r/dgemm_r not being properly defined with BUILD_COMPLEX/BUILD_COMPLEX16 --- common_param.h | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/common_param.h b/common_param.h index a689ddf7d..b6abc4e74 100644 --- a/common_param.h +++ b/common_param.h @@ -189,14 +189,14 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*ssymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*ssymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +#endif +#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) #ifdef ARCH_X86_64 void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG); int (*sgemm_direct_performant) (BLASLONG M, BLASLONG N, BLASLONG K); #endif -#endif -#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) int (*sgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); int (*sgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); @@ -1085,7 +1085,7 @@ extern gotoblas_t *gotoblas; #define SHGEMM_UNROLL_MN gotoblas -> shgemm_unroll_mn #endif -#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if defined (BUILD_SINGLE) #define SGEMM_P gotoblas -> sgemm_p #define SGEMM_Q gotoblas -> sgemm_q #define SGEMM_R gotoblas -> sgemm_r @@ -1094,7 +1094,7 @@ extern gotoblas_t *gotoblas; #define SGEMM_UNROLL_MN gotoblas -> sgemm_unroll_mn #endif -#if defined (BUILD_DOUBLE) || defined (BUILD_COMPLEX16) +#if defined (BUILD_DOUBLE) #define DGEMM_P gotoblas -> dgemm_p #define DGEMM_Q gotoblas -> dgemm_q #define DGEMM_R gotoblas -> dgemm_r @@ -1117,6 +1117,14 @@ extern gotoblas_t *gotoblas; #define CGEMM_UNROLL_M gotoblas -> cgemm_unroll_m #define CGEMM_UNROLL_N gotoblas -> cgemm_unroll_n #define CGEMM_UNROLL_MN gotoblas -> cgemm_unroll_mn +#ifndef BUILD_SINGLE +#define SGEMM_P gotoblas -> sgemm_p +#define SGEMM_Q gotoblas -> sgemm_q +#define SGEMM_R 1024 +#define SGEMM_UNROLL_M gotoblas -> sgemm_unroll_m +#define SGEMM_UNROLL_N gotoblas -> sgemm_unroll_n +#define SGEMM_UNROLL_MN gotoblas -> sgemm_unroll_mn +#endif #endif #ifdef BUILD_COMPLEX16 @@ -1126,6 +1134,14 @@ extern gotoblas_t *gotoblas; #define ZGEMM_UNROLL_M gotoblas -> zgemm_unroll_m #define ZGEMM_UNROLL_N gotoblas -> zgemm_unroll_n #define ZGEMM_UNROLL_MN gotoblas -> zgemm_unroll_mn +#ifndef BUILD_DOUBLE +#define DGEMM_P gotoblas -> dgemm_p +#define DGEMM_Q gotoblas -> dgemm_q +#define DGEMM_R 1024 +#define DGEMM_UNROLL_M gotoblas -> dgemm_unroll_m +#define DGEMM_UNROLL_N gotoblas -> dgemm_unroll_n +#define DGEMM_UNROLL_MN gotoblas -> dgemm_unroll_mn +#endif #endif #define XGEMM_P gotoblas -> xgemm_p From 896bbd55e19aa628fb1438333d1376b27c0bcd65 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 26 Sep 2020 23:25:55 +0200 Subject: [PATCH 471/593] Add support for building only selected variable types --- driver/others/blas_server_omp.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index d126955e4..bdb5ebfd2 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -300,12 +300,15 @@ static void exec_threads(blas_queue_t *queue, int buf_index){ } else #endif if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){ +#if defined ( BUILD_DOUBLE) || defined (BUILD_COMPLEX16) sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); - +#endif } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE){ +#if defined (BUILD_SINGLE) || defined (BUILD_COMPLEX) sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif } else { /* Other types in future */ } @@ -317,15 +320,24 @@ static void exec_threads(blas_queue_t *queue, int buf_index){ } else #endif if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){ +#ifdef BUILD_COMPLEX16 sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#else +fprintf(stderr,"UNHANDLED COMPLEX16\n"); +#endif } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) { +#ifdef BUILD_COMPLEX sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#else +fprintf(stderr,"UNHANDLED COMPLEX\n"); +#endif } else { /* Other types in future */ } } +if (!sb) fprintf(stderr,"SB not declared!!!\n"); queue->sb=sb; } } From 881c15179f93c96d9567ef74dceef1dfdbd5ccfa Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Sun, 27 Sep 2020 09:35:50 +0800 Subject: [PATCH 472/593] remove default support for FMA4 on zen architect --- getarch.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/getarch.c b/getarch.c index 83043bdf2..e2c22d3a0 100644 --- a/getarch.c +++ b/getarch.c @@ -492,7 +492,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DDTB_DEFAULT_ENTRIES=32 -DDTB_SIZE=4096 " \ "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \ "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU " \ - "-DHAVE_AVX -DHAVE_FMA4" + "-DHAVE_AVX" #define LIBNAME "bulldozer" #define CORENAME "BULLDOZER" #endif @@ -508,7 +508,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \ "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \ - "-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3" + "-DHAVE_AVX -DHAVE_FMA3" #define LIBNAME "piledriver" #define CORENAME "PILEDRIVER" #endif @@ -524,7 +524,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \ "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \ - "-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3" + "-DHAVE_AVX -DHAVE_FMA3" #define LIBNAME "steamroller" #define CORENAME "STEAMROLLER" #endif @@ -540,7 +540,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \ "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \ - "-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3" + "-DHAVE_AVX -DHAVE_FMA3" #define LIBNAME "excavator" #define CORENAME "EXCAVATOR" #endif From 7f539fb850a89b216c2d95aa48c9c36236c56767 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 27 Sep 2020 22:48:41 +0200 Subject: [PATCH 473/593] Update cpu list, outline cmake build, clarify scope of set_num_threads extension --- README.md | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f8226f5cb..6d44129c2 100644 --- a/README.md +++ b/README.md @@ -46,7 +46,10 @@ Building OpenBLAS requires the following to be installed: Simply invoking `make` (or `gmake` on BSD) will detect the CPU automatically. To set a specific target CPU, use `make TARGET=xxx`, e.g. `make TARGET=NEHALEM`. -The full target list is in the file `TargetList.txt`. +The full target list is in the file `TargetList.txt`. For building with `cmake`, the +usual conventions apply, i.e. create a build directory either underneath the toplevel +OpenBLAS source directory or separate from it, and invoke `cmake` there with the path +to the source tree and any build options you plan to set. ### Cross compile @@ -152,13 +155,17 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th - **Falkor**: same as A57 (different cpu specifications) - **ThunderX**: Optimized some Level-1 functions - **ThunderX2T99**: Optimized Level-3 BLAS and parts of Levels 1 and 2 +- **ThunderX3T110** - **TSV110**: Optimized some Level-3 helper functions - **EMAG 8180**: preliminary support based on A57 +- **Neoverse N1**: (AWS Graviton2) preliminary support +- **Apple Vortex**: preliminary support based on ARMV8 #### PPC/PPC64 - **POWER8**: Optimized BLAS, only for PPC64LE (Little Endian), only with `USE_OPENMP=1` - **POWER9**: Optimized Level-3 BLAS (real) and some Level-1,2. PPC64LE with OpenMP only. +- **POWER10**: #### IBM zEnterprise System @@ -226,7 +233,8 @@ We provide the following functions to control the number of threads at runtime: void goto_set_num_threads(int num_threads); void openblas_set_num_threads(int num_threads); ``` - +Note that these are only used once at library initialization, and are not available for +fine-tuning thread numbers in individual BLAS calls. If you compile this library with `USE_OPENMP=1`, you should use the above functions too. ## Reporting bugs From 7ed25e9e1010faa94a04d694080f982ed9e60b53 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 27 Sep 2020 22:59:20 +0200 Subject: [PATCH 474/593] FIx underflow/rounding errors in LAPACK (S,D)LANV2 Reference-LAPACK PR 445, fixing their issue 263 --- lapack-netlib/SRC/dlanv2.f | 28 ++++++++++++++++++++++++++-- lapack-netlib/SRC/slanv2.f | 28 ++++++++++++++++++++++++++-- 2 files changed, 52 insertions(+), 4 deletions(-) diff --git a/lapack-netlib/SRC/dlanv2.f b/lapack-netlib/SRC/dlanv2.f index d68481f7e..61b016f16 100644 --- a/lapack-netlib/SRC/dlanv2.f +++ b/lapack-netlib/SRC/dlanv2.f @@ -140,13 +140,16 @@ * * .. Parameters .. DOUBLE PRECISION ZERO, HALF, ONE - PARAMETER ( ZERO = 0.0D+0, HALF = 0.5D+0, ONE = 1.0D+0 ) + PARAMETER ( ZERO = 0.0D+0, HALF = 0.5D+0, ONE = 1.0D+0, + $ TWO = 2.0D0 ) DOUBLE PRECISION MULTPL PARAMETER ( MULTPL = 4.0D+0 ) * .. * .. Local Scalars .. DOUBLE PRECISION AA, BB, BCMAX, BCMIS, CC, CS1, DD, EPS, P, SAB, - $ SAC, SCALE, SIGMA, SN1, TAU, TEMP, Z + $ SAC, SCALE, SIGMA, SN1, TAU, TEMP, Z, SAFMIN, + $ SAFMN2, SAFMX2 + INTEGER COUNT * .. * .. External Functions .. DOUBLE PRECISION DLAMCH, DLAPY2 @@ -157,7 +160,11 @@ * .. * .. Executable Statements .. * + SAFMIN = DLAMCH( 'S' ) EPS = DLAMCH( 'P' ) + SAFMN2 = DLAMCH( 'B' )**INT( LOG( SAFMIN / EPS ) / + $ LOG( DLAMCH( 'B' ) ) / TWO ) + SAFMX2 = ONE / SAFMN2 IF( C.EQ.ZERO ) THEN CS = ONE SN = ZERO @@ -212,7 +219,24 @@ * Complex eigenvalues, or real (almost) equal eigenvalues. * Make diagonal elements equal. * + COUNT = 0 SIGMA = B + C + 10 CONTINUE + COUNT = COUNT + 1 + SCALE = MAX( ABS(TEMP), ABS(SIGMA) ) + IF( SCALE.GE.SAFMX2 ) THEN + SIGMA = SIGMA * SAFMN2 + TEMP = TEMP * SAFMN2 + IF (COUNT .LE. 20) + $ GOTO 10 + END IF + IF( SCALE.LE.SAFMN2 ) THEN + SIGMA = SIGMA * SAFMX2 + TEMP = TEMP * SAFMX2 + IF (COUNT .LE. 20) + $ GOTO 10 + END IF + P = HALF*TEMP TAU = DLAPY2( SIGMA, TEMP ) CS = SQRT( HALF*( ONE+ABS( SIGMA ) / TAU ) ) SN = -( P / ( TAU*CS ) )*SIGN( ONE, SIGMA ) diff --git a/lapack-netlib/SRC/slanv2.f b/lapack-netlib/SRC/slanv2.f index 1163446fa..e678305f2 100644 --- a/lapack-netlib/SRC/slanv2.f +++ b/lapack-netlib/SRC/slanv2.f @@ -140,13 +140,16 @@ * * .. Parameters .. REAL ZERO, HALF, ONE - PARAMETER ( ZERO = 0.0E+0, HALF = 0.5E+0, ONE = 1.0E+0 ) + PARAMETER ( ZERO = 0.0E+0, HALF = 0.5E+0, ONE = 1.0E+0, + $ TWO = 2.0E+0 ) REAL MULTPL PARAMETER ( MULTPL = 4.0E+0 ) * .. * .. Local Scalars .. REAL AA, BB, BCMAX, BCMIS, CC, CS1, DD, EPS, P, SAB, - $ SAC, SCALE, SIGMA, SN1, TAU, TEMP, Z + $ SAC, SCALE, SIGMA, SN1, TAU, TEMP, Z, SAFMIN, + $ SAFMN2, SAFMX2 + INTEGER COUNT * .. * .. External Functions .. REAL SLAMCH, SLAPY2 @@ -157,7 +160,11 @@ * .. * .. Executable Statements .. * + SAFMIN = SLAMCH( 'S' ) EPS = SLAMCH( 'P' ) + SAFMN2 = SLAMCH( 'B' )**INT( LOG( SAFMIN / EPS ) / + $ LOG( SLAMCH( 'B' ) ) / TWO ) + SAFMX2 = ONE / SAFMN2 IF( C.EQ.ZERO ) THEN CS = ONE SN = ZERO @@ -212,7 +219,24 @@ * Complex eigenvalues, or real (almost) equal eigenvalues. * Make diagonal elements equal. * + COUNT = 0 SIGMA = B + C + 10 CONTINUE + COUNT = COUNT + 1 + SCALE = MAX( ABS(TEMP), ABS(SIGMA) ) + IF( SCALE.GE.SAFMX2 ) THEN + SIGMA = SIGMA * SAFMN2 + TEMP = TEMP * SAFMN2 + IF (COUNT .LE. 20) + $ GOTO 10 + END IF + IF( SCALE.LE.SAFMN2 ) THEN + SIGMA = SIGMA * SAFMX2 + TEMP = TEMP * SAFMX2 + IF (COUNT .LE. 20) + $ GOTO 10 + END IF + P = HALF*TEMP TAU = SLAPY2( SIGMA, TEMP ) CS = SQRT( HALF*( ONE+ABS( SIGMA ) / TAU ) ) SN = -( P / ( TAU*CS ) )*SIGN( ONE, SIGMA ) From fe8cd5ae7e0958cced30e7086509d286a8442be0 Mon Sep 17 00:00:00 2001 From: Thomas Hisch Date: Mon, 28 Sep 2020 00:42:17 +0200 Subject: [PATCH 475/593] Consolidate usage of backticks for build options There were some build options in the README that were not highlighted. Now all are highlighted. --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 6d44129c2..ca034e747 100644 --- a/README.md +++ b/README.md @@ -174,18 +174,18 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th ### Support for multiple targets in a single library -OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying DYNAMIC_ARCH=1 in Makefile.rule, on the gmake command line or as -DDYNAMIC_ARCH=TRUE in cmake. +OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake. -For **x86_64**, the list of targets this activates contains Prescott, Core2, Nehalem, Barcelona, Sandybridge, Bulldozer, Piledriver, Steamroller, Excavator, Haswell, Zen, SkylakeX. For cpu generations not included in this list, the corresponding older model is used. If you also specify DYNAMIC_OLDER=1, specific support for Penryn, Dunnington, Opteron, Opteron/SSE3, Bobcat, Atom and Nano is added. Finally there is an option DYNAMIC_LIST that allows to specify an individual list of targets to include instead of the default. +For **x86_64**, the list of targets this activates contains Prescott, Core2, Nehalem, Barcelona, Sandybridge, Bulldozer, Piledriver, Steamroller, Excavator, Haswell, Zen, SkylakeX. For cpu generations not included in this list, the corresponding older model is used. If you also specify `DYNAMIC_OLDER=1`, specific support for Penryn, Dunnington, Opteron, Opteron/SSE3, Bobcat, Atom and Nano is added. Finally there is an option `DYNAMIC_LIST` that allows to specify an individual list of targets to include instead of the default. -DYNAMIC_ARCH is also supported on **x86**, where it translates to Katmai, Coppermine, Northwood, Prescott, Banias, +`DYNAMIC_ARCH` is also supported on **x86**, where it translates to Katmai, Coppermine, Northwood, Prescott, Banias, Core2, Penryn, Dunnington, Nehalem, Athlon, Opteron, Opteron_SSE3, Barcelona, Bobcat, Atom and Nano. On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73, Falkor, ThunderX, ThunderX2T99, TSV110 as well as generic ARMV8 cpus. For **POWER**, the list encompasses POWER6, POWER8 and POWER9, on **ZARCH** it comprises Z13 and Z14. -The TARGET option can be used in conjunction with DYNAMIC_ARCH=1 to specify which cpu model should be assumed for all the +The `TARGET` option can be used in conjunction with `DYNAMIC_ARCH=1` to specify which cpu model should be assumed for all the common code in the library, usually you will want to set this to the oldest model you expect to encounter. Please note that it is not possible to combine support for different architectures, so no combined 32 and 64 bit or x86_64 and arm64 in the same library. From 2df4235e00a73ad61b7997c74497fd86eb278ebf Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Sun, 27 Sep 2020 21:42:32 -0500 Subject: [PATCH 476/593] Optimize dcopy/zcopy for POWER10 This patch makes use of new POWER10 vector pair instructions for loads and stores. Tested in simulator and no new failures. --- kernel/power/KERNEL.POWER10 | 4 +- kernel/power/dcopy_microk_power10.c | 134 ++++++++++++++++++++++++++++ kernel/power/dcopy_power10.c | 123 +++++++++++++++++++++++++ kernel/power/zcopy_microk_power10.c | 134 ++++++++++++++++++++++++++++ kernel/power/zcopy_power10.c | 132 +++++++++++++++++++++++++++ 5 files changed, 525 insertions(+), 2 deletions(-) create mode 100644 kernel/power/dcopy_microk_power10.c create mode 100644 kernel/power/dcopy_power10.c create mode 100644 kernel/power/zcopy_microk_power10.c create mode 100644 kernel/power/zcopy_power10.c diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index ec02e09ad..d0cda7fb6 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -151,9 +151,9 @@ endif ZAXPYKERNEL = zaxpy_power10.c # SCOPYKERNEL = scopy.c -DCOPYKERNEL = dcopy.c +DCOPYKERNEL = dcopy_power10.c CCOPYKERNEL = ccopy.c -ZCOPYKERNEL = zcopy.c +ZCOPYKERNEL = zcopy_power10.c # SDOTKERNEL = sdot.c DDOTKERNEL = ddot.c diff --git a/kernel/power/dcopy_microk_power10.c b/kernel/power/dcopy_microk_power10.c new file mode 100644 index 000000000..8940e0db9 --- /dev/null +++ b/kernel/power/dcopy_microk_power10.c @@ -0,0 +1,134 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_64 1 + +static void dcopy_kernel_64 (long n, double *x, double *y) +{ + __asm__ + ( + "lxvp 32, 0(%2) \n\t" + "lxvp 34, 32(%2) \n\t" + "lxvp 36, 64(%2) \n\t" + "lxvp 38, 96(%2) \n\t" + "lxvp 40, 128(%2) \n\t" + "lxvp 42, 160(%2) \n\t" + "lxvp 44, 192(%2) \n\t" + "lxvp 46, 224(%2) \n\t" + + "lxvp 48, 256(%2) \n\t" + "lxvp 50, 288(%2) \n\t" + "lxvp 52, 320(%2) \n\t" + "lxvp 54, 352(%2) \n\t" + "lxvp 56, 384(%2) \n\t" + "lxvp 58, 416(%2) \n\t" + "lxvp 60, 448(%2) \n\t" + "lxvp 62, 480(%2) \n\t" + "addi %2, %2, 512 \n\t" + + "addic. %1, %1, -64 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "stxvp 32, 0(%3) \n\t" + "lxvp 32, 0(%2) \n\t" + "stxvp 34, 32(%3) \n\t" + "lxvp 34, 32(%2) \n\t" + "stxvp 36, 64(%3) \n\t" + "lxvp 36, 64(%2) \n\t" + "stxvp 38, 96(%3) \n\t" + "lxvp 38, 96(%2) \n\t" + + "stxvp 40, 128(%3) \n\t" + "lxvp 40, 128(%2) \n\t" + "stxvp 42, 160(%3) \n\t" + "lxvp 42, 160(%2) \n\t" + "stxvp 44, 192(%3) \n\t" + "lxvp 44, 192(%2) \n\t" + "stxvp 46, 224(%3) \n\t" + "lxvp 46, 224(%2) \n\t" + + "stxvp 48, 256(%3) \n\t" + "lxvp 48, 256(%2) \n\t" + "stxvp 50, 288(%3) \n\t" + "lxvp 50, 288(%2) \n\t" + "stxvp 52, 320(%3) \n\t" + "lxvp 52, 320(%2) \n\t" + "stxvp 54, 352(%3) \n\t" + "lxvp 54, 352(%2) \n\t" + "stxvp 56, 384(%3) \n\t" + "lxvp 56, 384(%2) \n\t" + "stxvp 58, 416(%3) \n\t" + "lxvp 58, 416(%2) \n\t" + "stxvp 60, 448(%3) \n\t" + "lxvp 60, 448(%2) \n\t" + "stxvp 62, 480(%3) \n\t" + "lxvp 62, 480(%2) \n\t" + + "addi %3, %3, 512 \n\t" + "addi %2, %2, 512 \n\t" + + "addic. %1, %1, -64 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "stxvp 32, 0(%3) \n\t" + "stxvp 34, 32(%3) \n\t" + "stxvp 36, 64(%3) \n\t" + "stxvp 38, 96(%3) \n\t" + "stxvp 40, 128(%3) \n\t" + "stxvp 42, 160(%3) \n\t" + "stxvp 44, 192(%3) \n\t" + "stxvp 46, 224(%3) \n\t" + "stxvp 48, 256(%3) \n\t" + "stxvp 50, 288(%3) \n\t" + "stxvp 52, 320(%3) \n\t" + "stxvp 54, 352(%3) \n\t" + "stxvp 56, 384(%3) \n\t" + "stxvp 58, 416(%3) \n\t" + "stxvp 60, 448(%3) \n\t" + "stxvp 62, 480(%3) \n\t" + + "#n=%1 x=%4=%2 y=%0=%3" + : + "=m" (*y), + "+r" (n), // 1 + "+b" (x), // 2 + "+b" (y) // 3 + : + "m" (*x) + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", + "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63" + ); +} diff --git a/kernel/power/dcopy_power10.c b/kernel/power/dcopy_power10.c new file mode 100644 index 000000000..32530d570 --- /dev/null +++ b/kernel/power/dcopy_power10.c @@ -0,0 +1,123 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if defined(__VEC__) || defined(__ALTIVEC__) +#include "dcopy_microk_power10.c" +#endif + +#ifndef HAVE_KERNEL_64 + +static void dcopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) +{ + + BLASLONG i=0; + FLOAT f0, f1, f2, f3, f4, f5, f6, f7; + FLOAT *x1=x; + FLOAT *y1=y; + + while ( i 0 ) + { + dcopy_kernel_64(n1, x, y); + i=n1; + } + + while(i < n) + { + y[i] = x[i] ; + i++ ; + + } + + + } + else + { + + while(i < n) + { + y[iy] = x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + + } + return(0); + + +} + + diff --git a/kernel/power/zcopy_microk_power10.c b/kernel/power/zcopy_microk_power10.c new file mode 100644 index 000000000..f2f2119a3 --- /dev/null +++ b/kernel/power/zcopy_microk_power10.c @@ -0,0 +1,134 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_32 1 + +static void zcopy_kernel_32 (long n, double *x, double *y) +{ + __asm__ + ( + "lxvp 32, 0(%2) \n\t" + "lxvp 34, 32(%2) \n\t" + "lxvp 36, 64(%2) \n\t" + "lxvp 38, 96(%2) \n\t" + "lxvp 40, 128(%2) \n\t" + "lxvp 42, 160(%2) \n\t" + "lxvp 44, 192(%2) \n\t" + "lxvp 46, 224(%2) \n\t" + + "lxvp 48, 256(%2) \n\t" + "lxvp 50, 288(%2) \n\t" + "lxvp 52, 320(%2) \n\t" + "lxvp 54, 352(%2) \n\t" + "lxvp 56, 384(%2) \n\t" + "lxvp 58, 416(%2) \n\t" + "lxvp 60, 448(%2) \n\t" + "lxvp 62, 480(%2) \n\t" + "addi %2, %2, 512 \n\t" + + "addic. %1, %1, -32 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "stxvp 32, 0(%3) \n\t" + "lxvp 32, 0(%2) \n\t" + "stxvp 34, 32(%3) \n\t" + "lxvp 34, 32(%2) \n\t" + "stxvp 36, 64(%3) \n\t" + "lxvp 36, 64(%2) \n\t" + "stxvp 38, 96(%3) \n\t" + "lxvp 38, 96(%2) \n\t" + + "stxvp 40, 128(%3) \n\t" + "lxvp 40, 128(%2) \n\t" + "stxvp 42, 160(%3) \n\t" + "lxvp 42, 160(%2) \n\t" + "stxvp 44, 192(%3) \n\t" + "lxvp 44, 192(%2) \n\t" + "stxvp 46, 224(%3) \n\t" + "lxvp 46, 224(%2) \n\t" + + "stxvp 48, 256(%3) \n\t" + "lxvp 48, 256(%2) \n\t" + "stxvp 50, 288(%3) \n\t" + "lxvp 50, 288(%2) \n\t" + "stxvp 52, 320(%3) \n\t" + "lxvp 52, 320(%2) \n\t" + "stxvp 54, 352(%3) \n\t" + "lxvp 54, 352(%2) \n\t" + "stxvp 56, 384(%3) \n\t" + "lxvp 56, 384(%2) \n\t" + "stxvp 58, 416(%3) \n\t" + "lxvp 58, 416(%2) \n\t" + "stxvp 60, 448(%3) \n\t" + "lxvp 60, 448(%2) \n\t" + "stxvp 62, 480(%3) \n\t" + "lxvp 62, 480(%2) \n\t" + + "addi %3, %3, 512 \n\t" + "addi %2, %2, 512 \n\t" + + "addic. %1, %1, -32 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "stxvp 32, 0(%3) \n\t" + "stxvp 34, 32(%3) \n\t" + "stxvp 36, 64(%3) \n\t" + "stxvp 38, 96(%3) \n\t" + "stxvp 40, 128(%3) \n\t" + "stxvp 42, 160(%3) \n\t" + "stxvp 44, 192(%3) \n\t" + "stxvp 46, 224(%3) \n\t" + "stxvp 48, 256(%3) \n\t" + "stxvp 50, 288(%3) \n\t" + "stxvp 52, 320(%3) \n\t" + "stxvp 54, 352(%3) \n\t" + "stxvp 56, 384(%3) \n\t" + "stxvp 58, 416(%3) \n\t" + "stxvp 60, 448(%3) \n\t" + "stxvp 62, 480(%3) \n\t" + + "#n=%1 x=%4=%2 y=%0=%3" + : + "=m" (*y), + "+r" (n), // 1 + "+b" (x), // 2 + "+b" (y) // 3 + : + "m" (*x) + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", + "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63" + ); +} diff --git a/kernel/power/zcopy_power10.c b/kernel/power/zcopy_power10.c new file mode 100644 index 000000000..99d463b02 --- /dev/null +++ b/kernel/power/zcopy_power10.c @@ -0,0 +1,132 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if defined(__VEC__) || defined(__ALTIVEC__) +#include "zcopy_microk_power10.c" +#endif + +#ifndef HAVE_KERNEL_32 + +static void zcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +{ + + BLASLONG i=0; + FLOAT f0, f1, f2, f3, f4, f5, f6, f7; + FLOAT *x1=x; + FLOAT *y1=y; + + while ( i 0 ) + { + zcopy_kernel_32(n1, x, y); + i=n1; + ix=n1*2; + iy=n1*2; + } + + while(i < n) + { + y[iy] = x[iy] ; + y[iy+1] = x[ix+1] ; + ix+=2; + iy+=2; + i++ ; + + } + + + } + else + { + + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_y2 = 2 * inc_y; + + while(i < n) + { + y[iy] = x[ix] ; + y[iy+1] = x[ix+1] ; + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + + } + return(0); + + +} + + From 1b1a757f5f389b9496f016defaecccb63c415fa6 Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Mon, 28 Sep 2020 20:36:53 +0800 Subject: [PATCH 477/593] Optimize the performance of dot by using universal intrinsics in X86/ARM --- kernel/generic/dot.c | 50 ++++++++++++++++++++++++++++++------- kernel/simd/intrin.h | 9 +++++++ kernel/simd/intrin_avx.h | 32 ++++++++++++++++-------- kernel/simd/intrin_avx512.h | 32 +++++++++++++++++------- kernel/simd/intrin_neon.h | 42 +++++++++++++++++++++++++++++++ kernel/simd/intrin_sse.h | 36 ++++++++++++++++++-------- utest/test_dsdot.c | 14 +++++++++++ 7 files changed, 177 insertions(+), 38 deletions(-) create mode 100644 kernel/simd/intrin_neon.h diff --git a/kernel/generic/dot.c b/kernel/generic/dot.c index bc07bc78f..f1ea6b264 100644 --- a/kernel/generic/dot.c +++ b/kernel/generic/dot.c @@ -47,27 +47,59 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) if ( (inc_x == 1) && (inc_y == 1) ) { - int n1 = n & -4; - - while(i < n1) +#if V_SIMD && !defined(DSDOT) + const int vstep = v_nlanes_f32; + const int unrollx4 = n & (-vstep * 4); + const int unrollx = n & -vstep; + v_f32 vsum0 = v_zero_f32(); + v_f32 vsum1 = v_zero_f32(); + v_f32 vsum2 = v_zero_f32(); + v_f32 vsum3 = v_zero_f32(); + while(i < unrollx4) + { + vsum0 = v_muladd_f32( + v_loadu_f32(x + i), v_loadu_f32(y + i), vsum0 + ); + vsum1 = v_muladd_f32( + v_loadu_f32(x + i + vstep), v_loadu_f32(y + i + vstep), vsum1 + ); + vsum2 = v_muladd_f32( + v_loadu_f32(x + i + vstep*2), v_loadu_f32(y + i + vstep*2), vsum2 + ); + vsum3 = v_muladd_f32( + v_loadu_f32(x + i + vstep*3), v_loadu_f32(y + i + vstep*3), vsum3 + ); + i += vstep*4; + } + vsum0 = v_add_f32( + v_add_f32(vsum0, vsum1), v_add_f32(vsum2 , vsum3) + ); + while(i < unrollx) + { + vsum0 = v_muladd_f32( + v_loadu_f32(x + i), v_loadu_f32(y + i), vsum0 + ); + i += vstep; + } + dot = v_sum_f32(vsum0); +#elif defined(DSDOT) + for (; i < n1; i += 4) { - -#if defined(DSDOT) dot += (double) y[i] * (double) x[i] + (double) y[i+1] * (double) x[i+1] + (double) y[i+2] * (double) x[i+2] + (double) y[i+3] * (double) x[i+3] ; + } #else + for (; i < n1; i += 4) + { dot += y[i] * x[i] + y[i+1] * x[i+1] + y[i+2] * x[i+2] + y[i+3] * x[i+3] ; -#endif - i+=4 ; - } - +#endif while(i < n) { diff --git a/kernel/simd/intrin.h b/kernel/simd/intrin.h index 5997bb6ac..ef8fcb865 100644 --- a/kernel/simd/intrin.h +++ b/kernel/simd/intrin.h @@ -51,6 +51,11 @@ extern "C" { #include #endif +/** NEON **/ +#ifdef HAVE_NEON +#include +#endif + // distribute #if defined(HAVE_AVX512VL) || defined(HAVE_AVX512BF16) #include "intrin_avx512.h" @@ -60,6 +65,10 @@ extern "C" { #include "intrin_sse.h" #endif +#ifdef HAVE_NEON +#include "intrin_neon.h" +#endif + #ifndef V_SIMD #define V_SIMD 0 #define V_SIMD_F64 0 diff --git a/kernel/simd/intrin_avx.h b/kernel/simd/intrin_avx.h index f6257ae98..f36a3dbf0 100644 --- a/kernel/simd/intrin_avx.h +++ b/kernel/simd/intrin_avx.h @@ -1,13 +1,13 @@ #define V_SIMD 256 #define V_SIMD_F64 1 -/* -Data Type -*/ +/*************************** + * Data Type + ***************************/ typedef __m256 v_f32; #define v_nlanes_f32 8 -/* -arithmetic -*/ +/*************************** + * Arithmetic + ***************************/ #define v_add_f32 _mm256_add_ps #define v_mul_f32 _mm256_mul_ps @@ -20,10 +20,22 @@ arithmetic { return v_add_f32(v_mul_f32(a, b), c); } #endif // !HAVE_FMA3 -/* -memory -*/ +// Horizontal add: Calculates the sum of all vector elements. +BLAS_FINLINE float v_sum_f32(__m256 a) +{ + __m256 sum_halves = _mm256_hadd_ps(a, a); + sum_halves = _mm256_hadd_ps(sum_halves, sum_halves); + __m128 lo = _mm256_castps256_ps128(sum_halves); + __m128 hi = _mm256_extractf128_ps(sum_halves, 1); + __m128 sum = _mm_add_ps(lo, hi); + return _mm_cvtss_f32(sum); +} + +/*************************** + * memory + ***************************/ // unaligned load #define v_loadu_f32 _mm256_loadu_ps #define v_storeu_f32 _mm256_storeu_ps -#define v_setall_f32(VAL) _mm256_set1_ps(VAL) \ No newline at end of file +#define v_setall_f32(VAL) _mm256_set1_ps(VAL) +#define v_zero_f32 _mm256_setzero_ps \ No newline at end of file diff --git a/kernel/simd/intrin_avx512.h b/kernel/simd/intrin_avx512.h index cb116a9a3..70e5f72e3 100644 --- a/kernel/simd/intrin_avx512.h +++ b/kernel/simd/intrin_avx512.h @@ -1,21 +1,35 @@ #define V_SIMD 512 #define V_SIMD_F64 1 -/* -Data Type -*/ +/*************************** + * Data Type + ***************************/ typedef __m512 v_f32; #define v_nlanes_f32 16 -/* -arithmetic -*/ +/*************************** + * Arithmetic + ***************************/ #define v_add_f32 _mm512_add_ps #define v_mul_f32 _mm512_mul_ps // multiply and add, a*b + c #define v_muladd_f32 _mm512_fmadd_ps -/* -memory -*/ + +BLAS_FINLINE float v_sum_f32(v_f32 a) +{ + __m512 h64 = _mm512_shuffle_f32x4(a, a, _MM_SHUFFLE(3, 2, 3, 2)); + __m512 sum32 = _mm512_add_ps(a, h64); + __m512 h32 = _mm512_shuffle_f32x4(sum32, sum32, _MM_SHUFFLE(1, 0, 3, 2)); + __m512 sum16 = _mm512_add_ps(sum32, h32); + __m512 h16 = _mm512_permute_ps(sum16, _MM_SHUFFLE(1, 0, 3, 2)); + __m512 sum8 = _mm512_add_ps(sum16, h16); + __m512 h4 = _mm512_permute_ps(sum8, _MM_SHUFFLE(2, 3, 0, 1)); + __m512 sum4 = _mm512_add_ps(sum8, h4); + return _mm_cvtss_f32(_mm512_castps512_ps128(sum4)); +} +/*************************** + * memory + ***************************/ // unaligned load #define v_loadu_f32(PTR) _mm512_loadu_ps((const __m512*)(PTR)) #define v_storeu_f32 _mm512_storeu_ps #define v_setall_f32(VAL) _mm512_set1_ps(VAL) +#define v_zero_f32 _mm512_setzero_ps diff --git a/kernel/simd/intrin_neon.h b/kernel/simd/intrin_neon.h new file mode 100644 index 000000000..5875c0e4e --- /dev/null +++ b/kernel/simd/intrin_neon.h @@ -0,0 +1,42 @@ +#define V_SIMD 128 +#ifdef __aarch64__ + #define V_SIMD_F64 1 +#else + #define V_SIMD_F64 0 +#endif +/*************************** + * Data Type + ***************************/ +typedef float32x4_t v_f32; +#define v_nlanes_f32 4 +/*************************** + * Arithmetic + ***************************/ +#define v_add_f32 vaddq_f32 +#define v_mul_f32 vmulq_f32 + +// FUSED F32 +#ifdef HAVE_VFPV4 // FMA + // multiply and add, a*b + c + BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c) + { return vfmaq_f32(c, a, b); } +#else + // multiply and add, a*b + c + BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c) + { return vmlaq_f32(c, a, b); } +#endif + +// Horizontal add: Calculates the sum of all vector elements. +BLAS_FINLINE float v_sum_f32(float32x4_t a) +{ + float32x2_t r = vadd_f32(vget_high_f32(a), vget_low_f32(a)); + return vget_lane_f32(vpadd_f32(r, r), 0); +} +/*************************** + * memory + ***************************/ +// unaligned load +#define v_loadu_f32(a) vld1q_f32((const float*)a) +#define v_storeu_f32 vst1q_f32 +#define v_setall_f32(VAL) vdupq_n_f32(VAL) +#define v_zero_f32() vdupq_n_f32(0.0f) \ No newline at end of file diff --git a/kernel/simd/intrin_sse.h b/kernel/simd/intrin_sse.h index 260112028..9de7e1b27 100644 --- a/kernel/simd/intrin_sse.h +++ b/kernel/simd/intrin_sse.h @@ -1,13 +1,13 @@ #define V_SIMD 128 #define V_SIMD_F64 1 -/* -Data Type -*/ +/*************************** + * Data Type + ***************************/ typedef __m128 v_f32; #define v_nlanes_f32 4 -/* -arithmetic -*/ +/*************************** + * Arithmetic + ***************************/ #define v_add_f32 _mm_add_ps #define v_mul_f32 _mm_mul_ps #ifdef HAVE_FMA3 @@ -21,10 +21,26 @@ arithmetic BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c) { return v_add_f32(v_mul_f32(a, b), c); } #endif // HAVE_FMA3 -/* -memory -*/ + +// Horizontal add: Calculates the sum of all vector elements. +BLAS_FINLINE float v_sum_f32(__m128 a) +{ +#ifdef HAVE_SSE3 + __m128 sum_halves = _mm_hadd_ps(a, a); + return _mm_cvtss_f32(_mm_hadd_ps(sum_halves, sum_halves)); +#else + __m128 t1 = _mm_movehl_ps(a, a); + __m128 t2 = _mm_add_ps(a, t1); + __m128 t3 = _mm_shuffle_ps(t2, t2, 1); + __m128 t4 = _mm_add_ss(t2, t3); + return _mm_cvtss_f32(t4); +#endif +} +/*************************** + * memory + ***************************/ // unaligned load #define v_loadu_f32 _mm_loadu_ps #define v_storeu_f32 _mm_storeu_ps -#define v_setall_f32(VAL) _mm_set1_ps(VAL) \ No newline at end of file +#define v_setall_f32(VAL) _mm_set1_ps(VAL) +#define v_zero_f32 _mm_setzero_ps \ No newline at end of file diff --git a/utest/test_dsdot.c b/utest/test_dsdot.c index d58b398a8..57da7101e 100644 --- a/utest/test_dsdot.c +++ b/utest/test_dsdot.c @@ -47,3 +47,17 @@ CTEST(dsdot,dsdot_n_1) ASSERT_DBL_NEAR_TOL(res2, res1, DOUBLE_EPS); } + +CTEST(dsdot,dsdot_n_2) +{ + float x[] = {0.1F, 0.2F, 0.3F, 0.4F, 0.5F, 0.6F, 0.7F, 0.8F}; + float y[] = {0.1F, 0.2F, 0.3F, 0.4F, 0.5F, 0.6F, 0.7F, 0.8F}; + blasint incx=1; + blasint incy=1; + blasint n=8; + + double res1=0.0f, res2= 2.0400000444054616; + + res1=BLASFUNC(dsdot)(&n, &x, &incx, &y, &incy); + ASSERT_DBL_NEAR_TOL(res2, res1, DOUBLE_EPS); +} \ No newline at end of file From 60e6c68e3811ae9b7b3bead134507e10fa31aed9 Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Tue, 29 Sep 2020 16:36:14 +0800 Subject: [PATCH 478/593] Adapt ARM architect --- kernel/arm64/KERNEL.ARMV8 | 2 +- kernel/arm64/KERNEL.CORTEXA53 | 2 +- kernel/arm64/KERNEL.CORTEXA57 | 2 +- kernel/generic/dot.c | 6 +++--- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8 index fe32d3137..603e47d87 100644 --- a/kernel/arm64/KERNEL.ARMV8 +++ b/kernel/arm64/KERNEL.ARMV8 @@ -97,7 +97,7 @@ CNRM2KERNEL = znrm2.S ZNRM2KERNEL = znrm2.S DDOTKERNEL = dot.S -SDOTKERNEL = dot.S +SDOTKERNEL = ../generic/dot.c CDOTKERNEL = zdot.S ZDOTKERNEL = zdot.S DSDOTKERNEL = dot.S diff --git a/kernel/arm64/KERNEL.CORTEXA53 b/kernel/arm64/KERNEL.CORTEXA53 index eba38a92e..e23133e52 100644 --- a/kernel/arm64/KERNEL.CORTEXA53 +++ b/kernel/arm64/KERNEL.CORTEXA53 @@ -97,7 +97,7 @@ CNRM2KERNEL = znrm2.S ZNRM2KERNEL = znrm2.S DDOTKERNEL = dot.S -SDOTKERNEL = dot.S +SDOTKERNEL = ../generic/dot.c CDOTKERNEL = zdot.S ZDOTKERNEL = zdot.S DSDOTKERNEL = dot.S diff --git a/kernel/arm64/KERNEL.CORTEXA57 b/kernel/arm64/KERNEL.CORTEXA57 index 04d6940d7..dcf2383a9 100644 --- a/kernel/arm64/KERNEL.CORTEXA57 +++ b/kernel/arm64/KERNEL.CORTEXA57 @@ -70,7 +70,7 @@ DCOPYKERNEL = copy.S CCOPYKERNEL = copy.S ZCOPYKERNEL = copy.S -SDOTKERNEL = dot.S +SDOTKERNEL = ../generic/dot.c DDOTKERNEL = dot.S CDOTKERNEL = zdot.S ZDOTKERNEL = zdot.S diff --git a/kernel/generic/dot.c b/kernel/generic/dot.c index f1ea6b264..5abbb735c 100644 --- a/kernel/generic/dot.c +++ b/kernel/generic/dot.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" - +#include "../simd/intrin.h" #if defined(DSDOT) double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) #else @@ -47,9 +47,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) if ( (inc_x == 1) && (inc_y == 1) ) { - int n1 = n & -4; + int n1 = n & -4; #if V_SIMD && !defined(DSDOT) - const int vstep = v_nlanes_f32; + const int vstep = v_nlanes_f32; const int unrollx4 = n & (-vstep * 4); const int unrollx = n & -vstep; v_f32 vsum0 = v_zero_f32(); From 2bf70c8e3b72f560ab35320ed12df9ac92f9b46c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 30 Sep 2020 22:43:25 +0200 Subject: [PATCH 479/593] Change ifdef linux to __linux for C11 compatibility --- cpuid_arm.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpuid_arm.c b/cpuid_arm.c index 19aa90718..a3b1dfd33 100644 --- a/cpuid_arm.c +++ b/cpuid_arm.c @@ -54,7 +54,7 @@ static char *cpuname_lower[] = { int get_feature(char *search) { -#ifdef linux +#ifdef __linux FILE *infile; char buffer[2048], *p,*t; p = (char *) NULL ; @@ -90,7 +90,7 @@ int get_feature(char *search) int detect(void) { -#ifdef linux +#ifdef __linux FILE *infile; char buffer[512], *p; @@ -289,7 +289,7 @@ void get_libname(void) void get_features(void) { -#ifdef linux +#ifdef __linux FILE *infile; char buffer[2048], *p,*t; p = (char *) NULL ; From be40440ec59e8ac16b7c63d62ab743845073d2ae Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 30 Sep 2020 22:45:18 +0200 Subject: [PATCH 480/593] Change ifdef linux to __linux for C11 compatibility --- cpuid_arm64.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index a0d3e15b9..ae150ef1b 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -90,7 +90,7 @@ static char *cpuname_lower[] = { int get_feature(char *search) { -#ifdef linux +#ifdef __linux FILE *infile; char buffer[2048], *p,*t; p = (char *) NULL ; @@ -126,7 +126,7 @@ int get_feature(char *search) int detect(void) { -#ifdef linux +#ifdef __linux FILE *infile; char buffer[512], *p, *cpu_part = NULL, *cpu_implementer = NULL; @@ -242,7 +242,7 @@ void get_cpucount(void) { int n=0; -#ifdef linux +#ifdef __linux FILE *infile; char buffer[2048], *p,*t; p = (char *) NULL ; @@ -441,7 +441,7 @@ void get_libname(void) void get_features(void) { -#ifdef linux +#ifdef __linux FILE *infile; char buffer[2048], *p,*t; p = (char *) NULL ; From a7d5d0078dd3d5a0c5d1aff9f3723d6799bd0410 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 30 Sep 2020 22:46:25 +0200 Subject: [PATCH 481/593] Change ifdef linux to __linux for C11 compatibility --- cpuid_mips.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpuid_mips.c b/cpuid_mips.c index 3a2e12393..e6e837f73 100644 --- a/cpuid_mips.c +++ b/cpuid_mips.c @@ -84,7 +84,7 @@ static char *cpuname[] = { int detect(void){ -#ifdef linux +#ifdef __linux FILE *infile; char buffer[512], *p; From 0b2bb5696af3c7abb0b0d5038124eb4a5f883fbc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 30 Sep 2020 22:47:25 +0200 Subject: [PATCH 482/593] Change ifdef linux to __linux for C11 compatibility --- cpuid_mips64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpuid_mips64.c b/cpuid_mips64.c index 0e32bfc0b..0c19ac1e7 100644 --- a/cpuid_mips64.c +++ b/cpuid_mips64.c @@ -90,7 +90,7 @@ static char *cpuname[] = { int detect(void){ -#ifdef linux +#ifdef __linux FILE *infile; char buffer[512], *p; From e1574cbc83a691f2f0ff898c9976e1f5861d9686 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 30 Sep 2020 22:50:21 +0200 Subject: [PATCH 483/593] Change ifdef linux to __linux for C11 compatibility and add a fallback for unsupported operating systems in detect() --- cpuid_power.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpuid_power.c b/cpuid_power.c index b17493bc8..2526e8d0e 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -104,7 +104,7 @@ char *corename[] = { int detect(void){ -#ifdef linux +#ifdef __linux FILE *infile; char buffer[512], *p; @@ -214,6 +214,8 @@ switch ( id >> 16 ) { return CPUTYPE_UNKNOWN; } #endif + + return CPUTYPE_UNKNOWN; } void get_architecture(void){ From 5464eb13ea362012047d98dd7c6ecd33ca58b27b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 30 Sep 2020 22:59:41 +0200 Subject: [PATCH 484/593] Change ifdef linux to __linux for C11 compatibility --- benchmark/amax.c | 2 +- benchmark/amin.c | 2 +- benchmark/asum.c | 2 +- benchmark/axpby.c | 2 +- benchmark/axpy.c | 2 +- benchmark/copy.c | 2 +- benchmark/dot.c | 2 +- benchmark/geev.c | 2 +- benchmark/gemm.c | 2 +- benchmark/gemm3m.c | 2 +- benchmark/gemv.c | 2 +- benchmark/ger.c | 2 +- benchmark/gesv.c | 2 +- benchmark/getri.c | 2 +- benchmark/hbmv.c | 2 +- benchmark/hemm.c | 2 +- benchmark/hemv.c | 2 +- benchmark/her.c | 2 +- benchmark/her2.c | 2 +- benchmark/her2k.c | 2 +- benchmark/herk.c | 2 +- benchmark/hpmv.c | 2 +- benchmark/iamax.c | 2 +- benchmark/iamin.c | 2 +- benchmark/imax.c | 2 +- benchmark/imin.c | 2 +- benchmark/linpack.c | 2 +- benchmark/max.c | 2 +- benchmark/min.c | 2 +- benchmark/nrm2.c | 2 +- benchmark/rot.c | 2 +- benchmark/rotm.c | 2 +- benchmark/scal.c | 2 +- benchmark/spmv.c | 2 +- benchmark/spr.c | 2 +- benchmark/spr2.c | 2 +- benchmark/swap.c | 2 +- benchmark/symm.c | 2 +- benchmark/symv.c | 2 +- benchmark/syr.c | 2 +- benchmark/syr2.c | 2 +- benchmark/syr2k.c | 2 +- benchmark/syrk.c | 2 +- benchmark/tpmv.c | 2 +- benchmark/tpsv.c | 2 +- benchmark/trmm.c | 2 +- benchmark/trmv.c | 2 +- benchmark/trsm.c | 2 +- benchmark/trsv.c | 2 +- benchmark/zdot-intel.c | 2 +- benchmark/zdot.c | 2 +- 51 files changed, 51 insertions(+), 51 deletions(-) diff --git a/benchmark/amax.c b/benchmark/amax.c index 32f55ce83..19ae95c8b 100644 --- a/benchmark/amax.c +++ b/benchmark/amax.c @@ -146,7 +146,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/amin.c b/benchmark/amin.c index 218f0ea9f..d0cadbd3b 100644 --- a/benchmark/amin.c +++ b/benchmark/amin.c @@ -145,7 +145,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/asum.c b/benchmark/asum.c index e3d16acfd..bcccd9089 100644 --- a/benchmark/asum.c +++ b/benchmark/asum.c @@ -152,7 +152,7 @@ int main(int argc, char *argv[]){ } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/axpby.c b/benchmark/axpby.c index 3b3dd9979..793ee7e40 100644 --- a/benchmark/axpby.c +++ b/benchmark/axpby.c @@ -152,7 +152,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/axpy.c b/benchmark/axpy.c index e40f93c70..760703c1d 100644 --- a/benchmark/axpy.c +++ b/benchmark/axpy.c @@ -151,7 +151,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/copy.c b/benchmark/copy.c index d7f58c94f..eb5148fff 100644 --- a/benchmark/copy.c +++ b/benchmark/copy.c @@ -154,7 +154,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/dot.c b/benchmark/dot.c index 50d05e532..aae3c04b0 100644 --- a/benchmark/dot.c +++ b/benchmark/dot.c @@ -145,7 +145,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/geev.c b/benchmark/geev.c index ef9271220..4fd2c8d6f 100644 --- a/benchmark/geev.c +++ b/benchmark/geev.c @@ -214,7 +214,7 @@ int main(int argc, char *argv[]){ } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/gemm.c b/benchmark/gemm.c index d2235330b..84dd292c5 100644 --- a/benchmark/gemm.c +++ b/benchmark/gemm.c @@ -197,7 +197,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/gemm3m.c b/benchmark/gemm3m.c index f4048c436..98c13e1be 100644 --- a/benchmark/gemm3m.c +++ b/benchmark/gemm3m.c @@ -163,7 +163,7 @@ int main(int argc, char *argv[]){ loops = atoi(p); -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/gemv.c b/benchmark/gemv.c index a9dee67d2..fb1f541d3 100644 --- a/benchmark/gemv.c +++ b/benchmark/gemv.c @@ -181,7 +181,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/ger.c b/benchmark/ger.c index ca7e94e15..d53d328f0 100644 --- a/benchmark/ger.c +++ b/benchmark/ger.c @@ -165,7 +165,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/gesv.c b/benchmark/gesv.c index 80f644e69..057cbd243 100644 --- a/benchmark/gesv.c +++ b/benchmark/gesv.c @@ -165,7 +165,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/getri.c b/benchmark/getri.c index e8b82a758..a07014768 100644 --- a/benchmark/getri.c +++ b/benchmark/getri.c @@ -188,7 +188,7 @@ int main(int argc, char *argv[]){ } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/hbmv.c b/benchmark/hbmv.c index b9dcc03bb..60ba9fb89 100644 --- a/benchmark/hbmv.c +++ b/benchmark/hbmv.c @@ -158,7 +158,7 @@ int main(int argc, char *argv[]){ exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/hemm.c b/benchmark/hemm.c index 2fe0f5c5f..2bc165458 100644 --- a/benchmark/hemm.c +++ b/benchmark/hemm.c @@ -151,7 +151,7 @@ int main(int argc, char *argv[]){ -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/hemv.c b/benchmark/hemv.c index b6ff512ce..98618a04e 100644 --- a/benchmark/hemv.c +++ b/benchmark/hemv.c @@ -152,7 +152,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/her.c b/benchmark/her.c index f4e10b684..010f8120d 100644 --- a/benchmark/her.c +++ b/benchmark/her.c @@ -149,7 +149,7 @@ int main(int argc, char *argv[]){ -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/her2.c b/benchmark/her2.c index e10b7e98e..0f80f3ed9 100644 --- a/benchmark/her2.c +++ b/benchmark/her2.c @@ -151,7 +151,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/her2k.c b/benchmark/her2k.c index a0772feff..021873beb 100644 --- a/benchmark/her2k.c +++ b/benchmark/her2k.c @@ -150,7 +150,7 @@ int main(int argc, char *argv[]){ -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/herk.c b/benchmark/herk.c index eed8ed738..c09d35c1f 100644 --- a/benchmark/herk.c +++ b/benchmark/herk.c @@ -149,7 +149,7 @@ int main(int argc, char *argv[]){ -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/hpmv.c b/benchmark/hpmv.c index 6e6634fcf..b0157094e 100644 --- a/benchmark/hpmv.c +++ b/benchmark/hpmv.c @@ -155,7 +155,7 @@ int main(int argc, char *argv[]){ exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/iamax.c b/benchmark/iamax.c index 736f02b89..c87044ab4 100644 --- a/benchmark/iamax.c +++ b/benchmark/iamax.c @@ -145,7 +145,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/iamin.c b/benchmark/iamin.c index b2c779811..e7c8e59e4 100644 --- a/benchmark/iamin.c +++ b/benchmark/iamin.c @@ -145,7 +145,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/imax.c b/benchmark/imax.c index c7060af84..b56ef64ba 100644 --- a/benchmark/imax.c +++ b/benchmark/imax.c @@ -139,7 +139,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/imin.c b/benchmark/imin.c index f8bdc2537..4a92c8bd0 100644 --- a/benchmark/imin.c +++ b/benchmark/imin.c @@ -139,7 +139,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/linpack.c b/benchmark/linpack.c index e4b20e99d..661a44175 100644 --- a/benchmark/linpack.c +++ b/benchmark/linpack.c @@ -174,7 +174,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/max.c b/benchmark/max.c index 2fa6e5a14..a19a386a2 100644 --- a/benchmark/max.c +++ b/benchmark/max.c @@ -139,7 +139,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/min.c b/benchmark/min.c index 9abed8e80..4df8fb0fd 100644 --- a/benchmark/min.c +++ b/benchmark/min.c @@ -139,7 +139,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/nrm2.c b/benchmark/nrm2.c index d3718f9e0..0f416621a 100644 --- a/benchmark/nrm2.c +++ b/benchmark/nrm2.c @@ -145,7 +145,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/rot.c b/benchmark/rot.c index 8ec8b1d97..69698988d 100644 --- a/benchmark/rot.c +++ b/benchmark/rot.c @@ -156,7 +156,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/rotm.c b/benchmark/rotm.c index 8dea2d08c..17c8d5416 100644 --- a/benchmark/rotm.c +++ b/benchmark/rotm.c @@ -168,7 +168,7 @@ int main(int argc, char *argv[]) exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/scal.c b/benchmark/scal.c index 453c3234d..8bd62c77c 100644 --- a/benchmark/scal.c +++ b/benchmark/scal.c @@ -150,7 +150,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/spmv.c b/benchmark/spmv.c index 2a26c9416..cff504d3b 100644 --- a/benchmark/spmv.c +++ b/benchmark/spmv.c @@ -163,7 +163,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/spr.c b/benchmark/spr.c index c91e587b1..5dcaa4f8b 100755 --- a/benchmark/spr.c +++ b/benchmark/spr.c @@ -149,7 +149,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/spr2.c b/benchmark/spr2.c index e8ee345d7..a5f2791f7 100755 --- a/benchmark/spr2.c +++ b/benchmark/spr2.c @@ -153,7 +153,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/swap.c b/benchmark/swap.c index 368c59cd4..76d545995 100644 --- a/benchmark/swap.c +++ b/benchmark/swap.c @@ -151,7 +151,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/symm.c b/benchmark/symm.c index b979e8d51..bb9849eb5 100644 --- a/benchmark/symm.c +++ b/benchmark/symm.c @@ -162,7 +162,7 @@ int main(int argc, char *argv[]){ -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/symv.c b/benchmark/symv.c index 789c3560f..e4c892b5a 100644 --- a/benchmark/symv.c +++ b/benchmark/symv.c @@ -162,7 +162,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/syr.c b/benchmark/syr.c index 458bc6edb..a9dd293e6 100644 --- a/benchmark/syr.c +++ b/benchmark/syr.c @@ -144,7 +144,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/syr2.c b/benchmark/syr2.c index 0129dd09a..9efbca315 100644 --- a/benchmark/syr2.c +++ b/benchmark/syr2.c @@ -150,7 +150,7 @@ int main(int argc, char *argv[]){ -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/syr2k.c b/benchmark/syr2k.c index b1fcd8a18..a906559eb 100644 --- a/benchmark/syr2k.c +++ b/benchmark/syr2k.c @@ -162,7 +162,7 @@ int main(int argc, char *argv[]){ -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/syrk.c b/benchmark/syrk.c index 95625a6c4..0fbb943f6 100644 --- a/benchmark/syrk.c +++ b/benchmark/syrk.c @@ -159,7 +159,7 @@ int main(int argc, char *argv[]){ -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/tpmv.c b/benchmark/tpmv.c index ee5b97f24..fe9d07534 100644 --- a/benchmark/tpmv.c +++ b/benchmark/tpmv.c @@ -132,7 +132,7 @@ int main(int argc, char *argv[]) fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/tpsv.c b/benchmark/tpsv.c index 46d78fd17..8472ac261 100644 --- a/benchmark/tpsv.c +++ b/benchmark/tpsv.c @@ -132,7 +132,7 @@ int main(int argc, char *argv[]) fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/trmm.c b/benchmark/trmm.c index e095b85ee..23af122b4 100644 --- a/benchmark/trmm.c +++ b/benchmark/trmm.c @@ -162,7 +162,7 @@ int main(int argc, char *argv[]){ -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/trmv.c b/benchmark/trmv.c index f5a5fe31a..46641b3e4 100644 --- a/benchmark/trmv.c +++ b/benchmark/trmv.c @@ -132,7 +132,7 @@ int main(int argc, char *argv[]) fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/trsm.c b/benchmark/trsm.c index 6ce1d532c..17676946a 100644 --- a/benchmark/trsm.c +++ b/benchmark/trsm.c @@ -172,7 +172,7 @@ int main(int argc, char *argv[]){ -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/trsv.c b/benchmark/trsv.c index c60890de4..1734e2adb 100644 --- a/benchmark/trsv.c +++ b/benchmark/trsv.c @@ -159,7 +159,7 @@ int main(int argc, char *argv[]){ uplo,diag,loops); -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/zdot-intel.c b/benchmark/zdot-intel.c index bb2c40f38..ba1515365 100644 --- a/benchmark/zdot-intel.c +++ b/benchmark/zdot-intel.c @@ -146,7 +146,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/zdot.c b/benchmark/zdot.c index 136135c9c..fa624e859 100644 --- a/benchmark/zdot.c +++ b/benchmark/zdot.c @@ -145,7 +145,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif From 2367726578884f3975d12e276927b1f52acc152c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 30 Sep 2020 23:28:49 +0200 Subject: [PATCH 485/593] Remove redundant status message --- cmake/system_check.cmake | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index d06f4779f..b0ab926fc 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -121,7 +121,6 @@ endif() include(CheckIncludeFile) CHECK_INCLUDE_FILE("stdatomic.h" HAVE_C11) -if (HAVE_C11 EQUAL 1) -message (STATUS found stdatomic.h) +if (HAVE_C11) set (CCOMMON_OPT "${CCOMMON_OPT} -DHAVE_C11") endif() From dee7c49938ef34c18deb3175f6e67ae9a2240f5f Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Thu, 1 Oct 2020 10:43:16 +0200 Subject: [PATCH 486/593] Fix TABs and trailing space --- driver/others/memory.c | 352 ++++++++++++++++++++--------------------- 1 file changed, 176 insertions(+), 176 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 9b6c226a1..5c9c388ce 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -80,7 +80,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #undef COMPILE_TLS #endif -#if defined(__GLIBC_PREREQ) +#if defined(__GLIBC_PREREQ) #if !__GLIBC_PREREQ(2,20) #undef COMPILE_TLS #endif @@ -161,7 +161,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(OS_WINDOWS) && (defined(__MINGW32__) || defined(__MINGW64__)) #include #undef printf -#define printf _cprintf +#define printf _cprintf #endif #ifdef OS_LINUX @@ -190,14 +190,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CONSTRUCTOR __cdecl #define DESTRUCTOR __cdecl #elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC) -#define CONSTRUCTOR __attribute__ ((constructor)) -#define DESTRUCTOR __attribute__ ((destructor)) +#define CONSTRUCTOR __attribute__ ((constructor)) +#define DESTRUCTOR __attribute__ ((destructor)) #elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900)) -#define CONSTRUCTOR __attribute__ ((constructor(101))) -#define DESTRUCTOR __attribute__ ((destructor(101))) +#define CONSTRUCTOR __attribute__ ((constructor(101))) +#define DESTRUCTOR __attribute__ ((destructor(101))) #else -#define CONSTRUCTOR __attribute__ ((constructor)) -#define DESTRUCTOR __attribute__ ((destructor)) +#define CONSTRUCTOR __attribute__ ((constructor)) +#define DESTRUCTOR __attribute__ ((destructor)) #endif #ifdef DYNAMIC_ARCH @@ -272,7 +272,7 @@ int get_num_procs(void) { return nums; } ret = CPU_COUNT_S(size,cpusetp); - if (ret > 0 && ret < nums) nums = ret; + if (ret > 0 && ret < nums) nums = ret; CPU_FREE(cpusetp); return nums; } else { @@ -281,7 +281,7 @@ int get_num_procs(void) { return nums; } ret = CPU_COUNT(&cpuset); - if (ret > 0 && ret < nums) nums = ret; + if (ret > 0 && ret < nums) nums = ret; return nums; } #endif @@ -628,12 +628,12 @@ static void *alloc_mmap(void *address){ if (address){ map_address = mmap(address, - allocation_block_size, - MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0); + allocation_block_size, + MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0); } else { map_address = mmap(address, - allocation_block_size, - MMAP_ACCESS, MMAP_POLICY, -1, 0); + allocation_block_size, + MMAP_ACCESS, MMAP_POLICY, -1, 0); } STORE_RELEASE_FUNC(map_address, alloc_mmap_free); @@ -648,7 +648,7 @@ static void *alloc_mmap(void *address){ #else #define BENCH_ITERATION 4 -#define SCALING 2 +#define SCALING 2 static inline BLASULONG run_bench(BLASULONG address, BLASULONG size) { @@ -711,60 +711,60 @@ static void *alloc_mmap(void *address){ #endif map_address = mmap(NULL, allocation_block_size * SCALING, - MMAP_ACCESS, MMAP_POLICY, -1, 0); + MMAP_ACCESS, MMAP_POLICY, -1, 0); if (map_address != (void *)-1) { #ifdef OS_LINUX #ifdef DEBUG - int ret=0; - ret=my_mbind(map_address, allocation_block_size * SCALING, MPOL_PREFERRED, NULL, 0, 0); - if(ret==-1){ - int errsv=errno; - perror("OpenBLAS alloc_mmap:"); - printf("error code=%d,\tmap_address=%lx\n",errsv,map_address); - } + int ret=0; + ret=my_mbind(map_address, allocation_block_size * SCALING, MPOL_PREFERRED, NULL, 0, 0); + if(ret==-1){ + int errsv=errno; + perror("OpenBLAS alloc_mmap:"); + printf("error code=%d,\tmap_address=%lx\n",errsv,map_address); + } #else - my_mbind(map_address, allocation_block_size * SCALING, MPOL_PREFERRED, NULL, 0, 0); + my_mbind(map_address, allocation_block_size * SCALING, MPOL_PREFERRED, NULL, 0, 0); #endif #endif - allocsize = DGEMM_P * DGEMM_Q * sizeof(double); + allocsize = DGEMM_P * DGEMM_Q * sizeof(double); - start = (BLASULONG)map_address; - current = (SCALING - 1) * allocation_block_size; - original = current; + start = (BLASULONG)map_address; + current = (SCALING - 1) * allocation_block_size; + original = current; - while(current > 0 && current <= original) { - *(BLASLONG *)start = (BLASLONG)start + PAGESIZE; - start += PAGESIZE; - current -= PAGESIZE; - } + while(current > 0 && current <= original) { + *(BLASLONG *)start = (BLASLONG)start + PAGESIZE; + start += PAGESIZE; + current -= PAGESIZE; + } - *(BLASLONG *)(start - PAGESIZE) = (BLASULONG)map_address; + *(BLASLONG *)(start - PAGESIZE) = (BLASULONG)map_address; - start = (BLASULONG)map_address; + start = (BLASULONG)map_address; - best = (BLASULONG)-1; - best_address = map_address; + best = (BLASULONG)-1; + best_address = map_address; - while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * allocation_block_size)) { + while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * allocation_block_size)) { - current = run_bench(start, allocsize); + current = run_bench(start, allocsize); - if (best > current) { - best = current; - best_address = (void *)start; - } + if (best > current) { + best = current; + best_address = (void *)start; + } - start += PAGESIZE; + start += PAGESIZE; - } + } if ((BLASULONG)best_address > (BLASULONG)map_address) - munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address); + munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address); munmap((void *)((BLASULONG)best_address + allocation_block_size), (SCALING - 1) * allocation_block_size + (BLASULONG)map_address - (BLASULONG)best_address); @@ -854,9 +854,9 @@ static void *alloc_windows(void *address){ void *map_address; map_address = VirtualAlloc(address, - allocation_block_size, - MEM_RESERVE | MEM_COMMIT, - PAGE_READWRITE); + allocation_block_size, + MEM_RESERVE | MEM_COMMIT, + PAGE_READWRITE); if (map_address == (void *)NULL) map_address = (void *)-1; @@ -897,9 +897,9 @@ static void *alloc_devicedirver(void *address){ } map_address = mmap(address, allocation_block_size, - PROT_READ | PROT_WRITE, - MAP_FILE | MAP_SHARED, - fd, 0); + PROT_READ | PROT_WRITE, + MAP_FILE | MAP_SHARED, + fd, 0); STORE_RELEASE_FUNC_WITH_ATTR(map_address, alloc_devicedirver_free, fd); @@ -974,12 +974,12 @@ static void *alloc_hugetlb(void *address){ shmid = shmget(IPC_PRIVATE, allocation_block_size, #ifdef OS_LINUX - SHM_HUGETLB | + SHM_HUGETLB | #endif #ifdef OS_AIX - SHM_LGPAGE | SHM_PIN | + SHM_LGPAGE | SHM_PIN | #endif - IPC_CREAT | SHM_R | SHM_W); + IPC_CREAT | SHM_R | SHM_W); if (shmid != -1) { map_address = (void *)shmat(shmid, address, SHM_RND); @@ -1026,9 +1026,9 @@ static void *alloc_hugetlb(void *address){ } map_address = (void *)VirtualAlloc(address, - allocation_block_size, - MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT, - PAGE_READWRITE); + allocation_block_size, + MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT, + PAGE_READWRITE); tp.Privileges[0].Attributes = 0; AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL); @@ -1078,9 +1078,9 @@ static void *alloc_hugetlbfile(void *address){ unlink(filename); map_address = mmap(address, allocation_block_size, - PROT_READ | PROT_WRITE, - MAP_SHARED, - fd, 0); + PROT_READ | PROT_WRITE, + MAP_SHARED, + fd, 0); STORE_RELEASE_FUNC_WITH_ATTR(map_address, alloc_hugetlbfile_free, fd); @@ -1107,7 +1107,7 @@ static volatile int memory_initialized = 0; /* 1 : Level 2 functions */ /* 2 : Thread */ - static void blas_memory_cleanup(void* ptr){ +static void blas_memory_cleanup(void* ptr){ if (ptr) { struct alloc_t ** table = (struct alloc_t **)ptr; int pos; @@ -1243,27 +1243,27 @@ UNLOCK_COMMAND(&alloc_lock); while ((func != NULL) && (map_address == (void *) -1)) { - map_address = (*func)((void *)base_address); + map_address = (*func)((void *)base_address); #ifdef ALLOC_DEVICEDRIVER - if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) { - fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation failed.\n"); - } + if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) { + fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation failed.\n"); + } #endif #ifdef ALLOC_HUGETLBFILE - if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) { + if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) { #ifndef OS_WINDOWS - fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation failed.\n"); + fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation failed.\n"); #endif - } + } #endif #if (defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS) - if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1; + if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1; #endif - func ++; + func ++; } #ifdef DEBUG @@ -1377,7 +1377,7 @@ static BLASULONG init_lock = 0UL; #endif static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, - void *sa, void *sb, BLASLONG pos) { + void *sa, void *sb, BLASLONG pos) { #if !defined(ARCH_POWER) && !defined(ARCH_SPARC) @@ -1507,11 +1507,11 @@ void CONSTRUCTOR gotoblas_init(void) { struct rlimit curlimit; if ( getrlimit(RLIMIT_STACK, &curlimit ) == 0 ) { - if ( curlimit.rlim_cur != curlimit.rlim_max ) - { - curlimit.rlim_cur = curlimit.rlim_max; - setrlimit(RLIMIT_STACK, &curlimit); - } + if ( curlimit.rlim_cur != curlimit.rlim_max ) + { + curlimit.rlim_cur = curlimit.rlim_max; + setrlimit(RLIMIT_STACK, &curlimit); + } } #endif @@ -1545,7 +1545,7 @@ void DESTRUCTOR gotoblas_quit(void) { TlsFree(local_storage_key); #else pthread_key_delete(local_storage_key); -#endif +#endif #endif #ifdef PROFILE @@ -1605,8 +1605,8 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReser */ static int on_process_term(void) { - gotoblas_quit(); - return 0; + gotoblas_quit(); + return 0; } #ifdef _WIN64 #pragma comment(linker, "/INCLUDE:_tls_used") @@ -1705,7 +1705,7 @@ void gotoblas_dummy_for_PGI(void) { #if defined(OS_WINDOWS) && (defined(__MINGW32__) || defined(__MINGW64__)) #include #undef printf -#define printf _cprintf +#define printf _cprintf #endif #ifdef OS_LINUX @@ -1734,14 +1734,14 @@ void gotoblas_dummy_for_PGI(void) { #define CONSTRUCTOR __cdecl #define DESTRUCTOR __cdecl #elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC) -#define CONSTRUCTOR __attribute__ ((constructor)) -#define DESTRUCTOR __attribute__ ((destructor)) +#define CONSTRUCTOR __attribute__ ((constructor)) +#define DESTRUCTOR __attribute__ ((destructor)) #elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900)) -#define CONSTRUCTOR __attribute__ ((constructor(101))) -#define DESTRUCTOR __attribute__ ((destructor(101))) +#define CONSTRUCTOR __attribute__ ((constructor(101))) +#define DESTRUCTOR __attribute__ ((destructor(101))) #else -#define CONSTRUCTOR __attribute__ ((constructor)) -#define DESTRUCTOR __attribute__ ((destructor)) +#define CONSTRUCTOR __attribute__ ((constructor)) +#define DESTRUCTOR __attribute__ ((destructor)) #endif #ifdef DYNAMIC_ARCH @@ -1817,7 +1817,7 @@ int get_num_procs(void) { return nums; } ret = CPU_COUNT_S(size,cpusetp); - if (ret > 0 && ret < nums) nums = ret; + if (ret > 0 && ret < nums) nums = ret; CPU_FREE(cpusetp); return nums; } else { @@ -1826,7 +1826,7 @@ int get_num_procs(void) { return nums; } ret = CPU_COUNT(&cpuset); - if (ret > 0 && ret < nums) nums = ret; + if (ret > 0 && ret < nums) nums = ret; return nums; } #endif @@ -2083,26 +2083,26 @@ static void *alloc_mmap(void *address){ if (address){ map_address = mmap(address, - BUFFER_SIZE, - MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0); + BUFFER_SIZE, + MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0); } else { map_address = mmap(address, - BUFFER_SIZE, - MMAP_ACCESS, MMAP_POLICY, -1, 0); + BUFFER_SIZE, + MMAP_ACCESS, MMAP_POLICY, -1, 0); } if (map_address != (void *)-1) { #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); -#endif +#endif release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; release_pos ++; #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); -#endif +#endif } else { -#ifdef DEBUG +#ifdef DEBUG int errsv=errno; perror("OpenBLAS : mmap failed:"); printf("error code=%d,\tmap_address=%lx\n",errsv,map_address); @@ -2119,7 +2119,7 @@ static void *alloc_mmap(void *address){ #else #define BENCH_ITERATION 4 -#define SCALING 2 +#define SCALING 2 static inline BLASULONG run_bench(BLASULONG address, BLASULONG size) { @@ -2182,59 +2182,59 @@ static void *alloc_mmap(void *address){ #endif map_address = mmap(NULL, BUFFER_SIZE * SCALING, - MMAP_ACCESS, MMAP_POLICY, -1, 0); + MMAP_ACCESS, MMAP_POLICY, -1, 0); if (map_address != (void *)-1) { #ifdef OS_LINUX #ifdef DEBUG - int ret=0; - ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0); - if(ret==-1){ - int errsv=errno; - perror("OpenBLAS alloc_mmap:"); - printf("error code=%d,\tmap_address=%lx\n",errsv,map_address); - } + int ret=0; + ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0); + if(ret==-1){ + int errsv=errno; + perror("OpenBLAS alloc_mmap:"); + printf("error code=%d,\tmap_address=%lx\n",errsv,map_address); + } #else - my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0); + my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0); #endif #endif - allocsize = DGEMM_P * DGEMM_Q * sizeof(double); + allocsize = DGEMM_P * DGEMM_Q * sizeof(double); - start = (BLASULONG)map_address; - current = (SCALING - 1) * BUFFER_SIZE; + start = (BLASULONG)map_address; + current = (SCALING - 1) * BUFFER_SIZE; - while(current > 0) { - *(BLASLONG *)start = (BLASLONG)start + PAGESIZE; - start += PAGESIZE; - current -= PAGESIZE; - } + while(current > 0) { + *(BLASLONG *)start = (BLASLONG)start + PAGESIZE; + start += PAGESIZE; + current -= PAGESIZE; + } - *(BLASLONG *)(start - PAGESIZE) = (BLASULONG)map_address; + *(BLASLONG *)(start - PAGESIZE) = (BLASULONG)map_address; - start = (BLASULONG)map_address; + start = (BLASULONG)map_address; - best = (BLASULONG)-1; - best_address = map_address; + best = (BLASULONG)-1; + best_address = map_address; - while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * BUFFER_SIZE)) { + while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * BUFFER_SIZE)) { - current = run_bench(start, allocsize); + current = run_bench(start, allocsize); - if (best > current) { - best = current; - best_address = (void *)start; - } + if (best > current) { + best = current; + best_address = (void *)start; + } - start += PAGESIZE; + start += PAGESIZE; - } + } if ((BLASULONG)best_address > (BLASULONG)map_address) - munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address); + munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address); munmap((void *)((BLASULONG)best_address + BUFFER_SIZE), (SCALING - 1) * BUFFER_SIZE + (BLASULONG)map_address - (BLASULONG)best_address); @@ -2342,9 +2342,9 @@ static void *alloc_windows(void *address){ void *map_address; map_address = VirtualAlloc(address, - BUFFER_SIZE, - MEM_RESERVE | MEM_COMMIT, - PAGE_READWRITE); + BUFFER_SIZE, + MEM_RESERVE | MEM_COMMIT, + PAGE_READWRITE); if (map_address == (void *)NULL) map_address = (void *)-1; @@ -2388,9 +2388,9 @@ static void *alloc_devicedirver(void *address){ } map_address = mmap(address, BUFFER_SIZE, - PROT_READ | PROT_WRITE, - MAP_FILE | MAP_SHARED, - fd, 0); + PROT_READ | PROT_WRITE, + MAP_FILE | MAP_SHARED, + fd, 0); if (map_address != (void *)-1) { release_info[release_pos].address = map_address; @@ -2471,12 +2471,12 @@ static void *alloc_hugetlb(void *address){ shmid = shmget(IPC_PRIVATE, BUFFER_SIZE, #ifdef OS_LINUX - SHM_HUGETLB | + SHM_HUGETLB | #endif #ifdef OS_AIX - SHM_LGPAGE | SHM_PIN | + SHM_LGPAGE | SHM_PIN | #endif - IPC_CREAT | SHM_R | SHM_W); + IPC_CREAT | SHM_R | SHM_W); if (shmid != -1) { map_address = (void *)shmat(shmid, address, SHM_RND); @@ -2511,7 +2511,7 @@ static void *alloc_hugetlb(void *address){ tp.PrivilegeCount = 1; tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; - + if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) { CloseHandle(hToken); return (void*)-1; @@ -2523,9 +2523,9 @@ static void *alloc_hugetlb(void *address){ } map_address = (void *)VirtualAlloc(address, - BUFFER_SIZE, - MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT, - PAGE_READWRITE); + BUFFER_SIZE, + MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT, + PAGE_READWRITE); tp.Privileges[0].Attributes = 0; AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL); @@ -2578,9 +2578,9 @@ static void *alloc_hugetlbfile(void *address){ unlink(filename); map_address = mmap(address, BUFFER_SIZE, - PROT_READ | PROT_WRITE, - MAP_SHARED, - fd, 0); + PROT_READ | PROT_WRITE, + MAP_SHARED, + fd, 0); if (map_address != (void *)-1) { release_info[release_pos].address = map_address; @@ -2717,7 +2717,7 @@ void *blas_memory_alloc(int procpos){ if (!memory[position].used && (memory[position].pos == mypos)) { #if defined(SMP) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); -#else +#else blas_lock(&memory[position].lock); #endif if (!memory[position].used) goto allocation; @@ -2725,7 +2725,7 @@ void *blas_memory_alloc(int procpos){ UNLOCK_COMMAND(&alloc_lock); #else blas_unlock(&memory[position].lock); -#endif +#endif } position ++; @@ -2741,22 +2741,22 @@ void *blas_memory_alloc(int procpos){ LOCK_COMMAND(&alloc_lock); #endif do { - RMB; -#if defined(USE_OPENMP) - if (!memory[position].used) { + RMB; +#if defined(USE_OPENMP) + if (!memory[position].used) { blas_lock(&memory[position].lock); #endif if (!memory[position].used) goto allocation; - + #if defined(USE_OPENMP) - blas_unlock(&memory[position].lock); + blas_unlock(&memory[position].lock); } #endif position ++; } while (position < NUM_BUFFERS); #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) - UNLOCK_COMMAND(&alloc_lock); + UNLOCK_COMMAND(&alloc_lock); #endif goto error; @@ -2770,7 +2770,7 @@ void *blas_memory_alloc(int procpos){ #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); #else - blas_unlock(&memory[position].lock); + blas_unlock(&memory[position].lock); #endif if (!memory[position].addr) { do { @@ -2784,27 +2784,27 @@ void *blas_memory_alloc(int procpos){ while ((func != NULL) && (map_address == (void *) -1)) { - map_address = (*func)((void *)base_address); + map_address = (*func)((void *)base_address); #ifdef ALLOC_DEVICEDRIVER - if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) { - fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation was failed.\n"); - } + if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) { + fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation was failed.\n"); + } #endif #ifdef ALLOC_HUGETLBFILE - if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) { + if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) { #ifndef OS_WINDOWS - fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n"); + fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n"); #endif - } + } #endif #if (defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS) - if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1; + if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1; #endif - func ++; + func ++; } #ifdef DEBUG @@ -2818,7 +2818,7 @@ void *blas_memory_alloc(int procpos){ #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); -#endif +#endif memory[position].addr = map_address; #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); @@ -2856,7 +2856,7 @@ void *blas_memory_alloc(int procpos){ #ifdef DEBUG printf("Mapped : %p %3d\n\n", - (void *)memory[position].addr, position); + (void *)memory[position].addr, position); #endif return (void *)memory[position].addr; @@ -2972,7 +2972,7 @@ static BLASULONG init_lock = 0UL; #endif static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, - void *sa, void *sb, BLASLONG pos) { + void *sa, void *sb, BLASLONG pos) { #if !defined(ARCH_POWER) && !defined(ARCH_SPARC) @@ -3099,15 +3099,15 @@ void CONSTRUCTOR gotoblas_init(void) { //#if defined(OS_LINUX) #if 0 - struct rlimit curlimit; - if ( getrlimit(RLIMIT_STACK, &curlimit ) == 0 ) - { - if ( curlimit.rlim_cur != curlimit.rlim_max ) - { - curlimit.rlim_cur = curlimit.rlim_max; - setrlimit(RLIMIT_STACK, &curlimit); - } - } + struct rlimit curlimit; + if ( getrlimit(RLIMIT_STACK, &curlimit ) == 0 ) + { + if ( curlimit.rlim_cur != curlimit.rlim_max ) + { + curlimit.rlim_cur = curlimit.rlim_max; + setrlimit(RLIMIT_STACK, &curlimit); + } + } #endif #ifdef SMP @@ -3189,8 +3189,8 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReser */ static int on_process_term(void) { - gotoblas_quit(); - return 0; + gotoblas_quit(); + return 0; } #ifdef _WIN64 #pragma comment(linker, "/INCLUDE:_tls_used") @@ -3237,7 +3237,7 @@ void gotoblas_dummy_for_PGI(void) { asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text"); asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text"); #endif -#endif +#endif } #endif From 3c05f54df8de5df17507e80697d651e147e0bf69 Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Thu, 1 Oct 2020 10:48:45 +0200 Subject: [PATCH 487/593] Avoid out of bounds access on invalid memory free --- driver/others/memory.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 5c9c388ce..91cfefbd7 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2882,9 +2882,10 @@ void blas_memory_free(void *free_area){ while ((position < NUM_BUFFERS) && (memory[position].addr != free_area)) position++; - if (memory[position].addr != free_area) goto error; + if (position >= NUM_BUFFERS) goto error; #ifdef DEBUG + if (memory[position].addr != free_area) goto error; printf(" Position : %d\n", position); #endif From 3094fc6c83c7a623f9a7e7846eb711a8a99ddfff Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Thu, 1 Oct 2020 15:41:42 +0200 Subject: [PATCH 488/593] Lazyly reinit threads after a fork in OMP mode This initializes the per-thread memory buffers which get cleared/released on a fork via pthread_at_fork. Not doing so leads to each thread calling blas_memory_alloc on almost every execution which slows down the code significantly as the threads race for the memory allocation using locks to serialize that. --- driver/others/blas_server_omp.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index d126955e4..da0a5674a 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -48,6 +48,21 @@ #else +#ifndef likely +#ifdef __GNUC__ +#define likely(x) __builtin_expect(!!(x), 1) +#else +#define likely(x) (x) +#endif +#endif +#ifndef unlikely +#ifdef __GNUC__ +#define unlikely(x) __builtin_expect(!!(x), 0) +#else +#define unlikely(x) (x) +#endif +#endif + #ifndef OMP_SCHED #define OMP_SCHED static #endif @@ -350,6 +365,9 @@ static void exec_threads(blas_queue_t *queue, int buf_index){ int exec_blas(BLASLONG num, blas_queue_t *queue){ + // Handle lazy re-init of the thread-pool after a POSIX fork + if (unlikely(blas_server_avail == 0)) blas_thread_init(); + BLASLONG i, buf_index; if ((num <= 0) || (queue == NULL)) return 0; From d2333e784224ba19f01659210d2aaab04b43d45c Mon Sep 17 00:00:00 2001 From: User User-User Date: Sat, 3 Oct 2020 18:00:34 +0300 Subject: [PATCH 489/593] aarch64 fix std=c18 compilation --- common.h | 2 +- driver/others/dynamic_arm64.c | 2 +- kernel/arm64/daxpy_thunderx.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/common.h b/common.h index adc162536..ac12dd6d8 100644 --- a/common.h +++ b/common.h @@ -352,7 +352,7 @@ typedef int blasint; #endif #if defined(ARMV7) || defined(ARMV6) || defined(ARMV8) || defined(ARMV5) -#define YIELDING asm volatile ("nop;nop;nop;nop;nop;nop;nop;nop; \n"); +#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop; \n"); #endif #ifdef BULLDOZER diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 157b03365..be22b247c 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -68,7 +68,7 @@ extern void openblas_warning(int verbose, const char * msg); #endif #define get_cpu_ftr(id, var) ({ \ - asm("mrs %0, "#id : "=r" (var)); \ + __asm__("mrs %0, "#id : "=r" (var)); \ }) static char *corename[] = { diff --git a/kernel/arm64/daxpy_thunderx.c b/kernel/arm64/daxpy_thunderx.c index 37aae9391..f44f9d4e5 100644 --- a/kernel/arm64/daxpy_thunderx.c +++ b/kernel/arm64/daxpy_thunderx.c @@ -62,7 +62,7 @@ static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) y5 = a * x[5] + y[5]; y6 = a * x[6] + y[6]; y7 = a * x[7] + y[7]; - asm("":"+w"(y0),"+w"(y1),"+w"(y2),"+w"(y3),"+w"(y4),"+w"(y5),"+w"(y6),"+w"(y7)); + __asm__("":"+w"(y0),"+w"(y1),"+w"(y2),"+w"(y3),"+w"(y4),"+w"(y5),"+w"(y6),"+w"(y7)); y[0] = y0; y[1] = y1; y[2] = y2; @@ -74,7 +74,7 @@ static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) xx = (x + 4*128/sizeof(*x)); yy = (y + 4*128/sizeof(*y)); - asm("":"+r"(yy)::"memory"); + __asm__("":"+r"(yy)::"memory"); prefetch(xx); prefetch(yy); From dc8e4e1959855ca24af7e2d675f2be33087ff96c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 4 Oct 2020 22:59:24 +0200 Subject: [PATCH 490/593] Reduce the BLAS3 heap allocation threshold to 32 and mark it as configurable --- Makefile.rule | 17 ++++++++++++++++- common.h | 2 +- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/Makefile.rule b/Makefile.rule index 4d6f2d313..635e02c02 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -279,7 +279,22 @@ COMMON_PROF = -pg # If you want to enable the experimental BFLOAT16 support # BUILD_HALF = 1 -# + + +# Set the thread number threshold beyond which the job array for the threaded level3 BLAS +# will be allocated on the heap rather than the stack. (This array alone requires +# NUM_THREADS*NUM_THREADS*128 bytes of memory so should not pose a problem at low cpu +# counts, but obviously it is not the only item that ends up on the stack. +# The default value of 32 ensures that the overall requirement is compatible +# with the default 1MB stacksize imposed by having the Java VM loaded without use +# of its -Xss parameter. +# The value of 160 formerly used from about version 0.2.7 until 0.3.10 is easily compatible +# with the common Linux stacksize of 8MB but will cause crashes with unwary use of the java +# VM e.g. in Octave or with the java-based libhdfs in numpy or scipy code +# BLAS3_MEM_ALLOC_THRESHOLD = 160 + + + # the below is not yet configurable, use cmake if you need to build only select types BUILD_SINGLE = 1 BUILD_DOUBLE = 1 diff --git a/common.h b/common.h index ac12dd6d8..ab287262c 100644 --- a/common.h +++ b/common.h @@ -402,7 +402,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246 #endif #ifndef BLAS3_MEM_ALLOC_THRESHOLD -#define BLAS3_MEM_ALLOC_THRESHOLD 160 +#define BLAS3_MEM_ALLOC_THRESHOLD 32 #endif #ifdef QUAD_PRECISION From a5feea6611f49e875de83282c061843e18050af6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 4 Oct 2020 23:01:06 +0200 Subject: [PATCH 491/593] make BLAS3_MEM_ALLOC_THRESHOLD configurable on non-Windows --- cmake/system.cmake | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index 8908a1890..0734065df 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -323,7 +323,13 @@ else () set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_STACK_ALLOC=2048") endif () endif () - +if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows") +if (DEFINED BLAS3_MEM_ALLOC_THRESHOLD) +if (NOT ${BLAS3_MEM_ALLOC_THRESHOLD} EQUAL 32) +set(CCOMMON_OPT "${CCOMMON_OPT} -DBLAS3_MEM_ALLOC_THRESHOLD=${BLAS3_MEM_ALLOC_THRESHOLD}") +endif() +endif() +endif() if (DEFINED LIBNAMESUFFIX) set(LIBPREFIX "libopenblas_${LIBNAMESUFFIX}") else () From a5b164946ccc9dec037d4e0a1cd2f2202b1c918a Mon Sep 17 00:00:00 2001 From: Matti Picus Date: Mon, 5 Oct 2020 22:13:25 +0300 Subject: [PATCH 492/593] add fninit to reset fpu registers before assembler routines --- kernel/x86_64/amax.S | 2 ++ kernel/x86_64/asum.S | 3 ++- kernel/x86_64/dot.S | 1 + kernel/x86_64/iamax.S | 1 + kernel/x86_64/izamax.S | 1 + kernel/x86_64/nrm2.S | 1 + kernel/x86_64/qconjg.S | 1 + kernel/x86_64/qdot.S | 2 ++ kernel/x86_64/qgemm_kernel_2x2.S | 2 ++ kernel/x86_64/qgemv_n.S | 2 ++ kernel/x86_64/qgemv_t.S | 1 + kernel/x86_64/qtrsm_kernel_LN_2x2.S | 2 ++ kernel/x86_64/qtrsm_kernel_LT_2x2.S | 2 ++ kernel/x86_64/qtrsm_kernel_RT_2x2.S | 3 +++ kernel/x86_64/sum.S | 2 ++ kernel/x86_64/xdot.S | 3 +++ kernel/x86_64/xgemm3m_kernel_2x2.S | 2 ++ kernel/x86_64/xgemm_kernel_1x1.S | 2 ++ kernel/x86_64/xgemv_n.S | 2 ++ kernel/x86_64/xgemv_t.S | 2 ++ kernel/x86_64/xtrsm_kernel_LT_1x1.S | 2 ++ kernel/x86_64/zamax.S | 2 ++ kernel/x86_64/zasum.S | 2 ++ kernel/x86_64/zdot.S | 2 ++ kernel/x86_64/znrm2.S | 2 ++ kernel/x86_64/zscal.S | 2 ++ kernel/x86_64/zsum.S | 2 ++ 27 files changed, 50 insertions(+), 1 deletion(-) diff --git a/kernel/x86_64/amax.S b/kernel/x86_64/amax.S index 0e9bf4db4..257147dfb 100644 --- a/kernel/x86_64/amax.S +++ b/kernel/x86_64/amax.S @@ -54,6 +54,8 @@ PROLOGUE PROFCODE + + fninit salq $BASE_SHIFT, INCX diff --git a/kernel/x86_64/asum.S b/kernel/x86_64/asum.S index 31f973894..24f57dd11 100644 --- a/kernel/x86_64/asum.S +++ b/kernel/x86_64/asum.S @@ -49,7 +49,8 @@ PROLOGUE PROFCODE - + + fninit fldz testq M, M jle .L999 diff --git a/kernel/x86_64/dot.S b/kernel/x86_64/dot.S index e63d9cd89..2319885f1 100644 --- a/kernel/x86_64/dot.S +++ b/kernel/x86_64/dot.S @@ -49,6 +49,7 @@ PROLOGUE PROFCODE + fninit salq $BASE_SHIFT, INCX salq $BASE_SHIFT, INCY diff --git a/kernel/x86_64/iamax.S b/kernel/x86_64/iamax.S index 79e1bae1d..0c666d623 100644 --- a/kernel/x86_64/iamax.S +++ b/kernel/x86_64/iamax.S @@ -59,6 +59,7 @@ PROLOGUE PROFCODE + fninit salq $BASE_SHIFT, INCX diff --git a/kernel/x86_64/izamax.S b/kernel/x86_64/izamax.S index c066acd62..e450c2cd2 100644 --- a/kernel/x86_64/izamax.S +++ b/kernel/x86_64/izamax.S @@ -59,6 +59,7 @@ PROLOGUE PROFCODE + fninit salq $ZBASE_SHIFT, INCX diff --git a/kernel/x86_64/nrm2.S b/kernel/x86_64/nrm2.S index e9be1262a..548e3b744 100644 --- a/kernel/x86_64/nrm2.S +++ b/kernel/x86_64/nrm2.S @@ -50,6 +50,7 @@ PROLOGUE PROFCODE + fninit fldz testq M, M jle .L999 diff --git a/kernel/x86_64/qconjg.S b/kernel/x86_64/qconjg.S index 49ca76649..bab541831 100644 --- a/kernel/x86_64/qconjg.S +++ b/kernel/x86_64/qconjg.S @@ -41,6 +41,7 @@ PROLOGUE PROFCODE + fninit fldz FLD 1 * SIZE(ARG1) diff --git a/kernel/x86_64/qdot.S b/kernel/x86_64/qdot.S index a48a04fdd..e7d31360b 100644 --- a/kernel/x86_64/qdot.S +++ b/kernel/x86_64/qdot.S @@ -58,6 +58,8 @@ PROLOGUE + fninit + pushl %edi pushl %esi pushl %ebx diff --git a/kernel/x86_64/qgemm_kernel_2x2.S b/kernel/x86_64/qgemm_kernel_2x2.S index 99db3961f..7b5e7707d 100644 --- a/kernel/x86_64/qgemm_kernel_2x2.S +++ b/kernel/x86_64/qgemm_kernel_2x2.S @@ -74,6 +74,8 @@ PROLOGUE PROFCODE + fninit + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) diff --git a/kernel/x86_64/qgemv_n.S b/kernel/x86_64/qgemv_n.S index 630d03ffb..1b65b03f0 100644 --- a/kernel/x86_64/qgemv_n.S +++ b/kernel/x86_64/qgemv_n.S @@ -76,6 +76,8 @@ PROLOGUE PROFCODE + fninit + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) diff --git a/kernel/x86_64/qgemv_t.S b/kernel/x86_64/qgemv_t.S index d7c9cd2a5..00188c257 100644 --- a/kernel/x86_64/qgemv_t.S +++ b/kernel/x86_64/qgemv_t.S @@ -75,6 +75,7 @@ PROLOGUE PROFCODE + fninit subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) diff --git a/kernel/x86_64/qtrsm_kernel_LN_2x2.S b/kernel/x86_64/qtrsm_kernel_LN_2x2.S index 536042e65..030eff893 100644 --- a/kernel/x86_64/qtrsm_kernel_LN_2x2.S +++ b/kernel/x86_64/qtrsm_kernel_LN_2x2.S @@ -74,6 +74,8 @@ PROLOGUE PROFCODE + fninit + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) diff --git a/kernel/x86_64/qtrsm_kernel_LT_2x2.S b/kernel/x86_64/qtrsm_kernel_LT_2x2.S index 6e94976c5..d86972c72 100644 --- a/kernel/x86_64/qtrsm_kernel_LT_2x2.S +++ b/kernel/x86_64/qtrsm_kernel_LT_2x2.S @@ -74,6 +74,8 @@ PROLOGUE PROFCODE + fninit + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) diff --git a/kernel/x86_64/qtrsm_kernel_RT_2x2.S b/kernel/x86_64/qtrsm_kernel_RT_2x2.S index caa7de14a..2826a62c9 100644 --- a/kernel/x86_64/qtrsm_kernel_RT_2x2.S +++ b/kernel/x86_64/qtrsm_kernel_RT_2x2.S @@ -74,6 +74,9 @@ PROLOGUE PROFCODE + fninit + + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) diff --git a/kernel/x86_64/sum.S b/kernel/x86_64/sum.S index d075eaa04..3d5fa7cc2 100644 --- a/kernel/x86_64/sum.S +++ b/kernel/x86_64/sum.S @@ -50,6 +50,8 @@ PROLOGUE PROFCODE + fninit + fldz testq M, M jle .L999 diff --git a/kernel/x86_64/xdot.S b/kernel/x86_64/xdot.S index ea97164b2..ec89b799c 100644 --- a/kernel/x86_64/xdot.S +++ b/kernel/x86_64/xdot.S @@ -59,6 +59,9 @@ PROFCODE + fninit + + #define N %ebx #define X %esi #define INCX %ecx diff --git a/kernel/x86_64/xgemm3m_kernel_2x2.S b/kernel/x86_64/xgemm3m_kernel_2x2.S index 843fc243a..e8da78d82 100644 --- a/kernel/x86_64/xgemm3m_kernel_2x2.S +++ b/kernel/x86_64/xgemm3m_kernel_2x2.S @@ -78,6 +78,8 @@ PROLOGUE PROFCODE + fninit + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) diff --git a/kernel/x86_64/xgemm_kernel_1x1.S b/kernel/x86_64/xgemm_kernel_1x1.S index e0cd1f1df..f04ab07f5 100644 --- a/kernel/x86_64/xgemm_kernel_1x1.S +++ b/kernel/x86_64/xgemm_kernel_1x1.S @@ -97,6 +97,8 @@ PROLOGUE PROFCODE + fninit + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) diff --git a/kernel/x86_64/xgemv_n.S b/kernel/x86_64/xgemv_n.S index cbde6402d..7d28c118a 100644 --- a/kernel/x86_64/xgemv_n.S +++ b/kernel/x86_64/xgemv_n.S @@ -76,6 +76,8 @@ PROLOGUE PROFCODE + fninit + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) diff --git a/kernel/x86_64/xgemv_t.S b/kernel/x86_64/xgemv_t.S index 31320f651..e79676088 100644 --- a/kernel/x86_64/xgemv_t.S +++ b/kernel/x86_64/xgemv_t.S @@ -75,6 +75,8 @@ PROLOGUE PROFCODE + fninit + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) diff --git a/kernel/x86_64/xtrsm_kernel_LT_1x1.S b/kernel/x86_64/xtrsm_kernel_LT_1x1.S index a61a240fd..54d41932f 100644 --- a/kernel/x86_64/xtrsm_kernel_LT_1x1.S +++ b/kernel/x86_64/xtrsm_kernel_LT_1x1.S @@ -90,6 +90,8 @@ PROLOGUE PROFCODE + fninit + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) diff --git a/kernel/x86_64/zamax.S b/kernel/x86_64/zamax.S index 74e127e6c..bfd836193 100644 --- a/kernel/x86_64/zamax.S +++ b/kernel/x86_64/zamax.S @@ -55,6 +55,8 @@ PROLOGUE PROFCODE + fninit + salq $ZBASE_SHIFT, INCX fldz diff --git a/kernel/x86_64/zasum.S b/kernel/x86_64/zasum.S index c372fc5dd..9ea2aadc0 100644 --- a/kernel/x86_64/zasum.S +++ b/kernel/x86_64/zasum.S @@ -50,6 +50,8 @@ PROLOGUE PROFCODE + fninit + fldz testq M, M jle .L999 diff --git a/kernel/x86_64/zdot.S b/kernel/x86_64/zdot.S index 94d1008ff..f7df919b7 100644 --- a/kernel/x86_64/zdot.S +++ b/kernel/x86_64/zdot.S @@ -54,6 +54,8 @@ PROLOGUE PROFCODE + fninit + #ifdef WINDOWS_ABI movq 40(%rsp), INCY #endif diff --git a/kernel/x86_64/znrm2.S b/kernel/x86_64/znrm2.S index 4115eab1d..cb02a5a9f 100644 --- a/kernel/x86_64/znrm2.S +++ b/kernel/x86_64/znrm2.S @@ -50,6 +50,8 @@ PROLOGUE PROFCODE + fninit + fldz testq M, M jle .L999 diff --git a/kernel/x86_64/zscal.S b/kernel/x86_64/zscal.S index 5282e0f72..08c0831a4 100644 --- a/kernel/x86_64/zscal.S +++ b/kernel/x86_64/zscal.S @@ -50,6 +50,8 @@ PROLOGUE PROFCODE + fninit + salq $ZBASE_SHIFT, INCX FLD 8(%rsp) diff --git a/kernel/x86_64/zsum.S b/kernel/x86_64/zsum.S index 45e0ddff5..1c3904839 100644 --- a/kernel/x86_64/zsum.S +++ b/kernel/x86_64/zsum.S @@ -50,6 +50,8 @@ PROLOGUE PROFCODE + fninit + fldz testq M, M jle .L999 From 78124860911ae2b4e226d1cd76486120c3187c72 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 6 Oct 2020 21:33:16 +0200 Subject: [PATCH 493/593] Use generic C for D/Z nrm2 kernels on Windows to work around fpu exception bug --- kernel/x86_64/KERNEL | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL index 4a2e13bed..d75196974 100644 --- a/kernel/x86_64/KERNEL +++ b/kernel/x86_64/KERNEL @@ -259,8 +259,12 @@ SNRM2KERNEL = nrm2_sse.S endif ifndef DNRM2KERNEL +ifeq ($(OSNAME),WINNT) +DNRM2KERNEL = ../arm/nrm2.c +else DNRM2KERNEL = nrm2.S endif +endif ifndef QNRM2KERNEL QNRM2KERNEL = nrm2.S @@ -271,8 +275,12 @@ CNRM2KERNEL = znrm2_sse.S endif ifndef ZNRM2KERNEL +ifeq ($(OSNAME),WINNT) +ZNRM2KERNEL = ../arm/znrm2.c +else ZNRM2KERNEL = znrm2.S endif +endif ifndef XNRM2KERNEL XNRM2KERNEL = znrm2.S From f32d34a01528a0b9f2df5229c17789333d41125a Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Sat, 10 Oct 2020 10:36:15 +0800 Subject: [PATCH 494/593] add sse3 compiler flag --- Makefile.x86_64 | 5 +++++ cmake/system.cmake | 3 +++ kernel/Makefile | 3 +++ 3 files changed, 11 insertions(+) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 00975b25a..65b67bba1 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -8,6 +8,11 @@ endif endif endif +ifdef HAVE_SSE3 +CCOMMON_OPT += -msse3 +FCOMMON_OPT += -msse3 +endif + ifeq ($(CORE), SKYLAKEX) ifndef DYNAMIC_ARCH ifndef NO_AVX512 diff --git a/cmake/system.cmake b/cmake/system.cmake index 8908a1890..1e6a292c8 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -70,6 +70,9 @@ if (DEFINED TARGET) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") endif() endif() + if (DEFINED HAVE_SSE3) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") + endif() endif() if (DEFINED TARGET) diff --git a/kernel/Makefile b/kernel/Makefile index 16211218f..0f0fa5a5e 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -5,6 +5,9 @@ endif TOPDIR = .. include $(TOPDIR)/Makefile.system +ifdef HAVE_SSE3 +CFLAGS += -msse3 +endif ifeq ($(C_COMPILER), GCC) GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) From de27e4f5fb54a792ea35720b67f0a395ad3e1026 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 00:40:22 +0200 Subject: [PATCH 495/593] Stop DYNAMIC_ARCH build if the toplevel source contains a stray config_kernel.h from a gmake build This is unlikely to happen in practice, but if it does, the rogue file would get included instead of the dynamically generated version for each target_core, leading to very confusing errors like "invalid operands (undefined UND and ABS sections)" in compilation of the assembly kernels as macros like PREFETCH would remain undefined --- cmake/arch.cmake | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index c00f8fe71..e851dd088 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -1,4 +1,3 @@ -## ## Author: Hank Anderson ## Description: Ported from portion of OpenBLAS/Makefile.system ## Sets various variables based on architecture. @@ -80,10 +79,15 @@ if (DYNAMIC_ARCH) string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") endif () if (DYNAMIC_LIST) - set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST}) + set(DYNAMIC_CORE ${DYNAMIC_LIST}) endif () endif () + CHECK_INCLUDE_FILE ("${PROJECT_SOURCE_DIR}/config_kernel.h" TRAP) + if (TRAP) + message (FATAL_ERROR "Your build directory contains a file config_kernel.h, probably from a previous compilation with make. This will conflict with the cmake compilation and cause strange compiler errors - please remove the file before trying again") + endif () + if (NOT DYNAMIC_CORE) message (STATUS "DYNAMIC_ARCH is not supported on this architecture, removing from options") unset(DYNAMIC_ARCH CACHE) From 82a497ec5d4c759acc9994b6d1eba54ea90e3b9b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 00:43:09 +0200 Subject: [PATCH 496/593] restore PRESCOTT default for DYNAMIC_LIST --- cmake/arch.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index e851dd088..c048f13d1 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -79,7 +79,7 @@ if (DYNAMIC_ARCH) string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") endif () if (DYNAMIC_LIST) - set(DYNAMIC_CORE ${DYNAMIC_LIST}) + set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST}) endif () endif () From 0c773b8205d5108a765db44eaca6427b2b3af608 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 01:04:57 +0200 Subject: [PATCH 497/593] Do not rely on HAVE_SSE3 in DYNAMIC_ARCH builds --- Makefile.x86_64 | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 65b67bba1..e793a1c2f 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -9,9 +9,11 @@ endif endif ifdef HAVE_SSE3 +ifndef DYNAMIC_ARCH CCOMMON_OPT += -msse3 FCOMMON_OPT += -msse3 endif +endif ifeq ($(CORE), SKYLAKEX) ifndef DYNAMIC_ARCH From 7a531284817d411e8d89deb3a0a912d1b1e4aca8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 01:06:46 +0200 Subject: [PATCH 498/593] Add whitelist of DYNAMIC_ARCH kernels for which -msse3 needs to be enabled --- kernel/Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/Makefile b/kernel/Makefile index 0f0fa5a5e..290fb2afe 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -41,6 +41,9 @@ ifdef NO_AVX2 endif ifdef TARGET_CORE + ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),PRESCOTT CORE2 PENRYN DUNNINGTON ATOM NANO NEHALEM BARCELONA BOBCAT BULLDOZER PILEDRIVER EXCAVATOR STEAMROLLER OPTERON_SSE3)) + override CFLAGS += -msse3 +endif ifeq ($(TARGET_CORE), COOPERLAKE) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) ifeq ($(GCCVERSIONGTEQ10), 1) From 9d43140d61d93a6b96844c19b760b64ba49d451f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 12:58:17 +0200 Subject: [PATCH 499/593] Improve check for conflicting config_kernel.h --- cmake/arch.cmake | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index c048f13d1..99e685d04 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -83,8 +83,7 @@ if (DYNAMIC_ARCH) endif () endif () - CHECK_INCLUDE_FILE ("${PROJECT_SOURCE_DIR}/config_kernel.h" TRAP) - if (TRAP) + if (EXISTS ${PROJECT_SOURCE_DIR}/config_kernel.h) message (FATAL_ERROR "Your build directory contains a file config_kernel.h, probably from a previous compilation with make. This will conflict with the cmake compilation and cause strange compiler errors - please remove the file before trying again") endif () From 63d7dad04cd23c71cc96495cc61adb20475a17c2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 14:15:35 +0200 Subject: [PATCH 501/593] Adapt utests for builds supportin only some variable types --- utest/test_dsdot.c | 17 ++--------------- utest/test_fork.c | 6 ++++++ 2 files changed, 8 insertions(+), 15 deletions(-) diff --git a/utest/test_dsdot.c b/utest/test_dsdot.c index 57da7101e..adef4e91c 100644 --- a/utest/test_dsdot.c +++ b/utest/test_dsdot.c @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #include "openblas_utest.h" - +#if defined(BUILD_SINGLE) && defined(BUILD_DOUBLE) CTEST(dsdot,dsdot_n_1) { float x= 0.172555164F; @@ -47,17 +47,4 @@ CTEST(dsdot,dsdot_n_1) ASSERT_DBL_NEAR_TOL(res2, res1, DOUBLE_EPS); } - -CTEST(dsdot,dsdot_n_2) -{ - float x[] = {0.1F, 0.2F, 0.3F, 0.4F, 0.5F, 0.6F, 0.7F, 0.8F}; - float y[] = {0.1F, 0.2F, 0.3F, 0.4F, 0.5F, 0.6F, 0.7F, 0.8F}; - blasint incx=1; - blasint incy=1; - blasint n=8; - - double res1=0.0f, res2= 2.0400000444054616; - - res1=BLASFUNC(dsdot)(&n, &x, &incx, &y, &incy); - ASSERT_DBL_NEAR_TOL(res2, res1, DOUBLE_EPS); -} \ No newline at end of file +#endif diff --git a/utest/test_fork.c b/utest/test_fork.c index 0b90407b1..5c976f920 100644 --- a/utest/test_fork.c +++ b/utest/test_fork.c @@ -48,6 +48,7 @@ void* xmalloc(size_t n) } } +#ifdef BUILD_DOUBLE void check_dgemm(double *a, double *b, double *result, double *expected, blasint n) { char trans1 = 'T'; @@ -59,9 +60,13 @@ void check_dgemm(double *a, double *b, double *result, double *expected, blasint ASSERT_DBL_NEAR_TOL(expected[i], result[i], DOUBLE_EPS); } } +#endif CTEST(fork, safety) { +#ifndef BUILD_DOUBLE +exit(0); +#else blasint n = 1000; int i; @@ -124,4 +129,5 @@ CTEST(fork, safety) ASSERT_EQUAL(wait_pid, fork_pid); ASSERT_EQUAL(0, WEXITSTATUS (child_status)); } +#endif } From 08f4749eb483f16618e553db54e8ae9d537795e4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 14:25:24 +0200 Subject: [PATCH 502/593] Adapt tests to having only a subset of types in the build --- test/Makefile | 245 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 218 insertions(+), 27 deletions(-) diff --git a/test/Makefile b/test/Makefile index 45f9821ec..a3966756d 100644 --- a/test/Makefile +++ b/test/Makefile @@ -7,82 +7,242 @@ all :: else all :: level1 level2 level3 endif +$(info buildvars [$(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16)]) +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1x1x1x1) +level1: sblat1 dblat1 cblat1 zblat1 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),x1x1x1) +level1: dblat1 cblat1 zblat1 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1xx1x1) +level1: sblat1 cblat1 zblat1 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),xx1x1) +level1: cblat1 zblat1 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),xx1x) +level1: cblat1 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),xxx1) +level1: zblat1 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1xxx1) +level1: sblat1 zblat1 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1x1xx1) +level1: sblat1 dblat1 zblat1 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1x1xx) +level1: sblat1 dblat1 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1xxx) +level1: sblat1 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),x1xx) +level1: dblat1 +endif -level1 : sblat1 dblat1 cblat1 zblat1 ifndef CROSS +ifeq ($(BUILD_SINGLE),1) OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat1 +endif +ifeq ($(BUILD_DOUBLE),1) OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./dblat1 +endif +ifeq ($(BUILD_COMPLEX),1) OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat1 +endif +ifeq ($(BUILD_COMPLEX16),1) OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./zblat1 +endif ifdef SMP ifeq ($(USE_OPENMP), 1) +ifeq ($(BUILD_SINGLE),1) OMP_NUM_THREADS=2 ./sblat1 +endif +ifeq ($(BUILD_DOUBLE),1) OMP_NUM_THREADS=2 ./dblat1 +endif +ifeq ($(BUILD_COMPLEX),1) OMP_NUM_THREADS=2 ./cblat1 +endif +ifeq ($(BUILD_COMPLEX16),1) OMP_NUM_THREADS=2 ./zblat1 +endif else +ifeq ($(BUILD_SINGLE),1) OPENBLAS_NUM_THREADS=2 ./sblat1 +endif +ifeq ($(BUILD_DOUBLE),1) OPENBLAS_NUM_THREADS=2 ./dblat1 +endif +ifeq ($(BUILD_COMPLEX),1) OPENBLAS_NUM_THREADS=2 ./cblat1 +endif +ifeq ($(BUILD_COMPLEX16),1) OPENBLAS_NUM_THREADS=2 ./zblat1 endif endif endif +endif + +#level2: sblat2 dblat2 cblat2 zblat2 +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1x1x1x1) +level2: sblat2 dblat2 cblat2 zblat2 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),x1x1x1) +level2: dblat2 cblat2 zblat2 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1xx1x1) +level2: sblat2 cblat2 zblat2 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),xx1x1) +level2: cblat2 zblat2 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),xx1x) +level2: cblat2 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),xxx1) +level2: zblat2 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1xxx1) +level2: sblat2 zblat2 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1x1xx1) +level2: sblat2 dblat2 zblat2 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1x1xx) +level2: sblat2 dblat2 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1xxx) +level2: sblat2 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),x1xx) +level2: dblat2 +endif -level2 : sblat2 dblat2 cblat2 zblat2 ifndef CROSS rm -f ?BLAT2.SUMM +ifeq ($(BUILD_SINGLE),1) OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat2 < ./sblat2.dat @$(GREP) -q FATAL SBLAT2.SUMM && cat SBLAT2.SUMM || exit 0 +endif +ifeq ($(BUILD_DOUBLE),1) OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./dblat2 < ./dblat2.dat @$(GREP) -q FATAL DBLAT2.SUMM && cat DBLAT2.SUMM || exit 0 +endif +ifeq ($(BUILD_COMPLEX),1) OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat2 < ./cblat2.dat @$(GREP) -q FATAL CBLAT2.SUMM && cat CBLAT2.SUMM || exit 0 +endif +ifeq ($(BUILD_COMPLEX16),1) OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./zblat2 < ./zblat2.dat @$(GREP) -q FATAL ZBLAT2.SUMM && cat ZBLAT2.SUMM || exit 0 +endif ifdef SMP rm -f ?BLAT2.SUMM ifeq ($(USE_OPENMP), 1) +ifeq ($(BUILD_SINGLE),1) OMP_NUM_THREADS=2 ./sblat2 < ./sblat2.dat @$(GREP) -q FATAL SBLAT2.SUMM && cat SBLAT2.SUMM || exit 0 +endif +ifeq ($(BUILD_DOUBLE),1) OMP_NUM_THREADS=2 ./dblat2 < ./dblat2.dat @$(GREP) -q FATAL DBLAT2.SUMM && cat DBLAT2.SUMM || exit 0 +endif +ifeq ($(BUILD_COMPLEX),1) OMP_NUM_THREADS=2 ./cblat2 < ./cblat2.dat @$(GREP) -q FATAL CBLAT2.SUMM && cat CBLAT2.SUMM || exit 0 +endif +ifeq ($(BUILD_COMPLEX16),1) OMP_NUM_THREADS=2 ./zblat2 < ./zblat2.dat @$(GREP) -q FATAL ZBLAT2.SUMM && cat ZBLAT2.SUMM || exit 0 +endif else +ifeq ($(BUILD_SINGLE),1) OPENBLAS_NUM_THREADS=2 ./sblat2 < ./sblat2.dat @$(GREP) -q FATAL SBLAT2.SUMM && cat SBLAT2.SUMM || exit 0 +endif +ifeq ($(BUILD_DOUBLE),1) OPENBLAS_NUM_THREADS=2 ./dblat2 < ./dblat2.dat @$(GREP) -q FATAL DBLAT2.SUMM && cat DBLAT2.SUMM || exit 0 +endif +ifeq ($(BUILD_COMPLEX),1) OPENBLAS_NUM_THREADS=2 ./cblat2 < ./cblat2.dat @$(GREP) -q FATAL CBLAT2.SUMM && cat CBLAT2.SUMM || exit 0 +endif +ifeq ($(BUILD_COMPLEX16),1) OPENBLAS_NUM_THREADS=2 ./zblat2 < ./zblat2.dat @$(GREP) -q FATAL ZBLAT2.SUMM && cat ZBLAT2.SUMM || exit 0 endif endif endif +endif -ifeq ($(BUILD_HALF),1) -level3 : test_shgemm sblat3 dblat3 cblat3 zblat3 -else -level3 : sblat3 dblat3 cblat3 zblat3 +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1x1x1x1) +level3: sblat3 dblat3 cblat3 zblat3 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),x1x1x1) +level3: dblat3 cblat3 zblat3 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1xx1x1) +level3: sblat3 cblat3 zblat3 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),xx1x1) +level3: cblat3 zblat3 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),xx1x) +level3: cblat3 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),xxx1) +level3: zblat3 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1xxx1) +level3: sblat3 zblat3 endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1x1xx1) +level3: sblat3 dblat3 zblat3 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1x1xx) +level3: sblat3 dblat3 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1xxx) +level3: sblat3 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),x1xx) +level3: dblat3 +endif + + + +#ifeq ($(BUILD_HALF),1) +#level3 : test_shgemm sblat3 dblat3 cblat3 zblat3 +#else +#level3 : sblat3 dblat3 cblat3 zblat3 +#endif + ifndef CROSS rm -f ?BLAT3.SUMM ifeq ($(BUILD_HALF),1) OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./test_shgemm > SHBLAT3.SUMM @$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0 endif +ifeq ($(BUILD_SINGLE),1) OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat3 < ./sblat3.dat @$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0 +endif +ifeq ($(BUILD_DOUBLE),1) OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./dblat3 < ./dblat3.dat @$(GREP) -q FATAL DBLAT3.SUMM && cat DBLAT3.SUMM || exit 0 +endif +ifeq ($(BUILD_COMPLEX),1) OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat3 < ./cblat3.dat @$(GREP) -q FATAL CBLAT3.SUMM && cat CBLAT3.SUMM || exit 0 +endif +ifeq ($(BUILD_COMPLEX16),1) OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./zblat3 < ./zblat3.dat @$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0 +endif ifdef SMP rm -f ?BLAT3.SUMM ifeq ($(USE_OPENMP), 1) @@ -90,30 +250,46 @@ ifeq ($(BUILD_HALF),1) OMP_NUM_THREADS=2 ./test_shgemm > SHBLAT3.SUMM @$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0 endif +ifeq ($(BUILD_SINGLE),1) OMP_NUM_THREADS=2 ./sblat3 < ./sblat3.dat @$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0 +endif +ifeq ($(BUILD_DOUBLE),1) OMP_NUM_THREADS=2 ./dblat3 < ./dblat3.dat @$(GREP) -q FATAL DBLAT3.SUMM && cat DBLAT3.SUMM || exit 0 +endif +ifeq ($(BUILD_COMPLEX),1) OMP_NUM_THREADS=2 ./cblat3 < ./cblat3.dat @$(GREP) -q FATAL CBLAT3.SUMM && cat CBLAT3.SUMM || exit 0 +endif +ifeq ($(BUILD_COMPLEX16),1) OMP_NUM_THREADS=2 ./zblat3 < ./zblat3.dat @$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0 +endif else ifeq ($(BUILD_HALF),1) OPENBLAS_NUM_THREADS=2 ./test_shgemm > SHBLAT3.SUMM @$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0 endif +ifeq ($(BUILD_SINGLE),1) OPENBLAS_NUM_THREADS=2 ./sblat3 < ./sblat3.dat @$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0 +endif +ifeq ($(BUILD_DOUBLE),1) OPENBLAS_NUM_THREADS=2 ./dblat3 < ./dblat3.dat @$(GREP) -q FATAL DBLAT3.SUMM && cat DBLAT3.SUMM || exit 0 +endif +ifeq ($(BUILD_COMPLEX),1) OPENBLAS_NUM_THREADS=2 ./cblat3 < ./cblat3.dat @$(GREP) -q FATAL CBLAT3.SUMM && cat CBLAT3.SUMM || exit 0 +endif +ifeq ($(BUILD_COMPLEX16),1) OPENBLAS_NUM_THREADS=2 ./zblat3 < ./zblat3.dat @$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0 endif endif endif +endif level3_3m : zblat3_3m cblat3_3m @@ -151,56 +327,71 @@ endif endif endif +ifeq ($(BUILD_SINGLE),1) sblat1 : sblat1.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o sblat1 sblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) +sblat2 : sblat2.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o sblat2 sblat2.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + +sblat3 : sblat3.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o sblat3 sblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) +endif + +ifeq ($(BUILD_DOUBLE),1) dblat1 : dblat1.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o dblat1 dblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) +dblat2 : dblat2.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o dblat2 dblat2.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + +dblat3 : dblat3.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o dblat3 dblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) +else +dblat2: +dblat3: +endif + + qblat1 : qblat1.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o qblat1 qblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) +ifeq ($(BUILD_COMPLEX),1) cblat1 : cblat1.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o cblat1 cblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) -zblat1 : zblat1.$(SUFFIX) ../$(LIBNAME) - $(FC) $(FLDFLAGS) -o zblat1 zblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) - -sblat2 : sblat2.$(SUFFIX) ../$(LIBNAME) - $(FC) $(FLDFLAGS) -o sblat2 sblat2.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) - -dblat2 : dblat2.$(SUFFIX) ../$(LIBNAME) - $(FC) $(FLDFLAGS) -o dblat2 dblat2.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) - cblat2 : cblat2.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o cblat2 cblat2.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) +cblat3 : cblat3.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o cblat3 cblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) +endif + +ifeq ($(BUILD_COMPLEX16),1) +zblat1 : zblat1.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o zblat1 zblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + zblat2 : zblat2.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o zblat2 zblat2.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) -sblat3 : sblat3.$(SUFFIX) ../$(LIBNAME) - $(FC) $(FLDFLAGS) -o sblat3 sblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) +zblat3 : zblat3.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o zblat3 zblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) +endif ifeq ($(BUILD_HALF),1) test_shgemm : compare_sgemm_shgemm.c ../$(LIBNAME) $(FC) $(FLDFLAGS) -o test_shgemm compare_sgemm_shgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) endif -dblat3 : dblat3.$(SUFFIX) ../$(LIBNAME) - $(FC) $(FLDFLAGS) -o dblat3 dblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) - -cblat3 : cblat3.$(SUFFIX) ../$(LIBNAME) - $(FC) $(FLDFLAGS) -o cblat3 cblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) - -zblat3 : zblat3.$(SUFFIX) ../$(LIBNAME) - $(FC) $(FLDFLAGS) -o zblat3 zblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) - +ifeq ($(BUILD_COMPLEX),1) cblat3_3m : cblat3_3m.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o cblat3_3m cblat3_3m.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) +endif +ifeq ($(BUILD_COMPLEX16),1) zblat3_3m : zblat3_3m.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o zblat3_3m zblat3_3m.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) - +endif From f6d2827d0ca2d773ee1295a674b096119cff3f44 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 14:32:00 +0200 Subject: [PATCH 503/593] Adapt ctests to having only a subset of types in the build --- ctest/Makefile | 119 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 115 insertions(+), 4 deletions(-) diff --git a/ctest/Makefile b/ctest/Makefile index 6f5b65142..cba904f75 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -46,56 +46,155 @@ else all :: all1 all2 all3 endif -all1: xscblat1 xdcblat1 xccblat1 xzcblat1 +ifeq ($(BUILD_SINGLE),1) +all1targets += xscblat1 +endif +ifeq ($(BUILD_DOUBLE),1) +all1targets += xdcblat1 +endif +ifeq ($(BUILD_COMPLEX),1) +all1targets += xccblat1 +endif +ifeq ($(BUILD_COMPLEX16),1) +all1targets += xzcblat1 +endif + +all1: $(all1targets) + ifndef CROSS ifeq ($(USE_OPENMP), 1) +ifeq ($(BUILD_SINGLE),1) OMP_NUM_THREADS=2 ./xscblat1 +endif +ifeq ($(BUILD_DOUBLE),1) OMP_NUM_THREADS=2 ./xdcblat1 +endif +ifeq ($(BUILD_COMPLEX),1) OMP_NUM_THREADS=2 ./xccblat1 +endif +ifeq ($(BUILD_COMPLEX16),1) OMP_NUM_THREADS=2 ./xzcblat1 +endif else +ifeq ($(BUILD_SINGLE),1) OPENBLAS_NUM_THREADS=2 ./xscblat1 +endif +ifeq ($(BUILD_DOUBLE),1) OPENBLAS_NUM_THREADS=2 ./xdcblat1 +endif +ifeq ($(BUILD_COMPLEX),1) OPENBLAS_NUM_THREADS=2 ./xccblat1 +endif +ifeq ($(BUILD_COMPLEX16),1) OPENBLAS_NUM_THREADS=2 ./xzcblat1 endif endif +endif + +ifeq ($(BUILD_SINGLE),1) +all2targets += xscblat2 +endif +ifeq ($(BUILD_DOUBLE),1) +all2targets += xdcblat2 +endif +ifeq ($(BUILD_COMPLEX),1) +all2targets += xccblat2 +endif +ifeq ($(BUILD_COMPLEX16),1) +all2targets += xzcblat2 +endif + +all2: $(all2targets) -all2: xscblat2 xdcblat2 xccblat2 xzcblat2 ifndef CROSS ifeq ($(USE_OPENMP), 1) +ifeq ($(BUILD_SINGLE),1) OMP_NUM_THREADS=2 ./xscblat2 < sin2 +endif +ifeq ($(BUILD_DOUBLE),1) OMP_NUM_THREADS=2 ./xdcblat2 < din2 +endif +ifeq ($(BUILD_COMPLEX),1) OMP_NUM_THREADS=2 ./xccblat2 < cin2 +endif +ifeq ($(BUILD_COMPLEX16),1) OMP_NUM_THREADS=2 ./xzcblat2 < zin2 +endif else +ifeq ($(BUILD_SINGLE),1) OPENBLAS_NUM_THREADS=2 ./xscblat2 < sin2 +endif +ifeq ($(BUILD_DOUBLE),1) OPENBLAS_NUM_THREADS=2 ./xdcblat2 < din2 +endif +ifeq ($(BUILD_COMPLEX),1) OPENBLAS_NUM_THREADS=2 ./xccblat2 < cin2 +endif +ifeq ($(BUILD_COMPLEX16),1) OPENBLAS_NUM_THREADS=2 ./xzcblat2 < zin2 endif endif +endif + + +ifeq ($(BUILD_SINGLE),1) +all3targets += xscblat3 +endif +ifeq ($(BUILD_DOUBLE),1) +all3targets += xdcblat3 +endif +ifeq ($(BUILD_COMPLEX),1) +all3targets += xccblat3 +endif +ifeq ($(BUILD_COMPLEX16),1) +all3targets += xzcblat3 +endif + +all3: $(all3targets) -all3: xscblat3 xdcblat3 xccblat3 xzcblat3 ifndef CROSS ifeq ($(USE_OPENMP), 1) +ifeq ($(BUILD_SINGLE),1) OMP_NUM_THREADS=2 ./xscblat3 < sin3 +endif +ifeq ($(BUILD_DOUBLE),1) OMP_NUM_THREADS=2 ./xdcblat3 < din3 +endif +ifeq ($(BUILD_COMPLEX),1) OMP_NUM_THREADS=2 ./xccblat3 < cin3 +endif +ifeq ($(BUILD_COMPLEX16),1) OMP_NUM_THREADS=2 ./xzcblat3 < zin3 +endif else +ifeq ($(BUILD_SINGLE),1) OPENBLAS_NUM_THREADS=2 ./xscblat3 < sin3 +endif +ifeq ($(BUILD_DOUBLE),1) OPENBLAS_NUM_THREADS=2 ./xdcblat3 < din3 +endif +ifeq ($(BUILD_COMPLEX),1) OPENBLAS_NUM_THREADS=2 ./xccblat3 < cin3 +endif +ifeq ($(BUILD_COMPLEX16),1) OPENBLAS_NUM_THREADS=2 ./xzcblat3 < zin3 endif +endif +endif all3_3m: xzcblat3_3m xccblat3_3m ifeq ($(USE_OPENMP), 1) +ifeq ($(BUILD_SINGLE),1) OMP_NUM_THREADS=2 ./xccblat3_3m < cin3_3m +endif +ifeq ($(BUILD_COMPLEX16),1) OMP_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m +endif else +ifeq ($(BUILD_COMPLEX),1) OPENBLAS_NUM_THREADS=2 ./xccblat3_3m < cin3_3m +endif +ifeq ($(BUILD_COMPLEX16),1) OPENBLAS_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m endif endif @@ -115,13 +214,19 @@ endif endif endif +ifeq ($(BUILD_SINGLE),1) # Single real xscblat1: $(stestl1o) c_sblat1.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xscblat1 c_sblat1.o $(stestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB) + xscblat2: $(stestl2o) c_sblat2.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xscblat2 c_sblat2.o $(stestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) + xscblat3: $(stestl3o) c_sblat3.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xscblat3 c_sblat3.o $(stestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +endif + +ifeq ($(BUILD_DOUBLE),1) # Double real xdcblat1: $(dtestl1o) c_dblat1.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xdcblat1 c_dblat1.o $(dtestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB) @@ -129,7 +234,10 @@ xdcblat2: $(dtestl2o) c_dblat2.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xdcblat2 c_dblat2.o $(dtestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) xdcblat3: $(dtestl3o) c_dblat3.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xdcblat3 c_dblat3.o $(dtestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +endif + +ifeq ($(BUILD_COMPLEX),1) # Single complex xccblat1: $(ctestl1o) c_cblat1.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xccblat1 c_cblat1.o $(ctestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB) @@ -140,7 +248,10 @@ xccblat3: $(ctestl3o) c_cblat3.o $(TOPDIR)/$(LIBNAME) xccblat3_3m: $(ctestl3o_3m) c_cblat3_3m.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xccblat3_3m c_cblat3_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) +endif + +ifeq ($(BUILD_COMPLEX16),1) # Double complex xzcblat1: $(ztestl1o) c_zblat1.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xzcblat1 c_zblat1.o $(ztestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB) @@ -152,6 +263,6 @@ xzcblat3: $(ztestl3o) c_zblat3.o $(TOPDIR)/$(LIBNAME) xzcblat3_3m: $(ztestl3o_3m) c_zblat3_3m.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xzcblat3_3m c_zblat3_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) - +endif include $(TOPDIR)/Makefile.tail From 6a83c591d65ebf1ccb7a7be69d5744d9ce522d24 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 14:34:12 +0200 Subject: [PATCH 504/593] Adapt for having only a subset of variable types --- exports/Makefile | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/exports/Makefile b/exports/Makefile index 75901586c..960150c86 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -33,6 +33,18 @@ endif ifndef BUILD_HALF BUILD_HALF = 0 endif +ifndef BUILD_SINGLE +BUILD_SINGLE = 0 +endif +ifndef BUILD_DOUBLE +BUILD_DOUBLE = 0 +endif +ifndef BUILD_COMPLEX +BUILD_COMPLEX = 0 +endif +ifndef BUILD_COMPLEX16 +BUILD_COMPLEX16 = 0 +endif ifeq ($(OSNAME), WINNT) ifeq ($(F_COMPILER), GFORTRAN) @@ -108,10 +120,10 @@ dll : ../$(LIBDLLNAME) -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive $(FEXTRALIB) $(EXTRALIB) $(LIBPREFIX).def : gensymbol - perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) + perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) libgoto_hpl.def : gensymbol - perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) + perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) ifeq ($(OSNAME), Darwin) INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib @@ -246,23 +258,23 @@ static : ../$(LIBNAME) rm -f goto.$(SUFFIX) osx.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F) + perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) aix.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F) + perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) objcopy.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F) + perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) objconv.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F) + perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) test : linktest.c $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK. rm -f linktest linktest.c : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > linktest.c + perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c clean :: @rm -f *.def *.dylib __.SYMDEF* *.renamed From d33de97d60d27b753f217d0a8d6a7ef1a6df12d9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 14:36:45 +0200 Subject: [PATCH 505/593] Adapt to having only a subset of variable types supported --- exports/gensymbol | 1646 +++++++++++++++++++++++++-------------------- 1 file changed, 907 insertions(+), 739 deletions(-) diff --git a/exports/gensymbol b/exports/gensymbol index ce4d9bb64..736fdc2cd 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -16,74 +16,84 @@ # 2017/08/01 Saar # removed blas_thread_shutdown_ # -@blasobjs = ( - caxpy,ccopy,cdotc,cdotu,cgbmv,cgemm,cgemv,cgerc,cgeru, - chbmv,chemm,chemv,cher2,cher2k,cher,cherk, - chpmv,chpr2,chpr,crotg,cscal,csrot,csscal,cswap, +@blasobjsc = ( + caxpy,caxpby,ccopy,cdotc,cdotu,cgbmv,cgemm,cgemv,cgerc,cgeru, + chbmv,chemm,chemv,cher2,cher2k,cher,cherk,scabs1,scamax, + chpmv,chpr2,chpr,crotg,cscal,csrot,csscal,cswap,scamin,scasum,scnrm2, csymm,csyr2k,csyrk,ctbmv,ctbsv,ctpmv,ctpsv,ctrmm,ctrmv,ctrsm, - ctrsv, - damax,damin,dasum,daxpy,dcabs1,dcopy,ddot,dgbmv,dgemm, + ctrsv,icamax,icamin,cimatcopy,comatcopy,cgeadd,scsum); + +@blasobjsd = ( + damax,damin,dasum,daxpy,daxpby,dcabs1,dcopy,ddot,dgbmv,dgemm, dgemv,dger,dmax,dmin,dnrm2,drot,drotg,drotm,drotmg,dsbmv, - dscal,dsdot,dspmv,dspr2, + dscal,dsdot,dspmv,dspr2,dimatcopy,domatcopy, dspr,dswap,dsymm,dsymv,dsyr2,dsyr2k,dsyr,dsyrk,dtbmv,dtbsv, - dtpmv,dtpsv,dtrmm,dtrmv,dtrsm,dtrsv,dzamax,dzamin,dzasum,dznrm2, - icamax,icamin,idamax,idamin,idmax,idmin,isamax,isamin,ismax,ismin, - izamax,izamin,lsame,samax,samin,sasum,saxpy,scabs1,scamax, - scamin,scasum,scnrm2,scopy,sdot,sdsdot,sgbmv,sgemm,sgemv,sger, - smax,smin,snrm2, + dtpmv,dtpsv,dtrmm,dtrmv,dtrsm,dtrsv, + idamax,idamin,idmax,idmin,dgeadd,dsum); + +@blasobjss = ( + isamax,isamin,ismax,ismin, + samax,samin,sasum,saxpy, saxpby, + scopy,sdot,sdsdot,sgbmv,sgemm,sgemv,sger, + smax,smin,snrm2,simatcopy,somatcopy, srot,srotg,srotm,srotmg,ssbmv,sscal,sspmv,sspr2,sspr,sswap, ssymm,ssymv,ssyr2,ssyr2k,ssyr,ssyrk,stbmv,stbsv,stpmv,stpsv, - strmm,strmv,strsm,strsv,zaxpy,zcopy,zdotc,zdotu,zdrot, + strmm,strmv,strsm,strsv, sgeadd,ssum); + +@blasobjsz = ( + izamax,izamin,, + zaxpy,zaxpby,zcopy,zdotc,zdotu,zdrot, zdscal,zgbmv,zgemm,zgemv,zgerc,zgeru, zhbmv,zhemm,zhemv,zher2,zher2k,zher,zherk,zhpmv,zhpr2, zhpr,zrotg,zscal,zswap,zsymm,zsyr2k,zsyrk,ztbmv, ztbsv,ztpmv,ztpsv,ztrmm,ztrmv,ztrsm,ztrsv, - xerbla, - saxpby,daxpby,caxpby,zaxpby, - somatcopy, domatcopy, comatcopy, zomatcopy, - simatcopy, dimatcopy, cimatcopy, zimatcopy, - sgeadd,dgeadd,cgeadd,zgeadd, - ssum, dsum, scsum, dzsum -); + zomatcopy, zimatcopy,dzamax,dzamin,dzasum,dznrm2, + zgeadd, dzsum); +@cblasobjs = (lsame, xerbla); @halfblasobjs = (shgemm, shdot, shstobf16, shdtobf16, sbf16tos, dbf16tod); -@cblasobjs = ( +@cblasobjsc = ( cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv, cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k, - cblas_cher, cblas_cherk, cblas_chpmv, cblas_chpr2, cblas_chpr, cblas_cscal, - cblas_csscal, cblas_cswap, cblas_csymm, cblas_csyr2k, cblas_csyrk, cblas_ctbmv, - cblas_ctbsv, cblas_ctpmv, cblas_ctpsv, cblas_ctrmm, cblas_ctrmv, cblas_ctrsm, cblas_ctrsv, + cblas_cher, cblas_cherk, cblas_chpmv, cblas_chpr2, cblas_chpr, cblas_cscal, cblas_caxpby, + cblas_csscal, cblas_cswap, cblas_csymm, cblas_csyr2k, cblas_csyrk, cblas_ctbmv, cblas_cgeadd, + cblas_ctbsv, cblas_ctpmv, cblas_ctpsv, cblas_ctrmm, cblas_ctrmv, cblas_ctrsm, cblas_ctrsv, + cblas_scnrm2, cblas_scasum, + cblas_icamax, cblas_icamin, cblas_icmin, cblas_icmax, cblas_scsum,cblas_cimatcopy,cblas_comatcopy + ); +@cblasobjsd = ( cblas_dasum, cblas_daxpy, cblas_dcopy, cblas_ddot, cblas_dgbmv, cblas_dgemm, cblas_dgemv, cblas_dger, cblas_dnrm2, cblas_drot, cblas_drotg, cblas_drotm, cblas_drotmg, cblas_dsbmv, cblas_dscal, cblas_dsdot, cblas_dspmv, cblas_dspr2, cblas_dspr, cblas_dswap, cblas_dsymm, cblas_dsymv, cblas_dsyr2, cblas_dsyr2k, cblas_dsyr, cblas_dsyrk, cblas_dtbmv, cblas_dtbsv, cblas_dtpmv, cblas_dtpsv, - cblas_dtrmm, cblas_dtrmv, cblas_dtrsm, cblas_dtrsv, cblas_dzasum, - cblas_dznrm2, cblas_icamax, cblas_idamax, - cblas_isamax, cblas_izamax, + cblas_dtrmm, cblas_dtrmv, cblas_dtrsm, cblas_dtrsv, cblas_daxpby, cblas_dgeadd, + cblas_idamax, cblas_idamin, cblas_idmin, cblas_idmax, cblas_dsum,cblas_dimatcopy,cblas_domatcopy + ); + +@cblasobjss = ( cblas_sasum, cblas_saxpy, - cblas_scasum, cblas_scnrm2, cblas_scopy, cblas_sdot, cblas_sdsdot, cblas_sgbmv, cblas_sgemm, + cblas_scopy, cblas_sdot, cblas_sdsdot, cblas_sgbmv, cblas_sgemm, cblas_sgemv, cblas_sger, cblas_snrm2, cblas_srot, cblas_srotg, cblas_srotm, cblas_srotmg, cblas_ssbmv, cblas_sscal, cblas_sspmv, cblas_sspr2, cblas_sspr, cblas_sswap, cblas_ssymm, cblas_ssymv, cblas_ssyr2, cblas_ssyr2k, cblas_ssyr, cblas_ssyrk, cblas_stbmv, cblas_stbsv, cblas_stpmv, cblas_stpsv, cblas_strmm, cblas_strmv, cblas_strsm, - cblas_strsv, cblas_zaxpy, cblas_zcopy, cblas_zdotc, cblas_zdotu, cblas_zdscal, + cblas_strsv, cblas_sgeadd, + cblas_isamax, cblas_isamin, cblas_ismin, cblas_ismax, cblas_ssum,cblas_simatcopy,cblas_somatcopy + ); +@cblasobjsz = ( + cblas_dzasum, cblas_dznrm2, cblas_zaxpy, cblas_zcopy, cblas_zdotc, cblas_zdotu, cblas_zdscal, cblas_zgbmv, cblas_zgemm, cblas_zgemv, cblas_zgerc, cblas_zgeru, cblas_zhbmv, cblas_zhemm, cblas_zhemv, cblas_zher2, cblas_zher2k, cblas_zher, cblas_zherk, cblas_zhpmv, cblas_zhpr2, cblas_zhpr, cblas_zscal, cblas_zswap, cblas_zsymm, cblas_zsyr2k, cblas_zsyrk, cblas_ztbmv, cblas_ztbsv, cblas_ztpmv, cblas_ztpsv, cblas_ztrmm, cblas_ztrmv, cblas_ztrsm, cblas_ztrsv, cblas_cdotc_sub, cblas_cdotu_sub, cblas_zdotc_sub, cblas_zdotu_sub, - cblas_saxpby,cblas_daxpby,cblas_caxpby,cblas_zaxpby, - cblas_somatcopy, cblas_domatcopy, cblas_comatcopy, cblas_zomatcopy, - cblas_simatcopy, cblas_dimatcopy, cblas_cimatcopy, cblas_zimatcopy, - cblas_sgeadd, cblas_dgeadd,cblas_cgeadd, cblas_zgeadd, - cblas_isamin, cblas_idamin, cblas_icamin, cblas_izamin, - cblas_ismin, cblas_idmin, cblas_icmin, cblas_izmin, - cblas_ismax, cblas_idmax, cblas_icmax, cblas_izmax, - cblas_ssum, cblas_dsum, cblas_scsum, cblas_dzsum, - cblas_xerbla + cblas_zaxpby, cblas_zgeadd, + cblas_izamax, cblas_izamin, cblas_izmin, cblas_izmax, cblas_dzsum,cblas_zimatcopy,cblas_zomatcopy ); +@cblasobjs = ( cblas_xerbla ); + @halfcblasobjs = (cblas_shgemm, cblas_shdot, cblas_shstobf16, cblas_shdtobf16, cblas_sbf16tos, cblas_dbf16tod); @exblasobjs = ( @@ -103,12 +113,22 @@ # xdrot,xrotg, ); -@gemm3mobjs = ( - cgemm3m,zgemm3m + @gemm3mobjs=(); + + @cblasgemm3mobjs=(); + +@gemm3mobjsc = ( + cgemm3m, +); +@gemm3mobjsz = ( + zgemm3m ); -@cblasgemm3mobjs = ( - cblas_cgemm3m,cblas_zgemm3m +@cblasgemm3mobjsc = ( + cblas_cgemm3m +); +@cblasgemm3mobjsz = ( + cblas_zgemm3m ); @@ -131,22 +151,68 @@ @misc_underscore_objs = ( ); -@lapackobjs = ( +@lapackobjss = ( # These routines are provided by OpenBLAS. - sgesv, dgesv, cgesv, zgesv, - sgetf2, dgetf2, cgetf2, zgetf2, - sgetrf, dgetrf, cgetrf, zgetrf, - slaswp, dlaswp, claswp, zlaswp, - sgetrs, dgetrs, cgetrs, zgetrs, - slauu2, dlauu2, clauu2, zlauu2, - slauum, dlauum, clauum, zlauum, - spotf2, dpotf2, cpotf2, zpotf2, - spotrf, dpotrf, cpotrf, zpotrf, - strti2, dtrti2, ctrti2, ztrti2, - strtri, dtrtri, ctrtri, ztrtri, - spotri, dpotri, cpotri, zpotri, + sgesv, + sgetf2, + sgetrf, + slaswp, + sgetrs, + slauu2, + slauum, + spotf2, + spotrf, + strti2, + strtri, + spotri, +); + +@lapackobjsd = ( + dgesv, + dgetf2, + dgetrf, + dlaswp, + dgetrs, + dlauu2, + dlauum, + dpotf2, + dpotrf, + dtrti2, + dtrtri, + dpotri, +); + +@lapackobjsc = ( +cgesv, +cgetf2, +cgetrf, +claswp, +cgetrs, +clauu2, +clauum, +cpotf2, +cpotrf, +ctrti2, +ctrtri, +cpotri, +); + +@lapackobjsz = ( +zgesv, +zgetf2, +zgetrf, +zlaswp, +zgetrs, +zlauu2, +zlauum, +zpotf2, +zpotrf, +ztrti2, +ztrtri, +zpotri, ); + @lapackobjs2 = ( # These routines are provided by LAPACK (reference implementation). # @@ -162,7 +228,9 @@ ilaenv, ieeeck, lsamen, iparmq, ilaprec, ilatrans, ilauplo, iladiag, ilaver, slamch, slamc3, - +); + +@lapackobjs2sc = ( # SCLAUX -- Auxiliary routines called from both REAL and COMPLEX. # excluded: second_$(TIMER) sbdsdc, @@ -180,7 +248,9 @@ slasr, slasrt, slassq, slasv2, spttrf, sstebz, sstedc, ssteqr, ssterf, slaisnan, sisnan, slartgp, slartgs, +); +@lapackobjs2dz = ( # DZLAUX -- Auxiliary routines called from both DOUBLE and COMPLEX*16. # excluded: dsecnd_$(TIMER) dbdsdc, @@ -199,7 +269,9 @@ dsteqr, dsterf, dlaisnan, disnan, dlartgp, dlartgs, dlamch, dlamc3, +); +@lapackobjs2s = ( # SLASRC -- Single precision real LAPACK routines # already provided by @lapackobjs: # sgesv, sgetf2, slaswp, slauu2, slauum, spotf2, spotri, strti2, strtri @@ -262,7 +334,9 @@ sorbdb5, sorbdb6, sorcsd, sorcsd2by1, sgeqrt, sgeqrt2, sgeqrt3, sgemqrt, stpqrt, stpqrt2, stpmqrt, stprfb, +); +@lapackobjs2ds = ( # DSLASRC -- Double-single mixed precision real routines called from # single, single-extra and double precision real LAPACK # routines (i.e. from SLASRC, SXLASRC, DLASRC). @@ -270,7 +344,9 @@ # already provided by @lapackobjs: # sgetrs, spotrf, sgetrf spotrs, +); +@lapackobjs2c = ( # CLASRC -- Single precision complex LAPACK routines # already provided by @blasobjs: # already provided by @lapackobjs: @@ -338,7 +414,8 @@ cunbdb5, cunbdb6, cuncsd, cuncsd2by1, cgeqrt, cgeqrt2, cgeqrt3, cgemqrt, ctpqrt, ctpqrt2, ctpmqrt, ctprfb, - +); +@lapack2objszc = ( # ZCLASRC -- Double-single mixed precision complex routines called from # single, single-extra and double precision complex LAPACK # routines (i.e. from CLASRC, CXLASRC, ZLASRC). @@ -346,7 +423,9 @@ # already provided by @lapackobjs: # cgetrs, cpotrf, cgetrf cpotrs, +); +@lapack2objsd = ( # DLASRC -- Double precision real LAPACK routines # already provided by @lapackobjs: # dgesv, dgetf2, dgetrs, dlaswp, dlauu2, dlauum, dpotf2, dpotrf, dpotri, @@ -411,7 +490,8 @@ dorbdb5, dorbdb6, dorcsd, dorcsd2by1, dgeqrt, dgeqrt2, dgeqrt3, dgemqrt, dtpqrt, dtpqrt2, dtpmqrt, dtprfb, - +); +@lapackobjs2z = ( # ZLASRC -- Double precision complex LAPACK routines # already provided by @blasobjs: # already provided by @lapackobjs: @@ -485,8 +565,10 @@ zunbdb5, zunbdb6, zuncsd, zuncsd2by1, zgeqrt, zgeqrt2, zgeqrt3, zgemqrt, ztpqrt, ztpqrt2, ztpmqrt, ztprfb, +); # functions added for lapack-3.6.0 +@lapack2objsc = ( @lapack2objsc, cgejsv, cgesvdx, cgesvj, @@ -521,6 +603,8 @@ cspr2, csyr2, cunm22, +); +@lapackobjs2d = (@lapack2objsd, dbdsvdx, dgesvdx, dgetrf2, @@ -552,6 +636,8 @@ dorm22, dpotrf2, dsecnd, + ); + @lapack2objss = (@lapack2objss, sbdsvdx, second, sgesvdx, @@ -583,6 +669,8 @@ slatmt, sorm22, spotrf2, + ); + @lapack2objsz = (@lapack2objsz, zgejsv, zgesvdx, zgesvj, @@ -617,9 +705,9 @@ zspr2, zsyr2, zunm22, - +); # functions added for lapack-3.7.0 - +@lapack2objss = (@lapack2objss, slarfy, strevc3, sgelqt, @@ -637,6 +725,8 @@ stplqt, stplqt2, stpmlqt, + ); + @lapack2objsd = (@lapack2objsd, dlarfy, dsyconvf, dtrevc3, @@ -655,6 +745,8 @@ dtplqt, dtplqt2, dtpmlqt, + ); + @lapack2objsc = (@lapack2objsc, clarfy, csyconvf, ctrevc3, @@ -673,6 +765,8 @@ ctplqt, ctplqt2, ctpmlqt, + ); + @lapack2objsz = (@lapack2objsz, zlarfy, zsyconvf, ztrevc3, @@ -691,6 +785,8 @@ zlaswlq, zlamswlq, zgemlq, + ); + @lapack2objs = (@lapack2objs, sladiv1, dladiv1, iparam2stage, @@ -698,16 +794,23 @@ # functions added for lapack-3.8.0 ilaenv2stage, - + ); # functions added for lapack-3.9.0 +@lapack2objsc = (@lapack2objsc, cgesvdq, cungtsqr, dcombssq, + ); +@lapack2objsd = (@lapack2objsd, dgesvdq, dorgtsqr, + ); +@lapack2objss = (@lapack2objss, scombssq, sgesvdq, sorgtsqr, + ); +@lapack2objsz = (@lapack2objsz, zgesvdq, zungtsqr ); @@ -717,36 +820,54 @@ dlagsy, dsysvxx, sporfsx, slatms, zlatms, zherfsx, csysvxx, ); -@lapack_deprecated_objs = ( - cgegs, cggsvd, ctzrqf, dgeqpf, dlatzm, sgelsx, slahrd, zgegv, zggsvp, - cgegv, cggsvp, dgegs, dggsvd, dtzrqf, sgeqpf, slatzm, zgelsx, zlahrd, - cgelsx, clahrd, dgegv, dggsvp, sgegs, sggsvd, stzrqf, zgeqpf, zlatzm, - cgeqpf, clatzm, dgelsx, dlahrd, sgegv, sggsvp, zgegs, zggsvd, ztzrqf, -); - -@lapacke_deprecated_objs = ( +@lapack_deprecated_objsc = ( + cgegs, cggsvd, + cgegv, cggsvp, + cgelsx, clahrd, + cgeqpf, clatzm, + ctzrqf, + ); +@lapack_deprecated_objsd = ( + dgegs, dgeqpf, + dgegv, dggsvd, + dgelsx, dggsvp, + dlahrd, + dlatzm, dtzrqf); + +@lapack_deprecated_objss = ( + sgegs, + sgegv, + ); + +@lapacke_deprecated_objsc = ( LAPACKE_cggsvp, LAPACKE_cggsvp_work, - LAPACKE_dggsvp, - LAPACKE_dggsvp_work, - LAPACKE_sggsvp, - LAPACKE_sggsvp_work, - LAPACKE_zggsvp, - LAPACKE_zggsvp_work, LAPACKE_cggsvd, LAPACKE_cggsvd_work, - LAPACKE_dggsvd, - LAPACKE_dggsvd_work, - LAPACKE_sggsvd, - LAPACKE_sggsvd_work, - LAPACKE_zggsvd, - LAPACKE_zggsvd_work, LAPACKE_cgeqpf, LAPACKE_cgeqpf_work, +); +@lapacke_deprecated_objsd = ( + LAPACKE_dggsvp, + LAPACKE_dggsvp_work, + LAPACKE_dggsvd, + LAPACKE_dggsvd_work, LAPACKE_dgeqpf, LAPACKE_dgeqpf_work, +); +@lapacke_deprecated_objss = ( + LAPACKE_sggsvp, + LAPACKE_sggsvp_work, + LAPACKE_sggsvd, + LAPACKE_sggsvd_work, LAPACKE_sgeqpf, LAPACKE_sgeqpf_work, +); +@lapacke_deprecated_objsz = ( + LAPACKE_zggsvp, + LAPACKE_zggsvp_work, + LAPACKE_zggsvd, + LAPACKE_zggsvd_work, LAPACKE_zgeqpf, LAPACKE_zgeqpf_work, ); @@ -763,6 +884,15 @@ # exported since the respective LAPACK routines are not built by default. # @(OBJ) from `lapack-3.4.1/lapacke/utils/Makefile` + LAPACKE_lsame, + LAPACKE_ilaver, + LAPACKE_xerbla, + lapack_make_complex_float, + lapack_make_complex_double, + LAPACKE_get_nancheck, + LAPACKE_set_nancheck, +); +@lapackeobjsc = ( LAPACKE_cgb_nancheck, LAPACKE_cgb_trans, LAPACKE_cge_nancheck, @@ -801,118 +931,6 @@ LAPACKE_ctp_trans, LAPACKE_ctr_nancheck, LAPACKE_ctr_trans, - LAPACKE_dgb_nancheck, - LAPACKE_dgb_trans, - LAPACKE_dge_nancheck, - LAPACKE_dge_trans, - LAPACKE_dgg_nancheck, - LAPACKE_dgg_trans, - LAPACKE_dgt_nancheck, - LAPACKE_dhs_nancheck, - LAPACKE_dhs_trans, - LAPACKE_d_nancheck, - LAPACKE_dpb_nancheck, - LAPACKE_dpb_trans, - LAPACKE_dpf_nancheck, - LAPACKE_dpf_trans, - LAPACKE_dpo_nancheck, - LAPACKE_dpo_trans, - LAPACKE_dpp_nancheck, - LAPACKE_dpp_trans, - LAPACKE_dpt_nancheck, - LAPACKE_dsb_nancheck, - LAPACKE_dsb_trans, - LAPACKE_dsp_nancheck, - LAPACKE_dsp_trans, - LAPACKE_dst_nancheck, - LAPACKE_dsy_nancheck, - LAPACKE_dsy_trans, - LAPACKE_dtb_nancheck, - LAPACKE_dtb_trans, - LAPACKE_dtf_nancheck, - LAPACKE_dtf_trans, - LAPACKE_dtp_nancheck, - LAPACKE_dtp_trans, - LAPACKE_dtr_nancheck, - LAPACKE_dtr_trans, - LAPACKE_lsame, - LAPACKE_sgb_nancheck, - LAPACKE_sgb_trans, - LAPACKE_sge_nancheck, - LAPACKE_sge_trans, - LAPACKE_sgg_nancheck, - LAPACKE_sgg_trans, - LAPACKE_sgt_nancheck, - LAPACKE_shs_nancheck, - LAPACKE_shs_trans, - LAPACKE_s_nancheck, - LAPACKE_spb_nancheck, - LAPACKE_spb_trans, - LAPACKE_spf_nancheck, - LAPACKE_spf_trans, - LAPACKE_spo_nancheck, - LAPACKE_spo_trans, - LAPACKE_spp_nancheck, - LAPACKE_spp_trans, - LAPACKE_spt_nancheck, - LAPACKE_ssb_nancheck, - LAPACKE_ssb_trans, - LAPACKE_ssp_nancheck, - LAPACKE_ssp_trans, - LAPACKE_sst_nancheck, - LAPACKE_ssy_nancheck, - LAPACKE_ssy_trans, - LAPACKE_stb_nancheck, - LAPACKE_stb_trans, - LAPACKE_stf_nancheck, - LAPACKE_stf_trans, - LAPACKE_stp_nancheck, - LAPACKE_stp_trans, - LAPACKE_str_nancheck, - LAPACKE_str_trans, - LAPACKE_xerbla, - LAPACKE_zgb_nancheck, - LAPACKE_zgb_trans, - LAPACKE_zge_nancheck, - LAPACKE_zge_trans, - LAPACKE_zgg_nancheck, - LAPACKE_zgg_trans, - LAPACKE_zgt_nancheck, - LAPACKE_zhb_nancheck, - LAPACKE_zhb_trans, - LAPACKE_zhe_nancheck, - LAPACKE_zhe_trans, - LAPACKE_zhp_nancheck, - LAPACKE_zhp_trans, - LAPACKE_zhs_nancheck, - LAPACKE_zhs_trans, - LAPACKE_z_nancheck, - LAPACKE_zpb_nancheck, - LAPACKE_zpb_trans, - LAPACKE_zpf_nancheck, - LAPACKE_zpf_trans, - LAPACKE_zpo_nancheck, - LAPACKE_zpo_trans, - LAPACKE_zpp_nancheck, - LAPACKE_zpp_trans, - LAPACKE_zpt_nancheck, - LAPACKE_zsp_nancheck, - LAPACKE_zsp_trans, - LAPACKE_zst_nancheck, - LAPACKE_zsy_nancheck, - LAPACKE_zsy_trans, - LAPACKE_ztb_nancheck, - LAPACKE_ztb_trans, - LAPACKE_ztf_nancheck, - LAPACKE_ztf_trans, - LAPACKE_ztp_nancheck, - LAPACKE_ztp_trans, - LAPACKE_ztr_nancheck, - LAPACKE_ztr_trans, - lapack_make_complex_float, - lapack_make_complex_double, - - # @(SRC_OBJ) from `lapack-3.5.0/lapacke/src/Makefile` LAPACKE_cbbcsd, LAPACKE_cbbcsd_work, LAPACKE_cbdsqr, @@ -1405,82 +1423,238 @@ LAPACKE_cupgtr_work, LAPACKE_cupmtr, LAPACKE_cupmtr_work, - LAPACKE_dbbcsd, - LAPACKE_dbbcsd_work, - LAPACKE_dbdsdc, - LAPACKE_dbdsdc_work, - LAPACKE_dbdsqr, - LAPACKE_dbdsqr_work, - LAPACKE_ddisna, - LAPACKE_ddisna_work, - LAPACKE_dgbbrd, - LAPACKE_dgbbrd_work, - LAPACKE_dgbcon, - LAPACKE_dgbcon_work, - LAPACKE_dgbequ, - LAPACKE_dgbequ_work, - LAPACKE_dgbequb, - LAPACKE_dgbequb_work, - LAPACKE_dgbrfs, - LAPACKE_dgbrfs_work, - LAPACKE_dgbsv, - LAPACKE_dgbsv_work, - LAPACKE_dgbsvx, - LAPACKE_dgbsvx_work, - LAPACKE_dgbtrf, - LAPACKE_dgbtrf_work, - LAPACKE_dgbtrs, - LAPACKE_dgbtrs_work, - LAPACKE_dgebak, - LAPACKE_dgebak_work, - LAPACKE_dgebal, - LAPACKE_dgebal_work, - LAPACKE_dgebrd, - LAPACKE_dgebrd_work, - LAPACKE_dgecon, - LAPACKE_dgecon_work, - LAPACKE_dgeequ, - LAPACKE_dgeequ_work, - LAPACKE_dgeequb, - LAPACKE_dgeequb_work, - LAPACKE_dgees, - LAPACKE_dgees_work, - LAPACKE_dgeesx, - LAPACKE_dgeesx_work, - LAPACKE_dgeev, - LAPACKE_dgeev_work, - LAPACKE_dgeevx, - LAPACKE_dgeevx_work, - LAPACKE_dgehrd, - LAPACKE_dgehrd_work, - LAPACKE_dgejsv, - LAPACKE_dgejsv_work, - LAPACKE_dgelq2, - LAPACKE_dgelq2_work, - LAPACKE_dgelqf, - LAPACKE_dgelqf_work, - LAPACKE_dgels, - LAPACKE_dgels_work, - LAPACKE_dgelsd, - LAPACKE_dgelsd_work, - LAPACKE_dgelss, - LAPACKE_dgelss_work, - LAPACKE_dgelsy, - LAPACKE_dgelsy_work, - LAPACKE_dgemqrt, - LAPACKE_dgemqrt_work, - LAPACKE_dgeqlf, - LAPACKE_dgeqlf_work, - LAPACKE_dgeqp3, - LAPACKE_dgeqp3_work, - LAPACKE_dgeqr2, - LAPACKE_dgeqr2_work, - LAPACKE_dgeqrf, - LAPACKE_dgeqrf_work, - LAPACKE_dgeqrfp, - LAPACKE_dgeqrfp_work, - LAPACKE_dgeqrt, - LAPACKE_dgeqrt2, + LAPACKE_csyr, + LAPACKE_csyr_work, + LAPACKE_clatms, + LAPACKE_clatms_work, + LAPACKE_clagge, + LAPACKE_clagge_work, + LAPACKE_claghe, + LAPACKE_claghe_work, + LAPACKE_clagsy, + LAPACKE_clagsy_work, + LAPACKE_cgejsv, + LAPACKE_cgejsv_work, + LAPACKE_cgesvdx, + LAPACKE_cgesvdx_work, + LAPACKE_cgesvj, + LAPACKE_cgesvj_work, + LAPACKE_cgetrf2, + LAPACKE_cgetrf2_work, + LAPACKE_cgges3, + LAPACKE_cgges3_work, + LAPACKE_cggev3, + LAPACKE_cggev3_work, + LAPACKE_cgghd3, + LAPACKE_cgghd3_work, + LAPACKE_cggsvd3, + LAPACKE_cggsvd3_work, + LAPACKE_cggsvp3, + LAPACKE_cggsvp3_work, + LAPACKE_chetrf_rook, + LAPACKE_chetrf_rook_work, + LAPACKE_chetrs_rook, + LAPACKE_chetrs_rook_work, + LAPACKE_clapmt, + LAPACKE_clapmt_work, + LAPACKE_clascl, + LAPACKE_clascl_work, + LAPACKE_cpotrf2, + LAPACKE_cpotrf2_work, + LAPACKE_csytrf_rook, + LAPACKE_csytrf_rook_work, + LAPACKE_csytrs_rook, + LAPACKE_csytrs_rook_work, + LAPACKE_cuncsd2by1, + LAPACKE_cuncsd2by1_work, + LAPACKE_cgelq, + LAPACKE_cgelq_work, + LAPACKE_cgemlq, + LAPACKE_cgemlq_work, + LAPACKE_cgemqr, + LAPACKE_cgemqr_work, + LAPACKE_cgeqr, + LAPACKE_cgeqr_work, + LAPACKE_cgetsls, + LAPACKE_cgetsls_work, + LAPACKE_chbev_2stage, + LAPACKE_chbev_2stage_work, + LAPACKE_chbevd_2stage, + LAPACKE_chbevd_2stage_work, + LAPACKE_chbevx_2stage, + LAPACKE_chbevx_2stage_work, + LAPACKE_checon_3, + LAPACKE_checon_3_work, + LAPACKE_cheev_2stage, + LAPACKE_cheev_2stage_work, + LAPACKE_cheevd_2stage, + LAPACKE_cheevd_2stage_work, + LAPACKE_cheevr_2stage, + LAPACKE_cheevr_2stage_work, + LAPACKE_cheevx_2stage, + LAPACKE_cheevx_2stage_work, + LAPACKE_chegv_2stage, + LAPACKE_chegv_2stage_work, + LAPACKE_chesv_aa, + LAPACKE_chesv_aa_work, + LAPACKE_chesv_rk, + LAPACKE_chesv_rk_work, + LAPACKE_chetrf_aa, + LAPACKE_chetrf_aa_work, + LAPACKE_chetrf_rk, + LAPACKE_chetrf_rk_work, + LAPACKE_chetri_3, + LAPACKE_chetri_3_work, + LAPACKE_chetrs_aa, + LAPACKE_chetrs_aa_work, + LAPACKE_chetrs_3, + LAPACKE_chetrs_3_work, + LAPACKE_csycon_3, + LAPACKE_csycon_3_work, + LAPACKE_csysv_aa, + LAPACKE_csysv_aa_work, + LAPACKE_csysv_rk, + LAPACKE_csysv_rk_work, + LAPACKE_csytrf_aa, + LAPACKE_csytrf_aa_work, + LAPACKE_csytrf_rk, + LAPACKE_csytrf_rk_work, + LAPACKE_csytri_3, + LAPACKE_csytri_3_work, + LAPACKE_csytrs_aa, + LAPACKE_csytrs_aa_work, + LAPACKE_csytrs_3, + LAPACKE_csytrs_3_work, + LAPACKE_chesv_aa_2stage, + LAPACKE_chesv_aa_2stage_work, + LAPACKE_chetrf_aa_2stage, + LAPACKE_chetrf_aa_2stage_work, + LAPACKE_chetrs_aa_2stage, + LAPACKE_chetrs_aa_2stage_work, + LAPACKE_clacrm, + LAPACKE_clacrm_work, + LAPACKE_clarcm, + LAPACKE_clarcm_work, + LAPACKE_classq, + LAPACKE_classq_work, + LAPACKE_csysv_aa_2stage, + LAPACKE_csysv_aa_2stage_work, + LAPACKE_csytrf_aa_2stage, + LAPACKE_csytrf_aa_2stage_work, + LAPACKE_csytrs_aa_2stage, + LAPACKE_csytrs_aa_2stage_work, +); +@lapackeobjsd = ( + LAPACKE_dgb_nancheck, + LAPACKE_dgb_trans, + LAPACKE_dge_nancheck, + LAPACKE_dge_trans, + LAPACKE_dgg_nancheck, + LAPACKE_dgg_trans, + LAPACKE_dgt_nancheck, + LAPACKE_dhs_nancheck, + LAPACKE_dhs_trans, + LAPACKE_d_nancheck, + LAPACKE_dpb_nancheck, + LAPACKE_dpb_trans, + LAPACKE_dpf_nancheck, + LAPACKE_dpf_trans, + LAPACKE_dpo_nancheck, + LAPACKE_dpo_trans, + LAPACKE_dpp_nancheck, + LAPACKE_dpp_trans, + LAPACKE_dpt_nancheck, + LAPACKE_dsb_nancheck, + LAPACKE_dsb_trans, + LAPACKE_dsp_nancheck, + LAPACKE_dsp_trans, + LAPACKE_dst_nancheck, + LAPACKE_dsy_nancheck, + LAPACKE_dsy_trans, + LAPACKE_dtb_nancheck, + LAPACKE_dtb_trans, + LAPACKE_dtf_nancheck, + LAPACKE_dtf_trans, + LAPACKE_dtp_nancheck, + LAPACKE_dtp_trans, + LAPACKE_dtr_nancheck, + LAPACKE_dtr_trans, + LAPACKE_dbbcsd, + LAPACKE_dbbcsd_work, + LAPACKE_dbdsdc, + LAPACKE_dbdsdc_work, + LAPACKE_dbdsqr, + LAPACKE_dbdsqr_work, + LAPACKE_ddisna, + LAPACKE_ddisna_work, + LAPACKE_dgbbrd, + LAPACKE_dgbbrd_work, + LAPACKE_dgbcon, + LAPACKE_dgbcon_work, + LAPACKE_dgbequ, + LAPACKE_dgbequ_work, + LAPACKE_dgbequb, + LAPACKE_dgbequb_work, + LAPACKE_dgbrfs, + LAPACKE_dgbrfs_work, + LAPACKE_dgbsv, + LAPACKE_dgbsv_work, + LAPACKE_dgbsvx, + LAPACKE_dgbsvx_work, + LAPACKE_dgbtrf, + LAPACKE_dgbtrf_work, + LAPACKE_dgbtrs, + LAPACKE_dgbtrs_work, + LAPACKE_dgebak, + LAPACKE_dgebak_work, + LAPACKE_dgebal, + LAPACKE_dgebal_work, + LAPACKE_dgebrd, + LAPACKE_dgebrd_work, + LAPACKE_dgecon, + LAPACKE_dgecon_work, + LAPACKE_dgeequ, + LAPACKE_dgeequ_work, + LAPACKE_dgeequb, + LAPACKE_dgeequb_work, + LAPACKE_dgees, + LAPACKE_dgees_work, + LAPACKE_dgeesx, + LAPACKE_dgeesx_work, + LAPACKE_dgeev, + LAPACKE_dgeev_work, + LAPACKE_dgeevx, + LAPACKE_dgeevx_work, + LAPACKE_dgehrd, + LAPACKE_dgehrd_work, + LAPACKE_dgejsv, + LAPACKE_dgejsv_work, + LAPACKE_dgelq2, + LAPACKE_dgelq2_work, + LAPACKE_dgelqf, + LAPACKE_dgelqf_work, + LAPACKE_dgels, + LAPACKE_dgels_work, + LAPACKE_dgelsd, + LAPACKE_dgelsd_work, + LAPACKE_dgelss, + LAPACKE_dgelss_work, + LAPACKE_dgelsy, + LAPACKE_dgelsy_work, + LAPACKE_dgemqrt, + LAPACKE_dgemqrt_work, + LAPACKE_dgeqlf, + LAPACKE_dgeqlf_work, + LAPACKE_dgeqp3, + LAPACKE_dgeqp3_work, + LAPACKE_dgeqr2, + LAPACKE_dgeqr2_work, + LAPACKE_dgeqrf, + LAPACKE_dgeqrf_work, + LAPACKE_dgeqrfp, + LAPACKE_dgeqrfp_work, + LAPACKE_dgeqrt, + LAPACKE_dgeqrt2, LAPACKE_dgeqrt2_work, LAPACKE_dgeqrt3, LAPACKE_dgeqrt3_work, @@ -1889,31 +2063,155 @@ LAPACKE_dtrttp_work, LAPACKE_dtzrzf, LAPACKE_dtzrzf_work, - LAPACKE_sbbcsd, - LAPACKE_sbbcsd_work, - LAPACKE_sbdsdc, - LAPACKE_sbdsdc_work, - LAPACKE_sbdsqr, - LAPACKE_sbdsqr_work, - LAPACKE_sdisna, - LAPACKE_sdisna_work, - LAPACKE_sgbbrd, - LAPACKE_sgbbrd_work, - LAPACKE_sgbcon, - LAPACKE_sgbcon_work, - LAPACKE_sgbequ, - LAPACKE_sgbequ_work, - LAPACKE_sgbequb, - LAPACKE_sgbequb_work, - LAPACKE_sgbrfs, - LAPACKE_sgbrfs_work, - LAPACKE_sgbsv, - LAPACKE_sgbsv_work, - LAPACKE_sgbsvx, - LAPACKE_sgbsvx_work, - LAPACKE_sgbtrf, - LAPACKE_sgbtrf_work, - LAPACKE_sgbtrs, + LAPACKE_dlatms, + LAPACKE_dlatms_work, + LAPACKE_dlagge, + LAPACKE_dlagge_work, + LAPACKE_dlagsy, + LAPACKE_dlagsy_work, + LAPACKE_dbdsvdx, + LAPACKE_dbdsvdx_work, + LAPACKE_dgesvdx, + LAPACKE_dgesvdx_work, + LAPACKE_dgetrf2, + LAPACKE_dgetrf2_work, + LAPACKE_dgges3, + LAPACKE_dgges3_work, + LAPACKE_dggev3, + LAPACKE_dggev3_work, + LAPACKE_dgghd3, + LAPACKE_dgghd3_work, + LAPACKE_dggsvd3, + LAPACKE_dggsvd3_work, + LAPACKE_dggsvp3, + LAPACKE_dggsvp3_work, + LAPACKE_dlapmt, + LAPACKE_dlapmt_work, + LAPACKE_dlascl, + LAPACKE_dlascl_work, + LAPACKE_dorcsd2by1, + LAPACKE_dorcsd2by1_work, + LAPACKE_dpotrf2, + LAPACKE_dpotrf2_work, + LAPACKE_dsytrf_rook, + LAPACKE_dsytrf_rook_work, + LAPACKE_dsytrs_rook, + LAPACKE_dsytrs_rook_work, + LAPACKE_dgelq, + LAPACKE_dgelq_work, + LAPACKE_dgemlq, + LAPACKE_dgemlq_work, + LAPACKE_dgemqr, + LAPACKE_dgemqr_work, + LAPACKE_dgeqr, + LAPACKE_dgeqr_work, + LAPACKE_dgetsls, + LAPACKE_dgetsls_work, + LAPACKE_dsbev_2stage, + LAPACKE_dsbev_2stage_work, + LAPACKE_dsbevd_2stage, + LAPACKE_dsbevd_2stage_work, + LAPACKE_dsbevx_2stage, + LAPACKE_dsbevx_2stage_work, + LAPACKE_dsycon_3, + LAPACKE_dsycon_3_work, + LAPACKE_dsyev_2stage, + LAPACKE_dsyev_2stage_work, + LAPACKE_dsyevd_2stage, + LAPACKE_dsyevd_2stage_work, + LAPACKE_dsyevr_2stage, + LAPACKE_dsyevr_2stage_work, + LAPACKE_dsyevx_2stage, + LAPACKE_dsyevx_2stage_work, + LAPACKE_dsygv_2stage, + LAPACKE_dsygv_2stage_work, + LAPACKE_dsysv_aa, + LAPACKE_dsysv_aa_work, + LAPACKE_dsysv_rk, + LAPACKE_dsysv_rk_work, + LAPACKE_dsytrf_aa, + LAPACKE_dsytrf_aa_work, + LAPACKE_dsytrf_rk, + LAPACKE_dsytrf_rk_work, + LAPACKE_dsytri_3, + LAPACKE_dsytri_3_work, + LAPACKE_dsytrs_aa, + LAPACKE_dsytrs_aa_work, + LAPACKE_dsytrs_3, + LAPACKE_dsytrs_3_work, + LAPACKE_dlassq, + LAPACKE_dlassq_work, + LAPACKE_dsysv_aa_2stage, + LAPACKE_dsysv_aa_2stage_work, + LAPACKE_dsytrf_aa_2stage, + LAPACKE_dsytrf_aa_2stage_work, + LAPACKE_dsytrs_aa_2stage, + LAPACKE_dsytrs_aa_2stage_work, + LAPACKE_dgesvdq, + LAPACKE_dgesvdq_work, + LAPACKE_slag2d, + LAPACKE_slag2d_work, +); +@lapackeobjss = ( + LAPACKE_sgb_nancheck, + LAPACKE_sgb_trans, + LAPACKE_sge_nancheck, + LAPACKE_sge_trans, + LAPACKE_sgg_nancheck, + LAPACKE_sgg_trans, + LAPACKE_sgt_nancheck, + LAPACKE_shs_nancheck, + LAPACKE_shs_trans, + LAPACKE_s_nancheck, + LAPACKE_spb_nancheck, + LAPACKE_spb_trans, + LAPACKE_spf_nancheck, + LAPACKE_spf_trans, + LAPACKE_spo_nancheck, + LAPACKE_spo_trans, + LAPACKE_spp_nancheck, + LAPACKE_spp_trans, + LAPACKE_spt_nancheck, + LAPACKE_ssb_nancheck, + LAPACKE_ssb_trans, + LAPACKE_ssp_nancheck, + LAPACKE_ssp_trans, + LAPACKE_sst_nancheck, + LAPACKE_ssy_nancheck, + LAPACKE_ssy_trans, + LAPACKE_stb_nancheck, + LAPACKE_stb_trans, + LAPACKE_stf_nancheck, + LAPACKE_stf_trans, + LAPACKE_stp_nancheck, + LAPACKE_stp_trans, + LAPACKE_str_nancheck, + LAPACKE_str_trans, + LAPACKE_sbbcsd, + LAPACKE_sbbcsd_work, + LAPACKE_sbdsdc, + LAPACKE_sbdsdc_work, + LAPACKE_sbdsqr, + LAPACKE_sbdsqr_work, + LAPACKE_sdisna, + LAPACKE_sdisna_work, + LAPACKE_sgbbrd, + LAPACKE_sgbbrd_work, + LAPACKE_sgbcon, + LAPACKE_sgbcon_work, + LAPACKE_sgbequ, + LAPACKE_sgbequ_work, + LAPACKE_sgbequb, + LAPACKE_sgbequb_work, + LAPACKE_sgbrfs, + LAPACKE_sgbrfs_work, + LAPACKE_sgbsv, + LAPACKE_sgbsv_work, + LAPACKE_sgbsvx, + LAPACKE_sgbsvx_work, + LAPACKE_sgbtrf, + LAPACKE_sgbtrf_work, + LAPACKE_sgbtrs, LAPACKE_sgbtrs_work, LAPACKE_sgebak, LAPACKE_sgebak_work, @@ -2035,8 +2333,6 @@ LAPACKE_slacn2_work, LAPACKE_slacpy, LAPACKE_slacpy_work, - LAPACKE_slag2d, - LAPACKE_slag2d_work, LAPACKE_slamch, LAPACKE_slamch_work, LAPACKE_slange, @@ -2367,112 +2663,240 @@ LAPACKE_strttp_work, LAPACKE_stzrzf, LAPACKE_stzrzf_work, - LAPACKE_zbbcsd, - LAPACKE_zbbcsd_work, - LAPACKE_zbdsqr, - LAPACKE_zbdsqr_work, - LAPACKE_zcgesv, - LAPACKE_zcgesv_work, - LAPACKE_zcposv, - LAPACKE_zcposv_work, - LAPACKE_zgbbrd, - LAPACKE_zgbbrd_work, - LAPACKE_zgbcon, - LAPACKE_zgbcon_work, - LAPACKE_zgbequ, - LAPACKE_zgbequ_work, - LAPACKE_zgbequb, - LAPACKE_zgbequb_work, - LAPACKE_zgbrfs, - LAPACKE_zgbrfs_work, - LAPACKE_zgbsv, - LAPACKE_zgbsv_work, - LAPACKE_zgbsvx, - LAPACKE_zgbsvx_work, - LAPACKE_zgbtrf, - LAPACKE_zgbtrf_work, - LAPACKE_zgbtrs, - LAPACKE_zgbtrs_work, - LAPACKE_zgebak, - LAPACKE_zgebak_work, - LAPACKE_zgebal, - LAPACKE_zgebal_work, - LAPACKE_zgebrd, - LAPACKE_zgebrd_work, - LAPACKE_zgecon, - LAPACKE_zgecon_work, - LAPACKE_zgeequ, - LAPACKE_zgeequ_work, - LAPACKE_zgeequb, - LAPACKE_zgeequb_work, - LAPACKE_zgees, - LAPACKE_zgees_work, - LAPACKE_zgeesx, - LAPACKE_zgeesx_work, - LAPACKE_zgeev, - LAPACKE_zgeev_work, - LAPACKE_zgeevx, - LAPACKE_zgeevx_work, - LAPACKE_zgehrd, - LAPACKE_zgehrd_work, - LAPACKE_zgelq2, - LAPACKE_zgelq2_work, - LAPACKE_zgelqf, - LAPACKE_zgelqf_work, - LAPACKE_zgels, - LAPACKE_zgels_work, - LAPACKE_zgelsd, - LAPACKE_zgelsd_work, - LAPACKE_zgelss, - LAPACKE_zgelss_work, - LAPACKE_zgelsy, - LAPACKE_zgelsy_work, - LAPACKE_zgemqrt, - LAPACKE_zgemqrt_work, - LAPACKE_zgeqlf, - LAPACKE_zgeqlf_work, - LAPACKE_zgeqp3, - LAPACKE_zgeqp3_work, - LAPACKE_zgeqr2, - LAPACKE_zgeqr2_work, - LAPACKE_zgeqrf, - LAPACKE_zgeqrf_work, - LAPACKE_zgeqrfp, - LAPACKE_zgeqrfp_work, - LAPACKE_zgeqrt, - LAPACKE_zgeqrt2, - LAPACKE_zgeqrt2_work, - LAPACKE_zgeqrt3, - LAPACKE_zgeqrt3_work, - LAPACKE_zgeqrt_work, - LAPACKE_zgerfs, - LAPACKE_zgerfs_work, - LAPACKE_zgerqf, - LAPACKE_zgerqf_work, - LAPACKE_zgesdd, - LAPACKE_zgesdd_work, - LAPACKE_zgesv, - LAPACKE_zgesv_work, - LAPACKE_zgesvd, - LAPACKE_zgesvd_work, - LAPACKE_zgesvx, - LAPACKE_zgesvx_work, - LAPACKE_zgetf2, - LAPACKE_zgetf2_work, - LAPACKE_zgetrf, - LAPACKE_zgetrf_work, - LAPACKE_zgetri, - LAPACKE_zgetri_work, - LAPACKE_zgetrs, - LAPACKE_zgetrs_work, - LAPACKE_zggbak, - LAPACKE_zggbak_work, - LAPACKE_zggbal, - LAPACKE_zggbal_work, - LAPACKE_zgges, - LAPACKE_zgges_work, - LAPACKE_zggesx, - LAPACKE_zggesx_work, + LAPACKE_slatms, + LAPACKE_slatms_work, + LAPACKE_slagge, + LAPACKE_slagge_work, + LAPACKE_slagsy, + LAPACKE_slagsy_work, + LAPACKE_sbdsvdx, + LAPACKE_sbdsvdx_work, + LAPACKE_sgesvdx, + LAPACKE_sgesvdx_work, + LAPACKE_sgetrf2, + LAPACKE_sgetrf2_work, + LAPACKE_sgges3, + LAPACKE_sgges3_work, + LAPACKE_sggev3, + LAPACKE_sggev3_work, + LAPACKE_sgghd3, + LAPACKE_sgghd3_work, + LAPACKE_sggsvd3, + LAPACKE_sggsvd3_work, + LAPACKE_sggsvp3, + LAPACKE_sggsvp3_work, + LAPACKE_slapmt, + LAPACKE_slapmt_work, + LAPACKE_slascl, + LAPACKE_slascl_work, + LAPACKE_sorcsd2by1, + LAPACKE_sorcsd2by1_work, + LAPACKE_spotrf2, + LAPACKE_spotrf2_work, + LAPACKE_ssytrf_rook, + LAPACKE_ssytrf_rook_work, + LAPACKE_ssytrs_rook, + LAPACKE_ssytrs_rook_work, + LAPACKE_stpqrt, + LAPACKE_stpqrt_work, + LAPACKE_sgelq, + LAPACKE_sgelq_work, + LAPACKE_sgemlq, + LAPACKE_sgemlq_work, + LAPACKE_sgemqr, + LAPACKE_sgemqr_work, + LAPACKE_sgeqr, + LAPACKE_sgeqr_work, + LAPACKE_sgetsls, + LAPACKE_sgetsls_work, + LAPACKE_ssbev_2stage, + LAPACKE_ssbev_2stage_work, + LAPACKE_ssbevd_2stage, + LAPACKE_ssbevd_2stage_work, + LAPACKE_ssbevx_2stage, + LAPACKE_ssbevx_2stage_work, + LAPACKE_ssycon_3, + LAPACKE_ssycon_3_work, + LAPACKE_ssyev_2stage, + LAPACKE_ssyev_2stage_work, + LAPACKE_ssyevd_2stage, + LAPACKE_ssyevd_2stage_work, + LAPACKE_ssyevr_2stage, + LAPACKE_ssyevr_2stage_work, + LAPACKE_ssyevx_2stage, + LAPACKE_ssyevx_2stage_work, + LAPACKE_ssygv_2stage, + LAPACKE_ssygv_2stage_work, + LAPACKE_ssysv_aa, + LAPACKE_ssysv_aa_work, + LAPACKE_ssysv_rk, + LAPACKE_ssysv_rk_work, + LAPACKE_ssytrf_aa, + LAPACKE_ssytrf_aa_work, + LAPACKE_ssytrf_rk, + LAPACKE_ssytrf_rk_work, + LAPACKE_ssytri_3, + LAPACKE_ssytri_3_work, + LAPACKE_ssytrs_aa, + LAPACKE_ssytrs_aa_work, + LAPACKE_ssytrs_3, + LAPACKE_ssytrs_3_work, + LAPACKE_slassq, + LAPACKE_slassq_work, + LAPACKE_ssysv_aa_2stage, + LAPACKE_ssysv_aa_2stage_work, + LAPACKE_ssytrf_aa_2stage, + LAPACKE_ssytrf_aa_2stage_work, + LAPACKE_ssytrs_aa_2stage, + LAPACKE_ssytrs_aa_2stage_work, + LAPACKE_sgesvdq, + LAPACKE_sgesvdq_work, +); +@lapackeobjsz = ( + LAPACKE_zgb_nancheck, + LAPACKE_zgb_trans, + LAPACKE_zge_nancheck, + LAPACKE_zge_trans, + LAPACKE_zgg_nancheck, + LAPACKE_zgg_trans, + LAPACKE_zgt_nancheck, + LAPACKE_zhb_nancheck, + LAPACKE_zhb_trans, + LAPACKE_zhe_nancheck, + LAPACKE_zhe_trans, + LAPACKE_zhp_nancheck, + LAPACKE_zhp_trans, + LAPACKE_zhs_nancheck, + LAPACKE_zhs_trans, + LAPACKE_z_nancheck, + LAPACKE_zpb_nancheck, + LAPACKE_zpb_trans, + LAPACKE_zpf_nancheck, + LAPACKE_zpf_trans, + LAPACKE_zpo_nancheck, + LAPACKE_zpo_trans, + LAPACKE_zpp_nancheck, + LAPACKE_zpp_trans, + LAPACKE_zpt_nancheck, + LAPACKE_zsp_nancheck, + LAPACKE_zsp_trans, + LAPACKE_zst_nancheck, + LAPACKE_zsy_nancheck, + LAPACKE_zsy_trans, + LAPACKE_ztb_nancheck, + LAPACKE_ztb_trans, + LAPACKE_ztf_nancheck, + LAPACKE_ztf_trans, + LAPACKE_ztp_nancheck, + LAPACKE_ztp_trans, + LAPACKE_ztr_nancheck, + LAPACKE_ztr_trans, + LAPACKE_zbbcsd, + LAPACKE_zbbcsd_work, + LAPACKE_zbdsqr, + LAPACKE_zbdsqr_work, + LAPACKE_zcgesv, + LAPACKE_zcgesv_work, + LAPACKE_zcposv, + LAPACKE_zcposv_work, + LAPACKE_zgbbrd, + LAPACKE_zgbbrd_work, + LAPACKE_zgbcon, + LAPACKE_zgbcon_work, + LAPACKE_zgbequ, + LAPACKE_zgbequ_work, + LAPACKE_zgbequb, + LAPACKE_zgbequb_work, + LAPACKE_zgbrfs, + LAPACKE_zgbrfs_work, + LAPACKE_zgbsv, + LAPACKE_zgbsv_work, + LAPACKE_zgbsvx, + LAPACKE_zgbsvx_work, + LAPACKE_zgbtrf, + LAPACKE_zgbtrf_work, + LAPACKE_zgbtrs, + LAPACKE_zgbtrs_work, + LAPACKE_zgebak, + LAPACKE_zgebak_work, + LAPACKE_zgebal, + LAPACKE_zgebal_work, + LAPACKE_zgebrd, + LAPACKE_zgebrd_work, + LAPACKE_zgecon, + LAPACKE_zgecon_work, + LAPACKE_zgeequ, + LAPACKE_zgeequ_work, + LAPACKE_zgeequb, + LAPACKE_zgeequb_work, + LAPACKE_zgees, + LAPACKE_zgees_work, + LAPACKE_zgeesx, + LAPACKE_zgeesx_work, + LAPACKE_zgeev, + LAPACKE_zgeev_work, + LAPACKE_zgeevx, + LAPACKE_zgeevx_work, + LAPACKE_zgehrd, + LAPACKE_zgehrd_work, + LAPACKE_zgelq2, + LAPACKE_zgelq2_work, + LAPACKE_zgelqf, + LAPACKE_zgelqf_work, + LAPACKE_zgels, + LAPACKE_zgels_work, + LAPACKE_zgelsd, + LAPACKE_zgelsd_work, + LAPACKE_zgelss, + LAPACKE_zgelss_work, + LAPACKE_zgelsy, + LAPACKE_zgelsy_work, + LAPACKE_zgemqrt, + LAPACKE_zgemqrt_work, + LAPACKE_zgeqlf, + LAPACKE_zgeqlf_work, + LAPACKE_zgeqp3, + LAPACKE_zgeqp3_work, + LAPACKE_zgeqr2, + LAPACKE_zgeqr2_work, + LAPACKE_zgeqrf, + LAPACKE_zgeqrf_work, + LAPACKE_zgeqrfp, + LAPACKE_zgeqrfp_work, + LAPACKE_zgeqrt, + LAPACKE_zgeqrt2, + LAPACKE_zgeqrt2_work, + LAPACKE_zgeqrt3, + LAPACKE_zgeqrt3_work, + LAPACKE_zgeqrt_work, + LAPACKE_zgerfs, + LAPACKE_zgerfs_work, + LAPACKE_zgerqf, + LAPACKE_zgerqf_work, + LAPACKE_zgesdd, + LAPACKE_zgesdd_work, + LAPACKE_zgesv, + LAPACKE_zgesv_work, + LAPACKE_zgesvd, + LAPACKE_zgesvd_work, + LAPACKE_zgesvx, + LAPACKE_zgesvx_work, + LAPACKE_zgetf2, + LAPACKE_zgetf2_work, + LAPACKE_zgetrf, + LAPACKE_zgetrf_work, + LAPACKE_zgetri, + LAPACKE_zgetri_work, + LAPACKE_zgetrs, + LAPACKE_zgetrs_work, + LAPACKE_zggbak, + LAPACKE_zggbak_work, + LAPACKE_zggbal, + LAPACKE_zggbal_work, + LAPACKE_zgges, + LAPACKE_zgges_work, + LAPACKE_zggesx, + LAPACKE_zggesx_work, LAPACKE_zggev, LAPACKE_zggev_work, LAPACKE_zggevx, @@ -2864,11 +3288,7 @@ LAPACKE_zupmtr, LAPACKE_zupmtr_work, LAPACKE_zsyr, - LAPACKE_csyr, LAPACKE_zsyr_work, - LAPACKE_csyr_work, - LAPACKE_ilaver, - ## @(SRCX_OBJ) from `lapack-3.4.1/lapacke/src/Makefile` ## Not exported: requires LAPACKE_EXTENDED to be set and depends on the ## corresponding LAPACK extended precision routines. @@ -2948,128 +3368,15 @@ ## @(MATGEN_OBJ) from `lapack-3.4.1/lapacke/src/Makefile` ## Not exported: requires LAPACKE_TESTING to be set and depends on libtmg ## (see `lapack-3.4.1/TESTING/MATGEN`). - LAPACKE_clatms, - LAPACKE_clatms_work, - LAPACKE_dlatms, - LAPACKE_dlatms_work, - LAPACKE_slatms, - LAPACKE_slatms_work, LAPACKE_zlatms, LAPACKE_zlatms_work, - LAPACKE_clagge, - LAPACKE_clagge_work, - LAPACKE_dlagge, - LAPACKE_dlagge_work, - LAPACKE_slagge, - LAPACKE_slagge_work, LAPACKE_zlagge, LAPACKE_zlagge_work, - LAPACKE_claghe, - LAPACKE_claghe_work, LAPACKE_zlaghe, LAPACKE_zlaghe_work, - LAPACKE_clagsy, - LAPACKE_clagsy_work, - LAPACKE_dlagsy, - LAPACKE_dlagsy_work, - LAPACKE_slagsy, - LAPACKE_slagsy_work, LAPACKE_zlagsy, LAPACKE_zlagsy_work, ## new function from lapack-3.6.0 - - LAPACKE_cgejsv, - LAPACKE_cgejsv_work, - LAPACKE_cgesvdx, - LAPACKE_cgesvdx_work, - LAPACKE_cgesvj, - LAPACKE_cgesvj_work, - LAPACKE_cgetrf2, - LAPACKE_cgetrf2_work, - LAPACKE_cgges3, - LAPACKE_cgges3_work, - LAPACKE_cggev3, - LAPACKE_cggev3_work, - LAPACKE_cgghd3, - LAPACKE_cgghd3_work, - LAPACKE_cggsvd3, - LAPACKE_cggsvd3_work, - LAPACKE_cggsvp3, - LAPACKE_cggsvp3_work, - LAPACKE_chetrf_rook, - LAPACKE_chetrf_rook_work, - LAPACKE_chetrs_rook, - LAPACKE_chetrs_rook_work, - LAPACKE_clapmt, - LAPACKE_clapmt_work, - LAPACKE_clascl, - LAPACKE_clascl_work, - LAPACKE_cpotrf2, - LAPACKE_cpotrf2_work, - LAPACKE_csytrf_rook, - LAPACKE_csytrf_rook_work, - LAPACKE_csytrs_rook, - LAPACKE_csytrs_rook_work, - LAPACKE_cuncsd2by1, - LAPACKE_cuncsd2by1_work, - LAPACKE_dbdsvdx, - LAPACKE_dbdsvdx_work, - LAPACKE_dgesvdx, - LAPACKE_dgesvdx_work, - LAPACKE_dgetrf2, - LAPACKE_dgetrf2_work, - LAPACKE_dgges3, - LAPACKE_dgges3_work, - LAPACKE_dggev3, - LAPACKE_dggev3_work, - LAPACKE_dgghd3, - LAPACKE_dgghd3_work, - LAPACKE_dggsvd3, - LAPACKE_dggsvd3_work, - LAPACKE_dggsvp3, - LAPACKE_dggsvp3_work, - LAPACKE_dlapmt, - LAPACKE_dlapmt_work, - LAPACKE_dlascl, - LAPACKE_dlascl_work, - LAPACKE_dorcsd2by1, - LAPACKE_dorcsd2by1_work, - LAPACKE_dpotrf2, - LAPACKE_dpotrf2_work, - LAPACKE_dsytrf_rook, - LAPACKE_dsytrf_rook_work, - LAPACKE_dsytrs_rook, - LAPACKE_dsytrs_rook_work, - LAPACKE_sbdsvdx, - LAPACKE_sbdsvdx_work, - LAPACKE_sgesvdx, - LAPACKE_sgesvdx_work, - LAPACKE_sgetrf2, - LAPACKE_sgetrf2_work, - LAPACKE_sgges3, - LAPACKE_sgges3_work, - LAPACKE_sggev3, - LAPACKE_sggev3_work, - LAPACKE_sgghd3, - LAPACKE_sgghd3_work, - LAPACKE_sggsvd3, - LAPACKE_sggsvd3_work, - LAPACKE_sggsvp3, - LAPACKE_sggsvp3_work, - LAPACKE_slapmt, - LAPACKE_slapmt_work, - LAPACKE_slascl, - LAPACKE_slascl_work, - LAPACKE_sorcsd2by1, - LAPACKE_sorcsd2by1_work, - LAPACKE_spotrf2, - LAPACKE_spotrf2_work, - LAPACKE_ssytrf_rook, - LAPACKE_ssytrf_rook_work, - LAPACKE_ssytrs_rook, - LAPACKE_ssytrs_rook_work, - LAPACKE_stpqrt, - LAPACKE_stpqrt_work, LAPACKE_zgejsv, LAPACKE_zgejsv_work, LAPACKE_zgesvdx, @@ -3106,148 +3413,6 @@ LAPACKE_zuncsd2by1_work, ## new function from lapack-3.7.0 - LAPACKE_cgelq, - LAPACKE_cgelq_work, - LAPACKE_cgemlq, - LAPACKE_cgemlq_work, - LAPACKE_cgemqr, - LAPACKE_cgemqr_work, - LAPACKE_cgeqr, - LAPACKE_cgeqr_work, - LAPACKE_cgetsls, - LAPACKE_cgetsls_work, - LAPACKE_chbev_2stage, - LAPACKE_chbev_2stage_work, - LAPACKE_chbevd_2stage, - LAPACKE_chbevd_2stage_work, - LAPACKE_chbevx_2stage, - LAPACKE_chbevx_2stage_work, - LAPACKE_checon_3, - LAPACKE_checon_3_work, - LAPACKE_cheev_2stage, - LAPACKE_cheev_2stage_work, - LAPACKE_cheevd_2stage, - LAPACKE_cheevd_2stage_work, - LAPACKE_cheevr_2stage, - LAPACKE_cheevr_2stage_work, - LAPACKE_cheevx_2stage, - LAPACKE_cheevx_2stage_work, - LAPACKE_chegv_2stage, - LAPACKE_chegv_2stage_work, - LAPACKE_chesv_aa, - LAPACKE_chesv_aa_work, - LAPACKE_chesv_rk, - LAPACKE_chesv_rk_work, - LAPACKE_chetrf_aa, - LAPACKE_chetrf_aa_work, - LAPACKE_chetrf_rk, - LAPACKE_chetrf_rk_work, - LAPACKE_chetri_3, - LAPACKE_chetri_3_work, - LAPACKE_chetrs_aa, - LAPACKE_chetrs_aa_work, - LAPACKE_chetrs_3, - LAPACKE_chetrs_3_work, - LAPACKE_csycon_3, - LAPACKE_csycon_3_work, - LAPACKE_csysv_aa, - LAPACKE_csysv_aa_work, - LAPACKE_csysv_rk, - LAPACKE_csysv_rk_work, - LAPACKE_csytrf_aa, - LAPACKE_csytrf_aa_work, - LAPACKE_csytrf_rk, - LAPACKE_csytrf_rk_work, - LAPACKE_csytri_3, - LAPACKE_csytri_3_work, - LAPACKE_csytrs_aa, - LAPACKE_csytrs_aa_work, - LAPACKE_csytrs_3, - LAPACKE_csytrs_3_work, - LAPACKE_dgelq, - LAPACKE_dgelq_work, - LAPACKE_dgemlq, - LAPACKE_dgemlq_work, - LAPACKE_dgemqr, - LAPACKE_dgemqr_work, - LAPACKE_dgeqr, - LAPACKE_dgeqr_work, - LAPACKE_dgetsls, - LAPACKE_dgetsls_work, - LAPACKE_dsbev_2stage, - LAPACKE_dsbev_2stage_work, - LAPACKE_dsbevd_2stage, - LAPACKE_dsbevd_2stage_work, - LAPACKE_dsbevx_2stage, - LAPACKE_dsbevx_2stage_work, - LAPACKE_dsycon_3, - LAPACKE_dsycon_3_work, - LAPACKE_dsyev_2stage, - LAPACKE_dsyev_2stage_work, - LAPACKE_dsyevd_2stage, - LAPACKE_dsyevd_2stage_work, - LAPACKE_dsyevr_2stage, - LAPACKE_dsyevr_2stage_work, - LAPACKE_dsyevx_2stage, - LAPACKE_dsyevx_2stage_work, - LAPACKE_dsygv_2stage, - LAPACKE_dsygv_2stage_work, - LAPACKE_dsysv_aa, - LAPACKE_dsysv_aa_work, - LAPACKE_dsysv_rk, - LAPACKE_dsysv_rk_work, - LAPACKE_dsytrf_aa, - LAPACKE_dsytrf_aa_work, - LAPACKE_dsytrf_rk, - LAPACKE_dsytrf_rk_work, - LAPACKE_dsytri_3, - LAPACKE_dsytri_3_work, - LAPACKE_dsytrs_aa, - LAPACKE_dsytrs_aa_work, - LAPACKE_dsytrs_3, - LAPACKE_dsytrs_3_work, - LAPACKE_sgelq, - LAPACKE_sgelq_work, - LAPACKE_sgemlq, - LAPACKE_sgemlq_work, - LAPACKE_sgemqr, - LAPACKE_sgemqr_work, - LAPACKE_sgeqr, - LAPACKE_sgeqr_work, - LAPACKE_sgetsls, - LAPACKE_sgetsls_work, - LAPACKE_ssbev_2stage, - LAPACKE_ssbev_2stage_work, - LAPACKE_ssbevd_2stage, - LAPACKE_ssbevd_2stage_work, - LAPACKE_ssbevx_2stage, - LAPACKE_ssbevx_2stage_work, - LAPACKE_ssycon_3, - LAPACKE_ssycon_3_work, - LAPACKE_ssyev_2stage, - LAPACKE_ssyev_2stage_work, - LAPACKE_ssyevd_2stage, - LAPACKE_ssyevd_2stage_work, - LAPACKE_ssyevr_2stage, - LAPACKE_ssyevr_2stage_work, - LAPACKE_ssyevx_2stage, - LAPACKE_ssyevx_2stage_work, - LAPACKE_ssygv_2stage, - LAPACKE_ssygv_2stage_work, - LAPACKE_ssysv_aa, - LAPACKE_ssysv_aa_work, - LAPACKE_ssysv_rk, - LAPACKE_ssysv_rk_work, - LAPACKE_ssytrf_aa, - LAPACKE_ssytrf_aa_work, - LAPACKE_ssytrf_rk, - LAPACKE_ssytrf_rk_work, - LAPACKE_ssytri_3, - LAPACKE_ssytri_3_work, - LAPACKE_ssytrs_aa, - LAPACKE_ssytrs_aa_work, - LAPACKE_ssytrs_3, - LAPACKE_ssytrs_3_work, LAPACKE_zgelq, LAPACKE_zgelq_work, LAPACKE_zgemlq, @@ -3308,42 +3473,6 @@ LAPACKE_zsytrs_3_work, ## new function from lapack-3.8.0 - LAPACKE_chesv_aa_2stage, - LAPACKE_chesv_aa_2stage_work, - LAPACKE_chetrf_aa_2stage, - LAPACKE_chetrf_aa_2stage_work, - LAPACKE_chetrs_aa_2stage, - LAPACKE_chetrs_aa_2stage_work, - LAPACKE_clacrm, - LAPACKE_clacrm_work, - LAPACKE_clarcm, - LAPACKE_clarcm_work, - LAPACKE_classq, - LAPACKE_classq_work, - LAPACKE_csysv_aa_2stage, - LAPACKE_csysv_aa_2stage_work, - LAPACKE_csytrf_aa_2stage, - LAPACKE_csytrf_aa_2stage_work, - LAPACKE_csytrs_aa_2stage, - LAPACKE_csytrs_aa_2stage_work, - LAPACKE_dlassq, - LAPACKE_dlassq_work, - LAPACKE_dsysv_aa_2stage, - LAPACKE_dsysv_aa_2stage_work, - LAPACKE_dsytrf_aa_2stage, - LAPACKE_dsytrf_aa_2stage_work, - LAPACKE_dsytrs_aa_2stage, - LAPACKE_dsytrs_aa_2stage_work, - LAPACKE_get_nancheck, - LAPACKE_set_nancheck, - LAPACKE_slassq, - LAPACKE_slassq_work, - LAPACKE_ssysv_aa_2stage, - LAPACKE_ssysv_aa_2stage_work, - LAPACKE_ssytrf_aa_2stage, - LAPACKE_ssytrf_aa_2stage_work, - LAPACKE_ssytrs_aa_2stage, - LAPACKE_ssytrs_aa_2stage_work, LAPACKE_zhesv_aa_2stage, LAPACKE_zhesv_aa_2stage_work, LAPACKE_zhetrf_aa_2stage, @@ -3362,36 +3491,19 @@ LAPACKE_zsytrf_aa_2stage_work, LAPACKE_zsytrs_aa_2stage, LAPACKE_zsytrs_aa_2stage_work, - # new functions from 3.9.0 - LAPACKE_dgesvdq, - LAPACKE_dgesvdq_work, - LAPACKE_sgesvdq, - LAPACKE_sgesvdq_work, LAPACKE_zgesvdq, LAPACKE_zgesvdq_work - ); #These function may need 2 underscores. @lapack_embeded_underscore_objs=( - xerbla_array, chla_transtype, slasyf_rook, + xerbla_array, chla_transtype, + ); +@lapack_embeded_underscore_objs_s=( + slasyf_rook, ssytf2_rook, ssytrf_rook, ssytrs_rook, ssytri_rook, ssycon_rook, ssysv_rook, - chetf2_rook, chetrf_rook, chetri_rook, - chetrs_rook, checon_rook, chesv_rook, - clahef_rook, clasyf_rook, - csytf2_rook, csytrf_rook, csytrs_rook, - csytri_rook, csycon_rook, csysv_rook, - dlasyf_rook, - dsytf2_rook, dsytrf_rook, dsytrs_rook, - dsytri_rook, dsycon_rook, dsysv_rook, - zhetf2_rook, zhetrf_rook, zhetri_rook, - zhetrs_rook, zhecon_rook, zhesv_rook, - zlahef_rook, zlasyf_rook, - zsytf2_rook, zsytrf_rook, zsytrs_rook, - zsytri_rook, zsycon_rook, zsysv_rook, -# 3.7.0 slasyf_rk, ssyconvf_rook, ssytf2_rk, ssytrf_rk, ssytrs_3, ssytri_3, ssytri_3x, ssycon_3, ssysv_rk, @@ -3400,15 +3512,18 @@ ssytrd_sb2st, ssb2st_kernels, ssyevd_2stage, ssyev_2stage, ssyevx_2stage, ssyevr_2stage, ssbev_2stage, ssbevx_2stage, ssbevd_2stage, - ssygv_2stage, dlasyf_rk, dsyconvf_rook, - dsytf2_rk, dsytrf_rk, dsytrs_3, - dsytri_3, dsytri_3x, dsycon_3, - dsysv_rk, dlasyf_aa, dsysv_aa, - dsytrf_aa, dsytrs_aa, dsytrd_2stage, - dsytrd_sy2sb, dsytrd_sb2st, dsb2st_kernels, - dsyevd_2stage, dsyev_2stage, dsyevx_2stage, - dsyevr_2stage, dsbev_2stage, dsbevx_2stage, - dsbevd_2stage, dsygv_2stage, chetf2_rk, + ssygv_2stage, + ssysv_aa_2stage, ssytrf_aa_2stage, + ssytrs_aa_2stage, + slaorhr_col_getrfnp, slaorhr_col_getrfnp2, sorhr_col, +); +@lapack_embeded_underscore_objs_c=( + chetf2_rook, chetrf_rook, chetri_rook, + chetrs_rook, checon_rook, chesv_rook, + clahef_rook, clasyf_rook, + csytf2_rook, csytrf_rook, csytrs_rook, + csytri_rook, csycon_rook, csysv_rook, + chetf2_rk, chetrf_rk, chetri_3, chetri_3x, chetrs_3, checon_3, chesv_rk, chesv_aa, chetrf_aa, chetrs_aa, @@ -3421,6 +3536,35 @@ chb2st_kernels, cheevd_2stage, cheev_2stage, cheevx_2stage, cheevr_2stage, chbev_2stage, chbevx_2stage, chbevd_2stage, chegv_2stage, + chesv_aa_2stage, + chetrf_aa_2stage, chetrs_aa_2stage, + csysv_aa_2stage, csytrf_aa_2stage, + csytrs_aa_2stage, + claunhr_col_getrfnp, claunhr_col_getrfnp2, cunhr_col, +); +@lapack_embeded_underscore_objs_d=( + dlasyf_rook, + dsytf2_rook, dsytrf_rook, dsytrs_rook, + dsytri_rook, dsycon_rook, dsysv_rook, + dlasyf_rk, dsyconvf_rook, + dsytf2_rk, dsytrf_rk, dsytrs_3, + dsytri_3, dsytri_3x, dsycon_3, + dsysv_rk, dlasyf_aa, dsysv_aa, + dsytrf_aa, dsytrs_aa, dsytrd_2stage, + dsytrd_sy2sb, dsytrd_sb2st, dsb2st_kernels, + dsyevd_2stage, dsyev_2stage, dsyevx_2stage, + dsyevr_2stage, dsbev_2stage, dsbevx_2stage, + dsbevd_2stage, dsygv_2stage, + dsysv_aa_2stage, + dsytrf_aa_2stage, dsytrs_aa_2stage, + dlaorhr_col_getrfnp, dlaorhr_col_getrfnp2, dorhr_col, +); +@lapack_embeded_underscore_objs_z=( + zhetf2_rook, zhetrf_rook, zhetri_rook, + zhetrs_rook, zhecon_rook, zhesv_rook, + zlahef_rook, zlasyf_rook, + zsytf2_rook, zsytrf_rook, zsytrs_rook, + zsytri_rook, zsycon_rook, zsysv_rook, zhetf2_rk, zhetrf_rk, zhetri_3, zhetri_3x, zhetrs_3, zhecon_3, zhesv_rk, zhesv_aa, zhetrf_aa, @@ -3434,22 +3578,10 @@ zheev_2stage, zheevx_2stage, zheevr_2stage, zhbev_2stage, zhbevx_2stage, zhbevd_2stage, zhegv_2stage, -# 3.8.0 - ssysv_aa_2stage, ssytrf_aa_2stage, - ssytrs_aa_2stage, chesv_aa_2stage, - chetrf_aa_2stage, chetrs_aa_2stage, - csysv_aa_2stage, csytrf_aa_2stage, - csytrs_aa_2stage, dsysv_aa_2stage, - dsytrf_aa_2stage, dsytrs_aa_2stage, zhesv_aa_2stage, zhetrf_aa_2stage, zhetrs_aa_2stage, zsysv_aa_2stage, zsytrf_aa_2stage, zsytrs_aa_2stage, -# 3.9.0 - claunhr_col_getrfnp, claunhr_col_getrfnp2, cunhr_col, - dlaorhr_col_getrfnp, dlaorhr_col_getrfnp2, dorhr_col, - slaorhr_col_getrfnp, slaorhr_col_getrfnp2, sorhr_col, zlaunhr_col_getrfnp, zlaunhr_col_getrfnp2, zunhr_col - ); @@ -3461,6 +3593,42 @@ if ($ARGV[12] == 1) { @blasobjs = (@blasobjs, @halfblasobjs); @cblasobjs = (@cblasobjs, @halfcblasobjs); } +if ($ARGV[13] == 1) { + @blasobjs = (@blasobjs, @blasobjss); + @cblasobjs = (@cblasobjs, @cblasobjss); + @lapackobjs = (@lapackobjs, @lapackobjss); + @lapack2objs = (@lapack2objs, @lapack2objss); + @lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs, @lapack_embeded_underscore_objs_s); + @lapackeobjs = (@lapackeobjs, @lapackeobjss); +} +if ($ARGV[14] == 1) { + @blasobjs = (@blasobjs, @blasobjsd); + @cblasobjs = (@cblasobjs, @cblasobjsd); + @lapackobjs = (@lapackobjs, @lapackobjsd); + @lapack2objs = (@lapack2objs, @lapack2objsd); + @lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs, @lapack_embeded_underscore_objs_d); + @lapackeobjs = (@lapackeobjs, @lapackeobjsd); +} +if ($ARGV[15] == 1) { + @blasobjs = (@blasobjs, @blasobjsc); + @cblasobjs = (@cblasobjs, @cblasobjsc); + @gemm3mobjs = (@gemm3mobjs, @gemm3mobjsc); + @cblasgemm3mobjs = (@cblasgemm3mobjs, @sblasgemm3mobjsc); + @lapackobjs = (@lapackobjs, @lapackobjsc); + @lapack2objs = (@lapack2objs, @lapack2objsc, @lapac2objszc); + @lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs, @lapack_embeded_underscore_objs_c); + @lapackeobjs = (@lapackeobjs, @lapackeobjsc); +} +if ($ARGV[16] == 1) { + @blasobjs = (@blasobjs, @blasobjsz); + @cblasobjs = (@cblasobjs, @cblasobjsz); + @gemm3mobjs = (@gemm3mobjs, @gemm3mobjsz); + @cblasgemm3mobjs = (@cblasgemm3mobjs, @sblasgemm3mobjsz); + @lapackobjs = (@lapackobjs, @lapackobjsz); + @lapack2objs = (@lapack2objs, @lapack2objsz, @lapack2objszc); + @lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs, @lapack_embeded_underscore_objs_z); + @lapackeobjs = (@lapackeobjs, @lapackeobjsz); +} if ($ARGV[8] == 1) { #ONLY_CBLAS=1 @underscore_objs = (@misc_underscore_objs); From b8f95354c7edb67bfdeb317ef3e735a0b0e3c8ab Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 14:38:25 +0200 Subject: [PATCH 506/593] Adapt to having only a subset of variable types supported --- lapack/trtrs/Makefile | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/lapack/trtrs/Makefile b/lapack/trtrs/Makefile index a3b8f4322..8ba63c21a 100644 --- a/lapack/trtrs/Makefile +++ b/lapack/trtrs/Makefile @@ -17,6 +17,19 @@ ZBLASOBJS += ztrtrs_UNU_parallel.$(SUFFIX) ztrtrs_UNN_parallel.$(SUFFIX) ztrtrs_ XBLASOBJS += xtrtrs_UNU_parallel.$(SUFFIX) xtrtrs_UNN_parallel.$(SUFFIX) xtrtrs_UTU_parallel.$(SUFFIX) xtrtrs_UTN_parallel.$(SUFFIX) xtrtrs_URU_parallel.$(SUFFIX) xtrtrs_URN_parallel.$(SUFFIX) xtrtrs_UCU_parallel.$(SUFFIX) xtrtrs_UCN_parallel.$(SUFFIX) xtrtrs_LNU_parallel.$(SUFFIX) xtrtrs_LNN_parallel.$(SUFFIX) xtrtrs_LTU_parallel.$(SUFFIX) xtrtrs_LTN_parallel.$(SUFFIX) xtrtrs_LRU_parallel.$(SUFFIX) xtrtrs_LRN_parallel.$(SUFFIX) xtrtrs_LCU_parallel.$(SUFFIX) xtrtrs_LCN_parallel.$(SUFFIX) endif +ifneq ($(BUILD_SINGLE),1) +SBLASOBJS= +endif +ifneq ($(BUILD_DOUBLE),1) +DBLASOBJS= +endif +ifneq ($(BUILD_COMPLEX),1) +CBLASOBJS= +endif +ifneq ($(BUILD_COMPLEX16),1) +ZBLASOBJS= +endif + strtrs_UNU_single.$(SUFFIX) : trtrs_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUPLO -UTRANS -UDIAG $< -o $(@F) From dcd51d5c72e5f05e43e327e3c4d9d954d5f80b8f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 14:39:19 +0200 Subject: [PATCH 507/593] Adapt to having only a subset of variable types supported --- lapack/trtri/Makefile | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/lapack/trtri/Makefile b/lapack/trtri/Makefile index 626c47bbf..72167ff56 100644 --- a/lapack/trtri/Makefile +++ b/lapack/trtri/Makefile @@ -23,6 +23,19 @@ ZBLASOBJS += ztrtri_UU_parallel.$(SUFFIX) ztrtri_UN_parallel.$(SUFFIX) ztrtri_LU XBLASOBJS += xtrtri_UU_parallel.$(SUFFIX) xtrtri_UN_parallel.$(SUFFIX) xtrtri_LU_parallel.$(SUFFIX) xtrtri_LN_parallel.$(SUFFIX) endif +ifneq ($(BUILD_SINGLE),1) +SBLASOBJS= +endif +ifneq ($(BUILD_DOUBLE),1) +DBLASOBJS= +endif +ifneq ($(BUILD_COMPLEX),1) +CBLASOBJS= +endif +ifneq ($(BUILD_COMPLEX16),1) +ZBLASOBJS= +endif + strtri_UU_single.$(SUFFIX) : trtri_U_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) From cf53970bcb34c17bd1f83c3b521e372f6e57f043 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 14:40:06 +0200 Subject: [PATCH 508/593] Adapt to having only a subset of variable types supported --- lapack/trti2/Makefile | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lapack/trti2/Makefile b/lapack/trti2/Makefile index 45251fb1e..005e80d73 100644 --- a/lapack/trti2/Makefile +++ b/lapack/trti2/Makefile @@ -1,11 +1,19 @@ TOPDIR = ../.. include ../../Makefile.system +ifeq ($(BUILD_SINGLE),1) SBLASOBJS = strti2_UU.$(SUFFIX) strti2_UN.$(SUFFIX) strti2_LU.$(SUFFIX) strti2_LN.$(SUFFIX) +endif +ifeq ($(BUILD_DOUBLE),1) DBLASOBJS = dtrti2_UU.$(SUFFIX) dtrti2_UN.$(SUFFIX) dtrti2_LU.$(SUFFIX) dtrti2_LN.$(SUFFIX) +endif QBLASOBJS = qtrti2_UU.$(SUFFIX) qtrti2_UN.$(SUFFIX) qtrti2_LU.$(SUFFIX) qtrti2_LN.$(SUFFIX) +ifeq ($(BUILD_COMPLEX),1) CBLASOBJS = ctrti2_UU.$(SUFFIX) ctrti2_UN.$(SUFFIX) ctrti2_LU.$(SUFFIX) ctrti2_LN.$(SUFFIX) +endif +ifeq ($(BUILD_COMPLEX16),1) ZBLASOBJS = ztrti2_UU.$(SUFFIX) ztrti2_UN.$(SUFFIX) ztrti2_LU.$(SUFFIX) ztrti2_LN.$(SUFFIX) +endif XBLASOBJS = xtrti2_UU.$(SUFFIX) xtrti2_UN.$(SUFFIX) xtrti2_LU.$(SUFFIX) xtrti2_LN.$(SUFFIX) strti2_UU.$(SUFFIX) : trti2_U.c From 9df12eb08fde4d2f5ee49da1a48b0bd15a1bdbd4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 14:40:51 +0200 Subject: [PATCH 509/593] Adapt to having only a subset of variable types supported --- lapack/potrf/Makefile | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/lapack/potrf/Makefile b/lapack/potrf/Makefile index 21efa5540..feefd0483 100644 --- a/lapack/potrf/Makefile +++ b/lapack/potrf/Makefile @@ -17,6 +17,20 @@ ZBLASOBJS += zpotrf_U_parallel.$(SUFFIX) zpotrf_L_parallel.$(SUFFIX) XBLASOBJS += xpotrf_U_parallel.$(SUFFIX) xpotrf_L_parallel.$(SUFFIX) endif +ifeq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE))" "" +SBLASOBJS= +endif +ifneq ($(BUILD_DOUBLE),1) +DBLASOBJS= +endif +ifeq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" "" +CBLASOBJS= +endif +ifneq ($(BUILD_COMPLEX16),1) +ZBLASOBJS= +endif + + spotrf_U_single.$(SUFFIX) : potrf_U_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) From e5966f860671381963acc2f7cfa95a3b1e24510e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 14:41:43 +0200 Subject: [PATCH 510/593] Adapt to having only a subset of variable types supported --- lapack/potf2/Makefile | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/lapack/potf2/Makefile b/lapack/potf2/Makefile index 5946ad9c8..f48570064 100644 --- a/lapack/potf2/Makefile +++ b/lapack/potf2/Makefile @@ -8,6 +8,19 @@ CBLASOBJS = cpotf2_U.$(SUFFIX) cpotf2_L.$(SUFFIX) ZBLASOBJS = zpotf2_U.$(SUFFIX) zpotf2_L.$(SUFFIX) XBLASOBJS = xpotf2_U.$(SUFFIX) xpotf2_L.$(SUFFIX) +ifeq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE))" "" + SBLASOBJS= +endif +ifneq ($(BUILD_DOUBLE),1) + DBLASOBJS= +endif +ifeq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" "" + CBLASOBJS= +endif +ifneq ($(BUILD_COMPLEX16),1) + ZBLASOBJS= +endif + spotf2_U.$(SUFFIX) : potf2_U.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) From bc319cee826cf2cc7d750bc83895aef9504d18db Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 14:42:26 +0200 Subject: [PATCH 511/593] Adapt to having only a subset of variable types supported --- lapack/lauum/Makefile | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/lapack/lauum/Makefile b/lapack/lauum/Makefile index f163479ef..c57f17937 100644 --- a/lapack/lauum/Makefile +++ b/lapack/lauum/Makefile @@ -17,6 +17,19 @@ ZBLASOBJS += zlauum_U_parallel.$(SUFFIX) zlauum_L_parallel.$(SUFFIX) XBLASOBJS += xlauum_U_parallel.$(SUFFIX) xlauum_L_parallel.$(SUFFIX) endif +ifneq ($(BUILD_SINGLE),1) +SBLASOBJS= +endif +ifneq ($(BUILD_DOUBLE),1) +DBLASOBJS= +endif +ifneq ($(BUILD_COMPLEX),1) +CBLASOBJS= +endif +ifneq ($(BUILD_COMPLEX16),1) +ZBLASOBJS= +endif + slauum_U_single.$(SUFFIX) : lauum_U_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) From b2620580594d2d6d0b06ea814dca34b37f79f84d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 14:43:13 +0200 Subject: [PATCH 512/593] Adapt to having only a subset of variable types supported --- lapack/lauu2/Makefile | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lapack/lauu2/Makefile b/lapack/lauu2/Makefile index dc6a640b4..60d2db4db 100644 --- a/lapack/lauu2/Makefile +++ b/lapack/lauu2/Makefile @@ -1,11 +1,19 @@ TOPDIR = ../.. include ../../Makefile.system +ifeq ($(BUILD_SINGLE),1) SBLASOBJS = slauu2_U.$(SUFFIX) slauu2_L.$(SUFFIX) +endif +ifeq ($(BUILD_DOUBLE),1) DBLASOBJS = dlauu2_U.$(SUFFIX) dlauu2_L.$(SUFFIX) +endif QBLASOBJS = qlauu2_U.$(SUFFIX) qlauu2_L.$(SUFFIX) +ifeq ($(BUILD_COMPLEX),1) CBLASOBJS = clauu2_U.$(SUFFIX) clauu2_L.$(SUFFIX) +endif +ifeq ($(BUILD_COMPLEX16),1) ZBLASOBJS = zlauu2_U.$(SUFFIX) zlauu2_L.$(SUFFIX) +endif XBLASOBJS = xlauu2_U.$(SUFFIX) xlauu2_L.$(SUFFIX) slauu2_U.$(SUFFIX) : lauu2_U.c From 5c657fffad5bc4246964b6c4204685b5cd036d32 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 14:44:13 +0200 Subject: [PATCH 513/593] Adapt to having only a subset of variable types supported --- lapack/laswp/Makefile | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lapack/laswp/Makefile b/lapack/laswp/Makefile index 389800692..2028d994e 100644 --- a/lapack/laswp/Makefile +++ b/lapack/laswp/Makefile @@ -1,11 +1,19 @@ TOPDIR = ../.. include ../../Makefile.system +ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE))" "" SBLASOBJS = slaswp_plus.$(SUFFIX) slaswp_minus.$(SUFFIX) +endif +ifeq ($(BUILD_DOUBLE),1) DBLASOBJS = dlaswp_plus.$(SUFFIX) dlaswp_minus.$(SUFFIX) +endif QBLASOBJS = qlaswp_plus.$(SUFFIX) qlaswp_minus.$(SUFFIX) +ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" "" CBLASOBJS = claswp_plus.$(SUFFIX) claswp_minus.$(SUFFIX) +endif +ifeq ($(BUILD_COMPLEX16),1) ZBLASOBJS = zlaswp_plus.$(SUFFIX) zlaswp_minus.$(SUFFIX) +endif XBLASOBJS = xlaswp_plus.$(SUFFIX) xlaswp_minus.$(SUFFIX) slaswp_plus.$(SUFFIX) slaswp_minus.$(SUFFIX) dlaswp_plus.$(SUFFIX) dlaswp_minus.$(SUFFIX) \ From 20cf1d773f34a54946994b2b219545049f9b9fb0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 14:44:56 +0200 Subject: [PATCH 514/593] Adapt to having only a subset of variable types supported --- lapack/getrs/Makefile | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/lapack/getrs/Makefile b/lapack/getrs/Makefile index 2640ef097..f32569367 100644 --- a/lapack/getrs/Makefile +++ b/lapack/getrs/Makefile @@ -17,6 +17,19 @@ ZBLASOBJS += zgetrs_N_parallel.$(SUFFIX) zgetrs_T_parallel.$(SUFFIX) zgetrs_R_pa XBLASOBJS += xgetrs_N_parallel.$(SUFFIX) xgetrs_T_parallel.$(SUFFIX) xgetrs_R_parallel.$(SUFFIX) xgetrs_C_parallel.$(SUFFIX) endif +ifeq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE))" "" +SBLASOBJS= +endif +ifneq ($(BUILD_DOUBLE),1) +DBLASOBJS= +endif +ifeq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" "" +CBLASOBJS= +endif +ifneq ($(BUILD_COMPLEX16),1) +ZBLASOBJS= +endif + sgetrs_N_single.$(SUFFIX) : getrs_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANS $< -o $(@F) From 93454022a9c3580fedbe06204234542448a62081 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 14:45:40 +0200 Subject: [PATCH 515/593] Adapt to having only a subset of variable types supported --- lapack/getrf/Makefile | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/lapack/getrf/Makefile b/lapack/getrf/Makefile index a559dfb0d..976ca3c0b 100644 --- a/lapack/getrf/Makefile +++ b/lapack/getrf/Makefile @@ -17,6 +17,19 @@ ZBLASOBJS += zgetrf_parallel.$(SUFFIX) XBLASOBJS += xgetrf_parallel.$(SUFFIX) endif +ifeq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE))" "" +SBLASOBJS= +endif +ifneq ($(BUILD_DOUBLE),1) +DBLASOBJS= +endif +ifeq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" "" +CBLASOBJS= +endif +ifneq ($(BUILD_COMPLEX16),1) +ZBLASOBJS= +endif + ifeq ($(USE_OPENMP), 1) GETRF_SRC = getrf_parallel_omp.c else From b27ca78a2105d676b446bf49231faf76455f8dfc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 14:46:24 +0200 Subject: [PATCH 516/593] Adapt to having only a subset of variable types supported --- lapack/getf2/Makefile | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lapack/getf2/Makefile b/lapack/getf2/Makefile index 612c6f9cc..a524a3235 100644 --- a/lapack/getf2/Makefile +++ b/lapack/getf2/Makefile @@ -1,11 +1,19 @@ TOPDIR = ../.. include ../../Makefile.system +ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE))" "" SBLASOBJS = sgetf2_k.$(SUFFIX) +endif +ifeq ($(BUILD_DOUBLE),1) DBLASOBJS = dgetf2_k.$(SUFFIX) +endif QBLASOBJS = qgetf2_k.$(SUFFIX) +ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" "" CBLASOBJS = cgetf2_k.$(SUFFIX) +endif +ifeq ($(BUILD_COMPLEX16),1) ZBLASOBJS = zgetf2_k.$(SUFFIX) +endif XBLASOBJS = xgetf2_k.$(SUFFIX) sgetf2_k.$(SUFFIX) : getf2_k.c From efe1ad4700bb55a06d9fc8e8291934a51c55c501 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 14:48:23 +0200 Subject: [PATCH 517/593] Add Makefile support for enabling only some variable types --- lapack-netlib/TESTING/MATGEN/Makefile | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/lapack-netlib/TESTING/MATGEN/Makefile b/lapack-netlib/TESTING/MATGEN/Makefile index 87432fd04..e21ebd6c3 100644 --- a/lapack-netlib/TESTING/MATGEN/Makefile +++ b/lapack-netlib/TESTING/MATGEN/Makefile @@ -33,25 +33,37 @@ TOPSRCDIR = ../.. include $(TOPSRCDIR)/make.inc +ifneq "$(or $(BUILD_SINGLE),$(BUILD_COMPLEX))" "" SCATGEN = slatm1.o slatm7.o slaran.o slarnd.o +endif +ifeq ($(BUILD_SINGLE),1) SMATGEN = slatms.o slatme.o slatmr.o slatmt.o \ slagge.o slagsy.o slakf2.o slarge.o slaror.o slarot.o slatm2.o \ slatm3.o slatm5.o slatm6.o slahilb.o +endif +ifeq ($(BUILD_COMPLEX),1) CMATGEN = clatms.o clatme.o clatmr.o clatmt.o \ clagge.o claghe.o clagsy.o clakf2.o clarge.o claror.o clarot.o \ clatm1.o clarnd.o clatm2.o clatm3.o clatm5.o clatm6.o clahilb.o +endif +ifneq "$(or $(BUILD_DOUBLE),$(BUILD_COMPLEX16))" "" DZATGEN = dlatm1.o dlatm7.o dlaran.o dlarnd.o +endif +ifeq ($(BUILD_DOUBLE),1) DMATGEN = dlatms.o dlatme.o dlatmr.o dlatmt.o \ dlagge.o dlagsy.o dlakf2.o dlarge.o dlaror.o dlarot.o dlatm2.o \ dlatm3.o dlatm5.o dlatm6.o dlahilb.o +endif +ifeq ($(BUILD_COMPLEX16),1) ZMATGEN = zlatms.o zlatme.o zlatmr.o zlatmt.o \ zlagge.o zlaghe.o zlagsy.o zlakf2.o zlarge.o zlaror.o zlarot.o \ zlatm1.o zlarnd.o zlatm2.o zlatm3.o zlatm5.o zlatm6.o zlahilb.o +endif .PHONY: all all: $(TMGLIB) @@ -97,5 +109,9 @@ cleanobj: cleanlib: rm -f $(TMGLIB) +ifeq ($(filter $(BUILD_SINGLE) $(BUILD_COMPLEX),1),) slaran.o: slaran.f ; $(FC) $(FFLAGS_NOOPT) -c -o $@ $< +endif +ifeq ($(filter $(BUILD_DOUBLE) $(BUILD_COMPLEX16),1),) dlaran.o: dlaran.f ; $(FC) $(FFLAGS_NOOPT) -c -o $@ $< +endif From ef552bc578274d257985e5ce76b3999920540daa Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 14:49:06 +0200 Subject: [PATCH 518/593] Add Makefile support for enabling only some variable types --- lapack-netlib/SRC/Makefile | 62 ++++++++++++++++++++++++++++---------- 1 file changed, 46 insertions(+), 16 deletions(-) diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile index 9f79e20e9..83baac875 100644 --- a/lapack-netlib/SRC/Makefile +++ b/lapack-netlib/SRC/Makefile @@ -66,7 +66,9 @@ ALLAUX_O = ilaenv.o ilaenv2stage.o ieeeck.o lsamen.o xerbla.o xerbla_array.o \ ilaprec.o ilatrans.o ilauplo.o iladiag.o chla_transtype.o \ ../INSTALL/ilaver.o ../INSTALL/lsame.o ../INSTALL/slamch.o +ifneq "$(or $(BUILD_SINGLE),$(BUILD_COMPLEX))" "" SCLAUX = \ + sbdsvdx.o sstevx.o sstein.o \ sbdsdc.o \ sbdsqr.o sdisna.o slabad.o slacpy.o sladiv.o slae2.o slaebz.o \ slaed0.o slaed1.o slaed2.o slaed3.o slaed4.o slaed5.o slaed6.o \ @@ -81,10 +83,14 @@ SCLAUX = \ slaset.o slasq1.o slasq2.o slasq3.o slasq4.o slasq5.o slasq6.o \ slasr.o slasrt.o slassq.o slasv2.o spttrf.o sstebz.o sstedc.o \ ssteqr.o ssterf.o slaisnan.o sisnan.o \ - slartgp.o slartgs.o \ + slartgp.o slartgs.o scombssq.o \ ../INSTALL/second_$(TIMER).o +endif +ifneq "$(or $(BUILD_DOUBLE),$(BUILD_COMPLEX16))" "" DZLAUX = \ + dcombssq.o \ + dbdsvdx.o dstevx.o dstein.o \ dbdsdc.o \ dbdsqr.o ddisna.o dlabad.o dlacpy.o dladiv.o dlae2.o dlaebz.o \ dlaed0.o dlaed1.o dlaed2.o dlaed3.o dlaed4.o dlaed5.o dlaed6.o \ @@ -101,9 +107,12 @@ DZLAUX = \ dsteqr.o dsterf.o dlaisnan.o disnan.o \ dlartgp.o dlartgs.o \ ../INSTALL/dlamch.o ../INSTALL/dsecnd_$(TIMER).o +endif +#ifeq ($(BUILD_SINGLE),1) +ifdef BUILD_SINGLE SLASRC_O = \ - sbdsvdx.o spotrf2.o sgetrf2.o \ + spotrf2.o sgetrf2.o \ sgbbrd.o sgbcon.o sgbequ.o sgbrfs.o sgbsv.o \ sgbsvx.o sgbtf2.o sgbtrf.o sgbtrs.o sgebak.o sgebal.o sgebd2.o \ sgebrd.o sgecon.o sgeequ.o sgees.o sgeesx.o sgeev.o sgeevx.o \ @@ -145,8 +154,7 @@ SLASRC_O = \ ssbev.o ssbevd.o ssbevx.o ssbgst.o ssbgv.o ssbgvd.o ssbgvx.o \ ssbtrd.o sspcon.o sspev.o sspevd.o sspevx.o sspgst.o \ sspgv.o sspgvd.o sspgvx.o ssprfs.o sspsv.o sspsvx.o ssptrd.o \ - ssptrf.o ssptri.o ssptrs.o sstegr.o sstein.o sstev.o sstevd.o sstevr.o \ - sstevx.o \ + ssptrf.o ssptri.o ssptrs.o sstegr.o sstev.o sstevd.o sstevr.o \ ssycon.o ssyev.o ssyevd.o ssyevr.o ssyevx.o ssygs2.o \ ssygst.o ssygv.o ssygvd.o ssygvx.o ssyrfs.o ssysv.o ssysvx.o \ ssytd2.o ssytf2.o ssytrd.o ssytrf.o ssytri.o ssytri2.o ssytri2x.o \ @@ -180,9 +188,13 @@ SLASRC_O = \ ssytrd_2stage.o ssytrd_sy2sb.o ssytrd_sb2st.o ssb2st_kernels.o \ ssyevd_2stage.o ssyev_2stage.o ssyevx_2stage.o ssyevr_2stage.o \ ssbev_2stage.o ssbevx_2stage.o ssbevd_2stage.o ssygv_2stage.o \ - sgesvdq.o scombssq.o + sgesvdq.o + +endif +ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE))" "" DSLASRC_O = spotrs.o sgetrs.o spotrf.o sgetrf.o +endif ifdef USEXBLAS SXLASRC = sgesvxx.o sgerfsx.o sla_gerfsx_extended.o sla_geamv.o \ @@ -194,6 +206,7 @@ SXLASRC = sgesvxx.o sgerfsx.o sla_gerfsx_extended.o sla_geamv.o \ slascl2.o sla_wwaddw.o endif +ifeq ($(BUILD_COMPLEX),1) CLASRC_O = \ cpotrf2.o cgetrf2.o \ cbdsqr.o cgbbrd.o cgbcon.o cgbequ.o cgbrfs.o cgbsv.o cgbsvx.o \ @@ -284,6 +297,7 @@ CLASRC_O = \ cheevd_2stage.o cheev_2stage.o cheevx_2stage.o cheevr_2stage.o \ chbev_2stage.o chbevx_2stage.o chbevd_2stage.o chegv_2stage.o \ cgesvdq.o +endif ifdef USEXBLAS CXLASRC = cgesvxx.o cgerfsx.o cla_gerfsx_extended.o cla_geamv.o \ @@ -299,11 +313,13 @@ CXLASRC = cgesvxx.o cgerfsx.o cla_gerfsx_extended.o cla_geamv.o \ cla_lin_berr.o clarscl2.o clascl2.o cla_wwaddw.o endif -ZCLASRC_O = cpotrs.o cgetrs.o cpotrf.o cgetrf.o +ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" "" +ZCLASRC_O = cpotrs.o cgetrs.o cpotrf.o cgetrf.o clag2z.o +endif +ifeq ($(BUILD_DOUBLE),1) DLASRC_O = \ dpotrf2.o dgetrf2.o \ - dbdsvdx.o \ dgbbrd.o dgbcon.o dgbequ.o dgbrfs.o dgbsv.o \ dgbsvx.o dgbtf2.o dgbtrf.o dgbtrs.o dgebak.o dgebal.o dgebd2.o \ dgebrd.o dgecon.o dgeequ.o dgees.o dgeesx.o dgeev.o dgeevx.o \ @@ -345,8 +361,7 @@ DLASRC_O = \ dsbev.o dsbevd.o dsbevx.o dsbgst.o dsbgv.o dsbgvd.o dsbgvx.o \ dsbtrd.o dspcon.o dspev.o dspevd.o dspevx.o dspgst.o \ dspgv.o dspgvd.o dspgvx.o dsprfs.o dspsv.o dspsvx.o dsptrd.o \ - dsptrf.o dsptri.o dsptrs.o dstegr.o dstein.o dstev.o dstevd.o dstevr.o \ - dstevx.o \ + dsptrf.o dsptri.o dsptrs.o dstegr.o dstev.o dstevd.o dstevr.o \ dsycon.o dsyev.o dsyevd.o dsyevr.o \ dsyevx.o dsygs2.o dsygst.o dsygv.o dsygvd.o dsygvx.o dsyrfs.o \ dsysv.o dsysvx.o \ @@ -381,7 +396,8 @@ DLASRC_O = \ dsytrd_2stage.o dsytrd_sy2sb.o dsytrd_sb2st.o dsb2st_kernels.o \ dsyevd_2stage.o dsyev_2stage.o dsyevx_2stage.o dsyevr_2stage.o \ dsbev_2stage.o dsbevx_2stage.o dsbevd_2stage.o dsygv_2stage.o \ - dgesvdq.o dcombssq.o + dgesvdq.o +endif ifdef USEXBLAS DXLASRC = dgesvxx.o dgerfsx.o dla_gerfsx_extended.o dla_geamv.o \ @@ -393,6 +409,7 @@ DXLASRC = dgesvxx.o dgerfsx.o dla_gerfsx_extended.o dla_geamv.o \ dlascl2.o dla_wwaddw.o endif +ifeq ($(BUILD_COMPLEX16),1) ZLASRC_O = \ zpotrf2.o zgetrf2.o \ zbdsqr.o zgbbrd.o zgbcon.o zgbequ.o zgbrfs.o zgbsv.o zgbsvx.o \ @@ -471,7 +488,7 @@ ZLASRC_O = \ zunmlq.o zunmql.o zunmqr.o zunmr2.o zunmr3.o zunmrq.o zunmrz.o \ zunmtr.o zupgtr.o \ zupmtr.o izmax1.o dzsum1.o zstemr.o \ - zcgesv.o zcposv.o zlag2c.o clag2z.o zlat2c.o \ + zcgesv.o zcposv.o zlag2c.o zlat2c.o \ zhfrk.o ztfttp.o zlanhf.o zpftrf.o zpftri.o zpftrs.o ztfsm.o ztftri.o \ ztfttr.o ztpttf.o ztpttr.o ztrttf.o ztrttp.o \ zgeequb.o zgbequb.o zsyequb.o zpoequb.o zheequb.o \ @@ -488,6 +505,7 @@ ZLASRC_O = \ zheevd_2stage.o zheev_2stage.o zheevx_2stage.o zheevr_2stage.o \ zhbev_2stage.o zhbevx_2stage.o zhbevd_2stage.o zhegv_2stage.o \ zgesvdq.o +endif ifdef USEXBLAS ZXLASRC = zgesvxx.o zgerfsx.o zla_gerfsx_extended.o zla_geamv.o \ @@ -501,18 +519,30 @@ ZXLASRC = zgesvxx.o zgerfsx.o zla_gerfsx_extended.o zla_geamv.o \ zla_lin_berr.o zlarscl2.o zlascl2.o zla_wwaddw.o endif -DEPRECSRC = DEPRECATED/cgegs.o DEPRECATED/cgegv.o DEPRECATED/cgelsx.o \ +ifeq ($(BUILD_COMPLEX),1) +CDEPRECSRC = DEPRECATED/cgegs.o DEPRECATED/cgegv.o DEPRECATED/cgelsx.o \ DEPRECATED/cgeqpf.o DEPRECATED/cggsvd.o DEPRECATED/cggsvp.o \ - DEPRECATED/clahrd.o DEPRECATED/clatzm.o DEPRECATED/ctzrqf.o \ + DEPRECATED/clahrd.o DEPRECATED/clatzm.o DEPRECATED/ctzrqf.o +endif + +ifeq ($(BUILD_DOUBLE),1) +DDEPRECSRC = \ DEPRECATED/dgegs.o DEPRECATED/dgegv.o DEPRECATED/dgelsx.o \ DEPRECATED/dgeqpf.o DEPRECATED/dggsvd.o DEPRECATED/dggsvp.o \ - DEPRECATED/dlahrd.o DEPRECATED/dlatzm.o DEPRECATED/dtzrqf.o \ + DEPRECATED/dlahrd.o DEPRECATED/dlatzm.o DEPRECATED/dtzrqf.o +endif +ifeq ($(BUILD_SINGLE),1) +SDEPRECSRC = \ DEPRECATED/sgegs.o DEPRECATED/sgegv.o DEPRECATED/sgelsx.o \ DEPRECATED/sgeqpf.o DEPRECATED/sggsvd.o DEPRECATED/sggsvp.o \ - DEPRECATED/slahrd.o DEPRECATED/slatzm.o DEPRECATED/stzrqf.o \ + DEPRECATED/slahrd.o DEPRECATED/slatzm.o DEPRECATED/stzrqf.o +endif +ifeq ($(BUILD_COMPLEX16),1) +ZDEPRECSRC = \ DEPRECATED/zgegs.o DEPRECATED/zgegv.o DEPRECATED/zgelsx.o \ DEPRECATED/zgeqpf.o DEPRECATED/zggsvd.o DEPRECATED/zggsvp.o \ DEPRECATED/zlahrd.o DEPRECATED/zlatzm.o DEPRECATED/ztzrqf.o +endif # filter out optimized codes from OpenBLAS ALL_AUX_OBJS = xerbla.o ../INSTALL/lsame.o @@ -560,7 +590,7 @@ ALLXOBJ = $(SXLASRC) $(DXLASRC) $(CXLASRC) $(ZXLASRC) endif ifdef BUILD_DEPRECATED -DEPRECATED = $(DEPRECSRC) +DEPRECATED = $(SDEPRECSRC) $(DDEPRECSRC) $(CDEPRECSRC) $(ZDEPRECSRC) endif .PHONY: all From a6570108c570848f0f036b296ad0c35e826a7bc2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 14:49:58 +0200 Subject: [PATCH 519/593] Add Makefile support for enabling only some variable types --- lapack-netlib/LAPACKE/src/Makefile | 99 ++++++++++++++++++++---------- 1 file changed, 68 insertions(+), 31 deletions(-) diff --git a/lapack-netlib/LAPACKE/src/Makefile b/lapack-netlib/LAPACKE/src/Makefile index 8060151ae..a602dd7a0 100644 --- a/lapack-netlib/LAPACKE/src/Makefile +++ b/lapack-netlib/LAPACKE/src/Makefile @@ -46,6 +46,7 @@ OBJ = \ lapacke_ilaver.o \ lapacke_nancheck.o +ifeq ($(BUILD_COMPLEX),1) OBJ_C = \ lapacke_cbbcsd.o \ lapacke_cbbcsd_work.o \ @@ -653,7 +654,9 @@ lapacke_cupgtr.o \ lapacke_cupgtr_work.o \ lapacke_cupmtr.o \ lapacke_cupmtr_work.o +endif +ifeq ($(BUILD_DOUBLE),1) OBJ_D = \ lapacke_dbbcsd.o \ lapacke_dbbcsd_work.o \ @@ -1218,8 +1221,12 @@ lapacke_dtrttf_work.o \ lapacke_dtrttp.o \ lapacke_dtrttp_work.o \ lapacke_dtzrzf.o \ -lapacke_dtzrzf_work.o +lapacke_dtzrzf_work.o \ +lapacke_slag2d.o \ +lapacke_slag2d_work.o +endif +ifeq ($(BUILD_SINGLE),1) OBJ_S = \ lapacke_sbbcsd.o \ lapacke_sbbcsd_work.o \ @@ -1395,8 +1402,6 @@ lapacke_slacn2.o \ lapacke_slacn2_work.o \ lapacke_slacpy.o \ lapacke_slacpy_work.o \ -lapacke_slag2d.o \ -lapacke_slag2d_work.o \ lapacke_slamch.o \ lapacke_slamch_work.o \ lapacke_slange.o \ @@ -1781,7 +1786,9 @@ lapacke_strttp.o \ lapacke_strttp_work.o \ lapacke_stzrzf.o \ lapacke_stzrzf_work.o +endif +ifeq ($(BUILD_COMPLEX16),1) OBJ_Z = \ lapacke_zbbcsd.o \ lapacke_zbbcsd_work.o \ @@ -2393,35 +2400,52 @@ lapacke_zupgtr.o \ lapacke_zupgtr_work.o \ lapacke_zupmtr.o \ lapacke_zupmtr_work.o +endif ifdef BUILD_DEPRECATED -DEPRECATED = \ +ifeq ($(BUILD_COMPLEX),1) +DEPRECATEDC = \ lapacke_cggsvp.o \ lapacke_cggsvp_work.o \ -lapacke_dggsvp.o \ -lapacke_dggsvp_work.o \ -lapacke_sggsvp.o \ -lapacke_sggsvp_work.o \ -lapacke_zggsvp.o \ -lapacke_zggsvp_work.o \ lapacke_cggsvd.o \ lapacke_cggsvd_work.o \ +lapacke_cgeqpf.o \ +lapacke_cgeqpf_work.o +endif + +ifeq ($(BUILD_DOUBLE),1) +DEPRECATEDD = \ +lapacke_dggsvp.o \ +lapacke_dggsvp_work.o \ lapacke_dggsvd.o \ lapacke_dggsvd_work.o \ +lapacke_dgeqpf.o \ +lapacke_dgeqpf_work.o +endif + +ifeq ($(BUILD_SINGLE),1) +DEPRECATEDS = \ +lapacke_sggsvp.o \ +lapacke_sggsvp_work.o \ lapacke_sggsvd.o \ lapacke_sggsvd_work.o \ +lapacke_sgeqpf.o \ +lapacke_sgeqpf_work.o +endif + +ifeq ($(BUILD_COMPLEX16),1) +DEPRECATEDZ = \ +lapacke_zggsvp.o \ +lapacke_zggsvp_work.o \ lapacke_zggsvd.o \ lapacke_zggsvd_work.o \ -lapacke_cgeqpf.o \ -lapacke_cgeqpf_work.o \ -lapacke_dgeqpf.o \ -lapacke_dgeqpf_work.o \ -lapacke_sgeqpf.o \ -lapacke_sgeqpf_work.o \ lapacke_zgeqpf.o \ lapacke_zgeqpf_work.o endif +DEPRECATED = $(DEPRECATEDS) $(DEPRECATEDD) $(DEPRECATEDC) $(DEPRECATEDZ) +endif + ifdef USEXBLAS EXTENDED = \ lapacke_cgbrfsx.o lapacke_cporfsx.o lapacke_dgerfsx.o lapacke_sgbrfsx.o lapacke_ssyrfsx.o lapacke_zherfsx.o \ @@ -2440,37 +2464,50 @@ endif ifdef LAPACKE_WITH_TMG # FILE PARTS OF TMGLIB -MATGEN = \ +ifeq ($(BUILD_COMPLEX),1) +MATGENC = \ lapacke_clatms.o \ lapacke_clatms_work.o \ -lapacke_dlatms.o \ -lapacke_dlatms_work.o \ -lapacke_slatms.o \ -lapacke_slatms_work.o \ -lapacke_zlatms.o \ -lapacke_zlatms_work.o \ lapacke_clagge.o \ lapacke_clagge_work.o \ +lapacke_claghe.o \ +lapacke_claghe_work.o \ +lapacke_clagsy.o \ +lapacke_clagsy_work.o +endif +ifeq ($(BUILD_DOUBLE),1) +MATGEND = \ +lapacke_dlatms.o \ +lapacke_dlatms_work.o \ lapacke_dlagge.o \ lapacke_dlagge_work.o \ +lapacke_dlagsy.o \ +lapacke_dlagsy_work.o +endif +ifeq ($(BUILD_SINGLE),1) +MATGENS = \ +lapacke_slatms.o \ +lapacke_slatms_work.o \ lapacke_slagge.o \ lapacke_slagge_work.o \ +lapacke_slagsy.o \ +lapacke_slagsy_work.o +endif +ifeq ($(BUILD_COMPLEX16),1) +MATGENZ = \ +lapacke_zlatms.o \ +lapacke_zlatms_work.o \ lapacke_zlagge.o \ lapacke_zlagge_work.o \ -lapacke_claghe.o \ -lapacke_claghe_work.o \ lapacke_zlaghe.o \ lapacke_zlaghe_work.o \ -lapacke_clagsy.o \ -lapacke_clagsy_work.o \ -lapacke_dlagsy.o \ -lapacke_dlagsy_work.o \ -lapacke_slagsy.o \ -lapacke_slagsy_work.o \ lapacke_zlagsy.o \ lapacke_zlagsy_work.o endif +MATGEN = $(MATGENS) $(MATGEND) $(MATGENC) $(MATGENZ) +endif + .PHONY: all all: $(LAPACKELIB) From 6b6adf8a4a563f4afbcab2a9d39b5eaa55da13b1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 14:52:09 +0200 Subject: [PATCH 520/593] Allow compiling only a subset of kernels for specific variable types --- kernel/CMakeLists.txt | 215 +++++++++++++++++++++++-- kernel/Makefile.L2 | 70 +++++++- kernel/Makefile.L3 | 84 ++++++++-- kernel/setparam-ref.c | 367 ++++++++++++++++++++++-------------------- 4 files changed, 527 insertions(+), 209 deletions(-) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index c81f2bf25..988b83338 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -91,7 +91,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${DSDOTKERNEL}" "DSDOT" "d*dot_k" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${DSDOTKERNEL}" "DSDOT" "dsdot_k" false "" "" false "SINGLE") - if (DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_SINGLE) + if ((BUILD_COMPLEX OR BUILD_DOUBLE) AND NOT BUILD_SINGLE) GenerateNamedObjects("${KERNELDIR}/${SAMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${SAMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${SASUMKERNEL}" "" "asum_k" false "" "" false "SINGLE") @@ -110,14 +110,14 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${ISAMAXKERNEL}" "USE_ABS" "i*amax_k" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${ISAMINKERNEL}" "USE_ABS;USE_MIN" "i*amin_k" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${SSCALKERNEL}" "" "scal_k" false "" "" false "SINGLE") - GenerateNamedObjects("${KERNELDIR}/${SCOPYKERNEL}" "" "copy_k" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SCOPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${SSWAPKERNEL}" "" "swap_k" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${SAXPYKERNEL}" "" "axpy_k" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${SNRM2KERNEL}" "" "nrm2_k" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${SDOTKERNEL}" "" "dot_k" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${SROTKERNEL}" "" "rot_k" false "" "" false "SINGLE") endif () - if (DEFINED BUILD_COMPLEX16 AND NOT DEFINED BUILD_DOUBLE) + if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) GenerateNamedObjects("${KERNELDIR}/${DAMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false "DOUBLE") GenerateNamedObjects("${KERNELDIR}/${DAMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false "DOUBLE") GenerateNamedObjects("${KERNELDIR}/${DASUMKERNEL}" "" "asum_k" false "" "" false "DOUBLE") @@ -177,11 +177,11 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false ${float_type}) endif () endforeach () - if (DEFINED BUILD_COMPLEX16 AND NOT DEFINED BUILD_DOUBLE) + if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) GenerateNamedObjects("${KERNELDIR}/${DGEMVNKERNEL}" "" "gemv_n" false "" "" false "DOUBLE") GenerateNamedObjects("${KERNELDIR}/${DGEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false "DOUBLE") endif () - if (DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_SINGLE) + if (BUILD_COMPLEX AND NOT BUILD_SINGLE) GenerateNamedObjects("${KERNELDIR}/${SGEMVNKERNEL}" "" "gemv_n" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${SGEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false "SINGLE") endif () @@ -219,7 +219,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) endif () GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type}) endforeach() - if (DEFINED BUILD_COMPLEX16 AND NOT DEFINED BUILD_DOUBLE) + if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) GenerateNamedObjects("${KERNELDIR}/${DGEMMKERNEL}" "" "gemm_kernel" false "" "" false "DOUBLE") if (DGEMMINCOPY) GenerateNamedObjects("${KERNELDIR}/${DGEMMINCOPY}" "DOUBLE" "${DGEMMINCOPYOBJ}" false "" "" true "DOUBLE") @@ -235,19 +235,19 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) endif () GenerateNamedObjects("${KERNELDIR}/${DGEMM_BETA}" "" "gemm_beta" false "" "" false "DOUBLE") endif () - if (DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_SINGLE) - GenerateNamedObjects("${KERNELDIR}/${SGEMMKERNEL}" "" "gemm_kernel" false "" "" false "DOUBLE") + if ((BUILD_DOUBLE OR BUILD_COMPLEX) AND NOT BUILD_SINGLE) + GenerateNamedObjects("${KERNELDIR}/${SGEMMKERNEL}" "" "gemm_kernel" false "" "" false "SINGLE") if (SGEMMINCOPY) - GenerateNamedObjects("${KERNELDIR}/${SGEMMINCOPY}" "DOUBLE" "${SGEMMINCOPYOBJ}" false "" "" true "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SGEMMINCOPY}" "SINGLE" "${SGEMMINCOPYOBJ}" false "" "" true "SINGLE") endif () if (SGEMMITCOPY) - GenerateNamedObjects("${KERNELDIR}/${SGEMMITCOPY}" "DOUBLE" "${SGEMMITCOPYOBJ}" false "" "" true "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SGEMMITCOPY}" "SINGLE" "${SGEMMITCOPYOBJ}" false "" "" true "SINGLE") endif () if (SGEMMONCOPY) - GenerateNamedObjects("${KERNELDIR}/${SGEMMONCOPY}" "DOUBLE" "${SGEMMONCOPYOBJ}" false "" "" true "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SGEMMONCOPY}" "SINGLE" "${SGEMMONCOPYOBJ}" false "" "" true "SINGLE") endif () if (SGEMMOTCOPY) - GenerateNamedObjects("${KERNELDIR}/${SGEMMOTCOPY}" "DOUBLE" "${SGEMMOTCOPYOBJ}" false "" "" true "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SGEMMOTCOPY}" "SINGLE" "${SGEMMOTCOPYOBJ}" false "" "" true "SINGLE") endif () GenerateNamedObjects("${KERNELDIR}/${SGEMM_BETA}" "" "gemm_beta" false "" "" false "SINGLE") endif () @@ -591,7 +591,31 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) #geadd GenerateNamedObjects("${KERNELDIR}/${${float_char}GEADD_KERNEL}" "" "geadd_k" false "" "" false ${float_type}) endforeach () - + if (BUILD_DOUBLE AND NOT BUILD_SINGLE) + GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_LN}" "UPPER;LN;TRSMKERNEL" "trsm_kernel_LN" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_LT}" "LT;TRSMKERNEL" "trsm_kernel_LT" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_RN}" "UPPER;RN;TRSMKERNEL" "trsm_kernel_RN" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_RT}" "RT;TRSMKERNEL" "trsm_kernel_RT" false "" "" false "SINGLE") + GenerateNamedObjects("generic/trsm_uncopy_${SGEMM_UNROLL_M}.c" "UNIT" "trsm_iunucopy" false "" "" false "SINGLE") + GenerateNamedObjects("generic/trsm_uncopy_${SGEMM_UNROLL_M}.c" "" "trsm_iunncopy" false "" "" false "SINGLE") + GenerateNamedObjects("generic/trsm_uncopy_${SGEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_ounucopy" false "" "" false "SINGLE") + GenerateNamedObjects("generic/trsm_uncopy_${SGEMM_UNROLL_N}.c" "OUTER" "trsm_ounncopy" false "" "" false "SINGLE") + + GenerateNamedObjects("generic/trsm_lncopy_${SGEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_ilnucopy" false "" "" false "SINGLE") + GenerateNamedObjects("generic/trsm_lncopy_${SGEMM_UNROLL_M}.c" "LOWER" "trsm_ilnncopy" false "" "" false "SINGLE") + GenerateNamedObjects("generic/trsm_lncopy_${SGEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_olnucopy" false "" "" false "SINGLE") + GenerateNamedObjects("generic/trsm_lncopy_${SGEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_olnncopy" false "" "" false "SINGLE") + + GenerateNamedObjects("generic/trsm_utcopy_${SGEMM_UNROLL_M}.c" "UNIT" "trsm_iutucopy" false "" "" false "SINGLE") + GenerateNamedObjects("generic/trsm_utcopy_${SGEMM_UNROLL_M}.c" "" "trsm_iutncopy" false "" "" false "SINGLE") + GenerateNamedObjects("generic/trsm_utcopy_${SGEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_outucopy" false "" "" false "SINGLE") + GenerateNamedObjects("generic/trsm_utcopy_${SGEMM_UNROLL_N}.c" "OUTER" "trsm_outncopy" false "" "" false "SINGLE") + + GenerateNamedObjects("generic/trsm_ltcopy_${SGEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_iltucopy" false "" "" false "SINGLE") + GenerateNamedObjects("generic/trsm_ltcopy_${SGEMM_UNROLL_M}.c" "LOWER" "trsm_iltncopy" false "" "" false "SINGLE") + GenerateNamedObjects("generic/trsm_ltcopy_${SGEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_oltucopy" false "" "" false "SINGLE") + GenerateNamedObjects("generic/trsm_ltcopy_${SGEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" "" false "SINGLE") + endif () # Makefile.LA if(NOT NO_LAPACK) @@ -618,6 +642,28 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${${float_char}NEG_TCOPY}_${${float_char}GEMM_UNROLL_M}" "" "neg_tcopy" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}LASWP_NCOPY}_${${float_char}GEMM_UNROLL_N}" "" "laswp_ncopy" false "" "" false ${float_type}) endforeach() + if (BUILD_COMPLEX AND NOT BUILD_SINGLE) + if (NOT DEFINED SNEG_TCOPY) + set(SNEG_TCOPY ../generic/neg_tcopy.c) + endif () + + if (NOT DEFINED SLASWP_NCOPY) + set(SLASWP_NCOPY ../generic/laswp_ncopy.c) + endif () + GenerateNamedObjects("${KERNELDIR}/${SNEG_TCOPY}_${SGEMM_UNROLL_M}" "" "neg_tcopy" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SLASWP_NCOPY}_${SGEMM_UNROLL_N}" "" "laswp_ncopy" false "" "" false "SINGLE") + endif() + if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) + if (NOT DEFINED DNEG_TCOPY) + set(DNEG_TCOPY ../generic/neg_tcopy.c) + endif () + + if (NOT DEFINED DLASWP_NCOPY) + set(DLASWP_NCOPY ../generic/laswp_ncopy.c) + endif () + GenerateNamedObjects("${KERNELDIR}/${DNEG_TCOPY}_${DGEMM_UNROLL_M}" "" "neg_tcopy" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DLASWP_NCOPY}_${DGEMM_UNROLL_N}" "" "laswp_ncopy" false "" "" false "DOUBLE") + endif() endif() if (${DYNAMIC_ARCH}) @@ -649,8 +695,147 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("generic/laswp_ncopy_${${float_char}GEMM_UNROLL_N}.c" "" "laswp_ncopy" false "" ${TSUFFIX} false ${float_type}) endforeach () - + if (BUILD_COMPLEX AND NOT BUILD_SINGLE) + GenerateNamedObjects("${KERNELDIR}/${SGEMVNKERNEL}" "" "gemv_n" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SGEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false "SINGLE") + GenerateNamedObjects("generic/neg_tcopy_${SGEMM_UNROLL_M}.c" "" "neg_tcopy" false "" ${TSUFFIX} false "SINGLE") + GenerateNamedObjects("generic/laswp_ncopy_${SGEMM_UNROLL_N}.c" "" "laswp_ncopy" false "" ${TSUFFIX} false "SINGLE") + endif () + if (BUILD_DOUBLE AND NOT BUILD_SINGLE) + GenerateNamedObjects("generic/neg_tcopy_${SGEMM_UNROLL_M}.c" "" "neg_tcopy" false "" ${TSUFFIX} false "SINGLE") + GenerateNamedObjects("generic/laswp_ncopy_${SGEMM_UNROLL_N}.c" "" "laswp_ncopy" false "" ${TSUFFIX} false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_LN}" "UPPER;LN;TRSMKERNEL" "trsm_kernel_LN" false "" ${TSUFFIX} false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_LT}" "LT;TRSMKERNEL" "trsm_kernel_LT" false "" ${TSUFFIX} false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_RN}" "UPPER;RN;TRSMKERNEL" "trsm_kernel_RN" false "" ${TSUFFIX} false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_RT}" "RT;TRSMKERNEL" "trsm_kernel_RT" false "" ${TSUFFIX} false "SINGLE") + GenerateNamedObjects("generic/trsm_uncopy_${SGEMM_UNROLL_M}.c" "UNIT" "trsm_iunucopy" false "" ${TSUFFIX} false "SINGLE") + GenerateNamedObjects("generic/trsm_uncopy_${SGEMM_UNROLL_M}.c" "" "trsm_iunncopy" false "" ${TSUFFIX} false "SINGLE") + GenerateNamedObjects("generic/trsm_uncopy_${SGEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_ounucopy" false "" ${TSUFFIX} false "SINGLE") + GenerateNamedObjects("generic/trsm_uncopy_${SGEMM_UNROLL_N}.c" "OUTER" "trsm_ounncopy" false "" ${TSUFFIX} false "SINGLE") + + GenerateNamedObjects("generic/trsm_lncopy_${SGEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_ilnucopy" false "" ${TSUFFIX} false "SINGLE") + GenerateNamedObjects("generic/trsm_lncopy_${SGEMM_UNROLL_M}.c" "LOWER" "trsm_ilnncopy" false "" ${TSUFFIX} false "SINGLE") + GenerateNamedObjects("generic/trsm_lncopy_${SGEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_olnucopy" false "" ${TSUFFIX} false "SINGLE") + GenerateNamedObjects("generic/trsm_lncopy_${SGEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_olnncopy" false "" ${TSUFFIX} false "SINGLE") + + GenerateNamedObjects("generic/trsm_utcopy_${SGEMM_UNROLL_M}.c" "UNIT" "trsm_iutucopy" false "" ${TSUFFIX} false "SINGLE") + GenerateNamedObjects("generic/trsm_utcopy_${SGEMM_UNROLL_M}.c" "" "trsm_iutncopy" false "" ${TSUFFIX} false "SINGLE") + GenerateNamedObjects("generic/trsm_utcopy_${SGEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_outucopy" false "" ${TSUFFIX} false "SINGLE") + GenerateNamedObjects("generic/trsm_utcopy_${SGEMM_UNROLL_N}.c" "OUTER" "trsm_outncopy" false "" ${TSUFFIX} false "SINGLE") + + GenerateNamedObjects("generic/trsm_ltcopy_${SGEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_iltucopy" false "" ${TSUFFIX} false "SINGLE") + GenerateNamedObjects("generic/trsm_ltcopy_${SGEMM_UNROLL_M}.c" "LOWER" "trsm_iltncopy" false "" ${TSUFFIX} false "SINGLE") + GenerateNamedObjects("generic/trsm_ltcopy_${SGEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_oltucopy" false "" ${TSUFFIX} false "SINGLE") + GenerateNamedObjects("generic/trsm_ltcopy_${SGEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" ${TSUFFIX} false "SINGLE") + + if (SGEMMINCOPY) + GenerateNamedObjects("${KERNELDIR}/${SGEMMINCOPY}" "SINGLE" "${SGEMMINCOPYOBJ}" false "" "" true "SINGLE") + endif () + if (SGEMMITCOPY) + GenerateNamedObjects("${KERNELDIR}/${SGEMMITCOPY}" "SINGLE" "${SGEMMITCOPYOBJ}" false "" "" true "SINGLE") + endif () + if (SGEMMONCOPY) + GenerateNamedObjects("${KERNELDIR}/${SGEMMONCOPY}" "SINGLE" "${SGEMMONCOPYOBJ}" false "" "" true "SINGLE") + endif () + if (SGEMMOTCOPY) + GenerateNamedObjects("${KERNELDIR}/${SGEMMOTCOPY}" "SINGLE" "${SGEMMOTCOPYOBJ}" false "" "" true "SINGLE") + endif () + GenerateNamedObjects("${KERNELDIR}/${SGEMVNKERNEL}" "" "gemv_n" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SGEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false "SINGLE") + endif () + + if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) + GenerateNamedObjects("generic/neg_tcopy_${DGEMM_UNROLL_M}.c" "" "neg_tcopy" false "" ${TSUFFIX} false "DOUBLE") + GenerateNamedObjects("generic/laswp_ncopy_${DGEMM_UNROLL_N}.c" "" "laswp_ncopy" false "" ${TSUFFIX} false "DOUBLE") + endif () + if (BUILD_COMPLEX16 AND NOT BUILD_COMPLEX) + GenerateNamedObjects("${KERNELDIR}/${CAMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CAMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false "COMPLEX") + if (DEFINED CMAXKERNEL) + GenerateNamedObjects("${KERNELDIR}/${CMAXKERNEL}" "" "max_k" false "" "" false "COMPLEX") + endif () + if (DEFINED CMINKERNEL) + GenerateNamedObjects("${KERNELDIR}/${CMINKERNEL}" "USE_MIN" "min_k" false "" "" false "COMPLEX") + endif () + GenerateNamedObjects("${KERNELDIR}/${ICAMAXKERNEL}" "USE_ABS" "i*amax_k" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${ICAMINKERNEL}" "USE_ABS;USE_MIN" "i*amin_k" false "" "" false "COMPLEX") + if (DEFINED ICMAXKERNEL) + GenerateNamedObjects("${KERNELDIR}/${ICMAXKERNEL}" "" "i*max_k" false "" "" false "COMPLEX") + endif () + if (DEFINED ICMINKERNEL) + GenerateNamedObjects("${KERNELDIR}/${ICMINKERNEL}" "USE_MIN" "i*min_k" false "" "" false "COMPLEX") + endif () + GenerateNamedObjects("${KERNELDIR}/${CASUMKERNEL}" "" "asum_k" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CAXPYKERNEL}" "" "axpy_k" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CCOPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CNRM2KERNEL}" "" "nrm2_k" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CROTKERNEL}" "" "rot_k" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CSCALKERNEL}" "" "scal_k" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CSWAPKERNEL}" "" "swap_k" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CAXPBYKERNEL}" "" "axpby_k" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CSUMKERNEL}" "" "sum_k" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CAXPYKERNEL}" "CONJ" "axpyc_k" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CDOTKERNEL}" "" "dotu_k" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CDOTKERNEL}" "CONJ" "dotc_k" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMVNKERNEL}" "" "gemv_n" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMVTKERNEL}" "TRANSA" "gemv_t" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMVNKERNEL}" "CONJ" "gemv_r" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMVTKERNEL}" "CONJ;TRANSA" "gemv_c" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMVNKERNEL}" "XCONJ" "gemv_o" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMVTKERNEL}" "XCONJ;TRANSA" "gemv_u" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMVNKERNEL}" "XCONJ;CONJ" "gemv_s" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMVTKERNEL}" "XCONJ;CONJ;TRANSA" "gemv_d" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CTRSMKERNEL_LN}" "UPPER;LN;TRSMKERNEL;CONJ" "trsm_kernel_LR" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CTRSMKERNEL_LT}" "LT;TRSMKERNEL;CONJ" "trsm_kernel_LC" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CTRSMKERNEL_RN}" "UPPER;RN;TRSMKERNEL;CONJ" "trsm_kernel_RR" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CTRSMKERNEL_RT}" "RT;TRSMKERNEL;CONJ" "trsm_kernel_RC" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CTRSMKERNEL_LN}" "UPPER;LN;TRSMKERNEL" "trsm_kernel_LN" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CTRSMKERNEL_LT}" "LT;TRSMKERNEL" "trsm_kernel_LT" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CTRSMKERNEL_RN}" "UPPER;RN;TRSMKERNEL" "trsm_kernel_RN" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CTRSMKERNEL_RT}" "RT;TRSMKERNEL" "trsm_kernel_RT" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMMKERNEL}" "NN" "gemm_kernel_n" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMMKERNEL}" "CN" "gemm_kernel_l" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMMKERNEL}" "NC" "gemm_kernel_r" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMMKERNEL}" "CC" "gemm_kernel_b" false "" "" false "COMPLEX") + if (CGEMMINCOPY) + GenerateNamedObjects("${KERNELDIR}/${CGEMMINCOPY}" "COMPLEX" "${CGEMMINCOPYOBJ}" false "" "" true "COMPLEX") + endif () + + if (CGEMMITCOPY) + GenerateNamedObjects("${KERNELDIR}/${CGEMMITCOPY}" "COMPLEX" "${CGEMMITCOPYOBJ}" false "" "" true "COMPLEX") + endif () + + if (CGEMMONCOPY) + GenerateNamedObjects("${KERNELDIR}/${CGEMMONCOPY}" "COMPLEX" "${CGEMMONCOPYOBJ}" false "" "" true "COMPLEX") + endif () + + if (CGEMMOTCOPY) + GenerateNamedObjects("${KERNELDIR}/${CGEMMOTCOPY}" "COMPLEX" "${CGEMMOTCOPYOBJ}" false "" "" true "COMPLEX") + endif () + GenerateNamedObjects("${KERNELDIR}/${CGEMM_BETA}" "" "gemm_beta" false "" "" false "COMPLEX") + GenerateNamedObjects("generic/ztrsm_uncopy_${CGEMM_UNROLL_M}.c" "UNIT" "trsm_iunucopy" false "" ${TSUFFIX} false "COMPLEX") + GenerateNamedObjects("generic/ztrsm_uncopy_${CGEMM_UNROLL_M}.c" "" "trsm_iunncopy" false "" ${TSUFFIX} false "COMPLEX") + GenerateNamedObjects("generic/ztrsm_uncopy_${CGEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_ounucopy" false "" ${TSUFFIX} false "COMPLEX") + GenerateNamedObjects("generic/ztrsm_uncopy_${CGEMM_UNROLL_N}.c" "OUTER" "trsm_ounncopy" false "" ${TSUFFIX} false "COMPLEX") + + GenerateNamedObjects("generic/ztrsm_lncopy_${CGEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_ilnucopy" false "" ${TSUFFIX} false "COMPLEX") + GenerateNamedObjects("generic/ztrsm_lncopy_${CGEMM_UNROLL_M}.c" "LOWER" "trsm_ilnncopy" false "" ${TSUFFIX} false "COMPLEX") + GenerateNamedObjects("generic/ztrsm_lncopy_${CGEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_olnucopy" false "" ${TSUFFIX} false "COMPLEX") + GenerateNamedObjects("generic/ztrsm_lncopy_${CGEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_olnncopy" false "" ${TSUFFIX} false "COMPLEX") + + GenerateNamedObjects("generic/ztrsm_utcopy_${CGEMM_UNROLL_M}.c" "UNIT" "trsm_iutucopy" false "" ${TSUFFIX} false "COMPLEX") + GenerateNamedObjects("generic/ztrsm_utcopy_${CGEMM_UNROLL_M}.c" "" "trsm_iutncopy" false "" ${TSUFFIX} false "COMPLEX") + GenerateNamedObjects("generic/ztrsm_utcopy_${CGEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_outucopy" false "" ${TSUFFIX} false "COMPLEX") + GenerateNamedObjects("generic/ztrsm_utcopy_${CGEMM_UNROLL_N}.c" "OUTER" "trsm_outncopy" false "" ${TSUFFIX} false "COMPLEX") + + GenerateNamedObjects("generic/ztrsm_ltcopy_${CGEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_iltucopy" false "" ${TSUFFIX} false "COMPLEX") + GenerateNamedObjects("generic/ztrsm_ltcopy_${CGEMM_UNROLL_M}.c" "LOWER" "trsm_iltncopy" false "" ${TSUFFIX} false "COMPLEX") + GenerateNamedObjects("generic/ztrsm_ltcopy_${CGEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_oltucopy" false "" ${TSUFFIX} false "COMPLEX") + GenerateNamedObjects("generic/ztrsm_ltcopy_${CGEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" ${TSUFFIX} false "COMPLEX") + GenerateNamedObjects("generic/neg_tcopy_${DGEMM_UNROLL_M}.c" "" "neg_tcopy" false "" ${TSUFFIX} false "COMPLEX") + GenerateNamedObjects("generic/laswp_ncopy_${DGEMM_UNROLL_N}.c" "" "laswp_ncopy" false "" ${TSUFFIX} false "COMPLEX") endif () + endif () add_library(kernel${TSUFFIX} OBJECT ${OPENBLAS_SRC}) set_target_properties(kernel${TSUFFIX} PROPERTIES COMPILE_FLAGS "${KERNEL_DEFINITIONS}") @@ -665,7 +850,7 @@ if (${DYNAMIC_ARCH}) set(BUILD_KERNEL 1) set(KDIR "") set(TSUFFIX "_${TARGET_CORE}") - set(KERNEL_DEFINITIONS "-DBUILD_KERNEL -DTABLE_NAME=gotoblas_${TARGET_CORE} -DTS=${TSUFFIX}") + set(KERNEL_DEFINITIONS "-DBUILD_KERNEL -DTABLE_NAME=gotoblas_${TARGET_CORE} -DTS=${TSUFFIX}") build_core("${TARGET_CORE}" "${KDIR}" "${TSUFFIX}" "${KERNEL_DEFINITIONS}") set(ADD_COMMONOBJS 0) endforeach() diff --git a/kernel/Makefile.L2 b/kernel/Makefile.L2 index 2aeb8f041..79399c342 100644 --- a/kernel/Makefile.L2 +++ b/kernel/Makefile.L2 @@ -186,31 +186,46 @@ ifndef XHEMV_M_KERNEL XHEMV_M_KERNEL = ../generic/zhemv_k.c endif +ifneq "$(or $(BUILD_SINGLE), $(BUILD_DOUBLE), $(BUILD_COMPLEX))" "" SBLASOBJS += \ - sgemv_n$(TSUFFIX).$(SUFFIX) sgemv_t$(TSUFFIX).$(SUFFIX) ssymv_U$(TSUFFIX).$(SUFFIX) ssymv_L$(TSUFFIX).$(SUFFIX) \ + sgemv_n$(TSUFFIX).$(SUFFIX) sgemv_t$(TSUFFIX).$(SUFFIX) +endif +ifeq ($(BUILD_SINGLE),1) +SBLASOBJS += \ + ssymv_U$(TSUFFIX).$(SUFFIX) ssymv_L$(TSUFFIX).$(SUFFIX) \ sger_k$(TSUFFIX).$(SUFFIX) - +endif +ifeq ($(BUILD_DOUBLE),1) DBLASOBJS += \ dgemv_n$(TSUFFIX).$(SUFFIX) dgemv_t$(TSUFFIX).$(SUFFIX) dsymv_U$(TSUFFIX).$(SUFFIX) dsymv_L$(TSUFFIX).$(SUFFIX) \ dger_k$(TSUFFIX).$(SUFFIX) - +endif QBLASOBJS += \ qgemv_n$(TSUFFIX).$(SUFFIX) qgemv_t$(TSUFFIX).$(SUFFIX) qsymv_U$(TSUFFIX).$(SUFFIX) qsymv_L$(TSUFFIX).$(SUFFIX) \ qger_k$(TSUFFIX).$(SUFFIX) - +ifeq ($(BUILD_COMPLEX),1) +SBLASOBJS += \ + sgemv_n$(TSUFFIX).$(SUFFIX) sgemv_t$(TSUFFIX).$(SUFFIX) CBLASOBJS += \ cgemv_n$(TSUFFIX).$(SUFFIX) cgemv_t$(TSUFFIX).$(SUFFIX) cgemv_r$(TSUFFIX).$(SUFFIX) cgemv_c$(TSUFFIX).$(SUFFIX) \ cgemv_o$(TSUFFIX).$(SUFFIX) cgemv_u$(TSUFFIX).$(SUFFIX) cgemv_s$(TSUFFIX).$(SUFFIX) cgemv_d$(TSUFFIX).$(SUFFIX) \ csymv_U$(TSUFFIX).$(SUFFIX) csymv_L$(TSUFFIX).$(SUFFIX) \ chemv_U$(TSUFFIX).$(SUFFIX) chemv_L$(TSUFFIX).$(SUFFIX) chemv_V$(TSUFFIX).$(SUFFIX) chemv_M$(TSUFFIX).$(SUFFIX) \ cgeru_k$(TSUFFIX).$(SUFFIX) cgerc_k$(TSUFFIX).$(SUFFIX) cgerv_k$(TSUFFIX).$(SUFFIX) cgerd_k$(TSUFFIX).$(SUFFIX) - +endif +ifeq ($(BUILD_COMPLEX16),1) +CBLASOBJS += \ + cgemv_n$(TSUFFIX).$(SUFFIX) cgemv_t$(TSUFFIX).$(SUFFIX) cgemv_r$(TSUFFIX).$(SUFFIX) cgemv_c$(TSUFFIX).$(SUFFIX) \ + cgemv_o$(TSUFFIX).$(SUFFIX) cgemv_u$(TSUFFIX).$(SUFFIX) cgemv_s$(TSUFFIX).$(SUFFIX) cgemv_d$(TSUFFIX).$(SUFFIX) +DBLASOBJS += \ + dgemv_n$(TSUFFIX).$(SUFFIX) dgemv_t$(TSUFFIX).$(SUFFIX) ZBLASOBJS += \ zgemv_n$(TSUFFIX).$(SUFFIX) zgemv_t$(TSUFFIX).$(SUFFIX) zgemv_r$(TSUFFIX).$(SUFFIX) zgemv_c$(TSUFFIX).$(SUFFIX) \ zgemv_o$(TSUFFIX).$(SUFFIX) zgemv_u$(TSUFFIX).$(SUFFIX) zgemv_s$(TSUFFIX).$(SUFFIX) zgemv_d$(TSUFFIX).$(SUFFIX) \ zsymv_U$(TSUFFIX).$(SUFFIX) zsymv_L$(TSUFFIX).$(SUFFIX) \ zhemv_U$(TSUFFIX).$(SUFFIX) zhemv_L$(TSUFFIX).$(SUFFIX) zhemv_V$(TSUFFIX).$(SUFFIX) zhemv_M$(TSUFFIX).$(SUFFIX) \ zgeru_k$(TSUFFIX).$(SUFFIX) zgerc_k$(TSUFFIX).$(SUFFIX) zgerv_k$(TSUFFIX).$(SUFFIX) zgerd_k$(TSUFFIX).$(SUFFIX) +endif XBLASOBJS += \ xgemv_n$(TSUFFIX).$(SUFFIX) xgemv_t$(TSUFFIX).$(SUFFIX) xgemv_r$(TSUFFIX).$(SUFFIX) xgemv_c$(TSUFFIX).$(SUFFIX) \ @@ -219,17 +234,21 @@ XBLASOBJS += \ xhemv_U$(TSUFFIX).$(SUFFIX) xhemv_L$(TSUFFIX).$(SUFFIX) xhemv_V$(TSUFFIX).$(SUFFIX) xhemv_M$(TSUFFIX).$(SUFFIX) \ xgeru_k$(TSUFFIX).$(SUFFIX) xgerc_k$(TSUFFIX).$(SUFFIX) xgerv_k$(TSUFFIX).$(SUFFIX) xgerd_k$(TSUFFIX).$(SUFFIX) +ifneq "$(or $(BUILD_SINGLE), $(BUILD_DOUBLE), $(BUILD_COMPLEX))" "" $(KDIR)sgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)sgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -UTRANS $< -o $@ $(KDIR)sgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)sgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -DTRANS $< -o $@ +endif +ifneq "$(or $(BUILD_DOUBLE),$(BUILD_COMPLEX16))" "" $(KDIR)dgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)dgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -UTRANS $< -o $@ $(KDIR)dgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)dgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DTRANS $< -o $@ +endif $(KDIR)qgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)qgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMVNKERNEL) $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -UTRANS $< -o $@ @@ -237,6 +256,8 @@ $(KDIR)qgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)qgemv_n$(TSUFFIX).$(PSUFFIX) : $(KER $(KDIR)qgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)qgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMVTKERNEL) $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -DTRANS $< -o $@ + +ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" "" $(KDIR)cgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -UTRANS -UCONJ -UXCONJ $< -o $@ @@ -260,6 +281,10 @@ $(KDIR)cgemv_s$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_s$(TSUFFIX).$(PSUFFIX) : $(KERNE $(KDIR)cgemv_d$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_d$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANS -DCONJ -DXCONJ $< -o $@ +endif + + +ifeq ($(BUILD_COMPLEX16),1) $(KDIR)zgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -UTRANS -UCONJ -UXCONJ $< -o $@ @@ -284,6 +309,7 @@ $(KDIR)zgemv_s$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_s$(TSUFFIX).$(PSUFFIX) : $(KERNE $(KDIR)zgemv_d$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_d$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANS -DCONJ -DXCONJ $< -o $@ +endif $(KDIR)xgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVNKERNEL) $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -UTRANS -UCONJ -UXCONJ $< -o $@ @@ -309,17 +335,25 @@ $(KDIR)xgemv_s$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_s$(TSUFFIX).$(PSUFFIX) : $(KERNE $(KDIR)xgemv_d$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_d$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVTKERNEL) $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANS -DCONJ -DXCONJ $< -o $@ + +ifeq ($(BUILD_SINGLE),1) + $(KDIR)ssymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)ssymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSYMV_U_KERNEL) $(SSYMV_U_PARAM) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $@ $(KDIR)ssymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)ssymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSYMV_L_KERNEL) $(SSYMV_L_PARAM) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $@ +endif + + +ifeq ($(BUILD_DOUBLE),1) $(KDIR)dsymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)dsymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSYMV_U_KERNEL) $(DSYMV_U_PARAM) $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $@ $(KDIR)dsymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)dsymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSYMV_L_KERNEL) $(DSYMV_L_PARAM) $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $@ +endif $(KDIR)qsymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)qsymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSYMV_U_KERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $@ @@ -327,17 +361,23 @@ $(KDIR)qsymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)qsymv_U$(TSUFFIX).$(PSUFFIX) : $(KER $(KDIR)qsymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)qsymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSYMV_L_KERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $@ +ifeq ($(BUILD_COMPLEX),1) + $(KDIR)csymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)csymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSYMV_U_KERNEL) $(CSYMV_U_PARAM) $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $@ $(KDIR)csymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)csymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSYMV_L_KERNEL) $(CSYMV_L_PARAM) $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $@ +endif + +ifeq ($(BUILD_COMPLEX16),1) $(KDIR)zsymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)zsymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSYMV_U_KERNEL) $(ZSYMV_U_PARAM) $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $@ $(KDIR)zsymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)zsymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSYMV_L_KERNEL) $(ZSYMV_L_PARAM) $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $@ +endif $(KDIR)xsymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)xsymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSYMV_U_KERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $@ @@ -345,15 +385,23 @@ $(KDIR)xsymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)xsymv_U$(TSUFFIX).$(PSUFFIX) : $(KER $(KDIR)xsymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)xsymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSYMV_L_KERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $@ +ifeq ($(BUILD_SINGLE),1) + $(KDIR)sger_k$(TSUFFIX).$(SUFFIX) $(KDIR)sger_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGERKERNEL) $(SGERPARAM) $(CC) -c $(CFLAGS) -UDOUBLE $< -o $@ +endif + +ifeq ($(BUILD_DOUBLE),1) $(KDIR)dger_k$(TSUFFIX).$(SUFFIX) $(KDIR)dger_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGERKERNEL) $(DGERPARAM) $(CC) -c $(CFLAGS) -DDOUBLE $< -o $@ +endif $(KDIR)qger_k$(TSUFFIX).$(SUFFIX) $(KDIR)qger_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGERKERNEL) $(QGERPARAM) $(CC) -c $(CFLAGS) -DXDOUBLE $< -o $@ +ifeq ($(BUILD_COMPLEX),1) + $(KDIR)cgeru_k$(TSUFFIX).$(SUFFIX) $(KDIR)cgeru_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGERUKERNEL) $(CGERPARAM) $(CC) -c $(CFLAGS) -UDOUBLE -UCONJ $< -o $@ @@ -365,6 +413,9 @@ $(KDIR)cgerv_k$(TSUFFIX).$(SUFFIX) $(KDIR)cgerv_k$(TSUFFIX).$(PSUFFIX) : $(KER $(KDIR)cgerd_k$(TSUFFIX).$(SUFFIX) $(KDIR)cgerd_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGERCKERNEL) $(CGERPARAM) $(CC) -c $(CFLAGS) -UDOUBLE -DCONJ -DXCONJ $< -o $@ +endif + +ifeq ($(BUILD_COMPLEX16),1) $(KDIR)zgeru_k$(TSUFFIX).$(SUFFIX) $(KDIR)zgeru_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGERUKERNEL) $(ZGERPARAM) $(CC) -c $(CFLAGS) -DDOUBLE -UCONJ $< -o $@ @@ -377,6 +428,7 @@ $(KDIR)zgerv_k$(TSUFFIX).$(SUFFIX) $(KDIR)zgerv_k$(TSUFFIX).$(PSUFFIX) : $(KER $(KDIR)zgerd_k$(TSUFFIX).$(SUFFIX) $(KDIR)zgerd_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGERCKERNEL) $(ZGERPARAM) $(CC) -c $(CFLAGS) -DDOUBLE -DCONJ -DXCONJ $< -o $@ +endif $(KDIR)xgeru_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgeru_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGERUKERNEL) $(XGERPARAM) $(CC) -c $(CFLAGS) -DXDOUBLE -UCONJ $< -o $@ @@ -390,6 +442,8 @@ $(KDIR)xgerv_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgerv_k$(TSUFFIX).$(PSUFFIX) : $(KER $(KDIR)xgerd_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgerd_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGERCKERNEL) $(XGERPARAM) $(CC) -c $(CFLAGS) -DXDOUBLE -DCONJ -DXCONJ $< -o $@ +ifeq ($(BUILD_COMPLEX),1) + $(KDIR)chemv_U$(TSUFFIX).$(SUFFIX) $(KDIR)chemv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CHEMV_U_KERNEL) $(CHEMV_U_PARAM) $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMV $< -o $@ @@ -401,6 +455,9 @@ $(KDIR)chemv_V$(TSUFFIX).$(SUFFIX) $(KDIR)chemv_V$(TSUFFIX).$(PSUFFIX) : $(KER $(KDIR)chemv_M$(TSUFFIX).$(SUFFIX) $(KDIR)chemv_M$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CHEMV_M_KERNEL) $(CHEMV_L_PARAM) ../symcopy.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMV -DHEMVREV $< -o $@ +endif + +ifeq ($(BUILD_COMPLEX16),1) $(KDIR)zhemv_U$(TSUFFIX).$(SUFFIX) $(KDIR)zhemv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZHEMV_U_KERNEL) $(ZHEMV_U_PARAM) $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMV $< -o $@ @@ -413,7 +470,7 @@ $(KDIR)zhemv_V$(TSUFFIX).$(SUFFIX) $(KDIR)zhemv_V$(TSUFFIX).$(PSUFFIX) : $(KER $(KDIR)zhemv_M$(TSUFFIX).$(SUFFIX) $(KDIR)zhemv_M$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZHEMV_M_KERNEL) $(ZHEMV_L_PARAM) ../symcopy.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMV -DHEMVREV $< -o $@ - +endif $(KDIR)xhemv_U$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XHEMV_U_KERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMV $< -o $@ @@ -426,3 +483,4 @@ $(KDIR)xhemv_V$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_V$(TSUFFIX).$(PSUFFIX) : $(KER $(KDIR)xhemv_M$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_M$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XHEMV_M_KERNEL) ../symcopy.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMV -DHEMVREV $< -o $@ + diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 24e17d9b4..e03ed0fad 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -100,8 +100,10 @@ SHKERNELOBJS += \ $(SHGEMMONCOPYOBJ) $(SHGEMMOTCOPYOBJ) endif +ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" "" SKERNELOBJS += \ sgemm_kernel$(TSUFFIX).$(SUFFIX) \ + sgemm_beta$(TSUFFIX).$(SUFFIX) \ $(SGEMMINCOPYOBJ) $(SGEMMITCOPYOBJ) \ $(SGEMMONCOPYOBJ) $(SGEMMOTCOPYOBJ) @@ -110,28 +112,36 @@ SKERNELOBJS += \ sgemm_direct$(TSUFFIX).$(SUFFIX) \ sgemm_direct_performant$(TSUFFIX).$(SUFFIX) endif +endif +ifneq "$(or $(BUILD_DOUBLE),$(BUILD_COMPLEX16))" "" DKERNELOBJS += \ + dgemm_beta$(TSUFFIX).$(SUFFIX) \ dgemm_kernel$(TSUFFIX).$(SUFFIX) \ $(DGEMMINCOPYOBJ) $(DGEMMITCOPYOBJ) \ $(DGEMMONCOPYOBJ) $(DGEMMOTCOPYOBJ) +endif QKERNELOBJS += \ qgemm_kernel$(TSUFFIX).$(SUFFIX) \ $(QGEMMINCOPYOBJ) $(QGEMMITCOPYOBJ) \ $(QGEMMONCOPYOBJ) $(QGEMMOTCOPYOBJ) +ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" "" CKERNELOBJS += \ cgemm_kernel_n$(TSUFFIX).$(SUFFIX) cgemm_kernel_r$(TSUFFIX).$(SUFFIX) \ cgemm_kernel_l$(TSUFFIX).$(SUFFIX) cgemm_kernel_b$(TSUFFIX).$(SUFFIX) \ $(CGEMMINCOPYOBJ) $(CGEMMITCOPYOBJ) \ $(CGEMMONCOPYOBJ) $(CGEMMOTCOPYOBJ) +endif +ifeq ($(BUILD_COMPLEX16),1) ZKERNELOBJS += \ zgemm_kernel_n$(TSUFFIX).$(SUFFIX) zgemm_kernel_r$(TSUFFIX).$(SUFFIX) \ zgemm_kernel_l$(TSUFFIX).$(SUFFIX) zgemm_kernel_b$(TSUFFIX).$(SUFFIX) \ $(ZGEMMINCOPYOBJ) $(ZGEMMITCOPYOBJ) \ $(ZGEMMONCOPYOBJ) $(ZGEMMOTCOPYOBJ) +endif XKERNELOBJS += \ xgemm_kernel_n$(TSUFFIX).$(SUFFIX) xgemm_kernel_r$(TSUFFIX).$(SUFFIX) \ @@ -153,38 +163,48 @@ ifeq ($(BUILD_HALF),1) SHBLASOBJS += shgemm_beta$(TSUFFIX).$(SUFFIX) endif +ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE))" "" SBLASOBJS += \ sgemm_beta$(TSUFFIX).$(SUFFIX) \ strmm_kernel_LN$(TSUFFIX).$(SUFFIX) strmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ strmm_kernel_RN$(TSUFFIX).$(SUFFIX) strmm_kernel_RT$(TSUFFIX).$(SUFFIX) \ strsm_kernel_LN$(TSUFFIX).$(SUFFIX) strsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ - strsm_kernel_RN$(TSUFFIX).$(SUFFIX) strsm_kernel_RT$(TSUFFIX).$(SUFFIX) \ + strsm_kernel_RN$(TSUFFIX).$(SUFFIX) strsm_kernel_RT$(TSUFFIX).$(SUFFIX) +endif +ifeq ($(BUILD_DOUBLE),1) DBLASOBJS += \ dgemm_beta$(TSUFFIX).$(SUFFIX) \ dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \ dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ - dtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) dtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \ + dtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) dtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) +endif QBLASOBJS += \ qgemm_beta$(TSUFFIX).$(SUFFIX) \ qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \ qtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) qtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ - qtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) qtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \ + qtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) qtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) +ifeq ($(BUILD_COMPLEX),1) CBLASOBJS += \ - cgemm_beta$(TSUFFIX).$(SUFFIX) \ ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) \ ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \ - ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) \ + ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) +endif +ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" "" +CBLASOBJS += \ + cgemm_beta$(TSUFFIX).$(SUFFIX) \ ctrsm_kernel_LN$(TSUFFIX).$(SUFFIX) ctrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ ctrsm_kernel_LR$(TSUFFIX).$(SUFFIX) ctrsm_kernel_LC$(TSUFFIX).$(SUFFIX) \ ctrsm_kernel_RN$(TSUFFIX).$(SUFFIX) ctrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \ - ctrsm_kernel_RR$(TSUFFIX).$(SUFFIX) ctrsm_kernel_RC$(TSUFFIX).$(SUFFIX) \ + ctrsm_kernel_RR$(TSUFFIX).$(SUFFIX) ctrsm_kernel_RC$(TSUFFIX).$(SUFFIX) +endif +ifeq ($(BUILD_COMPLEX16),1) ZBLASOBJS += \ zgemm_beta$(TSUFFIX).$(SUFFIX) \ ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ @@ -194,7 +214,8 @@ ZBLASOBJS += \ ztrsm_kernel_LN$(TSUFFIX).$(SUFFIX) ztrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ ztrsm_kernel_LR$(TSUFFIX).$(SUFFIX) ztrsm_kernel_LC$(TSUFFIX).$(SUFFIX) \ ztrsm_kernel_RN$(TSUFFIX).$(SUFFIX) ztrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \ - ztrsm_kernel_RR$(TSUFFIX).$(SUFFIX) ztrsm_kernel_RC$(TSUFFIX).$(SUFFIX) \ + ztrsm_kernel_RR$(TSUFFIX).$(SUFFIX) ztrsm_kernel_RC$(TSUFFIX).$(SUFFIX) +endif XBLASOBJS += \ xgemm_beta$(TSUFFIX).$(SUFFIX) \ @@ -205,7 +226,7 @@ XBLASOBJS += \ xtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) xtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ xtrsm_kernel_LR$(TSUFFIX).$(SUFFIX) xtrsm_kernel_LC$(TSUFFIX).$(SUFFIX) \ xtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) xtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \ - xtrsm_kernel_RR$(TSUFFIX).$(SUFFIX) xtrsm_kernel_RC$(TSUFFIX).$(SUFFIX) \ + xtrsm_kernel_RR$(TSUFFIX).$(SUFFIX) xtrsm_kernel_RC$(TSUFFIX).$(SUFFIX) ifeq ($(USE_GEMM3M), 1) @@ -215,6 +236,7 @@ XBLASOBJS += xgemm3m_kernel$(TSUFFIX).$(SUFFIX) endif +ifeq ($(BUILD_SINGLE),1) SBLASOBJS += \ strmm_iunucopy$(TSUFFIX).$(SUFFIX) strmm_iunncopy$(TSUFFIX).$(SUFFIX) \ strmm_ilnucopy$(TSUFFIX).$(SUFFIX) strmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ @@ -223,7 +245,10 @@ SBLASOBJS += \ strmm_ounucopy$(TSUFFIX).$(SUFFIX) strmm_ounncopy$(TSUFFIX).$(SUFFIX) \ strmm_olnucopy$(TSUFFIX).$(SUFFIX) strmm_olnncopy$(TSUFFIX).$(SUFFIX) \ strmm_outucopy$(TSUFFIX).$(SUFFIX) strmm_outncopy$(TSUFFIX).$(SUFFIX) \ - strmm_oltucopy$(TSUFFIX).$(SUFFIX) strmm_oltncopy$(TSUFFIX).$(SUFFIX) \ + strmm_oltucopy$(TSUFFIX).$(SUFFIX) strmm_oltncopy$(TSUFFIX).$(SUFFIX) +endif +ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE))" "" +SBLASOBJS += \ strsm_iunucopy$(TSUFFIX).$(SUFFIX) strsm_iunncopy$(TSUFFIX).$(SUFFIX) \ strsm_ilnucopy$(TSUFFIX).$(SUFFIX) strsm_ilnncopy$(TSUFFIX).$(SUFFIX) \ strsm_iutucopy$(TSUFFIX).$(SUFFIX) strsm_iutncopy$(TSUFFIX).$(SUFFIX) \ @@ -231,10 +256,15 @@ SBLASOBJS += \ strsm_ounucopy$(TSUFFIX).$(SUFFIX) strsm_ounncopy$(TSUFFIX).$(SUFFIX) \ strsm_olnucopy$(TSUFFIX).$(SUFFIX) strsm_olnncopy$(TSUFFIX).$(SUFFIX) \ strsm_outucopy$(TSUFFIX).$(SUFFIX) strsm_outncopy$(TSUFFIX).$(SUFFIX) \ - strsm_oltucopy$(TSUFFIX).$(SUFFIX) strsm_oltncopy$(TSUFFIX).$(SUFFIX) \ + strsm_oltucopy$(TSUFFIX).$(SUFFIX) strsm_oltncopy$(TSUFFIX).$(SUFFIX) +endif +ifeq ($(BUILD_SINGLE),1) +SBLASOBJS += \ ssymm_iutcopy$(TSUFFIX).$(SUFFIX) ssymm_iltcopy$(TSUFFIX).$(SUFFIX) \ ssymm_outcopy$(TSUFFIX).$(SUFFIX) ssymm_oltcopy$(TSUFFIX).$(SUFFIX) +endif +ifeq ($(BUILD_DOUBLE),1) DBLASOBJS += \ dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) \ dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ @@ -254,6 +284,7 @@ DBLASOBJS += \ dtrsm_oltucopy$(TSUFFIX).$(SUFFIX) dtrsm_oltncopy$(TSUFFIX).$(SUFFIX) \ dsymm_iutcopy$(TSUFFIX).$(SUFFIX) dsymm_iltcopy$(TSUFFIX).$(SUFFIX) \ dsymm_outcopy$(TSUFFIX).$(SUFFIX) dsymm_oltcopy$(TSUFFIX).$(SUFFIX) +endif QBLASOBJS += \ qtrmm_iunucopy$(TSUFFIX).$(SUFFIX) qtrmm_iunncopy$(TSUFFIX).$(SUFFIX) \ @@ -273,8 +304,9 @@ QBLASOBJS += \ qtrsm_outucopy$(TSUFFIX).$(SUFFIX) qtrsm_outncopy$(TSUFFIX).$(SUFFIX) \ qtrsm_oltucopy$(TSUFFIX).$(SUFFIX) qtrsm_oltncopy$(TSUFFIX).$(SUFFIX) \ qsymm_iutcopy$(TSUFFIX).$(SUFFIX) qsymm_iltcopy$(TSUFFIX).$(SUFFIX) \ - qsymm_outcopy$(TSUFFIX).$(SUFFIX) qsymm_oltcopy$(TSUFFIX).$(SUFFIX) \ + qsymm_outcopy$(TSUFFIX).$(SUFFIX) qsymm_oltcopy$(TSUFFIX).$(SUFFIX) +ifeq ($(BUILD_COMPLEX),1) CBLASOBJS += \ ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) \ ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ @@ -284,6 +316,13 @@ CBLASOBJS += \ ctrmm_olnucopy$(TSUFFIX).$(SUFFIX) ctrmm_olnncopy$(TSUFFIX).$(SUFFIX) \ ctrmm_outucopy$(TSUFFIX).$(SUFFIX) ctrmm_outncopy$(TSUFFIX).$(SUFFIX) \ ctrmm_oltucopy$(TSUFFIX).$(SUFFIX) ctrmm_oltncopy$(TSUFFIX).$(SUFFIX) \ + csymm_iutcopy$(TSUFFIX).$(SUFFIX) csymm_iltcopy$(TSUFFIX).$(SUFFIX) \ + csymm_outcopy$(TSUFFIX).$(SUFFIX) csymm_oltcopy$(TSUFFIX).$(SUFFIX) \ + chemm_iutcopy$(TSUFFIX).$(SUFFIX) chemm_iltcopy$(TSUFFIX).$(SUFFIX) \ + chemm_outcopy$(TSUFFIX).$(SUFFIX) chemm_oltcopy$(TSUFFIX).$(SUFFIX) +endif +ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" "" +CBLASOBJS += \ ctrsm_iunucopy$(TSUFFIX).$(SUFFIX) ctrsm_iunncopy$(TSUFFIX).$(SUFFIX) \ ctrsm_ilnucopy$(TSUFFIX).$(SUFFIX) ctrsm_ilnncopy$(TSUFFIX).$(SUFFIX) \ ctrsm_iutucopy$(TSUFFIX).$(SUFFIX) ctrsm_iutncopy$(TSUFFIX).$(SUFFIX) \ @@ -291,12 +330,10 @@ CBLASOBJS += \ ctrsm_ounucopy$(TSUFFIX).$(SUFFIX) ctrsm_ounncopy$(TSUFFIX).$(SUFFIX) \ ctrsm_olnucopy$(TSUFFIX).$(SUFFIX) ctrsm_olnncopy$(TSUFFIX).$(SUFFIX) \ ctrsm_outucopy$(TSUFFIX).$(SUFFIX) ctrsm_outncopy$(TSUFFIX).$(SUFFIX) \ - ctrsm_oltucopy$(TSUFFIX).$(SUFFIX) ctrsm_oltncopy$(TSUFFIX).$(SUFFIX) \ - csymm_iutcopy$(TSUFFIX).$(SUFFIX) csymm_iltcopy$(TSUFFIX).$(SUFFIX) \ - csymm_outcopy$(TSUFFIX).$(SUFFIX) csymm_oltcopy$(TSUFFIX).$(SUFFIX) \ - chemm_iutcopy$(TSUFFIX).$(SUFFIX) chemm_iltcopy$(TSUFFIX).$(SUFFIX) \ - chemm_outcopy$(TSUFFIX).$(SUFFIX) chemm_oltcopy$(TSUFFIX).$(SUFFIX) + ctrsm_oltucopy$(TSUFFIX).$(SUFFIX) ctrsm_oltncopy$(TSUFFIX).$(SUFFIX) +endif +ifeq ($(BUILD_COMPLEX16),1) ZBLASOBJS += \ ztrmm_iunucopy$(TSUFFIX).$(SUFFIX) ztrmm_iunncopy$(TSUFFIX).$(SUFFIX) \ ztrmm_ilnucopy$(TSUFFIX).$(SUFFIX) ztrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ @@ -318,6 +355,7 @@ ZBLASOBJS += \ zsymm_outcopy$(TSUFFIX).$(SUFFIX) zsymm_oltcopy$(TSUFFIX).$(SUFFIX) \ zhemm_iutcopy$(TSUFFIX).$(SUFFIX) zhemm_iltcopy$(TSUFFIX).$(SUFFIX) \ zhemm_outcopy$(TSUFFIX).$(SUFFIX) zhemm_oltcopy$(TSUFFIX).$(SUFFIX) +endif XBLASOBJS += \ xtrmm_iunucopy$(TSUFFIX).$(SUFFIX) xtrmm_iunncopy$(TSUFFIX).$(SUFFIX) \ @@ -343,6 +381,7 @@ XBLASOBJS += \ ifeq ($(USE_GEMM3M), 1) +ifeq ($(BUILD_COMPLEX),1) CBLASOBJS += \ cgemm3m_incopyb$(TSUFFIX).$(SUFFIX) cgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) \ cgemm3m_incopyr$(TSUFFIX).$(SUFFIX) cgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) \ @@ -362,7 +401,9 @@ CBLASOBJS += \ chemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) chemm3m_olcopyb$(TSUFFIX).$(SUFFIX) \ chemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) chemm3m_olcopyr$(TSUFFIX).$(SUFFIX) \ chemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) chemm3m_olcopyi$(TSUFFIX).$(SUFFIX) +endif +ifeq ($(BUILD_COMPLEX16),1) ZBLASOBJS += \ zgemm3m_incopyb$(TSUFFIX).$(SUFFIX) zgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) \ zgemm3m_incopyr$(TSUFFIX).$(SUFFIX) zgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) \ @@ -382,6 +423,7 @@ ZBLASOBJS += \ zhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) zhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) \ zhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) zhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) \ zhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) zhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) +endif XBLASOBJS += \ xgemm3m_incopyb$(TSUFFIX).$(SUFFIX) xgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) \ @@ -406,20 +448,25 @@ XBLASOBJS += \ endif ###### BLAS extensions ##### + +ifeq ($(BUILD_SINGLE),1) SBLASOBJS += \ somatcopy_k_cn$(TSUFFIX).$(SUFFIX) somatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ somatcopy_k_ct$(TSUFFIX).$(SUFFIX) somatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ simatcopy_k_cn$(TSUFFIX).$(SUFFIX) simatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ simatcopy_k_ct$(TSUFFIX).$(SUFFIX) simatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ sgeadd_k$(TSUFFIX).$(SUFFIX) - +endif +ifeq ($(BUILD_DOUBLE),1) DBLASOBJS += \ domatcopy_k_cn$(TSUFFIX).$(SUFFIX) domatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ domatcopy_k_ct$(TSUFFIX).$(SUFFIX) domatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ dimatcopy_k_cn$(TSUFFIX).$(SUFFIX) dimatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ dimatcopy_k_ct$(TSUFFIX).$(SUFFIX) dimatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ dgeadd_k$(TSUFFIX).$(SUFFIX) +endif +ifeq ($(BUILD_COMPLEX),1) CBLASOBJS += \ comatcopy_k_cn$(TSUFFIX).$(SUFFIX) comatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ comatcopy_k_ct$(TSUFFIX).$(SUFFIX) comatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ @@ -430,7 +477,9 @@ CBLASOBJS += \ cimatcopy_k_cnc$(TSUFFIX).$(SUFFIX) cimatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \ cimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) cimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ cgeadd_k$(TSUFFIX).$(SUFFIX) +endif +ifeq ($(BUILD_COMPLEX16),1) ZBLASOBJS += \ zomatcopy_k_cn$(TSUFFIX).$(SUFFIX) zomatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ zomatcopy_k_ct$(TSUFFIX).$(SUFFIX) zomatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ @@ -441,6 +490,7 @@ ZBLASOBJS += \ zimatcopy_k_cnc$(TSUFFIX).$(SUFFIX) zimatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \ zimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ zgeadd_k$(TSUFFIX).$(SUFFIX) +endif ifeq ($(BUILD_HALF), 1) SHGEMMINCOPYOBJ_P = $(SHGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 550af86a6..dd49d8e4e 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -114,7 +114,7 @@ gotoblas_t TABLE_NAME = { #endif #endif -#if defined( BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if ( BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) || (BUILD_COMPLEX16==1) 0, 0, 0, SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N, #ifdef SGEMM_DEFAULT_UNROLL_MN @@ -130,34 +130,38 @@ gotoblas_t TABLE_NAME = { 0, #endif -#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1 ) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) samax_kTS, samin_kTS, smax_kTS, smin_kTS, isamax_kTS, isamin_kTS, ismax_kTS, ismin_kTS, snrm2_kTS, sasum_kTS, #endif -#ifdef BUILD_SINGLE +#if BUILD_SINGLE == 1 ssum_kTS, #endif -#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) scopy_kTS, sdot_kTS, // dsdot_kTS, srot_kTS, saxpy_kTS, - sscal_kTS, +#endif +#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) || (BUILD_COMPLEX16==1) + sscal_kTS, +#endif +#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) sswap_kTS, sgemv_nTS, sgemv_tTS, #endif -#ifdef BUILD_SINGLE +#if BUILD_SINGLE == 1 sger_kTS, ssymv_LTS, ssymv_UTS, +#endif +#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) #ifdef ARCH_X86_64 sgemm_directTS, sgemm_direct_performantTS, #endif -#endif -#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) sgemm_kernelTS, sgemm_betaTS, #if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N sgemm_incopyTS, sgemm_itcopyTS, @@ -167,7 +171,7 @@ gotoblas_t TABLE_NAME = { sgemm_oncopyTS, sgemm_otcopyTS, #endif -#ifdef BUILD_SINGLE +#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) strsm_kernel_LNTS, strsm_kernel_LTTS, strsm_kernel_RNTS, strsm_kernel_RTTS, #if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N strsm_iunucopyTS, strsm_iunncopyTS, strsm_iutucopyTS, strsm_iutncopyTS, @@ -178,6 +182,8 @@ gotoblas_t TABLE_NAME = { #endif strsm_ounucopyTS, strsm_ounncopyTS, strsm_outucopyTS, strsm_outncopyTS, strsm_olnucopyTS, strsm_olnncopyTS, strsm_oltucopyTS, strsm_oltncopyTS, +#endif +#if BUILD_SINGLE == 1 strmm_kernel_RNTS, strmm_kernel_RTTS, strmm_kernel_LNTS, strmm_kernel_LTTS, #if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N strmm_iunucopyTS, strmm_iunncopyTS, strmm_iutucopyTS, strmm_iutncopyTS, @@ -194,16 +200,16 @@ gotoblas_t TABLE_NAME = { ssymm_outcopyTS, ssymm_oltcopyTS, #endif ssymm_outcopyTS, ssymm_oltcopyTS, - +#endif +#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) #ifndef NO_LAPACK sneg_tcopyTS, slaswp_ncopyTS, #else NULL,NULL, #endif - #endif -#if defined (BUILD_DOUBLE) || defined(BUILD_COMPLEX16) +#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16) 0, 0, 0, DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N, #ifdef DGEMM_DEFAULT_UNROLL_MN @@ -214,33 +220,33 @@ gotoblas_t TABLE_NAME = { #endif -#if defined (BUILD_DOUBLE) || defined(BUILD_COMPLEX16) +#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16) damax_kTS, damin_kTS, dmax_kTS, dmin_kTS, idamax_kTS, idamin_kTS, idmax_kTS, idmin_kTS, dnrm2_kTS, dasum_kTS, #endif -#if defined (BUILD_DOUBLE) +#if (BUILD_DOUBLE==1) dsum_kTS, #endif -#if defined (BUILD_DOUBLE) || defined(BUILD_COMPLEX16) +#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16) dcopy_kTS, ddot_kTS, #endif -#if defined (BUILD_SINGLE) || defined(BUILD_DOUBLE) +#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) dsdot_kTS, #endif -#if defined (BUILD_DOUBLE) || defined(BUILD_COMPLEX16) +#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16) drot_kTS, daxpy_kTS, dscal_kTS, dswap_kTS, dgemv_nTS, dgemv_tTS, #endif -#if defined (BUILD_DOUBLE) +#if (BUILD_DOUBLE==1) dger_kTS, dsymv_LTS, dsymv_UTS, #endif -#if defined (BUILD_DOUBLE) || defined(BUILD_COMPLEX16) +#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16) dgemm_kernelTS, dgemm_betaTS, #if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N dgemm_incopyTS, dgemm_itcopyTS, @@ -250,7 +256,7 @@ gotoblas_t TABLE_NAME = { dgemm_oncopyTS, dgemm_otcopyTS, #endif -#if defined (BUILD_DOUBLE) +#if (BUILD_DOUBLE==1) dtrsm_kernel_LNTS, dtrsm_kernel_LTTS, dtrsm_kernel_RNTS, dtrsm_kernel_RTTS, #if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N dtrsm_iunucopyTS, dtrsm_iunncopyTS, dtrsm_iutucopyTS, dtrsm_iutncopyTS, @@ -340,7 +346,7 @@ gotoblas_t TABLE_NAME = { #endif -#ifdef BUILD_COMPLEX +#if (BUILD_COMPLEX || BUILD_COMPLEX16) 0, 0, 0, CGEMM_DEFAULT_UNROLL_M, CGEMM_DEFAULT_UNROLL_N, #ifdef CGEMM_DEFAULT_UNROLL_MN @@ -348,21 +354,34 @@ gotoblas_t TABLE_NAME = { #else MAX(CGEMM_DEFAULT_UNROLL_M, CGEMM_DEFAULT_UNROLL_N), #endif - camax_kTS, camin_kTS, icamax_kTS, icamin_kTS, - cnrm2_kTS, casum_kTS, csum_kTS, ccopy_kTS, - cdotu_kTS, cdotc_kTS, csrot_kTS, - caxpy_kTS, caxpyc_kTS, cscal_kTS, cswap_kTS, +#endif +#if (BUILD_COMPLEX) + cnrm2_kTS, casum_kTS, csum_kTS, +#endif +#if (BUILD_COMPLEX || BUILD_COMPLEX16) + ccopy_kTS, cdotu_kTS, cdotc_kTS, +#endif +#if (BUILD_COMPLEX) + csrot_kTS, +#endif +#if (BUILD_COMPLEX || BUILD_COMPLEX16) + caxpy_kTS, + caxpyc_kTS, + cscal_kTS, + cswap_kTS, cgemv_nTS, cgemv_tTS, cgemv_rTS, cgemv_cTS, cgemv_oTS, cgemv_uTS, cgemv_sTS, cgemv_dTS, +#endif +#if (BUILD_COMPLEX) cgeru_kTS, cgerc_kTS, cgerv_kTS, cgerd_kTS, csymv_LTS, csymv_UTS, chemv_LTS, chemv_UTS, chemv_MTS, chemv_VTS, - +#endif +#if (BUILD_COMPLEX || BUILD_COMPLEX16) cgemm_kernel_nTS, cgemm_kernel_lTS, cgemm_kernel_rTS, cgemm_kernel_bTS, cgemm_betaTS, - #if CGEMM_DEFAULT_UNROLL_M != CGEMM_DEFAULT_UNROLL_N cgemm_incopyTS, cgemm_itcopyTS, #else @@ -382,6 +401,8 @@ gotoblas_t TABLE_NAME = { #endif ctrsm_ounucopyTS, ctrsm_ounncopyTS, ctrsm_outucopyTS, ctrsm_outncopyTS, ctrsm_olnucopyTS, ctrsm_olnncopyTS, ctrsm_oltucopyTS, ctrsm_oltncopyTS, +#endif +#if (BUILD_COMPLEX) ctrmm_kernel_RNTS, ctrmm_kernel_RTTS, ctrmm_kernel_RRTS, ctrmm_kernel_RCTS, ctrmm_kernel_LNTS, ctrmm_kernel_LTTS, ctrmm_kernel_LRTS, ctrmm_kernel_LCTS, @@ -411,7 +432,7 @@ gotoblas_t TABLE_NAME = { 0, 0, 0, -#if defined(USE_GEMM3M) +#if (USE_GEMM3M) #ifdef CGEMM3M_DEFAULT_UNROLL_M CGEMM3M_DEFAULT_UNROLL_M, CGEMM3M_DEFAULT_UNROLL_N, MAX(CGEMM3M_DEFAULT_UNROLL_M, CGEMM3M_DEFAULT_UNROLL_N), #else @@ -469,16 +490,20 @@ gotoblas_t TABLE_NAME = { NULL, NULL, NULL, NULL, #endif +#endif +#if (BUILD_COMPLEX || BUILD_COMPLEX16) #ifndef NO_LAPACK - cneg_tcopyTS, claswp_ncopyTS, + cneg_tcopyTS, + + claswp_ncopyTS, #else NULL, NULL, #endif #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16 == 1 0, 0, 0, ZGEMM_DEFAULT_UNROLL_M, ZGEMM_DEFAULT_UNROLL_N, #ifdef ZGEMM_DEFAULT_UNROLL_MN @@ -548,7 +573,7 @@ gotoblas_t TABLE_NAME = { zhemm_outcopyTS, zhemm_oltcopyTS, 0, 0, 0, -#if defined(USE_GEMM3M) +#if (USE_GEMM3M) #ifdef ZGEMM3M_DEFAULT_UNROLL_M ZGEMM3M_DEFAULT_UNROLL_M, ZGEMM3M_DEFAULT_UNROLL_N, MAX(ZGEMM3M_DEFAULT_UNROLL_M, ZGEMM3M_DEFAULT_UNROLL_N), #else @@ -681,7 +706,7 @@ gotoblas_t TABLE_NAME = { xhemm_outcopyTS, xhemm_oltcopyTS, 0, 0, 0, -#if defined(USE_GEMM3M) +#if (USE_GEMM3M) QGEMM_DEFAULT_UNROLL_M, QGEMM_DEFAULT_UNROLL_N, MAX(QGEMM_DEFAULT_UNROLL_M, QGEMM_DEFAULT_UNROLL_N), xgemm3m_kernelTS, @@ -746,110 +771,110 @@ gotoblas_t TABLE_NAME = { init_parameter, SNUMOPT, DNUMOPT, QNUMOPT, -#ifdef BUILD_SINGLE +#if BUILD_SINGLE == 1 saxpby_kTS, #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE == 1 daxpby_kTS, #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX == 1 caxpby_kTS, #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16== 1 zaxpby_kTS, #endif -#ifdef BUILD_SINGLE +#if BUILD_SINGLE == 1 somatcopy_k_cnTS, somatcopy_k_ctTS, somatcopy_k_rnTS, somatcopy_k_rtTS, #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE== 1 domatcopy_k_cnTS, domatcopy_k_ctTS, domatcopy_k_rnTS, domatcopy_k_rtTS, #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX == 1 comatcopy_k_cnTS, comatcopy_k_ctTS, comatcopy_k_rnTS, comatcopy_k_rtTS, comatcopy_k_cncTS, comatcopy_k_ctcTS, comatcopy_k_rncTS, comatcopy_k_rtcTS, #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16 == 1 zomatcopy_k_cnTS, zomatcopy_k_ctTS, zomatcopy_k_rnTS, zomatcopy_k_rtTS, zomatcopy_k_cncTS, zomatcopy_k_ctcTS, zomatcopy_k_rncTS, zomatcopy_k_rtcTS, #endif -#ifdef BUILD_SINGLE +#if BUILD_SINGLE == 1 simatcopy_k_cnTS, simatcopy_k_ctTS, simatcopy_k_rnTS, simatcopy_k_rtTS, #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE== 1 dimatcopy_k_cnTS, dimatcopy_k_ctTS, dimatcopy_k_rnTS, dimatcopy_k_rtTS, #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX== 1 cimatcopy_k_cnTS, cimatcopy_k_ctTS, cimatcopy_k_rnTS, cimatcopy_k_rtTS, cimatcopy_k_cncTS, cimatcopy_k_ctcTS, cimatcopy_k_rncTS, cimatcopy_k_rtcTS, #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16==1 zimatcopy_k_cnTS, zimatcopy_k_ctTS, zimatcopy_k_rnTS, zimatcopy_k_rtTS, zimatcopy_k_cncTS, zimatcopy_k_ctcTS, zimatcopy_k_rncTS, zimatcopy_k_rtcTS, #endif -#ifdef BUILD_SINGLE +#if BUILD_SINGLE == 1 sgeadd_kTS, #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE==1 dgeadd_kTS, #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX==1 cgeadd_kTS, #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16==1 zgeadd_kTS #endif }; -#if defined(ARCH_ARM64) +#if (ARCH_ARM64) static void init_parameter(void) { -#if defined(BUILD_HALF) +#if (BUILD_HALF) TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; #endif -#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE == 1 TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX==1 TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16==1 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; #endif -#if defined(BUILD_HALF) +#if (BUILD_HALF) TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; #endif -#ifdef BUILD_SINGLE +#if BUILD_SINGLE == 1 TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE== 1 TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX== 1 TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16==1 TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; #endif -#if defined(BUILD_HALF) +#if (BUILD_HALF) TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; #endif -#ifdef BUILD_SINGLE +#if BUILD_SINGLE == 1 TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE==1 TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX==1 TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R; #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16==1 TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; #endif @@ -862,7 +887,7 @@ static void init_parameter(void) { TABLE_NAME.xgemm_r = XGEMM_DEFAULT_R; #endif -#if defined(USE_GEMM3M) +#if (USE_GEMM3M) #ifdef CGEMM3M_DEFAULT_P TABLE_NAME.cgemm3m_p = CGEMM3M_DEFAULT_P; #else @@ -907,8 +932,8 @@ static void init_parameter(void) { #endif } -#else // defined(ARCH_ARM64) -#if defined(ARCH_POWER) +#else // (ARCH_ARM64) +#if (ARCH_POWER) static void init_parameter(void) { #ifdef BUILD_HALF @@ -938,7 +963,7 @@ static void init_parameter(void) { } #else //POWER -#if defined(ARCH_ZARCH) +#if (ARCH_ZARCH) static void init_parameter(void) { #ifdef BUILD_HALF TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; @@ -1104,20 +1129,20 @@ static void init_parameter(void) { TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; #endif -#ifdef BUILD_SINGLE +#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; #endif -#ifdef BUILD_DOUBLE +#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16) TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX == 1 TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16==1 TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX == 1 #ifdef CGEMM3M_DEFAULT_Q TABLE_NAME.cgemm3m_q = CGEMM3M_DEFAULT_Q; #else @@ -1125,7 +1150,7 @@ static void init_parameter(void) { #endif #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16 == 1 #ifdef ZGEMM3M_DEFAULT_Q TABLE_NAME.zgemm3m_q = ZGEMM3M_DEFAULT_Q; #else @@ -1139,22 +1164,22 @@ static void init_parameter(void) { TABLE_NAME.xgemm3m_q = QGEMM_DEFAULT_Q; #endif -#if defined(CORE_KATMAI) || defined(CORE_COPPERMINE) || defined(CORE_BANIAS) || defined(CORE_YONAH) || defined(CORE_ATHLON) +#if (CORE_KATMAI) || (CORE_COPPERMINE) || (CORE_BANIAS) || (CORE_YONAH) || (CORE_ATHLON) #ifdef DEBUG fprintf(stderr, "Katmai, Coppermine, Banias, Athlon\n"); #endif -#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = 64 * (l2 >> 7); #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE == 1 TABLE_NAME.dgemm_p = 32 * (l2 >> 7); #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX==1 TABLE_NAME.cgemm_p = 32 * (l2 >> 7); #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16==1 TABLE_NAME.zgemm_p = 16 * (l2 >> 7); #endif #ifdef EXPRECISION @@ -1169,16 +1194,16 @@ static void init_parameter(void) { fprintf(stderr, "Northwood\n"); #endif -#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = 96 * (l2 >> 7); #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE == 1 TABLE_NAME.dgemm_p = 48 * (l2 >> 7); #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX==1 TABLE_NAME.cgemm_p = 48 * (l2 >> 7); #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16==1 TABLE_NAME.zgemm_p = 24 * (l2 >> 7); #endif #ifdef EXPRECISION @@ -1193,16 +1218,16 @@ static void init_parameter(void) { fprintf(stderr, "Atom\n"); #endif -#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = 256; #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE ==1 TABLE_NAME.dgemm_p = 128; #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX==1 TABLE_NAME.cgemm_p = 128; #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16==1 TABLE_NAME.zgemm_p = 64; #endif #ifdef EXPRECISION @@ -1217,16 +1242,16 @@ static void init_parameter(void) { fprintf(stderr, "Prescott\n"); #endif -#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = 56 * (l2 >> 7); #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE ==1 TABLE_NAME.dgemm_p = 28 * (l2 >> 7); #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX==1 TABLE_NAME.cgemm_p = 28 * (l2 >> 7); #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16 == 1 TABLE_NAME.zgemm_p = 14 * (l2 >> 7); #endif #ifdef EXPRECISION @@ -1241,16 +1266,16 @@ static void init_parameter(void) { fprintf(stderr, "Core2\n"); #endif -#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = 92 * (l2 >> 9) + 8; #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE==1 TABLE_NAME.dgemm_p = 46 * (l2 >> 9) + 8; #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX==1 TABLE_NAME.cgemm_p = 46 * (l2 >> 9) + 4; #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16==1 TABLE_NAME.zgemm_p = 23 * (l2 >> 9) + 4; #endif #ifdef EXPRECISION @@ -1265,16 +1290,16 @@ static void init_parameter(void) { fprintf(stderr, "Penryn\n"); #endif -#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = 42 * (l2 >> 9) + 8; #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE == 1 TABLE_NAME.dgemm_p = 42 * (l2 >> 9) + 8; #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX==1 TABLE_NAME.cgemm_p = 21 * (l2 >> 9) + 4; #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16==1 TABLE_NAME.zgemm_p = 21 * (l2 >> 9) + 4; #endif #ifdef EXPRECISION @@ -1289,16 +1314,16 @@ static void init_parameter(void) { fprintf(stderr, "Dunnington\n"); #endif -#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = 42 * (l2 >> 9) + 8; #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE ==1 TABLE_NAME.dgemm_p = 42 * (l2 >> 9) + 8; #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX==1 TABLE_NAME.cgemm_p = 21 * (l2 >> 9) + 4; #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16==1 TABLE_NAME.zgemm_p = 21 * (l2 >> 9) + 4; #endif #ifdef EXPRECISION @@ -1314,16 +1339,16 @@ static void init_parameter(void) { fprintf(stderr, "Nehalem\n"); #endif -#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; #endif #ifdef EXPRECISION @@ -1338,16 +1363,16 @@ static void init_parameter(void) { fprintf(stderr, "Sandybridge\n"); #endif -#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; #endif #ifdef EXPRECISION @@ -1362,16 +1387,16 @@ static void init_parameter(void) { fprintf(stderr, "Haswell\n"); #endif -#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; #endif -#ifdef BUILD_DOUBLE +#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16) TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; #endif #ifdef EXPRECISION @@ -1380,22 +1405,22 @@ static void init_parameter(void) { #endif #endif -#if defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(SKYLAKEX) || defined(COOPERLAKE) #ifdef DEBUG fprintf(stderr, "SkylakeX\n"); #endif -#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; #endif #ifdef EXPRECISION @@ -1411,16 +1436,16 @@ static void init_parameter(void) { fprintf(stderr, "Opteron\n"); #endif -#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = 224 + 56 * (l2 >> 7); #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE TABLE_NAME.dgemm_p = 112 + 28 * (l2 >> 7); #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX TABLE_NAME.cgemm_p = 112 + 28 * (l2 >> 7); #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16 TABLE_NAME.zgemm_p = 56 + 14 * (l2 >> 7); #endif #ifdef EXPRECISION @@ -1435,16 +1460,16 @@ static void init_parameter(void) { fprintf(stderr, "Barcelona\n"); #endif -#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; #endif #ifdef EXPRECISION @@ -1459,16 +1484,16 @@ static void init_parameter(void) { fprintf(stderr, "Bobcate\n"); #endif -#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; #endif #ifdef EXPRECISION @@ -1483,16 +1508,16 @@ static void init_parameter(void) { fprintf(stderr, "Bulldozer\n"); #endif -#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; #endif #ifdef EXPRECISION @@ -1507,16 +1532,16 @@ static void init_parameter(void) { fprintf(stderr, "Excavator\n"); #endif -#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; #endif #ifdef EXPRECISION @@ -1532,16 +1557,16 @@ static void init_parameter(void) { fprintf(stderr, "Piledriver\n"); #endif -#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; #endif #ifdef EXPRECISION @@ -1556,16 +1581,16 @@ static void init_parameter(void) { fprintf(stderr, "Steamroller\n"); #endif -#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; #endif #ifdef EXPRECISION @@ -1580,16 +1605,16 @@ static void init_parameter(void) { fprintf(stderr, "Zen\n"); #endif -#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; #endif #ifdef EXPRECISION @@ -1605,16 +1630,16 @@ static void init_parameter(void) { fprintf(stderr, "NANO\n"); #endif -#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; #endif -#ifdef BUILD_DOUBLE +#if (BUILD_DOUBLE==1) TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX +#if (BUILD_COMPLEX==1) TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX16 +#if (BUILD_COMPLEX16==1) TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; #endif @@ -1626,7 +1651,7 @@ static void init_parameter(void) { #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX==1 #ifdef CGEMM3M_DEFAULT_P TABLE_NAME.cgemm3m_p = CGEMM3M_DEFAULT_P; #else @@ -1634,7 +1659,7 @@ static void init_parameter(void) { #endif #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16==1 #ifdef ZGEMM3M_DEFAULT_P TABLE_NAME.zgemm3m_p = ZGEMM3M_DEFAULT_P; #else @@ -1647,20 +1672,20 @@ static void init_parameter(void) { #endif -#ifdef BUILD_SINGLE +#if BUILD_SINGLE == 1 TABLE_NAME.sgemm_p = ((TABLE_NAME.sgemm_p + SGEMM_DEFAULT_UNROLL_M - 1)/SGEMM_DEFAULT_UNROLL_M) * SGEMM_DEFAULT_UNROLL_M; #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE== 1 TABLE_NAME.dgemm_p = ((TABLE_NAME.dgemm_p + DGEMM_DEFAULT_UNROLL_M - 1)/DGEMM_DEFAULT_UNROLL_M) * DGEMM_DEFAULT_UNROLL_M; #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX==1 TABLE_NAME.cgemm_p = ((TABLE_NAME.cgemm_p + CGEMM_DEFAULT_UNROLL_M - 1)/CGEMM_DEFAULT_UNROLL_M) * CGEMM_DEFAULT_UNROLL_M; #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16==1 TABLE_NAME.zgemm_p = ((TABLE_NAME.zgemm_p + ZGEMM_DEFAULT_UNROLL_M - 1)/ZGEMM_DEFAULT_UNROLL_M) * ZGEMM_DEFAULT_UNROLL_M; #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX==1 #ifdef CGEMM3M_DEFAULT_UNROLL_M TABLE_NAME.cgemm3m_p = ((TABLE_NAME.cgemm3m_p + CGEMM3M_DEFAULT_UNROLL_M - 1)/CGEMM3M_DEFAULT_UNROLL_M) * CGEMM3M_DEFAULT_UNROLL_M; #else @@ -1668,7 +1693,7 @@ static void init_parameter(void) { #endif #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16==1 #ifdef ZGEMM3M_DEFAULT_UNROLL_M TABLE_NAME.zgemm3m_p = ((TABLE_NAME.zgemm3m_p + ZGEMM3M_DEFAULT_UNROLL_M - 1)/ZGEMM3M_DEFAULT_UNROLL_M) * ZGEMM3M_DEFAULT_UNROLL_M; #else @@ -1686,14 +1711,14 @@ static void init_parameter(void) { fprintf(stderr, "L2 = %8d DGEMM_P .. %d\n", l2, TABLE_NAME.dgemm_p); #endif -#ifdef BUILD_SINGLE +#if BUILD_SINGLE==1 TABLE_NAME.sgemm_r = (((BUFFER_SIZE - ((TABLE_NAME.sgemm_p * TABLE_NAME.sgemm_q * 4 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) ) / (TABLE_NAME.sgemm_q * 4) - 15) & ~15); #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE==1 TABLE_NAME.dgemm_r = (((BUFFER_SIZE - ((TABLE_NAME.dgemm_p * TABLE_NAME.dgemm_q * 8 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) @@ -1707,28 +1732,28 @@ static void init_parameter(void) { ) / (TABLE_NAME.qgemm_q * 16) - 15) & ~15); #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX ==1 TABLE_NAME.cgemm_r = (((BUFFER_SIZE - ((TABLE_NAME.cgemm_p * TABLE_NAME.cgemm_q * 8 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) ) / (TABLE_NAME.cgemm_q * 8) - 15) & ~15); #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16 ==1 TABLE_NAME.zgemm_r = (((BUFFER_SIZE - ((TABLE_NAME.zgemm_p * TABLE_NAME.zgemm_q * 16 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) ) / (TABLE_NAME.zgemm_q * 16) - 15) & ~15); #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX == 1 TABLE_NAME.cgemm3m_r = (((BUFFER_SIZE - ((TABLE_NAME.cgemm3m_p * TABLE_NAME.cgemm3m_q * 8 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) ) / (TABLE_NAME.cgemm3m_q * 8) - 15) & ~15); #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16 == 1 TABLE_NAME.zgemm3m_r = (((BUFFER_SIZE - ((TABLE_NAME.zgemm3m_p * TABLE_NAME.zgemm3m_q * 16 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) @@ -1755,4 +1780,4 @@ static void init_parameter(void) { } #endif //POWER #endif //ZARCH -#endif //defined(ARCH_ARM64) +#endif //(ARCH_ARM64) From 0f7d73ff6d66e651e4d96b26056932746e885f1c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 14:53:26 +0200 Subject: [PATCH 521/593] Allow supporting only a subset of variable types --- interface/CMakeLists.txt | 4 +-- interface/Makefile | 54 +++++++++++++++++++++++++++++++++++----- 2 files changed, 50 insertions(+), 8 deletions(-) diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index ad56c6dba..5346ecadd 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -171,7 +171,7 @@ if (NOT DEFINED NO_LAPACK) GenerateNamedObjects("${LAPACK_MANGLED_SOURCES}" "" "" 0 "" "" 0 3) endif () -if (DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_SINGLE) +if ( BUILD_COMPLEX AND NOT BUILD_SINGLE) GenerateNamedObjects("scal.c" "" "scal" 0 "" "" false "SINGLE") GenerateNamedObjects("copy.c" "" "copy" 0 "" "" false "SINGLE") GenerateNamedObjects("dot.c" "" "dot" 0 "" "" false "SINGLE") @@ -184,7 +184,7 @@ if (DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_SINGLE) GenerateNamedObjects("axpy.c" "" "axpy" 0 "" "" false "SINGLE") GenerateNamedObjects("imax.c" "USE_ABS" "i*amax" 0 "" "" false "SINGLE") endif () -if (DEFINED BUILD_COMPLEX16 AND NOT DEFINED BUILD_DOUBLE) +if ( BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) GenerateNamedObjects("scal.c" "" "scal" 0 "" "" false "DOUBLE") GenerateNamedObjects("copy.c" "" "copy" 0 "" "" false "DOUBLE") GenerateNamedObjects("dot.c" "" "dot" 0 "" "" false "DOUBLE") diff --git a/interface/Makefile b/interface/Makefile index fde6227bc..71393aaba 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -329,7 +329,10 @@ CCBLAS3OBJS = \ cblas_csyrk.$(SUFFIX) cblas_csyr2k.$(SUFFIX) \ cblas_chemm.$(SUFFIX) cblas_cherk.$(SUFFIX) cblas_cher2k.$(SUFFIX) \ cblas_comatcopy.$(SUFFIX) cblas_cimatcopy.$(SUFFIX)\ - cblas_cgeadd.$(SUFFIX) cblas_xerbla.$(SUFFIX) + cblas_cgeadd.$(SUFFIX) + +CXERBLAOBJ = \ + cblas_xerbla.$(SUFFIX) @@ -391,6 +394,8 @@ ZBLAS2OBJS += $(CZBLAS2OBJS) ZBLAS3OBJS += $(CZBLAS3OBJS) SHEXTOBJS += $(CSHEXTOBJS) + +CBAUXOBJS += $(CXERBLAOBJ) endif SBLASOBJS = $(SBLAS1OBJS) $(SBLAS2OBJS) $(SBLAS3OBJS) @@ -434,13 +439,11 @@ QLAPACKOBJS = \ # cpotf2.$(SUFFIX) claswp.$(SUFFIX) cgesv.$(SUFFIX) clauu2.$(SUFFIX) \ # clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) cpotri.$(SUFFIX) - CLAPACKOBJS = \ cgetrf.$(SUFFIX) cgetrs.$(SUFFIX) cpotrf.$(SUFFIX) cgetf2.$(SUFFIX) \ cpotf2.$(SUFFIX) claswp.$(SUFFIX) cgesv.$(SUFFIX) clauu2.$(SUFFIX) \ clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) ctrtrs.$(SUFFIX) - #ZLAPACKOBJS = \ # zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \ # zpotf2.$(SUFFIX) zlaswp.$(SUFFIX) zgesv.$(SUFFIX) zlauu2.$(SUFFIX) \ @@ -469,8 +472,42 @@ ZBLASOBJS += $(ZLAPACKOBJS) endif -FUNCOBJS = $(SHEXTOBJS) $(SHBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) +ifneq ($(BUILD_SINGLE),1) + SBLASOBJS= +ifeq ($(BUILD_DOUBLE),1) + SBLASOBJS = dsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) strsm.$(SUFFIX) \ + sgetrs.$(SUFFIX) sgetrf.$(SUFFIX) spotf2.$(SUFFIX) spotrf.$(SUFFIX) \ + ssyrk.$(SUFFIX) sgemv.$(SUFFIX) +endif +ifeq ($(BUILD_COMPLEX),1) + SBLASOBJS = \ + sdot.$(SUFFIX) srot.$(SUFFIX) snrm2.$(SUFFIX) sswap.$(SUFFIX) \ + isamax.$(SUFFIX) saxpy.$(SUFFIX) sscal.$(SUFFIX) scopy.$(SUFFIX) \ + sgemv.$(SUFFIX) sgemm.$(SUFFIX) +endif +endif +ifneq ($(BUILD_DOUBLE),1) + DBLASOBJS= +ifeq ($(BUILD_COMPLEX16),1) + DBLASOBJS = \ + ddot.$(SUFFIX) drot.$(SUFFIX) dnrm2.$(SUFFIX) dswap.$(SUFFIX) \ + idamax.$(SUFFIX) daxpy.$(SUFFIX) dscal.$(SUFFIX) dcopy.$(SUFFIX) \ + dgemv.$(SUFFIX) dgemm.$(SUFFIX) +endif +endif +ifneq ($(BUILD_COMPLEX),1) + CBLASOBJS= +ifeq ($(BUILD_COMPLEX16),1) + CBLASOBJS = cgetrs.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) cgetrf.$(SUFFIX) \ + cpotrf.$(SUFFIX) ctrsm.$(SUFFIX) cblas_cdotc_sub.$(SUFFIX) +endif +endif +ifneq ($(BUILD_COMPLEX16),1) + ZBLASOBJS= +endif +FUNCOBJS = $(SHEXTOBJS) $(CXERBLAOBJS) $(SHBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) +$(info FUNCOBJS = {[$(FUNCOBJS)]} ) ifdef EXPRECISION FUNCOBJS += $(QBLASOBJS) $(XBLASOBJS) endif @@ -481,6 +518,7 @@ endif FUNCALLFILES = $(FUNCOBJS:.$(SUFFIX)=) + include $(TOPDIR)/Makefile.tail all :: libs @@ -503,11 +541,14 @@ level1 : $(BEXTOBJS) $(SHBLAS1OBJS) $(SBLAS1OBJS) $(DBLAS1OBJS) $(QBLAS1OBJS) $( level2 : $(SBLAS2OBJS) $(DBLAS2OBJS) $(QBLAS2OBJS) $(CBLAS2OBJS) $(ZBLAS2OBJS) $(XBLAS2OBJS) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ -level3 : $(SHBLAS3OBJS) $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $(XBLAS3OBJS) +level3 : $(SHBLAS3OBJS) $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $(XBLAS3OBJS) + $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ + +aux : $(CBAUXOBJS) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ $(CSHBLASOBJS) $(CSHBLASOBJS_P) $(CSBLASOBJS) $(CSBLASOBJS_P) $(CDBLASOBJS) $(CDBLASOBJS_P) $(CQBLASOBJS) $(CQBLASOBJS_P) \ -$(CCBLASOBJS) $(CCBLASOBJS_P) $(CZBLASOBJS) $(CZBLASOBJS_P) $(CXBLASOBJS) $(CXBLASOBJS_P) : override CFLAGS += -DCBLAS +$(CCBLASOBJS) $(CCBLASOBJS_P) $(CZBLASOBJS) $(CZBLASOBJS_P) $(CXBLASOBJS) $(CXBLASOBJS_P) $(CBAUXOBJS_P) : override CFLAGS += -DCBLAS srot.$(SUFFIX) srot.$(PSUFFIX) : rot.c $(CC) $(CFLAGS) -c $< -o $(@F) @@ -2268,3 +2309,4 @@ cblas_zgeadd.$(SUFFIX) cblas_zgeadd.$(PSUFFIX) : zgeadd.c cblas_xerbla.$(SUFFIX) cblas_xerbla.$(PSUFFIX) : xerbla.c $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + From 886a8e319048ff92a923f989ca1a01b594b60808 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 14:57:32 +0200 Subject: [PATCH 522/593] Adapt for supporting only a subset of variable types --- driver/level3/CMakeLists.txt | 8 +++--- driver/level3/Makefile | 54 ++++++++++++++++++++++++++++++++++++ driver/level3/syrk_thread.c | 4 +-- 3 files changed, 60 insertions(+), 6 deletions(-) diff --git a/driver/level3/CMakeLists.txt b/driver/level3/CMakeLists.txt index 46cbb0d6d..077862abc 100644 --- a/driver/level3/CMakeLists.txt +++ b/driver/level3/CMakeLists.txt @@ -14,7 +14,7 @@ foreach (GEMM_DEFINE ${GEMM_DEFINES}) endif () endforeach () -if (DEFINED BUILD_COMPLEX16 AND NOT DEFINED BUILD_DOUBLE) +if ( BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) foreach (GEMM_DEFINE ${GEMM_DEFINES}) string(TOLOWER ${GEMM_DEFINE} GEMM_DEFINE_LC) GenerateNamedObjects("gemm.c" "${GEMM_DEFINE}" "gemm_${GEMM_DEFINE_LC}" 0 "" "" false "DOUBLE") @@ -23,7 +23,7 @@ foreach (GEMM_DEFINE ${GEMM_DEFINES}) endif() endforeach() endif() -if (DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_SINGLE) +if ( BUILD_COMPLEX AND NOT BUILD_SINGLE) foreach (GEMM_DEFINE ${GEMM_DEFINES}) string(TOLOWER ${GEMM_DEFINE} GEMM_DEFINE_LC) GenerateNamedObjects("gemm.c" "${GEMM_DEFINE}" "gemm_${GEMM_DEFINE_LC}" 0 "" "" false "SINGLE") @@ -119,7 +119,7 @@ foreach (float_type ${FLOAT_TYPES}) endif () endforeach () - if (DEFINED BUILD_COMPLEX16 AND NOT DEFINED BUILD_DOUBLE) + if ( BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) foreach (gemm_define ${GEMM_COMPLEX_DEFINES}) string(TOLOWER ${gemm_define} gemm_define_LC) if (USE_THREAD AND NOT USE_SIMPLE_THREADED_LEVEL3) @@ -127,7 +127,7 @@ foreach (float_type ${FLOAT_TYPES}) endif() endforeach() endif () - if (DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_SINGLE) + if ( BUILD_COMPLEX AND NOT BUILD_SINGLE) foreach (gemm_define ${GEMM_COMPLEX_DEFINES}) string(TOLOWER ${gemm_define} gemm_define_LC) if (USE_THREAD AND NOT USE_SIMPLE_THREADED_LEVEL3) diff --git a/driver/level3/Makefile b/driver/level3/Makefile index 09a62d9bf..e3aa30256 100644 --- a/driver/level3/Makefile +++ b/driver/level3/Makefile @@ -287,6 +287,60 @@ HPLOBJS += dgemm_thread_nn.$(SUFFIX) dgemm_thread_nt.$(SUFFIX) \ dgemm_thread_tn.$(SUFFIX) dgemm_thread_tt.$(SUFFIX) endif +ifneq ($(BUILD_SINGLE),1) + SBLASOBJS= +ifeq ($(BUILD_DOUBLE),1) + SBLASOBJS= \ + strsm_LNUU.$(SUFFIX) strsm_LNUN.$(SUFFIX) strsm_LNLU.$(SUFFIX) strsm_LNLN.$(SUFFIX) \ + strsm_LTUU.$(SUFFIX) strsm_LTUN.$(SUFFIX) strsm_LTLU.$(SUFFIX) strsm_LTLN.$(SUFFIX) \ + strsm_RNUU.$(SUFFIX) strsm_RNUN.$(SUFFIX) strsm_RNLU.$(SUFFIX) strsm_RNLN.$(SUFFIX) \ + strsm_RTUU.$(SUFFIX) strsm_RTUN.$(SUFFIX) strsm_RTLU.$(SUFFIX) strsm_RTLN.$(SUFFIX) \ + ssyrk_UN.$(SUFFIX) ssyrk_UT.$(SUFFIX) ssyrk_LN.$(SUFFIX) ssyrk_LT.$(SUFFIX) \ + ssyrk_kernel_U.$(SUFFIX) ssyrk_kernel_L.$(SUFFIX) +ifndef USE_SIMPLE_THREADED_LEVEL3 +SBLASOBJS += ssyrk_thread_UN.$(SUFFIX) ssyrk_thread_UT.$(SUFFIX) ssyrk_thread_LN.$(SUFFIX) ssyrk_thread_LT.$(SUFFIX) +endif +endif +ifeq ($(BUILD_COMPLEX),1) + SBLASOBJS = sgemm_nn.$(SUFFIX) sgemm_nt.$(SUFFIX) sgemm_tn.$(SUFFIX) sgemm_tt.$(SUFFIX) +ifndef USE_SIMPLE_THREADED_LEVEL3 +SBLASOBJS += sgemm_thread_nn.$(SUFFIX) sgemm_thread_nt.$(SUFFIX) sgemm_thread_tn.$(SUFFIX) sgemm_thread_tt.$(SUFFIX) +endif +endif +endif +ifneq ($(BUILD_DOUBLE),1) + DBLASOBJS= +ifeq ($(BUILD_COMPLEX16),1) + DBLASOBJS = dgemm_nn.$(SUFFIX) dgemm_nt.$(SUFFIX) dgemm_tn.$(SUFFIX) dgemm_tt.$(SUFFIX) +ifndef USE_SIMPLE_THREADED_LEVEL3 +DBLASOBJS += dgemm_thread_nn.$(SUFFIX) dgemm_thread_nt.$(SUFFIX) dgemm_thread_tn.$(SUFFIX) dgemm_thread_tt.$(SUFFIX) +endif +endif +endif +ifneq ($(BUILD_COMPLEX),1) + CBLASOBJS= +ifeq ($(BUILD_COMPLEX16),1) + CBLASOBJS= \ + cherk_UN.$(SUFFIX) cherk_UC.$(SUFFIX) cherk_LN.$(SUFFIX) cherk_LC.$(SUFFIX) \ + cherk_kernel_UN.$(SUFFIX) cherk_kernel_UC.$(SUFFIX) \ + cherk_kernel_LN.$(SUFFIX) cherk_kernel_LC.$(SUFFIX) \ + ctrsm_LNUU.$(SUFFIX) ctrsm_LNUN.$(SUFFIX) ctrsm_LNLU.$(SUFFIX) ctrsm_LNLN.$(SUFFIX) \ + ctrsm_LTUU.$(SUFFIX) ctrsm_LTUN.$(SUFFIX) ctrsm_LTLU.$(SUFFIX) ctrsm_LTLN.$(SUFFIX) \ + ctrsm_LRUU.$(SUFFIX) ctrsm_LRUN.$(SUFFIX) ctrsm_LRLU.$(SUFFIX) ctrsm_LRLN.$(SUFFIX) \ + ctrsm_LCUU.$(SUFFIX) ctrsm_LCUN.$(SUFFIX) ctrsm_LCLU.$(SUFFIX) ctrsm_LCLN.$(SUFFIX) \ + ctrsm_RNUU.$(SUFFIX) ctrsm_RNUN.$(SUFFIX) ctrsm_RNLU.$(SUFFIX) ctrsm_RNLN.$(SUFFIX) \ + ctrsm_RTUU.$(SUFFIX) ctrsm_RTUN.$(SUFFIX) ctrsm_RTLU.$(SUFFIX) ctrsm_RTLN.$(SUFFIX) \ + ctrsm_RRUU.$(SUFFIX) ctrsm_RRUN.$(SUFFIX) ctrsm_RRLU.$(SUFFIX) ctrsm_RRLN.$(SUFFIX) \ + ctrsm_RCUU.$(SUFFIX) ctrsm_RCUN.$(SUFFIX) ctrsm_RCLU.$(SUFFIX) ctrsm_RCLN.$(SUFFIX) +ifndef USE_SIMPLE_THREADED_LEVEL3 +CBLASOBJS += cherk_thread_UN.$(SUFFIX) cherk_thread_UC.$(SUFFIX) cherk_thread_LN.$(SUFFIX) cherk_thread_LC.$(SUFFIX) +endif +endif +endif +ifneq ($(BUILD_COMPLEX16),1) + ZBLASOBJS= +endif + all :: shgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h diff --git a/driver/level3/syrk_thread.c b/driver/level3/syrk_thread.c index 753cdb5ca..12808afd5 100644 --- a/driver/level3/syrk_thread.c +++ b/driver/level3/syrk_thread.c @@ -56,12 +56,12 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( if (!(mode & BLAS_COMPLEX)) { switch (mode & BLAS_PREC) { -#ifdef BUILD_SINGLE +#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) case BLAS_SINGLE: mask = SGEMM_UNROLL_MN - 1; break; #endif -#ifdef BUILD_DOUBLE +#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16) case BLAS_DOUBLE: mask = DGEMM_UNROLL_MN - 1; break; From 887e00fd7fc328fb647bdc9aa2feb18898092a73 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 14:58:57 +0200 Subject: [PATCH 523/593] Adapt for supporting only a subset of variable types --- driver/level2/CMakeLists.txt | 4 +- driver/level2/Makefile | 82 +++++++++++++++++++++++++++++++++++- 2 files changed, 82 insertions(+), 4 deletions(-) diff --git a/driver/level2/CMakeLists.txt b/driver/level2/CMakeLists.txt index f72e707e1..61367e596 100644 --- a/driver/level2/CMakeLists.txt +++ b/driver/level2/CMakeLists.txt @@ -197,13 +197,13 @@ foreach (float_type ${FLOAT_TYPES}) endif () endforeach () -if (DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_SINGLE) +if ( BUILD_COMPLEX AND NOT BUILD_SINGLE) if (USE_THREAD) GenerateNamedObjects("gemv_thread.c" "" "gemv_thread_n" false "" "" false "SINGLE") GenerateNamedObjects("gemv_thread.c" "TRANSA" "gemv_thread_t" false "" "" false "SINGLE") endif () endif () -if (DEFINED BUILD_COMPLEX16 AND NOT DEFINED BUILD_DOUBLE) +if ( BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) if (USE_THREAD) GenerateNamedObjects("gemv_thread.c" "" "gemv_thread_n" false "" "" false "DOUBLE") GenerateNamedObjects("gemv_thread.c" "TRANSA" "gemv_thread_t" false "" "" false "DOUBLE") diff --git a/driver/level2/Makefile b/driver/level2/Makefile index 79c4ca153..7212d6662 100644 --- a/driver/level2/Makefile +++ b/driver/level2/Makefile @@ -417,19 +417,63 @@ XBLASOBJS += \ endif +ifneq ($(BUILD_SINGLE),1) + SBLASOBJS= +ifeq ($(BUILD_DOUBLE),1) +ifdef SMP +SBLASOBJS += \ + sgemv_thread_n.$(SUFFIX) sgemv_thread_t.$(SUFFIX) \ + strsv_NUU.$(SUFFIX) strsv_NUN.$(SUFFIX) strsv_NLU.$(SUFFIX) strsv_NLN.$(SUFFIX) \ + strsv_TUU.$(SUFFIX) strsv_TUN.$(SUFFIX) strsv_TLU.$(SUFFIX) strsv_TLN.$(SUFFIX) +endif +endif +ifeq ($(BUILD_COMPLEX),1) +ifdef SMP + SBLASOBJS = sgemv_thread_n.$(SUFFIX) sgemv_thread_t.$(SUFFIX) +endif +endif +endif +ifneq ($(BUILD_DOUBLE),1) + DBLASOBJS= +ifeq ($(BUILD_COMPLEX16),1) +ifdef SMP + DBLASOBJS = dgemv_thread_n.$(SUFFIX) dgemv_thread_t.$(SUFFIX) +endif +endif +endif +ifneq ($(BUILD_COMPLEX),1) + CBLASOBJS= +ifeq ($(BUILD_COMPLEX16),1) + CBLASOBJS= \ + ctrsv_NUU.$(SUFFIX) ctrsv_NUN.$(SUFFIX) ctrsv_NLU.$(SUFFIX) ctrsv_NLN.$(SUFFIX) \ + ctrsv_TUU.$(SUFFIX) ctrsv_TUN.$(SUFFIX) ctrsv_TLU.$(SUFFIX) ctrsv_TLN.$(SUFFIX) \ + ctrsv_RUU.$(SUFFIX) ctrsv_RUN.$(SUFFIX) ctrsv_RLU.$(SUFFIX) ctrsv_RLN.$(SUFFIX) \ + ctrsv_CUU.$(SUFFIX) ctrsv_CUN.$(SUFFIX) ctrsv_CLU.$(SUFFIX) ctrsv_CLN.$(SUFFIX) +endif +endif +ifneq ($(BUILD_COMPLEX16),1) + ZBLASOBJS= +endif + all :: +ifeq ($(BUILD_SINGLE),1) + sgbmv_n.$(SUFFIX) sgbmv_n.$(PSUFFIX) : gbmv_k.c $(CC) -c -UCOMPLEX -UDOUBLE -UTRANS $(CFLAGS) -o $(@F) $< sgbmv_t.$(SUFFIX) sgbmv_t.$(PSUFFIX) : gbmv_k.c $(CC) -c -UCOMPLEX -UDOUBLE -DTRANS $(CFLAGS) -o $(@F) $< +endif + +ifeq ($(BUILD_DOUBLE),1) dgbmv_n.$(SUFFIX) dgbmv_n.$(PSUFFIX) : gbmv_k.c $(CC) -c -UCOMPLEX -DDOUBLE -UTRANS $(CFLAGS) -o $(@F) $< dgbmv_t.$(SUFFIX) dgbmv_t.$(PSUFFIX) : gbmv_k.c $(CC) -c -UCOMPLEX -DDOUBLE -DTRANS $(CFLAGS) -o $(@F) $< +endif qgbmv_n.$(SUFFIX) qgbmv_n.$(PSUFFIX) : gbmv_k.c $(CC) -c -UCOMPLEX -DXDOUBLE -UTRANS $(CFLAGS) -o $(@F) $< @@ -437,6 +481,8 @@ qgbmv_n.$(SUFFIX) qgbmv_n.$(PSUFFIX) : gbmv_k.c qgbmv_t.$(SUFFIX) qgbmv_t.$(PSUFFIX) : gbmv_k.c $(CC) -c -UCOMPLEX -DXDOUBLE -DTRANS $(CFLAGS) -o $(@F) $< +ifeq ($(BUILD_COMPLEX),1) + cgbmv_n.$(SUFFIX) cgbmv_n.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -UDOUBLE -UTRANS -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< @@ -460,6 +506,9 @@ cgbmv_s.$(SUFFIX) cgbmv_s.$(PSUFFIX) : zgbmv_k.c cgbmv_d.$(SUFFIX) cgbmv_d.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -UDOUBLE -DTRANS -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< +endif + +ifeq ($(BUILD_COMPLEX16),1) zgbmv_n.$(SUFFIX) zgbmv_n.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -DDOUBLE -UTRANS -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< @@ -484,6 +533,7 @@ zgbmv_s.$(SUFFIX) zgbmv_s.$(PSUFFIX) : zgbmv_k.c zgbmv_d.$(SUFFIX) zgbmv_d.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -DDOUBLE -DTRANS -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< +endif xgbmv_n.$(SUFFIX) xgbmv_n.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -DXDOUBLE -UTRANS -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< @@ -509,24 +559,34 @@ xgbmv_s.$(SUFFIX) xgbmv_s.$(PSUFFIX) : zgbmv_k.c xgbmv_d.$(SUFFIX) xgbmv_d.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -DXDOUBLE -DTRANS -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +ifeq ($(BUILD_SINGLE),1) + sgbmv_thread_n.$(SUFFIX) sgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c $(CC) -c -UCOMPLEX -UDOUBLE -UTRANSA $(CFLAGS) -o $(@F) $< sgbmv_thread_t.$(SUFFIX) sgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c $(CC) -c -UCOMPLEX -UDOUBLE -DTRANSA $(CFLAGS) -o $(@F) $< +endif + + +ifeq ($(BUILD_DOUBLE),1) dgbmv_thread_n.$(SUFFIX) dgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c $(CC) -c -UCOMPLEX -DDOUBLE -UTRANSA $(CFLAGS) -o $(@F) $< dgbmv_thread_t.$(SUFFIX) dgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c $(CC) -c -UCOMPLEX -DDOUBLE -DTRANSA $(CFLAGS) -o $(@F) $< - +endif qgbmv_thread_n.$(SUFFIX) qgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c $(CC) -c -UCOMPLEX -DXDOUBLE -UTRANSA $(CFLAGS) -o $(@F) $< qgbmv_thread_t.$(SUFFIX) qgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c $(CC) -c -UCOMPLEX -DXDOUBLE -DTRANSA $(CFLAGS) -o $(@F) $< + +ifeq ($(BUILD_COMPLEX),1) + cgbmv_thread_n.$(SUFFIX) cgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -UDOUBLE -UTRANSA -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< @@ -550,6 +610,10 @@ cgbmv_thread_s.$(SUFFIX) cgbmv_thread_s.$(PSUFFIX) : gbmv_thread.c cgbmv_thread_d.$(SUFFIX) cgbmv_thread_d.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -UDOUBLE -DTRANSA -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< +endif + + +ifeq ($(BUILD_COMPLEX16),1) zgbmv_thread_n.$(SUFFIX) zgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -DDOUBLE -UTRANSA -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< @@ -574,6 +638,7 @@ zgbmv_thread_s.$(SUFFIX) zgbmv_thread_s.$(PSUFFIX) : gbmv_thread.c zgbmv_thread_d.$(SUFFIX) zgbmv_thread_d.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -DDOUBLE -DTRANSA -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< +endif xgbmv_thread_n.$(SUFFIX) xgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -DXDOUBLE -UTRANSA -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< @@ -599,24 +664,32 @@ xgbmv_thread_s.$(SUFFIX) xgbmv_thread_s.$(PSUFFIX) : gbmv_thread.c xgbmv_thread_d.$(SUFFIX) xgbmv_thread_d.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -DXDOUBLE -DTRANSA -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" "" sgemv_thread_n.$(SUFFIX) sgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F) sgemv_thread_t.$(SUFFIX) sgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UCONJ -UXCONJ $< -o $(@F) +endif + +ifneq "$(or $(BUILD_DOUBLE),$(BUILD_COMPLEX16))" "" dgemv_thread_n.$(SUFFIX) dgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F) dgemv_thread_t.$(SUFFIX) dgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UCONJ -UXCONJ $< -o $(@F) - +endif qgemv_thread_n.$(SUFFIX) qgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F) qgemv_thread_t.$(SUFFIX) qgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UCONJ -UXCONJ $< -o $(@F) + +ifeq ($(BUILD_COMPLEX),1) + cgemv_thread_n.$(SUFFIX) cgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F) @@ -640,6 +713,10 @@ cgemv_thread_s.$(SUFFIX) cgemv_thread_s.$(PSUFFIX) : gemv_thread.c ../../common. cgemv_thread_d.$(SUFFIX) cgemv_thread_d.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DCONJ -DXCONJ $< -o $(@F) +endif + + +ifeq ($(BUILD_COMPLEX16),1) zgemv_thread_n.$(SUFFIX) zgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F) @@ -664,6 +741,7 @@ zgemv_thread_s.$(SUFFIX) zgemv_thread_s.$(PSUFFIX) : gemv_thread.c ../../common. zgemv_thread_d.$(SUFFIX) zgemv_thread_d.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DCONJ -DXCONJ $< -o $(@F) +endif xgemv_thread_n.$(SUFFIX) xgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F) From 68e6823d36a2e727c6db7bf850ba2b05b204a04a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 15:01:32 +0200 Subject: [PATCH 524/593] Adapt for supporting only a subset of variable types --- cmake/arch.cmake | 3 +-- cmake/lapack.cmake | 28 +++++++++++++++++----------- cmake/system.cmake | 23 +++++++++-------------- cmake/system_check.cmake | 2 +- 4 files changed, 28 insertions(+), 28 deletions(-) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index c048f13d1..99e685d04 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -83,8 +83,7 @@ if (DYNAMIC_ARCH) endif () endif () - CHECK_INCLUDE_FILE ("${PROJECT_SOURCE_DIR}/config_kernel.h" TRAP) - if (TRAP) + if (EXISTS ${PROJECT_SOURCE_DIR}/config_kernel.h) message (FATAL_ERROR "Your build directory contains a file config_kernel.h, probably from a previous compilation with make. This will conflict with the cmake compilation and cause strange compiler errors - please remove the file before trying again") endif () diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake index 18a74d18e..73f2592ef 100644 --- a/cmake/lapack.cmake +++ b/cmake/lapack.cmake @@ -1,11 +1,12 @@ # Sources for compiling lapack-netlib. Can't use CMakeLists.txt because lapack-netlib already has its own cmake files. set(ALLAUX ilaenv.f ilaenv2stage.f ieeeck.f lsamen.f iparmq.f iparam2stage.F - ilaprec.f ilatrans.f ilauplo.f iladiag.f chla_transtype.f + ilaprec.f ilatrans.f ilauplo.f iladiag.f chla_transtype.f dlaset.f ../INSTALL/ilaver.f xerbla_array.f ../INSTALL/slamch.f) set(SCLAUX + scombssq.f sbdsvdx.f sstevx.f sstein.f sbdsdc.f sbdsqr.f sdisna.f slabad.f slacpy.f sladiv.f slae2.f slaebz.f slaed0.f slaed1.f slaed2.f slaed3.f slaed4.f slaed5.f slaed6.f @@ -25,6 +26,7 @@ set(SCLAUX set(DZLAUX dbdsdc.f + dbdsvdx.f dstevx.f dstein.f dbdsqr.f ddisna.f dlabad.f dlacpy.f dladiv.f dlae2.f dlaebz.f dlaed0.f dlaed1.f dlaed2.f dlaed3.f dlaed4.f dlaed5.f dlaed6.f dlaed7.f dlaed8.f dlaed9.f dlaeda.f dlaev2.f dlagtf.f @@ -35,14 +37,14 @@ set(DZLAUX dlartg.f dlaruv.f dlas2.f dlascl.f dlasd0.f dlasd1.f dlasd2.f dlasd3.f dlasd4.f dlasd5.f dlasd6.f dlasd7.f dlasd8.f dlasda.f dlasdq.f dlasdt.f - dlaset.f dlasq1.f dlasq2.f dlasq3.f dlasq4.f dlasq5.f dlasq6.f + dlasq1.f dlasq2.f dlasq3.f dlasq4.f dlasq5.f dlasq6.f dlasr.f dlasrt.f dlassq.f dlasv2.f dpttrf.f dstebz.f dstedc.f dsteqr.f dsterf.f dlaisnan.f disnan.f dlartgp.f dlartgs.f ../INSTALL/dlamch.f ../INSTALL/dsecnd_${TIMER}.f) set(SLASRC - sbdsvdx.f sgbbrd.f sgbcon.f sgbequ.f sgbrfs.f sgbsv.f + sgbbrd.f sgbcon.f sgbequ.f sgbrfs.f sgbsv.f sgbsvx.f sgbtf2.f sgbtrf.f sgbtrs.f sgebak.f sgebal.f sgebd2.f sgebrd.f sgecon.f sgeequ.f sgees.f sgeesx.f sgeev.f sgeevx.f sgehd2.f sgehrd.f sgelq2.f sgelqf.f @@ -83,8 +85,8 @@ set(SLASRC ssbev.f ssbevd.f ssbevx.f ssbgst.f ssbgv.f ssbgvd.f ssbgvx.f ssbtrd.f sspcon.f sspev.f sspevd.f sspevx.f sspgst.f sspgv.f sspgvd.f sspgvx.f ssprfs.f sspsv.f sspsvx.f ssptrd.f - ssptrf.f ssptri.f ssptrs.f sstegr.f sstein.f sstev.f sstevd.f sstevr.f - sstevx.f ssycon.f ssyev.f ssyevd.f ssyevr.f ssyevx.f ssygs2.f + ssptrf.f ssptri.f ssptrs.f sstegr.f sstev.f sstevd.f sstevr.f + ssycon.f ssyev.f ssyevd.f ssyevr.f ssyevx.f ssygs2.f ssygst.f ssygv.f ssygvd.f ssygvx.f ssyrfs.f ssysv.f ssysvx.f ssytd2.f ssytf2.f ssytrd.f ssytrf.f ssytri.f ssytri2.f ssytri2x.f ssyswapr.f ssytrs.f ssytrs2.f @@ -116,7 +118,7 @@ set(SLASRC ssytrd_2stage.f ssytrd_sy2sb.f ssytrd_sb2st.F ssb2st_kernels.f ssyevd_2stage.f ssyev_2stage.f ssyevx_2stage.f ssyevr_2stage.f ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f - scombssq.f sgesvdq.f slaorhr_col_getrfnp.f + sgesvdq.f slaorhr_col_getrfnp.f slaorhr_col_getrfnp2.f sorgtsqr.f sorhr_col.f ) set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f @@ -229,7 +231,7 @@ set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f cla_lin_berr.f clarscl2.f clascl2.f cla_wwaddw.f) set(DLASRC - dbdsvdx.f dgbbrd.f dgbcon.f dgbequ.f dgbrfs.f dgbsv.f + dgbbrd.f dgbcon.f dgbequ.f dgbrfs.f dgbsv.f dgbsvx.f dgbtf2.f dgbtrf.f dgbtrs.f dgebak.f dgebal.f dgebd2.f dgebrd.f dgecon.f dgeequ.f dgees.f dgeesx.f dgeev.f dgeevx.f dgehd2.f dgehrd.f dgelq2.f dgelqf.f @@ -270,8 +272,8 @@ set(DLASRC dsbev.f dsbevd.f dsbevx.f dsbgst.f dsbgv.f dsbgvd.f dsbgvx.f dsbtrd.f dspcon.f dspev.f dspevd.f dspevx.f dspgst.f dspgv.f dspgvd.f dspgvx.f dsprfs.f dspsv.f dspsvx.f dsptrd.f - dsptrf.f dsptri.f dsptrs.f dstegr.f dstein.f dstev.f dstevd.f dstevr.f - dstevx.f dsycon.f dsyev.f dsyevd.f dsyevr.f + dsptrf.f dsptri.f dsptrs.f dstegr.f dstev.f dstevd.f dstevr.f + dsycon.f dsyev.f dsyevd.f dsyevr.f dsyevx.f dsygs2.f dsygst.f dsygv.f dsygvd.f dsygvx.f dsyrfs.f dsysv.f dsysvx.f dsytd2.f dsytf2.f dsytrd.f dsytrf.f dsytri.f dsytrs.f dsytrs2.f @@ -474,12 +476,16 @@ endif() if(BUILD_COMPLEX) set(LA_REL_SRC ${LA_REL_SRC} ${CLASRC} ${ZCLASRC} ${ALLAUX} ${SCLAUX}) SET(LA_GEN_SRC ${LA_GEN_SRC} ${CMATGEN} ${SCATGEN}) - message(STATUS "Building Complex Precision") + message(STATUS "Building Single Precision Complex") endif() if(BUILD_COMPLEX16) set(LA_REL_SRC ${LA_REL_SRC} ${ZLASRC} ${ZCLASRC} ${ALLAUX} ${DZLAUX}) SET(LA_GEN_SRC ${LA_GEN_SRC} ${ZMATGEN} ${DZATGEN}) - message(STATUS "Building Double Complex Precision") +# for zlange/zlanhe + if (NOT BUILD_DOUBLE) + set (LA_REL_SRC ${LA_REL_SRC} dcombssq.f) + endif () + message(STATUS "Building Double Precision Complex") endif() # add lapack-netlib folder to the sources diff --git a/cmake/system.cmake b/cmake/system.cmake index 3729f6c62..a504530fb 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -70,9 +70,6 @@ if (DEFINED TARGET) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") endif() endif() - if (DEFINED HAVE_SSE3) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") - endif() endif() if (DEFINED TARGET) @@ -326,13 +323,7 @@ else () set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_STACK_ALLOC=2048") endif () endif () -if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows") -if (DEFINED BLAS3_MEM_ALLOC_THRESHOLD) -if (NOT ${BLAS3_MEM_ALLOC_THRESHOLD} EQUAL 32) -set(CCOMMON_OPT "${CCOMMON_OPT} -DBLAS3_MEM_ALLOC_THRESHOLD=${BLAS3_MEM_ALLOC_THRESHOLD}") -endif() -endif() -endif() + if (DEFINED LIBNAMESUFFIX) set(LIBPREFIX "libopenblas_${LIBNAMESUFFIX}") else () @@ -410,16 +401,20 @@ if (NOT BUILD_SINGLE AND NOT BUILD_DOUBLE AND NOT BUILD_COMPLEX AND NOT BUILD_CO set (BUILD_COMPLEX16 ON) endif() if (BUILD_SINGLE) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_SINGLE") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_SINGLE=1") + set(CCOMMON_OPT "${CCOMMON_OPT} -DBUILD_SINGLE=1") endif() if (BUILD_DOUBLE) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_DOUBLE") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_DOUBLE=1") + set(CCOMMON_OPT "${CCOMMON_OPT} -DBUILD_SINGLE=1") endif() if (BUILD_COMPLEX) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX=1") + set(CCOMMON_OPT "${CCOMMON_OPT} -DBUILD_COMPLEX=1") endif() if (BUILD_COMPLEX16) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX16") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX16=1") + set(CCOMMON_OPT "${CCOMMON_OPT} -DBUILD_COMPLEX16=1") endif() if(NOT MSVC) set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CCOMMON_OPT}") diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index b0ab926fc..fdc79c8ce 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -121,6 +121,6 @@ endif() include(CheckIncludeFile) CHECK_INCLUDE_FILE("stdatomic.h" HAVE_C11) -if (HAVE_C11) +if (HAVE_C11 EQUAL 1) set (CCOMMON_OPT "${CCOMMON_OPT} -DHAVE_C11") endif() From e396ec8b56511d84930e849b08d825af62b821a7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 15:11:15 +0200 Subject: [PATCH 525/593] Allow building support for only a subset of variable types --- CMakeLists.txt | 28 +++++---- Makefile | 15 ++++- Makefile.rule | 32 +++------- Makefile.tail | 4 +- common_param.h | 166 ++++++++++++++++++++++++++++++++++++++++++------- 5 files changed, 182 insertions(+), 63 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 954c053e4..f43e0e0fc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,8 +29,10 @@ option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding proc else() set(NO_AFFINITY 1) endif() -option(CPP_THREAD_SAFETY_TEST "Run a massively parallel DGEMM test to confirm thread safety of the library (requires OpenMP and about 1.3GB of RAM)" OFF) -option(CPP_THREAD_SAFETY_GEMV "Run a massively parallel DGEMV test to confirm thread safety of the library (requires OpenMP)" OFF) +option(BUILD_SINGLE "Single precision" OFF) +option(BUILD_DOUBLE "Double precision" OFF) +option(BUILD_COMPLEX "Single precision" OFF) +option(BUILD_COMPLEX16 "Single precision" OFF) # Add a prefix or suffix to all exported symbol names in the shared library. # Avoids conflicts with other BLAS libraries, especially when using @@ -108,28 +110,33 @@ endif() set(FLOAT_TYPES "") if (BUILD_SINGLE) - message(STATUS "Building Single Precision") - list(APPEND FLOAT_TYPES "SINGLE") # defines nothing + message(STATUS "Building Songle Precision") + list(APPEND FLOAT_TYPES "SINGLE") + # set(CCOMMON_OPT "${CCOMMON_OPT} -DBUILD_SINGLE=1") endif () if (BUILD_DOUBLE) message(STATUS "Building Double Precision") - list(APPEND FLOAT_TYPES "DOUBLE") # defines DOUBLE + list(APPEND FLOAT_TYPES "DOUBLE") + #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_DOUBLE=1") endif () if (BUILD_COMPLEX) message(STATUS "Building Complex Precision") - list(APPEND FLOAT_TYPES "COMPLEX") # defines COMPLEX -endif () + list(APPEND FLOAT_TYPES "COMPLEX") + #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX=1") +endif () if (BUILD_COMPLEX16) message(STATUS "Building Double Complex Precision") - list(APPEND FLOAT_TYPES "ZCOMPLEX") # defines COMPLEX and DOUBLE + list(APPEND FLOAT_TYPES "ZCOMPLEX") + #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX16=1") endif () if (BUILD_HALF) message(STATUS "Building Half Precision") - list(APPEND FLOAT_TYPES "HALF") # defines nothing + list(APPEND FLOAT_TYPES "HALF") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_HALF") endif () if (NOT DEFINED CORE OR "${CORE}" STREQUAL "UNKNOWN") @@ -236,9 +243,6 @@ if (NOT MSVC AND NOT NOFORTRAN) add_subdirectory(ctest) endif() add_subdirectory(lapack-netlib/TESTING) - if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV) - add_subdirectory(cpp_thread_test) - endif() endif() set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES diff --git a/Makefile b/Makefile index 93e8af2eb..a9af62a22 100644 --- a/Makefile +++ b/Makefile @@ -146,9 +146,6 @@ ifneq ($(NO_CBLAS), 1) ifeq ($(CPP_THREAD_SAFETY_TEST), 1) $(MAKE) -C cpp_thread_test all endif -ifeq ($(CPP_THREAD_SAFETY_GEMV), 1) - $(MAKE) -C cpp_thread_test dgemv_tester -endif endif endif @@ -304,6 +301,18 @@ else endif ifeq ($(BUILD_LAPACK_DEPRECATED), 1) -@echo "BUILD_DEPRECATED = 1" >> $(NETLIB_LAPACK_DIR)/make.inc +endif +ifeq ($(BUILD_SINGLE), 1) + -@echo "BUILD_SINGLE = 1" >> $(NETLIB_LAPACK_DIR)/make.inc +endif +ifeq ($(BUILD_DOUBLE), 1) + -@echo "BUILD_DOUBLE = 1" >> $(NETLIB_LAPACK_DIR)/make.inc +endif +ifeq ($(BUILD_COMPLEX), 1) + -@echo "BUILD_COMPLEX = 1" >> $(NETLIB_LAPACK_DIR)/make.inc +endif +ifeq ($(BUILD_COMPLEX16), 1) + -@echo "BUILD_COMPLEX16 = 1" >> $(NETLIB_LAPACK_DIR)/make.inc endif -@echo "LAPACKE_WITH_TMG = 1" >> $(NETLIB_LAPACK_DIR)/make.inc -@cat make.inc >> $(NETLIB_LAPACK_DIR)/make.inc diff --git a/Makefile.rule b/Makefile.rule index 635e02c02..09dfb0881 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -272,33 +272,17 @@ COMMON_PROF = -pg # work at all. # # CPP_THREAD_SAFETY_TEST = 1 -# -# use this to run only the less memory-hungry GEMV test -# CPP_THREAD_SAFETY_GEMV = 1 # If you want to enable the experimental BFLOAT16 support # BUILD_HALF = 1 - - -# Set the thread number threshold beyond which the job array for the threaded level3 BLAS -# will be allocated on the heap rather than the stack. (This array alone requires -# NUM_THREADS*NUM_THREADS*128 bytes of memory so should not pose a problem at low cpu -# counts, but obviously it is not the only item that ends up on the stack. -# The default value of 32 ensures that the overall requirement is compatible -# with the default 1MB stacksize imposed by having the Java VM loaded without use -# of its -Xss parameter. -# The value of 160 formerly used from about version 0.2.7 until 0.3.10 is easily compatible -# with the common Linux stacksize of 8MB but will cause crashes with unwary use of the java -# VM e.g. in Octave or with the java-based libhdfs in numpy or scipy code -# BLAS3_MEM_ALLOC_THRESHOLD = 160 - - - -# the below is not yet configurable, use cmake if you need to build only select types -BUILD_SINGLE = 1 -BUILD_DOUBLE = 1 -BUILD_COMPLEX = 1 -BUILD_COMPLEX16 = 1 +# +# Select if you need to build only select types +# BUILD_SINGLE = 1 +# BUILD_DOUBLE = 1 +# BUILD_COMPLEX = 1 +# BUILD_COMPLEX16 = 1 +# +# # End of user configuration # diff --git a/Makefile.tail b/Makefile.tail index cfc4a36fc..641082450 100644 --- a/Makefile.tail +++ b/Makefile.tail @@ -11,8 +11,8 @@ COMMONOBJS_P = $(COMMONOBJS:.$(SUFFIX)=.$(PSUFFIX)) HPLOBJS_P = $(HPLOBJS:.$(SUFFIX)=.$(PSUFFIX)) -BLASOBJS = $(SHEXTOBJS) $(SHBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) -BLASOBJS_P = $(SHEXTOBJS_P) $(SHBLASOBJS_P) $(SBLASOBJS_P) $(DBLASOBJS_P) $(CBLASOBJS_P) $(ZBLASOBJS_P) +BLASOBJS = $(SHEXTOBJS) $(SHBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) $(CBAUXOBJS) +BLASOBJS_P = $(SHEXTOBJS_P) $(SHBLASOBJS_P) $(SBLASOBJS_P) $(DBLASOBJS_P) $(CBLASOBJS_P) $(ZBLASOBJS_P) $(CBAUXOBJS_P) ifdef EXPRECISION BLASOBJS += $(QBLASOBJS) $(XBLASOBJS) diff --git a/common_param.h b/common_param.h index a52de98ab..81b479e53 100644 --- a/common_param.h +++ b/common_param.h @@ -146,40 +146,56 @@ BLASLONG (*ishmin_k) (BLASLONG, float *, BLASLONG); int (*shlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); #endif + +#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) || (BUILD_COMPLEX16) int sgemm_p, sgemm_q, sgemm_r; int sgemm_unroll_m, sgemm_unroll_n, sgemm_unroll_mn; +#endif int exclusive_cache; +#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) float (*samax_k) (BLASLONG, float *, BLASLONG); float (*samin_k) (BLASLONG, float *, BLASLONG); float (*smax_k) (BLASLONG, float *, BLASLONG); float (*smin_k) (BLASLONG, float *, BLASLONG); + BLASLONG (*isamax_k)(BLASLONG, float *, BLASLONG); BLASLONG (*isamin_k)(BLASLONG, float *, BLASLONG); BLASLONG (*ismax_k) (BLASLONG, float *, BLASLONG); BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); - float (*snrm2_k) (BLASLONG, float *, BLASLONG); float (*sasum_k) (BLASLONG, float *, BLASLONG); +#endif +#if BUILD_SINGLE float (*ssum_k) (BLASLONG, float *, BLASLONG); +#endif +#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); + //double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*srot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); int (*saxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); +#endif +#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) || (BUILD_COMPLEX16) int (*sscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); +#endif +#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) int (*sswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*sgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*sgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +#endif +#if BUILD_SINGLE int (*sger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*ssymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*ssymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +#endif +#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) #ifdef ARCH_X86_64 void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG); int (*sgemm_direct_performant) (BLASLONG M, BLASLONG N, BLASLONG K); @@ -193,7 +209,8 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*sgemm_itcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*sgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*sgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); - +#endif +#if (BUILD_SINGLE) || (BUILD_DOUBLE) int (*strsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); @@ -215,7 +232,8 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*strsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*strsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*strsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - +#endif +#if BUILD_SINGLE int (*strmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); @@ -242,13 +260,18 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*ssymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*ssymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*ssymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - +#endif +#if (BUILD_SINGLE) || (BUILD_DOUBLE) int (*sneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*slaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); +#endif +#if (BUILD_DOUBLE) || (BUILD_COMPLEX16) int dgemm_p, dgemm_q, dgemm_r; int dgemm_unroll_m, dgemm_unroll_n, dgemm_unroll_mn; +#endif +#if (BUILD_DOUBLE) || (BUILD_COMPLEX16) double (*damax_k) (BLASLONG, double *, BLASLONG); double (*damin_k) (BLASLONG, double *, BLASLONG); double (*dmax_k) (BLASLONG, double *, BLASLONG); @@ -257,25 +280,37 @@ BLASLONG (*idamax_k)(BLASLONG, double *, BLASLONG); BLASLONG (*idamin_k)(BLASLONG, double *, BLASLONG); BLASLONG (*idmax_k) (BLASLONG, double *, BLASLONG); BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); +#endif +#if (BUILD_DOUBLE) || (BUILD_COMPLEX16) double (*dnrm2_k) (BLASLONG, double *, BLASLONG); double (*dasum_k) (BLASLONG, double *, BLASLONG); +#endif +#if BUILD_DOUBLE double (*dsum_k) (BLASLONG, double *, BLASLONG); +#endif +#if (BUILD_DOUBLE) || (BUILD_COMPLEX16) int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); +#endif +#if (BUILD_SINGLE) || (BUILD_DOUBLE) + double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); +#endif +#if (BUILD_DOUBLE) || (BUILD_COMPLEX16) int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); - int (*daxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); int (*dscal_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); int (*dswap_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); - int (*dgemv_n) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*dgemv_t) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +#endif +#if BUILD_DOUBLE int (*dger_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*dsymv_L) (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*dsymv_U) (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - +#endif +#if (BUILD_DOUBLE) || (BUILD_COMPLEX16) int (*dgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG); int (*dgemm_beta )(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); @@ -283,7 +318,8 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); int (*dgemm_itcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*dgemm_oncopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*dgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); - +#endif +#if BUILD_DOUBLE int (*dtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int (*dtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int (*dtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); @@ -335,7 +371,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); int (*dneg_tcopy) (BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*dlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, double *, BLASLONG, blasint *, double *); - +#endif #ifdef EXPRECISION int qgemm_p, qgemm_q, qgemm_r; @@ -430,22 +466,29 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG); #endif +#if (BUILD_COMPLEX) || (BUILD_COMPLEX16) int cgemm_p, cgemm_q, cgemm_r; int cgemm_unroll_m, cgemm_unroll_n, cgemm_unroll_mn; - float (*camax_k) (BLASLONG, float *, BLASLONG); float (*camin_k) (BLASLONG, float *, BLASLONG); BLASLONG (*icamax_k)(BLASLONG, float *, BLASLONG); BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); +#endif +#if BUILD_COMPLEX float (*cnrm2_k) (BLASLONG, float *, BLASLONG); float (*casum_k) (BLASLONG, float *, BLASLONG); float (*csum_k) (BLASLONG, float *, BLASLONG); +#endif +#if (BUILD_COMPLEX)|| (BUILD_COMPLEX16) int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); +#endif +#if BUILD_COMPLEX int (*csrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); - +#endif +#if (BUILD_COMPLEX)|| (BUILD_COMPLEX16) int (*caxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*caxpyc_k)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*cscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); @@ -459,6 +502,8 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); int (*cgemv_u) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*cgemv_s) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*cgemv_d) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +#endif +#if (BUILD_COMPLEX) int (*cgeru_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*cgerc_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*cgerv_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); @@ -470,13 +515,14 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); int (*chemv_U) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*chemv_M) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*chemv_V) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +#endif +#if (BUILD_COMPLEX) || (BUILD_COMPLEX16) int (*cgemm_kernel_n )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int (*cgemm_kernel_l )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int (*cgemm_kernel_r )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int (*cgemm_kernel_b )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int (*cgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - int (*cgemm_incopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*cgemm_itcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*cgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); @@ -507,6 +553,8 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); int (*ctrsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*ctrsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*ctrsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); +#endif +#if (BUILD_COMPLEX) int (*ctrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int (*ctrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); @@ -590,10 +638,13 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); int (*chemm3m_olcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); int (*chemm3m_oucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); int (*chemm3m_olcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - +#endif +#if (BUILD_COMPLEX) || (BUILD_COMPLEX16) int (*cneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*claswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); +#endif +#if BUILD_COMPLEX16 int zgemm_p, zgemm_q, zgemm_r; int zgemm_unroll_m, zgemm_unroll_n, zgemm_unroll_mn; @@ -757,6 +808,7 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG); int (*zneg_tcopy) (BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*zlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, double *, BLASLONG, blasint *, double *); +#endif #ifdef EXPRECISION @@ -930,22 +982,34 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); void (*init)(void); int snum_opt, dnum_opt, qnum_opt; - +#if BUILD_SINGLE int (*saxpby_k) (BLASLONG, float, float*, BLASLONG,float, float*, BLASLONG); +#endif +#if BUILD_DOUBLE int (*daxpby_k) (BLASLONG, double, double*, BLASLONG,double, double*, BLASLONG); +#endif +#if BUILD_COMPLEX int (*caxpby_k) (BLASLONG, float, float, float*, BLASLONG,float,float, float*, BLASLONG); +#endif +#if BUILD_COMPLEX16 int (*zaxpby_k) (BLASLONG, double, double, double*, BLASLONG,double,double, double*, BLASLONG); +#endif +#if BUILD_SINGLE int (*somatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); int (*somatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); int (*somatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); int (*somatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); +#endif +#if BUILD_DOUBLE int (*domatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*domatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*domatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*domatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); +#endif +#if BUILD_COMPLEX int (*comatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); @@ -955,7 +1019,9 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*comatcopy_k_ctc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_rnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); +#endif +#if BUILD_COMPLEX16 int (*zomatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); @@ -965,17 +1031,23 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*zomatcopy_k_ctc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_rnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); +#endif +#if BUILD_SINGLE int (*simatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG); int (*simatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG); int (*simatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG); int (*simatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG); +#endif +#if BUILD_DOUBLE int (*dimatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG); int (*dimatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG); int (*dimatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG); int (*dimatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG); +#endif +#if BUILD_COMPLEX int (*cimatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); int (*cimatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); int (*cimatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); @@ -985,7 +1057,9 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*cimatcopy_k_ctc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); int (*cimatcopy_k_rnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); int (*cimatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); +#endif +#if BUILD_COMPLEX16 int (*zimatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); int (*zimatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); int (*zimatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); @@ -995,12 +1069,20 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*zimatcopy_k_ctc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); int (*zimatcopy_k_rnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); int (*zimatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); +#endif +#if BUILD_SINGLE int (*sgeadd_k) (BLASLONG, BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG); +#endif +#if BUILD_DOUBLE int (*dgeadd_k) (BLASLONG, BLASLONG, double, double *, BLASLONG, double, double *, BLASLONG); +#endif +#if BUILD_COMPLEX int (*cgeadd_k) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float, float, float *, BLASLONG); +#endif +#if BUILD_COMPLEX16 int (*zgeadd_k) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double, double, double *, BLASLONG); - +#endif } gotoblas_t; extern gotoblas_t *gotoblas; @@ -1021,19 +1103,31 @@ extern gotoblas_t *gotoblas; #define SHGEMM_UNROLL_MN gotoblas -> shgemm_unroll_mn #endif +#if (BUILD_SINGLE) #define SGEMM_P gotoblas -> sgemm_p #define SGEMM_Q gotoblas -> sgemm_q #define SGEMM_R gotoblas -> sgemm_r #define SGEMM_UNROLL_M gotoblas -> sgemm_unroll_m #define SGEMM_UNROLL_N gotoblas -> sgemm_unroll_n #define SGEMM_UNROLL_MN gotoblas -> sgemm_unroll_mn +#endif +#if (BUILD_DOUBLE) #define DGEMM_P gotoblas -> dgemm_p #define DGEMM_Q gotoblas -> dgemm_q #define DGEMM_R gotoblas -> dgemm_r #define DGEMM_UNROLL_M gotoblas -> dgemm_unroll_m #define DGEMM_UNROLL_N gotoblas -> dgemm_unroll_n #define DGEMM_UNROLL_MN gotoblas -> dgemm_unroll_mn +#if ! (BUILD_SINGLE) +#define SGEMM_P gotoblas -> sgemm_p +#define SGEMM_Q gotoblas -> sgemm_q +#define SGEMM_R gotoblas -> sgemm_r +#define SGEMM_UNROLL_M gotoblas -> sgemm_unroll_m +#define SGEMM_UNROLL_N gotoblas -> sgemm_unroll_n +#define SGEMM_UNROLL_MN gotoblas -> sgemm_unroll_mn +#endif +#endif #define QGEMM_P gotoblas -> qgemm_p #define QGEMM_Q gotoblas -> qgemm_q @@ -1042,19 +1136,47 @@ extern gotoblas_t *gotoblas; #define QGEMM_UNROLL_N gotoblas -> qgemm_unroll_n #define QGEMM_UNROLL_MN gotoblas -> qgemm_unroll_mn +#if BUILD_COMPLEX #define CGEMM_P gotoblas -> cgemm_p #define CGEMM_Q gotoblas -> cgemm_q #define CGEMM_R gotoblas -> cgemm_r #define CGEMM_UNROLL_M gotoblas -> cgemm_unroll_m #define CGEMM_UNROLL_N gotoblas -> cgemm_unroll_n #define CGEMM_UNROLL_MN gotoblas -> cgemm_unroll_mn +#ifndef BUILD_SINGLE +#define SGEMM_P gotoblas -> sgemm_p +#define SGEMM_Q gotoblas -> sgemm_q +#define SGEMM_R 1024 +#define SGEMM_UNROLL_M gotoblas -> sgemm_unroll_m +#define SGEMM_UNROLL_N gotoblas -> sgemm_unroll_n +#define SGEMM_UNROLL_MN gotoblas -> sgemm_unroll_mn +#endif +#endif +#if BUILD_COMPLEX16 #define ZGEMM_P gotoblas -> zgemm_p #define ZGEMM_Q gotoblas -> zgemm_q #define ZGEMM_R gotoblas -> zgemm_r #define ZGEMM_UNROLL_M gotoblas -> zgemm_unroll_m #define ZGEMM_UNROLL_N gotoblas -> zgemm_unroll_n #define ZGEMM_UNROLL_MN gotoblas -> zgemm_unroll_mn +#ifndef BUILD_DOUBLE +#define DGEMM_P gotoblas -> dgemm_p +#define DGEMM_Q gotoblas -> dgemm_q +#define DGEMM_R 1024 +#define DGEMM_UNROLL_M gotoblas -> dgemm_unroll_m +#define DGEMM_UNROLL_N gotoblas -> dgemm_unroll_n +#define DGEMM_UNROLL_MN gotoblas -> dgemm_unroll_mn +#endif +#ifndef BUILD_COMPLEX +#define CGEMM_P gotoblas -> cgemm_p +#define CGEMM_Q gotoblas -> cgemm_q +#define CGEMM_R gotoblas -> cgemm_r +#define CGEMM_UNROLL_M gotoblas -> cgemm_unroll_m +#define CGEMM_UNROLL_N gotoblas -> cgemm_unroll_n +#define CGEMM_UNROLL_MN gotoblas -> cgemm_unroll_mn +#endif +#endif #define XGEMM_P gotoblas -> xgemm_p #define XGEMM_Q gotoblas -> xgemm_q @@ -1222,7 +1344,7 @@ extern gotoblas_t *gotoblas; #endif #ifndef COMPLEX -#if defined(XDOUBLE) +#if (XDOUBLE) #define GEMM_P QGEMM_P #define GEMM_Q QGEMM_Q #define GEMM_R QGEMM_R @@ -1246,7 +1368,7 @@ extern gotoblas_t *gotoblas; #define GEMM_DEFAULT_R DGEMM_DEFAULT_R #define GEMM_DEFAULT_UNROLL_M DGEMM_DEFAULT_UNROLL_M #define GEMM_DEFAULT_UNROLL_N DGEMM_DEFAULT_UNROLL_N -#elif defined(HALF) +#elif (HALF) #define GEMM_P SHGEMM_P #define GEMM_Q SHGEMM_Q #define GEMM_R SHGEMM_R @@ -1272,7 +1394,7 @@ extern gotoblas_t *gotoblas; #define GEMM_DEFAULT_UNROLL_N SGEMM_DEFAULT_UNROLL_N #endif #else -#if defined(XDOUBLE) +#if (XDOUBLE) #define GEMM_P XGEMM_P #define GEMM_Q XGEMM_Q #define GEMM_R XGEMM_R @@ -1386,7 +1508,7 @@ extern gotoblas_t *gotoblas; #ifndef GEMM3M_P #ifdef XDOUBLE #define GEMM3M_P XGEMM3M_P -#elif defined(DOUBLE) +#elif defined (DOUBLE) #define GEMM3M_P ZGEMM3M_P #else #define GEMM3M_P CGEMM3M_P @@ -1396,7 +1518,7 @@ extern gotoblas_t *gotoblas; #ifndef GEMM3M_Q #ifdef XDOUBLE #define GEMM3M_Q XGEMM3M_Q -#elif defined(DOUBLE) +#elif defined (DOUBLE) #define GEMM3M_Q ZGEMM3M_Q #else #define GEMM3M_Q CGEMM3M_Q @@ -1406,7 +1528,7 @@ extern gotoblas_t *gotoblas; #ifndef GEMM3M_R #ifdef XDOUBLE #define GEMM3M_R XGEMM3M_R -#elif defined(DOUBLE) +#elif defined (DOUBLE) #define GEMM3M_R ZGEMM3M_R #else #define GEMM3M_R CGEMM3M_R From 5f23bdf437e6605f2fd36e3500026501f79eb134 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 17:23:08 +0200 Subject: [PATCH 526/593] remove debug output --- test/Makefile | 1 - 1 file changed, 1 deletion(-) diff --git a/test/Makefile b/test/Makefile index a3966756d..069d7880a 100644 --- a/test/Makefile +++ b/test/Makefile @@ -7,7 +7,6 @@ all :: else all :: level1 level2 level3 endif -$(info buildvars [$(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16)]) ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1x1x1x1) level1: sblat1 dblat1 cblat1 zblat1 endif From 8c5e08076ea8779d19f072254bcaadd15b495acc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 17:33:51 +0200 Subject: [PATCH 527/593] If none of the BUILD_ options is set, enable them all --- Makefile.system | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index c46c88581..501b161ae 100644 --- a/Makefile.system +++ b/Makefile.system @@ -9,7 +9,7 @@ ifndef TOPDIR TOPDIR = . endif -# If ARCH is not set, we use the host system's architecture for getarch compile options. + # If ARCH is not set, we use the host system's architecture for getarch compile options. ifndef ARCH HOSTARCH := $(shell uname -m) else @@ -73,6 +73,18 @@ endif # # Beginning of system configuration # +ifneq ($(BUILD_SINGLE),1) +ifneq ($(BUILD_DOUBLE),1) +ifneq ($(BUILD_COMPLEX),1) +ifneq ($(BUILD_COMPLEX16),1) +override BUILD_SINGLE=1 +override BUILD_DOUBLE=1 +override BUILD_COMPLEX=1 +override BUILD_COMPLEX16=1 +endif +endif +endif +endif ifndef HOSTCC HOSTCC = $(CC) From 1da32cc1fc3b7602619f56e6243aaa7e225b504d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 17:45:41 +0200 Subject: [PATCH 528/593] Add cblas_xerbla interface --- Makefile.tail | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile.tail b/Makefile.tail index cfc4a36fc..641082450 100644 --- a/Makefile.tail +++ b/Makefile.tail @@ -11,8 +11,8 @@ COMMONOBJS_P = $(COMMONOBJS:.$(SUFFIX)=.$(PSUFFIX)) HPLOBJS_P = $(HPLOBJS:.$(SUFFIX)=.$(PSUFFIX)) -BLASOBJS = $(SHEXTOBJS) $(SHBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) -BLASOBJS_P = $(SHEXTOBJS_P) $(SHBLASOBJS_P) $(SBLASOBJS_P) $(DBLASOBJS_P) $(CBLASOBJS_P) $(ZBLASOBJS_P) +BLASOBJS = $(SHEXTOBJS) $(SHBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) $(CBAUXOBJS) +BLASOBJS_P = $(SHEXTOBJS_P) $(SHBLASOBJS_P) $(SBLASOBJS_P) $(DBLASOBJS_P) $(CBLASOBJS_P) $(ZBLASOBJS_P) $(CBAUXOBJS_P) ifdef EXPRECISION BLASOBJS += $(QBLASOBJS) $(XBLASOBJS) From ae8b0d257a134b5630248f97b803a090bc51e31a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 18:08:21 +0200 Subject: [PATCH 529/593] Set BUILD_ options to 1 instead of just defining them --- Makefile.system | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Makefile.system b/Makefile.system index 501b161ae..eb6e14a98 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1236,16 +1236,16 @@ ifeq ($(BUILD_HALF), 1) CCOMMON_OPT += -DBUILD_HALF endif ifeq ($(BUILD_SINGLE), 1) -CCOMMON_OPT += -DBUILD_SINGLE +CCOMMON_OPT += -DBUILD_SINGLE=1 endif ifeq ($(BUILD_DOUBLE), 1) -CCOMMON_OPT += -DBUILD_DOUBLE +CCOMMON_OPT += -DBUILD_DOUBLE=1 endif ifeq ($(BUILD_COMPLEX), 1) -CCOMMON_OPT += -DBUILD_COMPLEX +CCOMMON_OPT += -DBUILD_COMPLEX=1 endif ifeq ($(BUILD_COMPLEX16), 1) -CCOMMON_OPT += -DBUILD_COMPLEX16 +CCOMMON_OPT += -DBUILD_COMPLEX16=1 endif CCOMMON_OPT += -DVERSION=\"$(VERSION)\" From 6154f72d6dc241260a53f9a9e424f18dd3f0f943 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 18:25:16 +0200 Subject: [PATCH 530/593] Copy BUILD_ settings to the LAPACK make.inc --- Makefile | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Makefile b/Makefile index 93e8af2eb..6e7b31b1a 100644 --- a/Makefile +++ b/Makefile @@ -304,6 +304,18 @@ else endif ifeq ($(BUILD_LAPACK_DEPRECATED), 1) -@echo "BUILD_DEPRECATED = 1" >> $(NETLIB_LAPACK_DIR)/make.inc +endif +ifeq ($(BUILD_SINGLE), 1) + -@echo "BUILD_SINGLE = 1" >> $(NETLIB_LAPACK_DIR)/make.inc +endif +ifeq ($(BUILD_DOUBLE), 1) + -@echo "BUILD_DOUBLE = 1" >> $(NETLIB_LAPACK_DIR)/make.inc +endif +ifeq ($(BUILD_COMPLEX), 1) + -@echo "BUILD_COMPLEX = 1" >> $(NETLIB_LAPACK_DIR)/make.inc +endif +ifeq ($(BUILD_COMPLEX16), 1) + -@echo "BUILD_COMPLEX16 = 1" >> $(NETLIB_LAPACK_DIR)/make.inc endif -@echo "LAPACKE_WITH_TMG = 1" >> $(NETLIB_LAPACK_DIR)/make.inc -@cat make.inc >> $(NETLIB_LAPACK_DIR)/make.inc From caa0d757cac13c59fa9ff763f4ccc91d73ffc5c0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 18:29:34 +0200 Subject: [PATCH 531/593] repair TABs --- Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 6e7b31b1a..22f7314d9 100644 --- a/Makefile +++ b/Makefile @@ -306,16 +306,16 @@ ifeq ($(BUILD_LAPACK_DEPRECATED), 1) -@echo "BUILD_DEPRECATED = 1" >> $(NETLIB_LAPACK_DIR)/make.inc endif ifeq ($(BUILD_SINGLE), 1) - -@echo "BUILD_SINGLE = 1" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "BUILD_SINGLE = 1" >> $(NETLIB_LAPACK_DIR)/make.inc endif ifeq ($(BUILD_DOUBLE), 1) - -@echo "BUILD_DOUBLE = 1" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "BUILD_DOUBLE = 1" >> $(NETLIB_LAPACK_DIR)/make.inc endif ifeq ($(BUILD_COMPLEX), 1) - -@echo "BUILD_COMPLEX = 1" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "BUILD_COMPLEX = 1" >> $(NETLIB_LAPACK_DIR)/make.inc endif ifeq ($(BUILD_COMPLEX16), 1) - -@echo "BUILD_COMPLEX16 = 1" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "BUILD_COMPLEX16 = 1" >> $(NETLIB_LAPACK_DIR)/make.inc endif -@echo "LAPACKE_WITH_TMG = 1" >> $(NETLIB_LAPACK_DIR)/make.inc -@cat make.inc >> $(NETLIB_LAPACK_DIR)/make.inc From d314d1f49f1cde993b1daa53a748303d853b4503 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 23:37:38 +0200 Subject: [PATCH 532/593] Rename shgemm_kernel_power10.c to sbgemm_kernel_power10.c --- kernel/power/{shgemm_kernel_power10.c => sbgemm_kernel_power10.c} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename kernel/power/{shgemm_kernel_power10.c => sbgemm_kernel_power10.c} (100%) diff --git a/kernel/power/shgemm_kernel_power10.c b/kernel/power/sbgemm_kernel_power10.c similarity index 100% rename from kernel/power/shgemm_kernel_power10.c rename to kernel/power/sbgemm_kernel_power10.c From 9ae80490e050c2526ce426b9557ff1a981142218 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 23:39:42 +0200 Subject: [PATCH 533/593] rename "HALF" and "sh" to "BFLOAT16" and "sb" --- kernel/power/sbgemm_kernel_power10.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/power/sbgemm_kernel_power10.c b/kernel/power/sbgemm_kernel_power10.c index 1ae9e04bf..46d82598a 100644 --- a/kernel/power/sbgemm_kernel_power10.c +++ b/kernel/power/sbgemm_kernel_power10.c @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #include "common.h" #include -#if defined(HALF) && defined(HALFCONVERSION) +#if defined(BFLOAT16) && defined(BFLOAT16CONVERSION) static float bfloat16tof32 (bfloat16 f16) { @@ -131,7 +131,7 @@ vector char mask = #define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); /************************************************************************************* -* SHGEMM Kernel +* SBGEMM Kernel *************************************************************************************/ int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, From d7dd9b396c3385e7eeb63cafb38778e74e31f16f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 23:40:43 +0200 Subject: [PATCH 534/593] Rename shdot.c to sbdot.c --- kernel/x86_64/{shdot.c => sbdot.c} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename kernel/x86_64/{shdot.c => sbdot.c} (100%) diff --git a/kernel/x86_64/shdot.c b/kernel/x86_64/sbdot.c similarity index 100% rename from kernel/x86_64/shdot.c rename to kernel/x86_64/sbdot.c From 68ce719faca5b17a3fd91ace87f474e6b255d358 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 23:41:13 +0200 Subject: [PATCH 535/593] Rename shdot_microk_cooperlake.c to sbdot_microk_cooperlake.c --- .../{shdot_microk_cooperlake.c => sbdot_microk_cooperlake.c} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename kernel/x86_64/{shdot_microk_cooperlake.c => sbdot_microk_cooperlake.c} (100%) diff --git a/kernel/x86_64/shdot_microk_cooperlake.c b/kernel/x86_64/sbdot_microk_cooperlake.c similarity index 100% rename from kernel/x86_64/shdot_microk_cooperlake.c rename to kernel/x86_64/sbdot_microk_cooperlake.c From fd942360421e0cfb82220042a2396479e9ec3383 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 23:42:07 +0200 Subject: [PATCH 536/593] Rename "HALF" and "sh" to "BFLOAT16" and "sb" --- kernel/x86_64/sbdot.c | 18 +++++++++--------- kernel/x86_64/sbdot_microk_cooperlake.c | 4 ++-- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/kernel/x86_64/sbdot.c b/kernel/x86_64/sbdot.c index 5073fda2a..ef14fd618 100644 --- a/kernel/x86_64/sbdot.c +++ b/kernel/x86_64/sbdot.c @@ -28,16 +28,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(COOPERLAKE) -#include "shdot_microk_cooperlake.c" +#include "sbdot_microk_cooperlake.c" #endif -static float shdot_compute(BLASLONG n, bfloat16 *x, BLASLONG inc_x, bfloat16 *y, BLASLONG inc_y) +static float sbdot_compute(BLASLONG n, bfloat16 *x, BLASLONG inc_x, bfloat16 *y, BLASLONG inc_y) { float d = 0.0; -#ifdef HAVE_SHDOT_ACCL_KERNEL +#ifdef HAVE_SBDOT_ACCL_KERNEL if ((inc_x == 1) && (inc_y == 1)) { - return shdot_accl_kernel(n, x, y); + return sbdot_accl_kernel(n, x, y); } #endif @@ -56,11 +56,11 @@ static float shdot_compute(BLASLONG n, bfloat16 *x, BLASLONG inc_x, bfloat16 *y, } #if defined(SMP) -static int shdot_thread_func(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, bfloat16 dummy2, +static int sbdot_thread_func(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, bfloat16 dummy2, bfloat16 *x, BLASLONG inc_x, bfloat16 *y, BLASLONG inc_y, float *result, BLASLONG dummy3) { - *(float *)result = shdot_compute(n, x, inc_x, y, inc_y); + *(float *)result = sbdot_compute(n, x, inc_x, y, inc_y); return 0; } @@ -94,13 +94,13 @@ float CNAME(BLASLONG n, bfloat16 *x, BLASLONG inc_x, bfloat16 *y, BLASLONG inc_y } if (nthreads <= 1) { - dot_result = shdot_compute(n, x, inc_x, y, inc_y); + dot_result = sbdot_compute(n, x, inc_x, y, inc_y); } else { char thread_result[MAX_CPU_NUMBER * sizeof(double) * 2]; int mode = BLAS_BFLOAT16 | BLAS_REAL; blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, y, inc_y, thread_result, 0, - (void *)shdot_thread_func, nthreads); + (void *)sbdot_thread_func, nthreads); float * ptr = (float *)thread_result; for (int i = 0; i < nthreads; i++) { dot_result += (*ptr); @@ -108,7 +108,7 @@ float CNAME(BLASLONG n, bfloat16 *x, BLASLONG inc_x, bfloat16 *y, BLASLONG inc_y } } #else - dot_result = shdot_compute(n, x, inc_x, y, inc_y); + dot_result = sbdot_compute(n, x, inc_x, y, inc_y); #endif return dot_result; diff --git a/kernel/x86_64/sbdot_microk_cooperlake.c b/kernel/x86_64/sbdot_microk_cooperlake.c index e645296f1..067726cb1 100644 --- a/kernel/x86_64/sbdot_microk_cooperlake.c +++ b/kernel/x86_64/sbdot_microk_cooperlake.c @@ -28,11 +28,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* need a new enough GCC for avx512 support */ #if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) -#define HAVE_SHDOT_ACCL_KERNEL 1 +#define HAVE_SBDOT_ACCL_KERNEL 1 #include "common.h" #include -static float shdot_accl_kernel(BLASLONG n, bfloat16 *x, bfloat16 *y) +static float sbdot_accl_kernel(BLASLONG n, bfloat16 *x, bfloat16 *y) { __m128 accum128 = _mm_setzero_ps(); if (n> 127) { /* n range from 128 to inf. */ From 4db09c6cec22711f8ec1588bc9d01d7db9e91478 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 23:42:45 +0200 Subject: [PATCH 537/593] Rename compare_sgemm_shgemm.c to compare_sgemm_sbgemm.c --- test/{compare_sgemm_shgemm.c => compare_sgemm_sbgemm.c} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename test/{compare_sgemm_shgemm.c => compare_sgemm_sbgemm.c} (100%) diff --git a/test/compare_sgemm_shgemm.c b/test/compare_sgemm_sbgemm.c similarity index 100% rename from test/compare_sgemm_shgemm.c rename to test/compare_sgemm_sbgemm.c From 924fd806d0737ab6faabab8fab26a102073ebbfd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 23:43:36 +0200 Subject: [PATCH 538/593] Rename "HALF" and "sh" to "BFLOAT16" and "sb" --- test/compare_sgemm_sbgemm.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/compare_sgemm_sbgemm.c b/test/compare_sgemm_sbgemm.c index 57aee7b8f..3d4eb2515 100644 --- a/test/compare_sgemm_sbgemm.c +++ b/test/compare_sgemm_sbgemm.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "../common.h" #define SGEMM BLASFUNC(sgemm) -#define SHGEMM BLASFUNC(shgemm) +#define SBGEMM BLASFUNC(sbgemm) typedef union { unsigned short v; @@ -102,7 +102,7 @@ main (int argc, char *argv[]) } SGEMM (&transA, &transB, &m, &n, &k, &alpha, A, &m, B, &k, &beta, C, &m); - SHGEMM (&transA, &transB, &m, &n, &k, &alpha, AA, + SBGEMM (&transA, &transB, &m, &n, &k, &alpha, AA, &m, BB, &k, &beta, CC, &m); for (i = 0; i < n; i++) for (j = 0; j < m; j++) @@ -126,6 +126,6 @@ main (int argc, char *argv[]) } } if (ret != 0) - fprintf (stderr, "FATAL ERROR SHGEMM - Return code: %d\n", ret); + fprintf (stderr, "FATAL ERROR SBGEMM - Return code: %d\n", ret); return ret; } From 5800758b43e26d873a89ace25bafff947980a5c9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 23:44:38 +0200 Subject: [PATCH 539/593] Rename "HALF" and "sh" to "BFLOAT16" and "sb" --- test/Makefile | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/test/Makefile b/test/Makefile index 069d7880a..06fb7fe86 100644 --- a/test/Makefile +++ b/test/Makefile @@ -214,16 +214,16 @@ endif -#ifeq ($(BUILD_HALF),1) -#level3 : test_shgemm sblat3 dblat3 cblat3 zblat3 +#ifeq ($(BUILD_BFLOAT16),1) +#level3 : test_sbgemm sblat3 dblat3 cblat3 zblat3 #else #level3 : sblat3 dblat3 cblat3 zblat3 #endif ifndef CROSS rm -f ?BLAT3.SUMM -ifeq ($(BUILD_HALF),1) - OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./test_shgemm > SHBLAT3.SUMM +ifeq ($(BUILD_BFLOAT16),1) + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./test_sbgemm > SHBLAT3.SUMM @$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0 endif ifeq ($(BUILD_SINGLE),1) @@ -245,8 +245,8 @@ endif ifdef SMP rm -f ?BLAT3.SUMM ifeq ($(USE_OPENMP), 1) -ifeq ($(BUILD_HALF),1) - OMP_NUM_THREADS=2 ./test_shgemm > SHBLAT3.SUMM +ifeq ($(BUILD_BFLOAT16),1) + OMP_NUM_THREADS=2 ./test_sbgemm > SHBLAT3.SUMM @$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0 endif ifeq ($(BUILD_SINGLE),1) @@ -266,8 +266,8 @@ ifeq ($(BUILD_COMPLEX16),1) @$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0 endif else -ifeq ($(BUILD_HALF),1) - OPENBLAS_NUM_THREADS=2 ./test_shgemm > SHBLAT3.SUMM +ifeq ($(BUILD_BFLOAT16),1) + OPENBLAS_NUM_THREADS=2 ./test_sbgemm > SHBLAT3.SUMM @$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0 endif ifeq ($(BUILD_SINGLE),1) @@ -377,9 +377,9 @@ zblat3 : zblat3.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o zblat3 zblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) endif -ifeq ($(BUILD_HALF),1) -test_shgemm : compare_sgemm_shgemm.c ../$(LIBNAME) - $(FC) $(FLDFLAGS) -o test_shgemm compare_sgemm_shgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) +ifeq ($(BUILD_BFLOAT16),1) +test_sbgemm : compare_sgemm_sbgemm.c ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o test_sbgemm compare_sgemm_sbgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) endif ifeq ($(BUILD_COMPLEX),1) @@ -398,7 +398,7 @@ clean: @rm -f *.$(SUFFIX) *.$(PSUFFIX) gmon.$(SUFFIX)ut *.SUMM *.cxml *.exe *.pdb *.dwf \ sblat1 dblat1 cblat1 zblat1 \ sblat2 dblat2 cblat2 zblat2 \ - test_shgemm sblat3 dblat3 cblat3 zblat3 \ + test_sbgemm sblat3 dblat3 cblat3 zblat3 \ sblat1p dblat1p cblat1p zblat1p \ sblat2p dblat2p cblat2p zblat2p \ sblat3p dblat3p cblat3p zblat3p \ From ca31c32693bbb70cd8eeee5f2be09a7e9d1b363c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 23:49:22 +0200 Subject: [PATCH 540/593] Rename "HALF" and "sh" to "BFLOAT16" and "sb" --- cblas.h | 2 +- common.h | 4 +- common_interface.h | 4 +- common_level1.h | 2 +- common_level3.h | 28 +++--- common_macro.h | 94 +++++++++--------- common_param.h | 230 ++++++++++++++++++--------------------------- getarch_2nd.c | 4 +- param.h | 32 +++---- 9 files changed, 178 insertions(+), 222 deletions(-) diff --git a/cblas.h b/cblas.h index 21f3958f2..4fc6f8681 100644 --- a/cblas.h +++ b/cblas.h @@ -392,7 +392,7 @@ void cblas_sbf16tos(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPE /* convert BFLOAT16 array to double array */ void cblas_dbf16tod(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPENBLAS_CONST blasint incin, double *out, OPENBLAS_CONST blasint incout); /* dot production of BFLOAT16 input arrays, and output as float */ -float cblas_shdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST bfloat16 *y, OPENBLAS_CONST blasint incy); +float cblas_sbdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST bfloat16 *y, OPENBLAS_CONST blasint incy); #ifdef __cplusplus } diff --git a/common.h b/common.h index ab287262c..89eeb197d 100644 --- a/common.h +++ b/common.h @@ -260,7 +260,7 @@ typedef unsigned long BLASULONG; #ifndef BFLOAT16 #include typedef uint16_t bfloat16; -#define HALFCONVERSION 1 +#define BFLOAT16CONVERSION 1 #endif #ifdef USE64BITINT @@ -303,7 +303,7 @@ typedef int blasint; #define SIZE 8 #define BASE_SHIFT 3 #define ZBASE_SHIFT 4 -#elif defined(HALF) +#elif defined(BFLOAT16) #define IFLOAT bfloat16 #define XFLOAT IFLOAT #define FLOAT float diff --git a/common_interface.h b/common_interface.h index 35a957aa1..bee09e894 100644 --- a/common_interface.h +++ b/common_interface.h @@ -54,7 +54,7 @@ double BLASFUNC(dsdot) (blasint *, float *, blasint *, float *, blasint *); double BLASFUNC(ddot) (blasint *, double *, blasint *, double *, blasint *); xdouble BLASFUNC(qdot) (blasint *, xdouble *, blasint *, xdouble *, blasint *); -float BLASFUNC(shdot) (blasint *, bfloat16 *, blasint *, bfloat16 *, blasint *); +float BLASFUNC(sbdot) (blasint *, bfloat16 *, blasint *, bfloat16 *, blasint *); void BLASFUNC(shstobf16) (blasint *, float *, blasint *, bfloat16 *, blasint *); void BLASFUNC(shdtobf16) (blasint *, double *, blasint *, bfloat16 *, blasint *); void BLASFUNC(sbf16tos) (blasint *, bfloat16 *, blasint *, float *, blasint *); @@ -474,7 +474,7 @@ void BLASFUNC(xhbmv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint /* Level 3 routines */ -void BLASFUNC(shgemm)(char *, char *, blasint *, blasint *, blasint *, float *, +void BLASFUNC(sbgemm)(char *, char *, blasint *, blasint *, blasint *, float *, bfloat16 *, blasint *, bfloat16 *, blasint *, float *, float *, blasint *); void BLASFUNC(sgemm)(char *, char *, blasint *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); diff --git a/common_level1.h b/common_level1.h index 88aa275a5..7b17962c4 100644 --- a/common_level1.h +++ b/common_level1.h @@ -46,7 +46,7 @@ float sdot_k(BLASLONG, float *, BLASLONG, float *, BLASLONG); double dsdot_k(BLASLONG, float *, BLASLONG, float *, BLASLONG); double ddot_k(BLASLONG, double *, BLASLONG, double *, BLASLONG); xdouble qdot_k(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); -float shdot_k(BLASLONG, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG); +float sbdot_k(BLASLONG, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG); void shstobf16_k(BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG); void shdtobf16_k(BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG); diff --git a/common_level3.h b/common_level3.h index 671a7a086..c4f9435a9 100644 --- a/common_level3.h +++ b/common_level3.h @@ -55,7 +55,7 @@ void sgemm_direct(BLASLONG M, BLASLONG N, BLASLONG K, int sgemm_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K); -int shgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, +int sbgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG); int sgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); @@ -78,10 +78,10 @@ int xgemm_beta(BLASLONG, BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); #endif -int shgemm_incopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); -int shgemm_itcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); -int shgemm_oncopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); -int shgemm_otcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); +int sbgemm_incopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); +int sbgemm_itcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); +int sbgemm_oncopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); +int sbgemm_otcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); int sgemm_incopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); int sgemm_itcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); int sgemm_oncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); @@ -505,7 +505,7 @@ int xher2k_kernel_UC(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdoubl int xher2k_kernel_LN(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag); int xher2k_kernel_LC(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag); -int shgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG); +int sbgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG); int sgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); int dgemm_kernel(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG); @@ -534,10 +534,10 @@ int cgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float int zgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); int xgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); -int shgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); -int shgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); -int shgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); -int shgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int sbgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int sbgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int sbgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int sbgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); int sgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int sgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); @@ -631,10 +631,10 @@ int xgemm_cr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLON int xgemm_cc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); #endif -int shgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); -int shgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); -int shgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); -int shgemm_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int sbgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int sbgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int sbgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int sbgemm_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); int sgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int sgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); diff --git a/common_macro.h b/common_macro.h index 3d6bcd9e8..605d74ada 100644 --- a/common_macro.h +++ b/common_macro.h @@ -644,7 +644,7 @@ #define GEADD_K DGEADD_K -#elif defined(HALF) +#elif defined(BFLOAT16) #define D_TO_BF16_K SHDTOBF16_K #define D_BF16_TO_K DBF16TOD_K @@ -662,7 +662,7 @@ #define ASUM_K SASUM_K #define DOTU_K SDOTU_K #define DOTC_K SDOTC_K -#define BF16_DOT_K SHDOT_K +#define BF16_DOT_K SBDOT_K #define AXPYU_K SAXPYU_K #define AXPYC_K SAXPYC_K #define AXPBY_K SAXPBY_K @@ -682,32 +682,32 @@ #define NRM2_K SNRM2_K #define SYMV_THREAD_U SSYMV_THREAD_U #define SYMV_THREAD_L SSYMV_THREAD_L -#define GEMM_BETA SHGEMM_BETA -#define GEMM_KERNEL_N SHGEMM_KERNEL -#define GEMM_KERNEL_L SHGEMM_KERNEL -#define GEMM_KERNEL_R SHGEMM_KERNEL -#define GEMM_KERNEL_B SHGEMM_KERNEL - -#define GEMM_NN SHGEMM_NN -#define GEMM_CN SHGEMM_TN -#define GEMM_TN SHGEMM_TN -#define GEMM_NC SHGEMM_NT -#define GEMM_NT SHGEMM_NT -#define GEMM_CC SHGEMM_TT -#define GEMM_CT SHGEMM_TT -#define GEMM_TC SHGEMM_TT -#define GEMM_TT SHGEMM_TT -#define GEMM_NR SHGEMM_NN -#define GEMM_TR SHGEMM_TN -#define GEMM_CR SHGEMM_TN -#define GEMM_RN SHGEMM_NN -#define GEMM_RT SHGEMM_NT -#define GEMM_RC SHGEMM_NT -#define GEMM_RR SHGEMM_NN -#define GEMM_ONCOPY SHGEMM_ONCOPY -#define GEMM_OTCOPY SHGEMM_OTCOPY -#define GEMM_INCOPY SHGEMM_INCOPY -#define GEMM_ITCOPY SHGEMM_ITCOPY +#define GEMM_BETA SBGEMM_BETA +#define GEMM_KERNEL_N SBGEMM_KERNEL +#define GEMM_KERNEL_L SBGEMM_KERNEL +#define GEMM_KERNEL_R SBGEMM_KERNEL +#define GEMM_KERNEL_B SBGEMM_KERNEL + +#define GEMM_NN SBGEMM_NN +#define GEMM_CN SBGEMM_TN +#define GEMM_TN SBGEMM_TN +#define GEMM_NC SBGEMM_NT +#define GEMM_NT SBGEMM_NT +#define GEMM_CC SBGEMM_TT +#define GEMM_CT SBGEMM_TT +#define GEMM_TC SBGEMM_TT +#define GEMM_TT SBGEMM_TT +#define GEMM_NR SBGEMM_NN +#define GEMM_TR SBGEMM_TN +#define GEMM_CR SBGEMM_TN +#define GEMM_RN SBGEMM_NN +#define GEMM_RT SBGEMM_NT +#define GEMM_RC SBGEMM_NT +#define GEMM_RR SBGEMM_NN +#define GEMM_ONCOPY SBGEMM_ONCOPY +#define GEMM_OTCOPY SBGEMM_OTCOPY +#define GEMM_INCOPY SBGEMM_INCOPY +#define GEMM_ITCOPY SBGEMM_ITCOPY #define SYMM_THREAD_LU SSYMM_THREAD_LU #define SYMM_THREAD_LL SSYMM_THREAD_LL #define SYMM_THREAD_RU SSYMM_THREAD_RU @@ -723,22 +723,22 @@ #define HEMM_THREAD_RU SHEMM_THREAD_RU #define HEMM_THREAD_RL SHEMM_THREAD_RL -#define GEMM_THREAD_NN SHGEMM_THREAD_NN -#define GEMM_THREAD_CN SHGEMM_THREAD_TN -#define GEMM_THREAD_TN SHGEMM_THREAD_TN -#define GEMM_THREAD_NC SHGEMM_THREAD_NT -#define GEMM_THREAD_NT SHGEMM_THREAD_NT -#define GEMM_THREAD_CC SHGEMM_THREAD_TT -#define GEMM_THREAD_CT SHGEMM_THREAD_TT -#define GEMM_THREAD_TC SHGEMM_THREAD_TT -#define GEMM_THREAD_TT SHGEMM_THREAD_TT -#define GEMM_THREAD_NR SHGEMM_THREAD_NN -#define GEMM_THREAD_TR SHGEMM_THREAD_TN -#define GEMM_THREAD_CR SHGEMM_THREAD_TN -#define GEMM_THREAD_RN SHGEMM_THREAD_NN -#define GEMM_THREAD_RT SHGEMM_THREAD_NT -#define GEMM_THREAD_RC SHGEMM_THREAD_NT -#define GEMM_THREAD_RR SHGEMM_THREAD_NN +#define GEMM_THREAD_NN SBGEMM_THREAD_NN +#define GEMM_THREAD_CN SBGEMM_THREAD_TN +#define GEMM_THREAD_TN SBGEMM_THREAD_TN +#define GEMM_THREAD_NC SBGEMM_THREAD_NT +#define GEMM_THREAD_NT SBGEMM_THREAD_NT +#define GEMM_THREAD_CC SBGEMM_THREAD_TT +#define GEMM_THREAD_CT SBGEMM_THREAD_TT +#define GEMM_THREAD_TC SBGEMM_THREAD_TT +#define GEMM_THREAD_TT SBGEMM_THREAD_TT +#define GEMM_THREAD_NR SBGEMM_THREAD_NN +#define GEMM_THREAD_TR SBGEMM_THREAD_TN +#define GEMM_THREAD_CR SBGEMM_THREAD_TN +#define GEMM_THREAD_RN SBGEMM_THREAD_NN +#define GEMM_THREAD_RT SBGEMM_THREAD_NT +#define GEMM_THREAD_RC SBGEMM_THREAD_NT +#define GEMM_THREAD_RR SBGEMM_THREAD_NN #ifdef UNIT @@ -2491,9 +2491,9 @@ #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) extern BLASLONG gemm_offset_a; extern BLASLONG gemm_offset_b; -extern BLASLONG shgemm_p; -extern BLASLONG shgemm_q; -extern BLASLONG shgemm_r; +extern BLASLONG sbgemm_p; +extern BLASLONG sbgemm_q; +extern BLASLONG sbgemm_r; extern BLASLONG sgemm_p; extern BLASLONG sgemm_q; extern BLASLONG sgemm_r; diff --git a/common_param.h b/common_param.h index 0fe5e6c1d..361523081 100644 --- a/common_param.h +++ b/common_param.h @@ -47,9 +47,9 @@ typedef struct { int dtb_entries; int offsetA, offsetB, align; -#ifdef BUILD_HALF - int shgemm_p, shgemm_q, shgemm_r; - int shgemm_unroll_m, shgemm_unroll_n, shgemm_unroll_mn; +#ifdef BUILD_BFLOAT16 + int sbgemm_p, sbgemm_q, sbgemm_r; + int sbgemm_unroll_m, sbgemm_unroll_n, sbgemm_unroll_mn; void (*shstobf16_k) (BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG); void (*shdtobf16_k) (BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG); @@ -69,8 +69,8 @@ BLASLONG (*ishmin_k) (BLASLONG, float *, BLASLONG); float (*shasum_k) (BLASLONG, float *, BLASLONG); float (*shsum_k) (BLASLONG, float *, BLASLONG); int (*shcopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - float (*shdot_k) (BLASLONG, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG); - double (*dshdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); + float (*sbdot_k) (BLASLONG, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG); + double (*dsbdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*shrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); @@ -78,20 +78,20 @@ BLASLONG (*ishmin_k) (BLASLONG, float *, BLASLONG); int (*shscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*shswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - int (*shgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*shgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*sbgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*sbgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*shger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*shsymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*shsymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*shgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG); - int (*shgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG); + int (*sbgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG); + int (*sbgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG); - int (*shgemm_incopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); - int (*shgemm_itcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); - int (*shgemm_oncopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); - int (*shgemm_otcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); + int (*sbgemm_incopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); + int (*sbgemm_itcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); + int (*sbgemm_oncopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); + int (*sbgemm_otcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); int (*shtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*shtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); @@ -147,14 +147,14 @@ BLASLONG (*ishmin_k) (BLASLONG, float *, BLASLONG); #endif -#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) || (BUILD_COMPLEX16) +#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) int sgemm_p, sgemm_q, sgemm_r; int sgemm_unroll_m, sgemm_unroll_n, sgemm_unroll_mn; #endif int exclusive_cache; -#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) +#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) float (*samax_k) (BLASLONG, float *, BLASLONG); float (*samin_k) (BLASLONG, float *, BLASLONG); float (*smax_k) (BLASLONG, float *, BLASLONG); @@ -167,11 +167,10 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); float (*snrm2_k) (BLASLONG, float *, BLASLONG); float (*sasum_k) (BLASLONG, float *, BLASLONG); #endif - -#if BUILD_SINGLE +#ifdef BUILD_SINGLE float (*ssum_k) (BLASLONG, float *, BLASLONG); #endif -#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) +#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); //double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); @@ -179,26 +178,20 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*srot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); int (*saxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); -#endif -#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) || (BUILD_COMPLEX16) int (*sscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); -#endif -#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) int (*sswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*sgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*sgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); #endif - -#if BUILD_SINGLE +#ifdef BUILD_SINGLE int (*sger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*ssymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*ssymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); #endif - -#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) +#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) #ifdef ARCH_X86_64 void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG); int (*sgemm_direct_performant) (BLASLONG M, BLASLONG N, BLASLONG K); @@ -213,8 +206,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*sgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*sgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); #endif - -#if (BUILD_SINGLE) || (BUILD_DOUBLE) +#ifdef BUILD_SINGLE int (*strsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); @@ -236,8 +228,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*strsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*strsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*strsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); -#endif -#if BUILD_SINGLE + int (*strmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); @@ -264,18 +255,17 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*ssymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*ssymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*ssymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); -#endif -#if (BUILD_SINGLE) || (BUILD_DOUBLE) + int (*sneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*slaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); #endif -#if (BUILD_DOUBLE) || (BUILD_COMPLEX16) +#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16) int dgemm_p, dgemm_q, dgemm_r; int dgemm_unroll_m, dgemm_unroll_n, dgemm_unroll_mn; #endif -#if (BUILD_DOUBLE) || (BUILD_COMPLEX16) +#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16) double (*damax_k) (BLASLONG, double *, BLASLONG); double (*damin_k) (BLASLONG, double *, BLASLONG); double (*dmax_k) (BLASLONG, double *, BLASLONG); @@ -286,21 +276,21 @@ BLASLONG (*idmax_k) (BLASLONG, double *, BLASLONG); BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); #endif -#if (BUILD_DOUBLE) || (BUILD_COMPLEX16) +#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16) double (*dnrm2_k) (BLASLONG, double *, BLASLONG); double (*dasum_k) (BLASLONG, double *, BLASLONG); #endif -#if BUILD_DOUBLE +#ifdef BUILD_DOUBLE double (*dsum_k) (BLASLONG, double *, BLASLONG); #endif -#if (BUILD_DOUBLE) || (BUILD_COMPLEX16) +#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16) int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); #endif -#if (BUILD_SINGLE) || (BUILD_DOUBLE) +#if defined (BUILD_SINGLE) || defined(BUILD_DOUBLE) double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); #endif -#if (BUILD_DOUBLE) || (BUILD_COMPLEX16) +#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16) int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); int (*daxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); int (*dscal_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); @@ -308,15 +298,13 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); int (*dgemv_n) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*dgemv_t) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); #endif - -#if BUILD_DOUBLE +#ifdef BUILD_DOUBLE int (*dger_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*dsymv_L) (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*dsymv_U) (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); #endif - -#if (BUILD_DOUBLE) || (BUILD_COMPLEX16) +#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16) int (*dgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG); int (*dgemm_beta )(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); @@ -325,8 +313,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); int (*dgemm_oncopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*dgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); #endif - -#if BUILD_DOUBLE +#ifdef BUILD_DOUBLE int (*dtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int (*dtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int (*dtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); @@ -473,30 +460,23 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG); #endif - -#if (BUILD_COMPLEX) || (BUILD_COMPLEX16) +#ifdef BUILD_COMPLEX int cgemm_p, cgemm_q, cgemm_r; int cgemm_unroll_m, cgemm_unroll_n, cgemm_unroll_mn; + float (*camax_k) (BLASLONG, float *, BLASLONG); float (*camin_k) (BLASLONG, float *, BLASLONG); BLASLONG (*icamax_k)(BLASLONG, float *, BLASLONG); BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); -#endif -#if BUILD_COMPLEX float (*cnrm2_k) (BLASLONG, float *, BLASLONG); float (*casum_k) (BLASLONG, float *, BLASLONG); float (*csum_k) (BLASLONG, float *, BLASLONG); -#endif -#if (BUILD_COMPLEX)|| (BUILD_COMPLEX16) int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); -#endif -#if BUILD_COMPLEX int (*csrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); -#endif -#if (BUILD_COMPLEX)|| (BUILD_COMPLEX16) + int (*caxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*caxpyc_k)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*cscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); @@ -510,8 +490,6 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); int (*cgemv_u) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*cgemv_s) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*cgemv_d) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); -#endif -#if (BUILD_COMPLEX) int (*cgeru_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*cgerc_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*cgerv_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); @@ -523,14 +501,13 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); int (*chemv_U) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*chemv_M) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*chemv_V) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); -#endif -#if (BUILD_COMPLEX) || (BUILD_COMPLEX16) int (*cgemm_kernel_n )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int (*cgemm_kernel_l )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int (*cgemm_kernel_r )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int (*cgemm_kernel_b )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int (*cgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + int (*cgemm_incopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*cgemm_itcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*cgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); @@ -561,8 +538,6 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); int (*ctrsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*ctrsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*ctrsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); -#endif -#if (BUILD_COMPLEX) int (*ctrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int (*ctrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); @@ -646,14 +621,12 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); int (*chemm3m_olcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); int (*chemm3m_oucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); int (*chemm3m_olcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); -#endif -#if (BUILD_COMPLEX) || (BUILD_COMPLEX16) + int (*cneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*claswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); #endif - -#if BUILD_COMPLEX16 +#ifdef BUILD_COMPLEX16 int zgemm_p, zgemm_q, zgemm_r; int zgemm_unroll_m, zgemm_unroll_n, zgemm_unroll_mn; @@ -991,35 +964,34 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); void (*init)(void); int snum_opt, dnum_opt, qnum_opt; - -#if BUILD_SINGLE +#ifdef BUILD_SINGLE int (*saxpby_k) (BLASLONG, float, float*, BLASLONG,float, float*, BLASLONG); #endif -#if BUILD_DOUBLE +#ifdef BUILD_DOUBLE int (*daxpby_k) (BLASLONG, double, double*, BLASLONG,double, double*, BLASLONG); #endif -#if BUILD_COMPLEX +#ifdef BUILD_COMPLEX int (*caxpby_k) (BLASLONG, float, float, float*, BLASLONG,float,float, float*, BLASLONG); #endif -#if BUILD_COMPLEX16 +#ifdef BUILD_COMPLEX16 int (*zaxpby_k) (BLASLONG, double, double, double*, BLASLONG,double,double, double*, BLASLONG); #endif -#if BUILD_SINGLE +#ifdef BUILD_SINGLE int (*somatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); int (*somatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); int (*somatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); int (*somatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); #endif -#if BUILD_DOUBLE +#ifdef BUILD_DOUBLE int (*domatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*domatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*domatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*domatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); #endif -#if BUILD_COMPLEX +#ifdef BUILD_COMPLEX int (*comatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); @@ -1031,7 +1003,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*comatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); #endif -#if BUILD_COMPLEX16 +#ifdef BUILD_COMPLEX16 int (*zomatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); @@ -1043,21 +1015,21 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*zomatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); #endif -#if BUILD_SINGLE +#ifdef BUILD_SINGLE int (*simatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG); int (*simatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG); int (*simatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG); int (*simatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG); #endif -#if BUILD_DOUBLE +#ifdef BUILD_DOUBLE int (*dimatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG); int (*dimatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG); int (*dimatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG); int (*dimatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG); #endif -#if BUILD_COMPLEX +#ifdef BUILD_COMPLEX int (*cimatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); int (*cimatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); int (*cimatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); @@ -1069,7 +1041,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*cimatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); #endif -#if BUILD_COMPLEX16 +#ifdef BUILD_COMPLEX16 int (*zimatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); int (*zimatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); int (*zimatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); @@ -1081,16 +1053,16 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*zimatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); #endif -#if BUILD_SINGLE +#ifdef BUILD_SINGLE int (*sgeadd_k) (BLASLONG, BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG); #endif -#if BUILD_DOUBLE +#ifdef BUILD_DOUBLE int (*dgeadd_k) (BLASLONG, BLASLONG, double, double *, BLASLONG, double, double *, BLASLONG); #endif -#if BUILD_COMPLEX +#ifdef BUILD_COMPLEX int (*cgeadd_k) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float, float, float *, BLASLONG); #endif -#if BUILD_COMPLEX16 +#ifdef BUILD_COMPLEX16 int (*zgeadd_k) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double, double, double *, BLASLONG); #endif } gotoblas_t; @@ -1104,16 +1076,16 @@ extern gotoblas_t *gotoblas; #define HAVE_EX_L2 gotoblas -> exclusive_cache -#ifdef BUILD_HALF -#define SHGEMM_P gotoblas -> shgemm_p -#define SHGEMM_Q gotoblas -> shgemm_q -#define SHGEMM_R gotoblas -> shgemm_r -#define SHGEMM_UNROLL_M gotoblas -> shgemm_unroll_m -#define SHGEMM_UNROLL_N gotoblas -> shgemm_unroll_n -#define SHGEMM_UNROLL_MN gotoblas -> shgemm_unroll_mn +#ifdef BUILD_BFLOAT16 +#define SBGEMM_P gotoblas -> sbgemm_p +#define SBGEMM_Q gotoblas -> sbgemm_q +#define SBGEMM_R gotoblas -> sbgemm_r +#define SBGEMM_UNROLL_M gotoblas -> sbgemm_unroll_m +#define SBGEMM_UNROLL_N gotoblas -> sbgemm_unroll_n +#define SBGEMM_UNROLL_MN gotoblas -> sbgemm_unroll_mn #endif -#if (BUILD_SINGLE) +#if defined (BUILD_SINGLE) #define SGEMM_P gotoblas -> sgemm_p #define SGEMM_Q gotoblas -> sgemm_q #define SGEMM_R gotoblas -> sgemm_r @@ -1122,21 +1094,13 @@ extern gotoblas_t *gotoblas; #define SGEMM_UNROLL_MN gotoblas -> sgemm_unroll_mn #endif -#if (BUILD_DOUBLE) +#if defined (BUILD_DOUBLE) #define DGEMM_P gotoblas -> dgemm_p #define DGEMM_Q gotoblas -> dgemm_q #define DGEMM_R gotoblas -> dgemm_r #define DGEMM_UNROLL_M gotoblas -> dgemm_unroll_m #define DGEMM_UNROLL_N gotoblas -> dgemm_unroll_n #define DGEMM_UNROLL_MN gotoblas -> dgemm_unroll_mn -#if ! (BUILD_SINGLE) -#define SGEMM_P gotoblas -> sgemm_p -#define SGEMM_Q gotoblas -> sgemm_q -#define SGEMM_R gotoblas -> sgemm_r -#define SGEMM_UNROLL_M gotoblas -> sgemm_unroll_m -#define SGEMM_UNROLL_N gotoblas -> sgemm_unroll_n -#define SGEMM_UNROLL_MN gotoblas -> sgemm_unroll_mn -#endif #endif #define QGEMM_P gotoblas -> qgemm_p @@ -1146,7 +1110,7 @@ extern gotoblas_t *gotoblas; #define QGEMM_UNROLL_N gotoblas -> qgemm_unroll_n #define QGEMM_UNROLL_MN gotoblas -> qgemm_unroll_mn -#if BUILD_COMPLEX +#ifdef BUILD_COMPLEX #define CGEMM_P gotoblas -> cgemm_p #define CGEMM_Q gotoblas -> cgemm_q #define CGEMM_R gotoblas -> cgemm_r @@ -1163,7 +1127,7 @@ extern gotoblas_t *gotoblas; #endif #endif -#if BUILD_COMPLEX16 +#ifdef BUILD_COMPLEX16 #define ZGEMM_P gotoblas -> zgemm_p #define ZGEMM_Q gotoblas -> zgemm_q #define ZGEMM_R gotoblas -> zgemm_r @@ -1178,14 +1142,6 @@ extern gotoblas_t *gotoblas; #define DGEMM_UNROLL_N gotoblas -> dgemm_unroll_n #define DGEMM_UNROLL_MN gotoblas -> dgemm_unroll_mn #endif -#ifndef BUILD_COMPLEX -#define CGEMM_P gotoblas -> cgemm_p -#define CGEMM_Q gotoblas -> cgemm_q -#define CGEMM_R gotoblas -> cgemm_r -#define CGEMM_UNROLL_M gotoblas -> cgemm_unroll_m -#define CGEMM_UNROLL_N gotoblas -> cgemm_unroll_n -#define CGEMM_UNROLL_MN gotoblas -> cgemm_unroll_mn -#endif #endif #define XGEMM_P gotoblas -> xgemm_p @@ -1230,16 +1186,16 @@ extern gotoblas_t *gotoblas; #define HAVE_EX_L2 0 #endif -#ifdef BUILD_HALF -#define SHGEMM_P SHGEMM_DEFAULT_P -#define SHGEMM_Q SHGEMM_DEFAULT_Q -#define SHGEMM_R SHGEMM_DEFAULT_R -#define SHGEMM_UNROLL_M SHGEMM_DEFAULT_UNROLL_M -#define SHGEMM_UNROLL_N SHGEMM_DEFAULT_UNROLL_N -#ifdef SHGEMM_DEFAULT_UNROLL_MN -#define SHGEMM_UNROLL_MN SHGEMM_DEFAULT_UNROLL_MN +#ifdef BUILD_BFLOAT16 +#define SBGEMM_P SBGEMM_DEFAULT_P +#define SBGEMM_Q SBGEMM_DEFAULT_Q +#define SBGEMM_R SBGEMM_DEFAULT_R +#define SBGEMM_UNROLL_M SBGEMM_DEFAULT_UNROLL_M +#define SBGEMM_UNROLL_N SBGEMM_DEFAULT_UNROLL_N +#ifdef SBGEMM_DEFAULT_UNROLL_MN +#define SBGEMM_UNROLL_MN SBGEMM_DEFAULT_UNROLL_MN #else -#define SHGEMM_UNROLL_MN MAX((SHGEMM_UNROLL_M), (SHGEMM_UNROLL_N)) +#define SBGEMM_UNROLL_MN MAX((SBGEMM_UNROLL_M), (SBGEMM_UNROLL_N)) #endif #endif @@ -1354,7 +1310,7 @@ extern gotoblas_t *gotoblas; #endif #ifndef COMPLEX -#if (XDOUBLE) +#if defined(XDOUBLE) #define GEMM_P QGEMM_P #define GEMM_Q QGEMM_Q #define GEMM_R QGEMM_R @@ -1378,18 +1334,18 @@ extern gotoblas_t *gotoblas; #define GEMM_DEFAULT_R DGEMM_DEFAULT_R #define GEMM_DEFAULT_UNROLL_M DGEMM_DEFAULT_UNROLL_M #define GEMM_DEFAULT_UNROLL_N DGEMM_DEFAULT_UNROLL_N -#elif (HALF) -#define GEMM_P SHGEMM_P -#define GEMM_Q SHGEMM_Q -#define GEMM_R SHGEMM_R -#define GEMM_UNROLL_M SHGEMM_UNROLL_M -#define GEMM_UNROLL_N SHGEMM_UNROLL_N -#define GEMM_UNROLL_MN SHGEMM_UNROLL_MN -#define GEMM_DEFAULT_P SHGEMM_DEFAULT_P -#define GEMM_DEFAULT_Q SHGEMM_DEFAULT_Q -#define GEMM_DEFAULT_R SHGEMM_DEFAULT_R -#define GEMM_DEFAULT_UNROLL_M SHGEMM_DEFAULT_UNROLL_M -#define GEMM_DEFAULT_UNROLL_N SHGEMM_DEFAULT_UNROLL_N +#elif defined(BFLOAT16) +#define GEMM_P SBGEMM_P +#define GEMM_Q SBGEMM_Q +#define GEMM_R SBGEMM_R +#define GEMM_UNROLL_M SBGEMM_UNROLL_M +#define GEMM_UNROLL_N SBGEMM_UNROLL_N +#define GEMM_UNROLL_MN SBGEMM_UNROLL_MN +#define GEMM_DEFAULT_P SBGEMM_DEFAULT_P +#define GEMM_DEFAULT_Q SBGEMM_DEFAULT_Q +#define GEMM_DEFAULT_R SBGEMM_DEFAULT_R +#define GEMM_DEFAULT_UNROLL_M SBGEMM_DEFAULT_UNROLL_M +#define GEMM_DEFAULT_UNROLL_N SBGEMM_DEFAULT_UNROLL_N #else #define GEMM_P SGEMM_P #define GEMM_Q SGEMM_Q @@ -1404,7 +1360,7 @@ extern gotoblas_t *gotoblas; #define GEMM_DEFAULT_UNROLL_N SGEMM_DEFAULT_UNROLL_N #endif #else -#if (XDOUBLE) +#if defined(XDOUBLE) #define GEMM_P XGEMM_P #define GEMM_Q XGEMM_Q #define GEMM_R XGEMM_R @@ -1475,8 +1431,8 @@ extern gotoblas_t *gotoblas; #define GEMM_THREAD gemm_thread_n #endif -#ifndef SHGEMM_DEFAULT_R -#define SHGEMM_DEFAULT_R (((BUFFER_SIZE - ((SHGEMM_DEFAULT_P * SHGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SHGEMM_DEFAULT_Q * 4) - 15) & ~15UL) +#ifndef SBGEMM_DEFAULT_R +#define SBGEMM_DEFAULT_R (((BUFFER_SIZE - ((SBGEMM_DEFAULT_P * SBGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SBGEMM_DEFAULT_Q * 4) - 15) & ~15UL) #endif #ifndef SGEMM_DEFAULT_R @@ -1518,7 +1474,7 @@ extern gotoblas_t *gotoblas; #ifndef GEMM3M_P #ifdef XDOUBLE #define GEMM3M_P XGEMM3M_P -#elif defined (DOUBLE) +#elif defined(DOUBLE) #define GEMM3M_P ZGEMM3M_P #else #define GEMM3M_P CGEMM3M_P @@ -1528,7 +1484,7 @@ extern gotoblas_t *gotoblas; #ifndef GEMM3M_Q #ifdef XDOUBLE #define GEMM3M_Q XGEMM3M_Q -#elif defined (DOUBLE) +#elif defined(DOUBLE) #define GEMM3M_Q ZGEMM3M_Q #else #define GEMM3M_Q CGEMM3M_Q @@ -1538,7 +1494,7 @@ extern gotoblas_t *gotoblas; #ifndef GEMM3M_R #ifdef XDOUBLE #define GEMM3M_R XGEMM3M_R -#elif defined (DOUBLE) +#elif defined(DOUBLE) #define GEMM3M_R ZGEMM3M_R #else #define GEMM3M_R CGEMM3M_R diff --git a/getarch_2nd.c b/getarch_2nd.c index a1d0ccac8..c390ef52c 100644 --- a/getarch_2nd.c +++ b/getarch_2nd.c @@ -9,8 +9,8 @@ int main(int argc, char **argv) { if ( (argc <= 1) || ((argc >= 2) && (*argv[1] == '0'))) { - printf("SHGEMM_UNROLL_M=%d\n", SHGEMM_DEFAULT_UNROLL_M); - printf("SHGEMM_UNROLL_N=%d\n", SHGEMM_DEFAULT_UNROLL_N); + printf("SBGEMM_UNROLL_M=%d\n", SBGEMM_DEFAULT_UNROLL_M); + printf("SBGEMM_UNROLL_N=%d\n", SBGEMM_DEFAULT_UNROLL_N); printf("SGEMM_UNROLL_M=%d\n", SGEMM_DEFAULT_UNROLL_M); printf("SGEMM_UNROLL_N=%d\n", SGEMM_DEFAULT_UNROLL_N); printf("DGEMM_UNROLL_M=%d\n", DGEMM_DEFAULT_UNROLL_M); diff --git a/param.h b/param.h index 1ab982dc5..f3ddde6a1 100644 --- a/param.h +++ b/param.h @@ -72,12 +72,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef PARAM_H #define PARAM_H -#define SHGEMM_DEFAULT_UNROLL_N 4 -#define SHGEMM_DEFAULT_UNROLL_M 8 -#define SHGEMM_DEFAULT_UNROLL_MN 32 -#define SHGEMM_DEFAULT_P 256 -#define SHGEMM_DEFAULT_R 256 -#define SHGEMM_DEFAULT_Q 256 +#define SBGEMM_DEFAULT_UNROLL_N 4 +#define SBGEMM_DEFAULT_UNROLL_M 8 +#define SBGEMM_DEFAULT_UNROLL_MN 32 +#define SBGEMM_DEFAULT_P 256 +#define SBGEMM_DEFAULT_R 256 +#define SBGEMM_DEFAULT_Q 256 #ifdef OPTERON #define SNUMOPT 4 @@ -2426,16 +2426,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #if defined(POWER10) -#undef SHGEMM_DEFAULT_UNROLL_N -#undef SHGEMM_DEFAULT_UNROLL_M -#undef SHGEMM_DEFAULT_P -#undef SHGEMM_DEFAULT_R -#undef SHGEMM_DEFAULT_Q -#define SHGEMM_DEFAULT_UNROLL_M 16 -#define SHGEMM_DEFAULT_UNROLL_N 8 -#define SHGEMM_DEFAULT_P 832 -#define SHGEMM_DEFAULT_Q 1026 -#define SHGEMM_DEFAULT_R 4096 +#undef SBGEMM_DEFAULT_UNROLL_N +#undef SBGEMM_DEFAULT_UNROLL_M +#undef SBGEMM_DEFAULT_P +#undef SBGEMM_DEFAULT_R +#undef SBGEMM_DEFAULT_Q +#define SBGEMM_DEFAULT_UNROLL_M 16 +#define SBGEMM_DEFAULT_UNROLL_N 8 +#define SBGEMM_DEFAULT_P 832 +#define SBGEMM_DEFAULT_Q 1026 +#define SBGEMM_DEFAULT_R 4096 #endif #if defined(SPARC) && defined(V7) From 573508f0ee04d890dcaf2307728063d2d23371de Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 23:50:54 +0200 Subject: [PATCH 541/593] Rename common_sh.h to common_sb.h --- common_sh.h => common_sb.h | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename common_sh.h => common_sb.h (100%) diff --git a/common_sh.h b/common_sb.h similarity index 100% rename from common_sh.h rename to common_sb.h From 3bc8e8c33404d4d3b8f5bd35c662f53fb1c6285c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 23:51:34 +0200 Subject: [PATCH 542/593] Rename "HALF" and "sh" to "BFLOAT16"and "sb" --- common_sb.h | 110 ++++++++++++++++++++++++++-------------------------- 1 file changed, 55 insertions(+), 55 deletions(-) diff --git a/common_sb.h b/common_sb.h index 5dc99b3bd..66968ab00 100644 --- a/common_sb.h +++ b/common_sb.h @@ -1,77 +1,77 @@ -#ifndef COMMON_SH_H -#define COMMON_SH_H +#ifndef COMMON_SB_H +#define COMMON_SB_H #ifndef DYNAMIC_ARCH -#define SHDOT_K shdot_k -#define SHSTOBF16_K shstobf16_k -#define SHDTOBF16_K shdtobf16_k +#define SBDOT_K sbdot_k +#define SBSTOBF16_K sbstobf16_k +#define SBDTOBF16_K sbdtobf16_k #define SBF16TOS_K sbf16tos_k #define DBF16TOD_K dbf16tod_k -#define SHGEMM_ONCOPY shgemm_oncopy -#define SHGEMM_OTCOPY shgemm_otcopy +#define SBGEMM_ONCOPY sbgemm_oncopy +#define SBGEMM_OTCOPY sbgemm_otcopy -#if SHGEMM_DEFAULT_UNROLL_M == SHGEMM_DEFAULT_UNROLL_N -#define SHGEMM_INCOPY shgemm_oncopy -#define SHGEMM_ITCOPY shgemm_otcopy +#if SBGEMM_DEFAULT_UNROLL_M == SBGEMM_DEFAULT_UNROLL_N +#define SBGEMM_INCOPY sbgemm_oncopy +#define SBGEMM_ITCOPY sbgemm_otcopy #else -#define SHGEMM_INCOPY shgemm_incopy -#define SHGEMM_ITCOPY shgemm_itcopy +#define SBGEMM_INCOPY sbgemm_incopy +#define SBGEMM_ITCOPY sbgemm_itcopy #endif -#define SHGEMM_BETA shgemm_beta -#define SHGEMM_KERNEL shgemm_kernel +#define SBGEMM_BETA sbgemm_beta +#define SBGEMM_KERNEL sbgemm_kernel #else -#define SHDOT_K gotoblas -> shdot_k -#define SHSTOBF16_K gotoblas -> shstobf16_k -#define SHDTOBF16_K gotoblas -> shdtobf16_k +#define SBDOT_K gotoblas -> sbdot_k +#define SBSTOBF16_K gotoblas -> sbstobf16_k +#define SBDTOBF16_K gotoblas -> sbdtobf16_k #define SBF16TOS_K gotoblas -> sbf16tos_k #define DBF16TOD_K gotoblas -> dbf16tod_k -#define SHGEMM_ONCOPY gotoblas -> shgemm_oncopy -#define SHGEMM_OTCOPY gotoblas -> shgemm_otcopy -#define SHGEMM_INCOPY gotoblas -> shgemm_incopy -#define SHGEMM_ITCOPY gotoblas -> shgemm_itcopy -#define SHGEMM_BETA gotoblas -> shgemm_beta -#define SHGEMM_KERNEL gotoblas -> shgemm_kernel +#define SBGEMM_ONCOPY gotoblas -> sbgemm_oncopy +#define SBGEMM_OTCOPY gotoblas -> sbgemm_otcopy +#define SBGEMM_INCOPY gotoblas -> sbgemm_incopy +#define SBGEMM_ITCOPY gotoblas -> sbgemm_itcopy +#define SBGEMM_BETA gotoblas -> sbgemm_beta +#define SBGEMM_KERNEL gotoblas -> sbgemm_kernel #endif -#define SHGEMM_NN shgemm_nn -#define SHGEMM_CN shgemm_tn -#define SHGEMM_TN shgemm_tn -#define SHGEMM_NC shgemm_nt -#define SHGEMM_NT shgemm_nt -#define SHGEMM_CC shgemm_tt -#define SHGEMM_CT shgemm_tt -#define SHGEMM_TC shgemm_tt -#define SHGEMM_TT shgemm_tt -#define SHGEMM_NR shgemm_nn -#define SHGEMM_TR shgemm_tn -#define SHGEMM_CR shgemm_tn -#define SHGEMM_RN shgemm_nn -#define SHGEMM_RT shgemm_nt -#define SHGEMM_RC shgemm_nt -#define SHGEMM_RR shgemm_nn +#define SBGEMM_NN sbgemm_nn +#define SBGEMM_CN sbgemm_tn +#define SBGEMM_TN sbgemm_tn +#define SBGEMM_NC sbgemm_nt +#define SBGEMM_NT sbgemm_nt +#define SBGEMM_CC sbgemm_tt +#define SBGEMM_CT sbgemm_tt +#define SBGEMM_TC sbgemm_tt +#define SBGEMM_TT sbgemm_tt +#define SBGEMM_NR sbgemm_nn +#define SBGEMM_TR sbgemm_tn +#define SBGEMM_CR sbgemm_tn +#define SBGEMM_RN sbgemm_nn +#define SBGEMM_RT sbgemm_nt +#define SBGEMM_RC sbgemm_nt +#define SBGEMM_RR sbgemm_nn -#define SHGEMM_THREAD_NN shgemm_thread_nn -#define SHGEMM_THREAD_CN shgemm_thread_tn -#define SHGEMM_THREAD_TN shgemm_thread_tn -#define SHGEMM_THREAD_NC shgemm_thread_nt -#define SHGEMM_THREAD_NT shgemm_thread_nt -#define SHGEMM_THREAD_CC shgemm_thread_tt -#define SHGEMM_THREAD_CT shgemm_thread_tt -#define SHGEMM_THREAD_TC shgemm_thread_tt -#define SHGEMM_THREAD_TT shgemm_thread_tt -#define SHGEMM_THREAD_NR shgemm_thread_nn -#define SHGEMM_THREAD_TR shgemm_thread_tn -#define SHGEMM_THREAD_CR shgemm_thread_tn -#define SHGEMM_THREAD_RN shgemm_thread_nn -#define SHGEMM_THREAD_RT shgemm_thread_nt -#define SHGEMM_THREAD_RC shgemm_thread_nt -#define SHGEMM_THREAD_RR shgemm_thread_nn +#define SBGEMM_THREAD_NN sbgemm_thread_nn +#define SBGEMM_THREAD_CN sbgemm_thread_tn +#define SBGEMM_THREAD_TN sbgemm_thread_tn +#define SBGEMM_THREAD_NC sbgemm_thread_nt +#define SBGEMM_THREAD_NT sbgemm_thread_nt +#define SBGEMM_THREAD_CC sbgemm_thread_tt +#define SBGEMM_THREAD_CT sbgemm_thread_tt +#define SBGEMM_THREAD_TC sbgemm_thread_tt +#define SBGEMM_THREAD_TT sbgemm_thread_tt +#define SBGEMM_THREAD_NR sbgemm_thread_nn +#define SBGEMM_THREAD_TR sbgemm_thread_tn +#define SBGEMM_THREAD_CR sbgemm_thread_tn +#define SBGEMM_THREAD_RN sbgemm_thread_nn +#define SBGEMM_THREAD_RT sbgemm_thread_nt +#define SBGEMM_THREAD_RC sbgemm_thread_nt +#define SBGEMM_THREAD_RR sbgemm_thread_nn #endif From 32733ded0460841708cde93d50fed735fd35ed5e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 23:52:45 +0200 Subject: [PATCH 543/593] Rename "HALF" and "sh" to "BFLOAT16" and "sb" --- lapack/potrf/potrf_parallel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack/potrf/potrf_parallel.c b/lapack/potrf/potrf_parallel.c index 008fcb8cc..29364cc05 100644 --- a/lapack/potrf/potrf_parallel.c +++ b/lapack/potrf/potrf_parallel.c @@ -382,7 +382,7 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){ mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1; #elif defined(HALF) mode = BLAS_HALF | BLAS_REAL; - mask = MAX(SHGEMM_UNROLL_M, SHGEMM_UNROLL_N) - 1; + mask = MAX(SBGEMM_UNROLL_M, SBGEMM_UNROLL_N) - 1; #else mode = BLAS_SINGLE | BLAS_REAL; mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1; From dc8a1afa6357662736fdf7d4eb73cf65bc7ccde1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 23:53:50 +0200 Subject: [PATCH 544/593] Rename "HALF" and "sh" to "BFLOAT16" and "sb" --- kernel/x86_64/KERNEL | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL index d75196974..4f110f0bf 100644 --- a/kernel/x86_64/KERNEL +++ b/kernel/x86_64/KERNEL @@ -146,8 +146,8 @@ ifndef XDOTKERNEL XDOTKERNEL = zdot.S endif -ifndef SHDOTKERNEL -SHDOTKERNEL = shdot.c +ifndef SBDOTKERNEL +SBDOTKERNEL = sbdot.c endif ifndef TOBF16KERNEL From 2061f7fdff640635467fcc790500c0a2028955db Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 23:54:53 +0200 Subject: [PATCH 545/593] Rename "HALF" and "sh" to "BFLOAT16" and "sb" --- kernel/power/KERNEL.POWER10 | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index d0cda7fb6..5cf1660a2 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -7,16 +7,16 @@ else #CGEMM_BETA = ../generic/zgemm_beta.c #ZGEMM_BETA = ../generic/zgemm_beta.c -SHGEMM_BETA = ../generic/gemm_beta.c -SHGEMMKERNEL = shgemm_kernel_power10.c -SHGEMMINCOPY = ../generic/gemm_ncopy_16.c -SHGEMMITCOPY = ../generic/gemm_tcopy_16.c -SHGEMMONCOPY = ../generic/gemm_ncopy_8.c -SHGEMMOTCOPY = ../generic/gemm_tcopy_8.c -SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX) -SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX) -SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX) -SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX) +SBGEMM_BETA = ../generic/gemm_beta.c +SBGEMMKERNEL = sbgemm_kernel_power10.c +SBGEMMINCOPY = ../generic/gemm_ncopy_16.c +SBGEMMITCOPY = ../generic/gemm_tcopy_16.c +SBGEMMONCOPY = ../generic/gemm_ncopy_8.c +SBGEMMOTCOPY = ../generic/gemm_tcopy_8.c +SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) +SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) +SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) +SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) STRMMKERNEL = sgemm_kernel_power10.c DTRMMKERNEL = dgemm_kernel_power10.c From 756062afa5f3de899e6b8dea397c95c8bae848af Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 23:56:17 +0200 Subject: [PATCH 546/593] Rename "HALF" and "sh" to "BFLOAT16" and "sb" --- kernel/generic/gemmkernel_2x2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/generic/gemmkernel_2x2.c b/kernel/generic/gemmkernel_2x2.c index cc7bb8e48..bf1c3ae38 100644 --- a/kernel/generic/gemmkernel_2x2.c +++ b/kernel/generic/gemmkernel_2x2.c @@ -1,5 +1,5 @@ #include "common.h" -#if defined(HALF) && defined(HALFCONVERSION) +#if defined(BFLOAT16) && defined(BFLOAT16CONVERSION) static float bfloat16tof32 (bfloat16 f16) { From 3aecafad801b05d2606ba2cafa5deb6f6731e8c4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 12 Oct 2020 00:00:55 +0200 Subject: [PATCH 547/593] Change "HALF" and "sh" to "BFLOAT16" and "sb" --- kernel/CMakeLists.txt | 28 ++++----- kernel/Makefile.L1 | 20 +++---- kernel/Makefile.L3 | 134 +++++++++++++++++++++--------------------- kernel/setparam-ref.c | 68 ++++++++++----------- 4 files changed, 125 insertions(+), 125 deletions(-) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 988b83338..6d8d759ad 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -41,8 +41,8 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) foreach (float_type ${FLOAT_TYPES}) # a bit of metaprogramming here to pull out the appropriate KERNEL var string(SUBSTRING ${float_type} 0 1 float_char) - if (${float_type} STREQUAL "HALF") - set (float_char "SH") + if (${float_type} STREQUAL "BFLOAT16") + set (float_char "SB") endif () GenerateNamedObjects("${KERNELDIR}/${${float_char}AMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}AMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false ${float_type}) @@ -149,8 +149,8 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("generic/ger.c" "" "ger_k" false "" "" "" 3) foreach (float_type ${FLOAT_TYPES}) string(SUBSTRING ${float_type} 0 1 float_char) - if (${float_type} STREQUAL "HALF") - set (float_char "SH") + if (${float_type} STREQUAL "BFLOAT16") + set (float_char "SB") endif () if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") GenerateNamedObjects("${KERNELDIR}/${${float_char}GERUKERNEL}" "" "geru_k" false "" "" false ${float_type}) @@ -208,13 +208,13 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPERFORMANT}" "" "gemm_direct_performant" false "" "" false SINGLE) endif() - foreach (float_type SINGLE DOUBLE HALF) + foreach (float_type SINGLE DOUBLE BFLOAT16) string(SUBSTRING ${float_type} 0 1 float_char) - if (${float_type} STREQUAL "HALF") - if (NOT ${BUILD_HALF}) + if (${float_type} STREQUAL "BFLOAT16") + if (NOT ${BUILD_BFLOAT16}) continue () else () - set (float_char "SH") + set (float_char "SB") endif () endif () GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type}) @@ -254,8 +254,8 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) foreach (float_type ${FLOAT_TYPES}) string(SUBSTRING ${float_type} 0 1 float_char) - if (${float_type} STREQUAL "HALF") - set (float_char "SH") + if (${float_type} STREQUAL "BFLOAT16") + set (float_char "SB") endif () if (${float_char}GEMMINCOPY) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMINCOPY}" "${float_type}" "${${float_char}GEMMINCOPYOBJ}" false "" "" true ${float_type}) @@ -620,8 +620,8 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) # Makefile.LA if(NOT NO_LAPACK) foreach (float_type ${FLOAT_TYPES}) - if (${float_type} STREQUAL "HALF") - set (float_char "SH") + if (${float_type} STREQUAL "BFLOAT16") + set (float_char "SB") endif () if (NOT DEFINED ${float_char}NEG_TCOPY) if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C" OR ${float_char} STREQUAL "X") @@ -688,8 +688,8 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) foreach (float_type ${FLOAT_TYPES}) # a bit of metaprogramming here to pull out the appropriate KERNEL var string(SUBSTRING ${float_type} 0 1 float_char) - if (${float_type} STREQUAL "HALF") - set (float_char "SH") + if (${float_type} STREQUAL "BFLOAT16") + set (float_char "SB") endif () GenerateNamedObjects("generic/neg_tcopy_${${float_char}GEMM_UNROLL_M}.c" "" "neg_tcopy" false "" ${TSUFFIX} false ${float_type}) GenerateNamedObjects("generic/laswp_ncopy_${${float_char}GEMM_UNROLL_N}.c" "" "laswp_ncopy" false "" ${TSUFFIX} false ${float_type}) diff --git a/kernel/Makefile.L1 b/kernel/Makefile.L1 index c6576ee07..6fe6778d0 100644 --- a/kernel/Makefile.L1 +++ b/kernel/Makefile.L1 @@ -262,9 +262,9 @@ ifndef XDOTKERNEL XDOTKERNEL = zdot.S endif -ifeq ($(BUILD_HALF),1) -ifndef SHDOTKERNEL -SHDOTKERNEL = ../x86_64/shdot.c +ifeq ($(BUILD_BFLOAT16),1) +ifndef SBDOTKERNEL +SBDOTKERNEL = ../x86_64/sbdot.c endif ifndef TOBF16KERNEL @@ -530,11 +530,11 @@ XBLASOBJS += \ xdotc_k$(TSUFFIX).$(SUFFIX) xdotu_k$(TSUFFIX).$(SUFFIX) xnrm2_k$(TSUFFIX).$(SUFFIX) xqrot_k$(TSUFFIX).$(SUFFIX) \ xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX) xsum_k$(TSUFFIX).$(SUFFIX) -ifeq ($(BUILD_HALF),1) +ifeq ($(BUILD_BFLOAT16),1) SHBLASOBJS += \ - shdot_k$(TSUFFIX).$(SUFFIX) + sbdot_k$(TSUFFIX).$(SUFFIX) SHEXTOBJS += \ - shstobf16_k$(TSUFFIX).$(SUFFIX) shdtobf16_k$(TSUFFIX).$(SUFFIX) + sbstobf16_k$(TSUFFIX).$(SUFFIX) sbdtobf16_k$(TSUFFIX).$(SUFFIX) SHEXTOBJS += \ sbf16tos_k$(TSUFFIX).$(SUFFIX) dbf16tod_k$(TSUFFIX).$(SUFFIX) endif @@ -757,12 +757,12 @@ $(KDIR)ddot_k$(TSUFFIX).$(SUFFIX) $(KDIR)ddot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNEL $(KDIR)qdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QDOTKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@ -ifeq ($(BUILD_HALF),1) -$(KDIR)shdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)shdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHDOTKERNEL) +ifeq ($(BUILD_BFLOAT16),1) +$(KDIR)sbdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sbdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBDOTKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX $< -o $@ -$(KDIR)shstobf16_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TOBF16KERNEL) +$(KDIR)sbstobf16_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TOBF16KERNEL) $(CC) -c $(CFLAGS) -UDOUBLE -DSINGLE $< -o $@ -$(KDIR)shdtobf16_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TOBF16KERNEL) +$(KDIR)sbdtobf16_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TOBF16KERNEL) $(CC) -c $(CFLAGS) -DDOUBLE -USINGLE $< -o $@ $(KDIR)sbf16tos_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(BF16TOKERNEL) $(CC) -c $(CFLAGS) -UDOUBLE -DSINGLE $< -o $@ diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index e03ed0fad..65d429012 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -80,24 +80,24 @@ SGEMMDIRECTPERFORMANT = sgemm_direct_performant.c endif endif -ifeq ($(BUILD_HALF), 1) -ifndef SHGEMMKERNEL -SHGEMM_BETA = ../generic/gemm_beta.c -SHGEMMKERNEL = ../generic/gemmkernel_2x2.c -SHGEMMINCOPY = ../generic/gemm_ncopy_2.c -SHGEMMITCOPY = ../generic/gemm_tcopy_2.c -SHGEMMONCOPY = ../generic/gemm_ncopy_2.c -SHGEMMOTCOPY = ../generic/gemm_tcopy_2.c -SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX) -SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX) -SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX) -SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX) +ifeq ($(BUILD_BFLOAT16), 1) +ifndef SBGEMMKERNEL +SBGEMM_BETA = ../generic/gemm_beta.c +SBGEMMKERNEL = ../generic/gemmkernel_2x2.c +SBGEMMINCOPY = ../generic/gemm_ncopy_2.c +SBGEMMITCOPY = ../generic/gemm_tcopy_2.c +SBGEMMONCOPY = ../generic/gemm_ncopy_2.c +SBGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) +SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) +SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) +SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) endif SHKERNELOBJS += \ - shgemm_kernel$(TSUFFIX).$(SUFFIX) \ - $(SHGEMMINCOPYOBJ) $(SHGEMMITCOPYOBJ) \ - $(SHGEMMONCOPYOBJ) $(SHGEMMOTCOPYOBJ) + sbgemm_kernel$(TSUFFIX).$(SUFFIX) \ + $(SBGEMMINCOPYOBJ) $(SBGEMMITCOPYOBJ) \ + $(SBGEMMONCOPYOBJ) $(SBGEMMOTCOPYOBJ) endif ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" "" @@ -149,7 +149,7 @@ XKERNELOBJS += \ $(XGEMMINCOPYOBJ) $(XGEMMITCOPYOBJ) \ $(XGEMMONCOPYOBJ) $(XGEMMOTCOPYOBJ) -ifeq ($(BUILD_HALF),1) +ifeq ($(BUILD_BFLOAT16),1) SHBLASOBJS += $(SHKERNELOBJS) endif SBLASOBJS += $(SKERNELOBJS) @@ -159,8 +159,8 @@ CBLASOBJS += $(CKERNELOBJS) ZBLASOBJS += $(ZKERNELOBJS) XBLASOBJS += $(XKERNELOBJS) -ifeq ($(BUILD_HALF),1) -SHBLASOBJS += shgemm_beta$(TSUFFIX).$(SUFFIX) +ifeq ($(BUILD_BFLOAT16),1) +SHBLASOBJS += sbgemm_beta$(TSUFFIX).$(SUFFIX) endif ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE))" "" @@ -492,11 +492,11 @@ ZBLASOBJS += \ zgeadd_k$(TSUFFIX).$(SUFFIX) endif -ifeq ($(BUILD_HALF), 1) -SHGEMMINCOPYOBJ_P = $(SHGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -SHGEMMITCOPYOBJ_P = $(SHGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -SHGEMMONCOPYOBJ_P = $(SHGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -SHGEMMOTCOPYOBJ_P = $(SHGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +ifeq ($(BUILD_BFLOAT16), 1) +SBGEMMINCOPYOBJ_P = $(SBGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +SBGEMMITCOPYOBJ_P = $(SBGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +SBGEMMONCOPYOBJ_P = $(SBGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +SBGEMMOTCOPYOBJ_P = $(SBGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) endif SGEMMINCOPYOBJ_P = $(SGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) @@ -524,9 +524,9 @@ XGEMMITCOPYOBJ_P = $(XGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) XGEMMONCOPYOBJ_P = $(XGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) XGEMMOTCOPYOBJ_P = $(XGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -ifeq ($(BUILD_HALF),1) -$(KDIR)shgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_BETA) - $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ +ifeq ($(BUILD_BFLOAT16),1) +$(KDIR)sbgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_BETA) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ endif $(KDIR)sgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_BETA) @@ -548,35 +548,35 @@ $(KDIR)xgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMM_BETA) $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@ -ifeq ($(BUILD_HALF), 1) +ifeq ($(BUILD_BFLOAT16), 1) -$(KDIR)$(SHGEMMONCOPYOBJ) : $(KERNELDIR)/$(SHGEMMONCOPY) - $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ +$(KDIR)$(SBGEMMONCOPYOBJ) : $(KERNELDIR)/$(SBGEMMONCOPY) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ -$(KDIR)$(SHGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SHGEMMOTCOPY) +$(KDIR)$(SBGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SBGEMMOTCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemmotcopy.s - m4 shgemmotcopy.s > shgemmotcopy_nomacros.s - $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemmotcopy_nomacros.s -o $@ - rm shgemmotcopy.s shgemmotcopy_nomacros.s + $(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o - > sbgemmotcopy.s + m4 sbgemmotcopy.s > sbgemmotcopy_nomacros.s + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemmotcopy_nomacros.s -o $@ + rm sbgemmotcopy.s sbgemmotcopy_nomacros.s else - $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ endif -ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N)) +ifneq ($(SBGEMM_UNROLL_M), $(SBGEMM_UNROLL_N)) -$(KDIR)$(SHGEMMINCOPYOBJ) : $(KERNELDIR)/$(SHGEMMINCOPY) - $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ +$(KDIR)$(SBGEMMINCOPYOBJ) : $(KERNELDIR)/$(SBGEMMINCOPY) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ -$(KDIR)$(SHGEMMITCOPYOBJ) : $(KERNELDIR)/$(SHGEMMITCOPY) +$(KDIR)$(SBGEMMITCOPYOBJ) : $(KERNELDIR)/$(SBGEMMITCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemmitcopy.s - m4 shgemmitcopy.s > shgemmitcopy_nomacros.s - $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemmitcopy_nomacros.s -o $@ - rm shgemmitcopy.s shgemmitcopy_nomacros.s + $(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o - > sbgemmitcopy.s + m4 sbgemmitcopy.s > sbgemmitcopy_nomacros.s + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemmitcopy_nomacros.s -o $@ + rm sbgemmitcopy.s sbgemmitcopy_nomacros.s else - $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ endif endif @@ -746,16 +746,16 @@ $(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ endif -ifeq ($(BUILD_HALF), 1) +ifeq ($(BUILD_BFLOAT16), 1) -$(KDIR)shgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND) +$(KDIR)sbgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMMKERNEL) $(SBGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemm_kernel$(TSUFFIX).s - m4 shgemm_kernel$(TSUFFIX).s > shgemm_kernel$(TSUFFIX)_nomacros.s - $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemm_kernel$(TSUFFIX)_nomacros.s -o $@ - rm shgemm_kernel$(TSUFFIX).s shgemm_kernel$(TSUFFIX)_nomacros.s + $(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o - > sbgemm_kernel$(TSUFFIX).s + m4 sbgemm_kernel$(TSUFFIX).s > sbgemm_kernel$(TSUFFIX)_nomacros.s + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemm_kernel$(TSUFFIX)_nomacros.s -o $@ + rm sbgemm_kernel$(TSUFFIX).s sbgemm_kernel$(TSUFFIX)_nomacros.s else - $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ endif endif @@ -2375,9 +2375,9 @@ $(KDIR)xtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_ $(KDIR)sgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMM_BETA) $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ -ifeq ($(BUILD_HALF),1) -$(KDIR)shgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHGEMM_BETA) - $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ +ifeq ($(BUILD_BFLOAT16),1) +$(KDIR)sbgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMM_BETA) + $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ endif $(KDIR)dgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMM_BETA) @@ -2396,19 +2396,19 @@ $(KDIR)xgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMM_BETA) $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@ -ifeq ($(BUILD_HALF), 1) -$(SHGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMONCOPY) - $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ +ifeq ($(BUILD_BFLOAT16), 1) +$(SBGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMONCOPY) + $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ -$(SHGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMOTCOPY) - $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ +$(SBGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMOTCOPY) + $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ -ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N)) -$(SHGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMINCOPY) - $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ +ifneq ($(SBGEMM_UNROLL_M), $(SBGEMM_UNROLL_N)) +$(SBGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMINCOPY) + $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ -$(SHGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMITCOPY) - $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ +$(SBGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMITCOPY) + $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ endif endif @@ -2518,9 +2518,9 @@ endif endif -ifeq ($(BUILD_HALF), 1) -$(KDIR)shgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND) - $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ +ifeq ($(BUILD_BFLOAT16), 1) +$(KDIR)sbgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMMKERNEL) $(SBGEMMDEPEND) + $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ endif $(KDIR)sgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index dd49d8e4e..72fbf32bf 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -53,32 +53,32 @@ gotoblas_t TABLE_NAME = { GEMM_DEFAULT_OFFSET_A, GEMM_DEFAULT_OFFSET_B, GEMM_DEFAULT_ALIGN, -#ifdef BUILD_HALF +#ifdef BUILD_BFLOAT16 0, 0, 0, - SHGEMM_DEFAULT_UNROLL_M, SHGEMM_DEFAULT_UNROLL_N, -#ifdef SHGEMM_DEFAULT_UNROLL_MN - SHGEMM_DEFAULT_UNROLL_MN, + SBGEMM_DEFAULT_UNROLL_M, SBGEMM_DEFAULT_UNROLL_N, +#ifdef SBGEMM_DEFAULT_UNROLL_MN + SBGEMM_DEFAULT_UNROLL_MN, #else - MAX(SHGEMM_DEFAULT_UNROLL_M, SHGEMM_DEFAULT_UNROLL_N), + MAX(SBGEMM_DEFAULT_UNROLL_M, SBGEMM_DEFAULT_UNROLL_N), #endif - shstobf16_kTS, shdtobf16_kTS, sbf16tos_kTS, dbf16tod_kTS, + sbstobf16_kTS, sbdtobf16_kTS, sbf16tos_kTS, dbf16tod_kTS, samax_kTS, samin_kTS, smax_kTS, smin_kTS, isamax_kTS, isamin_kTS, ismax_kTS, ismin_kTS, - snrm2_kTS, sasum_kTS, ssum_kTS, scopy_kTS, shdot_kTS, + snrm2_kTS, sasum_kTS, ssum_kTS, scopy_kTS, sbdot_kTS, dsdot_kTS, srot_kTS, saxpy_kTS, sscal_kTS, sswap_kTS, sgemv_nTS, sgemv_tTS, sger_kTS, ssymv_LTS, ssymv_UTS, - shgemm_kernelTS, shgemm_betaTS, -#if SHGEMM_DEFAULT_UNROLL_M != SHGEMM_DEFAULT_UNROLL_N - shgemm_incopyTS, shgemm_itcopyTS, + sbgemm_kernelTS, sbgemm_betaTS, +#if SBGEMM_DEFAULT_UNROLL_M != SBGEMM_DEFAULT_UNROLL_N + sbgemm_incopyTS, sbgemm_itcopyTS, #else - shgemm_oncopyTS, shgemm_otcopyTS, + sbgemm_oncopyTS, sbgemm_otcopyTS, #endif - shgemm_oncopyTS, shgemm_otcopyTS, + sbgemm_oncopyTS, sbgemm_otcopyTS, strsm_kernel_LNTS, strsm_kernel_LTTS, strsm_kernel_RNTS, strsm_kernel_RTTS, #if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N @@ -830,8 +830,8 @@ gotoblas_t TABLE_NAME = { #if (ARCH_ARM64) static void init_parameter(void) { -#if (BUILD_HALF) - TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; +#if (BUILD_BFLOAT16) + TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P; #endif #if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; @@ -846,8 +846,8 @@ static void init_parameter(void) { TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; #endif -#if (BUILD_HALF) - TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; +#if (BUILD_BFLOAT16) + TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q; #endif #if BUILD_SINGLE == 1 TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; @@ -862,8 +862,8 @@ static void init_parameter(void) { TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; #endif -#if (BUILD_HALF) - TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; +#if (BUILD_BFLOAT16) + TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R; #endif #if BUILD_SINGLE == 1 TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; @@ -936,16 +936,16 @@ static void init_parameter(void) { #if (ARCH_POWER) static void init_parameter(void) { -#ifdef BUILD_HALF - TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; +#ifdef BUILD_BFLOAT16 + TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P; #endif TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; -#ifdef BUILD_HALF - TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; +#ifdef BUILD_BFLOAT16 + TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R; #endif TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; @@ -953,8 +953,8 @@ static void init_parameter(void) { TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; -#ifdef BUILD_HALF - TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; +#ifdef BUILD_BFLOAT16 + TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q; #endif TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; @@ -965,16 +965,16 @@ static void init_parameter(void) { #if (ARCH_ZARCH) static void init_parameter(void) { -#ifdef BUILD_HALF - TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; +#ifdef BUILD_BFLOAT16 + TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P; #endif TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; -#ifdef BUILD_HALF - TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; +#ifdef BUILD_BFLOAT16 + TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R; #endif TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; @@ -982,8 +982,8 @@ static void init_parameter(void) { TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; -#ifdef BUILD_HALF - TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; +#ifdef BUILD_BFLOAT16 + TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q; #endif TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; @@ -1124,10 +1124,10 @@ static void init_parameter(void) { (void) l2; /* dirty trick to suppress unused variable warning for targets */ /* where the GEMM unrolling parameters do not depend on l2 */ -#ifdef BUILD_HALF - TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; - TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; - TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; +#ifdef BUILD_BFLOAT16 + TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P; + TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R; + TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q; #endif #if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; From 052f31bc3c72abbe8b166d6a6aca1096769d6e16 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 12 Oct 2020 00:02:16 +0200 Subject: [PATCH 548/593] Change "HALF" and "sh" to "BFLOAT16" and "sb" --- interface/Makefile | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/interface/Makefile b/interface/Makefile index 71393aaba..a35d53270 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -46,10 +46,10 @@ SBLAS3OBJS = \ somatcopy.$(SUFFIX) simatcopy.$(SUFFIX)\ sgeadd.$(SUFFIX) -ifeq ($(BUILD_HALF),1) -SHBLAS1OBJS = shdot.$(SUFFIX) -SHBLAS3OBJS = shgemm.$(SUFFIX) -SHEXTOBJS = shstobf16.$(SUFFIX) shdtobf16.$(SUFFIX) sbf16tos.$(SUFFIX) dbf16tod.$(SUFFIX) +ifeq ($(BUILD_BFLOAT16),1) +SBBLAS1OBJS = sbdot.$(SUFFIX) +SBBLAS3OBJS = sbgemm.$(SUFFIX) +SBEXTOBJS = sbstobf16.$(SUFFIX) sbdtobf16.$(SUFFIX) sbf16tos.$(SUFFIX) dbf16tod.$(SUFFIX) endif DBLAS1OBJS = \ @@ -282,10 +282,10 @@ CSBLAS3OBJS = \ cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX)\ cblas_sgeadd.$(SUFFIX) -ifeq ($(BUILD_HALF),1) -CSHBLAS1OBJS = cblas_shdot.$(SUFFIX) -CSHBLAS3OBJS = cblas_shgemm.$(SUFFIX) -CSHEXTOBJS = cblas_shstobf16.$(SUFFIX) cblas_shdtobf16.$(SUFFIX) cblas_sbf16tos.$(SUFFIX) cblas_dbf16tod.$(SUFFIX) +ifeq ($(BUILD_BFLOAT16),1) +CBHBLAS1OBJS = cblas_sbdot.$(SUFFIX) +CBHBLAS3OBJS = cblas_sbgemm.$(SUFFIX) +CBHEXTOBJS = cblas_sbstobf16.$(SUFFIX) cblas_sbdtobf16.$(SUFFIX) cblas_sbf16tos.$(SUFFIX) cblas_dbf16tod.$(SUFFIX) endif CDBLAS1OBJS = \ @@ -381,8 +381,8 @@ override CFLAGS += -I. SBLAS1OBJS += $(CSBLAS1OBJS) SBLAS2OBJS += $(CSBLAS2OBJS) SBLAS3OBJS += $(CSBLAS3OBJS) -SHBLAS1OBJS += $(CSHBLAS1OBJS) -SHBLAS3OBJS += $(CSHBLAS3OBJS) +SBBLAS1OBJS += $(CSBBLAS1OBJS) +SBBLAS3OBJS += $(CSBBLAS3OBJS) DBLAS1OBJS += $(CDBLAS1OBJS) DBLAS2OBJS += $(CDBLAS2OBJS) DBLAS3OBJS += $(CDBLAS3OBJS) @@ -393,13 +393,13 @@ ZBLAS1OBJS += $(CZBLAS1OBJS) ZBLAS2OBJS += $(CZBLAS2OBJS) ZBLAS3OBJS += $(CZBLAS3OBJS) -SHEXTOBJS += $(CSHEXTOBJS) +SBEXTOBJS += $(CSBEXTOBJS) CBAUXOBJS += $(CXERBLAOBJ) endif SBLASOBJS = $(SBLAS1OBJS) $(SBLAS2OBJS) $(SBLAS3OBJS) -SHBLASOBJS = $(SHBLAS1OBJS) $(SHBLAS3OBJS) +SBBLASOBJS = $(SBBLAS1OBJS) $(SBBLAS3OBJS) DBLASOBJS = $(DBLAS1OBJS) $(DBLAS2OBJS) $(DBLAS3OBJS) QBLASOBJS = $(QBLAS1OBJS) $(QBLAS2OBJS) $(QBLAS3OBJS) CBLASOBJS = $(CBLAS1OBJS) $(CBLAS2OBJS) $(CBLAS3OBJS) @@ -506,7 +506,7 @@ ifneq ($(BUILD_COMPLEX16),1) ZBLASOBJS= endif -FUNCOBJS = $(SHEXTOBJS) $(CXERBLAOBJS) $(SHBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) +FUNCOBJS = $(SBEXTOBJS) $(CXERBLAOBJS) $(SBBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) $(info FUNCOBJS = {[$(FUNCOBJS)]} ) ifdef EXPRECISION FUNCOBJS += $(QBLASOBJS) $(XBLASOBJS) @@ -772,8 +772,8 @@ sdsdot.$(SUFFIX) sdsdot.$(PSUFFIX) : sdsdot.c dsdot.$(SUFFIX) dsdot.$(PSUFFIX) : dsdot.c $(CC) $(CFLAGS) -c $< -o $(@F) -ifeq ($(BUILD_HALF),1) -shdot.$(SUFFIX) shdot.$(PSUFFIX) : bf16dot.c +ifeq ($(BUILD_BFLOAT16),1) +sbdot.$(SUFFIX) sbdot.$(PSUFFIX) : bf16dot.c $(CC) $(CFLAGS) -c $< -o $(@F) shstobf16.$(SUFFIX) shstobf16.$(PSUFFIX) : tobf16.c $(CC) $(CFLAGS) -DSINGLE_PREC -UDOUBLE_PREC -c $< -o $(@F) @@ -1278,8 +1278,8 @@ zhpr2.$(SUFFIX) zhpr2.$(PSUFFIX) : zhpr2.c xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c $(CC) -c $(CFLAGS) $< -o $(@F) -ifeq ($(BUILD_HALF),1) -shgemm.$(SUFFIX) shgemm.$(PSUFFIX) : gemm.c ../param.h +ifeq ($(BUILD_BFLOAT16),1) +sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) endif @@ -1523,8 +1523,8 @@ cblas_sdsdot.$(SUFFIX) cblas_sdsdot.$(PSUFFIX) : sdsdot.c cblas_dsdot.$(SUFFIX) cblas_dsdot.$(PSUFFIX) : dsdot.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) -ifeq ($(BUILD_HALF),1) -cblas_shdot.$(SUFFIX) cblas_shdot.$(PSUFFIX) : bf16dot.c +ifeq ($(BUILD_BFLOAT16),1) +cblas_sbdot.$(SUFFIX) cblas_sbdot.$(PSUFFIX) : bf16dot.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) cblas_shstobf16.$(SUFFIX) cblas_shstobf16.$(PSUFFIX) : tobf16.c $(CC) $(CFLAGS) -DCBLAS -DSINGLE_PREC -UDOUBLE_PREC -c $< -o $(@F) @@ -1857,8 +1857,8 @@ cblas_zhemv.$(SUFFIX) cblas_zhemv.$(PSUFFIX) : zhemv.c cblas_sgemm.$(SUFFIX) cblas_sgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) -ifeq ($(BUILD_HALF),1) -cblas_shgemm.$(SUFFIX) cblas_shgemm.$(PSUFFIX) : gemm.c ../param.h +ifeq ($(BUILD_BFLOAT16),1) +cblas_sbgemm.$(SUFFIX) cblas_sbgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) endif From ae1ab5bfdf866add26f25cce5c261705178e428e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 12 Oct 2020 00:03:21 +0200 Subject: [PATCH 549/593] Change "HALF" and "sh" to "BFLOAT16" and "sb" --- exports/Makefile | 18 +++++++++--------- exports/gensymbol | 4 ++-- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/exports/Makefile b/exports/Makefile index 960150c86..3f1ffba11 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -30,8 +30,8 @@ ifndef BUILD_LAPACK_DEPRECATED BUILD_LAPACK_DEPRECATED = 0 endif -ifndef BUILD_HALF -BUILD_HALF = 0 +ifndef BUILD_BFLOAT16 +BUILD_BFLOAT16 = 0 endif ifndef BUILD_SINGLE BUILD_SINGLE = 0 @@ -120,10 +120,10 @@ dll : ../$(LIBDLLNAME) -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive $(FEXTRALIB) $(EXTRALIB) $(LIBPREFIX).def : gensymbol - perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) + perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) libgoto_hpl.def : gensymbol - perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) + perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) ifeq ($(OSNAME), Darwin) INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib @@ -258,23 +258,23 @@ static : ../$(LIBNAME) rm -f goto.$(SUFFIX) osx.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) + perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) aix.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) + perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) objcopy.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) + perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) objconv.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) + perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) test : linktest.c $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK. rm -f linktest linktest.c : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c + perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c clean :: @rm -f *.def *.dylib __.SYMDEF* *.renamed diff --git a/exports/gensymbol b/exports/gensymbol index 736fdc2cd..9ff8e10b1 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -51,7 +51,7 @@ zgeadd, dzsum); @cblasobjs = (lsame, xerbla); -@halfblasobjs = (shgemm, shdot, shstobf16, shdtobf16, sbf16tos, dbf16tod); +@halfblasobjs = (sbgemm, sbdot, shstobf16, shdtobf16, sbf16tos, dbf16tod); @cblasobjsc = ( cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv, cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k, @@ -94,7 +94,7 @@ @cblasobjs = ( cblas_xerbla ); -@halfcblasobjs = (cblas_shgemm, cblas_shdot, cblas_shstobf16, cblas_shdtobf16, cblas_sbf16tos, cblas_dbf16tod); +@halfcblasobjs = (cblas_sbgemm, cblas_sbdot, cblas_shstobf16, cblas_shdtobf16, cblas_sbf16tos, cblas_dbf16tod); @exblasobjs = ( qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm, From 85154c2e18fbdcb8b45457dc2d8d51b8b69e71ae Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 12 Oct 2020 00:05:05 +0200 Subject: [PATCH 550/593] Change "HALF" and "sh" to "BFLOAT16" and "sb" --- driver/others/blas_server.c | 2 +- driver/others/blas_server_omp.c | 2 +- driver/others/blas_server_win32.c | 2 +- driver/others/parameter.c | 22 +++++++++++----------- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index acfaed75d..30e0cc6c2 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -227,7 +227,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); -#ifdef BUILD_HALF +#ifdef BUILD_BFLOAT16 } else if ((mode & BLAS_PREC) == BLAS_BFLOAT16){ /* REAL / BFLOAT16 */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16, diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index bfbe3a647..d546553c1 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -192,7 +192,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); -#ifdef BUILD_HALF +#ifdef BUILD_BFLOAT16 } else if ((mode & BLAS_PREC) == BLAS_BFLOAT16){ /* REAL / BFLOAT16 */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16, diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index d2cc91757..4624085d5 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -112,7 +112,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); -#ifdef BUILD_HALF +#ifdef BUILD_BFLOAT16 } else if ((mode & BLAS_PREC) == BLAS_BFLOAT16){ /* REAL / BFLOAT16 */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16, diff --git a/driver/others/parameter.c b/driver/others/parameter.c index 5d312fa87..35fc0a253 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -62,10 +62,10 @@ BLASLONG gemm_offset_b = DEFAULT_GEMM_OFFSET_B; BLASLONG gemm_offset_b = GEMM_OFFSET_B; #endif -#if SHGEMM_P == shgemm_p -BLASLONG shgemm_p = DEFAULT_GEMM_P; +#if SBGEMM_P == sbgemm_p +BLASLONG sbgemm_p = DEFAULT_GEMM_P; #else -BLASLONG shgemm_p = SHGEMM_P; +BLASLONG sbgemm_p = SBGEMM_P; #endif #if SGEMM_P == sgemm_p BLASLONG sgemm_p = DEFAULT_GEMM_P; @@ -88,10 +88,10 @@ BLASLONG zgemm_p = DEFAULT_GEMM_P; BLASLONG zgemm_p = ZGEMM_P; #endif -#if SHGEMM_Q == shgemm_q -BLASLONG shgemm_q = DEFAULT_GEMM_Q; +#if SBGEMM_Q == sbgemm_q +BLASLONG sbgemm_q = DEFAULT_GEMM_Q; #else -BLASLONG shgemm_q = SHGEMM_Q; +BLASLONG sbgemm_q = SBGEMM_Q; #endif #if SGEMM_Q == sgemm_q BLASLONG sgemm_q = DEFAULT_GEMM_Q; @@ -114,10 +114,10 @@ BLASLONG zgemm_q = DEFAULT_GEMM_Q; BLASLONG zgemm_q = ZGEMM_Q; #endif -#if SHGEMM_R == shgemm_r -BLASLONG shgemm_r = DEFAULT_GEMM_R; +#if SBGEMM_R == sbgemm_r +BLASLONG sbgemm_r = DEFAULT_GEMM_R; #else -BLASLONG shgemm_r = SHGEMM_R; +BLASLONG sbgemm_r = SBGEMM_R; #endif #if SGEMM_R == sgemm_r BLASLONG sgemm_r = DEFAULT_GEMM_R; @@ -615,7 +615,7 @@ void blas_set_parameter(void){ size = BITMASK(cpuid3, 16, 0xff); - shgemm_p = 192 * (size + 1); + sbgemm_p = 192 * (size + 1); sgemm_p = 192 * (size + 1); dgemm_p = 96 * (size + 1); cgemm_p = 96 * (size + 1); @@ -629,7 +629,7 @@ void blas_set_parameter(void){ xgemm_p = 16 * (size + 1); #endif - shgemm_r = (((BUFFER_SIZE - ((SHGEMM_P * SHGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SHGEMM_Q * 4)) - 15) & ~15; + sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15; sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15; dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15; cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15; From 006c7f6671895d36153e8a93cd6fd8c084aadfe0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 12 Oct 2020 00:06:06 +0200 Subject: [PATCH 551/593] Change "HALF" and "sh" to "BFLOAT16" and "sb" --- driver/level3/Makefile | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/driver/level3/Makefile b/driver/level3/Makefile index e3aa30256..b4f1e2b26 100644 --- a/driver/level3/Makefile +++ b/driver/level3/Makefile @@ -19,8 +19,8 @@ ifeq ($(ARCH), MIPS) USE_GEMM3M = 1 endif -ifeq ($(BUILD_HALF),1) -SHBLASOBJS += shgemm_nn.$(SUFFIX) shgemm_nt.$(SUFFIX) shgemm_tn.$(SUFFIX) shgemm_tt.$(SUFFIX) +ifeq ($(BUILD_BFLOAT16),1) +SHBLASOBJS += sbgemm_nn.$(SUFFIX) sbgemm_nt.$(SUFFIX) sbgemm_tn.$(SUFFIX) sbgemm_tt.$(SUFFIX) endif SBLASOBJS += \ @@ -207,8 +207,8 @@ COMMONOBJS += gemm_thread_m.$(SUFFIX) gemm_thread_n.$(SUFFIX) gemm_thread_mn.$( COMMONOBJS += syrk_thread.$(SUFFIX) ifndef USE_SIMPLE_THREADED_LEVEL3 -ifeq ($(BUILD_HALF),1) -SHBLASOBJS += shgemm_thread_nn.$(SUFFIX) shgemm_thread_nt.$(SUFFIX) shgemm_thread_tn.$(SUFFIX) shgemm_thread_tt.$(SUFFIX) +ifeq ($(BUILD_BFLOAT16),1) +SHBLASOBJS += sbgemm_thread_nn.$(SUFFIX) sbgemm_thread_nt.$(SUFFIX) sbgemm_thread_tn.$(SUFFIX) sbgemm_thread_tt.$(SUFFIX) endif SBLASOBJS += sgemm_thread_nn.$(SUFFIX) sgemm_thread_nt.$(SUFFIX) sgemm_thread_tn.$(SUFFIX) sgemm_thread_tt.$(SUFFIX) DBLASOBJS += dgemm_thread_nn.$(SUFFIX) dgemm_thread_nt.$(SUFFIX) dgemm_thread_tn.$(SUFFIX) dgemm_thread_tt.$(SUFFIX) @@ -343,16 +343,16 @@ endif all :: -shgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h +sbgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) -shgemm_nt.$(SUFFIX) : gemm.c level3.c ../../param.h +sbgemm_nt.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) -shgemm_tn.$(SUFFIX) : gemm.c level3.c ../../param.h +sbgemm_tn.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) -shgemm_tt.$(SUFFIX) : gemm.c level3.c ../../param.h +sbgemm_tt.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) sgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h @@ -550,16 +550,16 @@ gemm_thread_variable.$(SUFFIX) : gemm_thread_variable.c ../../common.h beta_thread.$(SUFFIX) : beta_thread.c ../../common.h $(CC) -c $(CFLAGS) $< -o $(@F) -shgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h +sbgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) -shgemm_thread_nt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h +sbgemm_thread_nt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) -shgemm_thread_tn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h +sbgemm_thread_tn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) -shgemm_thread_tt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h +sbgemm_thread_tt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) sgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h @@ -2735,16 +2735,16 @@ xtrsm_RCLU.$(SUFFIX) : trsm_R.c xtrsm_RCLN.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) -shgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h +sbgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) -shgemm_nt.$(PSUFFIX) : gemm.c level3.c ../../param.h +sbgemm_nt.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) -shgemm_tn.$(PSUFFIX) : gemm.c level3.c ../../param.h +sbgemm_tn.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) -shgemm_tt.$(PSUFFIX) : gemm.c level3.c ../../param.h +sbgemm_tt.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) sgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h @@ -2943,16 +2943,16 @@ beta_thread.$(PSUFFIX) : beta_thread.c ../../common.h $(CC) -c $(PFLAGS) $< -o $(@F) -shgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h +sbgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) -shgemm_thread_nt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h +sbgemm_thread_nt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) -shgemm_thread_tn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h +sbgemm_thread_tn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) -shgemm_thread_tt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h +sbgemm_thread_tt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) sgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h From e3a29f6b58ffdf656ff9b05438f235646b59586a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 12 Oct 2020 00:07:37 +0200 Subject: [PATCH 552/593] Change "HALF" and "sh" to "BFLOAT16" and "sb" --- cmake/kernel.cmake | 32 ++++++++++++++++---------------- cmake/prebuild.cmake | 8 ++++---- cmake/system.cmake | 27 ++++++++++++++++----------- cmake/utils.cmake | 10 +++++----- 4 files changed, 41 insertions(+), 36 deletions(-) diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake index 79eeaae6f..7d7f5ffda 100644 --- a/cmake/kernel.cmake +++ b/cmake/kernel.cmake @@ -113,7 +113,7 @@ macro(SetDefaultL1) set(ZSUMKERNEL zsum.S) set(QSUMKERNEL sum.S) set(XSUMKERNEL zsum.S) -if (BUILD_HALF) +if (BUILD_BFLOAT16) set(SHAMINKERNEL ../arm/amin.c) set(SHAMAXKERNEL ../arm/amax.c) set(SHMAXKERNEL ../arm/max.c) @@ -126,7 +126,7 @@ if (BUILD_HALF) set(SHAXPYKERNEL ../arm/axpy.c) set(SHAXPBYKERNEL ../arm/axpby.c) set(SHCOPYKERNEL ../arm/copy.c) - set(SHDOTKERNEL ../x86_64/shdot.c) + set(SBDOTKERNEL ../x86_64/sbdot.c) set(SHROTKERNEL ../arm/rot.c) set(SHSCALKERNEL ../arm/scal.c) set(SHNRM2KERNEL ../arm/nrm2.c) @@ -183,9 +183,9 @@ macro(SetDefaultL2) set(XHEMV_L_KERNEL ../generic/zhemv_k.c) set(XHEMV_V_KERNEL ../generic/zhemv_k.c) set(XHEMV_M_KERNEL ../generic/zhemv_k.c) -if (BUILD_HALF) - set(SHGEMVNKERNEL ../arm/gemv_n.c) - set(SHGEMVTKERNEL ../arm/gemv_t.c) +if (BUILD_BFLOAT16) + set(SBGEMVNKERNEL ../arm/gemv_n.c) + set(SBGEMVTKERNEL ../arm/gemv_t.c) set(SHGERKERNEL ../generic/ger.c) endif () endmacro () @@ -195,18 +195,18 @@ macro(SetDefaultL3) set(DGEADD_KERNEL ../generic/geadd.c) set(CGEADD_KERNEL ../generic/zgeadd.c) set(ZGEADD_KERNEL ../generic/zgeadd.c) -if (BUILD_HALF) +if (BUILD_BFLOAT16) set(SHGEADD_KERNEL ../generic/geadd.c) - set(SHGEMMKERNEL ../generic/gemmkernel_2x2.c) - set(SHGEMM_BETA ../generic/gemm_beta.c) - set(SHGEMMINCOPY ../generic/gemm_ncopy_2.c) - set(SHGEMMITCOPY ../generic/gemm_tcopy_2.c) - set(SHGEMMONCOPY ../generic/gemm_ncopy_2.c) - set(SHGEMMOTCOPY ../generic/gemm_tcopy_2.c) - set(SHGEMMINCOPYOBJ shgemm_incopy.o) - set(SHGEMMITCOPYOBJ shgemm_itcopy.o) - set(SHGEMMONCOPYOBJ shgemm_oncopy.o) - set(SHGEMMOTCOPYOBJ shgemm_otcopy.o) + set(SBGEMMKERNEL ../generic/gemmkernel_2x2.c) + set(SBGEMM_BETA ../generic/gemm_beta.c) + set(SBGEMMINCOPY ../generic/gemm_ncopy_2.c) + set(SBGEMMITCOPY ../generic/gemm_tcopy_2.c) + set(SBGEMMONCOPY ../generic/gemm_ncopy_2.c) + set(SBGEMMOTCOPY ../generic/gemm_tcopy_2.c) + set(SBGEMMINCOPYOBJ sbgemm_incopy.o) + set(SBGEMMITCOPYOBJ sbgemm_itcopy.o) + set(SBGEMMONCOPYOBJ sbgemm_oncopy.o) + set(SBGEMMOTCOPYOBJ sbgemm_otcopy.o) endif () endmacro () diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 3b2a9d6a2..f40304c09 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -16,8 +16,8 @@ # HAVE_SSE2 # HAVE_SSE3 # MAKE -# SHGEMM_UNROLL_M -# SHGEMM_UNROLL_N +# SBGEMM_UNROLL_M +# SBGEMM_UNROLL_N # SGEMM_UNROLL_M # SGEMM_UNROLL_N # DGEMM_UNROLL_M @@ -471,8 +471,8 @@ endif () set(ZGEMM_UNROLL_N 2) set(SYMV_P 8) endif() - set(SHGEMM_UNROLL_M 8) - set(SHGEMM_UNROLL_N 4) + set(SBGEMM_UNROLL_M 8) + set(SBGEMM_UNROLL_N 4) # Or should this actually be NUM_CORES? if (${NUM_THREADS} GREATER 0) diff --git a/cmake/system.cmake b/cmake/system.cmake index a504530fb..b34d4a9a5 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -70,6 +70,9 @@ if (DEFINED TARGET) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") endif() endif() + if (DEFINED HAVE_SSE3) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") + endif() endif() if (DEFINED TARGET) @@ -323,7 +326,13 @@ else () set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_STACK_ALLOC=2048") endif () endif () - +if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows") +if (DEFINED BLAS3_MEM_ALLOC_THRESHOLD) +if (NOT ${BLAS3_MEM_ALLOC_THRESHOLD} EQUAL 32) +set(CCOMMON_OPT "${CCOMMON_OPT} -DBLAS3_MEM_ALLOC_THRESHOLD=${BLAS3_MEM_ALLOC_THRESHOLD}") +endif() +endif() +endif() if (DEFINED LIBNAMESUFFIX) set(LIBPREFIX "libopenblas_${LIBNAMESUFFIX}") else () @@ -401,20 +410,16 @@ if (NOT BUILD_SINGLE AND NOT BUILD_DOUBLE AND NOT BUILD_COMPLEX AND NOT BUILD_CO set (BUILD_COMPLEX16 ON) endif() if (BUILD_SINGLE) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_SINGLE=1") - set(CCOMMON_OPT "${CCOMMON_OPT} -DBUILD_SINGLE=1") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_SINGLE") endif() if (BUILD_DOUBLE) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_DOUBLE=1") - set(CCOMMON_OPT "${CCOMMON_OPT} -DBUILD_SINGLE=1") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_DOUBLE") endif() if (BUILD_COMPLEX) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX=1") - set(CCOMMON_OPT "${CCOMMON_OPT} -DBUILD_COMPLEX=1") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX") endif() if (BUILD_COMPLEX16) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX16=1") - set(CCOMMON_OPT "${CCOMMON_OPT} -DBUILD_COMPLEX16=1") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX16") endif() if(NOT MSVC) set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CCOMMON_OPT}") @@ -588,8 +593,8 @@ endif () #export FUNCTION_PROFILE #export TARGET_CORE # -#export SHGEMM_UNROLL_M -#export SHGEMM_UNROLL_N +#export SBGEMM_UNROLL_M +#export SBGEMM_UNROLL_N #export SGEMM_UNROLL_M #export SGEMM_UNROLL_N #export DGEMM_UNROLL_M diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 1c21e776e..8f25c1b27 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -211,7 +211,7 @@ function(GenerateNamedObjects sources_in) if (complex_only) list(REMOVE_ITEM float_list "SINGLE") list(REMOVE_ITEM float_list "DOUBLE") - list(REMOVE_ITEM float_list "HALF") + list(REMOVE_ITEM float_list "BFLOAT16") elseif (real_only) list(REMOVE_ITEM float_list "COMPLEX") list(REMOVE_ITEM float_list "ZCOMPLEX") @@ -225,8 +225,8 @@ function(GenerateNamedObjects sources_in) if (NOT no_float_type) string(SUBSTRING ${float_type} 0 1 float_char) string(TOLOWER ${float_char} float_char) - if (${float_type} STREQUAL "HALF") - set (float_char "sh") + if (${float_type} STREQUAL "BFLOAT16") + set (float_char "sb") endif () endif () @@ -262,8 +262,8 @@ function(GenerateNamedObjects sources_in) if (${float_type} STREQUAL "DOUBLE" OR ${float_type} STREQUAL "ZCOMPLEX") list(APPEND obj_defines "DOUBLE") endif () - if (${float_type} STREQUAL "HALF") - list(APPEND obj_defines "HALF") + if (${float_type} STREQUAL "BFLOAT16") + list(APPEND obj_defines "BFLOAT16") endif () if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") list(APPEND obj_defines "COMPLEX") From 7ae9e8960e85a1b0c0d163a1c5980b9e8cacb71e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 12 Oct 2020 00:08:29 +0200 Subject: [PATCH 553/593] Change "HALF" and "sh" to "BFLOAT16" and "sb" --- benchmark/Makefile | 12 ++++++------ benchmark/gemm.c | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/benchmark/Makefile b/benchmark/Makefile index 2f70ceaf3..f2f3b354a 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -49,8 +49,8 @@ else GOTO_LAPACK_TARGETS= endif -ifeq ($(BUILD_HALF),1) -GOTO_HALF_TARGETS=shgemm.goto +ifeq ($(BUILD_BFLOAT16),1) +GOTO_HALF_TARGETS=sbgemm.goto else GOTO_HALF_TARGETS= endif @@ -620,8 +620,8 @@ zcholesky.essl : zcholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Sgemm #################################################### -ifeq ($(BUILD_HALF),1) -shgemm.goto : shgemm.$(SUFFIX) ../$(LIBNAME) +ifeq ($(BUILD_BFLOAT16),1) +sbgemm.goto : sbgemm.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm endif @@ -2927,8 +2927,8 @@ ccholesky.$(SUFFIX) : cholesky.c zcholesky.$(SUFFIX) : cholesky.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ -ifeq ($(BUILD_HALF),1) -shgemm.$(SUFFIX) : gemm.c +ifeq ($(BUILD_BFLOAT16),1) +sbgemm.$(SUFFIX) : gemm.c $(CC) $(CFLAGS) -c -DHALF -UCOMPLEX -UDOUBLE -o $(@F) $^ endif diff --git a/benchmark/gemm.c b/benchmark/gemm.c index 84dd292c5..8cd14bbed 100644 --- a/benchmark/gemm.c +++ b/benchmark/gemm.c @@ -40,7 +40,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef DOUBLE #define GEMM BLASFUNC(dgemm) #elif defined(HALF) -#define GEMM BLASFUNC(shgemm) +#define GEMM BLASFUNC(sbgemm) #else #define GEMM BLASFUNC(sgemm) #endif From 2c552f1074743f968bbd53ac0d7353e15064ddbf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 12 Oct 2020 00:11:31 +0200 Subject: [PATCH 554/593] Change "HALF" and "sh" to "BFLOAT16" and "sb" --- CMakeLists.txt | 36 ++++++++++++++++-------------------- Makefile.rule | 34 +++++++++++++++++++++++++--------- Makefile.system | 10 +++++----- Makefile.tail | 4 ++-- 4 files changed, 48 insertions(+), 36 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f43e0e0fc..a6cf2ef83 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,10 +29,8 @@ option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding proc else() set(NO_AFFINITY 1) endif() -option(BUILD_SINGLE "Single precision" OFF) -option(BUILD_DOUBLE "Double precision" OFF) -option(BUILD_COMPLEX "Single precision" OFF) -option(BUILD_COMPLEX16 "Single precision" OFF) +option(CPP_THREAD_SAFETY_TEST "Run a massively parallel DGEMM test to confirm thread safety of the library (requires OpenMP and about 1.3GB of RAM)" OFF) +option(CPP_THREAD_SAFETY_GEMV "Run a massively parallel DGEMV test to confirm thread safety of the library (requires OpenMP)" OFF) # Add a prefix or suffix to all exported symbol names in the shared library. # Avoids conflicts with other BLAS libraries, especially when using @@ -91,13 +89,13 @@ if (NOT NO_LAPACK) list(APPEND SUBDIRS lapack) endif () -if (NOT DEFINED BUILD_HALF) - set (BUILD_HALF false) +if (NOT DEFINED BUILD_BFLOAT16) + set (BUILD_BFLOAT16 false) endif () # set which float types we want to build for if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16) # if none are defined, build for all -# set(BUILD_HALF true) +# set(BUILD_BFLOAT16 true) set(BUILD_SINGLE true) set(BUILD_DOUBLE true) set(BUILD_COMPLEX true) @@ -110,33 +108,28 @@ endif() set(FLOAT_TYPES "") if (BUILD_SINGLE) - message(STATUS "Building Songle Precision") - list(APPEND FLOAT_TYPES "SINGLE") - # set(CCOMMON_OPT "${CCOMMON_OPT} -DBUILD_SINGLE=1") + message(STATUS "Building Single Precision") + list(APPEND FLOAT_TYPES "SINGLE") # defines nothing endif () if (BUILD_DOUBLE) message(STATUS "Building Double Precision") - list(APPEND FLOAT_TYPES "DOUBLE") - #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_DOUBLE=1") + list(APPEND FLOAT_TYPES "DOUBLE") # defines DOUBLE endif () if (BUILD_COMPLEX) message(STATUS "Building Complex Precision") - list(APPEND FLOAT_TYPES "COMPLEX") - #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX=1") -endif () + list(APPEND FLOAT_TYPES "COMPLEX") # defines COMPLEX +endif () if (BUILD_COMPLEX16) message(STATUS "Building Double Complex Precision") - list(APPEND FLOAT_TYPES "ZCOMPLEX") - #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX16=1") + list(APPEND FLOAT_TYPES "ZCOMPLEX") # defines COMPLEX and DOUBLE endif () -if (BUILD_HALF) +if (BUILD_BFLOAT16) message(STATUS "Building Half Precision") - list(APPEND FLOAT_TYPES "HALF") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_HALF") + list(APPEND FLOAT_TYPES "BFLOAT16") # defines nothing endif () if (NOT DEFINED CORE OR "${CORE}" STREQUAL "UNKNOWN") @@ -243,6 +236,9 @@ if (NOT MSVC AND NOT NOFORTRAN) add_subdirectory(ctest) endif() add_subdirectory(lapack-netlib/TESTING) + if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV) + add_subdirectory(cpp_thread_test) + endif() endif() set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES diff --git a/Makefile.rule b/Makefile.rule index 09dfb0881..67d183936 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -272,17 +272,33 @@ COMMON_PROF = -pg # work at all. # # CPP_THREAD_SAFETY_TEST = 1 +# +# use this to run only the less memory-hungry GEMV test +# CPP_THREAD_SAFETY_GEMV = 1 # If you want to enable the experimental BFLOAT16 support -# BUILD_HALF = 1 -# -# Select if you need to build only select types -# BUILD_SINGLE = 1 -# BUILD_DOUBLE = 1 -# BUILD_COMPLEX = 1 -# BUILD_COMPLEX16 = 1 -# -# +# BUILD_BFLOAT16 = 1 + + +# Set the thread number threshold beyond which the job array for the threaded level3 BLAS +# will be allocated on the heap rather than the stack. (This array alone requires +# NUM_THREADS*NUM_THREADS*128 bytes of memory so should not pose a problem at low cpu +# counts, but obviously it is not the only item that ends up on the stack. +# The default value of 32 ensures that the overall requirement is compatible +# with the default 1MB stacksize imposed by having the Java VM loaded without use +# of its -Xss parameter. +# The value of 160 formerly used from about version 0.2.7 until 0.3.10 is easily compatible +# with the common Linux stacksize of 8MB but will cause crashes with unwary use of the java +# VM e.g. in Octave or with the java-based libhdfs in numpy or scipy code +# BLAS3_MEM_ALLOC_THRESHOLD = 160 + + + +# the below is not yet configurable, use cmake if you need to build only select types +BUILD_SINGLE = 1 +BUILD_DOUBLE = 1 +BUILD_COMPLEX = 1 +BUILD_COMPLEX16 = 1 # End of user configuration # diff --git a/Makefile.system b/Makefile.system index eb6e14a98..461f7370b 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1232,8 +1232,8 @@ ifeq ($(USE_TLS), 1) CCOMMON_OPT += -DUSE_TLS endif -ifeq ($(BUILD_HALF), 1) -CCOMMON_OPT += -DBUILD_HALF +ifeq ($(BUILD_BFLOAT16), 1) +CCOMMON_OPT += -DBUILD_BFLOAT16 endif ifeq ($(BUILD_SINGLE), 1) CCOMMON_OPT += -DBUILD_SINGLE=1 @@ -1521,10 +1521,10 @@ export KERNELDIR export FUNCTION_PROFILE export TARGET_CORE export NO_AVX512 -export BUILD_HALF +export BUILD_BFLOAT16 -export SHGEMM_UNROLL_M -export SHGEMM_UNROLL_N +export SBGEMM_UNROLL_M +export SBGEMM_UNROLL_N export SGEMM_UNROLL_M export SGEMM_UNROLL_N export DGEMM_UNROLL_M diff --git a/Makefile.tail b/Makefile.tail index 641082450..b14689fc7 100644 --- a/Makefile.tail +++ b/Makefile.tail @@ -24,14 +24,14 @@ BLASOBJS += $(QBLASOBJS) $(XBLASOBJS) BLASOBJS_P += $(QBLASOBJS_P) $(XBLASOBJS_P) endif -$(SHBLASOBJS) $(SHBLASOBJS_P) : override CFLAGS += -DHALF -UDOUBLE -UCOMPLEX +$(SHBLASOBJS) $(SHBLASOBJS_P) : override CFLAGS += -DBFLOAT16 -UDOUBLE -UCOMPLEX $(SBLASOBJS) $(SBLASOBJS_P) : override CFLAGS += -UDOUBLE -UCOMPLEX $(DBLASOBJS) $(DBLASOBJS_P) : override CFLAGS += -DDOUBLE -UCOMPLEX $(QBLASOBJS) $(QBLASOBJS_P) : override CFLAGS += -DXDOUBLE -UCOMPLEX $(CBLASOBJS) $(CBLASOBJS_P) : override CFLAGS += -UDOUBLE -DCOMPLEX $(ZBLASOBJS) $(ZBLASOBJS_P) : override CFLAGS += -DDOUBLE -DCOMPLEX $(XBLASOBJS) $(XBLASOBJS_P) : override CFLAGS += -DXDOUBLE -DCOMPLEX -$(SHEXTOBJS) $(SHEXTOBJS_P) : override CFLAGS += -DHALF -UDOUBLE -UCOMPLEX +$(SHEXTOBJS) $(SHEXTOBJS_P) : override CFLAGS += -DBFLOAT16 -UDOUBLE -UCOMPLEX $(SHBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(SBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) From 629c497b6c34d63c5df133cb1ca74d1189a28652 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 12 Oct 2020 00:27:11 +0200 Subject: [PATCH 555/593] common_sh.h renamed to common_sb.h --- common_macro.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common_macro.h b/common_macro.h index 605d74ada..510813b0f 100644 --- a/common_macro.h +++ b/common_macro.h @@ -39,7 +39,7 @@ #ifndef COMMON_MACRO #define COMMON_MACRO -#include "common_sh.h" +#include "common_sb.h" #include "common_s.h" #include "common_d.h" #include "common_q.h" From bb74dd29db44b9d57770e8f27c7815aecc675611 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 12 Oct 2020 00:42:05 +0200 Subject: [PATCH 556/593] Restore -msse3 --- cmake/system.cmake | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cmake/system.cmake b/cmake/system.cmake index a504530fb..78544f661 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -70,6 +70,9 @@ if (DEFINED TARGET) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") endif() endif() + if (DEFINED HAVE_SSE3) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") + endif() endif() if (DEFINED TARGET) From 0ed1f07660b1836e530d5d9b0a140a36a8bca39d Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Mon, 12 Oct 2020 19:48:53 +0800 Subject: [PATCH 557/593] Optimize the performance of sum by using universal intrinsics --- kernel/arm/sum.c | 48 ++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 40 insertions(+), 8 deletions(-) diff --git a/kernel/arm/sum.c b/kernel/arm/sum.c index 7b78ec61a..d4b3fbc83 100644 --- a/kernel/arm/sum.c +++ b/kernel/arm/sum.c @@ -29,23 +29,55 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * trivial copy of asum.c with the ABS() removed * **************************************************************************************/ - #include "common.h" +#include "../simd/intrin.h" #include FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0; + BLASLONG i = 0; FLOAT sumf = 0.0; - if (n <= 0 || inc_x <= 0) return(sumf); - + if (n <= 0 || inc_x <= 0) + return (sumf); n *= inc_x; - while(i < n) + if (inc_x == 1) + { +#if V_SIMD + const int vstep = v_nlanes_f32; + const int unrollx4 = n & (-vstep * 4); + const int unrollx = n & -vstep; + v_f32 vsum0 = v_zero_f32(); + v_f32 vsum1 = v_zero_f32(); + v_f32 vsum2 = v_zero_f32(); + v_f32 vsum3 = v_zero_f32(); + while (i < unrollx4) + { + vsum0 = v_add_f32(vsum0, v_loadu_f32(x)); + vsum1 = v_add_f32(vsum1, v_loadu_f32(x + vstep)); + vsum2 = v_add_f32(vsum2, v_loadu_f32(x + vstep * 2)); + vsum3 = v_add_f32(vsum3, v_loadu_f32(x + vstep * 3)); + i += vstep * 4; + } + vsum0 = v_add_f32( + v_add_f32(vsum0, vsum1), v_add_f32(vsum2, vsum3)); + while (i < unrollx) + { + vsum0 = v_add_f32(vsum0, v_loadu_f32(x + i)); + i += vstep; + } + sumf = v_sum_f32(vsum0); +#else + int n1 = n & -4; + for (; i < n1; i += 4) + { + sumf += x[i] + x[i + 1] + x[i + 2] + x[i + 3]; + } +#endif + } + while (i < n) { sumf += x[i]; i += inc_x; } - return(sumf); + return (sumf); } - - From cb839575ed71b959f1dbd32d82c8789ea0f54bce Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 12 Oct 2020 14:44:33 +0200 Subject: [PATCH 558/593] Convert the prototypes of the unimplemented BFLOAT16 functions to the new naming scheme --- common_param.h | 146 ++++++++++++++++++++++++------------------------- 1 file changed, 73 insertions(+), 73 deletions(-) diff --git a/common_param.h b/common_param.h index 361523081..b50e4ff80 100644 --- a/common_param.h +++ b/common_param.h @@ -51,39 +51,39 @@ typedef struct { int sbgemm_p, sbgemm_q, sbgemm_r; int sbgemm_unroll_m, sbgemm_unroll_n, sbgemm_unroll_mn; - void (*shstobf16_k) (BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG); - void (*shdtobf16_k) (BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG); + void (*sbstobf16_k) (BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG); + void (*sbdtobf16_k) (BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG); void (*sbf16tos_k) (BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG); void (*dbf16tod_k) (BLASLONG, bfloat16 *, BLASLONG, double *, BLASLONG); - float (*shamax_k) (BLASLONG, float *, BLASLONG); - float (*shamin_k) (BLASLONG, float *, BLASLONG); - float (*shmax_k) (BLASLONG, float *, BLASLONG); - float (*shmin_k) (BLASLONG, float *, BLASLONG); -BLASLONG (*ishamax_k)(BLASLONG, float *, BLASLONG); -BLASLONG (*ishamin_k)(BLASLONG, float *, BLASLONG); -BLASLONG (*ishmax_k) (BLASLONG, float *, BLASLONG); -BLASLONG (*ishmin_k) (BLASLONG, float *, BLASLONG); - - float (*shnrm2_k) (BLASLONG, float *, BLASLONG); - float (*shasum_k) (BLASLONG, float *, BLASLONG); - float (*shsum_k) (BLASLONG, float *, BLASLONG); - int (*shcopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); + float (*sbamax_k) (BLASLONG, float *, BLASLONG); + float (*sbamin_k) (BLASLONG, float *, BLASLONG); + float (*sbmax_k) (BLASLONG, float *, BLASLONG); + float (*sbmin_k) (BLASLONG, float *, BLASLONG); +BLASLONG (*isbamax_k)(BLASLONG, float *, BLASLONG); +BLASLONG (*isbamin_k)(BLASLONG, float *, BLASLONG); +BLASLONG (*isbmax_k) (BLASLONG, float *, BLASLONG); +BLASLONG (*isbmin_k) (BLASLONG, float *, BLASLONG); + + float (*sbnrm2_k) (BLASLONG, float *, BLASLONG); + float (*sbasum_k) (BLASLONG, float *, BLASLONG); + float (*sbsum_k) (BLASLONG, float *, BLASLONG); + int (*sbcopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); float (*sbdot_k) (BLASLONG, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG); double (*dsbdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - int (*shrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); + int (*sbrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); - int (*shaxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - int (*shscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - int (*shswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + int (*sbaxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + int (*sbscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + int (*sbswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*sbgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*sbgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*shger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*sbger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*shsymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*shsymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*sbsymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*sbsymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*sbgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG); int (*sbgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG); @@ -93,57 +93,57 @@ BLASLONG (*ishmin_k) (BLASLONG, float *, BLASLONG); int (*sbgemm_oncopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); int (*sbgemm_otcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); - int (*shtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*shtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*shtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*shtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - - int (*shtrsm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - - int (*shtrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*shtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*shtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*shtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - - int (*shtrmm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - - int (*shsymm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shsymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shsymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shsymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - - int (*shneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*shlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); + int (*sbtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*sbtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*sbtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*sbtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + + int (*sbtrsm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*sbtrsm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*sbtrsm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*sbtrsm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*sbtrsm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*sbtrsm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*sbtrsm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*sbtrsm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*sbtrsm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*sbtrsm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*sbtrsm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*sbtrsm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*sbtrsm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*sbtrsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*sbtrsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*sbtrsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + + int (*sbtrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*sbtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*sbtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*sbtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + + int (*sbtrmm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*sbtrmm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*sbtrmm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*sbtrmm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*sbtrmm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*sbtrmm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*sbtrmm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*sbtrmm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*sbtrmm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*sbtrmm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*sbtrmm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*sbtrmm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*sbtrmm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*sbtrmm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*sbtrmm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*sbtrmm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + + int (*sbsymm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*sbsymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*sbsymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*sbsymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + + int (*sbneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); + int (*sblaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); #endif From 403eb513a0616020e7238b531bad739f6baef43a Mon Sep 17 00:00:00 2001 From: Matti Picus Date: Mon, 12 Oct 2020 18:15:01 +0300 Subject: [PATCH 559/593] use emms instead, add WIN guards --- kernel/x86_64/amax.S | 4 +++- kernel/x86_64/asum.S | 5 ++++- kernel/x86_64/dot.S | 5 ++++- kernel/x86_64/iamax.S | 5 ++++- kernel/x86_64/izamax.S | 5 ++++- kernel/x86_64/nrm2.S | 5 ++++- kernel/x86_64/qconjg.S | 5 ++++- kernel/x86_64/qdot.S | 4 +++- kernel/x86_64/qgemm_kernel_2x2.S | 4 +++- kernel/x86_64/qgemv_n.S | 4 +++- kernel/x86_64/qgemv_t.S | 5 ++++- kernel/x86_64/qtrsm_kernel_LN_2x2.S | 4 +++- kernel/x86_64/qtrsm_kernel_LT_2x2.S | 4 +++- kernel/x86_64/qtrsm_kernel_RT_2x2.S | 5 +++-- kernel/x86_64/sum.S | 4 +++- kernel/x86_64/xdot.S | 4 +++- kernel/x86_64/xgemm3m_kernel_2x2.S | 4 +++- kernel/x86_64/xgemm_kernel_1x1.S | 4 +++- kernel/x86_64/xgemv_n.S | 4 +++- kernel/x86_64/xgemv_t.S | 4 +++- kernel/x86_64/xtrsm_kernel_LT_1x1.S | 4 +++- kernel/x86_64/zamax.S | 4 +++- kernel/x86_64/zasum.S | 4 +++- kernel/x86_64/zdot.S | 4 ++-- kernel/x86_64/znrm2.S | 4 +++- kernel/x86_64/zscal.S | 4 +++- kernel/x86_64/zsum.S | 4 +++- 27 files changed, 87 insertions(+), 29 deletions(-) diff --git a/kernel/x86_64/amax.S b/kernel/x86_64/amax.S index 257147dfb..1498bb226 100644 --- a/kernel/x86_64/amax.S +++ b/kernel/x86_64/amax.S @@ -55,7 +55,9 @@ PROLOGUE PROFCODE - fninit +#ifdef WINDOWS_ABI + emms +#endif salq $BASE_SHIFT, INCX diff --git a/kernel/x86_64/asum.S b/kernel/x86_64/asum.S index 24f57dd11..a2cbfd480 100644 --- a/kernel/x86_64/asum.S +++ b/kernel/x86_64/asum.S @@ -50,7 +50,10 @@ PROLOGUE PROFCODE - fninit +#ifdef WINDOWS_ABI + emms +#endif + fldz testq M, M jle .L999 diff --git a/kernel/x86_64/dot.S b/kernel/x86_64/dot.S index 2319885f1..a11d25e5d 100644 --- a/kernel/x86_64/dot.S +++ b/kernel/x86_64/dot.S @@ -49,7 +49,10 @@ PROLOGUE PROFCODE - fninit + +#ifdef WINDOWS_ABI + emms +#endif salq $BASE_SHIFT, INCX salq $BASE_SHIFT, INCY diff --git a/kernel/x86_64/iamax.S b/kernel/x86_64/iamax.S index 0c666d623..00999e25f 100644 --- a/kernel/x86_64/iamax.S +++ b/kernel/x86_64/iamax.S @@ -59,7 +59,10 @@ PROLOGUE PROFCODE - fninit + +#ifdef WINDOWS_ABI + emms +#endif salq $BASE_SHIFT, INCX diff --git a/kernel/x86_64/izamax.S b/kernel/x86_64/izamax.S index e450c2cd2..b24b2e692 100644 --- a/kernel/x86_64/izamax.S +++ b/kernel/x86_64/izamax.S @@ -59,7 +59,10 @@ PROLOGUE PROFCODE - fninit + +#ifdef WINDOWS_ABI + emms +#endif salq $ZBASE_SHIFT, INCX diff --git a/kernel/x86_64/nrm2.S b/kernel/x86_64/nrm2.S index 548e3b744..b79ac2adb 100644 --- a/kernel/x86_64/nrm2.S +++ b/kernel/x86_64/nrm2.S @@ -50,7 +50,10 @@ PROLOGUE PROFCODE - fninit +#ifdef WINDOWS_ABI + emms +#endif + fldz testq M, M jle .L999 diff --git a/kernel/x86_64/qconjg.S b/kernel/x86_64/qconjg.S index bab541831..823a15a84 100644 --- a/kernel/x86_64/qconjg.S +++ b/kernel/x86_64/qconjg.S @@ -41,7 +41,10 @@ PROLOGUE PROFCODE - fninit + +#ifdef WINDOWS_ABI + emms +#endif fldz FLD 1 * SIZE(ARG1) diff --git a/kernel/x86_64/qdot.S b/kernel/x86_64/qdot.S index e7d31360b..2243b6b6d 100644 --- a/kernel/x86_64/qdot.S +++ b/kernel/x86_64/qdot.S @@ -58,7 +58,9 @@ PROLOGUE - fninit +#ifdef WINDOWS_ABI + emms +#endif pushl %edi pushl %esi diff --git a/kernel/x86_64/qgemm_kernel_2x2.S b/kernel/x86_64/qgemm_kernel_2x2.S index 7b5e7707d..c11f3a91d 100644 --- a/kernel/x86_64/qgemm_kernel_2x2.S +++ b/kernel/x86_64/qgemm_kernel_2x2.S @@ -74,7 +74,9 @@ PROLOGUE PROFCODE - fninit +#ifdef WINDOWS_ABI + emms +#endif subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) diff --git a/kernel/x86_64/qgemv_n.S b/kernel/x86_64/qgemv_n.S index 1b65b03f0..c9d345cb1 100644 --- a/kernel/x86_64/qgemv_n.S +++ b/kernel/x86_64/qgemv_n.S @@ -76,7 +76,9 @@ PROLOGUE PROFCODE - fninit +#ifdef WINDOWS_ABI + emms +#endif subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) diff --git a/kernel/x86_64/qgemv_t.S b/kernel/x86_64/qgemv_t.S index 00188c257..32372ff15 100644 --- a/kernel/x86_64/qgemv_t.S +++ b/kernel/x86_64/qgemv_t.S @@ -75,7 +75,10 @@ PROLOGUE PROFCODE - fninit +#ifdef WINDOWS_ABI + emms +#endif + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) diff --git a/kernel/x86_64/qtrsm_kernel_LN_2x2.S b/kernel/x86_64/qtrsm_kernel_LN_2x2.S index 030eff893..0a545faf8 100644 --- a/kernel/x86_64/qtrsm_kernel_LN_2x2.S +++ b/kernel/x86_64/qtrsm_kernel_LN_2x2.S @@ -74,7 +74,9 @@ PROLOGUE PROFCODE - fninit +#ifdef WINDOWS_ABI + emms +#endif subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) diff --git a/kernel/x86_64/qtrsm_kernel_LT_2x2.S b/kernel/x86_64/qtrsm_kernel_LT_2x2.S index d86972c72..16063fbcd 100644 --- a/kernel/x86_64/qtrsm_kernel_LT_2x2.S +++ b/kernel/x86_64/qtrsm_kernel_LT_2x2.S @@ -74,7 +74,9 @@ PROLOGUE PROFCODE - fninit +#ifdef WINDOWS_ABI + emms +#endif subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) diff --git a/kernel/x86_64/qtrsm_kernel_RT_2x2.S b/kernel/x86_64/qtrsm_kernel_RT_2x2.S index 2826a62c9..4c94ac02c 100644 --- a/kernel/x86_64/qtrsm_kernel_RT_2x2.S +++ b/kernel/x86_64/qtrsm_kernel_RT_2x2.S @@ -74,8 +74,9 @@ PROLOGUE PROFCODE - fninit - +#ifdef WINDOWS_ABI + emms +#endif subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) diff --git a/kernel/x86_64/sum.S b/kernel/x86_64/sum.S index 3d5fa7cc2..9f2cdc1ec 100644 --- a/kernel/x86_64/sum.S +++ b/kernel/x86_64/sum.S @@ -50,7 +50,9 @@ PROLOGUE PROFCODE - fninit +#ifdef WINDOWS_ABI + emms +#endif fldz testq M, M diff --git a/kernel/x86_64/xdot.S b/kernel/x86_64/xdot.S index ec89b799c..c4b473494 100644 --- a/kernel/x86_64/xdot.S +++ b/kernel/x86_64/xdot.S @@ -59,7 +59,9 @@ PROFCODE - fninit +#ifdef WINDOWS_ABI + emms +#endif #define N %ebx diff --git a/kernel/x86_64/xgemm3m_kernel_2x2.S b/kernel/x86_64/xgemm3m_kernel_2x2.S index e8da78d82..1d0b23c40 100644 --- a/kernel/x86_64/xgemm3m_kernel_2x2.S +++ b/kernel/x86_64/xgemm3m_kernel_2x2.S @@ -78,7 +78,9 @@ PROLOGUE PROFCODE - fninit +#ifdef WINDOWS_ABI + emms +#endif subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) diff --git a/kernel/x86_64/xgemm_kernel_1x1.S b/kernel/x86_64/xgemm_kernel_1x1.S index f04ab07f5..ee67d8d43 100644 --- a/kernel/x86_64/xgemm_kernel_1x1.S +++ b/kernel/x86_64/xgemm_kernel_1x1.S @@ -97,7 +97,9 @@ PROLOGUE PROFCODE - fninit +#ifdef WINDOWS_ABI + emms +#endif subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) diff --git a/kernel/x86_64/xgemv_n.S b/kernel/x86_64/xgemv_n.S index 7d28c118a..b66f28d58 100644 --- a/kernel/x86_64/xgemv_n.S +++ b/kernel/x86_64/xgemv_n.S @@ -76,7 +76,9 @@ PROLOGUE PROFCODE - fninit +#ifdef WINDOWS_ABI + emms +#endif subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) diff --git a/kernel/x86_64/xgemv_t.S b/kernel/x86_64/xgemv_t.S index e79676088..d6d37010d 100644 --- a/kernel/x86_64/xgemv_t.S +++ b/kernel/x86_64/xgemv_t.S @@ -75,7 +75,9 @@ PROLOGUE PROFCODE - fninit +#ifdef WINDOWS_ABI + emms +#endif subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) diff --git a/kernel/x86_64/xtrsm_kernel_LT_1x1.S b/kernel/x86_64/xtrsm_kernel_LT_1x1.S index 54d41932f..875206363 100644 --- a/kernel/x86_64/xtrsm_kernel_LT_1x1.S +++ b/kernel/x86_64/xtrsm_kernel_LT_1x1.S @@ -90,7 +90,9 @@ PROLOGUE PROFCODE - fninit +#ifdef WINDOWS_ABI + emms +#endif subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) diff --git a/kernel/x86_64/zamax.S b/kernel/x86_64/zamax.S index bfd836193..5cb2f6019 100644 --- a/kernel/x86_64/zamax.S +++ b/kernel/x86_64/zamax.S @@ -55,7 +55,9 @@ PROLOGUE PROFCODE - fninit +#ifdef WINDOWS_ABI + emms +#endif salq $ZBASE_SHIFT, INCX diff --git a/kernel/x86_64/zasum.S b/kernel/x86_64/zasum.S index 9ea2aadc0..3460fcea3 100644 --- a/kernel/x86_64/zasum.S +++ b/kernel/x86_64/zasum.S @@ -50,7 +50,9 @@ PROLOGUE PROFCODE - fninit +#ifdef WINDOWS_ABI + emms +#endif fldz testq M, M diff --git a/kernel/x86_64/zdot.S b/kernel/x86_64/zdot.S index f7df919b7..87c08d7c8 100644 --- a/kernel/x86_64/zdot.S +++ b/kernel/x86_64/zdot.S @@ -54,9 +54,9 @@ PROLOGUE PROFCODE - fninit - #ifdef WINDOWS_ABI + emms + movq 40(%rsp), INCY #endif diff --git a/kernel/x86_64/znrm2.S b/kernel/x86_64/znrm2.S index cb02a5a9f..0d2aa3480 100644 --- a/kernel/x86_64/znrm2.S +++ b/kernel/x86_64/znrm2.S @@ -50,7 +50,9 @@ PROLOGUE PROFCODE - fninit +#ifdef WINDOWS_ABI + emms +#endif fldz testq M, M diff --git a/kernel/x86_64/zscal.S b/kernel/x86_64/zscal.S index 08c0831a4..5ed4c4576 100644 --- a/kernel/x86_64/zscal.S +++ b/kernel/x86_64/zscal.S @@ -50,7 +50,9 @@ PROLOGUE PROFCODE - fninit +#ifdef WINDOWS_ABI + emms +#endif salq $ZBASE_SHIFT, INCX diff --git a/kernel/x86_64/zsum.S b/kernel/x86_64/zsum.S index 1c3904839..aa02637e4 100644 --- a/kernel/x86_64/zsum.S +++ b/kernel/x86_64/zsum.S @@ -50,7 +50,9 @@ PROLOGUE PROFCODE - fninit +#ifdef WINDOWS_ABI + emms +#endif fldz testq M, M From 8d2df7d066dbe6988502b352a4594cc78f9d89c7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 13 Oct 2020 00:14:29 +0200 Subject: [PATCH 560/593] Revert special handling of Windows xNRM2 and enable C+intrinsics kernel for SSUM/DSUM --- kernel/x86_64/KERNEL | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL index d75196974..cb98fd89a 100644 --- a/kernel/x86_64/KERNEL +++ b/kernel/x86_64/KERNEL @@ -259,12 +259,8 @@ SNRM2KERNEL = nrm2_sse.S endif ifndef DNRM2KERNEL -ifeq ($(OSNAME),WINNT) -DNRM2KERNEL = ../arm/nrm2.c -else DNRM2KERNEL = nrm2.S endif -endif ifndef QNRM2KERNEL QNRM2KERNEL = nrm2.S @@ -275,12 +271,8 @@ CNRM2KERNEL = znrm2_sse.S endif ifndef ZNRM2KERNEL -ifeq ($(OSNAME),WINNT) -ZNRM2KERNEL = ../arm/znrm2.c -else ZNRM2KERNEL = znrm2.S endif -endif ifndef XNRM2KERNEL XNRM2KERNEL = znrm2.S @@ -486,3 +478,6 @@ XTRSMKERNEL_RN = xtrsm_kernel_LT_1x1.S XTRSMKERNEL_RT = xtrsm_kernel_LT_1x1.S XGEMM3MKERNEL = xgemm3m_kernel_2x2.S + +SSUMKERNEL = ../arm/sum.c +DSUMKERNEL = ../arm/sum.c From e05af6575ee9fa12f2afea8c2c20e80b1529ba84 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 13 Oct 2020 09:05:04 +0200 Subject: [PATCH 561/593] Fix some overlooked "SHBLAS" entries --- Makefile.tail | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/Makefile.tail b/Makefile.tail index b14689fc7..54ba649db 100644 --- a/Makefile.tail +++ b/Makefile.tail @@ -1,18 +1,18 @@ -SHBLASOBJS_P = $(SHBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) +SBBLASOBJS_P = $(SBBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) SBLASOBJS_P = $(SBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) DBLASOBJS_P = $(DBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) QBLASOBJS_P = $(QBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) CBLASOBJS_P = $(CBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) ZBLASOBJS_P = $(ZBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) XBLASOBJS_P = $(XBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) -SHEXTOBJS_P = $(SHEXTOBJS:.$(SUFFIX)=.$(PSUFFIX)) +SBEXTOBJS_P = $(SBEXTOBJS:.$(SUFFIX)=.$(PSUFFIX)) COMMONOBJS_P = $(COMMONOBJS:.$(SUFFIX)=.$(PSUFFIX)) HPLOBJS_P = $(HPLOBJS:.$(SUFFIX)=.$(PSUFFIX)) -BLASOBJS = $(SHEXTOBJS) $(SHBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) $(CBAUXOBJS) -BLASOBJS_P = $(SHEXTOBJS_P) $(SHBLASOBJS_P) $(SBLASOBJS_P) $(DBLASOBJS_P) $(CBLASOBJS_P) $(ZBLASOBJS_P) $(CBAUXOBJS_P) +BLASOBJS = $(SBEXTOBJS) $(SBBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) $(CBAUXOBJS) +BLASOBJS_P = $(SBEXTOBJS_P) $(SBBLASOBJS_P) $(SBLASOBJS_P) $(DBLASOBJS_P) $(CBLASOBJS_P) $(ZBLASOBJS_P) $(CBAUXOBJS_P) ifdef EXPRECISION BLASOBJS += $(QBLASOBJS) $(XBLASOBJS) @@ -24,23 +24,23 @@ BLASOBJS += $(QBLASOBJS) $(XBLASOBJS) BLASOBJS_P += $(QBLASOBJS_P) $(XBLASOBJS_P) endif -$(SHBLASOBJS) $(SHBLASOBJS_P) : override CFLAGS += -DBFLOAT16 -UDOUBLE -UCOMPLEX +$(SBBLASOBJS) $(SBBLASOBJS_P) : override CFLAGS += -DBFLOAT16 -UDOUBLE -UCOMPLEX $(SBLASOBJS) $(SBLASOBJS_P) : override CFLAGS += -UDOUBLE -UCOMPLEX $(DBLASOBJS) $(DBLASOBJS_P) : override CFLAGS += -DDOUBLE -UCOMPLEX $(QBLASOBJS) $(QBLASOBJS_P) : override CFLAGS += -DXDOUBLE -UCOMPLEX $(CBLASOBJS) $(CBLASOBJS_P) : override CFLAGS += -UDOUBLE -DCOMPLEX $(ZBLASOBJS) $(ZBLASOBJS_P) : override CFLAGS += -DDOUBLE -DCOMPLEX $(XBLASOBJS) $(XBLASOBJS_P) : override CFLAGS += -DXDOUBLE -DCOMPLEX -$(SHEXTOBJS) $(SHEXTOBJS_P) : override CFLAGS += -DBFLOAT16 -UDOUBLE -UCOMPLEX +$(SBEXTOBJS) $(SBEXTOBJS_P) : override CFLAGS += -DBFLOAT16 -UDOUBLE -UCOMPLEX -$(SHBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) +$(SBBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(SBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(DBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(QBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(CBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(ZBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(XBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) -$(SHEXTOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) +$(SBEXTOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) libs :: $(BLASOBJS) $(COMMONOBJS) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ From 2ae87856039e78cf736fb22efb9bc8020697cbe3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 13 Oct 2020 09:07:50 +0200 Subject: [PATCH 562/593] Add a POWER9 build with BFLOAT16 enabled --- .travis.yml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/.travis.yml b/.travis.yml index 4bfdf485c..3f917ce72 100644 --- a/.travis.yml +++ b/.travis.yml @@ -104,6 +104,23 @@ matrix: # for matrix annotation only - TARGET_BOX=PPC64LE_LINUX_P9 + - os: linux + arch: ppc64le + dist: bionic + compiler: gcc + before_script: + - sudo add-apt-repository 'ppa:ubuntu-toolchain-r/test' -y + - sudo apt-get update + - sudo apt-get install gcc-9 gfortran-9 -y + script: + - make QUIET_MAKE=1 BUILD_BFLOAT16=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9 + - make -C test $COMMON_FLAGS $BTYPE + - make -C ctest $COMMON_FLAGS $BTYPE + - make -C utest $COMMON_FLAGS $BTYPE + env: + # for matrix annotation only + - TARGET_BOX=PPC64LE_LINUX_P9 + - os: linux compiler: gcc addons: From 84949754a0d62fe70beb8d36285328eb446a5dcd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 13 Oct 2020 09:11:36 +0200 Subject: [PATCH 563/593] Fix bfloat16 conditional --- common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common.h b/common.h index 89eeb197d..a3ef99b59 100644 --- a/common.h +++ b/common.h @@ -257,7 +257,7 @@ typedef long BLASLONG; typedef unsigned long BLASULONG; #endif -#ifndef BFLOAT16 +#ifndef bfloat16 #include typedef uint16_t bfloat16; #define BFLOAT16CONVERSION 1 From 1e7eb7b7a91838ccba39b9183fb0a5a814c09b7b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 13 Oct 2020 09:17:15 +0200 Subject: [PATCH 564/593] Fix typos in currently unused sections --- interface/Makefile | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/interface/Makefile b/interface/Makefile index a35d53270..1905827f9 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -283,9 +283,9 @@ CSBLAS3OBJS = \ cblas_sgeadd.$(SUFFIX) ifeq ($(BUILD_BFLOAT16),1) -CBHBLAS1OBJS = cblas_sbdot.$(SUFFIX) -CBHBLAS3OBJS = cblas_sbgemm.$(SUFFIX) -CBHEXTOBJS = cblas_sbstobf16.$(SUFFIX) cblas_sbdtobf16.$(SUFFIX) cblas_sbf16tos.$(SUFFIX) cblas_dbf16tod.$(SUFFIX) +CSBBLAS1OBJS = cblas_sbdot.$(SUFFIX) +CSBBLAS3OBJS = cblas_sbgemm.$(SUFFIX) +CSBEXTOBJS = cblas_sbstobf16.$(SUFFIX) cblas_sbdtobf16.$(SUFFIX) cblas_sbf16tos.$(SUFFIX) cblas_dbf16tod.$(SUFFIX) endif CDBLAS1OBJS = \ @@ -535,19 +535,19 @@ endif clean :: @rm -f functable.h -level1 : $(BEXTOBJS) $(SHBLAS1OBJS) $(SBLAS1OBJS) $(DBLAS1OBJS) $(QBLAS1OBJS) $(CBLAS1OBJS) $(ZBLAS1OBJS) $(XBLAS1OBJS) +level1 : $(SBEXTOBJS) $(SBBLAS1OBJS) $(SBLAS1OBJS) $(DBLAS1OBJS) $(QBLAS1OBJS) $(CBLAS1OBJS) $(ZBLAS1OBJS) $(XBLAS1OBJS) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ level2 : $(SBLAS2OBJS) $(DBLAS2OBJS) $(QBLAS2OBJS) $(CBLAS2OBJS) $(ZBLAS2OBJS) $(XBLAS2OBJS) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ -level3 : $(SHBLAS3OBJS) $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $(XBLAS3OBJS) +level3 : $(SBBLAS3OBJS) $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $(XBLAS3OBJS) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ aux : $(CBAUXOBJS) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ -$(CSHBLASOBJS) $(CSHBLASOBJS_P) $(CSBLASOBJS) $(CSBLASOBJS_P) $(CDBLASOBJS) $(CDBLASOBJS_P) $(CQBLASOBJS) $(CQBLASOBJS_P) \ +$(CSBBLASOBJS) $(CSBBLASOBJS_P) $(CSBLASOBJS) $(CSBLASOBJS_P) $(CDBLASOBJS) $(CDBLASOBJS_P) $(CQBLASOBJS) $(CQBLASOBJS_P) \ $(CCBLASOBJS) $(CCBLASOBJS_P) $(CZBLASOBJS) $(CZBLASOBJS_P) $(CXBLASOBJS) $(CXBLASOBJS_P) $(CBAUXOBJS_P) : override CFLAGS += -DCBLAS srot.$(SUFFIX) srot.$(PSUFFIX) : rot.c From 6999086a2bc4be5796a5d091f491af3b32970a71 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 13 Oct 2020 10:32:19 +0200 Subject: [PATCH 566/593] whitelist SANDYBRIDGE for SSE3 --- kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/Makefile b/kernel/Makefile index 290fb2afe..6745a79dd 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -41,7 +41,7 @@ ifdef NO_AVX2 endif ifdef TARGET_CORE - ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),PRESCOTT CORE2 PENRYN DUNNINGTON ATOM NANO NEHALEM BARCELONA BOBCAT BULLDOZER PILEDRIVER EXCAVATOR STEAMROLLER OPTERON_SSE3)) + ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),PRESCOTT CORE2 PENRYN DUNNINGTON ATOM NANO SANDYBRIDGE NEHALEM BARCELONA BOBCAT BULLDOZER PILEDRIVER EXCAVATOR STEAMROLLER OPTERON_SSE3)) override CFLAGS += -msse3 endif ifeq ($(TARGET_CORE), COOPERLAKE) From 0eacbca85fa30657f749f7818e081952b9fb49f4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 13 Oct 2020 11:42:39 +0200 Subject: [PATCH 567/593] Add Haswell and Zen to temporary sse3 whitelist --- kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/Makefile b/kernel/Makefile index 6745a79dd..e567485a6 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -41,7 +41,7 @@ ifdef NO_AVX2 endif ifdef TARGET_CORE - ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),PRESCOTT CORE2 PENRYN DUNNINGTON ATOM NANO SANDYBRIDGE NEHALEM BARCELONA BOBCAT BULLDOZER PILEDRIVER EXCAVATOR STEAMROLLER OPTERON_SSE3)) + ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),PRESCOTT CORE2 PENRYN DUNNINGTON ATOM NANO SANDYBRIDGE HASWELL NEHALEM ZEN BARCELONA BOBCAT BULLDOZER PILEDRIVER EXCAVATOR STEAMROLLER OPTERON_SSE3)) override CFLAGS += -msse3 endif ifeq ($(TARGET_CORE), COOPERLAKE) From fecedc9c699527dfdb208bde4634374eca1ebbce Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 13 Oct 2020 11:55:41 +0200 Subject: [PATCH 568/593] Add -mssse3 --- kernel/Makefile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/Makefile b/kernel/Makefile index e567485a6..c95c15f56 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -8,6 +8,9 @@ include $(TOPDIR)/Makefile.system ifdef HAVE_SSE3 CFLAGS += -msse3 endif +ifdef HAVE_SSSE3 +CFLAGS += -mssse3 +endif ifeq ($(C_COMPILER), GCC) GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) @@ -42,7 +45,7 @@ endif ifdef TARGET_CORE ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),PRESCOTT CORE2 PENRYN DUNNINGTON ATOM NANO SANDYBRIDGE HASWELL NEHALEM ZEN BARCELONA BOBCAT BULLDOZER PILEDRIVER EXCAVATOR STEAMROLLER OPTERON_SSE3)) - override CFLAGS += -msse3 + override CFLAGS += -msse3 -mssse3 endif ifeq ($(TARGET_CORE), COOPERLAKE) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) From 5f60a32cacc4e168202c7f8729d97b11e861e0c3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 13 Oct 2020 11:57:04 +0200 Subject: [PATCH 569/593] Add -mssse3 if supported by the hardware --- Makefile.x86_64 | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index e793a1c2f..f055828a9 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -12,6 +12,10 @@ ifdef HAVE_SSE3 ifndef DYNAMIC_ARCH CCOMMON_OPT += -msse3 FCOMMON_OPT += -msse3 +ifdef HAVE_SSSE3 +CCOMMON_OPT += -mssse3 +FCOMMON_OPT += -mssse3 +endif endif endif From 9e3cff5cf2cf841e9a7a73b70b4465c87ac45643 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 13 Oct 2020 14:41:25 +0200 Subject: [PATCH 570/593] Expressly enable -mavx2 on Zen, SkylakeX and Cooperlake as well --- Makefile.x86_64 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index f055828a9..9e75dc91c 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -64,7 +64,7 @@ endif endif endif -ifeq ($(CORE), HASWELL) +ifeq ($(CORE), $(filter $(CORE), HASWELL ZEN SKYLAKEX COOPERLAKE) ifndef DYNAMIC_ARCH ifndef NO_AVX2 ifeq ($(C_COMPILER), GCC) From 137ae618dba8ddf2ee899cb2a7854b34f1100ed3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 13 Oct 2020 15:02:17 +0200 Subject: [PATCH 571/593] Fix typo --- Makefile.x86_64 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 9e75dc91c..8a3fc4eae 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -64,7 +64,7 @@ endif endif endif -ifeq ($(CORE), $(filter $(CORE), HASWELL ZEN SKYLAKEX COOPERLAKE) +ifeq ($(CORE), $(filter $(CORE), HASWELL ZEN SKYLAKEX COOPERLAKE)) ifndef DYNAMIC_ARCH ifndef NO_AVX2 ifeq ($(C_COMPILER), GCC) From b5d30b390dd8d6aed4617c94e5b4fd94425c96d1 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Tue, 13 Oct 2020 11:00:22 -0500 Subject: [PATCH 572/593] Fix build issues with bfloat16 This patch fixes compilation errors due to recent renaming from SH to SB with BUILD_BFLOAT16. --- cblas.h | 4 ++-- common_interface.h | 4 ++-- common_level1.h | 4 ++-- common_macro.h | 4 ++-- driver/level3/Makefile | 4 ++-- exports/gensymbol | 4 ++-- interface/Makefile | 8 ++++---- kernel/Makefile.L1 | 6 +++--- kernel/Makefile.L3 | 6 +++--- test/Makefile | 8 +++----- 10 files changed, 25 insertions(+), 27 deletions(-) diff --git a/cblas.h b/cblas.h index 4fc6f8681..bf310bed2 100644 --- a/cblas.h +++ b/cblas.h @@ -384,9 +384,9 @@ void cblas_zgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint /*** BFLOAT16 and INT8 extensions ***/ /* convert float array to BFLOAT16 array by rounding */ -void cblas_shstobf16(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *in, OPENBLAS_CONST blasint incin, bfloat16 *out, OPENBLAS_CONST blasint incout); +void cblas_sbstobf16(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *in, OPENBLAS_CONST blasint incin, bfloat16 *out, OPENBLAS_CONST blasint incout); /* convert double array to BFLOAT16 array by rounding */ -void cblas_shdtobf16(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *in, OPENBLAS_CONST blasint incin, bfloat16 *out, OPENBLAS_CONST blasint incout); +void cblas_sbdtobf16(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *in, OPENBLAS_CONST blasint incin, bfloat16 *out, OPENBLAS_CONST blasint incout); /* convert BFLOAT16 array to float array */ void cblas_sbf16tos(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPENBLAS_CONST blasint incin, float *out, OPENBLAS_CONST blasint incout); /* convert BFLOAT16 array to double array */ diff --git a/common_interface.h b/common_interface.h index bee09e894..032877fe1 100644 --- a/common_interface.h +++ b/common_interface.h @@ -55,8 +55,8 @@ double BLASFUNC(ddot) (blasint *, double *, blasint *, double *, blasint *); xdouble BLASFUNC(qdot) (blasint *, xdouble *, blasint *, xdouble *, blasint *); float BLASFUNC(sbdot) (blasint *, bfloat16 *, blasint *, bfloat16 *, blasint *); -void BLASFUNC(shstobf16) (blasint *, float *, blasint *, bfloat16 *, blasint *); -void BLASFUNC(shdtobf16) (blasint *, double *, blasint *, bfloat16 *, blasint *); +void BLASFUNC(sbstobf16) (blasint *, float *, blasint *, bfloat16 *, blasint *); +void BLASFUNC(sbdtobf16) (blasint *, double *, blasint *, bfloat16 *, blasint *); void BLASFUNC(sbf16tos) (blasint *, bfloat16 *, blasint *, float *, blasint *); void BLASFUNC(dbf16tod) (blasint *, bfloat16 *, blasint *, double *, blasint *); diff --git a/common_level1.h b/common_level1.h index 7b17962c4..d2ed47e56 100644 --- a/common_level1.h +++ b/common_level1.h @@ -48,8 +48,8 @@ double ddot_k(BLASLONG, double *, BLASLONG, double *, BLASLONG); xdouble qdot_k(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); float sbdot_k(BLASLONG, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG); -void shstobf16_k(BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG); -void shdtobf16_k(BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG); +void sbstobf16_k(BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG); +void sbdtobf16_k(BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG); void sbf16tos_k (BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG); void dbf16tod_k (BLASLONG, bfloat16 *, BLASLONG, double *, BLASLONG); diff --git a/common_macro.h b/common_macro.h index 510813b0f..54deed57c 100644 --- a/common_macro.h +++ b/common_macro.h @@ -646,9 +646,9 @@ #elif defined(BFLOAT16) -#define D_TO_BF16_K SHDTOBF16_K +#define D_TO_BF16_K SBDTOBF16_K #define D_BF16_TO_K DBF16TOD_K -#define S_TO_BF16_K SHSTOBF16_K +#define S_TO_BF16_K SBSTOBF16_K #define S_BF16_TO_K SBF16TOS_K #define AMAX_K SAMAX_K diff --git a/driver/level3/Makefile b/driver/level3/Makefile index b4f1e2b26..b528dfa2d 100644 --- a/driver/level3/Makefile +++ b/driver/level3/Makefile @@ -20,7 +20,7 @@ USE_GEMM3M = 1 endif ifeq ($(BUILD_BFLOAT16),1) -SHBLASOBJS += sbgemm_nn.$(SUFFIX) sbgemm_nt.$(SUFFIX) sbgemm_tn.$(SUFFIX) sbgemm_tt.$(SUFFIX) +SBBLASOBJS += sbgemm_nn.$(SUFFIX) sbgemm_nt.$(SUFFIX) sbgemm_tn.$(SUFFIX) sbgemm_tt.$(SUFFIX) endif SBLASOBJS += \ @@ -208,7 +208,7 @@ COMMONOBJS += syrk_thread.$(SUFFIX) ifndef USE_SIMPLE_THREADED_LEVEL3 ifeq ($(BUILD_BFLOAT16),1) -SHBLASOBJS += sbgemm_thread_nn.$(SUFFIX) sbgemm_thread_nt.$(SUFFIX) sbgemm_thread_tn.$(SUFFIX) sbgemm_thread_tt.$(SUFFIX) +SBBLASOBJS += sbgemm_thread_nn.$(SUFFIX) sbgemm_thread_nt.$(SUFFIX) sbgemm_thread_tn.$(SUFFIX) sbgemm_thread_tt.$(SUFFIX) endif SBLASOBJS += sgemm_thread_nn.$(SUFFIX) sgemm_thread_nt.$(SUFFIX) sgemm_thread_tn.$(SUFFIX) sgemm_thread_tt.$(SUFFIX) DBLASOBJS += dgemm_thread_nn.$(SUFFIX) dgemm_thread_nt.$(SUFFIX) dgemm_thread_tn.$(SUFFIX) dgemm_thread_tt.$(SUFFIX) diff --git a/exports/gensymbol b/exports/gensymbol index 9ff8e10b1..8482ecb7e 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -51,7 +51,7 @@ zgeadd, dzsum); @cblasobjs = (lsame, xerbla); -@halfblasobjs = (sbgemm, sbdot, shstobf16, shdtobf16, sbf16tos, dbf16tod); +@halfblasobjs = (sbgemm, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod); @cblasobjsc = ( cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv, cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k, @@ -94,7 +94,7 @@ @cblasobjs = ( cblas_xerbla ); -@halfcblasobjs = (cblas_sbgemm, cblas_sbdot, cblas_shstobf16, cblas_shdtobf16, cblas_sbf16tos, cblas_dbf16tod); +@halfcblasobjs = (cblas_sbgemm, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod); @exblasobjs = ( qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm, diff --git a/interface/Makefile b/interface/Makefile index 1905827f9..6b247b49f 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -775,9 +775,9 @@ dsdot.$(SUFFIX) dsdot.$(PSUFFIX) : dsdot.c ifeq ($(BUILD_BFLOAT16),1) sbdot.$(SUFFIX) sbdot.$(PSUFFIX) : bf16dot.c $(CC) $(CFLAGS) -c $< -o $(@F) -shstobf16.$(SUFFIX) shstobf16.$(PSUFFIX) : tobf16.c +sbstobf16.$(SUFFIX) sbstobf16.$(PSUFFIX) : tobf16.c $(CC) $(CFLAGS) -DSINGLE_PREC -UDOUBLE_PREC -c $< -o $(@F) -shdtobf16.$(SUFFIX) shdtobf16.$(PSUFFIX) : tobf16.c +sbdtobf16.$(SUFFIX) sbdtobf16.$(PSUFFIX) : tobf16.c $(CC) $(CFLAGS) -USINGLE_PREC -DDOUBLE_PREC -c $< -o $(@F) sbf16tos.$(SUFFIX) sbf16tos.$(PSUFFIX) : bf16to.c $(CC) $(CFLAGS) -DSINGLE_PREC -UDOUBLE_PREC -c $< -o $(@F) @@ -1526,9 +1526,9 @@ cblas_dsdot.$(SUFFIX) cblas_dsdot.$(PSUFFIX) : dsdot.c ifeq ($(BUILD_BFLOAT16),1) cblas_sbdot.$(SUFFIX) cblas_sbdot.$(PSUFFIX) : bf16dot.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) -cblas_shstobf16.$(SUFFIX) cblas_shstobf16.$(PSUFFIX) : tobf16.c +cblas_sbstobf16.$(SUFFIX) cblas_sbstobf16.$(PSUFFIX) : tobf16.c $(CC) $(CFLAGS) -DCBLAS -DSINGLE_PREC -UDOUBLE_PREC -c $< -o $(@F) -cblas_shdtobf16.$(SUFFIX) cblas_shdtobf16.$(PSUFFIX) : tobf16.c +cblas_sbdtobf16.$(SUFFIX) cblas_sbdtobf16.$(PSUFFIX) : tobf16.c $(CC) $(CFLAGS) -DCBLAS -USINGLE_PREC -DDOUBLE_PREC -c $< -o $(@F) cblas_sbf16tos.$(SUFFIX) cblas_sbf16tos.$(PSUFFIX) : bf16to.c $(CC) $(CFLAGS) -DCBLAS -DSINGLE_PREC -UDOUBLE_PREC -c $< -o $(@F) diff --git a/kernel/Makefile.L1 b/kernel/Makefile.L1 index 6fe6778d0..7ad94118a 100644 --- a/kernel/Makefile.L1 +++ b/kernel/Makefile.L1 @@ -531,11 +531,11 @@ XBLASOBJS += \ xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX) xsum_k$(TSUFFIX).$(SUFFIX) ifeq ($(BUILD_BFLOAT16),1) -SHBLASOBJS += \ +SBBLASOBJS += \ sbdot_k$(TSUFFIX).$(SUFFIX) -SHEXTOBJS += \ +SBEXTOBJS += \ sbstobf16_k$(TSUFFIX).$(SUFFIX) sbdtobf16_k$(TSUFFIX).$(SUFFIX) -SHEXTOBJS += \ +SBEXTOBJS += \ sbf16tos_k$(TSUFFIX).$(SUFFIX) dbf16tod_k$(TSUFFIX).$(SUFFIX) endif diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 65d429012..2ba593c2e 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -94,7 +94,7 @@ SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) endif -SHKERNELOBJS += \ +SBKERNELOBJS += \ sbgemm_kernel$(TSUFFIX).$(SUFFIX) \ $(SBGEMMINCOPYOBJ) $(SBGEMMITCOPYOBJ) \ $(SBGEMMONCOPYOBJ) $(SBGEMMOTCOPYOBJ) @@ -150,7 +150,7 @@ XKERNELOBJS += \ $(XGEMMONCOPYOBJ) $(XGEMMOTCOPYOBJ) ifeq ($(BUILD_BFLOAT16),1) -SHBLASOBJS += $(SHKERNELOBJS) +SBBLASOBJS += $(SBKERNELOBJS) endif SBLASOBJS += $(SKERNELOBJS) DBLASOBJS += $(DKERNELOBJS) @@ -160,7 +160,7 @@ ZBLASOBJS += $(ZKERNELOBJS) XBLASOBJS += $(XKERNELOBJS) ifeq ($(BUILD_BFLOAT16),1) -SHBLASOBJS += sbgemm_beta$(TSUFFIX).$(SUFFIX) +SBBLASOBJS += sbgemm_beta$(TSUFFIX).$(SUFFIX) endif ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE))" "" diff --git a/test/Makefile b/test/Makefile index 06fb7fe86..212343389 100644 --- a/test/Makefile +++ b/test/Makefile @@ -214,11 +214,9 @@ endif -#ifeq ($(BUILD_BFLOAT16),1) -#level3 : test_sbgemm sblat3 dblat3 cblat3 zblat3 -#else -#level3 : sblat3 dblat3 cblat3 zblat3 -#endif +ifeq ($(BUILD_BFLOAT16),1) +level3 : test_sbgemm +endif ifndef CROSS rm -f ?BLAT3.SUMM From 437b7fe261f7026f0fcc517e0e3015cad29bb579 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 13 Oct 2020 19:55:14 +0200 Subject: [PATCH 573/593] sh prefix renamed to sb --- ctest/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ctest/CMakeLists.txt b/ctest/CMakeLists.txt index 8d301c239..8aed9eb85 100644 --- a/ctest/CMakeLists.txt +++ b/ctest/CMakeLists.txt @@ -12,7 +12,7 @@ FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh foreach(float_type ${FLOAT_TYPES}) string(SUBSTRING ${float_type} 0 1 float_char_upper) string(TOLOWER ${float_char_upper} float_char) - if (${float_char} STREQUAL "h") + if (${float_char} STREQUAL "b") continue() endif() #level1 From bc5c7f95781adcea95b60e553ad785d8e25cead8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 13 Oct 2020 19:56:09 +0200 Subject: [PATCH 574/593] Cleanup --- test/Makefile | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/test/Makefile b/test/Makefile index 212343389..06fb7fe86 100644 --- a/test/Makefile +++ b/test/Makefile @@ -214,9 +214,11 @@ endif -ifeq ($(BUILD_BFLOAT16),1) -level3 : test_sbgemm -endif +#ifeq ($(BUILD_BFLOAT16),1) +#level3 : test_sbgemm sblat3 dblat3 cblat3 zblat3 +#else +#level3 : sblat3 dblat3 cblat3 zblat3 +#endif ifndef CROSS rm -f ?BLAT3.SUMM From 4bb73c01713c43f28a3ab464399fb716516ffc70 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 13 Oct 2020 20:07:19 +0200 Subject: [PATCH 575/593] Rename "HALF" type to "BFLOAT16" --- lapack/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt index 778e6f8fa..fd4e57048 100644 --- a/lapack/CMakeLists.txt +++ b/lapack/CMakeLists.txt @@ -2,7 +2,7 @@ include_directories(${PROJECT_SOURCE_DIR}) include_directories(${PROJECT_BINARY_DIR}) -list (REMOVE_ITEM FLOAT_TYPES "HALF") +list (REMOVE_ITEM FLOAT_TYPES "BFLOAT16") set(LAPACK_SOURCES potrf/potrf_U_single.c From 0826d68f93ef1fed021c426911c464728d60ccb3 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Tue, 13 Oct 2020 16:05:10 -0500 Subject: [PATCH 576/593] POWER10: Change the packing format for bfloat16 As the new MMA instructions need the inputs in 4x2 order for bfloat16, changing the format in copy/packing code. This avoids permute instructions in the gemm kernel inner loop. --- kernel/power/KERNEL.POWER10 | 8 +- kernel/power/sbgemm_kernel_power10.c | 477 ++++++++---------- kernel/power/sbgemm_ncopy_16_power10.c | 437 ++++++++++++++++ kernel/power/sbgemm_ncopy_8_power10.c | 383 ++++++++++++++ kernel/power/sbgemm_tcopy_16_power10.c | 244 +++++++++ kernel/power/sbgemm_tcopy_8_power10.c | 659 +++++++++++++++++++++++++ 6 files changed, 1923 insertions(+), 285 deletions(-) create mode 100644 kernel/power/sbgemm_ncopy_16_power10.c create mode 100644 kernel/power/sbgemm_ncopy_8_power10.c create mode 100644 kernel/power/sbgemm_tcopy_16_power10.c create mode 100644 kernel/power/sbgemm_tcopy_8_power10.c diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index 5cf1660a2..031d96581 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -9,10 +9,10 @@ else SBGEMM_BETA = ../generic/gemm_beta.c SBGEMMKERNEL = sbgemm_kernel_power10.c -SBGEMMINCOPY = ../generic/gemm_ncopy_16.c -SBGEMMITCOPY = ../generic/gemm_tcopy_16.c -SBGEMMONCOPY = ../generic/gemm_ncopy_8.c -SBGEMMOTCOPY = ../generic/gemm_tcopy_8.c +SBGEMMINCOPY = sbgemm_ncopy_16_power10.c +SBGEMMITCOPY = sbgemm_tcopy_16_power10.c +SBGEMMONCOPY = sbgemm_ncopy_8_power10.c +SBGEMMOTCOPY = sbgemm_tcopy_8_power10.c SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/power/sbgemm_kernel_power10.c b/kernel/power/sbgemm_kernel_power10.c index 46d82598a..d15586703 100644 --- a/kernel/power/sbgemm_kernel_power10.c +++ b/kernel/power/sbgemm_kernel_power10.c @@ -137,15 +137,13 @@ int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc) { - BLASLONG N = n; BLASLONG i1; v4sf_t valpha = { alpha, alpha, alpha, alpha }; vector short vzero = { 0, 0, 0, 0, 0, 0, 0, 0 }; - N = n >> 3; /* Loop for n >= 8. */ - for (i1 = 0; i1 < N; i1++) + for (i1 = 0; i1 < (n >> 3); i1++) { - BLASLONG i, j; + BLASLONG j; FLOAT *CO; IFLOAT *AO; CO = C; @@ -153,9 +151,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, AO = A; PREFETCH1 (A, 128); PREFETCH1 (A, 256); - i = m >> 4; /* Loop for m >= 16. */ - for (j = 0; j < i; j++) + for (j = 0; j < (m >> 4); j++) { IFLOAT *BO = B; v4sf_t *rowC; @@ -167,20 +164,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, { vec_t *rowA = (vec_t *) & (AO[l << 5]); vec_t *rowB = (vec_t *) & (BO[l << 4]); - vec_t rowB_h = MERGE_HIGH (rowB[0], rowB[1]); - vec_t rowB_l = MERGE_LOW (rowB[0], rowB[1]); - vec_t rowA_h = MERGE_HIGH (rowA[0], rowA[2]); - vec_t rowA_l = MERGE_LOW (rowA[0], rowA[2]); - vec_t rowA2_h = MERGE_HIGH (rowA[1], rowA[3]); - vec_t rowA2_l = MERGE_LOW (rowA[1], rowA[3]); - MMA (&acc0, rowB_h, rowA_h); - MMA (&acc1, rowB_l, rowA_h); - MMA (&acc2, rowB_h, rowA_l); - MMA (&acc3, rowB_l, rowA_l); - MMA (&acc4, rowB_h, rowA2_h); - MMA (&acc5, rowB_l, rowA2_h); - MMA (&acc6, rowB_h, rowA2_l); - MMA (&acc7, rowB_l, rowA2_l); + MMA (&acc0, rowB[0], rowA[0]); + MMA (&acc1, rowB[1], rowA[0]); + MMA (&acc2, rowB[0], rowA[1]); + MMA (&acc3, rowB[1], rowA[1]); + MMA (&acc4, rowB[0], rowA[2]); + MMA (&acc5, rowB[1], rowA[2]); + MMA (&acc6, rowB[0], rowA[3]); + MMA (&acc7, rowB[1], rowA[3]); } if (k % 2 == 1) { @@ -216,9 +207,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, AO += (k << 4); BO += (k << 3); } - i = (m & 15) >> 3; - /* Loop for m >= 8. */ - for (j = 0; j < i; j++) + if (m & 8) { IFLOAT *BO = B; v4sf_t *rowC; @@ -230,14 +219,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, { vec_t *rowA = (vec_t *) & (AO[l << 4]); vec_t *rowB = (vec_t *) & (BO[l << 4]); - vec_t rowB_h = MERGE_HIGH (rowB[0], rowB[1]); - vec_t rowB_l = MERGE_LOW (rowB[0], rowB[1]); - vec_t rowA_h = MERGE_HIGH (rowA[0], rowA[1]); - vec_t rowA_l = MERGE_LOW (rowA[0], rowA[1]); - MMA (&acc0, rowB_h, rowA_h); - MMA (&acc1, rowB_l, rowA_h); - MMA (&acc2, rowB_h, rowA_l); - MMA (&acc3, rowB_l, rowA_l); + + MMA (&acc0, rowB[0], rowA[0]); + MMA (&acc1, rowB[1], rowA[0]); + MMA (&acc2, rowB[0], rowA[1]); + MMA (&acc3, rowB[1], rowA[1]); } if (k % 2 == 1) { @@ -262,9 +248,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, AO += (k << 3); BO += (k << 3); } - i = (m & 7) >> 2; - /* Loop for m >= 4. */ - for (j = 0; j < i; j++) + if (m & 4) { IFLOAT *BO = B; v4sf_t *rowC; @@ -277,9 +261,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, { vec_t *rowA = (vec_t *) & (AO[l << 3]); vec_t *rowB = (vec_t *) & (BO[l << 4]); - vec_t rowA_mrg = MERGE_ROW (rowA[0]); - MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), rowA_mrg); - MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), rowA_mrg); + MMA (&acc0, rowB[0], rowA[0]); + MMA (&acc1, rowB[1], rowA[0]); } if (k % 2 == 1) { @@ -297,9 +280,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, AO += (k << 2); BO += (k << 3); } - i = (m & 3) >> 1; - /* Loop for m >= 2. */ - for (j = 0; j < i; j++) + if (m & 2) { IFLOAT *BO = B; v2sf_t *rowC; @@ -316,8 +297,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, 0, 0, 0, 0 }; vec_t *rowB = (vec_t *) & (BO[l << 4]); - MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), (vec_t) rowA); - MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), (vec_t) rowA); + MMA (&acc0, rowB[0], (vec_t) rowA); + MMA (&acc1, rowB[1], (vec_t) rowA); } if (k % 2 == 1) { @@ -334,64 +315,50 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, AO += (k << 1); BO += (k << 3); } - i = (m & 1) >> 0; - /* Loop for m = 1. */ - for (j = 0; j < i; j++) + if (m & 1) { IFLOAT *BO = B; + v2sf_t *rowC; + v2sf_t result[8]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); BLASLONG l = 0; - v4sf_t t = { 0, 0, 0, 0 } - , t1 = - { - 0, 0, 0, 0}; - for (l = 0; l < k; l++) + for (l = 0; l < k / 2; l++) { - v4sf_t rowA = - { BF16TOF32 (AO[l]), BF16TOF32 (AO[l]), BF16TOF32 (AO[l]), - BF16TOF32 (AO[l]) - }; - v4sf_t rowB = - { BF16TOF32 (BO[l << 3]), BF16TOF32 (BO[(l << 3) + 1]), - BF16TOF32 (BO[(l << 3) + 2]), - BF16TOF32 (BO[(l << 3) + 3]) - }; - v4sf_t rowB1 = - { BF16TOF32 (BO[(l << 3) + 4]), BF16TOF32 (BO[(l << 3) + 5]), - BF16TOF32 (BO[(l << 3) + 6]), - BF16TOF32 (BO[(l << 3) + 7]) - }; - t += rowA * rowB; - t1 += rowA * rowB1; + vector short rowA = + { AO[(l << 1) + 0], AO[(l << 1) + 1], 0, 0, 0, 0, 0, 0}; + vec_t *rowB = (vec_t *) & (BO[l << 4]); + MMA (&acc0, rowB[0], (vec_t) rowA); + MMA (&acc1, rowB[1], (vec_t) rowA); } - t = t * valpha; - t1 = t1 * valpha; - CO[0 * ldc] += t[0]; - CO[1 * ldc] += t[1]; - CO[2 * ldc] += t[2]; - CO[3 * ldc] += t[3]; - CO[4 * ldc] += t1[0]; - CO[5 * ldc] += t1[1]; - CO[6 * ldc] += t1[2]; - CO[7 * ldc] += t1[3]; + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 1; + vector short rowA = { AO[l], 0, 0, 0, 0, 0, 0, 0 }; + vec_t *rowB = (vec_t *) & (BO[(l << 3)]); + MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), (vec_t) rowA); + MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), (vec_t) rowA); + } + SAVE4x2_ACC (&acc0, 0); + SAVE4x2_ACC1 (&acc1, 0); CO += 1; AO += k; BO += (k << 3); } B += k << 3; } - N = (n & 7) >> 2; - /* Loop for n >= 4. */ - for (i1 = 0; i1 < N; i1++) + if (n & 4) { - BLASLONG i, j; + BLASLONG j; FLOAT *CO; IFLOAT *AO; CO = C; C += ldc << 2; AO = A; - i = m >> 5; /* Loop for m >= 32. */ - for (j = 0; j < i; j++) + for (j = 0; j < (m >> 5); j++) { IFLOAT *BO = B; IFLOAT *A1 = AO + (16 * k); @@ -405,15 +372,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, vec_t *rowA = (vec_t *) & (AO[l << 5]); vec_t *rowA1 = (vec_t *) & (A1[l << 5]); vec_t *rowB = (vec_t *) & (BO[l << 3]); - vec_t rowB_mrg = MERGE_ROW (rowB[0]); - MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], rowA[2])); - MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], rowA[2])); - MMA (&acc2, rowB_mrg, MERGE_HIGH (rowA[1], rowA[3])); - MMA (&acc3, rowB_mrg, MERGE_LOW (rowA[1], rowA[3])); - MMA (&acc4, rowB_mrg, MERGE_HIGH (rowA1[0], rowA1[2])); - MMA (&acc5, rowB_mrg, MERGE_LOW (rowA1[0], rowA1[2])); - MMA (&acc6, rowB_mrg, MERGE_HIGH (rowA1[1], rowA1[3])); - MMA (&acc7, rowB_mrg, MERGE_LOW (rowA1[1], rowA1[3])); + MMA (&acc0, rowB[0], rowA[0]); + MMA (&acc1, rowB[0], rowA[1]); + MMA (&acc2, rowB[0], rowA[2]); + MMA (&acc3, rowB[0], rowA[3]); + MMA (&acc4, rowB[0], rowA1[0]); + MMA (&acc5, rowB[0], rowA1[1]); + MMA (&acc6, rowB[0], rowA1[2]); + MMA (&acc7, rowB[0], rowA1[3]); } if (k % 2 == 1) { @@ -448,9 +414,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, AO += k << 5; BO += k << 2; } - i = (m & 31) >> 4; - /* Loop for m >= 16. */ - for (j = 0; j < i; j++) + if (m & 16) { IFLOAT *BO = B; v4sf_t *rowC; @@ -462,11 +426,10 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, { vec_t *rowA = (vec_t *) & (AO[l << 5]); vec_t *rowB = (vec_t *) & (BO[l << 3]); - vec_t rowB_mrg = MERGE_ROW (rowB[0]); - MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], rowA[2])); - MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], rowA[2])); - MMA (&acc2, rowB_mrg, MERGE_HIGH (rowA[1], rowA[3])); - MMA (&acc3, rowB_mrg, MERGE_LOW (rowA[1], rowA[3])); + MMA (&acc0, rowB[0], rowA[0]); + MMA (&acc1, rowB[0], rowA[1]); + MMA (&acc2, rowB[0], rowA[2]); + MMA (&acc3, rowB[0], rowA[3]); } if (k % 2 == 1) { @@ -490,9 +453,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, AO += k << 4; BO += k << 2; } - i = (m & 15) >> 3; - /* Loop for m >= 8. */ - for (j = 0; j < i; j++) + if (m & 8) { IFLOAT *BO = B; v4sf_t *rowC; @@ -505,9 +466,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, { vec_t *rowA = (vec_t *) & (AO[l << 4]); vec_t *rowB = (vec_t *) & (BO[l << 3]); - vec_t rowB_mrg = MERGE_ROW (rowB[0]); - MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], rowA[1])); - MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], rowA[1])); + MMA (&acc0, rowB[0], rowA[0]); + MMA (&acc1, rowB[0], rowA[1]); } if (k % 2 == 1) { @@ -525,9 +485,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, AO += k << 3; BO += k << 2; } - i = (m & 7) >> 2; - /* Loop for m >= 4. */ - for (j = 0; j < i; j++) + if (m & 4) { IFLOAT *BO = B; v4sf_t *rowC; @@ -539,7 +497,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, { vec_t *rowA = (vec_t *) & (AO[l << 3]); vec_t *rowB = (vec_t *) & (BO[l << 3]); - MMA (&acc0, MERGE_ROW (rowB[0]), MERGE_ROW (rowA[0])); + MMA (&acc0, rowB[0], rowA[0]); } if (k % 2 == 1) { @@ -555,9 +513,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, AO += k << 2; BO += k << 2; } - i = (m & 3) >> 1; - /* Loop for m >= 2. */ - for (j = 0; j < i; j++) + if (m & 2) { IFLOAT *BO = B; v2sf_t *rowC; @@ -573,7 +529,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, 0, 0, 0, 0 }; vec_t *rowB = (vec_t *) & (BO[l << 3]); - MMA (&acc0, MERGE_ROW (rowB[0]), (vec_t) rowA); + MMA (&acc0, rowB[0], (vec_t) rowA); } if (k % 2 == 1) { @@ -588,31 +544,32 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, AO += k << 1; BO += k << 2; } - i = (m & 1) >> 0; - /* Loop for m = 1. */ - for (j = 0; j < i; j++) + if (m & 1) { IFLOAT *BO = B; + v2sf_t *rowC; + v2sf_t result[8]; + __vector_quad acc0; BLASLONG l = 0; - v4sf_t t = { 0, 0, 0, 0 }; - for (l = 0; l < k; l++) + __builtin_mma_xxsetaccz (&acc0); + for (l = 0; l < k / 2; l++) { - v4sf_t rowA = - { BF16TOF32 (AO[l]), BF16TOF32 (AO[l]), BF16TOF32 (AO[l]), - BF16TOF32 (AO[l]) - }; - v4sf_t rowB = - { BF16TOF32 (BO[l << 2]), BF16TOF32 (BO[(l << 2) + 1]), - BF16TOF32 (BO[(l << 2) + 2]), - BF16TOF32 (BO[(l << 2) + 3]) + vector short rowA = + { AO[(l << 1) + 0], AO[(l << 1) + 1], 0, + 0, 0, 0, 0 }; - t += rowA * rowB; + vec_t *rowB = (vec_t *) & (BO[l << 3]); + MMA (&acc0, rowB[0], (vec_t) rowA); } - t = t * valpha; - CO[0 * ldc] += t[0]; - CO[1 * ldc] += t[1]; - CO[2 * ldc] += t[2]; - CO[3 * ldc] += t[3]; + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 1; + vector short rowA = { AO[l], 0, 0, 0, 0, 0, 0, 0 }; + vec_t *rowB = (vec_t *) & (BO[l << 2]); + MMA (&acc0, MERGE_ROW (rowB[0]), (vec_t) rowA); + } + SAVE4x2_ACC (&acc0, 0); AO += k; BO += (k << 2); CO += 1; @@ -620,19 +577,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, B += k << 2; } - N = (n & 3) >> 1; - /* Loop for n >= 2. */ - for (i1 = 0; i1 < N; i1++) + if (n & 2) { - BLASLONG i, j; + BLASLONG j; FLOAT *CO; IFLOAT *AO; CO = C; C += ldc << 1; AO = A; - i = m >> 5; /* Loop for m >= 32. */ - for (j = 0; j < i; j++) + for (j = 0; j < (m >> 5); j++) { IFLOAT *BO = B; v4sf_t *rowC; @@ -650,14 +604,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, }; vec_t *rowA = (vec_t *) & (AO[l << 5]); vec_t *rowA1 = (vec_t *) & (A1[l << 5]); - MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2])); - MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2])); - MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3])); - MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3])); - MMA (&acc4, (vec_t) rowB, MERGE_HIGH (rowA1[0], rowA1[2])); - MMA (&acc5, (vec_t) rowB, MERGE_LOW (rowA1[0], rowA1[2])); - MMA (&acc6, (vec_t) rowB, MERGE_HIGH (rowA1[1], rowA1[3])); - MMA (&acc7, (vec_t) rowB, MERGE_LOW (rowA1[1], rowA1[3])); + MMA (&acc0, (vec_t) rowB, rowA[0]); + MMA (&acc1, (vec_t) rowB, rowA[1]); + MMA (&acc2, (vec_t) rowB, rowA[2]); + MMA (&acc3, (vec_t) rowB, rowA[3]); + MMA (&acc4, (vec_t) rowB, rowA1[0]); + MMA (&acc5, (vec_t) rowB, rowA1[1]); + MMA (&acc6, (vec_t) rowB, rowA1[2]); + MMA (&acc7, (vec_t) rowB, rowA1[3]); } if (k % 2 == 1) { @@ -688,9 +642,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, AO += k << 5; BO += k << 1; } - i = (m & 31) >> 4; - /* Loop for m >= 16. */ - for (j = 0; j < i; j++) + if (m & 16) { IFLOAT *BO = B; v4sf_t *rowC; @@ -706,10 +658,10 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, 0, 0, 0, 0 }; vec_t *rowA = (vec_t *) & (AO[l << 5]); - MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2])); - MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2])); - MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3])); - MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3])); + MMA (&acc0, (vec_t) rowB, rowA[0]); + MMA (&acc1, (vec_t) rowB, rowA[1]); + MMA (&acc2, (vec_t) rowB, rowA[2]); + MMA (&acc3, (vec_t) rowB, rowA[3]); } if (k % 2 == 1) { @@ -730,9 +682,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, AO += k << 4; BO += k << 1; } - i = (m & 15) >> 3; - /* Loop for m >= 8. */ - for (j = 0; j < i; j++) + if (m & 8) { IFLOAT *BO = B; v4sf_t *rowC; @@ -749,8 +699,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, 0, 0, 0, 0 }; vec_t *rowA = (vec_t *) & (AO[l << 4]); - MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[1])); - MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[1])); + MMA (&acc0, (vec_t) rowB, rowA[0]); + MMA (&acc1, (vec_t) rowB, rowA[1]); } if (k % 2 == 1) { @@ -767,9 +717,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, AO += k << 3; BO += k << 1; } - i = (m & 7) >> 2; - /* Loop for m >= 4. */ - for (j = 0; j < i; j++) + if (m & 4) { IFLOAT *BO = B; v4sf_t *rowC; @@ -785,7 +733,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, 0, 0, 0, 0 }; vec_t *rowA = (vec_t *) & (AO[l << 3]); - MMA (&acc0, (vec_t) rowB, MERGE_ROW (rowA[0])); + MMA (&acc0, (vec_t) rowB, rowA[0]); } if (k % 2 == 1) { @@ -800,9 +748,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, AO += k << 2; BO += k << 1; } - i = (m & 3) >> 1; - /* Loop for m >= 2. */ - for (j = 0; j < i; j++) + if (m & 2) { IFLOAT *BO = B; BLASLONG l = 0; @@ -828,9 +774,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, AO += k << 1; BO += k << 1; } - i = (m & 1) >> 0; - /* Loop for m = 1. */ - for (j = 0; j < i; j++) + if (m & 1) { IFLOAT *BO = B; BLASLONG l = 0; @@ -852,153 +796,126 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, } B += k << 1; } - N = (n & 1) >> 0; - /* Loop for n = 1. */ - for (i1 = 0; i1 < N; i1++) + if (n & 1) { - BLASLONG i; + BLASLONG j; FLOAT *CO; IFLOAT *AO; CO = C; C += ldc; AO = A; - i = m; /* Loop for m >= 16. */ - while (i >= 16) + for (j = 0; j < (m >> 4); j++) { IFLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3; + SET_ACC_ZERO4 (); BLASLONG l = 0; - v4sf_t t = { 0, 0, 0, 0 }; - v4sf_t t1 = { 0, 0, 0, 0 }; - v4sf_t t2 = { 0, 0, 0, 0 }; - v4sf_t t3 = { 0, 0, 0, 0 }; - for (l = 0; l < k; l++) + for (l = 0; l < k / 2; l++) { - v4sf_t rowB = - { BF16TOF32 (BO[l]), BF16TOF32 (BO[l]), BF16TOF32 (BO[l]), - BF16TOF32 (BO[l]) - }; - v4sf_t rowA = - { BF16TOF32 (AO[l << 4]), BF16TOF32 (AO[(l << 4) + 1]), - BF16TOF32 (AO[(l << 4) + 2]), - BF16TOF32 (AO[(l << 4) + 3]) - }; - v4sf_t rowA1 = - { BF16TOF32 (AO[(l << 4) + 4]), BF16TOF32 (AO[(l << 4) + 5]), - BF16TOF32 (AO[(l << 4) + 6]), - BF16TOF32 (AO[(l << 4) + 7]) - }; - v4sf_t rowA2 = - { BF16TOF32 (AO[(l << 4) + 8]), BF16TOF32 (AO[(l << 4) + 9]), - BF16TOF32 (AO[(l << 4) + 10]), - BF16TOF32 (AO[(l << 4) + 11]) - }; - v4sf_t rowA3 = { BF16TOF32 (AO[(l << 4) + 12]), - BF16TOF32 (AO[(l << 4) + 13]), BF16TOF32 (AO[(l << 4) + 14]), - BF16TOF32 (AO[(l << 4) + 15]) - }; - t += rowA * rowB; - t1 += rowA1 * rowB; - t2 += rowA2 * rowB; - t3 += rowA3 * rowB; + vector short rowB = + { BO[l << 1], BO[(l << 1) + 1], 0, 0, 0, 0, 0, 0}; + vec_t *rowA = (vec_t *) & (AO[l << 5]); + MMA (&acc0, (vec_t) rowB, rowA[0]); + MMA (&acc1, (vec_t) rowB, rowA[1]); + MMA (&acc2, (vec_t) rowB, rowA[2]); + MMA (&acc3, (vec_t) rowB, rowA[3]); } - t = t * valpha; - t1 = t1 * valpha; - t2 = t2 * valpha; - t3 = t3 * valpha; - CO[0] += t[0]; - CO[1] += t[1]; - CO[2] += t[2]; - CO[3] += t[3]; - CO[4] += t1[0]; - CO[5] += t1[1]; - CO[6] += t1[2]; - CO[7] += t1[3]; - CO[8] += t2[0]; - CO[9] += t2[1]; - CO[10] += t2[2]; - CO[11] += t2[3]; - CO[12] += t3[0]; - CO[13] += t3[1]; - CO[14] += t3[2]; - CO[15] += t3[3]; + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 1; + vector short rowB = { BO[l], 0, 0, 0, 0, 0, 0, 0 }; + vec_t *rowA = (vec_t *) & (AO[(l << 4)]); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2])); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2])); + MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3])); + MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3])); + } + rowC = (v4sf_t *) &CO[0]; + __builtin_mma_disassemble_acc ((void *)result, &acc0); + rowC[0] += result[0] * alpha; + __builtin_mma_disassemble_acc ((void *)result, &acc1); + rowC[1] += result[0] * alpha; + __builtin_mma_disassemble_acc ((void *)result, &acc2); + rowC[2] += result[0] * alpha; + __builtin_mma_disassemble_acc ((void *)result, &acc3); + rowC[3] += result[0] * alpha; AO += k << 4; BO += k; CO += 16; - i -= 16; } /* Loop for m >= 8. */ - while (i >= 8) + if (m & 8) { IFLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); BLASLONG l = 0; - v4sf_t t = { 0, 0, 0, 0 }; - v4sf_t t1 = { 0, 0, 0, 0 }; - for (l = 0; l < k; l++) + for (l = 0; l < k / 2; l++) { - v4sf_t rowB = - { BF16TOF32 (BO[l]), BF16TOF32 (BO[l]), BF16TOF32 (BO[l]), - BF16TOF32 (BO[l]) - }; - v4sf_t rowA = - { BF16TOF32 (AO[l << 3]), BF16TOF32 (AO[(l << 3) + 1]), - BF16TOF32 (AO[(l << 3) + 2]), - BF16TOF32 (AO[(l << 3) + 3]) - }; - v4sf_t rowA1 = - { BF16TOF32 (AO[(l << 3) + 4]), BF16TOF32 (AO[(l << 3) + 5]), - BF16TOF32 (AO[(l << 3) + 6]), - BF16TOF32 (AO[(l << 3) + 7]) - }; - t += rowA * rowB; - t1 += rowA1 * rowB; + vector short rowB = + { BO[l << 1], BO[(l << 1) + 1], 0, 0, 0, 0, 0, 0}; + vec_t *rowA = (vec_t *) & (AO[l << 4]); + MMA (&acc0, (vec_t) rowB, rowA[0]); + MMA (&acc1, (vec_t) rowB, rowA[1]); } - t = t * valpha; - t1 = t1 * valpha; - CO[0] += t[0]; - CO[1] += t[1]; - CO[2] += t[2]; - CO[3] += t[3]; - CO[4] += t1[0]; - CO[5] += t1[1]; - CO[6] += t1[2]; - CO[7] += t1[3]; + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 1; + vector short rowB = { BO[l], 0, 0, 0, 0, 0, 0, 0 }; + vec_t *rowA = (vec_t *) & (AO[(l << 3)]); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[1])); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[1])); + } + rowC = (v4sf_t *) &CO[0]; + __builtin_mma_disassemble_acc ((void *)result, &acc0); + rowC[0] += result[0] * alpha; + __builtin_mma_disassemble_acc ((void *)result, &acc1); + rowC[1] += result[0] * alpha; AO += k << 3; BO += k; CO += 8; - i -= 8; } /* Loop for m >= 4. */ - while (i >= 4) + if (m & 4) { IFLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0; + __builtin_mma_xxsetaccz (&acc0); BLASLONG l = 0; - v4sf_t t = { 0, 0, 0, 0 }; - for (l = 0; l < k; l++) + for (l = 0; l < k / 2; l++) { - v4sf_t rowB = - { BF16TOF32 (BO[l]), BF16TOF32 (BO[l]), BF16TOF32 (BO[l]), - BF16TOF32 (BO[l]) - }; - v4sf_t rowA = - { BF16TOF32 (AO[l << 2]), BF16TOF32 (AO[(l << 2) + 1]), - BF16TOF32 (AO[(l << 2) + 2]), - BF16TOF32 (AO[(l << 2) + 3]) - }; - t += rowA * rowB; + vector short rowB = + { BO[l << 1], BO[(l << 1) + 1], 0, 0, 0, 0, 0, 0}; + vec_t *rowA = (vec_t *) & (AO[l << 3]); + MMA (&acc0, (vec_t) rowB, rowA[0]); } - t = t * valpha; - CO[0] += t[0]; - CO[1] += t[1]; - CO[2] += t[2]; - CO[3] += t[3]; + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 1; + vector short rowB = { BO[l], 0, 0, 0, 0, 0, 0, 0 }; + vec_t *rowA = (vec_t *) & (AO[(l << 2)]); + MMA (&acc0, (vec_t) rowB, MERGE_ROW (rowA[0])); + } + rowC = (v4sf_t *) &CO[0]; + __builtin_mma_disassemble_acc ((void *)result, &acc0); + rowC[0] += result[0] * alpha; AO += k << 2; BO += k; CO += 4; - i -= 4; } /* Loop for m >= 2. */ - while (i >= 2) + if (m & 2) { IFLOAT *BO = B; BLASLONG l = 0; @@ -1018,10 +935,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, AO += k << 1; BO += k; CO += 2; - i -= 2; } /* Loop for m = 1. */ - while (i >= 1) + if (m & 1) { IFLOAT *BO = B; BLASLONG l = 0; @@ -1034,7 +950,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, BO += k; CO[0] += t * alpha; CO += 1; - i -= 1; } B += k; diff --git a/kernel/power/sbgemm_ncopy_16_power10.c b/kernel/power/sbgemm_ncopy_16_power10.c new file mode 100644 index 000000000..c6b633011 --- /dev/null +++ b/kernel/power/sbgemm_ncopy_16_power10.c @@ -0,0 +1,437 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + BLASLONG i, j; + + IFLOAT *aoffset; + IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; + IFLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12; + IFLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16; + + IFLOAT *boffset; + IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; + IFLOAT ctemp05, ctemp06, ctemp07, ctemp08; + IFLOAT ctemp09, ctemp10, ctemp11, ctemp12; + IFLOAT ctemp13, ctemp14, ctemp15, ctemp16; + IFLOAT ctemp17, ctemp18, ctemp19, ctemp20; + IFLOAT ctemp21, ctemp22, ctemp23, ctemp24; + IFLOAT ctemp25, ctemp26, ctemp27, ctemp28; + IFLOAT ctemp29, ctemp30, ctemp31, ctemp32; + + aoffset = a; + boffset = b; + + j = (n >> 4); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset9 = aoffset8 + lda; + aoffset10 = aoffset9 + lda; + aoffset11 = aoffset10 + lda; + aoffset12 = aoffset11 + lda; + aoffset13 = aoffset12 + lda; + aoffset14 = aoffset13 + lda; + aoffset15 = aoffset14 + lda; + aoffset16 = aoffset15 + lda; + aoffset += 16 * lda; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + ctemp05 = *(aoffset3 + 0); + ctemp06 = *(aoffset3 + 1); + ctemp07 = *(aoffset4 + 0); + ctemp08 = *(aoffset4 + 1); + + ctemp09 = *(aoffset5 + 0); + ctemp10 = *(aoffset5 + 1); + ctemp11 = *(aoffset6 + 0); + ctemp12 = *(aoffset6 + 1); + + ctemp13 = *(aoffset7 + 0); + ctemp14 = *(aoffset7 + 1); + ctemp15 = *(aoffset8 + 0); + ctemp16 = *(aoffset8 + 1); + + ctemp17 = *(aoffset9 + 0); + ctemp18 = *(aoffset9 + 1); + ctemp19 = *(aoffset10 + 0); + ctemp20 = *(aoffset10 + 1); + + ctemp21 = *(aoffset11 + 0); + ctemp22 = *(aoffset11 + 1); + ctemp23 = *(aoffset12 + 0); + ctemp24 = *(aoffset12 + 1); + + ctemp25 = *(aoffset13 + 0); + ctemp26 = *(aoffset13 + 1); + ctemp27 = *(aoffset14 + 0); + ctemp28 = *(aoffset14 + 1); + + ctemp29 = *(aoffset15 + 0); + ctemp30 = *(aoffset15 + 1); + ctemp31 = *(aoffset16 + 0); + ctemp32 = *(aoffset16 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + *(boffset + 8) = ctemp09; + *(boffset + 9) = ctemp10; + *(boffset + 10) = ctemp11; + *(boffset + 11) = ctemp12; + *(boffset + 12) = ctemp13; + *(boffset + 13) = ctemp14; + *(boffset + 14) = ctemp15; + *(boffset + 15) = ctemp16; + + *(boffset + 16) = ctemp17; + *(boffset + 17) = ctemp18; + *(boffset + 18) = ctemp19; + *(boffset + 19) = ctemp20; + *(boffset + 20) = ctemp21; + *(boffset + 21) = ctemp22; + *(boffset + 22) = ctemp23; + *(boffset + 23) = ctemp24; + + *(boffset + 24) = ctemp25; + *(boffset + 25) = ctemp26; + *(boffset + 26) = ctemp27; + *(boffset + 27) = ctemp28; + *(boffset + 28) = ctemp29; + *(boffset + 29) = ctemp30; + *(boffset + 30) = ctemp31; + *(boffset + 31) = ctemp32; + + aoffset1 += 2; + aoffset2 += 2; + aoffset3 += 2; + aoffset4 += 2; + aoffset5 += 2; + aoffset6 += 2; + aoffset7 += 2; + aoffset8 += 2; + + aoffset9 += 2; + aoffset10 += 2; + aoffset11 += 2; + aoffset12 += 2; + aoffset13 += 2; + aoffset14 += 2; + aoffset15 += 2; + aoffset16 += 2; + boffset += 32; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp03 = *(aoffset2 + 0); + ctemp05 = *(aoffset3 + 0); + ctemp07 = *(aoffset4 + 0); + ctemp09 = *(aoffset5 + 0); + ctemp11 = *(aoffset6 + 0); + ctemp13 = *(aoffset7 + 0); + ctemp15 = *(aoffset8 + 0); + + ctemp17 = *(aoffset9 + 0); + ctemp19 = *(aoffset10 + 0); + ctemp21 = *(aoffset11 + 0); + ctemp23 = *(aoffset12 + 0); + ctemp25 = *(aoffset13 + 0); + ctemp27 = *(aoffset14 + 0); + ctemp29 = *(aoffset15 + 0); + ctemp31 = *(aoffset16 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp05; + *(boffset + 3) = ctemp07; + *(boffset + 4) = ctemp09; + *(boffset + 5) = ctemp11; + *(boffset + 6) = ctemp13; + *(boffset + 7) = ctemp15; + + *(boffset + 8) = ctemp17; + *(boffset + 9) = ctemp19; + *(boffset + 10) = ctemp21; + *(boffset + 11) = ctemp23; + *(boffset + 12) = ctemp25; + *(boffset + 13) = ctemp27; + *(boffset + 14) = ctemp29; + *(boffset + 15) = ctemp31; + + boffset += 16; + } + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 8){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset += 8 * lda; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + ctemp05 = *(aoffset3 + 0); + ctemp06 = *(aoffset3 + 1); + ctemp07 = *(aoffset4 + 0); + ctemp08 = *(aoffset4 + 1); + + ctemp09 = *(aoffset5 + 0); + ctemp10 = *(aoffset5 + 1); + ctemp11 = *(aoffset6 + 0); + ctemp12 = *(aoffset6 + 1); + + ctemp13 = *(aoffset7 + 0); + ctemp14 = *(aoffset7 + 1); + ctemp15 = *(aoffset8 + 0); + ctemp16 = *(aoffset8 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + *(boffset + 8) = ctemp09; + *(boffset + 9) = ctemp10; + *(boffset + 10) = ctemp11; + *(boffset + 11) = ctemp12; + *(boffset + 12) = ctemp13; + *(boffset + 13) = ctemp14; + *(boffset + 14) = ctemp15; + *(boffset + 15) = ctemp16; + + aoffset1 += 2; + aoffset2 += 2; + aoffset3 += 2; + aoffset4 += 2; + aoffset5 += 2; + aoffset6 += 2; + aoffset7 += 2; + aoffset8 += 2; + + boffset += 16; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp03 = *(aoffset2 + 0); + ctemp05 = *(aoffset3 + 0); + ctemp07 = *(aoffset4 + 0); + ctemp09 = *(aoffset5 + 0); + ctemp11 = *(aoffset6 + 0); + ctemp13 = *(aoffset7 + 0); + ctemp15 = *(aoffset8 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp05; + *(boffset + 3) = ctemp07; + *(boffset + 4) = ctemp09; + *(boffset + 5) = ctemp11; + *(boffset + 6) = ctemp13; + *(boffset + 7) = ctemp15; + + boffset += 8; + } + } + + if (n & 4){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + ctemp05 = *(aoffset3 + 0); + ctemp06 = *(aoffset3 + 1); + ctemp07 = *(aoffset4 + 0); + ctemp08 = *(aoffset4 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + aoffset1 += 2; + aoffset2 += 2; + aoffset3 += 2; + aoffset4 += 2; + boffset += 8; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp03 = *(aoffset2 + 0); + ctemp05 = *(aoffset3 + 0); + ctemp07 = *(aoffset4 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp05; + *(boffset + 3) = ctemp07; + boffset += 4; + } + } + + if (n & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset += 2 * lda; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp02; + *(boffset + 3) = ctemp04; + + aoffset1 += 2; + aoffset2 += 2; + boffset += 4; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp03 = *(aoffset2 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + boffset += 2; + } + } + + if (n & 1){ + aoffset1 = aoffset; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + + aoffset1 += 2; + boffset += 2; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + + *(boffset + 0) = ctemp01; + // boffset += 1; + } + } + + return 0; +} diff --git a/kernel/power/sbgemm_ncopy_8_power10.c b/kernel/power/sbgemm_ncopy_8_power10.c new file mode 100644 index 000000000..0e4a680fb --- /dev/null +++ b/kernel/power/sbgemm_ncopy_8_power10.c @@ -0,0 +1,383 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +typedef IFLOAT vec_bf16 __attribute__ ((vector_size (16))); +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + BLASLONG i, j; + + IFLOAT *aoffset; + IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; + + IFLOAT *boffset; + vec_bf16 vtemp01, vtemp02, vtemp03, vtemp04; + vec_bf16 vtemp05, vtemp06, vtemp07, vtemp08; + vec_bf16 vtemp09, vtemp10, vtemp11, vtemp12; + vector char mask = + { 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 }; + vector char mask1 = + { 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 }; + IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; + IFLOAT ctemp05, ctemp06, ctemp07, ctemp08; + IFLOAT ctemp09, ctemp10, ctemp11, ctemp12; + IFLOAT ctemp13, ctemp14, ctemp15, ctemp16; + IFLOAT ctemp17; + IFLOAT ctemp25; + IFLOAT ctemp33; + IFLOAT ctemp41; + IFLOAT ctemp49; + IFLOAT ctemp57; + + + aoffset = a; + boffset = b; + + j = (n >> 3); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset += 8 * lda; + + i = (m >> 3); + if (i > 0){ + do{ + vtemp01 = *(vec_bf16 *)(aoffset1); + vtemp02 = *(vec_bf16 *)(aoffset2); + vtemp03 = *(vec_bf16 *)(aoffset3); + vtemp04 = *(vec_bf16 *)(aoffset4); + vtemp05 = *(vec_bf16 *)(aoffset5); + vtemp06 = *(vec_bf16 *)(aoffset6); + vtemp07 = *(vec_bf16 *)(aoffset7); + vtemp08 = *(vec_bf16 *)(aoffset8); + + vtemp09 = vec_perm(vtemp01, vtemp02, mask); + vtemp10 = vec_perm(vtemp03, vtemp04, mask); + vtemp11 = vec_perm(vtemp05, vtemp06, mask); + vtemp12 = vec_perm(vtemp07, vtemp08, mask); + + *(vec_bf16 *)(boffset + 0) = vec_xxpermdi(vtemp09, vtemp10, 0); + *(vec_bf16 *)(boffset + 8) = vec_xxpermdi(vtemp11, vtemp12, 0); + *(vec_bf16 *)(boffset + 16) = vec_xxpermdi(vtemp09, vtemp10, 3); + *(vec_bf16 *)(boffset + 24) = vec_xxpermdi(vtemp11, vtemp12, 3); + + vtemp09 = vec_perm(vtemp01, vtemp02, mask1); + vtemp10 = vec_perm(vtemp03, vtemp04, mask1); + vtemp11 = vec_perm(vtemp05, vtemp06, mask1); + vtemp12 = vec_perm(vtemp07, vtemp08, mask1); + + *(vec_bf16 *)(boffset + 32) = vec_xxpermdi(vtemp09, vtemp10, 0); + *(vec_bf16 *)(boffset + 40) = vec_xxpermdi(vtemp11, vtemp12, 0); + *(vec_bf16 *)(boffset + 48) = vec_xxpermdi(vtemp09, vtemp10, 3); + *(vec_bf16 *)(boffset + 56) = vec_xxpermdi(vtemp11, vtemp12, 3); + + aoffset1 += 8; + aoffset2 += 8; + aoffset3 += 8; + aoffset4 += 8; + aoffset5 += 8; + aoffset6 += 8; + aoffset7 += 8; + aoffset8 += 8; + boffset += 64; + i --; + }while(i > 0); + } + + i = (m & 7); + if (i >= 2){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp09 = *(aoffset1 + 1); + ctemp17 = *(aoffset2 + 0); + ctemp25 = *(aoffset2 + 1); + ctemp33 = *(aoffset3 + 0); + ctemp41 = *(aoffset3 + 1); + ctemp49 = *(aoffset4 + 0); + ctemp57 = *(aoffset4 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp09; + *(boffset + 2) = ctemp17; + *(boffset + 3) = ctemp25; + *(boffset + 4) = ctemp33; + *(boffset + 5) = ctemp41; + *(boffset + 6) = ctemp49; + *(boffset + 7) = ctemp57; + aoffset1 += 2; + aoffset2 += 2; + aoffset3 += 2; + aoffset4 += 2; + + ctemp01 = *(aoffset5 + 0); + ctemp09 = *(aoffset5 + 1); + ctemp17 = *(aoffset6 + 0); + ctemp25 = *(aoffset6 + 1); + ctemp33 = *(aoffset7 + 0); + ctemp41 = *(aoffset7 + 1); + ctemp49 = *(aoffset8 + 0); + ctemp57 = *(aoffset8 + 1); + *(boffset + 8) = ctemp01; + *(boffset + 9) = ctemp09; + *(boffset + 10) = ctemp17; + *(boffset + 11) = ctemp25; + *(boffset + 12) = ctemp33; + *(boffset + 13) = ctemp41; + *(boffset + 14) = ctemp49; + *(boffset + 15) = ctemp57; + + aoffset5 += 2; + aoffset6 += 2; + aoffset7 += 2; + aoffset8 += 2; + + boffset += 16; + i -= 2; + }while(i > 1); + } + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp09 = *(aoffset2 + 0); + ctemp17 = *(aoffset3 + 0); + ctemp25 = *(aoffset4 + 0); + ctemp33 = *(aoffset5 + 0); + ctemp41 = *(aoffset6 + 0); + ctemp49 = *(aoffset7 + 0); + ctemp57 = *(aoffset8 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp09; + *(boffset + 2) = ctemp17; + *(boffset + 3) = ctemp25; + *(boffset + 4) = ctemp33; + *(boffset + 5) = ctemp41; + *(boffset + 6) = ctemp49; + *(boffset + 7) = ctemp57; + + aoffset1 ++; + aoffset2 ++; + aoffset3 ++; + aoffset4 ++; + aoffset5 ++; + aoffset6 ++; + aoffset7 ++; + aoffset8 ++; + + boffset += 8; + } + + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 4){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + + i = (m >> 2); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + + ctemp09 = *(aoffset3 + 0); + ctemp10 = *(aoffset3 + 1); + ctemp11 = *(aoffset3 + 2); + ctemp12 = *(aoffset3 + 3); + + ctemp13 = *(aoffset4 + 0); + ctemp14 = *(aoffset4 + 1); + ctemp15 = *(aoffset4 + 2); + ctemp16 = *(aoffset4 + 3); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp05; + *(boffset + 3) = ctemp06; + + *(boffset + 4) = ctemp09; + *(boffset + 5) = ctemp10; + *(boffset + 6) = ctemp13; + *(boffset + 7) = ctemp14; + + *(boffset + 8) = ctemp03; + *(boffset + 9) = ctemp04; + *(boffset + 10) = ctemp07; + *(boffset + 11) = ctemp08; + + *(boffset + 12) = ctemp11; + *(boffset + 13) = ctemp12; + *(boffset + 14) = ctemp15; + *(boffset + 15) = ctemp16; + + aoffset1 += 4; + aoffset2 += 4; + aoffset3 += 4; + aoffset4 += 4; + boffset += 16; + i --; + }while(i > 0); + } + + i = (m & 3); + if (i >= 2){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp09 = *(aoffset1 + 1); + ctemp17 = *(aoffset2 + 0); + ctemp25 = *(aoffset2 + 1); + ctemp33 = *(aoffset3 + 0); + ctemp41 = *(aoffset3 + 1); + ctemp49 = *(aoffset4 + 0); + ctemp57 = *(aoffset4 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp09; + *(boffset + 2) = ctemp17; + *(boffset + 3) = ctemp25; + *(boffset + 4) = ctemp33; + *(boffset + 5) = ctemp41; + *(boffset + 6) = ctemp49; + *(boffset + 7) = ctemp57; + aoffset1 += 2; + aoffset2 += 2; + aoffset3 += 2; + aoffset4 += 2; + + boffset += 8; + i -= 2; + }while(i > 1); + } + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset2 + 0); + ctemp03 = *(aoffset3 + 0); + ctemp04 = *(aoffset4 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + + aoffset1 ++; + aoffset2 ++; + aoffset3 ++; + aoffset4 ++; + + boffset += 4; + } + } + + if (n & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset += 2 * lda; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp02; + *(boffset + 3) = ctemp04; + + aoffset1 += 2; + aoffset2 += 2; + boffset += 4; + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset2 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + + aoffset1 ++; + aoffset2 ++; + boffset += 2; + } + } /* end of if(j > 0) */ + + if (n & 1){ + aoffset1 = aoffset; + + i = m; + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + + *(boffset + 0) = ctemp01; + + aoffset1 ++; + boffset ++; + i --; + }while(i > 0); + } + + } /* end of if(j > 0) */ + + return 0; +} diff --git a/kernel/power/sbgemm_tcopy_16_power10.c b/kernel/power/sbgemm_tcopy_16_power10.c new file mode 100644 index 000000000..120c5ab7c --- /dev/null +++ b/kernel/power/sbgemm_tcopy_16_power10.c @@ -0,0 +1,244 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +typedef IFLOAT vec_bf16 __attribute__ ((vector_size (16))); + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + + BLASLONG i, j; + + IFLOAT *aoffset; + IFLOAT *aoffset1, *aoffset2; + IFLOAT *boffset; + + vec_bf16 vtemp01, vtemp02, vtemp03, vtemp04; + IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; + IFLOAT ctemp05, ctemp06, ctemp07, ctemp08; + + aoffset = a; + boffset = b; + +#if 0 + fprintf(stderr, "m = %d n = %d\n", m, n); +#endif + + j = (n >> 4); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 16; + + i = (m >> 1); + if (i > 0){ + do{ + vtemp01 = *(vec_bf16 *)(aoffset1); + vtemp02 = *(vec_bf16 *)(aoffset1+8); + vtemp03 = *(vec_bf16 *)(aoffset2); + vtemp04 = *(vec_bf16 *)(aoffset2+8); + *(vec_bf16 *)(boffset + 0) = vec_mergeh(vtemp01, vtemp03); + *(vec_bf16 *)(boffset + 8) = vec_mergel(vtemp01, vtemp03); + *(vec_bf16 *)(boffset + 16) = vec_mergeh(vtemp02, vtemp04); + *(vec_bf16 *)(boffset + 24) = vec_mergel(vtemp02, vtemp04); + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 32; + + i --; + }while(i > 0); + } + + if (m & 1){ + vtemp01 = *(vec_bf16 *)(aoffset1); + vtemp02 = *(vec_bf16 *)(aoffset1+8); + *(vec_bf16 *)(boffset + 0) = vtemp01; + *(vec_bf16 *)(boffset + 8) = vtemp02; + boffset += 16; + } + + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 8){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 8; + + i = (m >> 1); + if (i > 0){ + do{ + vtemp01 = *(vec_bf16 *)(aoffset1); + vtemp03 = *(vec_bf16 *)(aoffset2); + *(vec_bf16 *)(boffset + 0) = vec_mergeh(vtemp01, vtemp03); + *(vec_bf16 *)(boffset + 8) = vec_mergel(vtemp01, vtemp03); + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 16; + + i --; + }while(i > 0); + } + + if (m & 1){ + vtemp01 = *(vec_bf16 *)(aoffset1); + *(vec_bf16 *)(boffset + 0) = vtemp01; + boffset += 8; + } + } + + if (n & 4){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 4; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp05; + *(boffset + 2) = ctemp02; + *(boffset + 3) = ctemp06; + *(boffset + 4) = ctemp03; + *(boffset + 5) = ctemp07; + *(boffset + 6) = ctemp04; + *(boffset + 7) = ctemp08; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 8; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + + boffset += 4; + } + } + + if (n & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 2; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 4; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + boffset += 2; + } + } + + if (n & 1){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset2 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 2; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + *(boffset + 0) = ctemp01; + // boffset += 1; + } + } + + return 0; +} diff --git a/kernel/power/sbgemm_tcopy_8_power10.c b/kernel/power/sbgemm_tcopy_8_power10.c new file mode 100644 index 000000000..aceb0c9d8 --- /dev/null +++ b/kernel/power/sbgemm_tcopy_8_power10.c @@ -0,0 +1,659 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +typedef IFLOAT vec_bf16 __attribute__ ((vector_size (16))); + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + + BLASLONG i, j; + + IFLOAT *aoffset; + IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; + + IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4; + vec_bf16 vtemp01, vtemp02, vtemp03, vtemp04; + vec_bf16 vtemp05, vtemp06, vtemp07, vtemp08; + IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; + IFLOAT ctemp05, ctemp06, ctemp07, ctemp08; + IFLOAT ctemp09, ctemp10, ctemp11, ctemp12; + IFLOAT ctemp13, ctemp14, ctemp15, ctemp16; + IFLOAT ctemp17, ctemp18, ctemp19, ctemp20; + IFLOAT ctemp21, ctemp22, ctemp23, ctemp24; + IFLOAT ctemp25, ctemp26, ctemp27, ctemp28; + IFLOAT ctemp29, ctemp30, ctemp31, ctemp32; + + aoffset = a; + boffset = b; + +#if 0 + fprintf(stderr, "M = %d N = %d\n", m, n); +#endif + + boffset2 = b + m * (n & ~7); + boffset3 = b + m * (n & ~3); + boffset4 = b + m * (n & ~1); + + j = (m >> 3); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset += 8 * lda; + + boffset1 = boffset; + boffset += 64; + + i = (n >> 3); + if (i > 0){ + do{ + vtemp01 = *(vec_bf16 *)(aoffset1); + vtemp02 = *(vec_bf16 *)(aoffset2); + vtemp03 = *(vec_bf16 *)(aoffset3); + vtemp04 = *(vec_bf16 *)(aoffset4); + vtemp05 = *(vec_bf16 *)(aoffset5); + vtemp06 = *(vec_bf16 *)(aoffset6); + vtemp07 = *(vec_bf16 *)(aoffset7); + vtemp08 = *(vec_bf16 *)(aoffset8); + aoffset1 += 8; + aoffset2 += 8; + aoffset3 += 8; + aoffset4 += 8; + aoffset5 += 8; + aoffset6 += 8; + aoffset7 += 8; + aoffset8 += 8; + + *(vec_bf16 *)(boffset1 + 0) = vec_mergeh(vtemp01, vtemp02); + *(vec_bf16 *)(boffset1 + 8) = vec_mergel(vtemp01, vtemp02); + *(vec_bf16 *)(boffset1 + 16) = vec_mergeh(vtemp03, vtemp04); + *(vec_bf16 *)(boffset1 + 24) = vec_mergel(vtemp03, vtemp04); + *(vec_bf16 *)(boffset1 + 32) = vec_mergeh(vtemp05, vtemp06); + *(vec_bf16 *)(boffset1 + 40) = vec_mergel(vtemp05, vtemp06); + *(vec_bf16 *)(boffset1 + 48) = vec_mergeh(vtemp07, vtemp08); + *(vec_bf16 *)(boffset1 + 56) = vec_mergel(vtemp07, vtemp08); + + boffset1 += m * 8; + i --; + }while(i > 0); + } + + if (n & 4){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + aoffset1 += 4; + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + aoffset2 += 4; + + ctemp09 = *(aoffset3 + 0); + ctemp10 = *(aoffset3 + 1); + ctemp11 = *(aoffset3 + 2); + ctemp12 = *(aoffset3 + 3); + aoffset3 += 4; + + ctemp13 = *(aoffset4 + 0); + ctemp14 = *(aoffset4 + 1); + ctemp15 = *(aoffset4 + 2); + ctemp16 = *(aoffset4 + 3); + aoffset4 += 4; + + ctemp17 = *(aoffset5 + 0); + ctemp18 = *(aoffset5 + 1); + ctemp19 = *(aoffset5 + 2); + ctemp20 = *(aoffset5 + 3); + aoffset5 += 4; + + ctemp21 = *(aoffset6 + 0); + ctemp22 = *(aoffset6 + 1); + ctemp23 = *(aoffset6 + 2); + ctemp24 = *(aoffset6 + 3); + aoffset6 += 4; + + ctemp25 = *(aoffset7 + 0); + ctemp26 = *(aoffset7 + 1); + ctemp27 = *(aoffset7 + 2); + ctemp28 = *(aoffset7 + 3); + aoffset7 += 4; + + ctemp29 = *(aoffset8 + 0); + ctemp30 = *(aoffset8 + 1); + ctemp31 = *(aoffset8 + 2); + ctemp32 = *(aoffset8 + 3); + aoffset8 += 4; + + *(boffset2 + 0) = ctemp01; + *(boffset2 + 1) = ctemp05; + *(boffset2 + 2) = ctemp02; + *(boffset2 + 3) = ctemp06; + *(boffset2 + 4) = ctemp03; + *(boffset2 + 5) = ctemp07; + *(boffset2 + 6) = ctemp04; + *(boffset2 + 7) = ctemp08; + + *(boffset2 + 8) = ctemp09; + *(boffset2 + 9) = ctemp13; + *(boffset2 + 10) = ctemp10; + *(boffset2 + 11) = ctemp14; + *(boffset2 + 12) = ctemp11; + *(boffset2 + 13) = ctemp15; + *(boffset2 + 14) = ctemp12; + *(boffset2 + 15) = ctemp16; + + *(boffset2 + 16) = ctemp17; + *(boffset2 + 17) = ctemp21; + *(boffset2 + 18) = ctemp18; + *(boffset2 + 19) = ctemp22; + *(boffset2 + 20) = ctemp19; + *(boffset2 + 21) = ctemp23; + *(boffset2 + 22) = ctemp20; + *(boffset2 + 23) = ctemp24; + + *(boffset2 + 24) = ctemp25; + *(boffset2 + 25) = ctemp29; + *(boffset2 + 26) = ctemp26; + *(boffset2 + 27) = ctemp30; + *(boffset2 + 28) = ctemp27; + *(boffset2 + 29) = ctemp31; + *(boffset2 + 30) = ctemp28; + *(boffset2 + 31) = ctemp32; + + boffset2 += 32; + } + + if (n & 2){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + aoffset1 += 2; + + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + aoffset2 += 2; + + ctemp05 = *(aoffset3 + 0); + ctemp06 = *(aoffset3 + 1); + aoffset3 += 2; + + ctemp07 = *(aoffset4 + 0); + ctemp08 = *(aoffset4 + 1); + aoffset4 += 2; + + ctemp09 = *(aoffset5 + 0); + ctemp10 = *(aoffset5 + 1); + aoffset5 += 2; + + ctemp11 = *(aoffset6 + 0); + ctemp12 = *(aoffset6 + 1); + aoffset6 += 2; + + ctemp13 = *(aoffset7 + 0); + ctemp14 = *(aoffset7 + 1); + aoffset7 += 2; + + ctemp15 = *(aoffset8 + 0); + ctemp16 = *(aoffset8 + 1); + aoffset8 += 2; + + *(boffset3 + 0) = ctemp01; + *(boffset3 + 1) = ctemp02; + *(boffset3 + 2) = ctemp03; + *(boffset3 + 3) = ctemp04; + *(boffset3 + 4) = ctemp05; + *(boffset3 + 5) = ctemp06; + *(boffset3 + 6) = ctemp07; + *(boffset3 + 7) = ctemp08; + *(boffset3 + 8) = ctemp09; + *(boffset3 + 9) = ctemp10; + *(boffset3 + 10) = ctemp11; + *(boffset3 + 11) = ctemp12; + *(boffset3 + 12) = ctemp13; + *(boffset3 + 13) = ctemp14; + *(boffset3 + 14) = ctemp15; + *(boffset3 + 15) = ctemp16; + boffset3 += 16; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + aoffset1 ++; + ctemp02 = *(aoffset2 + 0); + aoffset2 ++; + ctemp03 = *(aoffset3 + 0); + aoffset3 ++; + ctemp04 = *(aoffset4 + 0); + aoffset4 ++; + ctemp05 = *(aoffset5 + 0); + aoffset5 ++; + ctemp06 = *(aoffset6 + 0); + aoffset6 ++; + ctemp07 = *(aoffset7 + 0); + aoffset7 ++; + ctemp08 = *(aoffset8 + 0); + aoffset8 ++; + + *(boffset4 + 0) = ctemp01; + *(boffset4 + 1) = ctemp02; + *(boffset4 + 2) = ctemp03; + *(boffset4 + 3) = ctemp04; + *(boffset4 + 4) = ctemp05; + *(boffset4 + 5) = ctemp06; + *(boffset4 + 6) = ctemp07; + *(boffset4 + 7) = ctemp08; + boffset4 += 8; + } + + j--; + }while(j > 0); + } + + if (m & 4){ + + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + + boffset1 = boffset; + boffset += 32; + + i = (n >> 3); + if (i > 0){ + + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + aoffset1 += 8; + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + aoffset2 += 8; + + ctemp17 = *(aoffset3 + 0); + ctemp18 = *(aoffset3 + 1); + ctemp19 = *(aoffset3 + 2); + ctemp20 = *(aoffset3 + 3); + ctemp21 = *(aoffset3 + 4); + ctemp22 = *(aoffset3 + 5); + ctemp23 = *(aoffset3 + 6); + ctemp24 = *(aoffset3 + 7); + aoffset3 += 8; + + ctemp25 = *(aoffset4 + 0); + ctemp26 = *(aoffset4 + 1); + ctemp27 = *(aoffset4 + 2); + ctemp28 = *(aoffset4 + 3); + ctemp29 = *(aoffset4 + 4); + ctemp30 = *(aoffset4 + 5); + ctemp31 = *(aoffset4 + 6); + ctemp32 = *(aoffset4 + 7); + aoffset4 += 8; + + *(boffset1 + 0) = ctemp01; + *(boffset1 + 1) = ctemp09; + *(boffset1 + 2) = ctemp02; + *(boffset1 + 3) = ctemp10; + *(boffset1 + 4) = ctemp03; + *(boffset1 + 5) = ctemp11; + *(boffset1 + 6) = ctemp04; + *(boffset1 + 7) = ctemp12; + + *(boffset1 + 8) = ctemp05; + *(boffset1 + 9) = ctemp13; + *(boffset1 + 10) = ctemp06; + *(boffset1 + 11) = ctemp14; + *(boffset1 + 12) = ctemp07; + *(boffset1 + 13) = ctemp15; + *(boffset1 + 14) = ctemp08; + *(boffset1 + 15) = ctemp16; + + *(boffset1 + 16) = ctemp17; + *(boffset1 + 17) = ctemp25; + *(boffset1 + 18) = ctemp18; + *(boffset1 + 19) = ctemp26; + *(boffset1 + 20) = ctemp19; + *(boffset1 + 21) = ctemp27; + *(boffset1 + 22) = ctemp20; + *(boffset1 + 23) = ctemp28; + + *(boffset1 + 24) = ctemp21; + *(boffset1 + 25) = ctemp29; + *(boffset1 + 26) = ctemp22; + *(boffset1 + 27) = ctemp30; + *(boffset1 + 28) = ctemp23; + *(boffset1 + 29) = ctemp31; + *(boffset1 + 30) = ctemp24; + *(boffset1 + 31) = ctemp32; + + boffset1 += 8 * m; + i --; + }while(i > 0); + } + + if (n & 4) { + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + aoffset1 += 4; + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + aoffset2 += 4; + + ctemp09 = *(aoffset3 + 0); + ctemp10 = *(aoffset3 + 1); + ctemp11 = *(aoffset3 + 2); + ctemp12 = *(aoffset3 + 3); + aoffset3 += 4; + + ctemp13 = *(aoffset4 + 0); + ctemp14 = *(aoffset4 + 1); + ctemp15 = *(aoffset4 + 2); + ctemp16 = *(aoffset4 + 3); + aoffset4 += 4; + + *(boffset2 + 0) = ctemp01; + *(boffset2 + 1) = ctemp05; + *(boffset2 + 2) = ctemp02; + *(boffset2 + 3) = ctemp06; + *(boffset2 + 4) = ctemp03; + *(boffset2 + 5) = ctemp07; + *(boffset2 + 6) = ctemp04; + *(boffset2 + 7) = ctemp08; + + *(boffset2 + 8) = ctemp09; + *(boffset2 + 9) = ctemp13; + *(boffset2 + 10) = ctemp10; + *(boffset2 + 11) = ctemp14; + *(boffset2 + 12) = ctemp11; + *(boffset2 + 13) = ctemp15; + *(boffset2 + 14) = ctemp12; + *(boffset2 + 15) = ctemp16; + boffset2 += 16; + } + + if (n & 2){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + aoffset1 += 2; + + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + aoffset2 += 2; + + ctemp05 = *(aoffset3 + 0); + ctemp06 = *(aoffset3 + 1); + aoffset3 += 2; + + ctemp07 = *(aoffset4 + 0); + ctemp08 = *(aoffset4 + 1); + aoffset4 += 2; + + *(boffset3 + 0) = ctemp01; + *(boffset3 + 1) = ctemp02; + *(boffset3 + 2) = ctemp03; + *(boffset3 + 3) = ctemp04; + *(boffset3 + 4) = ctemp05; + *(boffset3 + 5) = ctemp06; + *(boffset3 + 6) = ctemp07; + *(boffset3 + 7) = ctemp08; + boffset3 += 8; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + aoffset1 ++; + ctemp02 = *(aoffset2 + 0); + aoffset2 ++; + ctemp03 = *(aoffset3 + 0); + aoffset3 ++; + ctemp04 = *(aoffset4 + 0); + aoffset4 ++; + + *(boffset4 + 0) = ctemp01; + *(boffset4 + 1) = ctemp02; + *(boffset4 + 2) = ctemp03; + *(boffset4 + 3) = ctemp04; + boffset4 += 4; + } + } + + if (m & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset += 2 * lda; + + boffset1 = boffset; + boffset += 16; + + i = (n >> 3); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + aoffset1 += 8; + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + aoffset2 += 8; + + *(boffset1 + 0) = ctemp01; + *(boffset1 + 1) = ctemp09; + *(boffset1 + 2) = ctemp02; + *(boffset1 + 3) = ctemp10; + *(boffset1 + 4) = ctemp03; + *(boffset1 + 5) = ctemp11; + *(boffset1 + 6) = ctemp04; + *(boffset1 + 7) = ctemp12; + + *(boffset1 + 8) = ctemp05; + *(boffset1 + 9) = ctemp13; + *(boffset1 + 10) = ctemp06; + *(boffset1 + 11) = ctemp14; + *(boffset1 + 12) = ctemp07; + *(boffset1 + 13) = ctemp15; + *(boffset1 + 14) = ctemp08; + *(boffset1 + 15) = ctemp16; + + boffset1 += 8 * m; + i --; + }while(i > 0); + } + + if (n & 4){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + aoffset1 += 4; + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + aoffset2 += 4; + + *(boffset2 + 0) = ctemp01; + *(boffset2 + 1) = ctemp05; + *(boffset2 + 2) = ctemp02; + *(boffset2 + 3) = ctemp06; + *(boffset2 + 4) = ctemp03; + *(boffset2 + 5) = ctemp07; + *(boffset2 + 6) = ctemp04; + *(boffset2 + 7) = ctemp08; + boffset2 += 8; + } + + if (n & 2){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + aoffset1 += 2; + + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + aoffset2 += 2; + + *(boffset3 + 0) = ctemp01; + *(boffset3 + 1) = ctemp02; + *(boffset3 + 2) = ctemp03; + *(boffset3 + 3) = ctemp04; + boffset3 += 4; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + aoffset1 ++; + ctemp02 = *(aoffset2 + 0); + aoffset2 ++; + + *(boffset4 + 0) = ctemp01; + *(boffset4 + 1) = ctemp02; + boffset4 += 2; + } + } + + if (m & 1){ + aoffset1 = aoffset; + // aoffset += lda; + + boffset1 = boffset; + // boffset += 8; + + i = (n >> 3); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + aoffset1 += 8; + + *(boffset1 + 0) = ctemp01; + *(boffset1 + 1) = ctemp02; + *(boffset1 + 2) = ctemp03; + *(boffset1 + 3) = ctemp04; + *(boffset1 + 4) = ctemp05; + *(boffset1 + 5) = ctemp06; + *(boffset1 + 6) = ctemp07; + *(boffset1 + 7) = ctemp08; + + boffset1 += 8 * m; + i --; + }while(i > 0); + } + + if (n & 4){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + aoffset1 += 4; + + *(boffset2 + 0) = ctemp01; + *(boffset2 + 1) = ctemp02; + *(boffset2 + 2) = ctemp03; + *(boffset2 + 3) = ctemp04; + // boffset2 += 4; + } + + if (n & 2){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + aoffset1 += 2; + + *(boffset3 + 0) = ctemp01; + *(boffset3 + 1) = ctemp02; + // boffset3 += 2; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + aoffset1 ++; + *(boffset4 + 0) = ctemp01; + boffset4 ++; + } + } + + return 0; +} From 75e3a92df6b4100c05d034c85a6076678b5cc6af Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 14 Oct 2020 01:01:58 +0200 Subject: [PATCH 578/593] Add express -mavx and -msse options (and fix a stray = for cooperlake) --- cmake/cc.cmake | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/cmake/cc.cmake b/cmake/cc.cmake index c490dd9ab..9f5cc1bf7 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -109,10 +109,25 @@ if (${CORE} STREQUAL "COOPERLAKE") if (NOT NO_AVX512) execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1) - set (CCOMMON_OPT = "${CCOMMON_OPT} -march=cooperlake") + set (CCOMMON_OPT "${CCOMMON_OPT} -march=cooperlake") else () set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512") endif() endif () endif () endif () + +if (NOT DYNAMIC_ARCH) + if (HAVE_AVX2) + set (CCOMMON_OPT "${CCOMMON_OPT} -mavx2") + endif () + if (HAVE_AVX) + set (CCOMMON_OPT "${CCOMMON_OPT} -mavx") + endif () + if (HAVE_SSE3) + set (CCOMMON_OPT "${CCOMMON_OPT} -msse3") + endif () + if (HAVE_SSSE3) + set (CCOMMON_OPT "${CCOMMON_OPT} -mssse3") + endif () +endif() From c1f4f5d4e790ec92effe8f0984e85706553f4b3f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 14 Oct 2020 01:08:50 +0200 Subject: [PATCH 579/593] Replace Makefile with simplified version again --- test/Makefile | 138 ++++++++++++++------------------------------------ 1 file changed, 39 insertions(+), 99 deletions(-) diff --git a/test/Makefile b/test/Makefile index 06fb7fe86..eb3bc3447 100644 --- a/test/Makefile +++ b/test/Makefile @@ -7,40 +7,22 @@ all :: else all :: level1 level2 level3 endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1x1x1x1) -level1: sblat1 dblat1 cblat1 zblat1 -endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),x1x1x1) -level1: dblat1 cblat1 zblat1 -endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1xx1x1) -level1: sblat1 cblat1 zblat1 -endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),xx1x1) -level1: cblat1 zblat1 -endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),xx1x) -level1: cblat1 -endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),xxx1) -level1: zblat1 -endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1xxx1) -level1: sblat1 zblat1 -endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1x1xx1) -level1: sblat1 dblat1 zblat1 + +ifeq ($(BUILD_SINGLE),1) +S1=sblat1 endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1x1xx) -level1: sblat1 dblat1 +ifeq ($(BUILD_DOUBLE),1) +D1=dblat1 endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1xxx) -level1: sblat1 +ifeq ($(BUILD_COMPLEX),1) +C1=cblat1 endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),x1xx) -level1: dblat1 +ifeq ($(BUILD_COMPLEX16),1) +Z1=zblat1 endif +level1: $(S1) $(D1) $(C1) $(Z1) + ifndef CROSS ifeq ($(BUILD_SINGLE),1) OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat1 @@ -85,41 +67,22 @@ endif endif endif -#level2: sblat2 dblat2 cblat2 zblat2 -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1x1x1x1) -level2: sblat2 dblat2 cblat2 zblat2 -endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),x1x1x1) -level2: dblat2 cblat2 zblat2 -endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1xx1x1) -level2: sblat2 cblat2 zblat2 -endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),xx1x1) -level2: cblat2 zblat2 -endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),xx1x) -level2: cblat2 -endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),xxx1) -level2: zblat2 -endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1xxx1) -level2: sblat2 zblat2 -endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1x1xx1) -level2: sblat2 dblat2 zblat2 +ifeq ($(BUILD_SINGLE),1) +S2=sblat2 endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1x1xx) -level2: sblat2 dblat2 +ifeq ($(BUILD_DOUBLE),1) +D2=dblat2 endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1xxx) -level2: sblat2 +ifeq ($(BUILD_COMPLEX),1) +C2=cblat2 endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),x1xx) -level2: dblat2 +ifeq ($(BUILD_COMPLEX16),1) +Z2=zblat2 endif +level2: $(S2) $(D2) $(C2) $(Z2) + + ifndef CROSS rm -f ?BLAT2.SUMM ifeq ($(BUILD_SINGLE),1) @@ -178,53 +141,30 @@ endif endif endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1x1x1x1) -level3: sblat3 dblat3 cblat3 zblat3 -endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),x1x1x1) -level3: dblat3 cblat3 zblat3 -endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1xx1x1) -level3: sblat3 cblat3 zblat3 -endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),xx1x1) -level3: cblat3 zblat3 -endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),xx1x) -level3: cblat3 -endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),xxx1) -level3: zblat3 -endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1xxx1) -level3: sblat3 zblat3 +ifeq ($(BUILD_BFLOAT16),1) +B3= test_sbgemm endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1x1xx1) -level3: sblat3 dblat3 zblat3 +ifeq ($(BUILD_SINGLE),1) +S3=sblat3 endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1x1xx) -level3: sblat3 dblat3 +ifeq ($(BUILD_DOUBLE),1) +D3=dblat3 endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1xxx) -level3: sblat3 +ifeq ($(BUILD_COMPLEX),1) +C3=cblat3 endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),x1xx) -level3: dblat3 +ifeq ($(BUILD_COMPLEX16),1) +Z3=zblat3 endif +level3: $(B3) $(S3) $(D3) $(C3) $(Z3) -#ifeq ($(BUILD_BFLOAT16),1) -#level3 : test_sbgemm sblat3 dblat3 cblat3 zblat3 -#else -#level3 : sblat3 dblat3 cblat3 zblat3 -#endif - ifndef CROSS rm -f ?BLAT3.SUMM ifeq ($(BUILD_BFLOAT16),1) - OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./test_sbgemm > SHBLAT3.SUMM - @$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./test_sbgemm > SBBLAT3.SUMM + @$(GREP) -q FATAL SBBLAT3.SUMM && cat SBBLAT3.SUMM || exit 0 endif ifeq ($(BUILD_SINGLE),1) OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat3 < ./sblat3.dat @@ -246,8 +186,8 @@ ifdef SMP rm -f ?BLAT3.SUMM ifeq ($(USE_OPENMP), 1) ifeq ($(BUILD_BFLOAT16),1) - OMP_NUM_THREADS=2 ./test_sbgemm > SHBLAT3.SUMM - @$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0 + OMP_NUM_THREADS=2 ./test_sbgemm > SBBLAT3.SUMM + @$(GREP) -q FATAL SBBLAT3.SUMM && cat SBBLAT3.SUMM || exit 0 endif ifeq ($(BUILD_SINGLE),1) OMP_NUM_THREADS=2 ./sblat3 < ./sblat3.dat @@ -267,8 +207,8 @@ ifeq ($(BUILD_COMPLEX16),1) endif else ifeq ($(BUILD_BFLOAT16),1) - OPENBLAS_NUM_THREADS=2 ./test_sbgemm > SHBLAT3.SUMM - @$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0 + OPENBLAS_NUM_THREADS=2 ./test_sbgemm > SBBLAT3.SUMM + @$(GREP) -q FATAL SBBLAT3.SUMM && cat SBBLAT3.SUMM || exit 0 endif ifeq ($(BUILD_SINGLE),1) OPENBLAS_NUM_THREADS=2 ./sblat3 < ./sblat3.dat From c9c3ae07afaf7833f14025164360da1efe3eb4df Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 14 Oct 2020 18:10:45 +0200 Subject: [PATCH 580/593] Add double precision operations --- kernel/simd/intrin_sse.h | 48 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/kernel/simd/intrin_sse.h b/kernel/simd/intrin_sse.h index 9de7e1b27..7449a5a0b 100644 --- a/kernel/simd/intrin_sse.h +++ b/kernel/simd/intrin_sse.h @@ -3,25 +3,59 @@ /*************************** * Data Type ***************************/ +#ifdef DOUBLE +typedef __m128d v_f32; +#else typedef __m128 v_f32; +#endif + #define v_nlanes_f32 4 /*************************** * Arithmetic ***************************/ +#ifdef DOUBLE +#define v_add_f32 _mm_add_pd +#define v_mul_f32 _mm_mul_pd +#else #define v_add_f32 _mm_add_ps #define v_mul_f32 _mm_mul_ps +#endif #ifdef HAVE_FMA3 // multiply and add, a*b + c - #define v_muladd_f32 _mm_fmadd_ps +#ifdef DOUBLE + #define v_muladd_f32 _mm_fmadd_pd +#else + #define v_muladd_f32 _mm_fmadd_ps +#endif #elif defined(HAVE_FMA4) // multiply and add, a*b + c - #define v_muladd_f32 _mm_macc_ps + #ifdef DOUBLE + #define v_muladd_f32 _mm_macc_pd + #else + #define v_muladd_f32 _mm_macc_ps + #endif #else // multiply and add, a*b + c BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c) { return v_add_f32(v_mul_f32(a, b), c); } #endif // HAVE_FMA3 +// Horizontal add: Calculates the sum of all vector elements. +#ifdef DOUBLE +BLAS_FINLINE double v_sum_f32(__m128d a) +{ +#ifdef HAVE_SSE3 + __m128d sum_halves = _mm_hadd_pd(a, a); + return _mm_cvtsd_f64(_mm_hadd_pd(sum_halves, sum_halves)); +#else + __m128d t1 = _mm_movehl_pd(a, a); + __m128d t2 = _mm_add_pd(a, t1); + __m128d t3 = _mm_shuffle_pd(t2, t2, 1); + __m128d t4 = _mm_add_ss(t2, t3); + return _mm_cvtsd_f64(t4); +#endif +} +#else // Horizontal add: Calculates the sum of all vector elements. BLAS_FINLINE float v_sum_f32(__m128 a) { @@ -36,11 +70,19 @@ BLAS_FINLINE float v_sum_f32(__m128 a) return _mm_cvtss_f32(t4); #endif } +#endif /*************************** * memory ***************************/ // unaligned load +#ifdef DOUBLE +#define v_loadu_f32 _mm_loadu_pd +#define v_storeu_f32 _mm_storeu_pd +#define v_setall_f32(VAL) _mm_set1_pd(VAL) +#define v_zero_f32 _mm_setzero_pd +#else #define v_loadu_f32 _mm_loadu_ps #define v_storeu_f32 _mm_storeu_ps #define v_setall_f32(VAL) _mm_set1_ps(VAL) -#define v_zero_f32 _mm_setzero_ps \ No newline at end of file +#define v_zero_f32 _mm_setzero_ps +#endif From ca160bb4400a298f10ac358dce328eabb8c49a70 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 14 Oct 2020 19:18:07 +0200 Subject: [PATCH 581/593] Add -msse4.1 when SSE4.1 is supported --- Makefile.x86_64 | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 8a3fc4eae..27eb571ee 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -16,6 +16,10 @@ ifdef HAVE_SSSE3 CCOMMON_OPT += -mssse3 FCOMMON_OPT += -mssse3 endif +ifdef HAVE_SSE4_1 +CCOMMON_OPT += -msse4.1 +FCOMMON_OPT += -msse4.1 +endif endif endif From ebf0470fc25fd902a923d743977804ae672d4d20 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 14 Oct 2020 20:34:33 +0200 Subject: [PATCH 582/593] add sse4.1 for DYNAMIC_ARCH kernels --- kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/Makefile b/kernel/Makefile index c95c15f56..abe2e08d6 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -45,7 +45,7 @@ endif ifdef TARGET_CORE ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),PRESCOTT CORE2 PENRYN DUNNINGTON ATOM NANO SANDYBRIDGE HASWELL NEHALEM ZEN BARCELONA BOBCAT BULLDOZER PILEDRIVER EXCAVATOR STEAMROLLER OPTERON_SSE3)) - override CFLAGS += -msse3 -mssse3 + override CFLAGS += -msse3 -mssse3 -msse4.1 endif ifeq ($(TARGET_CORE), COOPERLAKE) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) From bfdf4b56dac690cdb03ea06b362cc178f4228d1a Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Thu, 15 Oct 2020 10:29:42 +0800 Subject: [PATCH 583/593] Add double precision universal intrinsics for X86/ARM --- kernel/arm/sum.c | 21 +++++++++++++++++++++ kernel/simd/intrin_avx.h | 21 ++++++++++++++++++++- kernel/simd/intrin_avx512.h | 21 ++++++++++++++++++++- kernel/simd/intrin_neon.h | 28 +++++++++++++++++++++++++++- kernel/simd/intrin_sse.h | 23 ++++++++++++++++++++++- kernel/x86_64/daxpy.c | 10 ++++++++++ 6 files changed, 120 insertions(+), 4 deletions(-) diff --git a/kernel/arm/sum.c b/kernel/arm/sum.c index d4b3fbc83..63584b95c 100644 --- a/kernel/arm/sum.c +++ b/kernel/arm/sum.c @@ -43,6 +43,26 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (inc_x == 1) { #if V_SIMD +#ifdef DOUBLE + const int vstep = v_nlanes_f64; + const int unrollx2 = n & (-vstep * 2); + const int unrollx = n & -vstep; + v_f64 vsum0 = v_zero_f64(); + v_f64 vsum1 = v_zero_f64(); + while (i < unrollx2) + { + vsum0 = v_add_f64(vsum0, v_loadu_f64(x)); + vsum1 = v_add_f64(vsum1, v_loadu_f64(x + vstep)); + i += vstep * 2; + } + vsum0 = v_add_f64(vsum0, vsum1); + while (i < unrollx) + { + vsum0 = v_add_f64(vsum0, v_loadu_f64(x + i)); + i += vstep; + } + sumf = v_sum_f64(vsum0); +#else const int vstep = v_nlanes_f32; const int unrollx4 = n & (-vstep * 4); const int unrollx = n & -vstep; @@ -66,6 +86,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) i += vstep; } sumf = v_sum_f32(vsum0); +#endif #else int n1 = n & -4; for (; i < n1; i += 4) diff --git a/kernel/simd/intrin_avx.h b/kernel/simd/intrin_avx.h index f36a3dbf0..3f79646e0 100644 --- a/kernel/simd/intrin_avx.h +++ b/kernel/simd/intrin_avx.h @@ -4,20 +4,27 @@ * Data Type ***************************/ typedef __m256 v_f32; +typedef __m256d v_f64; #define v_nlanes_f32 8 +#define v_nlanes_f64 4 /*************************** * Arithmetic ***************************/ #define v_add_f32 _mm256_add_ps +#define v_add_f64 _mm256_add_pd #define v_mul_f32 _mm256_mul_ps +#define v_mul_f64 _mm256_mul_pd #ifdef HAVE_FMA3 // multiply and add, a*b + c #define v_muladd_f32 _mm256_fmadd_ps + #define v_muladd_f64 _mm256_fmadd_pd #else // multiply and add, a*b + c BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c) { return v_add_f32(v_mul_f32(a, b), c); } + BLAS_FINLINE v_f64 v_muladd_f64(v_f64 a, v_f64 b, v_f64 c) + { return v_add_f64(v_mul_f64(a, b), c); } #endif // !HAVE_FMA3 // Horizontal add: Calculates the sum of all vector elements. @@ -31,11 +38,23 @@ BLAS_FINLINE float v_sum_f32(__m256 a) return _mm_cvtss_f32(sum); } +BLAS_FINLINE double v_sum_f64(__m256d a) +{ + __m256d sum_halves = _mm256_hadd_pd(a, a); + __m128d lo = _mm256_castpd256_pd128(sum_halves); + __m128d hi = _mm256_extractf128_pd(sum_halves, 1); + __m128d sum = _mm_add_pd(lo, hi); + return _mm_cvtsd_f64(sum); +} /*************************** * memory ***************************/ // unaligned load #define v_loadu_f32 _mm256_loadu_ps +#define v_loadu_f64 _mm256_loadu_pd #define v_storeu_f32 _mm256_storeu_ps +#define v_storeu_f64 _mm256_storeu_pd #define v_setall_f32(VAL) _mm256_set1_ps(VAL) -#define v_zero_f32 _mm256_setzero_ps \ No newline at end of file +#define v_setall_f64(VAL) _mm256_set1_pd(VAL) +#define v_zero_f32 _mm256_setzero_ps +#define v_zero_f64 _mm256_setzero_pd \ No newline at end of file diff --git a/kernel/simd/intrin_avx512.h b/kernel/simd/intrin_avx512.h index 70e5f72e3..f00af53e9 100644 --- a/kernel/simd/intrin_avx512.h +++ b/kernel/simd/intrin_avx512.h @@ -4,15 +4,19 @@ * Data Type ***************************/ typedef __m512 v_f32; +typedef __m512d v_f64; #define v_nlanes_f32 16 +#define v_nlanes_f64 8 /*************************** * Arithmetic ***************************/ #define v_add_f32 _mm512_add_ps +#define v_add_f64 _mm512_add_pd #define v_mul_f32 _mm512_mul_ps +#define v_mul_f64 _mm512_mul_pd // multiply and add, a*b + c #define v_muladd_f32 _mm512_fmadd_ps - +#define v_muladd_f64 _mm512_fmadd_pd BLAS_FINLINE float v_sum_f32(v_f32 a) { __m512 h64 = _mm512_shuffle_f32x4(a, a, _MM_SHUFFLE(3, 2, 3, 2)); @@ -25,11 +29,26 @@ BLAS_FINLINE float v_sum_f32(v_f32 a) __m512 sum4 = _mm512_add_ps(sum8, h4); return _mm_cvtss_f32(_mm512_castps512_ps128(sum4)); } + +BLAS_FINLINE double v_sum_f64(v_f64 a) +{ + __m512d h64 = _mm512_shuffle_f64x2(a, a, _MM_SHUFFLE(3, 2, 3, 2)); + __m512d sum32 = _mm512_add_pd(a, h64); + __m512d h32 = _mm512_permutex_pd(sum32, _MM_SHUFFLE(1, 0, 3, 2)); + __m512d sum16 = _mm512_add_pd(sum32, h32); + __m512d h16 = _mm512_permute_pd(sum16, _MM_SHUFFLE(2, 3, 0, 1)); + __m512d sum8 = _mm512_add_pd(sum16, h16); + return _mm_cvtsd_f64(_mm512_castpd512_pd128(sum8)); +} /*************************** * memory ***************************/ // unaligned load #define v_loadu_f32(PTR) _mm512_loadu_ps((const __m512*)(PTR)) +#define v_loadu_f64(PTR) _mm512_loadu_pd((const __m512*)(PTR)) #define v_storeu_f32 _mm512_storeu_ps +#define v_storeu_f64 _mm512_storeu_pd #define v_setall_f32(VAL) _mm512_set1_ps(VAL) +#define v_setall_f64(VAL) _mm512_set1_pd(VAL) #define v_zero_f32 _mm512_setzero_ps +#define v_zero_f64 _mm512_setzero_pd diff --git a/kernel/simd/intrin_neon.h b/kernel/simd/intrin_neon.h index 5875c0e4e..6df41cdd0 100644 --- a/kernel/simd/intrin_neon.h +++ b/kernel/simd/intrin_neon.h @@ -8,12 +8,18 @@ * Data Type ***************************/ typedef float32x4_t v_f32; +#if NPY_SIMD_F64 + typedef float64x2_t v_f64; +#endif #define v_nlanes_f32 4 +#define v_nlanes_f64 2 /*************************** * Arithmetic ***************************/ #define v_add_f32 vaddq_f32 +#define v_add_f64 vaddq_f64 #define v_mul_f32 vmulq_f32 +#define v_mul_f64 vmulq_f64 // FUSED F32 #ifdef HAVE_VFPV4 // FMA @@ -26,12 +32,26 @@ typedef float32x4_t v_f32; { return vmlaq_f32(c, a, b); } #endif +// FUSED F64 +#if NPY_SIMD_F64 + BLAS_FINLINE v_f64 v_muladd_f64(v_f64 a, v_f64 b, v_f64 c) + { return vfmaq_f64(c, a, b); } +#endif + // Horizontal add: Calculates the sum of all vector elements. BLAS_FINLINE float v_sum_f32(float32x4_t a) { float32x2_t r = vadd_f32(vget_high_f32(a), vget_low_f32(a)); return vget_lane_f32(vpadd_f32(r, r), 0); } + +#if NPY_SIMD_F64 + BLAS_FINLINE double v_sum_f64(float64x2_t a) + { + return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); + } +#endif + /*************************** * memory ***************************/ @@ -39,4 +59,10 @@ BLAS_FINLINE float v_sum_f32(float32x4_t a) #define v_loadu_f32(a) vld1q_f32((const float*)a) #define v_storeu_f32 vst1q_f32 #define v_setall_f32(VAL) vdupq_n_f32(VAL) -#define v_zero_f32() vdupq_n_f32(0.0f) \ No newline at end of file +#define v_zero_f32() vdupq_n_f32(0.0f) +#if NPY_SIMD_F64 + #define v_loadu_f64(a) vld1q_f64((const double*)a) + #define v_storeu_f64 vst1q_f64 + #define v_setall_f64 vdupq_n_f64 + #define v_zero_f64() vdupq_n_f64(0.0) +#endif \ No newline at end of file diff --git a/kernel/simd/intrin_sse.h b/kernel/simd/intrin_sse.h index 9de7e1b27..06a3fe78b 100644 --- a/kernel/simd/intrin_sse.h +++ b/kernel/simd/intrin_sse.h @@ -4,22 +4,30 @@ * Data Type ***************************/ typedef __m128 v_f32; +typedef __m128d v_f64; #define v_nlanes_f32 4 +#define v_nlanes_f64 2 /*************************** * Arithmetic ***************************/ #define v_add_f32 _mm_add_ps +#define v_add_f64 _mm_add_pd #define v_mul_f32 _mm_mul_ps +#define v_mul_f64 _mm_mul_pd #ifdef HAVE_FMA3 // multiply and add, a*b + c #define v_muladd_f32 _mm_fmadd_ps + #define v_muladd_f64 _mm_fmadd_pd #elif defined(HAVE_FMA4) // multiply and add, a*b + c #define v_muladd_f32 _mm_macc_ps + #define v_muladd_f64 _mm_macc_pd #else // multiply and add, a*b + c BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c) { return v_add_f32(v_mul_f32(a, b), c); } + BLAS_FINLINE v_f64 v_muladd_f64(v_f64 a, v_f64 b, v_f64 c) + { return v_add_f64(v_mul_f64(a, b), c); } #endif // HAVE_FMA3 // Horizontal add: Calculates the sum of all vector elements. @@ -36,11 +44,24 @@ BLAS_FINLINE float v_sum_f32(__m128 a) return _mm_cvtss_f32(t4); #endif } + +BLAS_FINLINE double v_sum_f64(__m128d a) +{ +#ifdef HAVE_SSE3 + return _mm_cvtsd_f64(_mm_hadd_pd(a, a)); +#else + return _mm_cvtsd_f64(_mm_add_pd(a, _mm_unpackhi_pd(a, a))); +#endif +} /*************************** * memory ***************************/ // unaligned load #define v_loadu_f32 _mm_loadu_ps +#define v_loadu_f64 _mm_loadu_pd #define v_storeu_f32 _mm_storeu_ps +#define v_storeu_f64 _mm_storeu_pd #define v_setall_f32(VAL) _mm_set1_ps(VAL) -#define v_zero_f32 _mm_setzero_ps \ No newline at end of file +#define v_setall_f64(VAL) _mm_set1_pd(VAL) +#define v_zero_f32 _mm_setzero_ps +#define v_zero_f64 _mm_setzero_pd \ No newline at end of file diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c index b62e3dcb3..26437012c 100644 --- a/kernel/x86_64/daxpy.c +++ b/kernel/x86_64/daxpy.c @@ -53,6 +53,15 @@ static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) BLASLONG register i = 0; FLOAT a = *alpha; #if V_SIMD +#ifdef DOUBLE + v_f64 __alpha, tmp; + __alpha = v_setall_f64(*alpha); + const int vstep = v_nlanes_f64; + for (; i < n; i += vstep) { + tmp = v_muladd_f64(__alpha, v_loadu_f64( x + i ), v_loadu_f64(y + i)); + v_storeu_f64(y + i, tmp); + } +#else v_f32 __alpha, tmp; __alpha = v_setall_f32(*alpha); const int vstep = v_nlanes_f32; @@ -60,6 +69,7 @@ static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) tmp = v_muladd_f32(__alpha, v_loadu_f32( x + i ), v_loadu_f32(y + i)); v_storeu_f32(y + i, tmp); } +#endif #else while(i < n) { From 4fac91ef37b37dc8979ac47d888320de3845acc3 Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Thu, 15 Oct 2020 11:08:10 +0800 Subject: [PATCH 584/593] adapt arm platform --- kernel/simd/intrin_neon.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/simd/intrin_neon.h b/kernel/simd/intrin_neon.h index 6df41cdd0..22cef10ca 100644 --- a/kernel/simd/intrin_neon.h +++ b/kernel/simd/intrin_neon.h @@ -8,7 +8,7 @@ * Data Type ***************************/ typedef float32x4_t v_f32; -#if NPY_SIMD_F64 +#if V_SIMD_F64 typedef float64x2_t v_f64; #endif #define v_nlanes_f32 4 @@ -33,7 +33,7 @@ typedef float32x4_t v_f32; #endif // FUSED F64 -#if NPY_SIMD_F64 +#if V_SIMD_F64 BLAS_FINLINE v_f64 v_muladd_f64(v_f64 a, v_f64 b, v_f64 c) { return vfmaq_f64(c, a, b); } #endif @@ -45,7 +45,7 @@ BLAS_FINLINE float v_sum_f32(float32x4_t a) return vget_lane_f32(vpadd_f32(r, r), 0); } -#if NPY_SIMD_F64 +#if V_SIMD_F64 BLAS_FINLINE double v_sum_f64(float64x2_t a) { return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); @@ -60,7 +60,7 @@ BLAS_FINLINE float v_sum_f32(float32x4_t a) #define v_storeu_f32 vst1q_f32 #define v_setall_f32(VAL) vdupq_n_f32(VAL) #define v_zero_f32() vdupq_n_f32(0.0f) -#if NPY_SIMD_F64 +#if V_SIMD_F64 #define v_loadu_f64(a) vld1q_f64((const double*)a) #define v_storeu_f64 vst1q_f64 #define v_setall_f64 vdupq_n_f64 From ae6ac83991539d688095bcfc66bfb22f054860be Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 15 Oct 2020 08:37:02 +0200 Subject: [PATCH 585/593] Revert "add double precision SSE" --- kernel/simd/intrin_sse.h | 48 +++------------------------------------- 1 file changed, 3 insertions(+), 45 deletions(-) diff --git a/kernel/simd/intrin_sse.h b/kernel/simd/intrin_sse.h index 7449a5a0b..9de7e1b27 100644 --- a/kernel/simd/intrin_sse.h +++ b/kernel/simd/intrin_sse.h @@ -3,59 +3,25 @@ /*************************** * Data Type ***************************/ -#ifdef DOUBLE -typedef __m128d v_f32; -#else typedef __m128 v_f32; -#endif - #define v_nlanes_f32 4 /*************************** * Arithmetic ***************************/ -#ifdef DOUBLE -#define v_add_f32 _mm_add_pd -#define v_mul_f32 _mm_mul_pd -#else #define v_add_f32 _mm_add_ps #define v_mul_f32 _mm_mul_ps -#endif #ifdef HAVE_FMA3 // multiply and add, a*b + c -#ifdef DOUBLE - #define v_muladd_f32 _mm_fmadd_pd -#else - #define v_muladd_f32 _mm_fmadd_ps -#endif + #define v_muladd_f32 _mm_fmadd_ps #elif defined(HAVE_FMA4) // multiply and add, a*b + c - #ifdef DOUBLE - #define v_muladd_f32 _mm_macc_pd - #else - #define v_muladd_f32 _mm_macc_ps - #endif + #define v_muladd_f32 _mm_macc_ps #else // multiply and add, a*b + c BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c) { return v_add_f32(v_mul_f32(a, b), c); } #endif // HAVE_FMA3 -// Horizontal add: Calculates the sum of all vector elements. -#ifdef DOUBLE -BLAS_FINLINE double v_sum_f32(__m128d a) -{ -#ifdef HAVE_SSE3 - __m128d sum_halves = _mm_hadd_pd(a, a); - return _mm_cvtsd_f64(_mm_hadd_pd(sum_halves, sum_halves)); -#else - __m128d t1 = _mm_movehl_pd(a, a); - __m128d t2 = _mm_add_pd(a, t1); - __m128d t3 = _mm_shuffle_pd(t2, t2, 1); - __m128d t4 = _mm_add_ss(t2, t3); - return _mm_cvtsd_f64(t4); -#endif -} -#else // Horizontal add: Calculates the sum of all vector elements. BLAS_FINLINE float v_sum_f32(__m128 a) { @@ -70,19 +36,11 @@ BLAS_FINLINE float v_sum_f32(__m128 a) return _mm_cvtss_f32(t4); #endif } -#endif /*************************** * memory ***************************/ // unaligned load -#ifdef DOUBLE -#define v_loadu_f32 _mm_loadu_pd -#define v_storeu_f32 _mm_storeu_pd -#define v_setall_f32(VAL) _mm_set1_pd(VAL) -#define v_zero_f32 _mm_setzero_pd -#else #define v_loadu_f32 _mm_loadu_ps #define v_storeu_f32 _mm_storeu_ps #define v_setall_f32(VAL) _mm_set1_ps(VAL) -#define v_zero_f32 _mm_setzero_ps -#endif +#define v_zero_f32 _mm_setzero_ps \ No newline at end of file From 10379fc83baced749a2e4f881daa923d9361df26 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 15 Oct 2020 19:05:37 +0200 Subject: [PATCH 586/593] Use ifdef instead of if --- kernel/setparam-ref.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 72fbf32bf..849a4194a 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -1164,7 +1164,7 @@ static void init_parameter(void) { TABLE_NAME.xgemm3m_q = QGEMM_DEFAULT_Q; #endif -#if (CORE_KATMAI) || (CORE_COPPERMINE) || (CORE_BANIAS) || (CORE_YONAH) || (CORE_ATHLON) +#if defined(CORE_KATMAI) || defined(CORE_COPPERMINE) || defined(CORE_BANIAS) || defined(CORE_YONAH) || defined(CORE_ATHLON) #ifdef DEBUG fprintf(stderr, "Katmai, Coppermine, Banias, Athlon\n"); From ac8af9cec6e9c391f9047992c15454db8ada1821 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 15 Oct 2020 19:06:45 +0200 Subject: [PATCH 587/593] Add -msse where supported, apparently required for older gcc --- Makefile.x86 | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/Makefile.x86 b/Makefile.x86 index a6196d365..330690935 100644 --- a/Makefile.x86 +++ b/Makefile.x86 @@ -54,3 +54,19 @@ LIBATLAS = -L$(ATLASPATH)/32 -lcblas -lf77blas -latlas -lm else LIBATLAS = -L$(ATLASPATH)/32 -lptf77blas -lptatlas -lpthread -lm endif + +ifdef HAVE_SSE3 +ifndef DYNAMIC_ARCH +CCOMMON_OPT += -msse3 +FCOMMON_OPT += -msse3 +ifdef HAVE_SSSE3 +CCOMMON_OPT += -mssse3 +FCOMMON_OPT += -mssse3 +endif +ifdef HAVE_SSE4_1 +CCOMMON_OPT += -msse4.1 +FCOMMON_OPT += -msse4.1 +endif +endif +endif + From c339c40c01c11046bd9886a00f16deb9a6d675a2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 15 Oct 2020 19:08:12 +0200 Subject: [PATCH 588/593] Silence a redefinition warning --- kernel/x86_64/iamax_sse.S | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/x86_64/iamax_sse.S b/kernel/x86_64/iamax_sse.S index 4f62b9be2..14c7f43ec 100644 --- a/kernel/x86_64/iamax_sse.S +++ b/kernel/x86_64/iamax_sse.S @@ -51,6 +51,8 @@ #define MAXPS maxps #define MAXSS maxss #ifdef USE_MIN +#undef MAXPS +#undef MAXSS #define MAXPS minps #define MAXSS minss #endif From dc6cefd2f588c27847f2c4b5a8ad42cbf6331299 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 15 Oct 2020 20:16:15 +0200 Subject: [PATCH 589/593] Expressly enable -msse for 32bit DYNAMIC_ARCH kernels --- kernel/Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/Makefile b/kernel/Makefile index abe2e08d6..65e2a0ad6 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -46,6 +46,9 @@ endif ifdef TARGET_CORE ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),PRESCOTT CORE2 PENRYN DUNNINGTON ATOM NANO SANDYBRIDGE HASWELL NEHALEM ZEN BARCELONA BOBCAT BULLDOZER PILEDRIVER EXCAVATOR STEAMROLLER OPTERON_SSE3)) override CFLAGS += -msse3 -mssse3 -msse4.1 +endif + ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),KATMAI COPPERMINE NEHALEM BARCELONA CORE2 PRESCOTT NORTHWOOD ATHLON)) + override CFLAGS += -msse endif ifeq ($(TARGET_CORE), COOPERLAKE) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) From f071d1207ab2d25247bf6ba02a2f16bf02273a5b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 15 Oct 2020 22:10:32 +0200 Subject: [PATCH 590/593] add sse2 --- kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/Makefile b/kernel/Makefile index 65e2a0ad6..495f3609f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -48,7 +48,7 @@ ifdef TARGET_CORE override CFLAGS += -msse3 -mssse3 -msse4.1 endif ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),KATMAI COPPERMINE NEHALEM BARCELONA CORE2 PRESCOTT NORTHWOOD ATHLON)) - override CFLAGS += -msse + override CFLAGS += -msse -msse2 endif ifeq ($(TARGET_CORE), COOPERLAKE) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) From df706670430ef39aeb0a423e367560e452909139 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 16 Oct 2020 09:55:48 +0200 Subject: [PATCH 591/593] fix core list for sse/sse2 --- kernel/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/Makefile b/kernel/Makefile index 495f3609f..43318d475 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -45,9 +45,9 @@ endif ifdef TARGET_CORE ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),PRESCOTT CORE2 PENRYN DUNNINGTON ATOM NANO SANDYBRIDGE HASWELL NEHALEM ZEN BARCELONA BOBCAT BULLDOZER PILEDRIVER EXCAVATOR STEAMROLLER OPTERON_SSE3)) - override CFLAGS += -msse3 -mssse3 -msse4.1 + override CFLAGS += -msse -msse2 -msse3 -mssse3 -msse4.1 endif - ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),KATMAI COPPERMINE NEHALEM BARCELONA CORE2 PRESCOTT NORTHWOOD ATHLON)) + ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),KATMAI COPPERMINE BANIAS NORTHWOOD ATHLON OPTERON)) override CFLAGS += -msse -msse2 endif ifeq ($(TARGET_CORE), COOPERLAKE) From 786c0a3ce80b4a3598d7a534470aa5f6b7e6b01c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 16 Oct 2020 10:41:53 +0200 Subject: [PATCH 592/593] Add sse options for use of intrinics with older compilers --- cmake/cc.cmake | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cmake/cc.cmake b/cmake/cc.cmake index 9f5cc1bf7..2f4d1c6d7 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -124,10 +124,19 @@ if (NOT DYNAMIC_ARCH) if (HAVE_AVX) set (CCOMMON_OPT "${CCOMMON_OPT} -mavx") endif () + if (HAVE_SSE) + set (CCOMMON_OPT "${CCOMMON_OPT} -msse") + endif () + if (HAVE_SSE2) + set (CCOMMON_OPT "${CCOMMON_OPT} -msse2") + endif () if (HAVE_SSE3) set (CCOMMON_OPT "${CCOMMON_OPT} -msse3") endif () if (HAVE_SSSE3) set (CCOMMON_OPT "${CCOMMON_OPT} -mssse3") endif () + if (HAVE_SSE4_1) + set (CCOMMON_OPT "${CCOMMON_OPT} -msse4.1") + endif () endif() From f64243ff57d79c6bd23d39c49648adfddbe018a4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 16 Oct 2020 10:47:06 +0200 Subject: [PATCH 593/593] Add compiler options for sse/sse2/ssse3/sse4.1 --- cmake/system.cmake | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/cmake/system.cmake b/cmake/system.cmake index b34d4a9a5..4cc46236d 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -70,9 +70,21 @@ if (DEFINED TARGET) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") endif() endif() + if (DEFINED HAVE_SSE) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse") + endif() + if (DEFINED HAVE_SSE2) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse2") + endif() if (DEFINED HAVE_SSE3) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") endif() + if (DEFINED HAVE_SSSE3) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mssse3") + endif() + if (DEFINED HAVE_SSE4_1) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse4.1") + endif() endif() if (DEFINED TARGET)