| @@ -171,3 +171,11 @@ In chronological order: | |||
| * [2019-02-01] added missing Blas Level-1,2 (single precision) simd codes | |||
| * [2019-03-14] power9 dgemm/dtrmm kernel | |||
| * [2019-04-29] power9 sgemm/strmm kernel | |||
| * Jiachen Wang <https://github.com/wjc404> | |||
| * [2019-07-29] optimize AVX2 DGEMM | |||
| * [2019-10-20] AVX512 DGEMM kernel (4x8) | |||
| * [2019-11-06] optimize AVX512 SGEMM | |||
| * [2019-11-12] AVX512 CGEMM & ZGEMM kernels | |||
| * [2019-12-23] optimize AVX2 CGEMM and ZGEMM | |||
| * [2019-12-27] AVX2 CGEMM3M kernel | |||
| @@ -338,7 +338,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| for(jjs = js; jjs < js + min_j; jjs += min_jj){ | |||
| min_jj = min_j + js - jjs; | |||
| if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; | |||
| if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3; | |||
| START_RPCC(); | |||
| @@ -398,7 +398,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| for(jjs = js; jjs < js + min_j; jjs += min_jj){ | |||
| min_jj = min_j + js - jjs; | |||
| if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; | |||
| if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3; | |||
| START_RPCC(); | |||
| @@ -463,7 +463,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| for(jjs = js; jjs < js + min_j; jjs += min_jj){ | |||
| min_jj = min_j + js - jjs; | |||
| if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; | |||
| if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3; | |||
| START_RPCC(); | |||
| @@ -56,7 +56,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CTRMMKERNEL = cgemm_kernel_8x2_haswell.S | |||
| CGEMMKERNEL = cgemm_kernel_8x2_haswell.S | |||
| CGEMMKERNEL = cgemm_kernel_8x2_haswell.c | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_8.c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_8.c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| @@ -67,7 +67,7 @@ CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZTRMMKERNEL = zgemm_kernel_4x2_haswell.S | |||
| ZGEMMKERNEL = zgemm_kernel_4x2_haswell.S | |||
| ZGEMMKERNEL = zgemm_kernel_4x2_haswell.c | |||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_4.c | |||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_4.c | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| @@ -97,6 +97,6 @@ ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S | |||
| CGEMM3MKERNEL = cgemm3m_kernel_8x4_haswell.c | |||
| ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S | |||
| @@ -53,7 +53,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CTRMMKERNEL = cgemm_kernel_8x2_haswell.S | |||
| CGEMMKERNEL = cgemm_kernel_8x2_haswell.S | |||
| CGEMMKERNEL = cgemm_kernel_8x2_haswell.c | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_8.c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_8.c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| @@ -64,7 +64,7 @@ CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZTRMMKERNEL = zgemm_kernel_4x2_haswell.S | |||
| ZGEMMKERNEL = zgemm_kernel_4x2_haswell.S | |||
| ZGEMMKERNEL = zgemm_kernel_4x2_haswell.c | |||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_4.c | |||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_4.c | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| @@ -94,6 +94,6 @@ ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S | |||
| CGEMM3MKERNEL = cgemm3m_kernel_8x4_haswell.c | |||
| ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S | |||
| @@ -0,0 +1,279 @@ | |||
| /* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store */ | |||
| /* r12 = k << 4(const), r13 = k(const), r14 = b_head_pos(const), r15 = tmp */ | |||
| #include "common.h" | |||
| #include <stdint.h> | |||
| //recommended settings: GEMM_P = 320, GEMM_Q = 320. | |||
| /* m = 8 *//* ymm0 for alpha, ymm1-ymm3 for temporary use, ymm4-ymm15 for accumulators */ | |||
| #define KERNEL_k1m8n1 \ | |||
| "vmovups (%0),%%ymm1; addq $32,%0;"\ | |||
| "vbroadcastss (%1),%%ymm2; vfmadd231ps %%ymm1,%%ymm2,%%ymm4;"\ | |||
| "addq $4,%1;" | |||
| #define KERNEL_h_k1m8n2 \ | |||
| "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; addq $32,%0;"\ | |||
| "vbroadcastsd (%1),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;" | |||
| #define KERNEL_k1m8n2 KERNEL_h_k1m8n2 "addq $8,%1;" | |||
| #define KERNEL_h_k1m8n4 \ | |||
| KERNEL_h_k1m8n2 "vbroadcastsd 8(%1),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;" | |||
| #define KERNEL_k1m8n4 KERNEL_h_k1m8n4 "addq $16,%1;" | |||
| #define unit_kernel_k1m8n4(c1,c2,c3,c4,...) \ | |||
| "vbroadcastsd ("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c1"; vfmadd231ps %%ymm2,%%ymm3,"#c2";"\ | |||
| "vbroadcastsd 8("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c3"; vfmadd231ps %%ymm2,%%ymm3,"#c4";" | |||
| #define KERNEL_h_k1m8n8 KERNEL_h_k1m8n4 unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,%1,%%r12,1) | |||
| #define KERNEL_k1m8n8 KERNEL_h_k1m8n8 "addq $16,%1;" | |||
| #define KERNEL_h_k1m8n12 KERNEL_h_k1m8n8 unit_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,%1,%%r12,2) | |||
| #define KERNEL_k1m8n12 KERNEL_h_k1m8n12 "addq $16,%1;" | |||
| #define INIT_m8n1 "vpxor %%ymm4,%%ymm4,%%ymm4;" | |||
| #define INIT_m8n2 INIT_m8n1 "vpxor %%ymm5,%%ymm5,%%ymm5;" | |||
| #define INIT_m8n4 INIT_m8n2 "vpxor %%ymm6,%%ymm6,%%ymm6;vpxor %%ymm7,%%ymm7,%%ymm7;" | |||
| #define unit_init_m8n4(c1,c2,c3,c4) \ | |||
| "vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";" | |||
| #define INIT_m8n8 INIT_m8n4 unit_init_m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11) | |||
| #define INIT_m8n12 INIT_m8n8 unit_init_m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15) | |||
| #define SAVE_m8n1 \ | |||
| "vunpcklps %%ymm4,%%ymm4,%%ymm2; vunpckhps %%ymm4,%%ymm4,%%ymm3;"\ | |||
| "vperm2f128 $2,%%ymm2,%%ymm3,%%ymm1; vperm2f128 $19,%%ymm2,%%ymm3,%%ymm2;"\ | |||
| "vfmadd213ps (%2),%%ymm0,%%ymm1; vfmadd213ps 32(%2),%%ymm0,%%ymm2; vmovups %%ymm1,(%2); vmovups %%ymm2,32(%2);" | |||
| #define unit_save_m8n2(c1,c2) \ | |||
| "vunpcklpd "#c2","#c1",%%ymm2; vunpckhpd "#c2","#c1",%%ymm3;"\ | |||
| "vperm2f128 $2,%%ymm2,%%ymm3,"#c1"; vperm2f128 $19,%%ymm2,%%ymm3,"#c2";"\ | |||
| "vmovsldup "#c1",%%ymm2; vmovsldup "#c2",%%ymm3;"\ | |||
| "vfmadd213ps (%5),%%ymm0,%%ymm2; vfmadd213ps 32(%5),%%ymm0,%%ymm3; vmovups %%ymm2,(%5); vmovups %%ymm3,32(%5);"\ | |||
| "vmovshdup "#c1",%%ymm2; vmovshdup "#c2",%%ymm3;"\ | |||
| "vfmadd213ps (%5,%3,1),%%ymm0,%%ymm2; vfmadd213ps 32(%5,%3,1),%%ymm0,%%ymm3; vmovups %%ymm2,(%5,%3,1); vmovups %%ymm3,32(%5,%3,1);"\ | |||
| "leaq (%5,%3,2),%5;" | |||
| #define SAVE_m8n2 "movq %2,%5;" unit_save_m8n2(%%ymm4,%%ymm5) | |||
| #define SAVE_m8n4 SAVE_m8n2 unit_save_m8n2(%%ymm6,%%ymm7) | |||
| #define SAVE_m8n8 SAVE_m8n4 unit_save_m8n2(%%ymm8,%%ymm9) unit_save_m8n2(%%ymm10,%%ymm11) | |||
| #define SAVE_m8n12 SAVE_m8n8 unit_save_m8n2(%%ymm12,%%ymm13) unit_save_m8n2(%%ymm14,%%ymm15) | |||
| #define COMPUTE_m8(ndim) \ | |||
| INIT_m8n##ndim\ | |||
| "movq %%r13,%4; movq %%r14,%1; movq %2,%5; xorq %%r15,%%r15;"\ | |||
| "cmpq $24,%4; jb "#ndim"882f;"\ | |||
| #ndim"881:\n\t"\ | |||
| "cmpq $126,%%r15; movq $126,%%r15; cmoveq %3,%%r15;"\ | |||
| "prefetcht0 64(%1); prefetcht0 64(%1,%%r12,1); prefetcht0 64(%1,%%r12,2);"\ | |||
| "prefetcht0 512(%0);" KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\ | |||
| "prefetcht0 512(%0);" KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\ | |||
| "prefetcht1 (%5); leaq -63(%5,%%r15,1),%5;"\ | |||
| "prefetcht0 64(%1); prefetcht0 64(%1,%%r12,1); prefetcht0 64(%1,%%r12,2);"\ | |||
| "prefetcht0 512(%0);" KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\ | |||
| "prefetcht0 512(%0);" KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\ | |||
| "prefetcht1 (%8); addq $16,%8;"\ | |||
| "subq $8,%4; cmpq $24,%4; jnb "#ndim"881b;"\ | |||
| "movq %2,%5;"\ | |||
| #ndim"882:\n\t"\ | |||
| "testq %4,%4; jz "#ndim"883f;"\ | |||
| "prefetcht0 (%5); prefetcht0 63(%5); addq %3,%5;"\ | |||
| KERNEL_k1m8n##ndim\ | |||
| "decq %4; jmp "#ndim"882b;"\ | |||
| #ndim"883:\n\t"\ | |||
| "prefetcht0 (%%r14); prefetcht0 64(%%r14);"\ | |||
| SAVE_m8n##ndim "addq $64,%2;" | |||
| /* m = 4 *//* xmm0 for alpha, xmm1-xmm3 for temporary use, xmm4-xmm15 for accumulators */ | |||
| #define KERNEL_k1m4n1 \ | |||
| "vmovups (%0),%%xmm1; addq $16,%0;"\ | |||
| "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ | |||
| "addq $4,%1;" | |||
| #define KERNEL_h_k1m4n2 \ | |||
| "vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2; addq $16,%0;"\ | |||
| "vmovddup (%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm4; vfmadd231ps %%xmm2,%%xmm3,%%xmm5;" | |||
| #define KERNEL_k1m4n2 KERNEL_h_k1m4n2 "addq $8,%1;" | |||
| #define KERNEL_h_k1m4n4 \ | |||
| KERNEL_h_k1m4n2 "vmovddup 8(%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm6; vfmadd231ps %%xmm2,%%xmm3,%%xmm7;" | |||
| #define KERNEL_k1m4n4 KERNEL_h_k1m4n4 "addq $16,%1;" | |||
| #define unit_kernel_k1m4n4(c1,c2,c3,c4,...) \ | |||
| "vmovddup ("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c1"; vfmadd231ps %%xmm2,%%xmm3,"#c2";"\ | |||
| "vmovddup 8("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c3"; vfmadd231ps %%xmm2,%%xmm3,"#c4";" | |||
| #define KERNEL_h_k1m4n8 KERNEL_h_k1m4n4 unit_kernel_k1m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11,%1,%%r12,1) | |||
| #define KERNEL_k1m4n8 KERNEL_h_k1m4n8 "addq $16,%1;" | |||
| #define KERNEL_h_k1m4n12 KERNEL_h_k1m4n8 unit_kernel_k1m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15,%1,%%r12,2) | |||
| #define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $16,%1;" | |||
| #define INIT_m4n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" | |||
| #define INIT_m4n2 INIT_m4n1 "vpxor %%xmm5,%%xmm5,%%xmm5;" | |||
| #define INIT_m4n4 INIT_m4n2 "vpxor %%xmm6,%%xmm6,%%xmm6;vpxor %%xmm7,%%xmm7,%%xmm7;" | |||
| #define unit_init_m4n4(c1,c2,c3,c4) \ | |||
| "vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";" | |||
| #define INIT_m4n8 INIT_m4n4 unit_init_m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11) | |||
| #define INIT_m4n12 INIT_m4n8 unit_init_m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15) | |||
| #define SAVE_m4n1 \ | |||
| "vunpcklps %%xmm4,%%xmm4,%%xmm2; vunpckhps %%xmm4,%%xmm4,%%xmm3;"\ | |||
| "vfmadd213ps (%2),%%xmm0,%%xmm2; vfmadd213ps 16(%2),%%xmm0,%%xmm3; vmovups %%xmm2,(%2); vmovups %%xmm3,16(%2);" | |||
| #define unit_save_m4n2(c1,c2) \ | |||
| "vunpcklpd "#c2","#c1",%%xmm2; vunpckhpd "#c2","#c1","#c2"; vmovapd %%xmm2,"#c1";"\ | |||
| "vmovsldup "#c1",%%xmm2; vmovsldup "#c2",%%xmm3;"\ | |||
| "vfmadd213ps (%5),%%xmm0,%%xmm2; vfmadd213ps 16(%5),%%xmm0,%%xmm3; vmovups %%xmm2,(%5); vmovups %%xmm3,16(%5);"\ | |||
| "vmovshdup "#c1",%%xmm2; vmovshdup "#c2",%%xmm3;"\ | |||
| "vfmadd213ps (%5,%3,1),%%xmm0,%%xmm2; vfmadd213ps 16(%5,%3,1),%%xmm0,%%xmm3; vmovups %%xmm2,(%5,%3,1); vmovups %%xmm3,16(%5,%3,1);"\ | |||
| "leaq (%5,%3,2),%5;" | |||
| #define SAVE_m4n2 "movq %2,%5;" unit_save_m4n2(%%xmm4,%%xmm5) | |||
| #define SAVE_m4n4 SAVE_m4n2 unit_save_m4n2(%%xmm6,%%xmm7) | |||
| #define SAVE_m4n8 SAVE_m4n4 unit_save_m4n2(%%xmm8,%%xmm9) unit_save_m4n2(%%xmm10,%%xmm11) | |||
| #define SAVE_m4n12 SAVE_m4n8 unit_save_m4n2(%%xmm12,%%xmm13) unit_save_m4n2(%%xmm14,%%xmm15) | |||
| #define COMPUTE_m4(ndim) \ | |||
| INIT_m4n##ndim\ | |||
| "movq %%r13,%4; movq %%r14,%1;"\ | |||
| #ndim"442:\n\t"\ | |||
| "testq %4,%4; jz "#ndim"443f;"\ | |||
| KERNEL_k1m4n##ndim\ | |||
| "decq %4; jmp "#ndim"442b;"\ | |||
| #ndim"443:\n\t"\ | |||
| SAVE_m4n##ndim "addq $32,%2;" | |||
| /* m = 2 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm9 for accumulators */ | |||
| #define INIT_m2n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" | |||
| #define KERNEL_k1m2n1 \ | |||
| "vmovsd (%0),%%xmm1; addq $8,%0;"\ | |||
| "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ | |||
| "addq $4,%1;" | |||
| #define SAVE_m2n1 \ | |||
| "vunpcklps %%xmm4,%%xmm4,%%xmm1; vfmadd213ps (%2),%%xmm0,%%xmm1; vmovups %%xmm1,(%2);" | |||
| #define INIT_m2n2 INIT_m2n1 "vpxor %%xmm5,%%xmm5,%%xmm5;" | |||
| #define KERNEL_k1m2n2 \ | |||
| "vmovsd (%0),%%xmm1; addq $8,%0;"\ | |||
| "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ | |||
| "vbroadcastss 4(%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm5;"\ | |||
| "addq $8,%1;" | |||
| #define SAVE_m2n2 SAVE_m2n1 \ | |||
| "vunpcklps %%xmm5,%%xmm5,%%xmm1; vfmadd213ps (%2,%3,1),%%xmm0,%%xmm1; vmovups %%xmm1,(%2,%3,1);" | |||
| #define INIT_m2n4 INIT_m2n2 | |||
| #define INIT_m2n8 INIT_m2n4 "vpxor %%xmm6,%%xmm6,%%xmm6; vpxor %%xmm7,%%xmm7,%%xmm7;" | |||
| #define INIT_m2n12 INIT_m2n8 "vpxor %%xmm8,%%xmm8,%%xmm8; vpxor %%xmm9,%%xmm9,%%xmm9;" | |||
| #define KERNEL_k1m2n4 \ | |||
| "vmovups (%1),%%xmm3; addq $16,%1;"\ | |||
| "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\ | |||
| "vbroadcastss 4(%0),%%xmm2; vfmadd231ps %%xmm3,%%xmm2,%%xmm5;"\ | |||
| "addq $8,%0;" | |||
| #define KERNEL_k1m2n8 \ | |||
| "vmovups (%1),%%xmm3; vmovups (%1,%%r12,1),%%xmm2; addq $16,%1;"\ | |||
| "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4; vfmadd231ps %%xmm2,%%xmm1,%%xmm6;"\ | |||
| "vbroadcastss 4(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm5; vfmadd231ps %%xmm2,%%xmm1,%%xmm7;"\ | |||
| "addq $8,%0;" | |||
| #define KERNEL_k1m2n12 \ | |||
| "vmovups (%1),%%xmm3; vmovups (%1,%%r12,1),%%xmm2; vmovups (%1,%%r12,2),%%xmm1; addq $16,%1;"\ | |||
| "vbroadcastss (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm6; vfmadd231ps %%xmm1,%%xmm10,%%xmm8;"\ | |||
| "vbroadcastss 4(%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm5; vfmadd231ps %%xmm2,%%xmm10,%%xmm7; vfmadd231ps %%xmm1,%%xmm10,%%xmm9;"\ | |||
| "addq $8,%0;" | |||
| #define unit_save_m2n4(c1,c2) \ | |||
| "vunpcklpd "#c2","#c1",%%xmm1; vunpckhpd "#c2","#c1",%%xmm2;"\ | |||
| "vmovsldup %%xmm1,%%xmm3; vfmadd213ps (%5),%%xmm0,%%xmm3; vmovups %%xmm3,(%5);"\ | |||
| "vmovshdup %%xmm1,%%xmm3; vfmadd213ps (%5,%3,1),%%xmm0,%%xmm3; vmovups %%xmm3,(%5,%3,1);"\ | |||
| "leaq (%5,%3,2),%5;"\ | |||
| "vmovsldup %%xmm2,%%xmm3; vfmadd213ps (%5),%%xmm0,%%xmm3; vmovups %%xmm3,(%5);"\ | |||
| "vmovshdup %%xmm2,%%xmm3; vfmadd213ps (%5,%3,1),%%xmm0,%%xmm3; vmovups %%xmm3,(%5,%3,1);"\ | |||
| "leaq (%5,%3,2),%5;" | |||
| #define SAVE_m2n4 "movq %2,%5;" unit_save_m2n4(%%xmm4,%%xmm5) | |||
| #define SAVE_m2n8 SAVE_m2n4 unit_save_m2n4(%%xmm6,%%xmm7) | |||
| #define SAVE_m2n12 SAVE_m2n8 unit_save_m2n4(%%xmm8,%%xmm9) | |||
| #define COMPUTE_m2(ndim) \ | |||
| INIT_m2n##ndim\ | |||
| "movq %%r13,%4; movq %%r14,%1;"\ | |||
| #ndim"222:\n\t"\ | |||
| "testq %4,%4; jz "#ndim"223f;"\ | |||
| KERNEL_k1m2n##ndim\ | |||
| "decq %4; jmp "#ndim"222b;"\ | |||
| #ndim"223:\n\t"\ | |||
| SAVE_m2n##ndim "addq $16,%2;" | |||
| /* m = 1 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm6 for accumulators */ | |||
| #define INIT_m1n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" | |||
| #define KERNEL_k1m1n1 \ | |||
| "vmovss (%1),%%xmm3; addq $4,%1;"\ | |||
| "vmovss (%0),%%xmm1; vfmadd231ss %%xmm3,%%xmm1,%%xmm4;"\ | |||
| "addq $4,%0;" | |||
| #define SAVE_m1n1 \ | |||
| "vunpcklps %%xmm4,%%xmm4,%%xmm4; vmovsd (%2),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm4; vmovsd %%xmm4,(%2);" | |||
| #define INIT_m1n2 INIT_m1n1 | |||
| #define KERNEL_k1m1n2 \ | |||
| "vmovsd (%1),%%xmm3; addq $8,%1;"\ | |||
| "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\ | |||
| "addq $4,%0;" | |||
| #define SAVE_m1n2 \ | |||
| "vunpcklps %%xmm4,%%xmm4,%%xmm4; vmovsd (%2),%%xmm3; vmovhpd (%2,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm4;"\ | |||
| "vmovsd %%xmm4,(%2); vmovhpd %%xmm4,(%2,%3,1);" | |||
| #define INIT_m1n4 INIT_m1n2 | |||
| #define INIT_m1n8 INIT_m1n4 "vpxor %%xmm5,%%xmm5,%%xmm5;" | |||
| #define INIT_m1n12 INIT_m1n8 "vpxor %%xmm6,%%xmm6,%%xmm6;" | |||
| #define KERNEL_k1m1n4 \ | |||
| "vmovups (%1),%%xmm3; addq $16,%1;"\ | |||
| "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\ | |||
| "addq $4,%0;" | |||
| #define KERNEL_k1m1n8 \ | |||
| "vmovups (%1),%%xmm3; vmovups (%1,%%r12,1),%%xmm2; addq $16,%1;"\ | |||
| "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4; vfmadd231ps %%xmm2,%%xmm1,%%xmm5;"\ | |||
| "addq $4,%0;" | |||
| #define KERNEL_k1m1n12 \ | |||
| "vmovups (%1),%%xmm3; vmovups (%1,%%r12,1),%%xmm2; vmovups (%1,%%r12,2),%%xmm1; addq $16,%1;"\ | |||
| "vbroadcastss (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm5; vfmadd231ps %%xmm1,%%xmm10,%%xmm6;"\ | |||
| "addq $4,%0;" | |||
| #define unit_save_m1n4(c1) \ | |||
| "vunpcklps "#c1","#c1",%%xmm1; vunpckhps "#c1","#c1",%%xmm2;"\ | |||
| "vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1;"\ | |||
| "vmovsd %%xmm1,(%5); vmovhpd %%xmm1,(%5,%3,1); leaq (%5,%3,2),%5;"\ | |||
| "vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2;"\ | |||
| "vmovsd %%xmm2,(%5); vmovhpd %%xmm2,(%5,%3,1); leaq (%5,%3,2),%5;" | |||
| #define SAVE_m1n4 "movq %2,%5;" unit_save_m1n4(%%xmm4) | |||
| #define SAVE_m1n8 SAVE_m1n4 unit_save_m1n4(%%xmm5) | |||
| #define SAVE_m1n12 SAVE_m1n8 unit_save_m1n4(%%xmm6) | |||
| #define COMPUTE_m1(ndim) \ | |||
| INIT_m1n##ndim\ | |||
| "movq %%r13,%4; movq %%r14,%1;"\ | |||
| #ndim"112:\n\t"\ | |||
| "testq %4,%4; jz "#ndim"113f;"\ | |||
| KERNEL_k1m1n##ndim\ | |||
| "decq %4; jmp "#ndim"112b;"\ | |||
| #ndim"113:\n\t"\ | |||
| SAVE_m1n##ndim "addq $8,%2;" | |||
| /* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 = "+r"(K), %5 = "+r"(ctemp) */ | |||
| /* %6 = "+r"(&alpha), %7 = "+r"(M), %8 = "+r"(next_b) */ | |||
| /* r11 = m(const), r12 = k << 4(const), r13 = k(const), r14 = b_head_pos(const),r15 = tmp */ | |||
| #define COMPUTE(ndim) {\ | |||
| next_b = b_pointer + ndim * K;\ | |||
| __asm__ __volatile__(\ | |||
| "vbroadcastsd (%6),%%ymm0;"\ | |||
| "movq %4,%%r13; movq %4,%%r12; salq $4,%%r12; movq %1,%%r14; movq %7,%%r11;"\ | |||
| "cmpq $8,%7;jb 33101"#ndim"f;"\ | |||
| "33109"#ndim":\n\t"\ | |||
| COMPUTE_m8(ndim)\ | |||
| "subq $8,%7;cmpq $8,%7;jnb 33109"#ndim"b;"\ | |||
| "33101"#ndim":\n\t"\ | |||
| "cmpq $4,%7;jb 33103"#ndim"f;"\ | |||
| COMPUTE_m4(ndim)\ | |||
| "subq $4,%7;"\ | |||
| "33103"#ndim":\n\t"\ | |||
| "cmpq $2,%7;jb 33104"#ndim"f;"\ | |||
| COMPUTE_m2(ndim)\ | |||
| "subq $2,%7;"\ | |||
| "33104"#ndim":\n\t"\ | |||
| "testq %7,%7;jz 33105"#ndim"f;"\ | |||
| COMPUTE_m1(ndim)\ | |||
| "33105"#ndim":\n\t"\ | |||
| "movq %%r13,%4; movq %%r14,%1; movq %%r11,%7;"\ | |||
| :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(K),"+r"(ctemp),"+r"(const_val),"+r"(M),"+r"(next_b)\ | |||
| ::"r11","r12","r13","r14","r15",\ | |||
| "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15","cc","memory");\ | |||
| a_pointer -= M * K; b_pointer += ndim * K; c_pointer += 2*(LDC * ndim - M);\ | |||
| } | |||
| int __attribute__ ((noinline)) | |||
| CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alphar, float alphai, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC) | |||
| { | |||
| if(m==0||n==0||k==0) return 0; | |||
| int64_t ldc_in_bytes = (int64_t)LDC * sizeof(float) * 2; | |||
| float constval[2]; constval[0] = alphar; constval[1] = alphai; | |||
| float *const_val=constval; | |||
| int64_t M = (int64_t)m, K = (int64_t)k; | |||
| BLASLONG n_count = n; | |||
| float *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*next_b = B; | |||
| for(;n_count>11;n_count-=12) COMPUTE(12) | |||
| for(;n_count>7;n_count-=8) COMPUTE(8) | |||
| for(;n_count>3;n_count-=4) COMPUTE(4) | |||
| for(;n_count>1;n_count-=2) COMPUTE(2) | |||
| if(n_count>0) COMPUTE(1) | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,292 @@ | |||
| #include "common.h" | |||
| #include <stdint.h> | |||
| /* recommended settings: GEMM_P = 256, GEMM_Q = 256 */ | |||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
| #define A_CONJ 0 | |||
| #define B_CONJ 0 | |||
| #endif | |||
| #if defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||
| #define A_CONJ 1 | |||
| #define B_CONJ 0 | |||
| #endif | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||
| #define A_CONJ 0 | |||
| #define B_CONJ 1 | |||
| #endif | |||
| #if defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| #define A_CONJ 1 | |||
| #define B_CONJ 1 | |||
| #endif | |||
| /* %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc(bytes), %5 = k_counter, %6 = &alpha, %7 = m_counter, %8 = b_pref */ | |||
| /* r11 = m, r12 = k << 4, r13 = k, r14 = b_head, r15 = temp */ | |||
| /* m=8, ymm 0-3 temp, ymm 4-15 acc */ | |||
| #if A_CONJ == B_CONJ | |||
| #define acc_m4n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%ymm"#ar",%%ymm"#b2",%%ymm"#cl"; vfmadd231ps %%ymm"#ai",%%ymm"#b2",%%ymm"#cr";" | |||
| #define acc_m8n1_con(ua,la,b1,uc,lc) "vfmaddsub231ps %%ymm"#ua",%%ymm"#b1",%%ymm"#uc"; vfmaddsub231ps %%ymm"#la",%%ymm"#b1",%%ymm"#lc";" | |||
| #else | |||
| #define acc_m4n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%ymm"#ar",%%ymm"#b2",%%ymm"#cl"; vfnmadd231ps %%ymm"#ai",%%ymm"#b2",%%ymm"#cr";" | |||
| #define acc_m8n1_con(ua,la,b1,uc,lc) "vfmsubadd231ps %%ymm"#ua",%%ymm"#b1",%%ymm"#uc"; vfmsubadd231ps %%ymm"#la",%%ymm"#b1",%%ymm"#lc";" | |||
| #endif | |||
| /* expanded accumulators for m8n1 and m8n2 */ | |||
| #define KERNEL_k1m8n1 \ | |||
| "vbroadcastsd (%1),%%ymm0; addq $8,%1;"\ | |||
| "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2;" acc_m4n1_exp(1,2,0,4,5)\ | |||
| "vmovsldup 32(%0),%%ymm1; vmovshdup 32(%0),%%ymm2;" acc_m4n1_exp(1,2,0,6,7)\ | |||
| "addq $64,%0;" | |||
| #define KERNEL_k1m8n2 \ | |||
| "vbroadcastsd (%1),%%ymm0; vbroadcastsd 8(%1),%%ymm1; addq $16,%1;"\ | |||
| "vmovsldup (%0),%%ymm2; vmovshdup (%0),%%ymm3;" acc_m4n1_exp(2,3,0,4,5) acc_m4n1_exp(2,3,1,8,9)\ | |||
| "vmovsldup 32(%0),%%ymm2; vmovshdup 32(%0),%%ymm3;" acc_m4n1_exp(2,3,0,6,7) acc_m4n1_exp(2,3,1,10,11)\ | |||
| "addq $64,%0;" | |||
| /* contracted accumulators for m8n4 and m8n6 */ | |||
| #define acc_m8n2_con(ua,la,luc,llc,ruc,rlc,lboff,rboff,...) \ | |||
| "vbroadcastss "#lboff"("#__VA_ARGS__"),%%ymm2;" acc_m8n1_con(ua,la,2,luc,llc)\ | |||
| "vbroadcastss "#rboff"("#__VA_ARGS__"),%%ymm3;" acc_m8n1_con(ua,la,3,ruc,rlc) | |||
| #define KERNEL_1_k1m8n4 \ | |||
| "vmovups (%0),%%ymm0; vmovups 32(%0),%%ymm1; prefetcht0 512(%0); addq $64,%0;"\ | |||
| acc_m8n2_con(0,1,4,5,6,7,0,8,%1) acc_m8n2_con(0,1,8,9,10,11,0,8,%1,%%r12,1) | |||
| #define KERNEL_2_k1m8n4 \ | |||
| "vpermilps $177,%%ymm0,%%ymm0; vpermilps $177,%%ymm1,%%ymm1;"\ | |||
| acc_m8n2_con(0,1,4,5,6,7,4,12,%1) acc_m8n2_con(0,1,8,9,10,11,4,12,%1,%%r12,1) | |||
| #define KERNEL_1_k1m8n6 KERNEL_1_k1m8n4 acc_m8n2_con(0,1,12,13,14,15,0,8,%1,%%r12,2) | |||
| #define KERNEL_2_k1m8n6 KERNEL_2_k1m8n4 acc_m8n2_con(0,1,12,13,14,15,4,12,%1,%%r12,2) | |||
| #define KERNEL_k1m8n4 KERNEL_1_k1m8n4 KERNEL_2_k1m8n4 "addq $16,%1;" | |||
| #define KERNEL_k1m8n6 KERNEL_1_k1m8n6 KERNEL_2_k1m8n6 "addq $16,%1;" | |||
| #define zero_4ymm(no1,no2,no3,no4) \ | |||
| "vpxor %%ymm"#no1",%%ymm"#no1",%%ymm"#no1"; vpxor %%ymm"#no2",%%ymm"#no2",%%ymm"#no2";"\ | |||
| "vpxor %%ymm"#no3",%%ymm"#no3",%%ymm"#no3"; vpxor %%ymm"#no4",%%ymm"#no4",%%ymm"#no4";" | |||
| /* initialization and storage macros */ | |||
| #define INIT_m8n1 zero_4ymm(4,5,6,7) | |||
| #define INIT_m8n2 zero_4ymm(4,5,6,7) zero_4ymm(8,9,10,11) | |||
| #define INIT_m8n4 zero_4ymm(4,5,6,7) zero_4ymm(8,9,10,11) | |||
| #define INIT_m8n6 INIT_m8n4 zero_4ymm(12,13,14,15) | |||
| #if A_CONJ == B_CONJ | |||
| #define cont_expacc(cl,cr,dst) "vpermilps $177,%%ymm"#cr",%%ymm"#cr"; vaddsubps %%ymm"#cl",%%ymm"#cr",%%ymm"#dst";" | |||
| #else | |||
| #define cont_expacc(cl,cr,dst) "vpermilps $177,%%ymm"#cr",%%ymm"#cr"; vaddsubps %%ymm"#cr",%%ymm"#cl",%%ymm"#dst";" | |||
| #endif | |||
| #if A_CONJ == 0 | |||
| #define save_1ymm(c,tmp,off,alpr,alpi,...) \ | |||
| "vpermilps $177,%%ymm"#c",%%ymm"#tmp"; vfmsubadd213ps "#off"("#__VA_ARGS__"),%%ymm"#alpr",%%ymm"#c";"\ | |||
| "vfmsubadd231ps %%ymm"#tmp",%%ymm"#alpi",%%ymm"#c"; vmovups %%ymm"#c","#off"("#__VA_ARGS__");" | |||
| #else | |||
| #define save_1ymm(c,tmp,off,alpr,alpi,...) \ | |||
| "vpermilps $177,%%ymm"#c",%%ymm"#tmp"; vfmaddsub213ps "#off"("#__VA_ARGS__"),%%ymm"#alpi",%%ymm"#tmp";"\ | |||
| "vfmaddsub231ps %%ymm"#c",%%ymm"#alpr",%%ymm"#tmp"; vmovups %%ymm"#tmp","#off"("#__VA_ARGS__");" | |||
| #endif | |||
| #define save_init_m8 "movq %2,%3; addq $64,%2; vbroadcastss (%6),%%ymm0; vbroadcastss 4(%6),%%ymm1;" | |||
| #define SAVE_m8n1 save_init_m8 cont_expacc(4,5,4) cont_expacc(6,7,6) save_1ymm(4,2,0,0,1,%3) save_1ymm(6,3,32,0,1,%3) | |||
| #define SAVE_m8n2 SAVE_m8n1\ | |||
| cont_expacc(8,9,8) cont_expacc(10,11,10) save_1ymm(8,2,0,0,1,%3,%4,1) save_1ymm(10,3,32,0,1,%3,%4,1) | |||
| #define SAVE_m8n4 save_init_m8\ | |||
| save_1ymm(4,2,0,0,1,%3) save_1ymm(5,3,32,0,1,%3) save_1ymm(6,2,0,0,1,%3,%4,1) save_1ymm(7,3,32,0,1,%3,%4,1) "leaq (%3,%4,2),%3;"\ | |||
| save_1ymm(8,2,0,0,1,%3) save_1ymm(9,3,32,0,1,%3) save_1ymm(10,2,0,0,1,%3,%4,1) save_1ymm(11,3,32,0,1,%3,%4,1) | |||
| #define SAVE_m8n6 SAVE_m8n4 "leaq (%3,%4,2),%3;"\ | |||
| save_1ymm(12,2,0,0,1,%3) save_1ymm(13,3,32,0,1,%3) save_1ymm(14,2,0,0,1,%3,%4,1) save_1ymm(15,3,32,0,1,%3,%4,1) | |||
| #define COMPUTE_m8(ndim) \ | |||
| "movq %%r14,%1;" INIT_m8n##ndim "movq %2,%3; movq %%r13,%5;"\ | |||
| "testq %5,%5; jz "#ndim"8883f; cmpq $10,%5; jb "#ndim"8882f;"\ | |||
| "movq $10,%5; movq $84,%%r15;"\ | |||
| #ndim"8881:\n\t"\ | |||
| "prefetcht1 (%3); subq $63,%3; addq %%r15,%3;"\ | |||
| "prefetcht0 64(%1); prefetcht0 64(%1,%%r12,1); prefetcht0 64(%1,%%r12,2);"\ | |||
| KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\ | |||
| "testq $12,%5; movq $84,%%r15; cmovz %4,%%r15; prefetcht1 (%8); addq $16,%8;"\ | |||
| KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\ | |||
| "addq $4,%5; cmpq %5,%%r13; jnb "#ndim"8881b;"\ | |||
| "movq %2,%3; negq %5; leaq 10(%%r13,%5,1),%5; prefetcht0 (%6); prefetcht0 7(%6);"\ | |||
| #ndim"8882:\n\t"\ | |||
| "prefetcht0 (%3); prefetcht0 63(%3); addq %4,%3;"\ | |||
| KERNEL_k1m8n##ndim "decq %5; jnz "#ndim"8882b;"\ | |||
| #ndim"8883:\n\t"\ | |||
| "prefetcht0 (%%r14); prefetcht0 64(%%r14);" SAVE_m8n##ndim | |||
| /* m=4, ymm 0-3 temp, ymm 4-15 acc, expanded accumulators */ | |||
| #define KERNEL_k1m4n1 \ | |||
| "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; addq $32,%0;"\ | |||
| "vbroadcastsd (%1),%%ymm0;" acc_m4n1_exp(1,2,0,4,5) "addq $8,%1;" | |||
| #define acc_m4n2_exp(c1l,c1r,c2l,c2r,...) \ | |||
| "vbroadcastsd ("#__VA_ARGS__"),%%ymm2;" acc_m4n1_exp(0,1,2,c1l,c1r)\ | |||
| "vbroadcastsd 8("#__VA_ARGS__"),%%ymm3;" acc_m4n1_exp(0,1,3,c2l,c2r) | |||
| #define KERNEL_h_k1m4n2 \ | |||
| "vmovsldup (%0),%%ymm0; vmovshdup (%0),%%ymm1; addq $32,%0;" acc_m4n2_exp(4,5,6,7,%1) | |||
| #define KERNEL_h_k1m4n4 KERNEL_h_k1m4n2 acc_m4n2_exp(8,9,10,11,%1,%%r12,1) | |||
| #define KERNEL_h_k1m4n6 KERNEL_h_k1m4n4 acc_m4n2_exp(12,13,14,15,%1,%%r12,2) | |||
| #define KERNEL_k1m4n2 KERNEL_h_k1m4n2 "addq $16,%1;" | |||
| #define KERNEL_k1m4n4 KERNEL_h_k1m4n4 "addq $16,%1;" | |||
| #define KERNEL_k1m4n6 KERNEL_h_k1m4n6 "addq $16,%1;" | |||
| #define INIT_m4n1 "vpxor %%ymm4,%%ymm4,%%ymm4; vpxor %%ymm5,%%ymm5,%%ymm5;" | |||
| #define INIT_m4n2 zero_4ymm(4,5,6,7) | |||
| #define INIT_m4n4 INIT_m4n2 zero_4ymm(8,9,10,11) | |||
| #define INIT_m4n6 INIT_m4n4 zero_4ymm(12,13,14,15) | |||
| #define save_init_m4 "movq %2,%3; addq $32,%2; vbroadcastss (%6),%%ymm0; vbroadcastss 4(%6),%%ymm1;" | |||
| #define SAVE_m4n1 save_init_m4 cont_expacc(4,5,4) save_1ymm(4,2,0,0,1,%3) | |||
| #define SAVE_m4n2 SAVE_m4n1 cont_expacc(6,7,6) save_1ymm(6,3,0,0,1,%3,%4,1) | |||
| #define SAVE_m4n4 SAVE_m4n2 "leaq (%3,%4,2),%3;"\ | |||
| cont_expacc(8,9,8) cont_expacc(10,11,10) save_1ymm(8,2,0,0,1,%3) save_1ymm(10,3,0,0,1,%3,%4,1) | |||
| #define SAVE_m4n6 SAVE_m4n4 "leaq (%3,%4,2),%3;"\ | |||
| cont_expacc(12,13,12) cont_expacc(14,15,14) save_1ymm(12,2,0,0,1,%3) save_1ymm(14,3,0,0,1,%3,%4,1) | |||
| #define COMPUTE_m4(ndim) \ | |||
| "movq %%r14,%1;" INIT_m4n##ndim "movq %%r13,%5;"\ | |||
| "testq %5,%5; jz "#ndim"4442f;"\ | |||
| #ndim"4441:\n\t"\ | |||
| KERNEL_k1m4n##ndim\ | |||
| "decq %5; jnz "#ndim"4441b;"\ | |||
| #ndim"4442:\n\t"\ | |||
| SAVE_m4n##ndim | |||
| /* m=2, xmm 0-3 temp, xmm 4-15 acc, expanded accumulators */ | |||
| #if A_CONJ == B_CONJ | |||
| #define acc_m2n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";" | |||
| #else | |||
| #define acc_m2n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfnmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";" | |||
| #endif | |||
| #define KERNEL_h_k1m2n1 \ | |||
| "vmovsldup (%0),%%xmm0; vmovshdup (%0),%%xmm1; addq $16,%0;"\ | |||
| "vmovddup (%1),%%xmm2;" acc_m2n1_exp(0,1,2,4,5) | |||
| #define KERNEL_h_k1m2n2 KERNEL_h_k1m2n1\ | |||
| "vmovddup 8(%1),%%xmm3;" acc_m2n1_exp(0,1,3,6,7) | |||
| #define acc_m2n2_exp(c1,c2,c3,c4,...)\ | |||
| "vmovddup ("#__VA_ARGS__"),%%xmm2;" acc_m2n1_exp(0,1,2,c1,c2)\ | |||
| "vmovddup 8("#__VA_ARGS__"),%%xmm3;" acc_m2n1_exp(0,1,3,c3,c4) | |||
| #define KERNEL_h_k1m2n4 KERNEL_h_k1m2n2 acc_m2n2_exp(8,9,10,11,%1,%%r12,1) | |||
| #define KERNEL_h_k1m2n6 KERNEL_h_k1m2n4 acc_m2n2_exp(12,13,14,15,%1,%%r12,2) | |||
| #define KERNEL_k1m2n1 KERNEL_h_k1m2n1 "addq $8,%1;" | |||
| #define KERNEL_k1m2n2 KERNEL_h_k1m2n2 "addq $16,%1;" | |||
| #define KERNEL_k1m2n4 KERNEL_h_k1m2n4 "addq $16,%1;" | |||
| #define KERNEL_k1m2n6 KERNEL_h_k1m2n6 "addq $16,%1;" | |||
| #define zero_2xmm(no1,no2) "vpxor %%xmm"#no1",%%xmm"#no1",%%xmm"#no1"; vpxor %%xmm"#no2",%%xmm"#no2",%%xmm"#no2";" | |||
| #define INIT_m2n1 zero_2xmm(4,5) | |||
| #define INIT_m2n2 INIT_m2n1 zero_2xmm(6,7) | |||
| #define INIT_m2n4 INIT_m2n2 zero_2xmm(8,9) zero_2xmm(10,11) | |||
| #define INIT_m2n6 INIT_m2n4 zero_2xmm(12,13) zero_2xmm(14,15) | |||
| #if A_CONJ == B_CONJ | |||
| #define cont_expxmmacc(cl,cr,dst) "vpermilps $177,%%xmm"#cr",%%xmm"#cr"; vaddsubps %%xmm"#cl",%%xmm"#cr",%%xmm"#dst";" | |||
| #else | |||
| #define cont_expxmmacc(cl,cr,dst) "vpermilps $177,%%xmm"#cr",%%xmm"#cr"; vaddsubps %%xmm"#cr",%%xmm"#cl",%%xmm"#dst";" | |||
| #endif | |||
| #if A_CONJ == 0 | |||
| #define save_1xmm(c,tmp,alpr,alpi) \ | |||
| "vpermilps $177,%%xmm"#c",%%xmm"#tmp"; vfmsubadd213ps (%3),%%xmm"#alpr",%%xmm"#c";"\ | |||
| "vfmsubadd231ps %%xmm"#tmp",%%xmm"#alpi",%%xmm"#c"; vmovups %%xmm"#c",(%3); addq %4,%3;" | |||
| #else | |||
| #define save_1xmm(c,tmp,alpr,alpi) \ | |||
| "vpermilps $177,%%xmm"#c",%%xmm"#tmp"; vfmaddsub213ps (%3),%%xmm"#alpi",%%xmm"#tmp";"\ | |||
| "vfmaddsub231ps %%xmm"#c",%%xmm"#alpr",%%xmm"#tmp"; vmovups %%xmm"#tmp",(%3); addq %4,%3;" | |||
| #endif | |||
| #define save_init_m2 "movq %2,%3; addq $16,%2; vbroadcastss (%6),%%xmm0; vbroadcastss 4(%6),%%xmm1;" | |||
| #define SAVE_m2n1 save_init_m2 cont_expxmmacc(4,5,4) save_1xmm(4,2,0,1) | |||
| #define SAVE_m2n2 SAVE_m2n1 cont_expacc(6,7,6) save_1xmm(6,3,0,1) | |||
| #define SAVE_m2n4 SAVE_m2n2 cont_expacc(8,9,8) save_1xmm(8,2,0,1) cont_expacc(10,11,10) save_1xmm(10,3,0,1) | |||
| #define SAVE_m2n6 SAVE_m2n4 cont_expacc(12,13,12) save_1xmm(12,2,0,1) cont_expacc(14,15,14) save_1xmm(14,3,0,1) | |||
| #define COMPUTE_m2(ndim) \ | |||
| "movq %%r14,%1;" INIT_m2n##ndim "movq %%r13,%5;"\ | |||
| "testq %5,%5; jz "#ndim"2222f;"\ | |||
| #ndim"2221:\n\t"\ | |||
| KERNEL_k1m2n##ndim\ | |||
| "decq %5; jnz "#ndim"2221b;"\ | |||
| #ndim"2222:\n\t"\ | |||
| SAVE_m2n##ndim | |||
| /* m=1, xmm 0-3 temp, xmm 4-9 acc, expanded accumulators */ | |||
| #if A_CONJ == B_CONJ | |||
| #define acc_m1n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";" | |||
| #define acc_m1n2_exp(arb,aib,b4,cl,cr) "vfmadd231ps %%xmm"#arb",%%xmm"#b4",%%xmm"#cl"; vfmadd231ps %%xmm"#aib",%%xmm"#b4",%%xmm"#cr";" | |||
| #else | |||
| #define acc_m1n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfnmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";" | |||
| #define acc_m1n2_exp(arb,aib,b4,cl,cr) "vfmadd231ps %%xmm"#arb",%%xmm"#b4",%%xmm"#cl"; vfnmadd231ps %%xmm"#aib",%%xmm"#b4",%%xmm"#cr";" | |||
| #endif | |||
| #define KERNEL_k1m1n1 \ | |||
| "vbroadcastss (%0),%%xmm0; vbroadcastss 4(%0),%%xmm1; addq $8,%0;"\ | |||
| "vmovsd (%1),%%xmm2; addq $8,%1;" acc_m1n1_exp(0,1,2,4,5) | |||
| #define KERNEL_h_k1m1n2 \ | |||
| "vbroadcastss (%0),%%xmm0; vbroadcastss 4(%0),%%xmm1; addq $8,%0;"\ | |||
| "vmovups (%1),%%xmm2;" acc_m1n2_exp(0,1,2,4,5) | |||
| #define KERNEL_h_k1m1n4 KERNEL_h_k1m1n2 "vmovups (%1,%%r12,1),%%xmm2;" acc_m1n2_exp(0,1,2,6,7) | |||
| #define KERNEL_h_k1m1n6 KERNEL_h_k1m1n4 "vmovups (%1,%%r12,2),%%xmm2;" acc_m1n2_exp(0,1,2,8,9) | |||
| #define KERNEL_k1m1n2 KERNEL_h_k1m1n2 "addq $16,%1;" | |||
| #define KERNEL_k1m1n4 KERNEL_h_k1m1n4 "addq $16,%1;" | |||
| #define KERNEL_k1m1n6 KERNEL_h_k1m1n6 "addq $16,%1;" | |||
| #define INIT_m1n1 zero_2xmm(4,5) | |||
| #define INIT_m1n2 zero_2xmm(4,5) | |||
| #define INIT_m1n4 INIT_m1n2 zero_2xmm(6,7) | |||
| #define INIT_m1n6 INIT_m1n4 zero_2xmm(8,9) | |||
| #if A_CONJ == 0 | |||
| #define save_m1n1(c,tmp1,tmp2,alpr,alpi) \ | |||
| "vpermilps $177,%%xmm"#c",%%xmm"#tmp1"; vmovsd (%3),%%xmm"#tmp2"; vfmsubadd213ps %%xmm"#tmp2",%%xmm"#alpr",%%xmm"#c";"\ | |||
| "vfmsubadd231ps %%xmm"#tmp1",%%xmm"#alpi",%%xmm"#c"; vmovsd %%xmm"#c",(%3);" | |||
| #define save_m1n2(c,tmp1,tmp2,alpr,alpi) \ | |||
| "vpermilps $177,%%xmm"#c",%%xmm"#tmp1"; vmovsd (%3),%%xmm"#tmp2"; vmovhpd (%3,%4,1),%%xmm"#tmp2",%%xmm"#tmp2";"\ | |||
| "vfmsubadd213ps %%xmm"#tmp2",%%xmm"#alpr",%%xmm"#c"; vfmsubadd231ps %%xmm"#tmp1",%%xmm"#alpi",%%xmm"#c";"\ | |||
| "vmovsd %%xmm"#c",(%3); vmovhpd %%xmm"#c",(%3,%4,1); leaq (%3,%4,2),%3;" | |||
| #else | |||
| #define save_m1n1(c,tmp1,tmp2,alpr,alpi) \ | |||
| "vpermilps $177,%%xmm"#c",%%xmm"#tmp1"; vmovsd (%3),%%xmm"#tmp2"; vfmaddsub213ps %%xmm"#tmp2",%%xmm"#alpi",%%xmm"#tmp1";"\ | |||
| "vfmaddsub231ps %%xmm"#c",%%xmm"#alpr",%%xmm"#tmp1"; vmovsd %%xmm"#tmp1",(%3);" | |||
| #define save_m1n2(c,tmp1,tmp2,alpr,alpi) \ | |||
| "vpermilps $177,%%xmm"#c",%%xmm"#tmp1"; vmovsd (%3),%%xmm"#tmp2"; vmovhpd (%3,%4,1),%%xmm"#tmp2",%%xmm"#tmp2";"\ | |||
| "vfmaddsub213ps %%xmm"#tmp2",%%xmm"#alpi",%%xmm"#tmp1"; vfmaddsub231ps %%xmm"#c",%%xmm"#alpr",%%xmm"#tmp1";"\ | |||
| "vmovsd %%xmm"#tmp1",(%3); vmovhpd %%xmm"#tmp1",(%3,%4,1); leaq (%3,%4,2),%3;" | |||
| #endif | |||
| #define save_init_m1 "movq %2,%3; addq $8,%2; vbroadcastss (%6),%%xmm0; vbroadcastss 4(%6),%%xmm1;" | |||
| #define SAVE_m1n1 save_init_m1 cont_expxmmacc(4,5,4) save_m1n1(4,2,3,0,1) | |||
| #define SAVE_m1n2 save_init_m1 cont_expxmmacc(4,5,4) save_m1n2(4,2,3,0,1) | |||
| #define SAVE_m1n4 SAVE_m1n2 cont_expxmmacc(6,7,6) save_m1n2(6,2,3,0,1) | |||
| #define SAVE_m1n6 SAVE_m1n4 cont_expxmmacc(8,9,8) save_m1n2(8,2,3,0,1) | |||
| #define COMPUTE_m1(ndim) \ | |||
| "movq %%r14,%1;" INIT_m1n##ndim "movq %%r13,%5;"\ | |||
| "testq %5,%5; jz "#ndim"1112f;"\ | |||
| #ndim"1111:\n\t"\ | |||
| KERNEL_k1m1n##ndim\ | |||
| "decq %5; jnz "#ndim"1111b;"\ | |||
| #ndim"1112:\n\t"\ | |||
| SAVE_m1n##ndim | |||
| #define COMPUTE(ndim) {\ | |||
| b_pref = b_ptr + ndim * K *2;\ | |||
| __asm__ __volatile__ (\ | |||
| "movq %1,%%r14; movq %5,%%r13; movq %5,%%r12; salq $4,%%r12; movq %7,%%r11;"\ | |||
| "cmpq $8,%7; jb "#ndim"9992f;"\ | |||
| #ndim"9991:\n\t"\ | |||
| COMPUTE_m8(ndim)\ | |||
| "subq $8,%7; cmpq $8,%7; jnb "#ndim"9991b;"\ | |||
| #ndim"9992:\n\t"\ | |||
| "cmpq $4,%7; jb "#ndim"9993f;"\ | |||
| COMPUTE_m4(ndim) "subq $4,%7;"\ | |||
| #ndim"9993:\n\t"\ | |||
| "cmpq $2,%7; jb "#ndim"9994f;"\ | |||
| COMPUTE_m2(ndim) "subq $2,%7;"\ | |||
| #ndim"9994:\n\t"\ | |||
| "testq %7,%7; jz "#ndim"9995f;"\ | |||
| COMPUTE_m1(ndim)\ | |||
| #ndim"9995:\n\t"\ | |||
| "movq %%r14,%1; movq %%r13,%5; movq %%r11,%7; vzeroupper;"\ | |||
| :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_in_bytes),"+r"(K),"+r"(alp),"+r"(M),"+r"(b_pref)\ | |||
| ::"cc","memory","r11","r12","r13","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5",\ | |||
| "xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ | |||
| a_ptr -= M * K *2; b_ptr += ndim * K *2; c_ptr += (ndim * LDC - M) * 2;\ | |||
| } | |||
| int __attribute__ ((noinline)) | |||
| CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alphar, float alphai, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC) | |||
| { | |||
| if(m==0||n==0||k==0||(alphar==0.0 && alphai==0.0)) return 0; | |||
| int64_t ldc_in_bytes = (int64_t)LDC * sizeof(float) * 2; | |||
| #if A_CONJ == B_CONJ | |||
| float const_val[2] = {-alphar, -alphai}; | |||
| #else | |||
| float const_val[2] = {alphar, alphai}; | |||
| #endif | |||
| int64_t M = (int64_t)m, K = (int64_t)k; | |||
| BLASLONG n_count = n; | |||
| float *a_ptr = A,*b_ptr = B,*c_ptr = C,*c_tmp = C,*alp = const_val,*b_pref = B; | |||
| for(;n_count>5;n_count-=6) COMPUTE(6) | |||
| for(;n_count>3;n_count-=4) COMPUTE(4) | |||
| for(;n_count>1;n_count-=2) COMPUTE(2) | |||
| if(n_count>0) COMPUTE(1) | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,240 @@ | |||
| #include "common.h" | |||
| #include <stdint.h> | |||
| /* recommended settings: GEMM_P = 192, GEMM_Q = 192 */ | |||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
| #define A_CONJ 0 | |||
| #define B_CONJ 0 | |||
| #endif | |||
| #if defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||
| #define A_CONJ 1 | |||
| #define B_CONJ 0 | |||
| #endif | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||
| #define A_CONJ 0 | |||
| #define B_CONJ 1 | |||
| #endif | |||
| #if defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| #define A_CONJ 1 | |||
| #define B_CONJ 1 | |||
| #endif | |||
| /* %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc(bytes), %5 = k_counter, %6 = &alpha, %7 = m_counter, %8 = b_pref */ | |||
| /* r11 = m, r12 = k << 5, r13 = k, r14 = b_head, r15 = temp */ | |||
| /* m=4, ymm 0-3 temp, ymm 4-15 acc */ | |||
| #if A_CONJ == B_CONJ | |||
| #define acc_m2n1_exp(ar,ai,b2,cl,cr) "vfmadd231pd %%ymm"#ar",%%ymm"#b2",%%ymm"#cl"; vfmadd231pd %%ymm"#ai",%%ymm"#b2",%%ymm"#cr";" | |||
| #define acc_m4n1_con(ua,la,b1,uc,lc) "vfmaddsub231pd %%ymm"#ua",%%ymm"#b1",%%ymm"#uc"; vfmaddsub231pd %%ymm"#la",%%ymm"#b1",%%ymm"#lc";" | |||
| #else | |||
| #define acc_m2n1_exp(ar,ai,b2,cl,cr) "vfmadd231pd %%ymm"#ar",%%ymm"#b2",%%ymm"#cl"; vfnmadd231pd %%ymm"#ai",%%ymm"#b2",%%ymm"#cr";" | |||
| #define acc_m4n1_con(ua,la,b1,uc,lc) "vfmsubadd231pd %%ymm"#ua",%%ymm"#b1",%%ymm"#uc"; vfmsubadd231pd %%ymm"#la",%%ymm"#b1",%%ymm"#lc";" | |||
| #endif | |||
| /* expanded accumulators for m4n1 and m4n2 */ | |||
| #define KERNEL_k1m4n1 \ | |||
| "vbroadcastf128 (%1),%%ymm0; addq $16,%1;"\ | |||
| "vmovddup (%0),%%ymm1; vmovddup 8(%0),%%ymm2;" acc_m2n1_exp(1,2,0,4,5)\ | |||
| "vmovddup 32(%0),%%ymm1; vmovddup 40(%0),%%ymm2;" acc_m2n1_exp(1,2,0,6,7)\ | |||
| "addq $64,%0;" | |||
| #define KERNEL_k1m4n2 \ | |||
| "vbroadcastf128 (%1),%%ymm0; vbroadcastf128 16(%1),%%ymm1; addq $32,%1;"\ | |||
| "vmovddup (%0),%%ymm2; vmovddup 8(%0),%%ymm3;" acc_m2n1_exp(2,3,0,4,5) acc_m2n1_exp(2,3,1,8,9)\ | |||
| "vmovddup 32(%0),%%ymm2; vmovddup 40(%0),%%ymm3;" acc_m2n1_exp(2,3,0,6,7) acc_m2n1_exp(2,3,1,10,11)\ | |||
| "addq $64,%0;" | |||
| /* contracted accumulators for m4n4 and m4n6 */ | |||
| #define acc_m4n2_con(ua,la,luc,llc,ruc,rlc,lboff,rboff,...) \ | |||
| "vbroadcastsd "#lboff"("#__VA_ARGS__"),%%ymm2;" acc_m4n1_con(ua,la,2,luc,llc)\ | |||
| "vbroadcastsd "#rboff"("#__VA_ARGS__"),%%ymm3;" acc_m4n1_con(ua,la,3,ruc,rlc) | |||
| #define KERNEL_1_k1m4n4 \ | |||
| "vmovupd (%0),%%ymm0; vmovupd 32(%0),%%ymm1; prefetcht0 512(%0); addq $64,%0;"\ | |||
| acc_m4n2_con(0,1,4,5,6,7,0,16,%1) acc_m4n2_con(0,1,8,9,10,11,0,16,%1,%%r12,1) | |||
| #define KERNEL_2_k1m4n4 \ | |||
| "vpermilpd $5,%%ymm0,%%ymm0; vpermilpd $5,%%ymm1,%%ymm1;"\ | |||
| acc_m4n2_con(0,1,4,5,6,7,8,24,%1) acc_m4n2_con(0,1,8,9,10,11,8,24,%1,%%r12,1) | |||
| #define KERNEL_1_k1m4n6 KERNEL_1_k1m4n4 acc_m4n2_con(0,1,12,13,14,15,0,16,%1,%%r12,2) | |||
| #define KERNEL_2_k1m4n6 KERNEL_2_k1m4n4 acc_m4n2_con(0,1,12,13,14,15,8,24,%1,%%r12,2) | |||
| #define KERNEL_k1m4n4 KERNEL_1_k1m4n4 KERNEL_2_k1m4n4 "addq $32,%1;" | |||
| #define KERNEL_k1m4n6 KERNEL_1_k1m4n6 KERNEL_2_k1m4n6 "addq $32,%1;" | |||
| #define zero_4ymm(no1,no2,no3,no4) \ | |||
| "vpxor %%ymm"#no1",%%ymm"#no1",%%ymm"#no1"; vpxor %%ymm"#no2",%%ymm"#no2",%%ymm"#no2";"\ | |||
| "vpxor %%ymm"#no3",%%ymm"#no3",%%ymm"#no3"; vpxor %%ymm"#no4",%%ymm"#no4",%%ymm"#no4";" | |||
| /* initialization and storage macros */ | |||
| #define INIT_m4n1 zero_4ymm(4,5,6,7) | |||
| #define INIT_m4n2 zero_4ymm(4,5,6,7) zero_4ymm(8,9,10,11) | |||
| #define INIT_m4n4 zero_4ymm(4,5,6,7) zero_4ymm(8,9,10,11) | |||
| #define INIT_m4n6 INIT_m4n4 zero_4ymm(12,13,14,15) | |||
| #if A_CONJ == B_CONJ | |||
| #define cont_expacc(cl,cr,dst) "vpermilpd $5,%%ymm"#cr",%%ymm"#cr"; vaddsubpd %%ymm"#cl",%%ymm"#cr",%%ymm"#dst";" | |||
| #else | |||
| #define cont_expacc(cl,cr,dst) "vpermilpd $5,%%ymm"#cr",%%ymm"#cr"; vaddsubpd %%ymm"#cr",%%ymm"#cl",%%ymm"#dst";" | |||
| #endif | |||
| #if A_CONJ == 0 | |||
| #define save_1ymm(c,tmp,off,alpr,alpi,...) \ | |||
| "vpermilpd $5,%%ymm"#c",%%ymm"#tmp"; vfmsubadd213pd "#off"("#__VA_ARGS__"),%%ymm"#alpr",%%ymm"#c";"\ | |||
| "vfmsubadd231pd %%ymm"#tmp",%%ymm"#alpi",%%ymm"#c"; vmovupd %%ymm"#c","#off"("#__VA_ARGS__");" | |||
| #else | |||
| #define save_1ymm(c,tmp,off,alpr,alpi,...) \ | |||
| "vpermilpd $5,%%ymm"#c",%%ymm"#tmp"; vfmaddsub213pd "#off"("#__VA_ARGS__"),%%ymm"#alpi",%%ymm"#tmp";"\ | |||
| "vfmaddsub231pd %%ymm"#c",%%ymm"#alpr",%%ymm"#tmp"; vmovupd %%ymm"#tmp","#off"("#__VA_ARGS__");" | |||
| #endif | |||
| #define save_init_m4 "movq %2,%3; addq $64,%2; vbroadcastsd (%6),%%ymm0; vbroadcastsd 8(%6),%%ymm1;" | |||
| #define SAVE_m4n1 save_init_m4 cont_expacc(4,5,4) cont_expacc(6,7,6) save_1ymm(4,2,0,0,1,%3) save_1ymm(6,3,32,0,1,%3) | |||
| #define SAVE_m4n2 SAVE_m4n1\ | |||
| cont_expacc(8,9,8) cont_expacc(10,11,10) save_1ymm(8,2,0,0,1,%3,%4,1) save_1ymm(10,3,32,0,1,%3,%4,1) | |||
| #define SAVE_m4n4 save_init_m4\ | |||
| save_1ymm(4,2,0,0,1,%3) save_1ymm(5,3,32,0,1,%3) save_1ymm(6,2,0,0,1,%3,%4,1) save_1ymm(7,3,32,0,1,%3,%4,1) "leaq (%3,%4,2),%3;"\ | |||
| save_1ymm(8,2,0,0,1,%3) save_1ymm(9,3,32,0,1,%3) save_1ymm(10,2,0,0,1,%3,%4,1) save_1ymm(11,3,32,0,1,%3,%4,1) | |||
| #define SAVE_m4n6 SAVE_m4n4 "leaq (%3,%4,2),%3;"\ | |||
| save_1ymm(12,2,0,0,1,%3) save_1ymm(13,3,32,0,1,%3) save_1ymm(14,2,0,0,1,%3,%4,1) save_1ymm(15,3,32,0,1,%3,%4,1) | |||
| #define COMPUTE_m4(ndim) \ | |||
| "movq %%r14,%1;" INIT_m4n##ndim "movq %2,%3; movq %%r13,%5;"\ | |||
| "testq %5,%5; jz "#ndim"4443f; cmpq $10,%5; jb "#ndim"4442f;"\ | |||
| "movq $10,%5; movq $84,%%r15;"\ | |||
| #ndim"4441:\n\t"\ | |||
| "prefetcht1 (%3); subq $63,%3; addq %%r15,%3;"\ | |||
| "prefetcht0 96(%1); prefetcht0 96(%1,%%r12,1); prefetcht0 96(%1,%%r12,2);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\ | |||
| "testq $12,%5; movq $84,%%r15; cmovz %4,%%r15; prefetcht1 (%8); addq $16,%8;"\ | |||
| "prefetcht0 96(%1); prefetcht0 96(%1,%%r12,1); prefetcht0 96(%1,%%r12,2);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\ | |||
| "addq $4,%5; cmpq %5,%%r13; jnb "#ndim"4441b;"\ | |||
| "movq %2,%3; negq %5; leaq 10(%%r13,%5,1),%5; prefetcht0 (%6); prefetcht0 15(%6);"\ | |||
| #ndim"4442:\n\t"\ | |||
| "prefetcht0 (%3); prefetcht0 63(%3); addq %4,%3;"\ | |||
| KERNEL_k1m4n##ndim "decq %5; jnz "#ndim"4442b;"\ | |||
| #ndim"4443:\n\t"\ | |||
| "prefetcht0 (%%r14); prefetcht0 64(%%r14);" SAVE_m4n##ndim | |||
| /* m=2, ymm 0-3 temp, ymm 4-15 acc, expanded accumulators */ | |||
| #define KERNEL_k1m2n1 \ | |||
| "vmovddup (%0),%%ymm1; vmovddup 8(%0),%%ymm2; addq $32,%0;"\ | |||
| "vbroadcastf128 (%1),%%ymm0;" acc_m2n1_exp(1,2,0,4,5) "addq $16,%1;" | |||
| #define acc_m2n2_exp(c1l,c1r,c2l,c2r,...) \ | |||
| "vbroadcastf128 ("#__VA_ARGS__"),%%ymm2;" acc_m2n1_exp(0,1,2,c1l,c1r)\ | |||
| "vbroadcastf128 16("#__VA_ARGS__"),%%ymm3;" acc_m2n1_exp(0,1,3,c2l,c2r) | |||
| #define KERNEL_h_k1m2n2 \ | |||
| "vmovddup (%0),%%ymm0; vmovddup 8(%0),%%ymm1; addq $32,%0;" acc_m2n2_exp(4,5,6,7,%1) | |||
| #define KERNEL_h_k1m2n4 KERNEL_h_k1m2n2 acc_m2n2_exp(8,9,10,11,%1,%%r12,1) | |||
| #define KERNEL_h_k1m2n6 KERNEL_h_k1m2n4 acc_m2n2_exp(12,13,14,15,%1,%%r12,2) | |||
| #define KERNEL_k1m2n2 KERNEL_h_k1m2n2 "addq $32,%1;" | |||
| #define KERNEL_k1m2n4 KERNEL_h_k1m2n4 "addq $32,%1;" | |||
| #define KERNEL_k1m2n6 KERNEL_h_k1m2n6 "addq $32,%1;" | |||
| #define INIT_m2n1 "vpxor %%ymm4,%%ymm4,%%ymm4; vpxor %%ymm5,%%ymm5,%%ymm5;" | |||
| #define INIT_m2n2 zero_4ymm(4,5,6,7) | |||
| #define INIT_m2n4 INIT_m2n2 zero_4ymm(8,9,10,11) | |||
| #define INIT_m2n6 INIT_m2n4 zero_4ymm(12,13,14,15) | |||
| #define save_init_m2 "movq %2,%3; addq $32,%2; vbroadcastsd (%6),%%ymm0; vbroadcastsd 8(%6),%%ymm1;" | |||
| #define SAVE_m2n1 save_init_m2 cont_expacc(4,5,4) save_1ymm(4,2,0,0,1,%3) | |||
| #define SAVE_m2n2 SAVE_m2n1 cont_expacc(6,7,6) save_1ymm(6,3,0,0,1,%3,%4,1) | |||
| #define SAVE_m2n4 SAVE_m2n2 "leaq (%3,%4,2),%3;"\ | |||
| cont_expacc(8,9,8) cont_expacc(10,11,10) save_1ymm(8,2,0,0,1,%3) save_1ymm(10,3,0,0,1,%3,%4,1) | |||
| #define SAVE_m2n6 SAVE_m2n4 "leaq (%3,%4,2),%3;"\ | |||
| cont_expacc(12,13,12) cont_expacc(14,15,14) save_1ymm(12,2,0,0,1,%3) save_1ymm(14,3,0,0,1,%3,%4,1) | |||
| #define COMPUTE_m2(ndim) \ | |||
| "movq %%r14,%1;" INIT_m2n##ndim "movq %%r13,%5;"\ | |||
| "testq %5,%5; jz "#ndim"2222f;"\ | |||
| #ndim"2221:\n\t"\ | |||
| KERNEL_k1m2n##ndim\ | |||
| "decq %5; jnz "#ndim"2221b;"\ | |||
| #ndim"2222:\n\t"\ | |||
| SAVE_m2n##ndim | |||
| /* m=1, vmm 0-3 temp, vmm 4-15 acc, expanded accumulators */ | |||
| #if A_CONJ == B_CONJ | |||
| #define acc_m1n1_exp(ar,ai,b2,cl,cr) "vfmadd231pd %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfmadd231pd %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";" | |||
| #define acc_m1n2_exp(arb,aib,b4,cl,cr) "vfmadd231pd %%ymm"#arb",%%ymm"#b4",%%ymm"#cl"; vfmadd231pd %%ymm"#aib",%%ymm"#b4",%%ymm"#cr";" | |||
| #else | |||
| #define acc_m1n1_exp(ar,ai,b2,cl,cr) "vfmadd231pd %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfnmadd231pd %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";" | |||
| #define acc_m1n2_exp(arb,aib,b4,cl,cr) "vfmadd231pd %%ymm"#arb",%%ymm"#b4",%%ymm"#cl"; vfnmadd231pd %%ymm"#aib",%%ymm"#b4",%%ymm"#cr";" | |||
| #endif | |||
| #define KERNEL_k1m1n1 \ | |||
| "vmovddup (%0),%%xmm0; vmovddup 8(%0),%%xmm1; addq $16,%0;"\ | |||
| "vmovupd (%1),%%xmm2; addq $16,%1;" acc_m1n1_exp(0,1,2,4,5) | |||
| #define KERNEL_h_k1m1n2 \ | |||
| "vbroadcastsd (%0),%%ymm0; vbroadcastsd 8(%0),%%ymm1; addq $16,%0;"\ | |||
| "vmovupd (%1),%%ymm2;" acc_m1n2_exp(0,1,2,4,5) | |||
| #define KERNEL_h_k1m1n4 KERNEL_h_k1m1n2 "vmovupd (%1,%%r12,1),%%ymm2;" acc_m1n2_exp(0,1,2,6,7) | |||
| #define KERNEL_h_k1m1n6 KERNEL_h_k1m1n4 "vmovupd (%1,%%r12,2),%%ymm2;" acc_m1n2_exp(0,1,2,8,9) | |||
| #define KERNEL_k1m1n2 KERNEL_h_k1m1n2 "addq $32,%1;" | |||
| #define KERNEL_k1m1n4 KERNEL_h_k1m1n4 "addq $32,%1;" | |||
| #define KERNEL_k1m1n6 KERNEL_h_k1m1n6 "addq $32,%1;" | |||
| #define INIT_m1n1 "vpxor %%xmm4,%%xmm4,%%xmm4; vpxor %%xmm5,%%xmm5,%%xmm5;" | |||
| #define INIT_m1n2 "vpxor %%ymm4,%%ymm4,%%ymm4; vpxor %%ymm5,%%ymm5,%%ymm5;" | |||
| #define INIT_m1n4 INIT_m1n2 "vpxor %%ymm6,%%ymm6,%%ymm6; vpxor %%ymm7,%%ymm7,%%ymm7;" | |||
| #define INIT_m1n6 INIT_m1n4 "vpxor %%ymm8,%%ymm8,%%ymm8; vpxor %%ymm9,%%ymm9,%%ymm9;" | |||
| #if A_CONJ == B_CONJ | |||
| #define cont_expxmmacc(cl,cr,dst) "vpermilpd $5,%%xmm"#cr",%%xmm"#cr"; vaddsubpd %%xmm"#cl",%%xmm"#cr",%%xmm"#dst";" | |||
| #else | |||
| #define cont_expxmmacc(cl,cr,dst) "vpermilpd $5,%%xmm"#cr",%%xmm"#cr"; vaddsubpd %%xmm"#cr",%%xmm"#cl",%%xmm"#dst";" | |||
| #endif | |||
| #if A_CONJ == 0 | |||
| #define save_m1n1(c,tmp,alpr,alpi) \ | |||
| "vpermilpd $5,%%xmm"#c",%%xmm"#tmp"; vfmsubadd213pd (%3),%%xmm"#alpr",%%xmm"#c";"\ | |||
| "vfmsubadd231pd %%xmm"#tmp",%%xmm"#alpi",%%xmm"#c"; vmovupd %%xmm"#c",(%3);" | |||
| #define save_m1n2(c,tmp1,tmp2,alpr,alpi) \ | |||
| "vpermilpd $5,%%ymm"#c",%%ymm"#tmp1"; vmovupd (%3),%%xmm"#tmp2"; vinsertf128 $1,(%3,%4,1),%%ymm"#tmp2",%%ymm"#tmp2";"\ | |||
| "vfmsubadd213pd %%ymm"#tmp2",%%ymm"#alpr",%%ymm"#c"; vfmsubadd231pd %%ymm"#tmp1",%%ymm"#alpi",%%ymm"#c";"\ | |||
| "vmovupd %%xmm"#c",(%3); vextractf128 $1,%%ymm"#c",(%3,%4,1); leaq (%3,%4,2),%3;" | |||
| #else | |||
| #define save_m1n1(c,tmp,alpr,alpi) \ | |||
| "vpermilpd $5,%%xmm"#c",%%xmm"#tmp"; vfmaddsub213pd (%3),%%xmm"#alpi",%%xmm"#tmp";"\ | |||
| "vfmaddsub231pd %%xmm"#c",%%xmm"#alpr",%%xmm"#tmp"; vmovupd %%xmm"#tmp",(%3);" | |||
| #define save_m1n2(c,tmp1,tmp2,alpr,alpi) \ | |||
| "vpermilpd $5,%%ymm"#c",%%ymm"#tmp1"; vmovupd (%3),%%xmm"#tmp2"; vinsertf128 $1,(%3,%4,1),%%ymm"#tmp2",%%ymm"#tmp2";"\ | |||
| "vfmaddsub213pd %%ymm"#tmp2",%%ymm"#alpi",%%ymm"#tmp1"; vfmaddsub231pd %%ymm"#c",%%ymm"#alpr",%%ymm"#tmp1";"\ | |||
| "vmovupd %%xmm"#tmp1",(%3); vextractf128 $1,%%ymm"#tmp1",(%3,%4,1); leaq (%3,%4,2),%3;" | |||
| #endif | |||
| #define save_init_m1 "movq %2,%3; addq $16,%2; vbroadcastsd (%6),%%ymm0; vbroadcastsd 8(%6),%%ymm1;" | |||
| #define SAVE_m1n1 save_init_m1 cont_expxmmacc(4,5,4) save_m1n1(4,2,0,1) | |||
| #define SAVE_m1n2 save_init_m1 cont_expacc(4,5,4) save_m1n2(4,2,3,0,1) | |||
| #define SAVE_m1n4 SAVE_m1n2 cont_expacc(6,7,6) save_m1n2(6,2,3,0,1) | |||
| #define SAVE_m1n6 SAVE_m1n4 cont_expacc(8,9,8) save_m1n2(8,2,3,0,1) | |||
| #define COMPUTE_m1(ndim) \ | |||
| "movq %%r14,%1;" INIT_m1n##ndim "movq %%r13,%5;"\ | |||
| "testq %5,%5; jz "#ndim"1112f;"\ | |||
| #ndim"1111:\n\t"\ | |||
| KERNEL_k1m1n##ndim\ | |||
| "decq %5; jnz "#ndim"1111b;"\ | |||
| #ndim"1112:\n\t"\ | |||
| SAVE_m1n##ndim | |||
| #define COMPUTE(ndim) {\ | |||
| b_pref = b_ptr + ndim * K *2;\ | |||
| __asm__ __volatile__ (\ | |||
| "movq %1,%%r14; movq %5,%%r13; movq %5,%%r12; salq $5,%%r12; movq %7,%%r11;"\ | |||
| "cmpq $4,%7; jb "#ndim"9992f;"\ | |||
| #ndim"9991:\n\t"\ | |||
| COMPUTE_m4(ndim)\ | |||
| "subq $4,%7; cmpq $4,%7; jnb "#ndim"9991b;"\ | |||
| #ndim"9992:\n\t"\ | |||
| "cmpq $2,%7; jb "#ndim"9993f;"\ | |||
| COMPUTE_m2(ndim) "subq $2,%7;"\ | |||
| #ndim"9993:\n\t"\ | |||
| "testq %7,%7; jz "#ndim"9994f;"\ | |||
| COMPUTE_m1(ndim)\ | |||
| #ndim"9994:\n\t"\ | |||
| "movq %%r14,%1; movq %%r13,%5; movq %%r11,%7; vzeroupper;"\ | |||
| :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_in_bytes),"+r"(K),"+r"(alp),"+r"(M),"+r"(b_pref)\ | |||
| ::"cc","memory","r11","r12","r13","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5",\ | |||
| "xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ | |||
| a_ptr -= M * K *2; b_ptr += ndim * K *2; c_ptr += (ndim * LDC - M) * 2;\ | |||
| } | |||
| int __attribute__ ((noinline)) | |||
| CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alphar, double alphai, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG LDC) | |||
| { | |||
| if(m==0||n==0||k==0||(alphar==0.0 && alphai==0.0)) return 0; | |||
| int64_t ldc_in_bytes = (int64_t)LDC * sizeof(double) * 2; | |||
| #if A_CONJ == B_CONJ | |||
| double const_val[2] = {-alphar, -alphai}; | |||
| #else | |||
| double const_val[2] = {alphar, alphai}; | |||
| #endif | |||
| int64_t M = (int64_t)m, K = (int64_t)k; | |||
| BLASLONG n_count = n; | |||
| double *a_ptr = A,*b_ptr = B,*c_ptr = C,*c_tmp = C,*alp = const_val,*b_pref = B; | |||
| for(;n_count>5;n_count-=6) COMPUTE(6) | |||
| for(;n_count>3;n_count-=4) COMPUTE(4) | |||
| for(;n_count>1;n_count-=2) COMPUTE(2) | |||
| if(n_count>0) COMPUTE(1) | |||
| return 0; | |||
| } | |||
| @@ -668,8 +668,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define SGEMM_DEFAULT_P 768 | |||
| #define DGEMM_DEFAULT_P 512 | |||
| #define CGEMM_DEFAULT_P 384 | |||
| #define ZGEMM_DEFAULT_P 256 | |||
| #define CGEMM_DEFAULT_P 256 | |||
| #define ZGEMM_DEFAULT_P 192 | |||
| #ifdef WINDOWS_ABI | |||
| #define SGEMM_DEFAULT_Q 320 | |||
| @@ -678,8 +678,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define SGEMM_DEFAULT_Q 384 | |||
| #define DGEMM_DEFAULT_Q 256 | |||
| #endif | |||
| #define CGEMM_DEFAULT_Q 192 | |||
| #define ZGEMM_DEFAULT_Q 128 | |||
| #define CGEMM_DEFAULT_Q 256 | |||
| #define ZGEMM_DEFAULT_Q 192 | |||
| #define SGEMM_DEFAULT_R sgemm_r | |||
| #define DGEMM_DEFAULT_R 13824 | |||
| @@ -693,15 +693,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define XGEMM_DEFAULT_R xgemm_r | |||
| #define XGEMM_DEFAULT_Q 128 | |||
| #define CGEMM3M_DEFAULT_UNROLL_N 8 | |||
| #define CGEMM3M_DEFAULT_UNROLL_M 4 | |||
| #define CGEMM3M_DEFAULT_UNROLL_N 4 | |||
| #define CGEMM3M_DEFAULT_UNROLL_M 8 | |||
| #define ZGEMM3M_DEFAULT_UNROLL_N 8 | |||
| #define ZGEMM3M_DEFAULT_UNROLL_M 2 | |||
| #define CGEMM3M_DEFAULT_P 448 | |||
| #define CGEMM3M_DEFAULT_P 320 | |||
| #define ZGEMM3M_DEFAULT_P 224 | |||
| #define XGEMM3M_DEFAULT_P 112 | |||
| #define CGEMM3M_DEFAULT_Q 224 | |||
| #define CGEMM3M_DEFAULT_Q 320 | |||
| #define ZGEMM3M_DEFAULT_Q 224 | |||
| #define XGEMM3M_DEFAULT_Q 224 | |||
| #define CGEMM3M_DEFAULT_R 12288 | |||
| @@ -1571,8 +1571,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define SGEMM_DEFAULT_P 768 | |||
| #define DGEMM_DEFAULT_P 512 | |||
| #define CGEMM_DEFAULT_P 384 | |||
| #define ZGEMM_DEFAULT_P 256 | |||
| #define CGEMM_DEFAULT_P 256 | |||
| #define ZGEMM_DEFAULT_P 192 | |||
| #ifdef WINDOWS_ABI | |||
| #define SGEMM_DEFAULT_Q 320 | |||
| @@ -1581,8 +1581,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define SGEMM_DEFAULT_Q 384 | |||
| #define DGEMM_DEFAULT_Q 256 | |||
| #endif | |||
| #define CGEMM_DEFAULT_Q 192 | |||
| #define ZGEMM_DEFAULT_Q 128 | |||
| #define CGEMM_DEFAULT_Q 256 | |||
| #define ZGEMM_DEFAULT_Q 192 | |||
| #define SGEMM_DEFAULT_R sgemm_r | |||
| #define DGEMM_DEFAULT_R 13824 | |||
| @@ -1596,15 +1596,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define XGEMM_DEFAULT_R xgemm_r | |||
| #define XGEMM_DEFAULT_Q 128 | |||
| #define CGEMM3M_DEFAULT_UNROLL_N 8 | |||
| #define CGEMM3M_DEFAULT_UNROLL_M 4 | |||
| #define CGEMM3M_DEFAULT_UNROLL_N 4 | |||
| #define CGEMM3M_DEFAULT_UNROLL_M 8 | |||
| #define ZGEMM3M_DEFAULT_UNROLL_N 8 | |||
| #define ZGEMM3M_DEFAULT_UNROLL_M 2 | |||
| #define CGEMM3M_DEFAULT_P 448 | |||
| #define CGEMM3M_DEFAULT_P 320 | |||
| #define ZGEMM3M_DEFAULT_P 224 | |||
| #define XGEMM3M_DEFAULT_P 112 | |||
| #define CGEMM3M_DEFAULT_Q 224 | |||
| #define CGEMM3M_DEFAULT_Q 320 | |||
| #define ZGEMM3M_DEFAULT_Q 224 | |||
| #define XGEMM3M_DEFAULT_Q 224 | |||
| #define CGEMM3M_DEFAULT_R 12288 | |||