workaround for sandybridge zgemm kerneltags/v0.2.14^2
| @@ -6,8 +6,13 @@ include $(TOPDIR)/Makefile.system | |||||
| #LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm | #LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm | ||||
| # ACML custom | # ACML custom | ||||
| ACML=/opt/pb/acml-5-3-1-gfortran-64bit/gfortran64_fma4_mp/lib | |||||
| LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm | |||||
| #ACML=/opt/pb/acml-5-3-1-gfortran-64bit/gfortran64_fma4_mp/lib | |||||
| #LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm | |||||
| # ACML 6.1 custom | |||||
| ACML=/home/saar/acml6.1/gfortran64_mp/lib | |||||
| LIBACML = -fopenmp $(ACML)/libacml_mp.so -lgfortran -lm | |||||
| # Atlas Ubuntu | # Atlas Ubuntu | ||||
| #ATLAS=/usr/lib/atlas-base | #ATLAS=/usr/lib/atlas-base | ||||
| @@ -114,7 +114,7 @@ static void *huge_malloc(BLASLONG size){ | |||||
| #endif | #endif | ||||
| int MAIN__(int argc, char *argv[]){ | |||||
| int main(int argc, char *argv[]){ | |||||
| FLOAT *x, *y; | FLOAT *x, *y; | ||||
| FLOAT alpha[2] = { 2.0, 2.0 }; | FLOAT alpha[2] = { 2.0, 2.0 }; | ||||
| @@ -198,4 +198,4 @@ int MAIN__(int argc, char *argv[]){ | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||||
| @@ -117,7 +117,7 @@ static __inline double getmflops(int ratio, int m, double secs){ | |||||
| } | } | ||||
| int MAIN__(int argc, char *argv[]){ | |||||
| int main(int argc, char *argv[]){ | |||||
| #ifndef COMPLEX | #ifndef COMPLEX | ||||
| char *trans[] = {"T", "N"}; | char *trans[] = {"T", "N"}; | ||||
| @@ -273,4 +273,4 @@ int MAIN__(int argc, char *argv[]){ | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||||
| @@ -108,7 +108,7 @@ static void *huge_malloc(BLASLONG size){ | |||||
| #endif | #endif | ||||
| int MAIN__(int argc, char *argv[]){ | |||||
| int main(int argc, char *argv[]){ | |||||
| FLOAT *x, *y; | FLOAT *x, *y; | ||||
| FLOAT result; | FLOAT result; | ||||
| @@ -192,4 +192,4 @@ int MAIN__(int argc, char *argv[]){ | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||||
| @@ -139,7 +139,7 @@ static void *huge_malloc(BLASLONG size){ | |||||
| #endif | #endif | ||||
| int MAIN__(int argc, char *argv[]){ | |||||
| int main(int argc, char *argv[]){ | |||||
| FLOAT *a,*vl,*vr,*wi,*wr,*work,*rwork; | FLOAT *a,*vl,*vr,*wi,*wr,*work,*rwork; | ||||
| FLOAT wkopt[4]; | FLOAT wkopt[4]; | ||||
| @@ -257,4 +257,4 @@ int MAIN__(int argc, char *argv[]){ | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||||
| @@ -118,14 +118,15 @@ static void *huge_malloc(BLASLONG size){ | |||||
| #endif | #endif | ||||
| int MAIN__(int argc, char *argv[]){ | |||||
| int main(int argc, char *argv[]){ | |||||
| FLOAT *a, *b, *c; | FLOAT *a, *b, *c; | ||||
| FLOAT alpha[] = {1.0, 1.0}; | FLOAT alpha[] = {1.0, 1.0}; | ||||
| FLOAT beta [] = {1.0, 1.0}; | FLOAT beta [] = {1.0, 1.0}; | ||||
| char trans='N'; | char trans='N'; | ||||
| blasint m, i, j; | |||||
| blasint m, n, i, j; | |||||
| int loops = 1; | int loops = 1; | ||||
| int has_param_n=0; | |||||
| int l; | int l; | ||||
| char *p; | char *p; | ||||
| @@ -162,6 +163,11 @@ int MAIN__(int argc, char *argv[]){ | |||||
| if ( p != NULL ) | if ( p != NULL ) | ||||
| loops = atoi(p); | loops = atoi(p); | ||||
| if ((p = getenv("OPENBLAS_PARAM_N"))) { | |||||
| n = atoi(p); | |||||
| has_param_n=1; | |||||
| } | |||||
| #ifdef linux | #ifdef linux | ||||
| srandom(getpid()); | srandom(getpid()); | ||||
| @@ -174,7 +180,14 @@ int MAIN__(int argc, char *argv[]){ | |||||
| timeg=0; | timeg=0; | ||||
| fprintf(stderr, " %6d : ", (int)m); | |||||
| if ( has_param_n == 1 && n <= m ) | |||||
| n=n; | |||||
| else | |||||
| n=m; | |||||
| fprintf(stderr, " %6dx%d : ", (int)m, (int)n); | |||||
| for (l=0; l<loops; l++) | for (l=0; l<loops; l++) | ||||
| { | { | ||||
| @@ -189,7 +202,7 @@ int MAIN__(int argc, char *argv[]){ | |||||
| gettimeofday( &start, (struct timezone *)0); | gettimeofday( &start, (struct timezone *)0); | ||||
| GEMM (&trans, &trans, &m, &m, &m, alpha, a, &m, b, &m, beta, c, &m ); | |||||
| GEMM (&trans, &trans, &m, &n, &m, alpha, a, &m, b, &m, beta, c, &m ); | |||||
| gettimeofday( &stop, (struct timezone *)0); | gettimeofday( &stop, (struct timezone *)0); | ||||
| @@ -202,11 +215,11 @@ int MAIN__(int argc, char *argv[]){ | |||||
| timeg /= loops; | timeg /= loops; | ||||
| fprintf(stderr, | fprintf(stderr, | ||||
| " %10.2f MFlops\n", | " %10.2f MFlops\n", | ||||
| COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)m / timeg * 1.e-6); | |||||
| COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)n / timeg * 1.e-6); | |||||
| } | } | ||||
| return 0; | return 0; | ||||
| } | } | ||||
| void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||||
| @@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){ | |||||
| #endif | #endif | ||||
| int MAIN__(int argc, char *argv[]){ | |||||
| int main(int argc, char *argv[]){ | |||||
| FLOAT *a, *b, *c; | FLOAT *a, *b, *c; | ||||
| FLOAT alpha[] = {1.0, 1.0}; | FLOAT alpha[] = {1.0, 1.0}; | ||||
| @@ -209,4 +209,4 @@ int MAIN__(int argc, char *argv[]){ | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||||
| @@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){ | |||||
| #endif | #endif | ||||
| int MAIN__(int argc, char *argv[]){ | |||||
| int main(int argc, char *argv[]){ | |||||
| FLOAT *a, *x, *y; | FLOAT *a, *x, *y; | ||||
| FLOAT alpha[] = {1.0, 1.0}; | FLOAT alpha[] = {1.0, 1.0}; | ||||
| @@ -266,4 +266,4 @@ int MAIN__(int argc, char *argv[]){ | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||||
| @@ -108,7 +108,7 @@ static void *huge_malloc(BLASLONG size){ | |||||
| #endif | #endif | ||||
| int MAIN__(int argc, char *argv[]){ | |||||
| int main(int argc, char *argv[]){ | |||||
| FLOAT *a, *x, *y; | FLOAT *a, *x, *y; | ||||
| FLOAT alpha[] = {1.0, 1.0}; | FLOAT alpha[] = {1.0, 1.0}; | ||||
| @@ -214,5 +214,5 @@ int MAIN__(int argc, char *argv[]){ | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||||
| @@ -137,7 +137,7 @@ static void *huge_malloc(BLASLONG size){ | |||||
| #endif | #endif | ||||
| int MAIN__(int argc, char *argv[]){ | |||||
| int main(int argc, char *argv[]){ | |||||
| FLOAT *a,*work; | FLOAT *a,*work; | ||||
| FLOAT wkopt[4]; | FLOAT wkopt[4]; | ||||
| @@ -231,4 +231,4 @@ int MAIN__(int argc, char *argv[]){ | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||||
| @@ -107,7 +107,7 @@ static void *huge_malloc(BLASLONG size){ | |||||
| #endif | #endif | ||||
| int MAIN__(int argc, char *argv[]){ | |||||
| int main(int argc, char *argv[]){ | |||||
| FLOAT *a, *b, *c; | FLOAT *a, *b, *c; | ||||
| FLOAT alpha[] = {1.0, 1.0}; | FLOAT alpha[] = {1.0, 1.0}; | ||||
| @@ -189,4 +189,4 @@ int MAIN__(int argc, char *argv[]){ | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||||
| @@ -108,7 +108,7 @@ static void *huge_malloc(BLASLONG size){ | |||||
| #endif | #endif | ||||
| int MAIN__(int argc, char *argv[]){ | |||||
| int main(int argc, char *argv[]){ | |||||
| FLOAT *a, *x, *y; | FLOAT *a, *x, *y; | ||||
| FLOAT alpha[] = {1.0, 1.0}; | FLOAT alpha[] = {1.0, 1.0}; | ||||
| @@ -205,4 +205,4 @@ int MAIN__(int argc, char *argv[]){ | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||||
| @@ -106,7 +106,7 @@ static void *huge_malloc(BLASLONG size){ | |||||
| #endif | #endif | ||||
| int MAIN__(int argc, char *argv[]){ | |||||
| int main(int argc, char *argv[]){ | |||||
| FLOAT *a, *b, *c; | FLOAT *a, *b, *c; | ||||
| FLOAT alpha[] = {1.0, 1.0}; | FLOAT alpha[] = {1.0, 1.0}; | ||||
| @@ -188,4 +188,4 @@ int MAIN__(int argc, char *argv[]){ | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||||
| @@ -108,7 +108,7 @@ static void *huge_malloc(BLASLONG size){ | |||||
| #endif | #endif | ||||
| int MAIN__(int argc, char *argv[]){ | |||||
| int main(int argc, char *argv[]){ | |||||
| FLOAT *a, *c; | FLOAT *a, *c; | ||||
| FLOAT alpha[] = {1.0, 1.0}; | FLOAT alpha[] = {1.0, 1.0}; | ||||
| @@ -186,4 +186,4 @@ int MAIN__(int argc, char *argv[]){ | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||||
| @@ -137,7 +137,7 @@ static void *huge_malloc(BLASLONG size){ | |||||
| #endif | #endif | ||||
| int MAIN__(int argc, char *argv[]){ | |||||
| int main(int argc, char *argv[]){ | |||||
| FLOAT *a, *b; | FLOAT *a, *b; | ||||
| blasint *ipiv; | blasint *ipiv; | ||||
| @@ -270,4 +270,4 @@ int MAIN__(int argc, char *argv[]){ | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||||
| @@ -114,7 +114,7 @@ int gettimeofday(struct timeval *tv, void *tz){ | |||||
| #endif | #endif | ||||
| int MAIN__(int argc, char *argv[]){ | |||||
| int main(int argc, char *argv[]){ | |||||
| #ifndef COMPLEX | #ifndef COMPLEX | ||||
| char *trans[] = {"T", "N"}; | char *trans[] = {"T", "N"}; | ||||
| @@ -278,5 +278,5 @@ int MAIN__(int argc, char *argv[]){ | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||||
| @@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){ | |||||
| #endif | #endif | ||||
| int MAIN__(int argc, char *argv[]){ | |||||
| int main(int argc, char *argv[]){ | |||||
| FLOAT *a, *b, *c; | FLOAT *a, *b, *c; | ||||
| FLOAT alpha[] = {1.0, 1.0}; | FLOAT alpha[] = {1.0, 1.0}; | ||||
| @@ -200,4 +200,4 @@ int MAIN__(int argc, char *argv[]){ | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||||
| @@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){ | |||||
| #endif | #endif | ||||
| int MAIN__(int argc, char *argv[]){ | |||||
| int main(int argc, char *argv[]){ | |||||
| FLOAT *a, *x, *y; | FLOAT *a, *x, *y; | ||||
| FLOAT alpha[] = {1.0, 1.0}; | FLOAT alpha[] = {1.0, 1.0}; | ||||
| @@ -215,4 +215,4 @@ int MAIN__(int argc, char *argv[]){ | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||||
| @@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){ | |||||
| #endif | #endif | ||||
| int MAIN__(int argc, char *argv[]){ | |||||
| int main(int argc, char *argv[]){ | |||||
| FLOAT *a, *b, *c; | FLOAT *a, *b, *c; | ||||
| FLOAT alpha[] = {1.0, 1.0}; | FLOAT alpha[] = {1.0, 1.0}; | ||||
| @@ -200,4 +200,4 @@ int MAIN__(int argc, char *argv[]){ | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||||
| @@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){ | |||||
| #endif | #endif | ||||
| int MAIN__(int argc, char *argv[]){ | |||||
| int main(int argc, char *argv[]){ | |||||
| FLOAT *a, *c; | FLOAT *a, *c; | ||||
| FLOAT alpha[] = {1.0, 1.0}; | FLOAT alpha[] = {1.0, 1.0}; | ||||
| @@ -196,4 +196,4 @@ int MAIN__(int argc, char *argv[]){ | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||||
| @@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){ | |||||
| #endif | #endif | ||||
| int MAIN__(int argc, char *argv[]){ | |||||
| int main(int argc, char *argv[]){ | |||||
| FLOAT *a, *b; | FLOAT *a, *b; | ||||
| FLOAT alpha[] = {1.0, 1.0}; | FLOAT alpha[] = {1.0, 1.0}; | ||||
| @@ -199,4 +199,4 @@ int MAIN__(int argc, char *argv[]){ | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||||
| @@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){ | |||||
| #endif | #endif | ||||
| int MAIN__(int argc, char *argv[]){ | |||||
| int main(int argc, char *argv[]){ | |||||
| FLOAT *a, *b; | FLOAT *a, *b; | ||||
| FLOAT alpha[] = {1.0, 1.0}; | FLOAT alpha[] = {1.0, 1.0}; | ||||
| @@ -199,4 +199,4 @@ int MAIN__(int argc, char *argv[]){ | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||||
| @@ -34,17 +34,17 @@ CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| ZGEMMKERNEL = zgemm_kernel_4x4_sandy.S | |||||
| ZGEMMINCOPY = | |||||
| ZGEMMITCOPY = | |||||
| ZGEMMKERNEL = zgemm_kernel_1x4_nehalem.S | |||||
| ZGEMMINCOPY = zgemm_ncopy_1.S | |||||
| ZGEMMITCOPY = zgemm_tcopy_1.S | |||||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c | ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c | ||||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | ||||
| ZGEMMINCOPYOBJ = | |||||
| ZGEMMITCOPYOBJ = | |||||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| #STRSMKERNEL_LN = trsm_kernel_LN_4x8_nehalem.S | #STRSMKERNEL_LN = trsm_kernel_LN_4x8_nehalem.S | ||||
| #STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S | #STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S | ||||
| #STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S | #STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S | ||||
| @@ -1092,18 +1092,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .macro INIT4x1 | .macro INIT4x1 | ||||
| vxorpd %xmm4 , %xmm4 , %xmm4 | |||||
| vxorpd %xmm5 , %xmm5 , %xmm5 | |||||
| vxorpd %ymm4 , %ymm4 , %ymm4 | |||||
| vxorpd %ymm5 , %ymm5 , %ymm5 | |||||
| vxorpd %ymm6 , %ymm6 , %ymm6 | |||||
| vxorpd %ymm7 , %ymm7 , %ymm7 | |||||
| .endm | |||||
| .macro KERNEL4x1 | |||||
| vbroadcastsd -12 * SIZE(BO), %ymm0 | |||||
| vbroadcastsd -11 * SIZE(BO), %ymm1 | |||||
| vbroadcastsd -10 * SIZE(BO), %ymm2 | |||||
| vbroadcastsd -9 * SIZE(BO), %ymm3 | |||||
| vfmadd231pd -16 * SIZE(AO) ,%ymm0 , %ymm4 | |||||
| vfmadd231pd -12 * SIZE(AO) ,%ymm1 , %ymm5 | |||||
| vbroadcastsd -8 * SIZE(BO), %ymm0 | |||||
| vbroadcastsd -7 * SIZE(BO), %ymm1 | |||||
| vfmadd231pd -8 * SIZE(AO) ,%ymm2 , %ymm6 | |||||
| vfmadd231pd -4 * SIZE(AO) ,%ymm3 , %ymm7 | |||||
| vbroadcastsd -6 * SIZE(BO), %ymm2 | |||||
| vbroadcastsd -5 * SIZE(BO), %ymm3 | |||||
| vfmadd231pd 0 * SIZE(AO) ,%ymm0 , %ymm4 | |||||
| vfmadd231pd 4 * SIZE(AO) ,%ymm1 , %ymm5 | |||||
| vfmadd231pd 8 * SIZE(AO) ,%ymm2 , %ymm6 | |||||
| vfmadd231pd 12 * SIZE(AO) ,%ymm3 , %ymm7 | |||||
| addq $ 8 *SIZE, BO | |||||
| addq $ 32*SIZE, AO | |||||
| .endm | .endm | ||||
| .macro KERNEL4x1_SUB | .macro KERNEL4x1_SUB | ||||
| vmovddup -12 * SIZE(BO), %xmm2 | |||||
| vmovups -16 * SIZE(AO), %xmm0 | |||||
| vmovups -14 * SIZE(AO), %xmm1 | |||||
| vfmadd231pd %xmm0 ,%xmm2 , %xmm4 | |||||
| vfmadd231pd %xmm1 ,%xmm2 , %xmm5 | |||||
| vbroadcastsd -12 * SIZE(BO), %ymm2 | |||||
| vmovups -16 * SIZE(AO), %ymm0 | |||||
| vfmadd231pd %ymm0 ,%ymm2 , %ymm4 | |||||
| addq $ 1*SIZE, BO | addq $ 1*SIZE, BO | ||||
| addq $ 4*SIZE, AO | addq $ 4*SIZE, AO | ||||
| @@ -1112,21 +1142,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .macro SAVE4x1 | .macro SAVE4x1 | ||||
| vmovddup ALPHA, %xmm0 | |||||
| vbroadcastsd ALPHA, %ymm0 | |||||
| vmulpd %xmm0 , %xmm4 , %xmm4 | |||||
| vmulpd %xmm0 , %xmm5 , %xmm5 | |||||
| vaddpd %ymm4,%ymm5, %ymm4 | |||||
| vaddpd %ymm6,%ymm7, %ymm6 | |||||
| vaddpd %ymm4,%ymm6, %ymm4 | |||||
| vmulpd %ymm0 , %ymm4 , %ymm4 | |||||
| #if !defined(TRMMKERNEL) | #if !defined(TRMMKERNEL) | ||||
| vaddpd (CO1) , %xmm4, %xmm4 | |||||
| vaddpd 2 * SIZE(CO1) , %xmm5, %xmm5 | |||||
| vaddpd (CO1) , %ymm4, %ymm4 | |||||
| #endif | #endif | ||||
| vmovups %xmm4 , (CO1) | |||||
| vmovups %xmm5 , 2 * SIZE(CO1) | |||||
| vmovups %ymm4 , (CO1) | |||||
| addq $ 4*SIZE, CO1 | addq $ 4*SIZE, CO1 | ||||
| .endm | .endm | ||||
| @@ -2112,15 +2143,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .L1_12: | .L1_12: | ||||
| KERNEL4x1_SUB | |||||
| KERNEL4x1_SUB | |||||
| KERNEL4x1_SUB | |||||
| KERNEL4x1_SUB | |||||
| KERNEL4x1_SUB | |||||
| KERNEL4x1_SUB | |||||
| KERNEL4x1_SUB | |||||
| KERNEL4x1_SUB | |||||
| KERNEL4x1 | |||||
| dec %rax | dec %rax | ||||
| jne .L1_12 | jne .L1_12 | ||||
| @@ -3180,15 +3203,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .L1_12: | .L1_12: | ||||
| KERNEL4x1_SUB | |||||
| KERNEL4x1_SUB | |||||
| KERNEL4x1_SUB | |||||
| KERNEL4x1_SUB | |||||
| KERNEL4x1_SUB | |||||
| KERNEL4x1_SUB | |||||
| KERNEL4x1_SUB | |||||
| KERNEL4x1_SUB | |||||
| KERNEL4x1 | |||||
| dec %rax | dec %rax | ||||
| jne .L1_12 | jne .L1_12 | ||||
| @@ -120,7 +120,7 @@ | |||||
| REAL RZERO | REAL RZERO | ||||
| PARAMETER ( RZERO = 0.0 ) | PARAMETER ( RZERO = 0.0 ) | ||||
| INTEGER NMAX, INCMAX | INTEGER NMAX, INCMAX | ||||
| PARAMETER ( NMAX = 65, INCMAX = 2 ) | |||||
| PARAMETER ( NMAX = 128, INCMAX = 2 ) | |||||
| INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX | INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX | ||||
| PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, | PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, | ||||
| $ NALMAX = 7, NBEMAX = 7 ) | $ NALMAX = 7, NBEMAX = 7 ) | ||||
| @@ -102,7 +102,7 @@ | |||||
| REAL RZERO | REAL RZERO | ||||
| PARAMETER ( RZERO = 0.0 ) | PARAMETER ( RZERO = 0.0 ) | ||||
| INTEGER NMAX | INTEGER NMAX | ||||
| PARAMETER ( NMAX = 65 ) | |||||
| PARAMETER ( NMAX = 128 ) | |||||
| INTEGER NIDMAX, NALMAX, NBEMAX | INTEGER NIDMAX, NALMAX, NBEMAX | ||||
| PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) | PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) | ||||
| * .. Local Scalars .. | * .. Local Scalars .. | ||||
| @@ -117,7 +117,7 @@ | |||||
| DOUBLE PRECISION ZERO, ONE | DOUBLE PRECISION ZERO, ONE | ||||
| PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) | PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) | ||||
| INTEGER NMAX, INCMAX | INTEGER NMAX, INCMAX | ||||
| PARAMETER ( NMAX = 65, INCMAX = 2 ) | |||||
| PARAMETER ( NMAX = 128, INCMAX = 2 ) | |||||
| INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX | INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX | ||||
| PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, | PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, | ||||
| $ NALMAX = 7, NBEMAX = 7 ) | $ NALMAX = 7, NBEMAX = 7 ) | ||||
| @@ -97,7 +97,7 @@ | |||||
| DOUBLE PRECISION ZERO, ONE | DOUBLE PRECISION ZERO, ONE | ||||
| PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) | PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) | ||||
| INTEGER NMAX | INTEGER NMAX | ||||
| PARAMETER ( NMAX = 65 ) | |||||
| PARAMETER ( NMAX = 128 ) | |||||
| INTEGER NIDMAX, NALMAX, NBEMAX | INTEGER NIDMAX, NALMAX, NBEMAX | ||||
| PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) | PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) | ||||
| * .. Local Scalars .. | * .. Local Scalars .. | ||||
| @@ -117,7 +117,7 @@ | |||||
| REAL ZERO, ONE | REAL ZERO, ONE | ||||
| PARAMETER ( ZERO = 0.0, ONE = 1.0 ) | PARAMETER ( ZERO = 0.0, ONE = 1.0 ) | ||||
| INTEGER NMAX, INCMAX | INTEGER NMAX, INCMAX | ||||
| PARAMETER ( NMAX = 65, INCMAX = 2 ) | |||||
| PARAMETER ( NMAX = 128, INCMAX = 2 ) | |||||
| INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX | INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX | ||||
| PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, | PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, | ||||
| $ NALMAX = 7, NBEMAX = 7 ) | $ NALMAX = 7, NBEMAX = 7 ) | ||||
| @@ -97,7 +97,7 @@ | |||||
| REAL ZERO, ONE | REAL ZERO, ONE | ||||
| PARAMETER ( ZERO = 0.0, ONE = 1.0 ) | PARAMETER ( ZERO = 0.0, ONE = 1.0 ) | ||||
| INTEGER NMAX | INTEGER NMAX | ||||
| PARAMETER ( NMAX = 65 ) | |||||
| PARAMETER ( NMAX = 128 ) | |||||
| INTEGER NIDMAX, NALMAX, NBEMAX | INTEGER NIDMAX, NALMAX, NBEMAX | ||||
| PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) | PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) | ||||
| * .. Local Scalars .. | * .. Local Scalars .. | ||||
| @@ -121,7 +121,7 @@ | |||||
| DOUBLE PRECISION RZERO | DOUBLE PRECISION RZERO | ||||
| PARAMETER ( RZERO = 0.0D0 ) | PARAMETER ( RZERO = 0.0D0 ) | ||||
| INTEGER NMAX, INCMAX | INTEGER NMAX, INCMAX | ||||
| PARAMETER ( NMAX = 65, INCMAX = 2 ) | |||||
| PARAMETER ( NMAX = 128, INCMAX = 2 ) | |||||
| INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX | INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX | ||||
| PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, | PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, | ||||
| $ NALMAX = 7, NBEMAX = 7 ) | $ NALMAX = 7, NBEMAX = 7 ) | ||||
| @@ -104,7 +104,7 @@ | |||||
| DOUBLE PRECISION RZERO | DOUBLE PRECISION RZERO | ||||
| PARAMETER ( RZERO = 0.0D0 ) | PARAMETER ( RZERO = 0.0D0 ) | ||||
| INTEGER NMAX | INTEGER NMAX | ||||
| PARAMETER ( NMAX = 65 ) | |||||
| PARAMETER ( NMAX = 128 ) | |||||
| INTEGER NIDMAX, NALMAX, NBEMAX | INTEGER NIDMAX, NALMAX, NBEMAX | ||||
| PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) | PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) | ||||
| * .. Local Scalars .. | * .. Local Scalars .. | ||||
| @@ -1,11 +1,11 @@ | |||||
| SEP: Data file for testing Symmetric Eigenvalue Problem routines | SEP: Data file for testing Symmetric Eigenvalue Problem routines | ||||
| 6 Number of values of N | |||||
| 0 1 2 3 5 20 Values of N (dimension) | |||||
| 8 Number of values of N | |||||
| 0 1 2 3 5 19 20 21 Values of N (dimension) | |||||
| 5 Number of values of NB | 5 Number of values of NB | ||||
| 1 3 3 3 10 Values of NB (blocksize) | 1 3 3 3 10 Values of NB (blocksize) | ||||
| 2 2 2 2 2 Values of NBMIN (minimum blocksize) | 2 2 2 2 2 Values of NBMIN (minimum blocksize) | ||||
| 1 0 5 9 1 Values of NX (crossover point) | 1 0 5 9 1 Values of NX (crossover point) | ||||
| 60.0 Threshold value | |||||
| 160.0 Threshold value | |||||
| T Put T to test the LAPACK routines | T Put T to test the LAPACK routines | ||||
| T Put T to test the driver routines | T Put T to test the driver routines | ||||
| T Put T to test the error exits | T Put T to test the error exits | ||||
| @@ -1129,7 +1129,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define DGEMM_DEFAULT_UNROLL_M 8 | #define DGEMM_DEFAULT_UNROLL_M 8 | ||||
| #define QGEMM_DEFAULT_UNROLL_M 2 | #define QGEMM_DEFAULT_UNROLL_M 2 | ||||
| #define CGEMM_DEFAULT_UNROLL_M 8 | #define CGEMM_DEFAULT_UNROLL_M 8 | ||||
| #define ZGEMM_DEFAULT_UNROLL_M 4 | |||||
| #define ZGEMM_DEFAULT_UNROLL_M 1 | |||||
| #define XGEMM_DEFAULT_UNROLL_M 1 | #define XGEMM_DEFAULT_UNROLL_M 1 | ||||
| #define SGEMM_DEFAULT_UNROLL_N 4 | #define SGEMM_DEFAULT_UNROLL_N 4 | ||||