| @@ -3442,4 +3442,4 @@ smallscaling: smallscaling.c ../$(LIBNAME) | |||
| clean :: | |||
| @rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl smallscaling | |||
| include $(TOPDIR)/Makefile.tail | |||
| include $(TOPDIR)/Makefile.tail | |||
| @@ -1,133 +1,133 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef AMAX | |||
| #ifdef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define AMAX BLASFUNC(dzamax) | |||
| #else | |||
| #define AMAX BLASFUNC(scamax) | |||
| #endif | |||
| #else | |||
| #ifdef DOUBLE | |||
| #define AMAX BLASFUNC(damax) | |||
| #else | |||
| #define AMAX BLASFUNC(samax) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]) | |||
| { | |||
| FLOAT *x; | |||
| blasint m, i; | |||
| blasint inc_x = 1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1, timeg; | |||
| argc--; | |||
| argv++; | |||
| if (argc > 0) | |||
| { | |||
| from = atol(*argv); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if (argc > 0) | |||
| { | |||
| to = MAX(atol(*argv), from); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if (argc > 0) | |||
| { | |||
| step = atol(*argv); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) | |||
| loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) | |||
| inc_x = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops); | |||
| if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) | |||
| { | |||
| fprintf(stderr, "Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for (m = from; m <= to; m += step) | |||
| { | |||
| timeg = 0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l = 0; l < loops; l++) | |||
| { | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) | |||
| { | |||
| x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| AMAX(&m, x, &inc_x); | |||
| end(); | |||
| timeg += getsec(); | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef AMAX | |||
| #ifdef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define AMAX BLASFUNC(dzamax) | |||
| #else | |||
| #define AMAX BLASFUNC(scamax) | |||
| #endif | |||
| #else | |||
| #ifdef DOUBLE | |||
| #define AMAX BLASFUNC(damax) | |||
| #else | |||
| #define AMAX BLASFUNC(samax) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]) | |||
| { | |||
| FLOAT *x; | |||
| blasint m, i; | |||
| blasint inc_x = 1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1, timeg; | |||
| argc--; | |||
| argv++; | |||
| if (argc > 0) | |||
| { | |||
| from = atol(*argv); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if (argc > 0) | |||
| { | |||
| to = MAX(atol(*argv), from); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if (argc > 0) | |||
| { | |||
| step = atol(*argv); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) | |||
| loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) | |||
| inc_x = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops); | |||
| if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) | |||
| { | |||
| fprintf(stderr, "Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for (m = from; m <= to; m += step) | |||
| { | |||
| timeg = 0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l = 0; l < loops; l++) | |||
| { | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) | |||
| { | |||
| x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| AMAX(&m, x, &inc_x); | |||
| end(); | |||
| timeg += getsec(); | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -1,137 +1,137 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef AMIN | |||
| #ifdef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define AMIN BLASFUNC(dzamin) | |||
| #else | |||
| #define AMIN BLASFUNC(scamin) | |||
| #endif | |||
| #else | |||
| #ifdef DOUBLE | |||
| #define AMIN BLASFUNC(damin) | |||
| #else | |||
| #define AMIN BLASFUNC(samin) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]) | |||
| { | |||
| FLOAT *x; | |||
| blasint m, i; | |||
| blasint inc_x = 1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1, timeg; | |||
| argc--; | |||
| argv++; | |||
| if (argc > 0) | |||
| { | |||
| from = atol(*argv); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if (argc > 0) | |||
| { | |||
| to = MAX(atol(*argv), from); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if (argc > 0) | |||
| { | |||
| step = atol(*argv); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) | |||
| loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) | |||
| inc_x = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops); | |||
| if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) | |||
| { | |||
| fprintf(stderr, "Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for (m = from; m <= to; m += step) | |||
| { | |||
| timeg = 0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l = 0; l < loops; l++) | |||
| { | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) | |||
| { | |||
| x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| AMIN(&m, x, &inc_x); | |||
| end(); | |||
| timeg += getsec(); | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef AMIN | |||
| #ifdef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define AMIN BLASFUNC(dzamin) | |||
| #else | |||
| #define AMIN BLASFUNC(scamin) | |||
| #endif | |||
| #else | |||
| #ifdef DOUBLE | |||
| #define AMIN BLASFUNC(damin) | |||
| #else | |||
| #define AMIN BLASFUNC(samin) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]) | |||
| { | |||
| FLOAT *x; | |||
| blasint m, i; | |||
| blasint inc_x = 1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1, timeg; | |||
| argc--; | |||
| argv++; | |||
| if (argc > 0) | |||
| { | |||
| from = atol(*argv); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if (argc > 0) | |||
| { | |||
| to = MAX(atol(*argv), from); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if (argc > 0) | |||
| { | |||
| step = atol(*argv); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) | |||
| loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) | |||
| inc_x = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops); | |||
| if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) | |||
| { | |||
| fprintf(stderr, "Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for (m = from; m <= to; m += step) | |||
| { | |||
| timeg = 0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l = 0; l < loops; l++) | |||
| { | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) | |||
| { | |||
| x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| AMIN(&m, x, &inc_x); | |||
| end(); | |||
| timeg += getsec(); | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -1,134 +1,134 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef HBMV | |||
| #ifdef DOUBLE | |||
| #define HBMV BLASFUNC(zhbmv) | |||
| #else | |||
| #define HBMV BLASFUNC(chbmv) | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *a, *x, *y; | |||
| FLOAT alpha[] = {1.0, 1.0}; | |||
| FLOAT beta [] = {0.0, 0.0}; | |||
| blasint k = 1; | |||
| char uplo='L'; | |||
| blasint m, i, j; | |||
| blasint inc_x=1, inc_y=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); | |||
| if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; | |||
| if ((p = getenv("OPENBLAS_K"))) k = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' k = %d Inc_x = %d Inc_y = %d Loops = %d\n", | |||
| from, to, step, uplo, k, inc_x, inc_y, loops); | |||
| if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) { | |||
| fprintf(stderr,"Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) { | |||
| fprintf(stderr,"Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL) { | |||
| fprintf(stderr,"Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) { | |||
| timeg=0; | |||
| fprintf(stderr, " %6dx%d : ", (int)m, (int)m); | |||
| for(j = 0; j < m; j++) { | |||
| for(i = 0; i < m * COMPSIZE; i++) { | |||
| a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| } | |||
| for (l = 0; l < loops; l++) { | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) { | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) { | |||
| y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| HBMV (&uplo, &m, &k, alpha, a, &m, x, &inc_x, beta, y, &inc_y ); | |||
| end(); | |||
| timeg += getsec(); | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, " %10.2f MFlops\n", | |||
| COMPSIZE * COMPSIZE * 2. * (double)(2 * k + 1) * (double)m / timeg * 1.e-6); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef HBMV | |||
| #ifdef DOUBLE | |||
| #define HBMV BLASFUNC(zhbmv) | |||
| #else | |||
| #define HBMV BLASFUNC(chbmv) | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *a, *x, *y; | |||
| FLOAT alpha[] = {1.0, 1.0}; | |||
| FLOAT beta [] = {0.0, 0.0}; | |||
| blasint k = 1; | |||
| char uplo='L'; | |||
| blasint m, i, j; | |||
| blasint inc_x=1, inc_y=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); | |||
| if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; | |||
| if ((p = getenv("OPENBLAS_K"))) k = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' k = %d Inc_x = %d Inc_y = %d Loops = %d\n", | |||
| from, to, step, uplo, k, inc_x, inc_y, loops); | |||
| if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) { | |||
| fprintf(stderr,"Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) { | |||
| fprintf(stderr,"Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL) { | |||
| fprintf(stderr,"Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) { | |||
| timeg=0; | |||
| fprintf(stderr, " %6dx%d : ", (int)m, (int)m); | |||
| for(j = 0; j < m; j++) { | |||
| for(i = 0; i < m * COMPSIZE; i++) { | |||
| a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| } | |||
| for (l = 0; l < loops; l++) { | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) { | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) { | |||
| y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| HBMV (&uplo, &m, &k, alpha, a, &m, x, &inc_x, beta, y, &inc_y ); | |||
| end(); | |||
| timeg += getsec(); | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, " %10.2f MFlops\n", | |||
| COMPSIZE * COMPSIZE * 2. * (double)(2 * k + 1) * (double)m / timeg * 1.e-6); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -1,133 +1,133 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef HPMV | |||
| #ifdef DOUBLE | |||
| #define HPMV BLASFUNC(zhpmv) | |||
| #else | |||
| #define HPMV BLASFUNC(chpmv) | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *a, *x, *y; | |||
| FLOAT alpha[] = {1.0, 1.0}; | |||
| FLOAT beta [] = {1.0, 1.0}; | |||
| char uplo='L'; | |||
| blasint m, i, j; | |||
| blasint inc_x=1, inc_y=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); | |||
| if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops); | |||
| if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) { | |||
| fprintf(stderr,"Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) { | |||
| fprintf(stderr,"Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL) { | |||
| fprintf(stderr,"Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) { | |||
| timeg=0; | |||
| fprintf(stderr, " %6dx%d : ", (int)m, (int)m); | |||
| for(j = 0; j < m; j++) { | |||
| for(i = 0; i < m * COMPSIZE; i++) { | |||
| a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| } | |||
| for (l = 0; l < loops; l++) { | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) { | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) { | |||
| y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| HPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y ); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, " %10.2f MFlops\n", | |||
| COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / timeg * 1.e-6); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef HPMV | |||
| #ifdef DOUBLE | |||
| #define HPMV BLASFUNC(zhpmv) | |||
| #else | |||
| #define HPMV BLASFUNC(chpmv) | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *a, *x, *y; | |||
| FLOAT alpha[] = {1.0, 1.0}; | |||
| FLOAT beta [] = {1.0, 1.0}; | |||
| char uplo='L'; | |||
| blasint m, i, j; | |||
| blasint inc_x=1, inc_y=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); | |||
| if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops); | |||
| if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) { | |||
| fprintf(stderr,"Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) { | |||
| fprintf(stderr,"Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL) { | |||
| fprintf(stderr,"Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) { | |||
| timeg=0; | |||
| fprintf(stderr, " %6dx%d : ", (int)m, (int)m); | |||
| for(j = 0; j < m; j++) { | |||
| for(i = 0; i < m * COMPSIZE; i++) { | |||
| a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| } | |||
| for (l = 0; l < loops; l++) { | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) { | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) { | |||
| y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| HPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y ); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, " %10.2f MFlops\n", | |||
| COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / timeg * 1.e-6); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -1,120 +1,120 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef IAMIN | |||
| #ifdef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define IAMIN BLASFUNC(izamin) | |||
| #else | |||
| #define IAMIN BLASFUNC(icamin) | |||
| #endif | |||
| #else | |||
| #ifdef DOUBLE | |||
| #define IAMIN BLASFUNC(idamin) | |||
| #else | |||
| #define IAMIN BLASFUNC(isamin) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *x; | |||
| blasint m, i; | |||
| blasint inc_x=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| IAMIN (&m, x, &inc_x); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef IAMIN | |||
| #ifdef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define IAMIN BLASFUNC(izamin) | |||
| #else | |||
| #define IAMIN BLASFUNC(icamin) | |||
| #endif | |||
| #else | |||
| #ifdef DOUBLE | |||
| #define IAMIN BLASFUNC(idamin) | |||
| #else | |||
| #define IAMIN BLASFUNC(isamin) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *x; | |||
| blasint m, i; | |||
| blasint inc_x=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| IAMIN (&m, x, &inc_x); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -1,114 +1,114 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef IMAX | |||
| #ifndef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define IMAX BLASFUNC(idmax) | |||
| #else | |||
| #define IMAX BLASFUNC(ismax) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *x; | |||
| blasint m, i; | |||
| blasint inc_x=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| IMAX (&m, x, &inc_x); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef IMAX | |||
| #ifndef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define IMAX BLASFUNC(idmax) | |||
| #else | |||
| #define IMAX BLASFUNC(ismax) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *x; | |||
| blasint m, i; | |||
| blasint inc_x=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| IMAX (&m, x, &inc_x); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -1,114 +1,114 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef IMIN | |||
| #ifndef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define IMIN BLASFUNC(idmin) | |||
| #else | |||
| #define IMIN BLASFUNC(ismin) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *x; | |||
| blasint m, i; | |||
| blasint inc_x=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| IMIN (&m, x, &inc_x); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef IMIN | |||
| #ifndef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define IMIN BLASFUNC(idmin) | |||
| #else | |||
| #define IMIN BLASFUNC(ismin) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *x; | |||
| blasint m, i; | |||
| blasint inc_x=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| IMIN (&m, x, &inc_x); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -1,113 +1,113 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef NAMAX | |||
| #ifndef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define NAMAX BLASFUNC(dmax) | |||
| #else | |||
| #define NAMAX BLASFUNC(smax) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *x; | |||
| blasint m, i; | |||
| blasint inc_x=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| NAMAX (&m, x, &inc_x); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef NAMAX | |||
| #ifndef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define NAMAX BLASFUNC(dmax) | |||
| #else | |||
| #define NAMAX BLASFUNC(smax) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *x; | |||
| blasint m, i; | |||
| blasint inc_x=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| NAMAX (&m, x, &inc_x); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -1,113 +1,113 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef NAMIN | |||
| #ifndef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define NAMIN BLASFUNC(dmin) | |||
| #else | |||
| #define NAMIN BLASFUNC(smin) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *x; | |||
| blasint m, i; | |||
| blasint inc_x=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| NAMIN (&m, x, &inc_x); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef NAMIN | |||
| #ifndef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define NAMIN BLASFUNC(dmin) | |||
| #else | |||
| #define NAMIN BLASFUNC(smin) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *x; | |||
| blasint m, i; | |||
| blasint inc_x=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| NAMIN (&m, x, &inc_x); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -1,138 +1,138 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE | |||
| GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |||
| HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |||
| LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF | |||
| THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef ROTM | |||
| #ifdef DOUBLE | |||
| #define ROTM BLASFUNC(drotm) | |||
| #else | |||
| #define ROTM BLASFUNC(srotm) | |||
| #endif | |||
| int main(int argc, char *argv[]) | |||
| { | |||
| FLOAT *x, *y; | |||
| // FLOAT result; | |||
| blasint m, i; | |||
| blasint inc_x = 1, inc_y = 1; | |||
| FLOAT param[5] = {1, 2.0, 3.0, 4.0, 5.0}; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1, timeg; | |||
| argc--; | |||
| argv++; | |||
| if (argc > 0) { | |||
| from = atol(*argv); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if (argc > 0) { | |||
| to = MAX(atol(*argv), from); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if (argc > 0) { | |||
| step = atol(*argv); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) | |||
| loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) | |||
| inc_x = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCY"))) | |||
| inc_y = atoi(p); | |||
| fprintf( | |||
| stderr, | |||
| "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", | |||
| from, to, step, inc_x, inc_y, loops); | |||
| if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == | |||
| NULL) { | |||
| fprintf(stderr, "Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| if ((y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == | |||
| NULL) { | |||
| fprintf(stderr, "Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for (m = from; m <= to; m += step) { | |||
| timeg = 0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) { | |||
| x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; | |||
| } | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) { | |||
| y[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; | |||
| } | |||
| for (l = 0; l < loops; l++) { | |||
| begin(); | |||
| ROTM(&m, x, &inc_x, y, &inc_y, param); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE | |||
| GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |||
| HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |||
| LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF | |||
| THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef ROTM | |||
| #ifdef DOUBLE | |||
| #define ROTM BLASFUNC(drotm) | |||
| #else | |||
| #define ROTM BLASFUNC(srotm) | |||
| #endif | |||
| int main(int argc, char *argv[]) | |||
| { | |||
| FLOAT *x, *y; | |||
| // FLOAT result; | |||
| blasint m, i; | |||
| blasint inc_x = 1, inc_y = 1; | |||
| FLOAT param[5] = {1, 2.0, 3.0, 4.0, 5.0}; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1, timeg; | |||
| argc--; | |||
| argv++; | |||
| if (argc > 0) { | |||
| from = atol(*argv); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if (argc > 0) { | |||
| to = MAX(atol(*argv), from); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if (argc > 0) { | |||
| step = atol(*argv); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) | |||
| loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) | |||
| inc_x = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCY"))) | |||
| inc_y = atoi(p); | |||
| fprintf( | |||
| stderr, | |||
| "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", | |||
| from, to, step, inc_x, inc_y, loops); | |||
| if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == | |||
| NULL) { | |||
| fprintf(stderr, "Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| if ((y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == | |||
| NULL) { | |||
| fprintf(stderr, "Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for (m = from; m <= to; m += step) { | |||
| timeg = 0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) { | |||
| x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; | |||
| } | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) { | |||
| y[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; | |||
| } | |||
| for (l = 0; l < loops; l++) { | |||
| begin(); | |||
| ROTM(&m, x, &inc_x, y, &inc_y, param); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -1,146 +1,146 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef SPMV | |||
| #ifndef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define SPMV BLASFUNC(dspmv) | |||
| #else | |||
| #define SPMV BLASFUNC(sspmv) | |||
| #endif | |||
| #else | |||
| #ifdef DOUBLE | |||
| #define SPMV BLASFUNC(zspmv) | |||
| #else | |||
| #define SPMV BLASFUNC(cspmv) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *a, *x, *y; | |||
| FLOAT alpha[] = {1.0, 1.0}; | |||
| FLOAT beta [] = {1.0, 1.0}; | |||
| char uplo='L'; | |||
| blasint m, i, j; | |||
| blasint inc_x=1,inc_y=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); | |||
| if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops); | |||
| if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6dx%d : ", (int)m,(int)m); | |||
| for(j = 0; j < m; j++){ | |||
| for(i = 0; i < m * COMPSIZE; i++){ | |||
| a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| } | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ | |||
| y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| SPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y ); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops\n", | |||
| COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / timeg * 1.e-6); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef SPMV | |||
| #ifndef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define SPMV BLASFUNC(dspmv) | |||
| #else | |||
| #define SPMV BLASFUNC(sspmv) | |||
| #endif | |||
| #else | |||
| #ifdef DOUBLE | |||
| #define SPMV BLASFUNC(zspmv) | |||
| #else | |||
| #define SPMV BLASFUNC(cspmv) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *a, *x, *y; | |||
| FLOAT alpha[] = {1.0, 1.0}; | |||
| FLOAT beta [] = {1.0, 1.0}; | |||
| char uplo='L'; | |||
| blasint m, i, j; | |||
| blasint inc_x=1,inc_y=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); | |||
| if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops); | |||
| if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6dx%d : ", (int)m,(int)m); | |||
| for(j = 0; j < m; j++){ | |||
| for(i = 0; i < m * COMPSIZE; i++){ | |||
| a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| } | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ | |||
| y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| SPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y ); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops\n", | |||
| COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / timeg * 1.e-6); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -318,6 +318,8 @@ set(CSRC | |||
| lapacke_clacn2.c | |||
| lapacke_clag2z.c | |||
| lapacke_clag2z_work.c | |||
| lapacke_clangb.c | |||
| lapacke_clangb_work.c | |||
| lapacke_clange.c | |||
| lapacke_clange_work.c | |||
| lapacke_clanhe.c | |||
| @@ -803,6 +805,8 @@ set(DSRC | |||
| lapacke_dlag2s_work.c | |||
| lapacke_dlamch.c | |||
| lapacke_dlamch_work.c | |||
| lapacke_dlangb.c | |||
| lapacke_dlangb_work.c | |||
| lapacke_dlange.c | |||
| lapacke_dlange_work.c | |||
| lapacke_dlansy.c | |||
| @@ -1381,6 +1385,8 @@ set(SSRC | |||
| lapacke_slag2d_work.c | |||
| lapacke_slamch.c | |||
| lapacke_slamch_work.c | |||
| lapacke_slangb.c | |||
| lapacke_slangb_work.c | |||
| lapacke_slange.c | |||
| lapacke_slange_work.c | |||
| lapacke_slansy.c | |||
| @@ -2089,6 +2095,8 @@ set(ZSRC | |||
| lapacke_zlacrm_work.c | |||
| lapacke_zlag2c.c | |||
| lapacke_zlag2c_work.c | |||
| lapacke_zlangb.c | |||
| lapacke_zlangb_work.c | |||
| lapacke_zlange.c | |||
| lapacke_zlange_work.c | |||
| lapacke_zlanhe.c | |||
| @@ -1,333 +1,333 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define M x0 | |||
| #define N x1 | |||
| #define A00 x2 | |||
| #define LDA x3 | |||
| #define B00 x4 | |||
| #define A01 x5 | |||
| #define A02 x6 | |||
| #define A03 x7 | |||
| #define A04 x8 | |||
| #define I x9 | |||
| #define J x10 | |||
| #define TEMP1 x11 | |||
| #define TEMP2 x12 | |||
| #define A_PREFETCH 2560 | |||
| /************************************************************************************** | |||
| * Macro definitions | |||
| **************************************************************************************/ | |||
| .macro SAVE_REGS | |||
| add sp, sp, #-(11 * 16) | |||
| stp d8, d9, [sp, #(0 * 16)] | |||
| stp d10, d11, [sp, #(1 * 16)] | |||
| stp d12, d13, [sp, #(2 * 16)] | |||
| stp d14, d15, [sp, #(3 * 16)] | |||
| stp d16, d17, [sp, #(4 * 16)] | |||
| stp x18, x19, [sp, #(5 * 16)] | |||
| stp x20, x21, [sp, #(6 * 16)] | |||
| stp x22, x23, [sp, #(7 * 16)] | |||
| stp x24, x25, [sp, #(8 * 16)] | |||
| stp x26, x27, [sp, #(9 * 16)] | |||
| str x28, [sp, #(10 * 16)] | |||
| .endm | |||
| .macro RESTORE_REGS | |||
| ldp d8, d9, [sp, #(0 * 16)] | |||
| ldp d10, d11, [sp, #(1 * 16)] | |||
| ldp d12, d13, [sp, #(2 * 16)] | |||
| ldp d14, d15, [sp, #(3 * 16)] | |||
| ldp d16, d17, [sp, #(4 * 16)] | |||
| ldp x18, x19, [sp, #(5 * 16)] | |||
| ldp x20, x21, [sp, #(6 * 16)] | |||
| ldp x22, x23, [sp, #(7 * 16)] | |||
| ldp x24, x25, [sp, #(8 * 16)] | |||
| ldp x26, x27, [sp, #(9 * 16)] | |||
| ldr x28, [sp, #(10 * 16)] | |||
| add sp, sp, #(11*16) | |||
| .endm | |||
| .macro COPY4x4 | |||
| prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| ldr q0, [A01], #16 | |||
| ins v8.s[0], v0.s[0] | |||
| ins v9.s[0], v0.s[1] | |||
| ins v10.s[0], v0.s[2] | |||
| ins v11.s[0], v0.s[3] | |||
| ldr q1, [A02], #16 | |||
| ins v8.s[1], v1.s[0] | |||
| ins v9.s[1], v1.s[1] | |||
| ins v10.s[1], v1.s[2] | |||
| ins v11.s[1], v1.s[3] | |||
| ldr q2, [A03], #16 | |||
| ins v8.s[2], v2.s[0] | |||
| ins v9.s[2], v2.s[1] | |||
| ins v10.s[2], v2.s[2] | |||
| ins v11.s[2], v2.s[3] | |||
| ldr q3, [A04], #16 | |||
| ins v8.s[3], v3.s[0] | |||
| ins v9.s[3], v3.s[1] | |||
| ins v10.s[3], v3.s[2] | |||
| ins v11.s[3], v3.s[3] | |||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00] | |||
| add B00, B00, #64 | |||
| .endm | |||
| .macro COPY1x4 | |||
| prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| ldr s0, [A01], #4 | |||
| ldr s1, [A02], #4 | |||
| ldr s2, [A03], #4 | |||
| ldr s3, [A04], #4 | |||
| stp s0, s1, [B00] | |||
| add B00, B00, #8 | |||
| stp s2, s3, [B00] | |||
| add B00, B00, #8 | |||
| .endm | |||
| .macro COPY4x2 | |||
| prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| ldr q0, [A01], #16 | |||
| ins v8.s[0], v0.s[0] | |||
| ins v9.s[0], v0.s[1] | |||
| ins v10.s[0], v0.s[2] | |||
| ins v11.s[0], v0.s[3] | |||
| ldr q1, [A02], #16 | |||
| ins v8.s[1], v1.s[0] | |||
| ins v9.s[1], v1.s[1] | |||
| ins v10.s[1], v1.s[2] | |||
| ins v11.s[1], v1.s[3] | |||
| st1 {v8.2s, v9.2s, v10.2s, v11.2s}, [B00] | |||
| add B00, B00, #32 | |||
| .endm | |||
| .macro COPY1x2 | |||
| prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| ldr s0, [A01], #4 | |||
| ldr s1, [A02], #4 | |||
| stp s0, s1, [B00] | |||
| add B00, B00, #8 | |||
| .endm | |||
| .macro COPY4x1 | |||
| prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| ldr q0, [A01], #16 | |||
| str q0, [B00], #16 | |||
| .endm | |||
| .macro COPY1x1 | |||
| prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| ldr s0, [A01], #4 | |||
| str s0, [B00], #4 | |||
| .endm | |||
| /************************************************************************************** | |||
| * End of macro definitions | |||
| **************************************************************************************/ | |||
| PROLOGUE | |||
| .align 5 | |||
| SAVE_REGS | |||
| lsl LDA, LDA, #2 // LDA = LDA * SIZE | |||
| .Ldgemm_ncopy_L4_BEGIN: | |||
| asr J, N, #2 // J = N / 4 | |||
| cmp J, #0 | |||
| ble .Ldgemm_ncopy_L2_BEGIN | |||
| .align 5 | |||
| .Ldgemm_ncopy_L4_M4_BEGIN: | |||
| mov A01, A00 | |||
| add A02, A01, LDA | |||
| add A03, A02, LDA | |||
| add A04, A03, LDA | |||
| add A00, A04, LDA | |||
| asr I, M, #2 // I = M / 4 | |||
| cmp I, #0 | |||
| ble .Ldgemm_ncopy_L4_M4_40 | |||
| .align 5 | |||
| .Ldgemm_ncopy_L4_M4_20: | |||
| COPY4x4 | |||
| subs I , I , #1 | |||
| bne .Ldgemm_ncopy_L4_M4_20 | |||
| .Ldgemm_ncopy_L4_M4_40: | |||
| and I, M , #3 | |||
| cmp I, #0 | |||
| ble .Ldgemm_ncopy_L4_M4_END | |||
| .align 5 | |||
| .Ldgemm_ncopy_L4_M4_60: | |||
| COPY1x4 | |||
| subs I , I , #1 | |||
| bne .Ldgemm_ncopy_L4_M4_60 | |||
| .Ldgemm_ncopy_L4_M4_END: | |||
| subs J , J, #1 // j-- | |||
| bne .Ldgemm_ncopy_L4_M4_BEGIN | |||
| /*********************************************************************************************/ | |||
| .Ldgemm_ncopy_L2_BEGIN: | |||
| tst N, #3 | |||
| ble .Ldgemm_ncopy_L999 | |||
| tst N, #2 | |||
| ble .Ldgemm_ncopy_L1_BEGIN | |||
| .Ldgemm_ncopy_L2_M4_BEGIN: | |||
| mov A01, A00 | |||
| add A02, A01, LDA | |||
| add A00, A02, LDA | |||
| asr I, M, #2 // I = M / 4 | |||
| cmp I, #0 | |||
| ble .Ldgemm_ncopy_L2_M4_40 | |||
| .align 5 | |||
| .Ldgemm_ncopy_L2_M4_20: | |||
| COPY4x2 | |||
| subs I , I , #1 | |||
| bne .Ldgemm_ncopy_L2_M4_20 | |||
| .Ldgemm_ncopy_L2_M4_40: | |||
| and I, M , #3 | |||
| cmp I, #0 | |||
| ble .Ldgemm_ncopy_L2_M4_END | |||
| .align 5 | |||
| .Ldgemm_ncopy_L2_M4_60: | |||
| COPY1x2 | |||
| subs I , I , #1 | |||
| bne .Ldgemm_ncopy_L2_M4_60 | |||
| .Ldgemm_ncopy_L2_M4_END: | |||
| /*********************************************************************************************/ | |||
| .Ldgemm_ncopy_L1_BEGIN: | |||
| tst N, #1 | |||
| ble .Ldgemm_ncopy_L999 | |||
| .Ldgemm_ncopy_L1_M4_BEGIN: | |||
| mov A01, A00 | |||
| asr I, M, #2 // I = M / 4 | |||
| cmp I, #0 | |||
| ble .Ldgemm_ncopy_L1_M4_40 | |||
| .align 5 | |||
| .Ldgemm_ncopy_L1_M4_20: | |||
| COPY4x1 | |||
| subs I , I , #1 | |||
| bne .Ldgemm_ncopy_L1_M4_20 | |||
| .Ldgemm_ncopy_L1_M4_40: | |||
| and I, M , #3 | |||
| cmp I, #0 | |||
| ble .Ldgemm_ncopy_L1_M4_END | |||
| .align 5 | |||
| .Ldgemm_ncopy_L1_M4_60: | |||
| COPY1x1 | |||
| subs I , I , #1 | |||
| bne .Ldgemm_ncopy_L1_M4_60 | |||
| .Ldgemm_ncopy_L1_M4_END: | |||
| .Ldgemm_ncopy_L999: | |||
| mov x0, #0 | |||
| RESTORE_REGS | |||
| ret | |||
| EPILOGUE | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define M x0 | |||
| #define N x1 | |||
| #define A00 x2 | |||
| #define LDA x3 | |||
| #define B00 x4 | |||
| #define A01 x5 | |||
| #define A02 x6 | |||
| #define A03 x7 | |||
| #define A04 x8 | |||
| #define I x9 | |||
| #define J x10 | |||
| #define TEMP1 x11 | |||
| #define TEMP2 x12 | |||
| #define A_PREFETCH 2560 | |||
| /************************************************************************************** | |||
| * Macro definitions | |||
| **************************************************************************************/ | |||
| .macro SAVE_REGS | |||
| add sp, sp, #-(11 * 16) | |||
| stp d8, d9, [sp, #(0 * 16)] | |||
| stp d10, d11, [sp, #(1 * 16)] | |||
| stp d12, d13, [sp, #(2 * 16)] | |||
| stp d14, d15, [sp, #(3 * 16)] | |||
| stp d16, d17, [sp, #(4 * 16)] | |||
| stp x18, x19, [sp, #(5 * 16)] | |||
| stp x20, x21, [sp, #(6 * 16)] | |||
| stp x22, x23, [sp, #(7 * 16)] | |||
| stp x24, x25, [sp, #(8 * 16)] | |||
| stp x26, x27, [sp, #(9 * 16)] | |||
| str x28, [sp, #(10 * 16)] | |||
| .endm | |||
| .macro RESTORE_REGS | |||
| ldp d8, d9, [sp, #(0 * 16)] | |||
| ldp d10, d11, [sp, #(1 * 16)] | |||
| ldp d12, d13, [sp, #(2 * 16)] | |||
| ldp d14, d15, [sp, #(3 * 16)] | |||
| ldp d16, d17, [sp, #(4 * 16)] | |||
| ldp x18, x19, [sp, #(5 * 16)] | |||
| ldp x20, x21, [sp, #(6 * 16)] | |||
| ldp x22, x23, [sp, #(7 * 16)] | |||
| ldp x24, x25, [sp, #(8 * 16)] | |||
| ldp x26, x27, [sp, #(9 * 16)] | |||
| ldr x28, [sp, #(10 * 16)] | |||
| add sp, sp, #(11*16) | |||
| .endm | |||
| .macro COPY4x4 | |||
| prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| ldr q0, [A01], #16 | |||
| ins v8.s[0], v0.s[0] | |||
| ins v9.s[0], v0.s[1] | |||
| ins v10.s[0], v0.s[2] | |||
| ins v11.s[0], v0.s[3] | |||
| ldr q1, [A02], #16 | |||
| ins v8.s[1], v1.s[0] | |||
| ins v9.s[1], v1.s[1] | |||
| ins v10.s[1], v1.s[2] | |||
| ins v11.s[1], v1.s[3] | |||
| ldr q2, [A03], #16 | |||
| ins v8.s[2], v2.s[0] | |||
| ins v9.s[2], v2.s[1] | |||
| ins v10.s[2], v2.s[2] | |||
| ins v11.s[2], v2.s[3] | |||
| ldr q3, [A04], #16 | |||
| ins v8.s[3], v3.s[0] | |||
| ins v9.s[3], v3.s[1] | |||
| ins v10.s[3], v3.s[2] | |||
| ins v11.s[3], v3.s[3] | |||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00] | |||
| add B00, B00, #64 | |||
| .endm | |||
| .macro COPY1x4 | |||
| prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| ldr s0, [A01], #4 | |||
| ldr s1, [A02], #4 | |||
| ldr s2, [A03], #4 | |||
| ldr s3, [A04], #4 | |||
| stp s0, s1, [B00] | |||
| add B00, B00, #8 | |||
| stp s2, s3, [B00] | |||
| add B00, B00, #8 | |||
| .endm | |||
| .macro COPY4x2 | |||
| prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| ldr q0, [A01], #16 | |||
| ins v8.s[0], v0.s[0] | |||
| ins v9.s[0], v0.s[1] | |||
| ins v10.s[0], v0.s[2] | |||
| ins v11.s[0], v0.s[3] | |||
| ldr q1, [A02], #16 | |||
| ins v8.s[1], v1.s[0] | |||
| ins v9.s[1], v1.s[1] | |||
| ins v10.s[1], v1.s[2] | |||
| ins v11.s[1], v1.s[3] | |||
| st1 {v8.2s, v9.2s, v10.2s, v11.2s}, [B00] | |||
| add B00, B00, #32 | |||
| .endm | |||
| .macro COPY1x2 | |||
| prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| ldr s0, [A01], #4 | |||
| ldr s1, [A02], #4 | |||
| stp s0, s1, [B00] | |||
| add B00, B00, #8 | |||
| .endm | |||
| .macro COPY4x1 | |||
| prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| ldr q0, [A01], #16 | |||
| str q0, [B00], #16 | |||
| .endm | |||
| .macro COPY1x1 | |||
| prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| ldr s0, [A01], #4 | |||
| str s0, [B00], #4 | |||
| .endm | |||
| /************************************************************************************** | |||
| * End of macro definitions | |||
| **************************************************************************************/ | |||
| PROLOGUE | |||
| .align 5 | |||
| SAVE_REGS | |||
| lsl LDA, LDA, #2 // LDA = LDA * SIZE | |||
| .Ldgemm_ncopy_L4_BEGIN: | |||
| asr J, N, #2 // J = N / 4 | |||
| cmp J, #0 | |||
| ble .Ldgemm_ncopy_L2_BEGIN | |||
| .align 5 | |||
| .Ldgemm_ncopy_L4_M4_BEGIN: | |||
| mov A01, A00 | |||
| add A02, A01, LDA | |||
| add A03, A02, LDA | |||
| add A04, A03, LDA | |||
| add A00, A04, LDA | |||
| asr I, M, #2 // I = M / 4 | |||
| cmp I, #0 | |||
| ble .Ldgemm_ncopy_L4_M4_40 | |||
| .align 5 | |||
| .Ldgemm_ncopy_L4_M4_20: | |||
| COPY4x4 | |||
| subs I , I , #1 | |||
| bne .Ldgemm_ncopy_L4_M4_20 | |||
| .Ldgemm_ncopy_L4_M4_40: | |||
| and I, M , #3 | |||
| cmp I, #0 | |||
| ble .Ldgemm_ncopy_L4_M4_END | |||
| .align 5 | |||
| .Ldgemm_ncopy_L4_M4_60: | |||
| COPY1x4 | |||
| subs I , I , #1 | |||
| bne .Ldgemm_ncopy_L4_M4_60 | |||
| .Ldgemm_ncopy_L4_M4_END: | |||
| subs J , J, #1 // j-- | |||
| bne .Ldgemm_ncopy_L4_M4_BEGIN | |||
| /*********************************************************************************************/ | |||
| .Ldgemm_ncopy_L2_BEGIN: | |||
| tst N, #3 | |||
| ble .Ldgemm_ncopy_L999 | |||
| tst N, #2 | |||
| ble .Ldgemm_ncopy_L1_BEGIN | |||
| .Ldgemm_ncopy_L2_M4_BEGIN: | |||
| mov A01, A00 | |||
| add A02, A01, LDA | |||
| add A00, A02, LDA | |||
| asr I, M, #2 // I = M / 4 | |||
| cmp I, #0 | |||
| ble .Ldgemm_ncopy_L2_M4_40 | |||
| .align 5 | |||
| .Ldgemm_ncopy_L2_M4_20: | |||
| COPY4x2 | |||
| subs I , I , #1 | |||
| bne .Ldgemm_ncopy_L2_M4_20 | |||
| .Ldgemm_ncopy_L2_M4_40: | |||
| and I, M , #3 | |||
| cmp I, #0 | |||
| ble .Ldgemm_ncopy_L2_M4_END | |||
| .align 5 | |||
| .Ldgemm_ncopy_L2_M4_60: | |||
| COPY1x2 | |||
| subs I , I , #1 | |||
| bne .Ldgemm_ncopy_L2_M4_60 | |||
| .Ldgemm_ncopy_L2_M4_END: | |||
| /*********************************************************************************************/ | |||
| .Ldgemm_ncopy_L1_BEGIN: | |||
| tst N, #1 | |||
| ble .Ldgemm_ncopy_L999 | |||
| .Ldgemm_ncopy_L1_M4_BEGIN: | |||
| mov A01, A00 | |||
| asr I, M, #2 // I = M / 4 | |||
| cmp I, #0 | |||
| ble .Ldgemm_ncopy_L1_M4_40 | |||
| .align 5 | |||
| .Ldgemm_ncopy_L1_M4_20: | |||
| COPY4x1 | |||
| subs I , I , #1 | |||
| bne .Ldgemm_ncopy_L1_M4_20 | |||
| .Ldgemm_ncopy_L1_M4_40: | |||
| and I, M , #3 | |||
| cmp I, #0 | |||
| ble .Ldgemm_ncopy_L1_M4_END | |||
| .align 5 | |||
| .Ldgemm_ncopy_L1_M4_60: | |||
| COPY1x1 | |||
| subs I , I , #1 | |||
| bne .Ldgemm_ncopy_L1_M4_60 | |||
| .Ldgemm_ncopy_L1_M4_END: | |||
| .Ldgemm_ncopy_L999: | |||
| mov x0, #0 | |||
| RESTORE_REGS | |||
| ret | |||
| EPILOGUE | |||
| @@ -1,293 +1,293 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * Abdelrauf(quickwritereader@gmail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "def_vsx.h" | |||
| #define LOAD ld | |||
| #define STACKSIZE (512 ) | |||
| #define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ | |||
| #define M r3 | |||
| #define N r4 | |||
| #define K r5 | |||
| #define A r8 | |||
| #define B r9 | |||
| #define C r10 | |||
| #define LDC r6 | |||
| #define OFFSET r7 | |||
| #define alpha_r vs19 | |||
| #define alpha_i vs20 | |||
| #define save_permute_1 vs21 | |||
| #define permute_mask vs22 | |||
| #define o0 0 | |||
| #define T1 r11 | |||
| #define T2 r12 | |||
| #define T3 r14 | |||
| #define T4 r15 | |||
| #define T5 r16 | |||
| #define T6 r17 | |||
| #define L r18 | |||
| #define T7 r19 | |||
| #define T8 r20 | |||
| #define TEMP_REG r21 | |||
| #define I r22 | |||
| #define J r23 | |||
| #define AO r24 | |||
| #define BO r25 | |||
| #define CO r26 | |||
| #define T9 r27 | |||
| #define T10 r28 | |||
| #define PRE r29 | |||
| #define T12 r30 | |||
| #define T13 r31 | |||
| #include "cgemm_macros_power9.S" | |||
| .equ perm_const1, 0x0405060700010203 | |||
| .equ perm_const2, 0x0c0d0e0f08090a0b | |||
| .equ save_permute_12, 0x0c0d0e0f1c1d1e1f | |||
| .equ save_permute_11, 0x0405060714151617 | |||
| #ifndef NEEDPARAM | |||
| PROLOGUE | |||
| PROFCODE | |||
| addi SP, SP, -STACKSIZE | |||
| mflr r0 | |||
| stfd f14, 0(SP) | |||
| stfd f15, 8(SP) | |||
| stfd f16, 16(SP) | |||
| stfd f17, 24(SP) | |||
| stfd f18, 32(SP) | |||
| stfd f19, 40(SP) | |||
| stfd f20, 48(SP) | |||
| stfd f21, 56(SP) | |||
| stfd f22, 64(SP) | |||
| stfd f23, 72(SP) | |||
| stfd f24, 80(SP) | |||
| stfd f25, 88(SP) | |||
| stfd f26, 96(SP) | |||
| stfd f27, 104(SP) | |||
| stfd f28, 112(SP) | |||
| stfd f29, 120(SP) | |||
| stfd f30, 128(SP) | |||
| stfd f31, 136(SP) | |||
| std r31, 144(SP) | |||
| std r30, 152(SP) | |||
| std r29, 160(SP) | |||
| std r28, 168(SP) | |||
| std r27, 176(SP) | |||
| std r26, 184(SP) | |||
| std r25, 192(SP) | |||
| std r24, 200(SP) | |||
| std r23, 208(SP) | |||
| std r22, 216(SP) | |||
| std r21, 224(SP) | |||
| std r20, 232(SP) | |||
| std r19, 240(SP) | |||
| std r18, 248(SP) | |||
| std r17, 256(SP) | |||
| std r16, 264(SP) | |||
| std r15, 272(SP) | |||
| std r14, 280(SP) | |||
| stxv vs52, 288(SP) | |||
| stxv vs53, 304(SP) | |||
| stxv vs54, 320(SP) | |||
| stxv vs55, 336(SP) | |||
| stxv vs56, 352(SP) | |||
| stxv vs57, 368(SP) | |||
| stxv vs58, 384(SP) | |||
| stxv vs59, 400(SP) | |||
| stxv vs60, 416(SP) | |||
| stxv vs61, 432(SP) | |||
| stxv vs62, 448(SP) | |||
| stxv vs63, 464(SP) | |||
| std r0, FLINK_SAVE(SP) | |||
| ld LDC, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #ifdef TRMMKERNEL | |||
| ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) | |||
| #endif | |||
| slwi LDC, LDC, ZBASE_SHIFT | |||
| /*alpha is stored in f1. convert to single and splat*/ | |||
| xscvdpspn alpha_r,vs1 | |||
| xscvdpspn alpha_i,vs2 | |||
| xxspltw alpha_r,alpha_r,0 | |||
| xxspltw alpha_i,alpha_i,0 | |||
| /*load reverse permute mask for big endian | |||
| uint128 = 0xc0d0e0f08090a0b0405060700010203 | |||
| */ | |||
| lis T2, perm_const2@highest | |||
| lis T1, perm_const1@highest | |||
| lis T3, save_permute_12@highest | |||
| lis T4, save_permute_11@highest | |||
| ori T2, T2, perm_const2@higher | |||
| ori T1, T1, perm_const1@higher | |||
| ori T3, T3, save_permute_12@higher | |||
| ori T4, T4, save_permute_11@higher | |||
| rldicr T2, T2, 32, 31 | |||
| rldicr T1, T1, 32, 31 | |||
| rldicr T3, T3, 32, 31 | |||
| rldicr T4, T4, 32, 31 | |||
| oris T2, T2, perm_const2@h | |||
| oris T1, T1, perm_const1@h | |||
| oris T3, T3, save_permute_12@h | |||
| oris T4, T4, save_permute_11@h | |||
| ori T2, T2, perm_const2@l | |||
| ori T1, T1, perm_const1@l | |||
| ori T3, T3, save_permute_12@l | |||
| ori T4, T4, save_permute_11@l | |||
| li r0,0 | |||
| li PRE,512 | |||
| #if defined(CC) || defined(CR) || defined(RC) || defined(RR) | |||
| /*negate for this case as we will use addition -1*(a+b) */ | |||
| xvnegsp alpha_r,alpha_r | |||
| xvnegsp alpha_i,alpha_i | |||
| #endif | |||
| mtvsrdd permute_mask,T2,T1 | |||
| mtvsrdd save_permute_1,T3,T4 | |||
| /*mask is reverse permute so we have to make it inner permute */ | |||
| xxpermdi permute_mask, permute_mask, permute_mask,2 | |||
| #include "cgemm_logic_power9.S" | |||
| .L999: | |||
| lfd f14, 0(SP) | |||
| lfd f15, 8(SP) | |||
| lfd f16, 16(SP) | |||
| lfd f17, 24(SP) | |||
| lfd f18, 32(SP) | |||
| lfd f19, 40(SP) | |||
| lfd f20, 48(SP) | |||
| lfd f21, 56(SP) | |||
| lfd f22, 64(SP) | |||
| lfd f23, 72(SP) | |||
| lfd f24, 80(SP) | |||
| lfd f25, 88(SP) | |||
| lfd f26, 96(SP) | |||
| lfd f27, 104(SP) | |||
| lfd f28, 112(SP) | |||
| lfd f29, 120(SP) | |||
| lfd f30, 128(SP) | |||
| lfd f31, 136(SP) | |||
| ld r31, 144(SP) | |||
| ld r30, 152(SP) | |||
| ld r29, 160(SP) | |||
| ld r28, 168(SP) | |||
| ld r27, 176(SP) | |||
| ld r26, 184(SP) | |||
| ld r25, 192(SP) | |||
| ld r24, 200(SP) | |||
| ld r23, 208(SP) | |||
| ld r22, 216(SP) | |||
| ld r21, 224(SP) | |||
| ld r20, 232(SP) | |||
| ld r19, 240(SP) | |||
| ld r18, 248(SP) | |||
| ld r17, 256(SP) | |||
| ld r16, 264(SP) | |||
| ld r15, 272(SP) | |||
| ld r14, 280(SP) | |||
| ld r0, FLINK_SAVE(SP) | |||
| lxv vs52, 288(SP) | |||
| lxv vs53, 304(SP) | |||
| lxv vs54, 320(SP) | |||
| lxv vs55, 336(SP) | |||
| lxv vs56, 352(SP) | |||
| lxv vs57, 368(SP) | |||
| lxv vs58, 384(SP) | |||
| lxv vs59, 400(SP) | |||
| mtlr r0 | |||
| lxv vs60, 416(SP) | |||
| lxv vs61, 432(SP) | |||
| lxv vs62, 448(SP) | |||
| lxv vs63, 464(SP) | |||
| addi SP, SP, STACKSIZE | |||
| blr | |||
| EPILOGUE | |||
| #endif | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * Abdelrauf(quickwritereader@gmail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "def_vsx.h" | |||
| #define LOAD ld | |||
| #define STACKSIZE (512 ) | |||
| #define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ | |||
| #define M r3 | |||
| #define N r4 | |||
| #define K r5 | |||
| #define A r8 | |||
| #define B r9 | |||
| #define C r10 | |||
| #define LDC r6 | |||
| #define OFFSET r7 | |||
| #define alpha_r vs19 | |||
| #define alpha_i vs20 | |||
| #define save_permute_1 vs21 | |||
| #define permute_mask vs22 | |||
| #define o0 0 | |||
| #define T1 r11 | |||
| #define T2 r12 | |||
| #define T3 r14 | |||
| #define T4 r15 | |||
| #define T5 r16 | |||
| #define T6 r17 | |||
| #define L r18 | |||
| #define T7 r19 | |||
| #define T8 r20 | |||
| #define TEMP_REG r21 | |||
| #define I r22 | |||
| #define J r23 | |||
| #define AO r24 | |||
| #define BO r25 | |||
| #define CO r26 | |||
| #define T9 r27 | |||
| #define T10 r28 | |||
| #define PRE r29 | |||
| #define T12 r30 | |||
| #define T13 r31 | |||
| #include "cgemm_macros_power9.S" | |||
| .equ perm_const1, 0x0405060700010203 | |||
| .equ perm_const2, 0x0c0d0e0f08090a0b | |||
| .equ save_permute_12, 0x0c0d0e0f1c1d1e1f | |||
| .equ save_permute_11, 0x0405060714151617 | |||
| #ifndef NEEDPARAM | |||
| PROLOGUE | |||
| PROFCODE | |||
| addi SP, SP, -STACKSIZE | |||
| mflr r0 | |||
| stfd f14, 0(SP) | |||
| stfd f15, 8(SP) | |||
| stfd f16, 16(SP) | |||
| stfd f17, 24(SP) | |||
| stfd f18, 32(SP) | |||
| stfd f19, 40(SP) | |||
| stfd f20, 48(SP) | |||
| stfd f21, 56(SP) | |||
| stfd f22, 64(SP) | |||
| stfd f23, 72(SP) | |||
| stfd f24, 80(SP) | |||
| stfd f25, 88(SP) | |||
| stfd f26, 96(SP) | |||
| stfd f27, 104(SP) | |||
| stfd f28, 112(SP) | |||
| stfd f29, 120(SP) | |||
| stfd f30, 128(SP) | |||
| stfd f31, 136(SP) | |||
| std r31, 144(SP) | |||
| std r30, 152(SP) | |||
| std r29, 160(SP) | |||
| std r28, 168(SP) | |||
| std r27, 176(SP) | |||
| std r26, 184(SP) | |||
| std r25, 192(SP) | |||
| std r24, 200(SP) | |||
| std r23, 208(SP) | |||
| std r22, 216(SP) | |||
| std r21, 224(SP) | |||
| std r20, 232(SP) | |||
| std r19, 240(SP) | |||
| std r18, 248(SP) | |||
| std r17, 256(SP) | |||
| std r16, 264(SP) | |||
| std r15, 272(SP) | |||
| std r14, 280(SP) | |||
| stxv vs52, 288(SP) | |||
| stxv vs53, 304(SP) | |||
| stxv vs54, 320(SP) | |||
| stxv vs55, 336(SP) | |||
| stxv vs56, 352(SP) | |||
| stxv vs57, 368(SP) | |||
| stxv vs58, 384(SP) | |||
| stxv vs59, 400(SP) | |||
| stxv vs60, 416(SP) | |||
| stxv vs61, 432(SP) | |||
| stxv vs62, 448(SP) | |||
| stxv vs63, 464(SP) | |||
| std r0, FLINK_SAVE(SP) | |||
| ld LDC, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #ifdef TRMMKERNEL | |||
| ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) | |||
| #endif | |||
| slwi LDC, LDC, ZBASE_SHIFT | |||
| /*alpha is stored in f1. convert to single and splat*/ | |||
| xscvdpspn alpha_r,vs1 | |||
| xscvdpspn alpha_i,vs2 | |||
| xxspltw alpha_r,alpha_r,0 | |||
| xxspltw alpha_i,alpha_i,0 | |||
| /*load reverse permute mask for big endian | |||
| uint128 = 0xc0d0e0f08090a0b0405060700010203 | |||
| */ | |||
| lis T2, perm_const2@highest | |||
| lis T1, perm_const1@highest | |||
| lis T3, save_permute_12@highest | |||
| lis T4, save_permute_11@highest | |||
| ori T2, T2, perm_const2@higher | |||
| ori T1, T1, perm_const1@higher | |||
| ori T3, T3, save_permute_12@higher | |||
| ori T4, T4, save_permute_11@higher | |||
| rldicr T2, T2, 32, 31 | |||
| rldicr T1, T1, 32, 31 | |||
| rldicr T3, T3, 32, 31 | |||
| rldicr T4, T4, 32, 31 | |||
| oris T2, T2, perm_const2@h | |||
| oris T1, T1, perm_const1@h | |||
| oris T3, T3, save_permute_12@h | |||
| oris T4, T4, save_permute_11@h | |||
| ori T2, T2, perm_const2@l | |||
| ori T1, T1, perm_const1@l | |||
| ori T3, T3, save_permute_12@l | |||
| ori T4, T4, save_permute_11@l | |||
| li r0,0 | |||
| li PRE,512 | |||
| #if defined(CC) || defined(CR) || defined(RC) || defined(RR) | |||
| /*negate for this case as we will use addition -1*(a+b) */ | |||
| xvnegsp alpha_r,alpha_r | |||
| xvnegsp alpha_i,alpha_i | |||
| #endif | |||
| mtvsrdd permute_mask,T2,T1 | |||
| mtvsrdd save_permute_1,T3,T4 | |||
| /*mask is reverse permute so we have to make it inner permute */ | |||
| xxpermdi permute_mask, permute_mask, permute_mask,2 | |||
| #include "cgemm_logic_power9.S" | |||
| .L999: | |||
| lfd f14, 0(SP) | |||
| lfd f15, 8(SP) | |||
| lfd f16, 16(SP) | |||
| lfd f17, 24(SP) | |||
| lfd f18, 32(SP) | |||
| lfd f19, 40(SP) | |||
| lfd f20, 48(SP) | |||
| lfd f21, 56(SP) | |||
| lfd f22, 64(SP) | |||
| lfd f23, 72(SP) | |||
| lfd f24, 80(SP) | |||
| lfd f25, 88(SP) | |||
| lfd f26, 96(SP) | |||
| lfd f27, 104(SP) | |||
| lfd f28, 112(SP) | |||
| lfd f29, 120(SP) | |||
| lfd f30, 128(SP) | |||
| lfd f31, 136(SP) | |||
| ld r31, 144(SP) | |||
| ld r30, 152(SP) | |||
| ld r29, 160(SP) | |||
| ld r28, 168(SP) | |||
| ld r27, 176(SP) | |||
| ld r26, 184(SP) | |||
| ld r25, 192(SP) | |||
| ld r24, 200(SP) | |||
| ld r23, 208(SP) | |||
| ld r22, 216(SP) | |||
| ld r21, 224(SP) | |||
| ld r20, 232(SP) | |||
| ld r19, 240(SP) | |||
| ld r18, 248(SP) | |||
| ld r17, 256(SP) | |||
| ld r16, 264(SP) | |||
| ld r15, 272(SP) | |||
| ld r14, 280(SP) | |||
| ld r0, FLINK_SAVE(SP) | |||
| lxv vs52, 288(SP) | |||
| lxv vs53, 304(SP) | |||
| lxv vs54, 320(SP) | |||
| lxv vs55, 336(SP) | |||
| lxv vs56, 352(SP) | |||
| lxv vs57, 368(SP) | |||
| lxv vs58, 384(SP) | |||
| lxv vs59, 400(SP) | |||
| mtlr r0 | |||
| lxv vs60, 416(SP) | |||
| lxv vs61, 432(SP) | |||
| lxv vs62, 448(SP) | |||
| lxv vs63, 464(SP) | |||
| addi SP, SP, STACKSIZE | |||
| blr | |||
| EPILOGUE | |||
| #endif | |||
| @@ -1,233 +1,233 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2018, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| static void crot_kernel_8 (long n, float *x, float *y, float c, float s) | |||
| { | |||
| __vector float t0; | |||
| __vector float t1; | |||
| __vector float t2; | |||
| __vector float t3; | |||
| __vector float t4; | |||
| __vector float t5; | |||
| __vector float t6; | |||
| __vector float t7; | |||
| __asm__ | |||
| ( | |||
| "xscvdpspn 36, %x[cos] \n\t" // load c to all words | |||
| "xxspltw 36, 36, 0 \n\t" | |||
| "xscvdpspn 37, %x[sin] \n\t" // load s to all words | |||
| "xxspltw 37, 37, 0 \n\t" | |||
| "lxvd2x 32, 0, %[x_ptr] \n\t" // load x | |||
| "lxvd2x 33, %[i16], %[x_ptr] \n\t" | |||
| "lxvd2x 34, %[i32], %[x_ptr] \n\t" | |||
| "lxvd2x 35, %[i48], %[x_ptr] \n\t" | |||
| "lxvd2x 48, 0, %[y_ptr] \n\t" // load y | |||
| "lxvd2x 49, %[i16], %[y_ptr] \n\t" | |||
| "lxvd2x 50, %[i32], %[y_ptr] \n\t" | |||
| "lxvd2x 51, %[i48], %[y_ptr] \n\t" | |||
| "addi %[x_ptr], %[x_ptr], 64 \n\t" | |||
| "addi %[y_ptr], %[y_ptr], 64 \n\t" | |||
| "addic. %[temp_n], %[temp_n], -8 \n\t" | |||
| "ble two%= \n\t" | |||
| ".align 5 \n\t" | |||
| "one%=: \n\t" | |||
| "xvmulsp 40, 32, 36 \n\t" // c * x | |||
| "xvmulsp 41, 33, 36 \n\t" | |||
| "xvmulsp 42, 34, 36 \n\t" | |||
| "xvmulsp 43, 35, 36 \n\t" | |||
| "xvmulsp %x[x0], 48, 36 \n\t" // c * y | |||
| "xvmulsp %x[x2], 49, 36 \n\t" | |||
| "xvmulsp %x[x1], 50, 36 \n\t" | |||
| "xvmulsp %x[x3], 51, 36 \n\t" | |||
| "xvmulsp 44, 32, 37 \n\t" // s * x | |||
| "xvmulsp 45, 33, 37 \n\t" | |||
| "lxvd2x 32, 0, %[x_ptr] \n\t" // load x | |||
| "lxvd2x 33, %[i16], %[x_ptr] \n\t" | |||
| "xvmulsp 46, 34, 37 \n\t" | |||
| "xvmulsp 47, 35, 37 \n\t" | |||
| "lxvd2x 34, %[i32], %[x_ptr] \n\t" | |||
| "lxvd2x 35, %[i48], %[x_ptr] \n\t" | |||
| "xvmulsp %x[x4], 48, 37 \n\t" // s * y | |||
| "xvmulsp %x[x5], 49, 37 \n\t" | |||
| "lxvd2x 48, 0, %[y_ptr] \n\t" // load y | |||
| "lxvd2x 49, %[i16], %[y_ptr] \n\t" | |||
| "xvmulsp %x[x6], 50, 37 \n\t" | |||
| "xvmulsp %x[x7], 51, 37 \n\t" | |||
| "lxvd2x 50, %[i32], %[y_ptr] \n\t" | |||
| "lxvd2x 51, %[i48], %[y_ptr] \n\t" | |||
| "xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y | |||
| "xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y | |||
| "addi %[x_ptr], %[x_ptr], -64 \n\t" | |||
| "addi %[y_ptr], %[y_ptr], -64 \n\t" | |||
| "xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y | |||
| "xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y | |||
| "xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x | |||
| "xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x | |||
| "xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x | |||
| "xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x | |||
| "stxvd2x 40, 0, %[x_ptr] \n\t" // store x | |||
| "stxvd2x 41, %[i16], %[x_ptr] \n\t" | |||
| "stxvd2x 42, %[i32], %[x_ptr] \n\t" | |||
| "stxvd2x 43, %[i48], %[x_ptr] \n\t" | |||
| "stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y | |||
| "stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" | |||
| "stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" | |||
| "stxvd2x %x[x3], %[i48], %[y_ptr] \n\t" | |||
| "addi %[x_ptr], %[x_ptr], 128 \n\t" | |||
| "addi %[y_ptr], %[y_ptr], 128 \n\t" | |||
| "addic. %[temp_n], %[temp_n], -8 \n\t" | |||
| "bgt one%= \n\t" | |||
| "two%=: \n\t" | |||
| "xvmulsp 40, 32, 36 \n\t" // c * x | |||
| "xvmulsp 41, 33, 36 \n\t" | |||
| "xvmulsp 42, 34, 36 \n\t" | |||
| "xvmulsp 43, 35, 36 \n\t" | |||
| "xvmulsp %x[x0], 48, 36 \n\t" // c * y | |||
| "xvmulsp %x[x2], 49, 36 \n\t" | |||
| "xvmulsp %x[x1], 50, 36 \n\t" | |||
| "xvmulsp %x[x3], 51, 36 \n\t" | |||
| "xvmulsp 44, 32, 37 \n\t" // s * x | |||
| "xvmulsp 45, 33, 37 \n\t" | |||
| "xvmulsp 46, 34, 37 \n\t" | |||
| "xvmulsp 47, 35, 37 \n\t" | |||
| "xvmulsp %x[x4], 48, 37 \n\t" // s * y | |||
| "xvmulsp %x[x5], 49, 37 \n\t" | |||
| "xvmulsp %x[x6], 50, 37 \n\t" | |||
| "xvmulsp %x[x7], 51, 37 \n\t" | |||
| "addi %[x_ptr], %[x_ptr], -64 \n\t" | |||
| "addi %[y_ptr], %[y_ptr], -64 \n\t" | |||
| "xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y | |||
| "xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y | |||
| "xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y | |||
| "xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y | |||
| "xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x | |||
| "xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x | |||
| "xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x | |||
| "xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x | |||
| "stxvd2x 40, 0, %[x_ptr] \n\t" // store x | |||
| "stxvd2x 41, %[i16], %[x_ptr] \n\t" | |||
| "stxvd2x 42, %[i32], %[x_ptr] \n\t" | |||
| "stxvd2x 43, %[i48], %[x_ptr] \n\t" | |||
| "stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y | |||
| "stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" | |||
| "stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" | |||
| "stxvd2x %x[x3], %[i48], %[y_ptr] " | |||
| : | |||
| [mem_x] "+m" (*(float (*)[2*n])x), | |||
| [mem_y] "+m" (*(float (*)[2*n])y), | |||
| [temp_n] "+r" (n), | |||
| [x_ptr] "+&b" (x), | |||
| [y_ptr] "+&b" (y), | |||
| [x0] "=wa" (t0), | |||
| [x1] "=wa" (t2), | |||
| [x2] "=wa" (t1), | |||
| [x3] "=wa" (t3), | |||
| [x4] "=wa" (t4), | |||
| [x5] "=wa" (t5), | |||
| [x6] "=wa" (t6), | |||
| [x7] "=wa" (t7) | |||
| : | |||
| [cos] "f" (c), | |||
| [sin] "f" (s), | |||
| [i16] "b" (16), | |||
| [i32] "b" (32), | |||
| [i48] "b" (48) | |||
| : | |||
| "cr0", | |||
| "vs32","vs33","vs34","vs35","vs36","vs37", | |||
| "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", | |||
| "vs48","vs49","vs50","vs51" | |||
| ); | |||
| } | |||
| #endif | |||
| #endif | |||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0,iy=0; | |||
| FLOAT temp[2]; | |||
| BLASLONG inc_x2; | |||
| BLASLONG inc_y2; | |||
| if ( n <= 0 ) return(0); | |||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||
| { | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| BLASLONG n1 = n & -8; | |||
| if ( n1 > 0 ) | |||
| { | |||
| crot_kernel_8(n1, x, y, c, s); | |||
| i=n1; | |||
| ix=2*n1; | |||
| } | |||
| #endif | |||
| while(i < n) | |||
| { | |||
| temp[0] = c*x[ix] + s*y[ix] ; | |||
| temp[1] = c*x[ix+1] + s*y[ix+1] ; | |||
| y[ix] = c*y[ix] - s*x[ix] ; | |||
| y[ix+1] = c*y[ix+1] - s*x[ix+1] ; | |||
| x[ix] = temp[0] ; | |||
| x[ix+1] = temp[1] ; | |||
| ix += 2 ; | |||
| i++ ; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| inc_x2 = 2 * inc_x ; | |||
| inc_y2 = 2 * inc_y ; | |||
| while(i < n) | |||
| { | |||
| temp[0] = c*x[ix] + s*y[iy] ; | |||
| temp[1] = c*x[ix+1] + s*y[iy+1] ; | |||
| y[iy] = c*y[iy] - s*x[ix] ; | |||
| y[iy+1] = c*y[iy+1] - s*x[ix+1] ; | |||
| x[ix] = temp[0] ; | |||
| x[ix+1] = temp[1] ; | |||
| ix += inc_x2 ; | |||
| iy += inc_y2 ; | |||
| i++ ; | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2018, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| static void crot_kernel_8 (long n, float *x, float *y, float c, float s) | |||
| { | |||
| __vector float t0; | |||
| __vector float t1; | |||
| __vector float t2; | |||
| __vector float t3; | |||
| __vector float t4; | |||
| __vector float t5; | |||
| __vector float t6; | |||
| __vector float t7; | |||
| __asm__ | |||
| ( | |||
| "xscvdpspn 36, %x[cos] \n\t" // load c to all words | |||
| "xxspltw 36, 36, 0 \n\t" | |||
| "xscvdpspn 37, %x[sin] \n\t" // load s to all words | |||
| "xxspltw 37, 37, 0 \n\t" | |||
| "lxvd2x 32, 0, %[x_ptr] \n\t" // load x | |||
| "lxvd2x 33, %[i16], %[x_ptr] \n\t" | |||
| "lxvd2x 34, %[i32], %[x_ptr] \n\t" | |||
| "lxvd2x 35, %[i48], %[x_ptr] \n\t" | |||
| "lxvd2x 48, 0, %[y_ptr] \n\t" // load y | |||
| "lxvd2x 49, %[i16], %[y_ptr] \n\t" | |||
| "lxvd2x 50, %[i32], %[y_ptr] \n\t" | |||
| "lxvd2x 51, %[i48], %[y_ptr] \n\t" | |||
| "addi %[x_ptr], %[x_ptr], 64 \n\t" | |||
| "addi %[y_ptr], %[y_ptr], 64 \n\t" | |||
| "addic. %[temp_n], %[temp_n], -8 \n\t" | |||
| "ble two%= \n\t" | |||
| ".align 5 \n\t" | |||
| "one%=: \n\t" | |||
| "xvmulsp 40, 32, 36 \n\t" // c * x | |||
| "xvmulsp 41, 33, 36 \n\t" | |||
| "xvmulsp 42, 34, 36 \n\t" | |||
| "xvmulsp 43, 35, 36 \n\t" | |||
| "xvmulsp %x[x0], 48, 36 \n\t" // c * y | |||
| "xvmulsp %x[x2], 49, 36 \n\t" | |||
| "xvmulsp %x[x1], 50, 36 \n\t" | |||
| "xvmulsp %x[x3], 51, 36 \n\t" | |||
| "xvmulsp 44, 32, 37 \n\t" // s * x | |||
| "xvmulsp 45, 33, 37 \n\t" | |||
| "lxvd2x 32, 0, %[x_ptr] \n\t" // load x | |||
| "lxvd2x 33, %[i16], %[x_ptr] \n\t" | |||
| "xvmulsp 46, 34, 37 \n\t" | |||
| "xvmulsp 47, 35, 37 \n\t" | |||
| "lxvd2x 34, %[i32], %[x_ptr] \n\t" | |||
| "lxvd2x 35, %[i48], %[x_ptr] \n\t" | |||
| "xvmulsp %x[x4], 48, 37 \n\t" // s * y | |||
| "xvmulsp %x[x5], 49, 37 \n\t" | |||
| "lxvd2x 48, 0, %[y_ptr] \n\t" // load y | |||
| "lxvd2x 49, %[i16], %[y_ptr] \n\t" | |||
| "xvmulsp %x[x6], 50, 37 \n\t" | |||
| "xvmulsp %x[x7], 51, 37 \n\t" | |||
| "lxvd2x 50, %[i32], %[y_ptr] \n\t" | |||
| "lxvd2x 51, %[i48], %[y_ptr] \n\t" | |||
| "xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y | |||
| "xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y | |||
| "addi %[x_ptr], %[x_ptr], -64 \n\t" | |||
| "addi %[y_ptr], %[y_ptr], -64 \n\t" | |||
| "xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y | |||
| "xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y | |||
| "xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x | |||
| "xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x | |||
| "xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x | |||
| "xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x | |||
| "stxvd2x 40, 0, %[x_ptr] \n\t" // store x | |||
| "stxvd2x 41, %[i16], %[x_ptr] \n\t" | |||
| "stxvd2x 42, %[i32], %[x_ptr] \n\t" | |||
| "stxvd2x 43, %[i48], %[x_ptr] \n\t" | |||
| "stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y | |||
| "stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" | |||
| "stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" | |||
| "stxvd2x %x[x3], %[i48], %[y_ptr] \n\t" | |||
| "addi %[x_ptr], %[x_ptr], 128 \n\t" | |||
| "addi %[y_ptr], %[y_ptr], 128 \n\t" | |||
| "addic. %[temp_n], %[temp_n], -8 \n\t" | |||
| "bgt one%= \n\t" | |||
| "two%=: \n\t" | |||
| "xvmulsp 40, 32, 36 \n\t" // c * x | |||
| "xvmulsp 41, 33, 36 \n\t" | |||
| "xvmulsp 42, 34, 36 \n\t" | |||
| "xvmulsp 43, 35, 36 \n\t" | |||
| "xvmulsp %x[x0], 48, 36 \n\t" // c * y | |||
| "xvmulsp %x[x2], 49, 36 \n\t" | |||
| "xvmulsp %x[x1], 50, 36 \n\t" | |||
| "xvmulsp %x[x3], 51, 36 \n\t" | |||
| "xvmulsp 44, 32, 37 \n\t" // s * x | |||
| "xvmulsp 45, 33, 37 \n\t" | |||
| "xvmulsp 46, 34, 37 \n\t" | |||
| "xvmulsp 47, 35, 37 \n\t" | |||
| "xvmulsp %x[x4], 48, 37 \n\t" // s * y | |||
| "xvmulsp %x[x5], 49, 37 \n\t" | |||
| "xvmulsp %x[x6], 50, 37 \n\t" | |||
| "xvmulsp %x[x7], 51, 37 \n\t" | |||
| "addi %[x_ptr], %[x_ptr], -64 \n\t" | |||
| "addi %[y_ptr], %[y_ptr], -64 \n\t" | |||
| "xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y | |||
| "xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y | |||
| "xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y | |||
| "xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y | |||
| "xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x | |||
| "xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x | |||
| "xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x | |||
| "xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x | |||
| "stxvd2x 40, 0, %[x_ptr] \n\t" // store x | |||
| "stxvd2x 41, %[i16], %[x_ptr] \n\t" | |||
| "stxvd2x 42, %[i32], %[x_ptr] \n\t" | |||
| "stxvd2x 43, %[i48], %[x_ptr] \n\t" | |||
| "stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y | |||
| "stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" | |||
| "stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" | |||
| "stxvd2x %x[x3], %[i48], %[y_ptr] " | |||
| : | |||
| [mem_x] "+m" (*(float (*)[2*n])x), | |||
| [mem_y] "+m" (*(float (*)[2*n])y), | |||
| [temp_n] "+r" (n), | |||
| [x_ptr] "+&b" (x), | |||
| [y_ptr] "+&b" (y), | |||
| [x0] "=wa" (t0), | |||
| [x1] "=wa" (t2), | |||
| [x2] "=wa" (t1), | |||
| [x3] "=wa" (t3), | |||
| [x4] "=wa" (t4), | |||
| [x5] "=wa" (t5), | |||
| [x6] "=wa" (t6), | |||
| [x7] "=wa" (t7) | |||
| : | |||
| [cos] "f" (c), | |||
| [sin] "f" (s), | |||
| [i16] "b" (16), | |||
| [i32] "b" (32), | |||
| [i48] "b" (48) | |||
| : | |||
| "cr0", | |||
| "vs32","vs33","vs34","vs35","vs36","vs37", | |||
| "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", | |||
| "vs48","vs49","vs50","vs51" | |||
| ); | |||
| } | |||
| #endif | |||
| #endif | |||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0,iy=0; | |||
| FLOAT temp[2]; | |||
| BLASLONG inc_x2; | |||
| BLASLONG inc_y2; | |||
| if ( n <= 0 ) return(0); | |||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||
| { | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| BLASLONG n1 = n & -8; | |||
| if ( n1 > 0 ) | |||
| { | |||
| crot_kernel_8(n1, x, y, c, s); | |||
| i=n1; | |||
| ix=2*n1; | |||
| } | |||
| #endif | |||
| while(i < n) | |||
| { | |||
| temp[0] = c*x[ix] + s*y[ix] ; | |||
| temp[1] = c*x[ix+1] + s*y[ix+1] ; | |||
| y[ix] = c*y[ix] - s*x[ix] ; | |||
| y[ix+1] = c*y[ix+1] - s*x[ix+1] ; | |||
| x[ix] = temp[0] ; | |||
| x[ix+1] = temp[1] ; | |||
| ix += 2 ; | |||
| i++ ; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| inc_x2 = 2 * inc_x ; | |||
| inc_y2 = 2 * inc_y ; | |||
| while(i < n) | |||
| { | |||
| temp[0] = c*x[ix] + s*y[iy] ; | |||
| temp[1] = c*x[ix+1] + s*y[iy+1] ; | |||
| y[iy] = c*y[iy] - s*x[ix] ; | |||
| y[iy+1] = c*y[iy+1] - s*x[ix+1] ; | |||
| x[ix] = temp[0] ; | |||
| x[ix+1] = temp[1] ; | |||
| ix += inc_x2 ; | |||
| iy += inc_y2 ; | |||
| i++ ; | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -1,249 +1,249 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "def_vsx.h" | |||
| #define LOAD ld | |||
| #define STACKSIZE (512 ) | |||
| #define ALPHA_SP (296+192)(SP) | |||
| #define FZERO (304+192)(SP) | |||
| #define M r3 | |||
| #define N r4 | |||
| #define K r5 | |||
| #define A r7 | |||
| #define B r8 | |||
| #define C r9 | |||
| #define LDC r10 | |||
| #define OFFSET r6 | |||
| #define alpha_r vs18 | |||
| #define o0 0 | |||
| #define T4 r12 | |||
| #define T3 r11 | |||
| #define C4 r14 | |||
| #define o8 r15 | |||
| #define o24 r16 | |||
| #define C2 r17 | |||
| #define L r18 | |||
| #define T1 r19 | |||
| #define C3 r20 | |||
| #define TEMP_REG r21 | |||
| #define I r22 | |||
| #define J r23 | |||
| #define AO r24 | |||
| #define BO r25 | |||
| #define CO r26 | |||
| #define o16 r27 | |||
| #define o32 r28 | |||
| #define o48 r29 | |||
| #define PRE r30 | |||
| #define T2 r31 | |||
| #include "dgemm_macros_power9.S" | |||
| #ifndef NEEDPARAM | |||
| PROLOGUE | |||
| PROFCODE | |||
| addi SP, SP, -STACKSIZE | |||
| li r0, 0 | |||
| stfd f14, 0(SP) | |||
| stfd f15, 8(SP) | |||
| stfd f16, 16(SP) | |||
| stfd f17, 24(SP) | |||
| stfd f18, 32(SP) | |||
| stfd f19, 40(SP) | |||
| stfd f20, 48(SP) | |||
| stfd f21, 56(SP) | |||
| stfd f22, 64(SP) | |||
| stfd f23, 72(SP) | |||
| stfd f24, 80(SP) | |||
| stfd f25, 88(SP) | |||
| stfd f26, 96(SP) | |||
| stfd f27, 104(SP) | |||
| stfd f28, 112(SP) | |||
| stfd f29, 120(SP) | |||
| stfd f30, 128(SP) | |||
| stfd f31, 136(SP) | |||
| std r31, 144(SP) | |||
| std r30, 152(SP) | |||
| std r29, 160(SP) | |||
| std r28, 168(SP) | |||
| std r27, 176(SP) | |||
| std r26, 184(SP) | |||
| std r25, 192(SP) | |||
| std r24, 200(SP) | |||
| std r23, 208(SP) | |||
| std r22, 216(SP) | |||
| std r21, 224(SP) | |||
| std r20, 232(SP) | |||
| std r19, 240(SP) | |||
| std r18, 248(SP) | |||
| std r17, 256(SP) | |||
| std r16, 264(SP) | |||
| std r15, 272(SP) | |||
| std r14, 280(SP) | |||
| stxv vs52, 288(SP) | |||
| stxv vs53, 304(SP) | |||
| stxv vs54, 320(SP) | |||
| stxv vs55, 336(SP) | |||
| stxv vs56, 352(SP) | |||
| stxv vs57, 368(SP) | |||
| stxv vs58, 384(SP) | |||
| stxv vs59, 400(SP) | |||
| stxv vs60, 416(SP) | |||
| stxv vs61, 432(SP) | |||
| stxv vs62, 448(SP) | |||
| stxv vs63, 464(SP) | |||
| stfd f1, ALPHA_SP | |||
| stw r0, FZERO | |||
| slwi LDC, LDC, BASE_SHIFT | |||
| #if defined(TRMMKERNEL) | |||
| ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #endif | |||
| cmpwi cr0, M, 0 | |||
| ble .L999_H1 | |||
| cmpwi cr0, N, 0 | |||
| ble .L999_H1 | |||
| cmpwi cr0, K, 0 | |||
| ble .L999_H1 | |||
| addi T1, SP, 296+192 | |||
| li PRE, 384 | |||
| li o8 , 8 | |||
| li o16, 16 | |||
| li o24, 24 | |||
| li o32, 32 | |||
| li o48, 48 | |||
| lxvdsx alpha_r, 0, T1 | |||
| #include "dgemm_logic_power9.S" | |||
| .L999: | |||
| addi r3, 0, 0 | |||
| lfd f14, 0(SP) | |||
| lfd f15, 8(SP) | |||
| lfd f16, 16(SP) | |||
| lfd f17, 24(SP) | |||
| lfd f18, 32(SP) | |||
| lfd f19, 40(SP) | |||
| lfd f20, 48(SP) | |||
| lfd f21, 56(SP) | |||
| lfd f22, 64(SP) | |||
| lfd f23, 72(SP) | |||
| lfd f24, 80(SP) | |||
| lfd f25, 88(SP) | |||
| lfd f26, 96(SP) | |||
| lfd f27, 104(SP) | |||
| lfd f28, 112(SP) | |||
| lfd f29, 120(SP) | |||
| lfd f30, 128(SP) | |||
| lfd f31, 136(SP) | |||
| ld r31, 144(SP) | |||
| ld r30, 152(SP) | |||
| ld r29, 160(SP) | |||
| ld r28, 168(SP) | |||
| ld r27, 176(SP) | |||
| ld r26, 184(SP) | |||
| ld r25, 192(SP) | |||
| ld r24, 200(SP) | |||
| ld r23, 208(SP) | |||
| ld r22, 216(SP) | |||
| ld r21, 224(SP) | |||
| ld r20, 232(SP) | |||
| ld r19, 240(SP) | |||
| ld r18, 248(SP) | |||
| ld r17, 256(SP) | |||
| ld r16, 264(SP) | |||
| ld r15, 272(SP) | |||
| ld r14, 280(SP) | |||
| lxv vs52, 288(SP) | |||
| lxv vs53, 304(SP) | |||
| lxv vs54, 320(SP) | |||
| lxv vs55, 336(SP) | |||
| lxv vs56, 352(SP) | |||
| lxv vs57, 368(SP) | |||
| lxv vs58, 384(SP) | |||
| lxv vs59, 400(SP) | |||
| lxv vs60, 416(SP) | |||
| lxv vs61, 432(SP) | |||
| lxv vs62, 448(SP) | |||
| lxv vs63, 464(SP) | |||
| addi SP, SP, STACKSIZE | |||
| blr | |||
| EPILOGUE | |||
| #endif | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "def_vsx.h" | |||
| #define LOAD ld | |||
| #define STACKSIZE (512 ) | |||
| #define ALPHA_SP (296+192)(SP) | |||
| #define FZERO (304+192)(SP) | |||
| #define M r3 | |||
| #define N r4 | |||
| #define K r5 | |||
| #define A r7 | |||
| #define B r8 | |||
| #define C r9 | |||
| #define LDC r10 | |||
| #define OFFSET r6 | |||
| #define alpha_r vs18 | |||
| #define o0 0 | |||
| #define T4 r12 | |||
| #define T3 r11 | |||
| #define C4 r14 | |||
| #define o8 r15 | |||
| #define o24 r16 | |||
| #define C2 r17 | |||
| #define L r18 | |||
| #define T1 r19 | |||
| #define C3 r20 | |||
| #define TEMP_REG r21 | |||
| #define I r22 | |||
| #define J r23 | |||
| #define AO r24 | |||
| #define BO r25 | |||
| #define CO r26 | |||
| #define o16 r27 | |||
| #define o32 r28 | |||
| #define o48 r29 | |||
| #define PRE r30 | |||
| #define T2 r31 | |||
| #include "dgemm_macros_power9.S" | |||
| #ifndef NEEDPARAM | |||
| PROLOGUE | |||
| PROFCODE | |||
| addi SP, SP, -STACKSIZE | |||
| li r0, 0 | |||
| stfd f14, 0(SP) | |||
| stfd f15, 8(SP) | |||
| stfd f16, 16(SP) | |||
| stfd f17, 24(SP) | |||
| stfd f18, 32(SP) | |||
| stfd f19, 40(SP) | |||
| stfd f20, 48(SP) | |||
| stfd f21, 56(SP) | |||
| stfd f22, 64(SP) | |||
| stfd f23, 72(SP) | |||
| stfd f24, 80(SP) | |||
| stfd f25, 88(SP) | |||
| stfd f26, 96(SP) | |||
| stfd f27, 104(SP) | |||
| stfd f28, 112(SP) | |||
| stfd f29, 120(SP) | |||
| stfd f30, 128(SP) | |||
| stfd f31, 136(SP) | |||
| std r31, 144(SP) | |||
| std r30, 152(SP) | |||
| std r29, 160(SP) | |||
| std r28, 168(SP) | |||
| std r27, 176(SP) | |||
| std r26, 184(SP) | |||
| std r25, 192(SP) | |||
| std r24, 200(SP) | |||
| std r23, 208(SP) | |||
| std r22, 216(SP) | |||
| std r21, 224(SP) | |||
| std r20, 232(SP) | |||
| std r19, 240(SP) | |||
| std r18, 248(SP) | |||
| std r17, 256(SP) | |||
| std r16, 264(SP) | |||
| std r15, 272(SP) | |||
| std r14, 280(SP) | |||
| stxv vs52, 288(SP) | |||
| stxv vs53, 304(SP) | |||
| stxv vs54, 320(SP) | |||
| stxv vs55, 336(SP) | |||
| stxv vs56, 352(SP) | |||
| stxv vs57, 368(SP) | |||
| stxv vs58, 384(SP) | |||
| stxv vs59, 400(SP) | |||
| stxv vs60, 416(SP) | |||
| stxv vs61, 432(SP) | |||
| stxv vs62, 448(SP) | |||
| stxv vs63, 464(SP) | |||
| stfd f1, ALPHA_SP | |||
| stw r0, FZERO | |||
| slwi LDC, LDC, BASE_SHIFT | |||
| #if defined(TRMMKERNEL) | |||
| ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #endif | |||
| cmpwi cr0, M, 0 | |||
| ble .L999_H1 | |||
| cmpwi cr0, N, 0 | |||
| ble .L999_H1 | |||
| cmpwi cr0, K, 0 | |||
| ble .L999_H1 | |||
| addi T1, SP, 296+192 | |||
| li PRE, 384 | |||
| li o8 , 8 | |||
| li o16, 16 | |||
| li o24, 24 | |||
| li o32, 32 | |||
| li o48, 48 | |||
| lxvdsx alpha_r, 0, T1 | |||
| #include "dgemm_logic_power9.S" | |||
| .L999: | |||
| addi r3, 0, 0 | |||
| lfd f14, 0(SP) | |||
| lfd f15, 8(SP) | |||
| lfd f16, 16(SP) | |||
| lfd f17, 24(SP) | |||
| lfd f18, 32(SP) | |||
| lfd f19, 40(SP) | |||
| lfd f20, 48(SP) | |||
| lfd f21, 56(SP) | |||
| lfd f22, 64(SP) | |||
| lfd f23, 72(SP) | |||
| lfd f24, 80(SP) | |||
| lfd f25, 88(SP) | |||
| lfd f26, 96(SP) | |||
| lfd f27, 104(SP) | |||
| lfd f28, 112(SP) | |||
| lfd f29, 120(SP) | |||
| lfd f30, 128(SP) | |||
| lfd f31, 136(SP) | |||
| ld r31, 144(SP) | |||
| ld r30, 152(SP) | |||
| ld r29, 160(SP) | |||
| ld r28, 168(SP) | |||
| ld r27, 176(SP) | |||
| ld r26, 184(SP) | |||
| ld r25, 192(SP) | |||
| ld r24, 200(SP) | |||
| ld r23, 208(SP) | |||
| ld r22, 216(SP) | |||
| ld r21, 224(SP) | |||
| ld r20, 232(SP) | |||
| ld r19, 240(SP) | |||
| ld r18, 248(SP) | |||
| ld r17, 256(SP) | |||
| ld r16, 264(SP) | |||
| ld r15, 272(SP) | |||
| ld r14, 280(SP) | |||
| lxv vs52, 288(SP) | |||
| lxv vs53, 304(SP) | |||
| lxv vs54, 320(SP) | |||
| lxv vs55, 336(SP) | |||
| lxv vs56, 352(SP) | |||
| lxv vs57, 368(SP) | |||
| lxv vs58, 384(SP) | |||
| lxv vs59, 400(SP) | |||
| lxv vs60, 416(SP) | |||
| lxv vs61, 432(SP) | |||
| lxv vs62, 448(SP) | |||
| lxv vs63, 464(SP) | |||
| addi SP, SP, STACKSIZE | |||
| blr | |||
| EPILOGUE | |||
| #endif | |||
| @@ -1,328 +1,328 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #include <altivec.h> | |||
| #if defined(DOUBLE) | |||
| #define ABS fabs | |||
| #else | |||
| #define ABS fabsf | |||
| #endif | |||
| #define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) | |||
| #define USE_MASK_PERMUTATIONS 1 //with this type of permutation gcc output a little faster code | |||
| #if !defined(USE_MASK_PERMUTATIONS) | |||
| static inline __attribute__((always_inline)) __vector float mvec_mergee(__vector float a,__vector float b ){ | |||
| __vector float result; | |||
| __asm__ ( | |||
| "vmrgew %0,%1,%2;\n" | |||
| : "=v" (result) | |||
| : "v" (a), | |||
| "v" (b) | |||
| : ); | |||
| return result; | |||
| } | |||
| static inline __attribute__((always_inline)) __vector float mvec_mergeo(__vector float a,__vector float b ){ | |||
| __vector float result; | |||
| __asm__ ( | |||
| "vmrgow %0,%1,%2;\n" | |||
| : "=v" (result) | |||
| : "v" (a), | |||
| "v" (b) | |||
| : ); | |||
| return result; | |||
| } | |||
| #endif | |||
| /** | |||
| * Find maximum index | |||
| * Warning: requirements n>0 and n % 32 == 0 | |||
| * @param n | |||
| * @param x pointer to the vector | |||
| * @param maxf (out) maximum absolute value .( only for output ) | |||
| * @return index | |||
| */ | |||
| static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { | |||
| BLASLONG index; | |||
| BLASLONG i=0; | |||
| #if defined(USE_MASK_PERMUTATIONS) | |||
| register __vector unsigned int static_index0 = {0,1,2,3}; | |||
| #else | |||
| register __vector unsigned int static_index0 = {2,0,3,1}; | |||
| #endif | |||
| register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | |||
| register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | |||
| register __vector unsigned int static_index1=static_index0 +temp0; | |||
| register __vector unsigned int static_index2=static_index0 +temp1; | |||
| register __vector unsigned int static_index3=static_index1 +temp1; | |||
| temp0=vec_xor(temp0,temp0); | |||
| temp1=temp1 <<1 ; //{16,16,16,16} | |||
| register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} | |||
| register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} | |||
| register __vector float quadruple_values={0,0,0,0}; | |||
| register __vector float * v_ptrx=(__vector float *)x; | |||
| #if defined(USE_MASK_PERMUTATIONS) | |||
| register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; | |||
| register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; | |||
| #endif | |||
| for(; i<n; i+=32 ){ | |||
| //absolute temporary complex vectors | |||
| register __vector float v0=vec_abs(v_ptrx[0]); | |||
| register __vector float v1=vec_abs(v_ptrx[1]); | |||
| register __vector float v2=vec_abs(v_ptrx[2]); | |||
| register __vector float v3=vec_abs(v_ptrx[3]); | |||
| register __vector float v4=vec_abs(v_ptrx[4]); | |||
| register __vector float v5=vec_abs(v_ptrx[5]); | |||
| register __vector float v6=vec_abs(v_ptrx[6]); | |||
| register __vector float v7=vec_abs(v_ptrx[7]); | |||
| //pack complex real and imaginary parts together to sum real+image | |||
| #if defined(USE_MASK_PERMUTATIONS) | |||
| register __vector float t1=vec_perm(v0,v1,real_pack_mask); | |||
| register __vector float ti=vec_perm(v0,v1,image_pack_mask); | |||
| v0=t1+ti; //sum quadruple real with quadruple image | |||
| register __vector float t2=vec_perm(v2,v3,real_pack_mask); | |||
| register __vector float ti2=vec_perm(v2,v3,image_pack_mask); | |||
| v1=t2+ti2; | |||
| t1=vec_perm(v4,v5,real_pack_mask); | |||
| ti=vec_perm(v4,v5,image_pack_mask); | |||
| v2=t1+ti; //sum | |||
| t2=vec_perm(v6,v7,real_pack_mask); | |||
| ti2=vec_perm(v6,v7,image_pack_mask); | |||
| v3=t2+ti2; | |||
| #else | |||
| register __vector float t1=mvec_mergee(v0,v1); | |||
| register __vector float ti=mvec_mergeo(v0,v1); | |||
| v0=t1+ti; //sum quadruple real with quadruple image | |||
| register __vector float t2= mvec_mergee(v2,v3); | |||
| register __vector float ti2=mvec_mergeo(v2,v3); | |||
| v1=t2+ti2; | |||
| t1=mvec_mergee(v4,v5); | |||
| ti=mvec_mergeo(v4,v5); | |||
| v2=t1+ti; //sum | |||
| t2=mvec_mergee(v6,v7); | |||
| ti2=mvec_mergeo(v6,v7); | |||
| v3=t2+ti2; | |||
| #endif | |||
| // now we have 16 summed elements . lets compare them | |||
| v_ptrx+=8; | |||
| register __vector bool int r1=vec_cmpgt(v1,v0); | |||
| register __vector bool int r2=vec_cmpgt(v3,v2); | |||
| register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r1); | |||
| v0=vec_sel(v0,v1,r1); | |||
| register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r2); | |||
| v1=vec_sel(v2,v3,r2); | |||
| //final cmp and select index and value for first 16 values | |||
| r1=vec_cmpgt(v1,v0); | |||
| register __vector unsigned int indf0 = vec_sel(ind2,ind3,r1); | |||
| register __vector float vf0= vec_sel(v0,v1,r1); | |||
| //absolute temporary complex vectors | |||
| v0=vec_abs(v_ptrx[0]); | |||
| v1=vec_abs(v_ptrx[1]); | |||
| v2=vec_abs(v_ptrx[2]); | |||
| v3=vec_abs(v_ptrx[3]); | |||
| v4=vec_abs(v_ptrx[4]); | |||
| v5=vec_abs(v_ptrx[5]); | |||
| v6=vec_abs(v_ptrx[6]); | |||
| v7=vec_abs(v_ptrx[7]); | |||
| //pack complex real and imaginary parts together to sum real+image | |||
| #if defined(USE_MASK_PERMUTATIONS) | |||
| t1=vec_perm(v0,v1,real_pack_mask); | |||
| ti=vec_perm(v0,v1,image_pack_mask); | |||
| v0=t1+ti; //sum quadruple real with quadruple image | |||
| t2=vec_perm(v2,v3,real_pack_mask); | |||
| ti2=vec_perm(v2,v3,image_pack_mask); | |||
| v1=t2+ti2; | |||
| t1=vec_perm(v4,v5,real_pack_mask); | |||
| ti=vec_perm(v4,v5,image_pack_mask); | |||
| v2=t1+ti; //sum | |||
| t2=vec_perm(v6,v7,real_pack_mask); | |||
| ti2=vec_perm(v6,v7,image_pack_mask); | |||
| v3=t2+ti2; | |||
| #else | |||
| t1=mvec_mergee(v0,v1); | |||
| ti=mvec_mergeo(v0,v1); | |||
| v0=t1+ti; //sum quadruple real with quadruple image | |||
| t2=mvec_mergee(v2,v3); | |||
| ti2=mvec_mergeo(v2,v3); | |||
| v1=t2+ti2; | |||
| t1=mvec_mergee(v4,v5); | |||
| ti=mvec_mergeo(v4,v5); | |||
| v2=t1+ti; //sum | |||
| t2=mvec_mergee(v6,v7); | |||
| ti2=mvec_mergeo(v6,v7); | |||
| v3=t2+ti2; | |||
| #endif | |||
| // now we have 16 summed elements {from 16 to 31} . lets compare them | |||
| v_ptrx+=8; | |||
| r1=vec_cmpgt(v1,v0); | |||
| r2=vec_cmpgt(v3,v2); | |||
| ind2= vec_sel(static_index0,static_index1,r1); | |||
| v0=vec_sel(v0,v1,r1); | |||
| ind3= vec_sel(static_index2,static_index3,r2); | |||
| v1=vec_sel(v2,v3,r2); | |||
| //final cmp and select index and value for the second 16 values | |||
| r1=vec_cmpgt(v1,v0); | |||
| register __vector unsigned int indv0 = vec_sel(ind2,ind3,r1); | |||
| register __vector float vv0= vec_sel(v0,v1,r1); | |||
| indv0+=temp1; //make index from 16->31 | |||
| //find final quadruple from 32 elements | |||
| r2=vec_cmpgt(vv0,vf0); | |||
| ind2 = vec_sel( indf0,indv0,r2); | |||
| vv0= vec_sel(vf0,vv0,r2); | |||
| //get asbolute index | |||
| ind2+=temp0; | |||
| //compare with old quadruple and update | |||
| r1=vec_cmpgt(vv0,quadruple_values); | |||
| quadruple_indices = vec_sel( quadruple_indices,ind2,r1); | |||
| quadruple_values= vec_sel(quadruple_values,vv0,r1); | |||
| temp0+=temp_add; | |||
| } | |||
| //now we have to chose from 4 values and 4 different indices | |||
| // we will compare pairwise if pairs are exactly the same we will choose minimum between index | |||
| // otherwise we will assign index of the maximum value | |||
| float a1,a2,a3,a4; | |||
| unsigned int i1,i2,i3,i4; | |||
| a1=vec_extract(quadruple_values,0); | |||
| a2=vec_extract(quadruple_values,1); | |||
| a3=vec_extract(quadruple_values,2); | |||
| a4=vec_extract(quadruple_values,3); | |||
| i1=vec_extract(quadruple_indices,0); | |||
| i2=vec_extract(quadruple_indices,1); | |||
| i3=vec_extract(quadruple_indices,2); | |||
| i4=vec_extract(quadruple_indices,3); | |||
| if(a1==a2){ | |||
| index=i1>i2?i2:i1; | |||
| }else if(a2>a1){ | |||
| index=i2; | |||
| a1=a2; | |||
| }else{ | |||
| index= i1; | |||
| } | |||
| if(a4==a3){ | |||
| i1=i3>i4?i4:i3; | |||
| }else if(a4>a3){ | |||
| i1=i4; | |||
| a3=a4; | |||
| }else{ | |||
| i1= i3; | |||
| } | |||
| if(a1==a3){ | |||
| index=i1>index?index:i1; | |||
| *maxf=a1; | |||
| }else if(a3>a1){ | |||
| index=i1; | |||
| *maxf=a3; | |||
| }else{ | |||
| *maxf=a1; | |||
| } | |||
| return index; | |||
| } | |||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i = 0; | |||
| BLASLONG ix = 0; | |||
| FLOAT maxf = 0; | |||
| BLASLONG max = 0; | |||
| BLASLONG inc_x2; | |||
| if (n <= 0 || inc_x <= 0) return(max); | |||
| if (inc_x == 1) { | |||
| BLASLONG n1 = n & -32; | |||
| if (n1 > 0) { | |||
| max = ciamax_kernel_32(n1, x, &maxf); | |||
| i = n1; | |||
| ix = n1 << 1; | |||
| } | |||
| while(i < n) | |||
| { | |||
| if( CABS1(x,ix) > maxf ) | |||
| { | |||
| max = i; | |||
| maxf = CABS1(x,ix); | |||
| } | |||
| ix += 2; | |||
| i++; | |||
| } | |||
| return (max + 1); | |||
| } else { | |||
| inc_x2 = 2 * inc_x; | |||
| maxf = CABS1(x,0); | |||
| ix += inc_x2; | |||
| i++; | |||
| while(i < n) | |||
| { | |||
| if( CABS1(x,ix) > maxf ) | |||
| { | |||
| max = i; | |||
| maxf = CABS1(x,ix); | |||
| } | |||
| ix += inc_x2; | |||
| i++; | |||
| } | |||
| return (max + 1); | |||
| } | |||
| } | |||
| /*************************************************************************** | |||
| Copyright (c) 2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #include <altivec.h> | |||
| #if defined(DOUBLE) | |||
| #define ABS fabs | |||
| #else | |||
| #define ABS fabsf | |||
| #endif | |||
| #define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) | |||
| #define USE_MASK_PERMUTATIONS 1 //with this type of permutation gcc output a little faster code | |||
| #if !defined(USE_MASK_PERMUTATIONS) | |||
| static inline __attribute__((always_inline)) __vector float mvec_mergee(__vector float a,__vector float b ){ | |||
| __vector float result; | |||
| __asm__ ( | |||
| "vmrgew %0,%1,%2;\n" | |||
| : "=v" (result) | |||
| : "v" (a), | |||
| "v" (b) | |||
| : ); | |||
| return result; | |||
| } | |||
| static inline __attribute__((always_inline)) __vector float mvec_mergeo(__vector float a,__vector float b ){ | |||
| __vector float result; | |||
| __asm__ ( | |||
| "vmrgow %0,%1,%2;\n" | |||
| : "=v" (result) | |||
| : "v" (a), | |||
| "v" (b) | |||
| : ); | |||
| return result; | |||
| } | |||
| #endif | |||
| /** | |||
| * Find maximum index | |||
| * Warning: requirements n>0 and n % 32 == 0 | |||
| * @param n | |||
| * @param x pointer to the vector | |||
| * @param maxf (out) maximum absolute value .( only for output ) | |||
| * @return index | |||
| */ | |||
| static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { | |||
| BLASLONG index; | |||
| BLASLONG i=0; | |||
| #if defined(USE_MASK_PERMUTATIONS) | |||
| register __vector unsigned int static_index0 = {0,1,2,3}; | |||
| #else | |||
| register __vector unsigned int static_index0 = {2,0,3,1}; | |||
| #endif | |||
| register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | |||
| register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | |||
| register __vector unsigned int static_index1=static_index0 +temp0; | |||
| register __vector unsigned int static_index2=static_index0 +temp1; | |||
| register __vector unsigned int static_index3=static_index1 +temp1; | |||
| temp0=vec_xor(temp0,temp0); | |||
| temp1=temp1 <<1 ; //{16,16,16,16} | |||
| register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} | |||
| register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} | |||
| register __vector float quadruple_values={0,0,0,0}; | |||
| register __vector float * v_ptrx=(__vector float *)x; | |||
| #if defined(USE_MASK_PERMUTATIONS) | |||
| register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; | |||
| register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; | |||
| #endif | |||
| for(; i<n; i+=32 ){ | |||
| //absolute temporary complex vectors | |||
| register __vector float v0=vec_abs(v_ptrx[0]); | |||
| register __vector float v1=vec_abs(v_ptrx[1]); | |||
| register __vector float v2=vec_abs(v_ptrx[2]); | |||
| register __vector float v3=vec_abs(v_ptrx[3]); | |||
| register __vector float v4=vec_abs(v_ptrx[4]); | |||
| register __vector float v5=vec_abs(v_ptrx[5]); | |||
| register __vector float v6=vec_abs(v_ptrx[6]); | |||
| register __vector float v7=vec_abs(v_ptrx[7]); | |||
| //pack complex real and imaginary parts together to sum real+image | |||
| #if defined(USE_MASK_PERMUTATIONS) | |||
| register __vector float t1=vec_perm(v0,v1,real_pack_mask); | |||
| register __vector float ti=vec_perm(v0,v1,image_pack_mask); | |||
| v0=t1+ti; //sum quadruple real with quadruple image | |||
| register __vector float t2=vec_perm(v2,v3,real_pack_mask); | |||
| register __vector float ti2=vec_perm(v2,v3,image_pack_mask); | |||
| v1=t2+ti2; | |||
| t1=vec_perm(v4,v5,real_pack_mask); | |||
| ti=vec_perm(v4,v5,image_pack_mask); | |||
| v2=t1+ti; //sum | |||
| t2=vec_perm(v6,v7,real_pack_mask); | |||
| ti2=vec_perm(v6,v7,image_pack_mask); | |||
| v3=t2+ti2; | |||
| #else | |||
| register __vector float t1=mvec_mergee(v0,v1); | |||
| register __vector float ti=mvec_mergeo(v0,v1); | |||
| v0=t1+ti; //sum quadruple real with quadruple image | |||
| register __vector float t2= mvec_mergee(v2,v3); | |||
| register __vector float ti2=mvec_mergeo(v2,v3); | |||
| v1=t2+ti2; | |||
| t1=mvec_mergee(v4,v5); | |||
| ti=mvec_mergeo(v4,v5); | |||
| v2=t1+ti; //sum | |||
| t2=mvec_mergee(v6,v7); | |||
| ti2=mvec_mergeo(v6,v7); | |||
| v3=t2+ti2; | |||
| #endif | |||
| // now we have 16 summed elements . lets compare them | |||
| v_ptrx+=8; | |||
| register __vector bool int r1=vec_cmpgt(v1,v0); | |||
| register __vector bool int r2=vec_cmpgt(v3,v2); | |||
| register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r1); | |||
| v0=vec_sel(v0,v1,r1); | |||
| register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r2); | |||
| v1=vec_sel(v2,v3,r2); | |||
| //final cmp and select index and value for first 16 values | |||
| r1=vec_cmpgt(v1,v0); | |||
| register __vector unsigned int indf0 = vec_sel(ind2,ind3,r1); | |||
| register __vector float vf0= vec_sel(v0,v1,r1); | |||
| //absolute temporary complex vectors | |||
| v0=vec_abs(v_ptrx[0]); | |||
| v1=vec_abs(v_ptrx[1]); | |||
| v2=vec_abs(v_ptrx[2]); | |||
| v3=vec_abs(v_ptrx[3]); | |||
| v4=vec_abs(v_ptrx[4]); | |||
| v5=vec_abs(v_ptrx[5]); | |||
| v6=vec_abs(v_ptrx[6]); | |||
| v7=vec_abs(v_ptrx[7]); | |||
| //pack complex real and imaginary parts together to sum real+image | |||
| #if defined(USE_MASK_PERMUTATIONS) | |||
| t1=vec_perm(v0,v1,real_pack_mask); | |||
| ti=vec_perm(v0,v1,image_pack_mask); | |||
| v0=t1+ti; //sum quadruple real with quadruple image | |||
| t2=vec_perm(v2,v3,real_pack_mask); | |||
| ti2=vec_perm(v2,v3,image_pack_mask); | |||
| v1=t2+ti2; | |||
| t1=vec_perm(v4,v5,real_pack_mask); | |||
| ti=vec_perm(v4,v5,image_pack_mask); | |||
| v2=t1+ti; //sum | |||
| t2=vec_perm(v6,v7,real_pack_mask); | |||
| ti2=vec_perm(v6,v7,image_pack_mask); | |||
| v3=t2+ti2; | |||
| #else | |||
| t1=mvec_mergee(v0,v1); | |||
| ti=mvec_mergeo(v0,v1); | |||
| v0=t1+ti; //sum quadruple real with quadruple image | |||
| t2=mvec_mergee(v2,v3); | |||
| ti2=mvec_mergeo(v2,v3); | |||
| v1=t2+ti2; | |||
| t1=mvec_mergee(v4,v5); | |||
| ti=mvec_mergeo(v4,v5); | |||
| v2=t1+ti; //sum | |||
| t2=mvec_mergee(v6,v7); | |||
| ti2=mvec_mergeo(v6,v7); | |||
| v3=t2+ti2; | |||
| #endif | |||
| // now we have 16 summed elements {from 16 to 31} . lets compare them | |||
| v_ptrx+=8; | |||
| r1=vec_cmpgt(v1,v0); | |||
| r2=vec_cmpgt(v3,v2); | |||
| ind2= vec_sel(static_index0,static_index1,r1); | |||
| v0=vec_sel(v0,v1,r1); | |||
| ind3= vec_sel(static_index2,static_index3,r2); | |||
| v1=vec_sel(v2,v3,r2); | |||
| //final cmp and select index and value for the second 16 values | |||
| r1=vec_cmpgt(v1,v0); | |||
| register __vector unsigned int indv0 = vec_sel(ind2,ind3,r1); | |||
| register __vector float vv0= vec_sel(v0,v1,r1); | |||
| indv0+=temp1; //make index from 16->31 | |||
| //find final quadruple from 32 elements | |||
| r2=vec_cmpgt(vv0,vf0); | |||
| ind2 = vec_sel( indf0,indv0,r2); | |||
| vv0= vec_sel(vf0,vv0,r2); | |||
| //get asbolute index | |||
| ind2+=temp0; | |||
| //compare with old quadruple and update | |||
| r1=vec_cmpgt(vv0,quadruple_values); | |||
| quadruple_indices = vec_sel( quadruple_indices,ind2,r1); | |||
| quadruple_values= vec_sel(quadruple_values,vv0,r1); | |||
| temp0+=temp_add; | |||
| } | |||
| //now we have to chose from 4 values and 4 different indices | |||
| // we will compare pairwise if pairs are exactly the same we will choose minimum between index | |||
| // otherwise we will assign index of the maximum value | |||
| float a1,a2,a3,a4; | |||
| unsigned int i1,i2,i3,i4; | |||
| a1=vec_extract(quadruple_values,0); | |||
| a2=vec_extract(quadruple_values,1); | |||
| a3=vec_extract(quadruple_values,2); | |||
| a4=vec_extract(quadruple_values,3); | |||
| i1=vec_extract(quadruple_indices,0); | |||
| i2=vec_extract(quadruple_indices,1); | |||
| i3=vec_extract(quadruple_indices,2); | |||
| i4=vec_extract(quadruple_indices,3); | |||
| if(a1==a2){ | |||
| index=i1>i2?i2:i1; | |||
| }else if(a2>a1){ | |||
| index=i2; | |||
| a1=a2; | |||
| }else{ | |||
| index= i1; | |||
| } | |||
| if(a4==a3){ | |||
| i1=i3>i4?i4:i3; | |||
| }else if(a4>a3){ | |||
| i1=i4; | |||
| a3=a4; | |||
| }else{ | |||
| i1= i3; | |||
| } | |||
| if(a1==a3){ | |||
| index=i1>index?index:i1; | |||
| *maxf=a1; | |||
| }else if(a3>a1){ | |||
| index=i1; | |||
| *maxf=a3; | |||
| }else{ | |||
| *maxf=a1; | |||
| } | |||
| return index; | |||
| } | |||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i = 0; | |||
| BLASLONG ix = 0; | |||
| FLOAT maxf = 0; | |||
| BLASLONG max = 0; | |||
| BLASLONG inc_x2; | |||
| if (n <= 0 || inc_x <= 0) return(max); | |||
| if (inc_x == 1) { | |||
| BLASLONG n1 = n & -32; | |||
| if (n1 > 0) { | |||
| max = ciamax_kernel_32(n1, x, &maxf); | |||
| i = n1; | |||
| ix = n1 << 1; | |||
| } | |||
| while(i < n) | |||
| { | |||
| if( CABS1(x,ix) > maxf ) | |||
| { | |||
| max = i; | |||
| maxf = CABS1(x,ix); | |||
| } | |||
| ix += 2; | |||
| i++; | |||
| } | |||
| return (max + 1); | |||
| } else { | |||
| inc_x2 = 2 * inc_x; | |||
| maxf = CABS1(x,0); | |||
| ix += inc_x2; | |||
| i++; | |||
| while(i < n) | |||
| { | |||
| if( CABS1(x,ix) > maxf ) | |||
| { | |||
| max = i; | |||
| maxf = CABS1(x,ix); | |||
| } | |||
| ix += inc_x2; | |||
| i++; | |||
| } | |||
| return (max + 1); | |||
| } | |||
| } | |||
| @@ -1,266 +1,266 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #include <altivec.h> | |||
| #if defined(DOUBLE) | |||
| #define ABS fabs | |||
| #else | |||
| #define ABS fabsf | |||
| #endif | |||
| #define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) | |||
| /** | |||
| * Find minimum index | |||
| * Warning: requirements n>0 and n % 32 == 0 | |||
| * @param n | |||
| * @param x pointer to the vector | |||
| * @param minf (out) minimum absolute value .( only for output ) | |||
| * @return index | |||
| */ | |||
| static BLASLONG ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||
| BLASLONG index; | |||
| BLASLONG i=0; | |||
| register __vector unsigned int static_index0 = {0,1,2,3}; | |||
| register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | |||
| register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | |||
| register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; | |||
| register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; | |||
| register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; | |||
| temp0=vec_xor(temp0,temp0); | |||
| temp1=temp1 <<1 ; //{16,16,16,16} | |||
| register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} | |||
| register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} | |||
| float first_min=CABS1(x,0); | |||
| register __vector float quadruple_values={first_min,first_min,first_min,first_min}; | |||
| register __vector float * v_ptrx=(__vector float *)x; | |||
| register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; | |||
| register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; | |||
| for(; i<n; i+=32){ | |||
| //absolute temporary complex vectors | |||
| register __vector float v0=vec_abs(v_ptrx[0]); | |||
| register __vector float v1=vec_abs(v_ptrx[1]); | |||
| register __vector float v2=vec_abs(v_ptrx[2]); | |||
| register __vector float v3=vec_abs(v_ptrx[3]); | |||
| register __vector float v4=vec_abs(v_ptrx[4]); | |||
| register __vector float v5=vec_abs(v_ptrx[5]); | |||
| register __vector float v6=vec_abs(v_ptrx[6]); | |||
| register __vector float v7=vec_abs(v_ptrx[7]); | |||
| //pack complex real and imaginary parts together to sum real+image | |||
| register __vector float t1=vec_perm(v0,v1,real_pack_mask); | |||
| register __vector float ti=vec_perm(v0,v1,image_pack_mask); | |||
| v0=t1+ti; //sum quadruple real with quadruple image | |||
| register __vector float t2=vec_perm(v2,v3,real_pack_mask); | |||
| register __vector float ti2=vec_perm(v2,v3,image_pack_mask); | |||
| v1=t2+ti2; | |||
| t1=vec_perm(v4,v5,real_pack_mask); | |||
| ti=vec_perm(v4,v5,image_pack_mask); | |||
| v2=t1+ti; //sum | |||
| t2=vec_perm(v6,v7,real_pack_mask); | |||
| ti2=vec_perm(v6,v7,image_pack_mask); | |||
| v3=t2+ti2; | |||
| // now we have 16 summed elements . lets compare them | |||
| v_ptrx+=8; | |||
| register __vector bool int r1=vec_cmpgt(v0,v1); | |||
| register __vector bool int r2=vec_cmpgt(v2,v3); | |||
| register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r1); | |||
| v0=vec_sel(v0,v1,r1); | |||
| register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r2); | |||
| v1=vec_sel(v2,v3,r2); | |||
| //final cmp and select index and value for first 16 values | |||
| r1=vec_cmpgt(v0,v1); | |||
| register __vector unsigned int indf0 = vec_sel(ind2,ind3,r1); | |||
| register __vector float vf0= vec_sel(v0,v1,r1); | |||
| //absolute temporary complex vectors | |||
| v0=vec_abs(v_ptrx[0]); | |||
| v1=vec_abs(v_ptrx[1]); | |||
| v2=vec_abs(v_ptrx[2]); | |||
| v3=vec_abs(v_ptrx[3]); | |||
| v4=vec_abs(v_ptrx[4]); | |||
| v5=vec_abs(v_ptrx[5]); | |||
| v6=vec_abs(v_ptrx[6]); | |||
| v7=vec_abs(v_ptrx[7]); | |||
| //pack complex real and imaginary parts together to sum real+image | |||
| t1=vec_perm(v0,v1,real_pack_mask); | |||
| ti=vec_perm(v0,v1,image_pack_mask); | |||
| v0=t1+ti; //sum quadruple real with quadruple image | |||
| t2=vec_perm(v2,v3,real_pack_mask); | |||
| ti2=vec_perm(v2,v3,image_pack_mask); | |||
| v1=t2+ti2; | |||
| t1=vec_perm(v4,v5,real_pack_mask); | |||
| ti=vec_perm(v4,v5,image_pack_mask); | |||
| v2=t1+ti; //sum | |||
| t2=vec_perm(v6,v7,real_pack_mask); | |||
| ti2=vec_perm(v6,v7,image_pack_mask); | |||
| v3=t2+ti2; | |||
| // now we have 16 summed elements {from 16 to 31} . lets compare them | |||
| v_ptrx+=8; | |||
| r1=vec_cmpgt(v0,v1); | |||
| r2=vec_cmpgt(v2,v3); | |||
| ind2= vec_sel(static_index0,static_index1,r1); | |||
| v0=vec_sel(v0,v1,r1); | |||
| ind3= vec_sel(static_index2,static_index3,r2); | |||
| v1=vec_sel(v2,v3,r2); | |||
| //final cmp and select index and value for the second 16 values | |||
| r1=vec_cmpgt(v0,v1); | |||
| register __vector unsigned int indv0 = vec_sel(ind2,ind3,r1); | |||
| register __vector float vv0= vec_sel(v0,v1,r1); | |||
| indv0+=temp1; //make index from 16->31 | |||
| //find final quadruple from 32 elements | |||
| r2=vec_cmpgt(vf0,vv0); | |||
| ind2 = vec_sel( indf0,indv0,r2); | |||
| vv0= vec_sel(vf0,vv0,r2); | |||
| //get asbolute index | |||
| ind2+=temp0; | |||
| //compare with old quadruple and update | |||
| r1=vec_cmpgt(quadruple_values,vv0); | |||
| quadruple_indices = vec_sel( quadruple_indices,ind2,r1); | |||
| quadruple_values= vec_sel(quadruple_values,vv0,r1); | |||
| temp0+=temp_add; | |||
| } | |||
| //now we have to chose from 4 values and 4 different indices | |||
| // we will compare pairwise if pairs are exactly the same we will choose minimum between index | |||
| // otherwise we will assign index of the minimum value | |||
| float a1,a2,a3,a4; | |||
| unsigned int i1,i2,i3,i4; | |||
| a1=vec_extract(quadruple_values,0); | |||
| a2=vec_extract(quadruple_values,1); | |||
| a3=vec_extract(quadruple_values,2); | |||
| a4=vec_extract(quadruple_values,3); | |||
| i1=vec_extract(quadruple_indices,0); | |||
| i2=vec_extract(quadruple_indices,1); | |||
| i3=vec_extract(quadruple_indices,2); | |||
| i4=vec_extract(quadruple_indices,3); | |||
| if(a1==a2){ | |||
| index=i1>i2?i2:i1; | |||
| }else if(a2<a1){ | |||
| index=i2; | |||
| a1=a2; | |||
| }else{ | |||
| index= i1; | |||
| } | |||
| if(a4==a3){ | |||
| i1=i3>i4?i4:i3; | |||
| }else if(a4<a3){ | |||
| i1=i4; | |||
| a3=a4; | |||
| }else{ | |||
| i1= i3; | |||
| } | |||
| if(a1==a3){ | |||
| index=i1>index?index:i1; | |||
| *minf=a1; | |||
| }else if(a3<a1){ | |||
| index=i1; | |||
| *minf=a3; | |||
| }else{ | |||
| *minf=a1; | |||
| } | |||
| return index; | |||
| } | |||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0; | |||
| FLOAT minf; | |||
| BLASLONG min=0; | |||
| BLASLONG inc_x2; | |||
| if (n <= 0 || inc_x <= 0) return(min); | |||
| if (inc_x == 1) { | |||
| minf = CABS1(x,0); //index will not be incremented | |||
| BLASLONG n1 = n & -32; | |||
| if (n1 > 0) { | |||
| min = ciamin_kernel_32(n1, x, &minf); | |||
| i = n1; | |||
| ix = n1 << 1; | |||
| } | |||
| while(i < n) | |||
| { | |||
| if( CABS1(x,ix) < minf ) | |||
| { | |||
| min = i; | |||
| minf = CABS1(x,ix); | |||
| } | |||
| ix += 2; | |||
| i++; | |||
| } | |||
| return (min + 1); | |||
| } else { | |||
| inc_x2 = 2 * inc_x; | |||
| minf = CABS1(x,0); | |||
| ix += inc_x2; | |||
| i++; | |||
| while(i < n) | |||
| { | |||
| if( CABS1(x,ix) < minf ) | |||
| { | |||
| min = i; | |||
| minf = CABS1(x,ix); | |||
| } | |||
| ix += inc_x2; | |||
| i++; | |||
| } | |||
| return (min + 1); | |||
| } | |||
| } | |||
| /*************************************************************************** | |||
| Copyright (c) 2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #include <altivec.h> | |||
| #if defined(DOUBLE) | |||
| #define ABS fabs | |||
| #else | |||
| #define ABS fabsf | |||
| #endif | |||
| #define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) | |||
| /** | |||
| * Find minimum index | |||
| * Warning: requirements n>0 and n % 32 == 0 | |||
| * @param n | |||
| * @param x pointer to the vector | |||
| * @param minf (out) minimum absolute value .( only for output ) | |||
| * @return index | |||
| */ | |||
| static BLASLONG ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||
| BLASLONG index; | |||
| BLASLONG i=0; | |||
| register __vector unsigned int static_index0 = {0,1,2,3}; | |||
| register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | |||
| register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | |||
| register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; | |||
| register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; | |||
| register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; | |||
| temp0=vec_xor(temp0,temp0); | |||
| temp1=temp1 <<1 ; //{16,16,16,16} | |||
| register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} | |||
| register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} | |||
| float first_min=CABS1(x,0); | |||
| register __vector float quadruple_values={first_min,first_min,first_min,first_min}; | |||
| register __vector float * v_ptrx=(__vector float *)x; | |||
| register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; | |||
| register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; | |||
| for(; i<n; i+=32){ | |||
| //absolute temporary complex vectors | |||
| register __vector float v0=vec_abs(v_ptrx[0]); | |||
| register __vector float v1=vec_abs(v_ptrx[1]); | |||
| register __vector float v2=vec_abs(v_ptrx[2]); | |||
| register __vector float v3=vec_abs(v_ptrx[3]); | |||
| register __vector float v4=vec_abs(v_ptrx[4]); | |||
| register __vector float v5=vec_abs(v_ptrx[5]); | |||
| register __vector float v6=vec_abs(v_ptrx[6]); | |||
| register __vector float v7=vec_abs(v_ptrx[7]); | |||
| //pack complex real and imaginary parts together to sum real+image | |||
| register __vector float t1=vec_perm(v0,v1,real_pack_mask); | |||
| register __vector float ti=vec_perm(v0,v1,image_pack_mask); | |||
| v0=t1+ti; //sum quadruple real with quadruple image | |||
| register __vector float t2=vec_perm(v2,v3,real_pack_mask); | |||
| register __vector float ti2=vec_perm(v2,v3,image_pack_mask); | |||
| v1=t2+ti2; | |||
| t1=vec_perm(v4,v5,real_pack_mask); | |||
| ti=vec_perm(v4,v5,image_pack_mask); | |||
| v2=t1+ti; //sum | |||
| t2=vec_perm(v6,v7,real_pack_mask); | |||
| ti2=vec_perm(v6,v7,image_pack_mask); | |||
| v3=t2+ti2; | |||
| // now we have 16 summed elements . lets compare them | |||
| v_ptrx+=8; | |||
| register __vector bool int r1=vec_cmpgt(v0,v1); | |||
| register __vector bool int r2=vec_cmpgt(v2,v3); | |||
| register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r1); | |||
| v0=vec_sel(v0,v1,r1); | |||
| register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r2); | |||
| v1=vec_sel(v2,v3,r2); | |||
| //final cmp and select index and value for first 16 values | |||
| r1=vec_cmpgt(v0,v1); | |||
| register __vector unsigned int indf0 = vec_sel(ind2,ind3,r1); | |||
| register __vector float vf0= vec_sel(v0,v1,r1); | |||
| //absolute temporary complex vectors | |||
| v0=vec_abs(v_ptrx[0]); | |||
| v1=vec_abs(v_ptrx[1]); | |||
| v2=vec_abs(v_ptrx[2]); | |||
| v3=vec_abs(v_ptrx[3]); | |||
| v4=vec_abs(v_ptrx[4]); | |||
| v5=vec_abs(v_ptrx[5]); | |||
| v6=vec_abs(v_ptrx[6]); | |||
| v7=vec_abs(v_ptrx[7]); | |||
| //pack complex real and imaginary parts together to sum real+image | |||
| t1=vec_perm(v0,v1,real_pack_mask); | |||
| ti=vec_perm(v0,v1,image_pack_mask); | |||
| v0=t1+ti; //sum quadruple real with quadruple image | |||
| t2=vec_perm(v2,v3,real_pack_mask); | |||
| ti2=vec_perm(v2,v3,image_pack_mask); | |||
| v1=t2+ti2; | |||
| t1=vec_perm(v4,v5,real_pack_mask); | |||
| ti=vec_perm(v4,v5,image_pack_mask); | |||
| v2=t1+ti; //sum | |||
| t2=vec_perm(v6,v7,real_pack_mask); | |||
| ti2=vec_perm(v6,v7,image_pack_mask); | |||
| v3=t2+ti2; | |||
| // now we have 16 summed elements {from 16 to 31} . lets compare them | |||
| v_ptrx+=8; | |||
| r1=vec_cmpgt(v0,v1); | |||
| r2=vec_cmpgt(v2,v3); | |||
| ind2= vec_sel(static_index0,static_index1,r1); | |||
| v0=vec_sel(v0,v1,r1); | |||
| ind3= vec_sel(static_index2,static_index3,r2); | |||
| v1=vec_sel(v2,v3,r2); | |||
| //final cmp and select index and value for the second 16 values | |||
| r1=vec_cmpgt(v0,v1); | |||
| register __vector unsigned int indv0 = vec_sel(ind2,ind3,r1); | |||
| register __vector float vv0= vec_sel(v0,v1,r1); | |||
| indv0+=temp1; //make index from 16->31 | |||
| //find final quadruple from 32 elements | |||
| r2=vec_cmpgt(vf0,vv0); | |||
| ind2 = vec_sel( indf0,indv0,r2); | |||
| vv0= vec_sel(vf0,vv0,r2); | |||
| //get asbolute index | |||
| ind2+=temp0; | |||
| //compare with old quadruple and update | |||
| r1=vec_cmpgt(quadruple_values,vv0); | |||
| quadruple_indices = vec_sel( quadruple_indices,ind2,r1); | |||
| quadruple_values= vec_sel(quadruple_values,vv0,r1); | |||
| temp0+=temp_add; | |||
| } | |||
| //now we have to chose from 4 values and 4 different indices | |||
| // we will compare pairwise if pairs are exactly the same we will choose minimum between index | |||
| // otherwise we will assign index of the minimum value | |||
| float a1,a2,a3,a4; | |||
| unsigned int i1,i2,i3,i4; | |||
| a1=vec_extract(quadruple_values,0); | |||
| a2=vec_extract(quadruple_values,1); | |||
| a3=vec_extract(quadruple_values,2); | |||
| a4=vec_extract(quadruple_values,3); | |||
| i1=vec_extract(quadruple_indices,0); | |||
| i2=vec_extract(quadruple_indices,1); | |||
| i3=vec_extract(quadruple_indices,2); | |||
| i4=vec_extract(quadruple_indices,3); | |||
| if(a1==a2){ | |||
| index=i1>i2?i2:i1; | |||
| }else if(a2<a1){ | |||
| index=i2; | |||
| a1=a2; | |||
| }else{ | |||
| index= i1; | |||
| } | |||
| if(a4==a3){ | |||
| i1=i3>i4?i4:i3; | |||
| }else if(a4<a3){ | |||
| i1=i4; | |||
| a3=a4; | |||
| }else{ | |||
| i1= i3; | |||
| } | |||
| if(a1==a3){ | |||
| index=i1>index?index:i1; | |||
| *minf=a1; | |||
| }else if(a3<a1){ | |||
| index=i1; | |||
| *minf=a3; | |||
| }else{ | |||
| *minf=a1; | |||
| } | |||
| return index; | |||
| } | |||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0; | |||
| FLOAT minf; | |||
| BLASLONG min=0; | |||
| BLASLONG inc_x2; | |||
| if (n <= 0 || inc_x <= 0) return(min); | |||
| if (inc_x == 1) { | |||
| minf = CABS1(x,0); //index will not be incremented | |||
| BLASLONG n1 = n & -32; | |||
| if (n1 > 0) { | |||
| min = ciamin_kernel_32(n1, x, &minf); | |||
| i = n1; | |||
| ix = n1 << 1; | |||
| } | |||
| while(i < n) | |||
| { | |||
| if( CABS1(x,ix) < minf ) | |||
| { | |||
| min = i; | |||
| minf = CABS1(x,ix); | |||
| } | |||
| ix += 2; | |||
| i++; | |||
| } | |||
| return (min + 1); | |||
| } else { | |||
| inc_x2 = 2 * inc_x; | |||
| minf = CABS1(x,0); | |||
| ix += inc_x2; | |||
| i++; | |||
| while(i < n) | |||
| { | |||
| if( CABS1(x,ix) < minf ) | |||
| { | |||
| min = i; | |||
| minf = CABS1(x,ix); | |||
| } | |||
| ix += inc_x2; | |||
| i++; | |||
| } | |||
| return (min + 1); | |||
| } | |||
| } | |||
| @@ -1,288 +1,288 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #include <altivec.h> | |||
| #if defined(DOUBLE) | |||
| #define ABS fabs | |||
| #else | |||
| #define ABS fabsf | |||
| #endif | |||
| /** | |||
| * Find maximum index | |||
| * Warning: requirements n>0 and n % 64 == 0 | |||
| * @param n | |||
| * @param x pointer to the vector | |||
| * @param maxf (out) maximum absolute value .( only for output ) | |||
| * @return index | |||
| */ | |||
| static BLASLONG siamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *maxf) { | |||
| BLASLONG index; | |||
| BLASLONG i=0; | |||
| register __vector unsigned int static_index0 = {0,1,2,3}; | |||
| register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | |||
| register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | |||
| register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; | |||
| register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; | |||
| register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; | |||
| temp0=vec_xor(temp0,temp0); | |||
| temp1=temp1 <<1 ; //{16,16,16,16} | |||
| register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} | |||
| register __vector float quadruple_values={0,0,0,0}; | |||
| register __vector float * v_ptrx=(__vector float *)x; | |||
| for(; i<n; i+=64){ | |||
| //absolute temporary vectors | |||
| register __vector float v0=vec_abs(v_ptrx[0]); | |||
| register __vector float v1=vec_abs(v_ptrx[1]); | |||
| register __vector float v2=vec_abs(v_ptrx[2]); | |||
| register __vector float v3=vec_abs(v_ptrx[3]); | |||
| register __vector float v4=vec_abs(v_ptrx[4]); | |||
| register __vector float v5=vec_abs(v_ptrx[5]); | |||
| register __vector float v6=vec_abs(v_ptrx[6]); | |||
| register __vector float v7=vec_abs(v_ptrx[7]); | |||
| //cmp quadruple pairs | |||
| register __vector bool int r1=vec_cmpgt(v1,v0); | |||
| register __vector bool int r2=vec_cmpgt(v3,v2); | |||
| register __vector bool int r3=vec_cmpgt(v5,v4); | |||
| register __vector bool int r4=vec_cmpgt(v7,v6); | |||
| //select | |||
| register __vector unsigned int ind0_first= vec_sel(static_index0,static_index1,r1); | |||
| register __vector float vf0= vec_sel(v0,v1,r1); | |||
| register __vector unsigned int ind1= vec_sel(static_index2,static_index3,r2); | |||
| register __vector float vf1= vec_sel(v2,v3,r2); | |||
| register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r3); | |||
| v0=vec_sel(v4,v5,r3); | |||
| register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r4); | |||
| v1=vec_sel(v6,v7,r4); | |||
| // cmp selected | |||
| r1=vec_cmpgt(vf1,vf0); | |||
| r2=vec_cmpgt(v1,v0); | |||
| v_ptrx+=8; | |||
| //select from above | |||
| ind0_first= vec_sel(ind0_first,ind1,r1); | |||
| vf0= vec_sel(vf0,vf1,r1) ; | |||
| ind2= vec_sel(ind2,ind3,r2); | |||
| vf1= vec_sel(v0,v1,r2); | |||
| //second indices actually should be within [16,31] so ind2+16 | |||
| ind2 +=temp1; | |||
| //final cmp and select index and value for the first 32 values | |||
| r1=vec_cmpgt(vf1,vf0); | |||
| ind0_first = vec_sel(ind0_first,ind2,r1); | |||
| vf0= vec_sel(vf0,vf1,r1); | |||
| ind0_first+=temp0; //get absolute index | |||
| temp0+=temp1; | |||
| temp0+=temp1; //temp0+32 | |||
| //second part of 32 | |||
| // absolute temporary vectors | |||
| v0=vec_abs(v_ptrx[0]); | |||
| v1=vec_abs(v_ptrx[1]); | |||
| v2=vec_abs(v_ptrx[2]); | |||
| v3=vec_abs(v_ptrx[3]); | |||
| v4=vec_abs(v_ptrx[4]); | |||
| v5=vec_abs(v_ptrx[5]); | |||
| v6=vec_abs(v_ptrx[6]); | |||
| v7=vec_abs(v_ptrx[7]); | |||
| //cmp quadruple pairs | |||
| r1=vec_cmpgt(v1,v0); | |||
| r2=vec_cmpgt(v3,v2); | |||
| r3=vec_cmpgt(v5,v4); | |||
| r4=vec_cmpgt(v7,v6); | |||
| //select | |||
| register __vector unsigned int ind0_second= vec_sel(static_index0,static_index1,r1); | |||
| register __vector float vv0= vec_sel(v0,v1,r1); | |||
| ind1= vec_sel(static_index2,static_index3,r2); | |||
| register __vector float vv1= vec_sel(v2,v3,r2); | |||
| ind2= vec_sel(static_index0,static_index1,r3); | |||
| v0=vec_sel(v4,v5,r3); | |||
| ind3= vec_sel(static_index2,static_index3,r4); | |||
| v1=vec_sel(v6,v7,r4); | |||
| // cmp selected | |||
| r1=vec_cmpgt(vv1,vv0); | |||
| r2=vec_cmpgt(v1,v0); | |||
| v_ptrx+=8; | |||
| //select from above | |||
| ind0_second= vec_sel(ind0_second,ind1,r1); | |||
| vv0= vec_sel(vv0,vv1,r1) ; | |||
| ind2= vec_sel(ind2,ind3,r2); | |||
| vv1= vec_sel(v0,v1,r2) ; | |||
| //second indices actually should be within [16,31] so ind2+16 | |||
| ind2 +=temp1; | |||
| //final cmp and select index and value for the second 32 values | |||
| r1=vec_cmpgt(vv1,vv0); | |||
| ind0_second = vec_sel(ind0_second,ind2,r1); | |||
| vv0= vec_sel(vv0,vv1,r1); | |||
| ind0_second+=temp0; //get absolute index | |||
| //find final quadruple from 64 elements | |||
| r2=vec_cmpgt(vv0,vf0); | |||
| ind2 = vec_sel( ind0_first,ind0_second,r2); | |||
| vv0= vec_sel(vf0,vv0,r2); | |||
| //compare with old quadruple and update | |||
| r3=vec_cmpgt(vv0,quadruple_values); | |||
| quadruple_indices = vec_sel( quadruple_indices,ind2,r3); | |||
| quadruple_values= vec_sel(quadruple_values,vv0,r3); | |||
| temp0+=temp1; | |||
| temp0+=temp1; //temp0+32 | |||
| } | |||
| //now we have to chose from 4 values and 4 different indices | |||
| // we will compare pairwise if pairs are exactly the same we will choose minimum between index | |||
| // otherwise we will assign index of the maximum value | |||
| float a1,a2,a3,a4; | |||
| unsigned int i1,i2,i3,i4; | |||
| a1=vec_extract(quadruple_values,0); | |||
| a2=vec_extract(quadruple_values,1); | |||
| a3=vec_extract(quadruple_values,2); | |||
| a4=vec_extract(quadruple_values,3); | |||
| i1=vec_extract(quadruple_indices,0); | |||
| i2=vec_extract(quadruple_indices,1); | |||
| i3=vec_extract(quadruple_indices,2); | |||
| i4=vec_extract(quadruple_indices,3); | |||
| if(a1==a2){ | |||
| index=i1>i2?i2:i1; | |||
| }else if(a2>a1){ | |||
| index=i2; | |||
| a1=a2; | |||
| }else{ | |||
| index= i1; | |||
| } | |||
| if(a4==a3){ | |||
| i1=i3>i4?i4:i3; | |||
| }else if(a4>a3){ | |||
| i1=i4; | |||
| a3=a4; | |||
| }else{ | |||
| i1= i3; | |||
| } | |||
| if(a1==a3){ | |||
| index=i1>index?index:i1; | |||
| *maxf=a1; | |||
| }else if(a3>a1){ | |||
| index=i1; | |||
| *maxf=a3; | |||
| }else{ | |||
| *maxf=a1; | |||
| } | |||
| return index; | |||
| } | |||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| BLASLONG i = 0; | |||
| BLASLONG j = 0; | |||
| FLOAT maxf = 0.0; | |||
| BLASLONG max = 0; | |||
| if (n <= 0 || inc_x <= 0) return (max); | |||
| if (inc_x == 1) { | |||
| BLASLONG n1 = n & -64; | |||
| if (n1 > 0) { | |||
| max = siamax_kernel_64(n1, x, &maxf); | |||
| i = n1; | |||
| } | |||
| while (i < n) { | |||
| if (ABS(x[i]) > maxf) { | |||
| max = i; | |||
| maxf = ABS(x[i]); | |||
| } | |||
| i++; | |||
| } | |||
| return (max + 1); | |||
| } else { | |||
| BLASLONG n1 = n & -4; | |||
| while (j < n1) { | |||
| if (ABS(x[i]) > maxf) { | |||
| max = j; | |||
| maxf = ABS(x[i]); | |||
| } | |||
| if (ABS(x[i + inc_x]) > maxf) { | |||
| max = j + 1; | |||
| maxf = ABS(x[i + inc_x]); | |||
| } | |||
| if (ABS(x[i + 2 * inc_x]) > maxf) { | |||
| max = j + 2; | |||
| maxf = ABS(x[i + 2 * inc_x]); | |||
| } | |||
| if (ABS(x[i + 3 * inc_x]) > maxf) { | |||
| max = j + 3; | |||
| maxf = ABS(x[i + 3 * inc_x]); | |||
| } | |||
| i += inc_x * 4; | |||
| j += 4; | |||
| } | |||
| while (j < n) { | |||
| if (ABS(x[i]) > maxf) { | |||
| max = j; | |||
| maxf = ABS(x[i]); | |||
| } | |||
| i += inc_x; | |||
| j++; | |||
| } | |||
| return (max + 1); | |||
| } | |||
| } | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #include <altivec.h> | |||
| #if defined(DOUBLE) | |||
| #define ABS fabs | |||
| #else | |||
| #define ABS fabsf | |||
| #endif | |||
| /** | |||
| * Find maximum index | |||
| * Warning: requirements n>0 and n % 64 == 0 | |||
| * @param n | |||
| * @param x pointer to the vector | |||
| * @param maxf (out) maximum absolute value .( only for output ) | |||
| * @return index | |||
| */ | |||
| static BLASLONG siamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *maxf) { | |||
| BLASLONG index; | |||
| BLASLONG i=0; | |||
| register __vector unsigned int static_index0 = {0,1,2,3}; | |||
| register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | |||
| register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | |||
| register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; | |||
| register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; | |||
| register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; | |||
| temp0=vec_xor(temp0,temp0); | |||
| temp1=temp1 <<1 ; //{16,16,16,16} | |||
| register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} | |||
| register __vector float quadruple_values={0,0,0,0}; | |||
| register __vector float * v_ptrx=(__vector float *)x; | |||
| for(; i<n; i+=64){ | |||
| //absolute temporary vectors | |||
| register __vector float v0=vec_abs(v_ptrx[0]); | |||
| register __vector float v1=vec_abs(v_ptrx[1]); | |||
| register __vector float v2=vec_abs(v_ptrx[2]); | |||
| register __vector float v3=vec_abs(v_ptrx[3]); | |||
| register __vector float v4=vec_abs(v_ptrx[4]); | |||
| register __vector float v5=vec_abs(v_ptrx[5]); | |||
| register __vector float v6=vec_abs(v_ptrx[6]); | |||
| register __vector float v7=vec_abs(v_ptrx[7]); | |||
| //cmp quadruple pairs | |||
| register __vector bool int r1=vec_cmpgt(v1,v0); | |||
| register __vector bool int r2=vec_cmpgt(v3,v2); | |||
| register __vector bool int r3=vec_cmpgt(v5,v4); | |||
| register __vector bool int r4=vec_cmpgt(v7,v6); | |||
| //select | |||
| register __vector unsigned int ind0_first= vec_sel(static_index0,static_index1,r1); | |||
| register __vector float vf0= vec_sel(v0,v1,r1); | |||
| register __vector unsigned int ind1= vec_sel(static_index2,static_index3,r2); | |||
| register __vector float vf1= vec_sel(v2,v3,r2); | |||
| register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r3); | |||
| v0=vec_sel(v4,v5,r3); | |||
| register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r4); | |||
| v1=vec_sel(v6,v7,r4); | |||
| // cmp selected | |||
| r1=vec_cmpgt(vf1,vf0); | |||
| r2=vec_cmpgt(v1,v0); | |||
| v_ptrx+=8; | |||
| //select from above | |||
| ind0_first= vec_sel(ind0_first,ind1,r1); | |||
| vf0= vec_sel(vf0,vf1,r1) ; | |||
| ind2= vec_sel(ind2,ind3,r2); | |||
| vf1= vec_sel(v0,v1,r2); | |||
| //second indices actually should be within [16,31] so ind2+16 | |||
| ind2 +=temp1; | |||
| //final cmp and select index and value for the first 32 values | |||
| r1=vec_cmpgt(vf1,vf0); | |||
| ind0_first = vec_sel(ind0_first,ind2,r1); | |||
| vf0= vec_sel(vf0,vf1,r1); | |||
| ind0_first+=temp0; //get absolute index | |||
| temp0+=temp1; | |||
| temp0+=temp1; //temp0+32 | |||
| //second part of 32 | |||
| // absolute temporary vectors | |||
| v0=vec_abs(v_ptrx[0]); | |||
| v1=vec_abs(v_ptrx[1]); | |||
| v2=vec_abs(v_ptrx[2]); | |||
| v3=vec_abs(v_ptrx[3]); | |||
| v4=vec_abs(v_ptrx[4]); | |||
| v5=vec_abs(v_ptrx[5]); | |||
| v6=vec_abs(v_ptrx[6]); | |||
| v7=vec_abs(v_ptrx[7]); | |||
| //cmp quadruple pairs | |||
| r1=vec_cmpgt(v1,v0); | |||
| r2=vec_cmpgt(v3,v2); | |||
| r3=vec_cmpgt(v5,v4); | |||
| r4=vec_cmpgt(v7,v6); | |||
| //select | |||
| register __vector unsigned int ind0_second= vec_sel(static_index0,static_index1,r1); | |||
| register __vector float vv0= vec_sel(v0,v1,r1); | |||
| ind1= vec_sel(static_index2,static_index3,r2); | |||
| register __vector float vv1= vec_sel(v2,v3,r2); | |||
| ind2= vec_sel(static_index0,static_index1,r3); | |||
| v0=vec_sel(v4,v5,r3); | |||
| ind3= vec_sel(static_index2,static_index3,r4); | |||
| v1=vec_sel(v6,v7,r4); | |||
| // cmp selected | |||
| r1=vec_cmpgt(vv1,vv0); | |||
| r2=vec_cmpgt(v1,v0); | |||
| v_ptrx+=8; | |||
| //select from above | |||
| ind0_second= vec_sel(ind0_second,ind1,r1); | |||
| vv0= vec_sel(vv0,vv1,r1) ; | |||
| ind2= vec_sel(ind2,ind3,r2); | |||
| vv1= vec_sel(v0,v1,r2) ; | |||
| //second indices actually should be within [16,31] so ind2+16 | |||
| ind2 +=temp1; | |||
| //final cmp and select index and value for the second 32 values | |||
| r1=vec_cmpgt(vv1,vv0); | |||
| ind0_second = vec_sel(ind0_second,ind2,r1); | |||
| vv0= vec_sel(vv0,vv1,r1); | |||
| ind0_second+=temp0; //get absolute index | |||
| //find final quadruple from 64 elements | |||
| r2=vec_cmpgt(vv0,vf0); | |||
| ind2 = vec_sel( ind0_first,ind0_second,r2); | |||
| vv0= vec_sel(vf0,vv0,r2); | |||
| //compare with old quadruple and update | |||
| r3=vec_cmpgt(vv0,quadruple_values); | |||
| quadruple_indices = vec_sel( quadruple_indices,ind2,r3); | |||
| quadruple_values= vec_sel(quadruple_values,vv0,r3); | |||
| temp0+=temp1; | |||
| temp0+=temp1; //temp0+32 | |||
| } | |||
| //now we have to chose from 4 values and 4 different indices | |||
| // we will compare pairwise if pairs are exactly the same we will choose minimum between index | |||
| // otherwise we will assign index of the maximum value | |||
| float a1,a2,a3,a4; | |||
| unsigned int i1,i2,i3,i4; | |||
| a1=vec_extract(quadruple_values,0); | |||
| a2=vec_extract(quadruple_values,1); | |||
| a3=vec_extract(quadruple_values,2); | |||
| a4=vec_extract(quadruple_values,3); | |||
| i1=vec_extract(quadruple_indices,0); | |||
| i2=vec_extract(quadruple_indices,1); | |||
| i3=vec_extract(quadruple_indices,2); | |||
| i4=vec_extract(quadruple_indices,3); | |||
| if(a1==a2){ | |||
| index=i1>i2?i2:i1; | |||
| }else if(a2>a1){ | |||
| index=i2; | |||
| a1=a2; | |||
| }else{ | |||
| index= i1; | |||
| } | |||
| if(a4==a3){ | |||
| i1=i3>i4?i4:i3; | |||
| }else if(a4>a3){ | |||
| i1=i4; | |||
| a3=a4; | |||
| }else{ | |||
| i1= i3; | |||
| } | |||
| if(a1==a3){ | |||
| index=i1>index?index:i1; | |||
| *maxf=a1; | |||
| }else if(a3>a1){ | |||
| index=i1; | |||
| *maxf=a3; | |||
| }else{ | |||
| *maxf=a1; | |||
| } | |||
| return index; | |||
| } | |||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| BLASLONG i = 0; | |||
| BLASLONG j = 0; | |||
| FLOAT maxf = 0.0; | |||
| BLASLONG max = 0; | |||
| if (n <= 0 || inc_x <= 0) return (max); | |||
| if (inc_x == 1) { | |||
| BLASLONG n1 = n & -64; | |||
| if (n1 > 0) { | |||
| max = siamax_kernel_64(n1, x, &maxf); | |||
| i = n1; | |||
| } | |||
| while (i < n) { | |||
| if (ABS(x[i]) > maxf) { | |||
| max = i; | |||
| maxf = ABS(x[i]); | |||
| } | |||
| i++; | |||
| } | |||
| return (max + 1); | |||
| } else { | |||
| BLASLONG n1 = n & -4; | |||
| while (j < n1) { | |||
| if (ABS(x[i]) > maxf) { | |||
| max = j; | |||
| maxf = ABS(x[i]); | |||
| } | |||
| if (ABS(x[i + inc_x]) > maxf) { | |||
| max = j + 1; | |||
| maxf = ABS(x[i + inc_x]); | |||
| } | |||
| if (ABS(x[i + 2 * inc_x]) > maxf) { | |||
| max = j + 2; | |||
| maxf = ABS(x[i + 2 * inc_x]); | |||
| } | |||
| if (ABS(x[i + 3 * inc_x]) > maxf) { | |||
| max = j + 3; | |||
| maxf = ABS(x[i + 3 * inc_x]); | |||
| } | |||
| i += inc_x * 4; | |||
| j += 4; | |||
| } | |||
| while (j < n) { | |||
| if (ABS(x[i]) > maxf) { | |||
| max = j; | |||
| maxf = ABS(x[i]); | |||
| } | |||
| i += inc_x; | |||
| j++; | |||
| } | |||
| return (max + 1); | |||
| } | |||
| } | |||
| @@ -1,288 +1,288 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #include <altivec.h> | |||
| #if defined(DOUBLE) | |||
| #define ABS fabs | |||
| #else | |||
| #define ABS fabsf | |||
| #endif | |||
| /** | |||
| * Find minimum index | |||
| * Warning: requirements n>0 and n % 64 == 0 | |||
| * @param n | |||
| * @param x pointer to the vector | |||
| * @param minf (out) minimum absolute value .( only for output ) | |||
| * @return index | |||
| */ | |||
| static BLASLONG siamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||
| BLASLONG index; | |||
| BLASLONG i=0; | |||
| register __vector unsigned int static_index0 = {0,1,2,3}; | |||
| register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | |||
| register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | |||
| register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; | |||
| register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; | |||
| register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; | |||
| temp0=vec_xor(temp0,temp0); | |||
| temp1=temp1 <<1 ; //{16,16,16,16} | |||
| register __vector unsigned int quadruple_indices=static_index0;//{0,1,2,3}; | |||
| register __vector float * v_ptrx=(__vector float *)x; | |||
| register __vector float quadruple_values=vec_abs(v_ptrx[0]); | |||
| for(; i<n; i+=64){ | |||
| //absolute temporary vectors | |||
| register __vector float v0=vec_abs(v_ptrx[0]); | |||
| register __vector float v1=vec_abs(v_ptrx[1]); | |||
| register __vector float v2=vec_abs(v_ptrx[2]); | |||
| register __vector float v3=vec_abs(v_ptrx[3]); | |||
| register __vector float v4=vec_abs(v_ptrx[4]); | |||
| register __vector float v5=vec_abs(v_ptrx[5]); | |||
| register __vector float v6=vec_abs(v_ptrx[6]); | |||
| register __vector float v7=vec_abs(v_ptrx[7]); | |||
| //cmp quadruple pairs | |||
| register __vector bool int r1=vec_cmpgt(v0,v1); | |||
| register __vector bool int r2=vec_cmpgt(v2,v3); | |||
| register __vector bool int r3=vec_cmpgt(v4,v5); | |||
| register __vector bool int r4=vec_cmpgt(v6,v7); | |||
| //select | |||
| register __vector unsigned int ind0_first= vec_sel(static_index0,static_index1,r1); | |||
| register __vector float vf0= vec_sel(v0,v1,r1); | |||
| register __vector unsigned int ind1= vec_sel(static_index2,static_index3,r2); | |||
| register __vector float vf1= vec_sel(v2,v3,r2); | |||
| register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r3); | |||
| v0=vec_sel(v4,v5,r3); | |||
| register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r4); | |||
| v1=vec_sel(v6,v7,r4); | |||
| // cmp selected | |||
| r1=vec_cmpgt(vf0,vf1); | |||
| r2=vec_cmpgt(v0,v1); | |||
| v_ptrx+=8; | |||
| //select from above | |||
| ind0_first= vec_sel(ind0_first,ind1,r1); | |||
| vf0= vec_sel(vf0,vf1,r1) ; | |||
| ind2= vec_sel(ind2,ind3,r2); | |||
| vf1= vec_sel(v0,v1,r2); | |||
| //second indices actually should be within [16,31] so ind2+16 | |||
| ind2 +=temp1; | |||
| //final cmp and select index and value for the first 32 values | |||
| r1=vec_cmpgt(vf0,vf1); | |||
| ind0_first = vec_sel(ind0_first,ind2,r1); | |||
| vf0= vec_sel(vf0,vf1,r1); | |||
| ind0_first+=temp0; //get absolute index | |||
| temp0+=temp1; | |||
| temp0+=temp1; //temp0+32 | |||
| //second part of 32 | |||
| // absolute temporary vectors | |||
| v0=vec_abs(v_ptrx[0]); | |||
| v1=vec_abs(v_ptrx[1]); | |||
| v2=vec_abs(v_ptrx[2]); | |||
| v3=vec_abs(v_ptrx[3]); | |||
| v4=vec_abs(v_ptrx[4]); | |||
| v5=vec_abs(v_ptrx[5]); | |||
| v6=vec_abs(v_ptrx[6]); | |||
| v7=vec_abs(v_ptrx[7]); | |||
| //cmp quadruple pairs | |||
| r1=vec_cmpgt(v0,v1); | |||
| r2=vec_cmpgt(v2,v3); | |||
| r3=vec_cmpgt(v4,v5); | |||
| r4=vec_cmpgt(v6,v7); | |||
| //select | |||
| register __vector unsigned int ind0_second= vec_sel(static_index0,static_index1,r1); | |||
| register __vector float vv0= vec_sel(v0,v1,r1); | |||
| ind1= vec_sel(static_index2,static_index3,r2); | |||
| register __vector float vv1= vec_sel(v2,v3,r2); | |||
| ind2= vec_sel(static_index0,static_index1,r3); | |||
| v0=vec_sel(v4,v5,r3); | |||
| ind3= vec_sel(static_index2,static_index3,r4); | |||
| v1=vec_sel(v6,v7,r4); | |||
| // cmp selected | |||
| r1=vec_cmpgt(vv0,vv1); | |||
| r2=vec_cmpgt(v0,v1); | |||
| v_ptrx+=8; | |||
| //select from above | |||
| ind0_second= vec_sel(ind0_second,ind1,r1); | |||
| vv0= vec_sel(vv0,vv1,r1) ; | |||
| ind2= vec_sel(ind2,ind3,r2); | |||
| vv1= vec_sel(v0,v1,r2) ; | |||
| //second indices actually should be within [16,31] so ind2+16 | |||
| ind2 +=temp1; | |||
| //final cmp and select index and value for the second 32 values | |||
| r1=vec_cmpgt(vv0,vv1); | |||
| ind0_second = vec_sel(ind0_second,ind2,r1); | |||
| vv0= vec_sel(vv0,vv1,r1); | |||
| ind0_second+=temp0; //get absolute index | |||
| //find final quadruple from 64 elements | |||
| r2=vec_cmpgt(vf0,vv0); | |||
| ind2 = vec_sel( ind0_first,ind0_second,r2); | |||
| vv0= vec_sel(vf0,vv0,r2); | |||
| //compare with old quadruple and update | |||
| r3=vec_cmpgt( quadruple_values,vv0); | |||
| quadruple_indices = vec_sel( quadruple_indices,ind2,r3); | |||
| quadruple_values= vec_sel(quadruple_values,vv0,r3); | |||
| temp0+=temp1; | |||
| temp0+=temp1; //temp0+32 | |||
| } | |||
| //now we have to chose from 4 values and 4 different indices | |||
| // we will compare pairwise if pairs are exactly the same we will choose minimum between index | |||
| // otherwise we will assign index of the minimum value | |||
| float a1,a2,a3,a4; | |||
| unsigned int i1,i2,i3,i4; | |||
| a1=vec_extract(quadruple_values,0); | |||
| a2=vec_extract(quadruple_values,1); | |||
| a3=vec_extract(quadruple_values,2); | |||
| a4=vec_extract(quadruple_values,3); | |||
| i1=vec_extract(quadruple_indices,0); | |||
| i2=vec_extract(quadruple_indices,1); | |||
| i3=vec_extract(quadruple_indices,2); | |||
| i4=vec_extract(quadruple_indices,3); | |||
| if(a1==a2){ | |||
| index=i1>i2?i2:i1; | |||
| }else if(a2<a1){ | |||
| index=i2; | |||
| a1=a2; | |||
| }else{ | |||
| index= i1; | |||
| } | |||
| if(a4==a3){ | |||
| i1=i3>i4?i4:i3; | |||
| }else if(a4<a3){ | |||
| i1=i4; | |||
| a3=a4; | |||
| }else{ | |||
| i1= i3; | |||
| } | |||
| if(a1==a3){ | |||
| index=i1>index?index:i1; | |||
| *minf=a1; | |||
| }else if(a3<a1){ | |||
| index=i1; | |||
| *minf=a3; | |||
| }else{ | |||
| *minf=a1; | |||
| } | |||
| return index; | |||
| } | |||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| BLASLONG i = 0; | |||
| BLASLONG j = 0; | |||
| BLASLONG min = 0; | |||
| FLOAT minf = 0.0; | |||
| if (n <= 0 || inc_x <= 0) return (min); | |||
| minf = ABS(x[0]); //index's not incremented | |||
| if (inc_x == 1) { | |||
| BLASLONG n1 = n & -64; | |||
| if (n1 > 0) { | |||
| min = siamin_kernel_64(n1, x, &minf); | |||
| i = n1; | |||
| } | |||
| while (i < n) { | |||
| if (ABS(x[i]) < minf) { | |||
| min = i; | |||
| minf = ABS(x[i]); | |||
| } | |||
| i++; | |||
| } | |||
| return (min + 1); | |||
| } else { | |||
| BLASLONG n1 = n & -4; | |||
| while (j < n1) { | |||
| if (ABS(x[i]) < minf) { | |||
| min = j; | |||
| minf = ABS(x[i]); | |||
| } | |||
| if (ABS(x[i + inc_x]) < minf) { | |||
| min = j + 1; | |||
| minf = ABS(x[i + inc_x]); | |||
| } | |||
| if (ABS(x[i + 2 * inc_x]) < minf) { | |||
| min = j + 2; | |||
| minf = ABS(x[i + 2 * inc_x]); | |||
| } | |||
| if (ABS(x[i + 3 * inc_x]) < minf) { | |||
| min = j + 3; | |||
| minf = ABS(x[i + 3 * inc_x]); | |||
| } | |||
| i += inc_x * 4; | |||
| j += 4; | |||
| } | |||
| while (j < n) { | |||
| if (ABS(x[i]) < minf) { | |||
| min = j; | |||
| minf = ABS(x[i]); | |||
| } | |||
| i += inc_x; | |||
| j++; | |||
| } | |||
| return (min + 1); | |||
| } | |||
| } | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #include <altivec.h> | |||
| #if defined(DOUBLE) | |||
| #define ABS fabs | |||
| #else | |||
| #define ABS fabsf | |||
| #endif | |||
| /** | |||
| * Find minimum index | |||
| * Warning: requirements n>0 and n % 64 == 0 | |||
| * @param n | |||
| * @param x pointer to the vector | |||
| * @param minf (out) minimum absolute value .( only for output ) | |||
| * @return index | |||
| */ | |||
| static BLASLONG siamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||
| BLASLONG index; | |||
| BLASLONG i=0; | |||
| register __vector unsigned int static_index0 = {0,1,2,3}; | |||
| register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | |||
| register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | |||
| register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; | |||
| register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; | |||
| register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; | |||
| temp0=vec_xor(temp0,temp0); | |||
| temp1=temp1 <<1 ; //{16,16,16,16} | |||
| register __vector unsigned int quadruple_indices=static_index0;//{0,1,2,3}; | |||
| register __vector float * v_ptrx=(__vector float *)x; | |||
| register __vector float quadruple_values=vec_abs(v_ptrx[0]); | |||
| for(; i<n; i+=64){ | |||
| //absolute temporary vectors | |||
| register __vector float v0=vec_abs(v_ptrx[0]); | |||
| register __vector float v1=vec_abs(v_ptrx[1]); | |||
| register __vector float v2=vec_abs(v_ptrx[2]); | |||
| register __vector float v3=vec_abs(v_ptrx[3]); | |||
| register __vector float v4=vec_abs(v_ptrx[4]); | |||
| register __vector float v5=vec_abs(v_ptrx[5]); | |||
| register __vector float v6=vec_abs(v_ptrx[6]); | |||
| register __vector float v7=vec_abs(v_ptrx[7]); | |||
| //cmp quadruple pairs | |||
| register __vector bool int r1=vec_cmpgt(v0,v1); | |||
| register __vector bool int r2=vec_cmpgt(v2,v3); | |||
| register __vector bool int r3=vec_cmpgt(v4,v5); | |||
| register __vector bool int r4=vec_cmpgt(v6,v7); | |||
| //select | |||
| register __vector unsigned int ind0_first= vec_sel(static_index0,static_index1,r1); | |||
| register __vector float vf0= vec_sel(v0,v1,r1); | |||
| register __vector unsigned int ind1= vec_sel(static_index2,static_index3,r2); | |||
| register __vector float vf1= vec_sel(v2,v3,r2); | |||
| register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r3); | |||
| v0=vec_sel(v4,v5,r3); | |||
| register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r4); | |||
| v1=vec_sel(v6,v7,r4); | |||
| // cmp selected | |||
| r1=vec_cmpgt(vf0,vf1); | |||
| r2=vec_cmpgt(v0,v1); | |||
| v_ptrx+=8; | |||
| //select from above | |||
| ind0_first= vec_sel(ind0_first,ind1,r1); | |||
| vf0= vec_sel(vf0,vf1,r1) ; | |||
| ind2= vec_sel(ind2,ind3,r2); | |||
| vf1= vec_sel(v0,v1,r2); | |||
| //second indices actually should be within [16,31] so ind2+16 | |||
| ind2 +=temp1; | |||
| //final cmp and select index and value for the first 32 values | |||
| r1=vec_cmpgt(vf0,vf1); | |||
| ind0_first = vec_sel(ind0_first,ind2,r1); | |||
| vf0= vec_sel(vf0,vf1,r1); | |||
| ind0_first+=temp0; //get absolute index | |||
| temp0+=temp1; | |||
| temp0+=temp1; //temp0+32 | |||
| //second part of 32 | |||
| // absolute temporary vectors | |||
| v0=vec_abs(v_ptrx[0]); | |||
| v1=vec_abs(v_ptrx[1]); | |||
| v2=vec_abs(v_ptrx[2]); | |||
| v3=vec_abs(v_ptrx[3]); | |||
| v4=vec_abs(v_ptrx[4]); | |||
| v5=vec_abs(v_ptrx[5]); | |||
| v6=vec_abs(v_ptrx[6]); | |||
| v7=vec_abs(v_ptrx[7]); | |||
| //cmp quadruple pairs | |||
| r1=vec_cmpgt(v0,v1); | |||
| r2=vec_cmpgt(v2,v3); | |||
| r3=vec_cmpgt(v4,v5); | |||
| r4=vec_cmpgt(v6,v7); | |||
| //select | |||
| register __vector unsigned int ind0_second= vec_sel(static_index0,static_index1,r1); | |||
| register __vector float vv0= vec_sel(v0,v1,r1); | |||
| ind1= vec_sel(static_index2,static_index3,r2); | |||
| register __vector float vv1= vec_sel(v2,v3,r2); | |||
| ind2= vec_sel(static_index0,static_index1,r3); | |||
| v0=vec_sel(v4,v5,r3); | |||
| ind3= vec_sel(static_index2,static_index3,r4); | |||
| v1=vec_sel(v6,v7,r4); | |||
| // cmp selected | |||
| r1=vec_cmpgt(vv0,vv1); | |||
| r2=vec_cmpgt(v0,v1); | |||
| v_ptrx+=8; | |||
| //select from above | |||
| ind0_second= vec_sel(ind0_second,ind1,r1); | |||
| vv0= vec_sel(vv0,vv1,r1) ; | |||
| ind2= vec_sel(ind2,ind3,r2); | |||
| vv1= vec_sel(v0,v1,r2) ; | |||
| //second indices actually should be within [16,31] so ind2+16 | |||
| ind2 +=temp1; | |||
| //final cmp and select index and value for the second 32 values | |||
| r1=vec_cmpgt(vv0,vv1); | |||
| ind0_second = vec_sel(ind0_second,ind2,r1); | |||
| vv0= vec_sel(vv0,vv1,r1); | |||
| ind0_second+=temp0; //get absolute index | |||
| //find final quadruple from 64 elements | |||
| r2=vec_cmpgt(vf0,vv0); | |||
| ind2 = vec_sel( ind0_first,ind0_second,r2); | |||
| vv0= vec_sel(vf0,vv0,r2); | |||
| //compare with old quadruple and update | |||
| r3=vec_cmpgt( quadruple_values,vv0); | |||
| quadruple_indices = vec_sel( quadruple_indices,ind2,r3); | |||
| quadruple_values= vec_sel(quadruple_values,vv0,r3); | |||
| temp0+=temp1; | |||
| temp0+=temp1; //temp0+32 | |||
| } | |||
| //now we have to chose from 4 values and 4 different indices | |||
| // we will compare pairwise if pairs are exactly the same we will choose minimum between index | |||
| // otherwise we will assign index of the minimum value | |||
| float a1,a2,a3,a4; | |||
| unsigned int i1,i2,i3,i4; | |||
| a1=vec_extract(quadruple_values,0); | |||
| a2=vec_extract(quadruple_values,1); | |||
| a3=vec_extract(quadruple_values,2); | |||
| a4=vec_extract(quadruple_values,3); | |||
| i1=vec_extract(quadruple_indices,0); | |||
| i2=vec_extract(quadruple_indices,1); | |||
| i3=vec_extract(quadruple_indices,2); | |||
| i4=vec_extract(quadruple_indices,3); | |||
| if(a1==a2){ | |||
| index=i1>i2?i2:i1; | |||
| }else if(a2<a1){ | |||
| index=i2; | |||
| a1=a2; | |||
| }else{ | |||
| index= i1; | |||
| } | |||
| if(a4==a3){ | |||
| i1=i3>i4?i4:i3; | |||
| }else if(a4<a3){ | |||
| i1=i4; | |||
| a3=a4; | |||
| }else{ | |||
| i1= i3; | |||
| } | |||
| if(a1==a3){ | |||
| index=i1>index?index:i1; | |||
| *minf=a1; | |||
| }else if(a3<a1){ | |||
| index=i1; | |||
| *minf=a3; | |||
| }else{ | |||
| *minf=a1; | |||
| } | |||
| return index; | |||
| } | |||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| BLASLONG i = 0; | |||
| BLASLONG j = 0; | |||
| BLASLONG min = 0; | |||
| FLOAT minf = 0.0; | |||
| if (n <= 0 || inc_x <= 0) return (min); | |||
| minf = ABS(x[0]); //index's not incremented | |||
| if (inc_x == 1) { | |||
| BLASLONG n1 = n & -64; | |||
| if (n1 > 0) { | |||
| min = siamin_kernel_64(n1, x, &minf); | |||
| i = n1; | |||
| } | |||
| while (i < n) { | |||
| if (ABS(x[i]) < minf) { | |||
| min = i; | |||
| minf = ABS(x[i]); | |||
| } | |||
| i++; | |||
| } | |||
| return (min + 1); | |||
| } else { | |||
| BLASLONG n1 = n & -4; | |||
| while (j < n1) { | |||
| if (ABS(x[i]) < minf) { | |||
| min = j; | |||
| minf = ABS(x[i]); | |||
| } | |||
| if (ABS(x[i + inc_x]) < minf) { | |||
| min = j + 1; | |||
| minf = ABS(x[i + inc_x]); | |||
| } | |||
| if (ABS(x[i + 2 * inc_x]) < minf) { | |||
| min = j + 2; | |||
| minf = ABS(x[i + 2 * inc_x]); | |||
| } | |||
| if (ABS(x[i + 3 * inc_x]) < minf) { | |||
| min = j + 3; | |||
| minf = ABS(x[i + 3 * inc_x]); | |||
| } | |||
| i += inc_x * 4; | |||
| j += 4; | |||
| } | |||
| while (j < n) { | |||
| if (ABS(x[i]) < minf) { | |||
| min = j; | |||
| minf = ABS(x[i]); | |||
| } | |||
| i += inc_x; | |||
| j++; | |||
| } | |||
| return (min + 1); | |||
| } | |||
| } | |||
| @@ -1,272 +1,272 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "def_vsx.h" | |||
| #define LOAD ld | |||
| #define STACKSIZE (512 ) | |||
| #define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ | |||
| #define M r3 | |||
| #define N r4 | |||
| #define K r5 | |||
| #define A r7 | |||
| #define B r8 | |||
| #define C r9 | |||
| #define LDC r10 | |||
| #define OFFSET r6 | |||
| #define alpha_r vs20 | |||
| #define save_permute_1 vs21 | |||
| #define save_permute_2 vs22 | |||
| #define permute_mask vs23 | |||
| #define o0 0 | |||
| #define T1 r11 | |||
| #define T2 r12 | |||
| #define T3 r14 | |||
| #define T4 r15 | |||
| #define T5 r16 | |||
| #define T6 r17 | |||
| #define L r18 | |||
| #define T7 r19 | |||
| #define T8 r20 | |||
| #define TEMP_REG r21 | |||
| #define I r22 | |||
| #define J r23 | |||
| #define AO r24 | |||
| #define BO r25 | |||
| #define CO r26 | |||
| #define T9 r27 | |||
| #define T10 r28 | |||
| #define T11 r29 | |||
| #define T12 r30 | |||
| #define T13 r31 | |||
| #include "sgemm_macros_power9.S" | |||
| .equ perm_const1, 0x0405060700010203 | |||
| .equ perm_const2, 0x0c0d0e0f08090a0b | |||
| .equ save_permute_11, 0x1415161718191a1b | |||
| .equ save_permute_12, 0x0405060708090a0b | |||
| .equ save_permute_21, 0x101112131c1d1e1f | |||
| .equ save_permute_22, 0x000102030c0d0e0f | |||
| #ifndef NEEDPARAM | |||
| PROLOGUE | |||
| PROFCODE | |||
| addi SP, SP, -STACKSIZE | |||
| mflr r0 | |||
| stfd f14, 0(SP) | |||
| stfd f15, 8(SP) | |||
| stfd f16, 16(SP) | |||
| stfd f17, 24(SP) | |||
| stfd f18, 32(SP) | |||
| stfd f19, 40(SP) | |||
| stfd f20, 48(SP) | |||
| stfd f21, 56(SP) | |||
| stfd f22, 64(SP) | |||
| stfd f23, 72(SP) | |||
| stfd f24, 80(SP) | |||
| stfd f25, 88(SP) | |||
| stfd f26, 96(SP) | |||
| stfd f27, 104(SP) | |||
| stfd f28, 112(SP) | |||
| stfd f29, 120(SP) | |||
| stfd f30, 128(SP) | |||
| stfd f31, 136(SP) | |||
| std r31, 144(SP) | |||
| std r30, 152(SP) | |||
| std r29, 160(SP) | |||
| std r28, 168(SP) | |||
| std r27, 176(SP) | |||
| std r26, 184(SP) | |||
| std r25, 192(SP) | |||
| std r24, 200(SP) | |||
| std r23, 208(SP) | |||
| std r22, 216(SP) | |||
| std r21, 224(SP) | |||
| std r20, 232(SP) | |||
| std r19, 240(SP) | |||
| std r18, 248(SP) | |||
| std r17, 256(SP) | |||
| std r16, 264(SP) | |||
| std r15, 272(SP) | |||
| std r14, 280(SP) | |||
| stxv vs52, 288(SP) | |||
| stxv vs53, 304(SP) | |||
| stxv vs54, 320(SP) | |||
| stxv vs55, 336(SP) | |||
| stxv vs56, 352(SP) | |||
| stxv vs57, 368(SP) | |||
| stxv vs58, 384(SP) | |||
| stxv vs59, 400(SP) | |||
| stxv vs60, 416(SP) | |||
| stxv vs61, 432(SP) | |||
| stxv vs62, 448(SP) | |||
| stxv vs63, 464(SP) | |||
| std r0, FLINK_SAVE(SP) | |||
| #if defined(TRMMKERNEL) | |||
| ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #endif | |||
| slwi LDC, LDC, 2 | |||
| /*alpha is stored in f1. convert to single and splat*/ | |||
| xscvdpspn alpha_r,vs1 | |||
| xxspltw alpha_r,alpha_r,0 | |||
| /*load reverse permute mask for big endian | |||
| uint128 = 0xc0d0e0f08090a0b0405060700010203 | |||
| */ | |||
| lis T2, perm_const2@highest | |||
| lis T1, perm_const1@highest | |||
| lis T3, save_permute_12@highest | |||
| lis T4, save_permute_11@highest | |||
| lis T5, save_permute_22@highest | |||
| lis T6, save_permute_21@highest | |||
| ori T2, T2, perm_const2@higher | |||
| ori T1, T1, perm_const1@higher | |||
| ori T3, T3, save_permute_12@higher | |||
| ori T4, T4, save_permute_11@higher | |||
| ori T5, T5, save_permute_22@higher | |||
| ori T6, T6, save_permute_21@higher | |||
| rldicr T2, T2, 32, 31 | |||
| rldicr T1, T1, 32, 31 | |||
| rldicr T3, T3, 32, 31 | |||
| rldicr T4, T4, 32, 31 | |||
| rldicr T5, T5, 32, 31 | |||
| rldicr T6, T6, 32, 31 | |||
| oris T2, T2, perm_const2@h | |||
| oris T1, T1, perm_const1@h | |||
| oris T3, T3, save_permute_12@h | |||
| oris T4, T4, save_permute_11@h | |||
| oris T5, T5, save_permute_22@h | |||
| oris T6, T6, save_permute_21@h | |||
| ori T2, T2, perm_const2@l | |||
| ori T1, T1, perm_const1@l | |||
| ori T3, T3, save_permute_12@l | |||
| ori T4, T4, save_permute_11@l | |||
| ori T5, T5, save_permute_22@l | |||
| ori T6, T6, save_permute_21@l | |||
| li r0,0 | |||
| mtvsrdd permute_mask,T2,T1 | |||
| mtvsrdd save_permute_1,T3,T4 | |||
| mtvsrdd save_permute_2,T5,T6 | |||
| #include "sgemm_logic_power9.S" | |||
| .L999: | |||
| lfd f14, 0(SP) | |||
| lfd f15, 8(SP) | |||
| lfd f16, 16(SP) | |||
| lfd f17, 24(SP) | |||
| lfd f18, 32(SP) | |||
| lfd f19, 40(SP) | |||
| lfd f20, 48(SP) | |||
| lfd f21, 56(SP) | |||
| lfd f22, 64(SP) | |||
| lfd f23, 72(SP) | |||
| lfd f24, 80(SP) | |||
| lfd f25, 88(SP) | |||
| lfd f26, 96(SP) | |||
| lfd f27, 104(SP) | |||
| lfd f28, 112(SP) | |||
| lfd f29, 120(SP) | |||
| lfd f30, 128(SP) | |||
| lfd f31, 136(SP) | |||
| ld r31, 144(SP) | |||
| ld r30, 152(SP) | |||
| ld r29, 160(SP) | |||
| ld r28, 168(SP) | |||
| ld r27, 176(SP) | |||
| ld r26, 184(SP) | |||
| ld r25, 192(SP) | |||
| ld r24, 200(SP) | |||
| ld r23, 208(SP) | |||
| ld r22, 216(SP) | |||
| ld r21, 224(SP) | |||
| ld r20, 232(SP) | |||
| ld r19, 240(SP) | |||
| ld r18, 248(SP) | |||
| ld r17, 256(SP) | |||
| ld r16, 264(SP) | |||
| ld r15, 272(SP) | |||
| ld r14, 280(SP) | |||
| ld r0, FLINK_SAVE(SP) | |||
| lxv vs52, 288(SP) | |||
| lxv vs53, 304(SP) | |||
| lxv vs54, 320(SP) | |||
| lxv vs55, 336(SP) | |||
| lxv vs56, 352(SP) | |||
| lxv vs57, 368(SP) | |||
| lxv vs58, 384(SP) | |||
| lxv vs59, 400(SP) | |||
| mtlr r0 | |||
| lxv vs60, 416(SP) | |||
| lxv vs61, 432(SP) | |||
| lxv vs62, 448(SP) | |||
| lxv vs63, 464(SP) | |||
| addi SP, SP, STACKSIZE | |||
| blr | |||
| EPILOGUE | |||
| #endif | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "def_vsx.h" | |||
| #define LOAD ld | |||
| #define STACKSIZE (512 ) | |||
| #define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ | |||
| #define M r3 | |||
| #define N r4 | |||
| #define K r5 | |||
| #define A r7 | |||
| #define B r8 | |||
| #define C r9 | |||
| #define LDC r10 | |||
| #define OFFSET r6 | |||
| #define alpha_r vs20 | |||
| #define save_permute_1 vs21 | |||
| #define save_permute_2 vs22 | |||
| #define permute_mask vs23 | |||
| #define o0 0 | |||
| #define T1 r11 | |||
| #define T2 r12 | |||
| #define T3 r14 | |||
| #define T4 r15 | |||
| #define T5 r16 | |||
| #define T6 r17 | |||
| #define L r18 | |||
| #define T7 r19 | |||
| #define T8 r20 | |||
| #define TEMP_REG r21 | |||
| #define I r22 | |||
| #define J r23 | |||
| #define AO r24 | |||
| #define BO r25 | |||
| #define CO r26 | |||
| #define T9 r27 | |||
| #define T10 r28 | |||
| #define T11 r29 | |||
| #define T12 r30 | |||
| #define T13 r31 | |||
| #include "sgemm_macros_power9.S" | |||
| .equ perm_const1, 0x0405060700010203 | |||
| .equ perm_const2, 0x0c0d0e0f08090a0b | |||
| .equ save_permute_11, 0x1415161718191a1b | |||
| .equ save_permute_12, 0x0405060708090a0b | |||
| .equ save_permute_21, 0x101112131c1d1e1f | |||
| .equ save_permute_22, 0x000102030c0d0e0f | |||
| #ifndef NEEDPARAM | |||
| PROLOGUE | |||
| PROFCODE | |||
| addi SP, SP, -STACKSIZE | |||
| mflr r0 | |||
| stfd f14, 0(SP) | |||
| stfd f15, 8(SP) | |||
| stfd f16, 16(SP) | |||
| stfd f17, 24(SP) | |||
| stfd f18, 32(SP) | |||
| stfd f19, 40(SP) | |||
| stfd f20, 48(SP) | |||
| stfd f21, 56(SP) | |||
| stfd f22, 64(SP) | |||
| stfd f23, 72(SP) | |||
| stfd f24, 80(SP) | |||
| stfd f25, 88(SP) | |||
| stfd f26, 96(SP) | |||
| stfd f27, 104(SP) | |||
| stfd f28, 112(SP) | |||
| stfd f29, 120(SP) | |||
| stfd f30, 128(SP) | |||
| stfd f31, 136(SP) | |||
| std r31, 144(SP) | |||
| std r30, 152(SP) | |||
| std r29, 160(SP) | |||
| std r28, 168(SP) | |||
| std r27, 176(SP) | |||
| std r26, 184(SP) | |||
| std r25, 192(SP) | |||
| std r24, 200(SP) | |||
| std r23, 208(SP) | |||
| std r22, 216(SP) | |||
| std r21, 224(SP) | |||
| std r20, 232(SP) | |||
| std r19, 240(SP) | |||
| std r18, 248(SP) | |||
| std r17, 256(SP) | |||
| std r16, 264(SP) | |||
| std r15, 272(SP) | |||
| std r14, 280(SP) | |||
| stxv vs52, 288(SP) | |||
| stxv vs53, 304(SP) | |||
| stxv vs54, 320(SP) | |||
| stxv vs55, 336(SP) | |||
| stxv vs56, 352(SP) | |||
| stxv vs57, 368(SP) | |||
| stxv vs58, 384(SP) | |||
| stxv vs59, 400(SP) | |||
| stxv vs60, 416(SP) | |||
| stxv vs61, 432(SP) | |||
| stxv vs62, 448(SP) | |||
| stxv vs63, 464(SP) | |||
| std r0, FLINK_SAVE(SP) | |||
| #if defined(TRMMKERNEL) | |||
| ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #endif | |||
| slwi LDC, LDC, 2 | |||
| /*alpha is stored in f1. convert to single and splat*/ | |||
| xscvdpspn alpha_r,vs1 | |||
| xxspltw alpha_r,alpha_r,0 | |||
| /*load reverse permute mask for big endian | |||
| uint128 = 0xc0d0e0f08090a0b0405060700010203 | |||
| */ | |||
| lis T2, perm_const2@highest | |||
| lis T1, perm_const1@highest | |||
| lis T3, save_permute_12@highest | |||
| lis T4, save_permute_11@highest | |||
| lis T5, save_permute_22@highest | |||
| lis T6, save_permute_21@highest | |||
| ori T2, T2, perm_const2@higher | |||
| ori T1, T1, perm_const1@higher | |||
| ori T3, T3, save_permute_12@higher | |||
| ori T4, T4, save_permute_11@higher | |||
| ori T5, T5, save_permute_22@higher | |||
| ori T6, T6, save_permute_21@higher | |||
| rldicr T2, T2, 32, 31 | |||
| rldicr T1, T1, 32, 31 | |||
| rldicr T3, T3, 32, 31 | |||
| rldicr T4, T4, 32, 31 | |||
| rldicr T5, T5, 32, 31 | |||
| rldicr T6, T6, 32, 31 | |||
| oris T2, T2, perm_const2@h | |||
| oris T1, T1, perm_const1@h | |||
| oris T3, T3, save_permute_12@h | |||
| oris T4, T4, save_permute_11@h | |||
| oris T5, T5, save_permute_22@h | |||
| oris T6, T6, save_permute_21@h | |||
| ori T2, T2, perm_const2@l | |||
| ori T1, T1, perm_const1@l | |||
| ori T3, T3, save_permute_12@l | |||
| ori T4, T4, save_permute_11@l | |||
| ori T5, T5, save_permute_22@l | |||
| ori T6, T6, save_permute_21@l | |||
| li r0,0 | |||
| mtvsrdd permute_mask,T2,T1 | |||
| mtvsrdd save_permute_1,T3,T4 | |||
| mtvsrdd save_permute_2,T5,T6 | |||
| #include "sgemm_logic_power9.S" | |||
| .L999: | |||
| lfd f14, 0(SP) | |||
| lfd f15, 8(SP) | |||
| lfd f16, 16(SP) | |||
| lfd f17, 24(SP) | |||
| lfd f18, 32(SP) | |||
| lfd f19, 40(SP) | |||
| lfd f20, 48(SP) | |||
| lfd f21, 56(SP) | |||
| lfd f22, 64(SP) | |||
| lfd f23, 72(SP) | |||
| lfd f24, 80(SP) | |||
| lfd f25, 88(SP) | |||
| lfd f26, 96(SP) | |||
| lfd f27, 104(SP) | |||
| lfd f28, 112(SP) | |||
| lfd f29, 120(SP) | |||
| lfd f30, 128(SP) | |||
| lfd f31, 136(SP) | |||
| ld r31, 144(SP) | |||
| ld r30, 152(SP) | |||
| ld r29, 160(SP) | |||
| ld r28, 168(SP) | |||
| ld r27, 176(SP) | |||
| ld r26, 184(SP) | |||
| ld r25, 192(SP) | |||
| ld r24, 200(SP) | |||
| ld r23, 208(SP) | |||
| ld r22, 216(SP) | |||
| ld r21, 224(SP) | |||
| ld r20, 232(SP) | |||
| ld r19, 240(SP) | |||
| ld r18, 248(SP) | |||
| ld r17, 256(SP) | |||
| ld r16, 264(SP) | |||
| ld r15, 272(SP) | |||
| ld r14, 280(SP) | |||
| ld r0, FLINK_SAVE(SP) | |||
| lxv vs52, 288(SP) | |||
| lxv vs53, 304(SP) | |||
| lxv vs54, 320(SP) | |||
| lxv vs55, 336(SP) | |||
| lxv vs56, 352(SP) | |||
| lxv vs57, 368(SP) | |||
| lxv vs58, 384(SP) | |||
| lxv vs59, 400(SP) | |||
| mtlr r0 | |||
| lxv vs60, 416(SP) | |||
| lxv vs61, 432(SP) | |||
| lxv vs62, 448(SP) | |||
| lxv vs63, 464(SP) | |||
| addi SP, SP, STACKSIZE | |||
| blr | |||
| EPILOGUE | |||
| #endif | |||
| @@ -1,470 +1,470 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #if !defined(__VEC__) || !defined(__ALTIVEC__) | |||
| #include "../arm/gemv_n.c" | |||
| #else | |||
| #include "common.h" | |||
| #define NBMAX 4096 | |||
| static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3; | |||
| FLOAT x0,x1,x2,x3,x4,x5,x6,x7; | |||
| a0 = ap[0]; | |||
| a1 = ap[1]; | |||
| a2 = ap[2]; | |||
| a3 = ap[3]; | |||
| b0 = a0 + lda4 ; | |||
| b1 = a1 + lda4 ; | |||
| b2 = a2 + lda4 ; | |||
| b3 = a3 + lda4 ; | |||
| x0 = xo[0] * *alpha; | |||
| x1 = xo[1] * *alpha; | |||
| x2 = xo[2] * *alpha; | |||
| x3 = xo[3] * *alpha; | |||
| x4 = xo[4] * *alpha; | |||
| x5 = xo[5] * *alpha; | |||
| x6 = xo[6] * *alpha; | |||
| x7 = xo[7] * *alpha; | |||
| __vector float* va0 = (__vector float*)a0; | |||
| __vector float* va1 = (__vector float*)a1; | |||
| __vector float* va2 = (__vector float*)a2; | |||
| __vector float* va3 = (__vector float*)a3; | |||
| __vector float* vb0 = (__vector float*)b0; | |||
| __vector float* vb1 = (__vector float*)b1; | |||
| __vector float* vb2 = (__vector float*)b2; | |||
| __vector float* vb3 = (__vector float*)b3; | |||
| __vector float v_x0 = {x0,x0,x0,x0}; | |||
| __vector float v_x1 = {x1,x1,x1,x1}; | |||
| __vector float v_x2 = {x2,x2,x2,x2}; | |||
| __vector float v_x3 = {x3,x3,x3,x3}; | |||
| __vector float v_x4 = {x4,x4,x4,x4}; | |||
| __vector float v_x5 = {x5,x5,x5,x5}; | |||
| __vector float v_x6 = {x6,x6,x6,x6}; | |||
| __vector float v_x7 = {x7,x7,x7,x7}; | |||
| __vector float* v_y =(__vector float*)y; | |||
| for ( i=0; i< n/4; i++) | |||
| { | |||
| register __vector float vy=v_y[i]; | |||
| vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; | |||
| vy += v_x4 * vb0[i] + v_x5 * vb1[i] + v_x6 * vb2[i] + v_x7 * vb3[i] ; | |||
| v_y[i] =vy; | |||
| } | |||
| } | |||
| static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT x0,x1,x2,x3; | |||
| x0 = xo[0] * *alpha; | |||
| x1 = xo[1] * *alpha; | |||
| x2 = xo[2] * *alpha; | |||
| x3 = xo[3] * *alpha; | |||
| __vector float v_x0 = {x0,x0,x0,x0}; | |||
| __vector float v_x1 = {x1,x1,x1,x1}; | |||
| __vector float v_x2 = {x2,x2,x2,x2}; | |||
| __vector float v_x3 = {x3,x3,x3,x3}; | |||
| __vector float* v_y =(__vector float*)y; | |||
| __vector float* va0 = (__vector float*)ap[0]; | |||
| __vector float* va1 = (__vector float*)ap[1]; | |||
| __vector float* va2 = (__vector float*)ap[2]; | |||
| __vector float* va3 = (__vector float*)ap[3]; | |||
| for ( i=0; i< n/4; i++ ) | |||
| { | |||
| register __vector float vy=v_y[i]; | |||
| vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; | |||
| v_y[i] =vy; | |||
| } | |||
| } | |||
| static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT x0,x1; | |||
| x0 = x[0] * *alpha; | |||
| x1 = x[1] * *alpha; | |||
| __vector float v_x0 = {x0,x0,x0,x0}; | |||
| __vector float v_x1 = {x1,x1,x1,x1}; | |||
| __vector float* v_y =(__vector float*)y; | |||
| __vector float* va0 = (__vector float*)ap[0]; | |||
| __vector float* va1 = (__vector float*)ap[1]; | |||
| for ( i=0; i< n/4; i++ ) | |||
| { | |||
| v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ; | |||
| } | |||
| } | |||
| static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT x0 ; | |||
| x0 = x[0] * *alpha; | |||
| __vector float v_x0 = {x0,x0,x0,x0}; | |||
| __vector float* v_y =(__vector float*)y; | |||
| __vector float* va0 = (__vector float*)ap; | |||
| for ( i=0; i< n/4; i++ ) | |||
| { | |||
| v_y[i] += v_x0 * va0[i] ; | |||
| } | |||
| } | |||
| static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) | |||
| { | |||
| BLASLONG i; | |||
| for ( i=0; i<n; i++ ){ | |||
| *dest += *src; | |||
| src++; | |||
| dest += inc_dest; | |||
| } | |||
| return; | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT *a_ptr; | |||
| FLOAT *x_ptr; | |||
| FLOAT *y_ptr; | |||
| FLOAT *ap[4]; | |||
| BLASLONG n1; | |||
| BLASLONG m1; | |||
| BLASLONG m2; | |||
| BLASLONG m3; | |||
| BLASLONG n2; | |||
| BLASLONG lda4 = lda << 2; | |||
| BLASLONG lda8 = lda << 3; | |||
| FLOAT xbuffer[8] __attribute__((aligned(16))); | |||
| FLOAT *ybuffer; | |||
| if ( m < 1 ) return(0); | |||
| if ( n < 1 ) return(0); | |||
| ybuffer = buffer; | |||
| if ( inc_x == 1 ) | |||
| { | |||
| n1 = n >> 3 ; | |||
| n2 = n & 7 ; | |||
| } | |||
| else | |||
| { | |||
| n1 = n >> 2 ; | |||
| n2 = n & 3 ; | |||
| } | |||
| m3 = m & 3 ; | |||
| m1 = m & -4 ; | |||
| m2 = (m & (NBMAX-1)) - m3 ; | |||
| y_ptr = y; | |||
| BLASLONG NB = NBMAX; | |||
| while ( NB == NBMAX ) | |||
| { | |||
| m1 -= NB; | |||
| if ( m1 < 0) | |||
| { | |||
| if ( m2 == 0 ) break; | |||
| NB = m2; | |||
| } | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| ap[0] = a_ptr; | |||
| ap[1] = a_ptr + lda; | |||
| ap[2] = ap[1] + lda; | |||
| ap[3] = ap[2] + lda; | |||
| if ( inc_y != 1 ) | |||
| memset(ybuffer,0,NB*4); | |||
| else | |||
| ybuffer = y_ptr; | |||
| if ( inc_x == 1 ) | |||
| { | |||
| for( i = 0; i < n1 ; i++) | |||
| { | |||
| sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha); | |||
| ap[0] += lda8; | |||
| ap[1] += lda8; | |||
| ap[2] += lda8; | |||
| ap[3] += lda8; | |||
| a_ptr += lda8; | |||
| x_ptr += 8; | |||
| } | |||
| if ( n2 & 4 ) | |||
| { | |||
| sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); | |||
| ap[0] += lda4; | |||
| ap[1] += lda4; | |||
| ap[2] += lda4; | |||
| ap[3] += lda4; | |||
| a_ptr += lda4; | |||
| x_ptr += 4; | |||
| } | |||
| if ( n2 & 2 ) | |||
| { | |||
| sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); | |||
| a_ptr += lda*2; | |||
| x_ptr += 2; | |||
| } | |||
| if ( n2 & 1 ) | |||
| { | |||
| sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); | |||
| a_ptr += lda; | |||
| x_ptr += 1; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n1 ; i++) | |||
| { | |||
| xbuffer[0] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| xbuffer[1] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| xbuffer[2] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| xbuffer[3] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); | |||
| ap[0] += lda4; | |||
| ap[1] += lda4; | |||
| ap[2] += lda4; | |||
| ap[3] += lda4; | |||
| a_ptr += lda4; | |||
| } | |||
| for( i = 0; i < n2 ; i++) | |||
| { | |||
| xbuffer[0] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); | |||
| a_ptr += lda; | |||
| } | |||
| } | |||
| a += NB; | |||
| if ( inc_y != 1 ) | |||
| { | |||
| add_y(NB,ybuffer,y_ptr,inc_y); | |||
| y_ptr += NB * inc_y; | |||
| } | |||
| else | |||
| y_ptr += NB ; | |||
| } | |||
| if ( m3 == 0 ) return(0); | |||
| if ( m3 == 3 ) | |||
| { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp0 = 0.0; | |||
| FLOAT temp1 = 0.0; | |||
| FLOAT temp2 = 0.0; | |||
| if ( lda == 3 && inc_x ==1 ) | |||
| { | |||
| for( i = 0; i < ( n & -4 ); i+=4 ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
| temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; | |||
| temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; | |||
| temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; | |||
| temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; | |||
| temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; | |||
| a_ptr += 12; | |||
| x_ptr += 4; | |||
| } | |||
| for( ; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| temp2 += a_ptr[2] * x_ptr[0]; | |||
| a_ptr += 3; | |||
| x_ptr ++; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| temp2 += a_ptr[2] * x_ptr[0]; | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| y_ptr[0] += alpha * temp0; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha * temp1; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha * temp2; | |||
| return(0); | |||
| } | |||
| if ( m3 == 2 ) | |||
| { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp0 = 0.0; | |||
| FLOAT temp1 = 0.0; | |||
| if ( lda == 2 && inc_x ==1 ) | |||
| { | |||
| for( i = 0; i < (n & -4) ; i+=4 ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; | |||
| temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
| temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; | |||
| temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; | |||
| a_ptr += 8; | |||
| x_ptr += 4; | |||
| } | |||
| for( ; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| a_ptr += 2; | |||
| x_ptr ++; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| y_ptr[0] += alpha * temp0; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha * temp1; | |||
| return(0); | |||
| } | |||
| if ( m3 == 1 ) | |||
| { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp = 0.0; | |||
| if ( lda == 1 && inc_x ==1 ) | |||
| { | |||
| for( i = 0; i < (n & -4); i+=4 ) | |||
| { | |||
| temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; | |||
| } | |||
| for( ; i < n; i++ ) | |||
| { | |||
| temp += a_ptr[i] * x_ptr[i]; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n; i++ ) | |||
| { | |||
| temp += a_ptr[0] * x_ptr[0]; | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| y_ptr[0] += alpha * temp; | |||
| return(0); | |||
| } | |||
| return(0); | |||
| } | |||
| #endif | |||
| /*************************************************************************** | |||
| Copyright (c) 2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #if !defined(__VEC__) || !defined(__ALTIVEC__) | |||
| #include "../arm/gemv_n.c" | |||
| #else | |||
| #include "common.h" | |||
| #define NBMAX 4096 | |||
| static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3; | |||
| FLOAT x0,x1,x2,x3,x4,x5,x6,x7; | |||
| a0 = ap[0]; | |||
| a1 = ap[1]; | |||
| a2 = ap[2]; | |||
| a3 = ap[3]; | |||
| b0 = a0 + lda4 ; | |||
| b1 = a1 + lda4 ; | |||
| b2 = a2 + lda4 ; | |||
| b3 = a3 + lda4 ; | |||
| x0 = xo[0] * *alpha; | |||
| x1 = xo[1] * *alpha; | |||
| x2 = xo[2] * *alpha; | |||
| x3 = xo[3] * *alpha; | |||
| x4 = xo[4] * *alpha; | |||
| x5 = xo[5] * *alpha; | |||
| x6 = xo[6] * *alpha; | |||
| x7 = xo[7] * *alpha; | |||
| __vector float* va0 = (__vector float*)a0; | |||
| __vector float* va1 = (__vector float*)a1; | |||
| __vector float* va2 = (__vector float*)a2; | |||
| __vector float* va3 = (__vector float*)a3; | |||
| __vector float* vb0 = (__vector float*)b0; | |||
| __vector float* vb1 = (__vector float*)b1; | |||
| __vector float* vb2 = (__vector float*)b2; | |||
| __vector float* vb3 = (__vector float*)b3; | |||
| __vector float v_x0 = {x0,x0,x0,x0}; | |||
| __vector float v_x1 = {x1,x1,x1,x1}; | |||
| __vector float v_x2 = {x2,x2,x2,x2}; | |||
| __vector float v_x3 = {x3,x3,x3,x3}; | |||
| __vector float v_x4 = {x4,x4,x4,x4}; | |||
| __vector float v_x5 = {x5,x5,x5,x5}; | |||
| __vector float v_x6 = {x6,x6,x6,x6}; | |||
| __vector float v_x7 = {x7,x7,x7,x7}; | |||
| __vector float* v_y =(__vector float*)y; | |||
| for ( i=0; i< n/4; i++) | |||
| { | |||
| register __vector float vy=v_y[i]; | |||
| vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; | |||
| vy += v_x4 * vb0[i] + v_x5 * vb1[i] + v_x6 * vb2[i] + v_x7 * vb3[i] ; | |||
| v_y[i] =vy; | |||
| } | |||
| } | |||
| static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT x0,x1,x2,x3; | |||
| x0 = xo[0] * *alpha; | |||
| x1 = xo[1] * *alpha; | |||
| x2 = xo[2] * *alpha; | |||
| x3 = xo[3] * *alpha; | |||
| __vector float v_x0 = {x0,x0,x0,x0}; | |||
| __vector float v_x1 = {x1,x1,x1,x1}; | |||
| __vector float v_x2 = {x2,x2,x2,x2}; | |||
| __vector float v_x3 = {x3,x3,x3,x3}; | |||
| __vector float* v_y =(__vector float*)y; | |||
| __vector float* va0 = (__vector float*)ap[0]; | |||
| __vector float* va1 = (__vector float*)ap[1]; | |||
| __vector float* va2 = (__vector float*)ap[2]; | |||
| __vector float* va3 = (__vector float*)ap[3]; | |||
| for ( i=0; i< n/4; i++ ) | |||
| { | |||
| register __vector float vy=v_y[i]; | |||
| vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; | |||
| v_y[i] =vy; | |||
| } | |||
| } | |||
| static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT x0,x1; | |||
| x0 = x[0] * *alpha; | |||
| x1 = x[1] * *alpha; | |||
| __vector float v_x0 = {x0,x0,x0,x0}; | |||
| __vector float v_x1 = {x1,x1,x1,x1}; | |||
| __vector float* v_y =(__vector float*)y; | |||
| __vector float* va0 = (__vector float*)ap[0]; | |||
| __vector float* va1 = (__vector float*)ap[1]; | |||
| for ( i=0; i< n/4; i++ ) | |||
| { | |||
| v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ; | |||
| } | |||
| } | |||
| static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT x0 ; | |||
| x0 = x[0] * *alpha; | |||
| __vector float v_x0 = {x0,x0,x0,x0}; | |||
| __vector float* v_y =(__vector float*)y; | |||
| __vector float* va0 = (__vector float*)ap; | |||
| for ( i=0; i< n/4; i++ ) | |||
| { | |||
| v_y[i] += v_x0 * va0[i] ; | |||
| } | |||
| } | |||
| static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) | |||
| { | |||
| BLASLONG i; | |||
| for ( i=0; i<n; i++ ){ | |||
| *dest += *src; | |||
| src++; | |||
| dest += inc_dest; | |||
| } | |||
| return; | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT *a_ptr; | |||
| FLOAT *x_ptr; | |||
| FLOAT *y_ptr; | |||
| FLOAT *ap[4]; | |||
| BLASLONG n1; | |||
| BLASLONG m1; | |||
| BLASLONG m2; | |||
| BLASLONG m3; | |||
| BLASLONG n2; | |||
| BLASLONG lda4 = lda << 2; | |||
| BLASLONG lda8 = lda << 3; | |||
| FLOAT xbuffer[8] __attribute__((aligned(16))); | |||
| FLOAT *ybuffer; | |||
| if ( m < 1 ) return(0); | |||
| if ( n < 1 ) return(0); | |||
| ybuffer = buffer; | |||
| if ( inc_x == 1 ) | |||
| { | |||
| n1 = n >> 3 ; | |||
| n2 = n & 7 ; | |||
| } | |||
| else | |||
| { | |||
| n1 = n >> 2 ; | |||
| n2 = n & 3 ; | |||
| } | |||
| m3 = m & 3 ; | |||
| m1 = m & -4 ; | |||
| m2 = (m & (NBMAX-1)) - m3 ; | |||
| y_ptr = y; | |||
| BLASLONG NB = NBMAX; | |||
| while ( NB == NBMAX ) | |||
| { | |||
| m1 -= NB; | |||
| if ( m1 < 0) | |||
| { | |||
| if ( m2 == 0 ) break; | |||
| NB = m2; | |||
| } | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| ap[0] = a_ptr; | |||
| ap[1] = a_ptr + lda; | |||
| ap[2] = ap[1] + lda; | |||
| ap[3] = ap[2] + lda; | |||
| if ( inc_y != 1 ) | |||
| memset(ybuffer,0,NB*4); | |||
| else | |||
| ybuffer = y_ptr; | |||
| if ( inc_x == 1 ) | |||
| { | |||
| for( i = 0; i < n1 ; i++) | |||
| { | |||
| sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha); | |||
| ap[0] += lda8; | |||
| ap[1] += lda8; | |||
| ap[2] += lda8; | |||
| ap[3] += lda8; | |||
| a_ptr += lda8; | |||
| x_ptr += 8; | |||
| } | |||
| if ( n2 & 4 ) | |||
| { | |||
| sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); | |||
| ap[0] += lda4; | |||
| ap[1] += lda4; | |||
| ap[2] += lda4; | |||
| ap[3] += lda4; | |||
| a_ptr += lda4; | |||
| x_ptr += 4; | |||
| } | |||
| if ( n2 & 2 ) | |||
| { | |||
| sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); | |||
| a_ptr += lda*2; | |||
| x_ptr += 2; | |||
| } | |||
| if ( n2 & 1 ) | |||
| { | |||
| sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); | |||
| a_ptr += lda; | |||
| x_ptr += 1; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n1 ; i++) | |||
| { | |||
| xbuffer[0] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| xbuffer[1] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| xbuffer[2] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| xbuffer[3] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); | |||
| ap[0] += lda4; | |||
| ap[1] += lda4; | |||
| ap[2] += lda4; | |||
| ap[3] += lda4; | |||
| a_ptr += lda4; | |||
| } | |||
| for( i = 0; i < n2 ; i++) | |||
| { | |||
| xbuffer[0] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); | |||
| a_ptr += lda; | |||
| } | |||
| } | |||
| a += NB; | |||
| if ( inc_y != 1 ) | |||
| { | |||
| add_y(NB,ybuffer,y_ptr,inc_y); | |||
| y_ptr += NB * inc_y; | |||
| } | |||
| else | |||
| y_ptr += NB ; | |||
| } | |||
| if ( m3 == 0 ) return(0); | |||
| if ( m3 == 3 ) | |||
| { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp0 = 0.0; | |||
| FLOAT temp1 = 0.0; | |||
| FLOAT temp2 = 0.0; | |||
| if ( lda == 3 && inc_x ==1 ) | |||
| { | |||
| for( i = 0; i < ( n & -4 ); i+=4 ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
| temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; | |||
| temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; | |||
| temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; | |||
| temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; | |||
| temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; | |||
| a_ptr += 12; | |||
| x_ptr += 4; | |||
| } | |||
| for( ; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| temp2 += a_ptr[2] * x_ptr[0]; | |||
| a_ptr += 3; | |||
| x_ptr ++; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| temp2 += a_ptr[2] * x_ptr[0]; | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| y_ptr[0] += alpha * temp0; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha * temp1; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha * temp2; | |||
| return(0); | |||
| } | |||
| if ( m3 == 2 ) | |||
| { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp0 = 0.0; | |||
| FLOAT temp1 = 0.0; | |||
| if ( lda == 2 && inc_x ==1 ) | |||
| { | |||
| for( i = 0; i < (n & -4) ; i+=4 ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; | |||
| temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
| temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; | |||
| temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; | |||
| a_ptr += 8; | |||
| x_ptr += 4; | |||
| } | |||
| for( ; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| a_ptr += 2; | |||
| x_ptr ++; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| y_ptr[0] += alpha * temp0; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha * temp1; | |||
| return(0); | |||
| } | |||
| if ( m3 == 1 ) | |||
| { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp = 0.0; | |||
| if ( lda == 1 && inc_x ==1 ) | |||
| { | |||
| for( i = 0; i < (n & -4); i+=4 ) | |||
| { | |||
| temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; | |||
| } | |||
| for( ; i < n; i++ ) | |||
| { | |||
| temp += a_ptr[i] * x_ptr[i]; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n; i++ ) | |||
| { | |||
| temp += a_ptr[0] * x_ptr[0]; | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| y_ptr[0] += alpha * temp; | |||
| return(0); | |||
| } | |||
| return(0); | |||
| } | |||
| #endif | |||
| @@ -1,484 +1,484 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #if !defined(__VEC__) || !defined(__ALTIVEC__) | |||
| #include "../arm/gemv_t.c" | |||
| #else | |||
| #include "common.h" | |||
| #define NBMAX 2048 | |||
| #include <altivec.h> | |||
| static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { | |||
| BLASLONG i; | |||
| FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; | |||
| __vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; | |||
| register __vector float temp0 = {0,0,0,0}; | |||
| register __vector float temp1 = {0,0,0,0}; | |||
| register __vector float temp2 = {0,0,0,0}; | |||
| register __vector float temp3 = {0,0,0,0}; | |||
| register __vector float temp4 = {0,0,0,0}; | |||
| register __vector float temp5 = {0,0,0,0}; | |||
| register __vector float temp6 = {0,0,0,0}; | |||
| register __vector float temp7 = {0,0,0,0}; | |||
| a0 = ap; | |||
| a1 = ap + lda; | |||
| a2 = a1 + lda; | |||
| a3 = a2 + lda; | |||
| a4 = a3 + lda; | |||
| a5 = a4 + lda; | |||
| a6 = a5 + lda; | |||
| a7 = a6 + lda; | |||
| va0 = (__vector float*) a0; | |||
| va1 = (__vector float*) a1; | |||
| va2 = (__vector float*) a2; | |||
| va3 = (__vector float*) a3; | |||
| va4 = (__vector float*) a4; | |||
| va5 = (__vector float*) a5; | |||
| va6 = (__vector float*) a6; | |||
| va7 = (__vector float*) a7; | |||
| v_x = (__vector float*) x; | |||
| for (i = 0; i < n/4; i ++) { | |||
| temp0 += v_x[i] * va0[i]; | |||
| temp1 += v_x[i] * va1[i]; | |||
| temp2 += v_x[i] * va2[i]; | |||
| temp3 += v_x[i] * va3[i]; | |||
| temp4 += v_x[i] * va4[i]; | |||
| temp5 += v_x[i] * va5[i]; | |||
| temp6 += v_x[i] * va6[i]; | |||
| temp7 += v_x[i] * va7[i]; | |||
| } | |||
| y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||
| y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); | |||
| y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); | |||
| y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); | |||
| y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]); | |||
| y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]); | |||
| y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]); | |||
| y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]); | |||
| } | |||
| static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { | |||
| BLASLONG i = 0; | |||
| FLOAT *a0, *a1, *a2, *a3; | |||
| a0 = ap; | |||
| a1 = ap + lda; | |||
| a2 = a1 + lda; | |||
| a3 = a2 + lda; | |||
| __vector float* va0 = (__vector float*) a0; | |||
| __vector float* va1 = (__vector float*) a1; | |||
| __vector float* va2 = (__vector float*) a2; | |||
| __vector float* va3 = (__vector float*) a3; | |||
| __vector float* v_x = (__vector float*) x; | |||
| register __vector float temp0 = {0,0,0,0}; | |||
| register __vector float temp1 = {0,0,0,0}; | |||
| register __vector float temp2 = {0,0,0,0}; | |||
| register __vector float temp3 = {0,0,0,0}; | |||
| for (i = 0; i < n / 4; i ++) { | |||
| temp0 += v_x[i] * va0[i]; | |||
| temp1 += v_x[i] * va1[i]; | |||
| temp2 += v_x[i] * va2[i]; | |||
| temp3 += v_x[i] * va3[i]; | |||
| } | |||
| y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||
| y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); | |||
| y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); | |||
| y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); | |||
| } | |||
| static void sgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) { | |||
| BLASLONG i; | |||
| FLOAT *a0, *a1; | |||
| a0 = ap; | |||
| a1 = ap + lda; | |||
| __vector float* va0 = (__vector float*) a0; | |||
| __vector float* va1 = (__vector float*) a1; | |||
| __vector float* v_x = (__vector float*) x; | |||
| __vector float temp0 = {0,0,0,0}; | |||
| __vector float temp1 = {0,0,0,0}; | |||
| for (i = 0; i < n / 4; i ++) { | |||
| temp0 += v_x[i] * va0[i]; | |||
| temp1 += v_x[i] * va1[i]; | |||
| } | |||
| y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||
| y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); | |||
| } | |||
| static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { | |||
| BLASLONG i; | |||
| FLOAT *a0; | |||
| a0 = ap; | |||
| __vector float* va0 = (__vector float*) a0; | |||
| __vector float* v_x = (__vector float*) x; | |||
| __vector float temp0 = {0,0,0,0}; | |||
| for (i = 0; i < n / 4; i ++) { | |||
| temp0 += v_x[i] * va0[i] ; | |||
| } | |||
| y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||
| } | |||
| static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { | |||
| BLASLONG i; | |||
| for (i = 0; i < n; i++) { | |||
| *dest++ = *src; | |||
| src += inc_src; | |||
| } | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { | |||
| BLASLONG i; | |||
| BLASLONG j; | |||
| FLOAT *a_ptr; | |||
| FLOAT *x_ptr; | |||
| FLOAT *y_ptr; | |||
| BLASLONG n1; | |||
| BLASLONG m1; | |||
| BLASLONG m2; | |||
| BLASLONG m3; | |||
| BLASLONG n2; | |||
| FLOAT ybuffer[8] __attribute__((aligned(16))); | |||
| FLOAT *xbuffer; | |||
| if (m < 1) return (0); | |||
| if (n < 1) return (0); | |||
| xbuffer = buffer; | |||
| n1 = n >> 3; | |||
| n2 = n & 7; | |||
| m3 = m & 3; | |||
| m1 = m - m3; | |||
| m2 = (m & (NBMAX - 1)) - m3; | |||
| BLASLONG NB = NBMAX; | |||
| while (NB == NBMAX) { | |||
| m1 -= NB; | |||
| if (m1 < 0) { | |||
| if (m2 == 0) break; | |||
| NB = m2; | |||
| } | |||
| y_ptr = y; | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| if (inc_x != 1) | |||
| copy_x(NB, x_ptr, xbuffer, inc_x); | |||
| else | |||
| xbuffer = x_ptr; | |||
| BLASLONG lda8 = lda << 3; | |||
| if (inc_y == 1) { | |||
| for (i = 0; i < n1; i++) { | |||
| sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha); | |||
| y_ptr += 8; | |||
| a_ptr += lda8; | |||
| } | |||
| } else { | |||
| for (i = 0; i < n1; i++) { | |||
| ybuffer[0] = 0; | |||
| ybuffer[1] = 0; | |||
| ybuffer[2] = 0; | |||
| ybuffer[3] = 0; | |||
| ybuffer[4] = 0; | |||
| ybuffer[5] = 0; | |||
| ybuffer[6] = 0; | |||
| ybuffer[7] = 0; | |||
| sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha); | |||
| *y_ptr += ybuffer[0]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[1]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[2]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[3]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[4]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[5]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[6]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[7]; | |||
| y_ptr += inc_y; | |||
| a_ptr += lda8; | |||
| } | |||
| } | |||
| if (n2 & 4) { | |||
| ybuffer[0] = 0; | |||
| ybuffer[1] = 0; | |||
| ybuffer[2] = 0; | |||
| ybuffer[3] = 0; | |||
| sgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha); | |||
| a_ptr += lda<<2; | |||
| *y_ptr += ybuffer[0]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[1]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[2]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[3]; | |||
| y_ptr += inc_y; | |||
| } | |||
| if (n2 & 2) { | |||
| sgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y); | |||
| a_ptr += lda << 1; | |||
| y_ptr += 2 * inc_y; | |||
| } | |||
| if (n2 & 1) { | |||
| sgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha); | |||
| a_ptr += lda; | |||
| y_ptr += inc_y; | |||
| } | |||
| a += NB; | |||
| x += NB * inc_x; | |||
| } | |||
| if (m3 == 0) return (0); | |||
| x_ptr = x; | |||
| a_ptr = a; | |||
| if (m3 == 3) { | |||
| FLOAT xtemp0 = *x_ptr * alpha; | |||
| x_ptr += inc_x; | |||
| FLOAT xtemp1 = *x_ptr * alpha; | |||
| x_ptr += inc_x; | |||
| FLOAT xtemp2 = *x_ptr * alpha; | |||
| FLOAT *aj = a_ptr; | |||
| y_ptr = y; | |||
| if (lda == 3 && inc_y == 1) { | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; | |||
| y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; | |||
| y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; | |||
| y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; | |||
| aj += 12; | |||
| } | |||
| for (; j < n; j++) { | |||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; | |||
| aj += 3; | |||
| } | |||
| } else { | |||
| if (inc_y == 1) { | |||
| BLASLONG register lda2 = lda << 1; | |||
| BLASLONG register lda4 = lda << 2; | |||
| BLASLONG register lda3 = lda2 + lda; | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||
| y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2; | |||
| y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2; | |||
| y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2; | |||
| aj += lda4; | |||
| } | |||
| for (; j < n; j++) { | |||
| y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||
| aj += lda; | |||
| } | |||
| } else { | |||
| for (j = 0; j < n; j++) { | |||
| *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||
| y_ptr += inc_y; | |||
| aj += lda; | |||
| } | |||
| } | |||
| } | |||
| return (0); | |||
| } | |||
| if (m3 == 2) { | |||
| FLOAT xtemp0 = *x_ptr * alpha; | |||
| x_ptr += inc_x; | |||
| FLOAT xtemp1 = *x_ptr * alpha; | |||
| FLOAT *aj = a_ptr; | |||
| y_ptr = y; | |||
| if (lda == 2 && inc_y == 1) { | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; | |||
| y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1; | |||
| y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1; | |||
| y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1; | |||
| aj += 8; | |||
| } | |||
| for (; j < n; j++) { | |||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; | |||
| aj += 2; | |||
| } | |||
| } else { | |||
| if (inc_y == 1) { | |||
| BLASLONG register lda2 = lda << 1; | |||
| BLASLONG register lda4 = lda << 2; | |||
| BLASLONG register lda3 = lda2 + lda; | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; | |||
| y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; | |||
| y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; | |||
| y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; | |||
| aj += lda4; | |||
| } | |||
| for (; j < n; j++) { | |||
| y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; | |||
| aj += lda; | |||
| } | |||
| } else { | |||
| for (j = 0; j < n; j++) { | |||
| *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1; | |||
| y_ptr += inc_y; | |||
| aj += lda; | |||
| } | |||
| } | |||
| } | |||
| return (0); | |||
| } | |||
| FLOAT xtemp = *x_ptr * alpha; | |||
| FLOAT *aj = a_ptr; | |||
| y_ptr = y; | |||
| if (lda == 1 && inc_y == 1) { | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += aj[j] * xtemp; | |||
| y_ptr[j + 1] += aj[j + 1] * xtemp; | |||
| y_ptr[j + 2] += aj[j + 2] * xtemp; | |||
| y_ptr[j + 3] += aj[j + 3] * xtemp; | |||
| } | |||
| for (; j < n; j++) { | |||
| y_ptr[j] += aj[j] * xtemp; | |||
| } | |||
| } else { | |||
| if (inc_y == 1) { | |||
| BLASLONG register lda2 = lda << 1; | |||
| BLASLONG register lda4 = lda << 2; | |||
| BLASLONG register lda3 = lda2 + lda; | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += *aj * xtemp; | |||
| y_ptr[j + 1] += *(aj + lda) * xtemp; | |||
| y_ptr[j + 2] += *(aj + lda2) * xtemp; | |||
| y_ptr[j + 3] += *(aj + lda3) * xtemp; | |||
| aj += lda4; | |||
| } | |||
| for (; j < n; j++) { | |||
| y_ptr[j] += *aj * xtemp; | |||
| aj += lda; | |||
| } | |||
| } else { | |||
| for (j = 0; j < n; j++) { | |||
| *y_ptr += *aj * xtemp; | |||
| y_ptr += inc_y; | |||
| aj += lda; | |||
| } | |||
| } | |||
| } | |||
| return (0); | |||
| } | |||
| #endif | |||
| /*************************************************************************** | |||
| Copyright (c) 2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #if !defined(__VEC__) || !defined(__ALTIVEC__) | |||
| #include "../arm/gemv_t.c" | |||
| #else | |||
| #include "common.h" | |||
| #define NBMAX 2048 | |||
| #include <altivec.h> | |||
| static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { | |||
| BLASLONG i; | |||
| FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; | |||
| __vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; | |||
| register __vector float temp0 = {0,0,0,0}; | |||
| register __vector float temp1 = {0,0,0,0}; | |||
| register __vector float temp2 = {0,0,0,0}; | |||
| register __vector float temp3 = {0,0,0,0}; | |||
| register __vector float temp4 = {0,0,0,0}; | |||
| register __vector float temp5 = {0,0,0,0}; | |||
| register __vector float temp6 = {0,0,0,0}; | |||
| register __vector float temp7 = {0,0,0,0}; | |||
| a0 = ap; | |||
| a1 = ap + lda; | |||
| a2 = a1 + lda; | |||
| a3 = a2 + lda; | |||
| a4 = a3 + lda; | |||
| a5 = a4 + lda; | |||
| a6 = a5 + lda; | |||
| a7 = a6 + lda; | |||
| va0 = (__vector float*) a0; | |||
| va1 = (__vector float*) a1; | |||
| va2 = (__vector float*) a2; | |||
| va3 = (__vector float*) a3; | |||
| va4 = (__vector float*) a4; | |||
| va5 = (__vector float*) a5; | |||
| va6 = (__vector float*) a6; | |||
| va7 = (__vector float*) a7; | |||
| v_x = (__vector float*) x; | |||
| for (i = 0; i < n/4; i ++) { | |||
| temp0 += v_x[i] * va0[i]; | |||
| temp1 += v_x[i] * va1[i]; | |||
| temp2 += v_x[i] * va2[i]; | |||
| temp3 += v_x[i] * va3[i]; | |||
| temp4 += v_x[i] * va4[i]; | |||
| temp5 += v_x[i] * va5[i]; | |||
| temp6 += v_x[i] * va6[i]; | |||
| temp7 += v_x[i] * va7[i]; | |||
| } | |||
| y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||
| y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); | |||
| y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); | |||
| y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); | |||
| y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]); | |||
| y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]); | |||
| y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]); | |||
| y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]); | |||
| } | |||
| static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { | |||
| BLASLONG i = 0; | |||
| FLOAT *a0, *a1, *a2, *a3; | |||
| a0 = ap; | |||
| a1 = ap + lda; | |||
| a2 = a1 + lda; | |||
| a3 = a2 + lda; | |||
| __vector float* va0 = (__vector float*) a0; | |||
| __vector float* va1 = (__vector float*) a1; | |||
| __vector float* va2 = (__vector float*) a2; | |||
| __vector float* va3 = (__vector float*) a3; | |||
| __vector float* v_x = (__vector float*) x; | |||
| register __vector float temp0 = {0,0,0,0}; | |||
| register __vector float temp1 = {0,0,0,0}; | |||
| register __vector float temp2 = {0,0,0,0}; | |||
| register __vector float temp3 = {0,0,0,0}; | |||
| for (i = 0; i < n / 4; i ++) { | |||
| temp0 += v_x[i] * va0[i]; | |||
| temp1 += v_x[i] * va1[i]; | |||
| temp2 += v_x[i] * va2[i]; | |||
| temp3 += v_x[i] * va3[i]; | |||
| } | |||
| y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||
| y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); | |||
| y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); | |||
| y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); | |||
| } | |||
| static void sgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) { | |||
| BLASLONG i; | |||
| FLOAT *a0, *a1; | |||
| a0 = ap; | |||
| a1 = ap + lda; | |||
| __vector float* va0 = (__vector float*) a0; | |||
| __vector float* va1 = (__vector float*) a1; | |||
| __vector float* v_x = (__vector float*) x; | |||
| __vector float temp0 = {0,0,0,0}; | |||
| __vector float temp1 = {0,0,0,0}; | |||
| for (i = 0; i < n / 4; i ++) { | |||
| temp0 += v_x[i] * va0[i]; | |||
| temp1 += v_x[i] * va1[i]; | |||
| } | |||
| y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||
| y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); | |||
| } | |||
| static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { | |||
| BLASLONG i; | |||
| FLOAT *a0; | |||
| a0 = ap; | |||
| __vector float* va0 = (__vector float*) a0; | |||
| __vector float* v_x = (__vector float*) x; | |||
| __vector float temp0 = {0,0,0,0}; | |||
| for (i = 0; i < n / 4; i ++) { | |||
| temp0 += v_x[i] * va0[i] ; | |||
| } | |||
| y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||
| } | |||
| static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { | |||
| BLASLONG i; | |||
| for (i = 0; i < n; i++) { | |||
| *dest++ = *src; | |||
| src += inc_src; | |||
| } | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { | |||
| BLASLONG i; | |||
| BLASLONG j; | |||
| FLOAT *a_ptr; | |||
| FLOAT *x_ptr; | |||
| FLOAT *y_ptr; | |||
| BLASLONG n1; | |||
| BLASLONG m1; | |||
| BLASLONG m2; | |||
| BLASLONG m3; | |||
| BLASLONG n2; | |||
| FLOAT ybuffer[8] __attribute__((aligned(16))); | |||
| FLOAT *xbuffer; | |||
| if (m < 1) return (0); | |||
| if (n < 1) return (0); | |||
| xbuffer = buffer; | |||
| n1 = n >> 3; | |||
| n2 = n & 7; | |||
| m3 = m & 3; | |||
| m1 = m - m3; | |||
| m2 = (m & (NBMAX - 1)) - m3; | |||
| BLASLONG NB = NBMAX; | |||
| while (NB == NBMAX) { | |||
| m1 -= NB; | |||
| if (m1 < 0) { | |||
| if (m2 == 0) break; | |||
| NB = m2; | |||
| } | |||
| y_ptr = y; | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| if (inc_x != 1) | |||
| copy_x(NB, x_ptr, xbuffer, inc_x); | |||
| else | |||
| xbuffer = x_ptr; | |||
| BLASLONG lda8 = lda << 3; | |||
| if (inc_y == 1) { | |||
| for (i = 0; i < n1; i++) { | |||
| sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha); | |||
| y_ptr += 8; | |||
| a_ptr += lda8; | |||
| } | |||
| } else { | |||
| for (i = 0; i < n1; i++) { | |||
| ybuffer[0] = 0; | |||
| ybuffer[1] = 0; | |||
| ybuffer[2] = 0; | |||
| ybuffer[3] = 0; | |||
| ybuffer[4] = 0; | |||
| ybuffer[5] = 0; | |||
| ybuffer[6] = 0; | |||
| ybuffer[7] = 0; | |||
| sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha); | |||
| *y_ptr += ybuffer[0]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[1]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[2]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[3]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[4]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[5]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[6]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[7]; | |||
| y_ptr += inc_y; | |||
| a_ptr += lda8; | |||
| } | |||
| } | |||
| if (n2 & 4) { | |||
| ybuffer[0] = 0; | |||
| ybuffer[1] = 0; | |||
| ybuffer[2] = 0; | |||
| ybuffer[3] = 0; | |||
| sgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha); | |||
| a_ptr += lda<<2; | |||
| *y_ptr += ybuffer[0]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[1]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[2]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[3]; | |||
| y_ptr += inc_y; | |||
| } | |||
| if (n2 & 2) { | |||
| sgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y); | |||
| a_ptr += lda << 1; | |||
| y_ptr += 2 * inc_y; | |||
| } | |||
| if (n2 & 1) { | |||
| sgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha); | |||
| a_ptr += lda; | |||
| y_ptr += inc_y; | |||
| } | |||
| a += NB; | |||
| x += NB * inc_x; | |||
| } | |||
| if (m3 == 0) return (0); | |||
| x_ptr = x; | |||
| a_ptr = a; | |||
| if (m3 == 3) { | |||
| FLOAT xtemp0 = *x_ptr * alpha; | |||
| x_ptr += inc_x; | |||
| FLOAT xtemp1 = *x_ptr * alpha; | |||
| x_ptr += inc_x; | |||
| FLOAT xtemp2 = *x_ptr * alpha; | |||
| FLOAT *aj = a_ptr; | |||
| y_ptr = y; | |||
| if (lda == 3 && inc_y == 1) { | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; | |||
| y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; | |||
| y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; | |||
| y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; | |||
| aj += 12; | |||
| } | |||
| for (; j < n; j++) { | |||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; | |||
| aj += 3; | |||
| } | |||
| } else { | |||
| if (inc_y == 1) { | |||
| BLASLONG register lda2 = lda << 1; | |||
| BLASLONG register lda4 = lda << 2; | |||
| BLASLONG register lda3 = lda2 + lda; | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||
| y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2; | |||
| y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2; | |||
| y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2; | |||
| aj += lda4; | |||
| } | |||
| for (; j < n; j++) { | |||
| y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||
| aj += lda; | |||
| } | |||
| } else { | |||
| for (j = 0; j < n; j++) { | |||
| *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||
| y_ptr += inc_y; | |||
| aj += lda; | |||
| } | |||
| } | |||
| } | |||
| return (0); | |||
| } | |||
| if (m3 == 2) { | |||
| FLOAT xtemp0 = *x_ptr * alpha; | |||
| x_ptr += inc_x; | |||
| FLOAT xtemp1 = *x_ptr * alpha; | |||
| FLOAT *aj = a_ptr; | |||
| y_ptr = y; | |||
| if (lda == 2 && inc_y == 1) { | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; | |||
| y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1; | |||
| y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1; | |||
| y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1; | |||
| aj += 8; | |||
| } | |||
| for (; j < n; j++) { | |||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; | |||
| aj += 2; | |||
| } | |||
| } else { | |||
| if (inc_y == 1) { | |||
| BLASLONG register lda2 = lda << 1; | |||
| BLASLONG register lda4 = lda << 2; | |||
| BLASLONG register lda3 = lda2 + lda; | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; | |||
| y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; | |||
| y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; | |||
| y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; | |||
| aj += lda4; | |||
| } | |||
| for (; j < n; j++) { | |||
| y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; | |||
| aj += lda; | |||
| } | |||
| } else { | |||
| for (j = 0; j < n; j++) { | |||
| *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1; | |||
| y_ptr += inc_y; | |||
| aj += lda; | |||
| } | |||
| } | |||
| } | |||
| return (0); | |||
| } | |||
| FLOAT xtemp = *x_ptr * alpha; | |||
| FLOAT *aj = a_ptr; | |||
| y_ptr = y; | |||
| if (lda == 1 && inc_y == 1) { | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += aj[j] * xtemp; | |||
| y_ptr[j + 1] += aj[j + 1] * xtemp; | |||
| y_ptr[j + 2] += aj[j + 2] * xtemp; | |||
| y_ptr[j + 3] += aj[j + 3] * xtemp; | |||
| } | |||
| for (; j < n; j++) { | |||
| y_ptr[j] += aj[j] * xtemp; | |||
| } | |||
| } else { | |||
| if (inc_y == 1) { | |||
| BLASLONG register lda2 = lda << 1; | |||
| BLASLONG register lda4 = lda << 2; | |||
| BLASLONG register lda3 = lda2 + lda; | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += *aj * xtemp; | |||
| y_ptr[j + 1] += *(aj + lda) * xtemp; | |||
| y_ptr[j + 2] += *(aj + lda2) * xtemp; | |||
| y_ptr[j + 3] += *(aj + lda3) * xtemp; | |||
| aj += lda4; | |||
| } | |||
| for (; j < n; j++) { | |||
| y_ptr[j] += *aj * xtemp; | |||
| aj += lda; | |||
| } | |||
| } else { | |||
| for (j = 0; j < n; j++) { | |||
| *y_ptr += *aj * xtemp; | |||
| y_ptr += inc_y; | |||
| aj += lda; | |||
| } | |||
| } | |||
| } | |||
| return (0); | |||
| } | |||
| #endif | |||
| @@ -1,245 +1,245 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "def_vsx.h" | |||
| #define LOAD ld | |||
| #define STACKSIZE 512 | |||
| #define FZERO 312+192(SP) | |||
| #define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ | |||
| #define M r3 | |||
| #define N r4 | |||
| #define K r5 | |||
| #define A r8 | |||
| #define B r9 | |||
| #define C r10 | |||
| #define LDC r6 | |||
| #define OFFSET r7 | |||
| #define o0 0 | |||
| #define alpha_r vs30 | |||
| #define alpha_i vs31 | |||
| #define VECSAVE r11 | |||
| #define FRAMEPOINTER r12 | |||
| #define T10 r14 | |||
| #define L r15 | |||
| #define T8 r16 | |||
| #define T5 r17 | |||
| #define T2 r19 | |||
| #define TEMP_REG r20 | |||
| #define T6 r21 | |||
| #define I r22 | |||
| #define J r23 | |||
| #define AO r24 | |||
| #define BO r25 | |||
| #define CO r26 | |||
| #define T7 r27 | |||
| #define T3 r28 | |||
| #define T4 r29 | |||
| #define PRE r30 | |||
| #define T1 r31 | |||
| #ifndef NEEDPARAM | |||
| PROLOGUE | |||
| PROFCODE | |||
| mr FRAMEPOINTER, SP | |||
| addi SP, SP, -STACKSIZE | |||
| mflr r0 | |||
| stfd f14, 0(SP) | |||
| stfd f15, 8(SP) | |||
| stfd f16, 16(SP) | |||
| stfd f17, 24(SP) | |||
| stfd f18, 32(SP) | |||
| stfd f19, 40(SP) | |||
| stfd f20, 48(SP) | |||
| stfd f21, 56(SP) | |||
| stfd f22, 64(SP) | |||
| stfd f23, 72(SP) | |||
| stfd f24, 80(SP) | |||
| stfd f25, 88(SP) | |||
| stfd f26, 96(SP) | |||
| stfd f27, 104(SP) | |||
| stfd f28, 112(SP) | |||
| stfd f29, 120(SP) | |||
| stfd f30, 128(SP) | |||
| stfd f31, 136(SP) | |||
| xxspltd alpha_r,vs1,0 /*copy from register f1 */ | |||
| xxspltd alpha_i,vs2,0 /*copy from register f2 */ | |||
| std r31, 144(SP) | |||
| std r30, 152(SP) | |||
| std r29, 160(SP) | |||
| std r28, 168(SP) | |||
| std r27, 176(SP) | |||
| std r26, 184(SP) | |||
| std r25, 192(SP) | |||
| std r24, 200(SP) | |||
| std r23, 208(SP) | |||
| std r22, 216(SP) | |||
| std r21, 224(SP) | |||
| std r20, 232(SP) | |||
| std r19, 240(SP) | |||
| std r18, 248(SP) | |||
| std r17, 256(SP) | |||
| std r16, 264(SP) | |||
| std r15, 272(SP) | |||
| std r14, 280(SP) | |||
| stxv vs52, 288(SP) | |||
| stxv vs53, 304(SP) | |||
| stxv vs54, 320(SP) | |||
| stxv vs55, 336(SP) | |||
| stxv vs56, 352(SP) | |||
| stxv vs57, 368(SP) | |||
| stxv vs58, 384(SP) | |||
| stxv vs59, 400(SP) | |||
| stxv vs60, 416(SP) | |||
| stxv vs61, 432(SP) | |||
| stxv vs62, 448(SP) | |||
| stxv vs63, 464(SP) | |||
| std r0, FLINK_SAVE(SP) | |||
| #if defined(linux) || defined(__FreeBSD__) | |||
| ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) | |||
| #endif | |||
| #ifdef TRMMKERNEL | |||
| #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) | |||
| ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) | |||
| #endif | |||
| #endif | |||
| #include "zgemm_macros_power9.S" | |||
| slwi LDC, LDC, ZBASE_SHIFT | |||
| li PRE, 512 | |||
| li r0, 0 | |||
| #if defined(CC) || defined(CR) || defined(RC) || defined(RR) | |||
| /*negate for this case as we will use addition -1*(a+b) */ | |||
| xvnegdp alpha_r,alpha_r | |||
| xvnegdp alpha_i,alpha_i | |||
| #endif | |||
| .align 4 | |||
| #include "zgemm_logic_power9.S" | |||
| L999: | |||
| lfd f14, 0(SP) | |||
| lfd f15, 8(SP) | |||
| lfd f16, 16(SP) | |||
| lfd f17, 24(SP) | |||
| lfd f18, 32(SP) | |||
| lfd f19, 40(SP) | |||
| lfd f20, 48(SP) | |||
| lfd f21, 56(SP) | |||
| lfd f22, 64(SP) | |||
| lfd f23, 72(SP) | |||
| lfd f24, 80(SP) | |||
| lfd f25, 88(SP) | |||
| lfd f26, 96(SP) | |||
| lfd f27, 104(SP) | |||
| lfd f28, 112(SP) | |||
| lfd f29, 120(SP) | |||
| lfd f30, 128(SP) | |||
| lfd f31, 136(SP) | |||
| ld r31, 144(SP) | |||
| ld r30, 152(SP) | |||
| ld r29, 160(SP) | |||
| ld r28, 168(SP) | |||
| ld r27, 176(SP) | |||
| ld r26, 184(SP) | |||
| ld r25, 192(SP) | |||
| ld r24, 200(SP) | |||
| ld r23, 208(SP) | |||
| ld r22, 216(SP) | |||
| ld r21, 224(SP) | |||
| ld r20, 232(SP) | |||
| ld r19, 240(SP) | |||
| ld r18, 248(SP) | |||
| ld r17, 256(SP) | |||
| ld r16, 264(SP) | |||
| ld r15, 272(SP) | |||
| ld r14, 280(SP) | |||
| ld r0, FLINK_SAVE(SP) | |||
| lxv vs52, 288(SP) | |||
| lxv vs53, 304(SP) | |||
| lxv vs54, 320(SP) | |||
| lxv vs55, 336(SP) | |||
| lxv vs56, 352(SP) | |||
| lxv vs57, 368(SP) | |||
| lxv vs58, 384(SP) | |||
| lxv vs59, 400(SP) | |||
| mtlr r0 | |||
| lxv vs60, 416(SP) | |||
| lxv vs61, 432(SP) | |||
| lxv vs62, 448(SP) | |||
| lxv vs63, 464(SP) | |||
| addi SP, SP, STACKSIZE | |||
| blr | |||
| EPILOGUE | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "def_vsx.h" | |||
| #define LOAD ld | |||
| #define STACKSIZE 512 | |||
| #define FZERO 312+192(SP) | |||
| #define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ | |||
| #define M r3 | |||
| #define N r4 | |||
| #define K r5 | |||
| #define A r8 | |||
| #define B r9 | |||
| #define C r10 | |||
| #define LDC r6 | |||
| #define OFFSET r7 | |||
| #define o0 0 | |||
| #define alpha_r vs30 | |||
| #define alpha_i vs31 | |||
| #define VECSAVE r11 | |||
| #define FRAMEPOINTER r12 | |||
| #define T10 r14 | |||
| #define L r15 | |||
| #define T8 r16 | |||
| #define T5 r17 | |||
| #define T2 r19 | |||
| #define TEMP_REG r20 | |||
| #define T6 r21 | |||
| #define I r22 | |||
| #define J r23 | |||
| #define AO r24 | |||
| #define BO r25 | |||
| #define CO r26 | |||
| #define T7 r27 | |||
| #define T3 r28 | |||
| #define T4 r29 | |||
| #define PRE r30 | |||
| #define T1 r31 | |||
| #ifndef NEEDPARAM | |||
| PROLOGUE | |||
| PROFCODE | |||
| mr FRAMEPOINTER, SP | |||
| addi SP, SP, -STACKSIZE | |||
| mflr r0 | |||
| stfd f14, 0(SP) | |||
| stfd f15, 8(SP) | |||
| stfd f16, 16(SP) | |||
| stfd f17, 24(SP) | |||
| stfd f18, 32(SP) | |||
| stfd f19, 40(SP) | |||
| stfd f20, 48(SP) | |||
| stfd f21, 56(SP) | |||
| stfd f22, 64(SP) | |||
| stfd f23, 72(SP) | |||
| stfd f24, 80(SP) | |||
| stfd f25, 88(SP) | |||
| stfd f26, 96(SP) | |||
| stfd f27, 104(SP) | |||
| stfd f28, 112(SP) | |||
| stfd f29, 120(SP) | |||
| stfd f30, 128(SP) | |||
| stfd f31, 136(SP) | |||
| xxspltd alpha_r,vs1,0 /*copy from register f1 */ | |||
| xxspltd alpha_i,vs2,0 /*copy from register f2 */ | |||
| std r31, 144(SP) | |||
| std r30, 152(SP) | |||
| std r29, 160(SP) | |||
| std r28, 168(SP) | |||
| std r27, 176(SP) | |||
| std r26, 184(SP) | |||
| std r25, 192(SP) | |||
| std r24, 200(SP) | |||
| std r23, 208(SP) | |||
| std r22, 216(SP) | |||
| std r21, 224(SP) | |||
| std r20, 232(SP) | |||
| std r19, 240(SP) | |||
| std r18, 248(SP) | |||
| std r17, 256(SP) | |||
| std r16, 264(SP) | |||
| std r15, 272(SP) | |||
| std r14, 280(SP) | |||
| stxv vs52, 288(SP) | |||
| stxv vs53, 304(SP) | |||
| stxv vs54, 320(SP) | |||
| stxv vs55, 336(SP) | |||
| stxv vs56, 352(SP) | |||
| stxv vs57, 368(SP) | |||
| stxv vs58, 384(SP) | |||
| stxv vs59, 400(SP) | |||
| stxv vs60, 416(SP) | |||
| stxv vs61, 432(SP) | |||
| stxv vs62, 448(SP) | |||
| stxv vs63, 464(SP) | |||
| std r0, FLINK_SAVE(SP) | |||
| #if defined(linux) || defined(__FreeBSD__) | |||
| ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) | |||
| #endif | |||
| #ifdef TRMMKERNEL | |||
| #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) | |||
| ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) | |||
| #endif | |||
| #endif | |||
| #include "zgemm_macros_power9.S" | |||
| slwi LDC, LDC, ZBASE_SHIFT | |||
| li PRE, 512 | |||
| li r0, 0 | |||
| #if defined(CC) || defined(CR) || defined(RC) || defined(RR) | |||
| /*negate for this case as we will use addition -1*(a+b) */ | |||
| xvnegdp alpha_r,alpha_r | |||
| xvnegdp alpha_i,alpha_i | |||
| #endif | |||
| .align 4 | |||
| #include "zgemm_logic_power9.S" | |||
| L999: | |||
| lfd f14, 0(SP) | |||
| lfd f15, 8(SP) | |||
| lfd f16, 16(SP) | |||
| lfd f17, 24(SP) | |||
| lfd f18, 32(SP) | |||
| lfd f19, 40(SP) | |||
| lfd f20, 48(SP) | |||
| lfd f21, 56(SP) | |||
| lfd f22, 64(SP) | |||
| lfd f23, 72(SP) | |||
| lfd f24, 80(SP) | |||
| lfd f25, 88(SP) | |||
| lfd f26, 96(SP) | |||
| lfd f27, 104(SP) | |||
| lfd f28, 112(SP) | |||
| lfd f29, 120(SP) | |||
| lfd f30, 128(SP) | |||
| lfd f31, 136(SP) | |||
| ld r31, 144(SP) | |||
| ld r30, 152(SP) | |||
| ld r29, 160(SP) | |||
| ld r28, 168(SP) | |||
| ld r27, 176(SP) | |||
| ld r26, 184(SP) | |||
| ld r25, 192(SP) | |||
| ld r24, 200(SP) | |||
| ld r23, 208(SP) | |||
| ld r22, 216(SP) | |||
| ld r21, 224(SP) | |||
| ld r20, 232(SP) | |||
| ld r19, 240(SP) | |||
| ld r18, 248(SP) | |||
| ld r17, 256(SP) | |||
| ld r16, 264(SP) | |||
| ld r15, 272(SP) | |||
| ld r14, 280(SP) | |||
| ld r0, FLINK_SAVE(SP) | |||
| lxv vs52, 288(SP) | |||
| lxv vs53, 304(SP) | |||
| lxv vs54, 320(SP) | |||
| lxv vs55, 336(SP) | |||
| lxv vs56, 352(SP) | |||
| lxv vs57, 368(SP) | |||
| lxv vs58, 384(SP) | |||
| lxv vs59, 400(SP) | |||
| mtlr r0 | |||
| lxv vs60, 416(SP) | |||
| lxv vs61, 432(SP) | |||
| lxv vs62, 448(SP) | |||
| lxv vs63, 464(SP) | |||
| addi SP, SP, STACKSIZE | |||
| blr | |||
| EPILOGUE | |||
| #endif | |||
| @@ -1,279 +1,279 @@ | |||
| #include "common.h" | |||
| #include <stdint.h> | |||
| #include "strsm_kernel_8x4_haswell_R_common.h" | |||
| #define SOLVE_RN_m8n4 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) "movq %2,%3; addq $32,%2;"\ | |||
| SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1)\ | |||
| SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1)\ | |||
| SAVE_SOLUTION_m8n2(4,5,0)\ | |||
| SOLVE_leri_m8n2(40,6,7,%1)\ | |||
| SOLVE_ri_m8n2(56,6,7,%1)\ | |||
| SAVE_SOLUTION_m8n2(6,7,64) | |||
| #define SOLVE_RN_m8n8 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) "movq %2,%3; addq $32,%2;"\ | |||
| SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1) SUBTRACT_m8n2(0,8,9,%1,%%r12,4) SUBTRACT_m8n2(8,10,11,%1,%%r12,4)\ | |||
| SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1) SUBTRACT_m8n2(16,8,9,%1,%%r12,4) SUBTRACT_m8n2(24,10,11,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m8n2(4,5,0)\ | |||
| SOLVE_leri_m8n2(40,6,7,%1) SUBTRACT_m8n2(32,8,9,%1,%%r12,4) SUBTRACT_m8n2(40,10,11,%1,%%r12,4)\ | |||
| SOLVE_ri_m8n2(56,6,7,%1) SUBTRACT_m8n2(48,8,9,%1,%%r12,4) SUBTRACT_m8n2(56,10,11,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m8n2(6,7,64)\ | |||
| SOLVE_leri_m8n2(64,8,9,%1,%%r12,4) SUBTRACT_m8n2(72,10,11,%1,%%r12,4)\ | |||
| SOLVE_ri_m8n2(80,8,9,%1,%%r12,4) SUBTRACT_m8n2(88,10,11,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m8n2(8,9,128)\ | |||
| SOLVE_leri_m8n2(104,10,11,%1,%%r12,4)\ | |||
| SOLVE_ri_m8n2(120,10,11,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m8n2(10,11,192) | |||
| #define SOLVE_RN_m8n12 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) GEMM_SUM_REORDER_8x4(12,13,14,15,63) "movq %2,%3; addq $32,%2;"\ | |||
| SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1) SUBTRACT_m8n2(0,8,9,%1,%%r12,4) SUBTRACT_m8n2(8,10,11,%1,%%r12,4) SUBTRACT_m8n2(0,12,13,%1,%%r12,8) SUBTRACT_m8n2(8,14,15,%1,%%r12,8)\ | |||
| SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1) SUBTRACT_m8n2(16,8,9,%1,%%r12,4) SUBTRACT_m8n2(24,10,11,%1,%%r12,4) SUBTRACT_m8n2(16,12,13,%1,%%r12,8) SUBTRACT_m8n2(24,14,15,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m8n2(4,5,0)\ | |||
| SOLVE_leri_m8n2(40,6,7,%1) SUBTRACT_m8n2(32,8,9,%1,%%r12,4) SUBTRACT_m8n2(40,10,11,%1,%%r12,4) SUBTRACT_m8n2(32,12,13,%1,%%r12,8) SUBTRACT_m8n2(40,14,15,%1,%%r12,8)\ | |||
| SOLVE_ri_m8n2(56,6,7,%1) SUBTRACT_m8n2(48,8,9,%1,%%r12,4) SUBTRACT_m8n2(56,10,11,%1,%%r12,4) SUBTRACT_m8n2(48,12,13,%1,%%r12,8) SUBTRACT_m8n2(56,14,15,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m8n2(6,7,64)\ | |||
| SOLVE_leri_m8n2(64,8,9,%1,%%r12,4) SUBTRACT_m8n2(72,10,11,%1,%%r12,4) SUBTRACT_m8n2(64,12,13,%1,%%r12,8) SUBTRACT_m8n2(72,14,15,%1,%%r12,8)\ | |||
| SOLVE_ri_m8n2(80,8,9,%1,%%r12,4) SUBTRACT_m8n2(88,10,11,%1,%%r12,4) SUBTRACT_m8n2(80,12,13,%1,%%r12,8) SUBTRACT_m8n2(88,14,15,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m8n2(8,9,128)\ | |||
| SOLVE_leri_m8n2(104,10,11,%1,%%r12,4) SUBTRACT_m8n2(96,12,13,%1,%%r12,8) SUBTRACT_m8n2(104,14,15,%1,%%r12,8)\ | |||
| SOLVE_ri_m8n2(120,10,11,%1,%%r12,4) SUBTRACT_m8n2(112,12,13,%1,%%r12,8) SUBTRACT_m8n2(120,14,15,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m8n2(10,11,192)\ | |||
| SOLVE_leri_m8n2(128,12,13,%1,%%r12,8) SUBTRACT_m8n2(136,14,15,%1,%%r12,8)\ | |||
| SOLVE_ri_m8n2(144,12,13,%1,%%r12,8) SUBTRACT_m8n2(152,14,15,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m8n2(12,13,256)\ | |||
| SOLVE_leri_m8n2(168,14,15,%1,%%r12,8)\ | |||
| SOLVE_ri_m8n2(184,14,15,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m8n2(14,15,320) | |||
| #define SOLVE_RN_m4n4 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) "movq %2,%3; addq $16,%2;"\ | |||
| SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1)\ | |||
| SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1)\ | |||
| SAVE_SOLUTION_m4n2(4,0)\ | |||
| SOLVE_leri_m4n2(40,5,%1)\ | |||
| SOLVE_ri_m4n2(56,5,%1)\ | |||
| SAVE_SOLUTION_m4n2(5,32) | |||
| #define SOLVE_RN_m4n8 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) "movq %2,%3; addq $16,%2;"\ | |||
| SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1) SUBTRACT_m4n2(0,6,%1,%%r12,4) SUBTRACT_m4n2(8,7,%1,%%r12,4)\ | |||
| SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1) SUBTRACT_m4n2(16,6,%1,%%r12,4) SUBTRACT_m4n2(24,7,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m4n2(4,0)\ | |||
| SOLVE_leri_m4n2(40,5,%1) SUBTRACT_m4n2(32,6,%1,%%r12,4) SUBTRACT_m4n2(40,7,%1,%%r12,4)\ | |||
| SOLVE_ri_m4n2(56,5,%1) SUBTRACT_m4n2(48,6,%1,%%r12,4) SUBTRACT_m4n2(56,7,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m4n2(5,32)\ | |||
| SOLVE_leri_m4n2(64,6,%1,%%r12,4) SUBTRACT_m4n2(72,7,%1,%%r12,4)\ | |||
| SOLVE_ri_m4n2(80,6,%1,%%r12,4) SUBTRACT_m4n2(88,7,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m4n2(6,64)\ | |||
| SOLVE_leri_m4n2(104,7,%1,%%r12,4)\ | |||
| SOLVE_ri_m4n2(120,7,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m4n2(7,96) | |||
| #define SOLVE_RN_m4n12 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9) "movq %2,%3; addq $16,%2;"\ | |||
| SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1) SUBTRACT_m4n2(0,6,%1,%%r12,4) SUBTRACT_m4n2(8,7,%1,%%r12,4) SUBTRACT_m4n2(0,8,%1,%%r12,8) SUBTRACT_m4n2(8,9,%1,%%r12,8)\ | |||
| SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1) SUBTRACT_m4n2(16,6,%1,%%r12,4) SUBTRACT_m4n2(24,7,%1,%%r12,4) SUBTRACT_m4n2(16,8,%1,%%r12,8) SUBTRACT_m4n2(24,9,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m4n2(4,0)\ | |||
| SOLVE_leri_m4n2(40,5,%1) SUBTRACT_m4n2(32,6,%1,%%r12,4) SUBTRACT_m4n2(40,7,%1,%%r12,4) SUBTRACT_m4n2(32,8,%1,%%r12,8) SUBTRACT_m4n2(40,9,%1,%%r12,8)\ | |||
| SOLVE_ri_m4n2(56,5,%1) SUBTRACT_m4n2(48,6,%1,%%r12,4) SUBTRACT_m4n2(56,7,%1,%%r12,4) SUBTRACT_m4n2(48,8,%1,%%r12,8) SUBTRACT_m4n2(56,9,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m4n2(5,32)\ | |||
| SOLVE_leri_m4n2(64,6,%1,%%r12,4) SUBTRACT_m4n2(72,7,%1,%%r12,4) SUBTRACT_m4n2(64,8,%1,%%r12,8) SUBTRACT_m4n2(72,9,%1,%%r12,8)\ | |||
| SOLVE_ri_m4n2(80,6,%1,%%r12,4) SUBTRACT_m4n2(88,7,%1,%%r12,4) SUBTRACT_m4n2(80,8,%1,%%r12,8) SUBTRACT_m4n2(88,9,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m4n2(6,64)\ | |||
| SOLVE_leri_m4n2(104,7,%1,%%r12,4) SUBTRACT_m4n2(96,8,%1,%%r12,8) SUBTRACT_m4n2(104,9,%1,%%r12,8)\ | |||
| SOLVE_ri_m4n2(120,7,%1,%%r12,4) SUBTRACT_m4n2(112,8,%1,%%r12,8) SUBTRACT_m4n2(120,9,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m4n2(7,96)\ | |||
| SOLVE_leri_m4n2(128,8,%1,%%r12,8) SUBTRACT_m4n2(136,9,%1,%%r12,8)\ | |||
| SOLVE_ri_m4n2(144,8,%1,%%r12,8) SUBTRACT_m4n2(152,9,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m4n2(8,128)\ | |||
| SOLVE_leri_m4n2(168,9,%1,%%r12,8)\ | |||
| SOLVE_ri_m4n2(184,9,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m4n2(9,160) | |||
| #define SOLVE_RN_m2n4 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) "movq %2,%3; addq $8,%2;"\ | |||
| SOLVE_col1_ltor_m2n4(0,4,5,%1)\ | |||
| SOLVE_col2_ltor_m2n4(16,4,5,%1)\ | |||
| SOLVE_col3_ltor_m2n4(32,4,5,%1)\ | |||
| SOLVE_col4_ltor_m2n4(48,4,5,%1)\ | |||
| SAVE_SOLUTION_m2n4(4,5,0) | |||
| #define SOLVE_RN_m2n8 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) "movq %2,%3; addq $8,%2;"\ | |||
| SOLVE_col1_ltor_m2n4(0,4,5,%1) SUBTRACT_m2n4(0,6,7,%1,%%r12,4)\ | |||
| SOLVE_col2_ltor_m2n4(16,4,5,%1) SUBTRACT_m2n4(16,6,7,%1,%%r12,4)\ | |||
| SOLVE_col3_ltor_m2n4(32,4,5,%1) SUBTRACT_m2n4(32,6,7,%1,%%r12,4)\ | |||
| SOLVE_col4_ltor_m2n4(48,4,5,%1) SUBTRACT_m2n4(48,6,7,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m2n4(4,5,0)\ | |||
| SOLVE_col1_ltor_m2n4(64,6,7,%1,%%r12,4)\ | |||
| SOLVE_col2_ltor_m2n4(80,6,7,%1,%%r12,4)\ | |||
| SOLVE_col3_ltor_m2n4(96,6,7,%1,%%r12,4)\ | |||
| SOLVE_col4_ltor_m2n4(112,6,7,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m2n4(6,7,32) | |||
| #define SOLVE_RN_m2n12 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) GEMM_SUM_REORDER_2x4(8,9) "movq %2,%3; addq $8,%2;"\ | |||
| SOLVE_col1_ltor_m2n4(0,4,5,%1) SUBTRACT_m2n4(0,6,7,%1,%%r12,4) SUBTRACT_m2n4(0,8,9,%1,%%r12,8)\ | |||
| SOLVE_col2_ltor_m2n4(16,4,5,%1) SUBTRACT_m2n4(16,6,7,%1,%%r12,4) SUBTRACT_m2n4(16,8,9,%1,%%r12,8)\ | |||
| SOLVE_col3_ltor_m2n4(32,4,5,%1) SUBTRACT_m2n4(32,6,7,%1,%%r12,4) SUBTRACT_m2n4(32,8,9,%1,%%r12,8)\ | |||
| SOLVE_col4_ltor_m2n4(48,4,5,%1) SUBTRACT_m2n4(48,6,7,%1,%%r12,4) SUBTRACT_m2n4(48,8,9,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m2n4(4,5,0)\ | |||
| SOLVE_col1_ltor_m2n4(64,6,7,%1,%%r12,4) SUBTRACT_m2n4(64,8,9,%1,%%r12,8)\ | |||
| SOLVE_col2_ltor_m2n4(80,6,7,%1,%%r12,4) SUBTRACT_m2n4(80,8,9,%1,%%r12,8)\ | |||
| SOLVE_col3_ltor_m2n4(96,6,7,%1,%%r12,4) SUBTRACT_m2n4(96,8,9,%1,%%r12,8)\ | |||
| SOLVE_col4_ltor_m2n4(112,6,7,%1,%%r12,4) SUBTRACT_m2n4(112,8,9,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m2n4(6,7,32)\ | |||
| SOLVE_col1_ltor_m2n4(128,8,9,%1,%%r12,8)\ | |||
| SOLVE_col2_ltor_m2n4(144,8,9,%1,%%r12,8)\ | |||
| SOLVE_col3_ltor_m2n4(160,8,9,%1,%%r12,8)\ | |||
| SOLVE_col4_ltor_m2n4(176,8,9,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m2n4(8,9,64) | |||
| #define SOLVE_RN_m1n4 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) "movq %2,%3; addq $4,%2;"\ | |||
| SOLVE_col1_ltor_m1n4(0,4,%1)\ | |||
| SOLVE_col2_ltor_m1n4(16,4,%1)\ | |||
| SOLVE_col3_ltor_m1n4(32,4,%1)\ | |||
| SOLVE_col4_ltor_m1n4(48,4,%1)\ | |||
| SAVE_SOLUTION_m1n4(4,0) | |||
| #define SOLVE_RN_m1n8 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) "movq %2,%3; addq $4,%2;"\ | |||
| SOLVE_col1_ltor_m1n4(0,4,%1) SUBTRACT_m1n4(0,5,%1,%%r12,4)\ | |||
| SOLVE_col2_ltor_m1n4(16,4,%1) SUBTRACT_m1n4(16,5,%1,%%r12,4)\ | |||
| SOLVE_col3_ltor_m1n4(32,4,%1) SUBTRACT_m1n4(32,5,%1,%%r12,4)\ | |||
| SOLVE_col4_ltor_m1n4(48,4,%1) SUBTRACT_m1n4(48,5,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m1n4(4,0)\ | |||
| SOLVE_col1_ltor_m1n4(64,5,%1,%%r12,4)\ | |||
| SOLVE_col2_ltor_m1n4(80,5,%1,%%r12,4)\ | |||
| SOLVE_col3_ltor_m1n4(96,5,%1,%%r12,4)\ | |||
| SOLVE_col4_ltor_m1n4(112,5,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m1n4(5,16) | |||
| #define SOLVE_RN_m1n12 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6) "movq %2,%3; addq $4,%2;"\ | |||
| SOLVE_col1_ltor_m1n4(0,4,%1) SUBTRACT_m1n4(0,5,%1,%%r12,4) SUBTRACT_m1n4(0,6,%1,%%r12,8)\ | |||
| SOLVE_col2_ltor_m1n4(16,4,%1) SUBTRACT_m1n4(16,5,%1,%%r12,4) SUBTRACT_m1n4(16,6,%1,%%r12,8)\ | |||
| SOLVE_col3_ltor_m1n4(32,4,%1) SUBTRACT_m1n4(32,5,%1,%%r12,4) SUBTRACT_m1n4(32,6,%1,%%r12,8)\ | |||
| SOLVE_col4_ltor_m1n4(48,4,%1) SUBTRACT_m1n4(48,5,%1,%%r12,4) SUBTRACT_m1n4(48,6,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m1n4(4,0)\ | |||
| SOLVE_col1_ltor_m1n4(64,5,%1,%%r12,4) SUBTRACT_m1n4(64,6,%1,%%r12,8)\ | |||
| SOLVE_col2_ltor_m1n4(80,5,%1,%%r12,4) SUBTRACT_m1n4(80,6,%1,%%r12,8)\ | |||
| SOLVE_col3_ltor_m1n4(96,5,%1,%%r12,4) SUBTRACT_m1n4(96,6,%1,%%r12,8)\ | |||
| SOLVE_col4_ltor_m1n4(112,5,%1,%%r12,4) SUBTRACT_m1n4(112,6,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m1n4(5,16)\ | |||
| SOLVE_col1_ltor_m1n4(128,6,%1,%%r12,8)\ | |||
| SOLVE_col2_ltor_m1n4(144,6,%1,%%r12,8)\ | |||
| SOLVE_col3_ltor_m1n4(160,6,%1,%%r12,8)\ | |||
| SOLVE_col4_ltor_m1n4(176,6,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m1n4(6,32) | |||
| #define GEMM_RN_SIMPLE(mdim,ndim) \ | |||
| "movq %%r15,%0; leaq (%%r15,%%r12,"#mdim"),%%r15; movq %%r13,%5; movq %%r14,%1;" INIT_m##mdim##n##ndim\ | |||
| "testq %5,%5; jz 1"#mdim""#ndim"2f;"\ | |||
| "1"#mdim""#ndim"1:\n\t"\ | |||
| GEMM_KERNEL_k1m##mdim##n##ndim "addq $16,%1; addq $"#mdim"*4,%0; decq %5; jnz 1"#mdim""#ndim"1b;"\ | |||
| "1"#mdim""#ndim"2:\n\t" | |||
| #define GEMM_RN_m8n4 GEMM_RN_SIMPLE(8,4) | |||
| #define GEMM_RN_m8n8 GEMM_RN_SIMPLE(8,8) | |||
| #define GEMM_RN_m8n12 \ | |||
| "movq %%r15,%0; leaq (%%r15,%%r12,8),%%r15; movq %%r13,%5; movq %%r14,%1;" INIT_m8n12\ | |||
| "cmpq $8,%5; jb 18122f;"\ | |||
| "18121:\n\t"\ | |||
| GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ | |||
| GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ | |||
| GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ | |||
| GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ | |||
| GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ | |||
| GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ | |||
| GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ | |||
| GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ | |||
| "subq $8,%5; cmpq $8,%5; jnb 18121b;"\ | |||
| "18122:\n\t"\ | |||
| "testq %5,%5; jz 18124f;"\ | |||
| "18123:\n\t"\ | |||
| GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1; decq %5; jnz 18123b;"\ | |||
| "18124:\n\t" | |||
| #define GEMM_RN_m4n4 GEMM_RN_SIMPLE(4,4) | |||
| #define GEMM_RN_m4n8 GEMM_RN_SIMPLE(4,8) | |||
| #define GEMM_RN_m4n12 GEMM_RN_SIMPLE(4,12) | |||
| #define GEMM_RN_m2n4 GEMM_RN_SIMPLE(2,4) | |||
| #define GEMM_RN_m2n8 GEMM_RN_SIMPLE(2,8) | |||
| #define GEMM_RN_m2n12 GEMM_RN_SIMPLE(2,12) | |||
| #define GEMM_RN_m1n4 GEMM_RN_SIMPLE(1,4) | |||
| #define GEMM_RN_m1n8 GEMM_RN_SIMPLE(1,8) | |||
| #define GEMM_RN_m1n12 GEMM_RN_SIMPLE(1,12) | |||
| #define COMPUTE(ndim) {\ | |||
| __asm__ __volatile__(\ | |||
| "movq %0,%%r15; movq %1,%%r14; movq %7,%%r13; movq %6,%%r12; salq $2,%%r12; movq %10,%%r11;"\ | |||
| "cmpq $8,%%r11; jb "#ndim"772f;"\ | |||
| #ndim"771:\n\t"\ | |||
| GEMM_RN_m8n##ndim SOLVE_RN_m8n##ndim "subq $8,%%r11; cmpq $8,%%r11; jnb "#ndim"771b;"\ | |||
| #ndim"772:\n\t"\ | |||
| "testq $4,%%r11; jz "#ndim"773f;"\ | |||
| GEMM_RN_m4n##ndim SOLVE_RN_m4n##ndim "subq $4,%%r11;"\ | |||
| #ndim"773:\n\t"\ | |||
| "testq $2,%%r11; jz "#ndim"774f;"\ | |||
| GEMM_RN_m2n##ndim SOLVE_RN_m2n##ndim "subq $2,%%r11;"\ | |||
| #ndim"774:\n\t"\ | |||
| "testq $1,%%r11; jz "#ndim"775f;"\ | |||
| GEMM_RN_m1n##ndim SOLVE_RN_m1n##ndim "subq $1,%%r11;"\ | |||
| #ndim"775:\n\t"\ | |||
| "movq %%r15,%0; movq %%r14,%1; vzeroupper;"\ | |||
| :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(OFF),"m"(one[0]),"m"(zero[0]),"m"(M)\ | |||
| :"r11","r12","r13","r14","r15","cc","memory",\ | |||
| "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ | |||
| a_ptr -= M * K; b_ptr += ndim * K; c_ptr += ldc * ndim - M; OFF += ndim;\ | |||
| } | |||
| static void solve_RN(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
| FLOAT a0, b0; | |||
| int i, j, k; | |||
| for (i=0; i<n; i++) { | |||
| b0 = b[i*n+i]; | |||
| for (j=0; j<m; j++) { | |||
| a0 = c[i*ldc+j] * b0; | |||
| a[i*m+j] = c[i*ldc+j] = a0; | |||
| for (k=i+1; k<n; k++) c[k*ldc+j] -= a0 * b[i*n+k]; | |||
| } | |||
| } | |||
| } | |||
| static void COMPUTE_EDGE_1_nchunk(BLASLONG m, BLASLONG n, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG k, BLASLONG offset) { | |||
| BLASLONG m_count = m, kk = offset; FLOAT *a_ptr = sa, *c_ptr = C; | |||
| for(;m_count>7;m_count-=8){ | |||
| if(kk>0) GEMM_KERNEL_N(8,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); | |||
| solve_RN(8,n,a_ptr+kk*8,sb+kk*n,c_ptr,ldc); | |||
| a_ptr += k * 8; c_ptr += 8; | |||
| } | |||
| for(;m_count>3;m_count-=4){ | |||
| if(kk>0) GEMM_KERNEL_N(4,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); | |||
| solve_RN(4,n,a_ptr+kk*4,sb+kk*n,c_ptr,ldc); | |||
| a_ptr += k * 4; c_ptr += 4; | |||
| } | |||
| for(;m_count>1;m_count-=2){ | |||
| if(kk>0) GEMM_KERNEL_N(2,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); | |||
| solve_RN(2,n,a_ptr+kk*2,sb+kk*n,c_ptr,ldc); | |||
| a_ptr += k * 2; c_ptr += 2; | |||
| } | |||
| if(m_count>0){ | |||
| if(kk>0) GEMM_KERNEL_N(1,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); | |||
| solve_RN(1,n,a_ptr+kk*1,sb+kk*n,c_ptr,ldc); | |||
| a_ptr += k * 1; c_ptr += 1; | |||
| } | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){ | |||
| float *a_ptr = sa, *b_ptr = sb, *c_ptr = C, *c_tmp = C; | |||
| float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; | |||
| float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}; | |||
| uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, OFF = (uint64_t)-offset, k_cnt = 0; | |||
| BLASLONG n_count = n; | |||
| for(;n_count>11;n_count-=12) COMPUTE(12) | |||
| for(;n_count>7;n_count-=8) COMPUTE(8) | |||
| for(;n_count>3;n_count-=4) COMPUTE(4) | |||
| for(;n_count>1;n_count-=2) { COMPUTE_EDGE_1_nchunk(m,2,a_ptr,b_ptr,c_ptr,ldc,k,OFF); b_ptr += 2*k; c_ptr += ldc*2; OFF+=2;} | |||
| if(n_count>0) COMPUTE_EDGE_1_nchunk(m,1,a_ptr,b_ptr,c_ptr,ldc,k,OFF); | |||
| return 0; | |||
| } | |||
| #include "common.h" | |||
| #include <stdint.h> | |||
| #include "strsm_kernel_8x4_haswell_R_common.h" | |||
| #define SOLVE_RN_m8n4 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) "movq %2,%3; addq $32,%2;"\ | |||
| SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1)\ | |||
| SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1)\ | |||
| SAVE_SOLUTION_m8n2(4,5,0)\ | |||
| SOLVE_leri_m8n2(40,6,7,%1)\ | |||
| SOLVE_ri_m8n2(56,6,7,%1)\ | |||
| SAVE_SOLUTION_m8n2(6,7,64) | |||
| #define SOLVE_RN_m8n8 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) "movq %2,%3; addq $32,%2;"\ | |||
| SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1) SUBTRACT_m8n2(0,8,9,%1,%%r12,4) SUBTRACT_m8n2(8,10,11,%1,%%r12,4)\ | |||
| SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1) SUBTRACT_m8n2(16,8,9,%1,%%r12,4) SUBTRACT_m8n2(24,10,11,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m8n2(4,5,0)\ | |||
| SOLVE_leri_m8n2(40,6,7,%1) SUBTRACT_m8n2(32,8,9,%1,%%r12,4) SUBTRACT_m8n2(40,10,11,%1,%%r12,4)\ | |||
| SOLVE_ri_m8n2(56,6,7,%1) SUBTRACT_m8n2(48,8,9,%1,%%r12,4) SUBTRACT_m8n2(56,10,11,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m8n2(6,7,64)\ | |||
| SOLVE_leri_m8n2(64,8,9,%1,%%r12,4) SUBTRACT_m8n2(72,10,11,%1,%%r12,4)\ | |||
| SOLVE_ri_m8n2(80,8,9,%1,%%r12,4) SUBTRACT_m8n2(88,10,11,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m8n2(8,9,128)\ | |||
| SOLVE_leri_m8n2(104,10,11,%1,%%r12,4)\ | |||
| SOLVE_ri_m8n2(120,10,11,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m8n2(10,11,192) | |||
| #define SOLVE_RN_m8n12 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) GEMM_SUM_REORDER_8x4(12,13,14,15,63) "movq %2,%3; addq $32,%2;"\ | |||
| SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1) SUBTRACT_m8n2(0,8,9,%1,%%r12,4) SUBTRACT_m8n2(8,10,11,%1,%%r12,4) SUBTRACT_m8n2(0,12,13,%1,%%r12,8) SUBTRACT_m8n2(8,14,15,%1,%%r12,8)\ | |||
| SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1) SUBTRACT_m8n2(16,8,9,%1,%%r12,4) SUBTRACT_m8n2(24,10,11,%1,%%r12,4) SUBTRACT_m8n2(16,12,13,%1,%%r12,8) SUBTRACT_m8n2(24,14,15,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m8n2(4,5,0)\ | |||
| SOLVE_leri_m8n2(40,6,7,%1) SUBTRACT_m8n2(32,8,9,%1,%%r12,4) SUBTRACT_m8n2(40,10,11,%1,%%r12,4) SUBTRACT_m8n2(32,12,13,%1,%%r12,8) SUBTRACT_m8n2(40,14,15,%1,%%r12,8)\ | |||
| SOLVE_ri_m8n2(56,6,7,%1) SUBTRACT_m8n2(48,8,9,%1,%%r12,4) SUBTRACT_m8n2(56,10,11,%1,%%r12,4) SUBTRACT_m8n2(48,12,13,%1,%%r12,8) SUBTRACT_m8n2(56,14,15,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m8n2(6,7,64)\ | |||
| SOLVE_leri_m8n2(64,8,9,%1,%%r12,4) SUBTRACT_m8n2(72,10,11,%1,%%r12,4) SUBTRACT_m8n2(64,12,13,%1,%%r12,8) SUBTRACT_m8n2(72,14,15,%1,%%r12,8)\ | |||
| SOLVE_ri_m8n2(80,8,9,%1,%%r12,4) SUBTRACT_m8n2(88,10,11,%1,%%r12,4) SUBTRACT_m8n2(80,12,13,%1,%%r12,8) SUBTRACT_m8n2(88,14,15,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m8n2(8,9,128)\ | |||
| SOLVE_leri_m8n2(104,10,11,%1,%%r12,4) SUBTRACT_m8n2(96,12,13,%1,%%r12,8) SUBTRACT_m8n2(104,14,15,%1,%%r12,8)\ | |||
| SOLVE_ri_m8n2(120,10,11,%1,%%r12,4) SUBTRACT_m8n2(112,12,13,%1,%%r12,8) SUBTRACT_m8n2(120,14,15,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m8n2(10,11,192)\ | |||
| SOLVE_leri_m8n2(128,12,13,%1,%%r12,8) SUBTRACT_m8n2(136,14,15,%1,%%r12,8)\ | |||
| SOLVE_ri_m8n2(144,12,13,%1,%%r12,8) SUBTRACT_m8n2(152,14,15,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m8n2(12,13,256)\ | |||
| SOLVE_leri_m8n2(168,14,15,%1,%%r12,8)\ | |||
| SOLVE_ri_m8n2(184,14,15,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m8n2(14,15,320) | |||
| #define SOLVE_RN_m4n4 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) "movq %2,%3; addq $16,%2;"\ | |||
| SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1)\ | |||
| SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1)\ | |||
| SAVE_SOLUTION_m4n2(4,0)\ | |||
| SOLVE_leri_m4n2(40,5,%1)\ | |||
| SOLVE_ri_m4n2(56,5,%1)\ | |||
| SAVE_SOLUTION_m4n2(5,32) | |||
| #define SOLVE_RN_m4n8 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) "movq %2,%3; addq $16,%2;"\ | |||
| SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1) SUBTRACT_m4n2(0,6,%1,%%r12,4) SUBTRACT_m4n2(8,7,%1,%%r12,4)\ | |||
| SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1) SUBTRACT_m4n2(16,6,%1,%%r12,4) SUBTRACT_m4n2(24,7,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m4n2(4,0)\ | |||
| SOLVE_leri_m4n2(40,5,%1) SUBTRACT_m4n2(32,6,%1,%%r12,4) SUBTRACT_m4n2(40,7,%1,%%r12,4)\ | |||
| SOLVE_ri_m4n2(56,5,%1) SUBTRACT_m4n2(48,6,%1,%%r12,4) SUBTRACT_m4n2(56,7,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m4n2(5,32)\ | |||
| SOLVE_leri_m4n2(64,6,%1,%%r12,4) SUBTRACT_m4n2(72,7,%1,%%r12,4)\ | |||
| SOLVE_ri_m4n2(80,6,%1,%%r12,4) SUBTRACT_m4n2(88,7,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m4n2(6,64)\ | |||
| SOLVE_leri_m4n2(104,7,%1,%%r12,4)\ | |||
| SOLVE_ri_m4n2(120,7,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m4n2(7,96) | |||
| #define SOLVE_RN_m4n12 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9) "movq %2,%3; addq $16,%2;"\ | |||
| SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1) SUBTRACT_m4n2(0,6,%1,%%r12,4) SUBTRACT_m4n2(8,7,%1,%%r12,4) SUBTRACT_m4n2(0,8,%1,%%r12,8) SUBTRACT_m4n2(8,9,%1,%%r12,8)\ | |||
| SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1) SUBTRACT_m4n2(16,6,%1,%%r12,4) SUBTRACT_m4n2(24,7,%1,%%r12,4) SUBTRACT_m4n2(16,8,%1,%%r12,8) SUBTRACT_m4n2(24,9,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m4n2(4,0)\ | |||
| SOLVE_leri_m4n2(40,5,%1) SUBTRACT_m4n2(32,6,%1,%%r12,4) SUBTRACT_m4n2(40,7,%1,%%r12,4) SUBTRACT_m4n2(32,8,%1,%%r12,8) SUBTRACT_m4n2(40,9,%1,%%r12,8)\ | |||
| SOLVE_ri_m4n2(56,5,%1) SUBTRACT_m4n2(48,6,%1,%%r12,4) SUBTRACT_m4n2(56,7,%1,%%r12,4) SUBTRACT_m4n2(48,8,%1,%%r12,8) SUBTRACT_m4n2(56,9,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m4n2(5,32)\ | |||
| SOLVE_leri_m4n2(64,6,%1,%%r12,4) SUBTRACT_m4n2(72,7,%1,%%r12,4) SUBTRACT_m4n2(64,8,%1,%%r12,8) SUBTRACT_m4n2(72,9,%1,%%r12,8)\ | |||
| SOLVE_ri_m4n2(80,6,%1,%%r12,4) SUBTRACT_m4n2(88,7,%1,%%r12,4) SUBTRACT_m4n2(80,8,%1,%%r12,8) SUBTRACT_m4n2(88,9,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m4n2(6,64)\ | |||
| SOLVE_leri_m4n2(104,7,%1,%%r12,4) SUBTRACT_m4n2(96,8,%1,%%r12,8) SUBTRACT_m4n2(104,9,%1,%%r12,8)\ | |||
| SOLVE_ri_m4n2(120,7,%1,%%r12,4) SUBTRACT_m4n2(112,8,%1,%%r12,8) SUBTRACT_m4n2(120,9,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m4n2(7,96)\ | |||
| SOLVE_leri_m4n2(128,8,%1,%%r12,8) SUBTRACT_m4n2(136,9,%1,%%r12,8)\ | |||
| SOLVE_ri_m4n2(144,8,%1,%%r12,8) SUBTRACT_m4n2(152,9,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m4n2(8,128)\ | |||
| SOLVE_leri_m4n2(168,9,%1,%%r12,8)\ | |||
| SOLVE_ri_m4n2(184,9,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m4n2(9,160) | |||
| #define SOLVE_RN_m2n4 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) "movq %2,%3; addq $8,%2;"\ | |||
| SOLVE_col1_ltor_m2n4(0,4,5,%1)\ | |||
| SOLVE_col2_ltor_m2n4(16,4,5,%1)\ | |||
| SOLVE_col3_ltor_m2n4(32,4,5,%1)\ | |||
| SOLVE_col4_ltor_m2n4(48,4,5,%1)\ | |||
| SAVE_SOLUTION_m2n4(4,5,0) | |||
| #define SOLVE_RN_m2n8 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) "movq %2,%3; addq $8,%2;"\ | |||
| SOLVE_col1_ltor_m2n4(0,4,5,%1) SUBTRACT_m2n4(0,6,7,%1,%%r12,4)\ | |||
| SOLVE_col2_ltor_m2n4(16,4,5,%1) SUBTRACT_m2n4(16,6,7,%1,%%r12,4)\ | |||
| SOLVE_col3_ltor_m2n4(32,4,5,%1) SUBTRACT_m2n4(32,6,7,%1,%%r12,4)\ | |||
| SOLVE_col4_ltor_m2n4(48,4,5,%1) SUBTRACT_m2n4(48,6,7,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m2n4(4,5,0)\ | |||
| SOLVE_col1_ltor_m2n4(64,6,7,%1,%%r12,4)\ | |||
| SOLVE_col2_ltor_m2n4(80,6,7,%1,%%r12,4)\ | |||
| SOLVE_col3_ltor_m2n4(96,6,7,%1,%%r12,4)\ | |||
| SOLVE_col4_ltor_m2n4(112,6,7,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m2n4(6,7,32) | |||
| #define SOLVE_RN_m2n12 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) GEMM_SUM_REORDER_2x4(8,9) "movq %2,%3; addq $8,%2;"\ | |||
| SOLVE_col1_ltor_m2n4(0,4,5,%1) SUBTRACT_m2n4(0,6,7,%1,%%r12,4) SUBTRACT_m2n4(0,8,9,%1,%%r12,8)\ | |||
| SOLVE_col2_ltor_m2n4(16,4,5,%1) SUBTRACT_m2n4(16,6,7,%1,%%r12,4) SUBTRACT_m2n4(16,8,9,%1,%%r12,8)\ | |||
| SOLVE_col3_ltor_m2n4(32,4,5,%1) SUBTRACT_m2n4(32,6,7,%1,%%r12,4) SUBTRACT_m2n4(32,8,9,%1,%%r12,8)\ | |||
| SOLVE_col4_ltor_m2n4(48,4,5,%1) SUBTRACT_m2n4(48,6,7,%1,%%r12,4) SUBTRACT_m2n4(48,8,9,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m2n4(4,5,0)\ | |||
| SOLVE_col1_ltor_m2n4(64,6,7,%1,%%r12,4) SUBTRACT_m2n4(64,8,9,%1,%%r12,8)\ | |||
| SOLVE_col2_ltor_m2n4(80,6,7,%1,%%r12,4) SUBTRACT_m2n4(80,8,9,%1,%%r12,8)\ | |||
| SOLVE_col3_ltor_m2n4(96,6,7,%1,%%r12,4) SUBTRACT_m2n4(96,8,9,%1,%%r12,8)\ | |||
| SOLVE_col4_ltor_m2n4(112,6,7,%1,%%r12,4) SUBTRACT_m2n4(112,8,9,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m2n4(6,7,32)\ | |||
| SOLVE_col1_ltor_m2n4(128,8,9,%1,%%r12,8)\ | |||
| SOLVE_col2_ltor_m2n4(144,8,9,%1,%%r12,8)\ | |||
| SOLVE_col3_ltor_m2n4(160,8,9,%1,%%r12,8)\ | |||
| SOLVE_col4_ltor_m2n4(176,8,9,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m2n4(8,9,64) | |||
| #define SOLVE_RN_m1n4 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) "movq %2,%3; addq $4,%2;"\ | |||
| SOLVE_col1_ltor_m1n4(0,4,%1)\ | |||
| SOLVE_col2_ltor_m1n4(16,4,%1)\ | |||
| SOLVE_col3_ltor_m1n4(32,4,%1)\ | |||
| SOLVE_col4_ltor_m1n4(48,4,%1)\ | |||
| SAVE_SOLUTION_m1n4(4,0) | |||
| #define SOLVE_RN_m1n8 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) "movq %2,%3; addq $4,%2;"\ | |||
| SOLVE_col1_ltor_m1n4(0,4,%1) SUBTRACT_m1n4(0,5,%1,%%r12,4)\ | |||
| SOLVE_col2_ltor_m1n4(16,4,%1) SUBTRACT_m1n4(16,5,%1,%%r12,4)\ | |||
| SOLVE_col3_ltor_m1n4(32,4,%1) SUBTRACT_m1n4(32,5,%1,%%r12,4)\ | |||
| SOLVE_col4_ltor_m1n4(48,4,%1) SUBTRACT_m1n4(48,5,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m1n4(4,0)\ | |||
| SOLVE_col1_ltor_m1n4(64,5,%1,%%r12,4)\ | |||
| SOLVE_col2_ltor_m1n4(80,5,%1,%%r12,4)\ | |||
| SOLVE_col3_ltor_m1n4(96,5,%1,%%r12,4)\ | |||
| SOLVE_col4_ltor_m1n4(112,5,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m1n4(5,16) | |||
| #define SOLVE_RN_m1n12 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6) "movq %2,%3; addq $4,%2;"\ | |||
| SOLVE_col1_ltor_m1n4(0,4,%1) SUBTRACT_m1n4(0,5,%1,%%r12,4) SUBTRACT_m1n4(0,6,%1,%%r12,8)\ | |||
| SOLVE_col2_ltor_m1n4(16,4,%1) SUBTRACT_m1n4(16,5,%1,%%r12,4) SUBTRACT_m1n4(16,6,%1,%%r12,8)\ | |||
| SOLVE_col3_ltor_m1n4(32,4,%1) SUBTRACT_m1n4(32,5,%1,%%r12,4) SUBTRACT_m1n4(32,6,%1,%%r12,8)\ | |||
| SOLVE_col4_ltor_m1n4(48,4,%1) SUBTRACT_m1n4(48,5,%1,%%r12,4) SUBTRACT_m1n4(48,6,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m1n4(4,0)\ | |||
| SOLVE_col1_ltor_m1n4(64,5,%1,%%r12,4) SUBTRACT_m1n4(64,6,%1,%%r12,8)\ | |||
| SOLVE_col2_ltor_m1n4(80,5,%1,%%r12,4) SUBTRACT_m1n4(80,6,%1,%%r12,8)\ | |||
| SOLVE_col3_ltor_m1n4(96,5,%1,%%r12,4) SUBTRACT_m1n4(96,6,%1,%%r12,8)\ | |||
| SOLVE_col4_ltor_m1n4(112,5,%1,%%r12,4) SUBTRACT_m1n4(112,6,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m1n4(5,16)\ | |||
| SOLVE_col1_ltor_m1n4(128,6,%1,%%r12,8)\ | |||
| SOLVE_col2_ltor_m1n4(144,6,%1,%%r12,8)\ | |||
| SOLVE_col3_ltor_m1n4(160,6,%1,%%r12,8)\ | |||
| SOLVE_col4_ltor_m1n4(176,6,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m1n4(6,32) | |||
| #define GEMM_RN_SIMPLE(mdim,ndim) \ | |||
| "movq %%r15,%0; leaq (%%r15,%%r12,"#mdim"),%%r15; movq %%r13,%5; movq %%r14,%1;" INIT_m##mdim##n##ndim\ | |||
| "testq %5,%5; jz 1"#mdim""#ndim"2f;"\ | |||
| "1"#mdim""#ndim"1:\n\t"\ | |||
| GEMM_KERNEL_k1m##mdim##n##ndim "addq $16,%1; addq $"#mdim"*4,%0; decq %5; jnz 1"#mdim""#ndim"1b;"\ | |||
| "1"#mdim""#ndim"2:\n\t" | |||
| #define GEMM_RN_m8n4 GEMM_RN_SIMPLE(8,4) | |||
| #define GEMM_RN_m8n8 GEMM_RN_SIMPLE(8,8) | |||
| #define GEMM_RN_m8n12 \ | |||
| "movq %%r15,%0; leaq (%%r15,%%r12,8),%%r15; movq %%r13,%5; movq %%r14,%1;" INIT_m8n12\ | |||
| "cmpq $8,%5; jb 18122f;"\ | |||
| "18121:\n\t"\ | |||
| GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ | |||
| GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ | |||
| GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ | |||
| GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ | |||
| GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ | |||
| GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ | |||
| GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ | |||
| GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ | |||
| "subq $8,%5; cmpq $8,%5; jnb 18121b;"\ | |||
| "18122:\n\t"\ | |||
| "testq %5,%5; jz 18124f;"\ | |||
| "18123:\n\t"\ | |||
| GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1; decq %5; jnz 18123b;"\ | |||
| "18124:\n\t" | |||
| #define GEMM_RN_m4n4 GEMM_RN_SIMPLE(4,4) | |||
| #define GEMM_RN_m4n8 GEMM_RN_SIMPLE(4,8) | |||
| #define GEMM_RN_m4n12 GEMM_RN_SIMPLE(4,12) | |||
| #define GEMM_RN_m2n4 GEMM_RN_SIMPLE(2,4) | |||
| #define GEMM_RN_m2n8 GEMM_RN_SIMPLE(2,8) | |||
| #define GEMM_RN_m2n12 GEMM_RN_SIMPLE(2,12) | |||
| #define GEMM_RN_m1n4 GEMM_RN_SIMPLE(1,4) | |||
| #define GEMM_RN_m1n8 GEMM_RN_SIMPLE(1,8) | |||
| #define GEMM_RN_m1n12 GEMM_RN_SIMPLE(1,12) | |||
| #define COMPUTE(ndim) {\ | |||
| __asm__ __volatile__(\ | |||
| "movq %0,%%r15; movq %1,%%r14; movq %7,%%r13; movq %6,%%r12; salq $2,%%r12; movq %10,%%r11;"\ | |||
| "cmpq $8,%%r11; jb "#ndim"772f;"\ | |||
| #ndim"771:\n\t"\ | |||
| GEMM_RN_m8n##ndim SOLVE_RN_m8n##ndim "subq $8,%%r11; cmpq $8,%%r11; jnb "#ndim"771b;"\ | |||
| #ndim"772:\n\t"\ | |||
| "testq $4,%%r11; jz "#ndim"773f;"\ | |||
| GEMM_RN_m4n##ndim SOLVE_RN_m4n##ndim "subq $4,%%r11;"\ | |||
| #ndim"773:\n\t"\ | |||
| "testq $2,%%r11; jz "#ndim"774f;"\ | |||
| GEMM_RN_m2n##ndim SOLVE_RN_m2n##ndim "subq $2,%%r11;"\ | |||
| #ndim"774:\n\t"\ | |||
| "testq $1,%%r11; jz "#ndim"775f;"\ | |||
| GEMM_RN_m1n##ndim SOLVE_RN_m1n##ndim "subq $1,%%r11;"\ | |||
| #ndim"775:\n\t"\ | |||
| "movq %%r15,%0; movq %%r14,%1; vzeroupper;"\ | |||
| :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(OFF),"m"(one[0]),"m"(zero[0]),"m"(M)\ | |||
| :"r11","r12","r13","r14","r15","cc","memory",\ | |||
| "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ | |||
| a_ptr -= M * K; b_ptr += ndim * K; c_ptr += ldc * ndim - M; OFF += ndim;\ | |||
| } | |||
| static void solve_RN(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
| FLOAT a0, b0; | |||
| int i, j, k; | |||
| for (i=0; i<n; i++) { | |||
| b0 = b[i*n+i]; | |||
| for (j=0; j<m; j++) { | |||
| a0 = c[i*ldc+j] * b0; | |||
| a[i*m+j] = c[i*ldc+j] = a0; | |||
| for (k=i+1; k<n; k++) c[k*ldc+j] -= a0 * b[i*n+k]; | |||
| } | |||
| } | |||
| } | |||
| static void COMPUTE_EDGE_1_nchunk(BLASLONG m, BLASLONG n, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG k, BLASLONG offset) { | |||
| BLASLONG m_count = m, kk = offset; FLOAT *a_ptr = sa, *c_ptr = C; | |||
| for(;m_count>7;m_count-=8){ | |||
| if(kk>0) GEMM_KERNEL_N(8,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); | |||
| solve_RN(8,n,a_ptr+kk*8,sb+kk*n,c_ptr,ldc); | |||
| a_ptr += k * 8; c_ptr += 8; | |||
| } | |||
| for(;m_count>3;m_count-=4){ | |||
| if(kk>0) GEMM_KERNEL_N(4,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); | |||
| solve_RN(4,n,a_ptr+kk*4,sb+kk*n,c_ptr,ldc); | |||
| a_ptr += k * 4; c_ptr += 4; | |||
| } | |||
| for(;m_count>1;m_count-=2){ | |||
| if(kk>0) GEMM_KERNEL_N(2,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); | |||
| solve_RN(2,n,a_ptr+kk*2,sb+kk*n,c_ptr,ldc); | |||
| a_ptr += k * 2; c_ptr += 2; | |||
| } | |||
| if(m_count>0){ | |||
| if(kk>0) GEMM_KERNEL_N(1,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); | |||
| solve_RN(1,n,a_ptr+kk*1,sb+kk*n,c_ptr,ldc); | |||
| a_ptr += k * 1; c_ptr += 1; | |||
| } | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){ | |||
| float *a_ptr = sa, *b_ptr = sb, *c_ptr = C, *c_tmp = C; | |||
| float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; | |||
| float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}; | |||
| uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, OFF = (uint64_t)-offset, k_cnt = 0; | |||
| BLASLONG n_count = n; | |||
| for(;n_count>11;n_count-=12) COMPUTE(12) | |||
| for(;n_count>7;n_count-=8) COMPUTE(8) | |||
| for(;n_count>3;n_count-=4) COMPUTE(4) | |||
| for(;n_count>1;n_count-=2) { COMPUTE_EDGE_1_nchunk(m,2,a_ptr,b_ptr,c_ptr,ldc,k,OFF); b_ptr += 2*k; c_ptr += ldc*2; OFF+=2;} | |||
| if(n_count>0) COMPUTE_EDGE_1_nchunk(m,1,a_ptr,b_ptr,c_ptr,ldc,k,OFF); | |||
| return 0; | |||
| } | |||
| @@ -1,281 +1,281 @@ | |||
| #include "common.h" | |||
| #include <stdint.h> | |||
| #include "strsm_kernel_8x4_haswell_R_common.h" | |||
| #define SOLVE_RT_m8n4 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\ | |||
| SOLVE_rile_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\ | |||
| SOLVE_le_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(6,7,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m8n2(-48,4,5,%1)\ | |||
| SOLVE_le_m8n2(-64,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(4,5,-128) | |||
| #define SOLVE_RT_m8n8 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\ | |||
| SOLVE_rile_m8n2(-8,10,11,%1,%%r12,4) SUBTRACT_m8n2(-16,8,9,%1,%%r12,4) SUBTRACT_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\ | |||
| SOLVE_le_m8n2(-24,10,11,%1,%%r12,4) SUBTRACT_m8n2(-32,8,9,%1,%%r12,4) SUBTRACT_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(10,11,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m8n2(-48,8,9,%1,%%r12,4) SUBTRACT_m8n2(-40,6,7,%1) SUBTRACT_m8n2(-48,4,5,%1)\ | |||
| SOLVE_le_m8n2(-64,8,9,%1,%%r12,4) SUBTRACT_m8n2(-56,6,7,%1) SUBTRACT_m8n2(-64,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(8,9,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m8n2(-72,6,7,%1) SUBTRACT_m8n2(-80,4,5,%1)\ | |||
| SOLVE_le_m8n2(-88,6,7,%1) SUBTRACT_m8n2(-96,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(6,7,-192) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m8n2(-112,4,5,%1)\ | |||
| SOLVE_le_m8n2(-128,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(4,5,-256) | |||
| #define SOLVE_RT_m8n12 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) GEMM_SUM_REORDER_8x4(12,13,14,15,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\ | |||
| SOLVE_rile_m8n2(-8,14,15,%1,%%r12,8) SUBTRACT_m8n2(-16,12,13,%1,%%r12,8) SUBTRACT_m8n2(-8,10,11,%1,%%r12,4) SUBTRACT_m8n2(-16,8,9,%1,%%r12,4) SUBTRACT_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\ | |||
| SOLVE_le_m8n2(-24,14,15,%1,%%r12,8) SUBTRACT_m8n2(-32,12,13,%1,%%r12,8) SUBTRACT_m8n2(-24,10,11,%1,%%r12,4) SUBTRACT_m8n2(-32,8,9,%1,%%r12,4) SUBTRACT_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(14,15,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m8n2(-48,12,13,%1,%%r12,8) SUBTRACT_m8n2(-40,10,11,%1,%%r12,4) SUBTRACT_m8n2(-48,8,9,%1,%%r12,4) SUBTRACT_m8n2(-40,6,7,%1) SUBTRACT_m8n2(-48,4,5,%1)\ | |||
| SOLVE_le_m8n2(-64,12,13,%1,%%r12,8) SUBTRACT_m8n2(-56,10,11,%1,%%r12,4) SUBTRACT_m8n2(-64,8,9,%1,%%r12,4) SUBTRACT_m8n2(-56,6,7,%1) SUBTRACT_m8n2(-64,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(12,13,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m8n2(-72,10,11,%1,%%r12,4) SUBTRACT_m8n2(-80,8,9,%1,%%r12,4) SUBTRACT_m8n2(-72,6,7,%1) SUBTRACT_m8n2(-80,4,5,%1)\ | |||
| SOLVE_le_m8n2(-88,10,11,%1,%%r12,4) SUBTRACT_m8n2(-96,8,9,%1,%%r12,4) SUBTRACT_m8n2(-88,6,7,%1) SUBTRACT_m8n2(-96,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(10,11,-192) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m8n2(-112,8,9,%1,%%r12,4) SUBTRACT_m8n2(-104,6,7,%1) SUBTRACT_m8n2(-112,4,5,%1)\ | |||
| SOLVE_le_m8n2(-128,8,9,%1,%%r12,4) SUBTRACT_m8n2(-120,6,7,%1) SUBTRACT_m8n2(-128,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(8,9,-256) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m8n2(-136,6,7,%1) SUBTRACT_m8n2(-144,4,5,%1)\ | |||
| SOLVE_le_m8n2(-152,6,7,%1) SUBTRACT_m8n2(-160,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(6,7,-320) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m8n2(-176,4,5,%1)\ | |||
| SOLVE_le_m8n2(-192,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(4,5,-384) | |||
| #define SOLVE_RT_m4n4 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\ | |||
| SOLVE_rile_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\ | |||
| SOLVE_le_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(5,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m4n2(-48,4,%1)\ | |||
| SOLVE_le_m4n2(-64,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(4,-64) | |||
| #define SOLVE_RT_m4n8 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\ | |||
| SOLVE_rile_m4n2(-8,7,%1,%%r12,4) SUBTRACT_m4n2(-16,6,%1,%%r12,4) SUBTRACT_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\ | |||
| SOLVE_le_m4n2(-24,7,%1,%%r12,4) SUBTRACT_m4n2(-32,6,%1,%%r12,4) SUBTRACT_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(7,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m4n2(-48,6,%1,%%r12,4) SUBTRACT_m4n2(-40,5,%1) SUBTRACT_m4n2(-48,4,%1)\ | |||
| SOLVE_le_m4n2(-64,6,%1,%%r12,4) SUBTRACT_m4n2(-56,5,%1) SUBTRACT_m4n2(-64,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(6,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m4n2(-72,5,%1) SUBTRACT_m4n2(-80,4,%1)\ | |||
| SOLVE_le_m4n2(-88,5,%1) SUBTRACT_m4n2(-96,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(5,-96) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m4n2(-112,4,%1)\ | |||
| SOLVE_le_m4n2(-128,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(4,-128) | |||
| #define SOLVE_RT_m4n12 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\ | |||
| SOLVE_rile_m4n2(-8,9,%1,%%r12,8) SUBTRACT_m4n2(-16,8,%1,%%r12,8) SUBTRACT_m4n2(-8,7,%1,%%r12,4) SUBTRACT_m4n2(-16,6,%1,%%r12,4) SUBTRACT_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\ | |||
| SOLVE_le_m4n2(-24,9,%1,%%r12,8) SUBTRACT_m4n2(-32,8,%1,%%r12,8) SUBTRACT_m4n2(-24,7,%1,%%r12,4) SUBTRACT_m4n2(-32,6,%1,%%r12,4) SUBTRACT_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(9,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m4n2(-48,8,%1,%%r12,8) SUBTRACT_m4n2(-40,7,%1,%%r12,4) SUBTRACT_m4n2(-48,6,%1,%%r12,4) SUBTRACT_m4n2(-40,5,%1) SUBTRACT_m4n2(-48,4,%1)\ | |||
| SOLVE_le_m4n2(-64,8,%1,%%r12,8) SUBTRACT_m4n2(-56,7,%1,%%r12,4) SUBTRACT_m4n2(-64,6,%1,%%r12,4) SUBTRACT_m4n2(-56,5,%1) SUBTRACT_m4n2(-64,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(8,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m4n2(-72,7,%1,%%r12,4) SUBTRACT_m4n2(-80,6,%1,%%r12,4) SUBTRACT_m4n2(-72,5,%1) SUBTRACT_m4n2(-80,4,%1)\ | |||
| SOLVE_le_m4n2(-88,7,%1,%%r12,4) SUBTRACT_m4n2(-96,6,%1,%%r12,4) SUBTRACT_m4n2(-88,5,%1) SUBTRACT_m4n2(-96,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(7,-96) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m4n2(-112,6,%1,%%r12,4) SUBTRACT_m4n2(-104,5,%1) SUBTRACT_m4n2(-112,4,%1)\ | |||
| SOLVE_le_m4n2(-128,6,%1,%%r12,4) SUBTRACT_m4n2(-120,5,%1) SUBTRACT_m4n2(-128,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(6,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m4n2(-136,5,%1) SUBTRACT_m4n2(-144,4,%1)\ | |||
| SOLVE_le_m4n2(-152,5,%1) SUBTRACT_m4n2(-160,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(5,-160) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m4n2(-176,4,%1)\ | |||
| SOLVE_le_m4n2(-192,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(4,-192) | |||
| #define SOLVE_RT_m2n4 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\ | |||
| SOLVE_col4_rtol_m2n4(-16,4,5,%1)\ | |||
| SOLVE_col3_rtol_m2n4(-32,4,5,%1)\ | |||
| SOLVE_col2_rtol_m2n4(-48,4,5,%1)\ | |||
| SOLVE_col1_rtol_m2n4(-64,4,5,%1)\ | |||
| SAVE_SOLUTION_m2n4(4,5,-32) | |||
| #define SOLVE_RT_m2n8 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\ | |||
| SOLVE_col4_rtol_m2n4(-16,6,7,%1,%%r12,4) SUBTRACT_m2n4(-16,4,5,%1)\ | |||
| SOLVE_col3_rtol_m2n4(-32,6,7,%1,%%r12,4) SUBTRACT_m2n4(-32,4,5,%1)\ | |||
| SOLVE_col2_rtol_m2n4(-48,6,7,%1,%%r12,4) SUBTRACT_m2n4(-48,4,5,%1)\ | |||
| SOLVE_col1_rtol_m2n4(-64,6,7,%1,%%r12,4) SUBTRACT_m2n4(-64,4,5,%1)\ | |||
| SAVE_SOLUTION_m2n4(6,7,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ | |||
| SOLVE_col4_rtol_m2n4(-80,4,5,%1)\ | |||
| SOLVE_col3_rtol_m2n4(-96,4,5,%1)\ | |||
| SOLVE_col2_rtol_m2n4(-112,4,5,%1)\ | |||
| SOLVE_col1_rtol_m2n4(-128,4,5,%1)\ | |||
| SAVE_SOLUTION_m2n4(4,5,-64) | |||
| #define SOLVE_RT_m2n12 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) GEMM_SUM_REORDER_2x4(8,9) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\ | |||
| SOLVE_col4_rtol_m2n4(-16,8,9,%1,%%r12,8) SUBTRACT_m2n4(-16,6,7,%1,%%r12,4) SUBTRACT_m2n4(-16,4,5,%1)\ | |||
| SOLVE_col3_rtol_m2n4(-32,8,9,%1,%%r12,8) SUBTRACT_m2n4(-32,6,7,%1,%%r12,4) SUBTRACT_m2n4(-32,4,5,%1)\ | |||
| SOLVE_col2_rtol_m2n4(-48,8,9,%1,%%r12,8) SUBTRACT_m2n4(-48,6,7,%1,%%r12,4) SUBTRACT_m2n4(-48,4,5,%1)\ | |||
| SOLVE_col1_rtol_m2n4(-64,8,9,%1,%%r12,8) SUBTRACT_m2n4(-64,6,7,%1,%%r12,4) SUBTRACT_m2n4(-64,4,5,%1)\ | |||
| SAVE_SOLUTION_m2n4(8,9,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ | |||
| SOLVE_col4_rtol_m2n4(-80,6,7,%1,%%r12,4) SUBTRACT_m2n4(-80,4,5,%1)\ | |||
| SOLVE_col3_rtol_m2n4(-96,6,7,%1,%%r12,4) SUBTRACT_m2n4(-96,4,5,%1)\ | |||
| SOLVE_col2_rtol_m2n4(-112,6,7,%1,%%r12,4) SUBTRACT_m2n4(-112,4,5,%1)\ | |||
| SOLVE_col1_rtol_m2n4(-128,6,7,%1,%%r12,4) SUBTRACT_m2n4(-128,4,5,%1)\ | |||
| SAVE_SOLUTION_m2n4(6,7,-64) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ | |||
| SOLVE_col4_rtol_m2n4(-144,4,5,%1)\ | |||
| SOLVE_col3_rtol_m2n4(-160,4,5,%1)\ | |||
| SOLVE_col2_rtol_m2n4(-176,4,5,%1)\ | |||
| SOLVE_col1_rtol_m2n4(-192,4,5,%1)\ | |||
| SAVE_SOLUTION_m2n4(4,5,-96) | |||
| #define SOLVE_RT_m1n4 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\ | |||
| SOLVE_col4_rtol_m1n4(-16,4,%1)\ | |||
| SOLVE_col3_rtol_m1n4(-32,4,%1)\ | |||
| SOLVE_col2_rtol_m1n4(-48,4,%1)\ | |||
| SOLVE_col1_rtol_m1n4(-64,4,%1)\ | |||
| SAVE_SOLUTION_m1n4(4,-16) | |||
| #define SOLVE_RT_m1n8 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\ | |||
| SOLVE_col4_rtol_m1n4(-16,5,%1,%%r12,4) SUBTRACT_m1n4(-16,4,%1)\ | |||
| SOLVE_col3_rtol_m1n4(-32,5,%1,%%r12,4) SUBTRACT_m1n4(-32,4,%1)\ | |||
| SOLVE_col2_rtol_m1n4(-48,5,%1,%%r12,4) SUBTRACT_m1n4(-48,4,%1)\ | |||
| SOLVE_col1_rtol_m1n4(-64,5,%1,%%r12,4) SUBTRACT_m1n4(-64,4,%1)\ | |||
| SAVE_SOLUTION_m1n4(5,-16) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ | |||
| SOLVE_col4_rtol_m1n4(-80,4,%1)\ | |||
| SOLVE_col3_rtol_m1n4(-96,4,%1)\ | |||
| SOLVE_col2_rtol_m1n4(-112,4,%1)\ | |||
| SOLVE_col1_rtol_m1n4(-128,4,%1)\ | |||
| SAVE_SOLUTION_m1n4(4,-32) | |||
| #define SOLVE_RT_m1n12 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\ | |||
| SOLVE_col4_rtol_m1n4(-16,6,%1,%%r12,8) SUBTRACT_m1n4(-16,5,%1,%%r12,4) SUBTRACT_m1n4(-16,4,%1)\ | |||
| SOLVE_col3_rtol_m1n4(-32,6,%1,%%r12,8) SUBTRACT_m1n4(-32,5,%1,%%r12,4) SUBTRACT_m1n4(-32,4,%1)\ | |||
| SOLVE_col2_rtol_m1n4(-48,6,%1,%%r12,8) SUBTRACT_m1n4(-48,5,%1,%%r12,4) SUBTRACT_m1n4(-48,4,%1)\ | |||
| SOLVE_col1_rtol_m1n4(-64,6,%1,%%r12,8) SUBTRACT_m1n4(-64,5,%1,%%r12,4) SUBTRACT_m1n4(-64,4,%1)\ | |||
| SAVE_SOLUTION_m1n4(6,-16) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ | |||
| SOLVE_col4_rtol_m1n4(-80,5,%1,%%r12,4) SUBTRACT_m1n4(-80,4,%1)\ | |||
| SOLVE_col3_rtol_m1n4(-96,5,%1,%%r12,4) SUBTRACT_m1n4(-96,4,%1)\ | |||
| SOLVE_col2_rtol_m1n4(-112,5,%1,%%r12,4) SUBTRACT_m1n4(-112,4,%1)\ | |||
| SOLVE_col1_rtol_m1n4(-128,5,%1,%%r12,4) SUBTRACT_m1n4(-128,4,%1)\ | |||
| SAVE_SOLUTION_m1n4(5,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ | |||
| SOLVE_col4_rtol_m1n4(-144,4,%1)\ | |||
| SOLVE_col3_rtol_m1n4(-160,4,%1)\ | |||
| SOLVE_col2_rtol_m1n4(-176,4,%1)\ | |||
| SOLVE_col1_rtol_m1n4(-192,4,%1)\ | |||
| SAVE_SOLUTION_m1n4(4,-48) | |||
| /* r14 = b_tail, r15 = a_tail, r13 = k-kk */ | |||
| #define GEMM_RT_SIMPLE(mdim,ndim) \ | |||
| "leaq (%%r15,%%r12,"#mdim"),%%r15; movq %%r15,%0; movq %%r13,%5; movq %%r14,%1;" INIT_m##mdim##n##ndim\ | |||
| "testq %5,%5; jz 1"#mdim""#ndim"2f;"\ | |||
| "1"#mdim""#ndim"1:\n\t"\ | |||
| "subq $16,%1; subq $"#mdim"*4,%0;" GEMM_KERNEL_k1m##mdim##n##ndim "decq %5; jnz 1"#mdim""#ndim"1b;"\ | |||
| "1"#mdim""#ndim"2:\n\t" | |||
| #define GEMM_RT_m8n4 GEMM_RT_SIMPLE(8,4) | |||
| #define GEMM_RT_m8n8 GEMM_RT_SIMPLE(8,8) | |||
| #define GEMM_RT_m8n12 \ | |||
| "leaq (%%r15,%%r12,8),%%r15; movq %%r15,%0; movq %%r13,%5; movq %%r14,%1;" INIT_m8n12\ | |||
| "cmpq $8,%5; jb 18122f;"\ | |||
| "18121:\n\t"\ | |||
| "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "subq $8,%5; cmpq $8,%5; jnb 18121b;"\ | |||
| "18122:\n\t"\ | |||
| "testq %5,%5; jz 18124f;"\ | |||
| "18123:\n\t"\ | |||
| "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12 "decq %5; jnz 18123b;"\ | |||
| "18124:\n\t" | |||
| #define GEMM_RT_m4n4 GEMM_RT_SIMPLE(4,4) | |||
| #define GEMM_RT_m4n8 GEMM_RT_SIMPLE(4,8) | |||
| #define GEMM_RT_m4n12 GEMM_RT_SIMPLE(4,12) | |||
| #define GEMM_RT_m2n4 GEMM_RT_SIMPLE(2,4) | |||
| #define GEMM_RT_m2n8 GEMM_RT_SIMPLE(2,8) | |||
| #define GEMM_RT_m2n12 GEMM_RT_SIMPLE(2,12) | |||
| #define GEMM_RT_m1n4 GEMM_RT_SIMPLE(1,4) | |||
| #define GEMM_RT_m1n8 GEMM_RT_SIMPLE(1,8) | |||
| #define GEMM_RT_m1n12 GEMM_RT_SIMPLE(1,12) | |||
| #define COMPUTE(ndim) {\ | |||
| b_ptr -= (ndim-4)*K; c_ptr -= ndim * ldc;\ | |||
| __asm__ __volatile__(\ | |||
| "movq %0,%%r15; movq %6,%%r13; subq %7,%%r13; movq %6,%%r12; salq $2,%%r12; movq %1,%%r14; movq %10,%%r11;"\ | |||
| "cmpq $8,%%r11; jb "#ndim"772f;"\ | |||
| #ndim"771:\n\t"\ | |||
| GEMM_RT_m8n##ndim SOLVE_RT_m8n##ndim "subq $8,%%r11; cmpq $8,%%r11; jnb "#ndim"771b;"\ | |||
| #ndim"772:\n\t"\ | |||
| "testq $4,%%r11; jz "#ndim"773f;"\ | |||
| GEMM_RT_m4n##ndim SOLVE_RT_m4n##ndim "subq $4,%%r11;"\ | |||
| #ndim"773:\n\t"\ | |||
| "testq $2,%%r11; jz "#ndim"774f;"\ | |||
| GEMM_RT_m2n##ndim SOLVE_RT_m2n##ndim "subq $2,%%r11;"\ | |||
| #ndim"774:\n\t"\ | |||
| "testq $1,%%r11; jz "#ndim"775f;"\ | |||
| GEMM_RT_m1n##ndim SOLVE_RT_m1n##ndim "subq $1,%%r11;"\ | |||
| #ndim"775:\n\t"\ | |||
| "movq %%r15,%0; movq %%r14,%1; vzeroupper;"\ | |||
| :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(OFF),"m"(one[0]),"m"(zero[0]),"m"(M)\ | |||
| :"r11","r12","r13","r14","r15","cc","memory",\ | |||
| "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ | |||
| a_ptr -= M * K; b_ptr -= 4 * K; c_ptr -= M; OFF -= ndim;\ | |||
| } | |||
| static void solve_RT(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc){ | |||
| FLOAT a0, b0; | |||
| int i, j, k; | |||
| for (i=n-1;i>=0;i--) { | |||
| b0 = b[i*n+i]; | |||
| for (j=0;j<m;j++) { | |||
| a0 = c[i*ldc+j] * b0; | |||
| a[i*m+j] = c[i*ldc+j] = a0; | |||
| for (k=0;k<i;k++) c[k*ldc+j] -= a0 * b[i*n+k]; | |||
| } | |||
| } | |||
| } | |||
| static void COMPUTE_EDGE_1_nchunk(BLASLONG m, BLASLONG n, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG k, BLASLONG offset) { | |||
| BLASLONG m_count = m, kk = offset; FLOAT *a_ptr = sa, *c_ptr = C; | |||
| for(;m_count>7;m_count-=8){ | |||
| if(k-kk>0) GEMM_KERNEL_N(8,n,k-kk,-1.0,a_ptr+kk*8,sb+kk*n,c_ptr,ldc); | |||
| solve_RT(8,n,a_ptr+(kk-n)*8,sb+(kk-n)*n,c_ptr,ldc); | |||
| a_ptr += k * 8; c_ptr += 8; | |||
| } | |||
| for(;m_count>3;m_count-=4){ | |||
| if(k-kk>0) GEMM_KERNEL_N(4,n,k-kk,-1.0,a_ptr+kk*4,sb+kk*n,c_ptr,ldc); | |||
| solve_RT(4,n,a_ptr+(kk-n)*4,sb+(kk-n)*n,c_ptr,ldc); | |||
| a_ptr += k * 4; c_ptr += 4; | |||
| } | |||
| for(;m_count>1;m_count-=2){ | |||
| if(k-kk>0) GEMM_KERNEL_N(2,n,k-kk,-1.0,a_ptr+kk*2,sb+kk*n,c_ptr,ldc); | |||
| solve_RT(2,n,a_ptr+(kk-n)*2,sb+(kk-n)*n,c_ptr,ldc); | |||
| a_ptr += k * 2; c_ptr += 2; | |||
| } | |||
| if(m_count>0){ | |||
| if(k-kk>0) GEMM_KERNEL_N(1,n,k-kk,-1.0,a_ptr+kk*1,sb+kk*n,c_ptr,ldc); | |||
| solve_RT(1,n,a_ptr+(kk-n)*1,sb+(kk-n)*n,c_ptr,ldc); | |||
| a_ptr += k * 1; c_ptr += 1; | |||
| } | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){ | |||
| float *a_ptr = sa, *b_ptr = sb+n*k, *c_ptr = C+n*ldc, *c_tmp = C; | |||
| float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; | |||
| float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}; | |||
| uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, OFF = (uint64_t)(n-offset), k_cnt = 0; | |||
| BLASLONG n_count = n; | |||
| if(n&1){b_ptr-=k; c_ptr-=ldc; COMPUTE_EDGE_1_nchunk(m,1,a_ptr,b_ptr,c_ptr,ldc,k,OFF); OFF--; n_count--;} | |||
| if(n&2){b_ptr-=k*2; c_ptr-=ldc*2; COMPUTE_EDGE_1_nchunk(m,2,a_ptr,b_ptr,c_ptr,ldc,k,OFF); OFF-=2; n_count-=2;} | |||
| for(;n_count>11;n_count-=12) COMPUTE(12) | |||
| for(;n_count>7;n_count-=8) COMPUTE(8) | |||
| for(;n_count>3;n_count-=4) COMPUTE(4) | |||
| return 0; | |||
| } | |||
| #include "common.h" | |||
| #include <stdint.h> | |||
| #include "strsm_kernel_8x4_haswell_R_common.h" | |||
| #define SOLVE_RT_m8n4 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\ | |||
| SOLVE_rile_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\ | |||
| SOLVE_le_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(6,7,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m8n2(-48,4,5,%1)\ | |||
| SOLVE_le_m8n2(-64,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(4,5,-128) | |||
| #define SOLVE_RT_m8n8 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\ | |||
| SOLVE_rile_m8n2(-8,10,11,%1,%%r12,4) SUBTRACT_m8n2(-16,8,9,%1,%%r12,4) SUBTRACT_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\ | |||
| SOLVE_le_m8n2(-24,10,11,%1,%%r12,4) SUBTRACT_m8n2(-32,8,9,%1,%%r12,4) SUBTRACT_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(10,11,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m8n2(-48,8,9,%1,%%r12,4) SUBTRACT_m8n2(-40,6,7,%1) SUBTRACT_m8n2(-48,4,5,%1)\ | |||
| SOLVE_le_m8n2(-64,8,9,%1,%%r12,4) SUBTRACT_m8n2(-56,6,7,%1) SUBTRACT_m8n2(-64,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(8,9,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m8n2(-72,6,7,%1) SUBTRACT_m8n2(-80,4,5,%1)\ | |||
| SOLVE_le_m8n2(-88,6,7,%1) SUBTRACT_m8n2(-96,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(6,7,-192) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m8n2(-112,4,5,%1)\ | |||
| SOLVE_le_m8n2(-128,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(4,5,-256) | |||
| #define SOLVE_RT_m8n12 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) GEMM_SUM_REORDER_8x4(12,13,14,15,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\ | |||
| SOLVE_rile_m8n2(-8,14,15,%1,%%r12,8) SUBTRACT_m8n2(-16,12,13,%1,%%r12,8) SUBTRACT_m8n2(-8,10,11,%1,%%r12,4) SUBTRACT_m8n2(-16,8,9,%1,%%r12,4) SUBTRACT_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\ | |||
| SOLVE_le_m8n2(-24,14,15,%1,%%r12,8) SUBTRACT_m8n2(-32,12,13,%1,%%r12,8) SUBTRACT_m8n2(-24,10,11,%1,%%r12,4) SUBTRACT_m8n2(-32,8,9,%1,%%r12,4) SUBTRACT_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(14,15,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m8n2(-48,12,13,%1,%%r12,8) SUBTRACT_m8n2(-40,10,11,%1,%%r12,4) SUBTRACT_m8n2(-48,8,9,%1,%%r12,4) SUBTRACT_m8n2(-40,6,7,%1) SUBTRACT_m8n2(-48,4,5,%1)\ | |||
| SOLVE_le_m8n2(-64,12,13,%1,%%r12,8) SUBTRACT_m8n2(-56,10,11,%1,%%r12,4) SUBTRACT_m8n2(-64,8,9,%1,%%r12,4) SUBTRACT_m8n2(-56,6,7,%1) SUBTRACT_m8n2(-64,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(12,13,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m8n2(-72,10,11,%1,%%r12,4) SUBTRACT_m8n2(-80,8,9,%1,%%r12,4) SUBTRACT_m8n2(-72,6,7,%1) SUBTRACT_m8n2(-80,4,5,%1)\ | |||
| SOLVE_le_m8n2(-88,10,11,%1,%%r12,4) SUBTRACT_m8n2(-96,8,9,%1,%%r12,4) SUBTRACT_m8n2(-88,6,7,%1) SUBTRACT_m8n2(-96,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(10,11,-192) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m8n2(-112,8,9,%1,%%r12,4) SUBTRACT_m8n2(-104,6,7,%1) SUBTRACT_m8n2(-112,4,5,%1)\ | |||
| SOLVE_le_m8n2(-128,8,9,%1,%%r12,4) SUBTRACT_m8n2(-120,6,7,%1) SUBTRACT_m8n2(-128,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(8,9,-256) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m8n2(-136,6,7,%1) SUBTRACT_m8n2(-144,4,5,%1)\ | |||
| SOLVE_le_m8n2(-152,6,7,%1) SUBTRACT_m8n2(-160,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(6,7,-320) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m8n2(-176,4,5,%1)\ | |||
| SOLVE_le_m8n2(-192,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(4,5,-384) | |||
| #define SOLVE_RT_m4n4 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\ | |||
| SOLVE_rile_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\ | |||
| SOLVE_le_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(5,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m4n2(-48,4,%1)\ | |||
| SOLVE_le_m4n2(-64,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(4,-64) | |||
| #define SOLVE_RT_m4n8 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\ | |||
| SOLVE_rile_m4n2(-8,7,%1,%%r12,4) SUBTRACT_m4n2(-16,6,%1,%%r12,4) SUBTRACT_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\ | |||
| SOLVE_le_m4n2(-24,7,%1,%%r12,4) SUBTRACT_m4n2(-32,6,%1,%%r12,4) SUBTRACT_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(7,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m4n2(-48,6,%1,%%r12,4) SUBTRACT_m4n2(-40,5,%1) SUBTRACT_m4n2(-48,4,%1)\ | |||
| SOLVE_le_m4n2(-64,6,%1,%%r12,4) SUBTRACT_m4n2(-56,5,%1) SUBTRACT_m4n2(-64,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(6,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m4n2(-72,5,%1) SUBTRACT_m4n2(-80,4,%1)\ | |||
| SOLVE_le_m4n2(-88,5,%1) SUBTRACT_m4n2(-96,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(5,-96) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m4n2(-112,4,%1)\ | |||
| SOLVE_le_m4n2(-128,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(4,-128) | |||
| #define SOLVE_RT_m4n12 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\ | |||
| SOLVE_rile_m4n2(-8,9,%1,%%r12,8) SUBTRACT_m4n2(-16,8,%1,%%r12,8) SUBTRACT_m4n2(-8,7,%1,%%r12,4) SUBTRACT_m4n2(-16,6,%1,%%r12,4) SUBTRACT_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\ | |||
| SOLVE_le_m4n2(-24,9,%1,%%r12,8) SUBTRACT_m4n2(-32,8,%1,%%r12,8) SUBTRACT_m4n2(-24,7,%1,%%r12,4) SUBTRACT_m4n2(-32,6,%1,%%r12,4) SUBTRACT_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(9,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m4n2(-48,8,%1,%%r12,8) SUBTRACT_m4n2(-40,7,%1,%%r12,4) SUBTRACT_m4n2(-48,6,%1,%%r12,4) SUBTRACT_m4n2(-40,5,%1) SUBTRACT_m4n2(-48,4,%1)\ | |||
| SOLVE_le_m4n2(-64,8,%1,%%r12,8) SUBTRACT_m4n2(-56,7,%1,%%r12,4) SUBTRACT_m4n2(-64,6,%1,%%r12,4) SUBTRACT_m4n2(-56,5,%1) SUBTRACT_m4n2(-64,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(8,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m4n2(-72,7,%1,%%r12,4) SUBTRACT_m4n2(-80,6,%1,%%r12,4) SUBTRACT_m4n2(-72,5,%1) SUBTRACT_m4n2(-80,4,%1)\ | |||
| SOLVE_le_m4n2(-88,7,%1,%%r12,4) SUBTRACT_m4n2(-96,6,%1,%%r12,4) SUBTRACT_m4n2(-88,5,%1) SUBTRACT_m4n2(-96,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(7,-96) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m4n2(-112,6,%1,%%r12,4) SUBTRACT_m4n2(-104,5,%1) SUBTRACT_m4n2(-112,4,%1)\ | |||
| SOLVE_le_m4n2(-128,6,%1,%%r12,4) SUBTRACT_m4n2(-120,5,%1) SUBTRACT_m4n2(-128,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(6,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m4n2(-136,5,%1) SUBTRACT_m4n2(-144,4,%1)\ | |||
| SOLVE_le_m4n2(-152,5,%1) SUBTRACT_m4n2(-160,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(5,-160) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m4n2(-176,4,%1)\ | |||
| SOLVE_le_m4n2(-192,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(4,-192) | |||
| #define SOLVE_RT_m2n4 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\ | |||
| SOLVE_col4_rtol_m2n4(-16,4,5,%1)\ | |||
| SOLVE_col3_rtol_m2n4(-32,4,5,%1)\ | |||
| SOLVE_col2_rtol_m2n4(-48,4,5,%1)\ | |||
| SOLVE_col1_rtol_m2n4(-64,4,5,%1)\ | |||
| SAVE_SOLUTION_m2n4(4,5,-32) | |||
| #define SOLVE_RT_m2n8 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\ | |||
| SOLVE_col4_rtol_m2n4(-16,6,7,%1,%%r12,4) SUBTRACT_m2n4(-16,4,5,%1)\ | |||
| SOLVE_col3_rtol_m2n4(-32,6,7,%1,%%r12,4) SUBTRACT_m2n4(-32,4,5,%1)\ | |||
| SOLVE_col2_rtol_m2n4(-48,6,7,%1,%%r12,4) SUBTRACT_m2n4(-48,4,5,%1)\ | |||
| SOLVE_col1_rtol_m2n4(-64,6,7,%1,%%r12,4) SUBTRACT_m2n4(-64,4,5,%1)\ | |||
| SAVE_SOLUTION_m2n4(6,7,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ | |||
| SOLVE_col4_rtol_m2n4(-80,4,5,%1)\ | |||
| SOLVE_col3_rtol_m2n4(-96,4,5,%1)\ | |||
| SOLVE_col2_rtol_m2n4(-112,4,5,%1)\ | |||
| SOLVE_col1_rtol_m2n4(-128,4,5,%1)\ | |||
| SAVE_SOLUTION_m2n4(4,5,-64) | |||
| #define SOLVE_RT_m2n12 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) GEMM_SUM_REORDER_2x4(8,9) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\ | |||
| SOLVE_col4_rtol_m2n4(-16,8,9,%1,%%r12,8) SUBTRACT_m2n4(-16,6,7,%1,%%r12,4) SUBTRACT_m2n4(-16,4,5,%1)\ | |||
| SOLVE_col3_rtol_m2n4(-32,8,9,%1,%%r12,8) SUBTRACT_m2n4(-32,6,7,%1,%%r12,4) SUBTRACT_m2n4(-32,4,5,%1)\ | |||
| SOLVE_col2_rtol_m2n4(-48,8,9,%1,%%r12,8) SUBTRACT_m2n4(-48,6,7,%1,%%r12,4) SUBTRACT_m2n4(-48,4,5,%1)\ | |||
| SOLVE_col1_rtol_m2n4(-64,8,9,%1,%%r12,8) SUBTRACT_m2n4(-64,6,7,%1,%%r12,4) SUBTRACT_m2n4(-64,4,5,%1)\ | |||
| SAVE_SOLUTION_m2n4(8,9,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ | |||
| SOLVE_col4_rtol_m2n4(-80,6,7,%1,%%r12,4) SUBTRACT_m2n4(-80,4,5,%1)\ | |||
| SOLVE_col3_rtol_m2n4(-96,6,7,%1,%%r12,4) SUBTRACT_m2n4(-96,4,5,%1)\ | |||
| SOLVE_col2_rtol_m2n4(-112,6,7,%1,%%r12,4) SUBTRACT_m2n4(-112,4,5,%1)\ | |||
| SOLVE_col1_rtol_m2n4(-128,6,7,%1,%%r12,4) SUBTRACT_m2n4(-128,4,5,%1)\ | |||
| SAVE_SOLUTION_m2n4(6,7,-64) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ | |||
| SOLVE_col4_rtol_m2n4(-144,4,5,%1)\ | |||
| SOLVE_col3_rtol_m2n4(-160,4,5,%1)\ | |||
| SOLVE_col2_rtol_m2n4(-176,4,5,%1)\ | |||
| SOLVE_col1_rtol_m2n4(-192,4,5,%1)\ | |||
| SAVE_SOLUTION_m2n4(4,5,-96) | |||
| #define SOLVE_RT_m1n4 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\ | |||
| SOLVE_col4_rtol_m1n4(-16,4,%1)\ | |||
| SOLVE_col3_rtol_m1n4(-32,4,%1)\ | |||
| SOLVE_col2_rtol_m1n4(-48,4,%1)\ | |||
| SOLVE_col1_rtol_m1n4(-64,4,%1)\ | |||
| SAVE_SOLUTION_m1n4(4,-16) | |||
| #define SOLVE_RT_m1n8 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\ | |||
| SOLVE_col4_rtol_m1n4(-16,5,%1,%%r12,4) SUBTRACT_m1n4(-16,4,%1)\ | |||
| SOLVE_col3_rtol_m1n4(-32,5,%1,%%r12,4) SUBTRACT_m1n4(-32,4,%1)\ | |||
| SOLVE_col2_rtol_m1n4(-48,5,%1,%%r12,4) SUBTRACT_m1n4(-48,4,%1)\ | |||
| SOLVE_col1_rtol_m1n4(-64,5,%1,%%r12,4) SUBTRACT_m1n4(-64,4,%1)\ | |||
| SAVE_SOLUTION_m1n4(5,-16) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ | |||
| SOLVE_col4_rtol_m1n4(-80,4,%1)\ | |||
| SOLVE_col3_rtol_m1n4(-96,4,%1)\ | |||
| SOLVE_col2_rtol_m1n4(-112,4,%1)\ | |||
| SOLVE_col1_rtol_m1n4(-128,4,%1)\ | |||
| SAVE_SOLUTION_m1n4(4,-32) | |||
| #define SOLVE_RT_m1n12 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\ | |||
| SOLVE_col4_rtol_m1n4(-16,6,%1,%%r12,8) SUBTRACT_m1n4(-16,5,%1,%%r12,4) SUBTRACT_m1n4(-16,4,%1)\ | |||
| SOLVE_col3_rtol_m1n4(-32,6,%1,%%r12,8) SUBTRACT_m1n4(-32,5,%1,%%r12,4) SUBTRACT_m1n4(-32,4,%1)\ | |||
| SOLVE_col2_rtol_m1n4(-48,6,%1,%%r12,8) SUBTRACT_m1n4(-48,5,%1,%%r12,4) SUBTRACT_m1n4(-48,4,%1)\ | |||
| SOLVE_col1_rtol_m1n4(-64,6,%1,%%r12,8) SUBTRACT_m1n4(-64,5,%1,%%r12,4) SUBTRACT_m1n4(-64,4,%1)\ | |||
| SAVE_SOLUTION_m1n4(6,-16) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ | |||
| SOLVE_col4_rtol_m1n4(-80,5,%1,%%r12,4) SUBTRACT_m1n4(-80,4,%1)\ | |||
| SOLVE_col3_rtol_m1n4(-96,5,%1,%%r12,4) SUBTRACT_m1n4(-96,4,%1)\ | |||
| SOLVE_col2_rtol_m1n4(-112,5,%1,%%r12,4) SUBTRACT_m1n4(-112,4,%1)\ | |||
| SOLVE_col1_rtol_m1n4(-128,5,%1,%%r12,4) SUBTRACT_m1n4(-128,4,%1)\ | |||
| SAVE_SOLUTION_m1n4(5,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ | |||
| SOLVE_col4_rtol_m1n4(-144,4,%1)\ | |||
| SOLVE_col3_rtol_m1n4(-160,4,%1)\ | |||
| SOLVE_col2_rtol_m1n4(-176,4,%1)\ | |||
| SOLVE_col1_rtol_m1n4(-192,4,%1)\ | |||
| SAVE_SOLUTION_m1n4(4,-48) | |||
| /* r14 = b_tail, r15 = a_tail, r13 = k-kk */ | |||
| #define GEMM_RT_SIMPLE(mdim,ndim) \ | |||
| "leaq (%%r15,%%r12,"#mdim"),%%r15; movq %%r15,%0; movq %%r13,%5; movq %%r14,%1;" INIT_m##mdim##n##ndim\ | |||
| "testq %5,%5; jz 1"#mdim""#ndim"2f;"\ | |||
| "1"#mdim""#ndim"1:\n\t"\ | |||
| "subq $16,%1; subq $"#mdim"*4,%0;" GEMM_KERNEL_k1m##mdim##n##ndim "decq %5; jnz 1"#mdim""#ndim"1b;"\ | |||
| "1"#mdim""#ndim"2:\n\t" | |||
| #define GEMM_RT_m8n4 GEMM_RT_SIMPLE(8,4) | |||
| #define GEMM_RT_m8n8 GEMM_RT_SIMPLE(8,8) | |||
| #define GEMM_RT_m8n12 \ | |||
| "leaq (%%r15,%%r12,8),%%r15; movq %%r15,%0; movq %%r13,%5; movq %%r14,%1;" INIT_m8n12\ | |||
| "cmpq $8,%5; jb 18122f;"\ | |||
| "18121:\n\t"\ | |||
| "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "subq $8,%5; cmpq $8,%5; jnb 18121b;"\ | |||
| "18122:\n\t"\ | |||
| "testq %5,%5; jz 18124f;"\ | |||
| "18123:\n\t"\ | |||
| "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12 "decq %5; jnz 18123b;"\ | |||
| "18124:\n\t" | |||
| #define GEMM_RT_m4n4 GEMM_RT_SIMPLE(4,4) | |||
| #define GEMM_RT_m4n8 GEMM_RT_SIMPLE(4,8) | |||
| #define GEMM_RT_m4n12 GEMM_RT_SIMPLE(4,12) | |||
| #define GEMM_RT_m2n4 GEMM_RT_SIMPLE(2,4) | |||
| #define GEMM_RT_m2n8 GEMM_RT_SIMPLE(2,8) | |||
| #define GEMM_RT_m2n12 GEMM_RT_SIMPLE(2,12) | |||
| #define GEMM_RT_m1n4 GEMM_RT_SIMPLE(1,4) | |||
| #define GEMM_RT_m1n8 GEMM_RT_SIMPLE(1,8) | |||
| #define GEMM_RT_m1n12 GEMM_RT_SIMPLE(1,12) | |||
| #define COMPUTE(ndim) {\ | |||
| b_ptr -= (ndim-4)*K; c_ptr -= ndim * ldc;\ | |||
| __asm__ __volatile__(\ | |||
| "movq %0,%%r15; movq %6,%%r13; subq %7,%%r13; movq %6,%%r12; salq $2,%%r12; movq %1,%%r14; movq %10,%%r11;"\ | |||
| "cmpq $8,%%r11; jb "#ndim"772f;"\ | |||
| #ndim"771:\n\t"\ | |||
| GEMM_RT_m8n##ndim SOLVE_RT_m8n##ndim "subq $8,%%r11; cmpq $8,%%r11; jnb "#ndim"771b;"\ | |||
| #ndim"772:\n\t"\ | |||
| "testq $4,%%r11; jz "#ndim"773f;"\ | |||
| GEMM_RT_m4n##ndim SOLVE_RT_m4n##ndim "subq $4,%%r11;"\ | |||
| #ndim"773:\n\t"\ | |||
| "testq $2,%%r11; jz "#ndim"774f;"\ | |||
| GEMM_RT_m2n##ndim SOLVE_RT_m2n##ndim "subq $2,%%r11;"\ | |||
| #ndim"774:\n\t"\ | |||
| "testq $1,%%r11; jz "#ndim"775f;"\ | |||
| GEMM_RT_m1n##ndim SOLVE_RT_m1n##ndim "subq $1,%%r11;"\ | |||
| #ndim"775:\n\t"\ | |||
| "movq %%r15,%0; movq %%r14,%1; vzeroupper;"\ | |||
| :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(OFF),"m"(one[0]),"m"(zero[0]),"m"(M)\ | |||
| :"r11","r12","r13","r14","r15","cc","memory",\ | |||
| "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ | |||
| a_ptr -= M * K; b_ptr -= 4 * K; c_ptr -= M; OFF -= ndim;\ | |||
| } | |||
| static void solve_RT(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc){ | |||
| FLOAT a0, b0; | |||
| int i, j, k; | |||
| for (i=n-1;i>=0;i--) { | |||
| b0 = b[i*n+i]; | |||
| for (j=0;j<m;j++) { | |||
| a0 = c[i*ldc+j] * b0; | |||
| a[i*m+j] = c[i*ldc+j] = a0; | |||
| for (k=0;k<i;k++) c[k*ldc+j] -= a0 * b[i*n+k]; | |||
| } | |||
| } | |||
| } | |||
| static void COMPUTE_EDGE_1_nchunk(BLASLONG m, BLASLONG n, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG k, BLASLONG offset) { | |||
| BLASLONG m_count = m, kk = offset; FLOAT *a_ptr = sa, *c_ptr = C; | |||
| for(;m_count>7;m_count-=8){ | |||
| if(k-kk>0) GEMM_KERNEL_N(8,n,k-kk,-1.0,a_ptr+kk*8,sb+kk*n,c_ptr,ldc); | |||
| solve_RT(8,n,a_ptr+(kk-n)*8,sb+(kk-n)*n,c_ptr,ldc); | |||
| a_ptr += k * 8; c_ptr += 8; | |||
| } | |||
| for(;m_count>3;m_count-=4){ | |||
| if(k-kk>0) GEMM_KERNEL_N(4,n,k-kk,-1.0,a_ptr+kk*4,sb+kk*n,c_ptr,ldc); | |||
| solve_RT(4,n,a_ptr+(kk-n)*4,sb+(kk-n)*n,c_ptr,ldc); | |||
| a_ptr += k * 4; c_ptr += 4; | |||
| } | |||
| for(;m_count>1;m_count-=2){ | |||
| if(k-kk>0) GEMM_KERNEL_N(2,n,k-kk,-1.0,a_ptr+kk*2,sb+kk*n,c_ptr,ldc); | |||
| solve_RT(2,n,a_ptr+(kk-n)*2,sb+(kk-n)*n,c_ptr,ldc); | |||
| a_ptr += k * 2; c_ptr += 2; | |||
| } | |||
| if(m_count>0){ | |||
| if(k-kk>0) GEMM_KERNEL_N(1,n,k-kk,-1.0,a_ptr+kk*1,sb+kk*n,c_ptr,ldc); | |||
| solve_RT(1,n,a_ptr+(kk-n)*1,sb+(kk-n)*n,c_ptr,ldc); | |||
| a_ptr += k * 1; c_ptr += 1; | |||
| } | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){ | |||
| float *a_ptr = sa, *b_ptr = sb+n*k, *c_ptr = C+n*ldc, *c_tmp = C; | |||
| float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; | |||
| float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}; | |||
| uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, OFF = (uint64_t)(n-offset), k_cnt = 0; | |||
| BLASLONG n_count = n; | |||
| if(n&1){b_ptr-=k; c_ptr-=ldc; COMPUTE_EDGE_1_nchunk(m,1,a_ptr,b_ptr,c_ptr,ldc,k,OFF); OFF--; n_count--;} | |||
| if(n&2){b_ptr-=k*2; c_ptr-=ldc*2; COMPUTE_EDGE_1_nchunk(m,2,a_ptr,b_ptr,c_ptr,ldc,k,OFF); OFF-=2; n_count-=2;} | |||
| for(;n_count>11;n_count-=12) COMPUTE(12) | |||
| for(;n_count>7;n_count-=8) COMPUTE(8) | |||
| for(;n_count>3;n_count-=4) COMPUTE(4) | |||
| return 0; | |||
| } | |||
| @@ -1,226 +1,226 @@ | |||
| /* r11 = m_counter, r12 = size_of_k_elements, r13 = kk, r14 = b_head, r15 = a_head */ | |||
| /* register i/o: %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc, %5 = k_counter */ | |||
| /* memory input: %6 = K, %7 = offset, %8 = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}, %9 = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}, %10 = M */ | |||
| #define init_m8n4(c1,c2,c3,c4)\ | |||
| "vpxor %%ymm"#c1",%%ymm"#c1",%%ymm"#c1"; vpxor %%ymm"#c2",%%ymm"#c2",%%ymm"#c2";"\ | |||
| "vpxor %%ymm"#c3",%%ymm"#c3",%%ymm"#c3"; vpxor %%ymm"#c4",%%ymm"#c4",%%ymm"#c4";" | |||
| #define INIT_m8n4 init_m8n4(4,5,6,7) | |||
| #define INIT_m8n8 INIT_m8n4 init_m8n4(8,9,10,11) | |||
| #define INIT_m8n12 INIT_m8n8 init_m8n4(12,13,14,15) | |||
| #define init_m4n4(c1,c2,c3,c4)\ | |||
| "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";"\ | |||
| "vpxor %%xmm"#c3",%%xmm"#c3",%%xmm"#c3"; vpxor %%xmm"#c4",%%xmm"#c4",%%xmm"#c4";" | |||
| #define INIT_m4n4 init_m4n4(4,5,6,7) | |||
| #define INIT_m4n8 INIT_m4n4 init_m4n4(8,9,10,11) | |||
| #define INIT_m4n12 INIT_m4n8 init_m4n4(12,13,14,15) | |||
| #define init_m2n4(c1,c2)\ | |||
| "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";" | |||
| #define INIT_m2n4 init_m2n4(4,5) | |||
| #define INIT_m2n8 INIT_m2n4 init_m2n4(6,7) | |||
| #define INIT_m2n12 INIT_m2n8 init_m2n4(8,9) | |||
| #define init_m1n4(c1) "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1";" | |||
| #define INIT_m1n4 init_m1n4(4) | |||
| #define INIT_m1n8 INIT_m1n4 init_m1n4(5) | |||
| #define INIT_m1n12 INIT_m1n8 init_m1n4(6) | |||
| #define GEMM_KERNEL_k1m8n4 \ | |||
| "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2;"\ | |||
| "vbroadcastsd (%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm4; vfnmadd231ps %%ymm3,%%ymm2,%%ymm5;"\ | |||
| "vbroadcastsd 8(%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm6; vfnmadd231ps %%ymm3,%%ymm2,%%ymm7;" | |||
| #define GEMM_KERNEL_k1m8n8 GEMM_KERNEL_k1m8n4\ | |||
| "vbroadcastsd (%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm8; vfnmadd231ps %%ymm3,%%ymm2,%%ymm9;"\ | |||
| "vbroadcastsd 8(%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm10; vfnmadd231ps %%ymm3,%%ymm2,%%ymm11;" | |||
| #define GEMM_KERNEL_k1m8n12 GEMM_KERNEL_k1m8n8\ | |||
| "vbroadcastsd (%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm12; vfnmadd231ps %%ymm3,%%ymm2,%%ymm13;"\ | |||
| "vbroadcastsd 8(%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm14; vfnmadd231ps %%ymm3,%%ymm2,%%ymm15;" | |||
| #define GEMM_KERNEL_k1m4n4 \ | |||
| "vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2;"\ | |||
| "vmovddup (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;"\ | |||
| "vmovddup 8(%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;" | |||
| #define GEMM_KERNEL_k1m4n8 GEMM_KERNEL_k1m4n4\ | |||
| "vmovddup (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;"\ | |||
| "vmovddup 8(%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm10; vfnmadd231ps %%xmm3,%%xmm2,%%xmm11;" | |||
| #define GEMM_KERNEL_k1m4n12 GEMM_KERNEL_k1m4n8\ | |||
| "vmovddup (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm12; vfnmadd231ps %%xmm3,%%xmm2,%%xmm13;"\ | |||
| "vmovddup 8(%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm14; vfnmadd231ps %%xmm3,%%xmm2,%%xmm15;" | |||
| #define GEMM_KERNEL_k1m2n4 \ | |||
| "vbroadcastss (%0),%%xmm1; vbroadcastss 4(%0),%%xmm2;"\ | |||
| "vmovups (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;" | |||
| #define GEMM_KERNEL_k1m2n8 GEMM_KERNEL_k1m2n4\ | |||
| "vmovups (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;" | |||
| #define GEMM_KERNEL_k1m2n12 GEMM_KERNEL_k1m2n8\ | |||
| "vmovups (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;" | |||
| #define GEMM_KERNEL_k1m1n4 "vbroadcastss (%0),%%xmm1; vfnmadd231ps (%1),%%xmm1,%%xmm4;" | |||
| #define GEMM_KERNEL_k1m1n8 GEMM_KERNEL_k1m1n4 "vfnmadd231ps (%1,%%r12,4),%%xmm1,%%xmm5;" | |||
| #define GEMM_KERNEL_k1m1n12 GEMM_KERNEL_k1m1n8 "vfnmadd231ps (%1,%%r12,8),%%xmm1,%%xmm6;" | |||
| #define GEMM_SUM_REORDER_8x4(c1,c2,c3,c4,prefpos)\ | |||
| "vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\ | |||
| "vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\ | |||
| "vaddps %%ymm0,%%ymm"#c1",%%ymm"#c1"; vaddps %%ymm1,%%ymm"#c2",%%ymm"#c2";"\ | |||
| "vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\ | |||
| "vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\ | |||
| "vaddps %%ymm0,%%ymm"#c3",%%ymm"#c3"; vaddps %%ymm1,%%ymm"#c4",%%ymm"#c4";" | |||
| #define GEMM_SUM_REORDER_4x4(c1,c2,c3,c4,co1,co2)\ | |||
| "vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\ | |||
| "vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\ | |||
| "vunpcklpd %%xmm"#c2",%%xmm"#c1",%%xmm0; vunpckhpd %%xmm"#c2",%%xmm"#c1",%%xmm1;"\ | |||
| "vaddps %%xmm0,%%xmm2,%%xmm"#c1"; vaddps %%xmm1,%%xmm3,%%xmm"#c2";"\ | |||
| "vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\ | |||
| "vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\ | |||
| "vunpcklpd %%xmm"#c4",%%xmm"#c3",%%xmm0; vunpckhpd %%xmm"#c4",%%xmm"#c3",%%xmm1;"\ | |||
| "vaddps %%xmm0,%%xmm2,%%xmm"#c3"; vaddps %%xmm1,%%xmm3,%%xmm"#c4";"\ | |||
| "vperm2f128 $2,%%ymm"#c1",%%ymm"#c2",%%ymm"#co1"; vperm2f128 $2,%%ymm"#c3",%%ymm"#c4",%%ymm"#co2";" | |||
| #define GEMM_SUM_REORDER_2x4(c1,c2)\ | |||
| "vmovsd (%3),%%xmm0; vmovhpd (%3,%4,1),%%xmm0,%%xmm0; leaq (%3,%4,2),%3; vpermilps $216,%%xmm0,%%xmm0;"\ | |||
| "vmovsd (%3),%%xmm1; vmovhpd (%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3; vpermilps $216,%%xmm1,%%xmm1;"\ | |||
| "vunpcklpd %%xmm1,%%xmm0,%%xmm2; vaddps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ | |||
| "vunpckhpd %%xmm1,%%xmm0,%%xmm3; vaddps %%xmm3,%%xmm"#c2",%%xmm"#c2";"\ | |||
| #define GEMM_SUM_REORDER_1x4(c1)\ | |||
| "vmovss (%3),%%xmm1; vinsertps $16,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\ | |||
| "vinsertps $32,(%3),%%xmm1,%%xmm1; vinsertps $48,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\ | |||
| "vaddps %%xmm"#c1",%%xmm1,%%xmm"#c1";" | |||
| #define SOLVE_le_m4n2(b_off,c1,...)\ | |||
| "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ | |||
| "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\ | |||
| "vmovsldup %%ymm"#c1",%%ymm1;" | |||
| #define SOLVE_le_m8n2(b_off,c1,c2,...)\ | |||
| "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ | |||
| "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\ | |||
| "vmovsldup %%ymm"#c1",%%ymm1; vmovsldup %%ymm"#c2",%%ymm2;" | |||
| #define SOLVE_leri_m4n2(b_off,c1,...) SOLVE_le_m4n2(b_off,c1,__VA_ARGS__)\ | |||
| "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" | |||
| #define SOLVE_leri_m8n2(b_off,c1,c2,...) SOLVE_le_m8n2(b_off,c1,c2,__VA_ARGS__)\ | |||
| "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" | |||
| #define SOLVE_ri_m4n2(b_off,c1,...)\ | |||
| "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ | |||
| "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\ | |||
| "vmovshdup %%ymm"#c1",%%ymm1;" | |||
| #define SOLVE_ri_m8n2(b_off,c1,c2,...)\ | |||
| "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ | |||
| "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\ | |||
| "vmovshdup %%ymm"#c1",%%ymm1; vmovshdup %%ymm"#c2",%%ymm2;" | |||
| #define SOLVE_rile_m4n2(b_off,c1,...) SOLVE_ri_m4n2(b_off,c1,__VA_ARGS__)\ | |||
| "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" | |||
| #define SOLVE_rile_m8n2(b_off,c1,c2,...) SOLVE_ri_m8n2(b_off,c1,c2,__VA_ARGS__)\ | |||
| "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" | |||
| #define SOLVE_col1_rtol_m1n4(b_off,c1,...)\ | |||
| "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\ | |||
| "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ | |||
| "vpermilps $0,%%xmm"#c1",%%xmm1;" | |||
| #define SOLVE_col1_rtol_m2n4(b_off,c1,c2,...)\ | |||
| "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\ | |||
| "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ | |||
| "vpermilps $0,%%xmm"#c1",%%xmm1; vpermilps $0,%%xmm"#c2",%%xmm2;" | |||
| #define SOLVE_col1_ltor_m1n4(b_off,c1,...) SOLVE_col1_rtol_m1n4(b_off,c1,__VA_ARGS__)\ | |||
| "vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
| #define SOLVE_col1_ltor_m2n4(b_off,c1,c2,...) SOLVE_col1_rtol_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||
| "vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
| #define SOLVE_col2_mul_m1n4(b_off,c1,...)\ | |||
| "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\ | |||
| "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ | |||
| "vpermilps $85,%%xmm"#c1",%%xmm1;" | |||
| #define SOLVE_col2_mul_m2n4(b_off,c1,c2,...)\ | |||
| "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\ | |||
| "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ | |||
| "vpermilps $85,%%xmm"#c1",%%xmm1; vpermilps $85,%%xmm"#c2",%%xmm2;" | |||
| #define SOLVE_col2_rtol_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\ | |||
| "vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
| #define SOLVE_col2_rtol_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||
| "vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
| #define SOLVE_col2_ltor_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\ | |||
| "vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
| #define SOLVE_col2_ltor_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||
| "vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
| #define SOLVE_col3_mul_m1n4(b_off,c1,...)\ | |||
| "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\ | |||
| "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ | |||
| "vpermilps $170,%%xmm"#c1",%%xmm1;" | |||
| #define SOLVE_col3_mul_m2n4(b_off,c1,c2,...)\ | |||
| "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\ | |||
| "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ | |||
| "vpermilps $170,%%xmm"#c1",%%xmm1; vpermilps $170,%%xmm"#c2",%%xmm2;" | |||
| #define SOLVE_col3_rtol_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\ | |||
| "vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
| #define SOLVE_col3_rtol_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||
| "vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
| #define SOLVE_col3_ltor_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\ | |||
| "vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
| #define SOLVE_col3_ltor_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||
| "vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
| #define SOLVE_col4_ltor_m1n4(b_off,c1,...)\ | |||
| "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\ | |||
| "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ | |||
| "vpermilps $255,%%xmm"#c1",%%xmm1;" | |||
| #define SOLVE_col4_ltor_m2n4(b_off,c1,c2,...)\ | |||
| "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\ | |||
| "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ | |||
| "vpermilps $255,%%xmm"#c1",%%xmm1; vpermilps $255,%%xmm"#c2",%%xmm2;" | |||
| #define SOLVE_col4_rtol_m1n4(b_off,c1,...) SOLVE_col4_ltor_m1n4(b_off,c1,__VA_ARGS__)\ | |||
| "vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
| #define SOLVE_col4_rtol_m2n4(b_off,c1,c2,...) SOLVE_col4_ltor_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||
| "vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
| #define SUBTRACT_m4n2(b_off,c1,...) "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" | |||
| #define SUBTRACT_m8n2(b_off,c1,c2,...) SUBTRACT_m4n2(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" | |||
| #define SUBTRACT_m1n4(b_off,c1,...) "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
| #define SUBTRACT_m2n4(b_off,c1,c2,...) SUBTRACT_m1n4(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
| #define SAVE_SOLUTION_m8n2(c1,c2,a_off)\ | |||
| "vunpcklps %%ymm"#c2",%%ymm"#c1",%%ymm0; vunpckhps %%ymm"#c2",%%ymm"#c1",%%ymm1;"\ | |||
| "vunpcklpd %%ymm1,%%ymm0,%%ymm"#c1"; vunpckhpd %%ymm1,%%ymm0,%%ymm"#c2";"\ | |||
| "vmovups %%ymm"#c1","#a_off"(%0); vmovups %%ymm"#c2","#a_off"+32(%0);"\ | |||
| "vmovups %%ymm"#c1",(%3); vmovups %%ymm"#c2",(%3,%4,1); leaq (%3,%4,2),%3;" | |||
| #define SAVE_SOLUTION_m4n2(c1,a_off)\ | |||
| "vpermilps $216,%%ymm"#c1",%%ymm"#c1"; vpermpd $216,%%ymm"#c1",%%ymm"#c1";"\ | |||
| "vmovups %%ymm"#c1","#a_off"(%0); vmovups %%xmm"#c1",(%3); vextractf128 $1,%%ymm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;" | |||
| #define SAVE_SOLUTION_m2n4(c1,c2,a_off)\ | |||
| "vunpcklps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;"\ | |||
| "vunpckhps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"+16(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;" | |||
| #define SAVE_SOLUTION_m1n4(c1,a_off)\ | |||
| "vmovups %%xmm"#c1","#a_off"(%0); vmovss %%xmm"#c1",(%3); vextractps $1,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"\ | |||
| "vextractps $2,%%xmm"#c1",(%3); vextractps $3,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;" | |||
| /* r11 = m_counter, r12 = size_of_k_elements, r13 = kk, r14 = b_head, r15 = a_head */ | |||
| /* register i/o: %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc, %5 = k_counter */ | |||
| /* memory input: %6 = K, %7 = offset, %8 = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}, %9 = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}, %10 = M */ | |||
| #define init_m8n4(c1,c2,c3,c4)\ | |||
| "vpxor %%ymm"#c1",%%ymm"#c1",%%ymm"#c1"; vpxor %%ymm"#c2",%%ymm"#c2",%%ymm"#c2";"\ | |||
| "vpxor %%ymm"#c3",%%ymm"#c3",%%ymm"#c3"; vpxor %%ymm"#c4",%%ymm"#c4",%%ymm"#c4";" | |||
| #define INIT_m8n4 init_m8n4(4,5,6,7) | |||
| #define INIT_m8n8 INIT_m8n4 init_m8n4(8,9,10,11) | |||
| #define INIT_m8n12 INIT_m8n8 init_m8n4(12,13,14,15) | |||
| #define init_m4n4(c1,c2,c3,c4)\ | |||
| "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";"\ | |||
| "vpxor %%xmm"#c3",%%xmm"#c3",%%xmm"#c3"; vpxor %%xmm"#c4",%%xmm"#c4",%%xmm"#c4";" | |||
| #define INIT_m4n4 init_m4n4(4,5,6,7) | |||
| #define INIT_m4n8 INIT_m4n4 init_m4n4(8,9,10,11) | |||
| #define INIT_m4n12 INIT_m4n8 init_m4n4(12,13,14,15) | |||
| #define init_m2n4(c1,c2)\ | |||
| "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";" | |||
| #define INIT_m2n4 init_m2n4(4,5) | |||
| #define INIT_m2n8 INIT_m2n4 init_m2n4(6,7) | |||
| #define INIT_m2n12 INIT_m2n8 init_m2n4(8,9) | |||
| #define init_m1n4(c1) "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1";" | |||
| #define INIT_m1n4 init_m1n4(4) | |||
| #define INIT_m1n8 INIT_m1n4 init_m1n4(5) | |||
| #define INIT_m1n12 INIT_m1n8 init_m1n4(6) | |||
| #define GEMM_KERNEL_k1m8n4 \ | |||
| "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2;"\ | |||
| "vbroadcastsd (%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm4; vfnmadd231ps %%ymm3,%%ymm2,%%ymm5;"\ | |||
| "vbroadcastsd 8(%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm6; vfnmadd231ps %%ymm3,%%ymm2,%%ymm7;" | |||
| #define GEMM_KERNEL_k1m8n8 GEMM_KERNEL_k1m8n4\ | |||
| "vbroadcastsd (%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm8; vfnmadd231ps %%ymm3,%%ymm2,%%ymm9;"\ | |||
| "vbroadcastsd 8(%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm10; vfnmadd231ps %%ymm3,%%ymm2,%%ymm11;" | |||
| #define GEMM_KERNEL_k1m8n12 GEMM_KERNEL_k1m8n8\ | |||
| "vbroadcastsd (%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm12; vfnmadd231ps %%ymm3,%%ymm2,%%ymm13;"\ | |||
| "vbroadcastsd 8(%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm14; vfnmadd231ps %%ymm3,%%ymm2,%%ymm15;" | |||
| #define GEMM_KERNEL_k1m4n4 \ | |||
| "vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2;"\ | |||
| "vmovddup (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;"\ | |||
| "vmovddup 8(%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;" | |||
| #define GEMM_KERNEL_k1m4n8 GEMM_KERNEL_k1m4n4\ | |||
| "vmovddup (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;"\ | |||
| "vmovddup 8(%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm10; vfnmadd231ps %%xmm3,%%xmm2,%%xmm11;" | |||
| #define GEMM_KERNEL_k1m4n12 GEMM_KERNEL_k1m4n8\ | |||
| "vmovddup (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm12; vfnmadd231ps %%xmm3,%%xmm2,%%xmm13;"\ | |||
| "vmovddup 8(%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm14; vfnmadd231ps %%xmm3,%%xmm2,%%xmm15;" | |||
| #define GEMM_KERNEL_k1m2n4 \ | |||
| "vbroadcastss (%0),%%xmm1; vbroadcastss 4(%0),%%xmm2;"\ | |||
| "vmovups (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;" | |||
| #define GEMM_KERNEL_k1m2n8 GEMM_KERNEL_k1m2n4\ | |||
| "vmovups (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;" | |||
| #define GEMM_KERNEL_k1m2n12 GEMM_KERNEL_k1m2n8\ | |||
| "vmovups (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;" | |||
| #define GEMM_KERNEL_k1m1n4 "vbroadcastss (%0),%%xmm1; vfnmadd231ps (%1),%%xmm1,%%xmm4;" | |||
| #define GEMM_KERNEL_k1m1n8 GEMM_KERNEL_k1m1n4 "vfnmadd231ps (%1,%%r12,4),%%xmm1,%%xmm5;" | |||
| #define GEMM_KERNEL_k1m1n12 GEMM_KERNEL_k1m1n8 "vfnmadd231ps (%1,%%r12,8),%%xmm1,%%xmm6;" | |||
| #define GEMM_SUM_REORDER_8x4(c1,c2,c3,c4,prefpos)\ | |||
| "vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\ | |||
| "vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\ | |||
| "vaddps %%ymm0,%%ymm"#c1",%%ymm"#c1"; vaddps %%ymm1,%%ymm"#c2",%%ymm"#c2";"\ | |||
| "vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\ | |||
| "vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\ | |||
| "vaddps %%ymm0,%%ymm"#c3",%%ymm"#c3"; vaddps %%ymm1,%%ymm"#c4",%%ymm"#c4";" | |||
| #define GEMM_SUM_REORDER_4x4(c1,c2,c3,c4,co1,co2)\ | |||
| "vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\ | |||
| "vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\ | |||
| "vunpcklpd %%xmm"#c2",%%xmm"#c1",%%xmm0; vunpckhpd %%xmm"#c2",%%xmm"#c1",%%xmm1;"\ | |||
| "vaddps %%xmm0,%%xmm2,%%xmm"#c1"; vaddps %%xmm1,%%xmm3,%%xmm"#c2";"\ | |||
| "vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\ | |||
| "vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\ | |||
| "vunpcklpd %%xmm"#c4",%%xmm"#c3",%%xmm0; vunpckhpd %%xmm"#c4",%%xmm"#c3",%%xmm1;"\ | |||
| "vaddps %%xmm0,%%xmm2,%%xmm"#c3"; vaddps %%xmm1,%%xmm3,%%xmm"#c4";"\ | |||
| "vperm2f128 $2,%%ymm"#c1",%%ymm"#c2",%%ymm"#co1"; vperm2f128 $2,%%ymm"#c3",%%ymm"#c4",%%ymm"#co2";" | |||
| #define GEMM_SUM_REORDER_2x4(c1,c2)\ | |||
| "vmovsd (%3),%%xmm0; vmovhpd (%3,%4,1),%%xmm0,%%xmm0; leaq (%3,%4,2),%3; vpermilps $216,%%xmm0,%%xmm0;"\ | |||
| "vmovsd (%3),%%xmm1; vmovhpd (%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3; vpermilps $216,%%xmm1,%%xmm1;"\ | |||
| "vunpcklpd %%xmm1,%%xmm0,%%xmm2; vaddps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ | |||
| "vunpckhpd %%xmm1,%%xmm0,%%xmm3; vaddps %%xmm3,%%xmm"#c2",%%xmm"#c2";"\ | |||
| #define GEMM_SUM_REORDER_1x4(c1)\ | |||
| "vmovss (%3),%%xmm1; vinsertps $16,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\ | |||
| "vinsertps $32,(%3),%%xmm1,%%xmm1; vinsertps $48,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\ | |||
| "vaddps %%xmm"#c1",%%xmm1,%%xmm"#c1";" | |||
| #define SOLVE_le_m4n2(b_off,c1,...)\ | |||
| "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ | |||
| "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\ | |||
| "vmovsldup %%ymm"#c1",%%ymm1;" | |||
| #define SOLVE_le_m8n2(b_off,c1,c2,...)\ | |||
| "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ | |||
| "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\ | |||
| "vmovsldup %%ymm"#c1",%%ymm1; vmovsldup %%ymm"#c2",%%ymm2;" | |||
| #define SOLVE_leri_m4n2(b_off,c1,...) SOLVE_le_m4n2(b_off,c1,__VA_ARGS__)\ | |||
| "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" | |||
| #define SOLVE_leri_m8n2(b_off,c1,c2,...) SOLVE_le_m8n2(b_off,c1,c2,__VA_ARGS__)\ | |||
| "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" | |||
| #define SOLVE_ri_m4n2(b_off,c1,...)\ | |||
| "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ | |||
| "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\ | |||
| "vmovshdup %%ymm"#c1",%%ymm1;" | |||
| #define SOLVE_ri_m8n2(b_off,c1,c2,...)\ | |||
| "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ | |||
| "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\ | |||
| "vmovshdup %%ymm"#c1",%%ymm1; vmovshdup %%ymm"#c2",%%ymm2;" | |||
| #define SOLVE_rile_m4n2(b_off,c1,...) SOLVE_ri_m4n2(b_off,c1,__VA_ARGS__)\ | |||
| "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" | |||
| #define SOLVE_rile_m8n2(b_off,c1,c2,...) SOLVE_ri_m8n2(b_off,c1,c2,__VA_ARGS__)\ | |||
| "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" | |||
| #define SOLVE_col1_rtol_m1n4(b_off,c1,...)\ | |||
| "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\ | |||
| "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ | |||
| "vpermilps $0,%%xmm"#c1",%%xmm1;" | |||
| #define SOLVE_col1_rtol_m2n4(b_off,c1,c2,...)\ | |||
| "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\ | |||
| "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ | |||
| "vpermilps $0,%%xmm"#c1",%%xmm1; vpermilps $0,%%xmm"#c2",%%xmm2;" | |||
| #define SOLVE_col1_ltor_m1n4(b_off,c1,...) SOLVE_col1_rtol_m1n4(b_off,c1,__VA_ARGS__)\ | |||
| "vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
| #define SOLVE_col1_ltor_m2n4(b_off,c1,c2,...) SOLVE_col1_rtol_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||
| "vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
| #define SOLVE_col2_mul_m1n4(b_off,c1,...)\ | |||
| "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\ | |||
| "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ | |||
| "vpermilps $85,%%xmm"#c1",%%xmm1;" | |||
| #define SOLVE_col2_mul_m2n4(b_off,c1,c2,...)\ | |||
| "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\ | |||
| "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ | |||
| "vpermilps $85,%%xmm"#c1",%%xmm1; vpermilps $85,%%xmm"#c2",%%xmm2;" | |||
| #define SOLVE_col2_rtol_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\ | |||
| "vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
| #define SOLVE_col2_rtol_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||
| "vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
| #define SOLVE_col2_ltor_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\ | |||
| "vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
| #define SOLVE_col2_ltor_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||
| "vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
| #define SOLVE_col3_mul_m1n4(b_off,c1,...)\ | |||
| "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\ | |||
| "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ | |||
| "vpermilps $170,%%xmm"#c1",%%xmm1;" | |||
| #define SOLVE_col3_mul_m2n4(b_off,c1,c2,...)\ | |||
| "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\ | |||
| "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ | |||
| "vpermilps $170,%%xmm"#c1",%%xmm1; vpermilps $170,%%xmm"#c2",%%xmm2;" | |||
| #define SOLVE_col3_rtol_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\ | |||
| "vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
| #define SOLVE_col3_rtol_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||
| "vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
| #define SOLVE_col3_ltor_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\ | |||
| "vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
| #define SOLVE_col3_ltor_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||
| "vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
| #define SOLVE_col4_ltor_m1n4(b_off,c1,...)\ | |||
| "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\ | |||
| "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ | |||
| "vpermilps $255,%%xmm"#c1",%%xmm1;" | |||
| #define SOLVE_col4_ltor_m2n4(b_off,c1,c2,...)\ | |||
| "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\ | |||
| "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ | |||
| "vpermilps $255,%%xmm"#c1",%%xmm1; vpermilps $255,%%xmm"#c2",%%xmm2;" | |||
| #define SOLVE_col4_rtol_m1n4(b_off,c1,...) SOLVE_col4_ltor_m1n4(b_off,c1,__VA_ARGS__)\ | |||
| "vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
| #define SOLVE_col4_rtol_m2n4(b_off,c1,c2,...) SOLVE_col4_ltor_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||
| "vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
| #define SUBTRACT_m4n2(b_off,c1,...) "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" | |||
| #define SUBTRACT_m8n2(b_off,c1,c2,...) SUBTRACT_m4n2(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" | |||
| #define SUBTRACT_m1n4(b_off,c1,...) "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
| #define SUBTRACT_m2n4(b_off,c1,c2,...) SUBTRACT_m1n4(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
| #define SAVE_SOLUTION_m8n2(c1,c2,a_off)\ | |||
| "vunpcklps %%ymm"#c2",%%ymm"#c1",%%ymm0; vunpckhps %%ymm"#c2",%%ymm"#c1",%%ymm1;"\ | |||
| "vunpcklpd %%ymm1,%%ymm0,%%ymm"#c1"; vunpckhpd %%ymm1,%%ymm0,%%ymm"#c2";"\ | |||
| "vmovups %%ymm"#c1","#a_off"(%0); vmovups %%ymm"#c2","#a_off"+32(%0);"\ | |||
| "vmovups %%ymm"#c1",(%3); vmovups %%ymm"#c2",(%3,%4,1); leaq (%3,%4,2),%3;" | |||
| #define SAVE_SOLUTION_m4n2(c1,a_off)\ | |||
| "vpermilps $216,%%ymm"#c1",%%ymm"#c1"; vpermpd $216,%%ymm"#c1",%%ymm"#c1";"\ | |||
| "vmovups %%ymm"#c1","#a_off"(%0); vmovups %%xmm"#c1",(%3); vextractf128 $1,%%ymm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;" | |||
| #define SAVE_SOLUTION_m2n4(c1,c2,a_off)\ | |||
| "vunpcklps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;"\ | |||
| "vunpckhps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"+16(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;" | |||
| #define SAVE_SOLUTION_m1n4(c1,a_off)\ | |||
| "vmovups %%xmm"#c1","#a_off"(%0); vmovss %%xmm"#c1",(%3); vextractps $1,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"\ | |||
| "vextractps $2,%%xmm"#c1",(%3); vextractps $3,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;" | |||
| @@ -49,11 +49,9 @@ | |||
| LAPACKE_dgels (row-major, high-level) Example Program Results | |||
| -- LAPACKE Example routine (version 3.7.0) -- | |||
| -- LAPACKE Example routine -- | |||
| -- LAPACK is a software package provided by Univ. of Tennessee, -- | |||
| -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- | |||
| December 2016 | |||
| */ | |||
| /* Calling DGELS using row-major layout */ | |||
| @@ -66,8 +64,8 @@ | |||
| int main (int argc, const char * argv[]) | |||
| { | |||
| /* Locals */ | |||
| double A[5][3] = {1,1,1,2,3,4,3,5,2,4,2,5,5,4,3}; | |||
| double b[5][2] = {-10,-3,12,14,14,12,16,16,18,16}; | |||
| double A[5][3] = {{1,1,1},{2,3,4},{3,5,2},{4,2,5},{5,4,3}}; | |||
| double b[5][2] = {{-10,-3},{12,14},{14,12},{16,16},{18,16}}; | |||
| lapack_int info,m,n,lda,ldb,nrhs; | |||
| /* Initialization */ | |||
| @@ -25,11 +25,9 @@ | |||
| LAPACKE_dgesv (col-major, high-level) Example Program Results | |||
| -- LAPACKE Example routine (version 3.7.0) -- | |||
| -- LAPACKE Example routine -- | |||
| -- LAPACK is a software package provided by Univ. of Tennessee, -- | |||
| -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- | |||
| December 2016 | |||
| */ | |||
| /* Includes */ | |||
| #include <stdlib.h> | |||
| @@ -94,7 +92,7 @@ int main(int argc, char **argv) { | |||
| /* Check for the exact singularity */ | |||
| if( info > 0 ) { | |||
| printf( "The diagonal element of the triangular factor of A,\n" ); | |||
| printf( "U(%i,%i) is zero, so that A is singular;\n", info, info ); | |||
| printf( "U(%" LAPACK_IFMT ",%" LAPACK_IFMT ") is zero, so that A is singular;\n", info, info ); | |||
| printf( "the solution could not be computed.\n" ); | |||
| exit( 1 ); | |||
| } | |||
| @@ -25,11 +25,9 @@ | |||
| LAPACKE_dgesv (row-major, high-level) Example Program Results | |||
| -- LAPACKE Example routine (version 3.7.0) -- | |||
| -- LAPACKE Example routine -- | |||
| -- LAPACK is a software package provided by Univ. of Tennessee, -- | |||
| -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- | |||
| December 2016 | |||
| */ | |||
| #include <stdlib.h> | |||
| #include <stdio.h> | |||
| @@ -91,7 +89,7 @@ int main(int argc, char **argv) { | |||
| /* Check for the exact singularity */ | |||
| if( info > 0 ) { | |||
| printf( "The diagonal element of the triangular factor of A,\n" ); | |||
| printf( "U(%i,%i) is zero, so that A is singular;\n", info, info ); | |||
| printf( "U(%" LAPACK_IFMT ",%" LAPACK_IFMT ") is zero, so that A is singular;\n", info, info ); | |||
| printf( "the solution could not be computed.\n" ); | |||
| exit( 1 ); | |||
| } | |||
| @@ -28,6 +28,6 @@ void print_matrix_colmajor( char* desc, lapack_int m, lapack_int n, double* mat, | |||
| void print_vector( char* desc, lapack_int n, lapack_int* vec ) { | |||
| lapack_int j; | |||
| printf( "\n %s\n", desc ); | |||
| for( j = 0; j < n; j++ ) printf( " %6i", vec[j] ); | |||
| for( j = 0; j < n; j++ ) printf( " %6" LAPACK_IFMT, vec[j] ); | |||
| printf( "\n" ); | |||
| } | |||
| @@ -12,6 +12,7 @@ | |||
| #include <stdlib.h> | |||
| #include <stdarg.h> | |||
| #include <inttypes.h> | |||
| /* It seems all current Fortran compilers put strlen at end. | |||
| * Some historical compilers put strlen after the str argument | |||
| @@ -80,11 +81,26 @@ extern "C" { | |||
| /*----------------------------------------------------------------------------*/ | |||
| #ifndef lapack_int | |||
| #define lapack_int int | |||
| #if defined(LAPACK_ILP64) | |||
| #define lapack_int int64_t | |||
| #else | |||
| #define lapack_int int32_t | |||
| #endif | |||
| #endif | |||
| /* | |||
| * Integer format string | |||
| */ | |||
| #ifndef LAPACK_IFMT | |||
| #if defined(LAPACK_ILP64) | |||
| #define LAPACK_IFMT PRId64 | |||
| #else | |||
| #define LAPACK_IFMT PRId32 | |||
| #endif | |||
| #endif | |||
| #ifndef lapack_logical | |||
| #define lapack_logical lapack_int | |||
| #define lapack_logical lapack_int | |||
| #endif | |||
| /* f2c, hence clapack and MacOS Accelerate, returns double instead of float | |||
| @@ -115,7 +131,7 @@ typedef lapack_logical (*LAPACK_Z_SELECT2) | |||
| ( const lapack_complex_double*, const lapack_complex_double* ); | |||
| #define LAPACK_lsame_base LAPACK_GLOBAL(lsame,LSAME) | |||
| lapack_logical LAPACK_lsame_base( const char* ca, const char* cb, | |||
| lapack_logical LAPACK_lsame_base( const char* ca, const char* cb, | |||
| lapack_int lca, lapack_int lcb | |||
| #ifdef LAPACK_FORTRAN_STRLEN_END | |||
| , size_t, size_t | |||
| @@ -21986,6 +22002,84 @@ void LAPACK_ztrsyl_base( | |||
| #define LAPACK_ztrsyl(...) LAPACK_ztrsyl_base(__VA_ARGS__) | |||
| #endif | |||
| #define LAPACK_ctrsyl3_base LAPACK_GLOBAL(ctrsyl3,CTRSYL3) | |||
| void LAPACK_ctrsyl3_base( | |||
| char const* trana, char const* tranb, | |||
| lapack_int const* isgn, lapack_int const* m, lapack_int const* n, | |||
| lapack_complex_float const* A, lapack_int const* lda, | |||
| lapack_complex_float const* B, lapack_int const* ldb, | |||
| lapack_complex_float* C, lapack_int const* ldc, float* scale, | |||
| float* swork, lapack_int const *ldswork, | |||
| lapack_int* info | |||
| #ifdef LAPACK_FORTRAN_STRLEN_END | |||
| , size_t, size_t | |||
| #endif | |||
| ); | |||
| #ifdef LAPACK_FORTRAN_STRLEN_END | |||
| #define LAPACK_ctrsyl3(...) LAPACK_ctrsyl3_base(__VA_ARGS__, 1, 1) | |||
| #else | |||
| #define LAPACK_ctrsyl3(...) LAPACK_ctrsyl3_base(__VA_ARGS__) | |||
| #endif | |||
| #define LAPACK_dtrsyl3_base LAPACK_GLOBAL(dtrsyl3,DTRSYL3) | |||
| void LAPACK_dtrsyl3_base( | |||
| char const* trana, char const* tranb, | |||
| lapack_int const* isgn, lapack_int const* m, lapack_int const* n, | |||
| double const* A, lapack_int const* lda, | |||
| double const* B, lapack_int const* ldb, | |||
| double* C, lapack_int const* ldc, double* scale, | |||
| lapack_int* iwork, lapack_int const* liwork, | |||
| double* swork, lapack_int const *ldswork, | |||
| lapack_int* info | |||
| #ifdef LAPACK_FORTRAN_STRLEN_END | |||
| , size_t, size_t | |||
| #endif | |||
| ); | |||
| #ifdef LAPACK_FORTRAN_STRLEN_END | |||
| #define LAPACK_dtrsyl3(...) LAPACK_dtrsyl3_base(__VA_ARGS__, 1, 1) | |||
| #else | |||
| #define LAPACK_dtrsyl3(...) LAPACK_dtrsyl3_base(__VA_ARGS__) | |||
| #endif | |||
| #define LAPACK_strsyl3_base LAPACK_GLOBAL(strsyl3,STRSYL3) | |||
| void LAPACK_strsyl3_base( | |||
| char const* trana, char const* tranb, | |||
| lapack_int const* isgn, lapack_int const* m, lapack_int const* n, | |||
| float const* A, lapack_int const* lda, | |||
| float const* B, lapack_int const* ldb, | |||
| float* C, lapack_int const* ldc, float* scale, | |||
| lapack_int* iwork, lapack_int const* liwork, | |||
| float* swork, lapack_int const *ldswork, | |||
| lapack_int* info | |||
| #ifdef LAPACK_FORTRAN_STRLEN_END | |||
| , size_t, size_t | |||
| #endif | |||
| ); | |||
| #ifdef LAPACK_FORTRAN_STRLEN_END | |||
| #define LAPACK_strsyl3(...) LAPACK_strsyl3_base(__VA_ARGS__, 1, 1) | |||
| #else | |||
| #define LAPACK_strsyl3(...) LAPACK_strsyl3_base(__VA_ARGS__) | |||
| #endif | |||
| #define LAPACK_ztrsyl3_base LAPACK_GLOBAL(ztrsyl3,ZTRSYL3) | |||
| void LAPACK_ztrsyl3_base( | |||
| char const* trana, char const* tranb, | |||
| lapack_int const* isgn, lapack_int const* m, lapack_int const* n, | |||
| lapack_complex_double const* A, lapack_int const* lda, | |||
| lapack_complex_double const* B, lapack_int const* ldb, | |||
| lapack_complex_double* C, lapack_int const* ldc, double* scale, | |||
| double* swork, lapack_int const *ldswork, | |||
| lapack_int* info | |||
| #ifdef LAPACK_FORTRAN_STRLEN_END | |||
| , size_t, size_t | |||
| #endif | |||
| ); | |||
| #ifdef LAPACK_FORTRAN_STRLEN_END | |||
| #define LAPACK_ztrsyl3(...) LAPACK_ztrsyl3_base(__VA_ARGS__, 1, 1) | |||
| #else | |||
| #define LAPACK_ztrsyl3(...) LAPACK_ztrsyl3_base(__VA_ARGS__) | |||
| #endif | |||
| #define LAPACK_ctrtri_base LAPACK_GLOBAL(ctrtri,CTRTRI) | |||
| void LAPACK_ctrtri_base( | |||
| char const* uplo, char const* diag, | |||
| @@ -2313,6 +2313,19 @@ lapack_int LAPACKE_zlagge( int matrix_layout, lapack_int m, lapack_int n, | |||
| float LAPACKE_slamch( char cmach ); | |||
| double LAPACKE_dlamch( char cmach ); | |||
| float LAPACKE_slangb( int matrix_layout, char norm, lapack_int n, | |||
| lapack_int kl, lapack_int ku, const float* ab, | |||
| lapack_int ldab ); | |||
| double LAPACKE_dlangb( int matrix_layout, char norm, lapack_int n, | |||
| lapack_int kl, lapack_int ku, const double* ab, | |||
| lapack_int ldab ); | |||
| float LAPACKE_clangb( int matrix_layout, char norm, lapack_int n, | |||
| lapack_int kl, lapack_int ku, | |||
| const lapack_complex_float* ab, lapack_int ldab ); | |||
| double LAPACKE_zlangb( int matrix_layout, char norm, lapack_int n, | |||
| lapack_int kl, lapack_int ku, | |||
| const lapack_complex_double* ab, lapack_int ldab ); | |||
| float LAPACKE_slange( int matrix_layout, char norm, lapack_int m, | |||
| lapack_int n, const float* a, lapack_int lda ); | |||
| double LAPACKE_dlange( int matrix_layout, char norm, lapack_int m, | |||
| @@ -4477,6 +4490,23 @@ lapack_int LAPACKE_ztrsyl( int matrix_layout, char trana, char tranb, | |||
| lapack_complex_double* c, lapack_int ldc, | |||
| double* scale ); | |||
| lapack_int LAPACKE_strsyl3( int matrix_layout, char trana, char tranb, | |||
| lapack_int isgn, lapack_int m, lapack_int n, | |||
| const float* a, lapack_int lda, const float* b, | |||
| lapack_int ldb, float* c, lapack_int ldc, | |||
| float* scale ); | |||
| lapack_int LAPACKE_dtrsyl3( int matrix_layout, char trana, char tranb, | |||
| lapack_int isgn, lapack_int m, lapack_int n, | |||
| const double* a, lapack_int lda, const double* b, | |||
| lapack_int ldb, double* c, lapack_int ldc, | |||
| double* scale ); | |||
| lapack_int LAPACKE_ztrsyl3( int matrix_layout, char trana, char tranb, | |||
| lapack_int isgn, lapack_int m, lapack_int n, | |||
| const lapack_complex_double* a, lapack_int lda, | |||
| const lapack_complex_double* b, lapack_int ldb, | |||
| lapack_complex_double* c, lapack_int ldc, | |||
| double* scale ); | |||
| lapack_int LAPACKE_strtri( int matrix_layout, char uplo, char diag, lapack_int n, | |||
| float* a, lapack_int lda ); | |||
| lapack_int LAPACKE_dtrtri( int matrix_layout, char uplo, char diag, lapack_int n, | |||
| @@ -7576,6 +7606,21 @@ double LAPACKE_dlapy3_work( double x, double y, double z ); | |||
| float LAPACKE_slamch_work( char cmach ); | |||
| double LAPACKE_dlamch_work( char cmach ); | |||
| float LAPACKE_slangb_work( int matrix_layout, char norm, lapack_int n, | |||
| lapack_int kl, lapack_int ku, const float* ab, | |||
| lapack_int ldab, float* work ); | |||
| double LAPACKE_dlangb_work( int matrix_layout, char norm, lapack_int n, | |||
| lapack_int kl, lapack_int ku, const double* ab, | |||
| lapack_int ldab, double* work ); | |||
| float LAPACKE_clangb_work( int matrix_layout, char norm, lapack_int n, | |||
| lapack_int kl, lapack_int ku, | |||
| const lapack_complex_float* ab, lapack_int ldab, | |||
| float* work ); | |||
| double LAPACKE_zlangb_work( int matrix_layout, char norm, lapack_int n, | |||
| lapack_int kl, lapack_int ku, | |||
| const lapack_complex_double* ab, lapack_int ldab, | |||
| double* work ); | |||
| float LAPACKE_slange_work( int matrix_layout, char norm, lapack_int m, | |||
| lapack_int n, const float* a, lapack_int lda, | |||
| float* work ); | |||
| @@ -10174,6 +10219,35 @@ lapack_int LAPACKE_ztrsyl_work( int matrix_layout, char trana, char tranb, | |||
| lapack_complex_double* c, lapack_int ldc, | |||
| double* scale ); | |||
| lapack_int LAPACKE_strsyl3_work( int matrix_layout, char trana, char tranb, | |||
| lapack_int isgn, lapack_int m, lapack_int n, | |||
| const float* a, lapack_int lda, | |||
| const float* b, lapack_int ldb, | |||
| float* c, lapack_int ldc, float* scale, | |||
| lapack_int* iwork, lapack_int liwork, | |||
| float* swork, lapack_int ldswork ); | |||
| lapack_int LAPACKE_dtrsyl3_work( int matrix_layout, char trana, char tranb, | |||
| lapack_int isgn, lapack_int m, lapack_int n, | |||
| const double* a, lapack_int lda, | |||
| const double* b, lapack_int ldb, | |||
| double* c, lapack_int ldc, double* scale, | |||
| lapack_int* iwork, lapack_int liwork, | |||
| double* swork, lapack_int ldswork ); | |||
| lapack_int LAPACKE_ctrsyl3_work( int matrix_layout, char trana, char tranb, | |||
| lapack_int isgn, lapack_int m, lapack_int n, | |||
| const lapack_complex_float* a, lapack_int lda, | |||
| const lapack_complex_float* b, lapack_int ldb, | |||
| lapack_complex_float* c, lapack_int ldc, | |||
| float* scale, float* swork, | |||
| lapack_int ldswork ); | |||
| lapack_int LAPACKE_ztrsyl3_work( int matrix_layout, char trana, char tranb, | |||
| lapack_int isgn, lapack_int m, lapack_int n, | |||
| const lapack_complex_double* a, lapack_int lda, | |||
| const lapack_complex_double* b, lapack_int ldb, | |||
| lapack_complex_double* c, lapack_int ldc, | |||
| double* scale, double* swork, | |||
| lapack_int ldswork ); | |||
| lapack_int LAPACKE_strtri_work( int matrix_layout, char uplo, char diag, | |||
| lapack_int n, float* a, lapack_int lda ); | |||
| lapack_int LAPACKE_dtrtri_work( int matrix_layout, char uplo, char diag, | |||
| @@ -42,17 +42,29 @@ extern "C" { | |||
| #include <stdlib.h> | |||
| #include <stdint.h> | |||
| #include <inttypes.h> | |||
| #ifndef lapack_int | |||
| #if defined(LAPACK_ILP64) | |||
| #define lapack_int int64_t | |||
| #define lapack_int int64_t | |||
| #else | |||
| #define lapack_int int32_t | |||
| #define lapack_int int32_t | |||
| #endif | |||
| #endif | |||
| /* | |||
| * Integer format string | |||
| */ | |||
| #ifndef LAPACK_IFMT | |||
| #if defined(LAPACK_ILP64) | |||
| #define LAPACK_IFMT PRId64 | |||
| #else | |||
| #define LAPACK_IFMT PRId32 | |||
| #endif | |||
| #endif | |||
| #ifndef lapack_logical | |||
| #define lapack_logical lapack_int | |||
| #define lapack_logical lapack_int | |||
| #endif | |||
| #ifndef LAPACK_COMPLEX_CUSTOM | |||
| @@ -358,6 +358,8 @@ lapacke_clacrm.o \ | |||
| lapacke_clacrm_work.o \ | |||
| lapacke_clag2z.o \ | |||
| lapacke_clag2z_work.o \ | |||
| lapacke_clangb.o \ | |||
| lapacke_clangb_work.o \ | |||
| lapacke_clange.o \ | |||
| lapacke_clange_work.o \ | |||
| lapacke_clanhe.o \ | |||
| @@ -842,6 +844,8 @@ lapacke_dlag2s.o \ | |||
| lapacke_dlag2s_work.o \ | |||
| lapacke_dlamch.o \ | |||
| lapacke_dlamch_work.o \ | |||
| lapacke_dlangb.o \ | |||
| lapacke_dlangb_work.o \ | |||
| lapacke_dlange.o \ | |||
| lapacke_dlange_work.o \ | |||
| lapacke_dlansy.o \ | |||
| @@ -1414,6 +1418,8 @@ lapacke_slacpy.o \ | |||
| lapacke_slacpy_work.o \ | |||
| lapacke_slamch.o \ | |||
| lapacke_slamch_work.o \ | |||
| lapacke_slangb.o \ | |||
| lapacke_slangb_work.o \ | |||
| lapacke_slange.o \ | |||
| lapacke_slange_work.o \ | |||
| lapacke_slansy.o \ | |||
| @@ -2116,6 +2122,8 @@ lapacke_zlacrm.o \ | |||
| lapacke_zlacrm_work.o \ | |||
| lapacke_zlag2c.o \ | |||
| lapacke_zlag2c_work.o \ | |||
| lapacke_zlangb.o \ | |||
| lapacke_zlangb_work.o \ | |||
| lapacke_zlange.o \ | |||
| lapacke_zlange_work.o \ | |||
| lapacke_zlanhe.o \ | |||
| @@ -48,7 +48,6 @@ lapack_int LAPACKE_cgesvdq( int matrix_layout, char joba, char jobp, | |||
| lapack_int lrwork = -1; | |||
| float* rwork = NULL; | |||
| float rwork_query; | |||
| lapack_int i; | |||
| if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { | |||
| LAPACKE_xerbla( "LAPACKE_cgesvdq", -1 ); | |||
| return -1; | |||
| @@ -0,0 +1,73 @@ | |||
| /***************************************************************************** | |||
| Copyright (c) 2022, Intel Corp. | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are met: | |||
| * Redistributions of source code must retain the above copyright notice, | |||
| this list of conditions and the following disclaimer. | |||
| * Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in the | |||
| documentation and/or other materials provided with the distribution. | |||
| * Neither the name of Intel Corporation nor the names of its contributors | |||
| may be used to endorse or promote products derived from this software | |||
| without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |||
| SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |||
| INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |||
| CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |||
| ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF | |||
| THE POSSIBILITY OF SUCH DAMAGE. | |||
| ***************************************************************************** | |||
| * Contents: Native high-level C interface to LAPACK function clangb | |||
| * Author: Simon Märtens | |||
| *****************************************************************************/ | |||
| #include "lapacke_utils.h" | |||
| float LAPACKE_clangb( int matrix_layout, char norm, lapack_int n, | |||
| lapack_int kl, lapack_int ku, | |||
| const lapack_complex_float* ab, lapack_int ldab ) | |||
| { | |||
| lapack_int info = 0; | |||
| float res = 0.; | |||
| float* work = NULL; | |||
| if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { | |||
| LAPACKE_xerbla( "LAPACKE_clangb", -1 ); | |||
| return -1; | |||
| } | |||
| #ifndef LAPACK_DISABLE_NAN_CHECK | |||
| if( LAPACKE_get_nancheck() ) { | |||
| /* Optionally check input matrices for NaNs */ | |||
| if( LAPACKE_cgb_nancheck( matrix_layout, n, n, kl, ku, ab, ldab ) ) { | |||
| return -6; | |||
| } | |||
| } | |||
| #endif | |||
| /* Allocate memory for working array(s) */ | |||
| if( LAPACKE_lsame( norm, 'i' ) ) { | |||
| work = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,n) ); | |||
| if( work == NULL ) { | |||
| info = LAPACK_WORK_MEMORY_ERROR; | |||
| goto exit_level_0; | |||
| } | |||
| } | |||
| /* Call middle-level interface */ | |||
| res = LAPACKE_clangb_work( matrix_layout, norm, n, kl, ku, ab, ldab, work ); | |||
| /* Release memory and exit */ | |||
| if( LAPACKE_lsame( norm, 'i' ) ) { | |||
| LAPACKE_free( work ); | |||
| } | |||
| exit_level_0: | |||
| if( info == LAPACK_WORK_MEMORY_ERROR ) { | |||
| LAPACKE_xerbla( "LAPACKE_clangb", info ); | |||
| } | |||
| return res; | |||
| } | |||
| @@ -0,0 +1,84 @@ | |||
| /***************************************************************************** | |||
| Copyright (c) 2022, Intel Corp. | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are met: | |||
| * Redistributions of source code must retain the above copyright notice, | |||
| this list of conditions and the following disclaimer. | |||
| * Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in the | |||
| documentation and/or other materials provided with the distribution. | |||
| * Neither the name of Intel Corporation nor the names of its contributors | |||
| may be used to endorse or promote products derived from this software | |||
| without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |||
| SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |||
| INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |||
| CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |||
| ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF | |||
| THE POSSIBILITY OF SUCH DAMAGE. | |||
| ***************************************************************************** | |||
| * Contents: Native middle-level C interface to LAPACK function clangb | |||
| * Author: Simon Märtens | |||
| *****************************************************************************/ | |||
| #include "lapacke_utils.h" | |||
| float LAPACKE_clangb_work( int matrix_layout, char norm, lapack_int n, | |||
| lapack_int kl, lapack_int ku, | |||
| const lapack_complex_float* ab, lapack_int ldab, | |||
| float* work ) | |||
| { | |||
| lapack_int info = 0; | |||
| float res = 0.; | |||
| if( matrix_layout == LAPACK_COL_MAJOR ) { | |||
| /* Call LAPACK function and adjust info */ | |||
| res = LAPACK_clangb( &norm, &n, &kl, &ku, ab, &ldab, work ); | |||
| } else if( matrix_layout == LAPACK_ROW_MAJOR ) { | |||
| char norm_lapack; | |||
| float* work_lapack = NULL; | |||
| /* Check leading dimension(s) */ | |||
| if( ldab < kl+ku+1 ) { | |||
| info = -7; | |||
| LAPACKE_xerbla( "LAPACKE_clangb_work", info ); | |||
| return info; | |||
| } | |||
| if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) { | |||
| norm_lapack = 'i'; | |||
| } else if( LAPACKE_lsame( norm, 'i' ) ) { | |||
| norm_lapack = '1'; | |||
| } else { | |||
| norm_lapack = norm; | |||
| } | |||
| /* Allocate memory for work array(s) */ | |||
| if( LAPACKE_lsame( norm_lapack, 'i' ) ) { | |||
| work_lapack = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,n) ); | |||
| if( work_lapack == NULL ) { | |||
| info = LAPACK_WORK_MEMORY_ERROR; | |||
| goto exit_level_0; | |||
| } | |||
| } | |||
| /* Call LAPACK function */ | |||
| res = LAPACK_clangb( &norm, &n, &ku, &kl, ab, &ldab, work ); | |||
| /* Release memory and exit */ | |||
| if( work_lapack ) { | |||
| LAPACKE_free( work_lapack ); | |||
| } | |||
| exit_level_0: | |||
| if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { | |||
| LAPACKE_xerbla( "LAPACKE_clangb_work", info ); | |||
| } | |||
| } else { | |||
| info = -1; | |||
| LAPACKE_xerbla( "LAPACKE_clangb_work", info ); | |||
| } | |||
| return res; | |||
| } | |||
| @@ -50,16 +50,24 @@ lapack_int LAPACKE_ctpmqrt_work( int matrix_layout, char side, char trans, | |||
| info = info - 1; | |||
| } | |||
| } else if( matrix_layout == LAPACK_ROW_MAJOR ) { | |||
| lapack_int lda_t = MAX(1,k); | |||
| lapack_int nrowsA, ncolsA, nrowsV; | |||
| if ( side == LAPACKE_lsame(side, 'l') ) { nrowsA = k; ncolsA = n; nrowsV = m; } | |||
| else if ( side == LAPACKE_lsame(side, 'r') ) { nrowsA = m; ncolsA = k; nrowsV = n; } | |||
| else { | |||
| info = -2; | |||
| LAPACKE_xerbla( "LAPACKE_ctpmqrt_work", info ); | |||
| return info; | |||
| } | |||
| lapack_int lda_t = MAX(1,nrowsA); | |||
| lapack_int ldb_t = MAX(1,m); | |||
| lapack_int ldt_t = MAX(1,ldt); | |||
| lapack_int ldv_t = MAX(1,ldv); | |||
| lapack_int ldt_t = MAX(1,nb); | |||
| lapack_int ldv_t = MAX(1,nrowsV); | |||
| lapack_complex_float* v_t = NULL; | |||
| lapack_complex_float* t_t = NULL; | |||
| lapack_complex_float* a_t = NULL; | |||
| lapack_complex_float* b_t = NULL; | |||
| /* Check leading dimension(s) */ | |||
| if( lda < m ) { | |||
| if( lda < ncolsA ) { | |||
| info = -14; | |||
| LAPACKE_xerbla( "LAPACKE_ctpmqrt_work", info ); | |||
| return info; | |||
| @@ -69,7 +77,7 @@ lapack_int LAPACKE_ctpmqrt_work( int matrix_layout, char side, char trans, | |||
| LAPACKE_xerbla( "LAPACKE_ctpmqrt_work", info ); | |||
| return info; | |||
| } | |||
| if( ldt < nb ) { | |||
| if( ldt < k ) { | |||
| info = -12; | |||
| LAPACKE_xerbla( "LAPACKE_ctpmqrt_work", info ); | |||
| return info; | |||
| @@ -87,13 +95,13 @@ lapack_int LAPACKE_ctpmqrt_work( int matrix_layout, char side, char trans, | |||
| goto exit_level_0; | |||
| } | |||
| t_t = (lapack_complex_float*) | |||
| LAPACKE_malloc( sizeof(lapack_complex_float) * ldt_t * MAX(1,nb) ); | |||
| LAPACKE_malloc( sizeof(lapack_complex_float) * ldt_t * MAX(1,k) ); | |||
| if( t_t == NULL ) { | |||
| info = LAPACK_TRANSPOSE_MEMORY_ERROR; | |||
| goto exit_level_1; | |||
| } | |||
| a_t = (lapack_complex_float*) | |||
| LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,m) ); | |||
| LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,ncolsA) ); | |||
| if( a_t == NULL ) { | |||
| info = LAPACK_TRANSPOSE_MEMORY_ERROR; | |||
| goto exit_level_2; | |||
| @@ -105,10 +113,10 @@ lapack_int LAPACKE_ctpmqrt_work( int matrix_layout, char side, char trans, | |||
| goto exit_level_3; | |||
| } | |||
| /* Transpose input matrices */ | |||
| LAPACKE_cge_trans( matrix_layout, ldv, k, v, ldv, v_t, ldv_t ); | |||
| LAPACKE_cge_trans( matrix_layout, ldt, nb, t, ldt, t_t, ldt_t ); | |||
| LAPACKE_cge_trans( matrix_layout, k, m, a, lda, a_t, lda_t ); | |||
| LAPACKE_cge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t ); | |||
| LAPACKE_cge_trans( LAPACK_ROW_MAJOR, nrowsV, k, v, ldv, v_t, ldv_t ); | |||
| LAPACKE_cge_trans( LAPACK_ROW_MAJOR, nb, k, t, ldt, t_t, ldt_t ); | |||
| LAPACKE_cge_trans( LAPACK_ROW_MAJOR, nrowsA, ncolsA, a, lda, a_t, lda_t ); | |||
| LAPACKE_cge_trans( LAPACK_ROW_MAJOR, m, n, b, ldb, b_t, ldb_t ); | |||
| /* Call LAPACK function and adjust info */ | |||
| LAPACK_ctpmqrt( &side, &trans, &m, &n, &k, &l, &nb, v_t, &ldv_t, t_t, | |||
| &ldt_t, a_t, &lda_t, b_t, &ldb_t, work, &info ); | |||
| @@ -116,7 +124,7 @@ lapack_int LAPACKE_ctpmqrt_work( int matrix_layout, char side, char trans, | |||
| info = info - 1; | |||
| } | |||
| /* Transpose output matrices */ | |||
| LAPACKE_cge_trans( LAPACK_COL_MAJOR, k, m, a_t, lda_t, a, lda ); | |||
| LAPACKE_cge_trans( LAPACK_COL_MAJOR, nrowsA, ncolsA, a_t, lda_t, a, lda ); | |||
| LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb ); | |||
| /* Release memory and exit */ | |||
| LAPACKE_free( b_t ); | |||
| @@ -0,0 +1,56 @@ | |||
| #include "lapacke_utils.h" | |||
| lapack_int LAPACKE_ctrsyl3( int matrix_layout, char trana, char tranb, | |||
| lapack_int isgn, lapack_int m, lapack_int n, | |||
| const lapack_complex_float* a, lapack_int lda, | |||
| const lapack_complex_float* b, lapack_int ldb, | |||
| lapack_complex_float* c, lapack_int ldc, | |||
| float* scale ) | |||
| { | |||
| lapack_int info = 0; | |||
| float swork_query[2]; | |||
| float* swork = NULL; | |||
| lapack_int ldswork = -1; | |||
| lapack_int swork_size = -1; | |||
| if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { | |||
| LAPACKE_xerbla( "LAPACKE_ctrsyl3", -1 ); | |||
| return -1; | |||
| } | |||
| #ifndef LAPACK_DISABLE_NAN_CHECK | |||
| if( LAPACKE_get_nancheck() ) { | |||
| /* Optionally check input matrices for NaNs */ | |||
| if( LAPACKE_cge_nancheck( matrix_layout, m, m, a, lda ) ) { | |||
| return -7; | |||
| } | |||
| if( LAPACKE_cge_nancheck( matrix_layout, n, n, b, ldb ) ) { | |||
| return -9; | |||
| } | |||
| if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) { | |||
| return -11; | |||
| } | |||
| } | |||
| #endif | |||
| /* Query optimal working array sizes */ | |||
| info = LAPACKE_ctrsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, lda, | |||
| b, ldb, c, ldc, scale, swork_query, ldswork ); | |||
| if( info != 0 ) { | |||
| goto exit_level_0; | |||
| } | |||
| ldswork = swork_query[0]; | |||
| swork_size = ldswork * swork_query[1]; | |||
| swork = (float*)LAPACKE_malloc( sizeof(float) * swork_size); | |||
| if( swork == NULL ) { | |||
| info = LAPACK_WORK_MEMORY_ERROR; | |||
| goto exit_level_0; | |||
| } | |||
| /* Call middle-level interface */ | |||
| info = LAPACKE_ctrsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, | |||
| lda, b, ldb, c, ldc, scale, swork, ldswork ); | |||
| /* Release memory and exit */ | |||
| LAPACKE_free( swork ); | |||
| exit_level_0: | |||
| if( info == LAPACK_WORK_MEMORY_ERROR ) { | |||
| LAPACKE_xerbla( "LAPACKE_ctrsyl3", info ); | |||
| } | |||
| return info; | |||
| } | |||
| @@ -0,0 +1,88 @@ | |||
| #include "lapacke_utils.h" | |||
| lapack_int LAPACKE_ctrsyl3_work( int matrix_layout, char trana, char tranb, | |||
| lapack_int isgn, lapack_int m, lapack_int n, | |||
| const lapack_complex_float* a, lapack_int lda, | |||
| const lapack_complex_float* b, lapack_int ldb, | |||
| lapack_complex_float* c, lapack_int ldc, | |||
| float* scale, float* swork, | |||
| lapack_int ldswork ) | |||
| { | |||
| lapack_int info = 0; | |||
| if( matrix_layout == LAPACK_COL_MAJOR ) { | |||
| /* Call LAPACK function and adjust info */ | |||
| LAPACK_ctrsyl3( &trana, &tranb, &isgn, &m, &n, a, &lda, b, &ldb, c, &ldc, | |||
| scale, swork, &ldswork, &info ); | |||
| if( info < 0 ) { | |||
| info = info - 1; | |||
| } | |||
| } else if( matrix_layout == LAPACK_ROW_MAJOR ) { | |||
| lapack_int lda_t = MAX(1,m); | |||
| lapack_int ldb_t = MAX(1,n); | |||
| lapack_int ldc_t = MAX(1,m); | |||
| lapack_complex_float* a_t = NULL; | |||
| lapack_complex_float* b_t = NULL; | |||
| lapack_complex_float* c_t = NULL; | |||
| /* Check leading dimension(s) */ | |||
| if( lda < m ) { | |||
| info = -8; | |||
| LAPACKE_xerbla( "LAPACKE_ctrsyl3_work", info ); | |||
| return info; | |||
| } | |||
| if( ldb < n ) { | |||
| info = -10; | |||
| LAPACKE_xerbla( "LAPACKE_ctrsyl3_work", info ); | |||
| return info; | |||
| } | |||
| if( ldc < n ) { | |||
| info = -12; | |||
| LAPACKE_xerbla( "LAPACKE_ctrsyl3_work", info ); | |||
| return info; | |||
| } | |||
| /* Allocate memory for temporary array(s) */ | |||
| a_t = (lapack_complex_float*) | |||
| LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,m) ); | |||
| if( a_t == NULL ) { | |||
| info = LAPACK_TRANSPOSE_MEMORY_ERROR; | |||
| goto exit_level_0; | |||
| } | |||
| b_t = (lapack_complex_float*) | |||
| LAPACKE_malloc( sizeof(lapack_complex_float) * ldb_t * MAX(1,n) ); | |||
| if( b_t == NULL ) { | |||
| info = LAPACK_TRANSPOSE_MEMORY_ERROR; | |||
| goto exit_level_1; | |||
| } | |||
| c_t = (lapack_complex_float*) | |||
| LAPACKE_malloc( sizeof(lapack_complex_float) * ldc_t * MAX(1,n) ); | |||
| if( c_t == NULL ) { | |||
| info = LAPACK_TRANSPOSE_MEMORY_ERROR; | |||
| goto exit_level_2; | |||
| } | |||
| /* Transpose input matrices */ | |||
| LAPACKE_cge_trans( matrix_layout, m, m, a, lda, a_t, lda_t ); | |||
| LAPACKE_cge_trans( matrix_layout, n, n, b, ldb, b_t, ldb_t ); | |||
| LAPACKE_cge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); | |||
| /* Call LAPACK function and adjust info */ | |||
| LAPACK_ctrsyl3( &trana, &tranb, &isgn, &m, &n, a_t, &lda_t, b_t, &ldb_t, | |||
| c_t, &ldc_t, scale, swork, &ldswork, &info ); | |||
| if( info < 0 ) { | |||
| info = info - 1; | |||
| } | |||
| /* Transpose output matrices */ | |||
| LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, c_t, ldc_t, c, ldc ); | |||
| /* Release memory and exit */ | |||
| LAPACKE_free( c_t ); | |||
| exit_level_2: | |||
| LAPACKE_free( b_t ); | |||
| exit_level_1: | |||
| LAPACKE_free( a_t ); | |||
| exit_level_0: | |||
| if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { | |||
| LAPACKE_xerbla( "LAPACKE_ctrsyl3_work", info ); | |||
| } | |||
| } else { | |||
| info = -1; | |||
| LAPACKE_xerbla( "LAPACKE_ctrsyl3_work", info ); | |||
| } | |||
| return info; | |||
| } | |||
| @@ -48,7 +48,6 @@ lapack_int LAPACKE_dgesvdq( int matrix_layout, char joba, char jobp, | |||
| lapack_int lrwork = -1; | |||
| double* rwork = NULL; | |||
| double rwork_query; | |||
| lapack_int i; | |||
| if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { | |||
| LAPACKE_xerbla( "LAPACKE_dgesvdq", -1 ); | |||
| return -1; | |||
| @@ -0,0 +1,73 @@ | |||
| /***************************************************************************** | |||
| Copyright (c) 2022, Intel Corp. | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are met: | |||
| * Redistributions of source code must retain the above copyright notice, | |||
| this list of conditions and the following disclaimer. | |||
| * Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in the | |||
| documentation and/or other materials provided with the distribution. | |||
| * Neither the name of Intel Corporation nor the names of its contributors | |||
| may be used to endorse or promote products derived from this software | |||
| without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |||
| SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |||
| INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |||
| CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |||
| ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF | |||
| THE POSSIBILITY OF SUCH DAMAGE. | |||
| ***************************************************************************** | |||
| * Contents: Native high-level C interface to LAPACK function dlangb | |||
| * Author: Simon Märtens | |||
| *****************************************************************************/ | |||
| #include "lapacke_utils.h" | |||
| double LAPACKE_dlangb( int matrix_layout, char norm, lapack_int n, | |||
| lapack_int kl, lapack_int ku, const double* ab, | |||
| lapack_int ldab ) | |||
| { | |||
| lapack_int info = 0; | |||
| double res = 0.; | |||
| double* work = NULL; | |||
| if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { | |||
| LAPACKE_xerbla( "LAPACKE_dlangb", -1 ); | |||
| return -1; | |||
| } | |||
| #ifndef LAPACK_DISABLE_NAN_CHECK | |||
| if( LAPACKE_get_nancheck() ) { | |||
| /* Optionally check input matrices for NaNs */ | |||
| if( LAPACKE_dgb_nancheck( matrix_layout, n, n, kl, ku, ab, ldab ) ) { | |||
| return -6; | |||
| } | |||
| } | |||
| #endif | |||
| /* Allocate memory for working array(s) */ | |||
| if( LAPACKE_lsame( norm, 'i' ) ) { | |||
| work = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,n) ); | |||
| if( work == NULL ) { | |||
| info = LAPACK_WORK_MEMORY_ERROR; | |||
| goto exit_level_0; | |||
| } | |||
| } | |||
| /* Call middle-level interface */ | |||
| res = LAPACKE_dlangb_work( matrix_layout, norm, n, kl, ku, ab, ldab, work ); | |||
| /* Release memory and exit */ | |||
| if( LAPACKE_lsame( norm, 'i' ) ) { | |||
| LAPACKE_free( work ); | |||
| } | |||
| exit_level_0: | |||
| if( info == LAPACK_WORK_MEMORY_ERROR ) { | |||
| LAPACKE_xerbla( "LAPACKE_dlangb", info ); | |||
| } | |||
| return res; | |||
| } | |||
| @@ -0,0 +1,83 @@ | |||
| /***************************************************************************** | |||
| Copyright (c) 2022, Intel Corp. | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are met: | |||
| * Redistributions of source code must retain the above copyright notice, | |||
| this list of conditions and the following disclaimer. | |||
| * Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in the | |||
| documentation and/or other materials provided with the distribution. | |||
| * Neither the name of Intel Corporation nor the names of its contributors | |||
| may be used to endorse or promote products derived from this software | |||
| without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |||
| SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |||
| INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |||
| CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |||
| ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF | |||
| THE POSSIBILITY OF SUCH DAMAGE. | |||
| ***************************************************************************** | |||
| * Contents: Native middle-level C interface to LAPACK function dlangb | |||
| * Author: Simon Märtens | |||
| *****************************************************************************/ | |||
| #include "lapacke_utils.h" | |||
| double LAPACKE_dlangb_work( int matrix_layout, char norm, lapack_int n, | |||
| lapack_int kl, lapack_int ku, const double* ab, | |||
| lapack_int ldab, double* work ) | |||
| { | |||
| lapack_int info = 0; | |||
| double res = 0.; | |||
| if( matrix_layout == LAPACK_COL_MAJOR ) { | |||
| /* Call LAPACK function and adjust info */ | |||
| res = LAPACK_dlangb( &norm, &n, &kl, &ku, ab, &ldab, work ); | |||
| } else if( matrix_layout == LAPACK_ROW_MAJOR ) { | |||
| char norm_lapack; | |||
| double* work_lapack = NULL; | |||
| /* Check leading dimension(s) */ | |||
| if( ldab < kl+ku+1 ) { | |||
| info = -7; | |||
| LAPACKE_xerbla( "LAPACKE_dlangb_work", info ); | |||
| return info; | |||
| } | |||
| if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) { | |||
| norm_lapack = 'i'; | |||
| } else if( LAPACKE_lsame( norm, 'i' ) ) { | |||
| norm_lapack = '1'; | |||
| } else { | |||
| norm_lapack = norm; | |||
| } | |||
| /* Allocate memory for work array(s) */ | |||
| if( LAPACKE_lsame( norm_lapack, 'i' ) ) { | |||
| work_lapack = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,n) ); | |||
| if( work_lapack == NULL ) { | |||
| info = LAPACK_WORK_MEMORY_ERROR; | |||
| goto exit_level_0; | |||
| } | |||
| } | |||
| /* Call LAPACK function */ | |||
| res = LAPACK_dlangb( &norm, &n, &ku, &kl, ab, &ldab, work ); | |||
| /* Release memory and exit */ | |||
| if( work_lapack ) { | |||
| LAPACKE_free( work_lapack ); | |||
| } | |||
| exit_level_0: | |||
| if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { | |||
| LAPACKE_xerbla( "LAPACKE_dlangb_work", info ); | |||
| } | |||
| } else { | |||
| info = -1; | |||
| LAPACKE_xerbla( "LAPACKE_dlangb_work", info ); | |||
| } | |||
| return res; | |||
| } | |||
| @@ -48,16 +48,24 @@ lapack_int LAPACKE_dtpmqrt_work( int matrix_layout, char side, char trans, | |||
| info = info - 1; | |||
| } | |||
| } else if( matrix_layout == LAPACK_ROW_MAJOR ) { | |||
| lapack_int lda_t = MAX(1,k); | |||
| lapack_int nrowsA, ncolsA, nrowsV; | |||
| if ( side == LAPACKE_lsame(side, 'l') ) { nrowsA = k; ncolsA = n; nrowsV = m; } | |||
| else if ( side == LAPACKE_lsame(side, 'r') ) { nrowsA = m; ncolsA = k; nrowsV = n; } | |||
| else { | |||
| info = -2; | |||
| LAPACKE_xerbla( "LAPACKE_dtpmqrt_work", info ); | |||
| return info; | |||
| } | |||
| lapack_int lda_t = MAX(1,nrowsA); | |||
| lapack_int ldb_t = MAX(1,m); | |||
| lapack_int ldt_t = MAX(1,ldt); | |||
| lapack_int ldv_t = MAX(1,ldv); | |||
| lapack_int ldt_t = MAX(1,nb); | |||
| lapack_int ldv_t = MAX(1,nrowsV); | |||
| double* v_t = NULL; | |||
| double* t_t = NULL; | |||
| double* a_t = NULL; | |||
| double* b_t = NULL; | |||
| /* Check leading dimension(s) */ | |||
| if( lda < m ) { | |||
| if( lda < ncolsA ) { | |||
| info = -14; | |||
| LAPACKE_xerbla( "LAPACKE_dtpmqrt_work", info ); | |||
| return info; | |||
| @@ -67,7 +75,7 @@ lapack_int LAPACKE_dtpmqrt_work( int matrix_layout, char side, char trans, | |||
| LAPACKE_xerbla( "LAPACKE_dtpmqrt_work", info ); | |||
| return info; | |||
| } | |||
| if( ldt < nb ) { | |||
| if( ldt < k ) { | |||
| info = -12; | |||
| LAPACKE_xerbla( "LAPACKE_dtpmqrt_work", info ); | |||
| return info; | |||
| @@ -83,12 +91,12 @@ lapack_int LAPACKE_dtpmqrt_work( int matrix_layout, char side, char trans, | |||
| info = LAPACK_TRANSPOSE_MEMORY_ERROR; | |||
| goto exit_level_0; | |||
| } | |||
| t_t = (double*)LAPACKE_malloc( sizeof(double) * ldt_t * MAX(1,nb) ); | |||
| t_t = (double*)LAPACKE_malloc( sizeof(double) * ldt_t * MAX(1,k) ); | |||
| if( t_t == NULL ) { | |||
| info = LAPACK_TRANSPOSE_MEMORY_ERROR; | |||
| goto exit_level_1; | |||
| } | |||
| a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,m) ); | |||
| a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,ncolsA) ); | |||
| if( a_t == NULL ) { | |||
| info = LAPACK_TRANSPOSE_MEMORY_ERROR; | |||
| goto exit_level_2; | |||
| @@ -99,10 +107,10 @@ lapack_int LAPACKE_dtpmqrt_work( int matrix_layout, char side, char trans, | |||
| goto exit_level_3; | |||
| } | |||
| /* Transpose input matrices */ | |||
| LAPACKE_dge_trans( matrix_layout, ldv, k, v, ldv, v_t, ldv_t ); | |||
| LAPACKE_dge_trans( matrix_layout, ldt, nb, t, ldt, t_t, ldt_t ); | |||
| LAPACKE_dge_trans( matrix_layout, k, m, a, lda, a_t, lda_t ); | |||
| LAPACKE_dge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t ); | |||
| LAPACKE_dge_trans( LAPACK_ROW_MAJOR, nrowsV, k, v, ldv, v_t, ldv_t ); | |||
| LAPACKE_dge_trans( LAPACK_ROW_MAJOR, nb, k, t, ldt, t_t, ldt_t ); | |||
| LAPACKE_dge_trans( LAPACK_ROW_MAJOR, nrowsA, ncolsA, a, lda, a_t, lda_t ); | |||
| LAPACKE_dge_trans( LAPACK_ROW_MAJOR, m, n, b, ldb, b_t, ldb_t ); | |||
| /* Call LAPACK function and adjust info */ | |||
| LAPACK_dtpmqrt( &side, &trans, &m, &n, &k, &l, &nb, v_t, &ldv_t, t_t, | |||
| &ldt_t, a_t, &lda_t, b_t, &ldb_t, work, &info ); | |||
| @@ -110,7 +118,7 @@ lapack_int LAPACKE_dtpmqrt_work( int matrix_layout, char side, char trans, | |||
| info = info - 1; | |||
| } | |||
| /* Transpose output matrices */ | |||
| LAPACKE_dge_trans( LAPACK_COL_MAJOR, k, m, a_t, lda_t, a, lda ); | |||
| LAPACKE_dge_trans( LAPACK_COL_MAJOR, nrowsA, ncolsA, a_t, lda_t, a, lda ); | |||
| LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb ); | |||
| /* Release memory and exit */ | |||
| LAPACKE_free( b_t ); | |||
| @@ -0,0 +1,68 @@ | |||
| #include "lapacke_utils.h" | |||
| lapack_int LAPACKE_dtrsyl3( int matrix_layout, char trana, char tranb, | |||
| lapack_int isgn, lapack_int m, lapack_int n, | |||
| const double* a, lapack_int lda, const double* b, | |||
| lapack_int ldb, double* c, lapack_int ldc, | |||
| double* scale ) | |||
| { | |||
| lapack_int info = 0; | |||
| double swork_query[2]; | |||
| double* swork = NULL; | |||
| lapack_int ldswork = -1; | |||
| lapack_int swork_size = -1; | |||
| lapack_int iwork_query; | |||
| lapack_int* iwork = NULL; | |||
| lapack_int liwork = -1; | |||
| if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { | |||
| LAPACKE_xerbla( "LAPACKE_dtrsyl3", -1 ); | |||
| return -1; | |||
| } | |||
| #ifndef LAPACK_DISABLE_NAN_CHECK | |||
| if( LAPACKE_get_nancheck() ) { | |||
| /* Optionally check input matrices for NaNs */ | |||
| if( LAPACKE_dge_nancheck( matrix_layout, m, m, a, lda ) ) { | |||
| return -7; | |||
| } | |||
| if( LAPACKE_dge_nancheck( matrix_layout, n, n, b, ldb ) ) { | |||
| return -9; | |||
| } | |||
| if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) { | |||
| return -11; | |||
| } | |||
| } | |||
| #endif | |||
| /* Query optimal working array sizes */ | |||
| info = LAPACKE_dtrsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, lda, | |||
| b, ldb, c, ldc, scale, &iwork_query, liwork, | |||
| swork_query, ldswork ); | |||
| if( info != 0 ) { | |||
| goto exit_level_0; | |||
| } | |||
| ldswork = swork_query[0]; | |||
| swork_size = ldswork * swork_query[1]; | |||
| swork = (double*)LAPACKE_malloc( sizeof(double) * swork_size); | |||
| if( swork == NULL ) { | |||
| info = LAPACK_WORK_MEMORY_ERROR; | |||
| goto exit_level_0; | |||
| } | |||
| liwork = iwork_query; | |||
| iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); | |||
| if ( iwork == NULL ) { | |||
| info = LAPACK_WORK_MEMORY_ERROR; | |||
| goto exit_level_1; | |||
| } | |||
| /* Call middle-level interface */ | |||
| info = LAPACKE_dtrsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, | |||
| lda, b, ldb, c, ldc, scale, iwork, liwork, | |||
| swork, ldswork ); | |||
| /* Release memory and exit */ | |||
| LAPACKE_free( iwork ); | |||
| exit_level_1: | |||
| LAPACKE_free( swork ); | |||
| exit_level_0: | |||
| if( info == LAPACK_WORK_MEMORY_ERROR ) { | |||
| LAPACKE_xerbla( "LAPACKE_dtrsyl3", info ); | |||
| } | |||
| return info; | |||
| } | |||
| @@ -0,0 +1,86 @@ | |||
| #include "lapacke_utils.h" | |||
| lapack_int LAPACKE_dtrsyl3_work( int matrix_layout, char trana, char tranb, | |||
| lapack_int isgn, lapack_int m, lapack_int n, | |||
| const double* a, lapack_int lda, | |||
| const double* b, lapack_int ldb, double* c, | |||
| lapack_int ldc, double* scale, | |||
| lapack_int* iwork, lapack_int liwork, | |||
| double* swork, lapack_int ldswork ) | |||
| { | |||
| lapack_int info = 0; | |||
| if( matrix_layout == LAPACK_COL_MAJOR ) { | |||
| /* Call LAPACK function and adjust info */ | |||
| LAPACK_dtrsyl3( &trana, &tranb, &isgn, &m, &n, a, &lda, b, &ldb, c, &ldc, | |||
| scale, iwork, &liwork, swork, &ldswork, &info ); | |||
| if( info < 0 ) { | |||
| info = info - 1; | |||
| } | |||
| } else if( matrix_layout == LAPACK_ROW_MAJOR ) { | |||
| lapack_int lda_t = MAX(1,m); | |||
| lapack_int ldb_t = MAX(1,n); | |||
| lapack_int ldc_t = MAX(1,m); | |||
| double* a_t = NULL; | |||
| double* b_t = NULL; | |||
| double* c_t = NULL; | |||
| /* Check leading dimension(s) */ | |||
| if( lda < m ) { | |||
| info = -8; | |||
| LAPACKE_xerbla( "LAPACKE_dtrsyl3_work", info ); | |||
| return info; | |||
| } | |||
| if( ldb < n ) { | |||
| info = -10; | |||
| LAPACKE_xerbla( "LAPACKE_dtrsyl3_work", info ); | |||
| return info; | |||
| } | |||
| if( ldc < n ) { | |||
| info = -12; | |||
| LAPACKE_xerbla( "LAPACKE_dtrsyl3_work", info ); | |||
| return info; | |||
| } | |||
| /* Allocate memory for temporary array(s) */ | |||
| a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,m) ); | |||
| if( a_t == NULL ) { | |||
| info = LAPACK_TRANSPOSE_MEMORY_ERROR; | |||
| goto exit_level_0; | |||
| } | |||
| b_t = (double*)LAPACKE_malloc( sizeof(double) * ldb_t * MAX(1,n) ); | |||
| if( b_t == NULL ) { | |||
| info = LAPACK_TRANSPOSE_MEMORY_ERROR; | |||
| goto exit_level_1; | |||
| } | |||
| c_t = (double*)LAPACKE_malloc( sizeof(double) * ldc_t * MAX(1,n) ); | |||
| if( c_t == NULL ) { | |||
| info = LAPACK_TRANSPOSE_MEMORY_ERROR; | |||
| goto exit_level_2; | |||
| } | |||
| /* Transpose input matrices */ | |||
| LAPACKE_dge_trans( matrix_layout, m, m, a, lda, a_t, lda_t ); | |||
| LAPACKE_dge_trans( matrix_layout, n, n, b, ldb, b_t, ldb_t ); | |||
| LAPACKE_dge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); | |||
| /* Call LAPACK function and adjust info */ | |||
| LAPACK_dtrsyl3( &trana, &tranb, &isgn, &m, &n, a_t, &lda_t, b_t, &ldb_t, | |||
| c_t, &ldc_t, scale, iwork, &liwork, swork, &ldswork, | |||
| &info ); | |||
| if( info < 0 ) { | |||
| info = info - 1; | |||
| } | |||
| /* Transpose output matrices */ | |||
| LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, c_t, ldc_t, c, ldc ); | |||
| /* Release memory and exit */ | |||
| LAPACKE_free( c_t ); | |||
| exit_level_2: | |||
| LAPACKE_free( b_t ); | |||
| exit_level_1: | |||
| LAPACKE_free( a_t ); | |||
| exit_level_0: | |||
| if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { | |||
| LAPACKE_xerbla( "LAPACKE_dtrsyl3_work", info ); | |||
| } | |||
| } else { | |||
| info = -1; | |||
| LAPACKE_xerbla( "LAPACKE_dtrsyl3_work", info ); | |||
| } | |||
| return info; | |||
| } | |||
| @@ -48,7 +48,6 @@ lapack_int LAPACKE_sgesvdq( int matrix_layout, char joba, char jobp, | |||
| lapack_int lrwork = -1; | |||
| float* rwork = NULL; | |||
| float rwork_query; | |||
| lapack_int i; | |||
| if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { | |||
| LAPACKE_xerbla( "LAPACKE_sgesvdq", -1 ); | |||
| return -1; | |||
| @@ -0,0 +1,73 @@ | |||
| /***************************************************************************** | |||
| Copyright (c) 2022, Intel Corp. | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are met: | |||
| * Redistributions of source code must retain the above copyright notice, | |||
| this list of conditions and the following disclaimer. | |||
| * Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in the | |||
| documentation and/or other materials provided with the distribution. | |||
| * Neither the name of Intel Corporation nor the names of its contributors | |||
| may be used to endorse or promote products derived from this software | |||
| without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |||
| SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |||
| INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |||
| CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |||
| ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF | |||
| THE POSSIBILITY OF SUCH DAMAGE. | |||
| ***************************************************************************** | |||
| * Contents: Native high-level C interface to LAPACK function slangb | |||
| * Author: Simon Märtens | |||
| *****************************************************************************/ | |||
| #include "lapacke_utils.h" | |||
| float LAPACKE_slangb( int matrix_layout, char norm, lapack_int n, | |||
| lapack_int kl, lapack_int ku, const float* ab, | |||
| lapack_int ldab ) | |||
| { | |||
| lapack_int info = 0; | |||
| float res = 0.; | |||
| float* work = NULL; | |||
| if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { | |||
| LAPACKE_xerbla( "LAPACKE_slangb", -1 ); | |||
| return -1; | |||
| } | |||
| #ifndef LAPACK_DISABLE_NAN_CHECK | |||
| if( LAPACKE_get_nancheck() ) { | |||
| /* Optionally check input matrices for NaNs */ | |||
| if( LAPACKE_sgb_nancheck( matrix_layout, n, n, kl, ku, ab, ldab ) ) { | |||
| return -6; | |||
| } | |||
| } | |||
| #endif | |||
| /* Allocate memory for working array(s) */ | |||
| if( LAPACKE_lsame( norm, 'i' ) ) { | |||
| work = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,n) ); | |||
| if( work == NULL ) { | |||
| info = LAPACK_WORK_MEMORY_ERROR; | |||
| goto exit_level_0; | |||
| } | |||
| } | |||
| /* Call middle-level interface */ | |||
| res = LAPACKE_slangb_work( matrix_layout, norm, n, kl, ku, ab, ldab, work ); | |||
| /* Release memory and exit */ | |||
| if( LAPACKE_lsame( norm, 'i' ) ) { | |||
| LAPACKE_free( work ); | |||
| } | |||
| exit_level_0: | |||
| if( info == LAPACK_WORK_MEMORY_ERROR ) { | |||
| LAPACKE_xerbla( "LAPACKE_slangb", info ); | |||
| } | |||
| return res; | |||
| } | |||
| @@ -0,0 +1,83 @@ | |||
| /***************************************************************************** | |||
| Copyright (c) 2022, Intel Corp. | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are met: | |||
| * Redistributions of source code must retain the above copyright notice, | |||
| this list of conditions and the following disclaimer. | |||
| * Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in the | |||
| documentation and/or other materials provided with the distribution. | |||
| * Neither the name of Intel Corporation nor the names of its contributors | |||
| may be used to endorse or promote products derived from this software | |||
| without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |||
| SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |||
| INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |||
| CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |||
| ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF | |||
| THE POSSIBILITY OF SUCH DAMAGE. | |||
| ***************************************************************************** | |||
| * Contents: Native middle-level C interface to LAPACK function slangb | |||
| * Author: Simon Märtens | |||
| *****************************************************************************/ | |||
| #include "lapacke_utils.h" | |||
| float LAPACKE_slangb_work( int matrix_layout, char norm, lapack_int n, | |||
| lapack_int kl, lapack_int ku, const float* ab, | |||
| lapack_int ldab, float* work ) | |||
| { | |||
| lapack_int info = 0; | |||
| float res = 0.; | |||
| if( matrix_layout == LAPACK_COL_MAJOR ) { | |||
| /* Call LAPACK function and adjust info */ | |||
| res = LAPACK_slangb( &norm, &n, &kl, &ku, ab, &ldab, work ); | |||
| } else if( matrix_layout == LAPACK_ROW_MAJOR ) { | |||
| char norm_lapack; | |||
| float* work_lapack = NULL; | |||
| /* Check leading dimension(s) */ | |||
| if( ldab < kl+ku+1 ) { | |||
| info = -7; | |||
| LAPACKE_xerbla( "LAPACKE_slangb_work", info ); | |||
| return info; | |||
| } | |||
| if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) { | |||
| norm_lapack = 'i'; | |||
| } else if( LAPACKE_lsame( norm, 'i' ) ) { | |||
| norm_lapack = '1'; | |||
| } else { | |||
| norm_lapack = norm; | |||
| } | |||
| /* Allocate memory for work array(s) */ | |||
| if( LAPACKE_lsame( norm_lapack, 'i' ) ) { | |||
| work_lapack = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,n) ); | |||
| if( work_lapack == NULL ) { | |||
| info = LAPACK_WORK_MEMORY_ERROR; | |||
| goto exit_level_0; | |||
| } | |||
| } | |||
| /* Call LAPACK function */ | |||
| res = LAPACK_slangb( &norm, &n, &ku, &kl, ab, &ldab, work ); | |||
| /* Release memory and exit */ | |||
| if( work_lapack ) { | |||
| LAPACKE_free( work_lapack ); | |||
| } | |||
| exit_level_0: | |||
| if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { | |||
| LAPACKE_xerbla( "LAPACKE_slangb_work", info ); | |||
| } | |||
| } else { | |||
| info = -1; | |||
| LAPACKE_xerbla( "LAPACKE_slangb_work", info ); | |||
| } | |||
| return res; | |||
| } | |||
| @@ -48,16 +48,24 @@ lapack_int LAPACKE_stpmqrt_work( int matrix_layout, char side, char trans, | |||
| info = info - 1; | |||
| } | |||
| } else if( matrix_layout == LAPACK_ROW_MAJOR ) { | |||
| lapack_int lda_t = MAX(1,k); | |||
| lapack_int nrowsA, ncolsA, nrowsV; | |||
| if ( side == LAPACKE_lsame(side, 'l') ) { nrowsA = k; ncolsA = n; nrowsV = m; } | |||
| else if ( side == LAPACKE_lsame(side, 'r') ) { nrowsA = m; ncolsA = k; nrowsV = n; } | |||
| else { | |||
| info = -2; | |||
| LAPACKE_xerbla( "LAPACKE_stpmqrt_work", info ); | |||
| return info; | |||
| } | |||
| lapack_int lda_t = MAX(1,nrowsA); | |||
| lapack_int ldb_t = MAX(1,m); | |||
| lapack_int ldt_t = MAX(1,ldt); | |||
| lapack_int ldv_t = MAX(1,ldv); | |||
| lapack_int ldt_t = MAX(1,nb); | |||
| lapack_int ldv_t = MAX(1,nrowsV); | |||
| float* v_t = NULL; | |||
| float* t_t = NULL; | |||
| float* a_t = NULL; | |||
| float* b_t = NULL; | |||
| /* Check leading dimension(s) */ | |||
| if( lda < m ) { | |||
| if( lda < ncolsA ) { | |||
| info = -14; | |||
| LAPACKE_xerbla( "LAPACKE_stpmqrt_work", info ); | |||
| return info; | |||
| @@ -67,7 +75,7 @@ lapack_int LAPACKE_stpmqrt_work( int matrix_layout, char side, char trans, | |||
| LAPACKE_xerbla( "LAPACKE_stpmqrt_work", info ); | |||
| return info; | |||
| } | |||
| if( ldt < nb ) { | |||
| if( ldt < k ) { | |||
| info = -12; | |||
| LAPACKE_xerbla( "LAPACKE_stpmqrt_work", info ); | |||
| return info; | |||
| @@ -83,12 +91,12 @@ lapack_int LAPACKE_stpmqrt_work( int matrix_layout, char side, char trans, | |||
| info = LAPACK_TRANSPOSE_MEMORY_ERROR; | |||
| goto exit_level_0; | |||
| } | |||
| t_t = (float*)LAPACKE_malloc( sizeof(float) * ldt_t * MAX(1,nb) ); | |||
| t_t = (float*)LAPACKE_malloc( sizeof(float) * ldt_t * MAX(1,k) ); | |||
| if( t_t == NULL ) { | |||
| info = LAPACK_TRANSPOSE_MEMORY_ERROR; | |||
| goto exit_level_1; | |||
| } | |||
| a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,m) ); | |||
| a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,ncolsA) ); | |||
| if( a_t == NULL ) { | |||
| info = LAPACK_TRANSPOSE_MEMORY_ERROR; | |||
| goto exit_level_2; | |||
| @@ -99,10 +107,10 @@ lapack_int LAPACKE_stpmqrt_work( int matrix_layout, char side, char trans, | |||
| goto exit_level_3; | |||
| } | |||
| /* Transpose input matrices */ | |||
| LAPACKE_sge_trans( matrix_layout, ldv, k, v, ldv, v_t, ldv_t ); | |||
| LAPACKE_sge_trans( matrix_layout, ldt, nb, t, ldt, t_t, ldt_t ); | |||
| LAPACKE_sge_trans( matrix_layout, k, m, a, lda, a_t, lda_t ); | |||
| LAPACKE_sge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t ); | |||
| LAPACKE_sge_trans( LAPACK_ROW_MAJOR, nrowsV, k, v, ldv, v_t, ldv_t ); | |||
| LAPACKE_sge_trans( LAPACK_ROW_MAJOR, nb, k, t, ldt, t_t, ldt_t ); | |||
| LAPACKE_sge_trans( LAPACK_ROW_MAJOR, nrowsA, ncolsA, a, lda, a_t, lda_t ); | |||
| LAPACKE_sge_trans( LAPACK_ROW_MAJOR, m, n, b, ldb, b_t, ldb_t ); | |||
| /* Call LAPACK function and adjust info */ | |||
| LAPACK_stpmqrt( &side, &trans, &m, &n, &k, &l, &nb, v_t, &ldv_t, t_t, | |||
| &ldt_t, a_t, &lda_t, b_t, &ldb_t, work, &info ); | |||
| @@ -110,7 +118,7 @@ lapack_int LAPACKE_stpmqrt_work( int matrix_layout, char side, char trans, | |||
| info = info - 1; | |||
| } | |||
| /* Transpose output matrices */ | |||
| LAPACKE_sge_trans( LAPACK_COL_MAJOR, k, m, a_t, lda_t, a, lda ); | |||
| LAPACKE_sge_trans( LAPACK_COL_MAJOR, nrowsA, ncolsA, a_t, lda_t, a, lda ); | |||
| LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb ); | |||
| /* Release memory and exit */ | |||
| LAPACKE_free( b_t ); | |||
| @@ -0,0 +1,68 @@ | |||
| #include "lapacke_utils.h" | |||
| lapack_int LAPACKE_strsyl3( int matrix_layout, char trana, char tranb, | |||
| lapack_int isgn, lapack_int m, lapack_int n, | |||
| const float* a, lapack_int lda, const float* b, | |||
| lapack_int ldb, float* c, lapack_int ldc, | |||
| float* scale ) | |||
| { | |||
| lapack_int info = 0; | |||
| float swork_query[2]; | |||
| float* swork = NULL; | |||
| lapack_int ldswork = -1; | |||
| lapack_int swork_size = -1; | |||
| lapack_int iwork_query; | |||
| lapack_int* iwork = NULL; | |||
| lapack_int liwork = -1; | |||
| if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { | |||
| LAPACKE_xerbla( "LAPACKE_strsyl3", -1 ); | |||
| return -1; | |||
| } | |||
| #ifndef LAPACK_DISABLE_NAN_CHECK | |||
| if( LAPACKE_get_nancheck() ) { | |||
| /* Optionally check input matrices for NaNs */ | |||
| if( LAPACKE_sge_nancheck( matrix_layout, m, m, a, lda ) ) { | |||
| return -7; | |||
| } | |||
| if( LAPACKE_sge_nancheck( matrix_layout, n, n, b, ldb ) ) { | |||
| return -9; | |||
| } | |||
| if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) { | |||
| return -11; | |||
| } | |||
| } | |||
| #endif | |||
| /* Query optimal working array sizes */ | |||
| info = LAPACKE_strsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, lda, | |||
| b, ldb, c, ldc, scale, &iwork_query, liwork, | |||
| swork_query, ldswork ); | |||
| if( info != 0 ) { | |||
| goto exit_level_0; | |||
| } | |||
| ldswork = swork_query[0]; | |||
| swork_size = ldswork * swork_query[1]; | |||
| swork = (float*)LAPACKE_malloc( sizeof(float) * swork_size); | |||
| if( swork == NULL ) { | |||
| info = LAPACK_WORK_MEMORY_ERROR; | |||
| goto exit_level_0; | |||
| } | |||
| liwork = iwork_query; | |||
| iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); | |||
| if ( iwork == NULL ) { | |||
| info = LAPACK_WORK_MEMORY_ERROR; | |||
| goto exit_level_1; | |||
| } | |||
| /* Call middle-level interface */ | |||
| info = LAPACKE_strsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, | |||
| lda, b, ldb, c, ldc, scale, iwork, liwork, | |||
| swork, ldswork ); | |||
| /* Release memory and exit */ | |||
| LAPACKE_free( iwork ); | |||
| exit_level_1: | |||
| LAPACKE_free( swork ); | |||
| exit_level_0: | |||
| if( info == LAPACK_WORK_MEMORY_ERROR ) { | |||
| LAPACKE_xerbla( "LAPACKE_strsyl3", info ); | |||
| } | |||
| return info; | |||
| } | |||
| @@ -0,0 +1,86 @@ | |||
| #include "lapacke_utils.h" | |||
| lapack_int LAPACKE_strsyl3_work( int matrix_layout, char trana, char tranb, | |||
| lapack_int isgn, lapack_int m, lapack_int n, | |||
| const float* a, lapack_int lda, | |||
| const float* b, lapack_int ldb, float* c, | |||
| lapack_int ldc, float* scale, | |||
| lapack_int* iwork, lapack_int liwork, | |||
| float* swork, lapack_int ldswork ) | |||
| { | |||
| lapack_int info = 0; | |||
| if( matrix_layout == LAPACK_COL_MAJOR ) { | |||
| /* Call LAPACK function and adjust info */ | |||
| LAPACK_strsyl3( &trana, &tranb, &isgn, &m, &n, a, &lda, b, &ldb, c, &ldc, | |||
| scale, iwork, &liwork, swork, &ldswork, &info ); | |||
| if( info < 0 ) { | |||
| info = info - 1; | |||
| } | |||
| } else if( matrix_layout == LAPACK_ROW_MAJOR ) { | |||
| lapack_int lda_t = MAX(1,m); | |||
| lapack_int ldb_t = MAX(1,n); | |||
| lapack_int ldc_t = MAX(1,m); | |||
| float* a_t = NULL; | |||
| float* b_t = NULL; | |||
| float* c_t = NULL; | |||
| /* Check leading dimension(s) */ | |||
| if( lda < m ) { | |||
| info = -8; | |||
| LAPACKE_xerbla( "LAPACKE_strsyl3_work", info ); | |||
| return info; | |||
| } | |||
| if( ldb < n ) { | |||
| info = -10; | |||
| LAPACKE_xerbla( "LAPACKE_strsyl3_work", info ); | |||
| return info; | |||
| } | |||
| if( ldc < n ) { | |||
| info = -12; | |||
| LAPACKE_xerbla( "LAPACKE_strsyl3_work", info ); | |||
| return info; | |||
| } | |||
| /* Allocate memory for temporary array(s) */ | |||
| a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,m) ); | |||
| if( a_t == NULL ) { | |||
| info = LAPACK_TRANSPOSE_MEMORY_ERROR; | |||
| goto exit_level_0; | |||
| } | |||
| b_t = (float*)LAPACKE_malloc( sizeof(float) * ldb_t * MAX(1,n) ); | |||
| if( b_t == NULL ) { | |||
| info = LAPACK_TRANSPOSE_MEMORY_ERROR; | |||
| goto exit_level_1; | |||
| } | |||
| c_t = (float*)LAPACKE_malloc( sizeof(float) * ldc_t * MAX(1,n) ); | |||
| if( c_t == NULL ) { | |||
| info = LAPACK_TRANSPOSE_MEMORY_ERROR; | |||
| goto exit_level_2; | |||
| } | |||
| /* Transpose input matrices */ | |||
| LAPACKE_sge_trans( matrix_layout, m, m, a, lda, a_t, lda_t ); | |||
| LAPACKE_sge_trans( matrix_layout, n, n, b, ldb, b_t, ldb_t ); | |||
| LAPACKE_sge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); | |||
| /* Call LAPACK function and adjust info */ | |||
| LAPACK_strsyl3( &trana, &tranb, &isgn, &m, &n, a_t, &lda_t, b_t, &ldb_t, | |||
| c_t, &ldc_t, scale, iwork, &liwork, swork, &ldswork, | |||
| &info ); | |||
| if( info < 0 ) { | |||
| info = info - 1; | |||
| } | |||
| /* Transpose output matrices */ | |||
| LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, c_t, ldc_t, c, ldc ); | |||
| /* Release memory and exit */ | |||
| LAPACKE_free( c_t ); | |||
| exit_level_2: | |||
| LAPACKE_free( b_t ); | |||
| exit_level_1: | |||
| LAPACKE_free( a_t ); | |||
| exit_level_0: | |||
| if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { | |||
| LAPACKE_xerbla( "LAPACKE_strsyl3_work", info ); | |||
| } | |||
| } else { | |||
| info = -1; | |||
| LAPACKE_xerbla( "LAPACKE_strsyl3_work", info ); | |||
| } | |||
| return info; | |||
| } | |||
| @@ -48,7 +48,6 @@ lapack_int LAPACKE_zgesvdq( int matrix_layout, char joba, char jobp, | |||
| lapack_int lrwork = -1; | |||
| double* rwork = NULL; | |||
| double rwork_query; | |||
| lapack_int i; | |||
| if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { | |||
| LAPACKE_xerbla( "LAPACKE_zgesvdq", -1 ); | |||
| return -1; | |||
| @@ -0,0 +1,73 @@ | |||
| /***************************************************************************** | |||
| Copyright (c) 2022, Intel Corp. | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are met: | |||
| * Redistributions of source code must retain the above copyright notice, | |||
| this list of conditions and the following disclaimer. | |||
| * Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in the | |||
| documentation and/or other materials provided with the distribution. | |||
| * Neither the name of Intel Corporation nor the names of its contributors | |||
| may be used to endorse or promote products derived from this software | |||
| without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |||
| SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |||
| INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |||
| CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |||
| ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF | |||
| THE POSSIBILITY OF SUCH DAMAGE. | |||
| ***************************************************************************** | |||
| * Contents: Native high-level C interface to LAPACK function zlangb | |||
| * Author: Simon Märtens | |||
| *****************************************************************************/ | |||
| #include "lapacke_utils.h" | |||
| double LAPACKE_zlangb( int matrix_layout, char norm, lapack_int n, | |||
| lapack_int kl, lapack_int ku, | |||
| const lapack_complex_double* ab, lapack_int ldab ) | |||
| { | |||
| lapack_int info = 0; | |||
| double res = 0.; | |||
| double* work = NULL; | |||
| if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { | |||
| LAPACKE_xerbla( "LAPACKE_zlangb", -1 ); | |||
| return -1; | |||
| } | |||
| #ifndef LAPACK_DISABLE_NAN_CHECK | |||
| if( LAPACKE_get_nancheck() ) { | |||
| /* Optionally check input matrices for NaNs */ | |||
| if( LAPACKE_zgb_nancheck( matrix_layout, n, n, kl, ku, ab, ldab ) ) { | |||
| return -6; | |||
| } | |||
| } | |||
| #endif | |||
| /* Allocate memory for working array(s) */ | |||
| if( LAPACKE_lsame( norm, 'i' ) ) { | |||
| work = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,n) ); | |||
| if( work == NULL ) { | |||
| info = LAPACK_WORK_MEMORY_ERROR; | |||
| goto exit_level_0; | |||
| } | |||
| } | |||
| /* Call middle-level interface */ | |||
| res = LAPACKE_zlangb_work( matrix_layout, norm, n, kl, ku, ab, ldab, work ); | |||
| /* Release memory and exit */ | |||
| if( LAPACKE_lsame( norm, 'i' ) ) { | |||
| LAPACKE_free( work ); | |||
| } | |||
| exit_level_0: | |||
| if( info == LAPACK_WORK_MEMORY_ERROR ) { | |||
| LAPACKE_xerbla( "LAPACKE_zlangb", info ); | |||
| } | |||
| return res; | |||
| } | |||
| @@ -0,0 +1,84 @@ | |||
| /***************************************************************************** | |||
| Copyright (c) 2022, Intel Corp. | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are met: | |||
| * Redistributions of source code must retain the above copyright notice, | |||
| this list of conditions and the following disclaimer. | |||
| * Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in the | |||
| documentation and/or other materials provided with the distribution. | |||
| * Neither the name of Intel Corporation nor the names of its contributors | |||
| may be used to endorse or promote products derived from this software | |||
| without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |||
| SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |||
| INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |||
| CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |||
| ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF | |||
| THE POSSIBILITY OF SUCH DAMAGE. | |||
| ***************************************************************************** | |||
| * Contents: Native middle-level C interface to LAPACK function zlangb | |||
| * Author: Simon Märtens | |||
| *****************************************************************************/ | |||
| #include "lapacke_utils.h" | |||
| double LAPACKE_zlangb_work( int matrix_layout, char norm, lapack_int n, | |||
| lapack_int kl, lapack_int ku, | |||
| const lapack_complex_double* ab, lapack_int ldab, | |||
| double* work ) | |||
| { | |||
| lapack_int info = 0; | |||
| double res = 0.; | |||
| if( matrix_layout == LAPACK_COL_MAJOR ) { | |||
| /* Call LAPACK function and adjust info */ | |||
| res = LAPACK_zlangb( &norm, &n, &kl, &ku, ab, &ldab, work ); | |||
| } else if( matrix_layout == LAPACK_ROW_MAJOR ) { | |||
| char norm_lapack; | |||
| double* work_lapack = NULL; | |||
| /* Check leading dimension(s) */ | |||
| if( ldab < kl+ku+1 ) { | |||
| info = -7; | |||
| LAPACKE_xerbla( "LAPACKE_zlangb_work", info ); | |||
| return info; | |||
| } | |||
| if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) { | |||
| norm_lapack = 'i'; | |||
| } else if( LAPACKE_lsame( norm, 'i' ) ) { | |||
| norm_lapack = '1'; | |||
| } else { | |||
| norm_lapack = norm; | |||
| } | |||
| /* Allocate memory for work array(s) */ | |||
| if( LAPACKE_lsame( norm_lapack, 'i' ) ) { | |||
| work_lapack = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,n) ); | |||
| if( work_lapack == NULL ) { | |||
| info = LAPACK_WORK_MEMORY_ERROR; | |||
| goto exit_level_0; | |||
| } | |||
| } | |||
| /* Call LAPACK function */ | |||
| res = LAPACK_zlangb( &norm, &n, &ku, &kl, ab, &ldab, work ); | |||
| /* Release memory and exit */ | |||
| if( work_lapack ) { | |||
| LAPACKE_free( work_lapack ); | |||
| } | |||
| exit_level_0: | |||
| if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { | |||
| LAPACKE_xerbla( "LAPACKE_zlangb_work", info ); | |||
| } | |||
| } else { | |||
| info = -1; | |||
| LAPACKE_xerbla( "LAPACKE_zlangb_work", info ); | |||
| } | |||
| return res; | |||
| } | |||
| @@ -50,16 +50,24 @@ lapack_int LAPACKE_ztpmqrt_work( int matrix_layout, char side, char trans, | |||
| info = info - 1; | |||
| } | |||
| } else if( matrix_layout == LAPACK_ROW_MAJOR ) { | |||
| lapack_int lda_t = MAX(1,k); | |||
| lapack_int nrowsA, ncolsA, nrowsV; | |||
| if ( side == LAPACKE_lsame(side, 'l') ) { nrowsA = k; ncolsA = n; nrowsV = m; } | |||
| else if ( side == LAPACKE_lsame(side, 'r') ) { nrowsA = m; ncolsA = k; nrowsV = n; } | |||
| else { | |||
| info = -2; | |||
| LAPACKE_xerbla( "LAPACKE_ztpmqrt_work", info ); | |||
| return info; | |||
| } | |||
| lapack_int lda_t = MAX(1,nrowsA); | |||
| lapack_int ldb_t = MAX(1,m); | |||
| lapack_int ldt_t = MAX(1,ldt); | |||
| lapack_int ldv_t = MAX(1,ldv); | |||
| lapack_int ldt_t = MAX(1,nb); | |||
| lapack_int ldv_t = MAX(1,nrowsV); | |||
| lapack_complex_double* v_t = NULL; | |||
| lapack_complex_double* t_t = NULL; | |||
| lapack_complex_double* a_t = NULL; | |||
| lapack_complex_double* b_t = NULL; | |||
| /* Check leading dimension(s) */ | |||
| if( lda < m ) { | |||
| if( lda < ncolsA ) { | |||
| info = -14; | |||
| LAPACKE_xerbla( "LAPACKE_ztpmqrt_work", info ); | |||
| return info; | |||
| @@ -69,7 +77,7 @@ lapack_int LAPACKE_ztpmqrt_work( int matrix_layout, char side, char trans, | |||
| LAPACKE_xerbla( "LAPACKE_ztpmqrt_work", info ); | |||
| return info; | |||
| } | |||
| if( ldt < nb ) { | |||
| if( ldt < k ) { | |||
| info = -12; | |||
| LAPACKE_xerbla( "LAPACKE_ztpmqrt_work", info ); | |||
| return info; | |||
| @@ -87,13 +95,13 @@ lapack_int LAPACKE_ztpmqrt_work( int matrix_layout, char side, char trans, | |||
| goto exit_level_0; | |||
| } | |||
| t_t = (lapack_complex_double*) | |||
| LAPACKE_malloc( sizeof(lapack_complex_double) * ldt_t * MAX(1,nb) ); | |||
| LAPACKE_malloc( sizeof(lapack_complex_double) * ldt_t * MAX(1,k) ); | |||
| if( t_t == NULL ) { | |||
| info = LAPACK_TRANSPOSE_MEMORY_ERROR; | |||
| goto exit_level_1; | |||
| } | |||
| a_t = (lapack_complex_double*) | |||
| LAPACKE_malloc( sizeof(lapack_complex_double) * lda_t * MAX(1,m) ); | |||
| LAPACKE_malloc( sizeof(lapack_complex_double) * lda_t * MAX(1,ncolsA) ); | |||
| if( a_t == NULL ) { | |||
| info = LAPACK_TRANSPOSE_MEMORY_ERROR; | |||
| goto exit_level_2; | |||
| @@ -105,10 +113,10 @@ lapack_int LAPACKE_ztpmqrt_work( int matrix_layout, char side, char trans, | |||
| goto exit_level_3; | |||
| } | |||
| /* Transpose input matrices */ | |||
| LAPACKE_zge_trans( matrix_layout, ldv, k, v, ldv, v_t, ldv_t ); | |||
| LAPACKE_zge_trans( matrix_layout, ldt, nb, t, ldt, t_t, ldt_t ); | |||
| LAPACKE_zge_trans( matrix_layout, k, m, a, lda, a_t, lda_t ); | |||
| LAPACKE_zge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t ); | |||
| LAPACKE_zge_trans( LAPACK_ROW_MAJOR, nrowsV, k, v, ldv, v_t, ldv_t ); | |||
| LAPACKE_zge_trans( LAPACK_ROW_MAJOR, nb, k, t, ldt, t_t, ldt_t ); | |||
| LAPACKE_zge_trans( LAPACK_ROW_MAJOR, nrowsA, ncolsA, a, lda, a_t, lda_t ); | |||
| LAPACKE_zge_trans( LAPACK_ROW_MAJOR, m, n, b, ldb, b_t, ldb_t ); | |||
| /* Call LAPACK function and adjust info */ | |||
| LAPACK_ztpmqrt( &side, &trans, &m, &n, &k, &l, &nb, v_t, &ldv_t, t_t, | |||
| &ldt_t, a_t, &lda_t, b_t, &ldb_t, work, &info ); | |||
| @@ -116,7 +124,7 @@ lapack_int LAPACKE_ztpmqrt_work( int matrix_layout, char side, char trans, | |||
| info = info - 1; | |||
| } | |||
| /* Transpose output matrices */ | |||
| LAPACKE_zge_trans( LAPACK_COL_MAJOR, k, m, a_t, lda_t, a, lda ); | |||
| LAPACKE_zge_trans( LAPACK_COL_MAJOR, nrowsA, ncolsA, a_t, lda_t, a, lda ); | |||
| LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb ); | |||
| /* Release memory and exit */ | |||
| LAPACKE_free( b_t ); | |||
| @@ -0,0 +1,56 @@ | |||
| #include "lapacke_utils.h" | |||
| lapack_int LAPACKE_ztrsyl3( int matrix_layout, char trana, char tranb, | |||
| lapack_int isgn, lapack_int m, lapack_int n, | |||
| const lapack_complex_double* a, lapack_int lda, | |||
| const lapack_complex_double* b, lapack_int ldb, | |||
| lapack_complex_double* c, lapack_int ldc, | |||
| double* scale ) | |||
| { | |||
| lapack_int info = 0; | |||
| double swork_query[2]; | |||
| double* swork = NULL; | |||
| lapack_int ldswork = -1; | |||
| lapack_int swork_size = -1; | |||
| if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { | |||
| LAPACKE_xerbla( "LAPACKE_ztrsyl3", -1 ); | |||
| return -1; | |||
| } | |||
| #ifndef LAPACK_DISABLE_NAN_CHECK | |||
| if( LAPACKE_get_nancheck() ) { | |||
| /* Optionally check input matrices for NaNs */ | |||
| if( LAPACKE_zge_nancheck( matrix_layout, m, m, a, lda ) ) { | |||
| return -7; | |||
| } | |||
| if( LAPACKE_zge_nancheck( matrix_layout, n, n, b, ldb ) ) { | |||
| return -9; | |||
| } | |||
| if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) { | |||
| return -11; | |||
| } | |||
| } | |||
| #endif | |||
| /* Query optimal working array sizes */ | |||
| info = LAPACKE_ztrsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, lda, | |||
| b, ldb, c, ldc, scale, swork_query, ldswork ); | |||
| if( info != 0 ) { | |||
| goto exit_level_0; | |||
| } | |||
| ldswork = swork_query[0]; | |||
| swork_size = ldswork * swork_query[1]; | |||
| swork = (double*)LAPACKE_malloc( sizeof(double) * swork_size); | |||
| if( swork == NULL ) { | |||
| info = LAPACK_WORK_MEMORY_ERROR; | |||
| goto exit_level_0; | |||
| } | |||
| /* Call middle-level interface */ | |||
| info = LAPACKE_ztrsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, | |||
| lda, b, ldb, c, ldc, scale, swork, ldswork ); | |||
| /* Release memory and exit */ | |||
| LAPACKE_free( swork ); | |||
| exit_level_0: | |||
| if( info == LAPACK_WORK_MEMORY_ERROR ) { | |||
| LAPACKE_xerbla( "LAPACKE_ztrsyl3", info ); | |||
| } | |||
| return info; | |||
| } | |||
| @@ -0,0 +1,88 @@ | |||
| #include "lapacke_utils.h" | |||
| lapack_int LAPACKE_ztrsyl3_work( int matrix_layout, char trana, char tranb, | |||
| lapack_int isgn, lapack_int m, lapack_int n, | |||
| const lapack_complex_double* a, lapack_int lda, | |||
| const lapack_complex_double* b, lapack_int ldb, | |||
| lapack_complex_double* c, lapack_int ldc, | |||
| double* scale, double* swork, | |||
| lapack_int ldswork ) | |||
| { | |||
| lapack_int info = 0; | |||
| if( matrix_layout == LAPACK_COL_MAJOR ) { | |||
| /* Call LAPACK function and adjust info */ | |||
| LAPACK_ztrsyl3( &trana, &tranb, &isgn, &m, &n, a, &lda, b, &ldb, c, &ldc, | |||
| scale, swork, &ldswork, &info ); | |||
| if( info < 0 ) { | |||
| info = info - 1; | |||
| } | |||
| } else if( matrix_layout == LAPACK_ROW_MAJOR ) { | |||
| lapack_int lda_t = MAX(1,m); | |||
| lapack_int ldb_t = MAX(1,n); | |||
| lapack_int ldc_t = MAX(1,m); | |||
| lapack_complex_double* a_t = NULL; | |||
| lapack_complex_double* b_t = NULL; | |||
| lapack_complex_double* c_t = NULL; | |||
| /* Check leading dimension(s) */ | |||
| if( lda < m ) { | |||
| info = -8; | |||
| LAPACKE_xerbla( "LAPACKE_ztrsyl3_work", info ); | |||
| return info; | |||
| } | |||
| if( ldb < n ) { | |||
| info = -10; | |||
| LAPACKE_xerbla( "LAPACKE_ztrsyl3_work", info ); | |||
| return info; | |||
| } | |||
| if( ldc < n ) { | |||
| info = -12; | |||
| LAPACKE_xerbla( "LAPACKE_ztrsyl3_work", info ); | |||
| return info; | |||
| } | |||
| /* Allocate memory for temporary array(s) */ | |||
| a_t = (lapack_complex_double*) | |||
| LAPACKE_malloc( sizeof(lapack_complex_double) * lda_t * MAX(1,m) ); | |||
| if( a_t == NULL ) { | |||
| info = LAPACK_TRANSPOSE_MEMORY_ERROR; | |||
| goto exit_level_0; | |||
| } | |||
| b_t = (lapack_complex_double*) | |||
| LAPACKE_malloc( sizeof(lapack_complex_double) * ldb_t * MAX(1,n) ); | |||
| if( b_t == NULL ) { | |||
| info = LAPACK_TRANSPOSE_MEMORY_ERROR; | |||
| goto exit_level_1; | |||
| } | |||
| c_t = (lapack_complex_double*) | |||
| LAPACKE_malloc( sizeof(lapack_complex_double) * ldc_t * MAX(1,n) ); | |||
| if( c_t == NULL ) { | |||
| info = LAPACK_TRANSPOSE_MEMORY_ERROR; | |||
| goto exit_level_2; | |||
| } | |||
| /* Transpose input matrices */ | |||
| LAPACKE_zge_trans( matrix_layout, m, m, a, lda, a_t, lda_t ); | |||
| LAPACKE_zge_trans( matrix_layout, n, n, b, ldb, b_t, ldb_t ); | |||
| LAPACKE_zge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); | |||
| /* Call LAPACK function and adjust info */ | |||
| LAPACK_ztrsyl3( &trana, &tranb, &isgn, &m, &n, a_t, &lda_t, b_t, &ldb_t, | |||
| c_t, &ldc_t, scale, swork, &ldswork, &info ); | |||
| if( info < 0 ) { | |||
| info = info - 1; | |||
| } | |||
| /* Transpose output matrices */ | |||
| LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, c_t, ldc_t, c, ldc ); | |||
| /* Release memory and exit */ | |||
| LAPACKE_free( c_t ); | |||
| exit_level_2: | |||
| LAPACKE_free( b_t ); | |||
| exit_level_1: | |||
| LAPACKE_free( a_t ); | |||
| exit_level_0: | |||
| if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { | |||
| LAPACKE_xerbla( "LAPACKE_ztrsyl3_work", info ); | |||
| } | |||
| } else { | |||
| info = -1; | |||
| LAPACKE_xerbla( "LAPACKE_ztrsyl3_work", info ); | |||
| } | |||
| return info; | |||
| } | |||
| @@ -207,7 +207,7 @@ SLASRC_O = \ | |||
| ssytrd_2stage.o ssytrd_sy2sb.o ssytrd_sb2st.o ssb2st_kernels.o \ | |||
| ssyevd_2stage.o ssyev_2stage.o ssyevx_2stage.o ssyevr_2stage.o \ | |||
| ssbev_2stage.o ssbevx_2stage.o ssbevd_2stage.o ssygv_2stage.o \ | |||
| sgesvdq.o | |||
| sgesvdq.o slarmm.o slatrs3.o strsyl3.o | |||
| endif | |||
| @@ -316,7 +316,7 @@ CLASRC_O = \ | |||
| chetrd_2stage.o chetrd_he2hb.o chetrd_hb2st.o chb2st_kernels.o \ | |||
| cheevd_2stage.o cheev_2stage.o cheevx_2stage.o cheevr_2stage.o \ | |||
| chbev_2stage.o chbevx_2stage.o chbevd_2stage.o chegv_2stage.o \ | |||
| cgesvdq.o | |||
| cgesvdq.o clatrs3.o ctrsyl3.o | |||
| endif | |||
| ifdef USEXBLAS | |||
| @@ -417,7 +417,7 @@ DLASRC_O = \ | |||
| dsytrd_2stage.o dsytrd_sy2sb.o dsytrd_sb2st.o dsb2st_kernels.o \ | |||
| dsyevd_2stage.o dsyev_2stage.o dsyevx_2stage.o dsyevr_2stage.o \ | |||
| dsbev_2stage.o dsbevx_2stage.o dsbevd_2stage.o dsygv_2stage.o \ | |||
| dgesvdq.o | |||
| dgesvdq.o dlarmm.o dlatrs3.o dtrsyl3.o | |||
| endif | |||
| ifdef USEXBLAS | |||
| @@ -526,7 +526,7 @@ ZLASRC_O = \ | |||
| zhetrd_2stage.o zhetrd_he2hb.o zhetrd_hb2st.o zhb2st_kernels.o \ | |||
| zheevd_2stage.o zheev_2stage.o zheevx_2stage.o zheevr_2stage.o \ | |||
| zhbev_2stage.o zhbevx_2stage.o zhbevd_2stage.o zhegv_2stage.o \ | |||
| zgesvdq.o | |||
| zgesvdq.o zlatrs3.o ztrsyl3.o | |||
| endif | |||
| ifdef USEXBLAS | |||
| @@ -0,0 +1,666 @@ | |||
| *> \brief \b CLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow. | |||
| * | |||
| * Definition: | |||
| * =========== | |||
| * | |||
| * SUBROUTINE CLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, | |||
| * X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) | |||
| * | |||
| * .. Scalar Arguments .. | |||
| * CHARACTER DIAG, NORMIN, TRANS, UPLO | |||
| * INTEGER INFO, LDA, LWORK, LDX, N, NRHS | |||
| * .. | |||
| * .. Array Arguments .. | |||
| * REAL CNORM( * ), SCALE( * ), WORK( * ) | |||
| * COMPLEX A( LDA, * ), X( LDX, * ) | |||
| * .. | |||
| * | |||
| * | |||
| *> \par Purpose: | |||
| * ============= | |||
| *> | |||
| *> \verbatim | |||
| *> | |||
| *> CLATRS3 solves one of the triangular systems | |||
| *> | |||
| *> A * X = B * diag(scale), A**T * X = B * diag(scale), or | |||
| *> A**H * X = B * diag(scale) | |||
| *> | |||
| *> with scaling to prevent overflow. Here A is an upper or lower | |||
| *> triangular matrix, A**T denotes the transpose of A, A**H denotes the | |||
| *> conjugate transpose of A. X and B are n-by-nrhs matrices and scale | |||
| *> is an nrhs-element vector of scaling factors. A scaling factor scale(j) | |||
| *> is usually less than or equal to 1, chosen such that X(:,j) is less | |||
| *> than the overflow threshold. If the matrix A is singular (A(j,j) = 0 | |||
| *> for some j), then a non-trivial solution to A*X = 0 is returned. If | |||
| *> the system is so badly scaled that the solution cannot be represented | |||
| *> as (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned. | |||
| *> | |||
| *> This is a BLAS-3 version of LATRS for solving several right | |||
| *> hand sides simultaneously. | |||
| *> | |||
| *> \endverbatim | |||
| * | |||
| * Arguments: | |||
| * ========== | |||
| * | |||
| *> \param[in] UPLO | |||
| *> \verbatim | |||
| *> UPLO is CHARACTER*1 | |||
| *> Specifies whether the matrix A is upper or lower triangular. | |||
| *> = 'U': Upper triangular | |||
| *> = 'L': Lower triangular | |||
| *> \endverbatim | |||
| *> | |||
| *> \param[in] TRANS | |||
| *> \verbatim | |||
| *> TRANS is CHARACTER*1 | |||
| *> Specifies the operation applied to A. | |||
| *> = 'N': Solve A * x = s*b (No transpose) | |||
| *> = 'T': Solve A**T* x = s*b (Transpose) | |||
| *> = 'C': Solve A**T* x = s*b (Conjugate transpose) | |||
| *> \endverbatim | |||
| *> | |||
| *> \param[in] DIAG | |||
| *> \verbatim | |||
| *> DIAG is CHARACTER*1 | |||
| *> Specifies whether or not the matrix A is unit triangular. | |||
| *> = 'N': Non-unit triangular | |||
| *> = 'U': Unit triangular | |||
| *> \endverbatim | |||
| *> | |||
| *> \param[in] NORMIN | |||
| *> \verbatim | |||
| *> NORMIN is CHARACTER*1 | |||
| *> Specifies whether CNORM has been set or not. | |||
| *> = 'Y': CNORM contains the column norms on entry | |||
| *> = 'N': CNORM is not set on entry. On exit, the norms will | |||
| *> be computed and stored in CNORM. | |||
| *> \endverbatim | |||
| *> | |||
| *> \param[in] N | |||
| *> \verbatim | |||
| *> N is INTEGER | |||
| *> The order of the matrix A. N >= 0. | |||
| *> \endverbatim | |||
| *> | |||
| *> \param[in] NRHS | |||
| *> \verbatim | |||
| *> NRHS is INTEGER | |||
| *> The number of columns of X. NRHS >= 0. | |||
| *> \endverbatim | |||
| *> | |||
| *> \param[in] A | |||
| *> \verbatim | |||
| *> A is COMPLEX array, dimension (LDA,N) | |||
| *> The triangular matrix A. If UPLO = 'U', the leading n by n | |||
| *> upper triangular part of the array A contains the upper | |||
| *> triangular matrix, and the strictly lower triangular part of | |||
| *> A is not referenced. If UPLO = 'L', the leading n by n lower | |||
| *> triangular part of the array A contains the lower triangular | |||
| *> matrix, and the strictly upper triangular part of A is not | |||
| *> referenced. If DIAG = 'U', the diagonal elements of A are | |||
| *> also not referenced and are assumed to be 1. | |||
| *> \endverbatim | |||
| *> | |||
| *> \param[in] LDA | |||
| *> \verbatim | |||
| *> LDA is INTEGER | |||
| *> The leading dimension of the array A. LDA >= max (1,N). | |||
| *> \endverbatim | |||
| *> | |||
| *> \param[in,out] X | |||
| *> \verbatim | |||
| *> X is COMPLEX array, dimension (LDX,NRHS) | |||
| *> On entry, the right hand side B of the triangular system. | |||
| *> On exit, X is overwritten by the solution matrix X. | |||
| *> \endverbatim | |||
| *> | |||
| *> \param[in] LDX | |||
| *> \verbatim | |||
| *> LDX is INTEGER | |||
| *> The leading dimension of the array X. LDX >= max (1,N). | |||
| *> \endverbatim | |||
| *> | |||
| *> \param[out] SCALE | |||
| *> \verbatim | |||
| *> SCALE is REAL array, dimension (NRHS) | |||
| *> The scaling factor s(k) is for the triangular system | |||
| *> A * x(:,k) = s(k)*b(:,k) or A**T* x(:,k) = s(k)*b(:,k). | |||
| *> If SCALE = 0, the matrix A is singular or badly scaled. | |||
| *> If A(j,j) = 0 is encountered, a non-trivial vector x(:,k) | |||
| *> that is an exact or approximate solution to A*x(:,k) = 0 | |||
| *> is returned. If the system so badly scaled that solution | |||
| *> cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0 | |||
| *> is returned. | |||
| *> \endverbatim | |||
| *> | |||
| *> \param[in,out] CNORM | |||
| *> \verbatim | |||
| *> CNORM is REAL array, dimension (N) | |||
| *> | |||
| *> If NORMIN = 'Y', CNORM is an input argument and CNORM(j) | |||
| *> contains the norm of the off-diagonal part of the j-th column | |||
| *> of A. If TRANS = 'N', CNORM(j) must be greater than or equal | |||
| *> to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) | |||
| *> must be greater than or equal to the 1-norm. | |||
| *> | |||
| *> If NORMIN = 'N', CNORM is an output argument and CNORM(j) | |||
| *> returns the 1-norm of the offdiagonal part of the j-th column | |||
| *> of A. | |||
| *> \endverbatim | |||
| *> | |||
| *> \param[out] WORK | |||
| *> \verbatim | |||
| *> WORK is REAL array, dimension (LWORK). | |||
| *> On exit, if INFO = 0, WORK(1) returns the optimal size of | |||
| *> WORK. | |||
| *> \endverbatim | |||
| *> | |||
| *> \param[in] LWORK | |||
| *> LWORK is INTEGER | |||
| *> LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where | |||
| *> NBA = (N + NB - 1)/NB and NB is the optimal block size. | |||
| *> | |||
| *> If LWORK = -1, then a workspace query is assumed; the routine | |||
| *> only calculates the optimal dimensions of the WORK array, returns | |||
| *> this value as the first entry of the WORK array, and no error | |||
| *> message related to LWORK is issued by XERBLA. | |||
| *> | |||
| *> \param[out] INFO | |||
| *> \verbatim | |||
| *> INFO is INTEGER | |||
| *> = 0: successful exit | |||
| *> < 0: if INFO = -k, the k-th argument had an illegal value | |||
| *> \endverbatim | |||
| * | |||
| * Authors: | |||
| * ======== | |||
| * | |||
| *> \author Univ. of Tennessee | |||
| *> \author Univ. of California Berkeley | |||
| *> \author Univ. of Colorado Denver | |||
| *> \author NAG Ltd. | |||
| * | |||
| *> \ingroup doubleOTHERauxiliary | |||
| *> \par Further Details: | |||
| * ===================== | |||
| * \verbatim | |||
| * The algorithm follows the structure of a block triangular solve. | |||
| * The diagonal block is solved with a call to the robust the triangular | |||
| * solver LATRS for every right-hand side RHS = 1, ..., NRHS | |||
| * op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ), | |||
| * where op( A ) = A or op( A ) = A**T or op( A ) = A**H. | |||
| * The linear block updates operate on block columns of X, | |||
| * B( I, K ) - op(A( I, J )) * X( J, K ) | |||
| * and use GEMM. To avoid overflow in the linear block update, the worst case | |||
| * growth is estimated. For every RHS, a scale factor s <= 1.0 is computed | |||
| * such that | |||
| * || s * B( I, RHS )||_oo | |||
| * + || op(A( I, J )) ||_oo * || s * X( J, RHS ) ||_oo <= Overflow threshold | |||
| * | |||
| * Once all columns of a block column have been rescaled (BLAS-1), the linear | |||
| * update is executed with GEMM without overflow. | |||
| * | |||
| * To limit rescaling, local scale factors track the scaling of column segments. | |||
| * There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA | |||
| * per right-hand side column RHS = 1, ..., NRHS. The global scale factor | |||
| * SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS ) | |||
| * I = 1, ..., NBA. | |||
| * A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ) | |||
| * updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The | |||
| * linear update of potentially inconsistently scaled vector segments | |||
| * s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) ) | |||
| * computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and, | |||
| * if necessary, rescales the blocks prior to calling GEMM. | |||
| * | |||
| * \endverbatim | |||
| * ===================================================================== | |||
| * References: | |||
| * C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019). | |||
| * Parallel robust solution of triangular linear systems. Concurrency | |||
| * and Computation: Practice and Experience, 31(19), e5064. | |||
| * | |||
| * Contributor: | |||
| * Angelika Schwarz, Umea University, Sweden. | |||
| * | |||
| * ===================================================================== | |||
| SUBROUTINE CLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, | |||
| $ X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) | |||
| IMPLICIT NONE | |||
| * | |||
| * .. Scalar Arguments .. | |||
| CHARACTER DIAG, TRANS, NORMIN, UPLO | |||
| INTEGER INFO, LDA, LWORK, LDX, N, NRHS | |||
| * .. | |||
| * .. Array Arguments .. | |||
| COMPLEX A( LDA, * ), X( LDX, * ) | |||
| REAL CNORM( * ), SCALE( * ), WORK( * ) | |||
| * .. | |||
| * | |||
| * ===================================================================== | |||
| * | |||
| * .. Parameters .. | |||
| REAL ZERO, ONE | |||
| PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) | |||
| COMPLEX CZERO, CONE | |||
| PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ) ) | |||
| PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) | |||
| INTEGER NBMAX, NBMIN, NBRHS, NRHSMIN | |||
| PARAMETER ( NRHSMIN = 2, NBRHS = 32 ) | |||
| PARAMETER ( NBMIN = 8, NBMAX = 64 ) | |||
| * .. | |||
| * .. Local Arrays .. | |||
| REAL W( NBMAX ), XNRM( NBRHS ) | |||
| * .. | |||
| * .. Local Scalars .. | |||
| LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER | |||
| INTEGER AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J, | |||
| $ JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2, | |||
| $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS | |||
| REAL ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC, | |||
| $ SCAMIN, SMLNUM, TMAX | |||
| * .. | |||
| * .. External Functions .. | |||
| LOGICAL LSAME | |||
| INTEGER ILAENV | |||
| REAL SLAMCH, CLANGE, SLARMM | |||
| EXTERNAL ILAENV, LSAME, SLAMCH, CLANGE, SLARMM | |||
| * .. | |||
| * .. External Subroutines .. | |||
| EXTERNAL CLATRS, CSSCAL, XERBLA | |||
| * .. | |||
| * .. Intrinsic Functions .. | |||
| INTRINSIC ABS, MAX, MIN | |||
| * .. | |||
| * .. Executable Statements .. | |||
| * | |||
| INFO = 0 | |||
| UPPER = LSAME( UPLO, 'U' ) | |||
| NOTRAN = LSAME( TRANS, 'N' ) | |||
| NOUNIT = LSAME( DIAG, 'N' ) | |||
| LQUERY = ( LWORK.EQ.-1 ) | |||
| * | |||
| * Partition A and X into blocks. | |||
| * | |||
| NB = MAX( NBMIN, ILAENV( 1, 'CLATRS', '', N, N, -1, -1 ) ) | |||
| NB = MIN( NBMAX, NB ) | |||
| NBA = MAX( 1, (N + NB - 1) / NB ) | |||
| NBX = MAX( 1, (NRHS + NBRHS - 1) / NBRHS ) | |||
| * | |||
| * Compute the workspace | |||
| * | |||
| * The workspace comprises two parts. | |||
| * The first part stores the local scale factors. Each simultaneously | |||
| * computed right-hand side requires one local scale factor per block | |||
| * row. WORK( I + KK * LDS ) is the scale factor of the vector | |||
| * segment associated with the I-th block row and the KK-th vector | |||
| * in the block column. | |||
| LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) ) | |||
| LDS = NBA | |||
| * The second part stores upper bounds of the triangular A. There are | |||
| * a total of NBA x NBA blocks, of which only the upper triangular | |||
| * part or the lower triangular part is referenced. The upper bound of | |||
| * the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). | |||
| LANRM = NBA * NBA | |||
| AWRK = LSCALE | |||
| WORK( 1 ) = LSCALE + LANRM | |||
| * | |||
| * Test the input parameters. | |||
| * | |||
| IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN | |||
| INFO = -1 | |||
| ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. | |||
| $ LSAME( TRANS, 'C' ) ) THEN | |||
| INFO = -2 | |||
| ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN | |||
| INFO = -3 | |||
| ELSE IF( .NOT.LSAME( NORMIN, 'Y' ) .AND. .NOT. | |||
| $ LSAME( NORMIN, 'N' ) ) THEN | |||
| INFO = -4 | |||
| ELSE IF( N.LT.0 ) THEN | |||
| INFO = -5 | |||
| ELSE IF( NRHS.LT.0 ) THEN | |||
| INFO = -6 | |||
| ELSE IF( LDA.LT.MAX( 1, N ) ) THEN | |||
| INFO = -8 | |||
| ELSE IF( LDX.LT.MAX( 1, N ) ) THEN | |||
| INFO = -10 | |||
| ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN | |||
| INFO = -14 | |||
| END IF | |||
| IF( INFO.NE.0 ) THEN | |||
| CALL XERBLA( 'CLATRS3', -INFO ) | |||
| RETURN | |||
| ELSE IF( LQUERY ) THEN | |||
| RETURN | |||
| END IF | |||
| * | |||
| * Initialize scaling factors | |||
| * | |||
| DO KK = 1, NRHS | |||
| SCALE( KK ) = ONE | |||
| END DO | |||
| * | |||
| * Quick return if possible | |||
| * | |||
| IF( MIN( N, NRHS ).EQ.0 ) | |||
| $ RETURN | |||
| * | |||
| * Determine machine dependent constant to control overflow. | |||
| * | |||
| BIGNUM = SLAMCH( 'Overflow' ) | |||
| SMLNUM = SLAMCH( 'Safe Minimum' ) | |||
| * | |||
| * Use unblocked code for small problems | |||
| * | |||
| IF( NRHS.LT.NRHSMIN ) THEN | |||
| CALL CLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X( 1, 1 ), | |||
| $ SCALE( 1 ), CNORM, INFO ) | |||
| DO K = 2, NRHS | |||
| CALL CLATRS( UPLO, TRANS, DIAG, 'Y', N, A, LDA, X( 1, K ), | |||
| $ SCALE( K ), CNORM, INFO ) | |||
| END DO | |||
| RETURN | |||
| END IF | |||
| * | |||
| * Compute norms of blocks of A excluding diagonal blocks and find | |||
| * the block with the largest norm TMAX. | |||
| * | |||
| TMAX = ZERO | |||
| DO J = 1, NBA | |||
| J1 = (J-1)*NB + 1 | |||
| J2 = MIN( J*NB, N ) + 1 | |||
| IF ( UPPER ) THEN | |||
| IFIRST = 1 | |||
| ILAST = J - 1 | |||
| ELSE | |||
| IFIRST = J + 1 | |||
| ILAST = NBA | |||
| END IF | |||
| DO I = IFIRST, ILAST | |||
| I1 = (I-1)*NB + 1 | |||
| I2 = MIN( I*NB, N ) + 1 | |||
| * | |||
| * Compute upper bound of A( I1:I2-1, J1:J2-1 ). | |||
| * | |||
| IF( NOTRAN ) THEN | |||
| ANRM = CLANGE( 'I', I2-I1, J2-J1, A( I1, J1 ), LDA, W ) | |||
| WORK( AWRK + I+(J-1)*NBA ) = ANRM | |||
| ELSE | |||
| ANRM = CLANGE( '1', I2-I1, J2-J1, A( I1, J1 ), LDA, W ) | |||
| WORK( AWRK + J+(I-1)*NBA ) = ANRM | |||
| END IF | |||
| TMAX = MAX( TMAX, ANRM ) | |||
| END DO | |||
| END DO | |||
| * | |||
| IF( .NOT. TMAX.LE.SLAMCH('Overflow') ) THEN | |||
| * | |||
| * Some matrix entries have huge absolute value. At least one upper | |||
| * bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point | |||
| * number, either due to overflow in LANGE or due to Inf in A. | |||
| * Fall back to LATRS. Set normin = 'N' for every right-hand side to | |||
| * force computation of TSCAL in LATRS to avoid the likely overflow | |||
| * in the computation of the column norms CNORM. | |||
| * | |||
| DO K = 1, NRHS | |||
| CALL CLATRS( UPLO, TRANS, DIAG, 'N', N, A, LDA, X( 1, K ), | |||
| $ SCALE( K ), CNORM, INFO ) | |||
| END DO | |||
| RETURN | |||
| END IF | |||
| * | |||
| * Every right-hand side requires workspace to store NBA local scale | |||
| * factors. To save workspace, X is computed successively in block columns | |||
| * of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient | |||
| * workspace is available, larger values of NBRHS or NBRHS = NRHS are viable. | |||
| DO K = 1, NBX | |||
| * Loop over block columns (index = K) of X and, for column-wise scalings, | |||
| * over individual columns (index = KK). | |||
| * K1: column index of the first column in X( J, K ) | |||
| * K2: column index of the first column in X( J, K+1 ) | |||
| * so the K2 - K1 is the column count of the block X( J, K ) | |||
| K1 = (K-1)*NBRHS + 1 | |||
| K2 = MIN( K*NBRHS, NRHS ) + 1 | |||
| * | |||
| * Initialize local scaling factors of current block column X( J, K ) | |||
| * | |||
| DO KK = 1, K2-K1 | |||
| DO I = 1, NBA | |||
| WORK( I+KK*LDS ) = ONE | |||
| END DO | |||
| END DO | |||
| * | |||
| IF( NOTRAN ) THEN | |||
| * | |||
| * Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) | |||
| * | |||
| IF( UPPER ) THEN | |||
| JFIRST = NBA | |||
| JLAST = 1 | |||
| JINC = -1 | |||
| ELSE | |||
| JFIRST = 1 | |||
| JLAST = NBA | |||
| JINC = 1 | |||
| END IF | |||
| ELSE | |||
| * | |||
| * Solve op(A) * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) | |||
| * where op(A) = A**T or op(A) = A**H | |||
| * | |||
| IF( UPPER ) THEN | |||
| JFIRST = 1 | |||
| JLAST = NBA | |||
| JINC = 1 | |||
| ELSE | |||
| JFIRST = NBA | |||
| JLAST = 1 | |||
| JINC = -1 | |||
| END IF | |||
| END IF | |||
| DO J = JFIRST, JLAST, JINC | |||
| * J1: row index of the first row in A( J, J ) | |||
| * J2: row index of the first row in A( J+1, J+1 ) | |||
| * so that J2 - J1 is the row count of the block A( J, J ) | |||
| J1 = (J-1)*NB + 1 | |||
| J2 = MIN( J*NB, N ) + 1 | |||
| * | |||
| * Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS ) | |||
| * | |||
| DO KK = 1, K2-K1 | |||
| RHS = K1 + KK - 1 | |||
| IF( KK.EQ.1 ) THEN | |||
| CALL CLATRS( UPLO, TRANS, DIAG, 'N', J2-J1, | |||
| $ A( J1, J1 ), LDA, X( J1, RHS ), | |||
| $ SCALOC, CNORM, INFO ) | |||
| ELSE | |||
| CALL CLATRS( UPLO, TRANS, DIAG, 'Y', J2-J1, | |||
| $ A( J1, J1 ), LDA, X( J1, RHS ), | |||
| $ SCALOC, CNORM, INFO ) | |||
| END IF | |||
| * Find largest absolute value entry in the vector segment | |||
| * X( J1:J2-1, RHS ) as an upper bound for the worst case | |||
| * growth in the linear updates. | |||
| XNRM( KK ) = CLANGE( 'I', J2-J1, 1, X( J1, RHS ), | |||
| $ LDX, W ) | |||
| * | |||
| IF( SCALOC .EQ. ZERO ) THEN | |||
| * LATRS found that A is singular through A(j,j) = 0. | |||
| * Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0 | |||
| * and compute op(A)*x = 0. Note that X(J1:J2-1, KK) is | |||
| * set by LATRS. | |||
| SCALE( RHS ) = ZERO | |||
| DO II = 1, J1-1 | |||
| X( II, KK ) = CZERO | |||
| END DO | |||
| DO II = J2, N | |||
| X( II, KK ) = CZERO | |||
| END DO | |||
| * Discard the local scale factors. | |||
| DO II = 1, NBA | |||
| WORK( II+KK*LDS ) = ONE | |||
| END DO | |||
| SCALOC = ONE | |||
| ELSE IF( SCALOC*WORK( J+KK*LDS ) .EQ. ZERO ) THEN | |||
| * LATRS computed a valid scale factor, but combined with | |||
| * the current scaling the solution does not have a | |||
| * scale factor > 0. | |||
| * | |||
| * Set WORK( J+KK*LDS ) to smallest valid scale | |||
| * factor and increase SCALOC accordingly. | |||
| SCAL = WORK( J+KK*LDS ) / SMLNUM | |||
| SCALOC = SCALOC * SCAL | |||
| WORK( J+KK*LDS ) = SMLNUM | |||
| * If LATRS overestimated the growth, x may be | |||
| * rescaled to preserve a valid combined scale | |||
| * factor WORK( J, KK ) > 0. | |||
| RSCAL = ONE / SCALOC | |||
| IF( XNRM( KK )*RSCAL .LE. BIGNUM ) THEN | |||
| XNRM( KK ) = XNRM( KK ) * RSCAL | |||
| CALL CSSCAL( J2-J1, RSCAL, X( J1, RHS ), 1 ) | |||
| SCALOC = ONE | |||
| ELSE | |||
| * The system op(A) * x = b is badly scaled and its | |||
| * solution cannot be represented as (1/scale) * x. | |||
| * Set x to zero. This approach deviates from LATRS | |||
| * where a completely meaningless non-zero vector | |||
| * is returned that is not a solution to op(A) * x = b. | |||
| SCALE( RHS ) = ZERO | |||
| DO II = 1, N | |||
| X( II, KK ) = CZERO | |||
| END DO | |||
| * Discard the local scale factors. | |||
| DO II = 1, NBA | |||
| WORK( II+KK*LDS ) = ONE | |||
| END DO | |||
| SCALOC = ONE | |||
| END IF | |||
| END IF | |||
| SCALOC = SCALOC * WORK( J+KK*LDS ) | |||
| WORK( J+KK*LDS ) = SCALOC | |||
| END DO | |||
| * | |||
| * Linear block updates | |||
| * | |||
| IF( NOTRAN ) THEN | |||
| IF( UPPER ) THEN | |||
| IFIRST = J - 1 | |||
| ILAST = 1 | |||
| IINC = -1 | |||
| ELSE | |||
| IFIRST = J + 1 | |||
| ILAST = NBA | |||
| IINC = 1 | |||
| END IF | |||
| ELSE | |||
| IF( UPPER ) THEN | |||
| IFIRST = J + 1 | |||
| ILAST = NBA | |||
| IINC = 1 | |||
| ELSE | |||
| IFIRST = J - 1 | |||
| ILAST = 1 | |||
| IINC = -1 | |||
| END IF | |||
| END IF | |||
| * | |||
| DO I = IFIRST, ILAST, IINC | |||
| * I1: row index of the first column in X( I, K ) | |||
| * I2: row index of the first column in X( I+1, K ) | |||
| * so the I2 - I1 is the row count of the block X( I, K ) | |||
| I1 = (I-1)*NB + 1 | |||
| I2 = MIN( I*NB, N ) + 1 | |||
| * | |||
| * Prepare the linear update to be executed with GEMM. | |||
| * For each column, compute a consistent scaling, a | |||
| * scaling factor to survive the linear update, and | |||
| * rescale the column segments, if necesssary. Then | |||
| * the linear update is safely executed. | |||
| * | |||
| DO KK = 1, K2-K1 | |||
| RHS = K1 + KK - 1 | |||
| * Compute consistent scaling | |||
| SCAMIN = MIN( WORK( I+KK*LDS), WORK( J+KK*LDS ) ) | |||
| * | |||
| * Compute scaling factor to survive the linear update | |||
| * simulating consistent scaling. | |||
| * | |||
| BNRM = CLANGE( 'I', I2-I1, 1, X( I1, RHS ), LDX, W ) | |||
| BNRM = BNRM*( SCAMIN / WORK( I+KK*LDS ) ) | |||
| XNRM( KK ) = XNRM( KK )*( SCAMIN / WORK( J+KK*LDS) ) | |||
| ANRM = WORK( AWRK + I+(J-1)*NBA ) | |||
| SCALOC = SLARMM( ANRM, XNRM( KK ), BNRM ) | |||
| * | |||
| * Simultaneously apply the robust update factor and the | |||
| * consistency scaling factor to X( I, KK ) and X( J, KK ). | |||
| * | |||
| SCAL = ( SCAMIN / WORK( I+KK*LDS) )*SCALOC | |||
| IF( SCAL.NE.ONE ) THEN | |||
| CALL CSSCAL( I2-I1, SCAL, X( I1, RHS ), 1 ) | |||
| WORK( I+KK*LDS ) = SCAMIN*SCALOC | |||
| END IF | |||
| * | |||
| SCAL = ( SCAMIN / WORK( J+KK*LDS ) )*SCALOC | |||
| IF( SCAL.NE.ONE ) THEN | |||
| CALL CSSCAL( J2-J1, SCAL, X( J1, RHS ), 1 ) | |||
| WORK( J+KK*LDS ) = SCAMIN*SCALOC | |||
| END IF | |||
| END DO | |||
| * | |||
| IF( NOTRAN ) THEN | |||
| * | |||
| * B( I, K ) := B( I, K ) - A( I, J ) * X( J, K ) | |||
| * | |||
| CALL CGEMM( 'N', 'N', I2-I1, K2-K1, J2-J1, -CONE, | |||
| $ A( I1, J1 ), LDA, X( J1, K1 ), LDX, | |||
| $ CONE, X( I1, K1 ), LDX ) | |||
| ELSE IF( LSAME( TRANS, 'T' ) ) THEN | |||
| * | |||
| * B( I, K ) := B( I, K ) - A( I, J )**T * X( J, K ) | |||
| * | |||
| CALL CGEMM( 'T', 'N', I2-I1, K2-K1, J2-J1, -CONE, | |||
| $ A( J1, I1 ), LDA, X( J1, K1 ), LDX, | |||
| $ CONE, X( I1, K1 ), LDX ) | |||
| ELSE | |||
| * | |||
| * B( I, K ) := B( I, K ) - A( I, J )**H * X( J, K ) | |||
| * | |||
| CALL CGEMM( 'C', 'N', I2-I1, K2-K1, J2-J1, -CONE, | |||
| $ A( J1, I1 ), LDA, X( J1, K1 ), LDX, | |||
| $ CONE, X( I1, K1 ), LDX ) | |||
| END IF | |||
| END DO | |||
| END DO | |||
| * | |||
| * Reduce local scaling factors | |||
| * | |||
| DO KK = 1, K2-K1 | |||
| RHS = K1 + KK - 1 | |||
| DO I = 1, NBA | |||
| SCALE( RHS ) = MIN( SCALE( RHS ), WORK( I+KK*LDS ) ) | |||
| END DO | |||
| END DO | |||
| * | |||
| * Realize consistent scaling | |||
| * | |||
| DO KK = 1, K2-K1 | |||
| RHS = K1 + KK - 1 | |||
| IF( SCALE( RHS ).NE.ONE .AND. SCALE( RHS ).NE. ZERO ) THEN | |||
| DO I = 1, NBA | |||
| I1 = (I-1)*NB + 1 | |||
| I2 = MIN( I*NB, N ) + 1 | |||
| SCAL = SCALE( RHS ) / WORK( I+KK*LDS ) | |||
| IF( SCAL.NE.ONE ) | |||
| $ CALL CSSCAL( I2-I1, SCAL, X( I1, RHS ), 1 ) | |||
| END DO | |||
| END IF | |||
| END DO | |||
| END DO | |||
| RETURN | |||
| * | |||
| * End of CLATRS3 | |||
| * | |||
| END | |||
| @@ -0,0 +1,605 @@ | |||
| #include <math.h> | |||
| #include <stdlib.h> | |||
| #include <string.h> | |||
| #include <stdio.h> | |||
| #include <complex.h> | |||
| #ifdef complex | |||
| #undef complex | |||
| #endif | |||
| #ifdef I | |||
| #undef I | |||
| #endif | |||
| #if defined(_WIN64) | |||
| typedef long long BLASLONG; | |||
| typedef unsigned long long BLASULONG; | |||
| #else | |||
| typedef long BLASLONG; | |||
| typedef unsigned long BLASULONG; | |||
| #endif | |||
| #ifdef LAPACK_ILP64 | |||
| typedef BLASLONG blasint; | |||
| #if defined(_WIN64) | |||
| #define blasabs(x) llabs(x) | |||
| #else | |||
| #define blasabs(x) labs(x) | |||
| #endif | |||
| #else | |||
| typedef int blasint; | |||
| #define blasabs(x) abs(x) | |||
| #endif | |||
| typedef blasint integer; | |||
| typedef unsigned int uinteger; | |||
| typedef char *address; | |||
| typedef short int shortint; | |||
| typedef float real; | |||
| typedef double doublereal; | |||
| typedef struct { real r, i; } complex; | |||
| typedef struct { doublereal r, i; } doublecomplex; | |||
| #ifdef _MSC_VER | |||
| static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} | |||
| static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} | |||
| static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} | |||
| static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} | |||
| #else | |||
| static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} | |||
| static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} | |||
| static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} | |||
| static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} | |||
| #endif | |||
| #define pCf(z) (*_pCf(z)) | |||
| #define pCd(z) (*_pCd(z)) | |||
| typedef int logical; | |||
| typedef short int shortlogical; | |||
| typedef char logical1; | |||
| typedef char integer1; | |||
| #define TRUE_ (1) | |||
| #define FALSE_ (0) | |||
| /* Extern is for use with -E */ | |||
| #ifndef Extern | |||
| #define Extern extern | |||
| #endif | |||
| /* I/O stuff */ | |||
| typedef int flag; | |||
| typedef int ftnlen; | |||
| typedef int ftnint; | |||
| /*external read, write*/ | |||
| typedef struct | |||
| { flag cierr; | |||
| ftnint ciunit; | |||
| flag ciend; | |||
| char *cifmt; | |||
| ftnint cirec; | |||
| } cilist; | |||
| /*internal read, write*/ | |||
| typedef struct | |||
| { flag icierr; | |||
| char *iciunit; | |||
| flag iciend; | |||
| char *icifmt; | |||
| ftnint icirlen; | |||
| ftnint icirnum; | |||
| } icilist; | |||
| /*open*/ | |||
| typedef struct | |||
| { flag oerr; | |||
| ftnint ounit; | |||
| char *ofnm; | |||
| ftnlen ofnmlen; | |||
| char *osta; | |||
| char *oacc; | |||
| char *ofm; | |||
| ftnint orl; | |||
| char *oblnk; | |||
| } olist; | |||
| /*close*/ | |||
| typedef struct | |||
| { flag cerr; | |||
| ftnint cunit; | |||
| char *csta; | |||
| } cllist; | |||
| /*rewind, backspace, endfile*/ | |||
| typedef struct | |||
| { flag aerr; | |||
| ftnint aunit; | |||
| } alist; | |||
| /* inquire */ | |||
| typedef struct | |||
| { flag inerr; | |||
| ftnint inunit; | |||
| char *infile; | |||
| ftnlen infilen; | |||
| ftnint *inex; /*parameters in standard's order*/ | |||
| ftnint *inopen; | |||
| ftnint *innum; | |||
| ftnint *innamed; | |||
| char *inname; | |||
| ftnlen innamlen; | |||
| char *inacc; | |||
| ftnlen inacclen; | |||
| char *inseq; | |||
| ftnlen inseqlen; | |||
| char *indir; | |||
| ftnlen indirlen; | |||
| char *infmt; | |||
| ftnlen infmtlen; | |||
| char *inform; | |||
| ftnint informlen; | |||
| char *inunf; | |||
| ftnlen inunflen; | |||
| ftnint *inrecl; | |||
| ftnint *innrec; | |||
| char *inblank; | |||
| ftnlen inblanklen; | |||
| } inlist; | |||
| #define VOID void | |||
| union Multitype { /* for multiple entry points */ | |||
| integer1 g; | |||
| shortint h; | |||
| integer i; | |||
| /* longint j; */ | |||
| real r; | |||
| doublereal d; | |||
| complex c; | |||
| doublecomplex z; | |||
| }; | |||
| typedef union Multitype Multitype; | |||
| struct Vardesc { /* for Namelist */ | |||
| char *name; | |||
| char *addr; | |||
| ftnlen *dims; | |||
| int type; | |||
| }; | |||
| typedef struct Vardesc Vardesc; | |||
| struct Namelist { | |||
| char *name; | |||
| Vardesc **vars; | |||
| int nvars; | |||
| }; | |||
| typedef struct Namelist Namelist; | |||
| #define abs(x) ((x) >= 0 ? (x) : -(x)) | |||
| #define dabs(x) (fabs(x)) | |||
| #define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) | |||
| #define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) | |||
| #define dmin(a,b) (f2cmin(a,b)) | |||
| #define dmax(a,b) (f2cmax(a,b)) | |||
| #define bit_test(a,b) ((a) >> (b) & 1) | |||
| #define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) | |||
| #define bit_set(a,b) ((a) | ((uinteger)1 << (b))) | |||
| #define abort_() { sig_die("Fortran abort routine called", 1); } | |||
| #define c_abs(z) (cabsf(Cf(z))) | |||
| #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } | |||
| #ifdef _MSC_VER | |||
| #define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} | |||
| #define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} | |||
| #else | |||
| #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} | |||
| #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} | |||
| #endif | |||
| #define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} | |||
| #define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} | |||
| #define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} | |||
| //#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} | |||
| #define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} | |||
| #define d_abs(x) (fabs(*(x))) | |||
| #define d_acos(x) (acos(*(x))) | |||
| #define d_asin(x) (asin(*(x))) | |||
| #define d_atan(x) (atan(*(x))) | |||
| #define d_atn2(x, y) (atan2(*(x),*(y))) | |||
| #define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } | |||
| #define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } | |||
| #define d_cos(x) (cos(*(x))) | |||
| #define d_cosh(x) (cosh(*(x))) | |||
| #define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) | |||
| #define d_exp(x) (exp(*(x))) | |||
| #define d_imag(z) (cimag(Cd(z))) | |||
| #define r_imag(z) (cimagf(Cf(z))) | |||
| #define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) | |||
| #define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) | |||
| #define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) | |||
| #define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) | |||
| #define d_log(x) (log(*(x))) | |||
| #define d_mod(x, y) (fmod(*(x), *(y))) | |||
| #define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) | |||
| #define d_nint(x) u_nint(*(x)) | |||
| #define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) | |||
| #define d_sign(a,b) u_sign(*(a),*(b)) | |||
| #define r_sign(a,b) u_sign(*(a),*(b)) | |||
| #define d_sin(x) (sin(*(x))) | |||
| #define d_sinh(x) (sinh(*(x))) | |||
| #define d_sqrt(x) (sqrt(*(x))) | |||
| #define d_tan(x) (tan(*(x))) | |||
| #define d_tanh(x) (tanh(*(x))) | |||
| #define i_abs(x) abs(*(x)) | |||
| #define i_dnnt(x) ((integer)u_nint(*(x))) | |||
| #define i_len(s, n) (n) | |||
| #define i_nint(x) ((integer)u_nint(*(x))) | |||
| #define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) | |||
| #define pow_dd(ap, bp) ( pow(*(ap), *(bp))) | |||
| #define pow_si(B,E) spow_ui(*(B),*(E)) | |||
| #define pow_ri(B,E) spow_ui(*(B),*(E)) | |||
| #define pow_di(B,E) dpow_ui(*(B),*(E)) | |||
| #define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} | |||
| #define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} | |||
| #define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} | |||
| #define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } | |||
| #define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) | |||
| #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } | |||
| #define sig_die(s, kill) { exit(1); } | |||
| #define s_stop(s, n) {exit(0);} | |||
| static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; | |||
| #define z_abs(z) (cabs(Cd(z))) | |||
| #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} | |||
| #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} | |||
| #define myexit_() break; | |||
| #define mycycle_() continue; | |||
| #define myceiling_(w) {ceil(w)} | |||
| #define myhuge_(w) {HUGE_VAL} | |||
| //#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} | |||
| #define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) | |||
| #define myexp_(w) my_expfunc(w) | |||
| static int my_expfunc(double *x) {int e; (void)frexp(*x,&e); return e;} | |||
| /* procedure parameter types for -A and -C++ */ | |||
| #define F2C_proc_par_types 1 | |||
| #ifdef __cplusplus | |||
| typedef logical (*L_fp)(...); | |||
| #else | |||
| typedef logical (*L_fp)(); | |||
| #endif | |||
| static float spow_ui(float x, integer n) { | |||
| float pow=1.0; unsigned long int u; | |||
| if(n != 0) { | |||
| if(n < 0) n = -n, x = 1/x; | |||
| for(u = n; ; ) { | |||
| if(u & 01) pow *= x; | |||
| if(u >>= 1) x *= x; | |||
| else break; | |||
| } | |||
| } | |||
| return pow; | |||
| } | |||
| static double dpow_ui(double x, integer n) { | |||
| double pow=1.0; unsigned long int u; | |||
| if(n != 0) { | |||
| if(n < 0) n = -n, x = 1/x; | |||
| for(u = n; ; ) { | |||
| if(u & 01) pow *= x; | |||
| if(u >>= 1) x *= x; | |||
| else break; | |||
| } | |||
| } | |||
| return pow; | |||
| } | |||
| #ifdef _MSC_VER | |||
| static _Fcomplex cpow_ui(complex x, integer n) { | |||
| complex pow={1.0,0.0}; unsigned long int u; | |||
| if(n != 0) { | |||
| if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; | |||
| for(u = n; ; ) { | |||
| if(u & 01) pow.r *= x.r, pow.i *= x.i; | |||
| if(u >>= 1) x.r *= x.r, x.i *= x.i; | |||
| else break; | |||
| } | |||
| } | |||
| _Fcomplex p={pow.r, pow.i}; | |||
| return p; | |||
| } | |||
| #else | |||
| static _Complex float cpow_ui(_Complex float x, integer n) { | |||
| _Complex float pow=1.0; unsigned long int u; | |||
| if(n != 0) { | |||
| if(n < 0) n = -n, x = 1/x; | |||
| for(u = n; ; ) { | |||
| if(u & 01) pow *= x; | |||
| if(u >>= 1) x *= x; | |||
| else break; | |||
| } | |||
| } | |||
| return pow; | |||
| } | |||
| #endif | |||
| #ifdef _MSC_VER | |||
| static _Dcomplex zpow_ui(_Dcomplex x, integer n) { | |||
| _Dcomplex pow={1.0,0.0}; unsigned long int u; | |||
| if(n != 0) { | |||
| if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; | |||
| for(u = n; ; ) { | |||
| if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; | |||
| if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; | |||
| else break; | |||
| } | |||
| } | |||
| _Dcomplex p = {pow._Val[0], pow._Val[1]}; | |||
| return p; | |||
| } | |||
| #else | |||
| static _Complex double zpow_ui(_Complex double x, integer n) { | |||
| _Complex double pow=1.0; unsigned long int u; | |||
| if(n != 0) { | |||
| if(n < 0) n = -n, x = 1/x; | |||
| for(u = n; ; ) { | |||
| if(u & 01) pow *= x; | |||
| if(u >>= 1) x *= x; | |||
| else break; | |||
| } | |||
| } | |||
| return pow; | |||
| } | |||
| #endif | |||
| static integer pow_ii(integer x, integer n) { | |||
| integer pow; unsigned long int u; | |||
| if (n <= 0) { | |||
| if (n == 0 || x == 1) pow = 1; | |||
| else if (x != -1) pow = x == 0 ? 1/x : 0; | |||
| else n = -n; | |||
| } | |||
| if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { | |||
| u = n; | |||
| for(pow = 1; ; ) { | |||
| if(u & 01) pow *= x; | |||
| if(u >>= 1) x *= x; | |||
| else break; | |||
| } | |||
| } | |||
| return pow; | |||
| } | |||
| static integer dmaxloc_(double *w, integer s, integer e, integer *n) | |||
| { | |||
| double m; integer i, mi; | |||
| for(m=w[s-1], mi=s, i=s+1; i<=e; i++) | |||
| if (w[i-1]>m) mi=i ,m=w[i-1]; | |||
| return mi-s+1; | |||
| } | |||
| static integer smaxloc_(float *w, integer s, integer e, integer *n) | |||
| { | |||
| float m; integer i, mi; | |||
| for(m=w[s-1], mi=s, i=s+1; i<=e; i++) | |||
| if (w[i-1]>m) mi=i ,m=w[i-1]; | |||
| return mi-s+1; | |||
| } | |||
| static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { | |||
| integer n = *n_, incx = *incx_, incy = *incy_, i; | |||
| #ifdef _MSC_VER | |||
| _Fcomplex zdotc = {0.0, 0.0}; | |||
| if (incx == 1 && incy == 1) { | |||
| for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */ | |||
| zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0]; | |||
| zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1]; | |||
| } | |||
| } else { | |||
| for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */ | |||
| zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0]; | |||
| zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1]; | |||
| } | |||
| } | |||
| pCf(z) = zdotc; | |||
| } | |||
| #else | |||
| _Complex float zdotc = 0.0; | |||
| if (incx == 1 && incy == 1) { | |||
| for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */ | |||
| zdotc += conjf(Cf(&x[i])) * Cf(&y[i]); | |||
| } | |||
| } else { | |||
| for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */ | |||
| zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]); | |||
| } | |||
| } | |||
| pCf(z) = zdotc; | |||
| } | |||
| #endif | |||
| static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) { | |||
| integer n = *n_, incx = *incx_, incy = *incy_, i; | |||
| #ifdef _MSC_VER | |||
| _Dcomplex zdotc = {0.0, 0.0}; | |||
| if (incx == 1 && incy == 1) { | |||
| for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */ | |||
| zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0]; | |||
| zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1]; | |||
| } | |||
| } else { | |||
| for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */ | |||
| zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0]; | |||
| zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1]; | |||
| } | |||
| } | |||
| pCd(z) = zdotc; | |||
| } | |||
| #else | |||
| _Complex double zdotc = 0.0; | |||
| if (incx == 1 && incy == 1) { | |||
| for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */ | |||
| zdotc += conj(Cd(&x[i])) * Cd(&y[i]); | |||
| } | |||
| } else { | |||
| for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */ | |||
| zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]); | |||
| } | |||
| } | |||
| pCd(z) = zdotc; | |||
| } | |||
| #endif | |||
| static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { | |||
| integer n = *n_, incx = *incx_, incy = *incy_, i; | |||
| #ifdef _MSC_VER | |||
| _Fcomplex zdotc = {0.0, 0.0}; | |||
| if (incx == 1 && incy == 1) { | |||
| for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */ | |||
| zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0]; | |||
| zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1]; | |||
| } | |||
| } else { | |||
| for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */ | |||
| zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0]; | |||
| zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1]; | |||
| } | |||
| } | |||
| pCf(z) = zdotc; | |||
| } | |||
| #else | |||
| _Complex float zdotc = 0.0; | |||
| if (incx == 1 && incy == 1) { | |||
| for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */ | |||
| zdotc += Cf(&x[i]) * Cf(&y[i]); | |||
| } | |||
| } else { | |||
| for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */ | |||
| zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]); | |||
| } | |||
| } | |||
| pCf(z) = zdotc; | |||
| } | |||
| #endif | |||
| static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) { | |||
| integer n = *n_, incx = *incx_, incy = *incy_, i; | |||
| #ifdef _MSC_VER | |||
| _Dcomplex zdotc = {0.0, 0.0}; | |||
| if (incx == 1 && incy == 1) { | |||
| for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */ | |||
| zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0]; | |||
| zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1]; | |||
| } | |||
| } else { | |||
| for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */ | |||
| zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0]; | |||
| zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1]; | |||
| } | |||
| } | |||
| pCd(z) = zdotc; | |||
| } | |||
| #else | |||
| _Complex double zdotc = 0.0; | |||
| if (incx == 1 && incy == 1) { | |||
| for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */ | |||
| zdotc += Cd(&x[i]) * Cd(&y[i]); | |||
| } | |||
| } else { | |||
| for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */ | |||
| zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]); | |||
| } | |||
| } | |||
| pCd(z) = zdotc; | |||
| } | |||
| #endif | |||
| /* -- translated by f2c (version 20000121). | |||
| You must link the resulting object file with the libraries: | |||
| -lf2c -lm (in that order) | |||
| */ | |||
| /* > \brief \b DLARMM */ | |||
| /* Definition: */ | |||
| /* =========== */ | |||
| /* DOUBLE PRECISION FUNCTION DLARMM( ANORM, BNORM, CNORM ) */ | |||
| /* DOUBLE PRECISION ANORM, BNORM, CNORM */ | |||
| /* > \par Purpose: */ | |||
| /* ======= */ | |||
| /* > */ | |||
| /* > \verbatim */ | |||
| /* > */ | |||
| /* > DLARMM returns a factor s in (0, 1] such that the linear updates */ | |||
| /* > */ | |||
| /* > (s * C) - A * (s * B) and (s * C) - (s * A) * B */ | |||
| /* > */ | |||
| /* > cannot overflow, where A, B, and C are matrices of conforming */ | |||
| /* > dimensions. */ | |||
| /* > */ | |||
| /* > This is an auxiliary routine so there is no argument checking. */ | |||
| /* > \endverbatim */ | |||
| /* Arguments: */ | |||
| /* ========= */ | |||
| /* > \param[in] ANORM */ | |||
| /* > \verbatim */ | |||
| /* > ANORM is DOUBLE PRECISION */ | |||
| /* > The infinity norm of A. ANORM >= 0. */ | |||
| /* > The number of rows of the matrix A. M >= 0. */ | |||
| /* > \endverbatim */ | |||
| /* > */ | |||
| /* > \param[in] BNORM */ | |||
| /* > \verbatim */ | |||
| /* > BNORM is DOUBLE PRECISION */ | |||
| /* > The infinity norm of B. BNORM >= 0. */ | |||
| /* > \endverbatim */ | |||
| /* > */ | |||
| /* > \param[in] CNORM */ | |||
| /* > \verbatim */ | |||
| /* > CNORM is DOUBLE PRECISION */ | |||
| /* > The infinity norm of C. CNORM >= 0. */ | |||
| /* > \endverbatim */ | |||
| /* > */ | |||
| /* > */ | |||
| /* ===================================================================== */ | |||
| /* > References: */ | |||
| /* > C. C. Kjelgaard Mikkelsen and L. Karlsson, Blocked Algorithms for */ | |||
| /* > Robust Solution of Triangular Linear Systems. In: International */ | |||
| /* > Conference on Parallel Processing and Applied Mathematics, pages */ | |||
| /* > 68--78. Springer, 2017. */ | |||
| /* > */ | |||
| /* > \ingroup OTHERauxiliary */ | |||
| /* ===================================================================== */ | |||
| doublereal dlarmm_(doublereal *anorm, doublereal *bnorm, doublereal *cnorm) | |||
| { | |||
| /* System generated locals */ | |||
| doublereal ret_val; | |||
| /* Local variables */ | |||
| extern doublereal dlamch_(char *); | |||
| doublereal bignum, smlnum; | |||
| /* Determine machine dependent parameters to control overflow. */ | |||
| smlnum = dlamch_("Safe minimum") / dlamch_("Precision"); | |||
| bignum = 1. / smlnum / 4.; | |||
| /* Compute a scale factor. */ | |||
| ret_val = 1.; | |||
| if (*bnorm <= 1.) { | |||
| if (*anorm * *bnorm > bignum - *cnorm) { | |||
| ret_val = .5; | |||
| } | |||
| } else { | |||
| if (*anorm > (bignum - *cnorm) / *bnorm) { | |||
| ret_val = .5 / *bnorm; | |||
| } | |||
| } | |||
| return ret_val; | |||
| /* ==== End of DLARMM ==== */ | |||
| } /* dlarmm_ */ | |||
| @@ -0,0 +1,99 @@ | |||
| *> \brief \b DLARMM | |||
| * | |||
| * Definition: | |||
| * =========== | |||
| * | |||
| * DOUBLE PRECISION FUNCTION DLARMM( ANORM, BNORM, CNORM ) | |||
| * | |||
| * .. Scalar Arguments .. | |||
| * DOUBLE PRECISION ANORM, BNORM, CNORM | |||
| * .. | |||
| * | |||
| *> \par Purpose: | |||
| * ======= | |||
| *> | |||
| *> \verbatim | |||
| *> | |||
| *> DLARMM returns a factor s in (0, 1] such that the linear updates | |||
| *> | |||
| *> (s * C) - A * (s * B) and (s * C) - (s * A) * B | |||
| *> | |||
| *> cannot overflow, where A, B, and C are matrices of conforming | |||
| *> dimensions. | |||
| *> | |||
| *> This is an auxiliary routine so there is no argument checking. | |||
| *> \endverbatim | |||
| * | |||
| * Arguments: | |||
| * ========= | |||
| * | |||
| *> \param[in] ANORM | |||
| *> \verbatim | |||
| *> ANORM is DOUBLE PRECISION | |||
| *> The infinity norm of A. ANORM >= 0. | |||
| *> The number of rows of the matrix A. M >= 0. | |||
| *> \endverbatim | |||
| *> | |||
| *> \param[in] BNORM | |||
| *> \verbatim | |||
| *> BNORM is DOUBLE PRECISION | |||
| *> The infinity norm of B. BNORM >= 0. | |||
| *> \endverbatim | |||
| *> | |||
| *> \param[in] CNORM | |||
| *> \verbatim | |||
| *> CNORM is DOUBLE PRECISION | |||
| *> The infinity norm of C. CNORM >= 0. | |||
| *> \endverbatim | |||
| *> | |||
| *> | |||
| * ===================================================================== | |||
| *> References: | |||
| *> C. C. Kjelgaard Mikkelsen and L. Karlsson, Blocked Algorithms for | |||
| *> Robust Solution of Triangular Linear Systems. In: International | |||
| *> Conference on Parallel Processing and Applied Mathematics, pages | |||
| *> 68--78. Springer, 2017. | |||
| *> | |||
| *> \ingroup OTHERauxiliary | |||
| * ===================================================================== | |||
| DOUBLE PRECISION FUNCTION DLARMM( ANORM, BNORM, CNORM ) | |||
| IMPLICIT NONE | |||
| * .. Scalar Arguments .. | |||
| DOUBLE PRECISION ANORM, BNORM, CNORM | |||
| * .. Parameters .. | |||
| DOUBLE PRECISION ONE, HALF, FOUR | |||
| PARAMETER ( ONE = 1.0D0, HALF = 0.5D+0, FOUR = 4.0D0 ) | |||
| * .. | |||
| * .. Local Scalars .. | |||
| DOUBLE PRECISION BIGNUM, SMLNUM | |||
| * .. | |||
| * .. External Functions .. | |||
| DOUBLE PRECISION DLAMCH | |||
| EXTERNAL DLAMCH | |||
| * .. | |||
| * .. Executable Statements .. | |||
| * | |||
| * | |||
| * Determine machine dependent parameters to control overflow. | |||
| * | |||
| SMLNUM = DLAMCH( 'Safe minimum' ) / DLAMCH( 'Precision' ) | |||
| BIGNUM = ( ONE / SMLNUM ) / FOUR | |||
| * | |||
| * Compute a scale factor. | |||
| * | |||
| DLARMM = ONE | |||
| IF( BNORM .LE. ONE ) THEN | |||
| IF( ANORM * BNORM .GT. BIGNUM - CNORM ) THEN | |||
| DLARMM = HALF | |||
| END IF | |||
| ELSE | |||
| IF( ANORM .GT. (BIGNUM - CNORM) / BNORM ) THEN | |||
| DLARMM = HALF / BNORM | |||
| END IF | |||
| END IF | |||
| RETURN | |||
| * | |||
| * ==== End of DLARMM ==== | |||
| * | |||
| END | |||
| @@ -0,0 +1,656 @@ | |||
| *> \brief \b DLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow. | |||
| * | |||
| * Definition: | |||
| * =========== | |||
| * | |||
| * SUBROUTINE DLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, | |||
| * X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) | |||
| * | |||
| * .. Scalar Arguments .. | |||
| * CHARACTER DIAG, NORMIN, TRANS, UPLO | |||
| * INTEGER INFO, LDA, LWORK, LDX, N, NRHS | |||
| * .. | |||
| * .. Array Arguments .. | |||
| * DOUBLE PRECISION A( LDA, * ), CNORM( * ), SCALE( * ), | |||
| * WORK( * ), X( LDX, * ) | |||
| * .. | |||
| * | |||
| * | |||
| *> \par Purpose: | |||
| * ============= | |||
| *> | |||
| *> \verbatim | |||
| *> | |||
| *> DLATRS3 solves one of the triangular systems | |||
| *> | |||
| *> A * X = B * diag(scale) or A**T * X = B * diag(scale) | |||
| *> | |||
| *> with scaling to prevent overflow. Here A is an upper or lower | |||
| *> triangular matrix, A**T denotes the transpose of A. X and B are | |||
| *> n by nrhs matrices and scale is an nrhs element vector of scaling | |||
| *> factors. A scaling factor scale(j) is usually less than or equal | |||
| *> to 1, chosen such that X(:,j) is less than the overflow threshold. | |||
| *> If the matrix A is singular (A(j,j) = 0 for some j), then | |||
| *> a non-trivial solution to A*X = 0 is returned. If the system is | |||
| *> so badly scaled that the solution cannot be represented as | |||
| *> (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned. | |||
| *> | |||
| *> This is a BLAS-3 version of LATRS for solving several right | |||
| *> hand sides simultaneously. | |||
| *> | |||
| *> \endverbatim | |||
| * | |||
| * Arguments: | |||
| * ========== | |||
| * | |||
| *> \param[in] UPLO | |||
| *> \verbatim | |||
| *> UPLO is CHARACTER*1 | |||
| *> Specifies whether the matrix A is upper or lower triangular. | |||
| *> = 'U': Upper triangular | |||
| *> = 'L': Lower triangular | |||
| *> \endverbatim | |||
| *> | |||
| *> \param[in] TRANS | |||
| *> \verbatim | |||
| *> TRANS is CHARACTER*1 | |||
| *> Specifies the operation applied to A. | |||
| *> = 'N': Solve A * x = s*b (No transpose) | |||
| *> = 'T': Solve A**T* x = s*b (Transpose) | |||
| *> = 'C': Solve A**T* x = s*b (Conjugate transpose = Transpose) | |||
| *> \endverbatim | |||
| *> | |||
| *> \param[in] DIAG | |||
| *> \verbatim | |||
| *> DIAG is CHARACTER*1 | |||
| *> Specifies whether or not the matrix A is unit triangular. | |||
| *> = 'N': Non-unit triangular | |||
| *> = 'U': Unit triangular | |||
| *> \endverbatim | |||
| *> | |||
| *> \param[in] NORMIN | |||
| *> \verbatim | |||
| *> NORMIN is CHARACTER*1 | |||
| *> Specifies whether CNORM has been set or not. | |||
| *> = 'Y': CNORM contains the column norms on entry | |||
| *> = 'N': CNORM is not set on entry. On exit, the norms will | |||
| *> be computed and stored in CNORM. | |||
| *> \endverbatim | |||
| *> | |||
| *> \param[in] N | |||
| *> \verbatim | |||
| *> N is INTEGER | |||
| *> The order of the matrix A. N >= 0. | |||
| *> \endverbatim | |||
| *> | |||
| *> \param[in] NRHS | |||
| *> \verbatim | |||
| *> NRHS is INTEGER | |||
| *> The number of columns of X. NRHS >= 0. | |||
| *> \endverbatim | |||
| *> | |||
| *> \param[in] A | |||
| *> \verbatim | |||
| *> A is DOUBLE PRECISION array, dimension (LDA,N) | |||
| *> The triangular matrix A. If UPLO = 'U', the leading n by n | |||
| *> upper triangular part of the array A contains the upper | |||
| *> triangular matrix, and the strictly lower triangular part of | |||
| *> A is not referenced. If UPLO = 'L', the leading n by n lower | |||
| *> triangular part of the array A contains the lower triangular | |||
| *> matrix, and the strictly upper triangular part of A is not | |||
| *> referenced. If DIAG = 'U', the diagonal elements of A are | |||
| *> also not referenced and are assumed to be 1. | |||
| *> \endverbatim | |||
| *> | |||
| *> \param[in] LDA | |||
| *> \verbatim | |||
| *> LDA is INTEGER | |||
| *> The leading dimension of the array A. LDA >= max (1,N). | |||
| *> \endverbatim | |||
| *> | |||
| *> \param[in,out] X | |||
| *> \verbatim | |||
| *> X is DOUBLE PRECISION array, dimension (LDX,NRHS) | |||
| *> On entry, the right hand side B of the triangular system. | |||
| *> On exit, X is overwritten by the solution matrix X. | |||
| *> \endverbatim | |||
| *> | |||
| *> \param[in] LDX | |||
| *> \verbatim | |||
| *> LDX is INTEGER | |||
| *> The leading dimension of the array X. LDX >= max (1,N). | |||
| *> \endverbatim | |||
| *> | |||
| *> \param[out] SCALE | |||
| *> \verbatim | |||
| *> SCALE is DOUBLE PRECISION array, dimension (NRHS) | |||
| *> The scaling factor s(k) is for the triangular system | |||
| *> A * x(:,k) = s(k)*b(:,k) or A**T* x(:,k) = s(k)*b(:,k). | |||
| *> If SCALE = 0, the matrix A is singular or badly scaled. | |||
| *> If A(j,j) = 0 is encountered, a non-trivial vector x(:,k) | |||
| *> that is an exact or approximate solution to A*x(:,k) = 0 | |||
| *> is returned. If the system so badly scaled that solution | |||
| *> cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0 | |||
| *> is returned. | |||
| *> \endverbatim | |||
| *> | |||
| *> \param[in,out] CNORM | |||
| *> \verbatim | |||
| *> CNORM is DOUBLE PRECISION array, dimension (N) | |||
| *> | |||
| *> If NORMIN = 'Y', CNORM is an input argument and CNORM(j) | |||
| *> contains the norm of the off-diagonal part of the j-th column | |||
| *> of A. If TRANS = 'N', CNORM(j) must be greater than or equal | |||
| *> to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) | |||
| *> must be greater than or equal to the 1-norm. | |||
| *> | |||
| *> If NORMIN = 'N', CNORM is an output argument and CNORM(j) | |||
| *> returns the 1-norm of the offdiagonal part of the j-th column | |||
| *> of A. | |||
| *> \endverbatim | |||
| *> | |||
| *> \param[out] WORK | |||
| *> \verbatim | |||
| *> WORK is DOUBLE PRECISION array, dimension (LWORK). | |||
| *> On exit, if INFO = 0, WORK(1) returns the optimal size of | |||
| *> WORK. | |||
| *> \endverbatim | |||
| *> | |||
| *> \param[in] LWORK | |||
| *> LWORK is INTEGER | |||
| *> LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where | |||
| *> NBA = (N + NB - 1)/NB and NB is the optimal block size. | |||
| *> | |||
| *> If LWORK = -1, then a workspace query is assumed; the routine | |||
| *> only calculates the optimal dimensions of the WORK array, returns | |||
| *> this value as the first entry of the WORK array, and no error | |||
| *> message related to LWORK is issued by XERBLA. | |||
| *> | |||
| *> \param[out] INFO | |||
| *> \verbatim | |||
| *> INFO is INTEGER | |||
| *> = 0: successful exit | |||
| *> < 0: if INFO = -k, the k-th argument had an illegal value | |||
| *> \endverbatim | |||
| * | |||
| * Authors: | |||
| * ======== | |||
| * | |||
| *> \author Univ. of Tennessee | |||
| *> \author Univ. of California Berkeley | |||
| *> \author Univ. of Colorado Denver | |||
| *> \author NAG Ltd. | |||
| * | |||
| *> \ingroup doubleOTHERauxiliary | |||
| *> \par Further Details: | |||
| * ===================== | |||
| * \verbatim | |||
| * The algorithm follows the structure of a block triangular solve. | |||
| * The diagonal block is solved with a call to the robust the triangular | |||
| * solver LATRS for every right-hand side RHS = 1, ..., NRHS | |||
| * op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ), | |||
| * where op( A ) = A or op( A ) = A**T. | |||
| * The linear block updates operate on block columns of X, | |||
| * B( I, K ) - op(A( I, J )) * X( J, K ) | |||
| * and use GEMM. To avoid overflow in the linear block update, the worst case | |||
| * growth is estimated. For every RHS, a scale factor s <= 1.0 is computed | |||
| * such that | |||
| * || s * B( I, RHS )||_oo | |||
| * + || op(A( I, J )) ||_oo * || s * X( J, RHS ) ||_oo <= Overflow threshold | |||
| * | |||
| * Once all columns of a block column have been rescaled (BLAS-1), the linear | |||
| * update is executed with GEMM without overflow. | |||
| * | |||
| * To limit rescaling, local scale factors track the scaling of column segments. | |||
| * There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA | |||
| * per right-hand side column RHS = 1, ..., NRHS. The global scale factor | |||
| * SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS ) | |||
| * I = 1, ..., NBA. | |||
| * A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ) | |||
| * updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The | |||
| * linear update of potentially inconsistently scaled vector segments | |||
| * s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) ) | |||
| * computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and, | |||
| * if necessary, rescales the blocks prior to calling GEMM. | |||
| * | |||
| * \endverbatim | |||
| * ===================================================================== | |||
| * References: | |||
| * C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019). | |||
| * Parallel robust solution of triangular linear systems. Concurrency | |||
| * and Computation: Practice and Experience, 31(19), e5064. | |||
| * | |||
| * Contributor: | |||
| * Angelika Schwarz, Umea University, Sweden. | |||
| * | |||
| * ===================================================================== | |||
| SUBROUTINE DLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, | |||
| $ X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) | |||
| IMPLICIT NONE | |||
| * | |||
| * .. Scalar Arguments .. | |||
| CHARACTER DIAG, TRANS, NORMIN, UPLO | |||
| INTEGER INFO, LDA, LWORK, LDX, N, NRHS | |||
| * .. | |||
| * .. Array Arguments .. | |||
| DOUBLE PRECISION A( LDA, * ), CNORM( * ), X( LDX, * ), | |||
| $ SCALE( * ), WORK( * ) | |||
| * .. | |||
| * | |||
| * ===================================================================== | |||
| * | |||
| * .. Parameters .. | |||
| DOUBLE PRECISION ZERO, ONE | |||
| PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) | |||
| INTEGER NBMAX, NBMIN, NBRHS, NRHSMIN | |||
| PARAMETER ( NRHSMIN = 2, NBRHS = 32 ) | |||
| PARAMETER ( NBMIN = 8, NBMAX = 64 ) | |||
| * .. | |||
| * .. Local Arrays .. | |||
| DOUBLE PRECISION W( NBMAX ), XNRM( NBRHS ) | |||
| * .. | |||
| * .. Local Scalars .. | |||
| LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER | |||
| INTEGER AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J, | |||
| $ JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2, | |||
| $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS | |||
| DOUBLE PRECISION ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC, | |||
| $ SCAMIN, SMLNUM, TMAX | |||
| * .. | |||
| * .. External Functions .. | |||
| LOGICAL LSAME | |||
| INTEGER ILAENV | |||
| DOUBLE PRECISION DLAMCH, DLANGE, DLARMM | |||
| EXTERNAL DLAMCH, DLANGE, DLARMM, ILAENV, LSAME | |||
| * .. | |||
| * .. External Subroutines .. | |||
| EXTERNAL DLATRS, DSCAL, XERBLA | |||
| * .. | |||
| * .. Intrinsic Functions .. | |||
| INTRINSIC ABS, MAX, MIN | |||
| * .. | |||
| * .. Executable Statements .. | |||
| * | |||
| INFO = 0 | |||
| UPPER = LSAME( UPLO, 'U' ) | |||
| NOTRAN = LSAME( TRANS, 'N' ) | |||
| NOUNIT = LSAME( DIAG, 'N' ) | |||
| LQUERY = ( LWORK.EQ.-1 ) | |||
| * | |||
| * Partition A and X into blocks | |||
| * | |||
| NB = MAX( 8, ILAENV( 1, 'DLATRS', '', N, N, -1, -1 ) ) | |||
| NB = MIN( NBMAX, NB ) | |||
| NBA = MAX( 1, (N + NB - 1) / NB ) | |||
| NBX = MAX( 1, (NRHS + NBRHS - 1) / NBRHS ) | |||
| * | |||
| * Compute the workspace | |||
| * | |||
| * The workspace comprises two parts. | |||
| * The first part stores the local scale factors. Each simultaneously | |||
| * computed right-hand side requires one local scale factor per block | |||
| * row. WORK( I+KK*LDS ) is the scale factor of the vector | |||
| * segment associated with the I-th block row and the KK-th vector | |||
| * in the block column. | |||
| LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) ) | |||
| LDS = NBA | |||
| * The second part stores upper bounds of the triangular A. There are | |||
| * a total of NBA x NBA blocks, of which only the upper triangular | |||
| * part or the lower triangular part is referenced. The upper bound of | |||
| * the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). | |||
| LANRM = NBA * NBA | |||
| AWRK = LSCALE | |||
| WORK( 1 ) = LSCALE + LANRM | |||
| * | |||
| * Test the input parameters | |||
| * | |||
| IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN | |||
| INFO = -1 | |||
| ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. | |||
| $ LSAME( TRANS, 'C' ) ) THEN | |||
| INFO = -2 | |||
| ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN | |||
| INFO = -3 | |||
| ELSE IF( .NOT.LSAME( NORMIN, 'Y' ) .AND. .NOT. | |||
| $ LSAME( NORMIN, 'N' ) ) THEN | |||
| INFO = -4 | |||
| ELSE IF( N.LT.0 ) THEN | |||
| INFO = -5 | |||
| ELSE IF( NRHS.LT.0 ) THEN | |||
| INFO = -6 | |||
| ELSE IF( LDA.LT.MAX( 1, N ) ) THEN | |||
| INFO = -8 | |||
| ELSE IF( LDX.LT.MAX( 1, N ) ) THEN | |||
| INFO = -10 | |||
| ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN | |||
| INFO = -14 | |||
| END IF | |||
| IF( INFO.NE.0 ) THEN | |||
| CALL XERBLA( 'DLATRS3', -INFO ) | |||
| RETURN | |||
| ELSE IF( LQUERY ) THEN | |||
| RETURN | |||
| END IF | |||
| * | |||
| * Initialize scaling factors | |||
| * | |||
| DO KK = 1, NRHS | |||
| SCALE( KK ) = ONE | |||
| END DO | |||
| * | |||
| * Quick return if possible | |||
| * | |||
| IF( MIN( N, NRHS ).EQ.0 ) | |||
| $ RETURN | |||
| * | |||
| * Determine machine dependent constant to control overflow. | |||
| * | |||
| BIGNUM = DLAMCH( 'Overflow' ) | |||
| SMLNUM = DLAMCH( 'Safe Minimum' ) | |||
| * | |||
| * Use unblocked code for small problems | |||
| * | |||
| IF( NRHS.LT.NRHSMIN ) THEN | |||
| CALL DLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X( 1, 1), | |||
| $ SCALE( 1 ), CNORM, INFO ) | |||
| DO K = 2, NRHS | |||
| CALL DLATRS( UPLO, TRANS, DIAG, 'Y', N, A, LDA, X( 1, K ), | |||
| $ SCALE( K ), CNORM, INFO ) | |||
| END DO | |||
| RETURN | |||
| END IF | |||
| * | |||
| * Compute norms of blocks of A excluding diagonal blocks and find | |||
| * the block with the largest norm TMAX. | |||
| * | |||
| TMAX = ZERO | |||
| DO J = 1, NBA | |||
| J1 = (J-1)*NB + 1 | |||
| J2 = MIN( J*NB, N ) + 1 | |||
| IF ( UPPER ) THEN | |||
| IFIRST = 1 | |||
| ILAST = J - 1 | |||
| ELSE | |||
| IFIRST = J + 1 | |||
| ILAST = NBA | |||
| END IF | |||
| DO I = IFIRST, ILAST | |||
| I1 = (I-1)*NB + 1 | |||
| I2 = MIN( I*NB, N ) + 1 | |||
| * | |||
| * Compute upper bound of A( I1:I2-1, J1:J2-1 ). | |||
| * | |||
| IF( NOTRAN ) THEN | |||
| ANRM = DLANGE( 'I', I2-I1, J2-J1, A( I1, J1 ), LDA, W ) | |||
| WORK( AWRK + I+(J-1)*NBA ) = ANRM | |||
| ELSE | |||
| ANRM = DLANGE( '1', I2-I1, J2-J1, A( I1, J1 ), LDA, W ) | |||
| WORK( AWRK + J+(I-1)*NBA ) = ANRM | |||
| END IF | |||
| TMAX = MAX( TMAX, ANRM ) | |||
| END DO | |||
| END DO | |||
| * | |||
| IF( .NOT. TMAX.LE.DLAMCH('Overflow') ) THEN | |||
| * | |||
| * Some matrix entries have huge absolute value. At least one upper | |||
| * bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point | |||
| * number, either due to overflow in LANGE or due to Inf in A. | |||
| * Fall back to LATRS. Set normin = 'N' for every right-hand side to | |||
| * force computation of TSCAL in LATRS to avoid the likely overflow | |||
| * in the computation of the column norms CNORM. | |||
| * | |||
| DO K = 1, NRHS | |||
| CALL DLATRS( UPLO, TRANS, DIAG, 'N', N, A, LDA, X( 1, K ), | |||
| $ SCALE( K ), CNORM, INFO ) | |||
| END DO | |||
| RETURN | |||
| END IF | |||
| * | |||
| * Every right-hand side requires workspace to store NBA local scale | |||
| * factors. To save workspace, X is computed successively in block columns | |||
| * of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient | |||
| * workspace is available, larger values of NBRHS or NBRHS = NRHS are viable. | |||
| DO K = 1, NBX | |||
| * Loop over block columns (index = K) of X and, for column-wise scalings, | |||
| * over individual columns (index = KK). | |||
| * K1: column index of the first column in X( J, K ) | |||
| * K2: column index of the first column in X( J, K+1 ) | |||
| * so the K2 - K1 is the column count of the block X( J, K ) | |||
| K1 = (K-1)*NBRHS + 1 | |||
| K2 = MIN( K*NBRHS, NRHS ) + 1 | |||
| * | |||
| * Initialize local scaling factors of current block column X( J, K ) | |||
| * | |||
| DO KK = 1, K2-K1 | |||
| DO I = 1, NBA | |||
| WORK( I+KK*LDS ) = ONE | |||
| END DO | |||
| END DO | |||
| * | |||
| IF( NOTRAN ) THEN | |||
| * | |||
| * Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) | |||
| * | |||
| IF( UPPER ) THEN | |||
| JFIRST = NBA | |||
| JLAST = 1 | |||
| JINC = -1 | |||
| ELSE | |||
| JFIRST = 1 | |||
| JLAST = NBA | |||
| JINC = 1 | |||
| END IF | |||
| ELSE | |||
| * | |||
| * Solve A**T * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) | |||
| * | |||
| IF( UPPER ) THEN | |||
| JFIRST = 1 | |||
| JLAST = NBA | |||
| JINC = 1 | |||
| ELSE | |||
| JFIRST = NBA | |||
| JLAST = 1 | |||
| JINC = -1 | |||
| END IF | |||
| END IF | |||
| * | |||
| DO J = JFIRST, JLAST, JINC | |||
| * J1: row index of the first row in A( J, J ) | |||
| * J2: row index of the first row in A( J+1, J+1 ) | |||
| * so that J2 - J1 is the row count of the block A( J, J ) | |||
| J1 = (J-1)*NB + 1 | |||
| J2 = MIN( J*NB, N ) + 1 | |||
| * | |||
| * Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS ) | |||
| * for all right-hand sides in the current block column, | |||
| * one RHS at a time. | |||
| * | |||
| DO KK = 1, K2-K1 | |||
| RHS = K1 + KK - 1 | |||
| IF( KK.EQ.1 ) THEN | |||
| CALL DLATRS( UPLO, TRANS, DIAG, 'N', J2-J1, | |||
| $ A( J1, J1 ), LDA, X( J1, RHS ), | |||
| $ SCALOC, CNORM, INFO ) | |||
| ELSE | |||
| CALL DLATRS( UPLO, TRANS, DIAG, 'Y', J2-J1, | |||
| $ A( J1, J1 ), LDA, X( J1, RHS ), | |||
| $ SCALOC, CNORM, INFO ) | |||
| END IF | |||
| * Find largest absolute value entry in the vector segment | |||
| * X( J1:J2-1, RHS ) as an upper bound for the worst case | |||
| * growth in the linear updates. | |||
| XNRM( KK ) = DLANGE( 'I', J2-J1, 1, X( J1, RHS ), | |||
| $ LDX, W ) | |||
| * | |||
| IF( SCALOC .EQ. ZERO ) THEN | |||
| * LATRS found that A is singular through A(j,j) = 0. | |||
| * Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0 | |||
| * and compute A*x = 0 (or A**T*x = 0). Note that | |||
| * X(J1:J2-1, KK) is set by LATRS. | |||
| SCALE( RHS ) = ZERO | |||
| DO II = 1, J1-1 | |||
| X( II, KK ) = ZERO | |||
| END DO | |||
| DO II = J2, N | |||
| X( II, KK ) = ZERO | |||
| END DO | |||
| * Discard the local scale factors. | |||
| DO II = 1, NBA | |||
| WORK( II+KK*LDS ) = ONE | |||
| END DO | |||
| SCALOC = ONE | |||
| ELSE IF( SCALOC * WORK( J+KK*LDS ) .EQ. ZERO ) THEN | |||
| * LATRS computed a valid scale factor, but combined with | |||
| * the current scaling the solution does not have a | |||
| * scale factor > 0. | |||
| * | |||
| * Set WORK( J+KK*LDS ) to smallest valid scale | |||
| * factor and increase SCALOC accordingly. | |||
| SCAL = WORK( J+KK*LDS ) / SMLNUM | |||
| SCALOC = SCALOC * SCAL | |||
| WORK( J+KK*LDS ) = SMLNUM | |||
| * If LATRS overestimated the growth, x may be | |||
| * rescaled to preserve a valid combined scale | |||
| * factor WORK( J, KK ) > 0. | |||
| RSCAL = ONE / SCALOC | |||
| IF( XNRM( KK ) * RSCAL .LE. BIGNUM ) THEN | |||
| XNRM( KK ) = XNRM( KK ) * RSCAL | |||
| CALL DSCAL( J2-J1, RSCAL, X( J1, RHS ), 1 ) | |||
| SCALOC = ONE | |||
| ELSE | |||
| * The system op(A) * x = b is badly scaled and its | |||
| * solution cannot be represented as (1/scale) * x. | |||
| * Set x to zero. This approach deviates from LATRS | |||
| * where a completely meaningless non-zero vector | |||
| * is returned that is not a solution to op(A) * x = b. | |||
| SCALE( RHS ) = ZERO | |||
| DO II = 1, N | |||
| X( II, KK ) = ZERO | |||
| END DO | |||
| * Discard the local scale factors. | |||
| DO II = 1, NBA | |||
| WORK( II+KK*LDS ) = ONE | |||
| END DO | |||
| SCALOC = ONE | |||
| END IF | |||
| END IF | |||
| SCALOC = SCALOC * WORK( J+KK*LDS ) | |||
| WORK( J+KK*LDS ) = SCALOC | |||
| END DO | |||
| * | |||
| * Linear block updates | |||
| * | |||
| IF( NOTRAN ) THEN | |||
| IF( UPPER ) THEN | |||
| IFIRST = J - 1 | |||
| ILAST = 1 | |||
| IINC = -1 | |||
| ELSE | |||
| IFIRST = J + 1 | |||
| ILAST = NBA | |||
| IINC = 1 | |||
| END IF | |||
| ELSE | |||
| IF( UPPER ) THEN | |||
| IFIRST = J + 1 | |||
| ILAST = NBA | |||
| IINC = 1 | |||
| ELSE | |||
| IFIRST = J - 1 | |||
| ILAST = 1 | |||
| IINC = -1 | |||
| END IF | |||
| END IF | |||
| * | |||
| DO I = IFIRST, ILAST, IINC | |||
| * I1: row index of the first column in X( I, K ) | |||
| * I2: row index of the first column in X( I+1, K ) | |||
| * so the I2 - I1 is the row count of the block X( I, K ) | |||
| I1 = (I-1)*NB + 1 | |||
| I2 = MIN( I*NB, N ) + 1 | |||
| * | |||
| * Prepare the linear update to be executed with GEMM. | |||
| * For each column, compute a consistent scaling, a | |||
| * scaling factor to survive the linear update, and | |||
| * rescale the column segments, if necesssary. Then | |||
| * the linear update is safely executed. | |||
| * | |||
| DO KK = 1, K2-K1 | |||
| RHS = K1 + KK - 1 | |||
| * Compute consistent scaling | |||
| SCAMIN = MIN( WORK( I + KK*LDS), WORK( J + KK*LDS ) ) | |||
| * | |||
| * Compute scaling factor to survive the linear update | |||
| * simulating consistent scaling. | |||
| * | |||
| BNRM = DLANGE( 'I', I2-I1, 1, X( I1, RHS ), LDX, W ) | |||
| BNRM = BNRM*( SCAMIN / WORK( I+KK*LDS ) ) | |||
| XNRM( KK ) = XNRM( KK )*(SCAMIN / WORK( J+KK*LDS )) | |||
| ANRM = WORK( AWRK + I+(J-1)*NBA ) | |||
| SCALOC = DLARMM( ANRM, XNRM( KK ), BNRM ) | |||
| * | |||
| * Simultaneously apply the robust update factor and the | |||
| * consistency scaling factor to B( I, KK ) and B( J, KK ). | |||
| * | |||
| SCAL = ( SCAMIN / WORK( I+KK*LDS) )*SCALOC | |||
| IF( SCAL.NE.ONE ) THEN | |||
| CALL DSCAL( I2-I1, SCAL, X( I1, RHS ), 1 ) | |||
| WORK( I+KK*LDS ) = SCAMIN*SCALOC | |||
| END IF | |||
| * | |||
| SCAL = ( SCAMIN / WORK( J+KK*LDS ) )*SCALOC | |||
| IF( SCAL.NE.ONE ) THEN | |||
| CALL DSCAL( J2-J1, SCAL, X( J1, RHS ), 1 ) | |||
| WORK( J+KK*LDS ) = SCAMIN*SCALOC | |||
| END IF | |||
| END DO | |||
| * | |||
| IF( NOTRAN ) THEN | |||
| * | |||
| * B( I, K ) := B( I, K ) - A( I, J ) * X( J, K ) | |||
| * | |||
| CALL DGEMM( 'N', 'N', I2-I1, K2-K1, J2-J1, -ONE, | |||
| $ A( I1, J1 ), LDA, X( J1, K1 ), LDX, | |||
| $ ONE, X( I1, K1 ), LDX ) | |||
| ELSE | |||
| * | |||
| * B( I, K ) := B( I, K ) - A( J, I )**T * X( J, K ) | |||
| * | |||
| CALL DGEMM( 'T', 'N', I2-I1, K2-K1, J2-J1, -ONE, | |||
| $ A( J1, I1 ), LDA, X( J1, K1 ), LDX, | |||
| $ ONE, X( I1, K1 ), LDX ) | |||
| END IF | |||
| END DO | |||
| END DO | |||
| * | |||
| * Reduce local scaling factors | |||
| * | |||
| DO KK = 1, K2-K1 | |||
| RHS = K1 + KK - 1 | |||
| DO I = 1, NBA | |||
| SCALE( RHS ) = MIN( SCALE( RHS ), WORK( I+KK*LDS ) ) | |||
| END DO | |||
| END DO | |||
| * | |||
| * Realize consistent scaling | |||
| * | |||
| DO KK = 1, K2-K1 | |||
| RHS = K1 + KK - 1 | |||
| IF( SCALE( RHS ).NE.ONE .AND. SCALE( RHS ).NE. ZERO ) THEN | |||
| DO I = 1, NBA | |||
| I1 = (I-1)*NB + 1 | |||
| I2 = MIN( I*NB, N ) + 1 | |||
| SCAL = SCALE( RHS ) / WORK( I+KK*LDS ) | |||
| IF( SCAL.NE.ONE ) | |||
| $ CALL DSCAL( I2-I1, SCAL, X( I1, RHS ), 1 ) | |||
| END DO | |||
| END IF | |||
| END DO | |||
| END DO | |||
| RETURN | |||
| * | |||
| * End of DLATRS3 | |||
| * | |||
| END | |||